apache-airflow-providers-google 15.1.0rc1__py3-none-any.whl → 19.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/3rd-party-licenses/NOTICE +2 -12
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/ads/hooks/ads.py +39 -5
- airflow/providers/google/ads/operators/ads.py +2 -2
- airflow/providers/google/ads/transfers/ads_to_gcs.py +2 -2
- airflow/providers/google/assets/gcs.py +1 -11
- airflow/providers/google/cloud/bundles/__init__.py +16 -0
- airflow/providers/google/cloud/bundles/gcs.py +161 -0
- airflow/providers/google/cloud/hooks/bigquery.py +166 -281
- airflow/providers/google/cloud/hooks/cloud_composer.py +287 -14
- airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
- airflow/providers/google/cloud/hooks/cloud_run.py +17 -9
- airflow/providers/google/cloud/hooks/cloud_sql.py +101 -22
- airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +27 -6
- airflow/providers/google/cloud/hooks/compute_ssh.py +5 -1
- airflow/providers/google/cloud/hooks/datacatalog.py +9 -1
- airflow/providers/google/cloud/hooks/dataflow.py +71 -94
- airflow/providers/google/cloud/hooks/datafusion.py +1 -1
- airflow/providers/google/cloud/hooks/dataplex.py +1 -1
- airflow/providers/google/cloud/hooks/dataprep.py +1 -1
- airflow/providers/google/cloud/hooks/dataproc.py +72 -71
- airflow/providers/google/cloud/hooks/gcs.py +111 -14
- airflow/providers/google/cloud/hooks/gen_ai.py +196 -0
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +2 -2
- airflow/providers/google/cloud/hooks/looker.py +6 -1
- airflow/providers/google/cloud/hooks/mlengine.py +3 -2
- airflow/providers/google/cloud/hooks/secret_manager.py +102 -10
- airflow/providers/google/cloud/hooks/spanner.py +73 -8
- airflow/providers/google/cloud/hooks/stackdriver.py +10 -8
- airflow/providers/google/cloud/hooks/translate.py +1 -1
- airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +0 -209
- airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +2 -2
- airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +27 -1
- airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +307 -7
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +79 -75
- airflow/providers/google/cloud/hooks/vertex_ai/ray.py +223 -0
- airflow/providers/google/cloud/hooks/vision.py +2 -2
- airflow/providers/google/cloud/hooks/workflows.py +1 -1
- airflow/providers/google/cloud/links/alloy_db.py +0 -46
- airflow/providers/google/cloud/links/base.py +77 -13
- airflow/providers/google/cloud/links/bigquery.py +0 -47
- airflow/providers/google/cloud/links/bigquery_dts.py +0 -20
- airflow/providers/google/cloud/links/bigtable.py +0 -48
- airflow/providers/google/cloud/links/cloud_build.py +0 -73
- airflow/providers/google/cloud/links/cloud_functions.py +0 -33
- airflow/providers/google/cloud/links/cloud_memorystore.py +0 -58
- airflow/providers/google/cloud/links/{life_sciences.py → cloud_run.py} +5 -27
- airflow/providers/google/cloud/links/cloud_sql.py +0 -33
- airflow/providers/google/cloud/links/cloud_storage_transfer.py +17 -44
- airflow/providers/google/cloud/links/cloud_tasks.py +7 -26
- airflow/providers/google/cloud/links/compute.py +0 -58
- airflow/providers/google/cloud/links/data_loss_prevention.py +0 -169
- airflow/providers/google/cloud/links/datacatalog.py +23 -54
- airflow/providers/google/cloud/links/dataflow.py +0 -34
- airflow/providers/google/cloud/links/dataform.py +0 -64
- airflow/providers/google/cloud/links/datafusion.py +1 -96
- airflow/providers/google/cloud/links/dataplex.py +0 -154
- airflow/providers/google/cloud/links/dataprep.py +0 -24
- airflow/providers/google/cloud/links/dataproc.py +11 -95
- airflow/providers/google/cloud/links/datastore.py +0 -31
- airflow/providers/google/cloud/links/kubernetes_engine.py +9 -60
- airflow/providers/google/cloud/links/managed_kafka.py +0 -70
- airflow/providers/google/cloud/links/mlengine.py +0 -70
- airflow/providers/google/cloud/links/pubsub.py +0 -32
- airflow/providers/google/cloud/links/spanner.py +0 -33
- airflow/providers/google/cloud/links/stackdriver.py +0 -30
- airflow/providers/google/cloud/links/translate.py +17 -187
- airflow/providers/google/cloud/links/vertex_ai.py +28 -195
- airflow/providers/google/cloud/links/workflows.py +0 -52
- airflow/providers/google/cloud/log/gcs_task_handler.py +17 -9
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +9 -6
- airflow/providers/google/cloud/openlineage/CloudStorageTransferJobFacet.json +68 -0
- airflow/providers/google/cloud/openlineage/CloudStorageTransferRunFacet.json +60 -0
- airflow/providers/google/cloud/openlineage/DataFusionRunFacet.json +32 -0
- airflow/providers/google/cloud/openlineage/facets.py +102 -1
- airflow/providers/google/cloud/openlineage/mixins.py +10 -8
- airflow/providers/google/cloud/openlineage/utils.py +15 -1
- airflow/providers/google/cloud/operators/alloy_db.py +70 -55
- airflow/providers/google/cloud/operators/bigquery.py +73 -636
- airflow/providers/google/cloud/operators/bigquery_dts.py +3 -5
- airflow/providers/google/cloud/operators/bigtable.py +36 -7
- airflow/providers/google/cloud/operators/cloud_base.py +21 -1
- airflow/providers/google/cloud/operators/cloud_batch.py +2 -2
- airflow/providers/google/cloud/operators/cloud_build.py +75 -32
- airflow/providers/google/cloud/operators/cloud_composer.py +128 -40
- airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
- airflow/providers/google/cloud/operators/cloud_memorystore.py +69 -43
- airflow/providers/google/cloud/operators/cloud_run.py +23 -5
- airflow/providers/google/cloud/operators/cloud_sql.py +8 -16
- airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +92 -11
- airflow/providers/google/cloud/operators/compute.py +8 -40
- airflow/providers/google/cloud/operators/datacatalog.py +157 -21
- airflow/providers/google/cloud/operators/dataflow.py +38 -15
- airflow/providers/google/cloud/operators/dataform.py +15 -5
- airflow/providers/google/cloud/operators/datafusion.py +41 -20
- airflow/providers/google/cloud/operators/dataplex.py +193 -109
- airflow/providers/google/cloud/operators/dataprep.py +1 -5
- airflow/providers/google/cloud/operators/dataproc.py +78 -35
- airflow/providers/google/cloud/operators/dataproc_metastore.py +96 -88
- airflow/providers/google/cloud/operators/datastore.py +22 -6
- airflow/providers/google/cloud/operators/dlp.py +6 -29
- airflow/providers/google/cloud/operators/functions.py +16 -7
- airflow/providers/google/cloud/operators/gcs.py +10 -8
- airflow/providers/google/cloud/operators/gen_ai.py +389 -0
- airflow/providers/google/cloud/operators/kubernetes_engine.py +60 -99
- airflow/providers/google/cloud/operators/looker.py +1 -1
- airflow/providers/google/cloud/operators/managed_kafka.py +107 -52
- airflow/providers/google/cloud/operators/natural_language.py +1 -1
- airflow/providers/google/cloud/operators/pubsub.py +60 -14
- airflow/providers/google/cloud/operators/spanner.py +25 -12
- airflow/providers/google/cloud/operators/speech_to_text.py +1 -2
- airflow/providers/google/cloud/operators/stackdriver.py +1 -9
- airflow/providers/google/cloud/operators/tasks.py +1 -12
- airflow/providers/google/cloud/operators/text_to_speech.py +1 -2
- airflow/providers/google/cloud/operators/translate.py +40 -16
- airflow/providers/google/cloud/operators/translate_speech.py +1 -2
- airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +39 -19
- airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +29 -9
- airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +54 -26
- airflow/providers/google/cloud/operators/vertex_ai/dataset.py +70 -8
- airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +43 -9
- airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +532 -1
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +135 -116
- airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +11 -9
- airflow/providers/google/cloud/operators/vertex_ai/model_service.py +57 -11
- airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +30 -7
- airflow/providers/google/cloud/operators/vertex_ai/ray.py +393 -0
- airflow/providers/google/cloud/operators/video_intelligence.py +1 -1
- airflow/providers/google/cloud/operators/vision.py +2 -2
- airflow/providers/google/cloud/operators/workflows.py +18 -15
- airflow/providers/google/cloud/sensors/bigquery.py +2 -2
- airflow/providers/google/cloud/sensors/bigquery_dts.py +2 -2
- airflow/providers/google/cloud/sensors/bigtable.py +11 -4
- airflow/providers/google/cloud/sensors/cloud_composer.py +533 -29
- airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +2 -2
- airflow/providers/google/cloud/sensors/dataflow.py +26 -9
- airflow/providers/google/cloud/sensors/dataform.py +2 -2
- airflow/providers/google/cloud/sensors/datafusion.py +4 -4
- airflow/providers/google/cloud/sensors/dataplex.py +2 -2
- airflow/providers/google/cloud/sensors/dataprep.py +2 -2
- airflow/providers/google/cloud/sensors/dataproc.py +2 -2
- airflow/providers/google/cloud/sensors/dataproc_metastore.py +2 -2
- airflow/providers/google/cloud/sensors/gcs.py +4 -4
- airflow/providers/google/cloud/sensors/looker.py +2 -2
- airflow/providers/google/cloud/sensors/pubsub.py +4 -4
- airflow/providers/google/cloud/sensors/tasks.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +2 -2
- airflow/providers/google/cloud/sensors/workflows.py +2 -2
- airflow/providers/google/cloud/transfers/adls_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/azure_blob_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/azure_fileshare_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/bigquery_to_bigquery.py +11 -8
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +4 -4
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +7 -3
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +12 -1
- airflow/providers/google/cloud/transfers/bigquery_to_postgres.py +24 -10
- airflow/providers/google/cloud/transfers/bigquery_to_sql.py +104 -5
- airflow/providers/google/cloud/transfers/calendar_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/facebook_ads_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +20 -12
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/gcs_to_local.py +5 -3
- airflow/providers/google/cloud/transfers/gcs_to_sftp.py +10 -4
- airflow/providers/google/cloud/transfers/gdrive_to_gcs.py +6 -2
- airflow/providers/google/cloud/transfers/gdrive_to_local.py +2 -2
- airflow/providers/google/cloud/transfers/http_to_gcs.py +193 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
- airflow/providers/google/cloud/transfers/oracle_to_gcs.py +36 -11
- airflow/providers/google/cloud/transfers/postgres_to_gcs.py +42 -9
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +12 -6
- airflow/providers/google/cloud/transfers/salesforce_to_gcs.py +2 -2
- airflow/providers/google/cloud/transfers/sftp_to_gcs.py +13 -4
- airflow/providers/google/cloud/transfers/sheets_to_gcs.py +3 -3
- airflow/providers/google/cloud/transfers/sql_to_gcs.py +10 -10
- airflow/providers/google/cloud/triggers/bigquery.py +75 -34
- airflow/providers/google/cloud/triggers/cloud_build.py +1 -1
- airflow/providers/google/cloud/triggers/cloud_composer.py +302 -46
- airflow/providers/google/cloud/triggers/cloud_run.py +2 -2
- airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +91 -1
- airflow/providers/google/cloud/triggers/dataflow.py +122 -0
- airflow/providers/google/cloud/triggers/datafusion.py +1 -1
- airflow/providers/google/cloud/triggers/dataplex.py +14 -2
- airflow/providers/google/cloud/triggers/dataproc.py +122 -52
- airflow/providers/google/cloud/triggers/kubernetes_engine.py +45 -27
- airflow/providers/google/cloud/triggers/mlengine.py +1 -1
- airflow/providers/google/cloud/triggers/pubsub.py +15 -19
- airflow/providers/google/cloud/utils/bigquery_get_data.py +1 -1
- airflow/providers/google/cloud/utils/credentials_provider.py +1 -1
- airflow/providers/google/cloud/utils/field_validator.py +1 -2
- airflow/providers/google/common/auth_backend/google_openid.py +4 -4
- airflow/providers/google/common/deprecated.py +2 -1
- airflow/providers/google/common/hooks/base_google.py +27 -8
- airflow/providers/google/common/links/storage.py +0 -22
- airflow/providers/google/common/utils/get_secret.py +31 -0
- airflow/providers/google/common/utils/id_token_credentials.py +3 -4
- airflow/providers/google/firebase/operators/firestore.py +2 -2
- airflow/providers/google/get_provider_info.py +56 -52
- airflow/providers/google/go_module_utils.py +35 -3
- airflow/providers/google/leveldb/hooks/leveldb.py +26 -1
- airflow/providers/google/leveldb/operators/leveldb.py +2 -2
- airflow/providers/google/marketing_platform/hooks/display_video.py +3 -109
- airflow/providers/google/marketing_platform/links/analytics_admin.py +5 -14
- airflow/providers/google/marketing_platform/operators/analytics_admin.py +1 -2
- airflow/providers/google/marketing_platform/operators/campaign_manager.py +5 -5
- airflow/providers/google/marketing_platform/operators/display_video.py +28 -489
- airflow/providers/google/marketing_platform/operators/search_ads.py +2 -2
- airflow/providers/google/marketing_platform/sensors/campaign_manager.py +2 -2
- airflow/providers/google/marketing_platform/sensors/display_video.py +3 -63
- airflow/providers/google/suite/hooks/calendar.py +1 -1
- airflow/providers/google/suite/hooks/sheets.py +15 -1
- airflow/providers/google/suite/operators/sheets.py +8 -3
- airflow/providers/google/suite/sensors/drive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_gdrive.py +2 -2
- airflow/providers/google/suite/transfers/gcs_to_sheets.py +1 -1
- airflow/providers/google/suite/transfers/local_to_drive.py +3 -3
- airflow/providers/google/suite/transfers/sql_to_sheets.py +5 -4
- airflow/providers/google/version_compat.py +15 -1
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/METADATA +92 -48
- apache_airflow_providers_google-19.1.0rc1.dist-info/RECORD +331 -0
- apache_airflow_providers_google-19.1.0rc1.dist-info/licenses/NOTICE +5 -0
- airflow/providers/google/cloud/hooks/automl.py +0 -673
- airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
- airflow/providers/google/cloud/links/automl.py +0 -193
- airflow/providers/google/cloud/operators/automl.py +0 -1362
- airflow/providers/google/cloud/operators/life_sciences.py +0 -119
- airflow/providers/google/cloud/operators/mlengine.py +0 -112
- apache_airflow_providers_google-15.1.0rc1.dist-info/RECORD +0 -321
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/entry_points.txt +0 -0
- {airflow/providers/google → apache_airflow_providers_google-19.1.0rc1.dist-info/licenses}/LICENSE +0 -0
|
@@ -47,7 +47,7 @@ from google.cloud.dataproc_v1 import (
|
|
|
47
47
|
|
|
48
48
|
from airflow.exceptions import AirflowException
|
|
49
49
|
from airflow.providers.google.common.consts import CLIENT_INFO
|
|
50
|
-
from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
|
|
50
|
+
from airflow.providers.google.common.hooks.base_google import GoogleBaseAsyncHook, GoogleBaseHook
|
|
51
51
|
from airflow.version import version as airflow_version
|
|
52
52
|
|
|
53
53
|
if TYPE_CHECKING:
|
|
@@ -298,7 +298,7 @@ class DataprocHook(GoogleBaseHook):
|
|
|
298
298
|
success_code = 0
|
|
299
299
|
|
|
300
300
|
with self.provide_authorized_gcloud():
|
|
301
|
-
proc = subprocess.run(cmd, capture_output=True)
|
|
301
|
+
proc = subprocess.run(cmd, check=False, capture_output=True)
|
|
302
302
|
|
|
303
303
|
if proc.returncode != success_code:
|
|
304
304
|
stderr_last_20_lines = "\n".join(proc.stderr.decode().strip().splitlines()[-20:])
|
|
@@ -912,12 +912,15 @@ class DataprocHook(GoogleBaseHook):
|
|
|
912
912
|
state = None
|
|
913
913
|
start = time.monotonic()
|
|
914
914
|
while state not in (JobStatus.State.ERROR, JobStatus.State.DONE, JobStatus.State.CANCELLED):
|
|
915
|
+
self.log.debug("Waiting for job %s to complete", job_id)
|
|
915
916
|
if timeout and start + timeout < time.monotonic():
|
|
916
917
|
raise AirflowException(f"Timeout: dataproc job {job_id} is not ready after {timeout}s")
|
|
918
|
+
self.log.debug("Sleeping for %s seconds", wait_time)
|
|
917
919
|
time.sleep(wait_time)
|
|
918
920
|
try:
|
|
919
921
|
job = self.get_job(project_id=project_id, region=region, job_id=job_id)
|
|
920
922
|
state = job.status.state
|
|
923
|
+
self.log.debug("Job %s is in state %s", job_id, state)
|
|
921
924
|
except ServerError as err:
|
|
922
925
|
self.log.info("Retrying. Dataproc API returned server error when waiting for job: %s", err)
|
|
923
926
|
|
|
@@ -1269,7 +1272,7 @@ class DataprocHook(GoogleBaseHook):
|
|
|
1269
1272
|
return all([word in error_msg for word in key_words])
|
|
1270
1273
|
|
|
1271
1274
|
|
|
1272
|
-
class DataprocAsyncHook(
|
|
1275
|
+
class DataprocAsyncHook(GoogleBaseAsyncHook):
|
|
1273
1276
|
"""
|
|
1274
1277
|
Asynchronous interaction with Google Cloud Dataproc APIs.
|
|
1275
1278
|
|
|
@@ -1277,6 +1280,8 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1277
1280
|
keyword arguments rather than positional.
|
|
1278
1281
|
"""
|
|
1279
1282
|
|
|
1283
|
+
sync_hook_class = DataprocHook
|
|
1284
|
+
|
|
1280
1285
|
def __init__(
|
|
1281
1286
|
self,
|
|
1282
1287
|
gcp_conn_id: str = "google_cloud_default",
|
|
@@ -1286,53 +1291,90 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1286
1291
|
super().__init__(gcp_conn_id=gcp_conn_id, impersonation_chain=impersonation_chain, **kwargs)
|
|
1287
1292
|
self._cached_client: JobControllerAsyncClient | None = None
|
|
1288
1293
|
|
|
1289
|
-
def get_cluster_client(self, region: str | None = None) -> ClusterControllerAsyncClient:
|
|
1294
|
+
async def get_cluster_client(self, region: str | None = None) -> ClusterControllerAsyncClient:
|
|
1290
1295
|
"""Create a ClusterControllerAsyncClient."""
|
|
1291
1296
|
client_options = None
|
|
1292
1297
|
if region and region != "global":
|
|
1293
1298
|
client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
|
|
1294
1299
|
|
|
1300
|
+
sync_hook = await self.get_sync_hook()
|
|
1295
1301
|
return ClusterControllerAsyncClient(
|
|
1296
|
-
credentials=
|
|
1302
|
+
credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
|
|
1297
1303
|
)
|
|
1298
1304
|
|
|
1299
|
-
def get_template_client(self, region: str | None = None) -> WorkflowTemplateServiceAsyncClient:
|
|
1305
|
+
async def get_template_client(self, region: str | None = None) -> WorkflowTemplateServiceAsyncClient:
|
|
1300
1306
|
"""Create a WorkflowTemplateServiceAsyncClient."""
|
|
1301
1307
|
client_options = None
|
|
1302
1308
|
if region and region != "global":
|
|
1303
1309
|
client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
|
|
1304
1310
|
|
|
1311
|
+
sync_hook = await self.get_sync_hook()
|
|
1305
1312
|
return WorkflowTemplateServiceAsyncClient(
|
|
1306
|
-
credentials=
|
|
1313
|
+
credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
|
|
1307
1314
|
)
|
|
1308
1315
|
|
|
1309
|
-
def get_job_client(self, region: str | None = None) -> JobControllerAsyncClient:
|
|
1316
|
+
async def get_job_client(self, region: str | None = None) -> JobControllerAsyncClient:
|
|
1310
1317
|
"""Create a JobControllerAsyncClient."""
|
|
1311
1318
|
if self._cached_client is None:
|
|
1312
1319
|
client_options = None
|
|
1313
1320
|
if region and region != "global":
|
|
1314
1321
|
client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
|
|
1315
1322
|
|
|
1323
|
+
sync_hook = await self.get_sync_hook()
|
|
1316
1324
|
self._cached_client = JobControllerAsyncClient(
|
|
1317
|
-
credentials=
|
|
1325
|
+
credentials=sync_hook.get_credentials(),
|
|
1318
1326
|
client_info=CLIENT_INFO,
|
|
1319
1327
|
client_options=client_options,
|
|
1320
1328
|
)
|
|
1321
1329
|
return self._cached_client
|
|
1322
1330
|
|
|
1323
|
-
def get_batch_client(self, region: str | None = None) -> BatchControllerAsyncClient:
|
|
1331
|
+
async def get_batch_client(self, region: str | None = None) -> BatchControllerAsyncClient:
|
|
1324
1332
|
"""Create a BatchControllerAsyncClient."""
|
|
1325
1333
|
client_options = None
|
|
1326
1334
|
if region and region != "global":
|
|
1327
1335
|
client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
|
|
1328
1336
|
|
|
1337
|
+
sync_hook = await self.get_sync_hook()
|
|
1329
1338
|
return BatchControllerAsyncClient(
|
|
1330
|
-
credentials=
|
|
1339
|
+
credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
|
|
1331
1340
|
)
|
|
1332
1341
|
|
|
1333
|
-
def get_operations_client(self, region: str) -> OperationsClient:
|
|
1342
|
+
async def get_operations_client(self, region: str) -> OperationsClient:
|
|
1334
1343
|
"""Create a OperationsClient."""
|
|
1335
|
-
|
|
1344
|
+
template_client = await self.get_template_client(region=region)
|
|
1345
|
+
return template_client.transport.operations_client
|
|
1346
|
+
|
|
1347
|
+
@GoogleBaseHook.fallback_to_default_project_id
|
|
1348
|
+
async def get_cluster(
|
|
1349
|
+
self,
|
|
1350
|
+
region: str,
|
|
1351
|
+
cluster_name: str,
|
|
1352
|
+
project_id: str,
|
|
1353
|
+
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
|
1354
|
+
timeout: float | None = None,
|
|
1355
|
+
metadata: Sequence[tuple[str, str]] = (),
|
|
1356
|
+
) -> Cluster:
|
|
1357
|
+
"""
|
|
1358
|
+
Get a cluster.
|
|
1359
|
+
|
|
1360
|
+
:param region: Cloud Dataproc region in which to handle the request.
|
|
1361
|
+
:param cluster_name: Name of the cluster to get.
|
|
1362
|
+
:param project_id: Google Cloud project ID that the cluster belongs to.
|
|
1363
|
+
:param retry: A retry object used to retry requests. If *None*, requests
|
|
1364
|
+
will not be retried.
|
|
1365
|
+
:param timeout: The amount of time, in seconds, to wait for the request
|
|
1366
|
+
to complete. If *retry* is specified, the timeout applies to each
|
|
1367
|
+
individual attempt.
|
|
1368
|
+
:param metadata: Additional metadata that is provided to the method.
|
|
1369
|
+
"""
|
|
1370
|
+
client = await self.get_cluster_client(region=region)
|
|
1371
|
+
result = await client.get_cluster(
|
|
1372
|
+
request={"project_id": project_id, "region": region, "cluster_name": cluster_name},
|
|
1373
|
+
retry=retry,
|
|
1374
|
+
timeout=timeout,
|
|
1375
|
+
metadata=metadata,
|
|
1376
|
+
)
|
|
1377
|
+
return result
|
|
1336
1378
|
|
|
1337
1379
|
@GoogleBaseHook.fallback_to_default_project_id
|
|
1338
1380
|
async def create_cluster(
|
|
@@ -1390,7 +1432,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1390
1432
|
cluster["config"] = cluster_config # type: ignore
|
|
1391
1433
|
cluster["labels"] = labels # type: ignore
|
|
1392
1434
|
|
|
1393
|
-
client = self.get_cluster_client(region=region)
|
|
1435
|
+
client = await self.get_cluster_client(region=region)
|
|
1394
1436
|
result = await client.create_cluster(
|
|
1395
1437
|
request={
|
|
1396
1438
|
"project_id": project_id,
|
|
@@ -1435,7 +1477,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1435
1477
|
individual attempt.
|
|
1436
1478
|
:param metadata: Additional metadata that is provided to the method.
|
|
1437
1479
|
"""
|
|
1438
|
-
client = self.get_cluster_client(region=region)
|
|
1480
|
+
client = await self.get_cluster_client(region=region)
|
|
1439
1481
|
result = await client.delete_cluster(
|
|
1440
1482
|
request={
|
|
1441
1483
|
"project_id": project_id,
|
|
@@ -1483,7 +1525,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1483
1525
|
individual attempt.
|
|
1484
1526
|
:param metadata: Additional metadata that is provided to the method.
|
|
1485
1527
|
"""
|
|
1486
|
-
client = self.get_cluster_client(region=region)
|
|
1528
|
+
client = await self.get_cluster_client(region=region)
|
|
1487
1529
|
result = await client.diagnose_cluster(
|
|
1488
1530
|
request={
|
|
1489
1531
|
"project_id": project_id,
|
|
@@ -1500,38 +1542,6 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1500
1542
|
)
|
|
1501
1543
|
return result
|
|
1502
1544
|
|
|
1503
|
-
@GoogleBaseHook.fallback_to_default_project_id
|
|
1504
|
-
async def get_cluster(
|
|
1505
|
-
self,
|
|
1506
|
-
region: str,
|
|
1507
|
-
cluster_name: str,
|
|
1508
|
-
project_id: str,
|
|
1509
|
-
retry: AsyncRetry | _MethodDefault = DEFAULT,
|
|
1510
|
-
timeout: float | None = None,
|
|
1511
|
-
metadata: Sequence[tuple[str, str]] = (),
|
|
1512
|
-
) -> Cluster:
|
|
1513
|
-
"""
|
|
1514
|
-
Get the resource representation for a cluster in a project.
|
|
1515
|
-
|
|
1516
|
-
:param project_id: Google Cloud project ID that the cluster belongs to.
|
|
1517
|
-
:param region: Cloud Dataproc region to handle the request.
|
|
1518
|
-
:param cluster_name: The cluster name.
|
|
1519
|
-
:param retry: A retry object used to retry requests. If *None*, requests
|
|
1520
|
-
will not be retried.
|
|
1521
|
-
:param timeout: The amount of time, in seconds, to wait for the request
|
|
1522
|
-
to complete. If *retry* is specified, the timeout applies to each
|
|
1523
|
-
individual attempt.
|
|
1524
|
-
:param metadata: Additional metadata that is provided to the method.
|
|
1525
|
-
"""
|
|
1526
|
-
client = self.get_cluster_client(region=region)
|
|
1527
|
-
result = await client.get_cluster(
|
|
1528
|
-
request={"project_id": project_id, "region": region, "cluster_name": cluster_name},
|
|
1529
|
-
retry=retry,
|
|
1530
|
-
timeout=timeout,
|
|
1531
|
-
metadata=metadata,
|
|
1532
|
-
)
|
|
1533
|
-
return result
|
|
1534
|
-
|
|
1535
1545
|
@GoogleBaseHook.fallback_to_default_project_id
|
|
1536
1546
|
async def list_clusters(
|
|
1537
1547
|
self,
|
|
@@ -1561,7 +1571,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1561
1571
|
individual attempt.
|
|
1562
1572
|
:param metadata: Additional metadata that is provided to the method.
|
|
1563
1573
|
"""
|
|
1564
|
-
client = self.get_cluster_client(region=region)
|
|
1574
|
+
client = await self.get_cluster_client(region=region)
|
|
1565
1575
|
result = await client.list_clusters(
|
|
1566
1576
|
request={"project_id": project_id, "region": region, "filter": filter_, "page_size": page_size},
|
|
1567
1577
|
retry=retry,
|
|
@@ -1638,7 +1648,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1638
1648
|
"""
|
|
1639
1649
|
if region is None:
|
|
1640
1650
|
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1641
|
-
client = self.get_cluster_client(region=region)
|
|
1651
|
+
client = await self.get_cluster_client(region=region)
|
|
1642
1652
|
operation = await client.update_cluster(
|
|
1643
1653
|
request={
|
|
1644
1654
|
"project_id": project_id,
|
|
@@ -1680,10 +1690,8 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1680
1690
|
individual attempt.
|
|
1681
1691
|
:param metadata: Additional metadata that is provided to the method.
|
|
1682
1692
|
"""
|
|
1683
|
-
if region is None:
|
|
1684
|
-
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1685
1693
|
metadata = metadata or ()
|
|
1686
|
-
client = self.get_template_client(region)
|
|
1694
|
+
client = await self.get_template_client(region)
|
|
1687
1695
|
parent = f"projects/{project_id}/regions/{region}"
|
|
1688
1696
|
return await client.create_workflow_template(
|
|
1689
1697
|
request={"parent": parent, "template": template}, retry=retry, timeout=timeout, metadata=metadata
|
|
@@ -1725,10 +1733,8 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1725
1733
|
individual attempt.
|
|
1726
1734
|
:param metadata: Additional metadata that is provided to the method.
|
|
1727
1735
|
"""
|
|
1728
|
-
if region is None:
|
|
1729
|
-
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1730
1736
|
metadata = metadata or ()
|
|
1731
|
-
client = self.get_template_client(region)
|
|
1737
|
+
client = await self.get_template_client(region)
|
|
1732
1738
|
name = f"projects/{project_id}/regions/{region}/workflowTemplates/{template_name}"
|
|
1733
1739
|
operation = await client.instantiate_workflow_template(
|
|
1734
1740
|
request={"name": name, "version": version, "request_id": request_id, "parameters": parameters},
|
|
@@ -1767,10 +1773,8 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1767
1773
|
individual attempt.
|
|
1768
1774
|
:param metadata: Additional metadata that is provided to the method.
|
|
1769
1775
|
"""
|
|
1770
|
-
if region is None:
|
|
1771
|
-
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1772
1776
|
metadata = metadata or ()
|
|
1773
|
-
client = self.get_template_client(region)
|
|
1777
|
+
client = await self.get_template_client(region)
|
|
1774
1778
|
parent = f"projects/{project_id}/regions/{region}"
|
|
1775
1779
|
operation = await client.instantiate_inline_workflow_template(
|
|
1776
1780
|
request={"parent": parent, "template": template, "request_id": request_id},
|
|
@@ -1781,7 +1785,8 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1781
1785
|
return operation
|
|
1782
1786
|
|
|
1783
1787
|
async def get_operation(self, region, operation_name):
|
|
1784
|
-
|
|
1788
|
+
operations_client = await self.get_operations_client(region)
|
|
1789
|
+
return await operations_client.get_operation(name=operation_name)
|
|
1785
1790
|
|
|
1786
1791
|
@GoogleBaseHook.fallback_to_default_project_id
|
|
1787
1792
|
async def get_job(
|
|
@@ -1806,9 +1811,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1806
1811
|
individual attempt.
|
|
1807
1812
|
:param metadata: Additional metadata that is provided to the method.
|
|
1808
1813
|
"""
|
|
1809
|
-
|
|
1810
|
-
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1811
|
-
client = self.get_job_client(region=region)
|
|
1814
|
+
client = await self.get_job_client(region=region)
|
|
1812
1815
|
job = await client.get_job(
|
|
1813
1816
|
request={"project_id": project_id, "region": region, "job_id": job_id},
|
|
1814
1817
|
retry=retry,
|
|
@@ -1845,9 +1848,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1845
1848
|
individual attempt.
|
|
1846
1849
|
:param metadata: Additional metadata that is provided to the method.
|
|
1847
1850
|
"""
|
|
1848
|
-
|
|
1849
|
-
raise TypeError("missing 1 required keyword argument: 'region'")
|
|
1850
|
-
client = self.get_job_client(region=region)
|
|
1851
|
+
client = await self.get_job_client(region=region)
|
|
1851
1852
|
return await client.submit_job(
|
|
1852
1853
|
request={"project_id": project_id, "region": region, "job": job, "request_id": request_id},
|
|
1853
1854
|
retry=retry,
|
|
@@ -1878,7 +1879,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1878
1879
|
individual attempt.
|
|
1879
1880
|
:param metadata: Additional metadata that is provided to the method.
|
|
1880
1881
|
"""
|
|
1881
|
-
client = self.get_job_client(region=region)
|
|
1882
|
+
client = await self.get_job_client(region=region)
|
|
1882
1883
|
|
|
1883
1884
|
job = await client.cancel_job(
|
|
1884
1885
|
request={"project_id": project_id, "region": region, "job_id": job_id},
|
|
@@ -1920,7 +1921,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1920
1921
|
individual attempt.
|
|
1921
1922
|
:param metadata: Additional metadata that is provided to the method.
|
|
1922
1923
|
"""
|
|
1923
|
-
client = self.get_batch_client(region)
|
|
1924
|
+
client = await self.get_batch_client(region)
|
|
1924
1925
|
parent = f"projects/{project_id}/regions/{region}"
|
|
1925
1926
|
|
|
1926
1927
|
result = await client.create_batch(
|
|
@@ -1959,7 +1960,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1959
1960
|
individual attempt.
|
|
1960
1961
|
:param metadata: Additional metadata that is provided to the method.
|
|
1961
1962
|
"""
|
|
1962
|
-
client = self.get_batch_client(region)
|
|
1963
|
+
client = await self.get_batch_client(region)
|
|
1963
1964
|
name = f"projects/{project_id}/locations/{region}/batches/{batch_id}"
|
|
1964
1965
|
|
|
1965
1966
|
await client.delete_batch(
|
|
@@ -1994,7 +1995,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
1994
1995
|
individual attempt.
|
|
1995
1996
|
:param metadata: Additional metadata that is provided to the method.
|
|
1996
1997
|
"""
|
|
1997
|
-
client = self.get_batch_client(region)
|
|
1998
|
+
client = await self.get_batch_client(region)
|
|
1998
1999
|
name = f"projects/{project_id}/locations/{region}/batches/{batch_id}"
|
|
1999
2000
|
|
|
2000
2001
|
result = await client.get_batch(
|
|
@@ -2039,7 +2040,7 @@ class DataprocAsyncHook(GoogleBaseHook):
|
|
|
2039
2040
|
:param filter: Result filters as specified in ListBatchesRequest
|
|
2040
2041
|
:param order_by: How to order results as specified in ListBatchesRequest
|
|
2041
2042
|
"""
|
|
2042
|
-
client = self.get_batch_client(region)
|
|
2043
|
+
client = await self.get_batch_client(region)
|
|
2043
2044
|
parent = f"projects/{project_id}/regions/{region}"
|
|
2044
2045
|
|
|
2045
2046
|
result = await client.list_batches(
|
|
@@ -26,24 +26,26 @@ import os
|
|
|
26
26
|
import shutil
|
|
27
27
|
import time
|
|
28
28
|
import warnings
|
|
29
|
-
from collections.abc import Generator, Sequence
|
|
29
|
+
from collections.abc import Callable, Generator, Sequence
|
|
30
30
|
from contextlib import contextmanager
|
|
31
|
+
from datetime import datetime
|
|
31
32
|
from functools import partial
|
|
32
33
|
from io import BytesIO
|
|
34
|
+
from pathlib import Path
|
|
33
35
|
from tempfile import NamedTemporaryFile
|
|
34
|
-
from typing import IO, TYPE_CHECKING, Any,
|
|
36
|
+
from typing import IO, TYPE_CHECKING, Any, ParamSpec, TypeVar, cast, overload
|
|
35
37
|
from urllib.parse import urlsplit
|
|
36
38
|
|
|
39
|
+
# Make mypy happy by importing as aliases
|
|
40
|
+
import google.cloud.storage as storage
|
|
37
41
|
from gcloud.aio.storage import Storage
|
|
38
42
|
from google.api_core.exceptions import GoogleAPICallError, NotFound
|
|
39
|
-
|
|
40
|
-
# not sure why but mypy complains on missing `storage` but it is clearly there and is importable
|
|
41
|
-
from google.cloud import storage # type: ignore[attr-defined]
|
|
42
43
|
from google.cloud.exceptions import GoogleCloudError
|
|
43
44
|
from google.cloud.storage.retry import DEFAULT_RETRY
|
|
44
45
|
|
|
45
46
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
46
47
|
from airflow.providers.common.compat.lineage.hook import get_hook_lineage_collector
|
|
48
|
+
from airflow.providers.common.compat.sdk import timezone
|
|
47
49
|
from airflow.providers.google.cloud.utils.helpers import normalize_directory_path
|
|
48
50
|
from airflow.providers.google.common.consts import CLIENT_INFO
|
|
49
51
|
from airflow.providers.google.common.hooks.base_google import (
|
|
@@ -51,13 +53,9 @@ from airflow.providers.google.common.hooks.base_google import (
|
|
|
51
53
|
GoogleBaseAsyncHook,
|
|
52
54
|
GoogleBaseHook,
|
|
53
55
|
)
|
|
54
|
-
from airflow.typing_compat import ParamSpec
|
|
55
|
-
from airflow.utils import timezone
|
|
56
56
|
from airflow.version import version
|
|
57
57
|
|
|
58
58
|
if TYPE_CHECKING:
|
|
59
|
-
from datetime import datetime
|
|
60
|
-
|
|
61
59
|
from aiohttp import ClientSession
|
|
62
60
|
from google.api_core.retry import Retry
|
|
63
61
|
from google.cloud.storage.blob import Blob
|
|
@@ -373,8 +371,7 @@ class GCSHook(GoogleBaseHook):
|
|
|
373
371
|
num_max_attempts,
|
|
374
372
|
)
|
|
375
373
|
raise
|
|
376
|
-
|
|
377
|
-
raise NotImplementedError # should not reach this, but makes mypy happy
|
|
374
|
+
raise NotImplementedError # should not reach this, but makes mypy happy
|
|
378
375
|
|
|
379
376
|
def download_as_byte_array(
|
|
380
377
|
self,
|
|
@@ -549,13 +546,13 @@ class GCSHook(GoogleBaseHook):
|
|
|
549
546
|
if cache_control:
|
|
550
547
|
blob.cache_control = cache_control
|
|
551
548
|
|
|
552
|
-
if filename and data:
|
|
549
|
+
if filename is not None and data is not None:
|
|
553
550
|
raise ValueError(
|
|
554
551
|
"'filename' and 'data' parameter provided. Please "
|
|
555
552
|
"specify a single parameter, either 'filename' for "
|
|
556
553
|
"local file uploads or 'data' for file content uploads."
|
|
557
554
|
)
|
|
558
|
-
if filename:
|
|
555
|
+
if filename is not None:
|
|
559
556
|
if not mime_type:
|
|
560
557
|
mime_type = "application/octet-stream"
|
|
561
558
|
if gzip:
|
|
@@ -575,7 +572,7 @@ class GCSHook(GoogleBaseHook):
|
|
|
575
572
|
if gzip:
|
|
576
573
|
os.remove(filename)
|
|
577
574
|
self.log.info("File %s uploaded to %s in %s bucket", filename, object_name, bucket_name)
|
|
578
|
-
elif data:
|
|
575
|
+
elif data is not None:
|
|
579
576
|
if not mime_type:
|
|
580
577
|
mime_type = "text/plain"
|
|
581
578
|
if gzip:
|
|
@@ -1251,6 +1248,106 @@ class GCSHook(GoogleBaseHook):
|
|
|
1251
1248
|
|
|
1252
1249
|
self.log.info("Completed successfully.")
|
|
1253
1250
|
|
|
1251
|
+
def _sync_to_local_dir_delete_stale_local_files(self, current_gcs_objects: List[Path], local_dir: Path):
|
|
1252
|
+
current_gcs_keys = {key.resolve() for key in current_gcs_objects}
|
|
1253
|
+
|
|
1254
|
+
for item in local_dir.rglob("*"):
|
|
1255
|
+
if item.is_file():
|
|
1256
|
+
if item.resolve() not in current_gcs_keys:
|
|
1257
|
+
self.log.debug("Deleting stale local file: %s", item)
|
|
1258
|
+
item.unlink()
|
|
1259
|
+
# Clean up empty directories
|
|
1260
|
+
for root, dirs, _ in os.walk(local_dir, topdown=False):
|
|
1261
|
+
for d in dirs:
|
|
1262
|
+
dir_path = os.path.join(root, d)
|
|
1263
|
+
if not os.listdir(dir_path):
|
|
1264
|
+
self.log.debug("Deleting stale empty directory: %s", dir_path)
|
|
1265
|
+
os.rmdir(dir_path)
|
|
1266
|
+
|
|
1267
|
+
def _sync_to_local_dir_if_changed(self, blob: Blob, local_target_path: Path):
|
|
1268
|
+
should_download = False
|
|
1269
|
+
download_msg = ""
|
|
1270
|
+
if not local_target_path.exists():
|
|
1271
|
+
should_download = True
|
|
1272
|
+
download_msg = f"Local file {local_target_path} does not exist."
|
|
1273
|
+
else:
|
|
1274
|
+
local_stats = local_target_path.stat()
|
|
1275
|
+
# Reload blob to get fresh metadata, including size and updated time
|
|
1276
|
+
blob.reload()
|
|
1277
|
+
|
|
1278
|
+
if blob.size != local_stats.st_size:
|
|
1279
|
+
should_download = True
|
|
1280
|
+
download_msg = (
|
|
1281
|
+
f"GCS object size ({blob.size}) and local file size ({local_stats.st_size}) differ."
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
gcs_last_modified = blob.updated
|
|
1285
|
+
if (
|
|
1286
|
+
not should_download
|
|
1287
|
+
and gcs_last_modified
|
|
1288
|
+
and local_stats.st_mtime < gcs_last_modified.timestamp()
|
|
1289
|
+
):
|
|
1290
|
+
should_download = True
|
|
1291
|
+
download_msg = f"GCS object last modified ({gcs_last_modified}) is newer than local file last modified ({datetime.fromtimestamp(local_stats.st_mtime, tz=timezone.utc)})."
|
|
1292
|
+
|
|
1293
|
+
if should_download:
|
|
1294
|
+
self.log.debug("%s Downloading %s to %s", download_msg, blob.name, local_target_path.as_posix())
|
|
1295
|
+
self.download(
|
|
1296
|
+
bucket_name=blob.bucket.name, object_name=blob.name, filename=str(local_target_path)
|
|
1297
|
+
)
|
|
1298
|
+
else:
|
|
1299
|
+
self.log.debug(
|
|
1300
|
+
"Local file %s is up-to-date with GCS object %s. Skipping download.",
|
|
1301
|
+
local_target_path.as_posix(),
|
|
1302
|
+
blob.name,
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
def sync_to_local_dir(
|
|
1306
|
+
self,
|
|
1307
|
+
bucket_name: str,
|
|
1308
|
+
local_dir: str | Path,
|
|
1309
|
+
prefix: str | None = None,
|
|
1310
|
+
delete_stale: bool = False,
|
|
1311
|
+
) -> None:
|
|
1312
|
+
"""
|
|
1313
|
+
Download files from a GCS bucket to a local directory.
|
|
1314
|
+
|
|
1315
|
+
It will download all files from the given ``prefix`` and create the corresponding
|
|
1316
|
+
directory structure in the ``local_dir``.
|
|
1317
|
+
|
|
1318
|
+
If ``delete_stale`` is ``True``, it will delete all local files that do not exist in the GCS bucket.
|
|
1319
|
+
|
|
1320
|
+
:param bucket_name: The name of the GCS bucket.
|
|
1321
|
+
:param local_dir: The local directory to which the files will be downloaded.
|
|
1322
|
+
:param prefix: The prefix of the files to be downloaded.
|
|
1323
|
+
:param delete_stale: If ``True``, deletes local files that don't exist in the bucket.
|
|
1324
|
+
"""
|
|
1325
|
+
prefix = prefix or ""
|
|
1326
|
+
local_dir_path = Path(local_dir)
|
|
1327
|
+
self.log.debug("Downloading data from gs://%s/%s to %s", bucket_name, prefix, local_dir_path)
|
|
1328
|
+
|
|
1329
|
+
gcs_bucket = self.get_bucket(bucket_name)
|
|
1330
|
+
local_gcs_objects = []
|
|
1331
|
+
|
|
1332
|
+
for blob in gcs_bucket.list_blobs(prefix=prefix):
|
|
1333
|
+
# GCS lists "directories" as objects ending with a slash. We should skip them.
|
|
1334
|
+
if blob.name.endswith("/"):
|
|
1335
|
+
continue
|
|
1336
|
+
|
|
1337
|
+
blob_path = Path(blob.name)
|
|
1338
|
+
local_target_path = local_dir_path.joinpath(blob_path.relative_to(prefix))
|
|
1339
|
+
if not local_target_path.parent.exists():
|
|
1340
|
+
local_target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1341
|
+
self.log.debug("Created local directory: %s", local_target_path.parent)
|
|
1342
|
+
|
|
1343
|
+
self._sync_to_local_dir_if_changed(blob=blob, local_target_path=local_target_path)
|
|
1344
|
+
local_gcs_objects.append(local_target_path)
|
|
1345
|
+
|
|
1346
|
+
if delete_stale:
|
|
1347
|
+
self._sync_to_local_dir_delete_stale_local_files(
|
|
1348
|
+
current_gcs_objects=local_gcs_objects, local_dir=local_dir_path
|
|
1349
|
+
)
|
|
1350
|
+
|
|
1254
1351
|
def sync(
|
|
1255
1352
|
self,
|
|
1256
1353
|
source_bucket: str,
|