apache-airflow-providers-google 15.1.0rc1__py3-none-any.whl → 19.1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. airflow/providers/google/3rd-party-licenses/NOTICE +2 -12
  2. airflow/providers/google/__init__.py +3 -3
  3. airflow/providers/google/ads/hooks/ads.py +39 -5
  4. airflow/providers/google/ads/operators/ads.py +2 -2
  5. airflow/providers/google/ads/transfers/ads_to_gcs.py +2 -2
  6. airflow/providers/google/assets/gcs.py +1 -11
  7. airflow/providers/google/cloud/bundles/__init__.py +16 -0
  8. airflow/providers/google/cloud/bundles/gcs.py +161 -0
  9. airflow/providers/google/cloud/hooks/bigquery.py +166 -281
  10. airflow/providers/google/cloud/hooks/cloud_composer.py +287 -14
  11. airflow/providers/google/cloud/hooks/cloud_logging.py +109 -0
  12. airflow/providers/google/cloud/hooks/cloud_run.py +17 -9
  13. airflow/providers/google/cloud/hooks/cloud_sql.py +101 -22
  14. airflow/providers/google/cloud/hooks/cloud_storage_transfer_service.py +27 -6
  15. airflow/providers/google/cloud/hooks/compute_ssh.py +5 -1
  16. airflow/providers/google/cloud/hooks/datacatalog.py +9 -1
  17. airflow/providers/google/cloud/hooks/dataflow.py +71 -94
  18. airflow/providers/google/cloud/hooks/datafusion.py +1 -1
  19. airflow/providers/google/cloud/hooks/dataplex.py +1 -1
  20. airflow/providers/google/cloud/hooks/dataprep.py +1 -1
  21. airflow/providers/google/cloud/hooks/dataproc.py +72 -71
  22. airflow/providers/google/cloud/hooks/gcs.py +111 -14
  23. airflow/providers/google/cloud/hooks/gen_ai.py +196 -0
  24. airflow/providers/google/cloud/hooks/kubernetes_engine.py +2 -2
  25. airflow/providers/google/cloud/hooks/looker.py +6 -1
  26. airflow/providers/google/cloud/hooks/mlengine.py +3 -2
  27. airflow/providers/google/cloud/hooks/secret_manager.py +102 -10
  28. airflow/providers/google/cloud/hooks/spanner.py +73 -8
  29. airflow/providers/google/cloud/hooks/stackdriver.py +10 -8
  30. airflow/providers/google/cloud/hooks/translate.py +1 -1
  31. airflow/providers/google/cloud/hooks/vertex_ai/auto_ml.py +0 -209
  32. airflow/providers/google/cloud/hooks/vertex_ai/batch_prediction_job.py +2 -2
  33. airflow/providers/google/cloud/hooks/vertex_ai/custom_job.py +27 -1
  34. airflow/providers/google/cloud/hooks/vertex_ai/experiment_service.py +202 -0
  35. airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +307 -7
  36. airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +79 -75
  37. airflow/providers/google/cloud/hooks/vertex_ai/ray.py +223 -0
  38. airflow/providers/google/cloud/hooks/vision.py +2 -2
  39. airflow/providers/google/cloud/hooks/workflows.py +1 -1
  40. airflow/providers/google/cloud/links/alloy_db.py +0 -46
  41. airflow/providers/google/cloud/links/base.py +77 -13
  42. airflow/providers/google/cloud/links/bigquery.py +0 -47
  43. airflow/providers/google/cloud/links/bigquery_dts.py +0 -20
  44. airflow/providers/google/cloud/links/bigtable.py +0 -48
  45. airflow/providers/google/cloud/links/cloud_build.py +0 -73
  46. airflow/providers/google/cloud/links/cloud_functions.py +0 -33
  47. airflow/providers/google/cloud/links/cloud_memorystore.py +0 -58
  48. airflow/providers/google/cloud/links/{life_sciences.py → cloud_run.py} +5 -27
  49. airflow/providers/google/cloud/links/cloud_sql.py +0 -33
  50. airflow/providers/google/cloud/links/cloud_storage_transfer.py +17 -44
  51. airflow/providers/google/cloud/links/cloud_tasks.py +7 -26
  52. airflow/providers/google/cloud/links/compute.py +0 -58
  53. airflow/providers/google/cloud/links/data_loss_prevention.py +0 -169
  54. airflow/providers/google/cloud/links/datacatalog.py +23 -54
  55. airflow/providers/google/cloud/links/dataflow.py +0 -34
  56. airflow/providers/google/cloud/links/dataform.py +0 -64
  57. airflow/providers/google/cloud/links/datafusion.py +1 -96
  58. airflow/providers/google/cloud/links/dataplex.py +0 -154
  59. airflow/providers/google/cloud/links/dataprep.py +0 -24
  60. airflow/providers/google/cloud/links/dataproc.py +11 -95
  61. airflow/providers/google/cloud/links/datastore.py +0 -31
  62. airflow/providers/google/cloud/links/kubernetes_engine.py +9 -60
  63. airflow/providers/google/cloud/links/managed_kafka.py +0 -70
  64. airflow/providers/google/cloud/links/mlengine.py +0 -70
  65. airflow/providers/google/cloud/links/pubsub.py +0 -32
  66. airflow/providers/google/cloud/links/spanner.py +0 -33
  67. airflow/providers/google/cloud/links/stackdriver.py +0 -30
  68. airflow/providers/google/cloud/links/translate.py +17 -187
  69. airflow/providers/google/cloud/links/vertex_ai.py +28 -195
  70. airflow/providers/google/cloud/links/workflows.py +0 -52
  71. airflow/providers/google/cloud/log/gcs_task_handler.py +17 -9
  72. airflow/providers/google/cloud/log/stackdriver_task_handler.py +9 -6
  73. airflow/providers/google/cloud/openlineage/CloudStorageTransferJobFacet.json +68 -0
  74. airflow/providers/google/cloud/openlineage/CloudStorageTransferRunFacet.json +60 -0
  75. airflow/providers/google/cloud/openlineage/DataFusionRunFacet.json +32 -0
  76. airflow/providers/google/cloud/openlineage/facets.py +102 -1
  77. airflow/providers/google/cloud/openlineage/mixins.py +10 -8
  78. airflow/providers/google/cloud/openlineage/utils.py +15 -1
  79. airflow/providers/google/cloud/operators/alloy_db.py +70 -55
  80. airflow/providers/google/cloud/operators/bigquery.py +73 -636
  81. airflow/providers/google/cloud/operators/bigquery_dts.py +3 -5
  82. airflow/providers/google/cloud/operators/bigtable.py +36 -7
  83. airflow/providers/google/cloud/operators/cloud_base.py +21 -1
  84. airflow/providers/google/cloud/operators/cloud_batch.py +2 -2
  85. airflow/providers/google/cloud/operators/cloud_build.py +75 -32
  86. airflow/providers/google/cloud/operators/cloud_composer.py +128 -40
  87. airflow/providers/google/cloud/operators/cloud_logging_sink.py +341 -0
  88. airflow/providers/google/cloud/operators/cloud_memorystore.py +69 -43
  89. airflow/providers/google/cloud/operators/cloud_run.py +23 -5
  90. airflow/providers/google/cloud/operators/cloud_sql.py +8 -16
  91. airflow/providers/google/cloud/operators/cloud_storage_transfer_service.py +92 -11
  92. airflow/providers/google/cloud/operators/compute.py +8 -40
  93. airflow/providers/google/cloud/operators/datacatalog.py +157 -21
  94. airflow/providers/google/cloud/operators/dataflow.py +38 -15
  95. airflow/providers/google/cloud/operators/dataform.py +15 -5
  96. airflow/providers/google/cloud/operators/datafusion.py +41 -20
  97. airflow/providers/google/cloud/operators/dataplex.py +193 -109
  98. airflow/providers/google/cloud/operators/dataprep.py +1 -5
  99. airflow/providers/google/cloud/operators/dataproc.py +78 -35
  100. airflow/providers/google/cloud/operators/dataproc_metastore.py +96 -88
  101. airflow/providers/google/cloud/operators/datastore.py +22 -6
  102. airflow/providers/google/cloud/operators/dlp.py +6 -29
  103. airflow/providers/google/cloud/operators/functions.py +16 -7
  104. airflow/providers/google/cloud/operators/gcs.py +10 -8
  105. airflow/providers/google/cloud/operators/gen_ai.py +389 -0
  106. airflow/providers/google/cloud/operators/kubernetes_engine.py +60 -99
  107. airflow/providers/google/cloud/operators/looker.py +1 -1
  108. airflow/providers/google/cloud/operators/managed_kafka.py +107 -52
  109. airflow/providers/google/cloud/operators/natural_language.py +1 -1
  110. airflow/providers/google/cloud/operators/pubsub.py +60 -14
  111. airflow/providers/google/cloud/operators/spanner.py +25 -12
  112. airflow/providers/google/cloud/operators/speech_to_text.py +1 -2
  113. airflow/providers/google/cloud/operators/stackdriver.py +1 -9
  114. airflow/providers/google/cloud/operators/tasks.py +1 -12
  115. airflow/providers/google/cloud/operators/text_to_speech.py +1 -2
  116. airflow/providers/google/cloud/operators/translate.py +40 -16
  117. airflow/providers/google/cloud/operators/translate_speech.py +1 -2
  118. airflow/providers/google/cloud/operators/vertex_ai/auto_ml.py +39 -19
  119. airflow/providers/google/cloud/operators/vertex_ai/batch_prediction_job.py +29 -9
  120. airflow/providers/google/cloud/operators/vertex_ai/custom_job.py +54 -26
  121. airflow/providers/google/cloud/operators/vertex_ai/dataset.py +70 -8
  122. airflow/providers/google/cloud/operators/vertex_ai/endpoint_service.py +43 -9
  123. airflow/providers/google/cloud/operators/vertex_ai/experiment_service.py +435 -0
  124. airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +532 -1
  125. airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +135 -116
  126. airflow/providers/google/cloud/operators/vertex_ai/hyperparameter_tuning_job.py +11 -9
  127. airflow/providers/google/cloud/operators/vertex_ai/model_service.py +57 -11
  128. airflow/providers/google/cloud/operators/vertex_ai/pipeline_job.py +30 -7
  129. airflow/providers/google/cloud/operators/vertex_ai/ray.py +393 -0
  130. airflow/providers/google/cloud/operators/video_intelligence.py +1 -1
  131. airflow/providers/google/cloud/operators/vision.py +2 -2
  132. airflow/providers/google/cloud/operators/workflows.py +18 -15
  133. airflow/providers/google/cloud/sensors/bigquery.py +2 -2
  134. airflow/providers/google/cloud/sensors/bigquery_dts.py +2 -2
  135. airflow/providers/google/cloud/sensors/bigtable.py +11 -4
  136. airflow/providers/google/cloud/sensors/cloud_composer.py +533 -29
  137. airflow/providers/google/cloud/sensors/cloud_storage_transfer_service.py +2 -2
  138. airflow/providers/google/cloud/sensors/dataflow.py +26 -9
  139. airflow/providers/google/cloud/sensors/dataform.py +2 -2
  140. airflow/providers/google/cloud/sensors/datafusion.py +4 -4
  141. airflow/providers/google/cloud/sensors/dataplex.py +2 -2
  142. airflow/providers/google/cloud/sensors/dataprep.py +2 -2
  143. airflow/providers/google/cloud/sensors/dataproc.py +2 -2
  144. airflow/providers/google/cloud/sensors/dataproc_metastore.py +2 -2
  145. airflow/providers/google/cloud/sensors/gcs.py +4 -4
  146. airflow/providers/google/cloud/sensors/looker.py +2 -2
  147. airflow/providers/google/cloud/sensors/pubsub.py +4 -4
  148. airflow/providers/google/cloud/sensors/tasks.py +2 -2
  149. airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +2 -2
  150. airflow/providers/google/cloud/sensors/workflows.py +2 -2
  151. airflow/providers/google/cloud/transfers/adls_to_gcs.py +1 -1
  152. airflow/providers/google/cloud/transfers/azure_blob_to_gcs.py +2 -2
  153. airflow/providers/google/cloud/transfers/azure_fileshare_to_gcs.py +2 -2
  154. airflow/providers/google/cloud/transfers/bigquery_to_bigquery.py +11 -8
  155. airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +4 -4
  156. airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +7 -3
  157. airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +12 -1
  158. airflow/providers/google/cloud/transfers/bigquery_to_postgres.py +24 -10
  159. airflow/providers/google/cloud/transfers/bigquery_to_sql.py +104 -5
  160. airflow/providers/google/cloud/transfers/calendar_to_gcs.py +1 -1
  161. airflow/providers/google/cloud/transfers/cassandra_to_gcs.py +2 -2
  162. airflow/providers/google/cloud/transfers/facebook_ads_to_gcs.py +3 -3
  163. airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +20 -12
  164. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +2 -2
  165. airflow/providers/google/cloud/transfers/gcs_to_local.py +5 -3
  166. airflow/providers/google/cloud/transfers/gcs_to_sftp.py +10 -4
  167. airflow/providers/google/cloud/transfers/gdrive_to_gcs.py +6 -2
  168. airflow/providers/google/cloud/transfers/gdrive_to_local.py +2 -2
  169. airflow/providers/google/cloud/transfers/http_to_gcs.py +193 -0
  170. airflow/providers/google/cloud/transfers/local_to_gcs.py +2 -2
  171. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +1 -1
  172. airflow/providers/google/cloud/transfers/oracle_to_gcs.py +36 -11
  173. airflow/providers/google/cloud/transfers/postgres_to_gcs.py +42 -9
  174. airflow/providers/google/cloud/transfers/s3_to_gcs.py +12 -6
  175. airflow/providers/google/cloud/transfers/salesforce_to_gcs.py +2 -2
  176. airflow/providers/google/cloud/transfers/sftp_to_gcs.py +13 -4
  177. airflow/providers/google/cloud/transfers/sheets_to_gcs.py +3 -3
  178. airflow/providers/google/cloud/transfers/sql_to_gcs.py +10 -10
  179. airflow/providers/google/cloud/triggers/bigquery.py +75 -34
  180. airflow/providers/google/cloud/triggers/cloud_build.py +1 -1
  181. airflow/providers/google/cloud/triggers/cloud_composer.py +302 -46
  182. airflow/providers/google/cloud/triggers/cloud_run.py +2 -2
  183. airflow/providers/google/cloud/triggers/cloud_storage_transfer_service.py +91 -1
  184. airflow/providers/google/cloud/triggers/dataflow.py +122 -0
  185. airflow/providers/google/cloud/triggers/datafusion.py +1 -1
  186. airflow/providers/google/cloud/triggers/dataplex.py +14 -2
  187. airflow/providers/google/cloud/triggers/dataproc.py +122 -52
  188. airflow/providers/google/cloud/triggers/kubernetes_engine.py +45 -27
  189. airflow/providers/google/cloud/triggers/mlengine.py +1 -1
  190. airflow/providers/google/cloud/triggers/pubsub.py +15 -19
  191. airflow/providers/google/cloud/utils/bigquery_get_data.py +1 -1
  192. airflow/providers/google/cloud/utils/credentials_provider.py +1 -1
  193. airflow/providers/google/cloud/utils/field_validator.py +1 -2
  194. airflow/providers/google/common/auth_backend/google_openid.py +4 -4
  195. airflow/providers/google/common/deprecated.py +2 -1
  196. airflow/providers/google/common/hooks/base_google.py +27 -8
  197. airflow/providers/google/common/links/storage.py +0 -22
  198. airflow/providers/google/common/utils/get_secret.py +31 -0
  199. airflow/providers/google/common/utils/id_token_credentials.py +3 -4
  200. airflow/providers/google/firebase/operators/firestore.py +2 -2
  201. airflow/providers/google/get_provider_info.py +56 -52
  202. airflow/providers/google/go_module_utils.py +35 -3
  203. airflow/providers/google/leveldb/hooks/leveldb.py +26 -1
  204. airflow/providers/google/leveldb/operators/leveldb.py +2 -2
  205. airflow/providers/google/marketing_platform/hooks/display_video.py +3 -109
  206. airflow/providers/google/marketing_platform/links/analytics_admin.py +5 -14
  207. airflow/providers/google/marketing_platform/operators/analytics_admin.py +1 -2
  208. airflow/providers/google/marketing_platform/operators/campaign_manager.py +5 -5
  209. airflow/providers/google/marketing_platform/operators/display_video.py +28 -489
  210. airflow/providers/google/marketing_platform/operators/search_ads.py +2 -2
  211. airflow/providers/google/marketing_platform/sensors/campaign_manager.py +2 -2
  212. airflow/providers/google/marketing_platform/sensors/display_video.py +3 -63
  213. airflow/providers/google/suite/hooks/calendar.py +1 -1
  214. airflow/providers/google/suite/hooks/sheets.py +15 -1
  215. airflow/providers/google/suite/operators/sheets.py +8 -3
  216. airflow/providers/google/suite/sensors/drive.py +2 -2
  217. airflow/providers/google/suite/transfers/gcs_to_gdrive.py +2 -2
  218. airflow/providers/google/suite/transfers/gcs_to_sheets.py +1 -1
  219. airflow/providers/google/suite/transfers/local_to_drive.py +3 -3
  220. airflow/providers/google/suite/transfers/sql_to_sheets.py +5 -4
  221. airflow/providers/google/version_compat.py +15 -1
  222. {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/METADATA +92 -48
  223. apache_airflow_providers_google-19.1.0rc1.dist-info/RECORD +331 -0
  224. apache_airflow_providers_google-19.1.0rc1.dist-info/licenses/NOTICE +5 -0
  225. airflow/providers/google/cloud/hooks/automl.py +0 -673
  226. airflow/providers/google/cloud/hooks/life_sciences.py +0 -159
  227. airflow/providers/google/cloud/links/automl.py +0 -193
  228. airflow/providers/google/cloud/operators/automl.py +0 -1362
  229. airflow/providers/google/cloud/operators/life_sciences.py +0 -119
  230. airflow/providers/google/cloud/operators/mlengine.py +0 -112
  231. apache_airflow_providers_google-15.1.0rc1.dist-info/RECORD +0 -321
  232. {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/WHEEL +0 -0
  233. {apache_airflow_providers_google-15.1.0rc1.dist-info → apache_airflow_providers_google-19.1.0rc1.dist-info}/entry_points.txt +0 -0
  234. {airflow/providers/google → apache_airflow_providers_google-19.1.0rc1.dist-info/licenses}/LICENSE +0 -0
@@ -47,7 +47,7 @@ from google.cloud.dataproc_v1 import (
47
47
 
48
48
  from airflow.exceptions import AirflowException
49
49
  from airflow.providers.google.common.consts import CLIENT_INFO
50
- from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
50
+ from airflow.providers.google.common.hooks.base_google import GoogleBaseAsyncHook, GoogleBaseHook
51
51
  from airflow.version import version as airflow_version
52
52
 
53
53
  if TYPE_CHECKING:
@@ -298,7 +298,7 @@ class DataprocHook(GoogleBaseHook):
298
298
  success_code = 0
299
299
 
300
300
  with self.provide_authorized_gcloud():
301
- proc = subprocess.run(cmd, capture_output=True)
301
+ proc = subprocess.run(cmd, check=False, capture_output=True)
302
302
 
303
303
  if proc.returncode != success_code:
304
304
  stderr_last_20_lines = "\n".join(proc.stderr.decode().strip().splitlines()[-20:])
@@ -912,12 +912,15 @@ class DataprocHook(GoogleBaseHook):
912
912
  state = None
913
913
  start = time.monotonic()
914
914
  while state not in (JobStatus.State.ERROR, JobStatus.State.DONE, JobStatus.State.CANCELLED):
915
+ self.log.debug("Waiting for job %s to complete", job_id)
915
916
  if timeout and start + timeout < time.monotonic():
916
917
  raise AirflowException(f"Timeout: dataproc job {job_id} is not ready after {timeout}s")
918
+ self.log.debug("Sleeping for %s seconds", wait_time)
917
919
  time.sleep(wait_time)
918
920
  try:
919
921
  job = self.get_job(project_id=project_id, region=region, job_id=job_id)
920
922
  state = job.status.state
923
+ self.log.debug("Job %s is in state %s", job_id, state)
921
924
  except ServerError as err:
922
925
  self.log.info("Retrying. Dataproc API returned server error when waiting for job: %s", err)
923
926
 
@@ -1269,7 +1272,7 @@ class DataprocHook(GoogleBaseHook):
1269
1272
  return all([word in error_msg for word in key_words])
1270
1273
 
1271
1274
 
1272
- class DataprocAsyncHook(GoogleBaseHook):
1275
+ class DataprocAsyncHook(GoogleBaseAsyncHook):
1273
1276
  """
1274
1277
  Asynchronous interaction with Google Cloud Dataproc APIs.
1275
1278
 
@@ -1277,6 +1280,8 @@ class DataprocAsyncHook(GoogleBaseHook):
1277
1280
  keyword arguments rather than positional.
1278
1281
  """
1279
1282
 
1283
+ sync_hook_class = DataprocHook
1284
+
1280
1285
  def __init__(
1281
1286
  self,
1282
1287
  gcp_conn_id: str = "google_cloud_default",
@@ -1286,53 +1291,90 @@ class DataprocAsyncHook(GoogleBaseHook):
1286
1291
  super().__init__(gcp_conn_id=gcp_conn_id, impersonation_chain=impersonation_chain, **kwargs)
1287
1292
  self._cached_client: JobControllerAsyncClient | None = None
1288
1293
 
1289
- def get_cluster_client(self, region: str | None = None) -> ClusterControllerAsyncClient:
1294
+ async def get_cluster_client(self, region: str | None = None) -> ClusterControllerAsyncClient:
1290
1295
  """Create a ClusterControllerAsyncClient."""
1291
1296
  client_options = None
1292
1297
  if region and region != "global":
1293
1298
  client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
1294
1299
 
1300
+ sync_hook = await self.get_sync_hook()
1295
1301
  return ClusterControllerAsyncClient(
1296
- credentials=self.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1302
+ credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1297
1303
  )
1298
1304
 
1299
- def get_template_client(self, region: str | None = None) -> WorkflowTemplateServiceAsyncClient:
1305
+ async def get_template_client(self, region: str | None = None) -> WorkflowTemplateServiceAsyncClient:
1300
1306
  """Create a WorkflowTemplateServiceAsyncClient."""
1301
1307
  client_options = None
1302
1308
  if region and region != "global":
1303
1309
  client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
1304
1310
 
1311
+ sync_hook = await self.get_sync_hook()
1305
1312
  return WorkflowTemplateServiceAsyncClient(
1306
- credentials=self.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1313
+ credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1307
1314
  )
1308
1315
 
1309
- def get_job_client(self, region: str | None = None) -> JobControllerAsyncClient:
1316
+ async def get_job_client(self, region: str | None = None) -> JobControllerAsyncClient:
1310
1317
  """Create a JobControllerAsyncClient."""
1311
1318
  if self._cached_client is None:
1312
1319
  client_options = None
1313
1320
  if region and region != "global":
1314
1321
  client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
1315
1322
 
1323
+ sync_hook = await self.get_sync_hook()
1316
1324
  self._cached_client = JobControllerAsyncClient(
1317
- credentials=self.get_credentials(),
1325
+ credentials=sync_hook.get_credentials(),
1318
1326
  client_info=CLIENT_INFO,
1319
1327
  client_options=client_options,
1320
1328
  )
1321
1329
  return self._cached_client
1322
1330
 
1323
- def get_batch_client(self, region: str | None = None) -> BatchControllerAsyncClient:
1331
+ async def get_batch_client(self, region: str | None = None) -> BatchControllerAsyncClient:
1324
1332
  """Create a BatchControllerAsyncClient."""
1325
1333
  client_options = None
1326
1334
  if region and region != "global":
1327
1335
  client_options = ClientOptions(api_endpoint=f"{region}-dataproc.googleapis.com:443")
1328
1336
 
1337
+ sync_hook = await self.get_sync_hook()
1329
1338
  return BatchControllerAsyncClient(
1330
- credentials=self.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1339
+ credentials=sync_hook.get_credentials(), client_info=CLIENT_INFO, client_options=client_options
1331
1340
  )
1332
1341
 
1333
- def get_operations_client(self, region: str) -> OperationsClient:
1342
+ async def get_operations_client(self, region: str) -> OperationsClient:
1334
1343
  """Create a OperationsClient."""
1335
- return self.get_template_client(region=region).transport.operations_client
1344
+ template_client = await self.get_template_client(region=region)
1345
+ return template_client.transport.operations_client
1346
+
1347
+ @GoogleBaseHook.fallback_to_default_project_id
1348
+ async def get_cluster(
1349
+ self,
1350
+ region: str,
1351
+ cluster_name: str,
1352
+ project_id: str,
1353
+ retry: AsyncRetry | _MethodDefault = DEFAULT,
1354
+ timeout: float | None = None,
1355
+ metadata: Sequence[tuple[str, str]] = (),
1356
+ ) -> Cluster:
1357
+ """
1358
+ Get a cluster.
1359
+
1360
+ :param region: Cloud Dataproc region in which to handle the request.
1361
+ :param cluster_name: Name of the cluster to get.
1362
+ :param project_id: Google Cloud project ID that the cluster belongs to.
1363
+ :param retry: A retry object used to retry requests. If *None*, requests
1364
+ will not be retried.
1365
+ :param timeout: The amount of time, in seconds, to wait for the request
1366
+ to complete. If *retry* is specified, the timeout applies to each
1367
+ individual attempt.
1368
+ :param metadata: Additional metadata that is provided to the method.
1369
+ """
1370
+ client = await self.get_cluster_client(region=region)
1371
+ result = await client.get_cluster(
1372
+ request={"project_id": project_id, "region": region, "cluster_name": cluster_name},
1373
+ retry=retry,
1374
+ timeout=timeout,
1375
+ metadata=metadata,
1376
+ )
1377
+ return result
1336
1378
 
1337
1379
  @GoogleBaseHook.fallback_to_default_project_id
1338
1380
  async def create_cluster(
@@ -1390,7 +1432,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1390
1432
  cluster["config"] = cluster_config # type: ignore
1391
1433
  cluster["labels"] = labels # type: ignore
1392
1434
 
1393
- client = self.get_cluster_client(region=region)
1435
+ client = await self.get_cluster_client(region=region)
1394
1436
  result = await client.create_cluster(
1395
1437
  request={
1396
1438
  "project_id": project_id,
@@ -1435,7 +1477,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1435
1477
  individual attempt.
1436
1478
  :param metadata: Additional metadata that is provided to the method.
1437
1479
  """
1438
- client = self.get_cluster_client(region=region)
1480
+ client = await self.get_cluster_client(region=region)
1439
1481
  result = await client.delete_cluster(
1440
1482
  request={
1441
1483
  "project_id": project_id,
@@ -1483,7 +1525,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1483
1525
  individual attempt.
1484
1526
  :param metadata: Additional metadata that is provided to the method.
1485
1527
  """
1486
- client = self.get_cluster_client(region=region)
1528
+ client = await self.get_cluster_client(region=region)
1487
1529
  result = await client.diagnose_cluster(
1488
1530
  request={
1489
1531
  "project_id": project_id,
@@ -1500,38 +1542,6 @@ class DataprocAsyncHook(GoogleBaseHook):
1500
1542
  )
1501
1543
  return result
1502
1544
 
1503
- @GoogleBaseHook.fallback_to_default_project_id
1504
- async def get_cluster(
1505
- self,
1506
- region: str,
1507
- cluster_name: str,
1508
- project_id: str,
1509
- retry: AsyncRetry | _MethodDefault = DEFAULT,
1510
- timeout: float | None = None,
1511
- metadata: Sequence[tuple[str, str]] = (),
1512
- ) -> Cluster:
1513
- """
1514
- Get the resource representation for a cluster in a project.
1515
-
1516
- :param project_id: Google Cloud project ID that the cluster belongs to.
1517
- :param region: Cloud Dataproc region to handle the request.
1518
- :param cluster_name: The cluster name.
1519
- :param retry: A retry object used to retry requests. If *None*, requests
1520
- will not be retried.
1521
- :param timeout: The amount of time, in seconds, to wait for the request
1522
- to complete. If *retry* is specified, the timeout applies to each
1523
- individual attempt.
1524
- :param metadata: Additional metadata that is provided to the method.
1525
- """
1526
- client = self.get_cluster_client(region=region)
1527
- result = await client.get_cluster(
1528
- request={"project_id": project_id, "region": region, "cluster_name": cluster_name},
1529
- retry=retry,
1530
- timeout=timeout,
1531
- metadata=metadata,
1532
- )
1533
- return result
1534
-
1535
1545
  @GoogleBaseHook.fallback_to_default_project_id
1536
1546
  async def list_clusters(
1537
1547
  self,
@@ -1561,7 +1571,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1561
1571
  individual attempt.
1562
1572
  :param metadata: Additional metadata that is provided to the method.
1563
1573
  """
1564
- client = self.get_cluster_client(region=region)
1574
+ client = await self.get_cluster_client(region=region)
1565
1575
  result = await client.list_clusters(
1566
1576
  request={"project_id": project_id, "region": region, "filter": filter_, "page_size": page_size},
1567
1577
  retry=retry,
@@ -1638,7 +1648,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1638
1648
  """
1639
1649
  if region is None:
1640
1650
  raise TypeError("missing 1 required keyword argument: 'region'")
1641
- client = self.get_cluster_client(region=region)
1651
+ client = await self.get_cluster_client(region=region)
1642
1652
  operation = await client.update_cluster(
1643
1653
  request={
1644
1654
  "project_id": project_id,
@@ -1680,10 +1690,8 @@ class DataprocAsyncHook(GoogleBaseHook):
1680
1690
  individual attempt.
1681
1691
  :param metadata: Additional metadata that is provided to the method.
1682
1692
  """
1683
- if region is None:
1684
- raise TypeError("missing 1 required keyword argument: 'region'")
1685
1693
  metadata = metadata or ()
1686
- client = self.get_template_client(region)
1694
+ client = await self.get_template_client(region)
1687
1695
  parent = f"projects/{project_id}/regions/{region}"
1688
1696
  return await client.create_workflow_template(
1689
1697
  request={"parent": parent, "template": template}, retry=retry, timeout=timeout, metadata=metadata
@@ -1725,10 +1733,8 @@ class DataprocAsyncHook(GoogleBaseHook):
1725
1733
  individual attempt.
1726
1734
  :param metadata: Additional metadata that is provided to the method.
1727
1735
  """
1728
- if region is None:
1729
- raise TypeError("missing 1 required keyword argument: 'region'")
1730
1736
  metadata = metadata or ()
1731
- client = self.get_template_client(region)
1737
+ client = await self.get_template_client(region)
1732
1738
  name = f"projects/{project_id}/regions/{region}/workflowTemplates/{template_name}"
1733
1739
  operation = await client.instantiate_workflow_template(
1734
1740
  request={"name": name, "version": version, "request_id": request_id, "parameters": parameters},
@@ -1767,10 +1773,8 @@ class DataprocAsyncHook(GoogleBaseHook):
1767
1773
  individual attempt.
1768
1774
  :param metadata: Additional metadata that is provided to the method.
1769
1775
  """
1770
- if region is None:
1771
- raise TypeError("missing 1 required keyword argument: 'region'")
1772
1776
  metadata = metadata or ()
1773
- client = self.get_template_client(region)
1777
+ client = await self.get_template_client(region)
1774
1778
  parent = f"projects/{project_id}/regions/{region}"
1775
1779
  operation = await client.instantiate_inline_workflow_template(
1776
1780
  request={"parent": parent, "template": template, "request_id": request_id},
@@ -1781,7 +1785,8 @@ class DataprocAsyncHook(GoogleBaseHook):
1781
1785
  return operation
1782
1786
 
1783
1787
  async def get_operation(self, region, operation_name):
1784
- return await self.get_operations_client(region).get_operation(name=operation_name)
1788
+ operations_client = await self.get_operations_client(region)
1789
+ return await operations_client.get_operation(name=operation_name)
1785
1790
 
1786
1791
  @GoogleBaseHook.fallback_to_default_project_id
1787
1792
  async def get_job(
@@ -1806,9 +1811,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1806
1811
  individual attempt.
1807
1812
  :param metadata: Additional metadata that is provided to the method.
1808
1813
  """
1809
- if region is None:
1810
- raise TypeError("missing 1 required keyword argument: 'region'")
1811
- client = self.get_job_client(region=region)
1814
+ client = await self.get_job_client(region=region)
1812
1815
  job = await client.get_job(
1813
1816
  request={"project_id": project_id, "region": region, "job_id": job_id},
1814
1817
  retry=retry,
@@ -1845,9 +1848,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1845
1848
  individual attempt.
1846
1849
  :param metadata: Additional metadata that is provided to the method.
1847
1850
  """
1848
- if region is None:
1849
- raise TypeError("missing 1 required keyword argument: 'region'")
1850
- client = self.get_job_client(region=region)
1851
+ client = await self.get_job_client(region=region)
1851
1852
  return await client.submit_job(
1852
1853
  request={"project_id": project_id, "region": region, "job": job, "request_id": request_id},
1853
1854
  retry=retry,
@@ -1878,7 +1879,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1878
1879
  individual attempt.
1879
1880
  :param metadata: Additional metadata that is provided to the method.
1880
1881
  """
1881
- client = self.get_job_client(region=region)
1882
+ client = await self.get_job_client(region=region)
1882
1883
 
1883
1884
  job = await client.cancel_job(
1884
1885
  request={"project_id": project_id, "region": region, "job_id": job_id},
@@ -1920,7 +1921,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1920
1921
  individual attempt.
1921
1922
  :param metadata: Additional metadata that is provided to the method.
1922
1923
  """
1923
- client = self.get_batch_client(region)
1924
+ client = await self.get_batch_client(region)
1924
1925
  parent = f"projects/{project_id}/regions/{region}"
1925
1926
 
1926
1927
  result = await client.create_batch(
@@ -1959,7 +1960,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1959
1960
  individual attempt.
1960
1961
  :param metadata: Additional metadata that is provided to the method.
1961
1962
  """
1962
- client = self.get_batch_client(region)
1963
+ client = await self.get_batch_client(region)
1963
1964
  name = f"projects/{project_id}/locations/{region}/batches/{batch_id}"
1964
1965
 
1965
1966
  await client.delete_batch(
@@ -1994,7 +1995,7 @@ class DataprocAsyncHook(GoogleBaseHook):
1994
1995
  individual attempt.
1995
1996
  :param metadata: Additional metadata that is provided to the method.
1996
1997
  """
1997
- client = self.get_batch_client(region)
1998
+ client = await self.get_batch_client(region)
1998
1999
  name = f"projects/{project_id}/locations/{region}/batches/{batch_id}"
1999
2000
 
2000
2001
  result = await client.get_batch(
@@ -2039,7 +2040,7 @@ class DataprocAsyncHook(GoogleBaseHook):
2039
2040
  :param filter: Result filters as specified in ListBatchesRequest
2040
2041
  :param order_by: How to order results as specified in ListBatchesRequest
2041
2042
  """
2042
- client = self.get_batch_client(region)
2043
+ client = await self.get_batch_client(region)
2043
2044
  parent = f"projects/{project_id}/regions/{region}"
2044
2045
 
2045
2046
  result = await client.list_batches(
@@ -26,24 +26,26 @@ import os
26
26
  import shutil
27
27
  import time
28
28
  import warnings
29
- from collections.abc import Generator, Sequence
29
+ from collections.abc import Callable, Generator, Sequence
30
30
  from contextlib import contextmanager
31
+ from datetime import datetime
31
32
  from functools import partial
32
33
  from io import BytesIO
34
+ from pathlib import Path
33
35
  from tempfile import NamedTemporaryFile
34
- from typing import IO, TYPE_CHECKING, Any, Callable, TypeVar, cast, overload
36
+ from typing import IO, TYPE_CHECKING, Any, ParamSpec, TypeVar, cast, overload
35
37
  from urllib.parse import urlsplit
36
38
 
39
+ # Make mypy happy by importing as aliases
40
+ import google.cloud.storage as storage
37
41
  from gcloud.aio.storage import Storage
38
42
  from google.api_core.exceptions import GoogleAPICallError, NotFound
39
-
40
- # not sure why but mypy complains on missing `storage` but it is clearly there and is importable
41
- from google.cloud import storage # type: ignore[attr-defined]
42
43
  from google.cloud.exceptions import GoogleCloudError
43
44
  from google.cloud.storage.retry import DEFAULT_RETRY
44
45
 
45
46
  from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
46
47
  from airflow.providers.common.compat.lineage.hook import get_hook_lineage_collector
48
+ from airflow.providers.common.compat.sdk import timezone
47
49
  from airflow.providers.google.cloud.utils.helpers import normalize_directory_path
48
50
  from airflow.providers.google.common.consts import CLIENT_INFO
49
51
  from airflow.providers.google.common.hooks.base_google import (
@@ -51,13 +53,9 @@ from airflow.providers.google.common.hooks.base_google import (
51
53
  GoogleBaseAsyncHook,
52
54
  GoogleBaseHook,
53
55
  )
54
- from airflow.typing_compat import ParamSpec
55
- from airflow.utils import timezone
56
56
  from airflow.version import version
57
57
 
58
58
  if TYPE_CHECKING:
59
- from datetime import datetime
60
-
61
59
  from aiohttp import ClientSession
62
60
  from google.api_core.retry import Retry
63
61
  from google.cloud.storage.blob import Blob
@@ -373,8 +371,7 @@ class GCSHook(GoogleBaseHook):
373
371
  num_max_attempts,
374
372
  )
375
373
  raise
376
- else:
377
- raise NotImplementedError # should not reach this, but makes mypy happy
374
+ raise NotImplementedError # should not reach this, but makes mypy happy
378
375
 
379
376
  def download_as_byte_array(
380
377
  self,
@@ -549,13 +546,13 @@ class GCSHook(GoogleBaseHook):
549
546
  if cache_control:
550
547
  blob.cache_control = cache_control
551
548
 
552
- if filename and data:
549
+ if filename is not None and data is not None:
553
550
  raise ValueError(
554
551
  "'filename' and 'data' parameter provided. Please "
555
552
  "specify a single parameter, either 'filename' for "
556
553
  "local file uploads or 'data' for file content uploads."
557
554
  )
558
- if filename:
555
+ if filename is not None:
559
556
  if not mime_type:
560
557
  mime_type = "application/octet-stream"
561
558
  if gzip:
@@ -575,7 +572,7 @@ class GCSHook(GoogleBaseHook):
575
572
  if gzip:
576
573
  os.remove(filename)
577
574
  self.log.info("File %s uploaded to %s in %s bucket", filename, object_name, bucket_name)
578
- elif data:
575
+ elif data is not None:
579
576
  if not mime_type:
580
577
  mime_type = "text/plain"
581
578
  if gzip:
@@ -1251,6 +1248,106 @@ class GCSHook(GoogleBaseHook):
1251
1248
 
1252
1249
  self.log.info("Completed successfully.")
1253
1250
 
1251
+ def _sync_to_local_dir_delete_stale_local_files(self, current_gcs_objects: List[Path], local_dir: Path):
1252
+ current_gcs_keys = {key.resolve() for key in current_gcs_objects}
1253
+
1254
+ for item in local_dir.rglob("*"):
1255
+ if item.is_file():
1256
+ if item.resolve() not in current_gcs_keys:
1257
+ self.log.debug("Deleting stale local file: %s", item)
1258
+ item.unlink()
1259
+ # Clean up empty directories
1260
+ for root, dirs, _ in os.walk(local_dir, topdown=False):
1261
+ for d in dirs:
1262
+ dir_path = os.path.join(root, d)
1263
+ if not os.listdir(dir_path):
1264
+ self.log.debug("Deleting stale empty directory: %s", dir_path)
1265
+ os.rmdir(dir_path)
1266
+
1267
+ def _sync_to_local_dir_if_changed(self, blob: Blob, local_target_path: Path):
1268
+ should_download = False
1269
+ download_msg = ""
1270
+ if not local_target_path.exists():
1271
+ should_download = True
1272
+ download_msg = f"Local file {local_target_path} does not exist."
1273
+ else:
1274
+ local_stats = local_target_path.stat()
1275
+ # Reload blob to get fresh metadata, including size and updated time
1276
+ blob.reload()
1277
+
1278
+ if blob.size != local_stats.st_size:
1279
+ should_download = True
1280
+ download_msg = (
1281
+ f"GCS object size ({blob.size}) and local file size ({local_stats.st_size}) differ."
1282
+ )
1283
+
1284
+ gcs_last_modified = blob.updated
1285
+ if (
1286
+ not should_download
1287
+ and gcs_last_modified
1288
+ and local_stats.st_mtime < gcs_last_modified.timestamp()
1289
+ ):
1290
+ should_download = True
1291
+ download_msg = f"GCS object last modified ({gcs_last_modified}) is newer than local file last modified ({datetime.fromtimestamp(local_stats.st_mtime, tz=timezone.utc)})."
1292
+
1293
+ if should_download:
1294
+ self.log.debug("%s Downloading %s to %s", download_msg, blob.name, local_target_path.as_posix())
1295
+ self.download(
1296
+ bucket_name=blob.bucket.name, object_name=blob.name, filename=str(local_target_path)
1297
+ )
1298
+ else:
1299
+ self.log.debug(
1300
+ "Local file %s is up-to-date with GCS object %s. Skipping download.",
1301
+ local_target_path.as_posix(),
1302
+ blob.name,
1303
+ )
1304
+
1305
+ def sync_to_local_dir(
1306
+ self,
1307
+ bucket_name: str,
1308
+ local_dir: str | Path,
1309
+ prefix: str | None = None,
1310
+ delete_stale: bool = False,
1311
+ ) -> None:
1312
+ """
1313
+ Download files from a GCS bucket to a local directory.
1314
+
1315
+ It will download all files from the given ``prefix`` and create the corresponding
1316
+ directory structure in the ``local_dir``.
1317
+
1318
+ If ``delete_stale`` is ``True``, it will delete all local files that do not exist in the GCS bucket.
1319
+
1320
+ :param bucket_name: The name of the GCS bucket.
1321
+ :param local_dir: The local directory to which the files will be downloaded.
1322
+ :param prefix: The prefix of the files to be downloaded.
1323
+ :param delete_stale: If ``True``, deletes local files that don't exist in the bucket.
1324
+ """
1325
+ prefix = prefix or ""
1326
+ local_dir_path = Path(local_dir)
1327
+ self.log.debug("Downloading data from gs://%s/%s to %s", bucket_name, prefix, local_dir_path)
1328
+
1329
+ gcs_bucket = self.get_bucket(bucket_name)
1330
+ local_gcs_objects = []
1331
+
1332
+ for blob in gcs_bucket.list_blobs(prefix=prefix):
1333
+ # GCS lists "directories" as objects ending with a slash. We should skip them.
1334
+ if blob.name.endswith("/"):
1335
+ continue
1336
+
1337
+ blob_path = Path(blob.name)
1338
+ local_target_path = local_dir_path.joinpath(blob_path.relative_to(prefix))
1339
+ if not local_target_path.parent.exists():
1340
+ local_target_path.parent.mkdir(parents=True, exist_ok=True)
1341
+ self.log.debug("Created local directory: %s", local_target_path.parent)
1342
+
1343
+ self._sync_to_local_dir_if_changed(blob=blob, local_target_path=local_target_path)
1344
+ local_gcs_objects.append(local_target_path)
1345
+
1346
+ if delete_stale:
1347
+ self._sync_to_local_dir_delete_stale_local_files(
1348
+ current_gcs_objects=local_gcs_objects, local_dir=local_dir_path
1349
+ )
1350
+
1254
1351
  def sync(
1255
1352
  self,
1256
1353
  source_bucket: str,