apache-airflow-providers-google 10.12.0__py3-none-any.whl → 10.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. airflow/providers/google/__init__.py +3 -3
  2. airflow/providers/google/cloud/fs/gcs.py +16 -13
  3. airflow/providers/google/cloud/hooks/bigquery_dts.py +2 -1
  4. airflow/providers/google/cloud/hooks/cloud_build.py +2 -1
  5. airflow/providers/google/cloud/hooks/cloud_composer.py +4 -3
  6. airflow/providers/google/cloud/hooks/compute_ssh.py +18 -6
  7. airflow/providers/google/cloud/hooks/dataflow.py +61 -3
  8. airflow/providers/google/cloud/hooks/dataplex.py +2 -1
  9. airflow/providers/google/cloud/hooks/dataproc.py +19 -18
  10. airflow/providers/google/cloud/hooks/gcs.py +10 -6
  11. airflow/providers/google/cloud/hooks/pubsub.py +3 -2
  12. airflow/providers/google/cloud/log/gcs_task_handler.py +2 -39
  13. airflow/providers/google/cloud/log/stackdriver_task_handler.py +2 -11
  14. airflow/providers/google/cloud/operators/bigquery.py +47 -47
  15. airflow/providers/google/cloud/operators/cloud_composer.py +1 -1
  16. airflow/providers/google/cloud/operators/cloud_run.py +3 -3
  17. airflow/providers/google/cloud/operators/dataflow.py +6 -0
  18. airflow/providers/google/cloud/operators/dataplex.py +530 -1
  19. airflow/providers/google/cloud/operators/dataproc.py +11 -11
  20. airflow/providers/google/cloud/operators/gcs.py +90 -15
  21. airflow/providers/google/cloud/operators/kubernetes_engine.py +2 -3
  22. airflow/providers/google/cloud/operators/pubsub.py +47 -55
  23. airflow/providers/google/cloud/secrets/secret_manager.py +22 -1
  24. airflow/providers/google/cloud/sensors/cloud_composer.py +14 -1
  25. airflow/providers/google/cloud/sensors/dataplex.py +118 -0
  26. airflow/providers/google/cloud/sensors/gcs.py +10 -1
  27. airflow/providers/google/cloud/transfers/adls_to_gcs.py +5 -5
  28. airflow/providers/google/cloud/transfers/gcs_to_gcs.py +42 -42
  29. airflow/providers/google/cloud/transfers/mssql_to_gcs.py +9 -9
  30. airflow/providers/google/cloud/triggers/cloud_run.py +7 -7
  31. airflow/providers/google/cloud/triggers/dataplex.py +82 -0
  32. airflow/providers/google/cloud/triggers/dataproc.py +2 -5
  33. airflow/providers/google/cloud/triggers/gcs.py +13 -3
  34. airflow/providers/google/cloud/triggers/kubernetes_engine.py +3 -1
  35. airflow/providers/google/common/hooks/base_google.py +6 -4
  36. airflow/providers/google/get_provider_info.py +14 -13
  37. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/METADATA +30 -30
  38. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/RECORD +40 -40
  39. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/WHEEL +0 -0
  40. {apache_airflow_providers_google-10.12.0.dist-info → apache_airflow_providers_google-10.13.0.dist-info}/entry_points.txt +0 -0
@@ -188,11 +188,11 @@ class GCSListObjectsOperator(GoogleCloudBaseOperator):
188
188
  folder in ``data`` bucket. ::
189
189
 
190
190
  GCS_Files = GCSListOperator(
191
- task_id='GCS_Files',
192
- bucket='data',
193
- prefix='sales/sales-2017/',
194
- match_glob='**/*/.avro',
195
- gcp_conn_id=google_cloud_conn_id
191
+ task_id="GCS_Files",
192
+ bucket="data",
193
+ prefix="sales/sales-2017/",
194
+ match_glob="**/*/.avro",
195
+ gcp_conn_id=google_cloud_conn_id,
196
196
  )
197
197
  """
198
198
 
@@ -313,6 +313,7 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
313
313
  )
314
314
  raise ValueError(err_message)
315
315
 
316
+ self._objects: list[str] = []
316
317
  super().__init__(**kwargs)
317
318
 
318
319
  def execute(self, context: Context) -> None:
@@ -322,13 +323,47 @@ class GCSDeleteObjectsOperator(GoogleCloudBaseOperator):
322
323
  )
323
324
 
324
325
  if self.objects is not None:
325
- objects = self.objects
326
+ self._objects = self.objects
326
327
  else:
327
- objects = hook.list(bucket_name=self.bucket_name, prefix=self.prefix)
328
- self.log.info("Deleting %s objects from %s", len(objects), self.bucket_name)
329
- for object_name in objects:
328
+ self._objects = hook.list(bucket_name=self.bucket_name, prefix=self.prefix)
329
+ self.log.info("Deleting %s objects from %s", len(self._objects), self.bucket_name)
330
+ for object_name in self._objects:
330
331
  hook.delete(bucket_name=self.bucket_name, object_name=object_name)
331
332
 
333
+ def get_openlineage_facets_on_complete(self, task_instance):
334
+ """Implementing on_complete as execute() resolves object names."""
335
+ from openlineage.client.facet import (
336
+ LifecycleStateChange,
337
+ LifecycleStateChangeDatasetFacet,
338
+ LifecycleStateChangeDatasetFacetPreviousIdentifier,
339
+ )
340
+ from openlineage.client.run import Dataset
341
+
342
+ from airflow.providers.openlineage.extractors import OperatorLineage
343
+
344
+ if not self._objects:
345
+ return OperatorLineage()
346
+
347
+ bucket_url = f"gs://{self.bucket_name}"
348
+ input_datasets = [
349
+ Dataset(
350
+ namespace=bucket_url,
351
+ name=object_name,
352
+ facets={
353
+ "lifecycleStateChange": LifecycleStateChangeDatasetFacet(
354
+ lifecycleStateChange=LifecycleStateChange.DROP.value,
355
+ previousIdentifier=LifecycleStateChangeDatasetFacetPreviousIdentifier(
356
+ namespace=bucket_url,
357
+ name=object_name,
358
+ ),
359
+ )
360
+ },
361
+ )
362
+ for object_name in self._objects
363
+ ]
364
+
365
+ return OperatorLineage(inputs=input_datasets)
366
+
332
367
 
333
368
  class GCSBucketCreateAclEntryOperator(GoogleCloudBaseOperator):
334
369
  """
@@ -596,6 +631,22 @@ class GCSFileTransformOperator(GoogleCloudBaseOperator):
596
631
  filename=destination_file.name,
597
632
  )
598
633
 
634
+ def get_openlineage_facets_on_start(self):
635
+ from openlineage.client.run import Dataset
636
+
637
+ from airflow.providers.openlineage.extractors import OperatorLineage
638
+
639
+ input_dataset = Dataset(
640
+ namespace=f"gs://{self.source_bucket}",
641
+ name=self.source_object,
642
+ )
643
+ output_dataset = Dataset(
644
+ namespace=f"gs://{self.destination_bucket}",
645
+ name=self.destination_object,
646
+ )
647
+
648
+ return OperatorLineage(inputs=[input_dataset], outputs=[output_dataset])
649
+
599
650
 
600
651
  class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
601
652
  """
@@ -722,6 +773,9 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
722
773
  self.upload_continue_on_fail = upload_continue_on_fail
723
774
  self.upload_num_attempts = upload_num_attempts
724
775
 
776
+ self._source_object_names: list[str] = []
777
+ self._destination_object_names: list[str] = []
778
+
725
779
  def execute(self, context: Context) -> list[str]:
726
780
  # Define intervals and prefixes.
727
781
  try:
@@ -773,7 +827,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
773
827
  )
774
828
 
775
829
  # Fetch list of files.
776
- blobs_to_transform = source_hook.list_by_timespan(
830
+ self._source_object_names = source_hook.list_by_timespan(
777
831
  bucket_name=self.source_bucket,
778
832
  prefix=source_prefix_interp,
779
833
  timespan_start=timespan_start,
@@ -785,7 +839,7 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
785
839
  temp_output_dir_path = Path(temp_output_dir)
786
840
 
787
841
  # TODO: download in parallel.
788
- for blob_to_transform in blobs_to_transform:
842
+ for blob_to_transform in self._source_object_names:
789
843
  destination_file = temp_input_dir_path / blob_to_transform
790
844
  destination_file.parent.mkdir(parents=True, exist_ok=True)
791
845
  try:
@@ -822,8 +876,6 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
822
876
 
823
877
  self.log.info("Transformation succeeded. Output temporarily located at %s", temp_output_dir_path)
824
878
 
825
- files_uploaded = []
826
-
827
879
  # TODO: upload in parallel.
828
880
  for upload_file in temp_output_dir_path.glob("**/*"):
829
881
  if upload_file.is_dir():
@@ -844,12 +896,35 @@ class GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
844
896
  chunk_size=self.chunk_size,
845
897
  num_max_attempts=self.upload_num_attempts,
846
898
  )
847
- files_uploaded.append(str(upload_file_name))
899
+ self._destination_object_names.append(str(upload_file_name))
848
900
  except GoogleCloudError:
849
901
  if not self.upload_continue_on_fail:
850
902
  raise
851
903
 
852
- return files_uploaded
904
+ return self._destination_object_names
905
+
906
+ def get_openlineage_facets_on_complete(self, task_instance):
907
+ """Implementing on_complete as execute() resolves object names."""
908
+ from openlineage.client.run import Dataset
909
+
910
+ from airflow.providers.openlineage.extractors import OperatorLineage
911
+
912
+ input_datasets = [
913
+ Dataset(
914
+ namespace=f"gs://{self.source_bucket}",
915
+ name=object_name,
916
+ )
917
+ for object_name in self._source_object_names
918
+ ]
919
+ output_datasets = [
920
+ Dataset(
921
+ namespace=f"gs://{self.destination_bucket}",
922
+ name=object_name,
923
+ )
924
+ for object_name in self._destination_object_names
925
+ ]
926
+
927
+ return OperatorLineage(inputs=input_datasets, outputs=output_datasets)
853
928
 
854
929
 
855
930
  class GCSDeleteBucketOperator(GoogleCloudBaseOperator):
@@ -192,15 +192,14 @@ class GKECreateClusterOperator(GoogleCloudBaseOperator):
192
192
  The **minimum** required to define a cluster to create is:
193
193
 
194
194
  ``dict()`` ::
195
- cluster_def = {'name': 'my-cluster-name',
196
- 'initial_node_count': 1}
195
+ cluster_def = {"name": "my-cluster-name", "initial_node_count": 1}
197
196
 
198
197
  or
199
198
 
200
199
  ``Cluster`` proto ::
201
200
  from google.cloud.container_v1.types import Cluster
202
201
 
203
- cluster_def = Cluster(name='my-cluster-name', initial_node_count=1)
202
+ cluster_def = Cluster(name="my-cluster-name", initial_node_count=1)
204
203
 
205
204
  **Operator Creation**: ::
206
205
 
@@ -58,24 +58,22 @@ class PubSubCreateTopicOperator(GoogleCloudBaseOperator):
58
58
  By default, if the topic already exists, this operator will
59
59
  not cause the DAG to fail. ::
60
60
 
61
- with DAG('successful DAG') as dag:
62
- (
63
- PubSubCreateTopicOperator(project_id='my-project', topic='my_new_topic')
64
- >> PubSubCreateTopicOperator(project_id='my-project', topic='my_new_topic')
65
- )
61
+ with DAG("successful DAG") as dag:
62
+ create_topic = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
63
+ create_topic_again = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
64
+
65
+ create_topic >> create_topic_again
66
66
 
67
67
  The operator can be configured to fail if the topic already exists. ::
68
68
 
69
- with DAG('failing DAG') as dag:
70
- (
71
- PubSubCreateTopicOperator(project_id='my-project', topic='my_new_topic')
72
- >> PubSubCreateTopicOperator(
73
- project_id='my-project',
74
- topic='my_new_topic',
75
- fail_if_exists=True,
76
- )
69
+ with DAG("failing DAG") as dag:
70
+ create_topic = PubSubCreateTopicOperator(project_id="my-project", topic="my_new_topic")
71
+ create_topic_again = PubSubCreateTopicOperator(
72
+ project_id="my-project", topic="my_new_topic", fail_if_exists=True
77
73
  )
78
74
 
75
+ create_topic >> create_topic_again
76
+
79
77
  Both ``project_id`` and ``topic`` are templated so you can use Jinja templating in their values.
80
78
 
81
79
  :param project_id: Optional, the Google Cloud project ID where the topic will be created.
@@ -197,43 +195,35 @@ class PubSubCreateSubscriptionOperator(GoogleCloudBaseOperator):
197
195
  By default, if the subscription already exists, this operator will
198
196
  not cause the DAG to fail. However, the topic must exist in the project. ::
199
197
 
200
- with DAG('successful DAG') as dag:
201
- (
202
- PubSubCreateSubscriptionOperator(
203
- project_id='my-project',
204
- topic='my-topic',
205
- subscription='my-subscription'
206
- )
207
- >> PubSubCreateSubscriptionOperator(
208
- project_id='my-project',
209
- topic='my-topic',
210
- subscription='my-subscription',
211
- )
198
+ with DAG("successful DAG") as dag:
199
+ create_subscription = PubSubCreateSubscriptionOperator(
200
+ project_id="my-project", topic="my-topic", subscription="my-subscription"
201
+ )
202
+ create_subscription_again = PubSubCreateSubscriptionOperator(
203
+ project_id="my-project", topic="my-topic", subscription="my-subscription"
212
204
  )
213
205
 
206
+ create_subscription >> create_subscription_again
207
+
208
+
214
209
  The operator can be configured to fail if the subscription already exists.
215
210
  ::
216
211
 
217
- with DAG('failing DAG') as dag:
218
- (
219
- PubSubCreateSubscriptionOperator(
220
- project_id='my-project',
221
- topic='my-topic',
222
- subscription='my-subscription',
223
- )
224
- >> PubSubCreateSubscriptionOperator(
225
- project_id='my-project',
226
- topic='my-topic',
227
- subscription='my-subscription',
228
- fail_if_exists=True,
229
- )
212
+ with DAG("failing DAG") as dag:
213
+ create_subscription = PubSubCreateSubscriptionOperator(
214
+ project_id="my-project", topic="my-topic", subscription="my-subscription"
230
215
  )
216
+ create_subscription_again = PubSubCreateSubscriptionOperator(
217
+ project_id="my-project", topic="my-topic", subscription="my-subscription", fail_if_exists=True
218
+ )
219
+
220
+ create_subscription >> create_subscription_again
231
221
 
232
222
  Finally, subscription is not required. If not passed, the operator will
233
223
  generated a universally unique identifier for the subscription's name. ::
234
224
 
235
- with DAG('DAG') as dag:
236
- PubSubCreateSubscriptionOperator(project_id='my-project', topic='my-topic')
225
+ with DAG("DAG") as dag:
226
+ PubSubCreateSubscriptionOperator(project_id="my-project", topic="my-topic")
237
227
 
238
228
  ``project_id``, ``topic``, ``subscription``, ``subscription_project_id`` and
239
229
  ``impersonation_chain`` are templated so you can use Jinja templating in their values.
@@ -410,14 +400,16 @@ class PubSubDeleteTopicOperator(GoogleCloudBaseOperator):
410
400
  By default, if the topic does not exist, this operator will
411
401
  not cause the DAG to fail. ::
412
402
 
413
- with DAG('successful DAG') as dag:
414
- PubSubDeleteTopicOperator(project_id='my-project', topic='non_existing_topic')
403
+ with DAG("successful DAG") as dag:
404
+ PubSubDeleteTopicOperator(project_id="my-project", topic="non_existing_topic")
415
405
 
416
406
  The operator can be configured to fail if the topic does not exist. ::
417
407
 
418
- with DAG('failing DAG') as dag:
408
+ with DAG("failing DAG") as dag:
419
409
  PubSubDeleteTopicOperator(
420
- project_id='my-project', topic='non_existing_topic', fail_if_not_exists=True,
410
+ project_id="my-project",
411
+ topic="non_existing_topic",
412
+ fail_if_not_exists=True,
421
413
  )
422
414
 
423
415
  Both ``project_id`` and ``topic`` are templated so you can use Jinja templating in their values.
@@ -506,16 +498,18 @@ class PubSubDeleteSubscriptionOperator(GoogleCloudBaseOperator):
506
498
  By default, if the subscription does not exist, this operator will
507
499
  not cause the DAG to fail. ::
508
500
 
509
- with DAG('successful DAG') as dag:
510
- PubSubDeleteSubscriptionOperator(project_id='my-project', subscription='non-existing')
501
+ with DAG("successful DAG") as dag:
502
+ PubSubDeleteSubscriptionOperator(project_id="my-project", subscription="non-existing")
511
503
 
512
504
  The operator can be configured to fail if the subscription already exists.
513
505
 
514
506
  ::
515
507
 
516
- with DAG('failing DAG') as dag:
508
+ with DAG("failing DAG") as dag:
517
509
  PubSubDeleteSubscriptionOperator(
518
- project_id='my-project', subscription='non-existing', fail_if_not_exists=True,
510
+ project_id="my-project",
511
+ subscription="non-existing",
512
+ fail_if_not_exists=True,
519
513
  )
520
514
 
521
515
  ``project_id``, and ``subscription`` are templated so you can use Jinja templating in their values.
@@ -605,15 +599,13 @@ class PubSubPublishMessageOperator(GoogleCloudBaseOperator):
605
599
  in a single Google Cloud project. If the topic does not exist, this
606
600
  task will fail. ::
607
601
 
608
- m1 = {'data': b'Hello, World!',
609
- 'attributes': {'type': 'greeting'}
610
- }
611
- m2 = {'data': b'Knock, knock'}
612
- m3 = {'attributes': {'foo': ''}}
602
+ m1 = {"data": b"Hello, World!", "attributes": {"type": "greeting"}}
603
+ m2 = {"data": b"Knock, knock"}
604
+ m3 = {"attributes": {"foo": ""}}
613
605
 
614
606
  t1 = PubSubPublishMessageOperator(
615
- project_id='my-project',
616
- topic='my_topic',
607
+ project_id="my-project",
608
+ topic="my_topic",
617
609
  messages=[m1, m2, m3],
618
610
  create_topic=True,
619
611
  dag=dag,
@@ -20,12 +20,16 @@ from __future__ import annotations
20
20
  import logging
21
21
  import re
22
22
  import warnings
23
+ from typing import Sequence
23
24
 
24
25
  from google.auth.exceptions import DefaultCredentialsError
25
26
 
26
27
  from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
27
28
  from airflow.providers.google.cloud._internal_client.secret_manager_client import _SecretManagerClient
28
- from airflow.providers.google.cloud.utils.credentials_provider import get_credentials_and_project_id
29
+ from airflow.providers.google.cloud.utils.credentials_provider import (
30
+ _get_target_principal_and_delegates,
31
+ get_credentials_and_project_id,
32
+ )
29
33
  from airflow.secrets import BaseSecretsBackend
30
34
  from airflow.utils.log.logging_mixin import LoggingMixin
31
35
 
@@ -76,6 +80,14 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
76
80
  :param project_id: Project ID to read the secrets from. If not passed, the project ID from credentials
77
81
  will be used.
78
82
  :param sep: Separator used to concatenate connections_prefix and conn_id. Default: "-"
83
+ :param impersonation_chain: Optional service account to impersonate using
84
+ short-term credentials, or chained list of accounts required to get the
85
+ access token of the last account in the list, which will be impersonated
86
+ in the request. If set as a string, the account must grant the
87
+ originating account the Service Account Token Creator IAM role. If set
88
+ as a sequence, the identities from the list must grant Service Account
89
+ Token Creator IAM role to the directly preceding identity, with first
90
+ account from the list granting this role to the originating account.
79
91
  """
80
92
 
81
93
  def __init__(
@@ -89,6 +101,7 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
89
101
  gcp_scopes: str | None = None,
90
102
  project_id: str | None = None,
91
103
  sep: str = "-",
104
+ impersonation_chain: str | Sequence[str] | None = None,
92
105
  **kwargs,
93
106
  ) -> None:
94
107
  super().__init__(**kwargs)
@@ -103,11 +116,19 @@ class CloudSecretManagerBackend(BaseSecretsBackend, LoggingMixin):
103
116
  f"follows that pattern {SECRET_ID_PATTERN}"
104
117
  )
105
118
  try:
119
+ if impersonation_chain:
120
+ target_principal, delegates = _get_target_principal_and_delegates(impersonation_chain)
121
+ else:
122
+ target_principal = None
123
+ delegates = None
124
+
106
125
  self.credentials, self.project_id = get_credentials_and_project_id(
107
126
  keyfile_dict=gcp_keyfile_dict,
108
127
  key_path=gcp_key_path,
109
128
  credential_config_file=gcp_credential_config_file,
110
129
  scopes=gcp_scopes,
130
+ target_principal=target_principal,
131
+ delegates=delegates,
111
132
  )
112
133
  except (DefaultCredentialsError, FileNotFoundError):
113
134
  log.exception(
@@ -19,9 +19,10 @@
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
+ import warnings
22
23
  from typing import TYPE_CHECKING, Any, Sequence
23
24
 
24
- from airflow.exceptions import AirflowException, AirflowSkipException
25
+ from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning, AirflowSkipException
25
26
  from airflow.providers.google.cloud.triggers.cloud_composer import CloudComposerExecutionTrigger
26
27
  from airflow.sensors.base import BaseSensorOperator
27
28
 
@@ -33,6 +34,11 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
33
34
  """
34
35
  Check the status of the Cloud Composer Environment task.
35
36
 
37
+ This Sensor is deprecated. You can achieve the same functionality by using Cloud Composer Operators
38
+ CloudComposerCreateEnvironmentOperator, CloudComposerDeleteEnvironmentOperator and
39
+ CloudComposerUpdateEnvironmentOperator in deferrable or non-deferrable mode, since every operator
40
+ gives user a possibility to wait (asynchronously or synchronously) until Operation will be finished.
41
+
36
42
  :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
37
43
  :param region: Required. The ID of the Google Cloud region that the service belongs to.
38
44
  :param operation_name: The name of the operation resource
@@ -59,6 +65,13 @@ class CloudComposerEnvironmentSensor(BaseSensorOperator):
59
65
  pooling_period_seconds: int = 30,
60
66
  **kwargs,
61
67
  ):
68
+ warnings.warn(
69
+ f"The `{self.__class__.__name__}` operator is deprecated. You can achieve the same functionality "
70
+ f"by using operators in deferrable or non-deferrable mode, since every operator for Cloud "
71
+ f"Composer will wait for the operation to complete.",
72
+ AirflowProviderDeprecationWarning,
73
+ stacklevel=2,
74
+ )
62
75
  super().__init__(**kwargs)
63
76
  self.project_id = project_id
64
77
  self.region = region
@@ -259,3 +259,121 @@ class DataplexDataQualityJobStatusSensor(BaseSensorOperator):
259
259
  raise AirflowSkipException(message)
260
260
  raise AirflowDataQualityScanException(message)
261
261
  return job_status == DataScanJob.State.SUCCEEDED
262
+
263
+
264
+ class DataplexDataProfileJobStatusSensor(BaseSensorOperator):
265
+ """
266
+ Check the status of the Dataplex DataProfile job.
267
+
268
+ :param project_id: Required. The ID of the Google Cloud project that the task belongs to.
269
+ :param region: Required. The ID of the Google Cloud region that the task belongs to.
270
+ :param data_scan_id: Required. Data Quality scan identifier.
271
+ :param job_id: Required. Job ID.
272
+ :param api_version: The version of the api that will be requested for example 'v3'.
273
+ :param retry: A retry object used to retry requests. If `None` is specified, requests
274
+ will not be retried.
275
+ :param metadata: Additional metadata that is provided to the method.
276
+ :param gcp_conn_id: The connection ID to use when fetching connection info.
277
+ :param impersonation_chain: Optional service account to impersonate using short-term
278
+ credentials, or chained list of accounts required to get the access_token
279
+ of the last account in the list, which will be impersonated in the request.
280
+ If set as a string, the account must grant the originating account
281
+ the Service Account Token Creator IAM role.
282
+ If set as a sequence, the identities from the list must grant
283
+ Service Account Token Creator IAM role to the directly preceding identity, with first
284
+ account from the list granting this role to the originating account (templated).
285
+ :param result_timeout: Value in seconds for which operator will wait for the Data Quality scan result.
286
+ Throws exception if there is no result found after specified amount of seconds.
287
+
288
+ :return: Boolean indicating if the job run has reached the ``DataScanJob.State.SUCCEEDED``.
289
+ """
290
+
291
+ template_fields = ["job_id"]
292
+
293
+ def __init__(
294
+ self,
295
+ project_id: str,
296
+ region: str,
297
+ data_scan_id: str,
298
+ job_id: str,
299
+ api_version: str = "v1",
300
+ retry: Retry | _MethodDefault = DEFAULT,
301
+ metadata: Sequence[tuple[str, str]] = (),
302
+ gcp_conn_id: str = "google_cloud_default",
303
+ impersonation_chain: str | Sequence[str] | None = None,
304
+ result_timeout: float = 60.0 * 10,
305
+ start_sensor_time: float | None = None,
306
+ *args,
307
+ **kwargs,
308
+ ) -> None:
309
+ super().__init__(*args, **kwargs)
310
+ self.project_id = project_id
311
+ self.region = region
312
+ self.data_scan_id = data_scan_id
313
+ self.job_id = job_id
314
+ self.api_version = api_version
315
+ self.retry = retry
316
+ self.metadata = metadata
317
+ self.gcp_conn_id = gcp_conn_id
318
+ self.impersonation_chain = impersonation_chain
319
+ self.result_timeout = result_timeout
320
+ self.start_sensor_time = start_sensor_time
321
+
322
+ def _duration(self):
323
+ if not self.start_sensor_time:
324
+ self.start_sensor_time = time.monotonic()
325
+ return time.monotonic() - self.start_sensor_time
326
+
327
+ def poke(self, context: Context) -> bool:
328
+ self.log.info("Waiting for job %s to be %s", self.job_id, DataScanJob.State.SUCCEEDED)
329
+ if self.result_timeout:
330
+ duration = self._duration()
331
+ if duration > self.result_timeout:
332
+ # TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
333
+ message = (
334
+ f"Timeout: Data Profile scan {self.job_id} is not ready after {self.result_timeout}s"
335
+ )
336
+ if self.soft_fail:
337
+ raise AirflowSkipException(message)
338
+ raise AirflowDataQualityScanResultTimeoutException(message)
339
+
340
+ hook = DataplexHook(
341
+ gcp_conn_id=self.gcp_conn_id,
342
+ api_version=self.api_version,
343
+ impersonation_chain=self.impersonation_chain,
344
+ )
345
+
346
+ try:
347
+ job = hook.get_data_scan_job(
348
+ project_id=self.project_id,
349
+ region=self.region,
350
+ data_scan_id=self.data_scan_id,
351
+ job_id=self.job_id,
352
+ timeout=self.timeout,
353
+ retry=self.retry,
354
+ metadata=self.metadata,
355
+ )
356
+ except GoogleAPICallError as e:
357
+ # TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
358
+ message = f"Error occurred when trying to retrieve Data Profile scan job: {self.data_scan_id}"
359
+ if self.soft_fail:
360
+ raise AirflowSkipException(message, e)
361
+ raise AirflowException(message, e)
362
+
363
+ job_status = job.state
364
+ self.log.info(
365
+ "Current status of the Dataplex Data Profile scan job %s => %s", self.job_id, job_status
366
+ )
367
+ if job_status == DataScanJob.State.FAILED:
368
+ # TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
369
+ message = f"Data Profile scan job failed: {self.job_id}"
370
+ if self.soft_fail:
371
+ raise AirflowSkipException(message)
372
+ raise AirflowException(message)
373
+ if job_status == DataScanJob.State.CANCELLED:
374
+ # TODO: remove this if check when min_airflow_version is set to higher than 2.7.1
375
+ message = f"Data Profile scan job cancelled: {self.job_id}"
376
+ if self.soft_fail:
377
+ raise AirflowSkipException(message)
378
+ raise AirflowException(message)
379
+ return job_status == DataScanJob.State.SUCCEEDED
@@ -50,6 +50,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
50
50
  :param bucket: The Google Cloud Storage bucket where the object is.
51
51
  :param object: The name of the object to check in the Google cloud
52
52
  storage bucket.
53
+ :param use_glob: When set to True the object parameter is interpreted as glob
53
54
  :param google_cloud_conn_id: The connection ID to use when
54
55
  connecting to Google Cloud Storage.
55
56
  :param impersonation_chain: Optional service account to impersonate using short-term
@@ -75,6 +76,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
75
76
  *,
76
77
  bucket: str,
77
78
  object: str,
79
+ use_glob: bool = False,
78
80
  google_cloud_conn_id: str = "google_cloud_default",
79
81
  impersonation_chain: str | Sequence[str] | None = None,
80
82
  retry: Retry = DEFAULT_RETRY,
@@ -84,7 +86,9 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
84
86
  super().__init__(**kwargs)
85
87
  self.bucket = bucket
86
88
  self.object = object
89
+ self.use_glob = use_glob
87
90
  self.google_cloud_conn_id = google_cloud_conn_id
91
+ self._matches: list[str] = []
88
92
  self.impersonation_chain = impersonation_chain
89
93
  self.retry = retry
90
94
 
@@ -96,7 +100,11 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
96
100
  gcp_conn_id=self.google_cloud_conn_id,
97
101
  impersonation_chain=self.impersonation_chain,
98
102
  )
99
- return hook.exists(self.bucket, self.object, self.retry)
103
+ if self.use_glob:
104
+ self._matches = hook.list(self.bucket, match_glob=self.object)
105
+ return bool(self._matches)
106
+ else:
107
+ return hook.exists(self.bucket, self.object, self.retry)
100
108
 
101
109
  def execute(self, context: Context) -> None:
102
110
  """Airflow runs this method on the worker and defers using the trigger."""
@@ -109,6 +117,7 @@ class GCSObjectExistenceSensor(BaseSensorOperator):
109
117
  trigger=GCSBlobTrigger(
110
118
  bucket=self.bucket,
111
119
  object_name=self.object,
120
+ use_glob=self.use_glob,
112
121
  poke_interval=self.poke_interval,
113
122
  google_cloud_conn_id=self.google_cloud_conn_id,
114
123
  hook_params={
@@ -58,12 +58,12 @@ class ADLSToGCSOperator(ADLSListOperator):
58
58
  resulting gcs path will be ``gs://mybucket/hello/world.avro`` ::
59
59
 
60
60
  copy_single_file = AdlsToGoogleCloudStorageOperator(
61
- task_id='copy_single_file',
62
- src_adls='hello/world.avro',
63
- dest_gcs='gs://mybucket',
61
+ task_id="copy_single_file",
62
+ src_adls="hello/world.avro",
63
+ dest_gcs="gs://mybucket",
64
64
  replace=False,
65
- azure_data_lake_conn_id='azure_data_lake_default',
66
- gcp_conn_id='google_cloud_default'
65
+ azure_data_lake_conn_id="azure_data_lake_default",
66
+ gcp_conn_id="google_cloud_default",
67
67
  )
68
68
 
69
69
  The following Operator would copy all parquet files from ADLS