PyPI - apache-airflow-providers-google - Versions diffs - 15.1.0rc1__py3-none-any.whl → 16.0.0rc1__py3-none-any.whl - Mend

apache-airflow-providers-google 15.1.0rc1py3-none-any.whl → 16.0.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

airflow/providers/google/cloud/transfers/s3_to_gcs.py CHANGED Viewed

@@ -181,21 +181,27 @@ class S3ToGCSOperator(S3ListOperator):
                 'The destination Google Cloud Storage path must end with a slash "/" or be empty.'
             )
-    def execute(self, context: Context):
-        self._check_inputs()
+    def _get_files(self, context: Context, gcs_hook: GCSHook) -> list[str]:
         # use the super method to list all the files in an S3 bucket/key
         s3_objects = super().execute(context)
+        if not self.replace:
+            s3_objects = self.exclude_existing_objects(s3_objects=s3_objects, gcs_hook=gcs_hook)
+        return s3_objects
+    def execute(self, context: Context):
+        self._check_inputs()
         gcs_hook = GCSHook(
             gcp_conn_id=self.gcp_conn_id,
             impersonation_chain=self.google_impersonation_chain,
         )
-        if not self.replace:
-            s3_objects = self.exclude_existing_objects(s3_objects=s3_objects, gcs_hook=gcs_hook)
         s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
+        s3_objects = self._get_files(context, gcs_hook)
         if not s3_objects:
             self.log.info("In sync, no files needed to be uploaded to Google Cloud Storage")
         elif self.deferrable:
             self.transfer_files_async(s3_objects, gcs_hook, s3_hook)
         else:

airflow/providers/google/cloud/triggers/bigquery.py CHANGED Viewed

@@ -22,10 +22,12 @@ from typing import TYPE_CHECKING, Any, SupportsAbs
 from aiohttp import ClientSession
 from aiohttp.client_exceptions import ClientResponseError
+from asgiref.sync import sync_to_async
 from airflow.exceptions import AirflowException
 from airflow.models.taskinstance import TaskInstance
 from airflow.providers.google.cloud.hooks.bigquery import BigQueryAsyncHook, BigQueryTableAsyncHook
+from airflow.providers.google.version_compat import AIRFLOW_V_3_0_PLUS
 from airflow.triggers.base import BaseTrigger, TriggerEvent
 from airflow.utils.session import provide_session
 from airflow.utils.state import TaskInstanceState
@@ -116,16 +118,41 @@ class BigQueryInsertJobTrigger(BaseTrigger):
             )
         return task_instance
-    def safe_to_cancel(self) -> bool:
+    async def get_task_state(self):
+        from airflow.sdk.execution_time.task_runner import RuntimeTaskInstance
+        task_states_response = await sync_to_async(RuntimeTaskInstance.get_task_states)(
+            dag_id=self.task_instance.dag_id,
+            task_ids=[self.task_instance.task_id],
+            run_ids=[self.task_instance.run_id],
+            map_index=self.task_instance.map_index,
+        )
+        try:
+            task_state = task_states_response[self.task_instance.run_id][self.task_instance.task_id]
+        except Exception:
+            raise AirflowException(
+                "TaskInstance with dag_id: %s, task_id: %s, run_id: %s and map_index: %s is not found",
+                self.task_instance.dag_id,
+                self.task_instance.task_id,
+                self.task_instance.run_id,
+                self.task_instance.map_index,
+            )
+        return task_state
+    async def safe_to_cancel(self) -> bool:
         """
         Whether it is safe to cancel the external job which is being executed by this trigger.
         This is to avoid the case that `asyncio.CancelledError` is called because the trigger itself is stopped.
         Because in those cases, we should NOT cancel the external job.
         """
-        # Database query is needed to get the latest state of the task instance.
-        task_instance = self.get_task_instance()  # type: ignore[call-arg]
-        return task_instance.state != TaskInstanceState.DEFERRED
+        if AIRFLOW_V_3_0_PLUS:
+            task_state = await self.get_task_state()
+        else:
+            # Database query is needed to get the latest state of the task instance.
+            task_instance = self.get_task_instance()  # type: ignore[call-arg]
+            task_state = task_instance.state
+        return task_state != TaskInstanceState.DEFERRED
     async def run(self) -> AsyncIterator[TriggerEvent]:  # type: ignore[override]
         """Get current job execution status and yields a TriggerEvent."""
@@ -155,7 +182,7 @@ class BigQueryInsertJobTrigger(BaseTrigger):
                     )
                     await asyncio.sleep(self.poll_interval)
         except asyncio.CancelledError:
-            if self.job_id and self.cancel_on_kill and self.safe_to_cancel():
+            if self.job_id and self.cancel_on_kill and await self.safe_to_cancel():
                 self.log.info(
                     "The job is safe to cancel the as airflow TaskInstance is not in deferred state."
                 )

airflow/providers/google/cloud/triggers/dataflow.py CHANGED Viewed

@@ -788,3 +788,125 @@ class DataflowJobMessagesTrigger(BaseTrigger):
             poll_sleep=self.poll_sleep,
             impersonation_chain=self.impersonation_chain,
         )
+class DataflowJobStateCompleteTrigger(BaseTrigger):
+    """
+    Trigger that monitors if a Dataflow job has reached any of successful terminal state meant for that job.
+    :param job_id: Required. ID of the job.
+    :param project_id: Required. The Google Cloud project ID in which the job was started.
+    :param location: Optional. The location where the job is executed. If set to None then
+        the value of DEFAULT_DATAFLOW_LOCATION will be used.
+    :param wait_until_finished: Optional. Dataflow option to block pipeline until completion.
+    :param gcp_conn_id: The connection ID to use for connecting to Google Cloud.
+    :param poll_sleep: Time (seconds) to wait between two consecutive calls to check the job.
+    :param impersonation_chain: Optional. Service account to impersonate using short-term
+        credentials, or chained list of accounts required to get the access_token
+        of the last account in the list, which will be impersonated in the request.
+        If set as a string, the account must grant the originating account
+        the Service Account Token Creator IAM role.
+        If set as a sequence, the identities from the list must grant
+        Service Account Token Creator IAM role to the directly preceding identity, with first
+        account from the list granting this role to the originating account (templated).
+    """
+    def __init__(
+        self,
+        job_id: str,
+        project_id: str | None,
+        location: str = DEFAULT_DATAFLOW_LOCATION,
+        wait_until_finished: bool | None = None,
+        gcp_conn_id: str = "google_cloud_default",
+        poll_sleep: int = 10,
+        impersonation_chain: str | Sequence[str] | None = None,
+    ):
+        super().__init__()
+        self.job_id = job_id
+        self.project_id = project_id
+        self.location = location
+        self.wait_until_finished = wait_until_finished
+        self.gcp_conn_id = gcp_conn_id
+        self.poll_sleep = poll_sleep
+        self.impersonation_chain = impersonation_chain
+    def serialize(self) -> tuple[str, dict[str, Any]]:
+        """Serialize class arguments and classpath."""
+        return (
+            "airflow.providers.google.cloud.triggers.dataflow.DataflowJobStateCompleteTrigger",
+            {
+                "job_id": self.job_id,
+                "project_id": self.project_id,
+                "location": self.location,
+                "wait_until_finished": self.wait_until_finished,
+                "gcp_conn_id": self.gcp_conn_id,
+                "poll_sleep": self.poll_sleep,
+                "impersonation_chain": self.impersonation_chain,
+            },
+        )
+    async def run(self):
+        """
+        Loop until the job reaches  successful final or error state.
+        Yields a TriggerEvent with success status, if the job reaches successful state for own type.
+        Yields a TriggerEvent with error status, if the client returns an unexpected terminal
+        job status or any exception is raised while looping.
+        In any other case the Trigger will wait for a specified amount of time
+        stored in self.poll_sleep variable.
+        """
+        try:
+            while True:
+                job = await self.async_hook.get_job(
+                    project_id=self.project_id,
+                    job_id=self.job_id,
+                    location=self.location,
+                )
+                job_state = job.current_state.name
+                job_type_name = job.type_.name
+                FAILED_STATES = DataflowJobStatus.FAILED_END_STATES | {DataflowJobStatus.JOB_STATE_DRAINED}
+                if job_state in FAILED_STATES:
+                    yield TriggerEvent(
+                        {
+                            "status": "error",
+                            "message": (
+                                f"Job with id '{self.job_id}' is in failed terminal state: {job_state}"
+                            ),
+                        }
+                    )
+                    return
+                if self.async_hook.job_reached_terminal_state(
+                    job={"id": self.job_id, "currentState": job_state, "type": job_type_name},
+                    wait_until_finished=self.wait_until_finished,
+                ):
+                    yield TriggerEvent(
+                        {
+                            "status": "success",
+                            "message": (
+                                f"Job with id '{self.job_id}' has reached successful final state: {job_state}"
+                            ),
+                        }
+                    )
+                    return
+                self.log.info("Sleeping for %s seconds.", self.poll_sleep)
+                await asyncio.sleep(self.poll_sleep)
+        except Exception as e:
+            self.log.error("Exception occurred while checking for job state!")
+            yield TriggerEvent(
+                {
+                    "status": "error",
+                    "message": str(e),
+                }
+            )
+    @cached_property
+    def async_hook(self) -> AsyncDataflowHook:
+        return AsyncDataflowHook(
+            gcp_conn_id=self.gcp_conn_id,
+            poll_sleep=self.poll_sleep,
+            impersonation_chain=self.impersonation_chain,
+        )

airflow/providers/google/cloud/triggers/dataproc.py CHANGED Viewed

@@ -25,6 +25,7 @@ import time
 from collections.abc import AsyncIterator, Sequence
 from typing import TYPE_CHECKING, Any
+from asgiref.sync import sync_to_async
 from google.api_core.exceptions import NotFound
 from google.cloud.dataproc_v1 import Batch, Cluster, ClusterStatus, JobStatus
@@ -33,6 +34,7 @@ from airflow.models.taskinstance import TaskInstance
 from airflow.providers.google.cloud.hooks.dataproc import DataprocAsyncHook, DataprocHook
 from airflow.providers.google.cloud.utils.dataproc import DataprocOperationType
 from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
+from airflow.providers.google.version_compat import AIRFLOW_V_3_0_PLUS
 from airflow.triggers.base import BaseTrigger, TriggerEvent
 from airflow.utils.session import provide_session
 from airflow.utils.state import TaskInstanceState
@@ -141,16 +143,41 @@ class DataprocSubmitTrigger(DataprocBaseTrigger):
             )
         return task_instance
-    def safe_to_cancel(self) -> bool:
+    async def get_task_state(self):
+        from airflow.sdk.execution_time.task_runner import RuntimeTaskInstance
+        task_states_response = await sync_to_async(RuntimeTaskInstance.get_task_states)(
+            dag_id=self.task_instance.dag_id,
+            task_ids=[self.task_instance.task_id],
+            run_ids=[self.task_instance.run_id],
+            map_index=self.task_instance.map_index,
+        )
+        try:
+            task_state = task_states_response[self.task_instance.run_id][self.task_instance.task_id]
+        except Exception:
+            raise AirflowException(
+                "TaskInstance with dag_id: %s, task_id: %s, run_id: %s and map_index: %s is not found",
+                self.task_instance.dag_id,
+                self.task_instance.task_id,
+                self.task_instance.run_id,
+                self.task_instance.map_index,
+            )
+        return task_state
+    async def safe_to_cancel(self) -> bool:
         """
         Whether it is safe to cancel the external job which is being executed by this trigger.
         This is to avoid the case that `asyncio.CancelledError` is called because the trigger itself is stopped.
         Because in those cases, we should NOT cancel the external job.
         """
-        # Database query is needed to get the latest state of the task instance.
-        task_instance = self.get_task_instance()  # type: ignore[call-arg]
-        return task_instance.state != TaskInstanceState.DEFERRED
+        if AIRFLOW_V_3_0_PLUS:
+            task_state = await self.get_task_state()
+        else:
+            # Database query is needed to get the latest state of the task instance.
+            task_instance = self.get_task_instance()  # type: ignore[call-arg]
+            task_state = task_instance.state
+        return task_state != TaskInstanceState.DEFERRED
     async def run(self):
         try:
@@ -167,7 +194,7 @@ class DataprocSubmitTrigger(DataprocBaseTrigger):
         except asyncio.CancelledError:
             self.log.info("Task got cancelled.")
             try:
-                if self.job_id and self.cancel_on_kill and self.safe_to_cancel():
+                if self.job_id and self.cancel_on_kill and await self.safe_to_cancel():
                     self.log.info(
                         "Cancelling the job as it is safe to do so. Note that the airflow TaskInstance is not"
                         " in deferred state."
@@ -243,16 +270,41 @@ class DataprocClusterTrigger(DataprocBaseTrigger):
             )
         return task_instance
-    def safe_to_cancel(self) -> bool:
+    async def get_task_state(self):
+        from airflow.sdk.execution_time.task_runner import RuntimeTaskInstance
+        task_states_response = await sync_to_async(RuntimeTaskInstance.get_task_states)(
+            dag_id=self.task_instance.dag_id,
+            task_ids=[self.task_instance.task_id],
+            run_ids=[self.task_instance.run_id],
+            map_index=self.task_instance.map_index,
+        )
+        try:
+            task_state = task_states_response[self.task_instance.run_id][self.task_instance.task_id]
+        except Exception:
+            raise AirflowException(
+                "TaskInstance with dag_id: %s, task_id: %s, run_id: %s and map_index: %s is not found",
+                self.task_instance.dag_id,
+                self.task_instance.task_id,
+                self.task_instance.run_id,
+                self.task_instance.map_index,
+            )
+        return task_state
+    async def safe_to_cancel(self) -> bool:
         """
         Whether it is safe to cancel the external job which is being executed by this trigger.
         This is to avoid the case that `asyncio.CancelledError` is called because the trigger itself is stopped.
         Because in those cases, we should NOT cancel the external job.
         """
-        # Database query is needed to get the latest state of the task instance.
-        task_instance = self.get_task_instance()  # type: ignore[call-arg]
-        return task_instance.state != TaskInstanceState.DEFERRED
+        if AIRFLOW_V_3_0_PLUS:
+            task_state = await self.get_task_state()
+        else:
+            # Database query is needed to get the latest state of the task instance.
+            task_instance = self.get_task_instance()  # type: ignore[call-arg]
+            task_state = task_instance.state
+        return task_state != TaskInstanceState.DEFERRED
     async def run(self) -> AsyncIterator[TriggerEvent]:
         try:
@@ -283,7 +335,7 @@ class DataprocClusterTrigger(DataprocBaseTrigger):
                 await asyncio.sleep(self.polling_interval_seconds)
         except asyncio.CancelledError:
             try:
-                if self.delete_on_error and self.safe_to_cancel():
+                if self.delete_on_error and await self.safe_to_cancel():
                     self.log.info(
                         "Deleting the cluster as it is safe to delete as the airflow TaskInstance is not in "
                         "deferred state."

airflow/providers/google/get_provider_info.py CHANGED Viewed

@@ -675,6 +675,7 @@ def get_provider_info():
                     "airflow.providers.google.cloud.operators.vertex_ai.pipeline_job",
                     "airflow.providers.google.cloud.operators.vertex_ai.generative_model",
                     "airflow.providers.google.cloud.operators.vertex_ai.feature_store",
+                    "airflow.providers.google.cloud.operators.vertex_ai.ray",
                 ],
             },
             {
@@ -1041,6 +1042,7 @@ def get_provider_info():
                     "airflow.providers.google.cloud.hooks.vertex_ai.generative_model",
                     "airflow.providers.google.cloud.hooks.vertex_ai.prediction_service",
                     "airflow.providers.google.cloud.hooks.vertex_ai.feature_store",
+                    "airflow.providers.google.cloud.hooks.vertex_ai.ray",
                 ],
             },
             {
@@ -1336,6 +1338,12 @@ def get_provider_info():
                 "python-module": "airflow.providers.google.cloud.transfers.azure_blob_to_gcs",
                 "how-to-guide": "/docs/apache-airflow-providers-google/operators/transfer/azure_blob_to_gcs.rst",
             },
+            {
+                "source-integration-name": "Hypertext Transfer Protocol (HTTP)",
+                "target-integration-name": "Google Cloud Storage (GCS)",
+                "python-module": "airflow.providers.google.cloud.transfers.http_to_gcs",
+                "how-to-guide": "/docs/apache-airflow-providers-google/operators/transfer/http_to_gcs.rst",
+            },
         ],
         "connection-types": [
             {
@@ -1366,6 +1374,14 @@ def get_provider_info():
                 "hook-class-name": "airflow.providers.google.leveldb.hooks.leveldb.LevelDBHook",
                 "connection-type": "leveldb",
             },
+            {
+                "hook-class-name": "airflow.providers.google.ads.hooks.ads.GoogleAdsHook",
+                "connection-type": "google_ads",
+            },
+            {
+                "hook-class-name": "airflow.providers.google.cloud.hooks.looker.LookerHook",
+                "connection-type": "gcp_looker",
+            },
         ],
         "extra-links": [
             "airflow.providers.google.cloud.links.alloy_db.AlloyDBBackupsLink",
@@ -1427,6 +1443,8 @@ def get_provider_info():
             "airflow.providers.google.cloud.links.vertex_ai.VertexAIEndpointListLink",
             "airflow.providers.google.cloud.links.vertex_ai.VertexAIPipelineJobLink",
             "airflow.providers.google.cloud.links.vertex_ai.VertexAIPipelineJobListLink",
+            "airflow.providers.google.cloud.links.vertex_ai.VertexAIRayClusterLink",
+            "airflow.providers.google.cloud.links.vertex_ai.VertexAIRayClusterListLink",
             "airflow.providers.google.cloud.links.workflows.WorkflowsWorkflowDetailsLink",
             "airflow.providers.google.cloud.links.workflows.WorkflowsListOfWorkflowsLink",
             "airflow.providers.google.cloud.links.workflows.WorkflowsExecutionLink",
@@ -1457,11 +1475,6 @@ def get_provider_info():
             "airflow.providers.google.cloud.links.cloud_build.CloudBuildListLink",
             "airflow.providers.google.cloud.links.cloud_build.CloudBuildTriggersListLink",
             "airflow.providers.google.cloud.links.cloud_build.CloudBuildTriggerDetailsLink",
-            "airflow.providers.google.cloud.links.automl.AutoMLDatasetLink",
-            "airflow.providers.google.cloud.links.automl.AutoMLDatasetListLink",
-            "airflow.providers.google.cloud.links.automl.AutoMLModelLink",
-            "airflow.providers.google.cloud.links.automl.AutoMLModelTrainLink",
-            "airflow.providers.google.cloud.links.automl.AutoMLModelPredictLink",
             "airflow.providers.google.cloud.links.life_sciences.LifeSciencesLink",
             "airflow.providers.google.cloud.links.cloud_functions.CloudFunctionsDetailsLink",
             "airflow.providers.google.cloud.links.cloud_functions.CloudFunctionsListLink",

airflow/providers/google/leveldb/hooks/leveldb.py CHANGED Viewed

@@ -18,6 +18,8 @@
 from __future__ import annotations
+from typing import Any
 from airflow.exceptions import AirflowException, AirflowOptionalProviderFeatureException
 from airflow.hooks.base import BaseHook
@@ -46,6 +48,29 @@ class LevelDBHook(BaseHook):
     conn_type = "leveldb"
     hook_name = "LevelDB"
+    @classmethod
+    def get_connection_form_widgets(cls) -> dict[str, Any]:
+        """Return connection widgets to add to LevelDB connection form."""
+        from flask_babel import lazy_gettext
+        from wtforms import BooleanField
+        return {
+            "create_if_missing": BooleanField(
+                lazy_gettext("Create a database if it does not exist"), default=False
+            ),
+            "error_if_exists": BooleanField(
+                lazy_gettext("Raise an exception if the database already exists"), default=False
+            ),
+        }
+    @classmethod
+    def get_ui_field_behaviour(cls) -> dict[str, Any]:
+        """Return custom UI field behaviour for LevelDB connection."""
+        return {
+            "hidden_fields": ["login", "password", "schema", "port"],
+            "relabeling": {},
+        }
     def __init__(self, leveldb_conn_id: str = default_conn_name):
         super().__init__()
         self.leveldb_conn_id = leveldb_conn_id

airflow/providers/google/version_compat.py CHANGED Viewed

@@ -32,5 +32,4 @@ def get_base_airflow_version_tuple() -> tuple[int, int, int]:
     return airflow_version.major, airflow_version.minor, airflow_version.micro
-AIRFLOW_V_2_10_PLUS = get_base_airflow_version_tuple() >= (2, 10, 0)
 AIRFLOW_V_3_0_PLUS = get_base_airflow_version_tuple() >= (3, 0, 0)

apache-airflow-providers-google 15.1.0rc1__py3-none-any.whl → 16.0.0rc1__py3-none-any.whl

apache-airflow-providers-google 15.1.0rc1py3-none-any.whl → 16.0.0rc1py3-none-any.whl