PyPI - apache-airflow-providers-google - Versions diffs - 15.1.0rc1__py3-none-any.whl → 16.0.0a1__py3-none-any.whl - Mend

apache-airflow-providers-google 15.1.0rc1py3-none-any.whl → 16.0.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

airflow/providers/google/__init__.py CHANGED Viewed

@@ -32,8 +32,8 @@ __all__ = ["__version__"]
 __version__ = "15.1.0"
 if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
-    "2.9.0"
+    "2.10.0"
 ):
     raise RuntimeError(
-        f"The package `apache-airflow-providers-google:{__version__}` needs Apache Airflow 2.9.0+"
+        f"The package `apache-airflow-providers-google:{__version__}` needs Apache Airflow 2.10.0+"
     )

airflow/providers/google/cloud/hooks/dataflow.py CHANGED Viewed

@@ -185,7 +185,67 @@ class DataflowJobType:
     JOB_TYPE_STREAMING = "JOB_TYPE_STREAMING"
-class _DataflowJobsController(LoggingMixin):
+class DataflowJobTerminalStateHelper(LoggingMixin):
+    """Helper to define and validate the dataflow job terminal state."""
+    @staticmethod
+    def expected_terminal_state_is_allowed(expected_terminal_state):
+        job_allowed_terminal_states = DataflowJobStatus.TERMINAL_STATES | {
+            DataflowJobStatus.JOB_STATE_RUNNING
+        }
+        if expected_terminal_state not in job_allowed_terminal_states:
+            raise AirflowException(
+                f"Google Cloud Dataflow job's expected terminal state "
+                f"'{expected_terminal_state}' is invalid."
+                f" The value should be any of the following: {job_allowed_terminal_states}"
+            )
+        return True
+    @staticmethod
+    def expected_terminal_state_is_valid_for_job_type(expected_terminal_state, is_streaming: bool):
+        if is_streaming:
+            invalid_terminal_state = DataflowJobStatus.JOB_STATE_DONE
+            job_type = "streaming"
+        else:
+            invalid_terminal_state = DataflowJobStatus.JOB_STATE_DRAINED
+            job_type = "batch"
+        if expected_terminal_state == invalid_terminal_state:
+            raise AirflowException(
+                f"Google Cloud Dataflow job's expected terminal state cannot be {invalid_terminal_state} while it is a {job_type} job"
+            )
+        return True
+    def job_reached_terminal_state(self, job, wait_until_finished=None, custom_terminal_state=None) -> bool:
+        """
+        Check the job reached terminal state, if job failed raise exception.
+        :return: True if job is done.
+        :raise: Exception
+        """
+        current_state = job["currentState"]
+        is_streaming = job.get("type") == DataflowJobType.JOB_TYPE_STREAMING
+        expected_terminal_state = (
+            DataflowJobStatus.JOB_STATE_RUNNING if is_streaming else DataflowJobStatus.JOB_STATE_DONE
+        )
+        if custom_terminal_state is not None:
+            expected_terminal_state = custom_terminal_state
+        self.expected_terminal_state_is_allowed(expected_terminal_state)
+        self.expected_terminal_state_is_valid_for_job_type(expected_terminal_state, is_streaming=is_streaming)
+        if current_state == expected_terminal_state:
+            if expected_terminal_state == DataflowJobStatus.JOB_STATE_RUNNING and wait_until_finished:
+                return False
+            return True
+        if current_state in DataflowJobStatus.AWAITING_STATES:
+            return wait_until_finished is False
+        self.log.debug("Current job: %s", job)
+        raise AirflowException(
+            f"Google Cloud Dataflow job {job['name']} is in an unexpected terminal state: {current_state}, "
+            f"expected terminal state: {expected_terminal_state}"
+        )
+class _DataflowJobsController(DataflowJobTerminalStateHelper):
     """
     Interface for communication with Google Cloud Dataflow API.
@@ -462,7 +522,10 @@ class _DataflowJobsController(LoggingMixin):
         """Wait for result of submitted job."""
         self.log.info("Start waiting for done.")
         self._refresh_jobs()
-        while self._jobs and not all(self._check_dataflow_job_state(job) for job in self._jobs):
+        while self._jobs and not all(
+            self.job_reached_terminal_state(job, self._wait_until_finished, self._expected_terminal_state)
+            for job in self._jobs
+        ):
             self.log.info("Waiting for done. Sleep %s s", self._poll_sleep)
             time.sleep(self._poll_sleep)
             self._refresh_jobs()
@@ -1295,8 +1358,7 @@ class DataflowHook(GoogleBaseHook):
             location=location,
         )
         job = job_controller.fetch_job_by_id(job_id)
-        return job_controller._check_dataflow_job_state(job)
+        return job_controller.job_reached_terminal_state(job)
     @GoogleBaseHook.fallback_to_default_project_id
     def create_data_pipeline(
@@ -1425,7 +1487,7 @@ class DataflowHook(GoogleBaseHook):
         return f"projects/{project_id}/locations/{location}"
-class AsyncDataflowHook(GoogleBaseAsyncHook):
+class AsyncDataflowHook(GoogleBaseAsyncHook, DataflowJobTerminalStateHelper):
     """Async hook class for dataflow service."""
     sync_hook_class = DataflowHook

airflow/providers/google/cloud/hooks/vertex_ai/ray.py ADDED Viewed

@@ -0,0 +1,223 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This module contains a Google Cloud Vertex AI hook."""
+from __future__ import annotations
+import dataclasses
+from typing import Any
+import vertex_ray
+from google._upb._message import ScalarMapContainer
+from google.cloud import aiplatform
+from google.cloud.aiplatform.vertex_ray.util import resources
+from google.cloud.aiplatform_v1 import (
+    PersistentResourceServiceClient,
+)
+from proto.marshal.collections.repeated import Repeated
+from airflow.providers.google.common.hooks.base_google import GoogleBaseHook
+class RayHook(GoogleBaseHook):
+    """Hook for Google Cloud Vertex AI Ray APIs."""
+    def extract_cluster_id(self, cluster_path) -> str:
+        """Extract cluster_id from cluster_path."""
+        cluster_id = PersistentResourceServiceClient.parse_persistent_resource_path(cluster_path)[
+            "persistent_resource"
+        ]
+        return cluster_id
+    def serialize_cluster_obj(self, cluster_obj: resources.Cluster) -> dict:
+        """Serialize Cluster dataclass to dict."""
+        def __encode_value(value: Any) -> Any:
+            if isinstance(value, (list, Repeated)):
+                return [__encode_value(nested_value) for nested_value in value]
+            if isinstance(value, ScalarMapContainer):
+                return {key: __encode_value(nested_value) for key, nested_value in dict(value).items()}
+            if dataclasses.is_dataclass(value):
+                return dataclasses.asdict(value)
+            return value
+        return {
+            field.name: __encode_value(getattr(cluster_obj, field.name))
+            for field in dataclasses.fields(cluster_obj)
+        }
+    @GoogleBaseHook.fallback_to_default_project_id
+    def create_ray_cluster(
+        self,
+        project_id: str,
+        location: str,
+        head_node_type: resources.Resources = resources.Resources(),
+        python_version: str = "3.10",
+        ray_version: str = "2.33",
+        network: str | None = None,
+        service_account: str | None = None,
+        cluster_name: str | None = None,
+        worker_node_types: list[resources.Resources] | None = None,
+        custom_images: resources.NodeImages | None = None,
+        enable_metrics_collection: bool = True,
+        enable_logging: bool = True,
+        psc_interface_config: resources.PscIConfig | None = None,
+        reserved_ip_ranges: list[str] | None = None,
+        labels: dict[str, str] | None = None,
+    ) -> str:
+        """
+        Create a Ray cluster on the Vertex AI.
+        :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
+        :param location: Required. The ID of the Google Cloud location that the service belongs to.
+        :param head_node_type: The head node resource. Resources.node_count must be 1. If not set, default
+            value of Resources() class will be used.
+        :param python_version: Python version for the ray cluster.
+        :param ray_version: Ray version for the ray cluster. Default is 2.33.0.
+        :param network: Virtual private cloud (VPC) network. For Ray Client, VPC peering is required to
+            connect to the Ray Cluster managed in the Vertex API service. For Ray Job API, VPC network is not
+            required because Ray Cluster connection can be accessed through dashboard address.
+        :param service_account: Service account to be used for running Ray programs on the cluster.
+        :param cluster_name: This value may be up to 63 characters, and valid characters are `[a-z0-9_-]`.
+            The first character cannot be a number or hyphen.
+        :param worker_node_types: The list of Resources of the worker nodes. The same Resources object should
+            not appear multiple times in the list.
+        :param custom_images: The NodeImages which specifies head node and worker nodes images. All the
+            workers will share the same image. If each Resource has a specific custom image, use
+            `Resources.custom_image` for head/worker_node_type(s). Note that configuring
+            `Resources.custom_image` will override `custom_images` here. Allowlist only.
+        :param enable_metrics_collection: Enable Ray metrics collection for visualization.
+        :param enable_logging: Enable exporting Ray logs to Cloud Logging.
+        :param psc_interface_config: PSC-I config.
+        :param reserved_ip_ranges: A list of names for the reserved IP ranges under the VPC network that can
+            be used for this cluster. If set, we will deploy the cluster within the provided IP ranges.
+            Otherwise, the cluster is deployed to any IP ranges under the provided VPC network.
+            Example: ["vertex-ai-ip-range"].
+        :param labels: The labels with user-defined metadata to organize Ray cluster.
+            Label keys and values can be no longer than 64 characters (Unicode codepoints), can only contain
+            lowercase letters, numeric characters, underscores and dashes. International characters are allowed.
+            See https://goo.gl/xmQnxf for more information and examples of labels.
+        """
+        aiplatform.init(project=project_id, location=location, credentials=self.get_credentials())
+        cluster_path = vertex_ray.create_ray_cluster(
+            head_node_type=head_node_type,
+            python_version=python_version,
+            ray_version=ray_version,
+            network=network,
+            service_account=service_account,
+            cluster_name=cluster_name,
+            worker_node_types=worker_node_types,
+            custom_images=custom_images,
+            enable_metrics_collection=enable_metrics_collection,
+            enable_logging=enable_logging,
+            psc_interface_config=psc_interface_config,
+            reserved_ip_ranges=reserved_ip_ranges,
+            labels=labels,
+        )
+        return cluster_path
+    @GoogleBaseHook.fallback_to_default_project_id
+    def list_ray_clusters(
+        self,
+        project_id: str,
+        location: str,
+    ) -> list[resources.Cluster]:
+        """
+        List Ray clusters under the currently authenticated project.
+        :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
+        :param location: Required. The ID of the Google Cloud location that the service belongs to.
+        """
+        aiplatform.init(project=project_id, location=location, credentials=self.get_credentials())
+        ray_clusters = vertex_ray.list_ray_clusters()
+        return ray_clusters
+    @GoogleBaseHook.fallback_to_default_project_id
+    def get_ray_cluster(
+        self,
+        project_id: str,
+        location: str,
+        cluster_id: str,
+    ) -> resources.Cluster:
+        """
+        Get Ray cluster.
+        :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
+        :param location: Required. The ID of the Google Cloud location that the service belongs to.
+        :param cluster_id: Cluster resource ID.
+        """
+        aiplatform.init(project=project_id, location=location, credentials=self.get_credentials())
+        ray_cluster_name = PersistentResourceServiceClient.persistent_resource_path(
+            project=project_id,
+            location=location,
+            persistent_resource=cluster_id,
+        )
+        ray_cluster = vertex_ray.get_ray_cluster(
+            cluster_resource_name=ray_cluster_name,
+        )
+        return ray_cluster
+    @GoogleBaseHook.fallback_to_default_project_id
+    def update_ray_cluster(
+        self,
+        project_id: str,
+        location: str,
+        cluster_id: str,
+        worker_node_types: list[resources.Resources],
+    ) -> str:
+        """
+        Update Ray cluster (currently support resizing node counts for worker nodes).
+        :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
+        :param location: Required. The ID of the Google Cloud location that the service belongs to.
+        :param cluster_id: Cluster resource ID.
+        :param worker_node_types: The list of Resources of the resized worker nodes. The same Resources
+            object should not appear multiple times in the list.
+        """
+        aiplatform.init(project=project_id, location=location, credentials=self.get_credentials())
+        ray_cluster_name = PersistentResourceServiceClient.persistent_resource_path(
+            project=project_id,
+            location=location,
+            persistent_resource=cluster_id,
+        )
+        updated_ray_cluster_name = vertex_ray.update_ray_cluster(
+            cluster_resource_name=ray_cluster_name, worker_node_types=worker_node_types
+        )
+        return updated_ray_cluster_name
+    @GoogleBaseHook.fallback_to_default_project_id
+    def delete_ray_cluster(
+        self,
+        project_id: str,
+        location: str,
+        cluster_id: str,
+    ) -> None:
+        """
+        Delete Ray cluster.
+        :param project_id: Required. The ID of the Google Cloud project that the service belongs to.
+        :param location: Required. The ID of the Google Cloud location that the service belongs to.
+        :param cluster_id: Cluster resource ID.
+        """
+        aiplatform.init(project=project_id, location=location, credentials=self.get_credentials())
+        ray_cluster_name = PersistentResourceServiceClient.persistent_resource_path(
+            project=project_id,
+            location=location,
+            persistent_resource=cluster_id,
+        )
+        vertex_ray.delete_ray_cluster(cluster_resource_name=ray_cluster_name)

airflow/providers/google/cloud/links/vertex_ai.py CHANGED Viewed

@@ -54,6 +54,10 @@ VERTEX_AI_PIPELINE_JOB_LINK = (
     VERTEX_AI_BASE_LINK + "/locations/{region}/pipelines/runs/{pipeline_id}?project={project_id}"
 )
 VERTEX_AI_PIPELINE_JOB_LIST_LINK = VERTEX_AI_BASE_LINK + "/pipelines/runs?project={project_id}"
+VERTEX_AI_RAY_CLUSTER_LINK = (
+    VERTEX_AI_BASE_LINK + "/locations/{location}/ray-clusters/{cluster_id}?project={project_id}"
+)
+VERTEX_AI_RAY_CLUSTER_LIST_LINK = VERTEX_AI_BASE_LINK + "/ray?project={project_id}"
 class VertexAIModelLink(BaseGoogleLink):
@@ -369,3 +373,48 @@ class VertexAIPipelineJobListLink(BaseGoogleLink):
                 "project_id": task_instance.project_id,
             },
         )
+class VertexAIRayClusterLink(BaseGoogleLink):
+    """Helper class for constructing Vertex AI Ray Cluster link."""
+    name = "Ray Cluster"
+    key = "ray_cluster_conf"
+    format_str = VERTEX_AI_RAY_CLUSTER_LINK
+    @staticmethod
+    def persist(
+        context: Context,
+        task_instance,
+        cluster_id: str,
+    ):
+        task_instance.xcom_push(
+            context=context,
+            key=VertexAIRayClusterLink.key,
+            value={
+                "location": task_instance.location,
+                "cluster_id": cluster_id,
+                "project_id": task_instance.project_id,
+            },
+        )
+class VertexAIRayClusterListLink(BaseGoogleLink):
+    """Helper class for constructing Vertex AI Ray Cluster List link."""
+    name = "Ray Cluster List"
+    key = "ray_cluster_list_conf"
+    format_str = VERTEX_AI_RAY_CLUSTER_LIST_LINK
+    @staticmethod
+    def persist(
+        context: Context,
+        task_instance,
+    ):
+        task_instance.xcom_push(
+            context=context,
+            key=VertexAIRayClusterListLink.key,
+            value={
+                "project_id": task_instance.project_id,
+            },
+        )

airflow/providers/google/cloud/log/gcs_task_handler.py CHANGED Viewed

@@ -61,13 +61,15 @@ class GCSRemoteLogIO(LoggingMixin):  # noqa: D101
     remote_base: str
     base_log_folder: Path = attrs.field(converter=Path)
     delete_local_copy: bool
+    project_id: str
     gcp_key_path: str | None
     gcp_keyfile_dict: dict | None
     scopes: Collection[str] | None
-    project_id: str
-    def upload(self, path: os.PathLike, ti: RuntimeTI):
+    processors = ()
+    def upload(self, path: os.PathLike | str, ti: RuntimeTI):
         """Upload the given log path to the remote storage."""
         path = Path(path)
         if path.is_absolute():

airflow/providers/google/cloud/operators/cloud_run.py CHANGED Viewed

@@ -265,7 +265,16 @@ class CloudRunExecuteJobOperator(GoogleCloudBaseOperator):
     :param deferrable: Run the operator in deferrable mode.
     """
-    template_fields = ("project_id", "region", "gcp_conn_id", "impersonation_chain", "job_name", "overrides")
+    template_fields = (
+        "project_id",
+        "region",
+        "gcp_conn_id",
+        "impersonation_chain",
+        "job_name",
+        "overrides",
+        "polling_period_seconds",
+        "timeout_seconds",
+    )
     def __init__(
         self,

airflow/providers/google/cloud/operators/kubernetes_engine.py CHANGED Viewed

@@ -636,7 +636,7 @@ class GKEStartPodOperator(GKEOperatorMixin, KubernetesPodOperator):
     """
     template_fields: Sequence[str] = tuple(
-        {"on_finish_action", "deferrable"}
+        {"deferrable"}
         | (set(KubernetesPodOperator.template_fields) - {"is_delete_operator_pod", "regional"})
         | set(GKEOperatorMixin.template_fields)
     )

apache-airflow-providers-google 15.1.0rc1__py3-none-any.whl → 16.0.0a1__py3-none-any.whl

apache-airflow-providers-google 15.1.0rc1py3-none-any.whl → 16.0.0a1py3-none-any.whl