PyPI - zenml-nightly - Versions diffs - 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250703__py3-none-any.whl - Mend

zenml-nightly 0.83.1.dev20250702py3-none-any.whl → 0.83.1.dev20250703py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

zenml/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.83.1.~~dev20250702~~
1	+ 0.83.1.dev20250703

zenml/cli/pipeline.py CHANGED Viewed

@@ -34,7 +34,7 @@ from zenml.models import (
     ScheduleFilter,
 )
 from zenml.pipelines.pipeline_definition import Pipeline
-from zenml.utils import source_utils, uuid_utils
+from zenml.utils import run_utils, source_utils, uuid_utils
 from zenml.utils.yaml_utils import write_yaml
 logger = get_logger(__name__)
@@ -511,6 +511,59 @@ def list_pipeline_runs(**kwargs: Any) -> None:
         cli_utils.print_page_info(pipeline_runs)
+@runs.command("stop")
+@click.argument("run_name_or_id", type=str, required=True)
+@click.option(
+    "--graceful",
+    "-g",
+    is_flag=True,
+    default=False,
+    help="Use graceful shutdown (default is False).",
+)
+@click.option(
+    "--yes",
+    "-y",
+    is_flag=True,
+    default=False,
+    help="Don't ask for confirmation.",
+)
+def stop_pipeline_run(
+    run_name_or_id: str,
+    graceful: bool = False,
+    yes: bool = False,
+) -> None:
+    """Stop a running pipeline.
+    Args:
+        run_name_or_id: The name or ID of the pipeline run to stop.
+        graceful: If True, uses graceful shutdown. If False, forces immediate termination.
+        yes: If set, don't ask for confirmation.
+    """
+    # Ask for confirmation to stop run.
+    if not yes:
+        action = "gracefully stop" if graceful else "force stop"
+        confirmation = cli_utils.confirmation(
+            f"Are you sure you want to {action} pipeline run `{run_name_or_id}`?"
+        )
+        if not confirmation:
+            cli_utils.declare("Not stopping the pipeline run.")
+            return
+    # Stop run.
+    try:
+        run = Client().get_pipeline_run(name_id_or_prefix=run_name_or_id)
+        run_utils.stop_run(run=run, graceful=graceful)
+        action = "Gracefully stopped" if graceful else "Force stopped"
+        cli_utils.declare(f"{action} pipeline run '{run.name}'.")
+    except NotImplementedError:
+        cli_utils.error(
+            "The orchestrator used for this pipeline run does not support "
+            f"{'gracefully' if graceful else 'forcefully'} stopping runs."
+        )
+    except Exception as e:
+        cli_utils.error(f"Failed to stop pipeline run: {e}")
 @runs.command("delete")
 @click.argument("run_name_or_id", type=str, required=True)
 @click.option(

zenml/cli/utils.py CHANGED Viewed

@@ -2214,6 +2214,8 @@ def get_execution_status_emoji(status: "ExecutionStatus") -> str:
         return ":white_check_mark:"
     if status == ExecutionStatus.CACHED:
         return ":package:"
+    if status == ExecutionStatus.STOPPED or status == ExecutionStatus.STOPPING:
+        return ":stop_sign:"
     raise RuntimeError(f"Unknown status: {status}")

zenml/constants.py CHANGED Viewed

@@ -416,6 +416,7 @@ STATISTICS = "/statistics"
 STATUS = "/status"
 STEP_CONFIGURATION = "/step-configuration"
 STEPS = "/steps"
+STOP = "/stop"
 TAGS = "/tags"
 TAG_RESOURCES = "/tag_resources"
 TRIGGERS = "/triggers"

zenml/enums.py CHANGED Viewed

@@ -71,25 +71,28 @@ class ZenMLServiceType(StrEnum):
 class ExecutionStatus(StrEnum):
-    """Enum that represents the current status of a step or pipeline run."""
+    """Enum that represents the execution status of a step or pipeline run."""
     INITIALIZING = "initializing"
     FAILED = "failed"
     COMPLETED = "completed"
     RUNNING = "running"
     CACHED = "cached"
+    STOPPED = "stopped"
+    STOPPING = "stopping"
     @property
     def is_finished(self) -> bool:
-        """Whether the execution status refers to a finished execution.
+        """Returns whether the execution status is in a finished state.
         Returns:
-            Whether the execution status refers to a finished execution.
+            Whether the execution status is finished.
         """
         return self in {
             ExecutionStatus.FAILED,
             ExecutionStatus.COMPLETED,
             ExecutionStatus.CACHED,
+            ExecutionStatus.STOPPED,
         }

zenml/exceptions.py CHANGED Viewed

@@ -122,6 +122,14 @@ class IllegalOperationError(ZenMLBaseException):
     """Raised when an illegal operation is attempted."""
+class RunStoppedException(ZenMLBaseException):
+    """Raised when a ZenML pipeline run gets stopped by the user."""
+class RunInterruptedException(ZenMLBaseException):
+    """Raised when a ZenML step gets interrupted for an unknown reason."""
 class MethodNotAllowedError(ZenMLBaseException):
     """Raised when the server does not allow a request method."""

zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py CHANGED Viewed

@@ -853,12 +853,16 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
         )["PipelineExecutionStatus"]
         # Map the potential outputs to ZenML ExecutionStatus. Potential values:
-        # https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/PipelineState
-        if status in ["Executing", "Stopping"]:
+        # https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribePipelineExecution.html
+        if status == "Executing":
             return ExecutionStatus.RUNNING
-        elif status in ["Stopped", "Failed"]:
+        elif status == "Stopping":
+            return ExecutionStatus.STOPPING
+        elif status == "Stopped":
+            return ExecutionStatus.STOPPED
+        elif status == "Failed":
             return ExecutionStatus.FAILED
-        elif status in ["Succeeded"]:
+        elif status == "Succeeded":
             return ExecutionStatus.COMPLETED
         else:
             raise ValueError("Unknown status for the pipeline execution.")

zenml/integrations/azure/orchestrators/azureml_orchestrator.py CHANGED Viewed

@@ -515,14 +515,16 @@ class AzureMLOrchestrator(ContainerizedOrchestrator):
             return ExecutionStatus.INITIALIZING
         elif status in ["Running", "Finalizing"]:
             return ExecutionStatus.RUNNING
+        elif status == "CancelRequested":
+            return ExecutionStatus.STOPPING
+        elif status == "Canceled":
+            return ExecutionStatus.STOPPED
         elif status in [
-            "CancelRequested",
             "Failed",
-            "Canceled",
             "NotResponding",
         ]:
             return ExecutionStatus.FAILED
-        elif status in ["Completed"]:
+        elif status == "Completed":
             return ExecutionStatus.COMPLETED
         else:
             raise ValueError("Unknown status for the pipeline job.")

zenml/integrations/gcp/orchestrators/vertex_orchestrator.py CHANGED Viewed

@@ -942,7 +942,7 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
         # Map the potential outputs to ZenML ExecutionStatus. Potential values:
         # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/describe_pipeline_execution.html#
-        if status in [PipelineState.PIPELINE_STATE_UNSPECIFIED]:
+        if status == PipelineState.PIPELINE_STATE_UNSPECIFIED:
             return run.status
         elif status in [
             PipelineState.PIPELINE_STATE_QUEUED,
@@ -954,14 +954,13 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
             PipelineState.PIPELINE_STATE_PAUSED,
         ]:
             return ExecutionStatus.RUNNING
-        elif status in [PipelineState.PIPELINE_STATE_SUCCEEDED]:
+        elif status == PipelineState.PIPELINE_STATE_SUCCEEDED:
             return ExecutionStatus.COMPLETED
-        elif status in [
-            PipelineState.PIPELINE_STATE_FAILED,
-            PipelineState.PIPELINE_STATE_CANCELLING,
-            PipelineState.PIPELINE_STATE_CANCELLED,
-        ]:
+        elif status == PipelineState.PIPELINE_STATE_CANCELLING:
+            return ExecutionStatus.STOPPING
+        elif status == PipelineState.PIPELINE_STATE_CANCELLED:
+            return ExecutionStatus.STOPPED
+        elif status == PipelineState.PIPELINE_STATE_FAILED:
             return ExecutionStatus.FAILED
         else:
             raise ValueError("Unknown status for the pipeline job.")

zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py CHANGED Viewed

@@ -69,6 +69,8 @@ class KubernetesOrchestratorSettings(BaseSettings):
             scheduling a pipeline.
         prevent_orchestrator_pod_caching: If `True`, the orchestrator pod will
             not try to compute cached steps before starting the step pods.
+        pod_stop_grace_period: When stopping a pipeline run, the amount of
+            seconds to wait for a step pod to shutdown gracefully.
     """
     synchronous: bool = True
@@ -88,6 +90,7 @@ class KubernetesOrchestratorSettings(BaseSettings):
     failed_jobs_history_limit: Optional[NonNegativeInt] = None
     ttl_seconds_after_finished: Optional[NonNegativeInt] = None
     prevent_orchestrator_pod_caching: bool = False
+    pod_stop_grace_period: PositiveInt = 30
 class KubernetesOrchestratorConfig(

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py CHANGED Viewed

@@ -545,6 +545,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 successful_jobs_history_limit=settings.successful_jobs_history_limit,
                 failed_jobs_history_limit=settings.failed_jobs_history_limit,
                 ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
+                termination_grace_period_seconds=settings.pod_stop_grace_period,
                 labels=orchestrator_pod_labels,
             )
@@ -570,6 +571,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 env=environment,
                 labels=orchestrator_pod_labels,
                 mount_local_stores=self.config.is_local,
+                termination_grace_period_seconds=settings.pod_stop_grace_period,
             )
             kube_utils.create_and_wait_for_pod_to_start(
@@ -663,6 +665,92 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 f"{ENV_ZENML_KUBERNETES_RUN_ID}."
             )
+    def _stop_run(
+        self, run: "PipelineRunResponse", graceful: bool = True
+    ) -> None:
+        """Stops a specific pipeline run by terminating step pods.
+        Args:
+            run: The run that was executed by this orchestrator.
+            graceful: If True, does nothing (lets the orchestrator and steps finish naturally).
+                If False, stops all running step pods.
+        Raises:
+            RuntimeError: If we fail to stop the run.
+        """
+        # If graceful, do nothing and let the orchestrator handle the stop naturally
+        if graceful:
+            logger.info(
+                "Graceful stop requested - the orchestrator pod will handle "
+                "stopping naturally"
+            )
+            return
+        pods_stopped = []
+        errors = []
+        # Find all pods with the orchestrator run ID label
+        label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
+        try:
+            pods = self._k8s_core_api.list_namespaced_pod(
+                namespace=self.config.kubernetes_namespace,
+                label_selector=label_selector,
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to list step pods with run ID {run.id}: {e}"
+            )
+        # Filter to only include running or pending pods
+        for pod in pods.items:
+            if pod.status.phase not in ["Running", "Pending"]:
+                logger.debug(
+                    f"Skipping pod {pod.metadata.name} with status {pod.status.phase}"
+                )
+                continue
+            try:
+                self._k8s_core_api.delete_namespaced_pod(
+                    name=pod.metadata.name,
+                    namespace=self.config.kubernetes_namespace,
+                )
+                pods_stopped.append(f"step pod: {pod.metadata.name}")
+                logger.debug(
+                    f"Successfully initiated graceful stop of step pod: {pod.metadata.name}"
+                )
+            except Exception as e:
+                error_msg = f"Failed to stop step pod {pod.metadata.name}: {e}"
+                logger.warning(error_msg)
+                errors.append(error_msg)
+        # Summary logging
+        settings = cast(KubernetesOrchestratorSettings, self.get_settings(run))
+        grace_period_seconds = settings.pod_stop_grace_period
+        if pods_stopped:
+            logger.debug(
+                f"Successfully initiated graceful termination of: {', '.join(pods_stopped)}. "
+                f"Pods will terminate within {grace_period_seconds} seconds."
+            )
+        if errors:
+            error_summary = "; ".join(errors)
+            if not pods_stopped:
+                # If nothing was stopped successfully, raise an error
+                raise RuntimeError(
+                    f"Failed to stop pipeline run: {error_summary}"
+                )
+            else:
+                # If some things were stopped but others failed, raise an error
+                raise RuntimeError(
+                    f"Partial stop operation completed with errors: {error_summary}"
+                )
+        # If no step pods were found and no errors occurred
+        if not pods_stopped and not errors:
+            logger.info(
+                f"No running step pods found for pipeline run with ID: {run.id}"
+            )
     def get_pipeline_run_metadata(
         self, run_id: UUID
     ) -> Dict[str, "MetadataType"]:

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py CHANGED Viewed

@@ -18,6 +18,7 @@ import socket
 from typing import Callable, Dict, Optional, cast
 from kubernetes import client as k8s_client
+from kubernetes.client.rest import ApiException
 from zenml.client import Client
 from zenml.entrypoints.step_entrypoint_configuration import (
@@ -248,6 +249,7 @@ def main() -> None:
             or settings.service_account_name,
             mount_local_stores=mount_local_stores,
             owner_references=owner_references,
+            termination_grace_period_seconds=settings.pod_stop_grace_period,
             labels=step_pod_labels,
         )
@@ -330,6 +332,38 @@ def main() -> None:
             # as the pipeline run status will already have been published.
             pass
+    def check_pipeline_cancellation() -> bool:
+        """Check if the pipeline should continue execution.
+        Returns:
+            True if execution should continue, False if it should stop.
+        """
+        try:
+            run = client.get_pipeline_run(
+                name_id_or_prefix=pipeline_run.id,
+                project=pipeline_run.project_id,
+                hydrate=False,  # We only need status, not full hydration
+            )
+            # If the run is STOPPING or STOPPED, we should stop the execution
+            if run.status in [
+                ExecutionStatus.STOPPING,
+                ExecutionStatus.STOPPED,
+            ]:
+                logger.info(
+                    f"Pipeline run is in {run.status} state, stopping execution"
+                )
+                return False
+            return True
+        except Exception as e:
+            # If we can't check the status, assume we should continue
+            logger.warning(
+                f"Failed to check pipeline cancellation status: {e}"
+            )
+            return True
     parallel_node_startup_waiting_period = (
         orchestrator.config.parallel_step_startup_waiting_period or 0.0
     )
@@ -344,6 +378,7 @@ def main() -> None:
             run_fn=run_step_on_kubernetes,
             preparation_fn=pre_step_run,
             finalize_fn=finalize_run,
+            continue_fn=check_pipeline_cancellation,
             parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
             max_parallelism=pipeline_settings.max_parallelism,
         ).run()
@@ -360,7 +395,7 @@ def main() -> None:
                     namespace=namespace,
                     secret_name=secret_name,
                 )
-            except k8s_client.rest.ApiException as e:
+            except ApiException as e:
                 logger.error(f"Error cleaning up secret {secret_name}: {e}")

zenml/integrations/kubernetes/orchestrators/manifest_utils.py CHANGED Viewed

@@ -106,6 +106,7 @@ def build_pod_manifest(
     labels: Optional[Dict[str, str]] = None,
     mount_local_stores: bool = False,
     owner_references: Optional[List[k8s_client.V1OwnerReference]] = None,
+    termination_grace_period_seconds: Optional[int] = 30,
 ) -> k8s_client.V1Pod:
     """Build a Kubernetes pod manifest for a ZenML run or step.
@@ -124,6 +125,8 @@ def build_pod_manifest(
         mount_local_stores: Whether to mount the local stores path inside the
             pod.
         owner_references: List of owner references for the pod.
+        termination_grace_period_seconds: The amount of seconds to wait for a
+            pod to shutdown gracefully.
     Returns:
         Pod manifest.
@@ -154,19 +157,20 @@ def build_pod_manifest(
         containers=[container_spec],
         restart_policy="Never",
         image_pull_secrets=image_pull_secrets,
+        termination_grace_period_seconds=termination_grace_period_seconds,
     )
     if service_account_name is not None:
         pod_spec.service_account_name = service_account_name
+    # Apply pod settings if provided
     labels = labels or {}
     if pod_settings:
         add_pod_settings(pod_spec, pod_settings)
-        # Add pod_settings.labels to the labels
-        if pod_settings.labels:
-            labels.update(pod_settings.labels)
+    if pod_settings and pod_settings.labels:
+        labels.update(pod_settings.labels)
     pod_metadata = k8s_client.V1ObjectMeta(
         name=pod_name,
@@ -273,6 +277,7 @@ def build_cron_job_manifest(
     successful_jobs_history_limit: Optional[int] = None,
     failed_jobs_history_limit: Optional[int] = None,
     ttl_seconds_after_finished: Optional[int] = None,
+    termination_grace_period_seconds: Optional[int] = 30,
 ) -> k8s_client.V1CronJob:
     """Create a manifest for launching a pod as scheduled CRON job.
@@ -295,6 +300,8 @@ def build_cron_job_manifest(
         failed_jobs_history_limit: The number of failed jobs to retain.
         ttl_seconds_after_finished: The amount of seconds to keep finished jobs
             before deleting them.
+        termination_grace_period_seconds: The amount of seconds to wait for a
+            pod to shutdown gracefully.
     Returns:
         CRON job manifest.
@@ -310,6 +317,7 @@ def build_cron_job_manifest(
         env=env,
         labels=labels,
         mount_local_stores=mount_local_stores,
+        termination_grace_period_seconds=termination_grace_period_seconds,
     )
     job_spec = k8s_client.V1CronJobSpec(

zenml/models/v2/core/pipeline_run.py CHANGED Viewed

@@ -343,7 +343,7 @@ class PipelineRunResponse(
             if self.stack is None:
                 raise ValueError(
                     "The stack that this pipeline run response was executed on"
-                    "has been deleted."
+                    "is either not accessible or has been deleted."
                 )
             # Create the orchestrator instance
@@ -358,7 +358,7 @@ class PipelineRunResponse(
             if len(orchestrator_list) == 0:
                 raise ValueError(
                     "The orchestrator that this pipeline run response was "
-                    "executed with has been deleted."
+                    "executed with is either not accessible or has been deleted."
                 )
             orchestrator = cast(

zenml/orchestrators/base_orchestrator.py CHANGED Viewed

@@ -38,6 +38,7 @@ from zenml.logger import get_logger
 from zenml.metadata.metadata_types import MetadataType
 from zenml.orchestrators.publish_utils import (
     publish_pipeline_run_metadata,
+    publish_pipeline_run_status_update,
     publish_schedule_metadata,
 )
 from zenml.orchestrators.step_launcher import StepLauncher
@@ -210,6 +211,8 @@ class BaseOrchestrator(StackComponent, ABC):
                 This will be deleted in case the pipeline deployment failed.
         Raises:
+            KeyboardInterrupt: If the orchestrator is synchronous and the
+                pipeline run is keyboard interrupted.
             RunMonitoringError: If a failure happened while monitoring the
                 pipeline run.
         """
@@ -324,8 +327,17 @@ class BaseOrchestrator(StackComponent, ABC):
                     if submission_result.wait_for_completion:
                         try:
                             submission_result.wait_for_completion()
+                        except KeyboardInterrupt:
+                            error_message = "Received KeyboardInterrupt. Note that the run is still executing. "
+                            if placeholder_run:
+                                error_message += (
+                                    "If you want to stop the pipeline run, please use: "
+                                    f"`zenml pipeline runs stop {placeholder_run.id}`"
+                                )
+                            raise KeyboardInterrupt(error_message)
                         except BaseException as e:
                             raise RunMonitoringError(original_exception=e)
         finally:
             self._cleanup_run()
@@ -391,6 +403,64 @@ class BaseOrchestrator(StackComponent, ABC):
             f"'{self.__class__.__name__}' orchestrator."
         )
+    def stop_run(
+        self, run: "PipelineRunResponse", graceful: bool = False
+    ) -> None:
+        """Stops a specific pipeline run.
+        This method should only be called if the orchestrator's
+        supports_cancellation property is True.
+        Args:
+            run: A pipeline run response to stop.
+            graceful: If True, allows for graceful shutdown where possible.
+                If False, forces immediate termination. Default is False.
+        Raises:
+            NotImplementedError: If any orchestrator inheriting from the base
+                class does not implement this logic.
+        """
+        # Check if the orchestrator supports cancellation
+        if (
+            getattr(self._stop_run, "__func__", None)
+            is BaseOrchestrator._stop_run
+        ):
+            raise NotImplementedError(
+                f"The '{self.__class__.__name__}' orchestrator does not "
+                "support stopping pipeline runs."
+            )
+        # Update pipeline status to STOPPING before calling concrete implementation
+        publish_pipeline_run_status_update(
+            pipeline_run_id=run.id,
+            status=ExecutionStatus.STOPPING,
+        )
+        # Now call the concrete implementation
+        self._stop_run(run=run, graceful=graceful)
+    def _stop_run(
+        self, run: "PipelineRunResponse", graceful: bool = False
+    ) -> None:
+        """Concrete implementation of pipeline stopping logic.
+        This method should be implemented by concrete orchestrator classes
+        instead of stop_run to ensure proper status management.
+        Args:
+            run: A pipeline run response to stop (already updated to STOPPING status).
+            graceful: If True, allows for graceful shutdown where possible.
+                If False, forces immediate termination. Default is True.
+        Raises:
+            NotImplementedError: If any orchestrator inheriting from the base
+                class does not implement this logic.
+        """
+        raise NotImplementedError(
+            "The stop run functionality is not implemented for the "
+            f"'{self.__class__.__name__}' orchestrator."
+        )
 class BaseOrchestratorFlavor(Flavor):
     """Base orchestrator flavor class."""

zenml-nightly 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250703__py3-none-any.whl

zenml-nightly 0.83.1.dev20250702py3-none-any.whl → 0.83.1.dev20250703py3-none-any.whl