PyPI - zenml-nightly - Versions diffs - 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250704__py3-none-any.whl - Mend

zenml-nightly 0.83.1.dev20250702py3-none-any.whl → 0.83.1.dev20250704py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

zenml/models/v2/core/pipeline_run.py CHANGED Viewed

@@ -343,7 +343,7 @@ class PipelineRunResponse(
             if self.stack is None:
                 raise ValueError(
                     "The stack that this pipeline run response was executed on"
-                    "has been deleted."
+                    "is either not accessible or has been deleted."
                 )
             # Create the orchestrator instance
@@ -358,7 +358,7 @@ class PipelineRunResponse(
             if len(orchestrator_list) == 0:
                 raise ValueError(
                     "The orchestrator that this pipeline run response was "
-                    "executed with has been deleted."
+                    "executed with is either not accessible or has been deleted."
                 )
             orchestrator = cast(

zenml/orchestrators/base_orchestrator.py CHANGED Viewed

@@ -38,6 +38,7 @@ from zenml.logger import get_logger
 from zenml.metadata.metadata_types import MetadataType
 from zenml.orchestrators.publish_utils import (
     publish_pipeline_run_metadata,
+    publish_pipeline_run_status_update,
     publish_schedule_metadata,
 )
 from zenml.orchestrators.step_launcher import StepLauncher
@@ -210,6 +211,8 @@ class BaseOrchestrator(StackComponent, ABC):
                 This will be deleted in case the pipeline deployment failed.
         Raises:
+            KeyboardInterrupt: If the orchestrator is synchronous and the
+                pipeline run is keyboard interrupted.
             RunMonitoringError: If a failure happened while monitoring the
                 pipeline run.
         """
@@ -324,8 +327,17 @@ class BaseOrchestrator(StackComponent, ABC):
                     if submission_result.wait_for_completion:
                         try:
                             submission_result.wait_for_completion()
+                        except KeyboardInterrupt:
+                            error_message = "Received KeyboardInterrupt. Note that the run is still executing. "
+                            if placeholder_run:
+                                error_message += (
+                                    "If you want to stop the pipeline run, please use: "
+                                    f"`zenml pipeline runs stop {placeholder_run.id}`"
+                                )
+                            raise KeyboardInterrupt(error_message)
                         except BaseException as e:
                             raise RunMonitoringError(original_exception=e)
         finally:
             self._cleanup_run()
@@ -391,6 +403,64 @@ class BaseOrchestrator(StackComponent, ABC):
             f"'{self.__class__.__name__}' orchestrator."
         )
+    def stop_run(
+        self, run: "PipelineRunResponse", graceful: bool = False
+    ) -> None:
+        """Stops a specific pipeline run.
+        This method should only be called if the orchestrator's
+        supports_cancellation property is True.
+        Args:
+            run: A pipeline run response to stop.
+            graceful: If True, allows for graceful shutdown where possible.
+                If False, forces immediate termination. Default is False.
+        Raises:
+            NotImplementedError: If any orchestrator inheriting from the base
+                class does not implement this logic.
+        """
+        # Check if the orchestrator supports cancellation
+        if (
+            getattr(self._stop_run, "__func__", None)
+            is BaseOrchestrator._stop_run
+        ):
+            raise NotImplementedError(
+                f"The '{self.__class__.__name__}' orchestrator does not "
+                "support stopping pipeline runs."
+            )
+        # Update pipeline status to STOPPING before calling concrete implementation
+        publish_pipeline_run_status_update(
+            pipeline_run_id=run.id,
+            status=ExecutionStatus.STOPPING,
+        )
+        # Now call the concrete implementation
+        self._stop_run(run=run, graceful=graceful)
+    def _stop_run(
+        self, run: "PipelineRunResponse", graceful: bool = False
+    ) -> None:
+        """Concrete implementation of pipeline stopping logic.
+        This method should be implemented by concrete orchestrator classes
+        instead of stop_run to ensure proper status management.
+        Args:
+            run: A pipeline run response to stop (already updated to STOPPING status).
+            graceful: If True, allows for graceful shutdown where possible.
+                If False, forces immediate termination. Default is True.
+        Raises:
+            NotImplementedError: If any orchestrator inheriting from the base
+                class does not implement this logic.
+        """
+        raise NotImplementedError(
+            "The stop run functionality is not implemented for the "
+            f"'{self.__class__.__name__}' orchestrator."
+        )
 class BaseOrchestratorFlavor(Flavor):
     """Base orchestrator flavor class."""

zenml/orchestrators/containerized_orchestrator.py CHANGED Viewed

@@ -53,6 +53,19 @@ class ContainerizedOrchestrator(BaseOrchestrator, ABC):
             component_key=ORCHESTRATOR_DOCKER_IMAGE_KEY, step=step_name
         )
+    def should_build_pipeline_image(
+        self, deployment: "PipelineDeploymentBase"
+    ) -> bool:
+        """Whether to build the pipeline image.
+        Args:
+            deployment: The pipeline deployment.
+        Returns:
+            Whether to build the pipeline image.
+        """
+        return False
     def get_docker_builds(
         self, deployment: "PipelineDeploymentBase"
     ) -> List["BuildConfiguration"]:
@@ -87,4 +100,13 @@ class ContainerizedOrchestrator(BaseOrchestrator, ABC):
                 builds.append(pipeline_build)
                 included_pipeline_build = True
+        if not included_pipeline_build and self.should_build_pipeline_image(
+            deployment
+        ):
+            pipeline_build = BuildConfiguration(
+                key=ORCHESTRATOR_DOCKER_IMAGE_KEY,
+                settings=pipeline_settings,
+            )
+            builds.append(pipeline_build)
         return builds

zenml/orchestrators/dag_runner.py CHANGED Viewed

@@ -56,6 +56,7 @@ class NodeStatus(Enum):
     RUNNING = "running"
     COMPLETED = "completed"
     FAILED = "failed"
+    CANCELLED = "cancelled"
 class ThreadedDagRunner:
@@ -76,6 +77,7 @@ class ThreadedDagRunner:
         finalize_fn: Optional[Callable[[Dict[str, NodeStatus]], None]] = None,
         parallel_node_startup_waiting_period: float = 0.0,
         max_parallelism: Optional[int] = None,
+        continue_fn: Optional[Callable[[], bool]] = None,
     ) -> None:
         """Define attributes and initialize all nodes in waiting state.
@@ -92,6 +94,9 @@ class ThreadedDagRunner:
             parallel_node_startup_waiting_period: Delay in seconds to wait in
                 between starting parallel nodes.
             max_parallelism: Maximum number of nodes to run in parallel
+            continue_fn: A function that returns True if the run should continue
+                after each step execution, False if it should stop (e.g., due
+                to cancellation). If None, execution continues normally.
         Raises:
             ValueError: If max_parallelism is not greater than 0.
@@ -108,12 +113,15 @@ class ThreadedDagRunner:
         self.run_fn = run_fn
         self.preparation_fn = preparation_fn
         self.finalize_fn = finalize_fn
+        self.continue_fn = continue_fn
         self.nodes = dag.keys()
         self.node_states = {
             node: NodeStatus.NOT_STARTED for node in self.nodes
         }
         self._lock = threading.Lock()
+        self._stop_requested = False
     def _can_run(self, node: str) -> bool:
         """Determine whether a node is ready to be run.
@@ -173,6 +181,15 @@ class ThreadedDagRunner:
         """
         self._prepare_node_run(node)
+        # Check if execution should continue (e.g., check for cancellation)
+        if self.continue_fn:
+            self._stop_requested = (
+                self._stop_requested or not self.continue_fn()
+            )
+            if self._stop_requested:
+                self._finish_node(node, cancelled=True)
+                return
         if self.preparation_fn:
             run_required = self.preparation_fn(node)
             if not run_required:
@@ -204,24 +221,26 @@ class ThreadedDagRunner:
         thread.start()
         return thread
-    def _finish_node(self, node: str, failed: bool = False) -> None:
-        """Finish a node run.
-        First updates the node status to completed.
-        Then starts all other nodes that can now be run and waits for them.
+    def _finish_node(
+        self, node: str, failed: bool = False, cancelled: bool = False
+    ) -> None:
+        """Mark a node as finished and potentially start new nodes.
         Args:
-            node: The node.
+            node: The node to mark as finished.
             failed: Whether the node failed.
+            cancelled: Whether the node was cancelled.
         """
         with self._lock:
             if failed:
                 self.node_states[node] = NodeStatus.FAILED
+            elif cancelled:
+                self.node_states[node] = NodeStatus.CANCELLED
             else:
                 self.node_states[node] = NodeStatus.COMPLETED
-        if failed:
-            # If the node failed, we don't need to run any downstream nodes.
+        if failed or cancelled:
+            # If the node failed or was cancelled, we don't need to run any downstream nodes.
             return
         # Run downstream nodes.

zenml/orchestrators/local_docker/local_docker_orchestrator.py CHANGED Viewed

@@ -63,6 +63,15 @@ class LocalDockerOrchestrator(ContainerizedOrchestrator):
         """
         return LocalDockerOrchestratorSettings
+    @property
+    def config(self) -> "LocalDockerOrchestratorConfig":
+        """Returns the `LocalDockerOrchestratorConfig` config.
+        Returns:
+            The configuration.
+        """
+        return cast(LocalDockerOrchestratorConfig, self._config)
     @property
     def validator(self) -> Optional[StackValidator]:
         """Ensures there is an image builder in the stack.

zenml/orchestrators/publish_utils.py CHANGED Viewed

@@ -13,7 +13,8 @@
 #  permissions and limitations under the License.
 """Utilities to publish pipeline and step runs."""
-from typing import TYPE_CHECKING, Dict, List
+from datetime import datetime
+from typing import TYPE_CHECKING, Dict, List, Optional
 from zenml.client import Client
 from zenml.enums import ExecutionStatus, MetadataResourceTypes
@@ -54,6 +55,40 @@ def publish_successful_step_run(
     )
+def publish_step_run_status_update(
+    step_run_id: "UUID",
+    status: "ExecutionStatus",
+    end_time: Optional[datetime] = None,
+) -> "StepRunResponse":
+    """Publishes a step run update.
+    Args:
+        step_run_id: ID of the step run.
+        status: New status of the step run.
+        end_time: New end time of the step run.
+    Returns:
+        The updated step run.
+    Raises:
+        ValueError: If the end time is set for a non-finished step run.
+    """
+    from zenml.client import Client
+    if end_time is not None and not status.is_finished:
+        raise ValueError("End time cannot be set for a non-finished step run.")
+    step_run = Client().zen_store.update_run_step(
+        step_run_id=step_run_id,
+        step_run_update=StepRunUpdate(
+            status=status,
+            end_time=end_time,
+        ),
+    )
+    return step_run
 def publish_failed_step_run(step_run_id: "UUID") -> "StepRunResponse":
     """Publishes a failed step run.
@@ -63,12 +98,10 @@ def publish_failed_step_run(step_run_id: "UUID") -> "StepRunResponse":
     Returns:
         The updated step run.
     """
-    return Client().zen_store.update_run_step(
+    return publish_step_run_status_update(
         step_run_id=step_run_id,
-        step_run_update=StepRunUpdate(
-            status=ExecutionStatus.FAILED,
-            end_time=utc_now(),
-        ),
+        status=ExecutionStatus.FAILED,
+        end_time=utc_now(),
     )
@@ -92,27 +125,81 @@ def publish_failed_pipeline_run(
     )
+def publish_pipeline_run_status_update(
+    pipeline_run_id: "UUID",
+    status: ExecutionStatus,
+    end_time: Optional[datetime] = None,
+) -> "PipelineRunResponse":
+    """Publishes a pipeline run status update.
+    Args:
+        pipeline_run_id: The ID of the pipeline run to update.
+        status: The new status for the pipeline run.
+        end_time: The end time for the pipeline run. If None, will be set to current time
+            for finished statuses.
+    Returns:
+        The updated pipeline run.
+    """
+    if end_time is None and status.is_finished:
+        end_time = utc_now()
+    return Client().zen_store.update_run(
+        run_id=pipeline_run_id,
+        run_update=PipelineRunUpdate(
+            status=status,
+            end_time=end_time,
+        ),
+    )
 def get_pipeline_run_status(
-    step_statuses: List[ExecutionStatus], num_steps: int
+    run_status: ExecutionStatus,
+    step_statuses: List[ExecutionStatus],
+    num_steps: int,
 ) -> ExecutionStatus:
     """Gets the pipeline run status for the given step statuses.
     Args:
+        run_status: The status of the run.
         step_statuses: The status of steps in this run.
         num_steps: The total amount of steps in this run.
     Returns:
         The run status.
     """
-    if ExecutionStatus.FAILED in step_statuses:
-        return ExecutionStatus.FAILED
-    if (
-        ExecutionStatus.RUNNING in step_statuses
-        or len(step_statuses) < num_steps
+    # STOPPING state
+    if run_status == ExecutionStatus.STOPPING:
+        if all(status.is_finished for status in step_statuses):
+            return ExecutionStatus.STOPPED
+        else:
+            return ExecutionStatus.STOPPING
+    # If there is a stopped step, the run is stopped or stopping
+    if ExecutionStatus.STOPPED in step_statuses:
+        if all(status.is_finished for status in step_statuses):
+            return ExecutionStatus.STOPPED
+        else:
+            return ExecutionStatus.STOPPING
+    # Otherwise, if there is a failed step, the run is failed
+    elif (
+        ExecutionStatus.FAILED in step_statuses
+        or run_status == ExecutionStatus.FAILED
     ):
+        return ExecutionStatus.FAILED
+    # If there is a running step, the run is running
+    elif ExecutionStatus.RUNNING in step_statuses:
+        return ExecutionStatus.RUNNING
+    # If there are less steps than the total number of steps, it is running
+    elif len(step_statuses) < num_steps:
         return ExecutionStatus.RUNNING
-    return ExecutionStatus.COMPLETED
+    # Any other state is completed
+    else:
+        return ExecutionStatus.COMPLETED
 def publish_pipeline_run_metadata(

zenml/orchestrators/step_launcher.py CHANGED Viewed

@@ -14,10 +14,11 @@
 """Class to launch (run directly or using a step operator) steps."""
 import os
+import signal
 import time
 from contextlib import nullcontext
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, Dict, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 from zenml.client import Client
 from zenml.config.step_configurations import Step
@@ -29,6 +30,7 @@ from zenml.constants import (
 )
 from zenml.enums import ExecutionStatus
 from zenml.environment import get_run_environment_dict
+from zenml.exceptions import RunInterruptedException, RunStoppedException
 from zenml.logger import get_logger
 from zenml.logging import step_logging
 from zenml.models import (
@@ -53,7 +55,7 @@ logger = get_logger(__name__)
 def _get_step_operator(
-    stack: "Stack", step_operator_name: str
+    stack: "Stack", step_operator_name: Optional[str]
 ) -> "BaseStepOperator":
     """Fetches the step operator from the stack.
@@ -76,7 +78,7 @@ def _get_step_operator(
             f"No step operator specified for active stack '{stack.name}'."
         )
-    if step_operator_name != step_operator.name:
+    if step_operator_name and step_operator_name != step_operator.name:
         raise RuntimeError(
             f"No step operator named '{step_operator_name}' in active "
             f"stack '{stack.name}'."
@@ -131,11 +133,86 @@ class StepLauncher:
         self._stack = Stack.from_model(deployment.stack)
         self._step_name = step.spec.pipeline_parameter_name
+        # Internal properties and methods
+        self._step_run: Optional[StepRunResponse] = None
+        self._setup_signal_handlers()
+    def _setup_signal_handlers(self) -> None:
+        """Set up signal handlers for graceful shutdown, chaining previous handlers."""
+        # Save previous handlers
+        self._prev_sigterm_handler = signal.getsignal(signal.SIGTERM)
+        self._prev_sigint_handler = signal.getsignal(signal.SIGINT)
+        def signal_handler(signum: int, frame: Any) -> None:
+            """Handle shutdown signals gracefully.
+            Args:
+                signum: The signal number.
+                frame: The frame of the signal handler.
+            Raises:
+                RunStoppedException: If the pipeline run is stopped by the user.
+                RunInterruptedException: If the execution is interrupted for any
+                    other reason.
+            """
+            logger.info(
+                f"Received signal shutdown {signum}. Requesting shutdown "
+                f"for step '{self._step_name}'..."
+            )
+            try:
+                client = Client()
+                pipeline_run = None
+                if self._step_run:
+                    pipeline_run = client.get_pipeline_run(
+                        self._step_run.pipeline_run_id
+                    )
+                else:
+                    raise RunInterruptedException(
+                        "The execution was interrupted and the step does not "
+                        "exist yet."
+                    )
+                if pipeline_run and pipeline_run.status in [
+                    ExecutionStatus.STOPPING,
+                    ExecutionStatus.STOPPED,
+                ]:
+                    if self._step_run:
+                        publish_utils.publish_step_run_status_update(
+                            step_run_id=self._step_run.id,
+                            status=ExecutionStatus.STOPPED,
+                            end_time=utc_now(),
+                        )
+                    raise RunStoppedException("Pipeline run in stopped.")
+                else:
+                    raise RunInterruptedException(
+                        "The execution was interrupted."
+                    )
+            except (RunStoppedException, RunInterruptedException):
+                raise
+            except Exception as e:
+                raise RunInterruptedException(str(e))
+            finally:
+                # Chain to previous handler if it exists and is not default/ignore
+                if signum == signal.SIGTERM and callable(
+                    self._prev_sigterm_handler
+                ):
+                    self._prev_sigterm_handler(signum, frame)
+                elif signum == signal.SIGINT and callable(
+                    self._prev_sigint_handler
+                ):
+                    self._prev_sigint_handler(signum, frame)
+        # Register handlers for common termination signals
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
     def launch(self) -> None:
         """Launches the step.
         Raises:
-            BaseException: If the step failed to launch, run, or publish.
+            RunStoppedException: If the pipeline run is stopped by the user.
         """
         pipeline_run, run_was_created = self._create_or_reuse_run()
@@ -207,6 +284,8 @@ class StepLauncher:
                     step_run = Client().zen_store.create_run_step(
                         step_run_request
                     )
+                    # Store step run ID for signal handler
+                    self._step_run = step_run
                     if model_version := step_run.model_version:
                         step_run_utils.log_model_version_dashboard_url(
                             model_version=model_version
@@ -259,6 +338,8 @@ class StepLauncher:
                                 force_write_logs=force_write_logs,
                             )
                             break
+                        except RunStoppedException as e:
+                            raise e
                         except BaseException as e:  # noqa: E722
                             retries += 1
                             if retries < max_retries:
@@ -292,10 +373,11 @@ class StepLauncher:
                             artifacts=step_run.outputs,
                             model_version=model_version,
                         )
+        except RunStoppedException:
+            logger.info(f"Pipeline run `{pipeline_run.name}` stopped.")
+            raise
         except:  # noqa: E722
             logger.error(f"Pipeline run `{pipeline_run.name}` failed.")
-            publish_utils.publish_failed_pipeline_run(pipeline_run.id)
             raise
     def _create_or_reuse_run(self) -> Tuple[PipelineRunResponse, bool]:
@@ -367,8 +449,12 @@ class StepLauncher:
         start_time = time.time()
         try:
             if self._step.config.step_operator:
+                step_operator_name = None
+                if isinstance(self._step.config.step_operator, str):
+                    step_operator_name = self._step.config.step_operator
                 self._run_step_with_step_operator(
-                    step_operator_name=self._step.config.step_operator,
+                    step_operator_name=step_operator_name,
                     step_run_info=step_run_info,
                     last_retry=last_retry,
                 )
@@ -395,7 +481,7 @@ class StepLauncher:
     def _run_step_with_step_operator(
         self,
-        step_operator_name: str,
+        step_operator_name: Optional[str],
         step_run_info: StepRunInfo,
         last_retry: bool,
     ) -> None:

zenml/stack/stack.py CHANGED Viewed

@@ -849,10 +849,10 @@ class Stack:
                 If the component is used in this step.
             """
             if component.type == StackComponentType.STEP_OPERATOR:
-                return component.name == step_config.step_operator
+                return step_config.uses_step_operator(component.name)
             if component.type == StackComponentType.EXPERIMENT_TRACKER:
-                return component.name == step_config.experiment_tracker
+                return step_config.uses_experiment_tracker(component.name)
             return True

zenml/utils/run_utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+#  Copyright (c) ZenML GmbH 2025. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""Utility functions for runs."""
+from typing import cast
+from zenml.enums import ExecutionStatus
+from zenml.exceptions import IllegalOperationError
+from zenml.models import PipelineRunResponse
+def stop_run(run: PipelineRunResponse, graceful: bool = False) -> None:
+    """Stop a pipeline run.
+    Args:
+        run: The pipeline run to stop.
+        graceful: Whether to stop the run gracefully.
+    Raises:
+        IllegalOperationError: If the run is already stopped or being stopped.
+        ValueError: If the stack is not accessible.
+    """
+    # Check if the stack is still accessible
+    if run.stack is None:
+        raise ValueError(
+            "The stack that this pipeline run response was executed on "
+            "is either not accessible or has been deleted."
+        )
+    # Check if pipeline can be stopped
+    if run.status == ExecutionStatus.COMPLETED:
+        raise IllegalOperationError(
+            "Cannot stop a run that is already completed."
+        )
+    if run.status == ExecutionStatus.STOPPED:
+        raise IllegalOperationError("Run is already stopped.")
+    if run.status == ExecutionStatus.STOPPING:
+        raise IllegalOperationError("Run is already being stopped.")
+    # Create the orchestrator instance
+    from zenml.enums import StackComponentType
+    from zenml.orchestrators.base_orchestrator import BaseOrchestrator
+    from zenml.stack.stack_component import StackComponent
+    # Check if the stack is still accessible
+    orchestrator_list = run.stack.components.get(
+        StackComponentType.ORCHESTRATOR, []
+    )
+    if len(orchestrator_list) == 0:
+        raise ValueError(
+            "The orchestrator that this pipeline run response was "
+            "executed with is either not accessible or has been deleted."
+        )
+    orchestrator = cast(
+        BaseOrchestrator,
+        StackComponent.from_model(component_model=orchestrator_list[0]),
+    )
+    # Stop the run
+    orchestrator.stop_run(run=run, graceful=graceful)

zenml-nightly 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250704__py3-none-any.whl

zenml-nightly 0.83.1.dev20250702py3-none-any.whl → 0.83.1.dev20250704py3-none-any.whl