PyPI - zenml-nightly - Versions diffs - 0.83.1.dev20250701__py3-none-any.whl → 0.83.1.dev20250703__py3-none-any.whl - Mend

zenml-nightly 0.83.1.dev20250701py3-none-any.whl → 0.83.1.dev20250703py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

zenml/orchestrators/dag_runner.py CHANGED Viewed

@@ -56,6 +56,7 @@ class NodeStatus(Enum):
     RUNNING = "running"
     COMPLETED = "completed"
     FAILED = "failed"
+    CANCELLED = "cancelled"
 class ThreadedDagRunner:
@@ -76,6 +77,7 @@ class ThreadedDagRunner:
         finalize_fn: Optional[Callable[[Dict[str, NodeStatus]], None]] = None,
         parallel_node_startup_waiting_period: float = 0.0,
         max_parallelism: Optional[int] = None,
+        continue_fn: Optional[Callable[[], bool]] = None,
     ) -> None:
         """Define attributes and initialize all nodes in waiting state.
@@ -92,6 +94,9 @@ class ThreadedDagRunner:
             parallel_node_startup_waiting_period: Delay in seconds to wait in
                 between starting parallel nodes.
             max_parallelism: Maximum number of nodes to run in parallel
+            continue_fn: A function that returns True if the run should continue
+                after each step execution, False if it should stop (e.g., due
+                to cancellation). If None, execution continues normally.
         Raises:
             ValueError: If max_parallelism is not greater than 0.
@@ -108,12 +113,15 @@ class ThreadedDagRunner:
         self.run_fn = run_fn
         self.preparation_fn = preparation_fn
         self.finalize_fn = finalize_fn
+        self.continue_fn = continue_fn
         self.nodes = dag.keys()
         self.node_states = {
             node: NodeStatus.NOT_STARTED for node in self.nodes
         }
         self._lock = threading.Lock()
+        self._stop_requested = False
     def _can_run(self, node: str) -> bool:
         """Determine whether a node is ready to be run.
@@ -173,6 +181,15 @@ class ThreadedDagRunner:
         """
         self._prepare_node_run(node)
+        # Check if execution should continue (e.g., check for cancellation)
+        if self.continue_fn:
+            self._stop_requested = (
+                self._stop_requested or not self.continue_fn()
+            )
+            if self._stop_requested:
+                self._finish_node(node, cancelled=True)
+                return
         if self.preparation_fn:
             run_required = self.preparation_fn(node)
             if not run_required:
@@ -204,24 +221,26 @@ class ThreadedDagRunner:
         thread.start()
         return thread
-    def _finish_node(self, node: str, failed: bool = False) -> None:
-        """Finish a node run.
-        First updates the node status to completed.
-        Then starts all other nodes that can now be run and waits for them.
+    def _finish_node(
+        self, node: str, failed: bool = False, cancelled: bool = False
+    ) -> None:
+        """Mark a node as finished and potentially start new nodes.
         Args:
-            node: The node.
+            node: The node to mark as finished.
             failed: Whether the node failed.
+            cancelled: Whether the node was cancelled.
         """
         with self._lock:
             if failed:
                 self.node_states[node] = NodeStatus.FAILED
+            elif cancelled:
+                self.node_states[node] = NodeStatus.CANCELLED
             else:
                 self.node_states[node] = NodeStatus.COMPLETED
-        if failed:
-            # If the node failed, we don't need to run any downstream nodes.
+        if failed or cancelled:
+            # If the node failed or was cancelled, we don't need to run any downstream nodes.
             return
         # Run downstream nodes.

zenml/orchestrators/local_docker/local_docker_orchestrator.py CHANGED Viewed

@@ -63,6 +63,15 @@ class LocalDockerOrchestrator(ContainerizedOrchestrator):
         """
         return LocalDockerOrchestratorSettings
+    @property
+    def config(self) -> "LocalDockerOrchestratorConfig":
+        """Returns the `LocalDockerOrchestratorConfig` config.
+        Returns:
+            The configuration.
+        """
+        return cast(LocalDockerOrchestratorConfig, self._config)
     @property
     def validator(self) -> Optional[StackValidator]:
         """Ensures there is an image builder in the stack.

zenml/orchestrators/publish_utils.py CHANGED Viewed

@@ -13,7 +13,8 @@
 #  permissions and limitations under the License.
 """Utilities to publish pipeline and step runs."""
-from typing import TYPE_CHECKING, Dict, List
+from datetime import datetime
+from typing import TYPE_CHECKING, Dict, List, Optional
 from zenml.client import Client
 from zenml.enums import ExecutionStatus, MetadataResourceTypes
@@ -54,6 +55,40 @@ def publish_successful_step_run(
     )
+def publish_step_run_status_update(
+    step_run_id: "UUID",
+    status: "ExecutionStatus",
+    end_time: Optional[datetime] = None,
+) -> "StepRunResponse":
+    """Publishes a step run update.
+    Args:
+        step_run_id: ID of the step run.
+        status: New status of the step run.
+        end_time: New end time of the step run.
+    Returns:
+        The updated step run.
+    Raises:
+        ValueError: If the end time is set for a non-finished step run.
+    """
+    from zenml.client import Client
+    if end_time is not None and not status.is_finished:
+        raise ValueError("End time cannot be set for a non-finished step run.")
+    step_run = Client().zen_store.update_run_step(
+        step_run_id=step_run_id,
+        step_run_update=StepRunUpdate(
+            status=status,
+            end_time=end_time,
+        ),
+    )
+    return step_run
 def publish_failed_step_run(step_run_id: "UUID") -> "StepRunResponse":
     """Publishes a failed step run.
@@ -63,12 +98,10 @@ def publish_failed_step_run(step_run_id: "UUID") -> "StepRunResponse":
     Returns:
         The updated step run.
     """
-    return Client().zen_store.update_run_step(
+    return publish_step_run_status_update(
         step_run_id=step_run_id,
-        step_run_update=StepRunUpdate(
-            status=ExecutionStatus.FAILED,
-            end_time=utc_now(),
-        ),
+        status=ExecutionStatus.FAILED,
+        end_time=utc_now(),
     )
@@ -92,27 +125,81 @@ def publish_failed_pipeline_run(
     )
+def publish_pipeline_run_status_update(
+    pipeline_run_id: "UUID",
+    status: ExecutionStatus,
+    end_time: Optional[datetime] = None,
+) -> "PipelineRunResponse":
+    """Publishes a pipeline run status update.
+    Args:
+        pipeline_run_id: The ID of the pipeline run to update.
+        status: The new status for the pipeline run.
+        end_time: The end time for the pipeline run. If None, will be set to current time
+            for finished statuses.
+    Returns:
+        The updated pipeline run.
+    """
+    if end_time is None and status.is_finished:
+        end_time = utc_now()
+    return Client().zen_store.update_run(
+        run_id=pipeline_run_id,
+        run_update=PipelineRunUpdate(
+            status=status,
+            end_time=end_time,
+        ),
+    )
 def get_pipeline_run_status(
-    step_statuses: List[ExecutionStatus], num_steps: int
+    run_status: ExecutionStatus,
+    step_statuses: List[ExecutionStatus],
+    num_steps: int,
 ) -> ExecutionStatus:
     """Gets the pipeline run status for the given step statuses.
     Args:
+        run_status: The status of the run.
         step_statuses: The status of steps in this run.
         num_steps: The total amount of steps in this run.
     Returns:
         The run status.
     """
-    if ExecutionStatus.FAILED in step_statuses:
-        return ExecutionStatus.FAILED
-    if (
-        ExecutionStatus.RUNNING in step_statuses
-        or len(step_statuses) < num_steps
+    # STOPPING state
+    if run_status == ExecutionStatus.STOPPING:
+        if all(status.is_finished for status in step_statuses):
+            return ExecutionStatus.STOPPED
+        else:
+            return ExecutionStatus.STOPPING
+    # If there is a stopped step, the run is stopped or stopping
+    if ExecutionStatus.STOPPED in step_statuses:
+        if all(status.is_finished for status in step_statuses):
+            return ExecutionStatus.STOPPED
+        else:
+            return ExecutionStatus.STOPPING
+    # Otherwise, if there is a failed step, the run is failed
+    elif (
+        ExecutionStatus.FAILED in step_statuses
+        or run_status == ExecutionStatus.FAILED
     ):
+        return ExecutionStatus.FAILED
+    # If there is a running step, the run is running
+    elif ExecutionStatus.RUNNING in step_statuses:
+        return ExecutionStatus.RUNNING
+    # If there are less steps than the total number of steps, it is running
+    elif len(step_statuses) < num_steps:
         return ExecutionStatus.RUNNING
-    return ExecutionStatus.COMPLETED
+    # Any other state is completed
+    else:
+        return ExecutionStatus.COMPLETED
 def publish_pipeline_run_metadata(

zenml/orchestrators/step_launcher.py CHANGED Viewed

@@ -14,10 +14,11 @@
 """Class to launch (run directly or using a step operator) steps."""
 import os
+import signal
 import time
 from contextlib import nullcontext
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, Dict, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 from zenml.client import Client
 from zenml.config.step_configurations import Step
@@ -29,6 +30,7 @@ from zenml.constants import (
 )
 from zenml.enums import ExecutionStatus
 from zenml.environment import get_run_environment_dict
+from zenml.exceptions import RunInterruptedException, RunStoppedException
 from zenml.logger import get_logger
 from zenml.logging import step_logging
 from zenml.models import (
@@ -131,11 +133,86 @@ class StepLauncher:
         self._stack = Stack.from_model(deployment.stack)
         self._step_name = step.spec.pipeline_parameter_name
+        # Internal properties and methods
+        self._step_run: Optional[StepRunResponse] = None
+        self._setup_signal_handlers()
+    def _setup_signal_handlers(self) -> None:
+        """Set up signal handlers for graceful shutdown, chaining previous handlers."""
+        # Save previous handlers
+        self._prev_sigterm_handler = signal.getsignal(signal.SIGTERM)
+        self._prev_sigint_handler = signal.getsignal(signal.SIGINT)
+        def signal_handler(signum: int, frame: Any) -> None:
+            """Handle shutdown signals gracefully.
+            Args:
+                signum: The signal number.
+                frame: The frame of the signal handler.
+            Raises:
+                RunStoppedException: If the pipeline run is stopped by the user.
+                RunInterruptedException: If the execution is interrupted for any
+                    other reason.
+            """
+            logger.info(
+                f"Received signal shutdown {signum}. Requesting shutdown "
+                f"for step '{self._step_name}'..."
+            )
+            try:
+                client = Client()
+                pipeline_run = None
+                if self._step_run:
+                    pipeline_run = client.get_pipeline_run(
+                        self._step_run.pipeline_run_id
+                    )
+                else:
+                    raise RunInterruptedException(
+                        "The execution was interrupted and the step does not "
+                        "exist yet."
+                    )
+                if pipeline_run and pipeline_run.status in [
+                    ExecutionStatus.STOPPING,
+                    ExecutionStatus.STOPPED,
+                ]:
+                    if self._step_run:
+                        publish_utils.publish_step_run_status_update(
+                            step_run_id=self._step_run.id,
+                            status=ExecutionStatus.STOPPED,
+                            end_time=utc_now(),
+                        )
+                    raise RunStoppedException("Pipeline run in stopped.")
+                else:
+                    raise RunInterruptedException(
+                        "The execution was interrupted."
+                    )
+            except (RunStoppedException, RunInterruptedException):
+                raise
+            except Exception as e:
+                raise RunInterruptedException(str(e))
+            finally:
+                # Chain to previous handler if it exists and is not default/ignore
+                if signum == signal.SIGTERM and callable(
+                    self._prev_sigterm_handler
+                ):
+                    self._prev_sigterm_handler(signum, frame)
+                elif signum == signal.SIGINT and callable(
+                    self._prev_sigint_handler
+                ):
+                    self._prev_sigint_handler(signum, frame)
+        # Register handlers for common termination signals
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
     def launch(self) -> None:
         """Launches the step.
         Raises:
-            BaseException: If the step failed to launch, run, or publish.
+            RunStoppedException: If the pipeline run is stopped by the user.
         """
         pipeline_run, run_was_created = self._create_or_reuse_run()
@@ -207,6 +284,8 @@ class StepLauncher:
                     step_run = Client().zen_store.create_run_step(
                         step_run_request
                     )
+                    # Store step run ID for signal handler
+                    self._step_run = step_run
                     if model_version := step_run.model_version:
                         step_run_utils.log_model_version_dashboard_url(
                             model_version=model_version
@@ -259,6 +338,8 @@ class StepLauncher:
                                 force_write_logs=force_write_logs,
                             )
                             break
+                        except RunStoppedException as e:
+                            raise e
                         except BaseException as e:  # noqa: E722
                             retries += 1
                             if retries < max_retries:
@@ -292,10 +373,11 @@ class StepLauncher:
                             artifacts=step_run.outputs,
                             model_version=model_version,
                         )
+        except RunStoppedException:
+            logger.info(f"Pipeline run `{pipeline_run.name}` stopped.")
+            raise
         except:  # noqa: E722
             logger.error(f"Pipeline run `{pipeline_run.name}` failed.")
-            publish_utils.publish_failed_pipeline_run(pipeline_run.id)
             raise
     def _create_or_reuse_run(self) -> Tuple[PipelineRunResponse, bool]:

zenml/utils/run_utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+#  Copyright (c) ZenML GmbH 2025. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""Utility functions for runs."""
+from typing import cast
+from zenml.enums import ExecutionStatus
+from zenml.exceptions import IllegalOperationError
+from zenml.models import PipelineRunResponse
+def stop_run(run: PipelineRunResponse, graceful: bool = False) -> None:
+    """Stop a pipeline run.
+    Args:
+        run: The pipeline run to stop.
+        graceful: Whether to stop the run gracefully.
+    Raises:
+        IllegalOperationError: If the run is already stopped or being stopped.
+        ValueError: If the stack is not accessible.
+    """
+    # Check if the stack is still accessible
+    if run.stack is None:
+        raise ValueError(
+            "The stack that this pipeline run response was executed on "
+            "is either not accessible or has been deleted."
+        )
+    # Check if pipeline can be stopped
+    if run.status == ExecutionStatus.COMPLETED:
+        raise IllegalOperationError(
+            "Cannot stop a run that is already completed."
+        )
+    if run.status == ExecutionStatus.STOPPED:
+        raise IllegalOperationError("Run is already stopped.")
+    if run.status == ExecutionStatus.STOPPING:
+        raise IllegalOperationError("Run is already being stopped.")
+    # Create the orchestrator instance
+    from zenml.enums import StackComponentType
+    from zenml.orchestrators.base_orchestrator import BaseOrchestrator
+    from zenml.stack.stack_component import StackComponent
+    # Check if the stack is still accessible
+    orchestrator_list = run.stack.components.get(
+        StackComponentType.ORCHESTRATOR, []
+    )
+    if len(orchestrator_list) == 0:
+        raise ValueError(
+            "The orchestrator that this pipeline run response was "
+            "executed with is either not accessible or has been deleted."
+        )
+    orchestrator = cast(
+        BaseOrchestrator,
+        StackComponent.from_model(component_model=orchestrator_list[0]),
+    )
+    # Stop the run
+    orchestrator.stop_run(run=run, graceful=graceful)

zenml/zen_server/routers/runs_endpoints.py CHANGED Viewed

@@ -25,6 +25,7 @@ from zenml.constants import (
     RUNS,
     STATUS,
     STEPS,
+    STOP,
     VERSION_1,
 )
 from zenml.enums import ExecutionStatus, StackComponentType
@@ -40,6 +41,7 @@ from zenml.models import (
     StepRunFilter,
     StepRunResponse,
 )
+from zenml.utils import run_utils
 from zenml.zen_server.auth import AuthContext, authorize
 from zenml.zen_server.exceptions import error_response
 from zenml.zen_server.rbac.endpoint_utils import (
@@ -51,6 +53,7 @@ from zenml.zen_server.rbac.endpoint_utils import (
 )
 from zenml.zen_server.rbac.models import Action, ResourceType
 from zenml.zen_server.rbac.utils import (
+    dehydrate_response_model,
     verify_permission_for_model,
 )
 from zenml.zen_server.routers.projects_endpoints import workspace_router
@@ -389,38 +392,39 @@ def refresh_run_status(
     Args:
         run_id: ID of the pipeline run to refresh.
-    Raises:
-        RuntimeError: If the stack or the orchestrator of the run is deleted.
     """
-    # Verify access to the run
     run = verify_permissions_and_get_entity(
         id=run_id,
         get_method=zen_store().get_run,
         hydrate=True,
     )
-    # Check the stack and its orchestrator
-    if run.stack is not None:
-        orchestrators = run.stack.components.get(
-            StackComponentType.ORCHESTRATOR, []
-        )
-        if orchestrators:
-            verify_permission_for_model(
-                model=orchestrators[0], action=Action.READ
-            )
-        else:
-            raise RuntimeError(
-                f"The orchestrator, the run '{run.id}' was executed with, is "
-                "deleted."
-            )
-    else:
-        raise RuntimeError(
-            f"The stack, the run '{run.id}' was executed on, is deleted."
-        )
     run.refresh_run_status()
+@router.post(
+    "/{run_id}" + STOP,
+    responses={401: error_response, 404: error_response, 422: error_response},
+)
+@async_fastapi_endpoint_wrapper
+def stop_run(
+    run_id: UUID,
+    graceful: bool = False,
+    _: AuthContext = Security(authorize),
+) -> None:
+    """Stops a specific pipeline run.
+    Args:
+        run_id: ID of the pipeline run to stop.
+        graceful: If True, allows for graceful shutdown where possible.
+            If False, forces immediate termination. Default is False.
+    """
+    run = zen_store().get_run(run_id, hydrate=True)
+    verify_permission_for_model(run, action=Action.READ)
+    verify_permission_for_model(run, action=Action.UPDATE)
+    dehydrate_response_model(run)
+    run_utils.stop_run(run=run, graceful=graceful)
 @router.get(
     "/{run_id}/logs",
     responses={

zenml/zen_stores/sql_zen_store.py CHANGED Viewed

@@ -6119,7 +6119,14 @@ class SqlZenStore(BaseZenStore):
                 resources=existing_run,
                 session=session,
             )
+            if run_update.status is not None:
+                self._update_pipeline_run_status(
+                    pipeline_run_id=run_id,
+                    session=session,
+                )
             session.refresh(existing_run)
             return existing_run.to_model(
                 include_metadata=True, include_resources=True
             )
@@ -8824,6 +8831,7 @@ class SqlZenStore(BaseZenStore):
         Raises:
             EntityExistsError: if the step run already exists.
+            IllegalOperationError: if the pipeline run is stopped or stopping.
         """
         with Session(self.engine) as session:
             self._set_request_user_id(request_model=step_run, session=session)
@@ -8835,6 +8843,16 @@ class SqlZenStore(BaseZenStore):
                 reference_id=step_run.pipeline_run_id,
                 session=session,
             )
+            # Validate pipeline status before creating step
+            if run.status in [
+                ExecutionStatus.STOPPING,
+                ExecutionStatus.STOPPED,
+            ]:
+                raise IllegalOperationError(
+                    f"Cannot create step '{step_run.name}' for pipeline in "
+                    f"{run.status} state. Pipeline run ID: {step_run.pipeline_run_id}"
+                )
             self._get_reference_schema_by_id(
                 resource=step_run,
                 reference_schema=StepRunSchema,
@@ -8996,6 +9014,8 @@ class SqlZenStore(BaseZenStore):
                         session=session,
                     )
+            session.commit()
             if step_run.status != ExecutionStatus.RUNNING:
                 self._update_pipeline_run_status(
                     pipeline_run_id=step_run.pipeline_run_id, session=session
@@ -9130,15 +9150,14 @@ class SqlZenStore(BaseZenStore):
                     input_type=StepRunInputArtifactType.MANUAL,
                     session=session,
                 )
+            session.commit()
+            session.refresh(existing_step_run)
             self._update_pipeline_run_status(
                 pipeline_run_id=existing_step_run.pipeline_run_id,
                 session=session,
             )
-            session.commit()
-            session.refresh(existing_step_run)
             return existing_step_run.to_model(
                 include_metadata=True, include_resources=True
             )
@@ -9375,6 +9394,7 @@ class SqlZenStore(BaseZenStore):
         assert pipeline_run.deployment
         num_steps = pipeline_run.deployment.step_count
         new_status = get_pipeline_run_status(
+            run_status=ExecutionStatus(pipeline_run.status),
             step_statuses=[
                 ExecutionStatus(status) for status in step_run_statuses
             ],

{zenml_nightly-0.83.1.dev20250701.dist-info → zenml_nightly-0.83.1.dev20250703.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: zenml-nightly
-Version: 0.83.1.dev20250701
+Version: 0.83.1.dev20250703
 Summary: ZenML: Write production-ready ML code.
 License: Apache-2.0
 Keywords: machine learning,production,pipeline,mlops,devops

zenml-nightly 0.83.1.dev20250701__py3-none-any.whl → 0.83.1.dev20250703__py3-none-any.whl

zenml-nightly 0.83.1.dev20250701py3-none-any.whl → 0.83.1.dev20250703py3-none-any.whl