zenml-nightly 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250704__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. zenml/VERSION +1 -1
  2. zenml/cli/pipeline.py +54 -1
  3. zenml/cli/utils.py +2 -0
  4. zenml/config/compiler.py +19 -3
  5. zenml/config/step_configurations.py +34 -2
  6. zenml/constants.py +1 -0
  7. zenml/enums.py +6 -3
  8. zenml/exceptions.py +8 -0
  9. zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +8 -4
  10. zenml/integrations/aws/step_operators/sagemaker_step_operator.py +1 -1
  11. zenml/integrations/azure/orchestrators/azureml_orchestrator.py +5 -3
  12. zenml/integrations/azure/step_operators/azureml_step_operator.py +1 -1
  13. zenml/integrations/gcp/orchestrators/vertex_orchestrator.py +7 -8
  14. zenml/integrations/gcp/step_operators/vertex_step_operator.py +1 -1
  15. zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +6 -0
  16. zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py +109 -1
  17. zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +36 -1
  18. zenml/integrations/kubernetes/orchestrators/manifest_utils.py +11 -3
  19. zenml/integrations/kubernetes/step_operators/kubernetes_step_operator.py +1 -1
  20. zenml/integrations/modal/step_operators/modal_step_operator.py +1 -1
  21. zenml/integrations/spark/step_operators/kubernetes_step_operator.py +1 -1
  22. zenml/models/v2/core/pipeline_run.py +2 -2
  23. zenml/orchestrators/base_orchestrator.py +70 -0
  24. zenml/orchestrators/containerized_orchestrator.py +22 -0
  25. zenml/orchestrators/dag_runner.py +27 -8
  26. zenml/orchestrators/local_docker/local_docker_orchestrator.py +9 -0
  27. zenml/orchestrators/publish_utils.py +100 -13
  28. zenml/orchestrators/step_launcher.py +94 -8
  29. zenml/stack/stack.py +2 -2
  30. zenml/utils/run_utils.py +74 -0
  31. zenml/zen_server/routers/runs_endpoints.py +27 -23
  32. zenml/zen_stores/sql_zen_store.py +23 -3
  33. {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/METADATA +1 -1
  34. {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/RECORD +37 -36
  35. {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/LICENSE +0 -0
  36. {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/WHEEL +0 -0
  37. {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/entry_points.txt +0 -0
zenml/VERSION CHANGED
@@ -1 +1 @@
1
- 0.83.1.dev20250702
1
+ 0.83.1.dev20250704
zenml/cli/pipeline.py CHANGED
@@ -34,7 +34,7 @@ from zenml.models import (
34
34
  ScheduleFilter,
35
35
  )
36
36
  from zenml.pipelines.pipeline_definition import Pipeline
37
- from zenml.utils import source_utils, uuid_utils
37
+ from zenml.utils import run_utils, source_utils, uuid_utils
38
38
  from zenml.utils.yaml_utils import write_yaml
39
39
 
40
40
  logger = get_logger(__name__)
@@ -511,6 +511,59 @@ def list_pipeline_runs(**kwargs: Any) -> None:
511
511
  cli_utils.print_page_info(pipeline_runs)
512
512
 
513
513
 
514
+ @runs.command("stop")
515
+ @click.argument("run_name_or_id", type=str, required=True)
516
+ @click.option(
517
+ "--graceful",
518
+ "-g",
519
+ is_flag=True,
520
+ default=False,
521
+ help="Use graceful shutdown (default is False).",
522
+ )
523
+ @click.option(
524
+ "--yes",
525
+ "-y",
526
+ is_flag=True,
527
+ default=False,
528
+ help="Don't ask for confirmation.",
529
+ )
530
+ def stop_pipeline_run(
531
+ run_name_or_id: str,
532
+ graceful: bool = False,
533
+ yes: bool = False,
534
+ ) -> None:
535
+ """Stop a running pipeline.
536
+
537
+ Args:
538
+ run_name_or_id: The name or ID of the pipeline run to stop.
539
+ graceful: If True, uses graceful shutdown. If False, forces immediate termination.
540
+ yes: If set, don't ask for confirmation.
541
+ """
542
+ # Ask for confirmation to stop run.
543
+ if not yes:
544
+ action = "gracefully stop" if graceful else "force stop"
545
+ confirmation = cli_utils.confirmation(
546
+ f"Are you sure you want to {action} pipeline run `{run_name_or_id}`?"
547
+ )
548
+ if not confirmation:
549
+ cli_utils.declare("Not stopping the pipeline run.")
550
+ return
551
+
552
+ # Stop run.
553
+ try:
554
+ run = Client().get_pipeline_run(name_id_or_prefix=run_name_or_id)
555
+ run_utils.stop_run(run=run, graceful=graceful)
556
+ action = "Gracefully stopped" if graceful else "Force stopped"
557
+ cli_utils.declare(f"{action} pipeline run '{run.name}'.")
558
+ except NotImplementedError:
559
+ cli_utils.error(
560
+ "The orchestrator used for this pipeline run does not support "
561
+ f"{'gracefully' if graceful else 'forcefully'} stopping runs."
562
+ )
563
+ except Exception as e:
564
+ cli_utils.error(f"Failed to stop pipeline run: {e}")
565
+
566
+
514
567
  @runs.command("delete")
515
568
  @click.argument("run_name_or_id", type=str, required=True)
516
569
  @click.option(
zenml/cli/utils.py CHANGED
@@ -2214,6 +2214,8 @@ def get_execution_status_emoji(status: "ExecutionStatus") -> str:
2214
2214
  return ":white_check_mark:"
2215
2215
  if status == ExecutionStatus.CACHED:
2216
2216
  return ":package:"
2217
+ if status == ExecutionStatus.STOPPED or status == ExecutionStatus.STOPPING:
2218
+ return ":stop_sign:"
2217
2219
  raise RuntimeError(f"Unknown status: {status}")
2218
2220
 
2219
2221
 
zenml/config/compiler.py CHANGED
@@ -556,7 +556,16 @@ class Compiler:
556
556
 
557
557
  for name, step in steps.items():
558
558
  step_operator = step.config.step_operator
559
- if step_operator and step_operator not in available_step_operators:
559
+ if step_operator is True:
560
+ if not available_step_operators:
561
+ raise StackValidationError(
562
+ f"Step `{name}` requires a step operator, but no step "
563
+ f"operators are configured in the stack '{stack.name}'."
564
+ )
565
+ elif (
566
+ isinstance(step_operator, str)
567
+ and step_operator not in available_step_operators
568
+ ):
560
569
  raise StackValidationError(
561
570
  f"Step `{name}` requires step operator "
562
571
  f"'{step_operator}' which is not configured in "
@@ -565,8 +574,15 @@ class Compiler:
565
574
  )
566
575
 
567
576
  experiment_tracker = step.config.experiment_tracker
568
- if (
569
- experiment_tracker
577
+ if experiment_tracker is True:
578
+ if not available_experiment_trackers:
579
+ raise StackValidationError(
580
+ f"Step `{name}` requires an experiment tracker, but no "
581
+ f"experiment trackers are configured in the stack "
582
+ f"'{stack.name}'."
583
+ )
584
+ elif (
585
+ isinstance(experiment_tracker, str)
570
586
  and experiment_tracker not in available_experiment_trackers
571
587
  ):
572
588
  raise StackValidationError(
@@ -145,8 +145,8 @@ class StepConfigurationUpdate(StrictBaseModel):
145
145
  enable_artifact_metadata: Optional[bool] = None
146
146
  enable_artifact_visualization: Optional[bool] = None
147
147
  enable_step_logs: Optional[bool] = None
148
- step_operator: Optional[str] = None
149
- experiment_tracker: Optional[str] = None
148
+ step_operator: Optional[Union[bool, str]] = None
149
+ experiment_tracker: Optional[Union[bool, str]] = None
150
150
  parameters: Dict[str, Any] = {}
151
151
  settings: Dict[str, SerializeAsAny[BaseSettings]] = {}
152
152
  extra: Dict[str, Any] = {}
@@ -158,6 +158,38 @@ class StepConfigurationUpdate(StrictBaseModel):
158
158
 
159
159
  outputs: Mapping[str, PartialArtifactConfiguration] = {}
160
160
 
161
+ def uses_step_operator(self, name: str) -> bool:
162
+ """Checks if the step configuration uses the given step operator.
163
+
164
+ Args:
165
+ name: The name of the step operator.
166
+
167
+ Returns:
168
+ If the step configuration uses the given step operator.
169
+ """
170
+ if self.step_operator is True:
171
+ return True
172
+ elif isinstance(self.step_operator, str):
173
+ return self.step_operator == name
174
+ else:
175
+ return False
176
+
177
+ def uses_experiment_tracker(self, name: str) -> bool:
178
+ """Checks if the step configuration uses the given experiment tracker.
179
+
180
+ Args:
181
+ name: The name of the experiment tracker.
182
+
183
+ Returns:
184
+ If the step configuration uses the given experiment tracker.
185
+ """
186
+ if self.experiment_tracker is True:
187
+ return True
188
+ elif isinstance(self.experiment_tracker, str):
189
+ return self.experiment_tracker == name
190
+ else:
191
+ return False
192
+
161
193
 
162
194
  class PartialStepConfiguration(StepConfigurationUpdate):
163
195
  """Class representing a partial step configuration."""
zenml/constants.py CHANGED
@@ -416,6 +416,7 @@ STATISTICS = "/statistics"
416
416
  STATUS = "/status"
417
417
  STEP_CONFIGURATION = "/step-configuration"
418
418
  STEPS = "/steps"
419
+ STOP = "/stop"
419
420
  TAGS = "/tags"
420
421
  TAG_RESOURCES = "/tag_resources"
421
422
  TRIGGERS = "/triggers"
zenml/enums.py CHANGED
@@ -71,25 +71,28 @@ class ZenMLServiceType(StrEnum):
71
71
 
72
72
 
73
73
  class ExecutionStatus(StrEnum):
74
- """Enum that represents the current status of a step or pipeline run."""
74
+ """Enum that represents the execution status of a step or pipeline run."""
75
75
 
76
76
  INITIALIZING = "initializing"
77
77
  FAILED = "failed"
78
78
  COMPLETED = "completed"
79
79
  RUNNING = "running"
80
80
  CACHED = "cached"
81
+ STOPPED = "stopped"
82
+ STOPPING = "stopping"
81
83
 
82
84
  @property
83
85
  def is_finished(self) -> bool:
84
- """Whether the execution status refers to a finished execution.
86
+ """Returns whether the execution status is in a finished state.
85
87
 
86
88
  Returns:
87
- Whether the execution status refers to a finished execution.
89
+ Whether the execution status is finished.
88
90
  """
89
91
  return self in {
90
92
  ExecutionStatus.FAILED,
91
93
  ExecutionStatus.COMPLETED,
92
94
  ExecutionStatus.CACHED,
95
+ ExecutionStatus.STOPPED,
93
96
  }
94
97
 
95
98
 
zenml/exceptions.py CHANGED
@@ -122,6 +122,14 @@ class IllegalOperationError(ZenMLBaseException):
122
122
  """Raised when an illegal operation is attempted."""
123
123
 
124
124
 
125
+ class RunStoppedException(ZenMLBaseException):
126
+ """Raised when a ZenML pipeline run gets stopped by the user."""
127
+
128
+
129
+ class RunInterruptedException(ZenMLBaseException):
130
+ """Raised when a ZenML step gets interrupted for an unknown reason."""
131
+
132
+
125
133
  class MethodNotAllowedError(ZenMLBaseException):
126
134
  """Raised when the server does not allow a request method."""
127
135
 
@@ -853,12 +853,16 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
853
853
  )["PipelineExecutionStatus"]
854
854
 
855
855
  # Map the potential outputs to ZenML ExecutionStatus. Potential values:
856
- # https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/PipelineState
857
- if status in ["Executing", "Stopping"]:
856
+ # https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribePipelineExecution.html
857
+ if status == "Executing":
858
858
  return ExecutionStatus.RUNNING
859
- elif status in ["Stopped", "Failed"]:
859
+ elif status == "Stopping":
860
+ return ExecutionStatus.STOPPING
861
+ elif status == "Stopped":
862
+ return ExecutionStatus.STOPPED
863
+ elif status == "Failed":
860
864
  return ExecutionStatus.FAILED
861
- elif status in ["Succeeded"]:
865
+ elif status == "Succeeded":
862
866
  return ExecutionStatus.COMPLETED
863
867
  else:
864
868
  raise ValueError("Unknown status for the pipeline execution.")
@@ -152,7 +152,7 @@ class SagemakerStepOperator(BaseStepOperator):
152
152
  """
153
153
  builds = []
154
154
  for step_name, step in deployment.step_configurations.items():
155
- if step.config.step_operator == self.name:
155
+ if step.config.uses_step_operator(self.name):
156
156
  build = BuildConfiguration(
157
157
  key=SAGEMAKER_DOCKER_IMAGE_KEY,
158
158
  settings=step.config.docker_settings,
@@ -515,14 +515,16 @@ class AzureMLOrchestrator(ContainerizedOrchestrator):
515
515
  return ExecutionStatus.INITIALIZING
516
516
  elif status in ["Running", "Finalizing"]:
517
517
  return ExecutionStatus.RUNNING
518
+ elif status == "CancelRequested":
519
+ return ExecutionStatus.STOPPING
520
+ elif status == "Canceled":
521
+ return ExecutionStatus.STOPPED
518
522
  elif status in [
519
- "CancelRequested",
520
523
  "Failed",
521
- "Canceled",
522
524
  "NotResponding",
523
525
  ]:
524
526
  return ExecutionStatus.FAILED
525
- elif status in ["Completed"]:
527
+ elif status == "Completed":
526
528
  return ExecutionStatus.COMPLETED
527
529
  else:
528
530
  raise ValueError("Unknown status for the pipeline job.")
@@ -149,7 +149,7 @@ class AzureMLStepOperator(BaseStepOperator):
149
149
  """
150
150
  builds = []
151
151
  for step_name, step in deployment.step_configurations.items():
152
- if step.config.step_operator == self.name:
152
+ if step.config.uses_step_operator(self.name):
153
153
  build = BuildConfiguration(
154
154
  key=AZUREML_STEP_OPERATOR_DOCKER_IMAGE_KEY,
155
155
  settings=step.config.docker_settings,
@@ -942,7 +942,7 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
942
942
 
943
943
  # Map the potential outputs to ZenML ExecutionStatus. Potential values:
944
944
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/describe_pipeline_execution.html#
945
- if status in [PipelineState.PIPELINE_STATE_UNSPECIFIED]:
945
+ if status == PipelineState.PIPELINE_STATE_UNSPECIFIED:
946
946
  return run.status
947
947
  elif status in [
948
948
  PipelineState.PIPELINE_STATE_QUEUED,
@@ -954,14 +954,13 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
954
954
  PipelineState.PIPELINE_STATE_PAUSED,
955
955
  ]:
956
956
  return ExecutionStatus.RUNNING
957
- elif status in [PipelineState.PIPELINE_STATE_SUCCEEDED]:
957
+ elif status == PipelineState.PIPELINE_STATE_SUCCEEDED:
958
958
  return ExecutionStatus.COMPLETED
959
-
960
- elif status in [
961
- PipelineState.PIPELINE_STATE_FAILED,
962
- PipelineState.PIPELINE_STATE_CANCELLING,
963
- PipelineState.PIPELINE_STATE_CANCELLED,
964
- ]:
959
+ elif status == PipelineState.PIPELINE_STATE_CANCELLING:
960
+ return ExecutionStatus.STOPPING
961
+ elif status == PipelineState.PIPELINE_STATE_CANCELLED:
962
+ return ExecutionStatus.STOPPED
963
+ elif status == PipelineState.PIPELINE_STATE_FAILED:
965
964
  return ExecutionStatus.FAILED
966
965
  else:
967
966
  raise ValueError("Unknown status for the pipeline job.")
@@ -161,7 +161,7 @@ class VertexStepOperator(BaseStepOperator, GoogleCredentialsMixin):
161
161
  """
162
162
  builds = []
163
163
  for step_name, step in deployment.step_configurations.items():
164
- if step.config.step_operator == self.name:
164
+ if step.config.uses_step_operator(self.name):
165
165
  build = BuildConfiguration(
166
166
  key=VERTEX_DOCKER_IMAGE_KEY,
167
167
  settings=step.config.docker_settings,
@@ -69,6 +69,10 @@ class KubernetesOrchestratorSettings(BaseSettings):
69
69
  scheduling a pipeline.
70
70
  prevent_orchestrator_pod_caching: If `True`, the orchestrator pod will
71
71
  not try to compute cached steps before starting the step pods.
72
+ always_build_pipeline_image: If `True`, the orchestrator will always
73
+ build the pipeline image, even if all steps have a custom build.
74
+ pod_stop_grace_period: When stopping a pipeline run, the amount of
75
+ seconds to wait for a step pod to shutdown gracefully.
72
76
  """
73
77
 
74
78
  synchronous: bool = True
@@ -88,6 +92,8 @@ class KubernetesOrchestratorSettings(BaseSettings):
88
92
  failed_jobs_history_limit: Optional[NonNegativeInt] = None
89
93
  ttl_seconds_after_finished: Optional[NonNegativeInt] = None
90
94
  prevent_orchestrator_pod_caching: bool = False
95
+ always_build_pipeline_image: bool = False
96
+ pod_stop_grace_period: PositiveInt = 30
91
97
 
92
98
 
93
99
  class KubernetesOrchestratorConfig(
@@ -70,7 +70,11 @@ from zenml.orchestrators.utils import get_orchestrator_run_name
70
70
  from zenml.stack import StackValidator
71
71
 
72
72
  if TYPE_CHECKING:
73
- from zenml.models import PipelineDeploymentResponse, PipelineRunResponse
73
+ from zenml.models import (
74
+ PipelineDeploymentBase,
75
+ PipelineDeploymentResponse,
76
+ PipelineRunResponse,
77
+ )
74
78
  from zenml.stack import Stack
75
79
 
76
80
  logger = get_logger(__name__)
@@ -84,6 +88,22 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
84
88
 
85
89
  _k8s_client: Optional[k8s_client.ApiClient] = None
86
90
 
91
+ def should_build_pipeline_image(
92
+ self, deployment: "PipelineDeploymentBase"
93
+ ) -> bool:
94
+ """Whether to always build the pipeline image.
95
+
96
+ Args:
97
+ deployment: The pipeline deployment.
98
+
99
+ Returns:
100
+ Whether to always build the pipeline image.
101
+ """
102
+ settings = cast(
103
+ KubernetesOrchestratorSettings, self.get_settings(deployment)
104
+ )
105
+ return settings.always_build_pipeline_image
106
+
87
107
  def get_kube_client(
88
108
  self, incluster: Optional[bool] = None
89
109
  ) -> k8s_client.ApiClient:
@@ -545,6 +565,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
545
565
  successful_jobs_history_limit=settings.successful_jobs_history_limit,
546
566
  failed_jobs_history_limit=settings.failed_jobs_history_limit,
547
567
  ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
568
+ termination_grace_period_seconds=settings.pod_stop_grace_period,
548
569
  labels=orchestrator_pod_labels,
549
570
  )
550
571
 
@@ -570,6 +591,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
570
591
  env=environment,
571
592
  labels=orchestrator_pod_labels,
572
593
  mount_local_stores=self.config.is_local,
594
+ termination_grace_period_seconds=settings.pod_stop_grace_period,
573
595
  )
574
596
 
575
597
  kube_utils.create_and_wait_for_pod_to_start(
@@ -663,6 +685,92 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
663
685
  f"{ENV_ZENML_KUBERNETES_RUN_ID}."
664
686
  )
665
687
 
688
+ def _stop_run(
689
+ self, run: "PipelineRunResponse", graceful: bool = True
690
+ ) -> None:
691
+ """Stops a specific pipeline run by terminating step pods.
692
+
693
+ Args:
694
+ run: The run that was executed by this orchestrator.
695
+ graceful: If True, does nothing (lets the orchestrator and steps finish naturally).
696
+ If False, stops all running step pods.
697
+
698
+ Raises:
699
+ RuntimeError: If we fail to stop the run.
700
+ """
701
+ # If graceful, do nothing and let the orchestrator handle the stop naturally
702
+ if graceful:
703
+ logger.info(
704
+ "Graceful stop requested - the orchestrator pod will handle "
705
+ "stopping naturally"
706
+ )
707
+ return
708
+
709
+ pods_stopped = []
710
+ errors = []
711
+
712
+ # Find all pods with the orchestrator run ID label
713
+ label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
714
+ try:
715
+ pods = self._k8s_core_api.list_namespaced_pod(
716
+ namespace=self.config.kubernetes_namespace,
717
+ label_selector=label_selector,
718
+ )
719
+ except Exception as e:
720
+ raise RuntimeError(
721
+ f"Failed to list step pods with run ID {run.id}: {e}"
722
+ )
723
+
724
+ # Filter to only include running or pending pods
725
+ for pod in pods.items:
726
+ if pod.status.phase not in ["Running", "Pending"]:
727
+ logger.debug(
728
+ f"Skipping pod {pod.metadata.name} with status {pod.status.phase}"
729
+ )
730
+ continue
731
+
732
+ try:
733
+ self._k8s_core_api.delete_namespaced_pod(
734
+ name=pod.metadata.name,
735
+ namespace=self.config.kubernetes_namespace,
736
+ )
737
+ pods_stopped.append(f"step pod: {pod.metadata.name}")
738
+ logger.debug(
739
+ f"Successfully initiated graceful stop of step pod: {pod.metadata.name}"
740
+ )
741
+ except Exception as e:
742
+ error_msg = f"Failed to stop step pod {pod.metadata.name}: {e}"
743
+ logger.warning(error_msg)
744
+ errors.append(error_msg)
745
+
746
+ # Summary logging
747
+ settings = cast(KubernetesOrchestratorSettings, self.get_settings(run))
748
+ grace_period_seconds = settings.pod_stop_grace_period
749
+ if pods_stopped:
750
+ logger.debug(
751
+ f"Successfully initiated graceful termination of: {', '.join(pods_stopped)}. "
752
+ f"Pods will terminate within {grace_period_seconds} seconds."
753
+ )
754
+
755
+ if errors:
756
+ error_summary = "; ".join(errors)
757
+ if not pods_stopped:
758
+ # If nothing was stopped successfully, raise an error
759
+ raise RuntimeError(
760
+ f"Failed to stop pipeline run: {error_summary}"
761
+ )
762
+ else:
763
+ # If some things were stopped but others failed, raise an error
764
+ raise RuntimeError(
765
+ f"Partial stop operation completed with errors: {error_summary}"
766
+ )
767
+
768
+ # If no step pods were found and no errors occurred
769
+ if not pods_stopped and not errors:
770
+ logger.info(
771
+ f"No running step pods found for pipeline run with ID: {run.id}"
772
+ )
773
+
666
774
  def get_pipeline_run_metadata(
667
775
  self, run_id: UUID
668
776
  ) -> Dict[str, "MetadataType"]:
@@ -18,6 +18,7 @@ import socket
18
18
  from typing import Callable, Dict, Optional, cast
19
19
 
20
20
  from kubernetes import client as k8s_client
21
+ from kubernetes.client.rest import ApiException
21
22
 
22
23
  from zenml.client import Client
23
24
  from zenml.entrypoints.step_entrypoint_configuration import (
@@ -248,6 +249,7 @@ def main() -> None:
248
249
  or settings.service_account_name,
249
250
  mount_local_stores=mount_local_stores,
250
251
  owner_references=owner_references,
252
+ termination_grace_period_seconds=settings.pod_stop_grace_period,
251
253
  labels=step_pod_labels,
252
254
  )
253
255
 
@@ -330,6 +332,38 @@ def main() -> None:
330
332
  # as the pipeline run status will already have been published.
331
333
  pass
332
334
 
335
+ def check_pipeline_cancellation() -> bool:
336
+ """Check if the pipeline should continue execution.
337
+
338
+ Returns:
339
+ True if execution should continue, False if it should stop.
340
+ """
341
+ try:
342
+ run = client.get_pipeline_run(
343
+ name_id_or_prefix=pipeline_run.id,
344
+ project=pipeline_run.project_id,
345
+ hydrate=False, # We only need status, not full hydration
346
+ )
347
+
348
+ # If the run is STOPPING or STOPPED, we should stop the execution
349
+ if run.status in [
350
+ ExecutionStatus.STOPPING,
351
+ ExecutionStatus.STOPPED,
352
+ ]:
353
+ logger.info(
354
+ f"Pipeline run is in {run.status} state, stopping execution"
355
+ )
356
+ return False
357
+
358
+ return True
359
+
360
+ except Exception as e:
361
+ # If we can't check the status, assume we should continue
362
+ logger.warning(
363
+ f"Failed to check pipeline cancellation status: {e}"
364
+ )
365
+ return True
366
+
333
367
  parallel_node_startup_waiting_period = (
334
368
  orchestrator.config.parallel_step_startup_waiting_period or 0.0
335
369
  )
@@ -344,6 +378,7 @@ def main() -> None:
344
378
  run_fn=run_step_on_kubernetes,
345
379
  preparation_fn=pre_step_run,
346
380
  finalize_fn=finalize_run,
381
+ continue_fn=check_pipeline_cancellation,
347
382
  parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
348
383
  max_parallelism=pipeline_settings.max_parallelism,
349
384
  ).run()
@@ -360,7 +395,7 @@ def main() -> None:
360
395
  namespace=namespace,
361
396
  secret_name=secret_name,
362
397
  )
363
- except k8s_client.rest.ApiException as e:
398
+ except ApiException as e:
364
399
  logger.error(f"Error cleaning up secret {secret_name}: {e}")
365
400
 
366
401
 
@@ -106,6 +106,7 @@ def build_pod_manifest(
106
106
  labels: Optional[Dict[str, str]] = None,
107
107
  mount_local_stores: bool = False,
108
108
  owner_references: Optional[List[k8s_client.V1OwnerReference]] = None,
109
+ termination_grace_period_seconds: Optional[int] = 30,
109
110
  ) -> k8s_client.V1Pod:
110
111
  """Build a Kubernetes pod manifest for a ZenML run or step.
111
112
 
@@ -124,6 +125,8 @@ def build_pod_manifest(
124
125
  mount_local_stores: Whether to mount the local stores path inside the
125
126
  pod.
126
127
  owner_references: List of owner references for the pod.
128
+ termination_grace_period_seconds: The amount of seconds to wait for a
129
+ pod to shutdown gracefully.
127
130
 
128
131
  Returns:
129
132
  Pod manifest.
@@ -154,19 +157,20 @@ def build_pod_manifest(
154
157
  containers=[container_spec],
155
158
  restart_policy="Never",
156
159
  image_pull_secrets=image_pull_secrets,
160
+ termination_grace_period_seconds=termination_grace_period_seconds,
157
161
  )
158
162
 
159
163
  if service_account_name is not None:
160
164
  pod_spec.service_account_name = service_account_name
161
165
 
166
+ # Apply pod settings if provided
162
167
  labels = labels or {}
163
168
 
164
169
  if pod_settings:
165
170
  add_pod_settings(pod_spec, pod_settings)
166
171
 
167
- # Add pod_settings.labels to the labels
168
- if pod_settings.labels:
169
- labels.update(pod_settings.labels)
172
+ if pod_settings and pod_settings.labels:
173
+ labels.update(pod_settings.labels)
170
174
 
171
175
  pod_metadata = k8s_client.V1ObjectMeta(
172
176
  name=pod_name,
@@ -273,6 +277,7 @@ def build_cron_job_manifest(
273
277
  successful_jobs_history_limit: Optional[int] = None,
274
278
  failed_jobs_history_limit: Optional[int] = None,
275
279
  ttl_seconds_after_finished: Optional[int] = None,
280
+ termination_grace_period_seconds: Optional[int] = 30,
276
281
  ) -> k8s_client.V1CronJob:
277
282
  """Create a manifest for launching a pod as scheduled CRON job.
278
283
 
@@ -295,6 +300,8 @@ def build_cron_job_manifest(
295
300
  failed_jobs_history_limit: The number of failed jobs to retain.
296
301
  ttl_seconds_after_finished: The amount of seconds to keep finished jobs
297
302
  before deleting them.
303
+ termination_grace_period_seconds: The amount of seconds to wait for a
304
+ pod to shutdown gracefully.
298
305
 
299
306
  Returns:
300
307
  CRON job manifest.
@@ -310,6 +317,7 @@ def build_cron_job_manifest(
310
317
  env=env,
311
318
  labels=labels,
312
319
  mount_local_stores=mount_local_stores,
320
+ termination_grace_period_seconds=termination_grace_period_seconds,
313
321
  )
314
322
 
315
323
  job_spec = k8s_client.V1CronJobSpec(
@@ -120,7 +120,7 @@ class KubernetesStepOperator(BaseStepOperator):
120
120
  """
121
121
  builds = []
122
122
  for step_name, step in deployment.step_configurations.items():
123
- if step.config.step_operator == self.name:
123
+ if step.config.uses_step_operator(self.name):
124
124
  build = BuildConfiguration(
125
125
  key=KUBERNETES_STEP_OPERATOR_DOCKER_IMAGE_KEY,
126
126
  settings=step.config.docker_settings,
@@ -139,7 +139,7 @@ class ModalStepOperator(BaseStepOperator):
139
139
  """
140
140
  builds = []
141
141
  for step_name, step in deployment.step_configurations.items():
142
- if step.config.step_operator == self.name:
142
+ if step.config.uses_step_operator(self.name):
143
143
  build = BuildConfiguration(
144
144
  key=MODAL_STEP_OPERATOR_DOCKER_IMAGE_KEY,
145
145
  settings=step.config.docker_settings,
@@ -124,7 +124,7 @@ class KubernetesSparkStepOperator(SparkStepOperator):
124
124
  builds = []
125
125
  extra_files = {ENTRYPOINT_NAME: LOCAL_ENTRYPOINT}
126
126
  for step_name, step in deployment.step_configurations.items():
127
- if step.config.step_operator == self.name:
127
+ if step.config.uses_step_operator(self.name):
128
128
  build = BuildConfiguration(
129
129
  key=SPARK_DOCKER_IMAGE_KEY,
130
130
  settings=step.config.docker_settings,