zenml-nightly 0.83.1.dev20250702__py3-none-any.whl → 0.83.1.dev20250704__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zenml/VERSION +1 -1
- zenml/cli/pipeline.py +54 -1
- zenml/cli/utils.py +2 -0
- zenml/config/compiler.py +19 -3
- zenml/config/step_configurations.py +34 -2
- zenml/constants.py +1 -0
- zenml/enums.py +6 -3
- zenml/exceptions.py +8 -0
- zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +8 -4
- zenml/integrations/aws/step_operators/sagemaker_step_operator.py +1 -1
- zenml/integrations/azure/orchestrators/azureml_orchestrator.py +5 -3
- zenml/integrations/azure/step_operators/azureml_step_operator.py +1 -1
- zenml/integrations/gcp/orchestrators/vertex_orchestrator.py +7 -8
- zenml/integrations/gcp/step_operators/vertex_step_operator.py +1 -1
- zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +6 -0
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py +109 -1
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +36 -1
- zenml/integrations/kubernetes/orchestrators/manifest_utils.py +11 -3
- zenml/integrations/kubernetes/step_operators/kubernetes_step_operator.py +1 -1
- zenml/integrations/modal/step_operators/modal_step_operator.py +1 -1
- zenml/integrations/spark/step_operators/kubernetes_step_operator.py +1 -1
- zenml/models/v2/core/pipeline_run.py +2 -2
- zenml/orchestrators/base_orchestrator.py +70 -0
- zenml/orchestrators/containerized_orchestrator.py +22 -0
- zenml/orchestrators/dag_runner.py +27 -8
- zenml/orchestrators/local_docker/local_docker_orchestrator.py +9 -0
- zenml/orchestrators/publish_utils.py +100 -13
- zenml/orchestrators/step_launcher.py +94 -8
- zenml/stack/stack.py +2 -2
- zenml/utils/run_utils.py +74 -0
- zenml/zen_server/routers/runs_endpoints.py +27 -23
- zenml/zen_stores/sql_zen_store.py +23 -3
- {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/METADATA +1 -1
- {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/RECORD +37 -36
- {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/LICENSE +0 -0
- {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/WHEEL +0 -0
- {zenml_nightly-0.83.1.dev20250702.dist-info → zenml_nightly-0.83.1.dev20250704.dist-info}/entry_points.txt +0 -0
zenml/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.83.1.
|
1
|
+
0.83.1.dev20250704
|
zenml/cli/pipeline.py
CHANGED
@@ -34,7 +34,7 @@ from zenml.models import (
|
|
34
34
|
ScheduleFilter,
|
35
35
|
)
|
36
36
|
from zenml.pipelines.pipeline_definition import Pipeline
|
37
|
-
from zenml.utils import source_utils, uuid_utils
|
37
|
+
from zenml.utils import run_utils, source_utils, uuid_utils
|
38
38
|
from zenml.utils.yaml_utils import write_yaml
|
39
39
|
|
40
40
|
logger = get_logger(__name__)
|
@@ -511,6 +511,59 @@ def list_pipeline_runs(**kwargs: Any) -> None:
|
|
511
511
|
cli_utils.print_page_info(pipeline_runs)
|
512
512
|
|
513
513
|
|
514
|
+
@runs.command("stop")
|
515
|
+
@click.argument("run_name_or_id", type=str, required=True)
|
516
|
+
@click.option(
|
517
|
+
"--graceful",
|
518
|
+
"-g",
|
519
|
+
is_flag=True,
|
520
|
+
default=False,
|
521
|
+
help="Use graceful shutdown (default is False).",
|
522
|
+
)
|
523
|
+
@click.option(
|
524
|
+
"--yes",
|
525
|
+
"-y",
|
526
|
+
is_flag=True,
|
527
|
+
default=False,
|
528
|
+
help="Don't ask for confirmation.",
|
529
|
+
)
|
530
|
+
def stop_pipeline_run(
|
531
|
+
run_name_or_id: str,
|
532
|
+
graceful: bool = False,
|
533
|
+
yes: bool = False,
|
534
|
+
) -> None:
|
535
|
+
"""Stop a running pipeline.
|
536
|
+
|
537
|
+
Args:
|
538
|
+
run_name_or_id: The name or ID of the pipeline run to stop.
|
539
|
+
graceful: If True, uses graceful shutdown. If False, forces immediate termination.
|
540
|
+
yes: If set, don't ask for confirmation.
|
541
|
+
"""
|
542
|
+
# Ask for confirmation to stop run.
|
543
|
+
if not yes:
|
544
|
+
action = "gracefully stop" if graceful else "force stop"
|
545
|
+
confirmation = cli_utils.confirmation(
|
546
|
+
f"Are you sure you want to {action} pipeline run `{run_name_or_id}`?"
|
547
|
+
)
|
548
|
+
if not confirmation:
|
549
|
+
cli_utils.declare("Not stopping the pipeline run.")
|
550
|
+
return
|
551
|
+
|
552
|
+
# Stop run.
|
553
|
+
try:
|
554
|
+
run = Client().get_pipeline_run(name_id_or_prefix=run_name_or_id)
|
555
|
+
run_utils.stop_run(run=run, graceful=graceful)
|
556
|
+
action = "Gracefully stopped" if graceful else "Force stopped"
|
557
|
+
cli_utils.declare(f"{action} pipeline run '{run.name}'.")
|
558
|
+
except NotImplementedError:
|
559
|
+
cli_utils.error(
|
560
|
+
"The orchestrator used for this pipeline run does not support "
|
561
|
+
f"{'gracefully' if graceful else 'forcefully'} stopping runs."
|
562
|
+
)
|
563
|
+
except Exception as e:
|
564
|
+
cli_utils.error(f"Failed to stop pipeline run: {e}")
|
565
|
+
|
566
|
+
|
514
567
|
@runs.command("delete")
|
515
568
|
@click.argument("run_name_or_id", type=str, required=True)
|
516
569
|
@click.option(
|
zenml/cli/utils.py
CHANGED
@@ -2214,6 +2214,8 @@ def get_execution_status_emoji(status: "ExecutionStatus") -> str:
|
|
2214
2214
|
return ":white_check_mark:"
|
2215
2215
|
if status == ExecutionStatus.CACHED:
|
2216
2216
|
return ":package:"
|
2217
|
+
if status == ExecutionStatus.STOPPED or status == ExecutionStatus.STOPPING:
|
2218
|
+
return ":stop_sign:"
|
2217
2219
|
raise RuntimeError(f"Unknown status: {status}")
|
2218
2220
|
|
2219
2221
|
|
zenml/config/compiler.py
CHANGED
@@ -556,7 +556,16 @@ class Compiler:
|
|
556
556
|
|
557
557
|
for name, step in steps.items():
|
558
558
|
step_operator = step.config.step_operator
|
559
|
-
if step_operator
|
559
|
+
if step_operator is True:
|
560
|
+
if not available_step_operators:
|
561
|
+
raise StackValidationError(
|
562
|
+
f"Step `{name}` requires a step operator, but no step "
|
563
|
+
f"operators are configured in the stack '{stack.name}'."
|
564
|
+
)
|
565
|
+
elif (
|
566
|
+
isinstance(step_operator, str)
|
567
|
+
and step_operator not in available_step_operators
|
568
|
+
):
|
560
569
|
raise StackValidationError(
|
561
570
|
f"Step `{name}` requires step operator "
|
562
571
|
f"'{step_operator}' which is not configured in "
|
@@ -565,8 +574,15 @@ class Compiler:
|
|
565
574
|
)
|
566
575
|
|
567
576
|
experiment_tracker = step.config.experiment_tracker
|
568
|
-
if
|
569
|
-
|
577
|
+
if experiment_tracker is True:
|
578
|
+
if not available_experiment_trackers:
|
579
|
+
raise StackValidationError(
|
580
|
+
f"Step `{name}` requires an experiment tracker, but no "
|
581
|
+
f"experiment trackers are configured in the stack "
|
582
|
+
f"'{stack.name}'."
|
583
|
+
)
|
584
|
+
elif (
|
585
|
+
isinstance(experiment_tracker, str)
|
570
586
|
and experiment_tracker not in available_experiment_trackers
|
571
587
|
):
|
572
588
|
raise StackValidationError(
|
@@ -145,8 +145,8 @@ class StepConfigurationUpdate(StrictBaseModel):
|
|
145
145
|
enable_artifact_metadata: Optional[bool] = None
|
146
146
|
enable_artifact_visualization: Optional[bool] = None
|
147
147
|
enable_step_logs: Optional[bool] = None
|
148
|
-
step_operator: Optional[str] = None
|
149
|
-
experiment_tracker: Optional[str] = None
|
148
|
+
step_operator: Optional[Union[bool, str]] = None
|
149
|
+
experiment_tracker: Optional[Union[bool, str]] = None
|
150
150
|
parameters: Dict[str, Any] = {}
|
151
151
|
settings: Dict[str, SerializeAsAny[BaseSettings]] = {}
|
152
152
|
extra: Dict[str, Any] = {}
|
@@ -158,6 +158,38 @@ class StepConfigurationUpdate(StrictBaseModel):
|
|
158
158
|
|
159
159
|
outputs: Mapping[str, PartialArtifactConfiguration] = {}
|
160
160
|
|
161
|
+
def uses_step_operator(self, name: str) -> bool:
|
162
|
+
"""Checks if the step configuration uses the given step operator.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
name: The name of the step operator.
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
If the step configuration uses the given step operator.
|
169
|
+
"""
|
170
|
+
if self.step_operator is True:
|
171
|
+
return True
|
172
|
+
elif isinstance(self.step_operator, str):
|
173
|
+
return self.step_operator == name
|
174
|
+
else:
|
175
|
+
return False
|
176
|
+
|
177
|
+
def uses_experiment_tracker(self, name: str) -> bool:
|
178
|
+
"""Checks if the step configuration uses the given experiment tracker.
|
179
|
+
|
180
|
+
Args:
|
181
|
+
name: The name of the experiment tracker.
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
If the step configuration uses the given experiment tracker.
|
185
|
+
"""
|
186
|
+
if self.experiment_tracker is True:
|
187
|
+
return True
|
188
|
+
elif isinstance(self.experiment_tracker, str):
|
189
|
+
return self.experiment_tracker == name
|
190
|
+
else:
|
191
|
+
return False
|
192
|
+
|
161
193
|
|
162
194
|
class PartialStepConfiguration(StepConfigurationUpdate):
|
163
195
|
"""Class representing a partial step configuration."""
|
zenml/constants.py
CHANGED
zenml/enums.py
CHANGED
@@ -71,25 +71,28 @@ class ZenMLServiceType(StrEnum):
|
|
71
71
|
|
72
72
|
|
73
73
|
class ExecutionStatus(StrEnum):
|
74
|
-
"""Enum that represents the
|
74
|
+
"""Enum that represents the execution status of a step or pipeline run."""
|
75
75
|
|
76
76
|
INITIALIZING = "initializing"
|
77
77
|
FAILED = "failed"
|
78
78
|
COMPLETED = "completed"
|
79
79
|
RUNNING = "running"
|
80
80
|
CACHED = "cached"
|
81
|
+
STOPPED = "stopped"
|
82
|
+
STOPPING = "stopping"
|
81
83
|
|
82
84
|
@property
|
83
85
|
def is_finished(self) -> bool:
|
84
|
-
"""
|
86
|
+
"""Returns whether the execution status is in a finished state.
|
85
87
|
|
86
88
|
Returns:
|
87
|
-
Whether the execution status
|
89
|
+
Whether the execution status is finished.
|
88
90
|
"""
|
89
91
|
return self in {
|
90
92
|
ExecutionStatus.FAILED,
|
91
93
|
ExecutionStatus.COMPLETED,
|
92
94
|
ExecutionStatus.CACHED,
|
95
|
+
ExecutionStatus.STOPPED,
|
93
96
|
}
|
94
97
|
|
95
98
|
|
zenml/exceptions.py
CHANGED
@@ -122,6 +122,14 @@ class IllegalOperationError(ZenMLBaseException):
|
|
122
122
|
"""Raised when an illegal operation is attempted."""
|
123
123
|
|
124
124
|
|
125
|
+
class RunStoppedException(ZenMLBaseException):
|
126
|
+
"""Raised when a ZenML pipeline run gets stopped by the user."""
|
127
|
+
|
128
|
+
|
129
|
+
class RunInterruptedException(ZenMLBaseException):
|
130
|
+
"""Raised when a ZenML step gets interrupted for an unknown reason."""
|
131
|
+
|
132
|
+
|
125
133
|
class MethodNotAllowedError(ZenMLBaseException):
|
126
134
|
"""Raised when the server does not allow a request method."""
|
127
135
|
|
@@ -853,12 +853,16 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
|
|
853
853
|
)["PipelineExecutionStatus"]
|
854
854
|
|
855
855
|
# Map the potential outputs to ZenML ExecutionStatus. Potential values:
|
856
|
-
# https://
|
857
|
-
if status
|
856
|
+
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribePipelineExecution.html
|
857
|
+
if status == "Executing":
|
858
858
|
return ExecutionStatus.RUNNING
|
859
|
-
elif status
|
859
|
+
elif status == "Stopping":
|
860
|
+
return ExecutionStatus.STOPPING
|
861
|
+
elif status == "Stopped":
|
862
|
+
return ExecutionStatus.STOPPED
|
863
|
+
elif status == "Failed":
|
860
864
|
return ExecutionStatus.FAILED
|
861
|
-
elif status
|
865
|
+
elif status == "Succeeded":
|
862
866
|
return ExecutionStatus.COMPLETED
|
863
867
|
else:
|
864
868
|
raise ValueError("Unknown status for the pipeline execution.")
|
@@ -152,7 +152,7 @@ class SagemakerStepOperator(BaseStepOperator):
|
|
152
152
|
"""
|
153
153
|
builds = []
|
154
154
|
for step_name, step in deployment.step_configurations.items():
|
155
|
-
if step.config.
|
155
|
+
if step.config.uses_step_operator(self.name):
|
156
156
|
build = BuildConfiguration(
|
157
157
|
key=SAGEMAKER_DOCKER_IMAGE_KEY,
|
158
158
|
settings=step.config.docker_settings,
|
@@ -515,14 +515,16 @@ class AzureMLOrchestrator(ContainerizedOrchestrator):
|
|
515
515
|
return ExecutionStatus.INITIALIZING
|
516
516
|
elif status in ["Running", "Finalizing"]:
|
517
517
|
return ExecutionStatus.RUNNING
|
518
|
+
elif status == "CancelRequested":
|
519
|
+
return ExecutionStatus.STOPPING
|
520
|
+
elif status == "Canceled":
|
521
|
+
return ExecutionStatus.STOPPED
|
518
522
|
elif status in [
|
519
|
-
"CancelRequested",
|
520
523
|
"Failed",
|
521
|
-
"Canceled",
|
522
524
|
"NotResponding",
|
523
525
|
]:
|
524
526
|
return ExecutionStatus.FAILED
|
525
|
-
elif status
|
527
|
+
elif status == "Completed":
|
526
528
|
return ExecutionStatus.COMPLETED
|
527
529
|
else:
|
528
530
|
raise ValueError("Unknown status for the pipeline job.")
|
@@ -149,7 +149,7 @@ class AzureMLStepOperator(BaseStepOperator):
|
|
149
149
|
"""
|
150
150
|
builds = []
|
151
151
|
for step_name, step in deployment.step_configurations.items():
|
152
|
-
if step.config.
|
152
|
+
if step.config.uses_step_operator(self.name):
|
153
153
|
build = BuildConfiguration(
|
154
154
|
key=AZUREML_STEP_OPERATOR_DOCKER_IMAGE_KEY,
|
155
155
|
settings=step.config.docker_settings,
|
@@ -942,7 +942,7 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
|
|
942
942
|
|
943
943
|
# Map the potential outputs to ZenML ExecutionStatus. Potential values:
|
944
944
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/describe_pipeline_execution.html#
|
945
|
-
if status
|
945
|
+
if status == PipelineState.PIPELINE_STATE_UNSPECIFIED:
|
946
946
|
return run.status
|
947
947
|
elif status in [
|
948
948
|
PipelineState.PIPELINE_STATE_QUEUED,
|
@@ -954,14 +954,13 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
|
|
954
954
|
PipelineState.PIPELINE_STATE_PAUSED,
|
955
955
|
]:
|
956
956
|
return ExecutionStatus.RUNNING
|
957
|
-
elif status
|
957
|
+
elif status == PipelineState.PIPELINE_STATE_SUCCEEDED:
|
958
958
|
return ExecutionStatus.COMPLETED
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
]:
|
959
|
+
elif status == PipelineState.PIPELINE_STATE_CANCELLING:
|
960
|
+
return ExecutionStatus.STOPPING
|
961
|
+
elif status == PipelineState.PIPELINE_STATE_CANCELLED:
|
962
|
+
return ExecutionStatus.STOPPED
|
963
|
+
elif status == PipelineState.PIPELINE_STATE_FAILED:
|
965
964
|
return ExecutionStatus.FAILED
|
966
965
|
else:
|
967
966
|
raise ValueError("Unknown status for the pipeline job.")
|
@@ -161,7 +161,7 @@ class VertexStepOperator(BaseStepOperator, GoogleCredentialsMixin):
|
|
161
161
|
"""
|
162
162
|
builds = []
|
163
163
|
for step_name, step in deployment.step_configurations.items():
|
164
|
-
if step.config.
|
164
|
+
if step.config.uses_step_operator(self.name):
|
165
165
|
build = BuildConfiguration(
|
166
166
|
key=VERTEX_DOCKER_IMAGE_KEY,
|
167
167
|
settings=step.config.docker_settings,
|
@@ -69,6 +69,10 @@ class KubernetesOrchestratorSettings(BaseSettings):
|
|
69
69
|
scheduling a pipeline.
|
70
70
|
prevent_orchestrator_pod_caching: If `True`, the orchestrator pod will
|
71
71
|
not try to compute cached steps before starting the step pods.
|
72
|
+
always_build_pipeline_image: If `True`, the orchestrator will always
|
73
|
+
build the pipeline image, even if all steps have a custom build.
|
74
|
+
pod_stop_grace_period: When stopping a pipeline run, the amount of
|
75
|
+
seconds to wait for a step pod to shutdown gracefully.
|
72
76
|
"""
|
73
77
|
|
74
78
|
synchronous: bool = True
|
@@ -88,6 +92,8 @@ class KubernetesOrchestratorSettings(BaseSettings):
|
|
88
92
|
failed_jobs_history_limit: Optional[NonNegativeInt] = None
|
89
93
|
ttl_seconds_after_finished: Optional[NonNegativeInt] = None
|
90
94
|
prevent_orchestrator_pod_caching: bool = False
|
95
|
+
always_build_pipeline_image: bool = False
|
96
|
+
pod_stop_grace_period: PositiveInt = 30
|
91
97
|
|
92
98
|
|
93
99
|
class KubernetesOrchestratorConfig(
|
@@ -70,7 +70,11 @@ from zenml.orchestrators.utils import get_orchestrator_run_name
|
|
70
70
|
from zenml.stack import StackValidator
|
71
71
|
|
72
72
|
if TYPE_CHECKING:
|
73
|
-
from zenml.models import
|
73
|
+
from zenml.models import (
|
74
|
+
PipelineDeploymentBase,
|
75
|
+
PipelineDeploymentResponse,
|
76
|
+
PipelineRunResponse,
|
77
|
+
)
|
74
78
|
from zenml.stack import Stack
|
75
79
|
|
76
80
|
logger = get_logger(__name__)
|
@@ -84,6 +88,22 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
84
88
|
|
85
89
|
_k8s_client: Optional[k8s_client.ApiClient] = None
|
86
90
|
|
91
|
+
def should_build_pipeline_image(
|
92
|
+
self, deployment: "PipelineDeploymentBase"
|
93
|
+
) -> bool:
|
94
|
+
"""Whether to always build the pipeline image.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
deployment: The pipeline deployment.
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Whether to always build the pipeline image.
|
101
|
+
"""
|
102
|
+
settings = cast(
|
103
|
+
KubernetesOrchestratorSettings, self.get_settings(deployment)
|
104
|
+
)
|
105
|
+
return settings.always_build_pipeline_image
|
106
|
+
|
87
107
|
def get_kube_client(
|
88
108
|
self, incluster: Optional[bool] = None
|
89
109
|
) -> k8s_client.ApiClient:
|
@@ -545,6 +565,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
545
565
|
successful_jobs_history_limit=settings.successful_jobs_history_limit,
|
546
566
|
failed_jobs_history_limit=settings.failed_jobs_history_limit,
|
547
567
|
ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
|
568
|
+
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
548
569
|
labels=orchestrator_pod_labels,
|
549
570
|
)
|
550
571
|
|
@@ -570,6 +591,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
570
591
|
env=environment,
|
571
592
|
labels=orchestrator_pod_labels,
|
572
593
|
mount_local_stores=self.config.is_local,
|
594
|
+
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
573
595
|
)
|
574
596
|
|
575
597
|
kube_utils.create_and_wait_for_pod_to_start(
|
@@ -663,6 +685,92 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
663
685
|
f"{ENV_ZENML_KUBERNETES_RUN_ID}."
|
664
686
|
)
|
665
687
|
|
688
|
+
def _stop_run(
|
689
|
+
self, run: "PipelineRunResponse", graceful: bool = True
|
690
|
+
) -> None:
|
691
|
+
"""Stops a specific pipeline run by terminating step pods.
|
692
|
+
|
693
|
+
Args:
|
694
|
+
run: The run that was executed by this orchestrator.
|
695
|
+
graceful: If True, does nothing (lets the orchestrator and steps finish naturally).
|
696
|
+
If False, stops all running step pods.
|
697
|
+
|
698
|
+
Raises:
|
699
|
+
RuntimeError: If we fail to stop the run.
|
700
|
+
"""
|
701
|
+
# If graceful, do nothing and let the orchestrator handle the stop naturally
|
702
|
+
if graceful:
|
703
|
+
logger.info(
|
704
|
+
"Graceful stop requested - the orchestrator pod will handle "
|
705
|
+
"stopping naturally"
|
706
|
+
)
|
707
|
+
return
|
708
|
+
|
709
|
+
pods_stopped = []
|
710
|
+
errors = []
|
711
|
+
|
712
|
+
# Find all pods with the orchestrator run ID label
|
713
|
+
label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
|
714
|
+
try:
|
715
|
+
pods = self._k8s_core_api.list_namespaced_pod(
|
716
|
+
namespace=self.config.kubernetes_namespace,
|
717
|
+
label_selector=label_selector,
|
718
|
+
)
|
719
|
+
except Exception as e:
|
720
|
+
raise RuntimeError(
|
721
|
+
f"Failed to list step pods with run ID {run.id}: {e}"
|
722
|
+
)
|
723
|
+
|
724
|
+
# Filter to only include running or pending pods
|
725
|
+
for pod in pods.items:
|
726
|
+
if pod.status.phase not in ["Running", "Pending"]:
|
727
|
+
logger.debug(
|
728
|
+
f"Skipping pod {pod.metadata.name} with status {pod.status.phase}"
|
729
|
+
)
|
730
|
+
continue
|
731
|
+
|
732
|
+
try:
|
733
|
+
self._k8s_core_api.delete_namespaced_pod(
|
734
|
+
name=pod.metadata.name,
|
735
|
+
namespace=self.config.kubernetes_namespace,
|
736
|
+
)
|
737
|
+
pods_stopped.append(f"step pod: {pod.metadata.name}")
|
738
|
+
logger.debug(
|
739
|
+
f"Successfully initiated graceful stop of step pod: {pod.metadata.name}"
|
740
|
+
)
|
741
|
+
except Exception as e:
|
742
|
+
error_msg = f"Failed to stop step pod {pod.metadata.name}: {e}"
|
743
|
+
logger.warning(error_msg)
|
744
|
+
errors.append(error_msg)
|
745
|
+
|
746
|
+
# Summary logging
|
747
|
+
settings = cast(KubernetesOrchestratorSettings, self.get_settings(run))
|
748
|
+
grace_period_seconds = settings.pod_stop_grace_period
|
749
|
+
if pods_stopped:
|
750
|
+
logger.debug(
|
751
|
+
f"Successfully initiated graceful termination of: {', '.join(pods_stopped)}. "
|
752
|
+
f"Pods will terminate within {grace_period_seconds} seconds."
|
753
|
+
)
|
754
|
+
|
755
|
+
if errors:
|
756
|
+
error_summary = "; ".join(errors)
|
757
|
+
if not pods_stopped:
|
758
|
+
# If nothing was stopped successfully, raise an error
|
759
|
+
raise RuntimeError(
|
760
|
+
f"Failed to stop pipeline run: {error_summary}"
|
761
|
+
)
|
762
|
+
else:
|
763
|
+
# If some things were stopped but others failed, raise an error
|
764
|
+
raise RuntimeError(
|
765
|
+
f"Partial stop operation completed with errors: {error_summary}"
|
766
|
+
)
|
767
|
+
|
768
|
+
# If no step pods were found and no errors occurred
|
769
|
+
if not pods_stopped and not errors:
|
770
|
+
logger.info(
|
771
|
+
f"No running step pods found for pipeline run with ID: {run.id}"
|
772
|
+
)
|
773
|
+
|
666
774
|
def get_pipeline_run_metadata(
|
667
775
|
self, run_id: UUID
|
668
776
|
) -> Dict[str, "MetadataType"]:
|
@@ -18,6 +18,7 @@ import socket
|
|
18
18
|
from typing import Callable, Dict, Optional, cast
|
19
19
|
|
20
20
|
from kubernetes import client as k8s_client
|
21
|
+
from kubernetes.client.rest import ApiException
|
21
22
|
|
22
23
|
from zenml.client import Client
|
23
24
|
from zenml.entrypoints.step_entrypoint_configuration import (
|
@@ -248,6 +249,7 @@ def main() -> None:
|
|
248
249
|
or settings.service_account_name,
|
249
250
|
mount_local_stores=mount_local_stores,
|
250
251
|
owner_references=owner_references,
|
252
|
+
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
251
253
|
labels=step_pod_labels,
|
252
254
|
)
|
253
255
|
|
@@ -330,6 +332,38 @@ def main() -> None:
|
|
330
332
|
# as the pipeline run status will already have been published.
|
331
333
|
pass
|
332
334
|
|
335
|
+
def check_pipeline_cancellation() -> bool:
|
336
|
+
"""Check if the pipeline should continue execution.
|
337
|
+
|
338
|
+
Returns:
|
339
|
+
True if execution should continue, False if it should stop.
|
340
|
+
"""
|
341
|
+
try:
|
342
|
+
run = client.get_pipeline_run(
|
343
|
+
name_id_or_prefix=pipeline_run.id,
|
344
|
+
project=pipeline_run.project_id,
|
345
|
+
hydrate=False, # We only need status, not full hydration
|
346
|
+
)
|
347
|
+
|
348
|
+
# If the run is STOPPING or STOPPED, we should stop the execution
|
349
|
+
if run.status in [
|
350
|
+
ExecutionStatus.STOPPING,
|
351
|
+
ExecutionStatus.STOPPED,
|
352
|
+
]:
|
353
|
+
logger.info(
|
354
|
+
f"Pipeline run is in {run.status} state, stopping execution"
|
355
|
+
)
|
356
|
+
return False
|
357
|
+
|
358
|
+
return True
|
359
|
+
|
360
|
+
except Exception as e:
|
361
|
+
# If we can't check the status, assume we should continue
|
362
|
+
logger.warning(
|
363
|
+
f"Failed to check pipeline cancellation status: {e}"
|
364
|
+
)
|
365
|
+
return True
|
366
|
+
|
333
367
|
parallel_node_startup_waiting_period = (
|
334
368
|
orchestrator.config.parallel_step_startup_waiting_period or 0.0
|
335
369
|
)
|
@@ -344,6 +378,7 @@ def main() -> None:
|
|
344
378
|
run_fn=run_step_on_kubernetes,
|
345
379
|
preparation_fn=pre_step_run,
|
346
380
|
finalize_fn=finalize_run,
|
381
|
+
continue_fn=check_pipeline_cancellation,
|
347
382
|
parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
|
348
383
|
max_parallelism=pipeline_settings.max_parallelism,
|
349
384
|
).run()
|
@@ -360,7 +395,7 @@ def main() -> None:
|
|
360
395
|
namespace=namespace,
|
361
396
|
secret_name=secret_name,
|
362
397
|
)
|
363
|
-
except
|
398
|
+
except ApiException as e:
|
364
399
|
logger.error(f"Error cleaning up secret {secret_name}: {e}")
|
365
400
|
|
366
401
|
|
@@ -106,6 +106,7 @@ def build_pod_manifest(
|
|
106
106
|
labels: Optional[Dict[str, str]] = None,
|
107
107
|
mount_local_stores: bool = False,
|
108
108
|
owner_references: Optional[List[k8s_client.V1OwnerReference]] = None,
|
109
|
+
termination_grace_period_seconds: Optional[int] = 30,
|
109
110
|
) -> k8s_client.V1Pod:
|
110
111
|
"""Build a Kubernetes pod manifest for a ZenML run or step.
|
111
112
|
|
@@ -124,6 +125,8 @@ def build_pod_manifest(
|
|
124
125
|
mount_local_stores: Whether to mount the local stores path inside the
|
125
126
|
pod.
|
126
127
|
owner_references: List of owner references for the pod.
|
128
|
+
termination_grace_period_seconds: The amount of seconds to wait for a
|
129
|
+
pod to shutdown gracefully.
|
127
130
|
|
128
131
|
Returns:
|
129
132
|
Pod manifest.
|
@@ -154,19 +157,20 @@ def build_pod_manifest(
|
|
154
157
|
containers=[container_spec],
|
155
158
|
restart_policy="Never",
|
156
159
|
image_pull_secrets=image_pull_secrets,
|
160
|
+
termination_grace_period_seconds=termination_grace_period_seconds,
|
157
161
|
)
|
158
162
|
|
159
163
|
if service_account_name is not None:
|
160
164
|
pod_spec.service_account_name = service_account_name
|
161
165
|
|
166
|
+
# Apply pod settings if provided
|
162
167
|
labels = labels or {}
|
163
168
|
|
164
169
|
if pod_settings:
|
165
170
|
add_pod_settings(pod_spec, pod_settings)
|
166
171
|
|
167
|
-
|
168
|
-
|
169
|
-
labels.update(pod_settings.labels)
|
172
|
+
if pod_settings and pod_settings.labels:
|
173
|
+
labels.update(pod_settings.labels)
|
170
174
|
|
171
175
|
pod_metadata = k8s_client.V1ObjectMeta(
|
172
176
|
name=pod_name,
|
@@ -273,6 +277,7 @@ def build_cron_job_manifest(
|
|
273
277
|
successful_jobs_history_limit: Optional[int] = None,
|
274
278
|
failed_jobs_history_limit: Optional[int] = None,
|
275
279
|
ttl_seconds_after_finished: Optional[int] = None,
|
280
|
+
termination_grace_period_seconds: Optional[int] = 30,
|
276
281
|
) -> k8s_client.V1CronJob:
|
277
282
|
"""Create a manifest for launching a pod as scheduled CRON job.
|
278
283
|
|
@@ -295,6 +300,8 @@ def build_cron_job_manifest(
|
|
295
300
|
failed_jobs_history_limit: The number of failed jobs to retain.
|
296
301
|
ttl_seconds_after_finished: The amount of seconds to keep finished jobs
|
297
302
|
before deleting them.
|
303
|
+
termination_grace_period_seconds: The amount of seconds to wait for a
|
304
|
+
pod to shutdown gracefully.
|
298
305
|
|
299
306
|
Returns:
|
300
307
|
CRON job manifest.
|
@@ -310,6 +317,7 @@ def build_cron_job_manifest(
|
|
310
317
|
env=env,
|
311
318
|
labels=labels,
|
312
319
|
mount_local_stores=mount_local_stores,
|
320
|
+
termination_grace_period_seconds=termination_grace_period_seconds,
|
313
321
|
)
|
314
322
|
|
315
323
|
job_spec = k8s_client.V1CronJobSpec(
|
@@ -120,7 +120,7 @@ class KubernetesStepOperator(BaseStepOperator):
|
|
120
120
|
"""
|
121
121
|
builds = []
|
122
122
|
for step_name, step in deployment.step_configurations.items():
|
123
|
-
if step.config.
|
123
|
+
if step.config.uses_step_operator(self.name):
|
124
124
|
build = BuildConfiguration(
|
125
125
|
key=KUBERNETES_STEP_OPERATOR_DOCKER_IMAGE_KEY,
|
126
126
|
settings=step.config.docker_settings,
|
@@ -139,7 +139,7 @@ class ModalStepOperator(BaseStepOperator):
|
|
139
139
|
"""
|
140
140
|
builds = []
|
141
141
|
for step_name, step in deployment.step_configurations.items():
|
142
|
-
if step.config.
|
142
|
+
if step.config.uses_step_operator(self.name):
|
143
143
|
build = BuildConfiguration(
|
144
144
|
key=MODAL_STEP_OPERATOR_DOCKER_IMAGE_KEY,
|
145
145
|
settings=step.config.docker_settings,
|
@@ -124,7 +124,7 @@ class KubernetesSparkStepOperator(SparkStepOperator):
|
|
124
124
|
builds = []
|
125
125
|
extra_files = {ENTRYPOINT_NAME: LOCAL_ENTRYPOINT}
|
126
126
|
for step_name, step in deployment.step_configurations.items():
|
127
|
-
if step.config.
|
127
|
+
if step.config.uses_step_operator(self.name):
|
128
128
|
build = BuildConfiguration(
|
129
129
|
key=SPARK_DOCKER_IMAGE_KEY,
|
130
130
|
settings=step.config.docker_settings,
|