zenml-nightly 0.84.1.dev20250805__py3-none-any.whl → 0.84.1.dev20250806__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zenml/VERSION +1 -1
- zenml/integrations/kubernetes/constants.py +27 -0
- zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py +79 -36
- zenml/integrations/kubernetes/flavors/kubernetes_step_operator_flavor.py +55 -24
- zenml/integrations/kubernetes/orchestrators/dag_runner.py +367 -0
- zenml/integrations/kubernetes/orchestrators/kube_utils.py +368 -1
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py +144 -262
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +392 -244
- zenml/integrations/kubernetes/orchestrators/manifest_utils.py +53 -85
- zenml/integrations/kubernetes/step_operators/kubernetes_step_operator.py +74 -32
- zenml/logging/step_logging.py +33 -30
- zenml/steps/base_step.py +6 -6
- zenml/steps/step_decorator.py +4 -4
- {zenml_nightly-0.84.1.dev20250805.dist-info → zenml_nightly-0.84.1.dev20250806.dist-info}/METADATA +1 -1
- {zenml_nightly-0.84.1.dev20250805.dist-info → zenml_nightly-0.84.1.dev20250806.dist-info}/RECORD +18 -16
- {zenml_nightly-0.84.1.dev20250805.dist-info → zenml_nightly-0.84.1.dev20250806.dist-info}/LICENSE +0 -0
- {zenml_nightly-0.84.1.dev20250805.dist-info → zenml_nightly-0.84.1.dev20250806.dist-info}/WHEEL +0 -0
- {zenml_nightly-0.84.1.dev20250805.dist-info → zenml_nightly-0.84.1.dev20250806.dist-info}/entry_points.txt +0 -0
@@ -31,6 +31,7 @@
|
|
31
31
|
"""Kubernetes-native orchestrator."""
|
32
32
|
|
33
33
|
import os
|
34
|
+
import random
|
34
35
|
from typing import (
|
35
36
|
TYPE_CHECKING,
|
36
37
|
Dict,
|
@@ -50,6 +51,13 @@ from zenml.constants import (
|
|
50
51
|
METADATA_ORCHESTRATOR_RUN_ID,
|
51
52
|
)
|
52
53
|
from zenml.enums import ExecutionStatus, StackComponentType
|
54
|
+
from zenml.integrations.kubernetes.constants import (
|
55
|
+
ENV_ZENML_KUBERNETES_RUN_ID,
|
56
|
+
KUBERNETES_CRON_JOB_METADATA_KEY,
|
57
|
+
KUBERNETES_SECRET_TOKEN_KEY_NAME,
|
58
|
+
ORCHESTRATOR_ANNOTATION_KEY,
|
59
|
+
STEP_NAME_ANNOTATION_KEY,
|
60
|
+
)
|
53
61
|
from zenml.integrations.kubernetes.flavors.kubernetes_orchestrator_flavor import (
|
54
62
|
KubernetesOrchestratorConfig,
|
55
63
|
KubernetesOrchestratorSettings,
|
@@ -60,14 +68,15 @@ from zenml.integrations.kubernetes.orchestrators.kubernetes_orchestrator_entrypo
|
|
60
68
|
)
|
61
69
|
from zenml.integrations.kubernetes.orchestrators.manifest_utils import (
|
62
70
|
build_cron_job_manifest,
|
71
|
+
build_job_manifest,
|
63
72
|
build_pod_manifest,
|
73
|
+
job_template_manifest_from_job,
|
74
|
+
pod_template_manifest_from_pod,
|
64
75
|
)
|
65
|
-
from zenml.integrations.kubernetes.pod_settings import KubernetesPodSettings
|
66
76
|
from zenml.logger import get_logger
|
67
77
|
from zenml.metadata.metadata_types import MetadataType
|
68
78
|
from zenml.models.v2.core.schedule import ScheduleUpdate
|
69
79
|
from zenml.orchestrators import ContainerizedOrchestrator, SubmissionResult
|
70
|
-
from zenml.orchestrators.utils import get_orchestrator_run_name
|
71
80
|
from zenml.stack import StackValidator
|
72
81
|
|
73
82
|
if TYPE_CHECKING:
|
@@ -81,10 +90,6 @@ if TYPE_CHECKING:
|
|
81
90
|
|
82
91
|
logger = get_logger(__name__)
|
83
92
|
|
84
|
-
ENV_ZENML_KUBERNETES_RUN_ID = "ZENML_KUBERNETES_RUN_ID"
|
85
|
-
KUBERNETES_SECRET_TOKEN_KEY_NAME = "zenml_api_token"
|
86
|
-
KUBERNETES_CRON_JOB_METADATA_KEY = "cron_job_name"
|
87
|
-
|
88
93
|
|
89
94
|
class KubernetesOrchestrator(ContainerizedOrchestrator):
|
90
95
|
"""Orchestrator for running ZenML pipelines using native Kubernetes."""
|
@@ -364,45 +369,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
364
369
|
custom_validation_function=_validate_local_requirements,
|
365
370
|
)
|
366
371
|
|
367
|
-
@classmethod
|
368
|
-
def apply_default_resource_requests(
|
369
|
-
cls,
|
370
|
-
memory: str,
|
371
|
-
cpu: Optional[str] = None,
|
372
|
-
pod_settings: Optional[KubernetesPodSettings] = None,
|
373
|
-
) -> KubernetesPodSettings:
|
374
|
-
"""Applies default resource requests to a pod settings object.
|
375
|
-
|
376
|
-
Args:
|
377
|
-
memory: The memory resource request.
|
378
|
-
cpu: The CPU resource request.
|
379
|
-
pod_settings: The pod settings to update. A new one will be created
|
380
|
-
if not provided.
|
381
|
-
|
382
|
-
Returns:
|
383
|
-
The new or updated pod settings.
|
384
|
-
"""
|
385
|
-
resources = {
|
386
|
-
"requests": {"memory": memory},
|
387
|
-
}
|
388
|
-
if cpu:
|
389
|
-
resources["requests"]["cpu"] = cpu
|
390
|
-
if not pod_settings:
|
391
|
-
pod_settings = KubernetesPodSettings(resources=resources)
|
392
|
-
elif not pod_settings.resources:
|
393
|
-
# We can't update the pod settings in place (because it's a frozen
|
394
|
-
# pydantic model), so we have to create a new one.
|
395
|
-
pod_settings = KubernetesPodSettings(
|
396
|
-
**pod_settings.model_dump(exclude_unset=True),
|
397
|
-
resources=resources,
|
398
|
-
)
|
399
|
-
else:
|
400
|
-
set_requests = pod_settings.resources.get("requests", {})
|
401
|
-
resources["requests"].update(set_requests)
|
402
|
-
pod_settings.resources["requests"] = resources["requests"]
|
403
|
-
|
404
|
-
return pod_settings
|
405
|
-
|
406
372
|
def get_token_secret_name(self, deployment_id: UUID) -> str:
|
407
373
|
"""Returns the name of the secret that contains the ZenML token.
|
408
374
|
|
@@ -463,27 +429,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
463
429
|
KubernetesOrchestratorSettings, self.get_settings(deployment)
|
464
430
|
)
|
465
431
|
|
466
|
-
# We already make sure the orchestrator run name has the correct length
|
467
|
-
# to make sure we don't cut off the randomized suffix later when
|
468
|
-
# sanitizing the pod name. This avoids any pod naming collisions.
|
469
|
-
max_length = kube_utils.calculate_max_pod_name_length_for_namespace(
|
470
|
-
namespace=self.config.kubernetes_namespace
|
471
|
-
)
|
472
|
-
orchestrator_run_name = get_orchestrator_run_name(
|
473
|
-
pipeline_name, max_length=max_length
|
474
|
-
)
|
475
|
-
|
476
|
-
if settings.pod_name_prefix:
|
477
|
-
pod_name = get_orchestrator_run_name(
|
478
|
-
settings.pod_name_prefix, max_length=max_length
|
479
|
-
)
|
480
|
-
else:
|
481
|
-
pod_name = orchestrator_run_name
|
482
|
-
|
483
|
-
pod_name = kube_utils.sanitize_pod_name(
|
484
|
-
pod_name, namespace=self.config.kubernetes_namespace
|
485
|
-
)
|
486
|
-
|
487
432
|
assert stack.container_registry
|
488
433
|
|
489
434
|
# Get Docker image for the orchestrator pod
|
@@ -514,7 +459,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
514
459
|
# takes up some memory resources itself and, if not specified, the pod
|
515
460
|
# will be scheduled on any node regardless of available memory and risk
|
516
461
|
# negatively impacting or even crashing the node due to memory pressure.
|
517
|
-
orchestrator_pod_settings =
|
462
|
+
orchestrator_pod_settings = kube_utils.apply_default_resource_requests(
|
518
463
|
memory="400Mi",
|
519
464
|
cpu="100m",
|
520
465
|
pod_settings=settings.orchestrator_pod_settings,
|
@@ -550,10 +495,74 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
550
495
|
str(placeholder_run.id)
|
551
496
|
)
|
552
497
|
orchestrator_pod_labels["run_name"] = kube_utils.sanitize_label(
|
553
|
-
|
498
|
+
placeholder_run.name
|
554
499
|
)
|
555
500
|
|
556
|
-
|
501
|
+
pod_manifest = build_pod_manifest(
|
502
|
+
pod_name=None,
|
503
|
+
image_name=image,
|
504
|
+
command=command,
|
505
|
+
args=args,
|
506
|
+
privileged=False,
|
507
|
+
pod_settings=orchestrator_pod_settings,
|
508
|
+
service_account_name=service_account_name,
|
509
|
+
env=environment,
|
510
|
+
labels=orchestrator_pod_labels,
|
511
|
+
mount_local_stores=self.config.is_local,
|
512
|
+
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
513
|
+
)
|
514
|
+
|
515
|
+
pod_failure_policy = settings.pod_failure_policy or {
|
516
|
+
# These rules are applied sequentially. This means any failure in
|
517
|
+
# the main container will count towards the max retries. Any other
|
518
|
+
# disruption will not count towards the max retries.
|
519
|
+
"rules": [
|
520
|
+
# If the main container fails, we count it towards the max
|
521
|
+
# retries.
|
522
|
+
{
|
523
|
+
"action": "Count",
|
524
|
+
"onExitCodes": {
|
525
|
+
"containerName": "main",
|
526
|
+
"operator": "NotIn",
|
527
|
+
"values": [0],
|
528
|
+
},
|
529
|
+
},
|
530
|
+
# If the pod is interrupted at any other time, we don't count
|
531
|
+
# it as a retry
|
532
|
+
{
|
533
|
+
"action": "Ignore",
|
534
|
+
"onPodConditions": [
|
535
|
+
{
|
536
|
+
"type": "DisruptionTarget",
|
537
|
+
"status": "True",
|
538
|
+
}
|
539
|
+
],
|
540
|
+
},
|
541
|
+
]
|
542
|
+
}
|
543
|
+
|
544
|
+
job_name = settings.job_name_prefix or ""
|
545
|
+
random_prefix = "".join(random.choices("0123456789abcdef", k=8))
|
546
|
+
job_name += (
|
547
|
+
f"-{random_prefix}-{deployment.pipeline_configuration.name}"
|
548
|
+
)
|
549
|
+
# The job name will be used as a label on the pods, so we need to make
|
550
|
+
# sure it doesn't exceed the label length limit
|
551
|
+
job_name = kube_utils.sanitize_label(job_name)
|
552
|
+
|
553
|
+
job_manifest = build_job_manifest(
|
554
|
+
job_name=job_name,
|
555
|
+
pod_template=pod_template_manifest_from_pod(pod_manifest),
|
556
|
+
backoff_limit=settings.orchestrator_job_backoff_limit,
|
557
|
+
ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
|
558
|
+
active_deadline_seconds=settings.active_deadline_seconds,
|
559
|
+
pod_failure_policy=pod_failure_policy,
|
560
|
+
labels=orchestrator_pod_labels,
|
561
|
+
annotations={
|
562
|
+
ORCHESTRATOR_ANNOTATION_KEY: str(self.id),
|
563
|
+
},
|
564
|
+
)
|
565
|
+
|
557
566
|
if deployment.schedule:
|
558
567
|
if not deployment.schedule.cron_expression:
|
559
568
|
raise RuntimeError(
|
@@ -564,20 +573,9 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
564
573
|
cron_expression = deployment.schedule.cron_expression
|
565
574
|
cron_job_manifest = build_cron_job_manifest(
|
566
575
|
cron_expression=cron_expression,
|
567
|
-
|
568
|
-
image_name=image,
|
569
|
-
command=command,
|
570
|
-
args=args,
|
571
|
-
service_account_name=service_account_name,
|
572
|
-
privileged=False,
|
573
|
-
pod_settings=orchestrator_pod_settings,
|
574
|
-
env=environment,
|
575
|
-
mount_local_stores=self.config.is_local,
|
576
|
+
job_template=job_template_manifest_from_job(job_manifest),
|
576
577
|
successful_jobs_history_limit=settings.successful_jobs_history_limit,
|
577
578
|
failed_jobs_history_limit=settings.failed_jobs_history_limit,
|
578
|
-
ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
|
579
|
-
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
580
|
-
labels=orchestrator_pod_labels,
|
581
579
|
)
|
582
580
|
|
583
581
|
cron_job = self._k8s_batch_api.create_namespaced_cron_job(
|
@@ -585,8 +583,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
585
583
|
namespace=self.config.kubernetes_namespace,
|
586
584
|
)
|
587
585
|
logger.info(
|
588
|
-
f"
|
589
|
-
f
|
586
|
+
f"Created Kubernetes CronJob `{cron_job.metadata.name}` "
|
587
|
+
f"with CRON expression `{cron_expression}`."
|
590
588
|
)
|
591
589
|
return SubmissionResult(
|
592
590
|
metadata={
|
@@ -594,32 +592,11 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
594
592
|
}
|
595
593
|
)
|
596
594
|
else:
|
597
|
-
# Create and run the orchestrator pod.
|
598
|
-
pod_manifest = build_pod_manifest(
|
599
|
-
pod_name=pod_name,
|
600
|
-
image_name=image,
|
601
|
-
command=command,
|
602
|
-
args=args,
|
603
|
-
privileged=False,
|
604
|
-
pod_settings=orchestrator_pod_settings,
|
605
|
-
service_account_name=service_account_name,
|
606
|
-
env=environment,
|
607
|
-
labels=orchestrator_pod_labels,
|
608
|
-
mount_local_stores=self.config.is_local,
|
609
|
-
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
610
|
-
)
|
611
|
-
|
612
595
|
try:
|
613
|
-
kube_utils.
|
614
|
-
|
615
|
-
pod_display_name="Kubernetes orchestrator pod",
|
616
|
-
pod_name=pod_name,
|
617
|
-
pod_manifest=pod_manifest,
|
596
|
+
kube_utils.create_job(
|
597
|
+
batch_api=self._k8s_batch_api,
|
618
598
|
namespace=self.config.kubernetes_namespace,
|
619
|
-
|
620
|
-
startup_failure_delay=settings.pod_failure_retry_delay,
|
621
|
-
startup_failure_backoff=settings.pod_failure_backoff,
|
622
|
-
startup_timeout=settings.pod_startup_timeout,
|
599
|
+
job_manifest=job_manifest,
|
623
600
|
)
|
624
601
|
except Exception as e:
|
625
602
|
if self.config.pass_zenml_token_as_secret:
|
@@ -638,40 +615,31 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
638
615
|
)
|
639
616
|
raise e
|
640
617
|
|
641
|
-
metadata: Dict[str, MetadataType] = {
|
642
|
-
METADATA_ORCHESTRATOR_RUN_ID: pod_name,
|
643
|
-
}
|
644
|
-
|
645
|
-
# Wait for the orchestrator pod to finish and stream logs.
|
646
618
|
if settings.synchronous:
|
647
619
|
|
648
620
|
def _wait_for_run_to_finish() -> None:
|
649
|
-
logger.info(
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
kube_client_fn=self.get_kube_client,
|
654
|
-
pod_name=pod_name,
|
621
|
+
logger.info("Waiting for orchestrator job to finish...")
|
622
|
+
kube_utils.wait_for_job_to_finish(
|
623
|
+
batch_api=self._k8s_batch_api,
|
624
|
+
core_api=self._k8s_core_api,
|
655
625
|
namespace=self.config.kubernetes_namespace,
|
656
|
-
|
657
|
-
|
626
|
+
job_name=job_name,
|
627
|
+
backoff_interval=settings.job_monitoring_interval,
|
628
|
+
fail_on_container_waiting_reasons=settings.fail_on_container_waiting_reasons,
|
658
629
|
stream_logs=True,
|
659
630
|
)
|
660
631
|
|
661
632
|
return SubmissionResult(
|
662
|
-
metadata=metadata,
|
663
633
|
wait_for_completion=_wait_for_run_to_finish,
|
664
634
|
)
|
665
635
|
else:
|
666
636
|
logger.info(
|
667
|
-
f"
|
668
|
-
f"`{self.config.kubernetes_namespace}:{pod_name}`. "
|
637
|
+
f"Orchestrator job `{job_name}` started. "
|
669
638
|
f"Run the following command to inspect the logs: "
|
670
|
-
f"`kubectl
|
671
|
-
|
672
|
-
return SubmissionResult(
|
673
|
-
metadata=metadata,
|
639
|
+
f"`kubectl -n {self.config.kubernetes_namespace} logs "
|
640
|
+
f"job/{job_name}`"
|
674
641
|
)
|
642
|
+
return None
|
675
643
|
|
676
644
|
def _get_service_account_name(
|
677
645
|
self, settings: KubernetesOrchestratorSettings
|
@@ -744,7 +712,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
744
712
|
# Find all jobs running steps of the pipeline
|
745
713
|
label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
|
746
714
|
try:
|
747
|
-
|
715
|
+
job_list = kube_utils.list_jobs(
|
716
|
+
batch_api=self._k8s_batch_api,
|
748
717
|
namespace=self.config.kubernetes_namespace,
|
749
718
|
label_selector=label_selector,
|
750
719
|
)
|
@@ -753,8 +722,12 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
753
722
|
f"Failed to list step jobs with run ID {run.id}: {e}"
|
754
723
|
)
|
755
724
|
|
756
|
-
for job in
|
757
|
-
if job
|
725
|
+
for job in job_list.items:
|
726
|
+
if not kube_utils.is_step_job(job):
|
727
|
+
# This is the orchestrator job which stops by itself
|
728
|
+
continue
|
729
|
+
|
730
|
+
if job.status and job.status.conditions:
|
758
731
|
# Don't delete completed/failed jobs
|
759
732
|
for condition in job.status.conditions:
|
760
733
|
if (
|
@@ -825,94 +798,59 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
825
798
|
A tuple of (pipeline_status, step_statuses).
|
826
799
|
If include_steps is False, step_statuses will be None.
|
827
800
|
If include_steps is True, step_statuses will be a dict (possibly empty).
|
828
|
-
|
829
|
-
Raises:
|
830
|
-
ValueError: If the orchestrator run ID cannot be found or if the
|
831
|
-
stack components are not accessible.
|
832
801
|
"""
|
833
|
-
|
834
|
-
|
835
|
-
if not orchestrator_run_id:
|
836
|
-
raise ValueError(
|
837
|
-
"Cannot determine orchestrator run ID for the run. "
|
838
|
-
"Unable to fetch the status."
|
839
|
-
)
|
802
|
+
pipeline_status = None
|
803
|
+
include_run_status = not run.status.is_finished
|
840
804
|
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
orchestrator_pod_phase
|
805
|
+
label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
|
806
|
+
try:
|
807
|
+
job_list = kube_utils.list_jobs(
|
808
|
+
batch_api=self._k8s_batch_api,
|
809
|
+
namespace=self.config.kubernetes_namespace,
|
810
|
+
label_selector=label_selector,
|
848
811
|
)
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
step_statuses = None
|
854
|
-
if include_steps:
|
855
|
-
step_statuses = self._fetch_step_statuses(run)
|
812
|
+
except Exception as e:
|
813
|
+
logger.warning(f"Failed to list jobs for run {run.id}: {e}")
|
814
|
+
return None, None
|
856
815
|
|
857
|
-
|
816
|
+
step_statuses = {}
|
817
|
+
# Only fetch steps if we really need them
|
818
|
+
steps_dict = run.steps if include_steps else {}
|
858
819
|
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
) -> kube_utils.PodPhase:
|
863
|
-
"""Check pod status and handle deletion scenarios for both orchestrator and step pods.
|
820
|
+
for job in job_list.items:
|
821
|
+
if not job.metadata or not job.metadata.annotations:
|
822
|
+
continue
|
864
823
|
|
865
|
-
|
824
|
+
is_orchestrator_job = (
|
825
|
+
ORCHESTRATOR_ANNOTATION_KEY in job.metadata.annotations
|
826
|
+
)
|
827
|
+
if is_orchestrator_job:
|
828
|
+
if include_run_status:
|
829
|
+
pipeline_status = self._map_job_status_to_execution_status(
|
830
|
+
job
|
831
|
+
)
|
832
|
+
continue
|
866
833
|
|
867
|
-
|
868
|
-
|
834
|
+
step_name = job.metadata.annotations.get(
|
835
|
+
STEP_NAME_ANNOTATION_KEY, None
|
836
|
+
)
|
837
|
+
if not include_steps or not step_name:
|
838
|
+
continue
|
869
839
|
|
870
|
-
|
871
|
-
The pod phase if the pod exists, or PodPhase.FAILED if pod was deleted.
|
872
|
-
"""
|
873
|
-
pod = kube_utils.get_pod(
|
874
|
-
core_api=self._k8s_core_api,
|
875
|
-
pod_name=pod_name,
|
876
|
-
namespace=self.config.kubernetes_namespace,
|
877
|
-
)
|
840
|
+
step_response = steps_dict.get(step_name, None)
|
878
841
|
|
879
|
-
|
880
|
-
|
881
|
-
return kube_utils.PodPhase(pod.status.phase)
|
882
|
-
except ValueError:
|
883
|
-
# Handle unknown pod phases
|
884
|
-
logger.warning(
|
885
|
-
f"Unknown pod phase for pod {pod_name}: {pod.status.phase}"
|
886
|
-
)
|
887
|
-
return kube_utils.PodPhase.UNKNOWN
|
888
|
-
else:
|
889
|
-
logger.warning(
|
890
|
-
f"Can't fetch the status of pod {pod_name} "
|
891
|
-
f"in namespace {self.config.kubernetes_namespace}."
|
892
|
-
)
|
893
|
-
return kube_utils.PodPhase.UNKNOWN
|
842
|
+
if step_response is None:
|
843
|
+
continue
|
894
844
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
"""Map Kubernetes pod phase to ZenML execution status.
|
845
|
+
# If the step is already in a finished state, skip
|
846
|
+
if step_response and step_response.status.is_finished:
|
847
|
+
continue
|
899
848
|
|
900
|
-
|
901
|
-
|
849
|
+
execution_status = self._map_job_status_to_execution_status(job)
|
850
|
+
if execution_status is not None:
|
851
|
+
step_statuses[step_name] = execution_status
|
902
852
|
|
903
|
-
|
904
|
-
The corresponding ZenML execution status.
|
905
|
-
"""
|
906
|
-
if pod_phase == kube_utils.PodPhase.PENDING:
|
907
|
-
return ExecutionStatus.INITIALIZING
|
908
|
-
elif pod_phase == kube_utils.PodPhase.RUNNING:
|
909
|
-
return ExecutionStatus.RUNNING
|
910
|
-
elif pod_phase == kube_utils.PodPhase.SUCCEEDED:
|
911
|
-
return ExecutionStatus.COMPLETED
|
912
|
-
elif pod_phase == kube_utils.PodPhase.FAILED:
|
913
|
-
return ExecutionStatus.FAILED
|
914
|
-
else: # UNKNOWN - no update
|
915
|
-
return None
|
853
|
+
return pipeline_status, step_statuses
|
916
854
|
|
917
855
|
def _map_job_status_to_execution_status(
|
918
856
|
self, job: k8s_client.V1Job
|
@@ -925,7 +863,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
925
863
|
Returns:
|
926
864
|
The corresponding ZenML execution status, or None if no clear status.
|
927
865
|
"""
|
928
|
-
# Check job conditions first
|
929
866
|
if job.status and job.status.conditions:
|
930
867
|
for condition in job.status.conditions:
|
931
868
|
if condition.type == "Complete" and condition.status == "True":
|
@@ -936,61 +873,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
|
|
936
873
|
# Return None if no clear status - don't update
|
937
874
|
return None
|
938
875
|
|
939
|
-
def _fetch_step_statuses(
|
940
|
-
self, run: "PipelineRunResponse"
|
941
|
-
) -> Dict[str, ExecutionStatus]:
|
942
|
-
"""Fetch the statuses of individual pipeline steps.
|
943
|
-
|
944
|
-
Args:
|
945
|
-
run: The pipeline run response.
|
946
|
-
|
947
|
-
Returns:
|
948
|
-
A dictionary mapping step names to their execution statuses.
|
949
|
-
"""
|
950
|
-
step_statuses = {}
|
951
|
-
|
952
|
-
# Query all jobs for this run and match them to steps
|
953
|
-
label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
|
954
|
-
|
955
|
-
try:
|
956
|
-
jobs = self._k8s_batch_api.list_namespaced_job(
|
957
|
-
namespace=self.config.kubernetes_namespace,
|
958
|
-
label_selector=label_selector,
|
959
|
-
)
|
960
|
-
except Exception as e:
|
961
|
-
logger.warning(f"Failed to list jobs for run {run.id}: {e}")
|
962
|
-
return {}
|
963
|
-
|
964
|
-
# Fetch the steps from the run response
|
965
|
-
steps_dict = run.steps
|
966
|
-
|
967
|
-
for job in jobs.items:
|
968
|
-
# Extract step name from job labels
|
969
|
-
if not job.metadata or not job.metadata.labels:
|
970
|
-
continue
|
971
|
-
|
972
|
-
step_name = job.metadata.labels.get("step_name")
|
973
|
-
if not step_name:
|
974
|
-
continue
|
975
|
-
|
976
|
-
# Check if this step is already finished
|
977
|
-
step_response = steps_dict.get(step_name, None)
|
978
|
-
|
979
|
-
# If the step is not in the run response yet, skip, we can't update
|
980
|
-
if step_response is None:
|
981
|
-
continue
|
982
|
-
|
983
|
-
# If the step is already in a finished state, skip
|
984
|
-
if step_response and step_response.status.is_finished:
|
985
|
-
continue
|
986
|
-
|
987
|
-
# Check job status and map to execution status
|
988
|
-
execution_status = self._map_job_status_to_execution_status(job)
|
989
|
-
if execution_status is not None:
|
990
|
-
step_statuses[step_name] = execution_status
|
991
|
-
|
992
|
-
return step_statuses
|
993
|
-
|
994
876
|
def get_pipeline_run_metadata(
|
995
877
|
self, run_id: UUID
|
996
878
|
) -> Dict[str, "MetadataType"]:
|