zenml-nightly 0.84.1.dev20250804__py3-none-any.whl → 0.84.1.dev20250806__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,7 @@
31
31
  """Kubernetes-native orchestrator."""
32
32
 
33
33
  import os
34
+ import random
34
35
  from typing import (
35
36
  TYPE_CHECKING,
36
37
  Dict,
@@ -50,6 +51,13 @@ from zenml.constants import (
50
51
  METADATA_ORCHESTRATOR_RUN_ID,
51
52
  )
52
53
  from zenml.enums import ExecutionStatus, StackComponentType
54
+ from zenml.integrations.kubernetes.constants import (
55
+ ENV_ZENML_KUBERNETES_RUN_ID,
56
+ KUBERNETES_CRON_JOB_METADATA_KEY,
57
+ KUBERNETES_SECRET_TOKEN_KEY_NAME,
58
+ ORCHESTRATOR_ANNOTATION_KEY,
59
+ STEP_NAME_ANNOTATION_KEY,
60
+ )
53
61
  from zenml.integrations.kubernetes.flavors.kubernetes_orchestrator_flavor import (
54
62
  KubernetesOrchestratorConfig,
55
63
  KubernetesOrchestratorSettings,
@@ -60,14 +68,15 @@ from zenml.integrations.kubernetes.orchestrators.kubernetes_orchestrator_entrypo
60
68
  )
61
69
  from zenml.integrations.kubernetes.orchestrators.manifest_utils import (
62
70
  build_cron_job_manifest,
71
+ build_job_manifest,
63
72
  build_pod_manifest,
73
+ job_template_manifest_from_job,
74
+ pod_template_manifest_from_pod,
64
75
  )
65
- from zenml.integrations.kubernetes.pod_settings import KubernetesPodSettings
66
76
  from zenml.logger import get_logger
67
77
  from zenml.metadata.metadata_types import MetadataType
68
78
  from zenml.models.v2.core.schedule import ScheduleUpdate
69
79
  from zenml.orchestrators import ContainerizedOrchestrator, SubmissionResult
70
- from zenml.orchestrators.utils import get_orchestrator_run_name
71
80
  from zenml.stack import StackValidator
72
81
 
73
82
  if TYPE_CHECKING:
@@ -81,10 +90,6 @@ if TYPE_CHECKING:
81
90
 
82
91
  logger = get_logger(__name__)
83
92
 
84
- ENV_ZENML_KUBERNETES_RUN_ID = "ZENML_KUBERNETES_RUN_ID"
85
- KUBERNETES_SECRET_TOKEN_KEY_NAME = "zenml_api_token"
86
- KUBERNETES_CRON_JOB_METADATA_KEY = "cron_job_name"
87
-
88
93
 
89
94
  class KubernetesOrchestrator(ContainerizedOrchestrator):
90
95
  """Orchestrator for running ZenML pipelines using native Kubernetes."""
@@ -364,45 +369,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
364
369
  custom_validation_function=_validate_local_requirements,
365
370
  )
366
371
 
367
- @classmethod
368
- def apply_default_resource_requests(
369
- cls,
370
- memory: str,
371
- cpu: Optional[str] = None,
372
- pod_settings: Optional[KubernetesPodSettings] = None,
373
- ) -> KubernetesPodSettings:
374
- """Applies default resource requests to a pod settings object.
375
-
376
- Args:
377
- memory: The memory resource request.
378
- cpu: The CPU resource request.
379
- pod_settings: The pod settings to update. A new one will be created
380
- if not provided.
381
-
382
- Returns:
383
- The new or updated pod settings.
384
- """
385
- resources = {
386
- "requests": {"memory": memory},
387
- }
388
- if cpu:
389
- resources["requests"]["cpu"] = cpu
390
- if not pod_settings:
391
- pod_settings = KubernetesPodSettings(resources=resources)
392
- elif not pod_settings.resources:
393
- # We can't update the pod settings in place (because it's a frozen
394
- # pydantic model), so we have to create a new one.
395
- pod_settings = KubernetesPodSettings(
396
- **pod_settings.model_dump(exclude_unset=True),
397
- resources=resources,
398
- )
399
- else:
400
- set_requests = pod_settings.resources.get("requests", {})
401
- resources["requests"].update(set_requests)
402
- pod_settings.resources["requests"] = resources["requests"]
403
-
404
- return pod_settings
405
-
406
372
  def get_token_secret_name(self, deployment_id: UUID) -> str:
407
373
  """Returns the name of the secret that contains the ZenML token.
408
374
 
@@ -463,27 +429,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
463
429
  KubernetesOrchestratorSettings, self.get_settings(deployment)
464
430
  )
465
431
 
466
- # We already make sure the orchestrator run name has the correct length
467
- # to make sure we don't cut off the randomized suffix later when
468
- # sanitizing the pod name. This avoids any pod naming collisions.
469
- max_length = kube_utils.calculate_max_pod_name_length_for_namespace(
470
- namespace=self.config.kubernetes_namespace
471
- )
472
- orchestrator_run_name = get_orchestrator_run_name(
473
- pipeline_name, max_length=max_length
474
- )
475
-
476
- if settings.pod_name_prefix:
477
- pod_name = get_orchestrator_run_name(
478
- settings.pod_name_prefix, max_length=max_length
479
- )
480
- else:
481
- pod_name = orchestrator_run_name
482
-
483
- pod_name = kube_utils.sanitize_pod_name(
484
- pod_name, namespace=self.config.kubernetes_namespace
485
- )
486
-
487
432
  assert stack.container_registry
488
433
 
489
434
  # Get Docker image for the orchestrator pod
@@ -514,7 +459,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
514
459
  # takes up some memory resources itself and, if not specified, the pod
515
460
  # will be scheduled on any node regardless of available memory and risk
516
461
  # negatively impacting or even crashing the node due to memory pressure.
517
- orchestrator_pod_settings = self.apply_default_resource_requests(
462
+ orchestrator_pod_settings = kube_utils.apply_default_resource_requests(
518
463
  memory="400Mi",
519
464
  cpu="100m",
520
465
  pod_settings=settings.orchestrator_pod_settings,
@@ -550,10 +495,74 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
550
495
  str(placeholder_run.id)
551
496
  )
552
497
  orchestrator_pod_labels["run_name"] = kube_utils.sanitize_label(
553
- str(placeholder_run.name)
498
+ placeholder_run.name
554
499
  )
555
500
 
556
- # Schedule as CRON job if CRON schedule is given.
501
+ pod_manifest = build_pod_manifest(
502
+ pod_name=None,
503
+ image_name=image,
504
+ command=command,
505
+ args=args,
506
+ privileged=False,
507
+ pod_settings=orchestrator_pod_settings,
508
+ service_account_name=service_account_name,
509
+ env=environment,
510
+ labels=orchestrator_pod_labels,
511
+ mount_local_stores=self.config.is_local,
512
+ termination_grace_period_seconds=settings.pod_stop_grace_period,
513
+ )
514
+
515
+ pod_failure_policy = settings.pod_failure_policy or {
516
+ # These rules are applied sequentially. This means any failure in
517
+ # the main container will count towards the max retries. Any other
518
+ # disruption will not count towards the max retries.
519
+ "rules": [
520
+ # If the main container fails, we count it towards the max
521
+ # retries.
522
+ {
523
+ "action": "Count",
524
+ "onExitCodes": {
525
+ "containerName": "main",
526
+ "operator": "NotIn",
527
+ "values": [0],
528
+ },
529
+ },
530
+ # If the pod is interrupted at any other time, we don't count
531
+ # it as a retry
532
+ {
533
+ "action": "Ignore",
534
+ "onPodConditions": [
535
+ {
536
+ "type": "DisruptionTarget",
537
+ "status": "True",
538
+ }
539
+ ],
540
+ },
541
+ ]
542
+ }
543
+
544
+ job_name = settings.job_name_prefix or ""
545
+ random_prefix = "".join(random.choices("0123456789abcdef", k=8))
546
+ job_name += (
547
+ f"-{random_prefix}-{deployment.pipeline_configuration.name}"
548
+ )
549
+ # The job name will be used as a label on the pods, so we need to make
550
+ # sure it doesn't exceed the label length limit
551
+ job_name = kube_utils.sanitize_label(job_name)
552
+
553
+ job_manifest = build_job_manifest(
554
+ job_name=job_name,
555
+ pod_template=pod_template_manifest_from_pod(pod_manifest),
556
+ backoff_limit=settings.orchestrator_job_backoff_limit,
557
+ ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
558
+ active_deadline_seconds=settings.active_deadline_seconds,
559
+ pod_failure_policy=pod_failure_policy,
560
+ labels=orchestrator_pod_labels,
561
+ annotations={
562
+ ORCHESTRATOR_ANNOTATION_KEY: str(self.id),
563
+ },
564
+ )
565
+
557
566
  if deployment.schedule:
558
567
  if not deployment.schedule.cron_expression:
559
568
  raise RuntimeError(
@@ -564,20 +573,9 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
564
573
  cron_expression = deployment.schedule.cron_expression
565
574
  cron_job_manifest = build_cron_job_manifest(
566
575
  cron_expression=cron_expression,
567
- pod_name=pod_name,
568
- image_name=image,
569
- command=command,
570
- args=args,
571
- service_account_name=service_account_name,
572
- privileged=False,
573
- pod_settings=orchestrator_pod_settings,
574
- env=environment,
575
- mount_local_stores=self.config.is_local,
576
+ job_template=job_template_manifest_from_job(job_manifest),
576
577
  successful_jobs_history_limit=settings.successful_jobs_history_limit,
577
578
  failed_jobs_history_limit=settings.failed_jobs_history_limit,
578
- ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
579
- termination_grace_period_seconds=settings.pod_stop_grace_period,
580
- labels=orchestrator_pod_labels,
581
579
  )
582
580
 
583
581
  cron_job = self._k8s_batch_api.create_namespaced_cron_job(
@@ -585,8 +583,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
585
583
  namespace=self.config.kubernetes_namespace,
586
584
  )
587
585
  logger.info(
588
- f"Scheduling Kubernetes run `{pod_name}` with CRON expression "
589
- f'`"{cron_expression}"`.'
586
+ f"Created Kubernetes CronJob `{cron_job.metadata.name}` "
587
+ f"with CRON expression `{cron_expression}`."
590
588
  )
591
589
  return SubmissionResult(
592
590
  metadata={
@@ -594,32 +592,11 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
594
592
  }
595
593
  )
596
594
  else:
597
- # Create and run the orchestrator pod.
598
- pod_manifest = build_pod_manifest(
599
- pod_name=pod_name,
600
- image_name=image,
601
- command=command,
602
- args=args,
603
- privileged=False,
604
- pod_settings=orchestrator_pod_settings,
605
- service_account_name=service_account_name,
606
- env=environment,
607
- labels=orchestrator_pod_labels,
608
- mount_local_stores=self.config.is_local,
609
- termination_grace_period_seconds=settings.pod_stop_grace_period,
610
- )
611
-
612
595
  try:
613
- kube_utils.create_and_wait_for_pod_to_start(
614
- core_api=self._k8s_core_api,
615
- pod_display_name="Kubernetes orchestrator pod",
616
- pod_name=pod_name,
617
- pod_manifest=pod_manifest,
596
+ kube_utils.create_job(
597
+ batch_api=self._k8s_batch_api,
618
598
  namespace=self.config.kubernetes_namespace,
619
- startup_max_retries=settings.pod_failure_max_retries,
620
- startup_failure_delay=settings.pod_failure_retry_delay,
621
- startup_failure_backoff=settings.pod_failure_backoff,
622
- startup_timeout=settings.pod_startup_timeout,
599
+ job_manifest=job_manifest,
623
600
  )
624
601
  except Exception as e:
625
602
  if self.config.pass_zenml_token_as_secret:
@@ -638,40 +615,31 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
638
615
  )
639
616
  raise e
640
617
 
641
- metadata: Dict[str, MetadataType] = {
642
- METADATA_ORCHESTRATOR_RUN_ID: pod_name,
643
- }
644
-
645
- # Wait for the orchestrator pod to finish and stream logs.
646
618
  if settings.synchronous:
647
619
 
648
620
  def _wait_for_run_to_finish() -> None:
649
- logger.info(
650
- "Waiting for Kubernetes orchestrator pod to finish..."
651
- )
652
- kube_utils.wait_pod(
653
- kube_client_fn=self.get_kube_client,
654
- pod_name=pod_name,
621
+ logger.info("Waiting for orchestrator job to finish...")
622
+ kube_utils.wait_for_job_to_finish(
623
+ batch_api=self._k8s_batch_api,
624
+ core_api=self._k8s_core_api,
655
625
  namespace=self.config.kubernetes_namespace,
656
- exit_condition_lambda=kube_utils.pod_is_done,
657
- timeout_sec=settings.timeout,
626
+ job_name=job_name,
627
+ backoff_interval=settings.job_monitoring_interval,
628
+ fail_on_container_waiting_reasons=settings.fail_on_container_waiting_reasons,
658
629
  stream_logs=True,
659
630
  )
660
631
 
661
632
  return SubmissionResult(
662
- metadata=metadata,
663
633
  wait_for_completion=_wait_for_run_to_finish,
664
634
  )
665
635
  else:
666
636
  logger.info(
667
- f"Orchestration started asynchronously in pod "
668
- f"`{self.config.kubernetes_namespace}:{pod_name}`. "
637
+ f"Orchestrator job `{job_name}` started. "
669
638
  f"Run the following command to inspect the logs: "
670
- f"`kubectl logs {pod_name} -n {self.config.kubernetes_namespace}`."
671
- )
672
- return SubmissionResult(
673
- metadata=metadata,
639
+ f"`kubectl -n {self.config.kubernetes_namespace} logs "
640
+ f"job/{job_name}`"
674
641
  )
642
+ return None
675
643
 
676
644
  def _get_service_account_name(
677
645
  self, settings: KubernetesOrchestratorSettings
@@ -744,7 +712,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
744
712
  # Find all jobs running steps of the pipeline
745
713
  label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
746
714
  try:
747
- jobs = self._k8s_batch_api.list_namespaced_job(
715
+ job_list = kube_utils.list_jobs(
716
+ batch_api=self._k8s_batch_api,
748
717
  namespace=self.config.kubernetes_namespace,
749
718
  label_selector=label_selector,
750
719
  )
@@ -753,8 +722,12 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
753
722
  f"Failed to list step jobs with run ID {run.id}: {e}"
754
723
  )
755
724
 
756
- for job in jobs.items:
757
- if job.status.conditions:
725
+ for job in job_list.items:
726
+ if not kube_utils.is_step_job(job):
727
+ # This is the orchestrator job which stops by itself
728
+ continue
729
+
730
+ if job.status and job.status.conditions:
758
731
  # Don't delete completed/failed jobs
759
732
  for condition in job.status.conditions:
760
733
  if (
@@ -825,94 +798,59 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
825
798
  A tuple of (pipeline_status, step_statuses).
826
799
  If include_steps is False, step_statuses will be None.
827
800
  If include_steps is True, step_statuses will be a dict (possibly empty).
828
-
829
- Raises:
830
- ValueError: If the orchestrator run ID cannot be found or if the
831
- stack components are not accessible.
832
801
  """
833
- # Get the orchestrator run ID which corresponds to the orchestrator pod name
834
- orchestrator_run_id = run.orchestrator_run_id
835
- if not orchestrator_run_id:
836
- raise ValueError(
837
- "Cannot determine orchestrator run ID for the run. "
838
- "Unable to fetch the status."
839
- )
802
+ pipeline_status = None
803
+ include_run_status = not run.status.is_finished
840
804
 
841
- # Check the orchestrator pod status (only if run is not finished)
842
- if not run.status.is_finished:
843
- orchestrator_pod_phase = self._check_pod_status(
844
- pod_name=orchestrator_run_id,
845
- )
846
- pipeline_status = self._map_pod_phase_to_execution_status(
847
- orchestrator_pod_phase
805
+ label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
806
+ try:
807
+ job_list = kube_utils.list_jobs(
808
+ batch_api=self._k8s_batch_api,
809
+ namespace=self.config.kubernetes_namespace,
810
+ label_selector=label_selector,
848
811
  )
849
- else:
850
- # Run is already finished, don't change status
851
- pipeline_status = None
852
-
853
- step_statuses = None
854
- if include_steps:
855
- step_statuses = self._fetch_step_statuses(run)
812
+ except Exception as e:
813
+ logger.warning(f"Failed to list jobs for run {run.id}: {e}")
814
+ return None, None
856
815
 
857
- return pipeline_status, step_statuses
816
+ step_statuses = {}
817
+ # Only fetch steps if we really need them
818
+ steps_dict = run.steps if include_steps else {}
858
819
 
859
- def _check_pod_status(
860
- self,
861
- pod_name: str,
862
- ) -> kube_utils.PodPhase:
863
- """Check pod status and handle deletion scenarios for both orchestrator and step pods.
820
+ for job in job_list.items:
821
+ if not job.metadata or not job.metadata.annotations:
822
+ continue
864
823
 
865
- This method should only be called for non-finished pipeline runs/steps.
824
+ is_orchestrator_job = (
825
+ ORCHESTRATOR_ANNOTATION_KEY in job.metadata.annotations
826
+ )
827
+ if is_orchestrator_job:
828
+ if include_run_status:
829
+ pipeline_status = self._map_job_status_to_execution_status(
830
+ job
831
+ )
832
+ continue
866
833
 
867
- Args:
868
- pod_name: The name of the pod to check.
834
+ step_name = job.metadata.annotations.get(
835
+ STEP_NAME_ANNOTATION_KEY, None
836
+ )
837
+ if not include_steps or not step_name:
838
+ continue
869
839
 
870
- Returns:
871
- The pod phase if the pod exists, or PodPhase.FAILED if pod was deleted.
872
- """
873
- pod = kube_utils.get_pod(
874
- core_api=self._k8s_core_api,
875
- pod_name=pod_name,
876
- namespace=self.config.kubernetes_namespace,
877
- )
840
+ step_response = steps_dict.get(step_name, None)
878
841
 
879
- if pod and pod.status and pod.status.phase:
880
- try:
881
- return kube_utils.PodPhase(pod.status.phase)
882
- except ValueError:
883
- # Handle unknown pod phases
884
- logger.warning(
885
- f"Unknown pod phase for pod {pod_name}: {pod.status.phase}"
886
- )
887
- return kube_utils.PodPhase.UNKNOWN
888
- else:
889
- logger.warning(
890
- f"Can't fetch the status of pod {pod_name} "
891
- f"in namespace {self.config.kubernetes_namespace}."
892
- )
893
- return kube_utils.PodPhase.UNKNOWN
842
+ if step_response is None:
843
+ continue
894
844
 
895
- def _map_pod_phase_to_execution_status(
896
- self, pod_phase: kube_utils.PodPhase
897
- ) -> Optional[ExecutionStatus]:
898
- """Map Kubernetes pod phase to ZenML execution status.
845
+ # If the step is already in a finished state, skip
846
+ if step_response and step_response.status.is_finished:
847
+ continue
899
848
 
900
- Args:
901
- pod_phase: The Kubernetes pod phase.
849
+ execution_status = self._map_job_status_to_execution_status(job)
850
+ if execution_status is not None:
851
+ step_statuses[step_name] = execution_status
902
852
 
903
- Returns:
904
- The corresponding ZenML execution status.
905
- """
906
- if pod_phase == kube_utils.PodPhase.PENDING:
907
- return ExecutionStatus.INITIALIZING
908
- elif pod_phase == kube_utils.PodPhase.RUNNING:
909
- return ExecutionStatus.RUNNING
910
- elif pod_phase == kube_utils.PodPhase.SUCCEEDED:
911
- return ExecutionStatus.COMPLETED
912
- elif pod_phase == kube_utils.PodPhase.FAILED:
913
- return ExecutionStatus.FAILED
914
- else: # UNKNOWN - no update
915
- return None
853
+ return pipeline_status, step_statuses
916
854
 
917
855
  def _map_job_status_to_execution_status(
918
856
  self, job: k8s_client.V1Job
@@ -925,7 +863,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
925
863
  Returns:
926
864
  The corresponding ZenML execution status, or None if no clear status.
927
865
  """
928
- # Check job conditions first
929
866
  if job.status and job.status.conditions:
930
867
  for condition in job.status.conditions:
931
868
  if condition.type == "Complete" and condition.status == "True":
@@ -936,61 +873,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
936
873
  # Return None if no clear status - don't update
937
874
  return None
938
875
 
939
- def _fetch_step_statuses(
940
- self, run: "PipelineRunResponse"
941
- ) -> Dict[str, ExecutionStatus]:
942
- """Fetch the statuses of individual pipeline steps.
943
-
944
- Args:
945
- run: The pipeline run response.
946
-
947
- Returns:
948
- A dictionary mapping step names to their execution statuses.
949
- """
950
- step_statuses = {}
951
-
952
- # Query all jobs for this run and match them to steps
953
- label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
954
-
955
- try:
956
- jobs = self._k8s_batch_api.list_namespaced_job(
957
- namespace=self.config.kubernetes_namespace,
958
- label_selector=label_selector,
959
- )
960
- except Exception as e:
961
- logger.warning(f"Failed to list jobs for run {run.id}: {e}")
962
- return {}
963
-
964
- # Fetch the steps from the run response
965
- steps_dict = run.steps
966
-
967
- for job in jobs.items:
968
- # Extract step name from job labels
969
- if not job.metadata or not job.metadata.labels:
970
- continue
971
-
972
- step_name = job.metadata.labels.get("step_name")
973
- if not step_name:
974
- continue
975
-
976
- # Check if this step is already finished
977
- step_response = steps_dict.get(step_name, None)
978
-
979
- # If the step is not in the run response yet, skip, we can't update
980
- if step_response is None:
981
- continue
982
-
983
- # If the step is already in a finished state, skip
984
- if step_response and step_response.status.is_finished:
985
- continue
986
-
987
- # Check job status and map to execution status
988
- execution_status = self._map_job_status_to_execution_status(job)
989
- if execution_status is not None:
990
- step_statuses[step_name] = execution_status
991
-
992
- return step_statuses
993
-
994
876
  def get_pipeline_run_metadata(
995
877
  self, run_id: UUID
996
878
  ) -> Dict[str, "MetadataType"]: