PyPI - zenml-nightly - Versions diffs - 0.84.1.dev20250805__py3-none-any.whl → 0.84.1.dev20250806__py3-none-any.whl - Mend

zenml-nightly 0.84.1.dev20250805py3-none-any.whl → 0.84.1.dev20250806py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py CHANGED Viewed

@@ -31,6 +31,7 @@
 """Kubernetes-native orchestrator."""
 import os
+import random
 from typing import (
     TYPE_CHECKING,
     Dict,
@@ -50,6 +51,13 @@ from zenml.constants import (
     METADATA_ORCHESTRATOR_RUN_ID,
 )
 from zenml.enums import ExecutionStatus, StackComponentType
+from zenml.integrations.kubernetes.constants import (
+    ENV_ZENML_KUBERNETES_RUN_ID,
+    KUBERNETES_CRON_JOB_METADATA_KEY,
+    KUBERNETES_SECRET_TOKEN_KEY_NAME,
+    ORCHESTRATOR_ANNOTATION_KEY,
+    STEP_NAME_ANNOTATION_KEY,
+)
 from zenml.integrations.kubernetes.flavors.kubernetes_orchestrator_flavor import (
     KubernetesOrchestratorConfig,
     KubernetesOrchestratorSettings,
@@ -60,14 +68,15 @@ from zenml.integrations.kubernetes.orchestrators.kubernetes_orchestrator_entrypo
 )
 from zenml.integrations.kubernetes.orchestrators.manifest_utils import (
     build_cron_job_manifest,
+    build_job_manifest,
     build_pod_manifest,
+    job_template_manifest_from_job,
+    pod_template_manifest_from_pod,
 )
-from zenml.integrations.kubernetes.pod_settings import KubernetesPodSettings
 from zenml.logger import get_logger
 from zenml.metadata.metadata_types import MetadataType
 from zenml.models.v2.core.schedule import ScheduleUpdate
 from zenml.orchestrators import ContainerizedOrchestrator, SubmissionResult
-from zenml.orchestrators.utils import get_orchestrator_run_name
 from zenml.stack import StackValidator
 if TYPE_CHECKING:
@@ -81,10 +90,6 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
-ENV_ZENML_KUBERNETES_RUN_ID = "ZENML_KUBERNETES_RUN_ID"
-KUBERNETES_SECRET_TOKEN_KEY_NAME = "zenml_api_token"
-KUBERNETES_CRON_JOB_METADATA_KEY = "cron_job_name"
 class KubernetesOrchestrator(ContainerizedOrchestrator):
     """Orchestrator for running ZenML pipelines using native Kubernetes."""
@@ -364,45 +369,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
             custom_validation_function=_validate_local_requirements,
         )
-    @classmethod
-    def apply_default_resource_requests(
-        cls,
-        memory: str,
-        cpu: Optional[str] = None,
-        pod_settings: Optional[KubernetesPodSettings] = None,
-    ) -> KubernetesPodSettings:
-        """Applies default resource requests to a pod settings object.
-        Args:
-            memory: The memory resource request.
-            cpu: The CPU resource request.
-            pod_settings: The pod settings to update. A new one will be created
-                if not provided.
-        Returns:
-            The new or updated pod settings.
-        """
-        resources = {
-            "requests": {"memory": memory},
-        }
-        if cpu:
-            resources["requests"]["cpu"] = cpu
-        if not pod_settings:
-            pod_settings = KubernetesPodSettings(resources=resources)
-        elif not pod_settings.resources:
-            # We can't update the pod settings in place (because it's a frozen
-            # pydantic model), so we have to create a new one.
-            pod_settings = KubernetesPodSettings(
-                **pod_settings.model_dump(exclude_unset=True),
-                resources=resources,
-            )
-        else:
-            set_requests = pod_settings.resources.get("requests", {})
-            resources["requests"].update(set_requests)
-            pod_settings.resources["requests"] = resources["requests"]
-        return pod_settings
     def get_token_secret_name(self, deployment_id: UUID) -> str:
         """Returns the name of the secret that contains the ZenML token.
@@ -463,27 +429,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
             KubernetesOrchestratorSettings, self.get_settings(deployment)
         )
-        # We already make sure the orchestrator run name has the correct length
-        # to make sure we don't cut off the randomized suffix later when
-        # sanitizing the pod name. This avoids any pod naming collisions.
-        max_length = kube_utils.calculate_max_pod_name_length_for_namespace(
-            namespace=self.config.kubernetes_namespace
-        )
-        orchestrator_run_name = get_orchestrator_run_name(
-            pipeline_name, max_length=max_length
-        )
-        if settings.pod_name_prefix:
-            pod_name = get_orchestrator_run_name(
-                settings.pod_name_prefix, max_length=max_length
-            )
-        else:
-            pod_name = orchestrator_run_name
-        pod_name = kube_utils.sanitize_pod_name(
-            pod_name, namespace=self.config.kubernetes_namespace
-        )
         assert stack.container_registry
         # Get Docker image for the orchestrator pod
@@ -514,7 +459,7 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
         # takes up some memory resources itself and, if not specified, the pod
         # will be scheduled on any node regardless of available memory and risk
         # negatively impacting or even crashing the node due to memory pressure.
-        orchestrator_pod_settings = self.apply_default_resource_requests(
+        orchestrator_pod_settings = kube_utils.apply_default_resource_requests(
             memory="400Mi",
             cpu="100m",
             pod_settings=settings.orchestrator_pod_settings,
@@ -550,10 +495,74 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 str(placeholder_run.id)
             )
             orchestrator_pod_labels["run_name"] = kube_utils.sanitize_label(
-                str(placeholder_run.name)
+                placeholder_run.name
             )
-        # Schedule as CRON job if CRON schedule is given.
+        pod_manifest = build_pod_manifest(
+            pod_name=None,
+            image_name=image,
+            command=command,
+            args=args,
+            privileged=False,
+            pod_settings=orchestrator_pod_settings,
+            service_account_name=service_account_name,
+            env=environment,
+            labels=orchestrator_pod_labels,
+            mount_local_stores=self.config.is_local,
+            termination_grace_period_seconds=settings.pod_stop_grace_period,
+        )
+        pod_failure_policy = settings.pod_failure_policy or {
+            # These rules are applied sequentially. This means any failure in
+            # the main container will count towards the max retries. Any other
+            # disruption will not count towards the max retries.
+            "rules": [
+                # If the main container fails, we count it towards the max
+                # retries.
+                {
+                    "action": "Count",
+                    "onExitCodes": {
+                        "containerName": "main",
+                        "operator": "NotIn",
+                        "values": [0],
+                    },
+                },
+                # If the pod is interrupted at any other time, we don't count
+                # it as a retry
+                {
+                    "action": "Ignore",
+                    "onPodConditions": [
+                        {
+                            "type": "DisruptionTarget",
+                            "status": "True",
+                        }
+                    ],
+                },
+            ]
+        }
+        job_name = settings.job_name_prefix or ""
+        random_prefix = "".join(random.choices("0123456789abcdef", k=8))
+        job_name += (
+            f"-{random_prefix}-{deployment.pipeline_configuration.name}"
+        )
+        # The job name will be used as a label on the pods, so we need to make
+        # sure it doesn't exceed the label length limit
+        job_name = kube_utils.sanitize_label(job_name)
+        job_manifest = build_job_manifest(
+            job_name=job_name,
+            pod_template=pod_template_manifest_from_pod(pod_manifest),
+            backoff_limit=settings.orchestrator_job_backoff_limit,
+            ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
+            active_deadline_seconds=settings.active_deadline_seconds,
+            pod_failure_policy=pod_failure_policy,
+            labels=orchestrator_pod_labels,
+            annotations={
+                ORCHESTRATOR_ANNOTATION_KEY: str(self.id),
+            },
+        )
         if deployment.schedule:
             if not deployment.schedule.cron_expression:
                 raise RuntimeError(
@@ -564,20 +573,9 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
             cron_expression = deployment.schedule.cron_expression
             cron_job_manifest = build_cron_job_manifest(
                 cron_expression=cron_expression,
-                pod_name=pod_name,
-                image_name=image,
-                command=command,
-                args=args,
-                service_account_name=service_account_name,
-                privileged=False,
-                pod_settings=orchestrator_pod_settings,
-                env=environment,
-                mount_local_stores=self.config.is_local,
+                job_template=job_template_manifest_from_job(job_manifest),
                 successful_jobs_history_limit=settings.successful_jobs_history_limit,
                 failed_jobs_history_limit=settings.failed_jobs_history_limit,
-                ttl_seconds_after_finished=settings.ttl_seconds_after_finished,
-                termination_grace_period_seconds=settings.pod_stop_grace_period,
-                labels=orchestrator_pod_labels,
             )
             cron_job = self._k8s_batch_api.create_namespaced_cron_job(
@@ -585,8 +583,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 namespace=self.config.kubernetes_namespace,
             )
             logger.info(
-                f"Scheduling Kubernetes run `{pod_name}` with CRON expression "
-                f'`"{cron_expression}"`.'
+                f"Created Kubernetes CronJob `{cron_job.metadata.name}` "
+                f"with CRON expression `{cron_expression}`."
             )
             return SubmissionResult(
                 metadata={
@@ -594,32 +592,11 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 }
             )
         else:
-            # Create and run the orchestrator pod.
-            pod_manifest = build_pod_manifest(
-                pod_name=pod_name,
-                image_name=image,
-                command=command,
-                args=args,
-                privileged=False,
-                pod_settings=orchestrator_pod_settings,
-                service_account_name=service_account_name,
-                env=environment,
-                labels=orchestrator_pod_labels,
-                mount_local_stores=self.config.is_local,
-                termination_grace_period_seconds=settings.pod_stop_grace_period,
-            )
             try:
-                kube_utils.create_and_wait_for_pod_to_start(
-                    core_api=self._k8s_core_api,
-                    pod_display_name="Kubernetes orchestrator pod",
-                    pod_name=pod_name,
-                    pod_manifest=pod_manifest,
+                kube_utils.create_job(
+                    batch_api=self._k8s_batch_api,
                     namespace=self.config.kubernetes_namespace,
-                    startup_max_retries=settings.pod_failure_max_retries,
-                    startup_failure_delay=settings.pod_failure_retry_delay,
-                    startup_failure_backoff=settings.pod_failure_backoff,
-                    startup_timeout=settings.pod_startup_timeout,
+                    job_manifest=job_manifest,
                 )
             except Exception as e:
                 if self.config.pass_zenml_token_as_secret:
@@ -638,40 +615,31 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                         )
                 raise e
-            metadata: Dict[str, MetadataType] = {
-                METADATA_ORCHESTRATOR_RUN_ID: pod_name,
-            }
-            # Wait for the orchestrator pod to finish and stream logs.
             if settings.synchronous:
                 def _wait_for_run_to_finish() -> None:
-                    logger.info(
-                        "Waiting for Kubernetes orchestrator pod to finish..."
-                    )
-                    kube_utils.wait_pod(
-                        kube_client_fn=self.get_kube_client,
-                        pod_name=pod_name,
+                    logger.info("Waiting for orchestrator job to finish...")
+                    kube_utils.wait_for_job_to_finish(
+                        batch_api=self._k8s_batch_api,
+                        core_api=self._k8s_core_api,
                         namespace=self.config.kubernetes_namespace,
-                        exit_condition_lambda=kube_utils.pod_is_done,
-                        timeout_sec=settings.timeout,
+                        job_name=job_name,
+                        backoff_interval=settings.job_monitoring_interval,
+                        fail_on_container_waiting_reasons=settings.fail_on_container_waiting_reasons,
                         stream_logs=True,
                     )
                 return SubmissionResult(
-                    metadata=metadata,
                     wait_for_completion=_wait_for_run_to_finish,
                 )
             else:
                 logger.info(
-                    f"Orchestration started asynchronously in pod "
-                    f"`{self.config.kubernetes_namespace}:{pod_name}`. "
+                    f"Orchestrator job `{job_name}` started. "
                     f"Run the following command to inspect the logs: "
-                    f"`kubectl logs {pod_name} -n {self.config.kubernetes_namespace}`."
-                )
-                return SubmissionResult(
-                    metadata=metadata,
+                    f"`kubectl -n {self.config.kubernetes_namespace} logs "
+                    f"job/{job_name}`"
                 )
+                return None
     def _get_service_account_name(
         self, settings: KubernetesOrchestratorSettings
@@ -744,7 +712,8 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
         # Find all jobs running steps of the pipeline
         label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
         try:
-            jobs = self._k8s_batch_api.list_namespaced_job(
+            job_list = kube_utils.list_jobs(
+                batch_api=self._k8s_batch_api,
                 namespace=self.config.kubernetes_namespace,
                 label_selector=label_selector,
             )
@@ -753,8 +722,12 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 f"Failed to list step jobs with run ID {run.id}: {e}"
             )
-        for job in jobs.items:
-            if job.status.conditions:
+        for job in job_list.items:
+            if not kube_utils.is_step_job(job):
+                # This is the orchestrator job which stops by itself
+                continue
+            if job.status and job.status.conditions:
                 # Don't delete completed/failed jobs
                 for condition in job.status.conditions:
                     if (
@@ -825,94 +798,59 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
             A tuple of (pipeline_status, step_statuses).
             If include_steps is False, step_statuses will be None.
             If include_steps is True, step_statuses will be a dict (possibly empty).
-        Raises:
-            ValueError: If the orchestrator run ID cannot be found or if the
-                stack components are not accessible.
         """
-        # Get the orchestrator run ID which corresponds to the orchestrator pod name
-        orchestrator_run_id = run.orchestrator_run_id
-        if not orchestrator_run_id:
-            raise ValueError(
-                "Cannot determine orchestrator run ID for the run. "
-                "Unable to fetch the status."
-            )
+        pipeline_status = None
+        include_run_status = not run.status.is_finished
-        # Check the orchestrator pod status (only if run is not finished)
-        if not run.status.is_finished:
-            orchestrator_pod_phase = self._check_pod_status(
-                pod_name=orchestrator_run_id,
-            )
-            pipeline_status = self._map_pod_phase_to_execution_status(
-                orchestrator_pod_phase
+        label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
+        try:
+            job_list = kube_utils.list_jobs(
+                batch_api=self._k8s_batch_api,
+                namespace=self.config.kubernetes_namespace,
+                label_selector=label_selector,
             )
-        else:
-            # Run is already finished, don't change status
-            pipeline_status = None
-        step_statuses = None
-        if include_steps:
-            step_statuses = self._fetch_step_statuses(run)
+        except Exception as e:
+            logger.warning(f"Failed to list jobs for run {run.id}: {e}")
+            return None, None
-        return pipeline_status, step_statuses
+        step_statuses = {}
+        # Only fetch steps if we really need them
+        steps_dict = run.steps if include_steps else {}
-    def _check_pod_status(
-        self,
-        pod_name: str,
-    ) -> kube_utils.PodPhase:
-        """Check pod status and handle deletion scenarios for both orchestrator and step pods.
+        for job in job_list.items:
+            if not job.metadata or not job.metadata.annotations:
+                continue
-        This method should only be called for non-finished pipeline runs/steps.
+            is_orchestrator_job = (
+                ORCHESTRATOR_ANNOTATION_KEY in job.metadata.annotations
+            )
+            if is_orchestrator_job:
+                if include_run_status:
+                    pipeline_status = self._map_job_status_to_execution_status(
+                        job
+                    )
+                continue
-        Args:
-            pod_name: The name of the pod to check.
+            step_name = job.metadata.annotations.get(
+                STEP_NAME_ANNOTATION_KEY, None
+            )
+            if not include_steps or not step_name:
+                continue
-        Returns:
-            The pod phase if the pod exists, or PodPhase.FAILED if pod was deleted.
-        """
-        pod = kube_utils.get_pod(
-            core_api=self._k8s_core_api,
-            pod_name=pod_name,
-            namespace=self.config.kubernetes_namespace,
-        )
+            step_response = steps_dict.get(step_name, None)
-        if pod and pod.status and pod.status.phase:
-            try:
-                return kube_utils.PodPhase(pod.status.phase)
-            except ValueError:
-                # Handle unknown pod phases
-                logger.warning(
-                    f"Unknown pod phase for pod {pod_name}: {pod.status.phase}"
-                )
-                return kube_utils.PodPhase.UNKNOWN
-        else:
-            logger.warning(
-                f"Can't fetch the status of pod {pod_name} "
-                f"in namespace {self.config.kubernetes_namespace}."
-            )
-            return kube_utils.PodPhase.UNKNOWN
+            if step_response is None:
+                continue
-    def _map_pod_phase_to_execution_status(
-        self, pod_phase: kube_utils.PodPhase
-    ) -> Optional[ExecutionStatus]:
-        """Map Kubernetes pod phase to ZenML execution status.
+            # If the step is already in a finished state, skip
+            if step_response and step_response.status.is_finished:
+                continue
-        Args:
-            pod_phase: The Kubernetes pod phase.
+            execution_status = self._map_job_status_to_execution_status(job)
+            if execution_status is not None:
+                step_statuses[step_name] = execution_status
-        Returns:
-            The corresponding ZenML execution status.
-        """
-        if pod_phase == kube_utils.PodPhase.PENDING:
-            return ExecutionStatus.INITIALIZING
-        elif pod_phase == kube_utils.PodPhase.RUNNING:
-            return ExecutionStatus.RUNNING
-        elif pod_phase == kube_utils.PodPhase.SUCCEEDED:
-            return ExecutionStatus.COMPLETED
-        elif pod_phase == kube_utils.PodPhase.FAILED:
-            return ExecutionStatus.FAILED
-        else:  # UNKNOWN - no update
-            return None
+        return pipeline_status, step_statuses
     def _map_job_status_to_execution_status(
         self, job: k8s_client.V1Job
@@ -925,7 +863,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
         Returns:
             The corresponding ZenML execution status, or None if no clear status.
         """
-        # Check job conditions first
         if job.status and job.status.conditions:
             for condition in job.status.conditions:
                 if condition.type == "Complete" and condition.status == "True":
@@ -936,61 +873,6 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
         # Return None if no clear status - don't update
         return None
-    def _fetch_step_statuses(
-        self, run: "PipelineRunResponse"
-    ) -> Dict[str, ExecutionStatus]:
-        """Fetch the statuses of individual pipeline steps.
-        Args:
-            run: The pipeline run response.
-        Returns:
-            A dictionary mapping step names to their execution statuses.
-        """
-        step_statuses = {}
-        # Query all jobs for this run and match them to steps
-        label_selector = f"run_id={kube_utils.sanitize_label(str(run.id))}"
-        try:
-            jobs = self._k8s_batch_api.list_namespaced_job(
-                namespace=self.config.kubernetes_namespace,
-                label_selector=label_selector,
-            )
-        except Exception as e:
-            logger.warning(f"Failed to list jobs for run {run.id}: {e}")
-            return {}
-        # Fetch the steps from the run response
-        steps_dict = run.steps
-        for job in jobs.items:
-            # Extract step name from job labels
-            if not job.metadata or not job.metadata.labels:
-                continue
-            step_name = job.metadata.labels.get("step_name")
-            if not step_name:
-                continue
-            # Check if this step is already finished
-            step_response = steps_dict.get(step_name, None)
-            # If the step is not in the run response yet, skip, we can't update
-            if step_response is None:
-                continue
-            # If the step is already in a finished state, skip
-            if step_response and step_response.status.is_finished:
-                continue
-            # Check job status and map to execution status
-            execution_status = self._map_job_status_to_execution_status(job)
-            if execution_status is not None:
-                step_statuses[step_name] = execution_status
-        return step_statuses
     def get_pipeline_run_metadata(
         self, run_id: UUID
     ) -> Dict[str, "MetadataType"]:

zenml-nightly 0.84.1.dev20250805__py3-none-any.whl → 0.84.1.dev20250806__py3-none-any.whl

zenml-nightly 0.84.1.dev20250805py3-none-any.whl → 0.84.1.dev20250806py3-none-any.whl