PyPI - zenml-nightly - Versions diffs - 0.80.2.dev20250414__py3-none-any.whl → 0.80.2.dev20250416__py3-none-any.whl - Mend

zenml-nightly 0.80.2.dev20250414py3-none-any.whl → 0.80.2.dev20250416py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

zenml/VERSION +1 -1
zenml/artifacts/utils.py +7 -2
zenml/cli/utils.py +13 -11
zenml/config/compiler.py +1 -0
zenml/config/global_config.py +1 -1
zenml/config/pipeline_configurations.py +1 -0
zenml/config/pipeline_run_configuration.py +1 -0
zenml/config/server_config.py +7 -0
zenml/constants.py +8 -0
zenml/integrations/gcp/orchestrators/vertex_orchestrator.py +47 -5
zenml/integrations/gcp/vertex_custom_job_parameters.py +15 -1
zenml/integrations/kubernetes/flavors/kubernetes_step_operator_flavor.py +12 -0
zenml/integrations/kubernetes/orchestrators/kube_utils.py +92 -0
zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py +12 -3
zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +11 -65
zenml/integrations/kubernetes/step_operators/kubernetes_step_operator.py +11 -3
zenml/logging/step_logging.py +41 -21
zenml/login/credentials_store.py +31 -0
zenml/materializers/path_materializer.py +17 -2
zenml/models/v2/base/base.py +8 -4
zenml/models/v2/base/filter.py +1 -1
zenml/models/v2/core/pipeline_run.py +19 -0
zenml/orchestrators/step_launcher.py +2 -3
zenml/orchestrators/step_runner.py +2 -2
zenml/orchestrators/utils.py +2 -5
zenml/pipelines/pipeline_context.py +1 -0
zenml/pipelines/pipeline_decorator.py +4 -0
zenml/pipelines/pipeline_definition.py +83 -22
zenml/pipelines/run_utils.py +4 -0
zenml/steps/utils.py +1 -1
zenml/utils/io_utils.py +23 -0
zenml/zen_server/auth.py +96 -64
zenml/zen_server/cloud_utils.py +7 -1
zenml/zen_server/download_utils.py +123 -0
zenml/zen_server/jwt.py +0 -14
zenml/zen_server/rbac/rbac_interface.py +10 -3
zenml/zen_server/rbac/utils.py +13 -3
zenml/zen_server/rbac/zenml_cloud_rbac.py +14 -8
zenml/zen_server/routers/artifact_version_endpoints.py +86 -3
zenml/zen_server/routers/auth_endpoints.py +5 -36
zenml/zen_server/routers/pipeline_deployments_endpoints.py +63 -26
zenml/zen_server/routers/runs_endpoints.py +57 -0
zenml/zen_server/routers/users_endpoints.py +13 -8
zenml/zen_server/template_execution/utils.py +3 -3
zenml/zen_stores/migrations/versions/ff538a321a92_migrate_onboarding_state.py +123 -0
zenml/zen_stores/rest_zen_store.py +16 -13
zenml/zen_stores/schemas/pipeline_run_schemas.py +1 -0
zenml/zen_stores/schemas/server_settings_schemas.py +4 -1
zenml/zen_stores/sql_zen_store.py +18 -0
{zenml_nightly-0.80.2.dev20250414.dist-info → zenml_nightly-0.80.2.dev20250416.dist-info}/METADATA +2 -1
{zenml_nightly-0.80.2.dev20250414.dist-info → zenml_nightly-0.80.2.dev20250416.dist-info}/RECORD +54 -52
{zenml_nightly-0.80.2.dev20250414.dist-info → zenml_nightly-0.80.2.dev20250416.dist-info}/LICENSE +0 -0
{zenml_nightly-0.80.2.dev20250414.dist-info → zenml_nightly-0.80.2.dev20250416.dist-info}/WHEEL +0 -0
{zenml_nightly-0.80.2.dev20250414.dist-info → zenml_nightly-0.80.2.dev20250416.dist-info}/entry_points.txt +0 -0

zenml/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.80.2.~~dev20250414~~
1	+ 0.80.2.dev20250416

zenml/artifacts/utils.py CHANGED Viewed

@@ -35,7 +35,9 @@ from zenml.artifacts.preexisting_data_materializer import (
     PreexistingDataMaterializer,
 )
 from zenml.client import Client
-from zenml.constants import MODEL_METADATA_YAML_FILE_NAME
+from zenml.constants import (
+    MODEL_METADATA_YAML_FILE_NAME,
+)
 from zenml.enums import (
     ArtifactSaveType,
     ArtifactType,
@@ -43,7 +45,10 @@ from zenml.enums import (
     StackComponentType,
     VisualizationType,
 )
-from zenml.exceptions import DoesNotExistException, StepContextError
+from zenml.exceptions import (
+    DoesNotExistException,
+    StepContextError,
+)
 from zenml.io import fileio
 from zenml.logger import get_logger
 from zenml.metadata.metadata_types import validate_metadata

zenml/cli/utils.py CHANGED Viewed

@@ -308,13 +308,13 @@ def print_pydantic_models(
             if isinstance(model, BaseIdentifiedResponse):
                 include_columns = ["id"]
-                if "name" in model.model_fields:
+                if "name" in type(model).model_fields:
                     include_columns.append("name")
                 include_columns.extend(
                     [
                         k
-                        for k in model.get_body().model_fields.keys()
+                        for k in type(model.get_body()).model_fields.keys()
                         if k not in exclude_columns
                     ]
                 )
@@ -323,7 +323,9 @@ def print_pydantic_models(
                     include_columns.extend(
                         [
                             k
-                            for k in model.get_metadata().model_fields.keys()
+                            for k in type(
+                                model.get_metadata()
+                            ).model_fields.keys()
                             if k not in exclude_columns
                         ]
                     )
@@ -347,7 +349,7 @@ def print_pydantic_models(
             #  we want to attempt to represent them by name, if they contain
             #  such a field, else the id is used
             if isinstance(value, BaseIdentifiedResponse):
-                if "name" in value.model_fields:
+                if "name" in type(value).model_fields:
                     items[k] = str(getattr(value, "name"))
                 else:
                     items[k] = str(value.id)
@@ -357,7 +359,7 @@ def print_pydantic_models(
             elif isinstance(value, list):
                 for v in value:
                     if isinstance(v, BaseIdentifiedResponse):
-                        if "name" in v.model_fields:
+                        if "name" in type(v).model_fields:
                             items.setdefault(k, []).append(
                                 str(getattr(v, "name"))
                             )
@@ -448,13 +450,13 @@ def print_pydantic_model(
         if isinstance(model, BaseIdentifiedResponse):
             include_columns = ["id"]
-            if "name" in model.model_fields:
+            if "name" in type(model).model_fields:
                 include_columns.append("name")
             include_columns.extend(
                 [
                     k
-                    for k in model.get_body().model_fields.keys()
+                    for k in type(model.get_body()).model_fields.keys()
                     if k not in exclude_columns
                 ]
             )
@@ -463,7 +465,7 @@ def print_pydantic_model(
                 include_columns.extend(
                     [
                         k
-                        for k in model.get_metadata().model_fields.keys()
+                        for k in type(model.get_metadata()).model_fields.keys()
                         if k not in exclude_columns
                     ]
                 )
@@ -482,7 +484,7 @@ def print_pydantic_model(
     for k in include_columns:
         value = getattr(model, k)
         if isinstance(value, BaseIdentifiedResponse):
-            if "name" in value.model_fields:
+            if "name" in type(value).model_fields:
                 items[k] = str(getattr(value, "name"))
             else:
                 items[k] = str(value.id)
@@ -492,7 +494,7 @@ def print_pydantic_model(
         elif isinstance(value, list):
             for v in value:
                 if isinstance(v, BaseIdentifiedResponse):
-                    if "name" in v.model_fields:
+                    if "name" in type(v).model_fields:
                         items.setdefault(k, []).append(str(getattr(v, "name")))
                     else:
                         items.setdefault(k, []).append(str(v.id))
@@ -2138,7 +2140,7 @@ def _scrub_secret(config: StackComponentConfig) -> Dict[str, Any]:
         A configuration with secret values removed.
     """
     config_dict = {}
-    config_fields = config.__class__.model_fields
+    config_fields = type(config).model_fields
     for key, value in config_fields.items():
         if getattr(config, key):
             if secret_utils.is_secret_field(value):

zenml/config/compiler.py CHANGED Viewed

@@ -210,6 +210,7 @@ class Compiler:
                 enable_artifact_metadata=config.enable_artifact_metadata,
                 enable_artifact_visualization=config.enable_artifact_visualization,
                 enable_step_logs=config.enable_step_logs,
+                enable_pipeline_logs=config.enable_pipeline_logs,
                 settings=config.settings,
                 tags=config.tags,
                 extra=config.extra,

zenml/config/global_config.py CHANGED Viewed

@@ -447,7 +447,7 @@ class GlobalConfiguration(BaseModel, metaclass=GlobalConfigMetaClass):
         """
         environment_vars = {}
-        for key in self.model_fields.keys():
+        for key in type(self).model_fields.keys():
             if key == "store":
                 # The store configuration uses its own environment variable
                 # naming scheme

zenml/config/pipeline_configurations.py CHANGED Viewed

@@ -41,6 +41,7 @@ class PipelineConfigurationUpdate(StrictBaseModel):
     enable_artifact_metadata: Optional[bool] = None
     enable_artifact_visualization: Optional[bool] = None
     enable_step_logs: Optional[bool] = None
+    enable_pipeline_logs: Optional[bool] = None
     settings: Dict[str, SerializeAsAny[BaseSettings]] = {}
     tags: Optional[List[Union[str, "Tag"]]] = None
     extra: Dict[str, Any] = {}

zenml/config/pipeline_run_configuration.py CHANGED Viewed

@@ -40,6 +40,7 @@ class PipelineRunConfiguration(
     enable_artifact_metadata: Optional[bool] = None
     enable_artifact_visualization: Optional[bool] = None
     enable_step_logs: Optional[bool] = None
+    enable_pipeline_logs: Optional[bool] = None
     schedule: Optional[Schedule] = None
     build: Union[PipelineBuildBase, UUID, None] = Field(
         default=None, union_mode="left_to_right"

zenml/config/server_config.py CHANGED Viewed

@@ -34,6 +34,7 @@ from zenml.constants import (
     DEFAULT_ZENML_JWT_TOKEN_LEEWAY,
     DEFAULT_ZENML_SERVER_DEVICE_AUTH_POLLING,
     DEFAULT_ZENML_SERVER_DEVICE_AUTH_TIMEOUT,
+    DEFAULT_ZENML_SERVER_FILE_DOWNLOAD_SIZE_LIMIT,
     DEFAULT_ZENML_SERVER_GENERIC_API_TOKEN_LIFETIME,
     DEFAULT_ZENML_SERVER_GENERIC_API_TOKEN_MAX_LIFETIME,
     DEFAULT_ZENML_SERVER_LOGIN_RATE_LIMIT_DAY,
@@ -245,6 +246,8 @@ class ServerConfiguration(BaseModel):
         memcache_default_expiry: The default expiry time in seconds for cache
             entries. If not specified, the default value of 30 seconds will be
             used.
+        file_download_size_limit: The maximum size of the file download in
+            bytes. If not specified, the default value of 2GB will be used.
     """
     deployment_type: ServerDeploymentType = ServerDeploymentType.OTHER
@@ -346,6 +349,10 @@ class ServerConfiguration(BaseModel):
     memcache_max_capacity: int = 1000
     memcache_default_expiry: int = 30
+    file_download_size_limit: int = (
+        DEFAULT_ZENML_SERVER_FILE_DOWNLOAD_SIZE_LIMIT
+    )
     _deployment_id: Optional[UUID] = None
     @model_validator(mode="before")

zenml/constants.py CHANGED Viewed

@@ -168,6 +168,7 @@ ENV_ZENML_SKIP_STACK_VALIDATION = "ZENML_SKIP_STACK_VALIDATION"
 ENV_ZENML_SERVER = "ZENML_SERVER"
 ENV_ZENML_ENFORCE_TYPE_ANNOTATIONS = "ZENML_ENFORCE_TYPE_ANNOTATIONS"
 ENV_ZENML_ENABLE_IMPLICIT_AUTH_METHODS = "ZENML_ENABLE_IMPLICIT_AUTH_METHODS"
+ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE = "ZENML_DISABLE_PIPELINE_LOGS_STORAGE"
 ENV_ZENML_DISABLE_STEP_LOGS_STORAGE = "ZENML_DISABLE_STEP_LOGS_STORAGE"
 ENV_ZENML_DISABLE_STEP_NAMES_IN_LOGS = "ZENML_DISABLE_STEP_NAMES_IN_LOGS"
 ENV_ZENML_IGNORE_FAILURE_HOOK = "ZENML_IGNORE_FAILURE_HOOK"
@@ -192,12 +193,16 @@ ENV_ZENML_SERVER_PRO_PREFIX = "ZENML_SERVER_PRO_"
 ENV_ZENML_SERVER_DEPLOYMENT_TYPE = f"{ENV_ZENML_SERVER_PREFIX}DEPLOYMENT_TYPE"
 ENV_ZENML_SERVER_AUTH_SCHEME = f"{ENV_ZENML_SERVER_PREFIX}AUTH_SCHEME"
 ENV_ZENML_SERVER_AUTO_ACTIVATE = f"{ENV_ZENML_SERVER_PREFIX}AUTO_ACTIVATE"
 ENV_ZENML_RUN_SINGLE_STEPS_WITHOUT_STACK = (
     "ZENML_RUN_SINGLE_STEPS_WITHOUT_STACK"
 )
 ENV_ZENML_PREVENT_CLIENT_SIDE_CACHING = "ZENML_PREVENT_CLIENT_SIDE_CACHING"
 ENV_ZENML_DISABLE_CREDENTIALS_DISK_CACHING = "DISABLE_CREDENTIALS_DISK_CACHING"
 ENV_ZENML_RUNNER_IMAGE_DISABLE_UV = "ZENML_RUNNER_IMAGE_DISABLE_UV"
+ENV_ZENML_WORKLOAD_TOKEN_EXPIRATION_LEEWAY = (
+    "ZENML_WORKLOAD_TOKEN_EXPIRATION_LEEWAY"
+)
 # Logging variables
 IS_DEBUG_ENV: bool = handle_bool_env_var(ENV_ZENML_DEBUG, default=False)
@@ -284,6 +289,7 @@ DEFAULT_ZENML_SERVER_GENERIC_API_TOKEN_LIFETIME = 60 * 60  # 1 hour
 DEFAULT_ZENML_SERVER_GENERIC_API_TOKEN_MAX_LIFETIME = (
     60 * 60 * 24 * 7
 )  # 7 days
+DEFAULT_ZENML_SERVER_FILE_DOWNLOAD_SIZE_LIMIT = 2 * 1024 * 1024 * 1024  # 20 GB
 DEFAULT_ZENML_SERVER_SECURE_HEADERS_HSTS = (
     "max-age=63072000; includeSubdomains"
@@ -350,10 +356,12 @@ CODE_REPOSITORIES = "/code_repositories"
 COMPONENT_TYPES = "/component-types"
 CONFIG = "/config"
 CURRENT_USER = "/current-user"
+DATA = "/data"
 DEACTIVATE = "/deactivate"
 DEVICES = "/devices"
 DEVICE_AUTHORIZATION = "/device_authorization"
 DEVICE_VERIFY = "/verify"
+DOWNLOAD_TOKEN = "/download-token"
 EMAIL_ANALYTICS = "/email-opt-in"
 EVENT_FLAVORS = "/event-flavors"
 EVENT_SOURCES = "/event-sources"

zenml/integrations/gcp/orchestrators/vertex_orchestrator.py CHANGED Viewed

@@ -341,13 +341,55 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
                 self.config.workload_service_account
             )
+        # Create a dictionary of explicit parameters
+        params = custom_job_parameters.model_dump(
+            exclude_none=True, exclude={"additional_training_job_args"}
+        )
+        # Remove None values to let defaults be set by the function
+        params = {k: v for k, v in params.items() if v is not None}
+        # Add environment variables
+        params["env"] = [
+            {"name": key, "value": value} for key, value in environment.items()
+        ]
+        # Check if any advanced parameters will override explicit parameters
+        if custom_job_parameters.additional_training_job_args:
+            overridden_params = set(params.keys()) & set(
+                custom_job_parameters.additional_training_job_args.keys()
+            )
+            if overridden_params:
+                logger.warning(
+                    f"The following explicit parameters are being overridden by values in "
+                    f"additional_training_job_args: {', '.join(overridden_params)}. "
+                    f"This may lead to unexpected behavior. Consider using either explicit "
+                    f"parameters or additional_training_job_args, but not both for the same parameters."
+                )
+        # Add any advanced parameters - these will override explicit parameters if provided
+        params.update(custom_job_parameters.additional_training_job_args)
+        # Add other parameters from orchestrator config if not already in params
+        if self.config.network and "network" not in params:
+            params["network"] = self.config.network
+        if (
+            self.config.encryption_spec_key_name
+            and "encryption_spec_key_name" not in params
+        ):
+            params["encryption_spec_key_name"] = (
+                self.config.encryption_spec_key_name
+            )
+        if (
+            self.config.workload_service_account
+            and "service_account" not in params
+        ):
+            params["service_account"] = self.config.workload_service_account
         custom_job_component = create_custom_training_job_from_component(
             component_spec=component,
-            env=[
-                {"name": key, "value": value}
-                for key, value in environment.items()
-            ],
-            **custom_job_parameters.model_dump(),
+            **params,
         )
         return custom_job_component

zenml/integrations/gcp/vertex_custom_job_parameters.py CHANGED Viewed

@@ -13,7 +13,7 @@
 #  permissions and limitations under the License.
 """Vertex custom job parameter model."""
-from typing import Optional
+from typing import Any, Dict, Optional
 from pydantic import BaseModel
@@ -37,8 +37,21 @@ class VertexCustomJobParameters(BaseModel):
         boot_disk_type: Type of the boot disk. (Default: pd-ssd)
             https://cloud.google.com/vertex-ai/docs/training/configure-compute#boot_disk_options
         persistent_resource_id: The ID of the persistent resource to use for the job.
+            If empty (default), the job will not use a persistent resource.
+            When using a persistent resource, you must also specify a service_account.
+            Conversely, when explicitly setting this to an empty string, you
+            should not specify a service_account (ZenML will handle this automatically).
             https://cloud.google.com/vertex-ai/docs/training/persistent-resource-overview
         service_account: Specifies the service account to be used.
+            This is required when using a persistent_resource_id, and
+            should not be set when persistent_resource_id="".
+        additional_training_job_args: Additional arguments to pass to the create_custom_training_job_from_component
+            function. This allows passing any additional parameters supported by the Google
+            Cloud Pipeline Components library without requiring ZenML to update its API.
+            Note: If you specify parameters in this dictionary that are also defined as explicit
+            attributes (like machine_type or boot_disk_size_gb), the values in this dictionary
+            will override the explicit values.
+            See: https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.19.0/api/v1/custom_job.html
     """
     accelerator_type: Optional[str] = None
@@ -48,3 +61,4 @@ class VertexCustomJobParameters(BaseModel):
     boot_disk_type: str = "pd-ssd"
     persistent_resource_id: Optional[str] = None
     service_account: Optional[str] = None
+    additional_training_job_args: Dict[str, Any] = {}

zenml/integrations/kubernetes/flavors/kubernetes_step_operator_flavor.py CHANGED Viewed

@@ -35,11 +35,23 @@ class KubernetesStepOperatorSettings(BaseSettings):
         pod_settings: Pod settings to apply to pods executing the steps.
         service_account_name: Name of the service account to use for the pod.
         privileged: If the container should be run in privileged mode.
+        pod_startup_timeout: The maximum time to wait for a pending step pod to
+            start (in seconds).
+        pod_failure_max_retries: The maximum number of times to retry a step
+            pod if the step Kubernetes pod fails to start
+        pod_failure_retry_delay: The delay in seconds between pod
+            failure retries and pod startup retries (in seconds)
+        pod_failure_backoff: The backoff factor for pod failure retries and
+            pod startup retries.
     """
     pod_settings: Optional[KubernetesPodSettings] = None
     service_account_name: Optional[str] = None
     privileged: bool = False
+    pod_startup_timeout: int = 60 * 10  # Default 10 minutes
+    pod_failure_max_retries: int = 3
+    pod_failure_retry_delay: int = 10
+    pod_failure_backoff: float = 1.0
 class KubernetesStepOperatorConfig(

zenml/integrations/kubernetes/orchestrators/kube_utils.py CHANGED Viewed

@@ -462,3 +462,95 @@ def delete_secret(
         name=secret_name,
         namespace=namespace,
     )
+def create_and_wait_for_pod_to_start(
+    core_api: k8s_client.CoreV1Api,
+    pod_display_name: str,
+    pod_name: str,
+    pod_manifest: k8s_client.V1Pod,
+    namespace: str,
+    startup_max_retries: int,
+    startup_failure_delay: float,
+    startup_failure_backoff: float,
+    startup_timeout: float,
+) -> None:
+    """Create a pod and wait for it to reach a desired state.
+    Args:
+        core_api: Client of Core V1 API of Kubernetes API.
+        pod_display_name: The display name of the pod to use in logs.
+        pod_name: The name of the pod to create.
+        pod_manifest: The manifest of the pod to create.
+        namespace: The namespace in which to create the pod.
+        startup_max_retries: The maximum number of retries for the pod startup.
+        startup_failure_delay: The delay between retries for the pod startup.
+        startup_failure_backoff: The backoff factor for the pod startup.
+        startup_timeout: The maximum time to wait for the pod to start.
+    Raises:
+        TimeoutError: If the pod is still in a pending state after the maximum
+            wait time has elapsed.
+        Exception: If the pod fails to start after the maximum number of
+            retries.
+    """
+    retries = 0
+    while retries < startup_max_retries:
+        try:
+            # Create and run pod.
+            core_api.create_namespaced_pod(
+                namespace=namespace,
+                body=pod_manifest,
+            )
+            break
+        except Exception as e:
+            retries += 1
+            if retries < startup_max_retries:
+                logger.debug(f"The {pod_display_name} failed to start: {e}")
+                logger.error(
+                    f"Failed to create {pod_display_name}. "
+                    f"Retrying in {startup_failure_delay} seconds..."
+                )
+                time.sleep(startup_failure_delay)
+                startup_failure_delay *= startup_failure_backoff
+            else:
+                logger.error(
+                    f"Failed to create {pod_display_name} after "
+                    f"{startup_max_retries} retries. Exiting."
+                )
+                raise
+    # Wait for pod to start
+    logger.info(f"Waiting for {pod_display_name} to start...")
+    max_wait = startup_timeout
+    total_wait: float = 0
+    delay = startup_failure_delay
+    while True:
+        pod = get_pod(
+            core_api=core_api,
+            pod_name=pod_name,
+            namespace=namespace,
+        )
+        if not pod or pod_is_not_pending(pod):
+            break
+        if total_wait >= max_wait:
+            # Have to delete the pending pod so it doesn't start running
+            # later on.
+            try:
+                core_api.delete_namespaced_pod(
+                    name=pod_name,
+                    namespace=namespace,
+                )
+            except Exception:
+                pass
+            raise TimeoutError(
+                f"The {pod_display_name} is still in a pending state "
+                f"after {total_wait} seconds. Exiting."
+            )
+        if total_wait + delay > max_wait:
+            delay = max_wait - total_wait
+        total_wait += delay
+        time.sleep(delay)
+        delay *= startup_failure_backoff

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py CHANGED Viewed

@@ -543,14 +543,23 @@ class KubernetesOrchestrator(ContainerizedOrchestrator):
                 mount_local_stores=self.config.is_local,
             )
-            self._k8s_core_api.create_namespaced_pod(
+            kube_utils.create_and_wait_for_pod_to_start(
+                core_api=self._k8s_core_api,
+                pod_display_name="Kubernetes orchestrator pod",
+                pod_name=pod_name,
+                pod_manifest=pod_manifest,
                 namespace=self.config.kubernetes_namespace,
-                body=pod_manifest,
+                startup_max_retries=settings.pod_failure_max_retries,
+                startup_failure_delay=settings.pod_failure_retry_delay,
+                startup_failure_backoff=settings.pod_failure_backoff,
+                startup_timeout=settings.pod_startup_timeout,
             )
             # Wait for the orchestrator pod to finish and stream logs.
             if settings.synchronous:
-                logger.info("Waiting for Kubernetes orchestrator pod...")
+                logger.info(
+                    "Waiting for Kubernetes orchestrator pod to finish..."
+                )
                 kube_utils.wait_pod(
                     kube_client_fn=self.get_kube_client,
                     pod_name=pod_name,

zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py CHANGED Viewed

@@ -15,7 +15,6 @@
 import argparse
 import socket
-import time
 from typing import Any, Dict
 from uuid import UUID
@@ -103,8 +102,6 @@ def main() -> None:
         Raises:
             Exception: If the pod fails to start.
-            TimeoutError: If the pod is still in a pending state after the
-                maximum wait time has elapsed.
         """
         # Define Kubernetes pod name.
         pod_name = f"{orchestrator_run_id}-{step_name}"
@@ -176,68 +173,17 @@ def main() -> None:
             mount_local_stores=mount_local_stores,
         )
-        retries = 0
-        max_retries = settings.pod_failure_max_retries
-        delay: float = settings.pod_failure_retry_delay
-        backoff = settings.pod_failure_backoff
-        while retries < max_retries:
-            try:
-                # Create and run pod.
-                core_api.create_namespaced_pod(
-                    namespace=args.kubernetes_namespace,
-                    body=pod_manifest,
-                )
-                break
-            except Exception as e:
-                retries += 1
-                if retries < max_retries:
-                    logger.debug(
-                        f"Pod for step `{step_name}` failed to start: {e}"
-                    )
-                    logger.error(
-                        f"Failed to create pod for step `{step_name}`. "
-                        f"Retrying in {delay} seconds..."
-                    )
-                    time.sleep(delay)
-                    delay *= backoff
-                else:
-                    logger.error(
-                        f"Failed to create pod for step `{step_name}` after "
-                        f"{max_retries} retries. Exiting."
-                    )
-                    raise
-        # Wait for pod to start
-        max_wait = settings.pod_startup_timeout
-        total_wait: float = 0
-        delay = settings.pod_failure_retry_delay
-        while True:
-            pod = kube_utils.get_pod(
-                core_api, pod_name, args.kubernetes_namespace
-            )
-            if not pod or kube_utils.pod_is_not_pending(pod):
-                break
-            if total_wait >= max_wait:
-                # Have to delete the pending pod so it doesn't start running
-                # later on.
-                try:
-                    core_api.delete_namespaced_pod(
-                        name=pod_name,
-                        namespace=args.kubernetes_namespace,
-                    )
-                except Exception:
-                    pass
-                raise TimeoutError(
-                    f"Pod for step `{step_name}` is still in a pending state "
-                    f"after {total_wait} seconds. Exiting."
-                )
-            if total_wait + delay > max_wait:
-                delay = max_wait - total_wait
-            total_wait += delay
-            time.sleep(delay)
-            delay *= backoff
+        kube_utils.create_and_wait_for_pod_to_start(
+            core_api=core_api,
+            pod_display_name=f"pod for step `{step_name}`",
+            pod_name=pod_name,
+            pod_manifest=pod_manifest,
+            namespace=args.kubernetes_namespace,
+            startup_max_retries=settings.pod_failure_max_retries,
+            startup_failure_delay=settings.pod_failure_retry_delay,
+            startup_failure_backoff=settings.pod_failure_backoff,
+            startup_timeout=settings.pod_startup_timeout,
+        )
         # Wait for pod to finish.
         logger.info(f"Waiting for pod of step `{step_name}` to finish...")

zenml/integrations/kubernetes/step_operators/kubernetes_step_operator.py CHANGED Viewed

@@ -218,13 +218,21 @@ class KubernetesStepOperator(BaseStepOperator):
             mount_local_stores=False,
         )
-        self._k8s_core_api.create_namespaced_pod(
+        kube_utils.create_and_wait_for_pod_to_start(
+            core_api=self._k8s_core_api,
+            pod_display_name=f"pod of step `{info.pipeline_step_name}`",
+            pod_name=pod_name,
+            pod_manifest=pod_manifest,
             namespace=self.config.kubernetes_namespace,
-            body=pod_manifest,
+            startup_max_retries=settings.pod_failure_max_retries,
+            startup_failure_delay=settings.pod_failure_retry_delay,
+            startup_failure_backoff=settings.pod_failure_backoff,
+            startup_timeout=settings.pod_startup_timeout,
         )
         logger.info(
-            "Waiting for pod of step `%s` to start...", info.pipeline_step_name
+            "Waiting for pod of step `%s` to finish...",
+            info.pipeline_step_name,
         )
         kube_utils.wait_pod(
             kube_client_fn=self.get_kube_client,

zenml-nightly 0.80.2.dev20250414__py3-none-any.whl → 0.80.2.dev20250416__py3-none-any.whl

zenml-nightly 0.80.2.dev20250414py3-none-any.whl → 0.80.2.dev20250416py3-none-any.whl