PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -1,78 +1,43 @@
 """Kubernetes instance provisioning."""
 import copy
+import json
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 import uuid
 from sky import exceptions
 from sky import sky_logging
 from sky import skypilot_config
-from sky import status_lib
 from sky.adaptors import kubernetes
 from sky.provision import common
+from sky.provision import constants
 from sky.provision import docker_utils
 from sky.provision.kubernetes import config as config_lib
+from sky.provision.kubernetes import network_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import kubernetes_enums
+from sky.utils import status_lib
+from sky.utils import subprocess_utils
+from sky.utils import timeline
 from sky.utils import ux_utils
 POLL_INTERVAL = 2
 _TIMEOUT_FOR_POD_TERMINATION = 60  # 1 minutes
+_MAX_RETRIES = 3
+_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
 logger = sky_logging.init_logger(__name__)
 TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
 TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
-TAG_RAY_NODE_KIND = 'ray-node-type'  # legacy tag for backward compatibility
 TAG_POD_INITIALIZED = 'skypilot-initialized'
-POD_STATUSES = {
-    'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
-}
-def to_label_selector(tags):
-    label_selector = ''
-    for k, v in tags.items():
-        if label_selector != '':
-            label_selector += ','
-        label_selector += '{}={}'.format(k, v)
-    return label_selector
-def _get_namespace(provider_config: Dict[str, Any]) -> str:
-    return provider_config.get(
-        'namespace',
-        kubernetes_utils.get_current_kube_config_context_namespace())
-def _filter_pods(namespace: str, tag_filters: Dict[str, str],
-                 status_filters: Optional[List[str]]) -> Dict[str, Any]:
-    """Filters pods by tags and status."""
-    non_included_pod_statuses = POD_STATUSES.copy()
-    field_selector = ''
-    if status_filters is not None:
-        non_included_pod_statuses -= set(status_filters)
-        field_selector = ','.join(
-            [f'status.phase!={status}' for status in non_included_pod_statuses])
-    label_selector = to_label_selector(tag_filters)
-    pod_list = kubernetes.core_api().list_namespaced_pod(
-        namespace, field_selector=field_selector, label_selector=label_selector)
-    # Don't return pods marked for deletion,
-    # i.e. pods with non-null metadata.DeletionTimestamp.
-    pods = [
-        pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
-    ]
-    return {pod.metadata.name: pod for pod in pods}
 def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
     head_pod_name = None
     for pod_name, pod in pods.items():
-        if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
+        if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
             head_pod_name = pod_name
             break
     return head_pod_name
@@ -83,16 +48,85 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
     return {'component': f'{cluster_name}-head'}
-def _raise_pod_scheduling_errors(namespace, new_nodes):
+def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
+    # Returns a formatted string of resource requirements for a pod.
+    resource_requirements = {}
+    if isinstance(pod_or_spec, dict):
+        containers = pod_or_spec.get('spec', {}).get('containers', [])
+    else:
+        containers = pod_or_spec.spec.containers
+    for container in containers:
+        if isinstance(container, dict):
+            resources = container.get('resources', {})
+            requests = resources.get('requests', {})
+        else:
+            resources = container.resources
+            requests = resources.requests or {}
+        for resource, value in requests.items():
+            if resource not in resource_requirements:
+                resource_requirements[resource] = 0
+            if resource == 'memory':
+                int_value = kubernetes_utils.parse_memory_resource(value)
+            else:
+                int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
+            resource_requirements[resource] += int(int_value)
+    return ', '.join(f'{resource}={value}'
+                     for resource, value in resource_requirements.items())
+def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
+    # Returns a formatted string of node selectors for a pod.
+    node_selectors = []
+    if isinstance(pod_or_spec, dict):
+        selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
+    else:
+        selectors = pod_or_spec.spec.node_selector
+    if not selectors:
+        return None
+    for label_key, label_value in selectors.items():
+        node_selectors.append(f'{label_key}={label_value}')
+    return ', '.join(node_selectors)
+def _lack_resource_msg(resource: str,
+                       pod_or_spec: Union[Any, dict],
+                       extra_msg: Optional[str] = None,
+                       details: Optional[str] = None) -> str:
+    resource_requirements = _formatted_resource_requirements(pod_or_spec)
+    node_selectors = _formatted_node_selector(pod_or_spec)
+    node_selector_str = f' and labels ({node_selectors})' if (
+        node_selectors) else ''
+    msg = (f'Insufficient {resource} capacity on the cluster. '
+           f'Required resources ({resource_requirements}){node_selector_str} '
+           'were not found in a single node. Other SkyPilot tasks or pods may '
+           'be using resources. Check resource usage by running '
+           '`kubectl describe nodes`.')
+    if extra_msg:
+        msg += f' {extra_msg}'
+    if details:
+        msg += f'\nFull error: {details}'
+    return msg
+def _raise_pod_scheduling_errors(namespace, context, new_nodes):
     """Raise pod scheduling failure reason.
     When a pod fails to schedule in Kubernetes, the reasons for the failure
     are recorded as events. This function retrieves those events and raises
     descriptive errors for better debugging and user feedback.
     """
+    timeout_err_msg = ('Timed out while waiting for nodes to start. '
+                       'Cluster may be out of resources or '
+                       'may be too slow to autoscale.')
     for new_node in new_nodes:
-        pod = kubernetes.core_api().read_namespaced_pod(new_node.metadata.name,
-                                                        namespace)
+        pod = kubernetes.core_api(context).read_namespaced_pod(
+            new_node.metadata.name, namespace)
         pod_status = pod.status.phase
         # When there are multiple pods involved while launching instance,
         # there may be a single pod causing issue while others are
@@ -101,7 +135,7 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
         if pod_status != 'Pending':
             continue
         pod_name = pod._metadata._name  # pylint: disable=protected-access
-        events = kubernetes.core_api().list_namespaced_event(
+        events = kubernetes.core_api(context).list_namespaced_event(
             namespace,
             field_selector=(f'involvedObject.name={pod_name},'
                             'involvedObject.kind=Pod'))
@@ -118,24 +152,25 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
             if event.reason == 'FailedScheduling':
                 event_message = event.message
                 break
-        timeout_err_msg = ('Timed out while waiting for nodes to start. '
-                           'Cluster may be out of resources or '
-                           'may be too slow to autoscale.')
-        lack_resource_msg = (
-            'Insufficient {resource} capacity on the cluster. '
-            'Other SkyPilot tasks or pods may be using resources. '
-            'Check resource usage by running `kubectl describe nodes`.')
         if event_message is not None:
             if pod_status == 'Pending':
+                logger.info(event_message)
                 if 'Insufficient cpu' in event_message:
                     raise config_lib.KubernetesError(
-                        lack_resource_msg.format(resource='CPU'))
+                        _lack_resource_msg('CPU', pod, details=event_message))
                 if 'Insufficient memory' in event_message:
                     raise config_lib.KubernetesError(
-                        lack_resource_msg.format(resource='memory'))
+                        _lack_resource_msg('memory', pod,
+                                           details=event_message))
+                if 'Insufficient smarter-devices/fuse' in event_message:
+                    raise config_lib.KubernetesError(
+                        'Something went wrong with FUSE device daemonset.'
+                        ' Try restarting your FUSE pods by running '
+                        '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.'  # pylint: disable=line-too-long
+                        f' Full error: {event_message}')
                 gpu_lf_keys = [
-                    lf.get_label_key()
-                    for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
+                    key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
+                    for key in lf.get_label_keys()
                 ]
                 if pod.spec.node_selector:
                     for label_key in pod.spec.node_selector.keys():
@@ -143,22 +178,52 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
                             # TODO(romilb): We may have additional node
                             #  affinity selectors in the future - in that
                             #  case we will need to update this logic.
-                            if (('Insufficient nvidia.com/gpu'
-                                 in event_message) or
-                                ('didn\'t match Pod\'s node affinity/selector'
-                                 in event_message)):
-                                msg = lack_resource_msg.format(resource='GPU')
-                                raise config_lib.KubernetesError(
-                                    f'{msg} Verify if '
+                            # TODO(Doyoung): Update the error message raised
+                            # with the multi-host TPU support.
+                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key()  # pylint: disable=line-too-long
+                            if 'Insufficient google.com/tpu' in event_message:
+                                extra_msg = (
+                                    f'Verify if '
                                     f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster.')
+                                    ' is available in the cluster. Note '
+                                    'that multi-host TPU podslices are '
+                                    'currently not unsupported.')
+                                raise config_lib.KubernetesError(
+                                    _lack_resource_msg('TPU',
+                                                       pod,
+                                                       extra_msg,
+                                                       details=event_message))
+                            elif ((f'Insufficient {gpu_resource_key}'
+                                   in event_message) or
+                                  ('didn\'t match Pod\'s node affinity/selector'
+                                   in event_message)):
+                                extra_msg = (
+                                    f'Verify if any node matching label  '
+                                    f'{pod.spec.node_selector[label_key]} and '
+                                    f'sufficient resource {gpu_resource_key} '
+                                    f'is available in the cluster.')
+                                raise config_lib.KubernetesError(
+                                    _lack_resource_msg('GPU',
+                                                       pod,
+                                                       extra_msg,
+                                                       details=event_message))
             raise config_lib.KubernetesError(f'{timeout_err_msg} '
                                              f'Pod status: {pod_status}'
                                              f'Details: \'{event_message}\' ')
     raise config_lib.KubernetesError(f'{timeout_err_msg}')
-def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
+def _raise_command_running_error(message: str, command: str, pod_name: str,
+                                 rc: int, stdout: str) -> None:
+    if rc == 0:
+        return
+    raise config_lib.KubernetesError(
+        f'Failed to {message} for pod {pod_name} with return '
+        f'code {rc}: {command!r}\nOutput: {stdout}.')
+@timeline.event
+def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     """Wait for all pods to be scheduled.
     Wait for all pods including jump pod to be scheduled, and if it
@@ -168,6 +233,10 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
     If timeout is set to a negative value, this method will wait indefinitely.
     """
+    # Create a set of pod names we're waiting for
+    if not new_nodes:
+        return
+    expected_pod_names = {node.metadata.name for node in new_nodes}
     start_time = time.time()
     def _evaluate_timeout() -> bool:
@@ -177,25 +246,40 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
         return time.time() - start_time < timeout
     while _evaluate_timeout():
-        all_pods_scheduled = True
-        for node in new_nodes:
-            # Iterate over each pod to check their status
-            pod = kubernetes.core_api().read_namespaced_pod(
-                node.metadata.name, namespace)
-            if pod.status.phase == 'Pending':
+        # Get all pods in a single API call using the cluster name label
+        # which all pods in new_nodes should share
+        cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
+        pods = kubernetes.core_api(context).list_namespaced_pod(
+            namespace,
+            label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+        # Get the set of found pod names and check if we have all expected pods
+        found_pod_names = {pod.metadata.name for pod in pods}
+        missing_pods = expected_pod_names - found_pod_names
+        if missing_pods:
+            logger.info('Retrying waiting for pods: '
+                        f'Missing pods: {missing_pods}')
+            time.sleep(0.5)
+            continue
+        # Check if all pods are scheduled
+        all_scheduled = True
+        for pod in pods:
+            if (pod.metadata.name in expected_pod_names and
+                    pod.status.phase == 'Pending'):
                 # If container_statuses is None, then the pod hasn't
                 # been scheduled yet.
                 if pod.status.container_statuses is None:
-                    all_pods_scheduled = False
+                    all_scheduled = False
                     break
-        if all_pods_scheduled:
+        if all_scheduled:
             return
         time.sleep(1)
     # Handle pod scheduling errors
     try:
-        _raise_pod_scheduling_errors(namespace, new_nodes)
+        _raise_pod_scheduling_errors(namespace, context, new_nodes)
     except config_lib.KubernetesError:
         raise
     except Exception as e:
@@ -205,19 +289,64 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
             f'Error: {common_utils.format_exception(e)}') from None
-def _wait_for_pods_to_run(namespace, new_nodes):
+@timeline.event
+def _wait_for_pods_to_run(namespace, context, new_nodes):
     """Wait for pods and their containers to be ready.
     Pods may be pulling images or may be in the process of container
     creation.
     """
+    if not new_nodes:
+        return
+    # Create a set of pod names we're waiting for
+    expected_pod_names = {node.metadata.name for node in new_nodes}
+    def _check_init_containers(pod):
+        # Check if any of the init containers failed
+        # to start. Could be because the init container
+        # command failed or failed to pull image etc.
+        for init_status in pod.status.init_container_statuses:
+            init_terminated = init_status.state.terminated
+            if init_terminated:
+                if init_terminated.exit_code != 0:
+                    msg = init_terminated.message if (
+                        init_terminated.message) else str(init_terminated)
+                    raise config_lib.KubernetesError(
+                        'Failed to run init container for pod '
+                        f'{pod.metadata.name}. Error details: {msg}.')
+                continue
+            init_waiting = init_status.state.waiting
+            if (init_waiting is not None and init_waiting.reason
+                    not in ['ContainerCreating', 'PodInitializing']):
+                # TODO(romilb): There may be more states to check for. Add
+                #  them as needed.
+                msg = init_waiting.message if (
+                    init_waiting.message) else str(init_waiting)
+                raise config_lib.KubernetesError(
+                    'Failed to create init container for pod '
+                    f'{pod.metadata.name}. Error details: {msg}.')
     while True:
-        all_pods_running = True
-        # Iterate over each pod to check their status
-        for node in new_nodes:
-            pod = kubernetes.core_api().read_namespaced_pod(
-                node.metadata.name, namespace)
+        # Get all pods in a single API call
+        cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
+        all_pods = kubernetes.core_api(context).list_namespaced_pod(
+            namespace,
+            label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+        # Get the set of found pod names and check if we have all expected pods
+        found_pod_names = {pod.metadata.name for pod in all_pods}
+        missing_pods = expected_pod_names - found_pod_names
+        if missing_pods:
+            logger.info('Retrying running pods check: '
+                        f'Missing pods: {missing_pods}')
+            time.sleep(0.5)
+            continue
+        all_pods_running = True
+        for pod in all_pods:
+            if pod.metadata.name not in expected_pod_names:
+                continue
             # Continue if pod and all the containers within the
             # pod are successfully created and running.
             if pod.status.phase == 'Running' and all(
@@ -235,12 +364,15 @@ def _wait_for_pods_to_run(namespace, new_nodes):
                     # See list of possible reasons for waiting here:
                     # https://stackoverflow.com/a/57886025
                     waiting = container_status.state.waiting
-                    if (waiting is not None and
-                            waiting.reason != 'ContainerCreating'):
-                        raise config_lib.KubernetesError(
-                            'Failed to create container while launching '
-                            'the node. Error details: '
-                            f'{container_status.state.waiting.message}.')
+                    if waiting is not None:
+                        if waiting.reason == 'PodInitializing':
+                            _check_init_containers(pod)
+                        elif waiting.reason != 'ContainerCreating':
+                            msg = waiting.message if waiting.message else str(
+                                waiting)
+                            raise config_lib.KubernetesError(
+                                'Failed to create container while launching '
+                                f'the node. Error details: {msg}.')
             # Reaching this point means that one of the pods had an issue,
             # so break out of the loop, and wait until next second.
             break
@@ -250,145 +382,188 @@ def _wait_for_pods_to_run(namespace, new_nodes):
         time.sleep(1)
-def _run_command_on_pods(node_name: str,
-                         node_namespace: str,
-                         command: List[str],
-                         stream_logs: bool = False):
-    """Run command on Kubernetes pods.
+def _run_function_with_retries(func: Callable,
+                               operation_name: str,
+                               max_retries: int = _MAX_RETRIES,
+                               retry_delay: int = 5) -> Any:
+    """Runs a function with retries on Kubernetes errors.
+    Args:
+        func: Function to retry
+        operation_name: Name of the operation for logging
+        max_retries: Maximum number of retry attempts
+        retry_delay: Delay between retries in seconds
-    If `stream_logs` is True, we poll for output and error messages while the
-    command is executing, and the stdout and stderr is written to logger.info.
-    When called from the provisioner, this logger.info is written to the
-    provision.log file (see setup_provision_logging()).
+    Raises:
+        The last exception encountered if all retries fail.
     """
-    cmd_output = kubernetes.stream()(
-        kubernetes.core_api().connect_get_namespaced_pod_exec,
-        node_name,
-        node_namespace,
-        command=command,
-        stderr=True,
-        stdin=False,
-        stdout=True,
-        tty=False,
-        _preload_content=(not stream_logs),
-        _request_timeout=kubernetes.API_TIMEOUT)
-    if stream_logs:
-        while cmd_output.is_open():
-            cmd_output.update(timeout=1)
-            if cmd_output.peek_stdout():
-                logger.info(f'{cmd_output.read_stdout().strip()}')
-            if cmd_output.peek_stderr():
-                logger.info(f'{cmd_output.read_stderr().strip()}')
-        cmd_output.close()
-    return cmd_output
-def _set_env_vars_in_pods(namespace: str, new_pods: List):
-    """Setting environment variables in pods.
-    Once all containers are ready, we can exec into them and set env vars.
-    Kubernetes automatically populates containers with critical
-    environment variables, such as those for discovering services running
-    in the cluster and CUDA/nvidia environment variables. We need to
-    make sure these env vars are available in every task and ssh session.
-    This is needed for GPU support and service discovery.
-    See https://github.com/skypilot-org/skypilot/issues/2287 for
-    more details.
-    To do so, we capture env vars from the pod's runtime and write them to
-    /etc/profile.d/, making them available for all users in future
-    shell sessions.
+    for attempt in range(max_retries + 1):
+        try:
+            return func()
+        except config_lib.KubernetesError:
+            if attempt < max_retries:
+                logger.warning(f'Failed to {operation_name} - '
+                               f'retrying in {retry_delay} seconds.')
+                time.sleep(retry_delay)
+            else:
+                raise
+@timeline.event
+def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
+    """Pre-initialization step for SkyPilot pods.
+    This step is run in the pod right after it is created and before the
+    SkyPilot runtime is setup.
+    This step includes three key steps:
+    1. Privilege check: Checks if the default user has sufficient privilege
+    to set up the kubernetes instance pod.
+    2. SSH setup: Sets up SSH for the pod instance.
+    3. Environment variable setup to populate k8s env vars in the pod.
+    Make sure commands used in these methods are generic and work
+    on most base images. E.g., do not use Python, since that may not
+    be installed by default.
+    If you run any apt commands, be sure to check if the lock is available.
+    It is possible the `apt update` run in the pod container args may still
+    be running.
+    Args:
+        namespace (str): Kubernetes namespace.
+        context (Optional[str]): Kubernetes context.
+        new_nodes (List): List of new pod instances.
+    Raises:
+        config_lib.KubernetesError: If user privileges are insufficient or
+          setup fails.
     """
-    set_k8s_env_var_cmd = [
-        '/bin/sh',
-        '-c',
-        docker_utils.SETUP_ENV_VARS_CMD,
-    ]
-    for new_pod in new_pods:
-        _run_command_on_pods(new_pod.metadata.name, namespace,
-                             set_k8s_env_var_cmd)
-def _check_user_privilege(namespace: str, new_nodes: List) -> None:
-    # Checks if the default user has sufficient privilege to set up
-    # the kubernetes instance pod.
-    check_k8s_user_sudo_cmd = [
-        '/bin/sh',
-        '-c',
-        (
-            'if [ $(id -u) -eq 0 ]; then'
-            # If user is root, create an alias for sudo used in skypilot setup
-            '  echo \'alias sudo=""\' >> ~/.bashrc; '
-            'else '
-            '  if command -v sudo >/dev/null 2>&1; then '
-            '    timeout 2 sudo -l >/dev/null 2>&1 || '
-            f'    ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
-            '  else '
-            f'    ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
-            '  fi; '
-            'fi')
-    ]
-    for new_node in new_nodes:
-        privilege_check = _run_command_on_pods(new_node.metadata.name,
-                                               namespace,
-                                               check_k8s_user_sudo_cmd)
-        if privilege_check == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
+    check_k8s_user_sudo_cmd = (
+        'if [ $(id -u) -eq 0 ]; then'
+        # If user is root, create an alias for sudo used in skypilot setup
+        '  echo \'alias sudo=""\' >> ~/.bashrc; echo succeed;'
+        'else '
+        '  if command -v sudo >/dev/null 2>&1; then '
+        '    timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
+        f'    ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
+        f'      exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
+        '  else '
+        f'    ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
+        f'      exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
+        '  fi; '
+        'fi;')
+    # Kubernetes automatically populates containers with critical
+    # environment variables, such as those for discovering services running
+    # in the cluster and CUDA/nvidia environment variables. We need to
+    # make sure these env vars are available in every task and ssh session.
+    # This is needed for GPU support and service discovery.
+    # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
+    # To do so, we capture env vars from the pod's runtime and write them to
+    # /etc/profile.d/, making them available for all users in future
+    # shell sessions.
+    set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
+    check_apt_update_complete_cmd = (
+        'echo "Checking if apt update from container init is complete..."; '
+        'timeout_secs=600; '
+        'start_time=$(date +%s); '
+        'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
+        '  echo "apt update still running. Logs:"; '
+        '  cat /tmp/apt-update.log || true; '
+        '  current_time=$(date +%s); '
+        '  elapsed=$((current_time - start_time)); '
+        '  if [ $elapsed -ge $timeout_secs ]; then '
+        '    echo "Timed out waiting for apt update"; '
+        '    exit 1; '
+        '  fi; '
+        '  sleep 5; '
+        'done; '
+        'echo "apt update complete."; ')
+    install_ssh_k8s_cmd = (
+        'prefix_cmd() '
+        '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
+        'export DEBIAN_FRONTEND=noninteractive;'
+        'echo "Installing missing packages..."; '
+        'for i in {1..5}; do '
+        '  output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
+        '  rc=$?; '
+        '  if [ $rc -eq 0 ]; then '
+        '    break; '
+        '  fi; '
+        '  echo "$output" | grep -qi "could not get lock" || '
+        '  grep -qi "Unable to acquire the dpkg frontend lock"; '
+        '  if [ $? -eq 0 ]; then '
+        '    echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
+        '    sleep 5; '
+        '  else '
+        '    echo "apt install failed for a non-lock reason: $output"; '
+        '    exit $rc; '
+        '  fi; '
+        'done; '
+        'if [ $rc -ne 0 ]; then '
+        '    echo "apt install failed after 5 attempts due to lock errors."; '
+        '    exit $rc; '
+        'fi; '
+        '$(prefix_cmd) mkdir -p /var/run/sshd; '
+        '$(prefix_cmd) '
+        'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
+        '/etc/ssh/sshd_config; '
+        '$(prefix_cmd) sed '
+        '"s@session\\s*required\\s*pam_loginuid.so@session optional '
+        'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
+        'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
+        '$(prefix_cmd) mkdir -p ~/.ssh; '
+        '$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
+        '$(prefix_cmd) chmod 700 ~/.ssh; '
+        '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
+        '~/.ssh/authorized_keys; '
+        '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
+        '$(prefix_cmd) service ssh restart; '
+        # Eliminate the error
+        # `mesg: ttyname failed: inappropriate ioctl for device`.
+        # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device  # pylint: disable=line-too-long
+        '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
+    pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
+                    set_k8s_env_var_cmd + check_apt_update_complete_cmd +
+                    install_ssh_k8s_cmd)
+    def _pre_init_thread(new_node):
+        pod_name = new_node.metadata.name
+        logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
+        runner = command_runner.KubernetesCommandRunner(
+            ((namespace, context), pod_name))
+        # Run the combined pre-init command
+        rc, stdout, _ = runner.run(pre_init_cmd,
+                                   require_outputs=True,
+                                   stream_logs=False)
+        if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
             raise config_lib.KubernetesError(
                 'Insufficient system privileges detected. '
                 'Ensure the default user has root access or '
                 '"sudo" is installed and the user is added to the sudoers '
                 'from the image.')
+        op_name = 'pre-init'
+        _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
+                                     stdout)
-def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
-    # Setting up ssh for the pod instance. This is already setup for
-    # the jump pod so it does not need to be run for it.
-    set_k8s_ssh_cmd = [
-        '/bin/sh',
-        '-c',
-        (
-            'set -x; '
-            'prefix_cmd() '
-            '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
-            'export DEBIAN_FRONTEND=noninteractive;'
-            '$(prefix_cmd) apt-get update;'
-            '$(prefix_cmd) apt install openssh-server rsync -y; '
-            '$(prefix_cmd) mkdir -p /var/run/sshd; '
-            '$(prefix_cmd) '
-            'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
-            '/etc/ssh/sshd_config; '
-            '$(prefix_cmd) sed '
-            '"s@session\\s*required\\s*pam_loginuid.so@session optional '
-            'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
-            'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
-            '$(prefix_cmd) mkdir -p ~/.ssh; '
-            '$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
-            '$(prefix_cmd) chmod 700 ~/.ssh; '
-            '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
-            '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
-            '~/.ssh/authorized_keys; '
-            '$(prefix_cmd) service ssh restart; '
-            # Eliminate the error
-            # `mesg: ttyname failed: inappropriate ioctl for device`.
-            # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device  # pylint: disable=line-too-long
-            '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
-    ]
-    # TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
-    for new_node in new_nodes:
-        pod_name = new_node.metadata.name
-        logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
-        _run_command_on_pods(new_node.metadata.name,
-                             namespace,
-                             set_k8s_ssh_cmd,
-                             stream_logs=True)
-        logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
+        logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
+    # Run pre_init in parallel across all new_nodes
+    subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, _NUM_THREADS)
-def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
+def _label_pod(namespace: str, context: Optional[str], pod_name: str,
+               label: Dict[str, str]) -> None:
     """Label a pod."""
-    kubernetes.core_api().patch_namespaced_pod(
+    kubernetes.core_api(context).patch_namespaced_pod(
         pod_name,
         namespace, {'metadata': {
             'labels': label
@@ -396,11 +571,92 @@ def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
         _request_timeout=kubernetes.API_TIMEOUT)
+@timeline.event
+def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
+                                        context: Optional[str]) -> Any:
+    """Attempts to create a Kubernetes Pod and handle any errors.
+    Currently, we handle errors due to the AppArmor annotation and retry if
+    it fails due to the `FieldValueForbidden` error.
+    See https://github.com/skypilot-org/skypilot/issues/4174 for details.
+    Returns: The created Pod object.
+    """
+    try:
+        # Attempt to create the Pod with the AppArmor annotation
+        pod = kubernetes.core_api(context).create_namespaced_pod(
+            namespace, pod_spec)
+        return pod
+    except kubernetes.api_exception() as e:
+        try:
+            error_body = json.loads(e.body)
+            error_message = error_body.get('message', '')
+        except json.JSONDecodeError:
+            error_message = str(e.body)
+        # Check if the error is due to the AppArmor annotation and retry.
+        # We add an AppArmor annotation to set it as unconfined in our
+        # base template in kubernetes-ray.yml.j2. This is required for
+        # FUSE to work in the pod on most Kubernetes distributions.
+        # However, some distributions do not support the AppArmor annotation
+        # and will fail to create the pod. In this case, we retry without
+        # the annotation.
+        if (e.status == 422 and 'FieldValueForbidden' in error_message and
+                'AppArmorProfile: nil' in error_message):
+            logger.warning('AppArmor annotation caused pod creation to fail. '
+                           'Retrying without the annotation. '
+                           'Note: this may cause bucket mounting to fail.')
+            # Remove the AppArmor annotation
+            annotations = pod_spec.get('metadata', {}).get('annotations', {})
+            if ('container.apparmor.security.beta.kubernetes.io/ray-node'
+                    in annotations):
+                del annotations[
+                    'container.apparmor.security.beta.kubernetes.io/ray-node']
+                pod_spec['metadata']['annotations'] = annotations
+                logger.info('AppArmor annotation removed from Pod spec.')
+            else:
+                logger.warning('AppArmor annotation not found in pod spec, '
+                               'retrying will not help. '
+                               f'Current annotations: {annotations}')
+                raise e
+            # Retry Pod creation without the AppArmor annotation
+            try:
+                pod = kubernetes.core_api(context).create_namespaced_pod(
+                    namespace, pod_spec)
+                logger.info(f'Pod {pod.metadata.name} created successfully '
+                            'without AppArmor annotation.')
+                return pod
+            except kubernetes.api_exception() as retry_exception:
+                logger.info('Failed to create Pod without AppArmor annotation: '
+                            f'{retry_exception}')
+                raise retry_exception
+        # Unlike other error from resource lackage on CPU/GPU/Memory, TPU
+        # lackage error is raised when pod is attemtped to be created.
+        # TODO(Doyoung): Update the error message raised with the multi-host
+        # TPU support.
+        elif 'Invalid resource requests for google.com/tpu.' in error_message:
+            extra_message = ('Verify if the cluster has a TPU slice node with '
+                             'a topology matching the number of TPU(s) '
+                             'requested. Note that multi-host TPU podslices '
+                             'are currently not unsupported.')
+            raise config_lib.KubernetesError(
+                _lack_resource_msg('TPU',
+                                   pod_spec,
+                                   details=error_message,
+                                   extra_msg=extra_message))
+        else:
+            # Re-raise the exception if it's a different error
+            raise e
+@timeline.event
 def _create_pods(region: str, cluster_name_on_cloud: str,
                  config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Create pods based on the config."""
     provider_config = config.provider_config
-    namespace = _get_namespace(provider_config)
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
     pod_spec = copy.deepcopy(config.node_config)
     tags = {
         TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
@@ -413,17 +669,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     pod_spec['metadata']['labels'].update(
         {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
-    terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
+    terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
+                                                    ['Terminating'])
     start_time = time.time()
-    while (len(terminating_pods) > 0 and
+    while (terminating_pods and
            time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
         logger.debug(f'run_instances: Found {len(terminating_pods)} '
                      'terminating pods. Waiting them to finish: '
                      f'{list(terminating_pods.keys())}')
         time.sleep(POLL_INTERVAL)
-        terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
+        terminating_pods = kubernetes_utils.filter_pods(namespace, context,
+                                                        tags, ['Terminating'])
-    if len(terminating_pods) > 0:
+    if terminating_pods:
         # If there are still terminating pods, we force delete them.
         logger.debug(f'run_instances: Found {len(terminating_pods)} '
                      'terminating pods still in terminating state after '
@@ -432,13 +690,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         for pod_name in terminating_pods.keys():
             # grace_period_seconds=0 means force delete the pod.
             # https://github.com/kubernetes-client/python/issues/508#issuecomment-1695759777
-            kubernetes.core_api().delete_namespaced_pod(
+            kubernetes.core_api(context).delete_namespaced_pod(
                 pod_name,
                 namespace,
                 _request_timeout=config_lib.DELETION_TIMEOUT,
                 grace_period_seconds=0)
-    running_pods = _filter_pods(namespace, tags, ['Pending', 'Running'])
+    running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
+                                                ['Pending', 'Running'])
     head_pod_name = _get_head_pod_name(running_pods)
     logger.debug(f'Found {len(running_pods)} existing pods: '
                  f'{list(running_pods.keys())}')
@@ -456,7 +715,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     # Add nvidia runtime class if it exists
     nvidia_runtime_exists = False
     try:
-        nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class()
+        nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
+            context)
     except kubernetes.kubernetes.client.ApiException as e:
         logger.warning('run_instances: Error occurred while checking for '
                        f'nvidia RuntimeClass - '
@@ -464,32 +724,45 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                        'Continuing without using nvidia RuntimeClass.\n'
                        'If you are on a K3s cluster, manually '
                        'override runtimeClassName in ~/.sky/config.yaml. '
-                       'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html')  # pylint: disable=line-too-long
+                       'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html')  # pylint: disable=line-too-long
+    needs_gpus = False
+    limits = pod_spec['spec']['containers'][0].get('resources',
+                                                   {}).get('limits')
+    if limits is not None:
+        needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
-    if nvidia_runtime_exists:
+    # TPU pods provisioned on GKE use the default containerd runtime.
+    # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview  # pylint: disable=line-too-long
+    if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
     created_pods = {}
     logger.debug(f'run_instances: calling create_namespaced_pod '
                  f'(count={to_start_count}).')
-    for _ in range(to_start_count):
-        if head_pod_name is None:
-            pod_spec['metadata']['labels'][TAG_RAY_NODE_KIND] = 'head'
+    def _create_pod_thread(i: int):
+        pod_spec_copy = copy.deepcopy(pod_spec)
+        if head_pod_name is None and i == 0:
+            # First pod should be head if no head exists
+            pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
             head_selector = head_service_selector(cluster_name_on_cloud)
-            pod_spec['metadata']['labels'].update(head_selector)
-            pod_spec['metadata']['name'] = f'{cluster_name_on_cloud}-head'
+            pod_spec_copy['metadata']['labels'].update(head_selector)
+            pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
         else:
-            pod_spec['metadata']['labels'][TAG_RAY_NODE_KIND] = 'worker'
-            pod_uuid = str(uuid.uuid4())[:4]
+            # Worker pods
+            pod_spec_copy['metadata']['labels'].update(
+                constants.WORKER_NODE_TAGS)
+            pod_uuid = str(uuid.uuid4())[:6]
             pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
-            pod_spec['metadata']['name'] = f'{pod_name}-worker'
+            pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
             # For multi-node support, we put a soft-constraint to schedule
             # worker pods on different nodes than the head pod.
             # This is not set as a hard constraint because if different nodes
             # are not available, we still want to be able to schedule worker
             # pods on larger nodes which may be able to fit multiple SkyPilot
             # "nodes".
-            pod_spec['spec']['affinity'] = {
+            pod_spec_copy['spec']['affinity'] = {
                 'podAntiAffinity': {
                     # Set as a soft constraint
                     'preferredDuringSchedulingIgnoredDuringExecution': [{
@@ -510,67 +783,67 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 }
             }
-        pod = kubernetes.core_api().create_namespaced_pod(namespace, pod_spec)
+        # TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
+        # This is to prevent from non-TPU workloads from being scheduled on TPU
+        # slice nodes. We need this toleration to allow the pod to be scheduled
+        # on TPU nodes.
+        # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
+        tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
+        if tpu_label in config.node_config.get('spec',
+                                               {}).get('nodeSelector', {}):
+            tpu_toleration = {
+                'key': kubernetes_utils.TPU_RESOURCE_KEY,
+                'operator': 'Equal',
+                'value': 'present',
+                'effect': 'NoSchedule'
+            }
+            # Preserve existing tolerations if any
+            existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
+            pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
+                tpu_toleration
+            ]
+        return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
+                                                   context)
+    # Create pods in parallel
+    pods = subprocess_utils.run_in_parallel(_create_pod_thread,
+                                            list(range(to_start_count)),
+                                            _NUM_THREADS)
+    # Process created pods
+    for pod in pods:
         created_pods[pod.metadata.name] = pod
-        if head_pod_name is None:
+        if head_pod_name is None and pod.metadata.labels.get(
+                constants.TAG_RAY_NODE_KIND) == 'head':
             head_pod_name = pod.metadata.name
-    # Adding the jump pod to the new_nodes list as well so it can be
-    # checked if it's scheduled and running along with other pods.
-    ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
-    jump_pod = kubernetes.core_api().read_namespaced_pod(
-        ssh_jump_pod_name, namespace)
-    wait_pods_dict = _filter_pods(namespace, tags, ['Pending'])
-    wait_pods = list(wait_pods_dict.values())
-    wait_pods.append(jump_pod)
+    networking_mode = network_utils.get_networking_mode(
+        config.provider_config.get('networking_mode'))
+    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
+        # Adding the jump pod to the new_nodes list as well so it can be
+        # checked if it's scheduled and running along with other pods.
+        ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
+        jump_pod = kubernetes.core_api(context).read_namespaced_pod(
+            ssh_jump_pod_name, namespace)
+        pods.append(jump_pod)
     provision_timeout = provider_config['timeout']
     wait_str = ('indefinitely'
                 if provision_timeout < 0 else f'for {provision_timeout}s')
     logger.debug(f'run_instances: waiting {wait_str} for pods to schedule and '
-                 f'run: {list(wait_pods_dict.keys())}')
+                 f'run: {[pod.metadata.name for pod in pods]}')
     # Wait until the pods are scheduled and surface cause for error
     # if there is one
-    _wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
+    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
     # Wait until the pods and their containers are up and running, and
     # fail early if there is an error
     logger.debug(f'run_instances: waiting for pods to be running (pulling '
-                 f'images): {list(wait_pods_dict.keys())}')
-    _wait_for_pods_to_run(namespace, wait_pods)
+                 f'images): {[pod.metadata.name for pod in pods]}')
+    _wait_for_pods_to_run(namespace, context, pods)
     logger.debug(f'run_instances: all pods are scheduled and running: '
-                 f'{list(wait_pods_dict.keys())}')
-    running_pods = _filter_pods(namespace, tags, ['Running'])
-    initialized_pods = _filter_pods(namespace, {
-        TAG_POD_INITIALIZED: 'true',
-        **tags
-    }, ['Running'])
-    uninitialized_pods = {
-        pod_name: pod
-        for pod_name, pod in running_pods.items()
-        if pod_name not in initialized_pods
-    }
-    if len(uninitialized_pods) > 0:
-        logger.debug(f'run_instances: Initializing {len(uninitialized_pods)} '
-                     f'pods: {list(uninitialized_pods.keys())}')
-        uninitialized_pods_list = list(uninitialized_pods.values())
-        # Setup SSH and environment variables in pods.
-        # Make sure commands used in these methods are generic and work
-        # on most base images. E.g., do not use Python, since that may not
-        # be installed by default.
-        _check_user_privilege(namespace, uninitialized_pods_list)
-        _setup_ssh_in_pods(namespace, uninitialized_pods_list)
-        _set_env_vars_in_pods(namespace, uninitialized_pods_list)
-        for pod in uninitialized_pods.values():
-            _label_pod(namespace,
-                       pod.metadata.name,
-                       label={
-                           TAG_POD_INITIALIZED: 'true',
-                           **pod.metadata.labels
-                       })
+                 f'{[pod.metadata.name for pod in pods]}')
     assert head_pod_name is not None, 'head_instance_id should not be None'
     return common.ProvisionRecord(
@@ -590,7 +863,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
     try:
         return _create_pods(region, cluster_name_on_cloud, config)
     except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
-        logger.warning(f'run_instances: Error occurred when creating pods: {e}')
+        e_msg = common_utils.format_exception(e).replace('\n', ' ')
+        logger.warning('run_instances: Error occurred when creating pods: '
+                       f'{e_msg}')
         raise
@@ -607,35 +882,66 @@ def stop_instances(
     raise NotImplementedError()
-def _terminate_node(namespace: str, pod_name: str) -> None:
+def _terminate_node(namespace: str, context: Optional[str],
+                    pod_name: str) -> None:
     """Terminate a pod."""
     logger.debug('terminate_instances: calling delete_namespaced_pod')
-    try:
-        kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, pod_name)
-    except Exception as e:  # pylint: disable=broad-except
-        logger.warning('terminate_instances: Error occurred when analyzing '
-                       f'SSH Jump pod: {e}')
-    try:
-        kubernetes.core_api().delete_namespaced_service(
-            pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
-        kubernetes.core_api().delete_namespaced_service(
-            f'{pod_name}-ssh',
-            namespace,
-            _request_timeout=config_lib.DELETION_TIMEOUT)
-    except kubernetes.api_exception():
-        pass
+    def _delete_k8s_resource_with_retry(delete_func: Callable,
+                                        resource_type: str,
+                                        resource_name: str) -> None:
+        """Helper to delete Kubernetes resources with 404 handling and retries.
+        Args:
+            delete_func: Function to call to delete the resource
+            resource_type: Type of resource being deleted (e.g. 'service'),
+                used in logging
+            resource_name: Name of the resource being deleted, used in logging
+        """
+        max_retries = 3
+        retry_delay = 5  # seconds
+        for attempt in range(max_retries):
+            try:
+                delete_func()
+                return
+            except kubernetes.api_exception() as e:
+                if e.status == 404:
+                    logger.warning(
+                        f'terminate_instances: Tried to delete {resource_type} '
+                        f'{resource_name}, but the {resource_type} was not '
+                        'found (404).')
+                    return
+                elif attempt < max_retries - 1:
+                    logger.warning(f'terminate_instances: Failed to delete '
+                                   f'{resource_type} {resource_name} (attempt '
+                                   f'{attempt + 1}/{max_retries}). Error: {e}. '
+                                   f'Retrying in {retry_delay} seconds...')
+                    time.sleep(retry_delay)
+                else:
+                    raise
+    # Delete services for the pod
+    for service_name in [pod_name, f'{pod_name}-ssh']:
+        _delete_k8s_resource_with_retry(
+            delete_func=lambda name=service_name: kubernetes.core_api(
+                context).delete_namespaced_service(name=name,
+                                                   namespace=namespace,
+                                                   _request_timeout=config_lib.
+                                                   DELETION_TIMEOUT),
+            resource_type='service',
+            resource_name=service_name)
     # Note - delete pod after all other resources are deleted.
     # This is to ensure there are no leftover resources if this down is run
     # from within the pod, e.g., for autodown.
-    try:
-        kubernetes.core_api().delete_namespaced_pod(
-            pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
-    except kubernetes.api_exception() as e:
-        if e.status == 404:
-            logger.warning('terminate_instances: Tried to delete pod '
-                           f'{pod_name}, but the pod was not found (404).')
-        else:
-            raise
+    _delete_k8s_resource_with_retry(
+        delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
+            name=pod_name,
+            namespace=namespace,
+            _request_timeout=config_lib.DELETION_TIMEOUT),
+        resource_type='pod',
+        resource_name=pod_name)
 def terminate_instances(
@@ -644,20 +950,38 @@ def terminate_instances(
     worker_only: bool = False,
 ) -> None:
     """See sky/provision/__init__.py"""
-    namespace = _get_namespace(provider_config)
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
     tag_filters = {
         TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
     }
-    pods = _filter_pods(namespace, tag_filters, None)
+    pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
+    # Clean up the SSH jump pod if in use
+    networking_mode = network_utils.get_networking_mode(
+        provider_config.get('networking_mode'))
+    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
+        pod_name = list(pods.keys())[0]
+        try:
+            kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
+                                                       pod_name)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.warning('terminate_instances: Error occurred when analyzing '
+                           f'SSH Jump pod: {e}')
     def _is_head(pod) -> bool:
-        return pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head'
+        return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
-    for pod_name, pod in pods.items():
-        logger.debug(f'Terminating instance {pod_name}: {pod}')
+    def _terminate_pod_thread(pod_info):
+        pod_name, pod = pod_info
         if _is_head(pod) and worker_only:
-            continue
-        _terminate_node(namespace, pod_name)
+            return
+        logger.debug(f'Terminating instance {pod_name}: {pod}')
+        _terminate_node(namespace, context, pod_name)
+    # Run pod termination in parallel
+    subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
+                                     _NUM_THREADS)
 def get_cluster_info(
@@ -666,12 +990,15 @@ def get_cluster_info(
         provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
     del region  # unused
     assert provider_config is not None
-    namespace = _get_namespace(provider_config)
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
     tag_filters = {
         TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
     }
-    running_pods = _filter_pods(namespace, tag_filters, ['Running'])
+    running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
+                                                ['Running'])
     pods: Dict[str, List[common.InstanceInfo]] = {}
     head_pod_name = None
@@ -680,11 +1007,11 @@ def get_cluster_info(
                                                   port_forward_mode.value)
     network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
         network_mode_str)
-    external_ip = kubernetes_utils.get_external_ip(network_mode)
+    external_ip = kubernetes_utils.get_external_ip(network_mode, context)
     port = 22
     if not provider_config.get('use_internal_ips', False):
         port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
-                                                  namespace)
+                                                  namespace, context)
     head_pod_name = None
     cpu_request = None
@@ -700,7 +1027,7 @@ def get_cluster_info(
                 tags=pod.metadata.labels,
             )
         ]
-        if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
+        if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
             head_pod_name = pod_name
             head_spec = pod.spec
             assert head_spec is not None, pod
@@ -709,11 +1036,17 @@ def get_cluster_info(
     assert cpu_request is not None, 'cpu_request should not be None'
     ssh_user = 'sky'
-    get_k8s_ssh_user_cmd = ['/bin/sh', '-c', ('echo $(whoami)')]
+    get_k8s_ssh_user_cmd = 'echo $(whoami)'
     assert head_pod_name is not None
-    ssh_user = _run_command_on_pods(head_pod_name, namespace,
-                                    get_k8s_ssh_user_cmd)
-    ssh_user = ssh_user.strip()
+    runner = command_runner.KubernetesCommandRunner(
+        ((namespace, context), head_pod_name))
+    rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
+                                    require_outputs=True,
+                                    separate_stderr=True,
+                                    stream_logs=False)
+    _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
+                                 head_pod_name, rc, stdout + stderr)
+    ssh_user = stdout.strip()
     logger.debug(
         f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
@@ -737,7 +1070,6 @@ def query_instances(
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True
 ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
-    del provider_config  # unused
     status_map = {
         'Pending': status_lib.ClusterStatus.INIT,
         'Running': status_lib.ClusterStatus.UP,
@@ -747,11 +1079,13 @@ def query_instances(
         'Terminating': None,
     }
-    namespace = kubernetes_utils.get_current_kube_config_context_namespace()
+    assert provider_config is not None
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
     # Get all the pods with the label skypilot-cluster: <cluster_name>
     try:
-        pods = kubernetes.core_api().list_namespaced_pod(
+        pods = kubernetes.core_api(context).list_namespaced_pod(
             namespace,
             label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
             _request_timeout=kubernetes.API_TIMEOUT).items
@@ -776,3 +1110,24 @@ def query_instances(
             continue
         cluster_status[pod.metadata.name] = pod_status
     return cluster_status
+def get_command_runners(
+    cluster_info: common.ClusterInfo,
+    **credentials: Dict[str, Any],
+) -> List[command_runner.CommandRunner]:
+    """Get a command runner for the given cluster."""
+    assert cluster_info.provider_config is not None, cluster_info
+    instances = cluster_info.instances
+    namespace = kubernetes_utils.get_namespace_from_config(
+        cluster_info.provider_config)
+    context = kubernetes_utils.get_context_from_config(
+        cluster_info.provider_config)
+    node_list = []
+    if cluster_info.head_instance_id is not None:
+        node_list = [((namespace, context), cluster_info.head_instance_id)]
+    node_list.extend(((namespace, context), pod_name)
+                     for pod_name in instances.keys()
+                     if pod_name != cluster_info.head_instance_id)
+    return command_runner.KubernetesCommandRunner.make_runner_list(
+        node_list=node_list, **credentials)

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl