PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1,9 +1,14 @@
 """Kubernetes utilities for SkyPilot."""
+import dataclasses
+import functools
 import json
 import math
 import os
 import re
+import shutil
 import subprocess
+import time
+import typing
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from urllib.parse import urlparse
@@ -11,17 +16,30 @@ import jinja2
 import yaml
 import sky
+from sky import clouds
 from sky import exceptions
+from sky import models
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import kubernetes
+from sky.provision import constants as provision_constants
+from sky.provision.kubernetes import constants as kubernetes_constants
 from sky.provision.kubernetes import network_utils
+from sky.skylet import constants
+from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils import config_utils
 from sky.utils import env_options
 from sky.utils import kubernetes_enums
 from sky.utils import schemas
+from sky.utils import status_lib
+from sky.utils import timeline
 from sky.utils import ux_utils
+if typing.TYPE_CHECKING:
+    from sky import backends
+    from sky import resources as resources_lib
 # TODO(romilb): Move constants to constants.py
 DEFAULT_NAMESPACE = 'default'
@@ -35,10 +53,18 @@ MEMORY_SIZE_UNITS = {
     'T': 2**40,
     'P': 2**50,
 }
-NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
-                       'nvidia.com/gpu resource is available on the nodes and '
-                       'the node labels for identifying GPUs '
-                       '(e.g., skypilot.co/accelerator) are setup correctly. ')
+# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
+# nodes. These keys are typically used in the node's status.allocatable
+# or status.capacity fields to indicate the available resources on the node.
+GPU_RESOURCE_KEY = 'nvidia.com/gpu'
+TPU_RESOURCE_KEY = 'google.com/tpu'
+NO_ACCELERATOR_HELP_MESSAGE = (
+    'If your cluster contains GPUs or TPUs, make sure '
+    f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
+    'on the nodes and the node labels for identifying GPUs/TPUs '
+    '(e.g., skypilot.co/accelerator) are setup correctly. ')
 KUBERNETES_AUTOSCALER_NOTE = (
     'Note: Kubernetes cluster autoscaling is enabled. '
@@ -53,8 +79,106 @@ ENDPOINTS_DEBUG_MESSAGE = ('Additionally, make sure your {endpoint_type} '
 KIND_CONTEXT_NAME = 'kind-skypilot'  # Context name used by sky local up
+# Port-forward proxy command constants
+PORT_FORWARD_PROXY_CMD_TEMPLATE = 'kubernetes-port-forward-proxy-command.sh'
+# We add a version suffix to the port-forward proxy command to ensure backward
+# compatibility and avoid overwriting the older version.
+PORT_FORWARD_PROXY_CMD_VERSION = 2
+PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
+                               f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
+# Mapping used to get generation for TPU accelerator name.
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
+GKE_TPU_ACCELERATOR_TO_GENERATION = {
+    'tpu-v4-podslice': 'v4',
+    # Only Single-host v5e TPU configurations are allowed.
+    'tpu-v5-lite-device': 'v5e',
+    # Multi-host compatible v5e TPU configurations allowed.
+    'tpu-v5-lite-podslice': 'v5e',
+    'tpu-v5p-slice': 'v5p',
+}
+POD_STATUSES = {
+    'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
+}
+AUTODOWN_ANNOTATION_KEY = 'skypilot.co/autodown'
+IDLE_MINUTES_TO_AUTOSTOP_ANNOTATION_KEY = (
+    'skypilot.co/idle_minutes_to_autostop')
+ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
+                                       '{namespace} while trying to {action} '
+                                       'an annotation {annotation}.')
 logger = sky_logging.init_logger(__name__)
+# Default retry settings for Kubernetes API calls
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_RETRY_INTERVAL_SECONDS = 1
+def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
+                    retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
+                    resource_type: Optional[str] = None):
+    """Decorator to retry Kubernetes API calls on transient failures.
+    Args:
+        max_retries: Maximum number of retry attempts
+        retry_interval: Initial seconds to wait between retries
+        resource_type: Type of resource being accessed (e.g. 'node', 'pod').
+            Used to provide more specific error messages.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            backoff = common_utils.Backoff(initial_backoff=retry_interval,
+                                           max_backoff_factor=3)
+            for attempt in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except (kubernetes.max_retry_error(),
+                        kubernetes.api_exception(),
+                        kubernetes.config_exception()) as e:
+                    last_exception = e
+                    # Don't retry on permanent errors like 401 (Unauthorized)
+                    # or 403 (Forbidden)
+                    if (isinstance(e, kubernetes.api_exception()) and
+                            e.status in (401, 403)):
+                        raise
+                    if attempt < max_retries - 1:
+                        sleep_time = backoff.current_backoff()
+                        logger.debug(f'Kubernetes API call {func.__name__} '
+                                     f'failed with {str(e)}. Retrying in '
+                                     f'{sleep_time:.1f}s...')
+                        time.sleep(sleep_time)
+                        continue
+            # Format error message based on the type of exception
+            resource_msg = f' when trying to get {resource_type} info' \
+                if resource_type else ''
+            debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
+                if resource_type else ''
+            if isinstance(last_exception, kubernetes.max_retry_error()):
+                error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
+            elif isinstance(last_exception, kubernetes.api_exception()):
+                error_msg = (f'Kubernetes API error{resource_msg}: '
+                             f'{str(last_exception)}')
+            else:
+                error_msg = (f'Kubernetes configuration error{resource_msg}: '
+                             f'{str(last_exception)}')
+            raise exceptions.ResourcesUnavailableError(
+                f'{error_msg}'
+                f' Please check if the cluster is healthy and retry.'
+                f'{debug_cmd}') from last_exception
+        return wrapper
+    return decorator
 class GPULabelFormatter:
     """Base class to define a GPU label formatter for a Kubernetes cluster
@@ -65,15 +189,41 @@ class GPULabelFormatter:
     """
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_tpu_topology_label_key(cls) -> str:
+        """Returns the label for TPU topology used by the Kubernetes cluster.
+        Only implemented by formatters that support TPUs.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
+        """Returns the TPU topology value for the given TPU type and count.
+        Only implemented by formatters that support TPUs.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         """Returns the label key for GPU type used by the Kubernetes cluster"""
         raise NotImplementedError
+    @classmethod
+    def get_label_keys(cls) -> List[str]:
+        """Returns a list of label keys for GPU used by Kubernetes cluster."""
+        raise NotImplementedError
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         """Given a GPU type, returns the label value to be used"""
         raise NotImplementedError
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        """Checks if the given label key matches the formatter's label keys"""
+        raise NotImplementedError
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         """Given a label value, returns the GPU type"""
@@ -95,14 +245,21 @@ class GPULabelFormatter:
 def get_gke_accelerator_name(accelerator: str) -> str:
-    """Returns the accelerator name for GKE clusters
+    """Returns the accelerator name for GKE clusters.
     Uses the format - nvidia-tesla-<accelerator>.
-    A100-80GB, H100-80GB and L4 are an exception. They use nvidia-<accelerator>.
+    A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
+    TPU types are an exception as well keeping the given name.
     """
-    if accelerator in ('A100-80GB', 'L4', 'H100-80GB'):
-        # A100-80GB, L4 and H100-80GB have a different name pattern.
+    if accelerator == 'H100':
+        # H100 is named as H100-80GB in GKE.
+        accelerator = 'H100-80GB'
+    if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
+        # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
+        # have a different name pattern.
         return 'nvidia-{}'.format(accelerator.lower())
+    elif accelerator.startswith('tpu-'):
+        return accelerator
     else:
         return 'nvidia-tesla-{}'.format(accelerator.lower())
@@ -117,15 +274,23 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'skypilot.co/accelerator'
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
+    @classmethod
+    def get_label_keys(cls) -> List[str]:
+        return [cls.LABEL_KEY]
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         # For SkyPilot formatter, we use the accelerator str directly.
         # See sky.utils.kubernetes.gpu_labeler.
         return accelerator.lower()
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         return value.upper()
@@ -149,13 +314,21 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
+    @classmethod
+    def get_label_keys(cls) -> List[str]:
+        return [cls.LABEL_KEY]
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         return accelerator.upper()
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         return value
@@ -167,12 +340,67 @@ class GKELabelFormatter(GPULabelFormatter):
     GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
     label, which is used to identify the GPU type.
     """
+    GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
+    TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
+    ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
+    TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
+    # Mapping from TPU type to {count: topologies}. Used to determine topology
+    # label to use in an autoscaling environment. For list of topologies, see:
+    # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
+    # tpu v5p: https://cloud.google.com/tpu/docs/v5p
+    # TODO(romilb): Add support for TPU v4 and v6.
+    GKE_TPU_TOPOLOGIES = {
+        'tpu-v5-lite-podslice': {
+            1: '1x1',
+            4: '2x2',
+            8: '2x4'
+        },
+        'tpu-v5-lite-device': {
+            1: '1x1',
+            4: '2x2',
+            8: '2x4'
+        },
+        'tpu-v5p-slice': {
+            4: '2x2x1'
+        },
+    }
-    LABEL_KEY = 'cloud.google.com/gke-accelerator'
+    @classmethod
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
+        if accelerator is not None and accelerator.startswith('tpu-'):
+            return cls.TPU_LABEL_KEY
+        return cls.GPU_LABEL_KEY
     @classmethod
-    def get_label_key(cls) -> str:
-        return cls.LABEL_KEY
+    def get_label_keys(cls) -> List[str]:
+        return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key in cls.get_label_keys()
+    @classmethod
+    def get_tpu_topology_label_key(cls) -> str:
+        return cls.TPU_TOPOLOGY_LABEL_KEY
+    @classmethod
+    def get_tpu_topology_label_value(cls, acc_type: str, acc_count: int) -> str:
+        """Returns the TPU topology label value for the given TPU count.
+        e.g. tpu-v5-lite-podslice:8 -> '2x4'
+        """
+        count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
+                                                       {}).get(acc_count, None)
+        if count_to_topology is None:
+            supported_tpus = {
+                tpu: list(topologies.values())
+                for tpu, topologies in cls.GKE_TPU_TOPOLOGIES.items()
+            }
+            raise ValueError(
+                f'No TPU topology found for {acc_type} with count {acc_count}. '
+                f'Supported TPU types and counts: {supported_tpus}')
+        return count_to_topology
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
@@ -183,12 +411,85 @@ class GKELabelFormatter(GPULabelFormatter):
         if value.startswith('nvidia-tesla-'):
             return value.replace('nvidia-tesla-', '').upper()
         elif value.startswith('nvidia-'):
-            return value.replace('nvidia-', '').upper()
+            acc = value.replace('nvidia-', '').upper()
+            if acc == 'H100-80GB':
+                # H100 can be either H100-80GB or H100-MEGA-80GB in GKE
+                # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
+                # to distinguish between a3-high and a3-mega instances
+                return 'H100'
+            return acc
+        elif is_tpu_on_gke(value):
+            return value
         else:
             raise ValueError(
                 f'Invalid accelerator name in GKE cluster: {value}')
+class GFDLabelFormatter(GPULabelFormatter):
+    """GPU Feature Discovery label formatter
+    NVIDIA GPUs nodes are labeled by GPU feature discovery
+    e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
+    https://github.com/NVIDIA/gpu-feature-discovery
+    GPU feature discovery is included as part of the
+    NVIDIA GPU Operator:
+    https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
+    This LabelFormatter can't be used in autoscaling clusters since accelerators
+    may map to multiple label, so we're not implementing `get_label_value`
+    """
+    LABEL_KEY = 'nvidia.com/gpu.product'
+    @classmethod
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
+        return cls.LABEL_KEY
+    @classmethod
+    def get_label_keys(cls) -> List[str]:
+        return [cls.LABEL_KEY]
+    @classmethod
+    def get_label_value(cls, accelerator: str) -> str:
+        """An accelerator can map to many Nvidia GFD labels
+        (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
+        As a result, we do not support get_label_value for GFDLabelFormatter."""
+        raise NotImplementedError
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
+    @classmethod
+    def get_accelerator_from_label_value(cls, value: str) -> str:
+        """Searches against a canonical list of NVIDIA GPUs and pattern
+        matches the canonical GPU name against the GFD label.
+        """
+        canonical_gpu_names = [
+            'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4',
+            'V100', 'A10', 'P4000', 'P100', 'P40', 'P4', 'L40', 'L4'
+        ]
+        for canonical_name in canonical_gpu_names:
+            # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB
+            if canonical_name == 'A100-80GB' and re.search(
+                    r'A100.*-80GB', value):
+                return canonical_name
+            # Use word boundary matching to prevent substring matches
+            elif re.search(rf'\b{re.escape(canonical_name)}\b', value):
+                return canonical_name
+        # If we didn't find a canonical name:
+        # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000')
+        # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070')
+        # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000')
+        # Same logic, but uppercased, as the Skypilot labeler job found in
+        # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
+        return value.upper().replace('NVIDIA-',
+                                     '').replace('GEFORCE-',
+                                                 '').replace('RTX-', 'RTX')
 class KarpenterLabelFormatter(SkyPilotLabelFormatter):
     """Karpeneter label formatter
     Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify
@@ -203,8 +504,8 @@ class KarpenterLabelFormatter(SkyPilotLabelFormatter):
 # it will be used to determine the priority of the label formats when
 # auto-detecting the GPU label type.
 LABEL_FORMATTER_REGISTRY = [
-    SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter,
-    KarpenterLabelFormatter
+    SkyPilotLabelFormatter, GKELabelFormatter, KarpenterLabelFormatter,
+    GFDLabelFormatter, CoreWeaveLabelFormatter
 ]
 # Mapping of autoscaler type to label formatter
@@ -215,7 +516,9 @@ AUTOSCALER_TO_LABEL_FORMATTER = {
 }
+@annotations.lru_cache(scope='request')
 def detect_gpu_label_formatter(
+    context: Optional[str]
 ) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
     """Detects the GPU label formatter for the Kubernetes cluster
@@ -226,7 +529,7 @@ def detect_gpu_label_formatter(
     """
     # Get all labels across all nodes
     node_labels: Dict[str, List[Tuple[str, str]]] = {}
-    nodes = get_kubernetes_nodes()
+    nodes = get_kubernetes_nodes(context)
     for node in nodes:
         node_labels[node.metadata.name] = []
         for label, value in node.metadata.labels.items():
@@ -236,63 +539,72 @@ def detect_gpu_label_formatter(
     # Check if the node labels contain any of the GPU label prefixes
     for lf in LABEL_FORMATTER_REGISTRY:
-        label_key = lf.get_label_key()
         for _, label_list in node_labels.items():
             for label, _ in label_list:
-                if label.startswith(label_key):
+                if lf.match_label_key(label):
                     label_formatter = lf()
                     return label_formatter, node_labels
     return label_formatter, node_labels
-def detect_gpu_resource() -> Tuple[bool, Set[str]]:
-    """Checks if the Kubernetes cluster has nvidia.com/gpu resource.
+@annotations.lru_cache(scope='request', maxsize=10)
+def detect_accelerator_resource(
+        context: Optional[str]) -> Tuple[bool, Set[str]]:
+    """Checks if the Kubernetes cluster has GPU/TPU resource.
-    If nvidia.com/gpu resource is missing, that typically means that the
-    Kubernetes cluster does not have GPUs or the nvidia GPU operator and/or
-    device drivers are not installed.
+    Two types of accelerator resources are available which are each checked
+    with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
+    missing, that typically means that the Kubernetes cluster does not have
+    GPUs or the nvidia GPU operator and/or device drivers are not installed.
     Returns:
-        bool: True if the cluster has nvidia.com/gpu resource, False otherwise.
+        bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
+            resource, False otherwise.
     """
     # Get the set of resources across all nodes
     cluster_resources: Set[str] = set()
-    nodes = get_kubernetes_nodes()
+    nodes = get_kubernetes_nodes(context)
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = 'nvidia.com/gpu' in cluster_resources
+    has_accelerator = (get_gpu_resource_key() in cluster_resources or
+                       TPU_RESOURCE_KEY in cluster_resources)
-    return has_gpu, cluster_resources
+    return has_accelerator, cluster_resources
-def get_kubernetes_nodes() -> List[Any]:
-    # TODO(romilb): Calling kube API can take between 10-100ms depending on
-    #  the control plane. Consider caching calls to this function (using
-    #  kubecontext hash as key).
-    try:
-        nodes = kubernetes.core_api().list_node(
-            _request_timeout=kubernetes.API_TIMEOUT).items
-    except kubernetes.max_retry_error():
-        raise exceptions.ResourcesUnavailableError(
-            'Timed out when trying to get node info from Kubernetes cluster. '
-            'Please check if the cluster is healthy and retry.') from None
+@annotations.lru_cache(scope='request', maxsize=10)
+@_retry_on_error(resource_type='node')
+def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
+    """Gets the kubernetes nodes in the context.
+    If context is None, gets the nodes in the current context.
+    """
+    if context is None:
+        context = get_current_kube_config_context_name()
+    nodes = kubernetes.core_api(context).list_node(
+        _request_timeout=kubernetes.API_TIMEOUT).items
     return nodes
-def get_kubernetes_pods() -> List[Any]:
-    try:
-        ns = get_current_kube_config_context_namespace()
-        pods = kubernetes.core_api().list_namespaced_pod(
-            ns, _request_timeout=kubernetes.API_TIMEOUT).items
-    except kubernetes.max_retry_error():
-        raise exceptions.ResourcesUnavailableError(
-            'Timed out when trying to get pod info from Kubernetes cluster. '
-            'Please check if the cluster is healthy and retry.') from None
+@_retry_on_error(resource_type='pod')
+def get_all_pods_in_kubernetes_cluster(
+        context: Optional[str] = None) -> List[Any]:
+    """Gets pods in all namespaces in kubernetes cluster indicated by context.
+    Used for computing cluster resource usage.
+    """
+    if context is None:
+        context = get_current_kube_config_context_name()
+    pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
+        _request_timeout=kubernetes.API_TIMEOUT).items
     return pods
-def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]:
+def check_instance_fits(context: Optional[str],
+                        instance: str) -> Tuple[bool, Optional[str]]:
     """Checks if the instance fits on the Kubernetes cluster.
     If the instance has GPU requirements, checks if the GPU type is
@@ -307,6 +619,9 @@ def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]:
         Optional[str]: Error message if the instance does not fit.
     """
+    # TODO(zhwu): this should check the node for specific context, instead
+    # of the default context to make failover fully functional.
     def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType',
                            node_list: List[Any]) -> Tuple[bool, Optional[str]]:
         """Checks if the instance fits on the cluster based on CPU and memory.
@@ -333,15 +648,53 @@ def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]:
             'Maximum resources found on a single node: '
             f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
-    nodes = get_kubernetes_nodes()
+    def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
+                       node_list: List[Any]) -> Tuple[bool, Optional[str]]:
+        """Checks if the instance fits on the cluster based on requested TPU.
+        It checks if the TPU type and count on each node match the required
+        number of TPU chips for the instance. In the case of multi-host TPU
+        podslice, the function ensures that the number of TPU chips on a single
+        node (node_tpu_chip_count) and the total TPU chips across the entire
+        podslice (topology_chip_count) are correctly handled.
+        """
+        acc_type = candidate_instance_type.accelerator_type
+        acc_count = candidate_instance_type.accelerator_count
+        tpu_list_in_cluster = []
+        for node in node_list:
+            if acc_type == node.metadata.labels[
+                    GKELabelFormatter.TPU_LABEL_KEY]:
+                # TODO(Doyoung): Update the logic when adding support for
+                # multi-host TPUs.
+                if is_multi_host_tpu(node.metadata.labels):
+                    continue
+                node_tpu_chip_count = int(node.metadata.labels[
+                    GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
+                tpu_type = f'{acc_type}:{node_tpu_chip_count}'
+                tpu_list_in_cluster.append(tpu_type)
+                if node_tpu_chip_count == acc_count:
+                    return True, None
+        tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
+        # TODO(Doyoung): Update the error message raised with the multi-host
+        # TPU support.
+        return False, ('Requested TPU type was not found in the cluster. TPU '
+                       'types found in the cluster: '
+                       f'{tpu_list_in_cluster_str}. Note that multi-host TPU '
+                       'podslices are currently not unsupported.')
+    nodes = get_kubernetes_nodes(context)
     k8s_instance_type = KubernetesInstanceType.\
         from_instance_type(instance)
     acc_type = k8s_instance_type.accelerator_type
+    acc_count = k8s_instance_type.accelerator_count
     if acc_type is not None:
-        # If GPUs are requested, check if GPU type is available, and if so,
-        # check if CPU and memory requirements on the specific node are met.
+        # If GPU/TPUs are requested, check if GPU/TPU type is available, and
+        # if so, check if CPU and memory requirements on the specific node are
+        # met.
+        assert acc_count is not None, (acc_type, acc_count)
         try:
-            gpu_label_key, gpu_label_val = get_gpu_label_key_value(acc_type)
+            gpu_label_key, gpu_label_val, _, _ = (
+                get_accelerator_label_key_value(context, acc_type, acc_count))
         except exceptions.ResourcesUnavailableError as e:
             # If GPU not found, return empty list and error message.
             return False, str(e)
@@ -350,14 +703,26 @@ def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]:
             node for node in nodes if gpu_label_key in node.metadata.labels and
             node.metadata.labels[gpu_label_key] == gpu_label_val
         ]
-        assert len(gpu_nodes) > 0, 'GPU nodes not found'
+        assert gpu_nodes, 'GPU nodes not found'
+        if is_tpu_on_gke(acc_type):
+            # If requested accelerator is a TPU type, check if the cluster
+            # has sufficient TPU resource to meet the requirement.
+            fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
+            if reason is not None:
+                return fits, reason
         candidate_nodes = gpu_nodes
-        not_fit_reason_prefix = (f'GPU nodes with {acc_type} do not have '
-                                 'enough CPU and/or memory. ')
+        not_fit_reason_prefix = (
+            f'GPU nodes with {acc_type} do not have '
+            f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
+            f'memory (> {k8s_instance_type.memory} G). ')
     else:
         candidate_nodes = nodes
-        not_fit_reason_prefix = 'No nodes found with enough CPU and/or memory. '
-    # Check if  CPU and memory requirements are met on at least one
+        not_fit_reason_prefix = (f'No nodes found with enough '
+                                 f'CPU (> {k8s_instance_type.cpus} CPUs) '
+                                 'and/or memory '
+                                 f'(> {k8s_instance_type.memory} G). ')
+    # Check if CPU and memory requirements are met on at least one
     # candidate node.
     fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes)
     if not fits:
@@ -368,23 +733,33 @@ def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]:
         return fits, reason
-def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
-    """Returns the label key and value for the given GPU type.
+def get_accelerator_label_key_value(
+    context: Optional[str],
+    acc_type: str,
+    acc_count: int,
+    check_mode=False
+) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
+    """Returns the label key and value for the given GPU/TPU type.
     Args:
-        acc_type: The GPU type required by the task.
-        check_mode: If True, only checks if the cluster has GPU resources and
-            labels are setup on the cluster. acc_type is ignore does not return
-            the label key and value. Useful for checking if GPUs are configured
-            correctly on the cluster without explicitly requesting a acc_type.
+        acc_type: The GPU/TPU type required by the task.
+        acc_count: Number of GPU/TPUs required by the task.
+        check_mode: If True, only checks if the cluster has GPU/TPU resources
+            and labels are setup on the cluster. acc_type is ignore does not
+            return the label key and value. Useful for checking if GPUs are
+            configured correctly on the cluster without explicitly requesting
+            a acc_type.
     Returns:
-        A tuple of the label key and value. Returns empty strings if check_mode
-        is True.
+        A tuple of the accelerator label key, value, topology label key, and
+        topology value. The topology label key and value are populated only if
+        the requested accelerator type is TPU. Returns None if check_mode is
+        True.
     Raises:
         ResourcesUnavailableError: Can be raised from the following conditions:
-            - The cluster does not have GPU resources (nvidia.com/gpu)
-            - The cluster does not have GPU labels setup correctly
-            - The cluster doesn't have any nodes with acc_type GPU
+            - The cluster does not have GPU/TPU resources
+                (nvidia.com/gpu, google.com/tpu)
+            - The cluster does not have GPU/TPU labels setup correctly
+            - The cluster doesn't have any nodes with acc_type GPU/TPU
     """
     # Check if the cluster has GPU resources
     # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
@@ -403,23 +778,33 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # If check mode is enabled and autoscaler is set, we can return
             # early since we assume the cluster autoscaler will handle GPU
             # node provisioning.
-            return '', ''
+            return None, None, None, None
         formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
         assert formatter is not None, ('Unsupported autoscaler type:'
                                        f' {autoscaler_type}')
-        return formatter.get_label_key(), formatter.get_label_value(acc_type)
-    has_gpus, cluster_resources = detect_gpu_resource()
+        tpu_topology_label_key = None
+        tpu_topology_label_value = None
+        if is_tpu_on_gke(acc_type):
+            assert formatter == GKELabelFormatter, formatter
+            tpu_topology_label_key = formatter.get_tpu_topology_label_key()
+            tpu_topology_label_value = formatter.get_tpu_topology_label_value(
+                acc_type, acc_count)
+        return formatter.get_label_key(acc_type), formatter.get_label_value(
+            acc_type), tpu_topology_label_key, tpu_topology_label_value
+    has_gpus, cluster_resources = detect_accelerator_resource(context)
     if has_gpus:
         # Check if the cluster has GPU labels setup correctly
         label_formatter, node_labels = \
-            detect_gpu_label_formatter()
+            detect_gpu_label_formatter(context)
         if label_formatter is None:
             # If none of the GPU labels from LABEL_FORMATTER_REGISTRY are
             # detected, raise error
             with ux_utils.print_exception_no_traceback():
-                supported_formats = ', '.join(
-                    [f.get_label_key() for f in LABEL_FORMATTER_REGISTRY])
+                supported_formats = ', '.join([
+                    key for f in LABEL_FORMATTER_REGISTRY
+                    for key in f.get_label_keys()
+                ])
                 suffix = ''
                 if env_options.Options.SHOW_DEBUG_INFO.get():
                     suffix = f' Found node labels: {node_labels}'
@@ -430,12 +815,12 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                     f'{supported_formats}. Please refer to '
                     'the documentation on how to set up node labels.'
                     f'{suffix}')
-        if label_formatter is not None:
+        else:
             # Validate the label value on all nodes labels to ensure they are
             # correctly setup and will behave as expected.
             for node_name, label_list in node_labels.items():
                 for label, value in label_list:
-                    if label == label_formatter.get_label_key():
+                    if label_formatter.match_label_key(label):
                         is_valid, reason = label_formatter.validate_label_value(
                             value)
                         if not is_valid:
@@ -445,9 +830,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
-                return '', ''
-            k8s_acc_label_key = label_formatter.get_label_key()
-            k8s_acc_label_value = label_formatter.get_label_value(acc_type)
+                return None, None, None, None
             # Search in node_labels to see if any node has the requested
             # GPU type.
             # Note - this only checks if the label is available on a
@@ -455,12 +838,43 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # quantity is available since that is dynamic and can change
             # during scheduling.
             for node_name, label_list in node_labels.items():
+                node_metadata_labels = dict(label_list)
+                # TODO(Doyoung): Update the logic when adding support for
+                # multi-host TPUs.
+                if is_multi_host_tpu(node_metadata_labels):
+                    continue
                 for label, value in label_list:
-                    if (label == k8s_acc_label_key and
-                            value == k8s_acc_label_value):
-                        # If a node is found, we can break out of the loop
-                        # and proceed to deploy.
-                        return k8s_acc_label_key, k8s_acc_label_value
+                    if (label_formatter.match_label_key(label) and
+                            label_formatter.get_accelerator_from_label_value(
+                                value) == acc_type):
+                        if is_tpu_on_gke(acc_type):
+                            assert isinstance(label_formatter,
+                                              GKELabelFormatter)
+                            if node_metadata_labels.get(
+                                    label_formatter.TPU_LABEL_KEY) == acc_type:
+                                topology_label_key = (
+                                    label_formatter.get_tpu_topology_label_key(
+                                    ))
+                                # Instead of using get_tpu_topology_label_value,
+                                # we use the node's label value to determine the
+                                # topology. This is to make sure the node's
+                                # available topology matches our request.
+                                topology_value = node_metadata_labels.get(
+                                    topology_label_key)
+                                assert topology_value is not None
+                                tpu_topology_chip_count = reduce_tpu_topology(
+                                    topology_value)
+                                # For single-host TPUs, there aren't multiple
+                                # different topologies that maps to identical
+                                # number of TPU chips.
+                                if tpu_topology_chip_count == acc_count:
+                                    return (label, value, topology_label_key,
+                                            topology_value)
+                                else:
+                                    continue
+                        else:
+                            return label, value, None, None
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():
                 suffix = ''
@@ -468,15 +882,19 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                     all_labels = []
                     for node_name, label_list in node_labels.items():
                         all_labels.extend(label_list)
-                    gpus_available = set(
-                        v for k, v in all_labels if k == k8s_acc_label_key)
-                    suffix = f' Available GPUs on the cluster: {gpus_available}'
+                    acc_available = set(v for k, v in all_labels
+                                        if label_formatter.match_label_key(k))
+                    suffix = (' Available GPU/TPUs on the cluster: '
+                              f'{acc_available}')
+                # TODO(Doyoung): Update the error message raised with the
+                # multi-host TPU support.
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '
-                    f'with {acc_type} GPU. Please ensure at least '
-                    f'one node in the cluster has {acc_type} GPU and node '
-                    'labels are setup correctly. '
-                    f'Please refer to the documentation for more. {suffix}')
+                    f'with {acc_type}. Please ensure at least one node in the '
+                    f'cluster has {acc_type} and node labels are setup '
+                    'correctly. Please refer to the documentration for more. '
+                    f'{suffix}. Note that multi-host TPU podslices are '
+                    'currently not unsupported.')
     else:
         # If GPU resources are not detected, raise error
         with ux_utils.print_exception_no_traceback():
@@ -485,55 +903,62 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 suffix = (' Available resources on the cluster: '
                           f'{cluster_resources}')
             raise exceptions.ResourcesUnavailableError(
-                'Could not detect GPU resources (`nvidia.com/gpu`) in '
-                'Kubernetes cluster. If this cluster contains GPUs, please '
-                'ensure GPU drivers are installed on the node. Check if the '
-                'GPUs are setup correctly by running `kubectl describe nodes` '
-                'and looking for the nvidia.com/gpu resource. '
-                'Please refer to the documentation on how '
-                f'to set up GPUs.{suffix}')
-def get_head_ssh_port(cluster_name: str, namespace: str) -> int:
+                f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
+                f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
+                ' contains GPUs, please ensure GPU drivers are installed on '
+                'the node. Check if the GPUs are setup correctly by running '
+                '`kubectl describe nodes` and looking for the '
+                f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
+                'Please refer to the documentation on how to set up GPUs.'
+                f'{suffix}')
+    assert False, 'This should not be reached'
+def get_head_ssh_port(cluster_name: str, namespace: str,
+                      context: Optional[str]) -> int:
     svc_name = f'{cluster_name}-head-ssh'
-    return get_port(svc_name, namespace)
+    return get_port(svc_name, namespace, context)
-def get_port(svc_name: str, namespace: str) -> int:
+def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
     """Gets the nodeport of the specified service.
     Args:
         svc_name (str): Name of the kubernetes service. Note that this may be
             different from the cluster name.
         namespace (str): Kubernetes namespace to look for the service in.
+        context (str): Kubernetes context to use.
     """
-    head_service = kubernetes.core_api().read_namespaced_service(
+    head_service = kubernetes.core_api(context).read_namespaced_service(
         svc_name, namespace)
     return head_service.spec.ports[0].node_port
-def get_external_ip(
-        network_mode: Optional[kubernetes_enums.KubernetesNetworkingMode]):
+def get_external_ip(network_mode: Optional[
+    kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
     if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
         return '127.0.0.1'
     # Return the IP address of the first node with an external IP
-    nodes = kubernetes.core_api().list_node().items
+    nodes = kubernetes.core_api(context).list_node().items
     for node in nodes:
         if node.status.addresses:
             for address in node.status.addresses:
                 if address.type == 'ExternalIP':
                     return address.address
     # If no external IP is found, use the API server IP
-    api_host = kubernetes.core_api().api_client.configuration.host
+    api_host = kubernetes.core_api(context).api_client.configuration.host
     parsed_url = urlparse(api_host)
     return parsed_url.hostname
-def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
+def check_credentials(context: Optional[str],
+                      timeout: int = kubernetes.API_TIMEOUT) -> \
         Tuple[bool, Optional[str]]:
     """Check if the credentials in kubeconfig file are valid
     Args:
+        context (Optional[str]): The Kubernetes context to use. If none, uses
+            in-cluster auth to check credentials, if available.
         timeout (int): Timeout in seconds for the test API call
     Returns:
@@ -541,8 +966,9 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
         str: Error message if credentials are invalid, None otherwise
     """
     try:
-        ns = get_current_kube_config_context_namespace()
-        kubernetes.core_api().list_namespaced_pod(ns, _request_timeout=timeout)
+        namespace = get_kube_config_context_namespace(context)
+        kubernetes.core_api(context).list_namespaced_pod(
+            namespace, _request_timeout=timeout)
     except ImportError:
         # TODO(romilb): Update these error strs to also include link to docs
         #  when docs are ready.
@@ -571,7 +997,7 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
     # We now do softer checks to check if exec based auth is used and to
     # see if the cluster is GPU-enabled.
-    _, exec_msg = is_kubeconfig_exec_auth()
+    _, exec_msg = is_kubeconfig_exec_auth(context)
     # We now check if GPUs are available and labels are set correctly on the
     # cluster, and if not we return hints that may help debug any issues.
@@ -580,7 +1006,10 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
     # provider if their cluster GPUs are not setup correctly.
     gpu_msg = ''
     try:
-        _, _ = get_gpu_label_key_value(acc_type='', check_mode=True)
+        get_accelerator_label_key_value(context,
+                                        acc_type='',
+                                        acc_count=0,
+                                        check_mode=True)
     except exceptions.ResourcesUnavailableError as e:
         # If GPUs are not available, we return cluster as enabled (since it can
         # be a CPU-only cluster) but we also return the exception message which
@@ -596,7 +1025,54 @@ def check_credentials(timeout: int = kubernetes.API_TIMEOUT) -> \
         return True, None
-def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
+def check_pod_config(pod_config: dict) \
+    -> Tuple[bool, Optional[str]]:
+    """Check if the pod_config is a valid pod config
+    Using deserialize api to check the pod_config is valid or not.
+    Returns:
+        bool: True if pod_config is valid.
+        str: Error message about why the pod_config is invalid, None otherwise.
+    """
+    errors = []
+    # This api_client won't be used to send any requests, so there is no need to
+    # load kubeconfig
+    api_client = kubernetes.kubernetes.client.ApiClient()
+    # Used for kubernetes api_client deserialize function, the function will use
+    # data attr, the detail ref:
+    # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244
+    class InnerResponse():
+        def __init__(self, data: dict):
+            self.data = json.dumps(data)
+    try:
+        # Validate metadata if present
+        if 'metadata' in pod_config:
+            try:
+                value = InnerResponse(pod_config['metadata'])
+                api_client.deserialize(
+                    value, kubernetes.kubernetes.client.V1ObjectMeta)
+            except ValueError as e:
+                errors.append(f'Invalid metadata: {str(e)}')
+        # Validate spec if present
+        if 'spec' in pod_config:
+            try:
+                value = InnerResponse(pod_config['spec'])
+                api_client.deserialize(value,
+                                       kubernetes.kubernetes.client.V1PodSpec)
+            except ValueError as e:
+                errors.append(f'Invalid spec: {str(e)}')
+        return len(errors) == 0, '.'.join(errors)
+    except Exception as e:  # pylint: disable=broad-except
+        errors.append(f'Validation error: {str(e)}')
+        return False, '.'.join(errors)
+def is_kubeconfig_exec_auth(
+        context: Optional[str] = None) -> Tuple[bool, Optional[str]]:
     """Checks if the kubeconfig file uses exec-based authentication
     Exec-based auth is commonly used for authenticating with cloud hosted
@@ -623,6 +1099,9 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
         str: Error message if exec-based authentication is used, None otherwise
     """
     k8s = kubernetes.kubernetes
+    if context == kubernetes.in_cluster_context_name():
+        # If in-cluster config is used, exec-based auth is not used.
+        return False, None
     try:
         k8s.config.load_kube_config()
     except kubernetes.config_exception():
@@ -630,8 +1109,16 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
         return False, None
     # Get active context and user from kubeconfig using k8s api
-    _, current_context = k8s.config.list_kube_config_contexts()
-    target_username = current_context['context']['user']
+    all_contexts, current_context = k8s.config.list_kube_config_contexts()
+    context_obj = current_context
+    if context is not None:
+        for c in all_contexts:
+            if c['name'] == context:
+                context_obj = c
+                break
+        else:
+            raise ValueError(f'Kubernetes context {context!r} not found.')
+    target_username = context_obj['context']['user']
     # K8s api does not provide a mechanism to get the user details from the
     # context. We need to load the kubeconfig file and parse it to get the
@@ -654,7 +1141,7 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
         schemas.get_default_remote_identity('kubernetes'))
     if ('exec' in user_details.get('user', {}) and remote_identity
             == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
-        ctx_name = current_context['name']
+        ctx_name = context_obj['name']
         exec_msg = ('exec-based authentication is used for '
                     f'Kubernetes context {ctx_name!r}.'
                     ' This may cause issues with autodown or when running '
@@ -664,12 +1151,13 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
                     '~/.sky/config.yaml:\n'
                     '    kubernetes:\n'
                     '      remote_identity: SERVICE_ACCOUNT\n'
-                    '    More: https://skypilot.readthedocs.io/en/latest/'
+                    '    More: https://docs.skypilot.co/en/latest/'
                     'reference/config.html')
         return True, exec_msg
     return False, None
+@annotations.lru_cache(scope='request')
 def get_current_kube_config_context_name() -> Optional[str]:
     """Get the current kubernetes context from the kubeconfig file
@@ -684,18 +1172,90 @@ def get_current_kube_config_context_name() -> Optional[str]:
         return None
-def get_current_kube_config_context_namespace() -> str:
+def is_incluster_config_available() -> bool:
+    """Check if in-cluster auth is available.
+    Note: We cannot use load_incluster_config() to check if in-cluster config
+    is available because it will load the in-cluster config (if available)
+    and modify the current global kubernetes config. We simply check if the
+    service account token file exists to determine if in-cluster config may
+    be available.
+    """
+    return os.path.exists('/var/run/secrets/kubernetes.io/serviceaccount/token')
+def get_all_kube_context_names() -> List[str]:
+    """Get all kubernetes context names available in the environment.
+    Fetches context names from the kubeconfig file and in-cluster auth, if any.
+    If running in-cluster and IN_CLUSTER_CONTEXT_NAME_ENV_VAR is not set,
+    returns the default in-cluster kubernetes context name.
+    We should not cache the result of this function as the admin policy may
+    update the contexts.
+    Returns:
+        List[Optional[str]]: The list of kubernetes context names if
+            available, an empty list otherwise.
+    """
+    k8s = kubernetes.kubernetes
+    context_names = []
+    try:
+        all_contexts, _ = k8s.config.list_kube_config_contexts()
+        # all_contexts will always have at least one context. If kubeconfig
+        # does not have any contexts defined, it will raise ConfigException.
+        context_names = [context['name'] for context in all_contexts]
+    except k8s.config.config_exception.ConfigException:
+        # If no config found, continue
+        pass
+    if is_incluster_config_available():
+        context_names.append(kubernetes.in_cluster_context_name())
+    return context_names
+@annotations.lru_cache(scope='request')
+def get_kube_config_context_namespace(
+        context_name: Optional[str] = None) -> str:
     """Get the current kubernetes context namespace from the kubeconfig file
     Returns:
-        str | None: The current kubernetes context namespace if it exists, else
+        str: The current kubernetes context namespace if it exists, else
             the default namespace.
     """
     k8s = kubernetes.kubernetes
+    ns_path = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
+    # If using in-cluster context, first check for the environment variable,
+    # then fall back to the service account namespace file. Uses the same logic
+    # as adaptors.kubernetes._load_config() to stay consistent with in-cluster
+    # config loading.
+    if (context_name == kubernetes.in_cluster_context_name() or
+            context_name is None):
+        # First check for environment variable. We allow the env var to take
+        # effect only when using in-cluster auth because the recommended way to
+        # set the namespace when using kubeconfig is to change the namespace
+        # configured in the context.
+        env_namespace = os.getenv(
+            kubernetes_constants.KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR)
+        if env_namespace:
+            return env_namespace
+        # Fall back to service account namespace file
+        if os.path.exists(ns_path):
+            with open(ns_path, encoding='utf-8') as f:
+                return f.read().strip()
+    # If not in-cluster, get the namespace from kubeconfig
     try:
-        _, current_context = k8s.config.list_kube_config_contexts()
-        if 'namespace' in current_context['context']:
-            return current_context['context']['namespace']
+        contexts, current_context = k8s.config.list_kube_config_contexts()
+        if context_name is None:
+            context = current_context
+        else:
+            context = next((c for c in contexts if c['name'] == context_name),
+                           None)
+            if context is None:
+                return DEFAULT_NAMESPACE
+        if 'namespace' in context['context']:
+            return context['context']['namespace']
         else:
             return DEFAULT_NAMESPACE
     except k8s.config.config_exception.ConfigException:
@@ -742,13 +1302,13 @@ class KubernetesInstanceType:
         - Accelerators
     The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
     k is the amount of memory in GB. Accelerators can be specified by
-    appending "--{a}{type}" where a is the number of accelerators and
-    type is the accelerator type.
+    appending "--{type}:{a}" where type is the accelerator type and a
+    is the number of accelerators.
     CPU and memory can be specified as floats. Accelerator count must be int.
     Examples:
         - 4CPU--16GB
         - 0.5CPU--1.5GB
-        - 4CPU--16GB--1V100
+        - 4CPU--16GB--V100:1
     """
     def __init__(self,
@@ -769,13 +1329,18 @@ class KubernetesInstanceType:
         name = (f'{common_utils.format_float(self.cpus)}CPU--'
                 f'{common_utils.format_float(self.memory)}GB')
         if self.accelerator_count:
-            name += f'--{self.accelerator_count}{self.accelerator_type}'
+            # Replace spaces with underscores in accelerator type to make it a
+            # valid logical instance type name.
+            assert self.accelerator_type is not None, self.accelerator_count
+            acc_name = self.accelerator_type.replace(' ', '_')
+            name += f'--{acc_name}:{self.accelerator_count}'
         return name
     @staticmethod
     def is_valid_instance_type(name: str) -> bool:
         """Returns whether the given name is a valid instance type."""
-        pattern = re.compile(r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
+        pattern = re.compile(
+            r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
         return bool(pattern.match(name))
     @classmethod
@@ -790,7 +1355,7 @@ class KubernetesInstanceType:
             accelerator_type | str: Type of accelerator
         """
         pattern = re.compile(
-            r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$'  # pylint: disable=line-too-long
+            r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$'  # pylint: disable=line-too-long
         )
         match = pattern.match(name)
         if match:
@@ -800,7 +1365,9 @@ class KubernetesInstanceType:
             accelerator_type = match.group('accelerator_type')
             if accelerator_count:
                 accelerator_count = int(accelerator_count)
-                accelerator_type = str(accelerator_type)
+                # This is to revert the accelerator types with spaces back to
+                # the original format.
+                accelerator_type = str(accelerator_type).replace('_', ' ')
             else:
                 accelerator_count = None
                 accelerator_type = None
@@ -834,7 +1401,7 @@ class KubernetesInstanceType:
         # Round up accelerator_count if it is not an int.
         accelerator_count = math.ceil(accelerator_count)
         if accelerator_count > 0:
-            name += f'--{accelerator_count}{accelerator_type}'
+            name += f'--{accelerator_type}:{accelerator_count}'
         return cls(cpus=cpus,
                    memory=memory,
                    accelerator_count=accelerator_count,
@@ -844,30 +1411,49 @@ class KubernetesInstanceType:
         return self.name
-def construct_ssh_jump_command(private_key_path: str,
-                               ssh_jump_ip: str,
-                               ssh_jump_port: Optional[int] = None,
-                               proxy_cmd_path: Optional[str] = None) -> str:
+def construct_ssh_jump_command(
+        private_key_path: str,
+        ssh_jump_ip: str,
+        ssh_jump_port: Optional[int] = None,
+        ssh_jump_user: str = 'sky',
+        proxy_cmd_path: Optional[str] = None,
+        proxy_cmd_target_pod: Optional[str] = None,
+        current_kube_context: Optional[str] = None,
+        current_kube_namespace: Optional[str] = None) -> str:
     ssh_jump_proxy_command = (f'ssh -tt -i {private_key_path} '
                               '-o StrictHostKeyChecking=no '
                               '-o UserKnownHostsFile=/dev/null '
                               f'-o IdentitiesOnly=yes '
-                              f'-W %h:%p sky@{ssh_jump_ip}')
+                              r'-W \[%h\]:%p '
+                              f'{ssh_jump_user}@{ssh_jump_ip}')
     if ssh_jump_port is not None:
         ssh_jump_proxy_command += f' -p {ssh_jump_port} '
     if proxy_cmd_path is not None:
         proxy_cmd_path = os.path.expanduser(proxy_cmd_path)
         # adding execution permission to the proxy command script
         os.chmod(proxy_cmd_path, os.stat(proxy_cmd_path).st_mode | 0o111)
-        ssh_jump_proxy_command += f' -o ProxyCommand=\'{proxy_cmd_path}\' '
+        kube_context_flag = f'-c {current_kube_context} ' if (
+            current_kube_context is not None) else ''
+        kube_namespace_flag = f'-n {current_kube_namespace} ' if (
+            current_kube_namespace is not None) else ''
+        ssh_jump_proxy_command += (f' -o ProxyCommand=\'{proxy_cmd_path} '
+                                   f'{kube_context_flag}'
+                                   f'{kube_namespace_flag}'
+                                   f'{proxy_cmd_target_pod}\'')
     return ssh_jump_proxy_command
 def get_ssh_proxy_command(
-        private_key_path: str, ssh_jump_name: str,
-        network_mode: kubernetes_enums.KubernetesNetworkingMode, namespace: str,
-        port_fwd_proxy_cmd_path: str, port_fwd_proxy_cmd_template: str) -> str:
-    """Generates the SSH proxy command to connect through the SSH jump pod.
+    k8s_ssh_target: str,
+    network_mode: kubernetes_enums.KubernetesNetworkingMode,
+    private_key_path: str,
+    context: Optional[str],
+    namespace: str,
+) -> str:
+    """Generates the SSH proxy command to connect to the pod.
+    Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
+    if the network mode is PORTFORWARD.
     By default, establishing an SSH connection creates a communication
     channel to a remote node by setting up a TCP connection. When a
@@ -883,58 +1469,87 @@ def get_ssh_proxy_command(
     With the NodePort networking mode, a NodePort service is launched. This
     service opens an external port on the node which redirects to the desired
-    port within the pod. When establishing an SSH session in this mode, the
+    port to a SSH jump pod. When establishing an SSH session in this mode, the
     ProxyCommand makes use of this external port to create a communication
     channel directly to port 22, which is the default port ssh server listens
     on, of the jump pod.
     With Port-forward mode, instead of directly exposing an external port,
     'kubectl port-forward' sets up a tunnel between a local port
-    (127.0.0.1:23100) and port 22 of the jump pod. Then we establish a TCP
+    (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
     connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
-    This is setup in the inner ProxyCommand of the nested ProxyCommand, and the
-    rest is the same as NodePort approach, which the outer ProxyCommand
-    establishes a communication channel between 127.0.0.1:23100 and port 22 on
-    the jump pod. Consequently, any stdin provided on the local machine is
-    forwarded through this tunnel to the application (SSH server) listening in
-    the pod. Similarly, any output from the application in the pod is tunneled
-    back and displayed in the terminal on the local machine.
+    All of this is done in a ProxyCommand script. Any stdin provided on the
+    local machine is forwarded through this tunnel to the application
+    (SSH server) listening in the pod. Similarly, any output from the
+    application in the pod is tunneled back and displayed in the terminal on
+    the local machine.
     Args:
-        private_key_path: str; Path to the private key to use for SSH.
-            This key must be authorized to access the SSH jump pod.
-        ssh_jump_name: str; Name of the SSH jump service to use
+        k8s_ssh_target: str; The Kubernetes object that will be used as the
+            target for SSH. If network_mode is NODEPORT, this is the name of the
+            service. If network_mode is PORTFORWARD, this is the pod name.
         network_mode: KubernetesNetworkingMode; networking mode for ssh
             session. It is either 'NODEPORT' or 'PORTFORWARD'
-        namespace: Kubernetes namespace to use
-        port_fwd_proxy_cmd_path: str; path to the script used as Proxycommand
-            with 'kubectl port-forward'
-        port_fwd_proxy_cmd_template: str; template used to create
-            'kubectl port-forward' Proxycommand
+        private_key_path: str; Path to the private key to use for SSH.
+            This key must be authorized to access the SSH jump pod.
+            Required for NODEPORT networking mode.
+        namespace: Kubernetes namespace to use.
+            Required for NODEPORT networking mode.
     """
     # Fetch IP to connect to for the jump svc
-    ssh_jump_ip = get_external_ip(network_mode)
+    ssh_jump_ip = get_external_ip(network_mode, context)
+    assert private_key_path is not None, 'Private key path must be provided'
     if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        ssh_jump_port = get_port(ssh_jump_name, namespace)
+        assert namespace is not None, 'Namespace must be provided for NodePort'
+        ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
         ssh_jump_proxy_command = construct_ssh_jump_command(
             private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
-    # Setting kubectl port-forward/socat to establish ssh session using
-    # ClusterIP service to disallow any ports opened
     else:
-        vars_to_fill = {
-            'ssh_jump_name': ssh_jump_name,
-        }
-        common_utils.fill_template(port_fwd_proxy_cmd_template,
-                                   vars_to_fill,
-                                   output_path=port_fwd_proxy_cmd_path)
+        ssh_jump_proxy_command_path = create_proxy_command_script()
         ssh_jump_proxy_command = construct_ssh_jump_command(
             private_key_path,
             ssh_jump_ip,
-            proxy_cmd_path=port_fwd_proxy_cmd_path)
+            ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
+            proxy_cmd_path=ssh_jump_proxy_command_path,
+            proxy_cmd_target_pod=k8s_ssh_target,
+            # We embed both the current context and namespace to the SSH proxy
+            # command to make sure SSH still works when the current
+            # context/namespace is changed by the user.
+            current_kube_context=context,
+            current_kube_namespace=namespace)
     return ssh_jump_proxy_command
+def create_proxy_command_script() -> str:
+    """Creates a ProxyCommand script that uses kubectl port-forward to setup
+    a tunnel between a local port and the SSH server in the pod.
+    Returns:
+        str: Path to the ProxyCommand script.
+    """
+    port_fwd_proxy_cmd_path = os.path.expanduser(PORT_FORWARD_PROXY_CMD_PATH)
+    os.makedirs(os.path.dirname(port_fwd_proxy_cmd_path),
+                exist_ok=True,
+                mode=0o700)
+    root_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    template_path = os.path.join(root_dir, 'templates',
+                                 PORT_FORWARD_PROXY_CMD_TEMPLATE)
+    # Copy the template to the proxy command path. We create a copy to allow
+    # different users sharing the same SkyPilot installation to have their own
+    # proxy command scripts.
+    shutil.copy(template_path, port_fwd_proxy_cmd_path)
+    # Set the permissions to 700 to ensure only the owner can read, write,
+    # and execute the file.
+    os.chmod(port_fwd_proxy_cmd_path, 0o700)
+    # Return the path to the proxy command script without expanding the user
+    # home directory to be compatible when a SSH is called from a client in
+    # client-server mode.
+    return PORT_FORWARD_PROXY_CMD_PATH
 def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
+                       context: Optional[str],
                        service_type: kubernetes_enums.KubernetesServiceType):
     """Sets up Kubernetes service resource to access for SSH jump pod.
@@ -956,13 +1571,14 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
     # Create service
     try:
-        kubernetes.core_api().create_namespaced_service(namespace,
-                                                        content['service_spec'])
+        kubernetes.core_api(context).create_namespaced_service(
+            namespace, content['service_spec'])
     except kubernetes.api_exception() as e:
         # SSH Jump Pod service already exists.
         if e.status == 409:
-            ssh_jump_service = kubernetes.core_api().read_namespaced_service(
-                name=ssh_jump_name, namespace=namespace)
+            ssh_jump_service = kubernetes.core_api(
+                context).read_namespaced_service(name=ssh_jump_name,
+                                                 namespace=namespace)
             curr_svc_type = ssh_jump_service.spec.type
             if service_type.value == curr_svc_type:
                 # If the currently existing SSH Jump service's type is identical
@@ -974,9 +1590,9 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
                 # If a different type of service type for SSH Jump pod compared
                 # to user's configuration for networking mode exists, we remove
                 # existing servie to create a new one following user's config
-                kubernetes.core_api().delete_namespaced_service(
+                kubernetes.core_api(context).delete_namespaced_service(
                     name=ssh_jump_name, namespace=namespace)
-                kubernetes.core_api().create_namespaced_service(
+                kubernetes.core_api(context).create_namespaced_service(
                     namespace, content['service_spec'])
                 port_forward_mode = (
                     kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
@@ -1005,7 +1621,8 @@ def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
 def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
-                       ssh_key_secret: str, namespace: str):
+                       ssh_key_secret: str, namespace: str,
+                       context: Optional[str]):
     """Sets up Kubernetes RBAC and pod for SSH jump host.
     Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
@@ -1034,7 +1651,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
     # ServiceAccount
     try:
-        kubernetes.core_api().create_namespaced_service_account(
+        kubernetes.core_api(context).create_namespaced_service_account(
             namespace, content['service_account'])
     except kubernetes.api_exception() as e:
         if e.status == 409:
@@ -1047,7 +1664,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
         logger.info('Created SSH Jump ServiceAccount.')
     # Role
     try:
-        kubernetes.auth_api().create_namespaced_role(namespace, content['role'])
+        kubernetes.auth_api(context).create_namespaced_role(
+            namespace, content['role'])
     except kubernetes.api_exception() as e:
         if e.status == 409:
             logger.info(
@@ -1058,7 +1676,7 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
         logger.info('Created SSH Jump Role.')
     # RoleBinding
     try:
-        kubernetes.auth_api().create_namespaced_role_binding(
+        kubernetes.auth_api(context).create_namespaced_role_binding(
             namespace, content['role_binding'])
     except kubernetes.api_exception() as e:
         if e.status == 409:
@@ -1071,8 +1689,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
         logger.info('Created SSH Jump RoleBinding.')
     # Pod
     try:
-        kubernetes.core_api().create_namespaced_pod(namespace,
-                                                    content['pod_spec'])
+        kubernetes.core_api(context).create_namespaced_pod(
+            namespace, content['pod_spec'])
     except kubernetes.api_exception() as e:
         if e.status == 409:
             logger.info(
@@ -1084,7 +1702,8 @@ def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
         logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
-def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
+def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
+                              node_id: str):
     """Analyzes SSH jump pod and removes if it is in a bad state
     Prevents the existence of a dangling SSH jump pod. This could happen
@@ -1100,11 +1719,12 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
     def find(l, predicate):
         """Utility function to find element in given list"""
         results = [x for x in l if predicate(x)]
-        return results[0] if len(results) > 0 else None
+        return results[0] if results else None
     # Get the SSH jump pod name from the head pod
     try:
-        pod = kubernetes.core_api().read_namespaced_pod(node_id, namespace)
+        pod = kubernetes.core_api(context).read_namespaced_pod(
+            node_id, namespace)
     except kubernetes.api_exception() as e:
         if e.status == 404:
             logger.warning(f'Failed to get pod {node_id},'
@@ -1113,7 +1733,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
     else:
         ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
     try:
-        ssh_jump_pod = kubernetes.core_api().read_namespaced_pod(
+        ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
             ssh_jump_name, namespace)
         cont_ready_cond = find(ssh_jump_pod.status.conditions,
                                lambda c: c.type == 'ContainersReady')
@@ -1124,9 +1744,9 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
             # ssh jump pod, lets remove it and the service. Otherwise, main
             # container is ready and its lifecycle management script takes
             # care of the cleaning.
-            kubernetes.core_api().delete_namespaced_pod(ssh_jump_name,
-                                                        namespace)
-            kubernetes.core_api().delete_namespaced_service(
+            kubernetes.core_api(context).delete_namespaced_pod(
+                ssh_jump_name, namespace)
+            kubernetes.core_api(context).delete_namespaced_service(
                 ssh_jump_name, namespace)
     except kubernetes.api_exception() as e:
         # We keep the warning in debug to avoid polluting the `sky launch`
@@ -1138,7 +1758,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, node_id: str):
         # We encountered an issue while checking ssh jump pod. To be on
         # the safe side, lets remove its service so the port is freed
         try:
-            kubernetes.core_api().delete_namespaced_service(
+            kubernetes.core_api(context).delete_namespaced_service(
                 ssh_jump_name, namespace)
         except kubernetes.api_exception():
             pass
@@ -1245,50 +1865,12 @@ def get_endpoint_debug_message() -> str:
                                           debug_cmd=debug_cmd)
-def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
-    """Merge two dictionaries into the destination dictionary.
-    Updates nested dictionaries instead of replacing them.
-    If a list is encountered, it will be appended to the destination list.
-    An exception is when the key is 'containers', in which case the
-    first container in the list will be fetched and merge_dict will be
-    called on it with the first container in the destination list.
-    """
-    for key, value in source.items():
-        if isinstance(value, dict) and key in destination:
-            merge_dicts(value, destination[key])
-        elif isinstance(value, list) and key in destination:
-            assert isinstance(destination[key], list), \
-                f'Expected {key} to be a list, found {destination[key]}'
-            if key == 'containers':
-                # If the key is 'containers', we take the first and only
-                # container in the list and merge it.
-                assert len(value) == 1, \
-                    f'Expected only one container, found {value}'
-                merge_dicts(value[0], destination[key][0])
-            elif key in ['volumes', 'volumeMounts']:
-                # If the key is 'volumes' or 'volumeMounts', we search for
-                # item with the same name and merge it.
-                for new_volume in value:
-                    new_volume_name = new_volume.get('name')
-                    if new_volume_name is not None:
-                        destination_volume = next(
-                            (v for v in destination[key]
-                             if v.get('name') == new_volume_name), None)
-                        if destination_volume is not None:
-                            merge_dicts(new_volume, destination_volume)
-                        else:
-                            destination[key].append(new_volume)
-            else:
-                destination[key].extend(value)
-        else:
-            destination[key] = value
-def combine_pod_config_fields(cluster_yaml_path: str) -> None:
-    """Adds or updates fields in the YAML with fields from the ~/.sky/config's
-    kubernetes.pod_spec dict.
+def combine_pod_config_fields(
+    cluster_yaml_path: str,
+    cluster_config_overrides: Dict[str, Any],
+) -> None:
+    """Adds or updates fields in the YAML with fields from the
+    ~/.sky/config.yaml's kubernetes.pod_spec dict.
     This can be used to add fields to the YAML that are not supported by
     SkyPilot yet, or require simple configuration (e.g., adding an
     imagePullSecrets field).
@@ -1328,13 +1910,19 @@ def combine_pod_config_fields(cluster_yaml_path: str) -> None:
     with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
         yaml_content = f.read()
     yaml_obj = yaml.safe_load(yaml_content)
+    # We don't use override_configs in `skypilot_config.get_nested`, as merging
+    # the pod config requires special handling.
     kubernetes_config = skypilot_config.get_nested(('kubernetes', 'pod_config'),
-                                                   {})
+                                                   default_value={},
+                                                   override_configs={})
+    override_pod_config = (cluster_config_overrides.get('kubernetes', {}).get(
+        'pod_config', {}))
+    config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
     # Merge the kubernetes config into the YAML for both head and worker nodes.
-    merge_dicts(
-        kubernetes_config,
-        yaml_obj['available_node_types']['ray_head_default']['node_config'])
+    config_utils.merge_k8s_configs(
+        yaml_obj['available_node_types']['ray_head_default']['node_config'],
+        kubernetes_config)
     # Write the updated YAML back to the file
     common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
@@ -1342,7 +1930,7 @@ def combine_pod_config_fields(cluster_yaml_path: str) -> None:
 def combine_metadata_fields(cluster_yaml_path: str) -> None:
     """Updates the metadata for all Kubernetes objects created by SkyPilot with
-    fields from the ~/.sky/config's kubernetes.custom_metadata dict.
+    fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
     Obeys the same add or update semantics as combine_pod_config_fields().
     """
@@ -1368,7 +1956,7 @@ def combine_metadata_fields(cluster_yaml_path: str) -> None:
     ]
     for destination in combination_destinations:
-        merge_dicts(custom_metadata, destination)
+        config_utils.merge_k8s_configs(destination, custom_metadata)
     # Write the updated YAML back to the file
     common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
@@ -1381,13 +1969,13 @@ def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
     """
     custom_metadata = skypilot_config.get_nested(
         ('kubernetes', 'custom_metadata'), {})
-    merge_dicts(custom_metadata, original_metadata)
+    config_utils.merge_k8s_configs(original_metadata, custom_metadata)
-def check_nvidia_runtime_class() -> bool:
+def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
     """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
     # Fetch the list of available RuntimeClasses
-    runtime_classes = kubernetes.node_api().list_runtime_class()
+    runtime_classes = kubernetes.node_api(context).list_runtime_class()
     # Check if 'nvidia' RuntimeClass exists
     nvidia_exists = any(
@@ -1395,7 +1983,8 @@ def check_nvidia_runtime_class() -> bool:
     return nvidia_exists
-def check_secret_exists(secret_name: str, namespace: str) -> bool:
+def check_secret_exists(secret_name: str, namespace: str,
+                        context: Optional[str]) -> bool:
     """Checks if a secret exists in a namespace
     Args:
@@ -1404,7 +1993,7 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
     """
     try:
-        kubernetes.core_api().read_namespaced_secret(
+        kubernetes.core_api(context).read_namespaced_secret(
             secret_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
     except kubernetes.api_exception() as e:
         if e.status == 404:
@@ -1414,20 +2003,29 @@ def check_secret_exists(secret_name: str, namespace: str) -> bool:
         return True
-def create_namespace(namespace: str) -> None:
+def create_namespace(namespace: str, context: Optional[str]) -> None:
     """Creates a namespace in the cluster.
     If the namespace already exists, logs a message and does nothing.
     Args:
         namespace: Name of the namespace to create
+        context: Name of the context to use. Can be none to use default context.
     """
     kubernetes_client = kubernetes.kubernetes.client
+    try:
+        kubernetes.core_api(context).read_namespace(namespace)
+    except kubernetes.api_exception() as e:
+        if e.status != 404:
+            raise
+    else:
+        return
     ns_metadata = dict(name=namespace, labels={'parent': 'skypilot'})
     merge_custom_metadata(ns_metadata)
     namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
     try:
-        kubernetes.core_api().create_namespace(namespace_obj)
+        kubernetes.core_api(context).create_namespace(namespace_obj)
     except kubernetes.api_exception() as e:
         if e.status == 409:
             logger.info(f'Namespace {namespace} already exists in the cluster.')
@@ -1453,7 +2051,7 @@ def get_head_pod_name(cluster_name_on_cloud: str):
 def get_autoscaler_type(
 ) -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
     """Returns the autoscaler type by reading from config"""
-    autoscaler_type = skypilot_config.get_nested(['kubernetes', 'autoscaler'],
+    autoscaler_type = skypilot_config.get_nested(('kubernetes', 'autoscaler'),
                                                  None)
     if autoscaler_type is not None:
         autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(
@@ -1461,6 +2059,45 @@ def get_autoscaler_type(
     return autoscaler_type
+# Mapping of known spot label keys and values for different cluster types
+# Add new cluster types here if they support spot instances along with the
+# corresponding spot label key and value.
+SPOT_LABEL_MAP = {
+    kubernetes_enums.KubernetesAutoscalerType.GKE.value:
+        ('cloud.google.com/gke-spot', 'true')
+}
+def get_spot_label(
+        context: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
+    """Get the spot label key and value for using spot instances, if supported.
+    Checks if the underlying cluster supports spot instances by checking nodes
+    for known spot label keys and values. If found, returns the spot label key
+    and value. If not, checks if autoscaler is configured and returns
+    appropriate labels. If neither are found, returns None.
+    Returns:
+        Tuple[str, str]: Tuple containing the spot label key and value. Returns
+            None if spot instances are not supported.
+    """
+    # Check if the cluster supports spot instances by checking nodes for known
+    # spot label keys and values
+    for node in get_kubernetes_nodes(context):
+        for _, (key, value) in SPOT_LABEL_MAP.items():
+            if key in node.metadata.labels and node.metadata.labels[
+                    key] == value:
+                return key, value
+    # Check if autoscaler is configured. Allow spot instances if autoscaler type
+    # is known to support spot instances.
+    autoscaler_type = get_autoscaler_type()
+    if autoscaler_type == kubernetes_enums.KubernetesAutoscalerType.GKE:
+        return SPOT_LABEL_MAP[autoscaler_type.value]
+    return None, None
 def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
     """Converts a dictionary to a Kubernetes object.
@@ -1479,3 +2116,485 @@ def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
     fake_kube_response = FakeKubeResponse(object_dict)
     return kubernetes.api_client().deserialize(fake_kube_response, object_type)
+def get_kubernetes_node_info(
+        context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
+    """Gets the resource information for all the nodes in the cluster.
+    Currently only GPU resources are supported. The function returns the total
+    number of GPUs available on the node and the number of free GPUs on the
+    node.
+    If the user does not have sufficient permissions to list pods in all
+    namespaces, the function will return free GPUs as -1.
+    Returns:
+        Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
+            key and the KubernetesNodeInfo object as value
+    """
+    nodes = get_kubernetes_nodes(context)
+    # Get the pods to get the real-time resource usage
+    try:
+        pods = get_all_pods_in_kubernetes_cluster(context)
+    except kubernetes.api_exception() as e:
+        if e.status == 403:
+            pods = None
+        else:
+            raise
+    lf, _ = detect_gpu_label_formatter(context)
+    if not lf:
+        label_keys = []
+    else:
+        label_keys = lf.get_label_keys()
+    node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
+    for node in nodes:
+        accelerator_name = None
+        # Determine the accelerator name from the node labels and pick the
+        # first one found. We assume that the node has only one accelerator type
+        # (e.g., either GPU or TPU).
+        for label_key in label_keys:
+            if lf is not None and label_key in node.metadata.labels:
+                accelerator_name = lf.get_accelerator_from_label_value(
+                    node.metadata.labels.get(label_key))
+                break
+        allocated_qty = 0
+        accelerator_count = get_node_accelerator_count(node.status.allocatable)
+        if pods is None:
+            accelerators_available = -1
+        else:
+            for pod in pods:
+                # Get all the pods running on the node
+                if (pod.spec.node_name == node.metadata.name and
+                        pod.status.phase in ['Running', 'Pending']):
+                    # Iterate over all the containers in the pod and sum the
+                    # GPU requests
+                    for container in pod.spec.containers:
+                        if container.resources.requests:
+                            allocated_qty += get_node_accelerator_count(
+                                container.resources.requests)
+            accelerators_available = accelerator_count - allocated_qty
+        # Exclude multi-host TPUs from being processed.
+        # TODO(Doyoung): Remove the logic when adding support for
+        # multi-host TPUs.
+        if is_multi_host_tpu(node.metadata.labels):
+            continue
+        node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
+            name=node.metadata.name,
+            accelerator_type=accelerator_name,
+            total={'accelerator_count': int(accelerator_count)},
+            free={'accelerators_available': int(accelerators_available)})
+    return node_info_dict
+def to_label_selector(tags):
+    label_selector = ''
+    for k, v in tags.items():
+        if label_selector != '':
+            label_selector += ','
+        label_selector += '{}={}'.format(k, v)
+    return label_selector
+def get_namespace_from_config(provider_config: Dict[str, Any]) -> str:
+    context = get_context_from_config(provider_config)
+    return provider_config.get('namespace',
+                               get_kube_config_context_namespace(context))
+@timeline.event
+def filter_pods(namespace: str,
+                context: Optional[str],
+                tag_filters: Dict[str, str],
+                status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Filters pods by tags and status."""
+    non_included_pod_statuses = POD_STATUSES.copy()
+    field_selector = ''
+    if status_filters is not None:
+        non_included_pod_statuses -= set(status_filters)
+        field_selector = ','.join(
+            [f'status.phase!={status}' for status in non_included_pod_statuses])
+    label_selector = to_label_selector(tag_filters)
+    pod_list = kubernetes.core_api(context).list_namespaced_pod(
+        namespace, field_selector=field_selector, label_selector=label_selector)
+    # Don't return pods marked for deletion,
+    # i.e. pods with non-null metadata.DeletionTimestamp.
+    pods = [
+        pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
+    ]
+    return {pod.metadata.name: pod for pod in pods}
+def _remove_pod_annotation(pod: Any,
+                           annotation_key: str,
+                           namespace: str,
+                           context: Optional[str] = None) -> None:
+    """Removes specified Annotations from a Kubernetes pod."""
+    try:
+        # Remove the specified annotation
+        if pod.metadata.annotations:
+            if annotation_key in pod.metadata.annotations:
+                # Patch the pod with the updated metadata.
+                body = {'metadata': {'annotations': {annotation_key: None}}}
+                kubernetes.core_api(context).patch_namespaced_pod(
+                    name=pod.metadata.name,
+                    namespace=namespace,
+                    body=body,
+                    _request_timeout=kubernetes.API_TIMEOUT)
+    except kubernetes.api_exception() as e:
+        if e.status == 404:
+            logger.warning(
+                ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG.format(
+                    pod_name=pod.metadata.name,
+                    namespace=namespace,
+                    action='remove',
+                    annotation=annotation_key))
+        else:
+            with ux_utils.print_exception_no_traceback():
+                raise
+def _add_pod_annotation(pod: Any,
+                        annotation: Dict[str, str],
+                        namespace: str,
+                        context: Optional[str] = None) -> None:
+    """Adds specified Annotations on a Kubernetes pod."""
+    try:
+        # Patch the pod with the updated metadata
+        body = {'metadata': {'annotations': annotation}}
+        kubernetes.core_api(context).patch_namespaced_pod(
+            name=pod.metadata.name,
+            namespace=namespace,
+            body=body,
+            _request_timeout=kubernetes.API_TIMEOUT)
+    except kubernetes.api_exception() as e:
+        if e.status == 404:
+            logger.warning(
+                ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG.format(
+                    pod_name=pod.metadata.name,
+                    namespace=namespace,
+                    action='add',
+                    annotation=annotation))
+        else:
+            with ux_utils.print_exception_no_traceback():
+                raise
+def set_autodown_annotations(handle: 'backends.CloudVmRayResourceHandle',
+                             idle_minutes_to_autostop: Optional[int],
+                             down: bool = False) -> None:
+    """Adds or removes Annotations of autodown on Kubernetes pods."""
+    tags = {
+        provision_constants.TAG_RAY_CLUSTER_NAME: handle.cluster_name_on_cloud,
+    }
+    ray_config = common_utils.read_yaml(handle.cluster_yaml)
+    provider_config = ray_config['provider']
+    namespace = get_namespace_from_config(provider_config)
+    context = get_context_from_config(provider_config)
+    running_pods = filter_pods(namespace, context, tags)
+    for _, pod in running_pods.items():
+        if down:
+            idle_minutes_to_autostop_annotation = {
+                IDLE_MINUTES_TO_AUTOSTOP_ANNOTATION_KEY:
+                    str(idle_minutes_to_autostop)
+            }
+            autodown_annotation = {AUTODOWN_ANNOTATION_KEY: 'true'}
+            _add_pod_annotation(pod=pod,
+                                annotation=idle_minutes_to_autostop_annotation,
+                                namespace=namespace,
+                                context=context)
+            _add_pod_annotation(pod=pod,
+                                annotation=autodown_annotation,
+                                namespace=namespace,
+                                context=context)
+        # If idle_minutes_to_autostop is negative, it indicates a request to
+        # cancel autostop using the --cancel flag with the `sky autostop`
+        # command.
+        elif (idle_minutes_to_autostop is not None and
+              idle_minutes_to_autostop < 0):
+            _remove_pod_annotation(
+                pod=pod,
+                annotation_key=IDLE_MINUTES_TO_AUTOSTOP_ANNOTATION_KEY,
+                namespace=namespace,
+                context=context)
+            _remove_pod_annotation(pod=pod,
+                                   annotation_key=AUTODOWN_ANNOTATION_KEY,
+                                   namespace=namespace,
+                                   context=context)
+def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
+    context = provider_config.get('context',
+                                  get_current_kube_config_context_name())
+    if context == kubernetes.in_cluster_context_name():
+        # If the context (also used as the region) is in-cluster, we need to
+        # we need to use in-cluster auth by setting the context to None.
+        context = None
+    return context
+def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
+    """Gets all SkyPilot pods in the Kubernetes cluster.
+    Args:
+        context: Kubernetes context to use. If None, uses the current context.
+    Returns:
+        A list of Kubernetes pod objects.
+    """
+    if context is None:
+        context = get_current_kube_config_context_name()
+    try:
+        pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
+            label_selector='skypilot-cluster',
+            _request_timeout=kubernetes.API_TIMEOUT).items
+    except kubernetes.max_retry_error():
+        raise exceptions.ResourcesUnavailableError(
+            'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
+            'Please check if the cluster is healthy and retry. To debug, run: '
+            'kubectl get pods --selector=skypilot-cluster --all-namespaces'
+        ) from None
+    return pods
+def is_tpu_on_gke(accelerator: str) -> bool:
+    """Determines if the given accelerator is a TPU supported on GKE."""
+    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
+def get_node_accelerator_count(attribute_dict: dict) -> int:
+    """Retrieves the count of accelerators from a node's resource dictionary.
+    This method checks the node's allocatable resources or the accelerators
+    already deployed on the node, using pod objects that describe resource
+    requests.
+    Args:
+        attribute_dict: Containing resource information from a node, such as
+            allocatable or requested resources.
+    Returns:
+        Number of accelerators allocated or available from the node. If no
+            resource is found, it returns 0.
+    """
+    gpu_resource_name = get_gpu_resource_key()
+    assert not (gpu_resource_name in attribute_dict and
+                TPU_RESOURCE_KEY in attribute_dict)
+    if gpu_resource_name in attribute_dict:
+        return int(attribute_dict[gpu_resource_name])
+    elif TPU_RESOURCE_KEY in attribute_dict:
+        return int(attribute_dict[TPU_RESOURCE_KEY])
+    return 0
+def reduce_tpu_topology(topology: str) -> int:
+    """Computes the number of TPU chips from its topology string."""
+    chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
+    # tpu_topology_chip_count represents the total number of TPU chips in the
+    # entire podslice, whether it is a single-host or multi-host TPU podslice.
+    tpu_topology_chip_count = functools.reduce(lambda x, y: x * y,
+                                               chip_dimensions)
+    return tpu_topology_chip_count
+def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
+    """Determines whether the given node is a multi-host TPU configuration."""
+    if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
+        assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
+        topology_value = (
+            node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
+        accelerator_count_label_key = (
+            GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
+        assert accelerator_count_label_key in node_metadata_labels
+        # node_tpu_chip_count represents the number of TPU chips
+        # available in this node. If the node is part of a node pool
+        # forming a multi-host TPU podslice, it only reflects the
+        # number of TPU chips in this individual node, not the entire
+        # multi-host TPU podslice.
+        node_tpu_chip_count = int(
+            node_metadata_labels[accelerator_count_label_key])
+        topology_chip_count = reduce_tpu_topology(topology_value)
+        # For multi-host TPU podslices, topology_chip_count and
+        # node_tpu_chip_count will differ, as topology_chip_count
+        # reflects the total across all hosts, while
+        # node_tpu_chip_count reflects only the chips in a single node.
+        if node_tpu_chip_count != topology_chip_count:
+            return True
+    return False
+def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
+    """Checks if there exists a multi-host TPU within the cluster."""
+    nodes = get_kubernetes_nodes(context)
+    for node in nodes:
+        if is_multi_host_tpu(node.metadata.labels):
+            return True
+    return False
+@dataclasses.dataclass
+class KubernetesSkyPilotClusterInfo:
+    cluster_name_on_cloud: str
+    cluster_name: str
+    user: str
+    status: status_lib.ClusterStatus
+    pods: List[Any]
+    launched_at: float
+    resources: 'resources_lib.Resources'
+    resources_str: str
+@dataclasses.dataclass
+class KubernetesSkyPilotClusterInfoPayload:
+    """SkyPilot Cluster on Kubernetes payload."""
+    cluster_name_on_cloud: str
+    cluster_name: str
+    user: str
+    status: status_lib.ClusterStatus
+    resources_str: str
+    launched_at: float
+    @classmethod
+    def from_cluster(
+        cls, cluster: KubernetesSkyPilotClusterInfo
+    ) -> 'KubernetesSkyPilotClusterInfoPayload':
+        resources_str = f'{len(cluster.pods)}x {cluster.resources}'
+        return cls(
+            cluster_name_on_cloud=cluster.cluster_name_on_cloud,
+            cluster_name=cluster.cluster_name,
+            user=cluster.user,
+            status=cluster.status,
+            resources_str=resources_str,
+            launched_at=cluster.launched_at,
+        )
+def process_skypilot_pods(
+    pods: List[Any],
+    context: Optional[str] = None
+) -> Tuple[List[KubernetesSkyPilotClusterInfo],
+           List[KubernetesSkyPilotClusterInfo],
+           List[KubernetesSkyPilotClusterInfo]]:
+    """Process SkyPilot pods on k8s to extract cluster and controller info.
+    Args:
+        pods: List of Kubernetes pod objects.
+        context: Kubernetes context name, used to detect GPU label formatter.
+    Returns:
+        A tuple containing:
+        - List of KubernetesSkyPilotClusterInfo with all cluster info.
+        - List of KubernetesSkyPilotClusterInfo with job controller info.
+        - List of KubernetesSkyPilotClusterInfo with serve controller info.
+    """
+    # pylint: disable=import-outside-toplevel
+    from sky import resources as resources_lib
+    clusters: Dict[str, KubernetesSkyPilotClusterInfo] = {}
+    jobs_controllers: List[KubernetesSkyPilotClusterInfo] = []
+    serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
+    for pod in pods:
+        cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
+        cluster_name = cluster_name_on_cloud.rsplit(
+            '-', 1
+        )[0]  # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
+        if cluster_name_on_cloud not in clusters:
+            # Parse the start time for the cluster
+            start_time = pod.status.start_time
+            if start_time is not None:
+                start_time = pod.status.start_time.timestamp()
+            # Parse resources
+            cpu_request = parse_cpu_or_gpu_resource(
+                pod.spec.containers[0].resources.requests.get('cpu', '0'))
+            memory_request = parse_memory_resource(
+                pod.spec.containers[0].resources.requests.get('memory', '0'),
+                unit='G')
+            gpu_count = parse_cpu_or_gpu_resource(
+                pod.spec.containers[0].resources.requests.get(
+                    'nvidia.com/gpu', '0'))
+            gpu_name = None
+            if gpu_count > 0:
+                label_formatter, _ = (detect_gpu_label_formatter(context))
+                assert label_formatter is not None, (
+                    'GPU label formatter cannot be None if there are pods '
+                    f'requesting GPUs: {pod.metadata.name}')
+                gpu_label = label_formatter.get_label_key()
+                # Get GPU name from pod node selector
+                if pod.spec.node_selector is not None:
+                    gpu_name = label_formatter.get_accelerator_from_label_value(
+                        pod.spec.node_selector.get(gpu_label))
+            resources = resources_lib.Resources(
+                cloud=clouds.Kubernetes(),
+                cpus=int(cpu_request),
+                memory=int(memory_request),
+                accelerators=(f'{gpu_name}:{gpu_count}'
+                              if gpu_count > 0 else None))
+            if pod.status.phase == 'Pending':
+                # If pod is pending, do not show it in the status
+                continue
+            cluster_info = KubernetesSkyPilotClusterInfo(
+                cluster_name_on_cloud=cluster_name_on_cloud,
+                cluster_name=cluster_name,
+                user=pod.metadata.labels.get('skypilot-user'),
+                status=status_lib.ClusterStatus.UP,
+                pods=[],
+                launched_at=start_time,
+                resources=resources,
+                resources_str='')
+            clusters[cluster_name_on_cloud] = cluster_info
+            # Check if cluster name is name of a controller
+            # Can't use controller_utils.Controllers.from_name(cluster_name)
+            # because hash is different across users
+            if 'sky-jobs-controller' in cluster_name_on_cloud:
+                jobs_controllers.append(cluster_info)
+            elif 'sky-serve-controller' in cluster_name_on_cloud:
+                serve_controllers.append(cluster_info)
+        else:
+            # Update start_time if this pod started earlier
+            pod_start_time = pod.status.start_time
+            if pod_start_time is not None:
+                pod_start_time = pod_start_time.timestamp()
+                if pod_start_time < clusters[cluster_name_on_cloud].launched_at:
+                    clusters[cluster_name_on_cloud].launched_at = pod_start_time
+        clusters[cluster_name_on_cloud].pods.append(pod)
+    # Update resources_str in clusters:
+    for cluster in clusters.values():
+        num_pods = len(cluster.pods)
+        cluster.resources_str = f'{num_pods}x {cluster.resources}'
+    return list(clusters.values()), jobs_controllers, serve_controllers
+def get_gpu_resource_key():
+    """Get the GPU resource name to use in kubernetes.
+    The function first checks for an environment variable.
+    If defined, it uses its value; otherwise, it returns the default value.
+    Args:
+        name (str): Default GPU resource name, default is "nvidia.com/gpu".
+    Returns:
+        str: The selected GPU resource name.
+    """
+    # Retrieve GPU resource name from environment variable, if set.
+    # Else use default.
+    # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
+    return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl