PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250612__py3-none-any.whl → 1.0.0.dev20250614__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250612py3-none-any.whl → 1.0.0.dev20250614py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sky/__init__.py +4 -2
sky/adaptors/hyperbolic.py +8 -0
sky/adaptors/kubernetes.py +3 -2
sky/authentication.py +20 -2
sky/backends/backend_utils.py +11 -3
sky/backends/cloud_vm_ray_backend.py +2 -1
sky/benchmark/benchmark_state.py +2 -1
sky/catalog/data_fetchers/fetch_aws.py +1 -1
sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
sky/catalog/data_fetchers/fetch_vast.py +1 -1
sky/catalog/hyperbolic_catalog.py +133 -0
sky/check.py +2 -1
sky/cli.py +1 -1
sky/client/cli.py +1 -1
sky/clouds/__init__.py +2 -0
sky/clouds/cloud.py +1 -1
sky/clouds/gcp.py +1 -1
sky/clouds/hyperbolic.py +276 -0
sky/clouds/kubernetes.py +8 -2
sky/clouds/ssh.py +7 -3
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/37-7754056a4b503e1d.js +6 -0
sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +16 -0
sky/dashboard/out/_next/static/chunks/{856-0776dc6ed6000c39.js → 856-c2c39c0912285e54.js} +1 -1
sky/dashboard/out/_next/static/chunks/938-245c9ac4c9e8bf15.js +1 -0
sky/dashboard/out/_next/static/chunks/{webpack-208a9812ab4f61c9.js → webpack-27de3d9d450d81c6.js} +1 -1
sky/dashboard/out/_next/static/css/{5d71bfc09f184bab.css → 6f84444b8f3c656c.css} +1 -1
sky/dashboard/out/_next/static/{G3DXdMFu2Jzd-Dody9iq1 → nm5jrKpUZh2W0SxzyDKhz}/_buildManifest.js +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +2 -2
sky/jobs/state.py +43 -44
sky/provision/__init__.py +1 -0
sky/provision/common.py +1 -1
sky/provision/gcp/config.py +1 -1
sky/provision/hyperbolic/__init__.py +11 -0
sky/provision/hyperbolic/config.py +10 -0
sky/provision/hyperbolic/instance.py +423 -0
sky/provision/hyperbolic/utils.py +373 -0
sky/provision/kubernetes/instance.py +2 -1
sky/provision/kubernetes/utils.py +60 -13
sky/resources.py +2 -2
sky/serve/serve_state.py +81 -15
sky/server/requests/preconditions.py +1 -1
sky/server/requests/requests.py +11 -6
sky/setup_files/dependencies.py +2 -1
sky/skylet/configs.py +26 -19
sky/skylet/constants.py +1 -1
sky/skylet/job_lib.py +3 -5
sky/task.py +1 -1
sky/templates/hyperbolic-ray.yml.j2 +67 -0
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/users/permission.py +2 -0
sky/utils/common_utils.py +6 -0
sky/utils/context.py +1 -1
sky/utils/infra_utils.py +1 -1
sky/utils/kubernetes/generate_kubeconfig.sh +1 -1
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/METADATA +2 -1
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/RECORD +79 -70
sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +0 -6
sky/dashboard/out/_next/static/chunks/600.15a0009177e86b86.js +0 -16
sky/dashboard/out/_next/static/chunks/938-ab185187a63f9cdb.js +0 -1
/sky/dashboard/out/_next/static/chunks/{843-6fcc4bf91ac45b39.js → 843-5011affc9540757f.js} +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{_app-7bbd9d39d6f9a98a.js → _app-664031f6ae737f80.js} +0 -0
/sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-451a14e7e755ebbc.js → [cluster]-20210f8cd809063d.js} +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{jobs-fe233baf3d073491.js → jobs-ae7a5e9fa5a5b5f0.js} +0 -0
/sky/dashboard/out/_next/static/{G3DXdMFu2Jzd-Dody9iq1 → nm5jrKpUZh2W0SxzyDKhz}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250612.dist-info → skypilot_nightly-1.0.0.dev20250614.dist-info}/top_level.txt +0 -0

sky/provision/hyperbolic/utils.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Hyperbolic API utilities."""
+import enum
+import json
+import os
+import time
+from typing import Any, Dict, Optional, Tuple
+import requests
+from sky import authentication
+from sky import sky_logging
+from sky.utils import status_lib
+#TODO update to prod endpoint
+BASE_URL = 'https://api.hyperbolic.xyz'
+API_KEY_PATH = '~/.hyperbolic/api_key'
+MAX_RETRIES = 3
+RETRY_DELAY = 2  # seconds
+TIMEOUT = 120
+logger = sky_logging.init_logger(__name__)
+class HyperbolicError(Exception):
+    """Base exception for Hyperbolic API errors."""
+    pass
+class HyperbolicInstanceStatus(enum.Enum):
+    """Statuses enum for Hyperbolic instances."""
+    UNKNOWN = 'unknown'
+    ONLINE = 'online'
+    OFFLINE = 'offline'
+    STARTING = 'starting'
+    STOPPING = 'stopping'
+    BUSY = 'busy'
+    RESTARTING = 'restarting'
+    CREATING = 'creating'
+    FAILED = 'failed'
+    ERROR = 'error'
+    TERMINATED = 'terminated'
+    @classmethod
+    def cluster_status_map(
+        cls
+    ) -> Dict['HyperbolicInstanceStatus', Optional[status_lib.ClusterStatus]]:
+        return {
+            cls.CREATING: status_lib.ClusterStatus.INIT,
+            cls.STARTING: status_lib.ClusterStatus.INIT,
+            cls.ONLINE: status_lib.ClusterStatus.UP,
+            cls.FAILED: status_lib.ClusterStatus.INIT,
+            cls.ERROR: status_lib.ClusterStatus.INIT,
+            cls.RESTARTING: status_lib.ClusterStatus.INIT,
+            cls.STOPPING: status_lib.ClusterStatus.INIT,
+            cls.UNKNOWN: status_lib.ClusterStatus.INIT,
+            cls.BUSY: status_lib.ClusterStatus.INIT,
+            cls.OFFLINE: status_lib.ClusterStatus.INIT,
+            cls.TERMINATED: None,
+        }
+    @classmethod
+    def from_raw_status(cls, status: str) -> 'HyperbolicInstanceStatus':
+        """Convert raw status string to HyperbolicInstanceStatus enum."""
+        try:
+            return cls(status.lower())
+        except ValueError as exc:
+            raise HyperbolicError(f'Unknown instance status: {status}') from exc
+    def to_cluster_status(self) -> Optional[status_lib.ClusterStatus]:
+        """Convert to SkyPilot cluster status."""
+        return self.cluster_status_map().get(self)
+class HyperbolicClient:
+    """Client for interacting with the Hyperbolic API."""
+    def __init__(self):
+        """Initialize the Hyperbolic client with API credentials."""
+        cred_path = os.path.expanduser(API_KEY_PATH)
+        if not os.path.exists(cred_path):
+            raise RuntimeError(f'API key not found at {cred_path}')
+        with open(cred_path, 'r', encoding='utf-8') as f:
+            self.api_key = f.read().strip()
+        self.headers = {'Authorization': f'Bearer {self.api_key}'}
+        self.api_url = BASE_URL
+    def _make_request(
+            self,
+            method: str,
+            endpoint: str,
+            payload: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """Make an API request to Hyperbolic."""
+        url = f'{BASE_URL}{endpoint}'
+        headers = {
+            'Authorization': f'Bearer {self.api_key}',
+            'Content-Type': 'application/json'
+        }
+        # Debug logging for request
+        logger.debug(f'Making {method} request to {url}')
+        if payload:
+            logger.debug(f'Request payload: {json.dumps(payload, indent=2)}')
+        try:
+            if method == 'GET':
+                response = requests.get(url, headers=headers, timeout=120)
+            elif method == 'POST':
+                response = requests.post(url,
+                                         headers=headers,
+                                         json=payload,
+                                         timeout=120)
+            else:
+                raise HyperbolicError(f'Unsupported HTTP method: {method}')
+            # Debug logging for response
+            logger.debug(f'Response status code: {response.status_code}')
+            logger.debug(f'Response headers: {dict(response.headers)}')
+            # Try to parse response as JSON
+            try:
+                response_data = response.json()
+                logger.debug(
+                    f'Response body: {json.dumps(response_data, indent=2)}')
+            except json.JSONDecodeError as exc:
+                # If response is not JSON, use the raw text
+                response_text = response.text
+                logger.debug(f'Response body (raw): {response_text}')
+                if not response.ok:
+                    raise HyperbolicError(f'API request failed with status '
+                                          f'{response.status_code}: '
+                                          f'{response_text}') from exc
+                # If response is OK but not JSON, return empty dict
+                return {}
+            if not response.ok:
+                error_msg = response_data.get(
+                    'error', response_data.get('message', response.text))
+                raise HyperbolicError(
+                    f'API request failed with status {response.status_code}: '
+                    f'{error_msg}')
+            return response_data
+        except requests.exceptions.RequestException as e:
+            raise HyperbolicError(f'Request failed: {str(e)}') from e
+        except Exception as e:
+            raise HyperbolicError(
+                f'Unexpected error during API request: {str(e)}') from e
+    def launch_instance(self, gpu_model: str, gpu_count: int,
+                        name: str) -> Tuple[str, str]:
+        """Launch a new instance with the specified configuration."""
+        # Initialize config with basic instance info
+        config = {
+            'gpuModel': gpu_model,
+            'gpuCount': str(gpu_count),
+            'userMetadata': {
+                'skypilot': {
+                    'cluster_name': name,
+                    'launch_time': str(int(time.time()))
+                }
+            }
+        }
+        config = authentication.setup_hyperbolic_authentication(config)
+        endpoint = '/v2/marketplace/instances/create-cheapest'
+        try:
+            response = self._make_request('POST', endpoint, payload=config)
+            logger.debug(f'Launch response: {json.dumps(response, indent=2)}')
+            instance_id = response.get('instanceName')
+            if not instance_id:
+                logger.error(f'No instance ID in response: {response}')
+                raise HyperbolicError('No instance ID returned from API')
+            logger.info(f'Successfully launched instance {instance_id}, '
+                        f'waiting for it to be ready...')
+            # Wait for instance to be ready
+            if not self.wait_for_instance(
+                    instance_id, HyperbolicInstanceStatus.ONLINE.value):
+                raise HyperbolicError(
+                    f'Instance {instance_id} failed to reach ONLINE state')
+            # Get instance details to get SSH command
+            instances = self.list_instances(
+                metadata={'skypilot': {
+                    'cluster_name': name
+                }})
+            instance = instances.get(instance_id)
+            if not instance:
+                raise HyperbolicError(
+                    f'Instance {instance_id} not found after launch')
+            ssh_command = instance.get('sshCommand')
+            if not ssh_command:
+                logger.error(
+                    f'No SSH command available for instance {instance_id}')
+                raise HyperbolicError('No SSH command available for instance')
+            logger.info(f'Instance {instance_id} is ready with SSH command')
+            return instance_id, ssh_command
+        except Exception as e:
+            logger.error(f'Failed to launch instance: {str(e)}')
+            raise HyperbolicError(f'Failed to launch instance: {str(e)}') from e
+    def list_instances(
+        self,
+        status: Optional[str] = None,
+        metadata: Optional[Dict[str, Dict[str, str]]] = None
+    ) -> Dict[str, Dict[str, Any]]:
+        """List all instances, optionally filtered by status and metadata."""
+        endpoint = '/v1/marketplace/instances'
+        try:
+            response = self._make_request('GET', endpoint)
+            logger.debug(f'Raw API response: {json.dumps(response, indent=2)}')
+            instances = {}
+            for instance in response.get('instances', []):
+                instance_info = instance.get('instance', {})
+                current_status = instance_info.get('status')
+                logger.debug(
+                    f'Instance {instance.get("id")} status: {current_status}')
+                # Convert raw status to enum
+                try:
+                    instance_status = HyperbolicInstanceStatus.from_raw_status(
+                        current_status)
+                except HyperbolicError as e:
+                    logger.warning(f'Failed to parse status for instance '
+                                   f'{instance.get("id")}: {e}')
+                    continue
+                if status and instance_status.value != status.lower():
+                    continue
+                if metadata:
+                    skypilot_metadata: Dict[str,
+                                            str] = metadata.get('skypilot', {})
+                    cluster_name = skypilot_metadata.get('cluster_name', '')
+                    instance_skypilot = instance.get('userMetadata',
+                                                     {}).get('skypilot', {})
+                    if not instance_skypilot.get('cluster_name',
+                                                 '').startswith(cluster_name):
+                        logger.debug(
+                            f'Skipping instance {instance.get("id")} - '
+                            f'skypilot metadata {instance_skypilot} '
+                            f'does not match {skypilot_metadata}')
+                        continue
+                    logger.debug(f'Including instance {instance.get("id")} '
+                                 f'- skypilot metadata matches')
+                hardware = instance_info.get('hardware', {})
+                instances[instance.get('id')] = {
+                    'id': instance.get('id'),
+                    'created': instance.get('created'),
+                    'sshCommand': instance.get('sshCommand'),
+                    'status': instance_status.value,
+                    'gpu_count': instance_info.get('gpu_count'),
+                    'gpus_total': instance_info.get('gpus_total'),
+                    'owner': instance_info.get('owner'),
+                    'cpus': hardware.get('cpus'),
+                    'gpus': hardware.get('gpus'),
+                    'ram': hardware.get('ram'),
+                    'storage': hardware.get('storage'),
+                    'pricing': instance_info.get('pricing'),
+                    'metadata': instance.get('userMetadata', {})
+                }
+            return instances
+        except Exception as e:
+            raise HyperbolicError(f'Failed to list instances: {str(e)}') from e
+    def terminate_instance(self, instance_id: str) -> None:
+        """Terminate an instance by ID."""
+        endpoint = '/v1/marketplace/instances/terminate'
+        data = {'id': instance_id}
+        try:
+            self._make_request('POST', endpoint, payload=data)
+        except Exception as e:
+            raise HyperbolicError(
+                f'Failed to terminate instance {instance_id}: {str(e)}') from e
+    def wait_for_instance(self,
+                          instance_id: str,
+                          target_status: str,
+                          timeout: int = TIMEOUT) -> bool:
+        """Wait for an instance to reach a specific status."""
+        start_time = time.time()
+        target_status_enum = HyperbolicInstanceStatus.from_raw_status(
+            target_status)
+        logger.info(
+            f'Waiting for instance {instance_id} '
+            f'to reach status {target_status_enum.value} and have SSH command')
+        while True:
+            elapsed = time.time() - start_time
+            if elapsed >= timeout:
+                logger.error(f'Timeout after {int(elapsed)}s '
+                             f'waiting for instance {instance_id}')
+                return False
+            try:
+                instances = self.list_instances()
+                instance = instances.get(instance_id)
+                if not instance:
+                    logger.warning(f'Instance {instance_id} not found')
+                    time.sleep(5)
+                    continue
+                current_status = instance.get('status', '').lower()
+                ssh_command = instance.get('sshCommand')
+                logger.debug(f'Current status: {current_status}, '
+                             f'Target status: {target_status_enum.value}, '
+                             f'SSH command: {ssh_command}')
+                if current_status == target_status_enum.value and ssh_command:
+                    logger.info(f'Instance {instance_id} reached '
+                                f'target status {target_status_enum.value} '
+                                f'and has SSH command after {int(elapsed)}s')
+                    return True
+                if current_status in ['failed', 'error', 'terminated']:
+                    logger.error(f'Instance {instance_id} reached '
+                                 f'terminal status: {current_status} '
+                                 f'after {int(elapsed)}s')
+                    return False
+                time.sleep(5)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.warning(
+                    f'Error while waiting for instance {instance_id}: {str(e)}')
+                time.sleep(5)
+# Module-level singleton client
+_client = None
+def get_client() -> HyperbolicClient:
+    """Get or create the Hyperbolic client singleton."""
+    global _client
+    if _client is None:
+        _client = HyperbolicClient()
+    return _client
+# Backward-compatible wrapper functions
+def launch_instance(gpu_model: str, gpu_count: int,
+                    name: str) -> Tuple[str, str]:
+    """Launch a new instance with the specified configuration."""
+    return get_client().launch_instance(gpu_model, gpu_count, name)
+def list_instances(
+    status: Optional[str] = None,
+    metadata: Optional[Dict[str, Dict[str, str]]] = None
+) -> Dict[str, Dict[str, Any]]:
+    """List all instances, optionally filtered by status and metadata."""
+    return get_client().list_instances(status=status, metadata=metadata)
+def terminate_instance(instance_id: str) -> None:
+    """Terminate an instance by ID."""
+    return get_client().terminate_instance(instance_id)
+def wait_for_instance(instance_id: str,
+                      target_status: str,
+                      timeout: int = TIMEOUT) -> bool:
+    """Wait for an instance to reach a specific status."""
+    return get_client().wait_for_instance(instance_id, target_status, timeout)

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -1277,7 +1277,8 @@ def query_instances(
     except kubernetes.max_retry_error():
         with ux_utils.print_exception_no_traceback():
             if is_ssh:
-                node_pool = context.lstrip('ssh-') if context else ''
+                node_pool = common_utils.removeprefix(context,
+                                                      'ssh-') if context else ''
                 msg = (
                     f'Cannot connect to SSH Node Pool {node_pool}. '
                     'Please check if the SSH Node Pool is up and accessible. '

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -133,6 +133,30 @@ DEFAULT_MAX_RETRIES = 3
 DEFAULT_RETRY_INTERVAL_SECONDS = 1
+def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
+    """Normalize TPU names to the k8s-compatible name and extract count."""
+    # Examples:
+    # 'tpu-v6e-8' -> ('tpu-v6e-slice', 8)
+    # 'tpu-v5litepod-4' -> ('tpu-v5-lite-podslice', 4)
+    gcp_to_k8s_patterns = [
+        (r'^tpu-v6e-(\d+)$', 'tpu-v6e-slice'),
+        (r'^tpu-v5p-(\d+)$', 'tpu-v5p-slice'),
+        (r'^tpu-v5litepod-(\d+)$', 'tpu-v5-lite-podslice'),
+        (r'^tpu-v5lite-(\d+)$', 'tpu-v5-lite-device'),
+        (r'^tpu-v4-(\d+)$', 'tpu-v4-podslice'),
+    ]
+    for pattern, replacement in gcp_to_k8s_patterns:
+        match = re.match(pattern, accelerator)
+        if match:
+            count = int(match.group(1))
+            return replacement, count
+    # Default fallback
+    return accelerator, 1
 def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
                     retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
                     resource_type: Optional[str] = None):
@@ -427,6 +451,7 @@ class GKELabelFormatter(GPULabelFormatter):
         e.g. tpu-v5-lite-podslice:8 -> '2x4'
         """
+        acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
         count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
                                                        {}).get(acc_count, None)
         if count_to_topology is None:
@@ -461,6 +486,14 @@ class GKELabelFormatter(GPULabelFormatter):
             raise ValueError(
                 f'Invalid accelerator name in GKE cluster: {value}')
+    @classmethod
+    def validate_label_value(cls, value: str) -> Tuple[bool, str]:
+        try:
+            _ = cls.get_accelerator_from_label_value(value)
+            return True, ''
+        except ValueError as e:
+            return False, str(e)
 class GFDLabelFormatter(GPULabelFormatter):
     """GPU Feature Discovery label formatter
@@ -565,17 +598,29 @@ def detect_gpu_label_formatter(
         for label, value in node.metadata.labels.items():
             node_labels[node.metadata.name].append((label, value))
-    label_formatter = None
     # Check if the node labels contain any of the GPU label prefixes
     for lf in LABEL_FORMATTER_REGISTRY:
+        skip = False
         for _, label_list in node_labels.items():
-            for label, _ in label_list:
+            for label, value in label_list:
                 if lf.match_label_key(label):
-                    label_formatter = lf()
-                    return label_formatter, node_labels
+                    valid, reason = lf.validate_label_value(value)
+                    if valid:
+                        return lf(), node_labels
+                    else:
+                        logger.warning(f'GPU label {label} matched for label '
+                                       f'formatter {lf.__class__.__name__}, '
+                                       f'but has invalid value {value}. '
+                                       f'Reason: {reason}. '
+                                       'Skipping...')
+                        skip = True
+                        break
+            if skip:
+                break
+        if skip:
+            continue
-    return label_formatter, node_labels
+    return None, node_labels
 class Autoscaler:
@@ -754,6 +799,8 @@ class GKEAutoscaler(Autoscaler):
                     f'checking {node_pool_name} for TPU {requested_acc_type}:'
                     f'{requested_acc_count}')
                 if 'resourceLabels' in node_config:
+                    requested_acc_type, requested_acc_count = normalize_tpu_accelerator_name(
+                        requested_acc_type)
                     accelerator_exists = cls._node_pool_has_tpu_capacity(
                         node_config['resourceLabels'], machine_type,
                         requested_acc_type, requested_acc_count)
@@ -993,7 +1040,7 @@ def check_instance_fits(context: Optional[str],
             'Maximum resources found on a single node: '
             f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
-    def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
+    def check_tpu_fits(acc_type: str, acc_count: int,
                        node_list: List[Any]) -> Tuple[bool, Optional[str]]:
         """Checks if the instance fits on the cluster based on requested TPU.
@@ -1003,8 +1050,6 @@ def check_instance_fits(context: Optional[str],
         node (node_tpu_chip_count) and the total TPU chips across the entire
         podslice (topology_chip_count) are correctly handled.
         """
-        acc_type = candidate_instance_type.accelerator_type
-        acc_count = candidate_instance_type.accelerator_count
         tpu_list_in_cluster = []
         for node in node_list:
             if acc_type == node.metadata.labels[
@@ -1055,7 +1100,8 @@ def check_instance_fits(context: Optional[str],
         if is_tpu_on_gke(acc_type):
             # If requested accelerator is a TPU type, check if the cluster
             # has sufficient TPU resource to meet the requirement.
-            fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
+            acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
+            fits, reason = check_tpu_fits(acc_type, acc_count, gpu_nodes)
             if reason is not None:
                 return fits, reason
         else:
@@ -1141,8 +1187,8 @@ def get_accelerator_label_key_values(
     is_ssh_node_pool = context.startswith('ssh-') if context else False
     cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
-    context_display_name = context.lstrip('ssh-') if (
-        context and is_ssh_node_pool) else context
+    context_display_name = common_utils.removeprefix(
+        context, 'ssh-') if (context and is_ssh_node_pool) else context
     autoscaler_type = get_autoscaler_type()
     if autoscaler_type is not None:
@@ -2911,7 +2957,8 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
 def is_tpu_on_gke(accelerator: str) -> bool:
     """Determines if the given accelerator is a TPU supported on GKE."""
-    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
+    normalized, _ = normalize_tpu_accelerator_name(accelerator)
+    return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
 def get_node_accelerator_count(attribute_dict: dict) -> int:

sky/resources.py CHANGED Viewed

@@ -480,7 +480,7 @@ class Resources:
         if self.region is not None:
             region_name = self.region
             if self.region.startswith('ssh-'):
-                region_name = self.region.lstrip('ssh-')
+                region_name = common_utils.removeprefix(self.region, 'ssh-')
             region_str = f', region={region_name}'
         zone_str = ''
         if self.zone is not None:
@@ -1868,7 +1868,7 @@ class Resources:
                     not isinstance(accelerators, set)):
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError(
-                        'Cannot specify multiple "accelerators" with prefered '
+                        'Cannot specify multiple "accelerators" with preferred '
                         'order (i.e., list of accelerators) with "any_of" '
                         'in resources.')

skypilot-nightly 1.0.0.dev20250612__py3-none-any.whl → 1.0.0.dev20250614__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250612py3-none-any.whl → 1.0.0.dev20250614py3-none-any.whl