PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -4
sky/backends/backend_utils.py +160 -23
sky/backends/cloud_vm_ray_backend.py +226 -74
sky/catalog/__init__.py +7 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +18 -0
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +2 -71
sky/client/sdk.py +20 -0
sky/client/sdk_async.py +23 -18
sky/clouds/aws.py +26 -6
sky/clouds/cloud.py +8 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +5 -1
sky/execution.py +21 -14
sky/global_user_state.py +34 -0
sky/jobs/client/sdk_async.py +4 -2
sky/jobs/constants.py +3 -0
sky/jobs/controller.py +734 -310
sky/jobs/recovery_strategy.py +251 -129
sky/jobs/scheduler.py +247 -174
sky/jobs/server/core.py +20 -4
sky/jobs/server/utils.py +2 -2
sky/jobs/state.py +709 -508
sky/jobs/utils.py +90 -40
sky/logs/agent.py +10 -2
sky/provision/aws/config.py +4 -1
sky/provision/gcp/config.py +6 -1
sky/provision/kubernetes/config.py +7 -2
sky/provision/kubernetes/instance.py +84 -41
sky/provision/kubernetes/utils.py +17 -8
sky/provision/provisioner.py +1 -0
sky/provision/vast/instance.py +1 -1
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/serve/replica_managers.py +0 -7
sky/serve/serve_utils.py +5 -0
sky/serve/server/impl.py +1 -2
sky/serve/service.py +0 -2
sky/server/common.py +8 -3
sky/server/config.py +55 -27
sky/server/constants.py +1 -0
sky/server/daemons.py +7 -11
sky/server/metrics.py +41 -8
sky/server/requests/executor.py +41 -4
sky/server/requests/serializers/encoders.py +1 -1
sky/server/server.py +9 -1
sky/server/uvicorn.py +11 -5
sky/setup_files/dependencies.py +4 -2
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/constants.py +14 -7
sky/skylet/events.py +2 -10
sky/skylet/log_lib.py +11 -0
sky/skylet/log_lib.pyi +9 -0
sky/task.py +62 -0
sky/templates/kubernetes-ray.yml.j2 +120 -3
sky/utils/accelerator_registry.py +3 -1
sky/utils/command_runner.py +35 -11
sky/utils/command_runner.pyi +25 -3
sky/utils/common_utils.py +11 -1
sky/utils/context_utils.py +15 -2
sky/utils/controller_utils.py +5 -0
sky/utils/db/db_utils.py +31 -2
sky/utils/db/migration_utils.py +1 -1
sky/utils/git.py +559 -1
sky/utils/resource_checker.py +8 -7
sky/utils/rich_utils.py +3 -1
sky/utils/subprocess_utils.py +9 -0
sky/volumes/volume.py +2 -0
sky/workspaces/core.py +57 -21
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -4,9 +4,11 @@ NOTE: whenever an API change is made in this file, we need to bump the
 jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
 ManagedJobCodeGen.
 """
+import asyncio
 import collections
 import datetime
 import enum
+import logging
 import os
 import pathlib
 import shlex
@@ -14,11 +16,11 @@ import textwrap
 import time
 import traceback
 import typing
-from typing import Any, Deque, Dict, List, Optional, Set, TextIO, Tuple, Union
+from typing import (Any, Deque, Dict, List, Literal, Optional, Set, TextIO,
+                    Tuple, Union)
 import colorama
 import filelock
-from typing_extensions import Literal
 from sky import backends
 from sky import exceptions
@@ -37,6 +39,7 @@ from sky.usage import usage_lib
 from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
+from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import infra_utils
 from sky.utils import log_utils
@@ -56,9 +59,9 @@ else:
 logger = sky_logging.init_logger(__name__)
-SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
 # Controller checks its job's status every this many seconds.
-JOB_STATUS_CHECK_GAP_SECONDS = 20
+# This is a tradeoff between the latency and the resource usage.
+JOB_STATUS_CHECK_GAP_SECONDS = 15
 # Controller checks if its job has started every this many seconds.
 JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
@@ -82,7 +85,7 @@ _JOB_CANCELLED_MESSAGE = (
 # blocking for a long time. This should be significantly longer than the
 # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
 # update the state.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
 class ManagedJobQueueResultType(enum.Enum):
@@ -99,7 +102,11 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
-def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+def terminate_cluster(
+        cluster_name: str,
+        max_retry: int = 6,
+        _logger: logging.Logger = logger,  # pylint: disable=invalid-name
+) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
     retry_cnt = 0
@@ -122,18 +129,18 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
             return
         except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
-            logger.debug(f'The cluster {cluster_name} is already down.')
+            _logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
                 raise RuntimeError(
                     f'Failed to terminate the cluster {cluster_name}.') from e
-            logger.error(
+            _logger.error(
                 f'Failed to terminate the cluster {cluster_name}. Retrying.'
                 f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
-                logger.error(f'  Traceback: {traceback.format_exc()}')
+                _logger.error(f'  Traceback: {traceback.format_exc()}')
             time.sleep(backoff.current_backoff())
@@ -183,6 +190,9 @@ def _validate_consolidation_mode_config(
 # Use LRU Cache so that the check is only done once.
 @annotations.lru_cache(scope='request', maxsize=1)
 def is_consolidation_mode() -> bool:
+    if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
+        return True
     consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
     # We should only do this check on API server, as the controller will not
@@ -199,6 +209,7 @@ def ha_recovery_for_consolidation_mode():
     # already has all runtime installed. Directly start jobs recovery here.
     # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
     runner = command_runner.LocalProcessCommandRunner()
+    scheduler.maybe_start_controllers()
     with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
               'w',
               encoding='utf-8') as f:
@@ -214,7 +225,7 @@ def ha_recovery_for_consolidation_mode():
             # just keep running.
             if controller_pid is not None:
                 try:
-                    if _controller_process_alive(controller_pid, job_id):
+                    if controller_process_alive(controller_pid, job_id):
                         f.write(f'Controller pid {controller_pid} for '
                                 f'job {job_id} is still running. '
                                 'Skipping recovery.\n')
@@ -227,7 +238,7 @@ def ha_recovery_for_consolidation_mode():
             if job['schedule_state'] not in [
                     managed_job_state.ManagedJobScheduleState.DONE,
-                    managed_job_state.ManagedJobScheduleState.WAITING
+                    managed_job_state.ManagedJobScheduleState.WAITING,
             ]:
                 script = managed_job_state.get_ha_recovery_script(job_id)
                 if script is None:
@@ -242,56 +253,66 @@ def ha_recovery_for_consolidation_mode():
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
-def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
-                   job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
+async def get_job_status(
+        backend: 'backends.CloudVmRayBackend', cluster_name: str,
+        job_id: Optional[int],
+        job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
     FAILED_SETUP or CANCELLED.
     """
-    handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+    # TODO(luca) make this async
+    handle = await context_utils.to_thread(
+        global_user_state.get_handle_from_cluster_name, cluster_name)
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
-        logger.info(f'Cluster {cluster_name} not found.')
+        job_logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
-            logger.info('=== Checking the job status... ===')
-            statuses = backend.get_job_status(handle,
-                                              job_ids=job_ids,
-                                              stream_logs=False)
+            job_logger.info('=== Checking the job status... ===')
+            statuses = await context_utils.to_thread(backend.get_job_status,
+                                                     handle,
+                                                     job_ids=job_ids,
+                                                     stream_logs=False)
             status = list(statuses.values())[0]
             if status is None:
-                logger.info('No job found.')
+                job_logger.info('No job found.')
             else:
-                logger.info(f'Job status: {status}')
-            logger.info('=' * 34)
+                job_logger.info(f'Job status: {status}')
+            job_logger.info('=' * 34)
             return status
         except exceptions.CommandError as e:
             # Retry on k8s transient network errors. This is useful when using
             # coreweave which may have transient network issue sometimes.
             if (e.detailed_reason is not None and
                     _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
-                logger.info('Failed to connect to the cluster. Retrying '
-                            f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
-                logger.info('=' * 34)
-                time.sleep(1)
+                job_logger.info('Failed to connect to the cluster. Retrying '
+                                f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
+                job_logger.info('=' * 34)
+                await asyncio.sleep(1)
             else:
-                logger.info(f'Failed to get job status: {e.detailed_reason}')
-                logger.info('=' * 34)
+                job_logger.info(
+                    f'Failed to get job status: {e.detailed_reason}')
+                job_logger.info('=' * 34)
                 return None
     return None
-def _controller_process_alive(pid: int, job_id: int) -> bool:
+def controller_process_alive(pid: int, job_id: int) -> bool:
     """Check if the controller process is alive."""
     try:
+        if pid < 0:
+            # new job controller process will always be negative
+            pid = -pid
         process = psutil.Process(pid)
         cmd_str = ' '.join(process.cmdline())
-        return process.is_running() and f'--job-id {job_id}' in cmd_str
+        return process.is_running() and ((f'--job-id {job_id}' in cmd_str) or
+                                         ('controller' in cmd_str))
     except psutil.NoSuchProcess:
         return False
@@ -466,7 +487,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
             failure_reason = f'No controller pid set for {schedule_state.value}'
         else:
             logger.debug(f'Checking controller pid {pid}')
-            if _controller_process_alive(pid, job_id):
+            if controller_process_alive(pid, job_id):
                 # The controller is still running, so this job is fine.
                 continue
@@ -565,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
             raise
-def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
+def event_callback_func(
+        job_id: int, task_id: Optional[int],
+        task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
     """Run event callback for the task."""
     def callback_func(status: str):
@@ -604,7 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
         logger.info(f'=== END: event callback for {status!r} ===')
-    return callback_func
+    async def async_callback_func(status: str):
+        return await context_utils.to_thread(callback_func, status)
+    return async_callback_func
 # ======== user functions ========
@@ -651,16 +677,41 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
             logger.info(f'Job {job_id} is already in terminal state '
                         f'{job_status.value}. Skipped.')
             continue
+        elif job_status == managed_job_state.ManagedJobStatus.PENDING:
+            # the if is a short circuit, this will be atomic.
+            cancelled = managed_job_state.set_pending_cancelled(job_id)
+            if cancelled:
+                cancelled_job_ids.append(job_id)
+                continue
         update_managed_jobs_statuses(job_id)
+        job_controller_pid = managed_job_state.get_job_controller_pid(job_id)
+        if job_controller_pid is not None and job_controller_pid < 0:
+            # This is a consolidated job controller, so we need to cancel the
+            # with the controller server API
+            try:
+                # we create a file as a signal to the controller server
+                signal_file = pathlib.Path(
+                    managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
+                signal_file.touch()
+                cancelled_job_ids.append(job_id)
+            except OSError as e:
+                logger.error(f'Failed to cancel job {job_id} '
+                             f'with controller server: {e}')
+                # don't add it to the to be cancelled job ids, since we don't
+                # know for sure yet.
+                continue
+            continue
         job_workspace = managed_job_state.get_workspace(job_id)
         if current_workspace is not None and job_workspace != current_workspace:
             wrong_workspace_job_ids.append(job_id)
             continue
         # Send the signal to the jobs controller.
-        signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
+        signal_file = (pathlib.Path(
+            managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
         # Filelock is needed to prevent race condition between signal
         # check/removal and signal writing.
         with filelock.FileLock(str(signal_file) + '.lock'):
@@ -1159,8 +1210,7 @@ def dump_managed_job_queue(
                 # It's possible for a WAITING/ALIVE_WAITING job to be ready to
                 # launch, but the scheduler just hasn't run yet.
                 managed_job_state.ManagedJobScheduleState.WAITING,
-                managed_job_state.ManagedJobScheduleState.ALIVE_WAITING,
-        ):
+                managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
             # This job will not block others.
             continue
@@ -1370,12 +1420,12 @@ def load_managed_job_queue(
     """Load job queue from json string."""
     result = message_utils.decode_payload(payload)
     result_type = ManagedJobQueueResultType.DICT
-    status_counts = {}
+    status_counts: Dict[str, int] = {}
     if isinstance(result, dict):
-        jobs = result['jobs']
-        total = result['total']
+        jobs: List[Dict[str, Any]] = result['jobs']
+        total: int = result['total']
         status_counts = result.get('status_counts', {})
-        total_no_filter = result.get('total_no_filter', total)
+        total_no_filter: int = result.get('total_no_filter', total)
     else:
         jobs = result
         total = len(jobs)

sky/logs/agent.py CHANGED Viewed

@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
                           cluster_name: resources_utils.ClusterName) -> str:
         install_cmd = (
             'if ! command -v fluent-bit >/dev/null 2>&1; then '
-            'sudo apt-get install -y gnupg; '
+            'sudo apt-get update; sudo apt-get install -y gnupg; '
             # pylint: disable=line-too-long
-            'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
+            'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
+            # pylint: disable=line-too-long
+            'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
+            # pylint: disable=line-too-long
+            'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
+            # pylint: disable=line-too-long
+            'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
+            'sudo apt-get update; '
+            'sudo apt-get install -y fluent-bit; '
             'fi')
         cfg = self.fluentbit_config(cluster_name)
         cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')

sky/provision/aws/config.py CHANGED Viewed

@@ -305,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
     Returns:
         A list of route tables associated with the options VPC and region
     """
-    filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
+    filters: List['ec2_type_defs.FilterTypeDef'] = [{
+        'Name': 'association.main',
+        'Values': [str(main).lower()],
+    }]
     if vpc_id is not None:
         filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
     logger.debug(

sky/provision/gcp/config.py CHANGED Viewed

@@ -5,6 +5,8 @@ import time
 import typing
 from typing import Any, Dict, List, Set, Tuple
+from typing_extensions import TypedDict
 from sky.adaptors import gcp
 from sky.clouds.utils import gcp_utils
 from sky.provision import common
@@ -415,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
     return iam_role
+AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
 def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
                           compute):
     """Check if the firewall rules in the VPC are sufficient."""
@@ -466,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
             }
         """
         source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
-        source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
+        source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
         for rule in rules:
             # Rules applied to specific VM (targetTags) may not work for the
             # current VM, so should be skipped.

sky/provision/kubernetes/config.py CHANGED Viewed

@@ -3,7 +3,7 @@ import copy
 import logging
 import math
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from sky.adaptors import kubernetes
 from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
 class KubernetesError(Exception):
-    pass
+    def __init__(self,
+                 *args,
+                 insufficent_resources: Optional[List[str]] = None):
+        self.insufficent_resources = insufficent_resources
+        super().__init__(*args)

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import datetime
 import json
 import re
+import sys
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                 break
         if event_message is not None:
             if pod_status == 'Pending':
-                logger.info(event_message)
+                out_of = {}
+                # key: resource name, value: (extra message, nice name)
                 if 'Insufficient cpu' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('CPU', pod, details=event_message))
+                    out_of['CPU'] = (': Run \'kubectl get nodes -o '
+                                     'custom-columns=NAME:.metadata.name,'
+                                     'CPU:.status.allocatable.cpu\' to check '
+                                     'the available CPUs on the node.', 'CPUs')
                 if 'Insufficient memory' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('memory', pod,
-                                           details=event_message))
+                    out_of['memory'] = (': Run \'kubectl get nodes -o '
+                                        'custom-columns=NAME:.metadata.name,'
+                                        'MEMORY:.status.allocatable.memory\' '
+                                        'to check the available memory on the '
+                                        'node.', 'Memory')
                 # TODO(aylei): after switching from smarter-device-manager to
                 # fusermount-server, we need a new way to check whether the
                 # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                     key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
                     for key in lf.get_label_keys()
                 ]
-                if pod.spec.node_selector:
-                    for label_key in pod.spec.node_selector.keys():
-                        if label_key in gpu_lf_keys:
-                            # TODO(romilb): We may have additional node
-                            #  affinity selectors in the future - in that
-                            #  case we will need to update this logic.
-                            # TODO(Doyoung): Update the error message raised
-                            # with the multi-host TPU support.
-                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context)  # pylint: disable=line-too-long
-                            if 'Insufficient google.com/tpu' in event_message:
-                                extra_msg = (
-                                    f'Verify if '
-                                    f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster. Note '
-                                    'that multi-host TPU podslices are '
-                                    'currently not unsupported.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('TPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
-                            elif ((f'Insufficient {gpu_resource_key}'
-                                   in event_message) or
-                                  ('didn\'t match Pod\'s node affinity/selector'
-                                   in event_message)):
-                                extra_msg = (
-                                    f'Verify if any node matching label  '
-                                    f'{pod.spec.node_selector[label_key]} and '
-                                    f'sufficient resource {gpu_resource_key} '
-                                    f'is available in the cluster.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('GPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
+                for label_key in gpu_lf_keys:
+                    # TODO(romilb): We may have additional node
+                    #  affinity selectors in the future - in that
+                    #  case we will need to update this logic.
+                    # TODO(Doyoung): Update the error message raised
+                    # with the multi-host TPU support.
+                    gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
+                        context)  # pylint: disable=line-too-long
+                    if ((f'Insufficient {gpu_resource_key}' in event_message) or
+                        ('didn\'t match Pod\'s node affinity/selector'
+                         in event_message) and pod.spec.node_selector):
+                        if 'gpu' in gpu_resource_key.lower():
+                            info_msg = (
+                                ': Run \'sky show-gpus --infra kubernetes\' to '
+                                'see the available GPUs.')
+                        else:
+                            info_msg = ': '
+                        if (pod.spec.node_selector and
+                                label_key in pod.spec.node_selector):
+                            extra_msg = (
+                                f'Verify if any node matching label '
+                                f'{pod.spec.node_selector[label_key]} and '
+                                f'sufficient resource {gpu_resource_key} '
+                                f'is available in the cluster.')
+                            extra_msg = info_msg + ' ' + extra_msg
+                        else:
+                            extra_msg = info_msg
+                        if gpu_resource_key not in out_of or len(
+                                out_of[gpu_resource_key][0]) < len(extra_msg):
+                            out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
+            if len(out_of) > 0:
+                # We are out of some resources. We should raise an error.
+                rsrc_err_msg = 'Insufficient resource capacity on the '
+                rsrc_err_msg += 'cluster:\n'
+                out_of_keys = list(out_of.keys())
+                for i in range(len(out_of_keys)):
+                    rsrc = out_of_keys[i]
+                    (extra_msg, nice_name) = out_of[rsrc]
+                    extra_msg = extra_msg if extra_msg else ''
+                    if i == len(out_of_keys) - 1:
+                        indent = '└──'
+                    else:
+                        indent = '├──'
+                    rsrc_err_msg += (f'{indent} Cluster does not have '
+                                     f'sufficient {nice_name} for your request'
+                                     f'{extra_msg}')
+                    if i != len(out_of_keys) - 1:
+                        rsrc_err_msg += '\n'
+                # Emit the error message without logging prefixes for better UX.
+                tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
+                tmp_handler.flush = sys.stdout.flush
+                tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
+                tmp_handler.setLevel(sky_logging.ERROR)
+                prev_propagate = logger.propagate
+                try:
+                    logger.addHandler(tmp_handler)
+                    logger.propagate = False
+                    logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
+                finally:
+                    logger.removeHandler(tmp_handler)
+                    logger.propagate = prev_propagate
+                nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
+                raise config_lib.KubernetesError(
+                    f'{timeout_err_msg} '
+                    f'Pod status: {pod_status} '
+                    f'Details: \'{event_message}\' ',
+                    insufficent_resources=nice_names,
+                )
             raise config_lib.KubernetesError(f'{timeout_err_msg} '
                                              f'Pod status: {pod_status} '
                                              f'Details: \'{event_message}\' ')

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -451,6 +451,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
+    # TODO (kyuds): fill in more label values for different accelerators.
+    ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
     @classmethod
     def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
@@ -469,7 +472,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
-        return value
+        # return original label value if not found in mappings.
+        return cls.ACC_VALUE_MAPPINGS.get(value, value)
 class GKELabelFormatter(GPULabelFormatter):
@@ -1012,15 +1016,16 @@ class GKEAutoscaler(Autoscaler):
         to fit the instance type.
         """
         for accelerator in node_pool_accelerators:
+            raw_value = accelerator['acceleratorType']
             node_accelerator_type = (
-                GKELabelFormatter.get_accelerator_from_label_value(
-                    accelerator['acceleratorType']))
+                GKELabelFormatter.get_accelerator_from_label_value(raw_value))
             # handle heterogenous nodes.
             if not node_accelerator_type:
                 continue
             node_accelerator_count = accelerator['acceleratorCount']
-            if node_accelerator_type == requested_gpu_type and int(
-                    node_accelerator_count) >= requested_gpu_count:
+            viable_names = [node_accelerator_type.lower(), raw_value.lower()]
+            if (requested_gpu_type.lower() in viable_names and
+                    int(node_accelerator_count) >= requested_gpu_count):
                 return True
         return False
@@ -1448,9 +1453,13 @@ def get_accelerator_label_key_values(
                 if is_multi_host_tpu(node_metadata_labels):
                     continue
                 for label, value in label_list:
-                    if (label_formatter.match_label_key(label) and
-                            label_formatter.get_accelerator_from_label_value(
-                                value).lower() == acc_type.lower()):
+                    if label_formatter.match_label_key(label):
+                        # match either canonicalized name or raw name
+                        accelerator = (label_formatter.
+                                       get_accelerator_from_label_value(value))
+                        viable = [value.lower(), accelerator.lower()]
+                        if acc_type.lower() not in viable:
+                            continue
                         if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)

sky/provision/provisioner.py CHANGED Viewed

@@ -526,6 +526,7 @@ def _post_provision_setup(
             status.update(
                 ux_utils.spinner_message(
                     'Checking controller version compatibility'))
             try:
                 server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
             except exceptions.ClusterNotUpError:

sky/provision/vast/instance.py CHANGED Viewed

@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
 def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
     for inst_id, inst in instances.items():
-        if inst['name'].endswith('-head'):
+        if inst.get('name') and inst['name'].endswith('-head'):
             return inst_id
     return None

sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Add skylet_ssh_tunnel_metadata to clusters.
+Revision ID: 008
+Revises: 007
+Create Date: 2025-09-09
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '008'
+down_revision: Union[str, Sequence[str], None] = '007'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add skylet_ssh_tunnel_metadata column to clusters."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'skylet_ssh_tunnel_metadata',
+                                             sa.LargeBinary(),
+                                             server_default=None)
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl