PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250910py3-none-any.whl → 1.0.0.dev20250913py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (105) hide show

sky/__init__.py +4 -2
sky/adaptors/seeweb.py +103 -0
sky/authentication.py +38 -0
sky/backends/backend_utils.py +148 -30
sky/backends/cloud_vm_ray_backend.py +606 -223
sky/catalog/__init__.py +7 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +18 -0
sky/catalog/data_fetchers/fetch_aws.py +13 -37
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/seeweb_catalog.py +184 -0
sky/client/cli/command.py +2 -71
sky/client/sdk_async.py +5 -2
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +23 -5
sky/clouds/cloud.py +8 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/seeweb.py +463 -0
sky/core.py +46 -12
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +5 -0
sky/global_user_state.py +75 -26
sky/jobs/client/sdk_async.py +4 -2
sky/jobs/controller.py +4 -2
sky/jobs/recovery_strategy.py +1 -1
sky/jobs/state.py +26 -16
sky/jobs/utils.py +67 -24
sky/logs/agent.py +10 -2
sky/provision/__init__.py +1 -0
sky/provision/kubernetes/config.py +7 -2
sky/provision/kubernetes/instance.py +84 -41
sky/provision/kubernetes/utils.py +14 -3
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +806 -0
sky/provision/vast/instance.py +1 -1
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +252 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/server/config.py +14 -5
sky/server/metrics.py +41 -8
sky/server/requests/executor.py +41 -4
sky/server/server.py +1 -0
sky/server/uvicorn.py +11 -5
sky/setup_files/dependencies.py +8 -1
sky/skylet/constants.py +14 -8
sky/skylet/job_lib.py +128 -10
sky/skylet/log_lib.py +14 -3
sky/skylet/log_lib.pyi +9 -0
sky/skylet/services.py +203 -0
sky/skylet/skylet.py +4 -0
sky/task.py +62 -0
sky/templates/kubernetes-ray.yml.j2 +120 -3
sky/templates/seeweb-ray.yml.j2 +108 -0
sky/utils/accelerator_registry.py +3 -1
sky/utils/command_runner.py +35 -11
sky/utils/command_runner.pyi +22 -0
sky/utils/context_utils.py +15 -2
sky/utils/controller_utils.py +11 -5
sky/utils/db/migration_utils.py +1 -1
sky/utils/git.py +559 -1
sky/utils/resource_checker.py +8 -7
sky/workspaces/core.py +57 -21
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
/sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -543,7 +543,7 @@ class StrategyExecutor:
             except exceptions.NoClusterLaunchedError:
                 # Update the status to PENDING during backoff.
-                state.set_backoff_pending_async(self.job_id, self.task_id)
+                await state.set_backoff_pending_async(self.job_id, self.task_id)
                 # Calculate the backoff time and sleep.
                 gap_seconds = (backoff.current_backoff()
                                if self.pool is None else 1)

sky/jobs/state.py CHANGED Viewed

@@ -238,6 +238,7 @@ def _init_db_async(func):
                 last_exc = e
             logger.debug(f'DB error: {last_exc}')
             await asyncio.sleep(backoff.current_backoff())
+        assert last_exc is not None
         raise last_exc
     return wrapper
@@ -266,6 +267,7 @@ def _init_db(func):
                 last_exc = e
             logger.debug(f'DB error: {last_exc}')
             time.sleep(backoff.current_backoff())
+        assert last_exc is not None
         raise last_exc
     return wrapper
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
         # Subquery to get the spot_job_ids that match the joined condition
         subquery = session.query(spot_table.c.job_id).join(
             job_info_table,
-            spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
-                spot_table.c.spot_job_id == job_id,
-                spot_table.c.status == ManagedJobStatus.PENDING.value,
-                sqlalchemy.or_(
-                    job_info_table.c.schedule_state ==
-                    ManagedJobScheduleState.WAITING.value,
-                    job_info_table.c.schedule_state ==
-                    ManagedJobScheduleState.INACTIVE.value,
-                ),
-            ).subquery()
+            spot_table.c.spot_job_id == job_info_table.c.spot_job_id
+        ).filter(
+            spot_table.c.spot_job_id == job_id,
+            spot_table.c.status == ManagedJobStatus.PENDING.value,
+            # Note: it's possible that a WAITING job actually needs to be
+            # cleaned up, if we are in the middle of an upgrade/recovery and
+            # the job is waiting to be reclaimed by a new controller. But,
+            # in this case the status will not be PENDING.
+            sqlalchemy.or_(
+                job_info_table.c.schedule_state ==
+                ManagedJobScheduleState.WAITING.value,
+                job_info_table.c.schedule_state ==
+                ManagedJobScheduleState.INACTIVE.value,
+            ),
+        ).subquery()
         count = session.query(spot_table).filter(
             spot_table.c.job_id.in_(subquery)).update(
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
     """Set the job id on the pool cluster for a job."""
     assert _SQLALCHEMY_ENGINE_ASYNC is not None
     async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
-        await session.execute(job_info_table.c.spot_job_id == job_id).update(
-            {job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
+        await session.execute(
+            sqlalchemy.update(job_info_table).
+            where(job_info_table.c.spot_job_id == job_id).values({
+                job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
+            }))
         await session.commit()
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
         job_id: int) -> Tuple[Optional[str], Optional[int]]:
     """Get the cluster name and job id on the pool from the managed job id."""
     assert _SQLALCHEMY_ENGINE_ASYNC is not None
-    async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
-        info = await session.execute(
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        result = await session.execute(
             sqlalchemy.select(job_info_table.c.current_cluster_name,
                               job_info_table.c.job_id_on_pool_cluster).where(
-                                  job_info_table.c.spot_job_id == job_id)
-        ).fetchone()
+                                  job_info_table.c.spot_job_id == job_id))
+        info = result.fetchone()
         if info is None:
             return None, None
         return info[0], info[1]

sky/jobs/utils.py CHANGED Viewed

@@ -29,6 +29,7 @@ from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
+from sky.backends import cloud_vm_ray_backend
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
@@ -50,12 +51,16 @@ from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
+    import grpc
     import psutil
     import sky
     from sky import dag as dag_lib
+    from sky.schemas.generated import jobsv1_pb2
 else:
     psutil = adaptors_common.LazyImport('psutil')
+    grpc = adaptors_common.LazyImport('grpc')
+    jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
 logger = sky_logging.init_logger(__name__)
@@ -286,19 +291,34 @@ async def get_job_status(
                 job_logger.info(f'Job status: {status}')
             job_logger.info('=' * 34)
             return status
-        except exceptions.CommandError as e:
+        except (exceptions.CommandError, grpc.RpcError,
+                grpc.FutureTimeoutError) as e:
             # Retry on k8s transient network errors. This is useful when using
             # coreweave which may have transient network issue sometimes.
-            if (e.detailed_reason is not None and
-                    _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
-                job_logger.info('Failed to connect to the cluster. Retrying '
-                                f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
-                job_logger.info('=' * 34)
+            is_transient_error = False
+            detailed_reason = None
+            if isinstance(e, exceptions.CommandError):
+                detailed_reason = e.detailed_reason
+                if (detailed_reason is not None and
+                        _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
+                    is_transient_error = True
+            elif isinstance(e, grpc.RpcError):
+                detailed_reason = e.details()
+                if e.code() in [
+                        grpc.StatusCode.UNAVAILABLE,
+                        grpc.StatusCode.DEADLINE_EXCEEDED
+                ]:
+                    is_transient_error = True
+            elif isinstance(e, grpc.FutureTimeoutError):
+                detailed_reason = 'Timeout'
+            if is_transient_error:
+                logger.info('Failed to connect to the cluster. Retrying '
+                            f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
+                logger.info('=' * 34)
                 await asyncio.sleep(1)
             else:
-                job_logger.info(
-                    f'Failed to get job status: {e.detailed_reason}')
-                job_logger.info('=' * 34)
+                logger.info(f'Failed to get job status: {detailed_reason}')
+                logger.info('=' * 34)
                 return None
     return None
@@ -547,9 +567,32 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
                       job_id: Optional[int], get_end_time: bool) -> float:
     """Get the submitted/ended time of the job."""
-    code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
-        job_id=job_id, get_ended_time=get_end_time)
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+    assert handle is not None, (
+        f'handle for cluster {cluster_name!r} should not be None')
+    if handle.is_grpc_enabled_with_flag:
+        try:
+            if get_end_time:
+                end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
+                    job_id=job_id)
+                end_ts_response = backend_utils.invoke_skylet_with_retries(
+                    lambda: cloud_vm_ray_backend.SkyletClient(
+                        handle.get_grpc_channel()).get_job_ended_timestamp(
+                            end_ts_request))
+                return end_ts_response.timestamp
+            else:
+                submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
+                    job_id=job_id)
+                submit_ts_response = backend_utils.invoke_skylet_with_retries(
+                    lambda: cloud_vm_ray_backend.SkyletClient(
+                        handle.get_grpc_channel()).get_job_submitted_timestamp(
+                            submit_ts_request))
+                return submit_ts_response.timestamp
+        except exceptions.SkyletMethodNotImplementedError:
+            pass
+    code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
+        job_id=job_id, get_ended_time=get_end_time))
     returncode, stdout, stderr = backend.run_on_head(handle,
                                                      code,
                                                      stream_logs=False,
@@ -573,8 +616,13 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
                                  cluster_name,
                                  job_id=job_id,
                                  get_end_time=True)
-    except exceptions.CommandError as e:
-        if e.returncode == 255:
+    except (exceptions.CommandError, grpc.RpcError,
+            grpc.FutureTimeoutError) as e:
+        if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
+                (isinstance(e, grpc.RpcError) and e.code() in [
+                    grpc.StatusCode.UNAVAILABLE,
+                    grpc.StatusCode.DEADLINE_EXCEEDED,
+                ]) or isinstance(e, grpc.FutureTimeoutError):
             # Failed to connect - probably the instance was preempted since the
             # job completed. We shouldn't crash here, so just log and use the
             # current time.
@@ -586,7 +634,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
             raise
-def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
+def event_callback_func(
+        job_id: int, task_id: Optional[int],
+        task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
     """Run event callback for the task."""
     def callback_func(status: str):
@@ -625,17 +675,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
         logger.info(f'=== END: event callback for {status!r} ===')
-    try:
-        asyncio.get_running_loop()
-        # In async context
-        async def async_callback_func(status: str):
-            return await context_utils.to_thread(callback_func, status)
+    async def async_callback_func(status: str):
+        return await context_utils.to_thread(callback_func, status)
-        return async_callback_func
-    except RuntimeError:
-        # Not in async context
-        return callback_func
+    return async_callback_func
 # ======== user functions ========

sky/logs/agent.py CHANGED Viewed

@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
                           cluster_name: resources_utils.ClusterName) -> str:
         install_cmd = (
             'if ! command -v fluent-bit >/dev/null 2>&1; then '
-            'sudo apt-get install -y gnupg; '
+            'sudo apt-get update; sudo apt-get install -y gnupg; '
             # pylint: disable=line-too-long
-            'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
+            'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
+            # pylint: disable=line-too-long
+            'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
+            # pylint: disable=line-too-long
+            'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
+            # pylint: disable=line-too-long
+            'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
+            'sudo apt-get update; '
+            'sudo apt-get install -y fluent-bit; '
             'fi')
         cfg = self.fluentbit_config(cluster_name)
         cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')

sky/provision/__init__.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sky.provision import nebius
 from sky.provision import oci
 from sky.provision import runpod
 from sky.provision import scp
+from sky.provision import seeweb
 from sky.provision import ssh
 from sky.provision import vast
 from sky.provision import vsphere

sky/provision/kubernetes/config.py CHANGED Viewed

@@ -3,7 +3,7 @@ import copy
 import logging
 import math
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from sky.adaptors import kubernetes
 from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
 class KubernetesError(Exception):
-    pass
+    def __init__(self,
+                 *args,
+                 insufficent_resources: Optional[List[str]] = None):
+        self.insufficent_resources = insufficent_resources
+        super().__init__(*args)

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import datetime
 import json
 import re
+import sys
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                 break
         if event_message is not None:
             if pod_status == 'Pending':
-                logger.info(event_message)
+                out_of = {}
+                # key: resource name, value: (extra message, nice name)
                 if 'Insufficient cpu' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('CPU', pod, details=event_message))
+                    out_of['CPU'] = (': Run \'kubectl get nodes -o '
+                                     'custom-columns=NAME:.metadata.name,'
+                                     'CPU:.status.allocatable.cpu\' to check '
+                                     'the available CPUs on the node.', 'CPUs')
                 if 'Insufficient memory' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('memory', pod,
-                                           details=event_message))
+                    out_of['memory'] = (': Run \'kubectl get nodes -o '
+                                        'custom-columns=NAME:.metadata.name,'
+                                        'MEMORY:.status.allocatable.memory\' '
+                                        'to check the available memory on the '
+                                        'node.', 'Memory')
                 # TODO(aylei): after switching from smarter-device-manager to
                 # fusermount-server, we need a new way to check whether the
                 # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                     key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
                     for key in lf.get_label_keys()
                 ]
-                if pod.spec.node_selector:
-                    for label_key in pod.spec.node_selector.keys():
-                        if label_key in gpu_lf_keys:
-                            # TODO(romilb): We may have additional node
-                            #  affinity selectors in the future - in that
-                            #  case we will need to update this logic.
-                            # TODO(Doyoung): Update the error message raised
-                            # with the multi-host TPU support.
-                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context)  # pylint: disable=line-too-long
-                            if 'Insufficient google.com/tpu' in event_message:
-                                extra_msg = (
-                                    f'Verify if '
-                                    f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster. Note '
-                                    'that multi-host TPU podslices are '
-                                    'currently not unsupported.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('TPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
-                            elif ((f'Insufficient {gpu_resource_key}'
-                                   in event_message) or
-                                  ('didn\'t match Pod\'s node affinity/selector'
-                                   in event_message)):
-                                extra_msg = (
-                                    f'Verify if any node matching label  '
-                                    f'{pod.spec.node_selector[label_key]} and '
-                                    f'sufficient resource {gpu_resource_key} '
-                                    f'is available in the cluster.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('GPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
+                for label_key in gpu_lf_keys:
+                    # TODO(romilb): We may have additional node
+                    #  affinity selectors in the future - in that
+                    #  case we will need to update this logic.
+                    # TODO(Doyoung): Update the error message raised
+                    # with the multi-host TPU support.
+                    gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
+                        context)  # pylint: disable=line-too-long
+                    if ((f'Insufficient {gpu_resource_key}' in event_message) or
+                        ('didn\'t match Pod\'s node affinity/selector'
+                         in event_message) and pod.spec.node_selector):
+                        if 'gpu' in gpu_resource_key.lower():
+                            info_msg = (
+                                ': Run \'sky show-gpus --infra kubernetes\' to '
+                                'see the available GPUs.')
+                        else:
+                            info_msg = ': '
+                        if (pod.spec.node_selector and
+                                label_key in pod.spec.node_selector):
+                            extra_msg = (
+                                f'Verify if any node matching label '
+                                f'{pod.spec.node_selector[label_key]} and '
+                                f'sufficient resource {gpu_resource_key} '
+                                f'is available in the cluster.')
+                            extra_msg = info_msg + ' ' + extra_msg
+                        else:
+                            extra_msg = info_msg
+                        if gpu_resource_key not in out_of or len(
+                                out_of[gpu_resource_key][0]) < len(extra_msg):
+                            out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
+            if len(out_of) > 0:
+                # We are out of some resources. We should raise an error.
+                rsrc_err_msg = 'Insufficient resource capacity on the '
+                rsrc_err_msg += 'cluster:\n'
+                out_of_keys = list(out_of.keys())
+                for i in range(len(out_of_keys)):
+                    rsrc = out_of_keys[i]
+                    (extra_msg, nice_name) = out_of[rsrc]
+                    extra_msg = extra_msg if extra_msg else ''
+                    if i == len(out_of_keys) - 1:
+                        indent = '└──'
+                    else:
+                        indent = '├──'
+                    rsrc_err_msg += (f'{indent} Cluster does not have '
+                                     f'sufficient {nice_name} for your request'
+                                     f'{extra_msg}')
+                    if i != len(out_of_keys) - 1:
+                        rsrc_err_msg += '\n'
+                # Emit the error message without logging prefixes for better UX.
+                tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
+                tmp_handler.flush = sys.stdout.flush
+                tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
+                tmp_handler.setLevel(sky_logging.ERROR)
+                prev_propagate = logger.propagate
+                try:
+                    logger.addHandler(tmp_handler)
+                    logger.propagate = False
+                    logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
+                finally:
+                    logger.removeHandler(tmp_handler)
+                    logger.propagate = prev_propagate
+                nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
+                raise config_lib.KubernetesError(
+                    f'{timeout_err_msg} '
+                    f'Pod status: {pod_status} '
+                    f'Details: \'{event_message}\' ',
+                    insufficent_resources=nice_names,
+                )
             raise config_lib.KubernetesError(f'{timeout_err_msg} '
                                              f'Pod status: {pod_status} '
                                              f'Details: \'{event_message}\' ')

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -3550,9 +3550,20 @@ def process_skypilot_pods(
                     f'requesting GPUs: {pod.metadata.name}')
                 gpu_label = label_formatter.get_label_key()
                 # Get GPU name from pod node selector
-                if pod.spec.node_selector is not None:
-                    gpu_name = label_formatter.get_accelerator_from_label_value(
-                        pod.spec.node_selector.get(gpu_label))
+                node_selector_terms = (
+                    pod.spec.affinity.node_affinity.
+                    required_during_scheduling_ignored_during_execution.
+                    node_selector_terms)
+                if node_selector_terms is not None:
+                    expressions = []
+                    for term in node_selector_terms:
+                        if term.match_expressions:
+                            expressions.extend(term.match_expressions)
+                    for expression in expressions:
+                        if expression.key == gpu_label and expression.operator == 'In':
+                            gpu_name = label_formatter.get_accelerator_from_label_value(
+                                expression.values[0])
+                            break
             resources = resources_lib.Resources(
                 cloud=clouds.Kubernetes(),

sky/provision/seeweb/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Seeweb provisioner for SkyPilot."""
+from sky.provision.seeweb.config import bootstrap_instances
+from sky.provision.seeweb.instance import cleanup_ports
+from sky.provision.seeweb.instance import get_cluster_info
+from sky.provision.seeweb.instance import open_ports
+from sky.provision.seeweb.instance import query_instances
+from sky.provision.seeweb.instance import run_instances
+from sky.provision.seeweb.instance import stop_instances
+from sky.provision.seeweb.instance import terminate_instances
+from sky.provision.seeweb.instance import wait_instances

sky/provision/seeweb/config.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Configuration for Seeweb provisioning."""
+from typing import Any, Dict
+def bootstrap_instances(*args, **_kwargs) -> Dict[str, Any]:
+    """Bootstrap instances for Seeweb.
+    Seeweb doesn't require any special configuration bootstrapping,
+    so we just return the config as-is.
+    """
+    config = args[2]
+    return config

skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250910py3-none-any.whl → 1.0.0.dev20250913py3-none-any.whl