PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sky/__init__.py +2 -2
sky/adaptors/nebius.py +28 -40
sky/backends/backend_utils.py +19 -2
sky/backends/cloud_vm_ray_backend.py +33 -8
sky/backends/local_docker_backend.py +1 -2
sky/cli.py +91 -38
sky/client/cli.py +91 -38
sky/client/sdk.py +3 -2
sky/clouds/aws.py +12 -6
sky/clouds/azure.py +3 -0
sky/clouds/cloud.py +8 -2
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +3 -0
sky/clouds/fluidstack.py +3 -0
sky/clouds/gcp.py +7 -0
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +42 -19
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/nebius.py +18 -10
sky/clouds/oci.py +6 -3
sky/clouds/paperspace.py +2 -0
sky/clouds/runpod.py +2 -0
sky/clouds/scp.py +2 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
sky/clouds/vast.py +2 -0
sky/clouds/vsphere.py +2 -0
sky/core.py +58 -29
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/favicon.ico +0 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/exceptions.py +6 -0
sky/execution.py +19 -4
sky/global_user_state.py +1 -0
sky/optimizer.py +35 -11
sky/provision/common.py +2 -5
sky/provision/docker_utils.py +22 -16
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +276 -93
sky/provision/kubernetes/network.py +1 -1
sky/provision/kubernetes/utils.py +36 -24
sky/provision/provisioner.py +6 -0
sky/serve/replica_managers.py +51 -5
sky/serve/serve_state.py +41 -0
sky/serve/service.py +108 -63
sky/server/common.py +6 -3
sky/server/config.py +184 -0
sky/server/requests/executor.py +17 -156
sky/server/server.py +4 -4
sky/setup_files/dependencies.py +0 -1
sky/skylet/constants.py +7 -0
sky/skypilot_config.py +27 -6
sky/task.py +1 -1
sky/templates/kubernetes-ray.yml.j2 +145 -15
sky/templates/nebius-ray.yml.j2 +63 -0
sky/utils/command_runner.py +17 -3
sky/utils/command_runner.pyi +2 -0
sky/utils/controller_utils.py +24 -0
sky/utils/kubernetes/rsync_helper.sh +20 -4
sky/utils/schemas.py +13 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0

sky/client/cli.py CHANGED Viewed

@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
 listed in "sky --help".  Take care to put logically connected commands close to
 each other.
 """
+import collections
 import copy
 import datetime
 import functools
@@ -162,7 +163,7 @@ def _get_cluster_records_and_set_ssh_config(
                 '-o StrictHostKeyChecking=no '
                 '-o UserKnownHostsFile=/dev/null '
                 '-o IdentitiesOnly=yes '
-                '-W %h:%p '
+                '-W \'[%h]:%p\' '
                 f'{handle.ssh_user}@127.0.0.1 '
                 '-o ProxyCommand='
                 # TODO(zhwu): write the template to a temp file, don't use
@@ -3413,7 +3414,7 @@ def show_gpus(
     # TODO(zhwu,romilb): We should move most of these kubernetes related
     # queries into the backend, especially behind the server.
-    def _get_kubernetes_realtime_gpu_table(
+    def _get_kubernetes_realtime_gpu_tables(
             context: Optional[str] = None,
             name_filter: Optional[str] = None,
             quantity_filter: Optional[int] = None):
@@ -3423,15 +3424,14 @@ def show_gpus(
         else:
             qty_header = 'REQUESTABLE_QTY_PER_NODE'
             free_header = 'TOTAL_FREE_GPUS'
-        realtime_gpu_table = log_utils.create_table(
-            ['GPU', qty_header, 'TOTAL_GPUS', free_header])
-        realtime_gpu_availability_list = sdk.stream_and_get(
+        realtime_gpu_availability_lists = sdk.stream_and_get(
             sdk.realtime_kubernetes_gpu_availability(
                 context=context,
                 name_filter=name_filter,
                 quantity_filter=quantity_filter))
-        if not realtime_gpu_availability_list:
-            err_msg = 'No GPUs found in Kubernetes cluster. '
+        if not realtime_gpu_availability_lists:
+            err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
             debug_msg = 'To further debug, run: sky check '
             if name_filter is not None:
                 gpu_info_msg = f' {name_filter!r}'
@@ -3439,26 +3439,52 @@ def show_gpus(
                     gpu_info_msg += (' with requested quantity'
                                      f' {quantity_filter}')
                 err_msg = (f'Resources{gpu_info_msg} not found '
-                           'in Kubernetes cluster. ')
+                           'in any allowed Kubernetes cluster. ')
                 debug_msg = ('To show available accelerators on kubernetes,'
                              ' run: sky show-gpus --cloud kubernetes ')
             full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
                             debug_msg)
             raise ValueError(full_err_msg)
         no_permissions_str = '<no permissions>'
-        for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
-            gpu_availability = models.RealtimeGpuAvailability(
-                *realtime_gpu_availability)
-            available_qty = (gpu_availability.available
-                             if gpu_availability.available != -1 else
-                             no_permissions_str)
-            realtime_gpu_table.add_row([
-                gpu_availability.gpu,
-                _list_to_str(gpu_availability.counts),
-                gpu_availability.capacity,
-                available_qty,
-            ])
-        return realtime_gpu_table
+        realtime_gpu_infos = []
+        total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
+            lambda: [0, 0])
+        for (ctx, availability_list) in realtime_gpu_availability_lists:
+            realtime_gpu_table = log_utils.create_table(
+                ['GPU', qty_header, 'TOTAL_GPUS', free_header])
+            for realtime_gpu_availability in sorted(availability_list):
+                gpu_availability = models.RealtimeGpuAvailability(
+                    *realtime_gpu_availability)
+                available_qty = (gpu_availability.available
+                                 if gpu_availability.available != -1 else
+                                 no_permissions_str)
+                realtime_gpu_table.add_row([
+                    gpu_availability.gpu,
+                    _list_to_str(gpu_availability.counts),
+                    gpu_availability.capacity,
+                    available_qty,
+                ])
+                gpu = gpu_availability.gpu
+                capacity = gpu_availability.capacity
+                # we want total, so skip permission denied.
+                available = max(gpu_availability.available, 0)
+                if capacity > 0:
+                    total_gpu_info[gpu][0] += capacity
+                    total_gpu_info[gpu][1] += available
+            realtime_gpu_infos.append((ctx, realtime_gpu_table))
+        # display an aggregated table for all contexts
+        # if there are more than one contexts with GPUs
+        if len(realtime_gpu_infos) > 1:
+            total_realtime_gpu_table = log_utils.create_table(
+                ['GPU', 'TOTAL_GPUS', free_header])
+            for gpu, stats in total_gpu_info.items():
+                total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
+        else:
+            total_realtime_gpu_table = None
+        return realtime_gpu_infos, total_realtime_gpu_table
     def _format_kubernetes_node_info(context: Optional[str]):
         node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def show_gpus(
             'Kubernetes per node accelerator availability ')
         if nodes_info.hint:
             k8s_per_node_acc_message += nodes_info.hint
-        return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+        return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
                 f'{k8s_per_node_acc_message}'
                 f'{colorama.Style.RESET_ALL}\n'
                 f'{node_table.get_string()}')
@@ -3516,8 +3542,7 @@ def show_gpus(
                     # If --cloud kubernetes is not specified, we want to catch
                     # the case where no GPUs are available on the cluster and
                     # print the warning at the end.
-                    k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
-                        context)
+                    k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context)  # pylint: disable=line-too-long
                 except ValueError as e:
                     if not cloud_is_kubernetes:
                         # Make it a note if cloud is not kubernetes
@@ -3525,13 +3550,24 @@ def show_gpus(
                     k8s_messages += str(e)
                 else:
                     print_section_titles = True
-                    context_str = f'(Context: {context})' if context else ''
-                    yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                           f'Kubernetes GPUs {context_str}'
-                           f'{colorama.Style.RESET_ALL}\n')
-                    yield from k8s_realtime_table.get_string()
-                    yield '\n\n'
-                    yield _format_kubernetes_node_info(context)
+                    # print total table
+                    if total_table is not None:
+                        yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
+                               'Total Kubernetes GPUs'
+                               f'{colorama.Style.RESET_ALL}\n')
+                        yield from total_table.get_string()
+                        yield '\n-----\n\n'
+                    # print individual infos.
+                    for (ctx, k8s_realtime_table) in k8s_realtime_infos:
+                        context_str = f'(Context: {ctx})' if ctx else ''
+                        yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                               f'Kubernetes GPUs {context_str}'
+                               f'{colorama.Style.RESET_ALL}\n')
+                        yield from k8s_realtime_table.get_string()
+                        yield '\n\n'
+                        yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
                 if kubernetes_autoscaling:
                     k8s_messages += (
                         '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def show_gpus(
             # Print section title if not showing all and instead a specific
             # accelerator is requested
             print_section_titles = True
-            yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                   f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
             # TODO(romilb): Show filtered per node GPU availability here as well
             try:
-                k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
-                    name_filter=name, quantity_filter=quantity)
-                yield from k8s_realtime_table.get_string()
+                k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(  # pylint: disable=line-too-long
+                    context=region,
+                    name_filter=name,
+                    quantity_filter=quantity)
+                # print total table
+                if total_table is not None:
+                    yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
+                           'Total Kubernetes GPUs'
+                           f'{colorama.Style.RESET_ALL}\n')
+                    yield from total_table.get_string()
+                    yield '\n-----\n\n'
+                # print individual tables
+                for (ctx, k8s_realtime_table) in k8s_realtime_infos:
+                    context_str = f'(Context: {ctx})' if ctx else ''
+                    yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                           f'Kubernetes GPUs {context_str}'
+                           f'{colorama.Style.RESET_ALL}\n')
+                    yield from k8s_realtime_table.get_string()
+                    yield '\n\n'
             except ValueError as e:
                 # In the case of a specific accelerator, show the error message
                 # immediately (e.g., "Resources H100 not found ...")
@@ -5911,11 +5963,12 @@ def api_info():
     user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
     user_hash = common_utils.get_user_hash()
     dashboard_url = server_common.get_dashboard_url(url)
-    click.echo(f'Using SkyPilot API server: {url} Dashboard: {dashboard_url}\n'
+    click.echo(f'Using SkyPilot API server: {url}\n'
                f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
                f'commit: {api_server_info["commit"]}, '
                f'version: {api_server_info["version"]}\n'
-               f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
+               f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
+               f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
 def main():

sky/client/sdk.py CHANGED Viewed

@@ -1840,6 +1840,7 @@ def api_login(endpoint: Optional[str] = None) -> None:
         dashboard_url = server_common.get_dashboard_url(endpoint)
         dashboard_msg = f'Dashboard: {dashboard_url}'
         click.secho(
-            f'Logged in to SkyPilot API server at {endpoint}.'
-            f' {dashboard_msg}',
+            f'Logged into SkyPilot API server at: {endpoint}'
+            f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
+            f'{dashboard_msg}',
             fg='green')

sky/clouds/aws.py CHANGED Viewed

@@ -161,13 +161,19 @@ class AWS(clouds.Cloud):
     def _unsupported_features_for_resources(
         cls, resources: 'resources_lib.Resources'
     ) -> Dict[clouds.CloudImplementationFeatures, str]:
+        unsupported_features = {}
         if resources.use_spot:
-            return {
-                clouds.CloudImplementationFeatures.STOP:
-                    ('Stopping spot instances is currently not supported on'
-                     f' {cls._REPR}.'),
-            }
-        return {}
+            unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
+                f'Stopping spot instances is currently not supported on {cls._REPR}.'
+            )
+        unsupported_features[
+            clouds.CloudImplementationFeatures.
+            HIGH_AVAILABILITY_CONTROLLERS] = (
+                f'High availability controllers are not supported on {cls._REPR}.'
+            )
+        return unsupported_features
     @classmethod
     def max_cluster_name_length(cls) -> Optional[int]:

sky/clouds/azure.py CHANGED Viewed

@@ -90,6 +90,9 @@ class Azure(clouds.Cloud):
         features = {
             clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
                 (f'Migrating disk is currently not supported on {cls._REPR}.'),
+            clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
+                f'High availability controllers are not supported on {cls._REPR}.'
+            ),
         }
         if resources.use_spot:
             features[clouds.CloudImplementationFeatures.STOP] = (

sky/clouds/cloud.py CHANGED Viewed

@@ -47,6 +47,9 @@ class CloudImplementationFeatures(enum.Enum):
     OPEN_PORTS = 'open_ports'
     STORAGE_MOUNTING = 'storage_mounting'
     HOST_CONTROLLERS = 'host_controllers'  # Can run jobs/serve controllers
+    HIGH_AVAILABILITY_CONTROLLERS = ('high_availability_controllers'
+                                    )  # Controller can auto-restart
+    AUTO_TERMINATE = 'auto_terminate'  # Pod/VM can stop or down itself
     AUTOSTOP = 'autostop'  # Pod/VM can stop itself
     AUTODOWN = 'autodown'  # Pod/VM can down itself
@@ -415,13 +418,16 @@ class Cloud:
         try:
             self.check_features_are_supported(resources,
                                               resources_required_features)
-        except exceptions.NotSupportedError:
+        except exceptions.NotSupportedError as e:
             # TODO(zhwu): The resources are now silently filtered out. We
             # should have some logging telling the user why the resources
             # are not considered.
+            # UPDATE(kyuds): passing in NotSupportedError reason string
+            # to hint for issue #5344. Did not remove above comment as
+            # reason is not displayed when other resources are valid.
             return resources_utils.FeasibleResources(resources_list=[],
                                                      fuzzy_candidate_list=[],
-                                                     hint=None)
+                                                     hint=str(e))
         return self._get_feasible_launchable_resources(resources)
     def _get_feasible_launchable_resources(

sky/clouds/cudo.py CHANGED Viewed

@@ -68,6 +68,8 @@ class Cudo(clouds.Cloud):
             'Cudo Compute cannot host a controller as it does not '
             'autostopping, which will leave the controller to run indefinitely.'
         ),
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported on Cudo.'),
     }
     _MAX_CLUSTER_NAME_LEN_LIMIT = 60

sky/clouds/do.py CHANGED Viewed

@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
         clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
             'Custom disk tiers'
             f' is not supported in {_REPR}.',
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported in '
+             f'{_REPR}.'),
     }
     # DO maximum node name length defined as <= 255
     # https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create

sky/clouds/fluidstack.py CHANGED Viewed

@@ -56,6 +56,9 @@ class Fluidstack(clouds.Cloud):
         clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
             'Host controllers'
             f' are not supported in {_REPR}.',
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported in '
+             f'{_REPR}.'),
     }
     # Using the latest SkyPilot provisioner API to provision and check status.
     PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT

sky/clouds/gcp.py CHANGED Viewed

@@ -232,6 +232,13 @@ class GCP(clouds.Cloud):
             unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
                 'Managed Instance Group with DWS does not support '
                 'spot instances.')
+        unsupported[
+            clouds.CloudImplementationFeatures.
+            HIGH_AVAILABILITY_CONTROLLERS] = (
+                f'High availability controllers are not supported on {cls._REPR}.'
+            )
         return unsupported
     @classmethod

sky/clouds/ibm.py CHANGED Viewed

@@ -50,6 +50,8 @@ class IBM(clouds.Cloud):
                 ),
             clouds.CloudImplementationFeatures.OPEN_PORTS:
                 (f'Opening ports is currently not supported on {cls._REPR}.'),
+            clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+                ('High availability controllers are not supported on IBM.'),
         }
         if resources.use_spot:
             features[clouds.CloudImplementationFeatures.STOP] = (

sky/clouds/kubernetes.py CHANGED Viewed

@@ -429,28 +429,32 @@ class Kubernetes(clouds.Cloud):
         acc_count = k.accelerator_count if k.accelerator_count else 0
         acc_type = k.accelerator_type if k.accelerator_type else None
-        image_id_dict = resources.image_id
-        if image_id_dict is not None:
-            # Use custom image specified in resources
-            if None in image_id_dict:
-                image_id = image_id_dict[None]
+        def _get_image_id(resources: 'resources_lib.Resources') -> str:
+            image_id_dict = resources.image_id
+            if image_id_dict is not None:
+                # Use custom image specified in resources
+                if None in image_id_dict:
+                    image_id = image_id_dict[None]
+                else:
+                    assert resources.region in image_id_dict, image_id_dict
+                    image_id = image_id_dict[resources.region]
+                if image_id.startswith('docker:'):
+                    image_id = image_id[len('docker:'):]
             else:
-                assert resources.region in image_id_dict, image_id_dict
-                image_id = image_id_dict[resources.region]
-            if image_id.startswith('docker:'):
-                image_id = image_id[len('docker:'):]
-        else:
-            # Select image based on whether we are using GPUs or not.
-            image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
-            # Get the container image ID from the service catalog.
-            image_id = service_catalog.get_image_id_from_tag(
-                image_id, clouds='kubernetes')
+                # Select image based on whether we are using GPUs or not.
+                image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
+                # Get the container image ID from the service catalog.
+                image_id = service_catalog.get_image_id_from_tag(
+                    image_id, clouds='kubernetes')
+            return image_id
+        image_id = _get_image_id(resources)
         # TODO(romilb): Create a lightweight image for SSH jump host
         ssh_jump_image = service_catalog.get_image_id_from_tag(
             self.IMAGE_CPU, clouds='kubernetes')
         k8s_acc_label_key = None
-        k8s_acc_label_value = None
+        k8s_acc_label_values = None
         k8s_topology_label_key = None
         k8s_topology_label_value = None
         k8s_resource_key = None
@@ -458,9 +462,9 @@ class Kubernetes(clouds.Cloud):
         # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
         if acc_count > 0 and acc_type is not None:
-            (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
+            (k8s_acc_label_key, k8s_acc_label_values, k8s_topology_label_key,
              k8s_topology_label_value) = (
-                 kubernetes_utils.get_accelerator_label_key_value(
+                 kubernetes_utils.get_accelerator_label_key_values(
                      context, acc_type, acc_count))
             if (k8s_acc_label_key ==
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
@@ -540,6 +544,13 @@ class Kubernetes(clouds.Cloud):
             # cpus is <1.
             'num-cpus': str(max(int(cpus), 1)),
         }
+        # Get the storage class name for high availability controller's PVC
+        k8s_ha_storage_class_name = skypilot_config.get_nested(
+            ('kubernetes', 'high_availability', 'storage_class_name'),
+            None,
+            override_configs=resources.cluster_config_overrides)
         deploy_vars = {
             'instance_type': resources.instance_type,
             'custom_resources': custom_resources,
@@ -551,7 +562,7 @@ class Kubernetes(clouds.Cloud):
             'k8s_networking_mode': network_utils.get_networking_mode().value,
             'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
             'k8s_acc_label_key': k8s_acc_label_key,
-            'k8s_acc_label_value': k8s_acc_label_value,
+            'k8s_acc_label_values': k8s_acc_label_values,
             'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
             'k8s_ssh_jump_image': ssh_jump_image,
             'k8s_service_account_name': k8s_service_account_name,
@@ -574,6 +585,18 @@ class Kubernetes(clouds.Cloud):
             'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
             'ray_worker_start_command': instance_setup.ray_worker_start_command(
                 custom_resources, custom_ray_options, no_restart=False),
+            'k8s_high_availability_deployment_volume_mount_name':
+                (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME
+                ),
+            'k8s_high_availability_deployment_volume_mount_path':
+                (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH
+                ),
+            'k8s_high_availability_deployment_setup_script_path':
+                (constants.PERSISTENT_SETUP_SCRIPT_PATH),
+            'k8s_high_availability_deployment_run_script_dir':
+                (constants.PERSISTENT_RUN_SCRIPT_DIR),
+            'k8s_high_availability_storage_class_name':
+                (k8s_ha_storage_class_name),
         }
         # Add kubecontext if it is set. It may be None if SkyPilot is running

sky/clouds/lambda_cloud.py CHANGED Viewed

@@ -44,6 +44,7 @@ class Lambda(clouds.Cloud):
         clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
         clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
         clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
     }
     PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT

sky/clouds/nebius.py CHANGED Viewed

@@ -1,5 +1,4 @@
 """ Nebius Cloud. """
-import logging
 import os
 import typing
 from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -7,6 +6,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
 from sky import clouds
 from sky.adaptors import nebius
 from sky.clouds import service_catalog
+from sky.utils import annotations
 from sky.utils import registry
 from sky.utils import resources_utils
@@ -59,12 +59,10 @@ class Nebius(clouds.Cloud):
             ('Spot is not supported, as Nebius API does not implement spot.'),
         clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
             (f'Migrating disk is currently not supported on {_REPR}.'),
-        clouds.CloudImplementationFeatures.DOCKER_IMAGE:
-            (f'Docker image is currently not supported on {_REPR}. '
-             'You can try running docker command inside the '
-             '`run` section in task.yaml.'),
         clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
             (f'Custom disk tier is currently not supported on {_REPR}.'),
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported on Nebius.'),
     }
     # Nebius maximum instance name length defined as <= 63 as a hostname length
     # 63 - 8 - 5 = 50 characters since
@@ -211,7 +209,8 @@ class Nebius(clouds.Cloud):
         else:
             raise RuntimeError('Unsupported instance type for Nebius cloud:'
                                f' {resources.instance_type}')
-        return {
+        resources_vars = {
             'instance_type': resources.instance_type,
             'custom_resources': custom_resources,
             'region': region.name,
@@ -220,6 +219,14 @@ class Nebius(clouds.Cloud):
             'zones': None,
         }
+        if acc_dict is not None:
+            # Nebius cloud's docker runtime information does not contain
+            # 'nvidia-container-runtime', causing no GPU option to be added to
+            # the docker run command. We patch this by adding it here.
+            resources_vars['docker_run_options'] = ['--gpus all']
+        return resources_vars
     def _get_feasible_launchable_resources(
         self, resources: 'resources_lib.Resources'
     ) -> 'resources_utils.FeasibleResources':
@@ -275,16 +282,16 @@ class Nebius(clouds.Cloud):
                                                  fuzzy_candidate_list, None)
     @classmethod
+    @annotations.lru_cache(scope='request')
     def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
         """Checks if the user has access credentials to
         Nebius's compute service."""
-        logging.debug('Nebius cloud check credentials')
         token_cred_msg = (
             f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
             f'{_INDENT_PREFIX}  $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n'  # pylint: disable=line-too-long
-            f'{_INDENT_PREFIX} or generate  ~/.nebius/credentials.json')
+            f'{_INDENT_PREFIX} or generate  ~/.nebius/credentials.json \n')
-        tenant_msg = (f'{_INDENT_PREFIX}Copy your tenat ID from the web console and save it to file \n'  # pylint: disable=line-too-long
+        tenant_msg = (f'{_INDENT_PREFIX} Copy your tenat ID from the web console and save it to file \n'  # pylint: disable=line-too-long
                       f'{_INDENT_PREFIX}  $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n'  # pylint: disable=line-too-long
                       f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n'  # pylint: disable=line-too-long
                       f'{_INDENT_PREFIX}  $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n')  # pylint: disable=line-too-long
@@ -301,11 +308,12 @@ class Nebius(clouds.Cloud):
         except nebius.request_error() as e:
             return False, (
                 f'{e.status} \n'  # First line is indented by 4 spaces
-                f'{token_cred_msg}'
+                f'{token_cred_msg} \n'
                 f'{tenant_msg}')
         return True, None
     @classmethod
+    @annotations.lru_cache(scope='request')
     def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
         """Checks if the user has access credentials to Nebius Object Storage.

sky/clouds/oci.py CHANGED Viewed

@@ -69,19 +69,22 @@ class OCI(clouds.Cloud):
     def _unsupported_features_for_resources(
         cls, resources: 'resources_lib.Resources'
     ) -> Dict[clouds.CloudImplementationFeatures, str]:
-        features = {
+        unsupported_features = {
             clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
                 (f'Migrating disk is currently not supported on {cls._REPR}.'),
             clouds.CloudImplementationFeatures.DOCKER_IMAGE:
                 (f'Docker image is currently not supported on {cls._REPR}. '
                  'You can try running docker command inside the '
                  '`run` section in task.yaml.'),
+            clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+                ('High availability controllers are not supported on '
+                 f'{cls._REPR}.'),
         }
         if resources.use_spot:
-            features[clouds.CloudImplementationFeatures.STOP] = (
+            unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
                 f'Stopping spot instances is currently not supported on '
                 f'{cls._REPR}.')
-        return features
+        return unsupported_features
     @classmethod
     def max_cluster_name_length(cls) -> Optional[int]:

sky/clouds/paperspace.py CHANGED Viewed

@@ -41,6 +41,8 @@ class Paperspace(clouds.Cloud):
         clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
             'Custom disk tiers'
             f' is not supported in {_REPR}.',
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            (f'High availability controllers are not supported in {_REPR}.'),
     }
     _MAX_CLUSTER_NAME_LEN_LIMIT = 120
     _regions: List[clouds.Region] = []

sky/clouds/runpod.py CHANGED Viewed

@@ -34,6 +34,8 @@ class RunPod(clouds.Cloud):
             ('Mounting object stores is not supported on RunPod. To read data '
              'from object stores on RunPod, use `mode: COPY` to copy the data '
              'to local disk.'),
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported on RunPod.'),
     }
     _MAX_CLUSTER_NAME_LEN_LIMIT = 120
     _regions: List[clouds.Region] = []

sky/clouds/scp.py CHANGED Viewed

@@ -58,6 +58,8 @@ class SCP(clouds.Cloud):
             (f'Custom disk tiers are not supported in {_REPR}.'),
         clouds.CloudImplementationFeatures.OPEN_PORTS:
             (f'Opening ports is currently not supported on {_REPR}.'),
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            (f'High availability controllers are not supported on {_REPR}.'),
     }
     _INDENT_PREFIX = '    '

sky/clouds/service_catalog/constants.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Constants used for service catalog."""
 HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs'  # pylint: disable=line-too-long
-CATALOG_SCHEMA_VERSION = 'v6'
+CATALOG_SCHEMA_VERSION = 'v7'
 CATALOG_DIR = '~/.sky/catalogs'
 ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
               'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',

sky/clouds/service_catalog/kubernetes_catalog.py CHANGED Viewed

@@ -261,16 +261,16 @@ def _list_accelerators(
                 accelerators_available = accelerator_count - allocated_qty
-                # Initialize the entry if it doesn't exist yet
-                if accelerator_name not in total_accelerators_available:
-                    total_accelerators_available[accelerator_name] = 0
                 if accelerators_available >= min_quantity_filter:
                     quantized_availability = min_quantity_filter * (
                         accelerators_available // min_quantity_filter)
-                    total_accelerators_available[accelerator_name] = (
-                        total_accelerators_available.get(accelerator_name, 0) +
-                        quantized_availability)
+                    if quantized_availability > 0:
+                        # only increment when quantized availability is positive
+                        # to avoid assertion errors checking keyset sizes in
+                        # core.py _realtime_kubernetes_gpu_availability_single
+                        total_accelerators_available[accelerator_name] = (
+                            total_accelerators_available.get(
+                                accelerator_name, 0) + quantized_availability)
     result = []

skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl