PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +565 -0
sky/backends/backend_utils.py +95 -12
sky/backends/cloud_vm_ray_backend.py +224 -65
sky/backends/task_codegen.py +380 -4
sky/catalog/__init__.py +0 -3
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +236 -0
sky/catalog/vast_catalog.py +30 -6
sky/check.py +25 -11
sky/client/cli/command.py +391 -32
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +64 -2
sky/client/sdk_async.py +9 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +610 -0
sky/clouds/ssh.py +3 -2
sky/clouds/vast.py +39 -16
sky/core.py +197 -37
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +44 -5
sky/global_user_state.py +111 -19
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +11 -0
sky/optimizer.py +8 -6
sky/provision/__init__.py +12 -9
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +163 -20
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +17 -7
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +618 -0
sky/provision/slurm/utils.py +689 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +11 -6
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/serve/server/impl.py +1 -1
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +238 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +30 -1
sky/server/requests/request_names.py +4 -0
sky/server/requests/requests.py +33 -11
sky/server/requests/serializers/encoders.py +22 -0
sky/server/requests/serializers/return_value_serializers.py +70 -0
sky/server/server.py +506 -109
sky/server/server_utils.py +30 -0
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +22 -9
sky/sky_logging.py +2 -1
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +55 -13
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +187 -0
sky/skylet/job_lib.py +91 -5
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/services.py +18 -3
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +12 -6
sky/templates/slurm-ray.yml.j2 +115 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +18 -41
sky/users/model.conf +1 -1
sky/users/permission.py +85 -52
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +389 -35
sky/utils/command_runner.pyi +43 -4
sky/utils/common_utils.py +47 -31
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +93 -19
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/client/cli/command.py CHANGED Viewed

@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
         # can still exist in the record, and we check for credentials to avoid
         # updating the SSH config for non-existent clusters.
         credentials = record['credentials']
+        ips = handle.cached_external_ips
         if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
             # Replace the proxy command to proxy through the SkyPilot API
             # server with websocket.
@@ -215,12 +216,28 @@ def _get_cluster_records_and_set_ssh_config(
                 f'\"{escaped_executable_path} '
                 f'{escaped_websocket_proxy_path} '
                 f'{server_common.get_server_url()} '
-                f'{handle.cluster_name}\"')
+                f'{handle.cluster_name} '
+                f'kubernetes-pod-ssh-proxy\"')
+            credentials['ssh_proxy_command'] = proxy_command
+        elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
+            # Replace the proxy command to proxy through the SkyPilot API
+            # server with websocket.
+            escaped_executable_path = shlex.quote(sys.executable)
+            escaped_websocket_proxy_path = shlex.quote(
+                f'{directory_utils.get_sky_dir()}/templates/websocket_proxy.py')
+            # %w is a placeholder for the node index, substituted per-node
+            # in cluster_utils.SSHConfigHelper.add_cluster().
+            proxy_command = (f'{escaped_executable_path} '
+                             f'{escaped_websocket_proxy_path} '
+                             f'{server_common.get_server_url()} '
+                             f'{handle.cluster_name} '
+                             f'slurm-job-ssh-proxy %w')
             credentials['ssh_proxy_command'] = proxy_command
         cluster_utils.SSHConfigHelper.add_cluster(
             handle.cluster_name,
-            handle.cached_external_ips,
+            handle.cluster_name_on_cloud,
+            ips,
             credentials,
             handle.cached_external_ssh_ports,
             handle.docker_user,
@@ -832,7 +849,19 @@ class _NaturalOrderGroup(click.Group):
     """
     def list_commands(self, ctx):  # pylint: disable=unused-argument
-        return self.commands.keys()
+        # Preserve definition order but hide aliases (same command object) and
+        # commands explicitly marked as hidden.
+        seen_commands = set()
+        names = []
+        for name, command in self.commands.items():
+            if getattr(command, 'hidden', False):
+                continue
+            command_id = id(command)
+            if command_id in seen_commands:
+                continue
+            seen_commands.add(command_id)
+            names.append(name)
+        return names
     @usage_lib.entrypoint('sky.cli', fallback=True)
     def invoke(self, ctx):
@@ -3424,7 +3453,12 @@ def _down_or_stop_clusters(
                     click.echo(f'      {name} ({first})')
     if failures:
-        click.echo('Cluster(s) failed. See details above.')
+        failure_str = 'Cluster(s) failed. See details above.'
+        if down:
+            failure_str += (
+                ' If you want to ignore the errors and remove the '
+                'cluster(s) from the status table, use `sky down --purge`.')
+        click.echo(failure_str)
 @cli.command(cls=_DocumentedCodeCommand)
@@ -3535,6 +3569,10 @@ def show_gpus(
     maximum quantities of the GPU available on a single node and the real-time
     availability of the GPU across all nodes in the Kubernetes cluster.
+    If ``--cloud slurm`` is specified, it will show the maximum quantities of
+    the GPU available on a single node and the real-time availability of the
+    GPU across all nodes in the Slurm cluster.
     Definitions of certain fields:
     * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3590,6 +3628,8 @@ def show_gpus(
     cloud_is_kubernetes = isinstance(
         cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
     cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
+    cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
     # TODO(romilb): We should move this to the backend.
     kubernetes_autoscaling = skypilot_config.get_effective_region_config(
         cloud='kubernetes',
@@ -3598,6 +3638,7 @@ def show_gpus(
         default_value=None) is not None
     kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
     ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
+    slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
     query_k8s_realtime_gpu = (kubernetes_is_enabled and
                               (cloud_name is None or cloud_is_kubernetes))
     query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3657,8 +3698,9 @@ def show_gpus(
             raise ValueError(full_err_msg)
         no_permissions_str = '<no permissions>'
         realtime_gpu_infos = []
+        # Stores per-GPU totals as [ready_capacity, available, not_ready].
         total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
-            lambda: [0, 0])
+            lambda: [0, 0, 0])
         all_nodes_info = []
         # display an aggregated table for all contexts
@@ -3669,6 +3711,33 @@ def show_gpus(
         num_filtered_contexts = 0
+        def _count_not_ready_gpus(
+            nodes_info: Optional['models.KubernetesNodesInfo']
+        ) -> Dict[str, int]:
+            """Return counts of GPUs on not ready nodes keyed by GPU type."""
+            not_ready_counts: Dict[str, int] = collections.defaultdict(int)
+            if nodes_info is None:
+                return not_ready_counts
+            node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
+            for node_info in node_info_dict.values():
+                accelerator_type = getattr(node_info, 'accelerator_type', None)
+                if not accelerator_type:
+                    continue
+                total_info = getattr(node_info, 'total', {})
+                accelerator_count = 0
+                if isinstance(total_info, dict):
+                    accelerator_count = int(
+                        total_info.get('accelerator_count', 0))
+                if accelerator_count <= 0:
+                    continue
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    not_ready_counts[accelerator_type] += accelerator_count
+            return not_ready_counts
         if realtime_gpu_availability_lists:
             for (ctx, availability_list) in realtime_gpu_availability_lists:
                 if not _filter_ctx(ctx):
@@ -3678,6 +3747,12 @@ def show_gpus(
                 else:
                     display_ctx = ctx
                 num_filtered_contexts += 1
+                # Collect node info for this context before building tables so
+                # we can exclude GPUs on not ready nodes from the totals.
+                nodes_info = sdk.stream_and_get(
+                    sdk.kubernetes_node_info(context=ctx))
+                context_not_ready_counts = _count_not_ready_gpus(nodes_info)
                 realtime_gpu_table = log_utils.create_table(
                     ['GPU', qty_header, 'UTILIZATION'])
                 for realtime_gpu_availability in sorted(availability_list):
@@ -3686,24 +3761,116 @@ def show_gpus(
                     available_qty = (gpu_availability.available
                                      if gpu_availability.available != -1 else
                                      no_permissions_str)
+                    # Exclude GPUs on not ready nodes from capacity counts.
+                    not_ready_count = min(
+                        context_not_ready_counts.get(gpu_availability.gpu, 0),
+                        gpu_availability.capacity)
+                    # Ensure capacity is never below the reported available
+                    # quantity (if available is unknown, treat as 0 for totals).
+                    available_for_totals = max(
+                        gpu_availability.available
+                        if gpu_availability.available != -1 else 0, 0)
+                    effective_capacity = max(
+                        gpu_availability.capacity - not_ready_count,
+                        available_for_totals)
+                    utilization = (
+                        f'{available_qty} of {effective_capacity} free')
+                    if not_ready_count > 0:
+                        utilization += f' ({not_ready_count} not ready)'
                     realtime_gpu_table.add_row([
                         gpu_availability.gpu,
                         _list_to_str(gpu_availability.counts),
-                        f'{available_qty} of {gpu_availability.capacity} free',
+                        utilization,
                     ])
                     gpu = gpu_availability.gpu
-                    capacity = gpu_availability.capacity
                     # we want total, so skip permission denied.
-                    available = max(gpu_availability.available, 0)
-                    if capacity > 0:
-                        total_gpu_info[gpu][0] += capacity
-                        total_gpu_info[gpu][1] += available
+                    if effective_capacity > 0 or not_ready_count > 0:
+                        total_gpu_info[gpu][0] += effective_capacity
+                        total_gpu_info[gpu][1] += available_for_totals
+                        total_gpu_info[gpu][2] += not_ready_count
                 realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
-                # Collect node info for this context
-                nodes_info = sdk.stream_and_get(
-                    sdk.kubernetes_node_info(context=ctx))
                 all_nodes_info.append((display_ctx, nodes_info))
         if num_filtered_contexts > 1:
+            total_realtime_gpu_table = log_utils.create_table(
+                ['GPU', 'UTILIZATION'])
+            for gpu, stats in total_gpu_info.items():
+                not_ready = stats[2]
+                utilization = f'{stats[1]} of {stats[0]} free'
+                if not_ready > 0:
+                    utilization += f' ({not_ready} not ready)'
+                total_realtime_gpu_table.add_row([gpu, utilization])
+        else:
+            total_realtime_gpu_table = None
+        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+    def _get_slurm_realtime_gpu_tables(
+        name_filter: Optional[str] = None,
+        quantity_filter: Optional[int] = None
+    ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
+               Optional['prettytable.PrettyTable']]:
+        """Get Slurm GPU availability tables.
+        Args:
+            name_filter: Filter GPUs by name.
+            quantity_filter: Filter GPUs by quantity.
+        Returns:
+            A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
+        """
+        if quantity_filter:
+            qty_header = 'QTY_FILTER'
+        else:
+            qty_header = 'REQUESTABLE_QTY_PER_NODE'
+        realtime_gpu_availability_lists = sdk.stream_and_get(
+            sdk.realtime_slurm_gpu_availability(
+                name_filter=name_filter, quantity_filter=quantity_filter))
+        if not realtime_gpu_availability_lists:
+            err_msg = 'No GPUs found in any Slurm partition. '
+            debug_msg = 'To further debug, run: sky check slurm '
+            if name_filter is not None:
+                gpu_info_msg = f' {name_filter!r}'
+                if quantity_filter is not None:
+                    gpu_info_msg += (' with requested quantity'
+                                     f' {quantity_filter}')
+                err_msg = (f'Resources{gpu_info_msg} not found '
+                           'in any Slurm partition. ')
+                debug_msg = ('To show available accelerators on Slurm,'
+                             ' run: sky show-gpus --cloud slurm ')
+            raise ValueError(err_msg + debug_msg)
+        realtime_gpu_infos = []
+        total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
+            lambda: [0, 0])
+        for (slurm_cluster,
+             availability_list) in realtime_gpu_availability_lists:
+            realtime_gpu_table = log_utils.create_table(
+                ['GPU', qty_header, 'UTILIZATION'])
+            for realtime_gpu_availability in sorted(availability_list):
+                gpu_availability = models.RealtimeGpuAvailability(
+                    *realtime_gpu_availability)
+                # Use the counts directly from the backend, which are already
+                # generated in powers of 2 (plus any actual maximums)
+                requestable_quantities = gpu_availability.counts
+                realtime_gpu_table.add_row([
+                    gpu_availability.gpu,
+                    _list_to_str(requestable_quantities),
+                    (f'{gpu_availability.available} of '
+                     f'{gpu_availability.capacity} free'),
+                ])
+                gpu = gpu_availability.gpu
+                capacity = gpu_availability.capacity
+                available = gpu_availability.available
+                if capacity > 0:
+                    total_gpu_info[gpu][0] += capacity
+                    total_gpu_info[gpu][1] += available
+            realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
+        # display an aggregated table for all partitions
+        # if there are more than one partitions with GPUs
+        if len(realtime_gpu_infos) > 1:
             total_realtime_gpu_table = log_utils.create_table(
                 ['GPU', 'UTILIZATION'])
             for gpu, stats in total_gpu_info.items():
@@ -3712,14 +3879,16 @@ def show_gpus(
         else:
             total_realtime_gpu_table = None
-        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+        return realtime_gpu_infos, total_realtime_gpu_table
     def _format_kubernetes_node_info_combined(
             contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
             cloud_str: str = 'Kubernetes',
             context_title_str: str = 'CONTEXT') -> str:
-        node_table = log_utils.create_table(
-            [context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
+        node_table = log_utils.create_table([
+            context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
+            'GPU UTILIZATION'
+        ])
         no_permissions_str = '<no permissions>'
         hints = []
@@ -3736,10 +3905,56 @@ def show_gpus(
                 acc_type = node_info.accelerator_type
                 if acc_type is None:
                     acc_type = '-'
+                # Format CPU and memory: "X of Y free" or just "Y" if
+                # free is unknown
+                cpu_str = '-'
+                if node_info.cpu_count is not None:
+                    cpu_total_str = common_utils.format_float(
+                        node_info.cpu_count, precision=0)
+                    # Check if we have free CPU info (use hasattr to
+                    # check if field exists, then access directly)
+                    cpu_free = None
+                    if hasattr(node_info, 'cpu_free'):
+                        cpu_free = node_info.cpu_free
+                    if cpu_free is not None:
+                        cpu_free_str = common_utils.format_float(cpu_free,
+                                                                 precision=0)
+                        cpu_str = f'{cpu_free_str} of {cpu_total_str} free'
+                    else:
+                        cpu_str = cpu_total_str
+                memory_str = '-'
+                if node_info.memory_gb is not None:
+                    memory_total_str = common_utils.format_float(
+                        node_info.memory_gb, precision=0)
+                    # Check if we have free memory info (use hasattr
+                    # to check if field exists, then access directly)
+                    memory_free_gb = None
+                    if hasattr(node_info, 'memory_free_gb'):
+                        memory_free_gb = node_info.memory_free_gb
+                    if memory_free_gb is not None:
+                        memory_free_str = common_utils.format_float(
+                            memory_free_gb, precision=0)
+                        memory_str = (
+                            f'{memory_free_str} of {memory_total_str} free')
+                    else:
+                        memory_str = memory_total_str
+                utilization_str = (
+                    f'{available} of '
+                    f'{node_info.total["accelerator_count"]} free')
+                # Check if node is ready (defaults to True for backward
+                # compatibility with older server versions)
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    utilization_str += ' (Node NotReady)'
                 node_table.add_row([
-                    context_name, node_name, acc_type,
-                    f'{available} of {node_info.total["accelerator_count"]} '
-                    'free'
+                    context_name, node_name, cpu_str, memory_str, acc_type,
+                    utilization_str
                 ])
         k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
@@ -3751,6 +3966,42 @@ def show_gpus(
                 f'{colorama.Style.RESET_ALL}\n'
                 f'{node_table.get_string()}')
+    def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
+        node_table = log_utils.create_table([
+            'CLUSTER',
+            'NODE',
+            'PARTITION',
+            'STATE',
+            'GPU',
+            'UTILIZATION',
+        ])
+        request_ids = [(cluster_name,
+                        sdk.slurm_node_info(slurm_cluster_name=cluster_name))
+                       for cluster_name in slurm_cluster_names]
+        for cluster_name, request_id in request_ids:
+            nodes_info = sdk.stream_and_get(request_id)
+            for node_info in nodes_info:
+                node_table.add_row([
+                    cluster_name,
+                    node_info.get('node_name'),
+                    node_info.get('partition', '-'),
+                    node_info.get('node_state'),
+                    node_info.get('gpu_type') or '',
+                    (f'{node_info.get("free_gpus", 0)} of '
+                     f'{node_info.get("total_gpus", 0)} free'),
+                ])
+        slurm_per_node_msg = 'Slurm per node accelerator availability'
+        # Optional: Add hint message if needed, similar to k8s
+        return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
+                f'{slurm_per_node_msg}'
+                f'{colorama.Style.RESET_ALL}\n'
+                f'{node_table.get_string()}')
     def _format_kubernetes_realtime_gpu(
             total_table: Optional['prettytable.PrettyTable'],
             k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3880,6 +4131,29 @@ def show_gpus(
                 return True, print_section_titles
         return False, print_section_titles
+    def _format_slurm_realtime_gpu(
+            total_table, slurm_realtime_infos,
+            show_node_info: bool) -> Generator[str, None, None]:
+        # print total table
+        yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
+               'Slurm GPUs'
+               f'{colorama.Style.RESET_ALL}\n')
+        if total_table is not None:
+            yield from total_table.get_string()
+            yield '\n'
+        # print individual infos.
+        for (partition, slurm_realtime_table) in slurm_realtime_infos:
+            partition_str = f'Slurm Cluster: {partition}'
+            yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                   f'{partition_str}'
+                   f'{colorama.Style.RESET_ALL}\n')
+            yield from slurm_realtime_table.get_string()
+            yield '\n'
+        if show_node_info:
+            cluster_names = [cluster for cluster, _ in slurm_realtime_infos]
+            yield _format_slurm_node_info(cluster_names)
     def _output() -> Generator[str, None, None]:
         gpu_table = log_utils.create_table(
             ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3897,10 +4171,12 @@ def show_gpus(
         if cloud_name is None:
             clouds_to_list = [
                 c for c in constants.ALL_CLOUDS
-                if c != 'kubernetes' and c != 'ssh'
+                if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
             ]
         k8s_messages = ''
+        slurm_messages = ''
+        k8s_printed = False
         if accelerator_str is None:
             # Collect k8s related messages in k8s_messages and print them at end
             print_section_titles = False
@@ -3912,6 +4188,7 @@ def show_gpus(
                     yield '\n\n'
                 stop_iter_one, print_section_titles_one, k8s_messages_one = (
                     yield from _possibly_show_k8s_like_realtime(is_ssh))
+                k8s_printed = True
                 stop_iter = stop_iter or stop_iter_one
                 print_section_titles = (print_section_titles or
                                         print_section_titles_one)
@@ -3919,11 +4196,45 @@ def show_gpus(
                 prev_print_section_titles = print_section_titles_one
             if stop_iter:
                 return
+            # If cloud is slurm, we want to show real-time capacity
+            if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
+                try:
+                    # If --cloud slurm is not specified, we want to catch
+                    # the case where no GPUs are available on the cluster and
+                    # print the warning at the end.
+                    slurm_realtime_infos, total_table = (
+                        _get_slurm_realtime_gpu_tables())
+                except ValueError as e:
+                    if not cloud_is_slurm:
+                        # Make it a note if cloud is not slurm
+                        slurm_messages += 'Note: '
+                    slurm_messages += str(e)
+                else:
+                    print_section_titles = True
+                    if k8s_printed:
+                        yield '\n'
+                    yield from _format_slurm_realtime_gpu(total_table,
+                                                          slurm_realtime_infos,
+                                                          show_node_info=True)
+            if cloud_is_slurm:
+                # Do not show clouds if --cloud slurm is specified
+                if not slurm_is_enabled:
+                    yield ('Slurm is not enabled. To fix, run: '
+                           'sky check slurm ')
+                yield slurm_messages
+                return
             # For show_all, show the k8s message at the start since output is
             # long and the user may not scroll to the end.
-            if show_all and k8s_messages:
-                yield k8s_messages
+            if show_all and (k8s_messages or slurm_messages):
+                if k8s_messages:
+                    yield k8s_messages
+                if slurm_messages:
+                    if k8s_messages:
+                        yield '\n'
+                    yield slurm_messages
                 yield '\n\n'
             list_accelerator_counts_result = sdk.stream_and_get(
@@ -3971,9 +4282,10 @@ def show_gpus(
             else:
                 yield ('\n\nHint: use -a/--all to see all accelerators '
                        '(including non-common ones) and pricing.')
-                if k8s_messages:
+                if k8s_messages or slurm_messages:
                     yield '\n'
                     yield k8s_messages
+                    yield slurm_messages
                 return
         else:
             # Parse accelerator string
@@ -4013,6 +4325,31 @@ def show_gpus(
         if stop_iter:
             return
+        # Handle Slurm filtering by name and quantity
+        if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
+                not show_all):
+            # Print section title if not showing all and instead a specific
+            # accelerator is requested
+            print_section_titles = True
+            try:
+                slurm_realtime_infos, total_table = (
+                    _get_slurm_realtime_gpu_tables(name_filter=name,
+                                                   quantity_filter=quantity))
+                yield from _format_slurm_realtime_gpu(total_table,
+                                                      slurm_realtime_infos,
+                                                      show_node_info=False)
+            except ValueError as e:
+                # In the case of a specific accelerator, show the error message
+                # immediately (e.g., "Resources A10G not found ...")
+                yield str(e)
+            yield slurm_messages
+        if cloud_is_slurm:
+            # Do not show clouds if --cloud slurm is specified
+            if not slurm_is_enabled:
+                yield ('Slurm is not enabled. To fix, run: '
+                       'sky check slurm ')
+            return
         # For clouds other than Kubernetes, get the accelerator details
         # Case-sensitive
         list_accelerators_result = sdk.stream_and_get(
@@ -4398,6 +4735,13 @@ def volumes_ls(verbose: bool):
               is_flag=True,
               required=False,
               help='Delete all volumes.')
+@click.option('--purge',
+              '-p',
+              default=False,
+              is_flag=True,
+              required=False,
+              help=('Forcibly delete the volume from the volumes table even '
+                    'if the deletion API fails.'))
 @click.option('--yes',
               '-y',
               default=False,
@@ -4406,7 +4750,12 @@ def volumes_ls(verbose: bool):
               help='Skip confirmation prompt.')
 @_add_click_options(flags.COMMON_OPTIONS)
 @usage_lib.entrypoint
-def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool):  # pylint: disable=redefined-builtin
+def volumes_delete(
+        names: List[str],
+        all: bool,  # pylint: disable=redefined-builtin
+        purge: bool,
+        yes: bool,
+        async_call: bool):
     """Delete volumes.
     Examples:
@@ -4421,6 +4770,9 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool):  #
         \b
         # Delete all volumes.
         sky volumes delete -a
+        \b
+        # Forcibly delete a volume.
+        sky volumes delete pvc1 -p
     """
     if sum([bool(names), all]) != 1:
         raise click.UsageError('Either --all or a name must be specified.')
@@ -4447,8 +4799,8 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool):  #
                 show_default=True)
         try:
-            _async_call_or_wait(volumes_sdk.delete(names), async_call,
-                                'sky.volumes.delete')
+            _async_call_or_wait(volumes_sdk.delete(names, purge=purge),
+                                async_call, 'sky.volumes.delete')
         except Exception as e:  # pylint: disable=broad-except
             logger.error(f'{colorama.Fore.RED}Error deleting volumes {names}: '
                          f'{str(e)}{colorama.Style.RESET_ALL}')
@@ -5120,9 +5472,14 @@ def jobs_pool_apply(
 @flags.config_option(expose_value=False)
 @flags.verbose_option()
 @click.argument('pool_names', required=False, type=str, nargs=-1)
+@click.option('--all',
+              '-a',
+              'show_all',
+              is_flag=True,
+              default=False,
+              help='Show all workers.')
 @usage_lib.entrypoint
-# pylint: disable=redefined-builtin
-def jobs_pool_status(verbose: bool, pool_names: List[str]):
+def jobs_pool_status(verbose: bool, pool_names: List[str], show_all: bool):
     """Show statuses of pools.
     Show detailed statuses of one or more pools. If POOL_NAME is not
@@ -5135,7 +5492,7 @@ def jobs_pool_status(verbose: bool, pool_names: List[str]):
         pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
         _, msg = _handle_services_request(pool_status_request_id,
                                           service_names=pool_names_to_query,
-                                          show_all=verbose,
+                                          show_all=verbose or show_all,
                                           show_endpoint=False,
                                           pool=True,
                                           is_called_by_user=True)
@@ -6438,9 +6795,11 @@ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
             if not verbose:
                 r_id = common_utils.truncate_long_string(r_id, 36)
             req_status = requests.RequestStatus(request.status)
-            row = [r_id, request.user_name, request.name]
+            user_display = status_utils.get_user_display_name(
+                request.user_name or '-', request.user_id)
+            row = [r_id, user_display, request.name]
             if verbose:
-                row.append(request.cluster_name)
+                row.append(request.cluster_name or '-')
             row.extend([
                 log_utils.readable_time_duration(request.created_at),
                 req_status.colored_str()

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl