PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

sky/__init__.py +4 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +478 -0
sky/backends/backend_utils.py +45 -4
sky/backends/cloud_vm_ray_backend.py +32 -33
sky/backends/task_codegen.py +340 -2
sky/catalog/__init__.py +0 -3
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/slurm_catalog.py +243 -0
sky/check.py +14 -3
sky/client/cli/command.py +329 -22
sky/client/sdk.py +56 -2
sky/clouds/__init__.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/slurm.py +578 -0
sky/clouds/ssh.py +2 -1
sky/clouds/vast.py +10 -0
sky/core.py +128 -36
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +16 -2
sky/global_user_state.py +3 -3
sky/models.py +2 -0
sky/optimizer.py +6 -5
sky/provision/__init__.py +1 -0
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +42 -6
sky/provision/provisioner.py +15 -6
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +572 -0
sky/provision/slurm/utils.py +583 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +10 -6
sky/serve/server/impl.py +1 -1
sky/server/constants.py +1 -1
sky/server/plugins.py +222 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +12 -1
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +5 -1
sky/server/requests/serializers/encoders.py +17 -0
sky/server/requests/serializers/return_value_serializers.py +60 -0
sky/server/server.py +78 -8
sky/server/server_utils.py +30 -0
sky/setup_files/dependencies.py +2 -0
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +34 -9
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +189 -0
sky/skylet/job_lib.py +2 -1
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +8 -0
sky/templates/slurm-ray.yml.j2 +85 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/users/model.conf +1 -1
sky/users/permission.py +24 -1
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/command_runner.py +197 -5
sky/utils/command_runner.pyi +27 -4
sky/utils/common_utils.py +18 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/schemas.py +31 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0

sky/client/cli/command.py CHANGED Viewed

@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
         # can still exist in the record, and we check for credentials to avoid
         # updating the SSH config for non-existent clusters.
         credentials = record['credentials']
+        ips = handle.cached_external_ips
         if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
             # Replace the proxy command to proxy through the SkyPilot API
             # server with websocket.
@@ -217,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
                 f'{server_common.get_server_url()} '
                 f'{handle.cluster_name}\"')
             credentials['ssh_proxy_command'] = proxy_command
+        elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
+            # TODO(kevin): This is a temporary workaround, ideally we want to
+            # get a shell through srun --pty bash on the existing sbatch job.
+            # Proxy through the controller/login node to reach the worker node.
+            if (handle.cached_internal_ips is None or
+                    not handle.cached_internal_ips):
+                logger.debug(
+                    f'Cluster {name} does not have cached internal IPs. '
+                    'Skipping SSH config update.')
+                cluster_utils.SSHConfigHelper.remove_cluster(name)
+                continue
+            escaped_key_path = shlex.quote(
+                cluster_utils.SSHConfigHelper.generate_local_key_file(
+                    handle.cluster_name, credentials))
+            controller_host = handle.cached_external_ips[0]
+            # Build jump proxy: ssh to worker via controller/login node
+            proxy_command = (f'ssh -tt -i {escaped_key_path} '
+                             '-o StrictHostKeyChecking=no '
+                             '-o UserKnownHostsFile=/dev/null '
+                             '-o IdentitiesOnly=yes '
+                             '-W %h:%p '
+                             f'{handle.ssh_user}@{controller_host}')
+            original_proxy = credentials.get('ssh_proxy_command')
+            if original_proxy:
+                proxy_command += (
+                    f' -o ProxyCommand={shlex.quote(original_proxy)}')
+            credentials['ssh_proxy_command'] = proxy_command
+            # For Slurm, use the worker's internal IP as the SSH target
+            ips = handle.cached_internal_ips
         cluster_utils.SSHConfigHelper.add_cluster(
             handle.cluster_name,
-            handle.cached_external_ips,
+            ips,
             credentials,
             handle.cached_external_ssh_ports,
             handle.docker_user,
@@ -832,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
     """
     def list_commands(self, ctx):  # pylint: disable=unused-argument
-        return self.commands.keys()
+        # Preserve definition order but hide aliases (same command object) and
+        # commands explicitly marked as hidden.
+        seen_commands = set()
+        names = []
+        for name, command in self.commands.items():
+            if getattr(command, 'hidden', False):
+                continue
+            command_id = id(command)
+            if command_id in seen_commands:
+                continue
+            seen_commands.add(command_id)
+            names.append(name)
+        return names
     @usage_lib.entrypoint('sky.cli', fallback=True)
     def invoke(self, ctx):
@@ -3535,6 +3582,10 @@ def show_gpus(
     maximum quantities of the GPU available on a single node and the real-time
     availability of the GPU across all nodes in the Kubernetes cluster.
+    If ``--cloud slurm`` is specified, it will show the maximum quantities of
+    the GPU available on a single node and the real-time availability of the
+    GPU across all nodes in the Slurm cluster.
     Definitions of certain fields:
     * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3590,6 +3641,8 @@ def show_gpus(
     cloud_is_kubernetes = isinstance(
         cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
     cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
+    cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
     # TODO(romilb): We should move this to the backend.
     kubernetes_autoscaling = skypilot_config.get_effective_region_config(
         cloud='kubernetes',
@@ -3598,6 +3651,7 @@ def show_gpus(
         default_value=None) is not None
     kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
     ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
+    slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
     query_k8s_realtime_gpu = (kubernetes_is_enabled and
                               (cloud_name is None or cloud_is_kubernetes))
     query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3657,8 +3711,9 @@ def show_gpus(
             raise ValueError(full_err_msg)
         no_permissions_str = '<no permissions>'
         realtime_gpu_infos = []
+        # Stores per-GPU totals as [ready_capacity, available, not_ready].
         total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
-            lambda: [0, 0])
+            lambda: [0, 0, 0])
         all_nodes_info = []
         # display an aggregated table for all contexts
@@ -3669,6 +3724,33 @@ def show_gpus(
         num_filtered_contexts = 0
+        def _count_not_ready_gpus(
+            nodes_info: Optional['models.KubernetesNodesInfo']
+        ) -> Dict[str, int]:
+            """Return counts of GPUs on not ready nodes keyed by GPU type."""
+            not_ready_counts: Dict[str, int] = collections.defaultdict(int)
+            if nodes_info is None:
+                return not_ready_counts
+            node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
+            for node_info in node_info_dict.values():
+                accelerator_type = getattr(node_info, 'accelerator_type', None)
+                if not accelerator_type:
+                    continue
+                total_info = getattr(node_info, 'total', {})
+                accelerator_count = 0
+                if isinstance(total_info, dict):
+                    accelerator_count = int(
+                        total_info.get('accelerator_count', 0))
+                if accelerator_count <= 0:
+                    continue
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    not_ready_counts[accelerator_type] += accelerator_count
+            return not_ready_counts
         if realtime_gpu_availability_lists:
             for (ctx, availability_list) in realtime_gpu_availability_lists:
                 if not _filter_ctx(ctx):
@@ -3678,6 +3760,12 @@ def show_gpus(
                 else:
                     display_ctx = ctx
                 num_filtered_contexts += 1
+                # Collect node info for this context before building tables so
+                # we can exclude GPUs on not ready nodes from the totals.
+                nodes_info = sdk.stream_and_get(
+                    sdk.kubernetes_node_info(context=ctx))
+                context_not_ready_counts = _count_not_ready_gpus(nodes_info)
                 realtime_gpu_table = log_utils.create_table(
                     ['GPU', qty_header, 'UTILIZATION'])
                 for realtime_gpu_availability in sorted(availability_list):
@@ -3686,24 +3774,116 @@ def show_gpus(
                     available_qty = (gpu_availability.available
                                      if gpu_availability.available != -1 else
                                      no_permissions_str)
+                    # Exclude GPUs on not ready nodes from capacity counts.
+                    not_ready_count = min(
+                        context_not_ready_counts.get(gpu_availability.gpu, 0),
+                        gpu_availability.capacity)
+                    # Ensure capacity is never below the reported available
+                    # quantity (if available is unknown, treat as 0 for totals).
+                    available_for_totals = max(
+                        gpu_availability.available
+                        if gpu_availability.available != -1 else 0, 0)
+                    effective_capacity = max(
+                        gpu_availability.capacity - not_ready_count,
+                        available_for_totals)
+                    utilization = (
+                        f'{available_qty} of {effective_capacity} free')
+                    if not_ready_count > 0:
+                        utilization += f' ({not_ready_count} not ready)'
                     realtime_gpu_table.add_row([
                         gpu_availability.gpu,
                         _list_to_str(gpu_availability.counts),
-                        f'{available_qty} of {gpu_availability.capacity} free',
+                        utilization,
                     ])
                     gpu = gpu_availability.gpu
-                    capacity = gpu_availability.capacity
                     # we want total, so skip permission denied.
-                    available = max(gpu_availability.available, 0)
-                    if capacity > 0:
-                        total_gpu_info[gpu][0] += capacity
-                        total_gpu_info[gpu][1] += available
+                    if effective_capacity > 0 or not_ready_count > 0:
+                        total_gpu_info[gpu][0] += effective_capacity
+                        total_gpu_info[gpu][1] += available_for_totals
+                        total_gpu_info[gpu][2] += not_ready_count
                 realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
-                # Collect node info for this context
-                nodes_info = sdk.stream_and_get(
-                    sdk.kubernetes_node_info(context=ctx))
                 all_nodes_info.append((display_ctx, nodes_info))
         if num_filtered_contexts > 1:
+            total_realtime_gpu_table = log_utils.create_table(
+                ['GPU', 'UTILIZATION'])
+            for gpu, stats in total_gpu_info.items():
+                not_ready = stats[2]
+                utilization = f'{stats[1]} of {stats[0]} free'
+                if not_ready > 0:
+                    utilization += f' ({not_ready} not ready)'
+                total_realtime_gpu_table.add_row([gpu, utilization])
+        else:
+            total_realtime_gpu_table = None
+        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+    def _get_slurm_realtime_gpu_tables(
+        name_filter: Optional[str] = None,
+        quantity_filter: Optional[int] = None
+    ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
+               Optional['prettytable.PrettyTable']]:
+        """Get Slurm GPU availability tables.
+        Args:
+            name_filter: Filter GPUs by name.
+            quantity_filter: Filter GPUs by quantity.
+        Returns:
+            A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
+        """
+        if quantity_filter:
+            qty_header = 'QTY_FILTER'
+        else:
+            qty_header = 'REQUESTABLE_QTY_PER_NODE'
+        realtime_gpu_availability_lists = sdk.stream_and_get(
+            sdk.realtime_slurm_gpu_availability(
+                name_filter=name_filter, quantity_filter=quantity_filter))
+        if not realtime_gpu_availability_lists:
+            err_msg = 'No GPUs found in any Slurm partition. '
+            debug_msg = 'To further debug, run: sky check slurm '
+            if name_filter is not None:
+                gpu_info_msg = f' {name_filter!r}'
+                if quantity_filter is not None:
+                    gpu_info_msg += (' with requested quantity'
+                                     f' {quantity_filter}')
+                err_msg = (f'Resources{gpu_info_msg} not found '
+                           'in any Slurm partition. ')
+                debug_msg = ('To show available accelerators on Slurm,'
+                             ' run: sky show-gpus --cloud slurm ')
+            raise ValueError(err_msg + debug_msg)
+        realtime_gpu_infos = []
+        total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
+            lambda: [0, 0])
+        for (slurm_cluster,
+             availability_list) in realtime_gpu_availability_lists:
+            realtime_gpu_table = log_utils.create_table(
+                ['GPU', qty_header, 'UTILIZATION'])
+            for realtime_gpu_availability in sorted(availability_list):
+                gpu_availability = models.RealtimeGpuAvailability(
+                    *realtime_gpu_availability)
+                # Use the counts directly from the backend, which are already
+                # generated in powers of 2 (plus any actual maximums)
+                requestable_quantities = gpu_availability.counts
+                realtime_gpu_table.add_row([
+                    gpu_availability.gpu,
+                    _list_to_str(requestable_quantities),
+                    (f'{gpu_availability.available} of '
+                     f'{gpu_availability.capacity} free'),
+                ])
+                gpu = gpu_availability.gpu
+                capacity = gpu_availability.capacity
+                available = gpu_availability.available
+                if capacity > 0:
+                    total_gpu_info[gpu][0] += capacity
+                    total_gpu_info[gpu][1] += available
+            realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
+        # display an aggregated table for all partitions
+        # if there are more than one partitions with GPUs
+        if len(realtime_gpu_infos) > 1:
             total_realtime_gpu_table = log_utils.create_table(
                 ['GPU', 'UTILIZATION'])
             for gpu, stats in total_gpu_info.items():
@@ -3712,7 +3892,7 @@ def show_gpus(
         else:
             total_realtime_gpu_table = None
-        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+        return realtime_gpu_infos, total_realtime_gpu_table
     def _format_kubernetes_node_info_combined(
             contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
@@ -3736,11 +3916,16 @@ def show_gpus(
                 acc_type = node_info.accelerator_type
                 if acc_type is None:
                     acc_type = '-'
-                node_table.add_row([
-                    context_name, node_name, acc_type,
-                    f'{available} of {node_info.total["accelerator_count"]} '
-                    'free'
-                ])
+                utilization_str = (
+                    f'{available} of '
+                    f'{node_info.total["accelerator_count"]} free')
+                # Check if node is ready (defaults to True for backward
+                # compatibility with older server versions)
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    utilization_str += ' (Node NotReady)'
+                node_table.add_row(
+                    [context_name, node_name, acc_type, utilization_str])
         k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
         if hints:
@@ -3751,6 +3936,43 @@ def show_gpus(
                 f'{colorama.Style.RESET_ALL}\n'
                 f'{node_table.get_string()}')
+    def _format_slurm_node_info() -> str:
+        node_table = log_utils.create_table([
+            'CLUSTER',
+            'NODE',
+            'PARTITION',
+            'STATE',
+            'GPU',
+            'UTILIZATION',
+        ])
+        # Get all cluster names
+        slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
+        # Query each cluster
+        for cluster_name in slurm_cluster_names:
+            nodes_info = sdk.stream_and_get(
+                sdk.slurm_node_info(slurm_cluster_name=cluster_name))
+            for node_info in nodes_info:
+                node_table.add_row([
+                    cluster_name,
+                    node_info.get('node_name'),
+                    node_info.get('partition', '-'),
+                    node_info.get('node_state'),
+                    node_info.get('gpu_type') or '',
+                    (f'{node_info.get("free_gpus", 0)} of '
+                     f'{node_info.get("total_gpus", 0)} free'),
+                ])
+        slurm_per_node_msg = 'Slurm per node accelerator availability'
+        # Optional: Add hint message if needed, similar to k8s
+        return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
+                f'{slurm_per_node_msg}'
+                f'{colorama.Style.RESET_ALL}\n'
+                f'{node_table.get_string()}')
     def _format_kubernetes_realtime_gpu(
             total_table: Optional['prettytable.PrettyTable'],
             k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3880,6 +4102,28 @@ def show_gpus(
                 return True, print_section_titles
         return False, print_section_titles
+    def _format_slurm_realtime_gpu(
+            total_table, slurm_realtime_infos,
+            show_node_info: bool) -> Generator[str, None, None]:
+        # print total table
+        yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
+               'Slurm GPUs'
+               f'{colorama.Style.RESET_ALL}\n')
+        if total_table is not None:
+            yield from total_table.get_string()
+            yield '\n'
+        # print individual infos.
+        for (partition, slurm_realtime_table) in slurm_realtime_infos:
+            partition_str = f'Slurm Cluster: {partition}'
+            yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                   f'{partition_str}'
+                   f'{colorama.Style.RESET_ALL}\n')
+            yield from slurm_realtime_table.get_string()
+            yield '\n'
+        if show_node_info:
+            yield _format_slurm_node_info()
     def _output() -> Generator[str, None, None]:
         gpu_table = log_utils.create_table(
             ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3897,10 +4141,12 @@ def show_gpus(
         if cloud_name is None:
             clouds_to_list = [
                 c for c in constants.ALL_CLOUDS
-                if c != 'kubernetes' and c != 'ssh'
+                if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
             ]
         k8s_messages = ''
+        slurm_messages = ''
+        k8s_printed = False
         if accelerator_str is None:
             # Collect k8s related messages in k8s_messages and print them at end
             print_section_titles = False
@@ -3912,6 +4158,7 @@ def show_gpus(
                     yield '\n\n'
                 stop_iter_one, print_section_titles_one, k8s_messages_one = (
                     yield from _possibly_show_k8s_like_realtime(is_ssh))
+                k8s_printed = True
                 stop_iter = stop_iter or stop_iter_one
                 print_section_titles = (print_section_titles or
                                         print_section_titles_one)
@@ -3919,11 +4166,45 @@ def show_gpus(
                 prev_print_section_titles = print_section_titles_one
             if stop_iter:
                 return
+            # If cloud is slurm, we want to show real-time capacity
+            if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
+                try:
+                    # If --cloud slurm is not specified, we want to catch
+                    # the case where no GPUs are available on the cluster and
+                    # print the warning at the end.
+                    slurm_realtime_infos, total_table = (
+                        _get_slurm_realtime_gpu_tables())
+                except ValueError as e:
+                    if not cloud_is_slurm:
+                        # Make it a note if cloud is not slurm
+                        slurm_messages += 'Note: '
+                    slurm_messages += str(e)
+                else:
+                    print_section_titles = True
+                    if k8s_printed:
+                        yield '\n'
+                    yield from _format_slurm_realtime_gpu(total_table,
+                                                          slurm_realtime_infos,
+                                                          show_node_info=True)
+            if cloud_is_slurm:
+                # Do not show clouds if --cloud slurm is specified
+                if not slurm_is_enabled:
+                    yield ('Slurm is not enabled. To fix, run: '
+                           'sky check slurm ')
+                yield slurm_messages
+                return
             # For show_all, show the k8s message at the start since output is
             # long and the user may not scroll to the end.
-            if show_all and k8s_messages:
-                yield k8s_messages
+            if show_all and (k8s_messages or slurm_messages):
+                if k8s_messages:
+                    yield k8s_messages
+                if slurm_messages:
+                    if k8s_messages:
+                        yield '\n'
+                    yield slurm_messages
                 yield '\n\n'
             list_accelerator_counts_result = sdk.stream_and_get(
@@ -3971,9 +4252,10 @@ def show_gpus(
             else:
                 yield ('\n\nHint: use -a/--all to see all accelerators '
                        '(including non-common ones) and pricing.')
-                if k8s_messages:
+                if k8s_messages or slurm_messages:
                     yield '\n'
                     yield k8s_messages
+                    yield slurm_messages
                 return
         else:
             # Parse accelerator string
@@ -4013,6 +4295,31 @@ def show_gpus(
         if stop_iter:
             return
+        # Handle Slurm filtering by name and quantity
+        if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
+                not show_all):
+            # Print section title if not showing all and instead a specific
+            # accelerator is requested
+            print_section_titles = True
+            try:
+                slurm_realtime_infos, total_table = (
+                    _get_slurm_realtime_gpu_tables(name_filter=name,
+                                                   quantity_filter=quantity))
+                yield from _format_slurm_realtime_gpu(total_table,
+                                                      slurm_realtime_infos,
+                                                      show_node_info=False)
+            except ValueError as e:
+                # In the case of a specific accelerator, show the error message
+                # immediately (e.g., "Resources A10G not found ...")
+                yield str(e)
+            yield slurm_messages
+        if cloud_is_slurm:
+            # Do not show clouds if --cloud slurm is specified
+            if not slurm_is_enabled:
+                yield ('Slurm is not enabled. To fix, run: '
+                       'sky check slurm ')
+            return
         # For clouds other than Kubernetes, get the accelerator details
         # Case-sensitive
         list_accelerators_result = sdk.stream_and_get(

sky/client/sdk.py CHANGED Viewed

@@ -42,6 +42,7 @@ from sky.server.requests import request_names
 from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
 from sky.skylet import constants
+from sky.ssh_node_pools import utils as ssh_utils
 from sky.usage import usage_lib
 from sky.utils import admin_policy_utils
 from sky.utils import annotations
@@ -57,7 +58,6 @@ from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 from sky.utils import yaml_utils
-from sky.utils.kubernetes import ssh_utils
 if typing.TYPE_CHECKING:
     import base64
@@ -675,7 +675,7 @@ def _launch(
         clusters = get(status_request_id)
         cluster_user_hash = common_utils.get_user_hash()
         cluster_user_hash_str = ''
-        current_user = common_utils.get_current_user_name()
+        current_user = common_utils.get_local_user_name()
         cluster_user_name = current_user
         if not clusters:
             # Show the optimize log before the prompt if the cluster does not
@@ -2744,3 +2744,57 @@ def api_logout() -> None:
     _clear_api_server_config()
     logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
                 f'{colorama.Style.RESET_ALL}')
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@versions.minimal_api_version(24)
+@annotations.client_api
+def realtime_slurm_gpu_availability(
+        name_filter: Optional[str] = None,
+        quantity_filter: Optional[int] = None) -> server_common.RequestId:
+    """Gets the real-time Slurm GPU availability.
+    Args:
+        name_filter: Optional name filter for GPUs.
+        quantity_filter: Optional quantity filter for GPUs.
+    Returns:
+        The request ID of the Slurm GPU availability request.
+    """
+    body = payloads.SlurmGpuAvailabilityRequestBody(
+        name_filter=name_filter,
+        quantity_filter=quantity_filter,
+    )
+    response = server_common.make_authenticated_request(
+        'POST',
+        '/slurm_gpu_availability',
+        json=json.loads(body.model_dump_json()),
+    )
+    return server_common.get_request_id(response)
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@versions.minimal_api_version(24)
+@annotations.client_api
+def slurm_node_info(
+        slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
+    """Gets the resource information for all nodes in the Slurm cluster.
+    Returns:
+        The request ID of the Slurm node info request.
+    Request Returns:
+        List[Dict[str, Any]]: A list of dictionaries, each containing info
+            for a single Slurm node (node_name, partition, node_state,
+            gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
+    """
+    body = payloads.SlurmNodeInfoRequestBody(
+        slurm_cluster_name=slurm_cluster_name)
+    response = server_common.make_authenticated_request(
+        'GET',
+        '/slurm_node_info',
+        json=json.loads(body.model_dump_json()),
+    )
+    return server_common.get_request_id(response)

sky/clouds/__init__.py CHANGED Viewed

@@ -31,6 +31,7 @@ from sky.clouds.runpod import RunPod
 from sky.clouds.scp import SCP
 from sky.clouds.seeweb import Seeweb
 from sky.clouds.shadeform import Shadeform
+from sky.clouds.slurm import Slurm
 from sky.clouds.ssh import SSH
 from sky.clouds.vast import Vast
 from sky.clouds.vsphere import Vsphere
@@ -48,6 +49,7 @@ __all__ = [
     'Paperspace',
     'PrimeIntellect',
     'SCP',
+    'Slurm',
     'RunPod',
     'Shadeform',
     'Vast',

sky/clouds/cloud.py CHANGED Viewed

@@ -182,6 +182,13 @@ class Cloud:
         """
         return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE
+    @classmethod
+    def uses_ray(cls) -> bool:
+        """Returns whether this cloud uses Ray as the distributed
+        execution framework.
+        """
+        return True
     #### Regions/Zones ####
     @classmethod

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl