skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
189
189
|
# can still exist in the record, and we check for credentials to avoid
|
|
190
190
|
# updating the SSH config for non-existent clusters.
|
|
191
191
|
credentials = record['credentials']
|
|
192
|
+
ips = handle.cached_external_ips
|
|
192
193
|
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
|
193
194
|
# Replace the proxy command to proxy through the SkyPilot API
|
|
194
195
|
# server with websocket.
|
|
@@ -217,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
217
218
|
f'{server_common.get_server_url()} '
|
|
218
219
|
f'{handle.cluster_name}\"')
|
|
219
220
|
credentials['ssh_proxy_command'] = proxy_command
|
|
221
|
+
elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
222
|
+
# TODO(kevin): This is a temporary workaround, ideally we want to
|
|
223
|
+
# get a shell through srun --pty bash on the existing sbatch job.
|
|
224
|
+
|
|
225
|
+
# Proxy through the controller/login node to reach the worker node.
|
|
226
|
+
if (handle.cached_internal_ips is None or
|
|
227
|
+
not handle.cached_internal_ips):
|
|
228
|
+
logger.debug(
|
|
229
|
+
f'Cluster {name} does not have cached internal IPs. '
|
|
230
|
+
'Skipping SSH config update.')
|
|
231
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
escaped_key_path = shlex.quote(
|
|
235
|
+
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
|
236
|
+
handle.cluster_name, credentials))
|
|
237
|
+
controller_host = handle.cached_external_ips[0]
|
|
238
|
+
|
|
239
|
+
# Build jump proxy: ssh to worker via controller/login node
|
|
240
|
+
proxy_command = (f'ssh -tt -i {escaped_key_path} '
|
|
241
|
+
'-o StrictHostKeyChecking=no '
|
|
242
|
+
'-o UserKnownHostsFile=/dev/null '
|
|
243
|
+
'-o IdentitiesOnly=yes '
|
|
244
|
+
'-W %h:%p '
|
|
245
|
+
f'{handle.ssh_user}@{controller_host}')
|
|
246
|
+
original_proxy = credentials.get('ssh_proxy_command')
|
|
247
|
+
if original_proxy:
|
|
248
|
+
proxy_command += (
|
|
249
|
+
f' -o ProxyCommand={shlex.quote(original_proxy)}')
|
|
250
|
+
|
|
251
|
+
credentials['ssh_proxy_command'] = proxy_command
|
|
252
|
+
|
|
253
|
+
# For Slurm, use the worker's internal IP as the SSH target
|
|
254
|
+
ips = handle.cached_internal_ips
|
|
220
255
|
|
|
221
256
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
222
257
|
handle.cluster_name,
|
|
223
|
-
|
|
258
|
+
ips,
|
|
224
259
|
credentials,
|
|
225
260
|
handle.cached_external_ssh_ports,
|
|
226
261
|
handle.docker_user,
|
|
@@ -832,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
|
|
|
832
867
|
"""
|
|
833
868
|
|
|
834
869
|
def list_commands(self, ctx): # pylint: disable=unused-argument
|
|
835
|
-
|
|
870
|
+
# Preserve definition order but hide aliases (same command object) and
|
|
871
|
+
# commands explicitly marked as hidden.
|
|
872
|
+
seen_commands = set()
|
|
873
|
+
names = []
|
|
874
|
+
for name, command in self.commands.items():
|
|
875
|
+
if getattr(command, 'hidden', False):
|
|
876
|
+
continue
|
|
877
|
+
command_id = id(command)
|
|
878
|
+
if command_id in seen_commands:
|
|
879
|
+
continue
|
|
880
|
+
seen_commands.add(command_id)
|
|
881
|
+
names.append(name)
|
|
882
|
+
return names
|
|
836
883
|
|
|
837
884
|
@usage_lib.entrypoint('sky.cli', fallback=True)
|
|
838
885
|
def invoke(self, ctx):
|
|
@@ -3535,6 +3582,10 @@ def show_gpus(
|
|
|
3535
3582
|
maximum quantities of the GPU available on a single node and the real-time
|
|
3536
3583
|
availability of the GPU across all nodes in the Kubernetes cluster.
|
|
3537
3584
|
|
|
3585
|
+
If ``--cloud slurm`` is specified, it will show the maximum quantities of
|
|
3586
|
+
the GPU available on a single node and the real-time availability of the
|
|
3587
|
+
GPU across all nodes in the Slurm cluster.
|
|
3588
|
+
|
|
3538
3589
|
Definitions of certain fields:
|
|
3539
3590
|
|
|
3540
3591
|
* ``DEVICE_MEM``: Memory of a single device; does not depend on the device
|
|
@@ -3590,6 +3641,8 @@ def show_gpus(
|
|
|
3590
3641
|
cloud_is_kubernetes = isinstance(
|
|
3591
3642
|
cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
|
|
3592
3643
|
cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
|
|
3644
|
+
cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
|
|
3645
|
+
|
|
3593
3646
|
# TODO(romilb): We should move this to the backend.
|
|
3594
3647
|
kubernetes_autoscaling = skypilot_config.get_effective_region_config(
|
|
3595
3648
|
cloud='kubernetes',
|
|
@@ -3598,6 +3651,7 @@ def show_gpus(
|
|
|
3598
3651
|
default_value=None) is not None
|
|
3599
3652
|
kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
|
|
3600
3653
|
ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
|
|
3654
|
+
slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
|
|
3601
3655
|
query_k8s_realtime_gpu = (kubernetes_is_enabled and
|
|
3602
3656
|
(cloud_name is None or cloud_is_kubernetes))
|
|
3603
3657
|
query_ssh_realtime_gpu = (ssh_is_enabled and
|
|
@@ -3657,8 +3711,9 @@ def show_gpus(
|
|
|
3657
3711
|
raise ValueError(full_err_msg)
|
|
3658
3712
|
no_permissions_str = '<no permissions>'
|
|
3659
3713
|
realtime_gpu_infos = []
|
|
3714
|
+
# Stores per-GPU totals as [ready_capacity, available, not_ready].
|
|
3660
3715
|
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3661
|
-
lambda: [0, 0])
|
|
3716
|
+
lambda: [0, 0, 0])
|
|
3662
3717
|
all_nodes_info = []
|
|
3663
3718
|
|
|
3664
3719
|
# display an aggregated table for all contexts
|
|
@@ -3669,6 +3724,33 @@ def show_gpus(
|
|
|
3669
3724
|
|
|
3670
3725
|
num_filtered_contexts = 0
|
|
3671
3726
|
|
|
3727
|
+
def _count_not_ready_gpus(
|
|
3728
|
+
nodes_info: Optional['models.KubernetesNodesInfo']
|
|
3729
|
+
) -> Dict[str, int]:
|
|
3730
|
+
"""Return counts of GPUs on not ready nodes keyed by GPU type."""
|
|
3731
|
+
not_ready_counts: Dict[str, int] = collections.defaultdict(int)
|
|
3732
|
+
if nodes_info is None:
|
|
3733
|
+
return not_ready_counts
|
|
3734
|
+
|
|
3735
|
+
node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
|
|
3736
|
+
for node_info in node_info_dict.values():
|
|
3737
|
+
accelerator_type = getattr(node_info, 'accelerator_type', None)
|
|
3738
|
+
if not accelerator_type:
|
|
3739
|
+
continue
|
|
3740
|
+
|
|
3741
|
+
total_info = getattr(node_info, 'total', {})
|
|
3742
|
+
accelerator_count = 0
|
|
3743
|
+
if isinstance(total_info, dict):
|
|
3744
|
+
accelerator_count = int(
|
|
3745
|
+
total_info.get('accelerator_count', 0))
|
|
3746
|
+
if accelerator_count <= 0:
|
|
3747
|
+
continue
|
|
3748
|
+
|
|
3749
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3750
|
+
if not node_is_ready:
|
|
3751
|
+
not_ready_counts[accelerator_type] += accelerator_count
|
|
3752
|
+
return not_ready_counts
|
|
3753
|
+
|
|
3672
3754
|
if realtime_gpu_availability_lists:
|
|
3673
3755
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
|
3674
3756
|
if not _filter_ctx(ctx):
|
|
@@ -3678,6 +3760,12 @@ def show_gpus(
|
|
|
3678
3760
|
else:
|
|
3679
3761
|
display_ctx = ctx
|
|
3680
3762
|
num_filtered_contexts += 1
|
|
3763
|
+
# Collect node info for this context before building tables so
|
|
3764
|
+
# we can exclude GPUs on not ready nodes from the totals.
|
|
3765
|
+
nodes_info = sdk.stream_and_get(
|
|
3766
|
+
sdk.kubernetes_node_info(context=ctx))
|
|
3767
|
+
context_not_ready_counts = _count_not_ready_gpus(nodes_info)
|
|
3768
|
+
|
|
3681
3769
|
realtime_gpu_table = log_utils.create_table(
|
|
3682
3770
|
['GPU', qty_header, 'UTILIZATION'])
|
|
3683
3771
|
for realtime_gpu_availability in sorted(availability_list):
|
|
@@ -3686,24 +3774,116 @@ def show_gpus(
|
|
|
3686
3774
|
available_qty = (gpu_availability.available
|
|
3687
3775
|
if gpu_availability.available != -1 else
|
|
3688
3776
|
no_permissions_str)
|
|
3777
|
+
# Exclude GPUs on not ready nodes from capacity counts.
|
|
3778
|
+
not_ready_count = min(
|
|
3779
|
+
context_not_ready_counts.get(gpu_availability.gpu, 0),
|
|
3780
|
+
gpu_availability.capacity)
|
|
3781
|
+
# Ensure capacity is never below the reported available
|
|
3782
|
+
# quantity (if available is unknown, treat as 0 for totals).
|
|
3783
|
+
available_for_totals = max(
|
|
3784
|
+
gpu_availability.available
|
|
3785
|
+
if gpu_availability.available != -1 else 0, 0)
|
|
3786
|
+
effective_capacity = max(
|
|
3787
|
+
gpu_availability.capacity - not_ready_count,
|
|
3788
|
+
available_for_totals)
|
|
3789
|
+
utilization = (
|
|
3790
|
+
f'{available_qty} of {effective_capacity} free')
|
|
3791
|
+
if not_ready_count > 0:
|
|
3792
|
+
utilization += f' ({not_ready_count} not ready)'
|
|
3689
3793
|
realtime_gpu_table.add_row([
|
|
3690
3794
|
gpu_availability.gpu,
|
|
3691
3795
|
_list_to_str(gpu_availability.counts),
|
|
3692
|
-
|
|
3796
|
+
utilization,
|
|
3693
3797
|
])
|
|
3694
3798
|
gpu = gpu_availability.gpu
|
|
3695
|
-
capacity = gpu_availability.capacity
|
|
3696
3799
|
# we want total, so skip permission denied.
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
total_gpu_info[gpu][
|
|
3700
|
-
total_gpu_info[gpu][
|
|
3800
|
+
if effective_capacity > 0 or not_ready_count > 0:
|
|
3801
|
+
total_gpu_info[gpu][0] += effective_capacity
|
|
3802
|
+
total_gpu_info[gpu][1] += available_for_totals
|
|
3803
|
+
total_gpu_info[gpu][2] += not_ready_count
|
|
3701
3804
|
realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
|
|
3702
|
-
# Collect node info for this context
|
|
3703
|
-
nodes_info = sdk.stream_and_get(
|
|
3704
|
-
sdk.kubernetes_node_info(context=ctx))
|
|
3705
3805
|
all_nodes_info.append((display_ctx, nodes_info))
|
|
3706
3806
|
if num_filtered_contexts > 1:
|
|
3807
|
+
total_realtime_gpu_table = log_utils.create_table(
|
|
3808
|
+
['GPU', 'UTILIZATION'])
|
|
3809
|
+
for gpu, stats in total_gpu_info.items():
|
|
3810
|
+
not_ready = stats[2]
|
|
3811
|
+
utilization = f'{stats[1]} of {stats[0]} free'
|
|
3812
|
+
if not_ready > 0:
|
|
3813
|
+
utilization += f' ({not_ready} not ready)'
|
|
3814
|
+
total_realtime_gpu_table.add_row([gpu, utilization])
|
|
3815
|
+
else:
|
|
3816
|
+
total_realtime_gpu_table = None
|
|
3817
|
+
|
|
3818
|
+
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
|
3819
|
+
|
|
3820
|
+
def _get_slurm_realtime_gpu_tables(
|
|
3821
|
+
name_filter: Optional[str] = None,
|
|
3822
|
+
quantity_filter: Optional[int] = None
|
|
3823
|
+
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
|
3824
|
+
Optional['prettytable.PrettyTable']]:
|
|
3825
|
+
"""Get Slurm GPU availability tables.
|
|
3826
|
+
|
|
3827
|
+
Args:
|
|
3828
|
+
name_filter: Filter GPUs by name.
|
|
3829
|
+
quantity_filter: Filter GPUs by quantity.
|
|
3830
|
+
|
|
3831
|
+
Returns:
|
|
3832
|
+
A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
|
|
3833
|
+
"""
|
|
3834
|
+
if quantity_filter:
|
|
3835
|
+
qty_header = 'QTY_FILTER'
|
|
3836
|
+
else:
|
|
3837
|
+
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
|
3838
|
+
|
|
3839
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
|
3840
|
+
sdk.realtime_slurm_gpu_availability(
|
|
3841
|
+
name_filter=name_filter, quantity_filter=quantity_filter))
|
|
3842
|
+
if not realtime_gpu_availability_lists:
|
|
3843
|
+
err_msg = 'No GPUs found in any Slurm partition. '
|
|
3844
|
+
debug_msg = 'To further debug, run: sky check slurm '
|
|
3845
|
+
if name_filter is not None:
|
|
3846
|
+
gpu_info_msg = f' {name_filter!r}'
|
|
3847
|
+
if quantity_filter is not None:
|
|
3848
|
+
gpu_info_msg += (' with requested quantity'
|
|
3849
|
+
f' {quantity_filter}')
|
|
3850
|
+
err_msg = (f'Resources{gpu_info_msg} not found '
|
|
3851
|
+
'in any Slurm partition. ')
|
|
3852
|
+
debug_msg = ('To show available accelerators on Slurm,'
|
|
3853
|
+
' run: sky show-gpus --cloud slurm ')
|
|
3854
|
+
raise ValueError(err_msg + debug_msg)
|
|
3855
|
+
|
|
3856
|
+
realtime_gpu_infos = []
|
|
3857
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3858
|
+
lambda: [0, 0])
|
|
3859
|
+
|
|
3860
|
+
for (slurm_cluster,
|
|
3861
|
+
availability_list) in realtime_gpu_availability_lists:
|
|
3862
|
+
realtime_gpu_table = log_utils.create_table(
|
|
3863
|
+
['GPU', qty_header, 'UTILIZATION'])
|
|
3864
|
+
for realtime_gpu_availability in sorted(availability_list):
|
|
3865
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
|
3866
|
+
*realtime_gpu_availability)
|
|
3867
|
+
# Use the counts directly from the backend, which are already
|
|
3868
|
+
# generated in powers of 2 (plus any actual maximums)
|
|
3869
|
+
requestable_quantities = gpu_availability.counts
|
|
3870
|
+
realtime_gpu_table.add_row([
|
|
3871
|
+
gpu_availability.gpu,
|
|
3872
|
+
_list_to_str(requestable_quantities),
|
|
3873
|
+
(f'{gpu_availability.available} of '
|
|
3874
|
+
f'{gpu_availability.capacity} free'),
|
|
3875
|
+
])
|
|
3876
|
+
gpu = gpu_availability.gpu
|
|
3877
|
+
capacity = gpu_availability.capacity
|
|
3878
|
+
available = gpu_availability.available
|
|
3879
|
+
if capacity > 0:
|
|
3880
|
+
total_gpu_info[gpu][0] += capacity
|
|
3881
|
+
total_gpu_info[gpu][1] += available
|
|
3882
|
+
realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
|
|
3883
|
+
|
|
3884
|
+
# display an aggregated table for all partitions
|
|
3885
|
+
# if there are more than one partitions with GPUs
|
|
3886
|
+
if len(realtime_gpu_infos) > 1:
|
|
3707
3887
|
total_realtime_gpu_table = log_utils.create_table(
|
|
3708
3888
|
['GPU', 'UTILIZATION'])
|
|
3709
3889
|
for gpu, stats in total_gpu_info.items():
|
|
@@ -3712,7 +3892,7 @@ def show_gpus(
|
|
|
3712
3892
|
else:
|
|
3713
3893
|
total_realtime_gpu_table = None
|
|
3714
3894
|
|
|
3715
|
-
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3895
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3716
3896
|
|
|
3717
3897
|
def _format_kubernetes_node_info_combined(
|
|
3718
3898
|
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
|
@@ -3736,11 +3916,16 @@ def show_gpus(
|
|
|
3736
3916
|
acc_type = node_info.accelerator_type
|
|
3737
3917
|
if acc_type is None:
|
|
3738
3918
|
acc_type = '-'
|
|
3739
|
-
|
|
3740
|
-
|
|
3741
|
-
f'{
|
|
3742
|
-
|
|
3743
|
-
|
|
3919
|
+
utilization_str = (
|
|
3920
|
+
f'{available} of '
|
|
3921
|
+
f'{node_info.total["accelerator_count"]} free')
|
|
3922
|
+
# Check if node is ready (defaults to True for backward
|
|
3923
|
+
# compatibility with older server versions)
|
|
3924
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3925
|
+
if not node_is_ready:
|
|
3926
|
+
utilization_str += ' (Node NotReady)'
|
|
3927
|
+
node_table.add_row(
|
|
3928
|
+
[context_name, node_name, acc_type, utilization_str])
|
|
3744
3929
|
|
|
3745
3930
|
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
|
3746
3931
|
if hints:
|
|
@@ -3751,6 +3936,43 @@ def show_gpus(
|
|
|
3751
3936
|
f'{colorama.Style.RESET_ALL}\n'
|
|
3752
3937
|
f'{node_table.get_string()}')
|
|
3753
3938
|
|
|
3939
|
+
def _format_slurm_node_info() -> str:
|
|
3940
|
+
node_table = log_utils.create_table([
|
|
3941
|
+
'CLUSTER',
|
|
3942
|
+
'NODE',
|
|
3943
|
+
'PARTITION',
|
|
3944
|
+
'STATE',
|
|
3945
|
+
'GPU',
|
|
3946
|
+
'UTILIZATION',
|
|
3947
|
+
])
|
|
3948
|
+
|
|
3949
|
+
# Get all cluster names
|
|
3950
|
+
slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
|
|
3951
|
+
|
|
3952
|
+
# Query each cluster
|
|
3953
|
+
for cluster_name in slurm_cluster_names:
|
|
3954
|
+
nodes_info = sdk.stream_and_get(
|
|
3955
|
+
sdk.slurm_node_info(slurm_cluster_name=cluster_name))
|
|
3956
|
+
|
|
3957
|
+
for node_info in nodes_info:
|
|
3958
|
+
node_table.add_row([
|
|
3959
|
+
cluster_name,
|
|
3960
|
+
node_info.get('node_name'),
|
|
3961
|
+
node_info.get('partition', '-'),
|
|
3962
|
+
node_info.get('node_state'),
|
|
3963
|
+
node_info.get('gpu_type') or '',
|
|
3964
|
+
(f'{node_info.get("free_gpus", 0)} of '
|
|
3965
|
+
f'{node_info.get("total_gpus", 0)} free'),
|
|
3966
|
+
])
|
|
3967
|
+
|
|
3968
|
+
slurm_per_node_msg = 'Slurm per node accelerator availability'
|
|
3969
|
+
# Optional: Add hint message if needed, similar to k8s
|
|
3970
|
+
|
|
3971
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
|
3972
|
+
f'{slurm_per_node_msg}'
|
|
3973
|
+
f'{colorama.Style.RESET_ALL}\n'
|
|
3974
|
+
f'{node_table.get_string()}')
|
|
3975
|
+
|
|
3754
3976
|
def _format_kubernetes_realtime_gpu(
|
|
3755
3977
|
total_table: Optional['prettytable.PrettyTable'],
|
|
3756
3978
|
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
|
@@ -3880,6 +4102,28 @@ def show_gpus(
|
|
|
3880
4102
|
return True, print_section_titles
|
|
3881
4103
|
return False, print_section_titles
|
|
3882
4104
|
|
|
4105
|
+
def _format_slurm_realtime_gpu(
|
|
4106
|
+
total_table, slurm_realtime_infos,
|
|
4107
|
+
show_node_info: bool) -> Generator[str, None, None]:
|
|
4108
|
+
# print total table
|
|
4109
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
|
4110
|
+
'Slurm GPUs'
|
|
4111
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4112
|
+
if total_table is not None:
|
|
4113
|
+
yield from total_table.get_string()
|
|
4114
|
+
yield '\n'
|
|
4115
|
+
|
|
4116
|
+
# print individual infos.
|
|
4117
|
+
for (partition, slurm_realtime_table) in slurm_realtime_infos:
|
|
4118
|
+
partition_str = f'Slurm Cluster: {partition}'
|
|
4119
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
4120
|
+
f'{partition_str}'
|
|
4121
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4122
|
+
yield from slurm_realtime_table.get_string()
|
|
4123
|
+
yield '\n'
|
|
4124
|
+
if show_node_info:
|
|
4125
|
+
yield _format_slurm_node_info()
|
|
4126
|
+
|
|
3883
4127
|
def _output() -> Generator[str, None, None]:
|
|
3884
4128
|
gpu_table = log_utils.create_table(
|
|
3885
4129
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
|
@@ -3897,10 +4141,12 @@ def show_gpus(
|
|
|
3897
4141
|
if cloud_name is None:
|
|
3898
4142
|
clouds_to_list = [
|
|
3899
4143
|
c for c in constants.ALL_CLOUDS
|
|
3900
|
-
if c != 'kubernetes' and c != 'ssh'
|
|
4144
|
+
if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
|
|
3901
4145
|
]
|
|
3902
4146
|
|
|
3903
4147
|
k8s_messages = ''
|
|
4148
|
+
slurm_messages = ''
|
|
4149
|
+
k8s_printed = False
|
|
3904
4150
|
if accelerator_str is None:
|
|
3905
4151
|
# Collect k8s related messages in k8s_messages and print them at end
|
|
3906
4152
|
print_section_titles = False
|
|
@@ -3912,6 +4158,7 @@ def show_gpus(
|
|
|
3912
4158
|
yield '\n\n'
|
|
3913
4159
|
stop_iter_one, print_section_titles_one, k8s_messages_one = (
|
|
3914
4160
|
yield from _possibly_show_k8s_like_realtime(is_ssh))
|
|
4161
|
+
k8s_printed = True
|
|
3915
4162
|
stop_iter = stop_iter or stop_iter_one
|
|
3916
4163
|
print_section_titles = (print_section_titles or
|
|
3917
4164
|
print_section_titles_one)
|
|
@@ -3919,11 +4166,45 @@ def show_gpus(
|
|
|
3919
4166
|
prev_print_section_titles = print_section_titles_one
|
|
3920
4167
|
if stop_iter:
|
|
3921
4168
|
return
|
|
4169
|
+
# If cloud is slurm, we want to show real-time capacity
|
|
4170
|
+
if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
|
|
4171
|
+
try:
|
|
4172
|
+
# If --cloud slurm is not specified, we want to catch
|
|
4173
|
+
# the case where no GPUs are available on the cluster and
|
|
4174
|
+
# print the warning at the end.
|
|
4175
|
+
slurm_realtime_infos, total_table = (
|
|
4176
|
+
_get_slurm_realtime_gpu_tables())
|
|
4177
|
+
except ValueError as e:
|
|
4178
|
+
if not cloud_is_slurm:
|
|
4179
|
+
# Make it a note if cloud is not slurm
|
|
4180
|
+
slurm_messages += 'Note: '
|
|
4181
|
+
slurm_messages += str(e)
|
|
4182
|
+
else:
|
|
4183
|
+
print_section_titles = True
|
|
4184
|
+
if k8s_printed:
|
|
4185
|
+
yield '\n'
|
|
4186
|
+
|
|
4187
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4188
|
+
slurm_realtime_infos,
|
|
4189
|
+
show_node_info=True)
|
|
4190
|
+
|
|
4191
|
+
if cloud_is_slurm:
|
|
4192
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4193
|
+
if not slurm_is_enabled:
|
|
4194
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4195
|
+
'sky check slurm ')
|
|
4196
|
+
yield slurm_messages
|
|
4197
|
+
return
|
|
3922
4198
|
|
|
3923
4199
|
# For show_all, show the k8s message at the start since output is
|
|
3924
4200
|
# long and the user may not scroll to the end.
|
|
3925
|
-
if show_all and k8s_messages:
|
|
3926
|
-
|
|
4201
|
+
if show_all and (k8s_messages or slurm_messages):
|
|
4202
|
+
if k8s_messages:
|
|
4203
|
+
yield k8s_messages
|
|
4204
|
+
if slurm_messages:
|
|
4205
|
+
if k8s_messages:
|
|
4206
|
+
yield '\n'
|
|
4207
|
+
yield slurm_messages
|
|
3927
4208
|
yield '\n\n'
|
|
3928
4209
|
|
|
3929
4210
|
list_accelerator_counts_result = sdk.stream_and_get(
|
|
@@ -3971,9 +4252,10 @@ def show_gpus(
|
|
|
3971
4252
|
else:
|
|
3972
4253
|
yield ('\n\nHint: use -a/--all to see all accelerators '
|
|
3973
4254
|
'(including non-common ones) and pricing.')
|
|
3974
|
-
if k8s_messages:
|
|
4255
|
+
if k8s_messages or slurm_messages:
|
|
3975
4256
|
yield '\n'
|
|
3976
4257
|
yield k8s_messages
|
|
4258
|
+
yield slurm_messages
|
|
3977
4259
|
return
|
|
3978
4260
|
else:
|
|
3979
4261
|
# Parse accelerator string
|
|
@@ -4013,6 +4295,31 @@ def show_gpus(
|
|
|
4013
4295
|
if stop_iter:
|
|
4014
4296
|
return
|
|
4015
4297
|
|
|
4298
|
+
# Handle Slurm filtering by name and quantity
|
|
4299
|
+
if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
|
|
4300
|
+
not show_all):
|
|
4301
|
+
# Print section title if not showing all and instead a specific
|
|
4302
|
+
# accelerator is requested
|
|
4303
|
+
print_section_titles = True
|
|
4304
|
+
try:
|
|
4305
|
+
slurm_realtime_infos, total_table = (
|
|
4306
|
+
_get_slurm_realtime_gpu_tables(name_filter=name,
|
|
4307
|
+
quantity_filter=quantity))
|
|
4308
|
+
|
|
4309
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4310
|
+
slurm_realtime_infos,
|
|
4311
|
+
show_node_info=False)
|
|
4312
|
+
except ValueError as e:
|
|
4313
|
+
# In the case of a specific accelerator, show the error message
|
|
4314
|
+
# immediately (e.g., "Resources A10G not found ...")
|
|
4315
|
+
yield str(e)
|
|
4316
|
+
yield slurm_messages
|
|
4317
|
+
if cloud_is_slurm:
|
|
4318
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4319
|
+
if not slurm_is_enabled:
|
|
4320
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4321
|
+
'sky check slurm ')
|
|
4322
|
+
return
|
|
4016
4323
|
# For clouds other than Kubernetes, get the accelerator details
|
|
4017
4324
|
# Case-sensitive
|
|
4018
4325
|
list_accelerators_result = sdk.stream_and_get(
|
sky/client/sdk.py
CHANGED
|
@@ -42,6 +42,7 @@ from sky.server.requests import request_names
|
|
|
42
42
|
from sky.server.requests import requests as requests_lib
|
|
43
43
|
from sky.skylet import autostop_lib
|
|
44
44
|
from sky.skylet import constants
|
|
45
|
+
from sky.ssh_node_pools import utils as ssh_utils
|
|
45
46
|
from sky.usage import usage_lib
|
|
46
47
|
from sky.utils import admin_policy_utils
|
|
47
48
|
from sky.utils import annotations
|
|
@@ -57,7 +58,6 @@ from sky.utils import status_lib
|
|
|
57
58
|
from sky.utils import subprocess_utils
|
|
58
59
|
from sky.utils import ux_utils
|
|
59
60
|
from sky.utils import yaml_utils
|
|
60
|
-
from sky.utils.kubernetes import ssh_utils
|
|
61
61
|
|
|
62
62
|
if typing.TYPE_CHECKING:
|
|
63
63
|
import base64
|
|
@@ -675,7 +675,7 @@ def _launch(
|
|
|
675
675
|
clusters = get(status_request_id)
|
|
676
676
|
cluster_user_hash = common_utils.get_user_hash()
|
|
677
677
|
cluster_user_hash_str = ''
|
|
678
|
-
current_user = common_utils.
|
|
678
|
+
current_user = common_utils.get_local_user_name()
|
|
679
679
|
cluster_user_name = current_user
|
|
680
680
|
if not clusters:
|
|
681
681
|
# Show the optimize log before the prompt if the cluster does not
|
|
@@ -2744,3 +2744,57 @@ def api_logout() -> None:
|
|
|
2744
2744
|
_clear_api_server_config()
|
|
2745
2745
|
logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
|
|
2746
2746
|
f'{colorama.Style.RESET_ALL}')
|
|
2747
|
+
|
|
2748
|
+
|
|
2749
|
+
@usage_lib.entrypoint
|
|
2750
|
+
@server_common.check_server_healthy_or_start
|
|
2751
|
+
@versions.minimal_api_version(24)
|
|
2752
|
+
@annotations.client_api
|
|
2753
|
+
def realtime_slurm_gpu_availability(
|
|
2754
|
+
name_filter: Optional[str] = None,
|
|
2755
|
+
quantity_filter: Optional[int] = None) -> server_common.RequestId:
|
|
2756
|
+
"""Gets the real-time Slurm GPU availability.
|
|
2757
|
+
|
|
2758
|
+
Args:
|
|
2759
|
+
name_filter: Optional name filter for GPUs.
|
|
2760
|
+
quantity_filter: Optional quantity filter for GPUs.
|
|
2761
|
+
|
|
2762
|
+
Returns:
|
|
2763
|
+
The request ID of the Slurm GPU availability request.
|
|
2764
|
+
"""
|
|
2765
|
+
body = payloads.SlurmGpuAvailabilityRequestBody(
|
|
2766
|
+
name_filter=name_filter,
|
|
2767
|
+
quantity_filter=quantity_filter,
|
|
2768
|
+
)
|
|
2769
|
+
response = server_common.make_authenticated_request(
|
|
2770
|
+
'POST',
|
|
2771
|
+
'/slurm_gpu_availability',
|
|
2772
|
+
json=json.loads(body.model_dump_json()),
|
|
2773
|
+
)
|
|
2774
|
+
return server_common.get_request_id(response)
|
|
2775
|
+
|
|
2776
|
+
|
|
2777
|
+
@usage_lib.entrypoint
|
|
2778
|
+
@server_common.check_server_healthy_or_start
|
|
2779
|
+
@versions.minimal_api_version(24)
|
|
2780
|
+
@annotations.client_api
|
|
2781
|
+
def slurm_node_info(
|
|
2782
|
+
slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
|
|
2783
|
+
"""Gets the resource information for all nodes in the Slurm cluster.
|
|
2784
|
+
|
|
2785
|
+
Returns:
|
|
2786
|
+
The request ID of the Slurm node info request.
|
|
2787
|
+
|
|
2788
|
+
Request Returns:
|
|
2789
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing info
|
|
2790
|
+
for a single Slurm node (node_name, partition, node_state,
|
|
2791
|
+
gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
|
|
2792
|
+
"""
|
|
2793
|
+
body = payloads.SlurmNodeInfoRequestBody(
|
|
2794
|
+
slurm_cluster_name=slurm_cluster_name)
|
|
2795
|
+
response = server_common.make_authenticated_request(
|
|
2796
|
+
'GET',
|
|
2797
|
+
'/slurm_node_info',
|
|
2798
|
+
json=json.loads(body.model_dump_json()),
|
|
2799
|
+
)
|
|
2800
|
+
return server_common.get_request_id(response)
|
sky/clouds/__init__.py
CHANGED
|
@@ -31,6 +31,7 @@ from sky.clouds.runpod import RunPod
|
|
|
31
31
|
from sky.clouds.scp import SCP
|
|
32
32
|
from sky.clouds.seeweb import Seeweb
|
|
33
33
|
from sky.clouds.shadeform import Shadeform
|
|
34
|
+
from sky.clouds.slurm import Slurm
|
|
34
35
|
from sky.clouds.ssh import SSH
|
|
35
36
|
from sky.clouds.vast import Vast
|
|
36
37
|
from sky.clouds.vsphere import Vsphere
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
'Paperspace',
|
|
49
50
|
'PrimeIntellect',
|
|
50
51
|
'SCP',
|
|
52
|
+
'Slurm',
|
|
51
53
|
'RunPod',
|
|
52
54
|
'Shadeform',
|
|
53
55
|
'Vast',
|
sky/clouds/cloud.py
CHANGED
|
@@ -182,6 +182,13 @@ class Cloud:
|
|
|
182
182
|
"""
|
|
183
183
|
return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE
|
|
184
184
|
|
|
185
|
+
@classmethod
|
|
186
|
+
def uses_ray(cls) -> bool:
|
|
187
|
+
"""Returns whether this cloud uses Ray as the distributed
|
|
188
|
+
execution framework.
|
|
189
|
+
"""
|
|
190
|
+
return True
|
|
191
|
+
|
|
185
192
|
#### Regions/Zones ####
|
|
186
193
|
|
|
187
194
|
@classmethod
|