skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
189
189
  # can still exist in the record, and we check for credentials to avoid
190
190
  # updating the SSH config for non-existent clusters.
191
191
  credentials = record['credentials']
192
+ ips = handle.cached_external_ips
192
193
  if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
193
194
  # Replace the proxy command to proxy through the SkyPilot API
194
195
  # server with websocket.
@@ -217,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
217
218
  f'{server_common.get_server_url()} '
218
219
  f'{handle.cluster_name}\"')
219
220
  credentials['ssh_proxy_command'] = proxy_command
221
+ elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
222
+ # TODO(kevin): This is a temporary workaround, ideally we want to
223
+ # get a shell through srun --pty bash on the existing sbatch job.
224
+
225
+ # Proxy through the controller/login node to reach the worker node.
226
+ if (handle.cached_internal_ips is None or
227
+ not handle.cached_internal_ips):
228
+ logger.debug(
229
+ f'Cluster {name} does not have cached internal IPs. '
230
+ 'Skipping SSH config update.')
231
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
232
+ continue
233
+
234
+ escaped_key_path = shlex.quote(
235
+ cluster_utils.SSHConfigHelper.generate_local_key_file(
236
+ handle.cluster_name, credentials))
237
+ controller_host = handle.cached_external_ips[0]
238
+
239
+ # Build jump proxy: ssh to worker via controller/login node
240
+ proxy_command = (f'ssh -tt -i {escaped_key_path} '
241
+ '-o StrictHostKeyChecking=no '
242
+ '-o UserKnownHostsFile=/dev/null '
243
+ '-o IdentitiesOnly=yes '
244
+ '-W %h:%p '
245
+ f'{handle.ssh_user}@{controller_host}')
246
+ original_proxy = credentials.get('ssh_proxy_command')
247
+ if original_proxy:
248
+ proxy_command += (
249
+ f' -o ProxyCommand={shlex.quote(original_proxy)}')
250
+
251
+ credentials['ssh_proxy_command'] = proxy_command
252
+
253
+ # For Slurm, use the worker's internal IP as the SSH target
254
+ ips = handle.cached_internal_ips
220
255
 
221
256
  cluster_utils.SSHConfigHelper.add_cluster(
222
257
  handle.cluster_name,
223
- handle.cached_external_ips,
258
+ ips,
224
259
  credentials,
225
260
  handle.cached_external_ssh_ports,
226
261
  handle.docker_user,
@@ -832,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
832
867
  """
833
868
 
834
869
  def list_commands(self, ctx): # pylint: disable=unused-argument
835
- return self.commands.keys()
870
+ # Preserve definition order but hide aliases (same command object) and
871
+ # commands explicitly marked as hidden.
872
+ seen_commands = set()
873
+ names = []
874
+ for name, command in self.commands.items():
875
+ if getattr(command, 'hidden', False):
876
+ continue
877
+ command_id = id(command)
878
+ if command_id in seen_commands:
879
+ continue
880
+ seen_commands.add(command_id)
881
+ names.append(name)
882
+ return names
836
883
 
837
884
  @usage_lib.entrypoint('sky.cli', fallback=True)
838
885
  def invoke(self, ctx):
@@ -3535,6 +3582,10 @@ def show_gpus(
3535
3582
  maximum quantities of the GPU available on a single node and the real-time
3536
3583
  availability of the GPU across all nodes in the Kubernetes cluster.
3537
3584
 
3585
+ If ``--cloud slurm`` is specified, it will show the maximum quantities of
3586
+ the GPU available on a single node and the real-time availability of the
3587
+ GPU across all nodes in the Slurm cluster.
3588
+
3538
3589
  Definitions of certain fields:
3539
3590
 
3540
3591
  * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3590,6 +3641,8 @@ def show_gpus(
3590
3641
  cloud_is_kubernetes = isinstance(
3591
3642
  cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
3592
3643
  cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
3644
+ cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
3645
+
3593
3646
  # TODO(romilb): We should move this to the backend.
3594
3647
  kubernetes_autoscaling = skypilot_config.get_effective_region_config(
3595
3648
  cloud='kubernetes',
@@ -3598,6 +3651,7 @@ def show_gpus(
3598
3651
  default_value=None) is not None
3599
3652
  kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
3600
3653
  ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
3654
+ slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
3601
3655
  query_k8s_realtime_gpu = (kubernetes_is_enabled and
3602
3656
  (cloud_name is None or cloud_is_kubernetes))
3603
3657
  query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3657,8 +3711,9 @@ def show_gpus(
3657
3711
  raise ValueError(full_err_msg)
3658
3712
  no_permissions_str = '<no permissions>'
3659
3713
  realtime_gpu_infos = []
3714
+ # Stores per-GPU totals as [ready_capacity, available, not_ready].
3660
3715
  total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3661
- lambda: [0, 0])
3716
+ lambda: [0, 0, 0])
3662
3717
  all_nodes_info = []
3663
3718
 
3664
3719
  # display an aggregated table for all contexts
@@ -3669,6 +3724,33 @@ def show_gpus(
3669
3724
 
3670
3725
  num_filtered_contexts = 0
3671
3726
 
3727
+ def _count_not_ready_gpus(
3728
+ nodes_info: Optional['models.KubernetesNodesInfo']
3729
+ ) -> Dict[str, int]:
3730
+ """Return counts of GPUs on not ready nodes keyed by GPU type."""
3731
+ not_ready_counts: Dict[str, int] = collections.defaultdict(int)
3732
+ if nodes_info is None:
3733
+ return not_ready_counts
3734
+
3735
+ node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
3736
+ for node_info in node_info_dict.values():
3737
+ accelerator_type = getattr(node_info, 'accelerator_type', None)
3738
+ if not accelerator_type:
3739
+ continue
3740
+
3741
+ total_info = getattr(node_info, 'total', {})
3742
+ accelerator_count = 0
3743
+ if isinstance(total_info, dict):
3744
+ accelerator_count = int(
3745
+ total_info.get('accelerator_count', 0))
3746
+ if accelerator_count <= 0:
3747
+ continue
3748
+
3749
+ node_is_ready = getattr(node_info, 'is_ready', True)
3750
+ if not node_is_ready:
3751
+ not_ready_counts[accelerator_type] += accelerator_count
3752
+ return not_ready_counts
3753
+
3672
3754
  if realtime_gpu_availability_lists:
3673
3755
  for (ctx, availability_list) in realtime_gpu_availability_lists:
3674
3756
  if not _filter_ctx(ctx):
@@ -3678,6 +3760,12 @@ def show_gpus(
3678
3760
  else:
3679
3761
  display_ctx = ctx
3680
3762
  num_filtered_contexts += 1
3763
+ # Collect node info for this context before building tables so
3764
+ # we can exclude GPUs on not ready nodes from the totals.
3765
+ nodes_info = sdk.stream_and_get(
3766
+ sdk.kubernetes_node_info(context=ctx))
3767
+ context_not_ready_counts = _count_not_ready_gpus(nodes_info)
3768
+
3681
3769
  realtime_gpu_table = log_utils.create_table(
3682
3770
  ['GPU', qty_header, 'UTILIZATION'])
3683
3771
  for realtime_gpu_availability in sorted(availability_list):
@@ -3686,24 +3774,116 @@ def show_gpus(
3686
3774
  available_qty = (gpu_availability.available
3687
3775
  if gpu_availability.available != -1 else
3688
3776
  no_permissions_str)
3777
+ # Exclude GPUs on not ready nodes from capacity counts.
3778
+ not_ready_count = min(
3779
+ context_not_ready_counts.get(gpu_availability.gpu, 0),
3780
+ gpu_availability.capacity)
3781
+ # Ensure capacity is never below the reported available
3782
+ # quantity (if available is unknown, treat as 0 for totals).
3783
+ available_for_totals = max(
3784
+ gpu_availability.available
3785
+ if gpu_availability.available != -1 else 0, 0)
3786
+ effective_capacity = max(
3787
+ gpu_availability.capacity - not_ready_count,
3788
+ available_for_totals)
3789
+ utilization = (
3790
+ f'{available_qty} of {effective_capacity} free')
3791
+ if not_ready_count > 0:
3792
+ utilization += f' ({not_ready_count} not ready)'
3689
3793
  realtime_gpu_table.add_row([
3690
3794
  gpu_availability.gpu,
3691
3795
  _list_to_str(gpu_availability.counts),
3692
- f'{available_qty} of {gpu_availability.capacity} free',
3796
+ utilization,
3693
3797
  ])
3694
3798
  gpu = gpu_availability.gpu
3695
- capacity = gpu_availability.capacity
3696
3799
  # we want total, so skip permission denied.
3697
- available = max(gpu_availability.available, 0)
3698
- if capacity > 0:
3699
- total_gpu_info[gpu][0] += capacity
3700
- total_gpu_info[gpu][1] += available
3800
+ if effective_capacity > 0 or not_ready_count > 0:
3801
+ total_gpu_info[gpu][0] += effective_capacity
3802
+ total_gpu_info[gpu][1] += available_for_totals
3803
+ total_gpu_info[gpu][2] += not_ready_count
3701
3804
  realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
3702
- # Collect node info for this context
3703
- nodes_info = sdk.stream_and_get(
3704
- sdk.kubernetes_node_info(context=ctx))
3705
3805
  all_nodes_info.append((display_ctx, nodes_info))
3706
3806
  if num_filtered_contexts > 1:
3807
+ total_realtime_gpu_table = log_utils.create_table(
3808
+ ['GPU', 'UTILIZATION'])
3809
+ for gpu, stats in total_gpu_info.items():
3810
+ not_ready = stats[2]
3811
+ utilization = f'{stats[1]} of {stats[0]} free'
3812
+ if not_ready > 0:
3813
+ utilization += f' ({not_ready} not ready)'
3814
+ total_realtime_gpu_table.add_row([gpu, utilization])
3815
+ else:
3816
+ total_realtime_gpu_table = None
3817
+
3818
+ return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3819
+
3820
+ def _get_slurm_realtime_gpu_tables(
3821
+ name_filter: Optional[str] = None,
3822
+ quantity_filter: Optional[int] = None
3823
+ ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
3824
+ Optional['prettytable.PrettyTable']]:
3825
+ """Get Slurm GPU availability tables.
3826
+
3827
+ Args:
3828
+ name_filter: Filter GPUs by name.
3829
+ quantity_filter: Filter GPUs by quantity.
3830
+
3831
+ Returns:
3832
+ A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
3833
+ """
3834
+ if quantity_filter:
3835
+ qty_header = 'QTY_FILTER'
3836
+ else:
3837
+ qty_header = 'REQUESTABLE_QTY_PER_NODE'
3838
+
3839
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3840
+ sdk.realtime_slurm_gpu_availability(
3841
+ name_filter=name_filter, quantity_filter=quantity_filter))
3842
+ if not realtime_gpu_availability_lists:
3843
+ err_msg = 'No GPUs found in any Slurm partition. '
3844
+ debug_msg = 'To further debug, run: sky check slurm '
3845
+ if name_filter is not None:
3846
+ gpu_info_msg = f' {name_filter!r}'
3847
+ if quantity_filter is not None:
3848
+ gpu_info_msg += (' with requested quantity'
3849
+ f' {quantity_filter}')
3850
+ err_msg = (f'Resources{gpu_info_msg} not found '
3851
+ 'in any Slurm partition. ')
3852
+ debug_msg = ('To show available accelerators on Slurm,'
3853
+ ' run: sky show-gpus --cloud slurm ')
3854
+ raise ValueError(err_msg + debug_msg)
3855
+
3856
+ realtime_gpu_infos = []
3857
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3858
+ lambda: [0, 0])
3859
+
3860
+ for (slurm_cluster,
3861
+ availability_list) in realtime_gpu_availability_lists:
3862
+ realtime_gpu_table = log_utils.create_table(
3863
+ ['GPU', qty_header, 'UTILIZATION'])
3864
+ for realtime_gpu_availability in sorted(availability_list):
3865
+ gpu_availability = models.RealtimeGpuAvailability(
3866
+ *realtime_gpu_availability)
3867
+ # Use the counts directly from the backend, which are already
3868
+ # generated in powers of 2 (plus any actual maximums)
3869
+ requestable_quantities = gpu_availability.counts
3870
+ realtime_gpu_table.add_row([
3871
+ gpu_availability.gpu,
3872
+ _list_to_str(requestable_quantities),
3873
+ (f'{gpu_availability.available} of '
3874
+ f'{gpu_availability.capacity} free'),
3875
+ ])
3876
+ gpu = gpu_availability.gpu
3877
+ capacity = gpu_availability.capacity
3878
+ available = gpu_availability.available
3879
+ if capacity > 0:
3880
+ total_gpu_info[gpu][0] += capacity
3881
+ total_gpu_info[gpu][1] += available
3882
+ realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
3883
+
3884
+ # display an aggregated table for all partitions
3885
+ # if there are more than one partitions with GPUs
3886
+ if len(realtime_gpu_infos) > 1:
3707
3887
  total_realtime_gpu_table = log_utils.create_table(
3708
3888
  ['GPU', 'UTILIZATION'])
3709
3889
  for gpu, stats in total_gpu_info.items():
@@ -3712,7 +3892,7 @@ def show_gpus(
3712
3892
  else:
3713
3893
  total_realtime_gpu_table = None
3714
3894
 
3715
- return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3895
+ return realtime_gpu_infos, total_realtime_gpu_table
3716
3896
 
3717
3897
  def _format_kubernetes_node_info_combined(
3718
3898
  contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
@@ -3736,11 +3916,16 @@ def show_gpus(
3736
3916
  acc_type = node_info.accelerator_type
3737
3917
  if acc_type is None:
3738
3918
  acc_type = '-'
3739
- node_table.add_row([
3740
- context_name, node_name, acc_type,
3741
- f'{available} of {node_info.total["accelerator_count"]} '
3742
- 'free'
3743
- ])
3919
+ utilization_str = (
3920
+ f'{available} of '
3921
+ f'{node_info.total["accelerator_count"]} free')
3922
+ # Check if node is ready (defaults to True for backward
3923
+ # compatibility with older server versions)
3924
+ node_is_ready = getattr(node_info, 'is_ready', True)
3925
+ if not node_is_ready:
3926
+ utilization_str += ' (Node NotReady)'
3927
+ node_table.add_row(
3928
+ [context_name, node_name, acc_type, utilization_str])
3744
3929
 
3745
3930
  k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
3746
3931
  if hints:
@@ -3751,6 +3936,43 @@ def show_gpus(
3751
3936
  f'{colorama.Style.RESET_ALL}\n'
3752
3937
  f'{node_table.get_string()}')
3753
3938
 
3939
+ def _format_slurm_node_info() -> str:
3940
+ node_table = log_utils.create_table([
3941
+ 'CLUSTER',
3942
+ 'NODE',
3943
+ 'PARTITION',
3944
+ 'STATE',
3945
+ 'GPU',
3946
+ 'UTILIZATION',
3947
+ ])
3948
+
3949
+ # Get all cluster names
3950
+ slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
3951
+
3952
+ # Query each cluster
3953
+ for cluster_name in slurm_cluster_names:
3954
+ nodes_info = sdk.stream_and_get(
3955
+ sdk.slurm_node_info(slurm_cluster_name=cluster_name))
3956
+
3957
+ for node_info in nodes_info:
3958
+ node_table.add_row([
3959
+ cluster_name,
3960
+ node_info.get('node_name'),
3961
+ node_info.get('partition', '-'),
3962
+ node_info.get('node_state'),
3963
+ node_info.get('gpu_type') or '',
3964
+ (f'{node_info.get("free_gpus", 0)} of '
3965
+ f'{node_info.get("total_gpus", 0)} free'),
3966
+ ])
3967
+
3968
+ slurm_per_node_msg = 'Slurm per node accelerator availability'
3969
+ # Optional: Add hint message if needed, similar to k8s
3970
+
3971
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3972
+ f'{slurm_per_node_msg}'
3973
+ f'{colorama.Style.RESET_ALL}\n'
3974
+ f'{node_table.get_string()}')
3975
+
3754
3976
  def _format_kubernetes_realtime_gpu(
3755
3977
  total_table: Optional['prettytable.PrettyTable'],
3756
3978
  k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3880,6 +4102,28 @@ def show_gpus(
3880
4102
  return True, print_section_titles
3881
4103
  return False, print_section_titles
3882
4104
 
4105
+ def _format_slurm_realtime_gpu(
4106
+ total_table, slurm_realtime_infos,
4107
+ show_node_info: bool) -> Generator[str, None, None]:
4108
+ # print total table
4109
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
4110
+ 'Slurm GPUs'
4111
+ f'{colorama.Style.RESET_ALL}\n')
4112
+ if total_table is not None:
4113
+ yield from total_table.get_string()
4114
+ yield '\n'
4115
+
4116
+ # print individual infos.
4117
+ for (partition, slurm_realtime_table) in slurm_realtime_infos:
4118
+ partition_str = f'Slurm Cluster: {partition}'
4119
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4120
+ f'{partition_str}'
4121
+ f'{colorama.Style.RESET_ALL}\n')
4122
+ yield from slurm_realtime_table.get_string()
4123
+ yield '\n'
4124
+ if show_node_info:
4125
+ yield _format_slurm_node_info()
4126
+
3883
4127
  def _output() -> Generator[str, None, None]:
3884
4128
  gpu_table = log_utils.create_table(
3885
4129
  ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3897,10 +4141,12 @@ def show_gpus(
3897
4141
  if cloud_name is None:
3898
4142
  clouds_to_list = [
3899
4143
  c for c in constants.ALL_CLOUDS
3900
- if c != 'kubernetes' and c != 'ssh'
4144
+ if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
3901
4145
  ]
3902
4146
 
3903
4147
  k8s_messages = ''
4148
+ slurm_messages = ''
4149
+ k8s_printed = False
3904
4150
  if accelerator_str is None:
3905
4151
  # Collect k8s related messages in k8s_messages and print them at end
3906
4152
  print_section_titles = False
@@ -3912,6 +4158,7 @@ def show_gpus(
3912
4158
  yield '\n\n'
3913
4159
  stop_iter_one, print_section_titles_one, k8s_messages_one = (
3914
4160
  yield from _possibly_show_k8s_like_realtime(is_ssh))
4161
+ k8s_printed = True
3915
4162
  stop_iter = stop_iter or stop_iter_one
3916
4163
  print_section_titles = (print_section_titles or
3917
4164
  print_section_titles_one)
@@ -3919,11 +4166,45 @@ def show_gpus(
3919
4166
  prev_print_section_titles = print_section_titles_one
3920
4167
  if stop_iter:
3921
4168
  return
4169
+ # If cloud is slurm, we want to show real-time capacity
4170
+ if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
4171
+ try:
4172
+ # If --cloud slurm is not specified, we want to catch
4173
+ # the case where no GPUs are available on the cluster and
4174
+ # print the warning at the end.
4175
+ slurm_realtime_infos, total_table = (
4176
+ _get_slurm_realtime_gpu_tables())
4177
+ except ValueError as e:
4178
+ if not cloud_is_slurm:
4179
+ # Make it a note if cloud is not slurm
4180
+ slurm_messages += 'Note: '
4181
+ slurm_messages += str(e)
4182
+ else:
4183
+ print_section_titles = True
4184
+ if k8s_printed:
4185
+ yield '\n'
4186
+
4187
+ yield from _format_slurm_realtime_gpu(total_table,
4188
+ slurm_realtime_infos,
4189
+ show_node_info=True)
4190
+
4191
+ if cloud_is_slurm:
4192
+ # Do not show clouds if --cloud slurm is specified
4193
+ if not slurm_is_enabled:
4194
+ yield ('Slurm is not enabled. To fix, run: '
4195
+ 'sky check slurm ')
4196
+ yield slurm_messages
4197
+ return
3922
4198
 
3923
4199
  # For show_all, show the k8s message at the start since output is
3924
4200
  # long and the user may not scroll to the end.
3925
- if show_all and k8s_messages:
3926
- yield k8s_messages
4201
+ if show_all and (k8s_messages or slurm_messages):
4202
+ if k8s_messages:
4203
+ yield k8s_messages
4204
+ if slurm_messages:
4205
+ if k8s_messages:
4206
+ yield '\n'
4207
+ yield slurm_messages
3927
4208
  yield '\n\n'
3928
4209
 
3929
4210
  list_accelerator_counts_result = sdk.stream_and_get(
@@ -3971,9 +4252,10 @@ def show_gpus(
3971
4252
  else:
3972
4253
  yield ('\n\nHint: use -a/--all to see all accelerators '
3973
4254
  '(including non-common ones) and pricing.')
3974
- if k8s_messages:
4255
+ if k8s_messages or slurm_messages:
3975
4256
  yield '\n'
3976
4257
  yield k8s_messages
4258
+ yield slurm_messages
3977
4259
  return
3978
4260
  else:
3979
4261
  # Parse accelerator string
@@ -4013,6 +4295,31 @@ def show_gpus(
4013
4295
  if stop_iter:
4014
4296
  return
4015
4297
 
4298
+ # Handle Slurm filtering by name and quantity
4299
+ if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
4300
+ not show_all):
4301
+ # Print section title if not showing all and instead a specific
4302
+ # accelerator is requested
4303
+ print_section_titles = True
4304
+ try:
4305
+ slurm_realtime_infos, total_table = (
4306
+ _get_slurm_realtime_gpu_tables(name_filter=name,
4307
+ quantity_filter=quantity))
4308
+
4309
+ yield from _format_slurm_realtime_gpu(total_table,
4310
+ slurm_realtime_infos,
4311
+ show_node_info=False)
4312
+ except ValueError as e:
4313
+ # In the case of a specific accelerator, show the error message
4314
+ # immediately (e.g., "Resources A10G not found ...")
4315
+ yield str(e)
4316
+ yield slurm_messages
4317
+ if cloud_is_slurm:
4318
+ # Do not show clouds if --cloud slurm is specified
4319
+ if not slurm_is_enabled:
4320
+ yield ('Slurm is not enabled. To fix, run: '
4321
+ 'sky check slurm ')
4322
+ return
4016
4323
  # For clouds other than Kubernetes, get the accelerator details
4017
4324
  # Case-sensitive
4018
4325
  list_accelerators_result = sdk.stream_and_get(
sky/client/sdk.py CHANGED
@@ -42,6 +42,7 @@ from sky.server.requests import request_names
42
42
  from sky.server.requests import requests as requests_lib
43
43
  from sky.skylet import autostop_lib
44
44
  from sky.skylet import constants
45
+ from sky.ssh_node_pools import utils as ssh_utils
45
46
  from sky.usage import usage_lib
46
47
  from sky.utils import admin_policy_utils
47
48
  from sky.utils import annotations
@@ -57,7 +58,6 @@ from sky.utils import status_lib
57
58
  from sky.utils import subprocess_utils
58
59
  from sky.utils import ux_utils
59
60
  from sky.utils import yaml_utils
60
- from sky.utils.kubernetes import ssh_utils
61
61
 
62
62
  if typing.TYPE_CHECKING:
63
63
  import base64
@@ -675,7 +675,7 @@ def _launch(
675
675
  clusters = get(status_request_id)
676
676
  cluster_user_hash = common_utils.get_user_hash()
677
677
  cluster_user_hash_str = ''
678
- current_user = common_utils.get_current_user_name()
678
+ current_user = common_utils.get_local_user_name()
679
679
  cluster_user_name = current_user
680
680
  if not clusters:
681
681
  # Show the optimize log before the prompt if the cluster does not
@@ -2744,3 +2744,57 @@ def api_logout() -> None:
2744
2744
  _clear_api_server_config()
2745
2745
  logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
2746
2746
  f'{colorama.Style.RESET_ALL}')
2747
+
2748
+
2749
+ @usage_lib.entrypoint
2750
+ @server_common.check_server_healthy_or_start
2751
+ @versions.minimal_api_version(24)
2752
+ @annotations.client_api
2753
+ def realtime_slurm_gpu_availability(
2754
+ name_filter: Optional[str] = None,
2755
+ quantity_filter: Optional[int] = None) -> server_common.RequestId:
2756
+ """Gets the real-time Slurm GPU availability.
2757
+
2758
+ Args:
2759
+ name_filter: Optional name filter for GPUs.
2760
+ quantity_filter: Optional quantity filter for GPUs.
2761
+
2762
+ Returns:
2763
+ The request ID of the Slurm GPU availability request.
2764
+ """
2765
+ body = payloads.SlurmGpuAvailabilityRequestBody(
2766
+ name_filter=name_filter,
2767
+ quantity_filter=quantity_filter,
2768
+ )
2769
+ response = server_common.make_authenticated_request(
2770
+ 'POST',
2771
+ '/slurm_gpu_availability',
2772
+ json=json.loads(body.model_dump_json()),
2773
+ )
2774
+ return server_common.get_request_id(response)
2775
+
2776
+
2777
+ @usage_lib.entrypoint
2778
+ @server_common.check_server_healthy_or_start
2779
+ @versions.minimal_api_version(24)
2780
+ @annotations.client_api
2781
+ def slurm_node_info(
2782
+ slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
2783
+ """Gets the resource information for all nodes in the Slurm cluster.
2784
+
2785
+ Returns:
2786
+ The request ID of the Slurm node info request.
2787
+
2788
+ Request Returns:
2789
+ List[Dict[str, Any]]: A list of dictionaries, each containing info
2790
+ for a single Slurm node (node_name, partition, node_state,
2791
+ gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
2792
+ """
2793
+ body = payloads.SlurmNodeInfoRequestBody(
2794
+ slurm_cluster_name=slurm_cluster_name)
2795
+ response = server_common.make_authenticated_request(
2796
+ 'GET',
2797
+ '/slurm_node_info',
2798
+ json=json.loads(body.model_dump_json()),
2799
+ )
2800
+ return server_common.get_request_id(response)
sky/clouds/__init__.py CHANGED
@@ -31,6 +31,7 @@ from sky.clouds.runpod import RunPod
31
31
  from sky.clouds.scp import SCP
32
32
  from sky.clouds.seeweb import Seeweb
33
33
  from sky.clouds.shadeform import Shadeform
34
+ from sky.clouds.slurm import Slurm
34
35
  from sky.clouds.ssh import SSH
35
36
  from sky.clouds.vast import Vast
36
37
  from sky.clouds.vsphere import Vsphere
@@ -48,6 +49,7 @@ __all__ = [
48
49
  'Paperspace',
49
50
  'PrimeIntellect',
50
51
  'SCP',
52
+ 'Slurm',
51
53
  'RunPod',
52
54
  'Shadeform',
53
55
  'Vast',
sky/clouds/cloud.py CHANGED
@@ -182,6 +182,13 @@ class Cloud:
182
182
  """
183
183
  return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE
184
184
 
185
+ @classmethod
186
+ def uses_ray(cls) -> bool:
187
+ """Returns whether this cloud uses Ray as the distributed
188
+ execution framework.
189
+ """
190
+ return True
191
+
185
192
  #### Regions/Zones ####
186
193
 
187
194
  @classmethod