skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,689 @@
1
+ """Slurm utilities for SkyPilot."""
2
+ import json
3
+ import math
4
+ import os
5
+ import re
6
+ import shlex
7
+ import time
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
+
10
+ from paramiko.config import SSHConfig
11
+
12
+ from sky import exceptions
13
+ from sky import sky_logging
14
+ from sky.adaptors import slurm
15
+ from sky.skylet import constants
16
+ from sky.utils import annotations
17
+ from sky.utils import common_utils
18
+ from sky.utils.db import kv_cache
19
+
20
+ logger = sky_logging.init_logger(__name__)
21
+
22
+ DEFAULT_SLURM_PATH = '~/.slurm/config'
23
+ SLURM_MARKER_FILE = '.sky_slurm_cluster'
24
+
25
+ # Regex pattern for parsing GPU GRES strings.
26
+ # Format: 'gpu[:acc_type]:acc_count(optional_extra_info)'
27
+ # Examples: 'gpu:8', 'gpu:H100:8', 'gpu:nvidia_h100_80gb_hbm3:8(S:0-1)'
28
+ _GRES_GPU_PATTERN = re.compile(r'\bgpu:(?:(?P<type>[^:(]+):)?(?P<count>\d+)',
29
+ re.IGNORECASE)
30
+
31
+ _SLURM_NODES_INFO_CACHE_TTL = 30 * 60
32
+
33
+
34
+ def get_gpu_type_and_count(gres_str: str) -> Tuple[Optional[str], int]:
35
+ """Parses GPU type and count from a GRES string.
36
+
37
+ Returns:
38
+ A tuple of (GPU type, GPU count). If no GPU is found, returns (None, 0).
39
+ """
40
+ match = _GRES_GPU_PATTERN.search(gres_str)
41
+ if not match:
42
+ return None, 0
43
+ return match.group('type'), int(match.group('count'))
44
+
45
+
46
+ # SSH host key filename for sshd.
47
+ SLURM_SSHD_HOST_KEY_FILENAME = 'skypilot_host_key'
48
+
49
+
50
+ def get_slurm_ssh_config() -> SSHConfig:
51
+ """Get the Slurm SSH config."""
52
+ slurm_config_path = os.path.expanduser(DEFAULT_SLURM_PATH)
53
+ slurm_config = SSHConfig.from_path(slurm_config_path)
54
+ return slurm_config
55
+
56
+
57
+ @annotations.lru_cache(scope='request')
58
+ def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
59
+ cache_key = f'slurm:nodes_info:{cluster}'
60
+ cached = kv_cache.get_cache_entry(cache_key)
61
+ if cached is not None:
62
+ logger.debug(f'Slurm nodes info found in cache ({cache_key})')
63
+ return [slurm.NodeInfo(**item) for item in json.loads(cached)]
64
+
65
+ ssh_config = get_slurm_ssh_config()
66
+ ssh_config_dict = ssh_config.lookup(cluster)
67
+ client = slurm.SlurmClient(
68
+ ssh_config_dict['hostname'],
69
+ int(ssh_config_dict.get('port', 22)),
70
+ ssh_config_dict['user'],
71
+ ssh_config_dict['identityfile'][0],
72
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
73
+ ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
74
+ )
75
+ nodes_info = client.info_nodes()
76
+
77
+ try:
78
+ # Nodes in a cluster are unlikely to change frequently, so cache
79
+ # the result for a short period of time.
80
+ kv_cache.add_or_update_cache_entry(
81
+ cache_key, json.dumps([n._asdict() for n in nodes_info]),
82
+ time.time() + _SLURM_NODES_INFO_CACHE_TTL)
83
+ except Exception as e: # pylint: disable=broad-except
84
+ # Catch the error and continue.
85
+ # Failure to cache the result is not critical to the
86
+ # success of this function.
87
+ logger.debug(f'Failed to cache slurm nodes info for {cluster}: '
88
+ f'{common_utils.format_exception(e)}')
89
+
90
+ return nodes_info
91
+
92
+
93
+ class SlurmInstanceType:
94
+ """Class to represent the "Instance Type" in a Slurm cluster.
95
+
96
+ Since Slurm does not have a notion of instances, we generate
97
+ virtual instance types that represent the resources requested by a
98
+ Slurm worker node.
99
+
100
+ This name captures the following resource requests:
101
+ - CPU
102
+ - Memory
103
+ - Accelerators
104
+
105
+ The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
106
+ k is the amount of memory in GB. Accelerators can be specified by
107
+ appending "--{type}:{a}" where type is the accelerator type and a
108
+ is the number of accelerators.
109
+ CPU and memory can be specified as floats. Accelerator count must be int.
110
+
111
+ Examples:
112
+ - 4CPU--16GB
113
+ - 0.5CPU--1.5GB
114
+ - 4CPU--16GB--V100:1
115
+ """
116
+
117
+ def __init__(self,
118
+ cpus: float,
119
+ memory: float,
120
+ accelerator_count: Optional[int] = None,
121
+ accelerator_type: Optional[str] = None):
122
+ self.cpus = cpus
123
+ self.memory = memory
124
+ self.accelerator_count = accelerator_count
125
+ self.accelerator_type = accelerator_type
126
+
127
+ @property
128
+ def name(self) -> str:
129
+ """Returns the name of the instance."""
130
+ assert self.cpus is not None
131
+ assert self.memory is not None
132
+ name = (f'{common_utils.format_float(self.cpus)}CPU--'
133
+ f'{common_utils.format_float(self.memory)}GB')
134
+ if self.accelerator_count is not None:
135
+ # Replace spaces with underscores in accelerator type to make it a
136
+ # valid logical instance type name.
137
+ assert self.accelerator_type is not None, self.accelerator_count
138
+ acc_name = self.accelerator_type.replace(' ', '_')
139
+ name += f'--{acc_name}:{self.accelerator_count}'
140
+ return name
141
+
142
+ @staticmethod
143
+ def is_valid_instance_type(name: str) -> bool:
144
+ """Returns whether the given name is a valid instance type."""
145
+ pattern = re.compile(
146
+ r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
147
+ return bool(pattern.match(name))
148
+
149
+ @classmethod
150
+ def _parse_instance_type(
151
+ cls,
152
+ name: str) -> Tuple[float, float, Optional[int], Optional[str]]:
153
+ """Parses and returns resources from the given InstanceType name.
154
+
155
+ Returns:
156
+ cpus | float: Number of CPUs
157
+ memory | float: Amount of memory in GB
158
+ accelerator_count | float: Number of accelerators
159
+ accelerator_type | str: Type of accelerator
160
+ """
161
+ pattern = re.compile(
162
+ r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
163
+ )
164
+ match = pattern.match(name)
165
+ if match is not None:
166
+ cpus = float(match.group('cpus'))
167
+ memory = float(match.group('memory'))
168
+ accelerator_count = match.group('accelerator_count')
169
+ accelerator_type = match.group('accelerator_type')
170
+ if accelerator_count is not None:
171
+ accelerator_count = int(accelerator_count)
172
+ # This is to revert the accelerator types with spaces back to
173
+ # the original format.
174
+ accelerator_type = str(accelerator_type).replace(' ', '_')
175
+ else:
176
+ accelerator_count = None
177
+ accelerator_type = None
178
+ return cpus, memory, accelerator_count, accelerator_type
179
+ else:
180
+ raise ValueError(f'Invalid instance name: {name}')
181
+
182
+ @classmethod
183
+ def from_instance_type(cls, name: str) -> 'SlurmInstanceType':
184
+ """Returns an instance name object from the given name."""
185
+ if not cls.is_valid_instance_type(name):
186
+ raise ValueError(f'Invalid instance name: {name}')
187
+ cpus, memory, accelerator_count, accelerator_type = \
188
+ cls._parse_instance_type(name)
189
+ return cls(cpus=cpus,
190
+ memory=memory,
191
+ accelerator_count=accelerator_count,
192
+ accelerator_type=accelerator_type)
193
+
194
+ @classmethod
195
+ def from_resources(cls,
196
+ cpus: float,
197
+ memory: float,
198
+ accelerator_count: Union[float, int] = 0,
199
+ accelerator_type: str = '') -> 'SlurmInstanceType':
200
+ """Returns an instance name object from the given resources.
201
+
202
+ If accelerator_count is not an int, it will be rounded up since GPU
203
+ requests in Slurm must be int.
204
+
205
+ NOTE: Should we take MIG management into account? See
206
+ https://slurm.schedmd.com/gres.html#MIG_Management.
207
+ """
208
+ name = f'{cpus}CPU--{memory}GB'
209
+ # Round up accelerator_count if it is not an int.
210
+ accelerator_count = math.ceil(accelerator_count)
211
+ if accelerator_count > 0:
212
+ name += f'--{accelerator_type}:{accelerator_count}'
213
+ return cls(cpus=cpus,
214
+ memory=memory,
215
+ accelerator_count=accelerator_count,
216
+ accelerator_type=accelerator_type)
217
+
218
+ def __str__(self):
219
+ return self.name
220
+
221
+ def __repr__(self):
222
+ return (f'SlurmInstanceType(cpus={self.cpus!r}, '
223
+ f'memory={self.memory!r}, '
224
+ f'accelerator_count={self.accelerator_count!r}, '
225
+ f'accelerator_type={self.accelerator_type!r})')
226
+
227
+
228
+ def instance_id(job_id: str, node: str) -> str:
229
+ """Generates the SkyPilot-defined instance ID for Slurm.
230
+
231
+ A (job id, node) pair is unique within a Slurm cluster.
232
+ """
233
+ return f'job{job_id}-{node}'
234
+
235
+
236
+ def get_partition_from_config(provider_config: Dict[str, Any]) -> str:
237
+ """Return the partition from the provider config.
238
+
239
+ The concept of partition can be mapped to a cloud zone.
240
+ """
241
+ partition = provider_config.get('partition')
242
+ if partition is None:
243
+ raise ValueError('Partition not specified in provider config.')
244
+ return partition
245
+
246
+
247
+ @annotations.lru_cache(scope='request')
248
+ def get_cluster_default_partition(cluster_name: str) -> Optional[str]:
249
+ """Get the default partition for a Slurm cluster.
250
+
251
+ Queries the Slurm cluster for the partition marked with an asterisk (*)
252
+ in sinfo output. If no default partition is marked, returns None.
253
+ """
254
+ try:
255
+ ssh_config = get_slurm_ssh_config()
256
+ ssh_config_dict = ssh_config.lookup(cluster_name)
257
+ except Exception as e:
258
+ raise ValueError(
259
+ f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
260
+ f'{common_utils.format_exception(e)}') from e
261
+
262
+ client = slurm.SlurmClient(
263
+ ssh_config_dict['hostname'],
264
+ int(ssh_config_dict.get('port', 22)),
265
+ ssh_config_dict['user'],
266
+ ssh_config_dict['identityfile'][0],
267
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
268
+ ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
269
+ )
270
+
271
+ return client.get_default_partition()
272
+
273
+
274
+ def get_all_slurm_cluster_names() -> List[str]:
275
+ """Get all Slurm cluster names available in the environment.
276
+
277
+ Returns:
278
+ List[str]: The list of Slurm cluster names if available,
279
+ an empty list otherwise.
280
+ """
281
+ try:
282
+ ssh_config = get_slurm_ssh_config()
283
+ except FileNotFoundError:
284
+ return []
285
+ except Exception as e:
286
+ raise ValueError(
287
+ f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
288
+ f'{common_utils.format_exception(e)}') from e
289
+
290
+ cluster_names = []
291
+ for cluster in ssh_config.get_hostnames():
292
+ if cluster == '*':
293
+ continue
294
+
295
+ cluster_names.append(cluster)
296
+
297
+ return cluster_names
298
+
299
+
300
+ def _check_cpu_mem_fits(
301
+ candidate_instance_type: SlurmInstanceType,
302
+ node_list: List[slurm.NodeInfo]) -> Tuple[bool, Optional[str]]:
303
+ """Checks if instance fits on candidate nodes based on CPU and memory.
304
+
305
+ We check capacity (not allocatable) because availability can change
306
+ during scheduling, and we want to let the Slurm scheduler handle that.
307
+ """
308
+ # We log max CPU and memory found on the GPU nodes for debugging.
309
+ max_cpu = 0
310
+ max_mem_gb = 0.0
311
+
312
+ for node_info in node_list:
313
+ node_cpus = node_info.cpus
314
+ node_mem_gb = node_info.memory_gb
315
+
316
+ if node_cpus > max_cpu:
317
+ max_cpu = node_cpus
318
+ max_mem_gb = node_mem_gb
319
+
320
+ if (node_cpus >= candidate_instance_type.cpus and
321
+ node_mem_gb >= candidate_instance_type.memory):
322
+ return True, None
323
+
324
+ return False, (f'Max found: {max_cpu} CPUs, '
325
+ f'{common_utils.format_float(max_mem_gb)}G memory')
326
+
327
+
328
+ def check_instance_fits(
329
+ cluster: str,
330
+ instance_type: str,
331
+ partition: Optional[str] = None) -> Tuple[bool, Optional[str]]:
332
+ """Check if the given instance type fits in the given cluster/partition.
333
+
334
+ Args:
335
+ cluster: Name of the Slurm cluster.
336
+ instance_type: The instance type to check.
337
+ partition: Optional partition name. If None, checks all partitions.
338
+
339
+ Returns:
340
+ Tuple of (fits, reason) where fits is True if available.
341
+ """
342
+ # Get Slurm node list in the given cluster (region).
343
+ try:
344
+ nodes = _get_slurm_nodes_info(cluster)
345
+ except FileNotFoundError:
346
+ return (False, f'Could not query Slurm cluster {cluster} '
347
+ f'because the Slurm configuration file '
348
+ f'{DEFAULT_SLURM_PATH} does not exist.')
349
+ except Exception as e: # pylint: disable=broad-except
350
+ return (False, f'Could not query Slurm cluster {cluster} '
351
+ f'because Slurm SSH configuration at {DEFAULT_SLURM_PATH} '
352
+ f'could not be loaded: {common_utils.format_exception(e)}.')
353
+
354
+ default_partition = get_cluster_default_partition(cluster)
355
+
356
+ def is_default_partition(node_partition: str) -> bool:
357
+ if default_partition is None:
358
+ return False
359
+
360
+ # info_nodes does not strip the '*' from the default partition name.
361
+ # But non-default partition names can also end with '*',
362
+ # so we need to check whether the partition name without the '*'
363
+ # is the same as the default partition name.
364
+ return (node_partition.endswith('*') and
365
+ node_partition[:-1] == default_partition)
366
+
367
+ partition_suffix = ''
368
+ if partition is not None:
369
+ filtered = []
370
+ for node_info in nodes:
371
+ node_partition = node_info.partition
372
+ if is_default_partition(node_partition):
373
+ # Strip '*' from default partition name.
374
+ node_partition = node_partition[:-1]
375
+ if node_partition == partition:
376
+ filtered.append(node_info)
377
+ nodes = filtered
378
+ partition_suffix = f' in partition {partition}'
379
+
380
+ slurm_instance_type = SlurmInstanceType.from_instance_type(instance_type)
381
+ acc_count = (slurm_instance_type.accelerator_count
382
+ if slurm_instance_type.accelerator_count is not None else 0)
383
+ acc_type = slurm_instance_type.accelerator_type
384
+ candidate_nodes = nodes
385
+ not_fit_reason_prefix = (
386
+ f'No nodes found with enough '
387
+ f'CPU (> {slurm_instance_type.cpus} CPUs) and/or '
388
+ f'memory (> {slurm_instance_type.memory} G){partition_suffix}. ')
389
+ if acc_type is not None:
390
+ assert acc_count is not None, (acc_type, acc_count)
391
+
392
+ gpu_nodes = []
393
+ for node_info in nodes:
394
+ # Extract the GPU type and count from the GRES string
395
+ node_acc_type, node_acc_count = get_gpu_type_and_count(
396
+ node_info.gres)
397
+ if node_acc_type is None:
398
+ continue
399
+
400
+ # TODO(jwj): Handle status check.
401
+
402
+ # Check if the node has the requested GPU type and at least the
403
+ # requested count
404
+ if (node_acc_type.lower() == acc_type.lower() and
405
+ node_acc_count >= acc_count):
406
+ gpu_nodes.append(node_info)
407
+ if len(gpu_nodes) == 0:
408
+ return (False,
409
+ f'No GPU nodes found with at least {acc_type}:{acc_count} '
410
+ f'on the cluster.')
411
+
412
+ candidate_nodes = gpu_nodes
413
+ not_fit_reason_prefix = (
414
+ f'GPU nodes with {acc_type}{partition_suffix} do not have '
415
+ f'enough CPU (> {slurm_instance_type.cpus} CPUs) and/or '
416
+ f'memory (> {slurm_instance_type.memory} G). ')
417
+
418
+ # Check if CPU and memory requirements are met on at least one
419
+ # candidate node.
420
+ fits, reason = _check_cpu_mem_fits(slurm_instance_type, candidate_nodes)
421
+ if not fits and reason is not None:
422
+ reason = not_fit_reason_prefix + reason
423
+ return fits, reason
424
+
425
+
426
+ # GRES names are highly unlikely to change within a cluster.
427
+ # TODO(kevin): Cache using sky/utils/db/kv_cache.py too.
428
+ @annotations.lru_cache(scope='global', maxsize=10)
429
+ def get_gres_gpu_type(cluster: str, requested_gpu_type: str) -> str:
430
+ """Get the actual GPU type as it appears in the cluster's GRES.
431
+
432
+ Args:
433
+ cluster: Name of the Slurm cluster.
434
+ requested_gpu_type: The GPU type requested by the user.
435
+
436
+ Returns:
437
+ The actual GPU type as it appears in the cluster's GRES string.
438
+ Falls back to the requested type if not found.
439
+ """
440
+ try:
441
+ ssh_config = get_slurm_ssh_config()
442
+ ssh_config_dict = ssh_config.lookup(cluster)
443
+ client = slurm.SlurmClient(
444
+ ssh_config_dict['hostname'],
445
+ int(ssh_config_dict.get('port', 22)),
446
+ ssh_config_dict['user'],
447
+ ssh_config_dict['identityfile'][0],
448
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
449
+ ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
450
+ )
451
+
452
+ nodes = client.info_nodes()
453
+
454
+ for node_info in nodes:
455
+ node_gpu_type, _ = get_gpu_type_and_count(node_info.gres)
456
+ if node_gpu_type is None:
457
+ continue
458
+ if node_gpu_type.lower() == requested_gpu_type.lower():
459
+ return node_gpu_type
460
+ except Exception as e: # pylint: disable=broad-except
461
+ logger.warning(
462
+ 'Failed to determine the exact GPU GRES type from the Slurm '
463
+ f'cluster {cluster!r}. Falling back to '
464
+ f'{requested_gpu_type.lower()!r}. This may cause issues if the '
465
+ f'casing is incorrect. Error: {common_utils.format_exception(e)}')
466
+
467
+ # GRES names are more commonly in lowercase from what we've seen so far.
468
+ return requested_gpu_type.lower()
469
+
470
+
471
+ def _get_slurm_node_info_list(
472
+ slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
473
+ """Gathers detailed information about each node in the Slurm cluster.
474
+
475
+ Raises:
476
+ FileNotFoundError: If the Slurm configuration file does not exist.
477
+ ValueError: If no Slurm cluster name is found in the Slurm
478
+ configuration file.
479
+ """
480
+ # 1. Get node state and GRES using sinfo
481
+
482
+ # can raise FileNotFoundError if config file does not exist.
483
+ slurm_config = get_slurm_ssh_config()
484
+ if slurm_cluster_name is None:
485
+ slurm_cluster_names = get_all_slurm_cluster_names()
486
+ if slurm_cluster_names:
487
+ slurm_cluster_name = slurm_cluster_names[0]
488
+ if slurm_cluster_name is None:
489
+ raise ValueError(
490
+ f'No Slurm cluster name found in the {DEFAULT_SLURM_PATH} '
491
+ f'configuration.')
492
+ slurm_config_dict = slurm_config.lookup(slurm_cluster_name)
493
+ logger.debug(f'Slurm config dict: {slurm_config_dict}')
494
+ slurm_client = slurm.SlurmClient(
495
+ slurm_config_dict['hostname'],
496
+ int(slurm_config_dict.get('port', 22)),
497
+ slurm_config_dict['user'],
498
+ slurm_config_dict['identityfile'][0],
499
+ ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
500
+ ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
501
+ )
502
+ node_infos = slurm_client.info_nodes()
503
+
504
+ if not node_infos:
505
+ logger.warning(
506
+ f'`sinfo -N` returned no output on cluster {slurm_cluster_name}. '
507
+ f'No nodes found?')
508
+ return []
509
+
510
+ # 2. Process each node, aggregating partitions per node
511
+ slurm_nodes_info: Dict[str, Dict[str, Any]] = {}
512
+
513
+ nodes_to_jobs_gres = slurm_client.get_all_jobs_gres()
514
+ for node_info in node_infos:
515
+ node_name = node_info.node
516
+ state = node_info.state
517
+ gres_str = node_info.gres
518
+ partition = node_info.partition
519
+
520
+ if node_name in slurm_nodes_info:
521
+ slurm_nodes_info[node_name]['partitions'].append(partition)
522
+ continue
523
+
524
+ # Extract GPU info from GRES
525
+ node_gpu_type, total_gpus = get_gpu_type_and_count(gres_str)
526
+ if total_gpus > 0:
527
+ if node_gpu_type is not None:
528
+ node_gpu_type = node_gpu_type.upper()
529
+ else:
530
+ node_gpu_type = 'GPU'
531
+
532
+ # Get allocated GPUs
533
+ allocated_gpus = 0
534
+ # TODO(zhwu): move to enum
535
+ if state in ('alloc', 'mix', 'drain', 'drng', 'drained', 'resv',
536
+ 'comp'):
537
+ jobs_gres = nodes_to_jobs_gres.get(node_name, [])
538
+ if jobs_gres:
539
+ for job_line in jobs_gres:
540
+ _, job_gpu_count = get_gpu_type_and_count(job_line)
541
+ allocated_gpus += job_gpu_count
542
+ elif state == 'alloc':
543
+ # If no GRES info found but node is fully allocated,
544
+ # assume all GPUs are in use.
545
+ allocated_gpus = total_gpus
546
+ elif state == 'idle':
547
+ allocated_gpus = 0
548
+
549
+ free_gpus = total_gpus - allocated_gpus if state not in ('down',
550
+ 'drain',
551
+ 'drng',
552
+ 'maint') else 0
553
+ free_gpus = max(0, free_gpus)
554
+
555
+ slurm_nodes_info[node_name] = {
556
+ 'node_name': node_name,
557
+ 'slurm_cluster_name': slurm_cluster_name,
558
+ 'partitions': [partition],
559
+ 'node_state': state,
560
+ 'gpu_type': node_gpu_type,
561
+ 'total_gpus': total_gpus,
562
+ 'free_gpus': free_gpus,
563
+ 'vcpu_count': node_info.cpus,
564
+ 'memory_gb': round(node_info.memory_gb, 2),
565
+ }
566
+
567
+ for node_info in slurm_nodes_info.values():
568
+ partitions = node_info.pop('partitions')
569
+ node_info['partition'] = ','.join(str(p) for p in partitions)
570
+
571
+ return list(slurm_nodes_info.values())
572
+
573
+
574
+ def slurm_node_info(
575
+ slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
576
+ """Gets detailed information for each node in the Slurm cluster.
577
+
578
+ Returns:
579
+ List[Dict[str, Any]]: A list of dictionaries, each containing node info.
580
+ """
581
+ try:
582
+ node_list = _get_slurm_node_info_list(
583
+ slurm_cluster_name=slurm_cluster_name)
584
+ except (RuntimeError, exceptions.NotSupportedError) as e:
585
+ logger.debug(f'Could not retrieve Slurm node info: {e}')
586
+ return []
587
+ return node_list
588
+
589
+
590
+ def is_inside_slurm_cluster() -> bool:
591
+ # Check for the marker file in the current home directory. When run by
592
+ # the skylet on a compute node, the HOME environment variable is set to
593
+ # the cluster's sky home directory by the SlurmCommandRunner.
594
+ marker_file = os.path.join(os.path.expanduser('~'), SLURM_MARKER_FILE)
595
+ return os.path.exists(marker_file)
596
+
597
+
598
+ @annotations.lru_cache(scope='request')
599
+ def get_partitions(cluster_name: str) -> List[str]:
600
+ """Get unique partition names available in a Slurm cluster.
601
+
602
+ Args:
603
+ cluster_name: Name of the Slurm cluster.
604
+
605
+ Returns:
606
+ List of unique partition names available in the cluster.
607
+ The default partition appears first,
608
+ and the rest are sorted alphabetically.
609
+ """
610
+ try:
611
+ slurm_config = SSHConfig.from_path(
612
+ os.path.expanduser(DEFAULT_SLURM_PATH))
613
+ slurm_config_dict = slurm_config.lookup(cluster_name)
614
+
615
+ client = slurm.SlurmClient(
616
+ slurm_config_dict['hostname'],
617
+ int(slurm_config_dict.get('port', 22)),
618
+ slurm_config_dict['user'],
619
+ slurm_config_dict['identityfile'][0],
620
+ ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
621
+ ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
622
+ )
623
+
624
+ partitions_info = client.get_partitions_info()
625
+ default_partitions = []
626
+ other_partitions = []
627
+ for partition in partitions_info:
628
+ if partition.is_default:
629
+ default_partitions.append(partition.name)
630
+ else:
631
+ other_partitions.append(partition.name)
632
+ return default_partitions + sorted(other_partitions)
633
+ except Exception as e: # pylint: disable=broad-except
634
+ raise ValueError(
635
+ f'Failed to get partitions for cluster '
636
+ f'{cluster_name}: {common_utils.format_exception(e)}') from e
637
+
638
+
639
+ def srun_sshd_command(
640
+ job_id: str,
641
+ target_node: str,
642
+ unix_user: str,
643
+ ) -> str:
644
+ """Build srun command for launching sshd -i inside a Slurm job.
645
+
646
+ This is used by the API server to proxy SSH connections to Slurm jobs
647
+ via sshd running in inetd mode within srun.
648
+
649
+ Args:
650
+ job_id: The Slurm job ID
651
+ target_node: The target compute node hostname
652
+ unix_user: The Unix user for the job
653
+
654
+ Returns:
655
+ List of command arguments to be extended to ssh base command
656
+ """
657
+ # We use ~username to ensure we use the real home of the user ssh'ing in,
658
+ # because we override the home directory in SlurmCommandRunner.run.
659
+ user_home_ssh_dir = f'~{unix_user}/.ssh'
660
+ return shlex.join([
661
+ 'srun',
662
+ '--quiet',
663
+ '--unbuffered',
664
+ '--overlap',
665
+ '--jobid',
666
+ job_id,
667
+ '-w',
668
+ target_node,
669
+ '/usr/sbin/sshd',
670
+ '-i', # Uses stdin/stdout
671
+ '-e', # Writes errors to stderr
672
+ '-f', # Use /dev/null to avoid reading system sshd_config
673
+ '/dev/null',
674
+ '-h',
675
+ f'{user_home_ssh_dir}/{SLURM_SSHD_HOST_KEY_FILENAME}',
676
+ '-o',
677
+ f'AuthorizedKeysFile={user_home_ssh_dir}/authorized_keys',
678
+ '-o',
679
+ 'PasswordAuthentication=no',
680
+ '-o',
681
+ 'PubkeyAuthentication=yes',
682
+ # If UsePAM is enabled, we will not be able to run sshd(8)
683
+ # as a non-root user.
684
+ # See https://man7.org/linux/man-pages/man5/sshd_config.5.html
685
+ '-o',
686
+ 'UsePAM=no',
687
+ '-o',
688
+ f'AcceptEnv={constants.SKY_CLUSTER_NAME_ENV_VAR_KEY}',
689
+ ])