skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/clouds/slurm.py ADDED
@@ -0,0 +1,610 @@
1
+ """Slurm."""
2
+
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import catalog
7
+ from sky import clouds
8
+ from sky import sky_logging
9
+ from sky import skypilot_config
10
+ from sky.adaptors import slurm
11
+ from sky.provision.slurm import utils as slurm_utils
12
+ from sky.skylet import constants
13
+ from sky.utils import annotations
14
+ from sky.utils import common_utils
15
+ from sky.utils import registry
16
+ from sky.utils import resources_utils
17
+
18
+ if typing.TYPE_CHECKING:
19
+ from sky import resources as resources_lib
20
+ from sky.utils import volume as volume_lib
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ CREDENTIAL_PATH = slurm_utils.DEFAULT_SLURM_PATH
25
+
26
+
27
+ @registry.CLOUD_REGISTRY.register
28
+ class Slurm(clouds.Cloud):
29
+ """Slurm."""
30
+
31
+ _REPR = 'Slurm'
32
+ _CLOUD_UNSUPPORTED_FEATURES = {
33
+ clouds.CloudImplementationFeatures.AUTOSTOP: 'Slurm does not '
34
+ 'support autostop.',
35
+ clouds.CloudImplementationFeatures.STOP: 'Slurm does not support '
36
+ 'stopping instances.',
37
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are '
38
+ 'not supported in '
39
+ 'Slurm.',
40
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
41
+ 'Customized multiple network interfaces are not supported in '
42
+ 'Slurm.',
43
+ clouds.CloudImplementationFeatures.OPEN_PORTS: 'Opening ports is not '
44
+ 'supported in Slurm.',
45
+ clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
46
+ 'Running '
47
+ 'controllers is not '
48
+ 'well tested with '
49
+ 'Slurm.',
50
+ clouds.CloudImplementationFeatures.IMAGE_ID: 'Specifying image ID is '
51
+ 'not supported in Slurm.',
52
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE: 'Docker image is not '
53
+ 'supported in Slurm.',
54
+ }
55
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
56
+ _regions: List[clouds.Region] = []
57
+ _INDENT_PREFIX = ' '
58
+
59
+ # Same as Kubernetes.
60
+ _DEFAULT_NUM_VCPUS_WITH_GPU = 4
61
+ _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4
62
+
63
+ # Using the latest SkyPilot provisioner API to provision and check status.
64
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
65
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
66
+
67
+ _SSH_CONFIG_KEY_MAPPING = {
68
+ 'identityfile': 'IdentityFile',
69
+ 'user': 'User',
70
+ 'hostname': 'HostName',
71
+ }
72
+
73
+ @classmethod
74
+ def _unsupported_features_for_resources(
75
+ cls,
76
+ resources: 'resources_lib.Resources',
77
+ region: Optional[str] = None,
78
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
79
+ del region # unused
80
+ # logger.critical('[BYPASS] Check Slurm's unsupported features...')
81
+ return cls._CLOUD_UNSUPPORTED_FEATURES
82
+
83
+ @classmethod
84
+ def _max_cluster_name_length(cls) -> Optional[int]:
85
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
86
+
87
+ @classmethod
88
+ def uses_ray(cls) -> bool:
89
+ return False
90
+
91
+ @classmethod
92
+ def get_vcpus_mem_from_instance_type(
93
+ cls,
94
+ instance_type: str,
95
+ ) -> Tuple[Optional[float], Optional[float]]:
96
+ inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
97
+ return inst.cpus, inst.memory
98
+
99
+ @classmethod
100
+ def zones_provision_loop(
101
+ cls,
102
+ *,
103
+ region: str,
104
+ num_nodes: int,
105
+ instance_type: str,
106
+ accelerators: Optional[Dict[str, int]] = None,
107
+ use_spot: bool = False,
108
+ ) -> Iterator[Optional[List[clouds.Zone]]]:
109
+ """Iterate over partitions (zones) for provisioning with failover.
110
+
111
+ Yields one partition at a time for failover retry logic.
112
+ """
113
+ del num_nodes # unused
114
+
115
+ regions = cls.regions_with_offering(instance_type,
116
+ accelerators,
117
+ use_spot,
118
+ region=region,
119
+ zone=None)
120
+
121
+ for r in regions:
122
+ if r.zones:
123
+ # Yield one partition at a time for failover
124
+ for zone in r.zones:
125
+ yield [zone]
126
+ else:
127
+ # No partitions discovered, use default
128
+ yield None
129
+
130
+ @classmethod
131
+ @annotations.lru_cache(scope='global', maxsize=1)
132
+ def _log_skipped_clusters_once(cls, skipped_clusters: Tuple[str,
133
+ ...]) -> None:
134
+ """Log skipped clusters for only once.
135
+
136
+ We don't directly cache the result of existing_allowed_clusters
137
+ as the config may update the allowed clusters.
138
+ """
139
+ if skipped_clusters:
140
+ logger.warning(
141
+ f'Slurm clusters {set(skipped_clusters)!r} specified in '
142
+ '"allowed_clusters" not found in ~/.slurm/config. '
143
+ 'Ignoring these clusters.')
144
+
145
+ @classmethod
146
+ def existing_allowed_clusters(cls, silent: bool = False) -> List[str]:
147
+ """Get existing allowed clusters.
148
+
149
+ Returns clusters based on the following logic:
150
+ 1. If 'allowed_clusters' is set to 'all' in ~/.sky/config.yaml,
151
+ return all clusters from ~/.slurm/config
152
+ 2. If specific clusters are listed in 'allowed_clusters',
153
+ return only those that exist in ~/.slurm/config
154
+ 3. If no configuration is specified, return all clusters
155
+ from ~/.slurm/config (default behavior)
156
+ """
157
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
158
+ if len(all_clusters) == 0:
159
+ return []
160
+
161
+ all_clusters = set(all_clusters)
162
+
163
+ # Workspace-level allowed_clusters should take precedence over
164
+ # the global allowed_clusters.
165
+ allowed_clusters = skypilot_config.get_workspace_cloud('slurm').get(
166
+ 'allowed_clusters', None)
167
+ if allowed_clusters is None:
168
+ allowed_clusters = skypilot_config.get_effective_region_config(
169
+ cloud='slurm',
170
+ region=None,
171
+ keys=('allowed_clusters',),
172
+ default_value=None)
173
+
174
+ allow_all_clusters = allowed_clusters == 'all'
175
+ if allow_all_clusters:
176
+ allowed_clusters = list(all_clusters)
177
+
178
+ if allowed_clusters is None:
179
+ # Default to all clusters if no configuration is specified
180
+ allowed_clusters = list(all_clusters)
181
+
182
+ existing_clusters = []
183
+ skipped_clusters = []
184
+ for cluster in allowed_clusters:
185
+ if cluster in all_clusters:
186
+ existing_clusters.append(cluster)
187
+ else:
188
+ skipped_clusters.append(cluster)
189
+
190
+ if not silent:
191
+ cls._log_skipped_clusters_once(tuple(sorted(skipped_clusters)))
192
+
193
+ return existing_clusters
194
+
195
+ @classmethod
196
+ def regions_with_offering(
197
+ cls,
198
+ instance_type: Optional[str],
199
+ accelerators: Optional[Dict[str, int]],
200
+ use_spot: bool,
201
+ region: Optional[str],
202
+ zone: Optional[str],
203
+ resources: Optional['resources_lib.Resources'] = None
204
+ ) -> List[clouds.Region]:
205
+ del accelerators, use_spot, resources # unused
206
+ existing_clusters = cls.existing_allowed_clusters()
207
+
208
+ regions: List[clouds.Region] = []
209
+ for cluster in existing_clusters:
210
+ # Filter by region if specified
211
+ if region is not None and cluster != region:
212
+ continue
213
+
214
+ # Fetch partitions for this cluster and attach as zones
215
+ try:
216
+ partitions = slurm_utils.get_partitions(cluster)
217
+ if zone is not None:
218
+ # Filter by zone (partition) if specified
219
+ partitions = [p for p in partitions if p == zone]
220
+ zones = [clouds.Zone(p) for p in partitions]
221
+ except Exception as e: # pylint: disable=broad-except
222
+ logger.debug(f'Failed to get partitions for {cluster}: {e}')
223
+ zones = []
224
+
225
+ r = clouds.Region(cluster)
226
+ if zones:
227
+ r.set_zones(zones)
228
+ regions.append(r)
229
+
230
+ # Check if requested instance type will fit in the cluster.
231
+ if instance_type is None:
232
+ return regions
233
+
234
+ regions_to_return = []
235
+ for r in regions:
236
+ cluster = r.name
237
+
238
+ # Check each partition (zone) in the cluster
239
+ partitions_to_check = [z.name for z in r.zones] if r.zones else []
240
+ valid_zones = []
241
+
242
+ # TODO(kevin): Batch this check to reduce number of roundtrips.
243
+ for partition in partitions_to_check:
244
+ fits, reason = slurm_utils.check_instance_fits(
245
+ cluster, instance_type, partition)
246
+ if fits:
247
+ if partition:
248
+ valid_zones.append(clouds.Zone(partition))
249
+ else:
250
+ logger.debug(
251
+ f'Instance type {instance_type} does not fit in '
252
+ f'{cluster}/{partition}: {reason}')
253
+
254
+ if valid_zones:
255
+ r.set_zones(valid_zones)
256
+ regions_to_return.append(r)
257
+
258
+ return regions_to_return
259
+
260
+ def instance_type_to_hourly_cost(self,
261
+ instance_type: str,
262
+ use_spot: bool,
263
+ region: Optional[str] = None,
264
+ zone: Optional[str] = None) -> float:
265
+ """For now, we assume zero cost for Slurm clusters."""
266
+ return 0.0
267
+
268
+ def accelerators_to_hourly_cost(self,
269
+ accelerators: Dict[str, int],
270
+ use_spot: bool,
271
+ region: Optional[str] = None,
272
+ zone: Optional[str] = None) -> float:
273
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
274
+ del accelerators, use_spot, region, zone # unused
275
+ return 0.0
276
+
277
+ def get_egress_cost(self, num_gigabytes: float) -> float:
278
+ return 0.0
279
+
280
+ def __repr__(self):
281
+ return self._REPR
282
+
283
+ def is_same_cloud(self, other: clouds.Cloud) -> bool:
284
+ # Returns true if the two clouds are the same cloud type.
285
+ return isinstance(other, Slurm)
286
+
287
+ @classmethod
288
+ def get_default_instance_type(cls,
289
+ cpus: Optional[str] = None,
290
+ memory: Optional[str] = None,
291
+ disk_tier: Optional[
292
+ resources_utils.DiskTier] = None,
293
+ region: Optional[str] = None,
294
+ zone: Optional[str] = None) -> Optional[str]:
295
+ """Returns the default instance type for Slurm."""
296
+ return catalog.get_default_instance_type(cpus=cpus,
297
+ memory=memory,
298
+ disk_tier=disk_tier,
299
+ region=region,
300
+ zone=zone,
301
+ clouds='slurm')
302
+
303
+ @classmethod
304
+ def get_accelerators_from_instance_type(
305
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
306
+ inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
307
+ return {
308
+ inst.accelerator_type: inst.accelerator_count
309
+ } if (inst.accelerator_count is not None and
310
+ inst.accelerator_type is not None) else None
311
+
312
+ @classmethod
313
+ def get_zone_shell_cmd(cls) -> Optional[str]:
314
+ return None
315
+
316
+ def make_deploy_resources_variables(
317
+ self,
318
+ resources: 'resources_lib.Resources',
319
+ cluster_name: 'resources_utils.ClusterName',
320
+ region: Optional['clouds.Region'],
321
+ zones: Optional[List['clouds.Zone']],
322
+ num_nodes: int,
323
+ dryrun: bool = False,
324
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
325
+ ) -> Dict[str, Optional[str]]:
326
+ del cluster_name, dryrun, volume_mounts # Unused.
327
+ if region is not None:
328
+ cluster = region.name
329
+ else:
330
+ cluster = 'localcluster'
331
+ assert cluster is not None, 'No available Slurm cluster found.'
332
+
333
+ # Use zone as partition if specified, otherwise default
334
+ if zones and len(zones) > 0:
335
+ partition = zones[0].name
336
+ else:
337
+ partitions = slurm_utils.get_partitions(cluster)
338
+ if not partitions:
339
+ raise ValueError(f'No partitions found for cluster {cluster}.')
340
+ # get_partitions returns the default partition first, then sorted
341
+ # alphabetically, so this also handles the case where the cluster
342
+ # does not have a default partition.
343
+ partition = partitions[0]
344
+
345
+ # cluster is our target slurmctld host.
346
+ ssh_config = slurm_utils.get_slurm_ssh_config()
347
+ ssh_config_dict = ssh_config.lookup(cluster)
348
+
349
+ resources = resources.assert_launchable()
350
+ acc_dict = self.get_accelerators_from_instance_type(
351
+ resources.instance_type)
352
+ custom_resources = resources_utils.make_ray_custom_resources_str(
353
+ acc_dict)
354
+
355
+ # resources.memory and cpus are none if they are not explicitly set.
356
+ # we fetch the default values for the instance type in that case.
357
+ s = slurm_utils.SlurmInstanceType.from_instance_type(
358
+ resources.instance_type)
359
+ cpus = s.cpus
360
+ mem = s.memory
361
+ # Optionally populate accelerator information.
362
+ acc_count = s.accelerator_count if s.accelerator_count else 0
363
+ acc_type = s.accelerator_type if s.accelerator_type else None
364
+ # Resolve the actual GPU type as it appears in the cluster's GRES.
365
+ # Slurm GRES types are case-sensitive.
366
+ if acc_type:
367
+ acc_type = slurm_utils.get_gres_gpu_type(cluster, acc_type)
368
+
369
+ deploy_vars = {
370
+ 'instance_type': resources.instance_type,
371
+ 'custom_resources': custom_resources,
372
+ 'cpus': str(cpus),
373
+ 'memory': str(mem),
374
+ 'accelerator_count': str(acc_count),
375
+ 'accelerator_type': acc_type,
376
+ 'slurm_cluster': cluster,
377
+ 'slurm_partition': partition,
378
+ # TODO(jwj): Pass SSH config in a smarter way
379
+ 'ssh_hostname': ssh_config_dict['hostname'],
380
+ 'ssh_port': str(ssh_config_dict.get('port', 22)),
381
+ 'ssh_user': ssh_config_dict['user'],
382
+ 'slurm_proxy_command': ssh_config_dict.get('proxycommand', None),
383
+ 'slurm_proxy_jump': ssh_config_dict.get('proxyjump', None),
384
+ # TODO(jwj): Solve naming collision with 'ssh_private_key'.
385
+ # Please refer to slurm-ray.yml.j2 'ssh' and 'auth' sections.
386
+ 'slurm_private_key': ssh_config_dict['identityfile'][0],
387
+ 'slurm_sshd_host_key_filename':
388
+ (slurm_utils.SLURM_SSHD_HOST_KEY_FILENAME),
389
+ 'slurm_cluster_name_env_var':
390
+ (constants.SKY_CLUSTER_NAME_ENV_VAR_KEY),
391
+ }
392
+
393
+ return deploy_vars
394
+
395
+ def _get_feasible_launchable_resources(
396
+ self, resources: 'resources_lib.Resources'
397
+ ) -> 'resources_utils.FeasibleResources':
398
+ """Returns a list of feasible resources for the given resources."""
399
+ if resources.instance_type is not None:
400
+ assert resources.is_launchable(), resources
401
+ # Check if the instance type is available in at least one cluster
402
+ available_regions = self.regions_with_offering(
403
+ resources.instance_type,
404
+ accelerators=None,
405
+ use_spot=resources.use_spot,
406
+ region=resources.region,
407
+ zone=resources.zone)
408
+ if not available_regions:
409
+ return resources_utils.FeasibleResources([], [], None)
410
+
411
+ # Return a single resource without region set.
412
+ # The optimizer will call make_launchables_for_valid_region_zones()
413
+ # which will create one resource per region/cluster.
414
+ resources = resources.copy(accelerators=None)
415
+ return resources_utils.FeasibleResources([resources], [], None)
416
+
417
+ def _make(instance_list):
418
+ resource_list = []
419
+ for instance_type in instance_list:
420
+ r = resources.copy(
421
+ cloud=Slurm(),
422
+ instance_type=instance_type,
423
+ accelerators=None,
424
+ )
425
+ resource_list.append(r)
426
+ return resource_list
427
+
428
+ # Currently, handle a filter on accelerators only.
429
+ accelerators = resources.accelerators
430
+
431
+ default_instance_type = Slurm.get_default_instance_type(
432
+ cpus=resources.cpus,
433
+ memory=resources.memory,
434
+ disk_tier=resources.disk_tier,
435
+ region=resources.region,
436
+ zone=resources.zone)
437
+ if default_instance_type is None:
438
+ return resources_utils.FeasibleResources([], [], None)
439
+
440
+ if accelerators is None:
441
+ chosen_instance_type = default_instance_type
442
+ else:
443
+ assert len(accelerators) == 1, resources
444
+
445
+ # Build GPU-enabled instance type.
446
+ acc_type, acc_count = list(accelerators.items())[0]
447
+
448
+ slurm_instance_type = (slurm_utils.SlurmInstanceType.
449
+ from_instance_type(default_instance_type))
450
+
451
+ gpu_task_cpus = slurm_instance_type.cpus
452
+ if resources.cpus is None:
453
+ gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
454
+ # Special handling to bump up memory multiplier for GPU instances
455
+ gpu_task_memory = (float(resources.memory.strip('+')) if
456
+ resources.memory is not None else gpu_task_cpus *
457
+ self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
458
+
459
+ chosen_instance_type = (
460
+ slurm_utils.SlurmInstanceType.from_resources(
461
+ gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
462
+
463
+ # Check the availability of the specified instance type in all
464
+ # Slurm clusters.
465
+ available_regions = self.regions_with_offering(
466
+ chosen_instance_type,
467
+ accelerators=None,
468
+ use_spot=resources.use_spot,
469
+ region=resources.region,
470
+ zone=resources.zone)
471
+ if not available_regions:
472
+ return resources_utils.FeasibleResources([], [], None)
473
+
474
+ return resources_utils.FeasibleResources(_make([chosen_instance_type]),
475
+ [], None)
476
+
477
+ @classmethod
478
+ def _check_compute_credentials(
479
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
480
+ """Checks if the user has access credentials to the Slurm cluster."""
481
+ try:
482
+ ssh_config = slurm_utils.get_slurm_ssh_config()
483
+ except FileNotFoundError:
484
+ return (
485
+ False,
486
+ f'Slurm configuration file {slurm_utils.DEFAULT_SLURM_PATH} '
487
+ 'does not exist.\n'
488
+ f'{cls._INDENT_PREFIX}For more info: '
489
+ 'https://docs.skypilot.co/en/latest/getting-started/'
490
+ 'installation.html#slurm-installation')
491
+ except Exception as e: # pylint: disable=broad-except
492
+ return (False, 'Failed to load SSH configuration from '
493
+ f'{slurm_utils.DEFAULT_SLURM_PATH}: '
494
+ f'{common_utils.format_exception(e)}.')
495
+ existing_allowed_clusters = cls.existing_allowed_clusters()
496
+
497
+ if not existing_allowed_clusters:
498
+ return (False, 'No Slurm clusters found in ~/.slurm/config. '
499
+ 'Please configure at least one Slurm cluster.')
500
+
501
+ # Check credentials for each cluster and return ctx2text mapping
502
+ ctx2text = {}
503
+ success = False
504
+ for cluster in existing_allowed_clusters:
505
+ # Retrieve the config options for a given SlurmctldHost name alias.
506
+ ssh_config_dict = ssh_config.lookup(cluster)
507
+ try:
508
+ client = slurm.SlurmClient(
509
+ ssh_config_dict['hostname'],
510
+ int(ssh_config_dict.get('port', 22)),
511
+ ssh_config_dict['user'],
512
+ ssh_config_dict['identityfile'][0],
513
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
514
+ ssh_proxy_jump=ssh_config_dict.get('proxyjump', None))
515
+ info = client.info()
516
+ logger.debug(f'Slurm cluster {cluster} sinfo: {info}')
517
+ ctx2text[cluster] = 'enabled'
518
+ success = True
519
+ except KeyError as e:
520
+ key = e.args[0]
521
+ ctx2text[cluster] = (
522
+ f'disabled. '
523
+ f'{cls._SSH_CONFIG_KEY_MAPPING.get(key, key.capitalize())} '
524
+ 'is missing, please check your ~/.slurm/config '
525
+ 'and try again.')
526
+ except Exception as e: # pylint: disable=broad-except
527
+ error_msg = (f'Credential check failed: '
528
+ f'{common_utils.format_exception(e)}')
529
+ ctx2text[cluster] = f'disabled. {error_msg}'
530
+
531
+ return success, ctx2text
532
+
533
+ def get_credential_file_mounts(self) -> Dict[str, str]:
534
+ ########
535
+ # TODO #
536
+ ########
537
+ # Return dictionary of credential file paths. This may look
538
+ # something like:
539
+ return {}
540
+
541
+ @classmethod
542
+ def get_current_user_identity(cls) -> Optional[List[str]]:
543
+ # NOTE: used for very advanced SkyPilot functionality
544
+ # Can implement later if desired
545
+ return None
546
+
547
+ def instance_type_exists(self, instance_type: str) -> bool:
548
+ return catalog.instance_type_exists(instance_type, 'slurm')
549
+
550
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
551
+ """Validate region (cluster) and zone (partition).
552
+
553
+ Args:
554
+ region: Slurm cluster name.
555
+ zone: Slurm partition name (optional).
556
+
557
+ Returns:
558
+ Tuple of (region, zone) if valid.
559
+
560
+ Raises:
561
+ ValueError: If cluster or partition not found.
562
+ """
563
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
564
+ if region and region not in all_clusters:
565
+ raise ValueError(
566
+ f'Cluster {region} not found in Slurm config. Slurm only '
567
+ 'supports cluster names as regions. Available '
568
+ f'clusters: {all_clusters}')
569
+
570
+ # Validate partition (zone) if specified
571
+ if zone is not None:
572
+ if region is None:
573
+ raise ValueError(
574
+ 'Cannot specify partition (zone) without specifying '
575
+ 'cluster (region) for Slurm.')
576
+
577
+ partitions = slurm_utils.get_partitions(region)
578
+ if zone not in partitions:
579
+ raise ValueError(
580
+ f'Partition {zone!r} not found in cluster {region!r}. '
581
+ f'Available partitions: {partitions}')
582
+
583
+ return region, zone
584
+
585
+ def accelerator_in_region_or_zone(self,
586
+ accelerator: str,
587
+ acc_count: int,
588
+ region: Optional[str] = None,
589
+ zone: Optional[str] = None) -> bool:
590
+ del zone # unused for now
591
+ regions = catalog.get_region_zones_for_accelerators(accelerator,
592
+ acc_count,
593
+ use_spot=False,
594
+ clouds='slurm')
595
+ if not regions:
596
+ return False
597
+ if region is None:
598
+ return True
599
+ return any(r.name == region for r in regions)
600
+
601
+ @classmethod
602
+ def expand_infras(cls) -> List[str]:
603
+ """Returns a list of enabled Slurm clusters.
604
+
605
+ Each is returned as 'Slurm/cluster-name'.
606
+ """
607
+ infras = []
608
+ for cluster in cls.existing_allowed_clusters(silent=True):
609
+ infras.append(f'{cls.canonical_name()}/{cluster}')
610
+ return infras
sky/clouds/ssh.py CHANGED
@@ -9,6 +9,7 @@ from sky import skypilot_config
9
9
  from sky.adaptors import kubernetes as kubernetes_adaptor
10
10
  from sky.clouds import kubernetes
11
11
  from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.ssh_node_pools import constants as ssh_constants
12
13
  from sky.utils import annotations
13
14
  from sky.utils import common_utils
14
15
  from sky.utils import registry
@@ -20,7 +21,7 @@ if typing.TYPE_CHECKING:
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
23
- SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
24
+ SSH_NODE_POOLS_PATH = ssh_constants.DEFAULT_SSH_NODE_POOLS_PATH
24
25
 
25
26
 
26
27
  @registry.CLOUD_REGISTRY.register()
@@ -254,7 +255,7 @@ class SSH(kubernetes.Kubernetes):
254
255
  @classmethod
255
256
  def expand_infras(cls) -> List[str]:
256
257
  return [
257
- f'{cls.canonical_name()}/{c.lstrip("ssh-")}'
258
+ f'{cls.canonical_name()}/{common_utils.removeprefix(c, "ssh-")}'
258
259
  for c in cls.existing_allowed_contexts(silent=True)
259
260
  ]
260
261