skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -81,6 +81,7 @@ from sky.utils import timeline
81
81
  from sky.utils import ux_utils
82
82
  from sky.utils import volume as volume_lib
83
83
  from sky.utils import yaml_utils
84
+ from sky.utils.plugin_extensions import ExternalFailureSource
84
85
 
85
86
  if typing.TYPE_CHECKING:
86
87
  import grpc
@@ -192,18 +193,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
192
193
  pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
193
194
  'monkey_patches' / 'monkey_patch_ray_up.py')
194
195
 
195
- # The maximum size of a command line arguments is 128 KB, i.e. the command
196
- # executed with /bin/sh should be less than 128KB.
197
- # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
198
- #
199
- # If a user have very long run or setup commands, the generated command may
200
- # exceed the limit, as we directly include scripts in job submission commands.
201
- # If the command is too long, we instead write it to a file, rsync and execute
202
- # it.
203
- #
204
- # We use 100KB as a threshold to be safe for other arguments that
205
- # might be added during ssh.
206
- _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
207
196
  _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
208
197
  ('too long', 255),
209
198
  ('request-uri too large', 1),
@@ -218,18 +207,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
218
207
  _CLUSTER_LOCK_TIMEOUT = 5.0
219
208
 
220
209
 
221
- def _is_command_length_over_limit(command: str) -> bool:
222
- """Check if the length of the command exceeds the limit.
223
-
224
- We calculate the length of the command after quoting the command twice as
225
- when it is executed by the CommandRunner, the command will be quoted twice
226
- to ensure the correctness, which will add significant length to the command.
227
- """
228
-
229
- quoted_length = len(shlex.quote(shlex.quote(command)))
230
- return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
231
-
232
-
233
210
  def _is_message_too_long(returncode: int,
234
211
  output: Optional[str] = None,
235
212
  file_path: Optional[str] = None) -> bool:
@@ -294,6 +271,7 @@ def _get_cluster_config_template(cloud):
294
271
  clouds.Lambda: 'lambda-ray.yml.j2',
295
272
  clouds.IBM: 'ibm-ray.yml.j2',
296
273
  clouds.SCP: 'scp-ray.yml.j2',
274
+ clouds.Slurm: 'slurm-ray.yml.j2',
297
275
  clouds.OCI: 'oci-ray.yml.j2',
298
276
  clouds.Paperspace: 'paperspace-ray.yml.j2',
299
277
  clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
@@ -938,8 +916,10 @@ class RetryingVmProvisioner(object):
938
916
  elif to_provision.region is not None and to_provision.cloud is not None:
939
917
  # For public clouds, provision.region is always set.
940
918
  if clouds.SSH().is_same_cloud(to_provision.cloud):
919
+ ssh_node_pool_name = common_utils.removeprefix(
920
+ to_provision.region, 'ssh-')
941
921
  message += (
942
- f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
922
+ f'in SSH Node Pool ({ssh_node_pool_name}) '
943
923
  f'for {requested_resources}. The SSH Node Pool may not '
944
924
  'have enough resources.')
945
925
  elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
@@ -1199,7 +1179,9 @@ class RetryingVmProvisioner(object):
1199
1179
  if isinstance(to_provision.cloud, clouds.Kubernetes):
1200
1180
  suffix = '.'
1201
1181
  if region.name.startswith('ssh-'):
1202
- suffix = f' ({region.name.lstrip("ssh-")})'
1182
+ ssh_node_pool_name = common_utils.removeprefix(
1183
+ region.name, 'ssh-')
1184
+ suffix = f' ({ssh_node_pool_name})'
1203
1185
  logger.info(
1204
1186
  ux_utils.starting_message(
1205
1187
  f'Launching{controller_str} on '
@@ -2516,7 +2498,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2516
2498
  @property
2517
2499
  def is_grpc_enabled_with_flag(self) -> bool:
2518
2500
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2519
- return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2501
+ return (env_options.Options.ENABLE_GRPC.get() and
2502
+ self.is_grpc_enabled and
2503
+ not isinstance(self.launched_resources.cloud, clouds.Slurm))
2520
2504
 
2521
2505
  def __getstate__(self):
2522
2506
  state = self.__dict__.copy()
@@ -2753,6 +2737,13 @@ class SkyletClient:
2753
2737
  ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
2754
2738
  return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
2755
2739
 
2740
+ def get_job_exit_codes(
2741
+ self,
2742
+ request: 'jobsv1_pb2.GetJobExitCodesRequest',
2743
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2744
+ ) -> 'jobsv1_pb2.GetJobExitCodesResponse':
2745
+ return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
2746
+
2756
2747
  def tail_logs(
2757
2748
  self,
2758
2749
  request: 'jobsv1_pb2.TailLogsRequest',
@@ -3061,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3061
3052
  'sky api status -v | grep '
3062
3053
  f'{cluster_name}'))
3063
3054
 
3055
+ def _maybe_clear_external_cluster_failures(
3056
+ self, cluster_name: str,
3057
+ prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
3058
+ """Clear any existing cluster failures when reusing a cluster.
3059
+
3060
+ Clear any existing cluster failures when reusing a cluster. This ensures
3061
+ that when a cluster failure is detected (causing the cluster to be
3062
+ marked as INIT), the user can recover the cluster via `sky start` or
3063
+ `sky launch` and clear the failure.
3064
+ """
3065
+ if prev_cluster_status is not None:
3066
+ failures = ExternalFailureSource.clear(cluster_name=cluster_name)
3067
+ if failures:
3068
+ failure_details = [f'"{f["failure_mode"]}"' for f in failures]
3069
+ plural = 's' if len(failures) > 1 else ''
3070
+ logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
3071
+ f'existing cluster failure{plural} for cluster '
3072
+ f'{cluster_name!r}: {", ".join(failure_details)}'
3073
+ f'{colorama.Style.RESET_ALL}')
3074
+
3064
3075
  def _locked_provision(
3065
3076
  self,
3066
3077
  lock_id: str,
@@ -3091,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3091
3102
  to_provision_config.num_nodes, to_provision_config.resources)
3092
3103
  usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
3093
3104
 
3105
+ self._maybe_clear_external_cluster_failures(cluster_name,
3106
+ prev_cluster_status)
3107
+
3094
3108
  # TODO(suquark): once we have sky on PyPI, we should directly
3095
3109
  # install sky from PyPI.
3096
3110
  # NOTE: can take ~2s.
@@ -3449,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3449
3463
  ssh_user=handle.ssh_user,
3450
3464
  docker_user=handle.docker_user)
3451
3465
  cluster_utils.SSHConfigHelper.add_cluster(
3452
- handle.cluster_name, handle.cached_external_ips, auth_config,
3466
+ handle.cluster_name, handle.cluster_name_on_cloud,
3467
+ handle.cached_external_ips, auth_config,
3453
3468
  handle.cached_external_ssh_ports, handle.docker_user,
3454
3469
  handle.ssh_user)
3455
3470
 
@@ -3596,6 +3611,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3596
3611
 
3597
3612
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3598
3613
  detach_setup: bool) -> None:
3614
+
3599
3615
  start = time.time()
3600
3616
 
3601
3617
  if task.setup is None:
@@ -3647,7 +3663,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3647
3663
  _dump_final_script(setup_script,
3648
3664
  constants.PERSISTENT_SETUP_SCRIPT_PATH)
3649
3665
 
3650
- if detach_setup or _is_command_length_over_limit(encoded_script):
3666
+ if (detach_setup or
3667
+ backend_utils.is_command_length_over_limit(encoded_script)):
3651
3668
  _dump_final_script(setup_script)
3652
3669
  create_script_code = 'true'
3653
3670
  else:
@@ -3788,23 +3805,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3788
3805
  up=True,
3789
3806
  stream_logs=False)
3790
3807
 
3791
- cd = f'cd {SKY_REMOTE_WORKDIR}'
3792
- mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3793
- f'touch {remote_log_path}')
3808
+ mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
3794
3809
  encoded_script = shlex.quote(codegen)
3795
3810
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3796
3811
  job_submit_cmd = (
3797
3812
  # JOB_CMD_IDENTIFIER is used for identifying the process
3798
3813
  # retrieved with pid is the same driver process.
3799
3814
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3800
- f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3815
+ f'{constants.SKY_PYTHON_CMD} -u {script_path}'
3801
3816
  # Do not use &>, which is not POSIX and may not work.
3802
3817
  # Note that the order of ">filename 2>&1" matters.
3803
3818
  f'> {remote_log_path} 2>&1')
3804
3819
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3820
+
3821
+ # For Slurm, we need to wait for the job to complete before exiting,
3822
+ # because Slurm's proctrack/cgroup kills all processes when the srun
3823
+ # job step ends, including child processes launched as a separate
3824
+ # process group.
3825
+ # So this keeps srun alive so the job driver process that was spawned
3826
+ # (and runs in the background) by job_lib.JobScheduler.schedule_step()
3827
+ # does not get killed.
3828
+ # Note: proctrack/cgroup is enabled by default on Nebius' Managed
3829
+ # Soperator.
3830
+ is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
3831
+ if is_slurm:
3832
+ wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
3833
+ code = code + ' && ' + wait_code
3834
+
3805
3835
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3806
3836
 
3807
- # Should also be ealier than _is_command_length_over_limit
3837
+ # Should also be ealier than is_command_length_over_limit
3808
3838
  # Same reason as in _setup
3809
3839
  if self._dump_final_script:
3810
3840
  _dump_code_to_file(job_submit_cmd,
@@ -3837,7 +3867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3837
3867
  tasks=managed_job_tasks,
3838
3868
  user_id=managed_job_user_id)
3839
3869
 
3840
- if _is_command_length_over_limit(codegen):
3870
+ if backend_utils.is_command_length_over_limit(codegen):
3841
3871
  _dump_code_to_file(codegen)
3842
3872
  queue_job_request = jobsv1_pb2.QueueJobRequest(
3843
3873
  job_id=job_id,
@@ -3859,7 +3889,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3859
3889
  use_legacy = True
3860
3890
 
3861
3891
  if use_legacy:
3862
- if _is_command_length_over_limit(job_submit_cmd):
3892
+ if backend_utils.is_command_length_over_limit(job_submit_cmd):
3863
3893
  _dump_code_to_file(codegen)
3864
3894
  job_submit_cmd = f'{mkdir_code} && {code}'
3865
3895
 
@@ -3886,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3886
3916
 
3887
3917
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3888
3918
 
3889
- returncode, stdout, stderr = self.run_on_head(handle,
3890
- job_submit_cmd,
3891
- stream_logs=False,
3892
- require_outputs=True)
3919
+ # For Slurm, run in background so that SSH returns immediately.
3920
+ # This is needed because we add the wait_for_job code above which
3921
+ # makes the command block until the job completes.
3922
+ returncode, stdout, stderr = self.run_on_head(
3923
+ handle,
3924
+ job_submit_cmd,
3925
+ stream_logs=False,
3926
+ require_outputs=True,
3927
+ run_in_background=is_slurm)
3893
3928
  # Happens when someone calls `sky exec` but remote is outdated for
3894
3929
  # running a job. Necessitating calling `sky launch`.
3895
3930
  backend_utils.check_stale_runtime_on_remote(returncode, stderr,
@@ -3906,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3906
3941
  _dump_code_to_file(codegen)
3907
3942
  job_submit_cmd = f'{mkdir_code} && {code}'
3908
3943
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3944
+ # See comment above for why run_in_background=is_slurm.
3909
3945
  returncode, stdout, stderr = self.run_on_head(
3910
3946
  handle,
3911
3947
  job_submit_cmd,
3912
3948
  stream_logs=False,
3913
- require_outputs=True)
3949
+ require_outputs=True,
3950
+ run_in_background=is_slurm)
3914
3951
 
3915
3952
  subprocess_utils.handle_returncode(
3916
3953
  returncode,
@@ -4969,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4969
5006
  ports_cleaned_up = True
4970
5007
  except exceptions.PortDoesNotExistError:
4971
5008
  logger.debug('Ports do not exist. Skipping cleanup.')
5009
+ ports_cleaned_up = True
4972
5010
  except Exception as e: # pylint: disable=broad-except
4973
5011
  if purge:
4974
5012
  msg = common_utils.format_exception(e, use_bracket=True)
@@ -5041,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5041
5079
  config['provider'],
5042
5080
  non_terminated_only=False)
5043
5081
 
5044
- unexpected_node_state: Optional[Tuple[str, str]] = None
5082
+ unexpected_nodes = []
5045
5083
  for node_id, node_status_tuple in node_status_dict.items():
5046
5084
  node_status, reason = node_status_tuple
5047
- reason = '' if reason is None else f' ({reason})'
5048
- logger.debug(f'{node_id} status: {node_status}{reason}')
5085
+ reason_str = '' if reason is None else f' ({reason})'
5086
+ logger.debug(f'{node_id} status: {node_status}{reason_str}')
5049
5087
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
5050
5088
  # between "stopping/stopped" and "terminating/terminated",
5051
5089
  # so we allow for either status instead of casing on
@@ -5053,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5053
5091
  if node_status not in [
5054
5092
  None, status_lib.ClusterStatus.STOPPED
5055
5093
  ]:
5056
- unexpected_node_state = (node_id, node_status)
5057
- break
5094
+ unexpected_nodes.append((node_id, node_status, reason))
5058
5095
 
5059
- if unexpected_node_state is None:
5096
+ if not unexpected_nodes:
5060
5097
  break
5061
5098
 
5062
5099
  attempts += 1
5063
5100
  if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
5064
5101
  time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
5065
5102
  else:
5066
- (node_id, node_status) = unexpected_node_state
5067
- raise RuntimeError(f'Instance {node_id} in unexpected '
5068
- f'state {node_status}.')
5103
+ unexpected_nodes_str = '\n'.join([
5104
+ f' - {node_id}: {node_status}' +
5105
+ (f' ({reason})' if reason else '')
5106
+ for node_id, node_status, reason in unexpected_nodes
5107
+ ])
5108
+ raise RuntimeError(f'Instances in unexpected state:\n'
5109
+ f'{unexpected_nodes_str}')
5069
5110
 
5070
5111
  # If cluster_yaml is None, the cluster should ensured to be terminated,
5071
5112
  # so we don't need to do the double check.
@@ -5352,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5352
5393
  assert handle is not None
5353
5394
  # Cluster already exists.
5354
5395
  self.check_resources_fit_cluster(handle, task)
5396
+
5355
5397
  # Use the existing cluster.
5356
5398
  assert handle.launched_resources is not None, (cluster_name, handle)
5357
5399
  # Take a random resource in order to get resource info that applies
@@ -5403,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5403
5445
  for resource in task.resources:
5404
5446
  assert (resource.cluster_config_overrides ==
5405
5447
  one_task_resource.cluster_config_overrides)
5406
- if isinstance(to_provision.cloud, clouds.Kubernetes):
5448
+
5449
+ cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5450
+ cluster_name)
5451
+ cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
5452
+ if cluster_yaml_str is not None else None)
5453
+
5454
+ def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5455
+ return (yaml_obj.get('available_node_types',
5456
+ {}).get('ray_head_default',
5457
+ {}).get('node_config', {}))
5458
+
5459
+ if isinstance(to_provision.cloud,
5460
+ clouds.Kubernetes) and cluster_yaml_obj is not None:
5407
5461
  # Warn users if the Kubernetes pod config is different
5408
5462
  # from the existing cluster.
5409
- cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5410
- cluster_name)
5411
- actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
5412
5463
  desired_cluster_yaml_obj = (
5413
5464
  kubernetes_utils.combine_pod_config_fields_and_metadata(
5414
- actual_cluster_yaml_obj,
5465
+ cluster_yaml_obj,
5415
5466
  cluster_config_overrides=one_task_resource.
5416
5467
  cluster_config_overrides,
5417
5468
  cloud=to_provision.cloud,
5418
5469
  context=to_provision.region))
5419
5470
 
5420
- def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5421
- return (yaml_obj.get('available_node_types',
5422
- {}).get('ray_head_default',
5423
- {}).get('node_config', {}))
5424
-
5425
5471
  if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
5426
- actual_cluster_yaml_obj):
5472
+ cluster_yaml_obj):
5427
5473
  # pylint: disable=line-too-long
5428
5474
  logger.warning(
5429
5475
  f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
@@ -5434,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5434
5480
  f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
5435
5481
  f'{colorama.Style.RESET_ALL}')
5436
5482
 
5483
+ # Check for volume mount warnings
5484
+ if task.volume_mounts:
5485
+ # Get existing cluster's volume mounts from cluster yaml
5486
+ existing_volume_names = set()
5487
+ try:
5488
+ if cluster_yaml_obj is not None:
5489
+ # Extract volume names from existing cluster
5490
+ node_config = _get_pod_config(cluster_yaml_obj)
5491
+
5492
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
5493
+ # Check for K8s-style persistent volumes
5494
+ # (spec.volumes)
5495
+ # See sky/templates/kubernetes-ray.yml.j2.
5496
+ volumes = node_config.get('spec',
5497
+ {}).get('volumes', [])
5498
+ for vol in volumes:
5499
+ # Volume from PVC has structure:
5500
+ # - name: <volume_name>
5501
+ # persistentVolumeClaim:
5502
+ # claimName: <volume_name_on_cloud>
5503
+ if 'persistentVolumeClaim' in vol:
5504
+ pvc = vol.get('persistentVolumeClaim', {})
5505
+ # Use claimName (volume_name_on_cloud) to
5506
+ # be consistent with RunPod.
5507
+ vol_name_on_cloud = pvc.get('claimName')
5508
+ if vol_name_on_cloud:
5509
+ existing_volume_names.add(
5510
+ vol_name_on_cloud)
5511
+
5512
+ # Check for K8s ephemeral volumes
5513
+ # See sky/templates/kubernetes-ray.yml.j2.
5514
+ provider_config = cluster_yaml_obj.get(
5515
+ 'provider', {})
5516
+ ephemeral_specs = provider_config.get(
5517
+ 'ephemeral_volume_specs', [])
5518
+ for spec in ephemeral_specs:
5519
+ # For ephemeral volumes, we check the mount
5520
+ # path.
5521
+ mount_path = spec.get('path')
5522
+ if mount_path:
5523
+ existing_volume_names.add(mount_path)
5524
+
5525
+ elif isinstance(to_provision.cloud, clouds.RunPod):
5526
+ # Check for custom VolumeMounts config
5527
+ # (e.g. RunPod)
5528
+ # See sky/templates/runpod-ray.yml.j2.
5529
+ volume_mounts_config = node_config.get(
5530
+ 'VolumeMounts', [])
5531
+ for vol_mount in volume_mounts_config:
5532
+ vol_name = vol_mount.get('VolumeNameOnCloud')
5533
+ if vol_name:
5534
+ existing_volume_names.add(vol_name)
5535
+ except Exception as e: # pylint: disable=broad-except
5536
+ # If we can't get the existing volume mounts, log debug
5537
+ # and skip the warning check
5538
+ logger.debug(f'Failed to check existing volume mounts: {e}',
5539
+ exc_info=True)
5540
+
5541
+ # Check if task has new volumes not in existing cluster
5542
+ new_ephemeral_volumes = []
5543
+ new_persistent_volumes = []
5544
+ for volume_mount in task.volume_mounts:
5545
+ # Compare using volume_name for user-facing name
5546
+ if volume_mount.is_ephemeral:
5547
+ if volume_mount.path not in existing_volume_names:
5548
+ new_ephemeral_volumes.append(volume_mount.path)
5549
+ elif (volume_mount.volume_name not in existing_volume_names
5550
+ and volume_mount.volume_config.name_on_cloud
5551
+ not in existing_volume_names):
5552
+ new_persistent_volumes.append(volume_mount.volume_name)
5553
+
5554
+ if new_ephemeral_volumes or new_persistent_volumes:
5555
+ msg_parts = []
5556
+ if new_ephemeral_volumes:
5557
+ msg_parts.append(f'new ephemeral volume(s) with path '
5558
+ f'{", ".join(new_ephemeral_volumes)}')
5559
+ if new_persistent_volumes:
5560
+ msg_parts.append(
5561
+ f'new volume(s) {", ".join(new_persistent_volumes)}'
5562
+ )
5563
+
5564
+ volume_msg = ' and '.join(msg_parts)
5565
+ # Capitalize the first letter of the message
5566
+ volume_msg = volume_msg[0].upper() + volume_msg[1:]
5567
+
5568
+ logger.warning(
5569
+ f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
5570
+ f'specified in task but not '
5571
+ f'mounted to existing cluster "{cluster_name}". '
5572
+ f'These volumes will not be mounted to the cluster. '
5573
+ f'To mount new volumes, either:\n'
5574
+ f' • Use a new cluster, or\n'
5575
+ f' • Terminate and recreate this cluster'
5576
+ f'{colorama.Style.RESET_ALL}')
5577
+
5437
5578
  return RetryingVmProvisioner.ToProvisionConfig(
5438
5579
  cluster_name,
5439
5580
  to_provision,
@@ -5850,6 +5991,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5850
5991
  return task.envs[constants.USER_ID_ENV_VAR]
5851
5992
  return None
5852
5993
 
5994
+ def _get_task_codegen_class(
5995
+ self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
5996
+ """Returns the appropriate TaskCodeGen for the given handle."""
5997
+ if isinstance(handle.launched_resources.cloud, clouds.Slurm):
5998
+ assert (handle.cached_cluster_info
5999
+ is not None), ('cached_cluster_info must be set')
6000
+ head_instance = handle.cached_cluster_info.get_head_instance()
6001
+ assert (head_instance is not None), (
6002
+ 'Head instance not found in cached cluster info')
6003
+ slurm_job_id = head_instance.tags.get('job_id')
6004
+ assert (slurm_job_id
6005
+ is not None), ('job_id tag not found in head instance')
6006
+ return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
6007
+ else:
6008
+ return task_codegen.RayCodeGen()
6009
+
5853
6010
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5854
6011
  task: task_lib.Task, job_id: int,
5855
6012
  remote_log_dir: str) -> None:
@@ -5862,15 +6019,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5862
6019
 
5863
6020
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5864
6021
 
5865
- codegen = task_codegen.RayCodeGen()
6022
+ codegen = self._get_task_codegen_class(handle)
6023
+
5866
6024
  codegen.add_prologue(job_id)
5867
6025
  codegen.add_setup(
5868
6026
  1,
5869
6027
  resources_dict,
5870
6028
  stable_cluster_internal_ips=internal_ips,
5871
6029
  env_vars=task_env_vars,
6030
+ log_dir=log_dir,
5872
6031
  setup_cmd=self._setup_cmd,
5873
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5874
6032
  )
5875
6033
 
5876
6034
  codegen.add_task(
@@ -5907,15 +6065,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5907
6065
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
5908
6066
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5909
6067
 
5910
- codegen = task_codegen.RayCodeGen()
6068
+ codegen = self._get_task_codegen_class(handle)
6069
+
5911
6070
  codegen.add_prologue(job_id)
5912
6071
  codegen.add_setup(
5913
6072
  num_actual_nodes,
5914
6073
  resources_dict,
5915
6074
  stable_cluster_internal_ips=internal_ips,
5916
6075
  env_vars=task_env_vars,
6076
+ log_dir=log_dir,
5917
6077
  setup_cmd=self._setup_cmd,
5918
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5919
6078
  )
5920
6079
 
5921
6080
  codegen.add_task(