skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -69,6 +69,7 @@ from sky.utils import timeline
69
69
  from sky.utils import ux_utils
70
70
  from sky.utils import volume as volume_utils
71
71
  from sky.utils import yaml_utils
72
+ from sky.utils.plugin_extensions import ExternalFailureSource
72
73
  from sky.workspaces import core as workspaces_core
73
74
 
74
75
  if typing.TYPE_CHECKING:
@@ -147,6 +148,19 @@ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
147
148
  # Remote dir that holds our runtime files.
148
149
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
149
150
 
151
+ # The maximum size of a command line arguments is 128 KB, i.e. the command
152
+ # executed with /bin/sh should be less than 128KB.
153
+ # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
154
+ #
155
+ # If a user have very long run or setup commands, the generated command may
156
+ # exceed the limit, as we directly include scripts in job submission commands.
157
+ # If the command is too long, we instead write it to a file, rsync and execute
158
+ # it.
159
+ #
160
+ # We use 100KB as a threshold to be safe for other arguments that
161
+ # might be added during ssh.
162
+ _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
163
+
150
164
  _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
151
165
  'please retry after a while.')
152
166
 
@@ -225,6 +239,18 @@ _ACK_MESSAGE = 'ack'
225
239
  _FORWARDING_FROM_MESSAGE = 'Forwarding from'
226
240
 
227
241
 
242
+ def is_command_length_over_limit(command: str) -> bool:
243
+ """Check if the length of the command exceeds the limit.
244
+
245
+ We calculate the length of the command after quoting the command twice as
246
+ when it is executed by the CommandRunner, the command will be quoted twice
247
+ to ensure the correctness, which will add significant length to the command.
248
+ """
249
+
250
+ quoted_length = len(shlex.quote(shlex.quote(command)))
251
+ return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
252
+
253
+
228
254
  def is_ip(s: str) -> bool:
229
255
  """Returns whether this string matches IP_ADDR_REGEX."""
230
256
  return len(re.findall(IP_ADDR_REGEX, s)) == 1
@@ -738,7 +764,20 @@ def write_cluster_config(
738
764
  keys=('allowed_contexts',),
739
765
  default_value=None)
740
766
  if allowed_contexts is None:
741
- excluded_clouds.add(cloud)
767
+ # Exclude both Kubernetes and SSH explicitly since:
768
+ # 1. isinstance(cloud, clouds.Kubernetes) matches both (SSH
769
+ # inherits from Kubernetes)
770
+ # 2. Both share the same get_credential_file_mounts() which
771
+ # returns the kubeconfig. So if we don't exclude both, the
772
+ # unexcluded one will upload the kubeconfig.
773
+ # TODO(romilb): This is a workaround. The right long-term fix
774
+ # is to have SSH Node Pools use its own kubeconfig instead of
775
+ # sharing the global kubeconfig at ~/.kube/config. In the
776
+ # interim, SSH Node Pools' get_credential_file_mounts can filter
777
+ # contexts starting with ssh- and create a temp kubeconfig
778
+ # to upload.
779
+ excluded_clouds.add(clouds.Kubernetes())
780
+ excluded_clouds.add(clouds.SSH())
742
781
  else:
743
782
  excluded_clouds.add(cloud)
744
783
 
@@ -946,6 +985,9 @@ def write_cluster_config(
946
985
  '{conda_auto_activate}',
947
986
  conda_auto_activate).replace('{is_custom_docker}',
948
987
  is_custom_docker),
988
+ # Currently only used by Slurm. For other clouds, it is
989
+ # already part of ray_skypilot_installation_commands
990
+ 'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
949
991
  'ray_skypilot_installation_commands':
950
992
  (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
951
993
  '{sky_wheel_hash}',
@@ -1058,7 +1100,11 @@ def write_cluster_config(
1058
1100
  with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
1059
1101
  f.write(restored_yaml_content)
1060
1102
 
1061
- config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
1103
+ # Read the cluster_name_on_cloud from the restored yaml. This is a hack to
1104
+ # make sure that launching on the same cluster across multiple users works
1105
+ # correctly. See #8232.
1106
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1107
+ config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
1062
1108
 
1063
1109
  # Make sure to do this before we optimize file mounts. Optimization is
1064
1110
  # non-deterministic, but everything else before this point should be
@@ -1105,17 +1151,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1105
1151
  """
1106
1152
  config = yaml_utils.read_yaml(tmp_yaml_path)
1107
1153
  # Check the availability of the cloud type.
1108
- if isinstance(cloud, (
1154
+ if isinstance(
1155
+ cloud,
1156
+ (
1109
1157
  clouds.AWS,
1110
1158
  clouds.OCI,
1111
1159
  clouds.SCP,
1160
+ # TODO(jwj): Handle Slurm-specific auth logic
1161
+ clouds.Slurm,
1112
1162
  clouds.Vsphere,
1113
1163
  clouds.Cudo,
1114
1164
  clouds.Paperspace,
1115
1165
  clouds.Azure,
1116
1166
  clouds.DO,
1117
1167
  clouds.Nebius,
1118
- )):
1168
+ )):
1119
1169
  config = auth.configure_ssh_info(config)
1120
1170
  elif isinstance(cloud, clouds.GCP):
1121
1171
  config = auth.setup_gcp_authentication(config)
@@ -2226,6 +2276,12 @@ def _update_cluster_status(
2226
2276
  for status in node_statuses) and
2227
2277
  len(node_statuses) == handle.launched_nodes)
2228
2278
 
2279
+ external_cluster_failures = ExternalFailureSource.get(
2280
+ cluster_hash=record['cluster_hash'])
2281
+ logger.debug(f'Cluster {cluster_name} with cluster_hash '
2282
+ f'{record["cluster_hash"]} has external cluster failures: '
2283
+ f'{external_cluster_failures}')
2284
+
2229
2285
  def get_node_counts_from_ray_status(
2230
2286
  runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
2231
2287
  rc, output, stderr = runner.run(
@@ -2361,7 +2417,13 @@ def _update_cluster_status(
2361
2417
  # remain healthy for a while before the cloud completely preempts the VMs.
2362
2418
  # We have mitigated this by again first querying the VM state from the cloud
2363
2419
  # provider.
2364
- if all_nodes_up and run_ray_status_to_check_ray_cluster_healthy():
2420
+ cloud = handle.launched_resources.cloud
2421
+
2422
+ # For Slurm, skip Ray health check since it doesn't use Ray.
2423
+ should_check_ray = cloud is not None and cloud.uses_ray()
2424
+ if (all_nodes_up and (not should_check_ray or
2425
+ run_ray_status_to_check_ray_cluster_healthy()) and
2426
+ not external_cluster_failures):
2365
2427
  # NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
2366
2428
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
2367
2429
  # head-ip/worker-ips`.
@@ -2464,15 +2526,15 @@ def _update_cluster_status(
2464
2526
  # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2465
2527
  # autostopping/autodowning.
2466
2528
  some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2467
- # If all nodes are up and ray cluster is health, we would have returned
2468
- # earlier. So if all_nodes_up is True and we are here, it means the ray
2469
- # cluster must have been unhealthy.
2470
- ray_cluster_unhealthy = all_nodes_up
2471
2529
  some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2472
2530
  for status in node_statuses)
2473
2531
  is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2474
2532
 
2475
- if is_abnormal:
2533
+ if is_abnormal and not external_cluster_failures:
2534
+ # If all nodes are up and ray cluster is healthy, we would have returned
2535
+ # earlier. So if all_nodes_up is True and we are here, it means the ray
2536
+ # cluster must have been unhealthy.
2537
+ ray_cluster_unhealthy = all_nodes_up
2476
2538
  status_reason = ', '.join(
2477
2539
  [status[1] for status in node_statuses if status[1] is not None])
2478
2540
 
@@ -2600,8 +2662,25 @@ def _update_cluster_status(
2600
2662
  cluster_name,
2601
2663
  include_user_info=include_user_info,
2602
2664
  summary_response=summary_response)
2603
- # Now is_abnormal is False: either node_statuses is empty or all nodes are
2604
- # STOPPED.
2665
+ # Now either:
2666
+ # (1) is_abnormal is False: either node_statuses is empty or all nodes are
2667
+ # STOPPED
2668
+ # or
2669
+ # (2) there are external cluster failures reported by a plugin.
2670
+
2671
+ # If there are external cluster failures and the cluster has not been
2672
+ # terminated on cloud (to_terminate), we can return the cluster record as is.
2673
+ # This is because when an external failure is detected, the cluster will be
2674
+ # marked as INIT with a reason indicating the details of the failure. So, we
2675
+ # do not want to modify the cluster status in this function except for in the
2676
+ # case where the cluster has been terminated on cloud, in which case we should
2677
+ # clean up the cluster from SkyPilot's global state.
2678
+ if external_cluster_failures and not to_terminate:
2679
+ return global_user_state.get_cluster_from_name(
2680
+ cluster_name,
2681
+ include_user_info=include_user_info,
2682
+ summary_response=summary_response)
2683
+
2605
2684
  verb = 'terminated' if to_terminate else 'stopped'
2606
2685
  backend = backends.CloudVmRayBackend()
2607
2686
  global_user_state.add_cluster_event(
@@ -3327,6 +3406,8 @@ def get_clusters(
3327
3406
  handle = record['handle']
3328
3407
  record['nodes'] = handle.launched_nodes
3329
3408
  if handle.launched_resources is None:
3409
+ # Set default values when launched_resources is None
3410
+ record['labels'] = {}
3330
3411
  continue
3331
3412
  record['cloud'] = (f'{handle.launched_resources.cloud}'
3332
3413
  if handle.launched_resources.cloud else None)
@@ -3339,6 +3420,8 @@ def get_clusters(
3339
3420
  record['accelerators'] = (
3340
3421
  f'{handle.launched_resources.accelerators}'
3341
3422
  if handle.launched_resources.accelerators else None)
3423
+ record['labels'] = (handle.launched_resources.labels
3424
+ if handle.launched_resources.labels else {})
3342
3425
  if not include_handle:
3343
3426
  record.pop('handle', None)
3344
3427