skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,618 @@
1
+ """Slurm instance provisioning."""
2
+
3
+ import tempfile
4
+ import textwrap
5
+ import time
6
+ from typing import Any, cast, Dict, List, Optional, Tuple
7
+
8
+ from sky import sky_logging
9
+ from sky import skypilot_config
10
+ from sky.adaptors import slurm
11
+ from sky.provision import common
12
+ from sky.provision import constants
13
+ from sky.provision.slurm import utils as slurm_utils
14
+ from sky.utils import command_runner
15
+ from sky.utils import common_utils
16
+ from sky.utils import status_lib
17
+ from sky.utils import subprocess_utils
18
+ from sky.utils import timeline
19
+
20
+ logger = sky_logging.init_logger(__name__)
21
+
22
+ # TODO(kevin): This assumes $HOME is in a shared filesystem.
23
+ # We should probably make it configurable, and add a check
24
+ # during sky check.
25
+ SHARED_ROOT_SKY_DIRECTORY = '~/.sky_clusters'
26
+ PROVISION_SCRIPTS_DIRECTORY_NAME = '.sky_provision'
27
+ PROVISION_SCRIPTS_DIRECTORY = f'~/{PROVISION_SCRIPTS_DIRECTORY_NAME}'
28
+
29
+ POLL_INTERVAL_SECONDS = 2
30
+ # Default KillWait is 30 seconds, so we add some buffer time here.
31
+ _JOB_TERMINATION_TIMEOUT_SECONDS = 60
32
+ _SKY_DIR_CREATION_TIMEOUT_SECONDS = 30
33
+
34
+
35
+ def _sky_cluster_home_dir(cluster_name_on_cloud: str) -> str:
36
+ """Returns the SkyPilot cluster's home directory path on the Slurm cluster.
37
+
38
+ This path is assumed to be on a shared NFS mount accessible by all nodes.
39
+ To support clusters with non-NFS home directories, we would need to let
40
+ users specify an NFS-backed "working directory" or use a different
41
+ coordination mechanism.
42
+ """
43
+ return f'{SHARED_ROOT_SKY_DIRECTORY}/{cluster_name_on_cloud}'
44
+
45
+
46
+ def _sbatch_provision_script_path(filename: str) -> str:
47
+ """Returns the path to the sbatch provision script on the login node."""
48
+ # Put sbatch script in $HOME instead of /tmp as there can be
49
+ # multiple login nodes, and different SSH connections
50
+ # can land on different login nodes.
51
+ return f'{PROVISION_SCRIPTS_DIRECTORY}/{filename}'
52
+
53
+
54
+ def _skypilot_runtime_dir(cluster_name_on_cloud: str) -> str:
55
+ """Returns the SkyPilot runtime directory path on the Slurm cluster."""
56
+ return f'/tmp/{cluster_name_on_cloud}'
57
+
58
+
59
+ @timeline.event
60
+ def _create_virtual_instance(
61
+ region: str, cluster_name_on_cloud: str,
62
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
63
+ """Creates a Slurm virtual instance from the config.
64
+
65
+ A Slurm virtual instance is created by submitting a long-running
66
+ job with sbatch, to mimic a cloud VM.
67
+ """
68
+ provider_config = config.provider_config
69
+ ssh_config_dict = provider_config['ssh']
70
+ ssh_host = ssh_config_dict['hostname']
71
+ ssh_port = int(ssh_config_dict['port'])
72
+ ssh_user = ssh_config_dict['user']
73
+ ssh_key = ssh_config_dict['private_key']
74
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
75
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
76
+ partition = slurm_utils.get_partition_from_config(provider_config)
77
+
78
+ client = slurm.SlurmClient(
79
+ ssh_host,
80
+ ssh_port,
81
+ ssh_user,
82
+ ssh_key,
83
+ ssh_proxy_command=ssh_proxy_command,
84
+ ssh_proxy_jump=ssh_proxy_jump,
85
+ )
86
+
87
+ # COMPLETING state occurs when a job is being terminated - during this
88
+ # phase, slurmd sends SIGTERM to tasks, waits for KillWait period, sends
89
+ # SIGKILL if needed, runs epilog scripts, and notifies slurmctld. This
90
+ # typically happens when a previous job with the same name is being
91
+ # cancelled or has finished. Jobs can get stuck in COMPLETING if epilog
92
+ # scripts hang or tasks don't respond to signals, so we wait with a
93
+ # timeout.
94
+ completing_jobs = client.query_jobs(
95
+ cluster_name_on_cloud,
96
+ ['completing'],
97
+ )
98
+ start_time = time.time()
99
+ while (completing_jobs and
100
+ time.time() - start_time < _JOB_TERMINATION_TIMEOUT_SECONDS):
101
+ logger.debug(f'Found {len(completing_jobs)} completing jobs. '
102
+ f'Waiting for them to finish: {completing_jobs}')
103
+ time.sleep(POLL_INTERVAL_SECONDS)
104
+ completing_jobs = client.query_jobs(
105
+ cluster_name_on_cloud,
106
+ ['completing'],
107
+ )
108
+ if completing_jobs:
109
+ # TODO(kevin): Automatically handle this, following the suggestions in
110
+ # https://slurm.schedmd.com/troubleshoot.html#completing
111
+ raise RuntimeError(f'Found {len(completing_jobs)} jobs still in '
112
+ 'completing state after '
113
+ f'{_JOB_TERMINATION_TIMEOUT_SECONDS}s. '
114
+ 'This is typically due to non-killable processes '
115
+ 'associated with the job.')
116
+
117
+ # Check if job already exists
118
+ existing_jobs = client.query_jobs(
119
+ cluster_name_on_cloud,
120
+ ['pending', 'running'],
121
+ )
122
+
123
+ # Get provision_timeout from config. If not specified, use None,
124
+ # which will use the default timeout specified in the Slurm adaptor.
125
+ provision_timeout = skypilot_config.get_effective_region_config(
126
+ cloud='slurm',
127
+ region=region,
128
+ keys=('provision_timeout',),
129
+ default_value=None)
130
+
131
+ if existing_jobs:
132
+ assert len(existing_jobs) == 1, (
133
+ f'Multiple jobs found with name {cluster_name_on_cloud}: '
134
+ f'{existing_jobs}')
135
+
136
+ job_id = existing_jobs[0]
137
+ logger.debug(f'Job with name {cluster_name_on_cloud} already exists '
138
+ f'(JOBID: {job_id})')
139
+
140
+ # Wait for nodes to be allocated (job might be in PENDING state)
141
+ nodes, _ = client.get_job_nodes(job_id,
142
+ wait=True,
143
+ timeout=provision_timeout)
144
+ return common.ProvisionRecord(provider_name='slurm',
145
+ region=region,
146
+ zone=partition,
147
+ cluster_name=cluster_name_on_cloud,
148
+ head_instance_id=slurm_utils.instance_id(
149
+ job_id, nodes[0]),
150
+ resumed_instance_ids=[],
151
+ created_instance_ids=[])
152
+
153
+ resources = config.node_config
154
+
155
+ # Note: By default Slurm terminates the entire job allocation if any node
156
+ # fails in its range of allocated nodes.
157
+ # In the future we can consider running sbatch with --no-kill to not
158
+ # automatically terminate a job if one of the nodes it has been
159
+ # allocated fails.
160
+ num_nodes = config.count
161
+
162
+ accelerator_type = resources.get('accelerator_type')
163
+ accelerator_count_raw = resources.get('accelerator_count')
164
+ try:
165
+ accelerator_count = int(
166
+ accelerator_count_raw) if accelerator_count_raw is not None else 0
167
+ except (TypeError, ValueError):
168
+ accelerator_count = 0
169
+
170
+ skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
171
+ sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
172
+ ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
173
+ slurm_marker_file = f'{sky_home_dir}/{slurm_utils.SLURM_MARKER_FILE}'
174
+
175
+ # Build the sbatch script
176
+ gpu_directive = ''
177
+ if (accelerator_type is not None and accelerator_type.upper() != 'NONE' and
178
+ accelerator_count > 0):
179
+ gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type}:'
180
+ f'{accelerator_count}')
181
+
182
+ # By default stdout and stderr will be written to $HOME/slurm-%j.out
183
+ # (because we invoke sbatch from $HOME). Redirect elsewhere to not pollute
184
+ # the home directory.
185
+ provision_script = textwrap.dedent(f"""\
186
+ #!/bin/bash
187
+ #SBATCH --job-name={cluster_name_on_cloud}
188
+ #SBATCH --output={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
189
+ #SBATCH --error={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
190
+ #SBATCH --nodes={num_nodes}
191
+ #SBATCH --wait-all-nodes=1
192
+ # Let the job be terminated rather than requeued implicitly.
193
+ #SBATCH --no-requeue
194
+ #SBATCH --cpus-per-task={int(resources["cpus"])}
195
+ #SBATCH --mem={int(resources["memory"])}G
196
+ {gpu_directive}
197
+
198
+ # Cleanup function to remove cluster dirs on job termination.
199
+ cleanup() {{
200
+ # The Skylet is daemonized, so it is not automatically terminated when
201
+ # the Slurm job is terminated, we need to kill it manually.
202
+ echo "Terminating Skylet..."
203
+ if [ -f "{skypilot_runtime_dir}/.sky/skylet_pid" ]; then
204
+ kill $(cat "{skypilot_runtime_dir}/.sky/skylet_pid") 2>/dev/null || true
205
+ fi
206
+ echo "Cleaning up sky directories..."
207
+ # Clean up sky runtime directory on each node.
208
+ # NOTE: We can do this because --nodes for both this srun and the
209
+ # sbatch is the same number. Otherwise, there are no guarantees
210
+ # that this srun will run on the same subset of nodes as the srun
211
+ # that created the sky directories.
212
+ srun --nodes={num_nodes} rm -rf {skypilot_runtime_dir}
213
+ rm -rf {sky_home_dir}
214
+ }}
215
+ trap cleanup TERM
216
+
217
+ # Create sky home directory for the cluster.
218
+ mkdir -p {sky_home_dir}
219
+ # Create sky runtime directory on each node.
220
+ srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
221
+ # Marker file to indicate we're in a Slurm cluster.
222
+ touch {slurm_marker_file}
223
+ # Suppress login messages.
224
+ touch {sky_home_dir}/.hushlogin
225
+ # Signal that the sbatch script has completed setup.
226
+ touch {ready_signal}
227
+ sleep infinity
228
+ """)
229
+
230
+ # To bootstrap things, we need to do it with SSHCommandRunner first.
231
+ # SlurmCommandRunner is for after the virtual instances are created.
232
+ login_node_runner = command_runner.SSHCommandRunner(
233
+ (ssh_host, ssh_port),
234
+ ssh_user,
235
+ ssh_key,
236
+ ssh_proxy_command=ssh_proxy_command,
237
+ ssh_proxy_jump=ssh_proxy_jump,
238
+ )
239
+
240
+ cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
241
+ rc, stdout, stderr = login_node_runner.run(cmd,
242
+ require_outputs=True,
243
+ stream_logs=False)
244
+ subprocess_utils.handle_returncode(
245
+ rc,
246
+ cmd,
247
+ 'Failed to create provision scripts directory on login node.',
248
+ stderr=f'{stdout}\n{stderr}')
249
+ # Rsync the provision script to the login node
250
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=True) as f:
251
+ f.write(provision_script)
252
+ f.flush()
253
+ src_path = f.name
254
+ tgt_path = _sbatch_provision_script_path(f'{cluster_name_on_cloud}.sh')
255
+ login_node_runner.rsync(src_path, tgt_path, up=True, stream_logs=False)
256
+
257
+ job_id = client.submit_job(partition, cluster_name_on_cloud, tgt_path)
258
+ logger.debug(f'Successfully submitted Slurm job {job_id} to partition '
259
+ f'{partition} for cluster {cluster_name_on_cloud} '
260
+ f'with {num_nodes} nodes')
261
+
262
+ nodes, _ = client.get_job_nodes(job_id,
263
+ wait=True,
264
+ timeout=provision_timeout)
265
+ created_instance_ids = [
266
+ slurm_utils.instance_id(job_id, node) for node in nodes
267
+ ]
268
+
269
+ # Wait for the sbatch script to create the cluster's sky directories,
270
+ # to avoid a race condition where post-provision commands try to
271
+ # access the directories before they are created.
272
+ ready_check_cmd = (f'end=$((SECONDS+{_SKY_DIR_CREATION_TIMEOUT_SECONDS})); '
273
+ f'while [ ! -f {ready_signal} ]; do '
274
+ 'if (( SECONDS >= end )); then '
275
+ 'exit 1; fi; '
276
+ 'sleep 0.5; '
277
+ 'done')
278
+ rc, stdout, stderr = login_node_runner.run(ready_check_cmd,
279
+ require_outputs=True,
280
+ stream_logs=False)
281
+ subprocess_utils.handle_returncode(
282
+ rc,
283
+ ready_check_cmd,
284
+ 'Failed to verify sky directories creation.',
285
+ stderr=f'{stdout}\n{stderr}')
286
+
287
+ return common.ProvisionRecord(provider_name='slurm',
288
+ region=region,
289
+ zone=partition,
290
+ cluster_name=cluster_name_on_cloud,
291
+ head_instance_id=created_instance_ids[0],
292
+ resumed_instance_ids=[],
293
+ created_instance_ids=created_instance_ids)
294
+
295
+
296
+ @common_utils.retry
297
+ def query_instances(
298
+ cluster_name: str,
299
+ cluster_name_on_cloud: str,
300
+ provider_config: Optional[Dict[str, Any]] = None,
301
+ non_terminated_only: bool = True,
302
+ retry_if_missing: bool = False,
303
+ ) -> Dict[str, Tuple[Optional[status_lib.ClusterStatus], Optional[str]]]:
304
+ """See sky/provision/__init__.py"""
305
+ del cluster_name, retry_if_missing # Unused for Slurm
306
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
307
+
308
+ ssh_config_dict = provider_config['ssh']
309
+ ssh_host = ssh_config_dict['hostname']
310
+ ssh_port = int(ssh_config_dict['port'])
311
+ ssh_user = ssh_config_dict['user']
312
+ ssh_key = ssh_config_dict['private_key']
313
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
314
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
315
+
316
+ client = slurm.SlurmClient(
317
+ ssh_host,
318
+ ssh_port,
319
+ ssh_user,
320
+ ssh_key,
321
+ ssh_proxy_command=ssh_proxy_command,
322
+ ssh_proxy_jump=ssh_proxy_jump,
323
+ )
324
+
325
+ # Map Slurm job states to SkyPilot ClusterStatus
326
+ # Slurm states:
327
+ # https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
328
+ # TODO(kevin): Include more states here.
329
+ status_map = {
330
+ 'pending': status_lib.ClusterStatus.INIT,
331
+ 'running': status_lib.ClusterStatus.UP,
332
+ 'completing': status_lib.ClusterStatus.UP,
333
+ 'completed': None,
334
+ 'cancelled': None,
335
+ # NOTE: Jobs that get cancelled (from sky down) will go to failed state
336
+ # with the reason 'NonZeroExitCode' and remain in the squeue output for
337
+ # a while.
338
+ 'failed': None,
339
+ 'node_fail': None,
340
+ }
341
+
342
+ statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
343
+ Optional[str]]] = {}
344
+ for state, sky_status in status_map.items():
345
+ jobs = client.query_jobs(
346
+ cluster_name_on_cloud,
347
+ [state],
348
+ )
349
+
350
+ for job_id in jobs:
351
+ if state in ('pending', 'failed', 'node_fail', 'cancelled',
352
+ 'completed'):
353
+ reason = client.get_job_reason(job_id)
354
+ if non_terminated_only and sky_status is None:
355
+ # TODO(kevin): For better UX, we should also find out
356
+ # which node(s) exactly that failed if it's a node_fail
357
+ # state.
358
+ logger.debug(f'Job {job_id} is terminated, but '
359
+ 'query_instances is called with '
360
+ f'non_terminated_only=True. State: {state}, '
361
+ f'Reason: {reason}')
362
+ continue
363
+ statuses[job_id] = (sky_status, reason)
364
+ else:
365
+ nodes, _ = client.get_job_nodes(job_id, wait=False)
366
+ for node in nodes:
367
+ instance_id = slurm_utils.instance_id(job_id, node)
368
+ statuses[instance_id] = (sky_status, None)
369
+
370
+ # TODO(kevin): Query sacct too to get more historical job info.
371
+ # squeue only includes completed jobs that finished in the last
372
+ # MinJobAge seconds (default 300s). Or could be earlier if it
373
+ # reaches MaxJobCount first (default 10_000).
374
+
375
+ return statuses
376
+
377
+
378
+ def run_instances(
379
+ region: str,
380
+ cluster_name: str, # pylint: disable=unused-argument
381
+ cluster_name_on_cloud: str,
382
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
383
+ """Run instances for the given cluster (Slurm in this case)."""
384
+ return _create_virtual_instance(region, cluster_name_on_cloud, config)
385
+
386
+
387
+ def wait_instances(region: str, cluster_name_on_cloud: str,
388
+ state: Optional[status_lib.ClusterStatus]) -> None:
389
+ """See sky/provision/__init__.py"""
390
+ del region, cluster_name_on_cloud, state
391
+ # We already wait for the instances to be running in run_instances.
392
+ # So we don't need to wait here.
393
+
394
+
395
+ def get_cluster_info(
396
+ region: str,
397
+ cluster_name_on_cloud: str,
398
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
399
+ del region
400
+ assert provider_config is not None, cluster_name_on_cloud
401
+
402
+ # The SSH host is the remote machine running slurmctld daemon.
403
+ # Cross-cluster operations are supported by interacting with
404
+ # the current controller. For details, please refer to
405
+ # https://slurm.schedmd.com/multi_cluster.html.
406
+ ssh_config_dict = provider_config['ssh']
407
+ ssh_host = ssh_config_dict['hostname']
408
+ ssh_port = int(ssh_config_dict['port'])
409
+ ssh_user = ssh_config_dict['user']
410
+ ssh_key = ssh_config_dict['private_key']
411
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
412
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
413
+
414
+ client = slurm.SlurmClient(
415
+ ssh_host,
416
+ ssh_port,
417
+ ssh_user,
418
+ ssh_key,
419
+ ssh_proxy_command=ssh_proxy_command,
420
+ ssh_proxy_jump=ssh_proxy_jump,
421
+ )
422
+
423
+ # Find running job for this cluster
424
+ running_jobs = client.query_jobs(
425
+ cluster_name_on_cloud,
426
+ ['running'],
427
+ )
428
+
429
+ if not running_jobs:
430
+ # No running jobs found - cluster may be in pending or terminated state
431
+ return common.ClusterInfo(
432
+ instances={},
433
+ head_instance_id=None,
434
+ ssh_user=ssh_user,
435
+ provider_name='slurm',
436
+ provider_config=provider_config,
437
+ )
438
+ assert len(running_jobs) == 1, (
439
+ f'Multiple running jobs found for cluster {cluster_name_on_cloud}: '
440
+ f'{running_jobs}')
441
+
442
+ job_id = running_jobs[0]
443
+ # Running jobs should already have nodes allocated, so don't wait
444
+ nodes, node_ips = client.get_job_nodes(job_id, wait=False)
445
+
446
+ instances = {
447
+ f'{slurm_utils.instance_id(job_id, node)}': [
448
+ common.InstanceInfo(
449
+ instance_id=slurm_utils.instance_id(job_id, node),
450
+ internal_ip=node_ip,
451
+ external_ip=ssh_host,
452
+ ssh_port=ssh_port,
453
+ tags={
454
+ constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud,
455
+ 'job_id': job_id,
456
+ 'node': node,
457
+ },
458
+ )
459
+ ] for node, node_ip in zip(nodes, node_ips)
460
+ }
461
+
462
+ return common.ClusterInfo(
463
+ instances=instances,
464
+ head_instance_id=slurm_utils.instance_id(job_id, nodes[0]),
465
+ ssh_user=ssh_user,
466
+ provider_name='slurm',
467
+ provider_config=provider_config,
468
+ )
469
+
470
+
471
+ def stop_instances(
472
+ cluster_name_on_cloud: str,
473
+ provider_config: Optional[Dict[str, Any]] = None,
474
+ worker_only: bool = False,
475
+ ) -> None:
476
+ """Keep the Slurm virtual instances running."""
477
+ raise NotImplementedError()
478
+
479
+
480
+ def terminate_instances(
481
+ cluster_name_on_cloud: str,
482
+ provider_config: Optional[Dict[str, Any]] = None,
483
+ worker_only: bool = False,
484
+ ) -> None:
485
+ """See sky/provision/__init__.py"""
486
+ assert provider_config is not None, cluster_name_on_cloud
487
+
488
+ if worker_only:
489
+ logger.warning(
490
+ 'worker_only=True is not supported for Slurm, this is a no-op.')
491
+ return
492
+
493
+ # Check if we are running inside a Slurm cluster (only happens with
494
+ # autodown, where the Skylet invokes terminate_instances on the remote
495
+ # cluster). In this case, use local execution instead of SSH.
496
+ # This assumes that the compute node is able to run scancel.
497
+ # TODO(kevin): Validate this assumption.
498
+ if slurm_utils.is_inside_slurm_cluster():
499
+ logger.debug('Running inside a Slurm cluster, using local execution')
500
+ client = slurm.SlurmClient(is_inside_slurm_cluster=True)
501
+ else:
502
+ ssh_config_dict = provider_config['ssh']
503
+ ssh_host = ssh_config_dict['hostname']
504
+ ssh_port = int(ssh_config_dict['port'])
505
+ ssh_user = ssh_config_dict['user']
506
+ ssh_private_key = ssh_config_dict['private_key']
507
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
508
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
509
+
510
+ client = slurm.SlurmClient(
511
+ ssh_host,
512
+ ssh_port,
513
+ ssh_user,
514
+ ssh_private_key,
515
+ ssh_proxy_command=ssh_proxy_command,
516
+ ssh_proxy_jump=ssh_proxy_jump,
517
+ )
518
+ jobs_state = client.get_jobs_state_by_name(cluster_name_on_cloud)
519
+ if not jobs_state:
520
+ logger.debug(f'Job for cluster {cluster_name_on_cloud} not found, '
521
+ 'it may have been terminated.')
522
+ return
523
+ assert len(jobs_state) == 1, (
524
+ f'Multiple jobs found for cluster {cluster_name_on_cloud}: {jobs_state}'
525
+ )
526
+
527
+ job_state = jobs_state[0].strip()
528
+ # Terminal states where scancel is not needed or will fail.
529
+ terminal_states = {
530
+ 'COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT', 'NODE_FAIL', 'PREEMPTED',
531
+ 'SPECIAL_EXIT'
532
+ }
533
+ if job_state in terminal_states:
534
+ logger.debug(
535
+ f'Job for cluster {cluster_name_on_cloud} is already in a terminal '
536
+ f'state {job_state}. No action needed.')
537
+ return
538
+
539
+ if job_state in ('PENDING', 'CONFIGURING'):
540
+ # For pending/configuring jobs, cancel without signal to avoid hangs.
541
+ client.cancel_jobs_by_name(cluster_name_on_cloud, signal=None)
542
+ elif job_state == 'COMPLETING':
543
+ # Job is already being terminated. No action needed.
544
+ logger.debug(
545
+ f'Job for cluster {cluster_name_on_cloud} is already completing. '
546
+ 'No action needed.')
547
+ else:
548
+ # For other states (e.g., RUNNING, SUSPENDED), send a TERM signal.
549
+ client.cancel_jobs_by_name(cluster_name_on_cloud,
550
+ signal='TERM',
551
+ full=True)
552
+
553
+
554
+ def open_ports(
555
+ cluster_name_on_cloud: str,
556
+ ports: List[str],
557
+ provider_config: Optional[Dict[str, Any]] = None,
558
+ ) -> None:
559
+ """See sky/provision/__init__.py"""
560
+ del cluster_name_on_cloud, ports, provider_config
561
+ pass
562
+
563
+
564
+ def cleanup_ports(
565
+ cluster_name_on_cloud: str,
566
+ ports: List[str],
567
+ provider_config: Optional[Dict[str, Any]] = None,
568
+ ) -> None:
569
+ """See sky/provision/__init__.py"""
570
+ del cluster_name_on_cloud, ports, provider_config
571
+ pass
572
+
573
+
574
+ def get_command_runners(
575
+ cluster_info: common.ClusterInfo,
576
+ **credentials: Dict[str, Any],
577
+ ) -> List[command_runner.SlurmCommandRunner]:
578
+ """Get a command runner for the given cluster."""
579
+ assert cluster_info.provider_config is not None, cluster_info
580
+
581
+ if cluster_info.head_instance_id is None:
582
+ # No running job found
583
+ return []
584
+
585
+ head_instance = cluster_info.get_head_instance()
586
+ assert head_instance is not None, 'Head instance not found'
587
+ cluster_name_on_cloud = head_instance.tags.get(
588
+ constants.TAG_SKYPILOT_CLUSTER_NAME, None)
589
+ assert cluster_name_on_cloud is not None, cluster_info
590
+
591
+ # There can only be one InstanceInfo per instance_id.
592
+ instances = [
593
+ instance_infos[0] for instance_infos in cluster_info.instances.values()
594
+ ]
595
+
596
+ # Note: For Slurm, the external IP for all instances is the same,
597
+ # it is the login node's. The internal IP is the private IP of the node.
598
+ ssh_user = cast(str, credentials.pop('ssh_user'))
599
+ ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
600
+ # ssh_proxy_jump is Slurm-specific, it does not exist in the auth section
601
+ # of the cluster yaml.
602
+ ssh_proxy_jump = cluster_info.provider_config.get('ssh', {}).get(
603
+ 'proxyjump', None)
604
+ runners = [
605
+ command_runner.SlurmCommandRunner(
606
+ (instance_info.external_ip or '', instance_info.ssh_port),
607
+ ssh_user,
608
+ ssh_private_key,
609
+ sky_dir=_sky_cluster_home_dir(cluster_name_on_cloud),
610
+ skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
611
+ job_id=instance_info.tags['job_id'],
612
+ slurm_node=instance_info.tags['node'],
613
+ ssh_proxy_jump=ssh_proxy_jump,
614
+ enable_interactive_auth=True,
615
+ **credentials) for instance_info in instances
616
+ ]
617
+
618
+ return runners