skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ """Utilities for the API server."""
2
+
3
+ from typing import Optional, Type, TypeVar
4
+
5
+ import fastapi
6
+
7
+ from sky.server.requests import payloads
8
+ from sky.skylet import constants
9
+
10
+ _BodyT = TypeVar('_BodyT', bound=payloads.RequestBody)
11
+
12
+
13
+ # TODO(aylei): remove this and disable request body construction at server-side
14
+ def build_body_at_server(request: Optional[fastapi.Request],
15
+ body_type: Type[_BodyT], **data) -> _BodyT:
16
+ """Builds the request body at the server.
17
+
18
+ For historical reasons, some handlers mimic a client request body
19
+ at server-side in order to coordinate with the interface of executor.
20
+ This will cause issues where the client info like user identity is not
21
+ respected in these handlers. This function is a helper to build the request
22
+ body at server-side with the auth user overridden.
23
+ """
24
+ request_body = body_type(**data)
25
+ if request is not None:
26
+ auth_user = getattr(request.state, 'auth_user', None)
27
+ if auth_user:
28
+ request_body.env_vars[constants.USER_ID_ENV_VAR] = auth_user.id
29
+ request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
30
+ return request_body
sky/server/uvicorn.py CHANGED
@@ -20,6 +20,7 @@ from uvicorn.supervisors import multiprocess
20
20
  from sky import sky_logging
21
21
  from sky.server import daemons
22
22
  from sky.server import metrics as metrics_lib
23
+ from sky.server import plugins
23
24
  from sky.server import state
24
25
  from sky.server.requests import requests as requests_lib
25
26
  from sky.skylet import constants
@@ -237,6 +238,10 @@ def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
237
238
  server = Server(config=config, max_db_connections=max_db_connections)
238
239
  try:
239
240
  if config.workers is not None and config.workers > 1:
241
+ # When workers > 1, uvicorn does not run server app in the main
242
+ # process. In this case, plugins are not loaded at this point, so
243
+ # load plugins here without uvicorn app.
244
+ plugins.load_plugins(plugins.ExtensionContext())
240
245
  sock = config.bind_socket()
241
246
  SlowStartMultiprocess(config, target=server.run,
242
247
  sockets=[sock]).run()
@@ -15,6 +15,7 @@ include sky/jobs/dashboard/templates/*
15
15
  include sky/jobs/dashboard/static/*
16
16
  include sky/templates/*
17
17
  include sky/utils/kubernetes/*
18
+ include sky/ssh_node_pools/deploy/tunnel/*
18
19
  include sky/server/html/*
19
20
  recursive-include sky/dashboard/out *
20
21
  include sky/users/*.conf
@@ -84,6 +84,7 @@ install_requires = [
84
84
  'bcrypt==4.0.1',
85
85
  'pyjwt',
86
86
  'gitpython',
87
+ 'paramiko',
87
88
  'types-paramiko',
88
89
  'alembic',
89
90
  'aiohttp',
@@ -143,9 +144,11 @@ aws_dependencies = [
143
144
  'awscli>=1.27.10',
144
145
  'botocore>=1.29.10',
145
146
  'boto3>=1.26.1',
146
- # NOTE: required by awscli. To avoid ray automatically installing
147
- # the latest version.
148
- 'colorama < 0.4.5',
147
+ # NOTE: colorama is a dependency of awscli. We pin it to match the
148
+ # version constraint in awscli (<0.4.7) to prevent potential conflicts
149
+ # with other packages like ray, which might otherwise install a newer
150
+ # version.
151
+ 'colorama<0.4.7',
149
152
  ]
150
153
 
151
154
  # Kubernetes 32.0.0 has an authentication bug:
@@ -203,12 +206,21 @@ cloud_dependencies: Dict[str, List[str]] = {
203
206
  'ssh': kubernetes_dependencies,
204
207
  # For the container registry auth api. Reference:
205
208
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
206
- # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
207
- # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
208
- # Instead of installing tomli conditionally, we install it explicitly.
209
- # This is because the conditional installation of tomli does not work
210
- # with controller package installation code.
211
- 'runpod': ['runpod>=1.6.1', 'tomli'],
209
+ 'runpod': [
210
+ # For the container registry auth api. Reference:
211
+ # https://github.com/runpod/runpod-python/releases/tag/1.6.1
212
+ 'runpod>=1.6.1',
213
+ # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
214
+ # 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
215
+ # explicitly. Instead of installing tomli conditionally, we install it
216
+ # explicitly. This is because the conditional installation of tomli does
217
+ # not work with controller package installation code.
218
+ 'tomli',
219
+ # runpod installs aiodns (via aiohttp[speedups]), which is incompatible
220
+ # with pycares 5.0.0 due to deprecations.
221
+ # See https://github.com/aio-libs/aiodns/issues/214
222
+ 'pycares<5',
223
+ ],
212
224
  'fluidstack': [], # No dependencies needed for fluidstack
213
225
  'cudo': ['cudo-compute>=0.1.10'],
214
226
  'paperspace': [], # No dependencies needed for paperspace
@@ -234,6 +246,7 @@ cloud_dependencies: Dict[str, List[str]] = {
234
246
  'hyperbolic': [], # No dependencies needed for hyperbolic
235
247
  'seeweb': ['ecsapi==0.4.0'],
236
248
  'shadeform': [], # No dependencies needed for shadeform
249
+ 'slurm': ['python-hostlist'],
237
250
  }
238
251
 
239
252
  # Calculate which clouds should be included in the [all] installation.
sky/sky_logging.py CHANGED
@@ -15,7 +15,8 @@ from sky.utils import env_options
15
15
  from sky.utils import rich_utils
16
16
 
17
17
  # UX: Should we show logging prefixes and some extra information in optimizer?
18
- _FORMAT = '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
18
+ _FORMAT = ('%(levelname).1s %(asctime)s.%(msecs)03d PID=%(process)d '
19
+ '%(filename)s:%(lineno)d] %(message)s')
19
20
  _DATE_FORMAT = '%m-%d %H:%M:%S'
20
21
  _SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
21
22
 
@@ -9,6 +9,7 @@ import psutil
9
9
 
10
10
  from sky.skylet import constants
11
11
  from sky.skylet import runtime_utils
12
+ from sky.utils import common_utils
12
13
 
13
14
  VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
14
15
  SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
@@ -97,8 +98,13 @@ def restart_skylet():
97
98
  for pid in _find_running_skylet_pids():
98
99
  try:
99
100
  os.kill(pid, signal.SIGKILL)
100
- except (OSError, ProcessLookupError):
101
- # Process died between detection and kill
101
+ # Wait until process fully terminates so its socket gets released.
102
+ # Without this, find_free_port may race with the kernel closing the
103
+ # socket and fail to bind to the port that's supposed to be free.
104
+ psutil.Process(pid).wait(timeout=5)
105
+ except (OSError, ProcessLookupError, psutil.NoSuchProcess,
106
+ psutil.TimeoutExpired):
107
+ # Process died between detection and kill, or timeout waiting
102
108
  pass
103
109
  # Clean up the PID file
104
110
  try:
@@ -106,7 +112,11 @@ def restart_skylet():
106
112
  except OSError:
107
113
  pass # Best effort cleanup
108
114
 
109
- port = constants.SKYLET_GRPC_PORT
115
+ # TODO(kevin): Handle race conditions here. Race conditions can only
116
+ # happen on Slurm, where there could be multiple clusters running in
117
+ # one network namespace. For other clouds, the behaviour will be that
118
+ # it always gets port 46590 (default port).
119
+ port = common_utils.find_free_port(constants.SKYLET_GRPC_PORT)
110
120
  subprocess.run(
111
121
  # We have made sure that `attempt_skylet.py` is executed with the
112
122
  # skypilot runtime env activated, so that skylet can access the cloud
sky/skylet/constants.py CHANGED
@@ -20,11 +20,13 @@ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
20
20
  # os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
21
21
  # '.sky/jobs.db')
22
22
  SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
23
+ SKY_CLUSTER_NAME_ENV_VAR_KEY = 'SKY_CLUSTER_NAME'
23
24
  # We keep sky_logs and sky_workdir in $HOME, because
24
25
  # these are artifacts that users can access, and having
25
26
  # them be in $HOME makes it more convenient.
26
27
  SKY_LOGS_DIRECTORY = '~/sky_logs'
27
28
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
29
+ SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
28
30
  SKY_IGNORE_FILE = '.skyignore'
29
31
  GIT_IGNORE_FILE = '.gitignore'
30
32
 
@@ -45,7 +47,19 @@ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
45
47
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
46
48
  SKY_REMOTE_RAY_VERSION = '2.9.3'
47
49
 
48
- SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
50
+ # To avoid user image causing issue with the SkyPilot runtime, we run SkyPilot
51
+ # commands the following prefix:
52
+ # 1. env -u PYTHONPATH: unset PYTHONPATH to avoid any package specified in
53
+ # PYTHONPATH interfering with the SkyPilot runtime.
54
+ # 2. env -C $HOME: set the execution directory to $HOME to avoid the case when
55
+ # a user's WORKDIR in Dockerfile is a Python site-packages directory. Python
56
+ # adds CWD to the beginning of sys.path, so if WORKDIR contains packages (e.g.,
57
+ # compiled for a different Python version), imports will fail with errors like
58
+ # "ModuleNotFoundError: No module named 'rpds.rpds'".
59
+ #
60
+ # TODO(zhwu): Switch -C $HOME to PYTHONSAFEPATH=1, once we moved our runtime to
61
+ # Python 3.11 for a more robust setup.
62
+ SKY_UNSET_PYTHONPATH_AND_SET_CWD = 'env -u PYTHONPATH -C $HOME'
49
63
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
50
64
  # in this file, so that any future internal commands that need to use python
51
65
  # can use this path. This is useful for the case where the user has a custom
@@ -57,7 +71,8 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
57
71
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
58
72
  'which python3')
59
73
  # Python executable, e.g., /opt/conda/bin/python3
60
- SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
74
+ SKY_PYTHON_CMD = (f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} '
75
+ f'$({SKY_GET_PYTHON_PATH_CMD})')
61
76
  # Prefer SKY_UV_PIP_CMD, which is faster.
62
77
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
63
78
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -67,17 +82,30 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
67
82
  # #!/opt/conda/bin/python3
68
83
  SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
69
84
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
85
+
86
+ # Use $(which env) to find env, falling back to /usr/bin/env if which is
87
+ # unavailable. This works around a Slurm quirk where srun's execvp() doesn't
88
+ # check execute permissions, failing when $HOME/.local/bin/env (non-executable,
89
+ # from uv installation) shadows /usr/bin/env.
90
+ SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
91
+ '-u PYTHONPATH')
92
+ SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
93
+ f'$({SKY_GET_PYTHON_PATH_CMD})')
94
+
70
95
  # Separate env for SkyPilot runtime dependencies.
71
96
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
72
97
  SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
73
98
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
99
+ # Place the conda root in the runtime directory, as installing to $HOME
100
+ # on an NFS takes too long (1-2m slower).
101
+ SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
74
102
  # uv is used for venv and pip, much faster than python implementations.
75
103
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
76
104
  # set UV_SYSTEM_PYTHON to false in case the
77
105
  # user provided docker image set it to true.
78
106
  # unset PYTHONPATH in case the user provided docker image set it.
79
107
  SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
80
- f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
108
+ f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} {SKY_UV_INSTALL_DIR}/uv')
81
109
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
82
110
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
83
111
  'curl -LsSf https://astral.sh/uv/install.sh '
@@ -116,7 +144,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
116
144
  # cluster yaml is updated.
117
145
  #
118
146
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
119
- SKYLET_VERSION = '27'
147
+ SKYLET_VERSION = '29'
120
148
  # The version of the lib files that skylet/jobs use. Whenever there is an API
121
149
  # change for the job_lib or log_lib, we need to bump this version, so that the
122
150
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -162,6 +190,10 @@ DISABLE_GPU_ECC_COMMAND = (
162
190
  '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
163
191
  '|| true; ')
164
192
 
193
+ SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
194
+ f'mkdir -p ~/.sky/sky_app && '
195
+ f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
196
+
165
197
  # Install conda on the remote cluster if it is not already installed.
166
198
  # We use conda with python 3.10 to be consistent across multiple clouds with
167
199
  # best effort.
@@ -178,8 +210,9 @@ CONDA_INSTALLATION_COMMANDS = (
178
210
  # because for some images, conda is already installed, but not initialized.
179
211
  # In this case, we need to initialize conda and set auto_activate_base to
180
212
  # true.
181
- '{ bash Miniconda3-Linux.sh -b || true; '
182
- 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
213
+ '{ '
214
+ f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
215
+ f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
183
216
  # Caller should replace {conda_auto_activate} with either true or false.
184
217
  'conda config --set auto_activate_base {conda_auto_activate} && '
185
218
  'conda activate base; }; '
@@ -222,7 +255,7 @@ _sky_version = str(version.parse(sky.__version__))
222
255
  RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
223
256
  RAY_INSTALLATION_COMMANDS = (
224
257
  f'{SKY_UV_INSTALL_CMD};'
225
- 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
258
+ f'{SETUP_SKY_DIRS_COMMANDS}'
226
259
  # Print the PATH in provision.log to help debug PATH issues.
227
260
  'echo PATH=$PATH; '
228
261
  # Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
@@ -256,7 +289,7 @@ RAY_INSTALLATION_COMMANDS = (
256
289
  #
257
290
  # Here, we add ~/.local/bin to the end of the PATH to make sure the issues
258
291
  # mentioned above are resolved.
259
- 'export PATH=$PATH:$HOME/.local/bin; '
292
+ f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
260
293
  # Writes ray path to file if it does not exist or the file is empty.
261
294
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
262
295
  f'{{ {SKY_UV_RUN_CMD} '
@@ -264,18 +297,23 @@ RAY_INSTALLATION_COMMANDS = (
264
297
 
265
298
  # Copy SkyPilot templates from the installed wheel to ~/sky_templates.
266
299
  # This must run after the skypilot wheel is installed.
300
+ # Note: We remove ~/sky_templates first to avoid import conflicts where Python
301
+ # would import from ~/sky_templates instead of site-packages (because
302
+ # sky_templates itself is a package), leading to src == dst error when
303
+ # launching on an existing cluster.
267
304
  COPY_SKYPILOT_TEMPLATES_COMMANDS = (
305
+ f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
268
306
  f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
269
307
  f'{SKY_PYTHON_CMD} -c \''
270
308
  'import sky_templates, shutil, os; '
271
309
  'src = os.path.dirname(sky_templates.__file__); '
272
- 'dst = os.path.expanduser(\"~/sky_templates\"); '
310
+ f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
273
311
  'print(f\"Copying templates from {src} to {dst}...\"); '
274
- 'shutil.copytree(src, dst, dirs_exist_ok=True); '
312
+ 'shutil.copytree(src, dst); '
275
313
  'print(f\"Templates copied successfully\")\'; '
276
314
  # Make scripts executable.
277
- 'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
278
- '-exec chmod +x {} \\; ')
315
+ f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
316
+ '-exec chmod +x {} + ; ')
279
317
 
280
318
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
281
319
  f'{SKY_UV_INSTALL_CMD};'
@@ -438,6 +476,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
438
476
  ('gcp', 'enable_gvnic'),
439
477
  ('gcp', 'enable_gpu_direct'),
440
478
  ('gcp', 'placement_policy'),
479
+ ('vast', 'datacenter_only'),
441
480
  ('active_workspace',),
442
481
  ]
443
482
  # When overriding the SkyPilot configs on the API server with the client one,
@@ -498,6 +537,9 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
498
537
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
499
538
  OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
500
539
  IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
540
+ # Environment variable that is set to 'true' if rolling update strategy is
541
+ # enabled for the API server deployment.
542
+ SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'
501
543
 
502
544
  SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
503
545
  f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
@@ -532,7 +574,7 @@ CATALOG_SCHEMA_VERSION = 'v8'
532
574
  CATALOG_DIR = '~/.sky/catalogs'
533
575
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
534
576
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
535
- 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
577
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
536
578
  'hyperbolic', 'seeweb', 'shadeform')
537
579
  # END constants used for service catalog.
538
580
 
sky/skylet/events.py CHANGED
@@ -236,7 +236,7 @@ class AutostopEvent(SkyletEvent):
236
236
  RAY_PROVISIONER_SKYPILOT_TERMINATOR):
237
237
  logger.info('Using new provisioner to stop the cluster.')
238
238
  self._stop_cluster_with_new_provisioner(autostop_config, config,
239
- provider_name)
239
+ provider_name, cloud)
240
240
  return
241
241
  logger.info('Not using new provisioner to stop the cluster. '
242
242
  f'Cloud of this cluster: {provider_name}')
@@ -314,7 +314,8 @@ class AutostopEvent(SkyletEvent):
314
314
  raise NotImplementedError
315
315
 
316
316
  def _stop_cluster_with_new_provisioner(self, autostop_config,
317
- cluster_config, provider_name):
317
+ cluster_config, provider_name,
318
+ cloud):
318
319
  # pylint: disable=import-outside-toplevel
319
320
  from sky import provision as provision_lib
320
321
  autostop_lib.set_autostopping_started()
@@ -334,8 +335,13 @@ class AutostopEvent(SkyletEvent):
334
335
 
335
336
  # Stop the ray autoscaler to avoid scaling up, during
336
337
  # stopping/terminating of the cluster.
337
- logger.info('Stopping the ray cluster.')
338
- subprocess.run(f'{constants.SKY_RAY_CMD} stop', shell=True, check=True)
338
+ if not cloud.uses_ray():
339
+ logger.info('Skipping ray stop as cloud does not use Ray.')
340
+ else:
341
+ logger.info('Stopping the ray cluster.')
342
+ subprocess.run(f'{constants.SKY_RAY_CMD} stop',
343
+ shell=True,
344
+ check=True)
339
345
 
340
346
  operation_fn = provision_lib.stop_instances
341
347
  if autostop_config.down:
@@ -0,0 +1 @@
1
+ """Task Executors"""
@@ -0,0 +1,187 @@
1
+ """Slurm distributed task executor for SkyPilot.
2
+
3
+ This module is invoked on each Slurm compute node via:
4
+ srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
5
+ """
6
+ import argparse
7
+ import json
8
+ import os
9
+ import pathlib
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import time
14
+
15
+ import colorama
16
+
17
+ from sky.skylet.log_lib import run_bash_command_with_log
18
+
19
+
20
+ def _get_ip_address() -> str:
21
+ """Get the IP address of the current node."""
22
+ # Use socket.gethostbyname to be consistent with _get_job_node_ips(),
23
+ # which resolves hostnames the same way. Using `hostname -I` can return
24
+ # Docker bridge IPs (172.17.x.x) first, causing IP mismatch errors.
25
+ return socket.gethostbyname(socket.gethostname())
26
+
27
+
28
+ def _get_job_node_ips() -> str:
29
+ """Get IPs of all nodes in the current Slurm job."""
30
+ nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
31
+ assert nodelist, 'SLURM_JOB_NODELIST is not set'
32
+
33
+ # Expand compressed nodelist (e.g., "node[1-3,5]"
34
+ # -> "node1\nnode2\nnode3\nnode5")
35
+ result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
36
+ capture_output=True,
37
+ text=True,
38
+ check=False)
39
+ if result.returncode != 0:
40
+ raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
41
+
42
+ hostnames = result.stdout.strip().split('\n')
43
+ ips = []
44
+ for hostname in hostnames:
45
+ try:
46
+ ip = socket.gethostbyname(hostname)
47
+ ips.append(ip)
48
+ except socket.gaierror as e:
49
+ raise RuntimeError('Failed to get IP for hostname: '
50
+ f'{hostname}') from e
51
+
52
+ return '\n'.join(ips)
53
+
54
+
55
+ def main():
56
+ parser = argparse.ArgumentParser(
57
+ description='SkyPilot Slurm task runner for distributed execution')
58
+ parser.add_argument('--script', help='User script (inline, shell-quoted)')
59
+ parser.add_argument('--script-path',
60
+ help='Path to script file (if too long for inline)')
61
+ parser.add_argument('--env-vars',
62
+ default='{}',
63
+ help='JSON-encoded environment variables')
64
+ parser.add_argument('--log-dir',
65
+ required=True,
66
+ help='Directory for log files')
67
+ parser.add_argument('--cluster-num-nodes',
68
+ type=int,
69
+ required=True,
70
+ help='Total number of nodes in the cluster')
71
+ parser.add_argument('--cluster-ips',
72
+ required=True,
73
+ help='Comma-separated list of cluster node IPs')
74
+ parser.add_argument('--task-name',
75
+ default=None,
76
+ help='Task name for single-node log prefix')
77
+ parser.add_argument(
78
+ '--is-setup',
79
+ action='store_true',
80
+ help=
81
+ 'Whether this is a setup command (affects logging prefix and filename)')
82
+ parser.add_argument('--alloc-signal-file',
83
+ help='Path to allocation signal file')
84
+ parser.add_argument('--setup-done-signal-file',
85
+ help='Path to setup-done signal file')
86
+ args = parser.parse_args()
87
+
88
+ assert args.script is not None or args.script_path is not None, (
89
+ 'Either '
90
+ '--script or --script-path must be provided')
91
+
92
+ # Task rank, different from index of the node in the cluster.
93
+ rank = int(os.environ['SLURM_PROCID'])
94
+ num_nodes = int(os.environ.get('SLURM_NNODES', 1))
95
+ is_single_node_cluster = (args.cluster_num_nodes == 1)
96
+
97
+ # Determine node index from IP (like Ray's cluster_ips_to_node_id)
98
+ cluster_ips = args.cluster_ips.split(',')
99
+ ip_addr = _get_ip_address()
100
+ try:
101
+ node_idx = cluster_ips.index(ip_addr)
102
+ except ValueError as e:
103
+ raise RuntimeError(f'IP address {ip_addr} not found in '
104
+ f'cluster IPs: {cluster_ips}') from e
105
+ node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
106
+
107
+ # Log files are written to a shared filesystem, so each node must use a
108
+ # unique filename to avoid collisions.
109
+ if args.is_setup:
110
+ # TODO(kevin): This is inconsistent with other clouds, where it is
111
+ # simply called 'setup.log'. On Slurm that is obviously not possible,
112
+ # since the ~/sky_logs directory is shared by all nodes, so
113
+ # 'setup.log' will be overwritten by other nodes.
114
+ # Perhaps we should apply this naming convention to other clouds.
115
+ log_filename = f'setup-{node_name}.log'
116
+ elif is_single_node_cluster:
117
+ log_filename = 'run.log'
118
+ else:
119
+ log_filename = f'{rank}-{node_name}.log'
120
+ log_path = os.path.join(args.log_dir, log_filename)
121
+
122
+ if args.script_path:
123
+ with open(args.script_path, 'r', encoding='utf-8') as f:
124
+ script = f.read()
125
+ else:
126
+ script = args.script
127
+
128
+ # Parse env vars and add SKYPILOT environment variables
129
+ env_vars = json.loads(args.env_vars)
130
+ if not args.is_setup:
131
+ # For setup, env vars are set in CloudVmRayBackend._setup.
132
+ env_vars['SKYPILOT_NODE_RANK'] = str(rank)
133
+ env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
134
+ env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
135
+
136
+ # Signal file coordination for setup/run synchronization
137
+ # Rank 0 touches the allocation signal to indicate resources acquired
138
+ if args.alloc_signal_file is not None and rank == 0:
139
+ pathlib.Path(args.alloc_signal_file).touch()
140
+
141
+ # Wait for setup to complete.
142
+ while args.setup_done_signal_file is not None and not os.path.exists(
143
+ args.setup_done_signal_file):
144
+ time.sleep(0.1)
145
+
146
+ # Build log prefix
147
+ # For setup on head: (setup pid={pid})
148
+ # For setup on workers: (setup pid={pid}, ip=1.2.3.4)
149
+ # For single-node cluster: (task_name, pid={pid})
150
+ # For multi-node on head: (head, rank=0, pid={pid})
151
+ # For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
152
+ # The {pid} placeholder will be replaced by run_with_log
153
+ if args.is_setup:
154
+ # Setup prefix: head (node_idx=0) shows no IP, workers show IP
155
+ if node_idx == 0:
156
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
157
+ f'{colorama.Style.RESET_ALL} ')
158
+ else:
159
+ prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
160
+ f'{colorama.Style.RESET_ALL} ')
161
+ elif is_single_node_cluster:
162
+ # Single-node cluster: use task name
163
+ name_str = args.task_name if args.task_name else 'task'
164
+ prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
165
+ f'{colorama.Style.RESET_ALL} ')
166
+ else:
167
+ # Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
168
+ if node_idx == 0:
169
+ prefix = (
170
+ f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
171
+ f'{colorama.Style.RESET_ALL} ')
172
+ else:
173
+ prefix = (f'{colorama.Fore.CYAN}'
174
+ f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
175
+ f'{colorama.Style.RESET_ALL} ')
176
+
177
+ returncode = run_bash_command_with_log(script,
178
+ log_path,
179
+ env_vars=env_vars,
180
+ stream_logs=True,
181
+ streaming_prefix=prefix)
182
+
183
+ sys.exit(returncode)
184
+
185
+
186
+ if __name__ == '__main__':
187
+ main()