skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,971 @@
1
+ """Code generator for task execution."""
2
+
3
+ import copy
4
+ import inspect
5
+ import json
6
+ import math
7
+ import os
8
+ import textwrap
9
+ from typing import Dict, List, Optional, Tuple
10
+
11
+ import colorama
12
+
13
+ from sky import sky_logging
14
+ from sky.skylet import constants
15
+ from sky.skylet import log_lib
16
+ from sky.utils import accelerator_registry
17
+ from sky.utils import ux_utils
18
+
19
+ # Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
20
+ # from interfering with the Ray cluster in the user's task (if any).
21
+ UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
22
+
23
+ logger = sky_logging.init_logger(__name__)
24
+
25
+
26
+ class TaskCodeGen:
27
+ """Base code generator for task execution on Ray and Slurm."""
28
+
29
+ def __init__(self) -> None:
30
+ # Code generated so far, to be joined via '\n'.
31
+ self._code: List[str] = []
32
+ # Guard method calling order.
33
+ self._has_prologue: bool = False
34
+ self._has_epilogue: bool = False
35
+ self._has_setup: bool = False
36
+ # Job ID is used to identify the job (also this generated code).
37
+ self.job_id: Optional[int] = None
38
+
39
+ def _add_common_imports(self) -> None:
40
+ """Add common imports for both Ray and Slurm execution."""
41
+ self._code.append(
42
+ textwrap.dedent("""\
43
+ import functools
44
+ import getpass
45
+ import hashlib
46
+ import io
47
+ import os
48
+ import pathlib
49
+ import selectors
50
+ import shlex
51
+ import subprocess
52
+ import sys
53
+ import tempfile
54
+ import textwrap
55
+ import time
56
+ from typing import Dict, List, Optional, Tuple, Union
57
+ """))
58
+
59
+ def _add_skylet_imports(self) -> None:
60
+ """Add SkyPilot skylet imports."""
61
+ self._code.append(
62
+ textwrap.dedent("""\
63
+ from sky.skylet import autostop_lib
64
+ from sky.skylet import constants
65
+ from sky.skylet import job_lib
66
+ from sky.utils import log_utils
67
+ from sky.utils import subprocess_utils
68
+ """))
69
+
70
+ def _add_logging_functions(self) -> None:
71
+ """Add log streaming functions from log_lib."""
72
+ self._code += [
73
+ # FIXME: This is a hack to make sure that the functions can be found
74
+ # by ray.remote. This should be removed once we have a better way to
75
+ # specify dependencies for ray.
76
+ inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
77
+ inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
78
+ inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
79
+ inspect.getsource(log_lib.process_subprocess_stream),
80
+ inspect.getsource(log_lib.run_with_log),
81
+ inspect.getsource(log_lib.make_task_bash_script),
82
+ inspect.getsource(log_lib.add_ray_env_vars),
83
+ inspect.getsource(log_lib.run_bash_command_with_log),
84
+ inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
85
+ ]
86
+
87
+ def _add_waiting_for_resources_msg(self, num_nodes: int) -> None:
88
+ self._code.append(
89
+ textwrap.dedent(f"""\
90
+ plural = 's' if {num_nodes} > 1 else ''
91
+ node_str = f'{num_nodes} node{{plural}}'
92
+ message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
93
+ 'Waiting for task resources on '
94
+ f'{{node_str}}.{colorama.Style.RESET_ALL}')
95
+ print(message, flush=True)"""))
96
+
97
+ def _get_job_started_msg(self) -> str:
98
+ """Returns the 'Job started' streaming message with ANSI formatting."""
99
+ return (
100
+ f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
101
+ f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
102
+ f'be killed){colorama.Style.RESET_ALL}')
103
+
104
+ def _add_job_started_msg(self) -> None:
105
+ streaming_message = self._get_job_started_msg()
106
+ self._code.append(f'print({streaming_message!r}, flush=True)')
107
+
108
+ def _get_accelerator_details(
109
+ self,
110
+ resources_dict: Dict[str, float],
111
+ ) -> Tuple[Optional[str], float]:
112
+ resources_copy = resources_dict.copy()
113
+ resources_copy.pop('CPU', None)
114
+
115
+ if not resources_copy:
116
+ return None, 0.0
117
+
118
+ assert len(resources_copy) == 1, (
119
+ 'There can only be one type of accelerator per instance. '
120
+ f'Found: {resources_copy}.')
121
+
122
+ acc_name, acc_count = list(resources_copy.items())[0]
123
+ return acc_name, float(acc_count)
124
+
125
+ def _add_constants(self) -> None:
126
+ self._code.append(
127
+ textwrap.dedent(f"""\
128
+ SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
129
+
130
+ CANCELLED_RETURN_CODE = 137
131
+ """))
132
+
133
+ def _get_rclone_flush_script(self) -> str:
134
+ """Generate rclone flush script for cached storage mounts.
135
+
136
+ This script blocks job completion until all storage mounted with
137
+ CACHED_MOUNT mode is uploaded to remote.
138
+
139
+ Returns:
140
+ Bash script as string
141
+ """
142
+ return textwrap.dedent(f"""\
143
+
144
+ # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
145
+ # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
146
+ # rclone for normal mounts as well.
147
+ if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
148
+ [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
149
+ [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
150
+ flushed=0
151
+ # extra second on top of --vfs-cache-poll-interval to
152
+ # avoid race condition between rclone log line creation and this check.
153
+ sleep 1
154
+ while [ $flushed -eq 0 ]; do
155
+ # sleep for the same interval as --vfs-cache-poll-interval
156
+ sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
157
+ flushed=1
158
+ for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
159
+ exitcode=0
160
+ tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
161
+ if [ $exitcode -ne 0 ]; then
162
+ echo "skypilot: cached mount is still uploading to remote"
163
+ flushed=0
164
+ break
165
+ fi
166
+ done
167
+ done
168
+ echo "skypilot: cached mount uploaded complete"
169
+ fi""")
170
+
171
+ def add_prologue(self, job_id: int) -> None:
172
+ """Initialize code generator and add prologue code.
173
+
174
+ Args:
175
+ job_id: SkyPilot internal job ID
176
+ """
177
+ raise NotImplementedError
178
+
179
+ def add_setup(
180
+ self,
181
+ num_nodes: int,
182
+ resources_dict: Dict[str, float],
183
+ stable_cluster_internal_ips: List[str],
184
+ env_vars: Dict[str, str],
185
+ log_dir: str,
186
+ setup_cmd: Optional[str] = None,
187
+ ) -> None:
188
+ """Generates code to set up the task on each node.
189
+
190
+ stable_cluster_internal_ips is used to ensure that the
191
+ SKYPILOT_NODE_RANK environment variable is assigned in a
192
+ deterministic order whenever a new task is added.
193
+ """
194
+ raise NotImplementedError
195
+
196
+ def add_task(
197
+ self,
198
+ num_nodes: int,
199
+ bash_script: Optional[str],
200
+ task_name: Optional[str],
201
+ resources_dict: Dict[str, float],
202
+ log_dir: str,
203
+ env_vars: Optional[Dict[str, str]] = None,
204
+ ) -> None:
205
+ """Generates code to run the bash command on all num_nodes nodes."""
206
+ raise NotImplementedError
207
+
208
+ def add_epilogue(self) -> None:
209
+ """Generate code that checks return codes and updates job status."""
210
+ assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
211
+ assert not self._has_epilogue, 'add_epilogue() called twice?'
212
+ self._has_epilogue = True
213
+
214
+ self._code += [
215
+ textwrap.dedent(f"""\
216
+ if sum(returncodes) != 0:
217
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
218
+ # Schedule the next pending job immediately to make the job
219
+ # scheduling more efficient.
220
+ job_lib.scheduler.schedule_step()
221
+ # This waits for all streaming logs to finish.
222
+ time.sleep(0.5)
223
+ reason = ''
224
+ # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
225
+ if any(r == 139 for r in returncodes):
226
+ reason = '(likely due to Segmentation Fault)'
227
+ if any(r == 137 for r in returncodes):
228
+ # Find the first non-137 return code
229
+ non_137 = next(r for r in returncodes if r != 137)
230
+ reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
231
+ print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
232
+ 'return code list:{colorama.Style.RESET_ALL}',
233
+ returncodes,
234
+ reason,
235
+ flush=True)
236
+ # Need this to set the job status in ray job to be FAILED.
237
+ sys.exit(1)
238
+ else:
239
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
240
+ # Schedule the next pending job immediately to make the job
241
+ # scheduling more efficient.
242
+ job_lib.scheduler.schedule_step()
243
+ # This waits for all streaming logs to finish.
244
+ time.sleep(0.5)
245
+ """)
246
+ ]
247
+
248
+ def build(self) -> str:
249
+ """Returns the entire generated program."""
250
+ assert self._has_epilogue, 'Call add_epilogue() before build().'
251
+ return '\n'.join(self._code)
252
+
253
+
254
+ class RayCodeGen(TaskCodeGen):
255
+ """Code generator of a Ray program that executes a sky.Task.
256
+
257
+ Usage:
258
+
259
+ >> codegen = RayCodegen()
260
+ >> codegen.add_prologue()
261
+
262
+ >> codegen.add_task(...)
263
+ >> codegen.add_task(...)
264
+
265
+ >> codegen.add_epilogue()
266
+ >> code = codegen.build()
267
+ """
268
+
269
+ def add_prologue(self, job_id: int) -> None:
270
+ assert not self._has_prologue, 'add_prologue() called twice?'
271
+ self._has_prologue = True
272
+ self.job_id = job_id
273
+ # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
274
+ # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
275
+ # Otherwise, ray will fail to get the placement group because of a bug
276
+ # in ray job.
277
+ ray_address = 'auto'
278
+
279
+ # Add common imports
280
+ self._add_common_imports()
281
+
282
+ # Add Ray-specific setup
283
+ self._code.append(
284
+ textwrap.dedent("""\
285
+ # Set the environment variables to avoid deduplicating logs and
286
+ # scheduler events. This should be set in driver code, since we are
287
+ # not using `ray job submit` anymore, and the environment variables
288
+ # from the ray cluster is not inherited.
289
+ os.environ['RAY_DEDUP_LOGS'] = '0'
290
+ os.environ['RAY_SCHEDULER_EVENTS'] = '0'
291
+
292
+ import ray
293
+ import ray.util as ray_util
294
+ """))
295
+
296
+ self._add_skylet_imports()
297
+
298
+ self._add_constants()
299
+
300
+ # Add Ray configuration
301
+ self._code.append(
302
+ textwrap.dedent(f"""\
303
+ kwargs = dict()
304
+ # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
305
+ # the directory exists for backward compatibility for the VM
306
+ # launched before #1790.
307
+ if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
308
+ kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
309
+ ray.init(
310
+ address={ray_address!r},
311
+ namespace='__sky__{job_id}__',
312
+ log_to_driver=True,
313
+ **kwargs
314
+ )
315
+ def get_or_fail(futures, pg) -> List[int]:
316
+ \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
317
+ if not futures:
318
+ return [], []
319
+ returncodes = [1] * len(futures)
320
+ pids = [None] * len(futures)
321
+ failed = False
322
+ # Wait for 1 task to be ready.
323
+ ready = []
324
+ # Keep invoking ray.wait if ready is empty. This is because
325
+ # ray.wait with timeout=None will only wait for 10**6 seconds,
326
+ # which will cause tasks running for more than 12 days to return
327
+ # before becoming ready.
328
+ # (Such tasks are common in serving jobs.)
329
+ # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
330
+
331
+ def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
332
+ nonlocal returncodes, pids, failed
333
+ for task in tasks:
334
+ idx = futures.index(task)
335
+ res = ray.get(task)
336
+ returncodes[idx] = res['return_code']
337
+ pids[idx] = res['pid']
338
+ if res['return_code'] != 0:
339
+ failed = True
340
+
341
+ while not ready:
342
+ ready, unready = ray.wait(futures)
343
+ handle_ready_tasks(ready)
344
+ while unready:
345
+ if failed:
346
+ for task in unready:
347
+ # ray.cancel without force fails to kill tasks.
348
+ # We use force=True to kill unready tasks.
349
+ ray.cancel(task, force=True)
350
+ # Use SIGKILL=128+9 to indicate the task is forcely
351
+ # killed.
352
+ idx = futures.index(task)
353
+ returncodes[idx] = CANCELLED_RETURN_CODE
354
+ break
355
+ ready, unready = ray.wait(unready)
356
+ handle_ready_tasks(ready)
357
+ # Remove the placement group after all tasks are done, so that
358
+ # the next job can be scheduled on the released resources
359
+ # immediately.
360
+ ray_util.remove_placement_group(pg)
361
+ sys.stdout.flush()
362
+ return returncodes, pids
363
+
364
+ futures = []
365
+ """))
366
+
367
+ self._add_logging_functions()
368
+
369
+ self._code += [
370
+ 'run_bash_command_with_log = run_bash_command_with_log',
371
+ 'run_bash_command_with_log_and_return_pid = \
372
+ ray.remote(run_bash_command_with_log_and_return_pid)',
373
+ 'autostop_lib.set_last_active_time_to_now()',
374
+ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
375
+ ]
376
+
377
+ def add_setup(
378
+ self,
379
+ num_nodes: int,
380
+ resources_dict: Dict[str, float],
381
+ stable_cluster_internal_ips: List[str],
382
+ env_vars: Dict[str, str],
383
+ log_dir: str,
384
+ setup_cmd: Optional[str] = None,
385
+ ) -> None:
386
+ assert self._has_prologue, ('Call add_prologue() before '
387
+ 'add_setup().')
388
+ self._has_setup = True
389
+
390
+ setup_log_path = os.path.join(log_dir, 'setup.log')
391
+
392
+ bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
393
+ # Set CPU to avoid ray hanging the resources allocation
394
+ # for remote functions, since the task will request 1 CPU
395
+ # by default.
396
+ task_cpu_demand = resources_dict.pop('CPU')
397
+
398
+ if resources_dict:
399
+ assert len(resources_dict) == 1, (
400
+ 'There can only be one type of accelerator per instance. '
401
+ f'Found: {resources_dict}.')
402
+ acc_name, acc_count = list(resources_dict.items())[0]
403
+ gpu_dict = {'GPU': acc_count}
404
+ # gpu_dict should be empty when the accelerator is not GPU.
405
+ # TODO(zongheng,zhanghao): an alternative is to start the remote
406
+ # cluster with custom resource 'GPU': <n> even if the accelerator(s)
407
+ # are not GPU. We opt for the current solution for now.
408
+ if accelerator_registry.is_schedulable_non_gpu_accelerator(
409
+ acc_name):
410
+ gpu_dict = {}
411
+ for bundle in bundles:
412
+ bundle.update({
413
+ # Set the GPU to avoid ray hanging the resources allocation
414
+ **gpu_dict,
415
+ })
416
+
417
+ self._code.append(
418
+ f'pg = ray_util.placement_group({json.dumps(bundles)}, '
419
+ f'\'STRICT_SPREAD\')')
420
+ self._add_waiting_for_resources_msg(num_nodes)
421
+ self._code.append(
422
+ textwrap.dedent("""\
423
+ # FIXME: This will print the error message from autoscaler if
424
+ # it is waiting for other task to finish. We should hide the
425
+ # error message.
426
+ ray.get(pg.ready())"""))
427
+ self._add_job_started_msg()
428
+
429
+ job_id = self.job_id
430
+ if setup_cmd is not None:
431
+ setup_envs = env_vars.copy()
432
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
433
+ self._code += [
434
+ textwrap.dedent(f"""\
435
+ setup_cmd = {setup_cmd!r}
436
+ _SETUP_CPUS = 0.0001
437
+ # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
438
+ # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
439
+ # We unset it so that user setup command may properly use this env var.
440
+ setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
441
+ job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
442
+
443
+ # The schedule_step should be called after the job status is set to non-PENDING,
444
+ # otherwise, the scheduler will think the current job is not submitted yet, and
445
+ # skip the scheduling step.
446
+ job_lib.scheduler.schedule_step()
447
+
448
+ # If some nodes are down and then new nodes are added after launching again,
449
+ # the result of `ray.nodes()` will include all the nodes, so we need to get
450
+ # the alive nodes.
451
+ alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
452
+ total_num_nodes = len(alive_nodes)
453
+ setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
454
+ setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
455
+ setup_workers = [run_bash_command_with_log_and_return_pid \\
456
+ .options(
457
+ name='setup',
458
+ num_cpus=_SETUP_CPUS,
459
+ scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
460
+ placement_group=setup_pg,
461
+ placement_group_bundle_index=i)
462
+ ) \\
463
+ .remote(
464
+ setup_cmd,
465
+ os.path.expanduser({setup_log_path!r}),
466
+ env_vars={setup_envs!r},
467
+ stream_logs=True,
468
+ with_ray=True,
469
+ ) for i in range(total_num_nodes)]
470
+ setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
471
+ success = True
472
+ failed_workers_and_returncodes = []
473
+ for i in range(len(setup_returncodes)):
474
+ returncode = setup_returncodes[i]
475
+ pid = setup_pids[i]
476
+ if pid == None:
477
+ pid = os.getpid()
478
+ if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
479
+ success = False
480
+ failed_workers_and_returncodes.append((pid, returncode))
481
+ if not success:
482
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
483
+ msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
484
+ msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
485
+ print(msg, flush=True)
486
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
487
+ # This waits for all streaming logs to finish.
488
+ time.sleep(1)
489
+ # Need this to set the job status in ray job to be FAILED.
490
+ sys.exit(1)
491
+ """)
492
+ ]
493
+
494
+ self._code.append(f'job_lib.set_job_started({self.job_id!r})')
495
+ if setup_cmd is None:
496
+ # Need to call schedule_step() to make sure the scheduler
497
+ # schedule the next pending job.
498
+ self._code.append('job_lib.scheduler.schedule_step()')
499
+
500
+ # Export IP and node rank to the environment variables.
501
+ self._code += [
502
+ textwrap.dedent(f"""\
503
+ @ray.remote
504
+ def check_ip():
505
+ return ray.util.get_node_ip_address()
506
+ gang_scheduling_id_to_ip = ray.get([
507
+ check_ip.options(
508
+ num_cpus={task_cpu_demand},
509
+ scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
510
+ placement_group=pg,
511
+ placement_group_bundle_index=i
512
+ )).remote()
513
+ for i in range(pg.bundle_count)
514
+ ])
515
+
516
+ cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
517
+ job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
518
+ job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
519
+ job_ip_list_str = '\\n'.join(job_ip_rank_list)
520
+ """),
521
+ ]
522
+
523
+ def add_task(self,
524
+ num_nodes: int,
525
+ bash_script: Optional[str],
526
+ task_name: Optional[str],
527
+ resources_dict: Dict[str, float],
528
+ log_dir: str,
529
+ env_vars: Optional[Dict[str, str]] = None) -> None:
530
+ # TODO(zhwu): The resources limitation for multi-node ray.tune and
531
+ # horovod should be considered.
532
+ for i in range(num_nodes):
533
+ # Ray's per-node resources, to constrain scheduling each command to
534
+ # the corresponding node, represented by private IPs.
535
+ self._add_ray_task(bash_script=bash_script,
536
+ task_name=task_name,
537
+ resources_dict=resources_dict.copy(),
538
+ log_dir=log_dir,
539
+ env_vars=env_vars,
540
+ gang_scheduling_id=i)
541
+
542
+ def _add_ray_task(self,
543
+ bash_script: Optional[str],
544
+ task_name: Optional[str],
545
+ resources_dict: Dict[str, float],
546
+ log_dir: str,
547
+ env_vars: Optional[Dict[str, str]] = None,
548
+ gang_scheduling_id: int = 0) -> None:
549
+ """Generates code for a ray remote task that runs a bash command."""
550
+ assert self._has_setup, 'Call add_setup() before add_task().'
551
+
552
+ task_cpu_demand = resources_dict.pop('CPU')
553
+ # Build remote_task.options(...)
554
+ # resources=...
555
+ # num_gpus=...
556
+ options = []
557
+ options.append(f'num_cpus={task_cpu_demand}')
558
+
559
+ acc_name, acc_count = self._get_accelerator_details(resources_dict)
560
+ num_gpus = 0.0
561
+ if acc_name is not None:
562
+ assert resources_dict, ('There can only be one type of accelerator '
563
+ 'per instance.')
564
+ options.append(f'resources={json.dumps(resources_dict)}')
565
+ if not accelerator_registry.is_schedulable_non_gpu_accelerator(
566
+ acc_name):
567
+ num_gpus = acc_count
568
+ options.append(f'num_gpus={num_gpus}')
569
+ options.append(
570
+ 'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
571
+ 'placement_group=pg, '
572
+ f'placement_group_bundle_index={gang_scheduling_id})')
573
+
574
+ sky_env_vars_dict_str = [
575
+ textwrap.dedent(f"""\
576
+ sky_env_vars_dict = {{}}
577
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
578
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
579
+ """)
580
+ ]
581
+
582
+ if env_vars is not None:
583
+ sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
584
+ for k, v in env_vars.items())
585
+ sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
586
+
587
+ options_str = ', '.join(options)
588
+ logger.debug('Added Task with options: '
589
+ f'{options_str}')
590
+ rclone_flush_script = self._get_rclone_flush_script()
591
+ unset_ray_env_vars = ' && '.join(
592
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
593
+ self._code += [
594
+ sky_env_vars_dict_str,
595
+ textwrap.dedent(f"""\
596
+ script = {bash_script!r}
597
+ rclone_flush_script = {rclone_flush_script!r}
598
+
599
+ if script is not None:
600
+ script=f'{unset_ray_env_vars}; {{script}}'
601
+ script += rclone_flush_script
602
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
603
+
604
+ ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
605
+ rank = job_ip_rank_map[ip]
606
+
607
+ if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
608
+ name_str = '{task_name},' if {task_name!r} != None else 'task,'
609
+ log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
610
+ else: # Single-node or multi-node task on multi-node cluster
611
+ idx_in_cluster = cluster_ips_to_node_id[ip]
612
+ if cluster_ips_to_node_id[ip] == 0:
613
+ node_name = 'head'
614
+ else:
615
+ node_name = f'worker{{idx_in_cluster}}'
616
+ name_str = f'{{node_name}}, rank={{rank}},'
617
+ log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
618
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
619
+
620
+ sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
621
+
622
+ futures.append(run_bash_command_with_log_and_return_pid \\
623
+ .options(name=name_str, {options_str}) \\
624
+ .remote(
625
+ script,
626
+ log_path,
627
+ env_vars=sky_env_vars_dict,
628
+ stream_logs=True,
629
+ with_ray=True,
630
+ ))""")
631
+ ]
632
+
633
+ def add_epilogue(self) -> None:
634
+ """Generates code that waits for all tasks, then exits."""
635
+ self._code.append('returncodes, _ = get_or_fail(futures, pg)')
636
+ super().add_epilogue()
637
+
638
+
639
+ class SlurmCodeGen(TaskCodeGen):
640
+ """Code generator for task execution on Slurm using native srun."""
641
+
642
+ def __init__(self, slurm_job_id: str):
643
+ """Initialize SlurmCodeGen
644
+
645
+ Args:
646
+ slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
647
+ """
648
+ super().__init__()
649
+ self._slurm_job_id = slurm_job_id
650
+
651
+ def add_prologue(self, job_id: int) -> None:
652
+ assert not self._has_prologue, 'add_prologue() called twice?'
653
+ self._has_prologue = True
654
+ self.job_id = job_id
655
+
656
+ self._add_common_imports()
657
+
658
+ self._code.append(
659
+ textwrap.dedent("""\
660
+ import colorama
661
+ import copy
662
+ import json
663
+ import multiprocessing
664
+ import signal
665
+ import threading
666
+ from sky.backends import backend_utils
667
+ """))
668
+ self._add_skylet_imports()
669
+
670
+ self._add_constants()
671
+
672
+ self._add_logging_functions()
673
+
674
+ self._code.append(
675
+ textwrap.dedent(f"""\
676
+ def _cancel_slurm_job_steps():
677
+ slurm_job_id = {self._slurm_job_id!r}
678
+ assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
679
+ try:
680
+ # Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
681
+ # Output format: "JOBID.STEPID STEPNAME"
682
+ # TODO(kevin): This assumes that compute node is able
683
+ # to run client commands against the controller.
684
+ # Validate this assumption.
685
+ result = subprocess.run(
686
+ ['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
687
+ capture_output=True, text=True, check=False)
688
+ for line in result.stdout.strip().split('\\n'):
689
+ if not line:
690
+ continue
691
+ parts = line.split()
692
+ assert len(parts) >= 2, 'Expected at least 2 parts'
693
+ step_id, step_name = parts[0], parts[1]
694
+ if step_name == f'sky-{self.job_id}':
695
+ subprocess.run(['scancel', step_id],
696
+ check=False, capture_output=True)
697
+ except Exception as e:
698
+ print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
699
+ pass
700
+
701
+ def _slurm_cleanup_handler(signum, _frame):
702
+ _cancel_slurm_job_steps()
703
+ # Re-raise to let default handler terminate.
704
+ signal.signal(signum, signal.SIG_DFL)
705
+ os.kill(os.getpid(), signum)
706
+
707
+ signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
708
+ """))
709
+
710
+ self._code += [
711
+ 'autostop_lib.set_last_active_time_to_now()',
712
+ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
713
+ ]
714
+
715
+ self._setup_cmd: Optional[str] = None
716
+ self._setup_envs: Optional[Dict[str, str]] = None
717
+ self._setup_log_dir: Optional[str] = None
718
+ self._setup_num_nodes: Optional[int] = None
719
+
720
+ def add_setup(
721
+ self,
722
+ num_nodes: int,
723
+ resources_dict: Dict[str, float],
724
+ stable_cluster_internal_ips: List[str],
725
+ env_vars: Dict[str, str],
726
+ log_dir: str,
727
+ setup_cmd: Optional[str] = None,
728
+ ) -> None:
729
+ assert self._has_prologue, ('Call add_prologue() before add_setup().')
730
+ self._has_setup = True
731
+ self._cluster_num_nodes = len(stable_cluster_internal_ips)
732
+ self._stable_cluster_ips = stable_cluster_internal_ips
733
+
734
+ self._add_waiting_for_resources_msg(num_nodes)
735
+
736
+ # Store setup information for use in add_task().
737
+ if setup_cmd is not None:
738
+ setup_envs = env_vars.copy()
739
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
740
+ self._setup_cmd = setup_cmd
741
+ self._setup_envs = setup_envs
742
+ self._setup_log_dir = log_dir
743
+ self._setup_num_nodes = num_nodes
744
+
745
+ def add_task(
746
+ self,
747
+ num_nodes: int,
748
+ bash_script: Optional[str],
749
+ task_name: Optional[str],
750
+ resources_dict: Dict[str, float],
751
+ log_dir: str,
752
+ env_vars: Optional[Dict[str, str]] = None,
753
+ ) -> None:
754
+ """Generates code for invoking a bash command
755
+ using srun within sbatch allocation.
756
+ """
757
+ assert self._has_setup, 'Call add_setup() before add_task().'
758
+ env_vars = env_vars or {}
759
+ task_name = task_name if task_name is not None else 'task'
760
+
761
+ acc_name, acc_count = self._get_accelerator_details(resources_dict)
762
+ num_gpus = 0
763
+ if (acc_name is not None and
764
+ not accelerator_registry.is_schedulable_non_gpu_accelerator(
765
+ acc_name)):
766
+ num_gpus = int(math.ceil(acc_count))
767
+
768
+ # Slurm does not support fractional CPUs.
769
+ task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
770
+
771
+ sky_env_vars_dict_str = [
772
+ textwrap.dedent(f"""\
773
+ sky_env_vars_dict = {{}}
774
+ sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
775
+ """)
776
+ ]
777
+
778
+ if env_vars:
779
+ sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
780
+ for k, v in env_vars.items())
781
+ sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
782
+
783
+ rclone_flush_script = self._get_rclone_flush_script()
784
+ streaming_msg = self._get_job_started_msg()
785
+ has_setup_cmd = self._setup_cmd is not None
786
+
787
+ self._code += [
788
+ sky_env_vars_dict_str,
789
+ textwrap.dedent(f"""\
790
+ script = {bash_script!r}
791
+ if script is None:
792
+ script = ''
793
+ rclone_flush_script = {rclone_flush_script!r}
794
+
795
+ if script or {has_setup_cmd!r}:
796
+ script += rclone_flush_script
797
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
798
+
799
+ # Signal files for setup/run synchronization:
800
+ # 1. alloc_signal_file: srun has acquired allocation
801
+ # 2. setup_done_signal_file: Driver has finished setup, run can proceed
802
+ #
803
+ # Signal files are stored in home directory, which is
804
+ # assumed to be on a shared NFS mount accessible by all nodes.
805
+ # To support clusters with non-NFS home directories, we would
806
+ # need to let users specify an NFS-backed "working directory"
807
+ # or use a different coordination mechanism.
808
+ alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
809
+ alloc_signal_file = os.path.expanduser(alloc_signal_file)
810
+ setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
811
+ setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
812
+
813
+ # Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
814
+ gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
815
+
816
+ def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
817
+ task_name=None, is_setup=False,
818
+ alloc_signal=None, setup_done_signal=None):
819
+ env_vars_json = json.dumps(env_vars_dict)
820
+
821
+ log_dir = shlex.quote(log_dir)
822
+ env_vars = shlex.quote(env_vars_json)
823
+ cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
824
+
825
+ runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
826
+
827
+ if task_name is not None:
828
+ runner_args += f' --task-name={{shlex.quote(task_name)}}'
829
+
830
+ if is_setup:
831
+ runner_args += ' --is-setup'
832
+
833
+ if alloc_signal is not None:
834
+ runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
835
+
836
+ if setup_done_signal is not None:
837
+ runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
838
+
839
+ script_path = None
840
+ prefix = 'sky_setup_' if is_setup else 'sky_task_'
841
+ if backend_utils.is_command_length_over_limit(user_script):
842
+ with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
843
+ f.write(user_script)
844
+ script_path = f.name
845
+ runner_args += f' --script-path={{shlex.quote(script_path)}}'
846
+ else:
847
+ runner_args += f' --script={{shlex.quote(user_script)}}'
848
+
849
+ # Use /usr/bin/env explicitly to work around a Slurm quirk where
850
+ # srun's execvp() doesn't check execute permissions, failing when
851
+ # $HOME/.local/bin/env (non-executable, from uv installation)
852
+ # shadows /usr/bin/env.
853
+ job_suffix = '-setup' if is_setup else ''
854
+ srun_cmd = (
855
+ f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
856
+ f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
857
+ f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
858
+ )
859
+ return srun_cmd, script_path
860
+
861
+ def run_thread_func():
862
+ # This blocks until Slurm allocates resources (--exclusive)
863
+ # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
864
+ run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
865
+ srun_cmd, task_script_path = build_task_runner_cmd(
866
+ script, run_flags, {log_dir!r}, sky_env_vars_dict,
867
+ task_name={task_name!r},
868
+ alloc_signal=alloc_signal_file,
869
+ setup_done_signal=setup_done_signal_file
870
+ )
871
+
872
+ proc = subprocess.Popen(srun_cmd, shell=True,
873
+ stdout=subprocess.PIPE,
874
+ stderr=subprocess.STDOUT,
875
+ text=True)
876
+ for line in proc.stdout:
877
+ print(line, end='', flush=True)
878
+ proc.wait()
879
+
880
+ if task_script_path is not None:
881
+ os.remove(task_script_path)
882
+ return {{'return_code': proc.returncode, 'pid': proc.pid}}
883
+
884
+ run_thread_result = {{'result': None}}
885
+ def run_thread_wrapper():
886
+ run_thread_result['result'] = run_thread_func()
887
+
888
+ run_thread = threading.Thread(target=run_thread_wrapper)
889
+ run_thread.start()
890
+
891
+ # Wait for allocation signal from inside srun
892
+ while not os.path.exists(alloc_signal_file):
893
+ if not run_thread.is_alive():
894
+ # srun failed before creating the signal file.
895
+ run_thread.join()
896
+ result = run_thread_result['result']
897
+ returncode = int(result.get('return_code', 1))
898
+ pid = result.get('pid', os.getpid())
899
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
900
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
901
+ print(msg, flush=True)
902
+ returncodes = [returncode]
903
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
904
+ sys.exit(1)
905
+ time.sleep(0.1)
906
+
907
+ print({streaming_msg!r}, flush=True)
908
+
909
+ if {has_setup_cmd!r}:
910
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
911
+
912
+ # The schedule_step should be called after the job status is set to
913
+ # non-PENDING, otherwise, the scheduler will think the current job
914
+ # is not submitted yet, and skip the scheduling step.
915
+ job_lib.scheduler.schedule_step()
916
+
917
+ # --overlap as we have already secured allocation with the srun for the run section,
918
+ # and otherwise this srun would get blocked and deadlock.
919
+ setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
920
+ setup_srun, setup_script_path = build_task_runner_cmd(
921
+ {self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
922
+ is_setup=True
923
+ )
924
+
925
+ # Run setup srun directly, streaming output to driver stdout
926
+ setup_proc = subprocess.Popen(setup_srun, shell=True,
927
+ stdout=subprocess.PIPE,
928
+ stderr=subprocess.STDOUT,
929
+ text=True)
930
+ for line in setup_proc.stdout:
931
+ print(line, end='', flush=True)
932
+ setup_proc.wait()
933
+
934
+ if setup_script_path is not None:
935
+ os.remove(setup_script_path)
936
+
937
+ setup_returncode = setup_proc.returncode
938
+ if setup_returncode != 0:
939
+ setup_pid = setup_proc.pid
940
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
941
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
942
+ print(msg, flush=True)
943
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
944
+ # Cancel the srun spawned by run_thread_func.
945
+ _cancel_slurm_job_steps()
946
+ sys.exit(1)
947
+
948
+ job_lib.set_job_started({self.job_id!r})
949
+ if not {has_setup_cmd!r}:
950
+ # Need to call schedule_step() to make sure the scheduler
951
+ # schedule the next pending job.
952
+ job_lib.scheduler.schedule_step()
953
+
954
+ # Signal run thread to proceed.
955
+ pathlib.Path(setup_done_signal_file).touch()
956
+
957
+ # Wait for run thread to complete.
958
+ run_thread.join()
959
+ result = run_thread_result['result']
960
+
961
+ # Cleanup signal files
962
+ if os.path.exists(alloc_signal_file):
963
+ os.remove(alloc_signal_file)
964
+ if os.path.exists(setup_done_signal_file):
965
+ os.remove(setup_done_signal_file)
966
+
967
+ returncodes = [int(result.get('return_code', 1))]
968
+ else:
969
+ returncodes = [0]
970
+ """),
971
+ ]