skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/volumes/volume.py CHANGED
@@ -13,6 +13,10 @@ VOLUME_TYPE_TO_CLOUD = {
13
13
  volume_lib.VolumeType.PVC: clouds.Kubernetes(),
14
14
  volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME: clouds.RunPod(),
15
15
  }
16
+ CLOUD_TO_VOLUME_TYPE = {
17
+ clouds.Kubernetes(): [volume_lib.VolumeType.PVC],
18
+ clouds.RunPod(): [volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME],
19
+ }
16
20
 
17
21
 
18
22
  class Volume:
@@ -25,7 +29,7 @@ class Volume:
25
29
  infra: Optional[str] = None,
26
30
  size: Optional[str] = None,
27
31
  labels: Optional[Dict[str, str]] = None,
28
- resource_name: Optional[str] = None,
32
+ use_existing: Optional[bool] = None,
29
33
  config: Optional[Dict[str, Any]] = None):
30
34
  """Initialize a Volume instance.
31
35
 
@@ -35,6 +39,7 @@ class Volume:
35
39
  infra: Infrastructure specification
36
40
  size: Volume size
37
41
  labels: Volume labels
42
+ use_existing: Whether to use an existing volume
38
43
  config: Additional configuration
39
44
  """
40
45
  self.name = name
@@ -42,7 +47,7 @@ class Volume:
42
47
  self.infra = infra
43
48
  self.size = size
44
49
  self.labels = labels or {}
45
- self.resource_name = resource_name
50
+ self.use_existing = use_existing
46
51
  self.config = config or {}
47
52
 
48
53
  self.cloud: Optional[str] = None
@@ -70,17 +75,16 @@ class Volume:
70
75
  infra=config.get('infra'),
71
76
  size=config.get('size'),
72
77
  labels=config.get('labels'),
73
- resource_name=config.get('resource_name'),
78
+ use_existing=config.get('use_existing'),
74
79
  config=config.get('config', {}))
75
80
  if vt == volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME:
76
- return RunpodNetworkVolume(
77
- name=config.get('name'),
78
- type=vol_type_val,
79
- infra=config.get('infra'),
80
- size=config.get('size'),
81
- labels=config.get('labels'),
82
- resource_name=config.get('resource_name'),
83
- config=config.get('config', {}))
81
+ return RunpodNetworkVolume(name=config.get('name'),
82
+ type=vol_type_val,
83
+ infra=config.get('infra'),
84
+ size=config.get('size'),
85
+ labels=config.get('labels'),
86
+ use_existing=config.get('use_existing'),
87
+ config=config.get('config', {}))
84
88
 
85
89
  raise ValueError(f'Invalid volume type: {vol_type_val}')
86
90
 
@@ -92,7 +96,7 @@ class Volume:
92
96
  'infra': self.infra,
93
97
  'size': self.size,
94
98
  'labels': self.labels,
95
- 'resource_name': self.resource_name,
99
+ 'use_existing': self.use_existing,
96
100
  'config': self.config,
97
101
  'cloud': self.cloud,
98
102
  'region': self.region,
@@ -100,7 +104,7 @@ class Volume:
100
104
  }
101
105
 
102
106
  def _normalize_config(self) -> None:
103
- """Adjust and validate the config."""
107
+ """Normalize and validate the config."""
104
108
  # Validate schema
105
109
  common_utils.validate_schema(self.to_yaml_config(),
106
110
  schemas.get_volume_schema(),
@@ -115,8 +119,17 @@ class Volume:
115
119
  self.region = infra_info.region
116
120
  self.zone = infra_info.zone
117
121
 
118
- # Validate the volume config
119
- self._validate_config()
122
+ # Set cloud from volume type if not specified
123
+ cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
124
+ volume_lib.VolumeType(self.type))
125
+ if self.cloud:
126
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
127
+ assert cloud_obj is not None
128
+ if not cloud_obj.is_same_cloud(cloud_obj_from_type):
129
+ raise ValueError(
130
+ f'Invalid cloud {self.cloud} for volume type {self.type}')
131
+ else:
132
+ self.cloud = str(cloud_obj_from_type)
120
133
 
121
134
  def _adjust_config(self) -> None:
122
135
  """Adjust the volume config (e.g., parse size)."""
@@ -132,41 +145,41 @@ class Volume:
132
145
  except ValueError as e:
133
146
  raise ValueError(f'Invalid size {self.size}: {e}') from e
134
147
 
135
- def _validate_config(self) -> None:
136
- """Validate the volume config."""
137
- cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
138
- volume_lib.VolumeType(self.type))
139
- if self.cloud:
140
- cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
141
- assert cloud_obj is not None
142
- if not cloud_obj.is_same_cloud(cloud_obj_from_type):
143
- raise ValueError(
144
- f'Invalid cloud {self.cloud} for volume type {self.type}')
145
- else:
146
- self.cloud = str(cloud_obj_from_type)
147
- cloud_obj = cloud_obj_from_type
148
- assert cloud_obj is not None
148
+ def validate(self, skip_cloud_compatibility: bool = False) -> None:
149
+ """Validates the volume."""
150
+ self.validate_name()
151
+ self.validate_size()
152
+ if not skip_cloud_compatibility:
153
+ self.validate_cloud_compatibility()
154
+ # Extra, type-specific validations
155
+ self._validate_config_extra()
149
156
 
150
- self.region, self.zone = cloud_obj.validate_region_zone(
151
- self.region, self.zone)
157
+ def validate_name(self) -> None:
158
+ """Validates if the volume name is set."""
159
+ assert self.name is not None, 'Volume name must be set'
160
+
161
+ def validate_size(self) -> None:
162
+ """Validates that size is specified for new volumes."""
163
+ if not self.use_existing and not self.size:
164
+ raise ValueError('Size is required for new volumes. '
165
+ 'Please specify the size in the YAML file or '
166
+ 'use the --size flag.')
167
+
168
+ def validate_cloud_compatibility(self) -> None:
169
+ """Validates region, zone, name, labels with the cloud."""
170
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
171
+ assert cloud_obj is not None
152
172
 
153
173
  valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
154
174
  if not valid:
155
175
  raise ValueError(f'Invalid volume name: {err_msg}')
156
176
 
157
- if not self.resource_name and not self.size:
158
- raise ValueError('Size is required for new volumes. '
159
- 'Please specify the size in the YAML file or '
160
- 'use the --size flag.')
161
177
  if self.labels:
162
178
  for key, value in self.labels.items():
163
179
  valid, err_msg = cloud_obj.is_label_valid(key, value)
164
180
  if not valid:
165
181
  raise ValueError(f'{err_msg}')
166
182
 
167
- # Extra, type-specific validations
168
- self._validate_config_extra()
169
-
170
183
  # Hook methods for subclasses
171
184
  def _validate_config_extra(self) -> None:
172
185
  """Additional type-specific validation.
@@ -185,7 +198,7 @@ class RunpodNetworkVolume(Volume):
185
198
  """RunPod Network Volume."""
186
199
 
187
200
  def _validate_config_extra(self) -> None:
188
- if self.size is not None:
201
+ if not self.use_existing and self.size is not None:
189
202
  try:
190
203
  size_int = int(self.size)
191
204
  if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
@@ -196,8 +209,7 @@ class RunpodNetworkVolume(Volume):
196
209
  raise ValueError(f'Invalid volume size {self.size!r}: '
197
210
  f'{e}') from e
198
211
  if not self.zone:
199
- raise ValueError(
200
- 'RunPod DataCenterId is required to create a network '
201
- 'volume. Set the zone in the infra field.')
212
+ raise ValueError('RunPod DataCenterId is required for network '
213
+ 'volumes. Set the zone in the infra field.')
202
214
 
203
215
  return
sky/workspaces/core.py CHANGED
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
14
14
  from sky.skylet import constants
15
15
  from sky.usage import usage_lib
16
16
  from sky.users import permission
17
+ from sky.users import rbac
17
18
  from sky.utils import annotations
18
19
  from sky.utils import common_utils
19
20
  from sky.utils import config_utils
@@ -147,11 +148,15 @@ def _compare_workspace_configs(
147
148
  private_new = new_config.get('private', False)
148
149
  private_changed = private_old != private_new
149
150
 
151
+ admin_user_ids = permission.permission_service.get_users_for_role(
152
+ rbac.RoleName.ADMIN.value)
150
153
  # Get allowed users (resolve to user IDs for comparison)
151
154
  allowed_users_old = workspaces_utils.get_workspace_users(
152
155
  current_config) if private_old else []
156
+ allowed_users_old += admin_user_ids
153
157
  allowed_users_new = workspaces_utils.get_workspace_users(
154
158
  new_config) if private_new else []
159
+ allowed_users_new += admin_user_ids
155
160
 
156
161
  # Convert to sets for easier comparison
157
162
  old_users_set = set(allowed_users_old)
@@ -188,6 +193,24 @@ def _compare_workspace_configs(
188
193
  added_users=added_users)
189
194
 
190
195
 
196
+ def _validate_workspace_config_changes_with_lock(
197
+ workspace_name: str, current_config: Dict[str, Any],
198
+ new_config: Dict[str, Any]) -> None:
199
+ lock_id = backend_utils.workspace_lock_id(workspace_name)
200
+ lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
201
+ try:
202
+ with locks.get_lock(lock_id, lock_timeout):
203
+ # Validate the configuration changes based on active resources
204
+ _validate_workspace_config_changes(workspace_name, current_config,
205
+ new_config)
206
+ except locks.LockTimeout as e:
207
+ raise RuntimeError(
208
+ f'Failed to validate workspace {workspace_name!r} due to '
209
+ 'a timeout when trying to access database. Please '
210
+ f'try again or manually remove the lock at {lock_id}. '
211
+ f'{common_utils.format_exception(e)}') from None
212
+
213
+
191
214
  def _validate_workspace_config_changes(workspace_name: str,
192
215
  current_config: Dict[str, Any],
193
216
  new_config: Dict[str, Any]) -> None:
@@ -232,7 +255,7 @@ def _validate_workspace_config_changes(workspace_name: str,
232
255
  f' private. Checking that all active resources belong'
233
256
  f' to allowed users.')
234
257
 
235
- error_summary, missed_users_names = (
258
+ error_summary, missed_users_names, _ = (
236
259
  resource_checker.check_users_workspaces_active_resources(
237
260
  config_comparison.allowed_users_new, [workspace_name]))
238
261
  if error_summary:
@@ -259,11 +282,35 @@ def _validate_workspace_config_changes(workspace_name: str,
259
282
  f'Checking that removed users'
260
283
  f' {config_comparison.removed_users} do not have'
261
284
  f' active resources in workspace {workspace_name!r}.')
262
- user_operations = []
263
- for user_id in config_comparison.removed_users:
264
- user_operations.append((user_id, 'remove'))
265
- resource_checker.check_no_active_resources_for_users(
266
- user_operations)
285
+ error_summary, missed_users_names, missed_user_dict = (
286
+ resource_checker.check_users_workspaces_active_resources(
287
+ config_comparison.allowed_users_new, [workspace_name]))
288
+ if error_summary:
289
+ error_user_ids = []
290
+ for user_id in config_comparison.removed_users:
291
+ if user_id in missed_user_dict:
292
+ error_user_ids.append(user_id)
293
+ error_user_names = []
294
+ if error_user_ids:
295
+ error_user_names = [
296
+ missed_user_dict[user_id]
297
+ for user_id in error_user_ids
298
+ ]
299
+
300
+ error_msg = 'Cannot '
301
+ error_users_list = ', '.join(error_user_names)
302
+ if len(error_user_names) == 1:
303
+ error_msg += f'remove user {error_users_list!r} ' \
304
+ f'from workspace {workspace_name!r} because the ' \
305
+ f'user has {error_summary}'
306
+ else:
307
+ error_msg += f'remove users {error_users_list!r}' \
308
+ f' from workspace {workspace_name!r} because the' \
309
+ f' users have {error_summary}'
310
+ error_msg += ', but not in the allowed_users list.' \
311
+ ' Please either add the users to allowed_users or' \
312
+ ' ask them to terminate their resources.'
313
+ raise ValueError(error_msg)
267
314
  else:
268
315
  # Other configuration changes - check that workspace has no active
269
316
  # resources
@@ -310,20 +357,8 @@ def update_workspace(workspace_name: str, config: Dict[str,
310
357
  default_value={})
311
358
  current_config = current_workspaces.get(workspace_name, {})
312
359
 
313
- if current_config:
314
- lock_id = backend_utils.workspace_lock_id(workspace_name)
315
- lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
316
- try:
317
- with locks.get_lock(lock_id, lock_timeout):
318
- # Validate the configuration changes based on active resources
319
- _validate_workspace_config_changes(workspace_name,
320
- current_config, config)
321
- except locks.LockTimeout as e:
322
- raise RuntimeError(
323
- f'Failed to validate workspace {workspace_name!r} due to '
324
- 'a timeout when trying to access database. Please '
325
- f'try again or manually remove the lock at {lock_id}. '
326
- f'{common_utils.format_exception(e)}') from None
360
+ _validate_workspace_config_changes_with_lock(workspace_name, current_config,
361
+ config)
327
362
 
328
363
  def update_workspace_fn(workspaces: Dict[str, Any]) -> None:
329
364
  """Function to update workspace inside the lock."""
@@ -510,7 +545,8 @@ def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
510
545
  # If workspace configuration is changing, validate and mark for checking
511
546
  if current_workspace_config != new_workspace_config:
512
547
  _validate_workspace_config(workspace_name, new_workspace_config)
513
- workspaces_to_check.append((workspace_name, 'update'))
548
+ _validate_workspace_config_changes_with_lock(
549
+ workspace_name, current_workspace_config, new_workspace_config)
514
550
  users = workspaces_utils.get_workspace_users(new_workspace_config)
515
551
  workspaces_to_check_policy['update'][workspace_name] = users
516
552
 
sky/workspaces/server.py CHANGED
@@ -4,6 +4,7 @@ import fastapi
4
4
 
5
5
  from sky.server.requests import executor
6
6
  from sky.server.requests import payloads
7
+ from sky.server.requests import request_names
7
8
  from sky.server.requests import requests as api_requests
8
9
  from sky.workspaces import core
9
10
 
@@ -22,9 +23,9 @@ async def get(request: fastapi.Request) -> None:
22
23
  } if auth_user else {}
23
24
  request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
24
25
 
25
- executor.schedule_request(
26
+ await executor.schedule_request_async(
26
27
  request_id=request.state.request_id,
27
- request_name='workspaces.get',
28
+ request_name=request_names.RequestName.WORKSPACES_GET,
28
29
  request_body=request_body,
29
30
  func=core.get_workspaces,
30
31
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -35,9 +36,9 @@ async def get(request: fastapi.Request) -> None:
35
36
  async def update(request: fastapi.Request,
36
37
  update_workspace_body: payloads.UpdateWorkspaceBody) -> None:
37
38
  """Updates a specific workspace configuration."""
38
- executor.schedule_request(
39
+ await executor.schedule_request_async(
39
40
  request_id=request.state.request_id,
40
- request_name='workspaces.update',
41
+ request_name=request_names.RequestName.WORKSPACES_UPDATE,
41
42
  request_body=update_workspace_body,
42
43
  func=core.update_workspace,
43
44
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -48,9 +49,9 @@ async def update(request: fastapi.Request,
48
49
  async def create(request: fastapi.Request,
49
50
  create_workspace_body: payloads.CreateWorkspaceBody) -> None:
50
51
  """Creates a new workspace configuration."""
51
- executor.schedule_request(
52
+ await executor.schedule_request_async(
52
53
  request_id=request.state.request_id,
53
- request_name='workspaces.create',
54
+ request_name=request_names.RequestName.WORKSPACES_CREATE,
54
55
  request_body=create_workspace_body,
55
56
  func=core.create_workspace,
56
57
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -61,9 +62,9 @@ async def create(request: fastapi.Request,
61
62
  async def delete(request: fastapi.Request,
62
63
  delete_workspace_body: payloads.DeleteWorkspaceBody) -> None:
63
64
  """Deletes a workspace configuration."""
64
- executor.schedule_request(
65
+ await executor.schedule_request_async(
65
66
  request_id=request.state.request_id,
66
- request_name='workspaces.delete',
67
+ request_name=request_names.RequestName.WORKSPACES_DELETE,
67
68
  request_body=delete_workspace_body,
68
69
  func=core.delete_workspace,
69
70
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -78,9 +79,9 @@ async def get_config(request: fastapi.Request) -> None:
78
79
  'env_vars': auth_user.to_env_vars()
79
80
  } if auth_user else {}
80
81
  get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
81
- executor.schedule_request(
82
+ await executor.schedule_request_async(
82
83
  request_id=request.state.request_id,
83
- request_name='workspaces.get_config',
84
+ request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
84
85
  request_body=get_config_body,
85
86
  func=core.get_config,
86
87
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -91,9 +92,9 @@ async def get_config(request: fastapi.Request) -> None:
91
92
  async def update_config(request: fastapi.Request,
92
93
  update_config_body: payloads.UpdateConfigBody) -> None:
93
94
  """Updates the entire SkyPilot configuration."""
94
- executor.schedule_request(
95
+ await executor.schedule_request_async(
95
96
  request_id=request.state.request_id,
96
- request_name='workspaces.update_config',
97
+ request_name=request_names.RequestName.WORKSPACES_UPDATE_CONFIG,
97
98
  request_body=update_config_body,
98
99
  func=core.update_config,
99
100
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -0,0 +1,3 @@
1
+ # SkyPilot Templates
2
+
3
+ This package contains templates for users to use in their SkyPilot clusters, jobs, and services.
@@ -0,0 +1,3 @@
1
+ """SkyPilot templates."""
2
+
3
+ __version__ = '1.0.0-dev0'
File without changes
@@ -0,0 +1,183 @@
1
+ #!/bin/bash
2
+ # Starts a Ray cluster on a SkyPilot cluster.
3
+ #
4
+ # This script starts a Ray cluster using default Ray ports (6379, 8265),
5
+ # which are different from SkyPilot's system Ray ports (6380, 8266).
6
+ # This allows users to run their own Ray applications independently of
7
+ # SkyPilot's internal Ray cluster.
8
+ #
9
+ # Environment Variables:
10
+ # RAY_HEAD_PORT=6379 - Ray head node port
11
+ # RAY_DASHBOARD_PORT=8265 - Ray dashboard port
12
+ # RAY_DASHBOARD_HOST=127.0.0.1 - Dashboard host (set to 0.0.0.0 to expose externally)
13
+ # RAY_DASHBOARD_AGENT_LISTEN_PORT= - (Optional) Dashboard agent listen port
14
+ # RAY_HEAD_IP_ADDRESS= - (Optional) Node IP address
15
+ # RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
16
+ #
17
+ # Usage:
18
+ # ~/sky_templates/ray/start_cluster
19
+ #
20
+ # # With custom configurations
21
+ # export RAY_DASHBOARD_HOST=0.0.0.0
22
+ # export RAY_DASHBOARD_PORT=8280
23
+ # ~/sky_templates/ray/start_cluster
24
+ #
25
+ # # With uv
26
+ # export RAY_CMD="uv run ray"
27
+ # ~/sky_templates/ray/start_cluster
28
+
29
+ set -e
30
+
31
+ # Color codes for output
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ NC='\033[0m' # No Color
36
+
37
+ RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
38
+ RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
39
+ RAY_DASHBOARD_HOST=${RAY_DASHBOARD_HOST:-127.0.0.1}
40
+ RAY_DASHBOARD_AGENT_LISTEN_PORT=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-}
41
+ RAY_HEAD_IP_ADDRESS=${RAY_HEAD_IP_ADDRESS:-}
42
+
43
+ RAY_CMD=${RAY_CMD:-ray}
44
+ # Tokenize the command string into an array so multi-word commands
45
+ # (e.g., "uv run ray") are handled safely when expanded later.
46
+ eval "RAY_CMD_ARR=( ${RAY_CMD} )"
47
+
48
+ # Convenience wrapper to invoke the configured Ray command with arbitrary args.
49
+ run_ray() {
50
+ "${RAY_CMD_ARR[@]}" "$@"
51
+ }
52
+
53
+ echo -e "${GREEN}Starting Ray cluster...${NC}"
54
+
55
+ # Ensure ray[default] is installed (we need [default] to do `ray list nodes`)
56
+ # Pin to existing version if Ray is already installed to avoid upgrading existing version.
57
+ RAY_VERSION=$(run_ray --version 2>/dev/null | cut -d' ' -f3 || echo "")
58
+ if [ -n "${RAY_VERSION}" ]; then
59
+ # Pin to existing version.
60
+ VERSION_SPEC="==${RAY_VERSION}"
61
+ else
62
+ echo -e "${YELLOW}Installing ray[default]...${NC}"
63
+ VERSION_SPEC=""
64
+ fi
65
+
66
+ # Pin click<8.3.0 to avoid incompatibility with Ray on Python 3.10
67
+ # click 8.3.0 and 8.3.1 breaks Ray CLI due to deepcopy issues with sentinel values
68
+ # See: https://github.com/ray-project/ray/issues/56747
69
+ # TODO(kevin): Remove this once the issue is fixed in a future click release
70
+ RAY_INSTALL_SPEC="ray[default]${VERSION_SPEC} click<8.3.0"
71
+ uv pip install ${RAY_INSTALL_SPEC} || uv pip install --system ${RAY_INSTALL_SPEC}
72
+
73
+ # Verify Ray is working
74
+ if ! run_ray --version > /dev/null; then
75
+ echo -e "${RED}Error: Failed to install Ray.${NC}"
76
+ exit 1
77
+ fi
78
+ echo -e "${GREEN}Ray $(run_ray --version | cut -d' ' -f3) is installed.${NC}"
79
+
80
+ RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
81
+ if [ "${SKYPILOT_NODE_RANK}" -ne 0 ]; then
82
+ HEAD_IP=$(echo "${SKYPILOT_NODE_IPS}" | head -n1)
83
+ RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
84
+ fi
85
+
86
+ # Check if user-space Ray is already running
87
+ if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
88
+ echo -e "${YELLOW}Ray cluster is already running.${NC}"
89
+ run_ray status --address="${RAY_ADDRESS}"
90
+ exit 0
91
+ fi
92
+
93
+ TIMEOUT=300
94
+
95
+ if [ "${SKYPILOT_NODE_RANK}" -eq 0 ]; then
96
+ echo -e "${GREEN}Starting Ray head node...${NC}"
97
+
98
+ RAY_START_CMD="start --head \
99
+ --port=${RAY_HEAD_PORT} \
100
+ --dashboard-port=${RAY_DASHBOARD_PORT} \
101
+ --dashboard-host=${RAY_DASHBOARD_HOST} \
102
+ --disable-usage-stats \
103
+ --include-dashboard=True"
104
+
105
+ # Add --num-gpus only if > 0
106
+ if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
107
+ RAY_START_CMD="${RAY_START_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
108
+ fi
109
+
110
+ # Add optional dashboard agent listen port if specified
111
+ if [ -n "${RAY_DASHBOARD_AGENT_LISTEN_PORT}" ]; then
112
+ RAY_START_CMD="${RAY_START_CMD} --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT}"
113
+ fi
114
+
115
+ # Add optional node IP address if specified
116
+ if [ -n "${RAY_HEAD_IP_ADDRESS}" ]; then
117
+ RAY_START_CMD="${RAY_START_CMD} --node-ip-address=${RAY_HEAD_IP_ADDRESS}"
118
+ fi
119
+
120
+ run_ray ${RAY_START_CMD}
121
+
122
+ start_time=$(date +%s)
123
+ while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
124
+ if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
125
+ echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
126
+ exit 1
127
+ fi
128
+ echo "Head node not healthy yet. Retrying in 1s..."
129
+ sleep 1
130
+ done
131
+
132
+ echo -e "${GREEN}Head node started successfully.${NC}"
133
+
134
+ # Wait for all worker nodes to join
135
+ if [ "${SKYPILOT_NUM_NODES}" -gt 1 ]; then
136
+ echo "Waiting for all ${SKYPILOT_NUM_NODES} nodes to join..."
137
+ start_time=$(date +%s)
138
+ while true; do
139
+ if [ "$(( $(date +%s) - start_time ))" -ge "${TIMEOUT}" ]; then
140
+ echo -e "${RED}Error: Timeout waiting for nodes.${NC}" >&2
141
+ exit 1
142
+ fi
143
+ ready_nodes=$(run_ray list nodes --format=json | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")
144
+ if [ "${ready_nodes}" -ge "${SKYPILOT_NUM_NODES}" ]; then
145
+ break
146
+ fi
147
+ echo "Waiting... (${ready_nodes} / ${SKYPILOT_NUM_NODES} nodes ready)"
148
+ sleep 5
149
+ done
150
+ echo -e "${GREEN}All ${SKYPILOT_NUM_NODES} nodes have joined.${NC}"
151
+ fi
152
+
153
+ # Add sleep to after `ray start` to give ray enough time to daemonize
154
+ sleep 5
155
+ else
156
+ echo -e "${GREEN}Starting Ray worker node...${NC}"
157
+
158
+ echo "Waiting for head node at ${RAY_ADDRESS}..."
159
+ start_time=$(date +%s)
160
+ while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
161
+ if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
162
+ echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
163
+ exit 1
164
+ fi
165
+ echo "Head node not healthy yet. Retrying in 1s..."
166
+ sleep 1
167
+ done
168
+
169
+ echo -e "${GREEN}Head node is healthy. Starting worker node...${NC}"
170
+ WORKER_CMD="start --address=${RAY_ADDRESS} --disable-usage-stats"
171
+
172
+ # Add --num-gpus only if > 0
173
+ if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
174
+ WORKER_CMD="${WORKER_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
175
+ fi
176
+
177
+ run_ray ${WORKER_CMD}
178
+
179
+ echo -e "${GREEN}Worker node started successfully.${NC}"
180
+
181
+ # Add sleep to after `ray start` to give ray enough time to daemonize
182
+ sleep 5
183
+ fi
@@ -0,0 +1,75 @@
1
+ #!/bin/bash
2
+ # Stops a user Ray cluster on a SkyPilot cluster.
3
+ #
4
+ # This script stops a Ray cluster running on custom ports (default 6379),
5
+ # which is separate from SkyPilot's internal Ray cluster (port 6380).
6
+ #
7
+ # IMPORTANT: This script uses pkill to stop Ray processes, NOT 'ray stop',
8
+ # as 'ray stop' can interfere with SkyPilot's internal operations.
9
+ #
10
+ # Environment Variables:
11
+ # RAY_HEAD_PORT=6379 - Ray head node port to stop
12
+ # RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
13
+ #
14
+ # Usage:
15
+ # # Stop default Ray cluster (port 6379)
16
+ # ~/sky_templates/ray/stop_ray_cluster.sh
17
+ #
18
+ # # Stop Ray cluster on custom port
19
+ # export RAY_HEAD_PORT=6385
20
+ # ~/sky_templates/ray/stop_ray_cluster.sh
21
+ #
22
+ # # With uv
23
+ # export RAY_CMD="uv run ray"
24
+ # ~/sky_templates/ray/stop_ray_cluster.sh
25
+
26
+ set -e
27
+
28
+ # Color codes for output
29
+ RED='\033[0;31m'
30
+ GREEN='\033[0;32m'
31
+ YELLOW='\033[1;33m'
32
+ NC='\033[0m' # No Color
33
+
34
+ RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
35
+ RAY_CMD=${RAY_CMD:-ray}
36
+ # Tokenize the command string into an array so multi-word commands (e.g., "uv run ray")
37
+ # are handled safely when expanded later.
38
+ eval "RAY_CMD_ARR=( ${RAY_CMD} )"
39
+
40
+ run_ray() {
41
+ "${RAY_CMD_ARR[@]}" "$@"
42
+ }
43
+
44
+ echo -e "${GREEN}Stopping Ray cluster on port ${RAY_HEAD_PORT}...${NC}"
45
+
46
+ RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
47
+ if [ "$SKYPILOT_NODE_RANK" -ne 0 ]; then
48
+ HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
49
+ RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
50
+ fi
51
+
52
+ # Check if Ray is running
53
+ if ! run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
54
+ echo -e "${YELLOW}No Ray cluster found running on port ${RAY_HEAD_PORT}.${NC}"
55
+ exit 0
56
+ fi
57
+
58
+ # Use pkill to stop Ray processes instead of 'ray stop'
59
+ # This prevents interfering with SkyPilot's internal Ray cluster (port 6380)
60
+ echo -e "${YELLOW}Killing Ray processes on port ${RAY_HEAD_PORT}...${NC}"
61
+
62
+ pkill -f "ray.*[=:]${RAY_HEAD_PORT}" || true
63
+
64
+ echo -e "${GREEN}Ray processes killed.${NC}"
65
+ # Wait a moment for processes to terminate
66
+ sleep 5
67
+
68
+ # Verify Ray is stopped
69
+ if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
70
+ echo -e "${RED}Warning: Ray cluster may still be running. Try manually:${NC}"
71
+ echo -e "${RED} pkill -9 -f 'ray.*[=:]${RAY_HEAD_PORT}'${NC}"
72
+ exit 1
73
+ else
74
+ echo -e "${GREEN}Ray cluster successfully stopped.${NC}"
75
+ fi