skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,101 @@
1
+ """REST API for workspace management."""
2
+
3
+ import fastapi
4
+
5
+ from sky.server.requests import executor
6
+ from sky.server.requests import payloads
7
+ from sky.server.requests import request_names
8
+ from sky.server.requests import requests as api_requests
9
+ from sky.workspaces import core
10
+
11
+ router = fastapi.APIRouter()
12
+
13
+
14
+ @router.get('')
15
+ # pylint: disable=redefined-builtin
16
+ async def get(request: fastapi.Request) -> None:
17
+ """Gets workspace config on the server."""
18
+ # Have to manually inject user info into the request body because the
19
+ # request body is not available in the GET endpoint.
20
+ auth_user = request.state.auth_user
21
+ auth_user_env_vars_kwargs = {
22
+ 'env_vars': auth_user.to_env_vars()
23
+ } if auth_user else {}
24
+ request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
25
+
26
+ await executor.schedule_request_async(
27
+ request_id=request.state.request_id,
28
+ request_name=request_names.RequestName.WORKSPACES_GET,
29
+ request_body=request_body,
30
+ func=core.get_workspaces,
31
+ schedule_type=api_requests.ScheduleType.SHORT,
32
+ )
33
+
34
+
35
+ @router.post('/update')
36
+ async def update(request: fastapi.Request,
37
+ update_workspace_body: payloads.UpdateWorkspaceBody) -> None:
38
+ """Updates a specific workspace configuration."""
39
+ await executor.schedule_request_async(
40
+ request_id=request.state.request_id,
41
+ request_name=request_names.RequestName.WORKSPACES_UPDATE,
42
+ request_body=update_workspace_body,
43
+ func=core.update_workspace,
44
+ schedule_type=api_requests.ScheduleType.SHORT,
45
+ )
46
+
47
+
48
+ @router.post('/create')
49
+ async def create(request: fastapi.Request,
50
+ create_workspace_body: payloads.CreateWorkspaceBody) -> None:
51
+ """Creates a new workspace configuration."""
52
+ await executor.schedule_request_async(
53
+ request_id=request.state.request_id,
54
+ request_name=request_names.RequestName.WORKSPACES_CREATE,
55
+ request_body=create_workspace_body,
56
+ func=core.create_workspace,
57
+ schedule_type=api_requests.ScheduleType.SHORT,
58
+ )
59
+
60
+
61
+ @router.post('/delete')
62
+ async def delete(request: fastapi.Request,
63
+ delete_workspace_body: payloads.DeleteWorkspaceBody) -> None:
64
+ """Deletes a workspace configuration."""
65
+ await executor.schedule_request_async(
66
+ request_id=request.state.request_id,
67
+ request_name=request_names.RequestName.WORKSPACES_DELETE,
68
+ request_body=delete_workspace_body,
69
+ func=core.delete_workspace,
70
+ schedule_type=api_requests.ScheduleType.SHORT,
71
+ )
72
+
73
+
74
+ @router.get('/config')
75
+ async def get_config(request: fastapi.Request) -> None:
76
+ """Gets the entire SkyPilot configuration."""
77
+ auth_user = request.state.auth_user
78
+ auth_user_env_vars_kwargs = {
79
+ 'env_vars': auth_user.to_env_vars()
80
+ } if auth_user else {}
81
+ get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
82
+ await executor.schedule_request_async(
83
+ request_id=request.state.request_id,
84
+ request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
85
+ request_body=get_config_body,
86
+ func=core.get_config,
87
+ schedule_type=api_requests.ScheduleType.SHORT,
88
+ )
89
+
90
+
91
+ @router.post('/config')
92
+ async def update_config(request: fastapi.Request,
93
+ update_config_body: payloads.UpdateConfigBody) -> None:
94
+ """Updates the entire SkyPilot configuration."""
95
+ await executor.schedule_request_async(
96
+ request_id=request.state.request_id,
97
+ request_name=request_names.RequestName.WORKSPACES_UPDATE_CONFIG,
98
+ request_body=update_config_body,
99
+ func=core.update_config,
100
+ schedule_type=api_requests.ScheduleType.SHORT,
101
+ )
@@ -0,0 +1,56 @@
1
+ """Utils for workspaces."""
2
+ import collections
3
+ from typing import Any, Dict, List
4
+
5
+ from sky import global_user_state
6
+ from sky import sky_logging
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+
11
+ def get_workspace_users(workspace_config: Dict[str, Any]) -> List[str]:
12
+ """Get the users that should have access to a workspace.
13
+
14
+ workspace_config is a dict with the following keys:
15
+ - private: bool
16
+ - allowed_users: list of user names or IDs
17
+
18
+ This function will automatically resolve the user names to IDs.
19
+
20
+ Args:
21
+ workspace_config: The configuration of the workspace.
22
+
23
+ Returns:
24
+ List of user IDs that should have access to the workspace.
25
+ For private workspaces, returns specific user IDs.
26
+ For public workspaces, returns ['*'] to indicate all users.
27
+ """
28
+ if workspace_config.get('private', False):
29
+ user_ids = []
30
+ workspace_user_name_or_ids = workspace_config.get('allowed_users', [])
31
+ all_users = global_user_state.get_all_users()
32
+ all_user_ids = {user.id for user in all_users}
33
+ all_user_map = collections.defaultdict(list)
34
+ for user in all_users:
35
+ all_user_map[user.name].append(user.id)
36
+
37
+ # Resolve user names to IDs
38
+ for user_name_or_id in workspace_user_name_or_ids:
39
+ if user_name_or_id in all_user_ids:
40
+ user_ids.append(user_name_or_id)
41
+ elif user_name_or_id in all_user_map:
42
+ if len(all_user_map[user_name_or_id]) > 1:
43
+ user_ids_str = ', '.join(all_user_map[user_name_or_id])
44
+ raise ValueError(
45
+ f'User {user_name_or_id!r} has multiple IDs: '
46
+ f'{user_ids_str}. Please specify the user '
47
+ f'ID instead.')
48
+ user_ids.append(all_user_map[user_name_or_id][0])
49
+ else:
50
+ logger.warning(
51
+ f'User {user_name_or_id!r} not found in all users')
52
+ continue
53
+ return user_ids
54
+ else:
55
+ # Public workspace - return '*' to indicate all users should have access
56
+ return ['*']
@@ -0,0 +1,3 @@
1
+ # SkyPilot Templates
2
+
3
+ This package contains templates for users to use in their SkyPilot clusters, jobs, and services.
@@ -0,0 +1,3 @@
1
+ """SkyPilot templates."""
2
+
3
+ __version__ = '1.0.0-dev0'
File without changes
@@ -0,0 +1,183 @@
1
+ #!/bin/bash
2
+ # Starts a Ray cluster on a SkyPilot cluster.
3
+ #
4
+ # This script starts a Ray cluster using default Ray ports (6379, 8265),
5
+ # which are different from SkyPilot's system Ray ports (6380, 8266).
6
+ # This allows users to run their own Ray applications independently of
7
+ # SkyPilot's internal Ray cluster.
8
+ #
9
+ # Environment Variables:
10
+ # RAY_HEAD_PORT=6379 - Ray head node port
11
+ # RAY_DASHBOARD_PORT=8265 - Ray dashboard port
12
+ # RAY_DASHBOARD_HOST=127.0.0.1 - Dashboard host (set to 0.0.0.0 to expose externally)
13
+ # RAY_DASHBOARD_AGENT_LISTEN_PORT= - (Optional) Dashboard agent listen port
14
+ # RAY_HEAD_IP_ADDRESS= - (Optional) Node IP address
15
+ # RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
16
+ #
17
+ # Usage:
18
+ # ~/sky_templates/ray/start_cluster
19
+ #
20
+ # # With custom configurations
21
+ # export RAY_DASHBOARD_HOST=0.0.0.0
22
+ # export RAY_DASHBOARD_PORT=8280
23
+ # ~/sky_templates/ray/start_cluster
24
+ #
25
+ # # With uv
26
+ # export RAY_CMD="uv run ray"
27
+ # ~/sky_templates/ray/start_cluster
28
+
29
+ set -e
30
+
31
+ # Color codes for output
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ NC='\033[0m' # No Color
36
+
37
+ RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
38
+ RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
39
+ RAY_DASHBOARD_HOST=${RAY_DASHBOARD_HOST:-127.0.0.1}
40
+ RAY_DASHBOARD_AGENT_LISTEN_PORT=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-}
41
+ RAY_HEAD_IP_ADDRESS=${RAY_HEAD_IP_ADDRESS:-}
42
+
43
+ RAY_CMD=${RAY_CMD:-ray}
44
+ # Tokenize the command string into an array so multi-word commands
45
+ # (e.g., "uv run ray") are handled safely when expanded later.
46
+ eval "RAY_CMD_ARR=( ${RAY_CMD} )"
47
+
48
+ # Convenience wrapper to invoke the configured Ray command with arbitrary args.
49
+ run_ray() {
50
+ "${RAY_CMD_ARR[@]}" "$@"
51
+ }
52
+
53
+ echo -e "${GREEN}Starting Ray cluster...${NC}"
54
+
55
+ # Ensure ray[default] is installed (we need [default] to do `ray list nodes`)
56
+ # Pin to existing version if Ray is already installed to avoid upgrading existing version.
57
+ RAY_VERSION=$(run_ray --version 2>/dev/null | cut -d' ' -f3 || echo "")
58
+ if [ -n "${RAY_VERSION}" ]; then
59
+ # Pin to existing version.
60
+ VERSION_SPEC="==${RAY_VERSION}"
61
+ else
62
+ echo -e "${YELLOW}Installing ray[default]...${NC}"
63
+ VERSION_SPEC=""
64
+ fi
65
+
66
+ # Pin click<8.3.0 to avoid incompatibility with Ray on Python 3.10
67
+ # click 8.3.0 and 8.3.1 breaks Ray CLI due to deepcopy issues with sentinel values
68
+ # See: https://github.com/ray-project/ray/issues/56747
69
+ # TODO(kevin): Remove this once the issue is fixed in a future click release
70
+ RAY_INSTALL_SPEC="ray[default]${VERSION_SPEC} click<8.3.0"
71
+ uv pip install ${RAY_INSTALL_SPEC} || uv pip install --system ${RAY_INSTALL_SPEC}
72
+
73
+ # Verify Ray is working
74
+ if ! run_ray --version > /dev/null; then
75
+ echo -e "${RED}Error: Failed to install Ray.${NC}"
76
+ exit 1
77
+ fi
78
+ echo -e "${GREEN}Ray $(run_ray --version | cut -d' ' -f3) is installed.${NC}"
79
+
80
+ RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
81
+ if [ "${SKYPILOT_NODE_RANK}" -ne 0 ]; then
82
+ HEAD_IP=$(echo "${SKYPILOT_NODE_IPS}" | head -n1)
83
+ RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
84
+ fi
85
+
86
+ # Check if user-space Ray is already running
87
+ if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
88
+ echo -e "${YELLOW}Ray cluster is already running.${NC}"
89
+ run_ray status --address="${RAY_ADDRESS}"
90
+ exit 0
91
+ fi
92
+
93
+ TIMEOUT=300
94
+
95
+ if [ "${SKYPILOT_NODE_RANK}" -eq 0 ]; then
96
+ echo -e "${GREEN}Starting Ray head node...${NC}"
97
+
98
+ RAY_START_CMD="start --head \
99
+ --port=${RAY_HEAD_PORT} \
100
+ --dashboard-port=${RAY_DASHBOARD_PORT} \
101
+ --dashboard-host=${RAY_DASHBOARD_HOST} \
102
+ --disable-usage-stats \
103
+ --include-dashboard=True"
104
+
105
+ # Add --num-gpus only if > 0
106
+ if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
107
+ RAY_START_CMD="${RAY_START_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
108
+ fi
109
+
110
+ # Add optional dashboard agent listen port if specified
111
+ if [ -n "${RAY_DASHBOARD_AGENT_LISTEN_PORT}" ]; then
112
+ RAY_START_CMD="${RAY_START_CMD} --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT}"
113
+ fi
114
+
115
+ # Add optional node IP address if specified
116
+ if [ -n "${RAY_HEAD_IP_ADDRESS}" ]; then
117
+ RAY_START_CMD="${RAY_START_CMD} --node-ip-address=${RAY_HEAD_IP_ADDRESS}"
118
+ fi
119
+
120
+ run_ray ${RAY_START_CMD}
121
+
122
+ start_time=$(date +%s)
123
+ while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
124
+ if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
125
+ echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
126
+ exit 1
127
+ fi
128
+ echo "Head node not healthy yet. Retrying in 1s..."
129
+ sleep 1
130
+ done
131
+
132
+ echo -e "${GREEN}Head node started successfully.${NC}"
133
+
134
+ # Wait for all worker nodes to join
135
+ if [ "${SKYPILOT_NUM_NODES}" -gt 1 ]; then
136
+ echo "Waiting for all ${SKYPILOT_NUM_NODES} nodes to join..."
137
+ start_time=$(date +%s)
138
+ while true; do
139
+ if [ "$(( $(date +%s) - start_time ))" -ge "${TIMEOUT}" ]; then
140
+ echo -e "${RED}Error: Timeout waiting for nodes.${NC}" >&2
141
+ exit 1
142
+ fi
143
+ ready_nodes=$(run_ray list nodes --format=json | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")
144
+ if [ "${ready_nodes}" -ge "${SKYPILOT_NUM_NODES}" ]; then
145
+ break
146
+ fi
147
+ echo "Waiting... (${ready_nodes} / ${SKYPILOT_NUM_NODES} nodes ready)"
148
+ sleep 5
149
+ done
150
+ echo -e "${GREEN}All ${SKYPILOT_NUM_NODES} nodes have joined.${NC}"
151
+ fi
152
+
153
+ # Add sleep to after `ray start` to give ray enough time to daemonize
154
+ sleep 5
155
+ else
156
+ echo -e "${GREEN}Starting Ray worker node...${NC}"
157
+
158
+ echo "Waiting for head node at ${RAY_ADDRESS}..."
159
+ start_time=$(date +%s)
160
+ while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
161
+ if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
162
+ echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
163
+ exit 1
164
+ fi
165
+ echo "Head node not healthy yet. Retrying in 1s..."
166
+ sleep 1
167
+ done
168
+
169
+ echo -e "${GREEN}Head node is healthy. Starting worker node...${NC}"
170
+ WORKER_CMD="start --address=${RAY_ADDRESS} --disable-usage-stats"
171
+
172
+ # Add --num-gpus only if > 0
173
+ if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
174
+ WORKER_CMD="${WORKER_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
175
+ fi
176
+
177
+ run_ray ${WORKER_CMD}
178
+
179
+ echo -e "${GREEN}Worker node started successfully.${NC}"
180
+
181
+ # Add sleep to after `ray start` to give ray enough time to daemonize
182
+ sleep 5
183
+ fi
@@ -0,0 +1,75 @@
1
+ #!/bin/bash
2
+ # Stops a user Ray cluster on a SkyPilot cluster.
3
+ #
4
+ # This script stops a Ray cluster running on custom ports (default 6379),
5
+ # which is separate from SkyPilot's internal Ray cluster (port 6380).
6
+ #
7
+ # IMPORTANT: This script uses pkill to stop Ray processes, NOT 'ray stop',
8
+ # as 'ray stop' can interfere with SkyPilot's internal operations.
9
+ #
10
+ # Environment Variables:
11
+ # RAY_HEAD_PORT=6379 - Ray head node port to stop
12
+ # RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
13
+ #
14
+ # Usage:
15
+ # # Stop default Ray cluster (port 6379)
16
+ # ~/sky_templates/ray/stop_ray_cluster.sh
17
+ #
18
+ # # Stop Ray cluster on custom port
19
+ # export RAY_HEAD_PORT=6385
20
+ # ~/sky_templates/ray/stop_ray_cluster.sh
21
+ #
22
+ # # With uv
23
+ # export RAY_CMD="uv run ray"
24
+ # ~/sky_templates/ray/stop_ray_cluster.sh
25
+
26
+ set -e
27
+
28
+ # Color codes for output
29
+ RED='\033[0;31m'
30
+ GREEN='\033[0;32m'
31
+ YELLOW='\033[1;33m'
32
+ NC='\033[0m' # No Color
33
+
34
+ RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
35
+ RAY_CMD=${RAY_CMD:-ray}
36
+ # Tokenize the command string into an array so multi-word commands (e.g., "uv run ray")
37
+ # are handled safely when expanded later.
38
+ eval "RAY_CMD_ARR=( ${RAY_CMD} )"
39
+
40
+ run_ray() {
41
+ "${RAY_CMD_ARR[@]}" "$@"
42
+ }
43
+
44
+ echo -e "${GREEN}Stopping Ray cluster on port ${RAY_HEAD_PORT}...${NC}"
45
+
46
+ RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
47
+ if [ "$SKYPILOT_NODE_RANK" -ne 0 ]; then
48
+ HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
49
+ RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
50
+ fi
51
+
52
+ # Check if Ray is running
53
+ if ! run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
54
+ echo -e "${YELLOW}No Ray cluster found running on port ${RAY_HEAD_PORT}.${NC}"
55
+ exit 0
56
+ fi
57
+
58
+ # Use pkill to stop Ray processes instead of 'ray stop'
59
+ # This prevents interfering with SkyPilot's internal Ray cluster (port 6380)
60
+ echo -e "${YELLOW}Killing Ray processes on port ${RAY_HEAD_PORT}...${NC}"
61
+
62
+ pkill -f "ray.*[=:]${RAY_HEAD_PORT}" || true
63
+
64
+ echo -e "${GREEN}Ray processes killed.${NC}"
65
+ # Wait a moment for processes to terminate
66
+ sleep 5
67
+
68
+ # Verify Ray is stopped
69
+ if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
70
+ echo -e "${RED}Warning: Ray cluster may still be running. Try manually:${NC}"
71
+ echo -e "${RED} pkill -9 -f 'ray.*[=:]${RAY_HEAD_PORT}'${NC}"
72
+ exit 1
73
+ else
74
+ echo -e "${GREEN}Ray cluster successfully stopped.${NC}"
75
+ fi