skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/execution.py CHANGED
@@ -3,8 +3,9 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
+ import logging
6
7
  import typing
7
- from typing import List, Optional, Tuple, Union
8
+ from typing import Callable, List, Optional, Tuple, Union
8
9
 
9
10
  import colorama
10
11
 
@@ -14,7 +15,10 @@ from sky import clouds
14
15
  from sky import global_user_state
15
16
  from sky import optimizer
16
17
  from sky import sky_logging
18
+ from sky import task as task_lib
17
19
  from sky.backends import backend_utils
20
+ from sky.server.requests import request_names
21
+ from sky.skylet import autostop_lib
18
22
  from sky.usage import usage_lib
19
23
  from sky.utils import admin_policy_utils
20
24
  from sky.utils import common
@@ -23,11 +27,13 @@ from sky.utils import dag_utils
23
27
  from sky.utils import resources_utils
24
28
  from sky.utils import rich_utils
25
29
  from sky.utils import status_lib
30
+ from sky.utils import tempstore
26
31
  from sky.utils import timeline
27
32
  from sky.utils import ux_utils
28
33
 
29
34
  if typing.TYPE_CHECKING:
30
35
  import sky
36
+ from sky import resources as resources_lib
31
37
 
32
38
  logger = sky_logging.init_logger(__name__)
33
39
 
@@ -108,16 +114,18 @@ def _execute(
108
114
  stages: Optional[List[Stage]] = None,
109
115
  cluster_name: Optional[str] = None,
110
116
  detach_setup: bool = False,
111
- detach_run: bool = False,
112
117
  idle_minutes_to_autostop: Optional[int] = None,
113
118
  no_setup: bool = False,
114
119
  clone_disk_from: Optional[str] = None,
115
120
  skip_unnecessary_provisioning: bool = False,
121
+ *, #keyword only separator
116
122
  # Internal only:
117
123
  # pylint: disable=invalid-name
124
+ _request_name: request_names.AdminPolicyRequestName,
118
125
  _quiet_optimizer: bool = False,
119
126
  _is_launched_by_jobs_controller: bool = False,
120
127
  _is_launched_by_sky_serve_controller: bool = False,
128
+ job_logger: logging.Logger = logger,
121
129
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
122
130
  """Execute an entrypoint.
123
131
 
@@ -152,8 +160,6 @@ def _execute(
152
160
  job itself. You can safely ctrl-c to detach from logging, and it will
153
161
  not interrupt the setup process. To see the logs again after detaching,
154
162
  use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
155
- detach_run: If True, as soon as a job is submitted, return from this
156
- function and do not stream execution logs.
157
163
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
158
164
  autostop after this many minutes of idleness.
159
165
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
@@ -170,26 +176,96 @@ def _execute(
170
176
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
171
177
  if dryrun.
172
178
  """
173
-
179
+ if _request_name == request_names.AdminPolicyRequestName.CLUSTER_LAUNCH:
180
+ if _is_launched_by_jobs_controller:
181
+ _request_name = (
182
+ request_names.AdminPolicyRequestName.JOBS_LAUNCH_CLUSTER)
183
+ elif _is_launched_by_sky_serve_controller:
184
+ _request_name = (
185
+ request_names.AdminPolicyRequestName.SERVE_LAUNCH_REPLICA)
174
186
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
175
187
  for task in dag.tasks:
176
- if task.storage_mounts is not None:
177
- for storage in task.storage_mounts.values():
178
- # Ensure the storage is constructed.
179
- storage.construct()
180
- dag, _ = admin_policy_utils.apply(
181
- dag,
182
- request_options=admin_policy.RequestOptions(
183
- cluster_name=cluster_name,
184
- idle_minutes_to_autostop=idle_minutes_to_autostop,
185
- down=down,
188
+ for resource in task.resources:
189
+ # For backward compatibility, we need to override the autostop
190
+ # config at server-side for legacy clients. This should be set
191
+ # before admin policy to make the admin policy get the final
192
+ # value of autostop config.
193
+ # TODO(aylei): remove this after we bump the API version.
194
+ resource.override_autostop_config(
195
+ down=down, idle_minutes=idle_minutes_to_autostop)
196
+ if resource.autostop_config is not None:
197
+ down = resource.autostop_config.down
198
+ idle_minutes_to_autostop = resource.autostop_config.idle_minutes
199
+ with admin_policy_utils.apply_and_use_config_in_current_request(
200
+ dag,
201
+ request_name=_request_name,
202
+ request_options=admin_policy.RequestOptions(
203
+ cluster_name=cluster_name,
204
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
205
+ down=down,
206
+ dryrun=dryrun,
207
+ )) as dag:
208
+ dag.resolve_and_validate_volumes()
209
+ if (not _is_launched_by_jobs_controller and
210
+ not _is_launched_by_sky_serve_controller):
211
+ # Only process pre-mount operations on API server.
212
+ dag.pre_mount_volumes()
213
+ for task in dag.tasks:
214
+ if task.storage_mounts is not None:
215
+ for storage in task.storage_mounts.values():
216
+ # Ensure the storage is constructed.
217
+ storage.construct()
218
+ return _execute_dag(
219
+ dag,
186
220
  dryrun=dryrun,
187
- ))
221
+ stream_logs=stream_logs,
222
+ handle=handle,
223
+ backend=backend,
224
+ retry_until_up=retry_until_up,
225
+ optimize_target=optimize_target,
226
+ stages=stages,
227
+ cluster_name=cluster_name,
228
+ detach_setup=detach_setup,
229
+ no_setup=no_setup,
230
+ clone_disk_from=clone_disk_from,
231
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning,
232
+ _quiet_optimizer=_quiet_optimizer,
233
+ _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
234
+ _is_launched_by_sky_serve_controller=
235
+ _is_launched_by_sky_serve_controller,
236
+ job_logger=job_logger)
237
+
238
+
239
+ def _execute_dag(
240
+ dag: 'sky.Dag',
241
+ dryrun: bool,
242
+ stream_logs: bool,
243
+ handle: Optional[backends.ResourceHandle],
244
+ backend: Optional[backends.Backend],
245
+ retry_until_up: bool,
246
+ optimize_target: common.OptimizeTarget,
247
+ stages: Optional[List[Stage]],
248
+ cluster_name: Optional[str],
249
+ detach_setup: bool,
250
+ no_setup: bool,
251
+ clone_disk_from: Optional[str],
252
+ skip_unnecessary_provisioning: bool,
253
+ # pylint: disable=invalid-name
254
+ _quiet_optimizer: bool,
255
+ _is_launched_by_jobs_controller: bool,
256
+ _is_launched_by_sky_serve_controller: bool,
257
+ job_logger: logging.Logger = logger,
258
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
259
+ """Execute a DAG.
260
+
261
+ This is an internal helper function for _execute() and is expected to be
262
+ called only by _execute().
263
+ """
188
264
  assert len(dag) == 1, f'We support 1 task for now. {dag}'
189
265
  task = dag.tasks[0]
190
266
 
191
267
  if any(r.job_recovery is not None for r in task.resources):
192
- logger.warning(
268
+ job_logger.warning(
193
269
  f'{colorama.Style.DIM}The task has `job_recovery` specified, '
194
270
  'but is launched as an unmanaged job. It will be ignored.'
195
271
  'To enable job recovery, use managed jobs: sky jobs launch.'
@@ -197,8 +273,10 @@ def _execute(
197
273
 
198
274
  cluster_exists = False
199
275
  if cluster_name is not None:
200
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
201
- cluster_exists = cluster_record is not None
276
+ # We use launched_at to check if the cluster exists, because this
277
+ # db query is faster than get_cluster_from_name.
278
+ cluster_exists = global_user_state.cluster_with_name_exists(
279
+ cluster_name)
202
280
  # TODO(woosuk): If the cluster exists, print a warning that
203
281
  # `cpus` and `memory` are not used as a job scheduling constraint,
204
282
  # unlike `gpus`.
@@ -214,8 +292,7 @@ def _execute(
214
292
  if controller is not None:
215
293
  requested_features.add(
216
294
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS)
217
- if controller_utils.high_availability_specified(cluster_name,
218
- skip_warning=False):
295
+ if controller_utils.high_availability_specified(cluster_name):
219
296
  requested_features.add(clouds.CloudImplementationFeatures.
220
297
  HIGH_AVAILABILITY_CONTROLLERS)
221
298
  # If we provision a cluster that supports high availability
@@ -226,11 +303,43 @@ def _execute(
226
303
  requested_features |= task.get_required_cloud_features()
227
304
 
228
305
  backend = backend if backend is not None else backends.CloudVmRayBackend()
306
+ # Figure out autostop config.
307
+ # Note: Ideally this can happen after provisioning, so we can check the
308
+ # autostop config from the launched resources. Before provisioning,
309
+ # we aren't sure which resources will be launched, and different
310
+ # resources may have different autostop configs.
229
311
  if isinstance(backend, backends.CloudVmRayBackend):
230
- if down and idle_minutes_to_autostop is None:
231
- # Use auto{stop,down} to terminate the cluster after the task is
232
- # done.
233
- idle_minutes_to_autostop = 0
312
+ # No autostop config specified on command line, use the
313
+ # config from resources.
314
+ # TODO(cooperc): This should be done after provisioning, in order to
315
+ # support different autostop configs for different resources.
316
+ # Blockers:
317
+ # - Need autostop config to set requested_features before
318
+ # provisioning.
319
+ # - Need to send info message about idle_minutes_to_autostop==0 here
320
+ # - Need to check if autostop is supported by the backend.
321
+ resources = list(task.resources)
322
+ for resource in resources:
323
+ if resource.autostop_config != resources[0].autostop_config:
324
+ raise ValueError(
325
+ 'All resources must have the same autostop config.')
326
+ resource_autostop_config = resources[0].autostop_config
327
+
328
+ idle_minutes_to_autostop: Optional[int] = None
329
+ down = False
330
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
331
+ if resource_autostop_config is not None:
332
+ if resource_autostop_config.enabled:
333
+ idle_minutes_to_autostop = (
334
+ resource_autostop_config.idle_minutes)
335
+ down = resource_autostop_config.down
336
+ wait_for = resource_autostop_config.wait_for
337
+ else:
338
+ # Autostop is explicitly disabled, so cancel it if it's
339
+ # already set.
340
+ assert not resource_autostop_config.enabled
341
+ idle_minutes_to_autostop = -1
342
+ down = False
234
343
  if idle_minutes_to_autostop is not None:
235
344
  if idle_minutes_to_autostop == 0:
236
345
  # idle_minutes_to_autostop=0 can cause the following problem:
@@ -239,10 +348,10 @@ def _execute(
239
348
  # itself have no task running and start the auto{stop,down}
240
349
  # process, before the task is submitted in the EXEC stage.
241
350
  verb = 'torn down' if down else 'stopped'
242
- logger.info(f'{colorama.Style.DIM}The cluster will '
243
- f'be {verb} after 1 minutes of idleness '
244
- '(after all jobs finish).'
245
- f'{colorama.Style.RESET_ALL}')
351
+ job_logger.info(f'{colorama.Style.DIM}The cluster will '
352
+ f'be {verb} after 1 minutes of idleness '
353
+ '(after all jobs finish).'
354
+ f'{colorama.Style.RESET_ALL}')
246
355
  idle_minutes_to_autostop = 1
247
356
  if Stage.DOWN in stages:
248
357
  stages.remove(Stage.DOWN)
@@ -257,27 +366,21 @@ def _execute(
257
366
  # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
258
367
  # the backend.
259
368
 
260
- elif idle_minutes_to_autostop is not None:
261
- # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend.
262
- with ux_utils.print_exception_no_traceback():
263
- raise ValueError(
264
- f'Backend {backend.NAME} does not support autostop, please try'
265
- f' {backends.CloudVmRayBackend.NAME}')
266
-
267
369
  if Stage.CLONE_DISK in stages:
268
370
  task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name,
269
371
  task)
270
372
 
373
+ is_managed = (_is_launched_by_jobs_controller or
374
+ _is_launched_by_sky_serve_controller)
375
+
271
376
  if not cluster_exists:
272
377
  # If spot is launched on serve or jobs controller, we don't need to
273
378
  # print out the hint.
274
- if (Stage.PROVISION in stages and task.use_spot and
275
- not _is_launched_by_jobs_controller and
276
- not _is_launched_by_sky_serve_controller):
379
+ if (Stage.PROVISION in stages and task.use_spot and not is_managed):
277
380
  yellow = colorama.Fore.YELLOW
278
381
  bold = colorama.Style.BRIGHT
279
382
  reset = colorama.Style.RESET_ALL
280
- logger.info(
383
+ job_logger.info(
281
384
  f'{yellow}Launching a spot job that does not '
282
385
  f'automatically recover from preemptions. To '
283
386
  'get automatic recovery, use managed job instead: '
@@ -296,7 +399,7 @@ def _execute(
296
399
  controller = controller_utils.Controllers.from_name(
297
400
  cluster_name)
298
401
  if controller is not None:
299
- logger.info(
402
+ job_logger.info(
300
403
  f'Choosing resources for {controller.value.name}...'
301
404
  )
302
405
  dag = optimizer.Optimizer.optimize(dag,
@@ -305,6 +408,26 @@ def _execute(
305
408
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
306
409
  assert task.best_resources is not None, task
307
410
 
411
+ # Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
412
+ # lock. After the backend acquires the lock and refreshes state, the
413
+ # original "do we need to optimize?" decision may be stale (e.g., the
414
+ # cluster just got terminated). To compensate without moving the optimizer
415
+ # into the backend, we inject a small planner the backend can call under
416
+ # the lock only when no reusable snapshot and no caller plan exist.
417
+ planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
418
+ if isinstance(backend,
419
+ backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
420
+
421
+ def _planner(_t: 'sky.Task'):
422
+ new_dag = optimizer.Optimizer.optimize(dag,
423
+ minimize=optimize_target,
424
+ quiet=_quiet_optimizer)
425
+ new_task = new_dag.tasks[0]
426
+ assert new_task.best_resources is not None, new_task
427
+ return new_task.best_resources.assert_launchable()
428
+
429
+ planner = _planner
430
+
308
431
  backend.register_info(
309
432
  dag=dag,
310
433
  optimize_target=optimize_target,
@@ -312,7 +435,9 @@ def _execute(
312
435
  # That's because we want to do commands in task.setup and task.run again
313
436
  # after K8S pod recovers from a crash.
314
437
  # See `kubernetes-ray.yml.j2` for more details.
315
- dump_final_script=is_controller_high_availability_supported)
438
+ dump_final_script=is_controller_high_availability_supported,
439
+ is_managed=is_managed,
440
+ planner=planner)
316
441
 
317
442
  if task.storage_mounts is not None:
318
443
  # Optimizer should eventually choose where to store bucket
@@ -337,7 +462,7 @@ def _execute(
337
462
  if handle is None:
338
463
  assert dryrun, ('If not dryrun, handle must be set or '
339
464
  'Stage.PROVISION must be included in stages.')
340
- logger.info('Dryrun finished.')
465
+ job_logger.info('Dryrun finished.')
341
466
  return None, None
342
467
 
343
468
  do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
@@ -346,39 +471,52 @@ def _execute(
346
471
  (task.file_mounts is not None or
347
472
  task.storage_mounts is not None))
348
473
  if do_workdir or do_file_mounts:
349
- logger.info(ux_utils.starting_message('Syncing files.'))
474
+ job_logger.info(ux_utils.starting_message('Syncing files.'))
350
475
 
351
476
  if do_workdir:
352
- backend.sync_workdir(handle, task.workdir)
477
+ if cluster_name is not None:
478
+ global_user_state.add_cluster_event(
479
+ cluster_name, status_lib.ClusterStatus.INIT,
480
+ 'Syncing files to cluster',
481
+ global_user_state.ClusterEventType.STATUS_CHANGE)
482
+ envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
483
+ task.envs_and_secrets)
484
+ backend.sync_workdir(handle, task.workdir, envs_and_secrets)
353
485
 
354
486
  if do_file_mounts:
487
+ if cluster_name is not None:
488
+ global_user_state.add_cluster_event(
489
+ cluster_name, status_lib.ClusterStatus.UP,
490
+ 'Syncing file mounts',
491
+ global_user_state.ClusterEventType.STATUS_CHANGE)
355
492
  backend.sync_file_mounts(handle, task.file_mounts,
356
493
  task.storage_mounts)
357
494
 
358
495
  if no_setup:
359
- logger.info('Setup commands skipped.')
496
+ job_logger.info('Setup commands skipped.')
360
497
  elif Stage.SETUP in stages and not dryrun:
361
498
  if skip_unnecessary_provisioning and provisioning_skipped:
362
- logger.debug('Unnecessary provisioning was skipped, so '
363
- 'skipping setup as well.')
499
+ job_logger.debug('Unnecessary provisioning was skipped, so '
500
+ 'skipping setup as well.')
364
501
  else:
502
+ if cluster_name is not None:
503
+ global_user_state.add_cluster_event(
504
+ cluster_name, status_lib.ClusterStatus.UP,
505
+ 'Running setup commands to install dependencies',
506
+ global_user_state.ClusterEventType.STATUS_CHANGE)
365
507
  backend.setup(handle, task, detach_setup=detach_setup)
366
508
 
367
509
  if Stage.PRE_EXEC in stages and not dryrun:
368
510
  if idle_minutes_to_autostop is not None:
369
511
  assert isinstance(backend, backends.CloudVmRayBackend)
370
512
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
371
- backend.set_autostop(handle,
372
- idle_minutes_to_autostop,
373
- down=down)
513
+ backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
514
+ down)
374
515
 
375
516
  if Stage.EXEC in stages:
376
517
  try:
377
518
  global_user_state.update_last_use(handle.get_cluster_name())
378
- job_id = backend.execute(handle,
379
- task,
380
- detach_run,
381
- dryrun=dryrun)
519
+ job_id = backend.execute(handle, task, dryrun=dryrun)
382
520
  finally:
383
521
  # Enables post_execute() to be run after KeyboardInterrupt.
384
522
  backend.post_execute(handle, down)
@@ -395,6 +533,9 @@ def _execute(
395
533
 
396
534
  @timeline.event
397
535
  @usage_lib.entrypoint
536
+ # A launch routine will share tempfiles between steps, so we init a tempdir
537
+ # for the launch routine and gc the entire dir after launch.
538
+ @tempstore.with_tempdir
398
539
  def launch(
399
540
  task: Union['sky.Task', 'sky.Dag'],
400
541
  cluster_name: Optional[str] = None,
@@ -408,12 +549,16 @@ def launch(
408
549
  no_setup: bool = False,
409
550
  clone_disk_from: Optional[str] = None,
410
551
  fast: bool = False,
552
+ *, #keyword only separator
411
553
  # Internal only:
412
554
  # pylint: disable=invalid-name
413
555
  _quiet_optimizer: bool = False,
414
556
  _is_launched_by_jobs_controller: bool = False,
415
557
  _is_launched_by_sky_serve_controller: bool = False,
416
558
  _disable_controller_check: bool = False,
559
+ _request_name: request_names.AdminPolicyRequestName = request_names.
560
+ AdminPolicyRequestName.CLUSTER_LAUNCH,
561
+ job_logger: logging.Logger = logger,
417
562
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
418
563
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
419
564
  """Launches a cluster or task.
@@ -432,7 +577,7 @@ def launch(
432
577
  import sky
433
578
  task = sky.Task(run='echo hello SkyPilot')
434
579
  task.set_resources(
435
- sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
580
+ sky.Resources(infra='aws', accelerators='V100:4'))
436
581
  sky.launch(task, cluster_name='my-cluster')
437
582
 
438
583
 
@@ -448,13 +593,16 @@ def launch(
448
593
  running/pending jobs are found in the job queue. Setting this
449
594
  flag is equivalent to running
450
595
  ``sky.launch(...)`` and then
451
- ``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
452
- will not be autostopped.
596
+ ``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
597
+ config specified in the task' resources will be overridden by
598
+ this parameter.
453
599
  down: Tear down the cluster after all jobs finish (successfully or
454
600
  abnormally). If --idle-minutes-to-autostop is also set, the
455
601
  cluster will be torn down after the specified idle time.
456
602
  Note that if errors occur during provisioning/data syncing/setting
457
- up, the cluster will not be torn down for debugging purposes.
603
+ up, the cluster will not be torn down for debugging purposes. If
604
+ set, the autostop config specified in the task' resources will be
605
+ overridden by this parameter.
458
606
  dryrun: if True, do not actually launch the cluster.
459
607
  stream_logs: if True, show the logs in the terminal.
460
608
  backend: backend to use. If None, use the default backend
@@ -556,7 +704,6 @@ def launch(
556
704
  # see the setup logs when inspecting the launch process to know
557
705
  # excatly what the job is waiting for.
558
706
  detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
559
-
560
707
  return _execute(
561
708
  entrypoint=entrypoint,
562
709
  dryrun=dryrun,
@@ -569,7 +716,6 @@ def launch(
569
716
  stages=stages,
570
717
  cluster_name=cluster_name,
571
718
  detach_setup=detach_setup,
572
- detach_run=True,
573
719
  idle_minutes_to_autostop=idle_minutes_to_autostop,
574
720
  no_setup=no_setup,
575
721
  clone_disk_from=clone_disk_from,
@@ -578,7 +724,12 @@ def launch(
578
724
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
579
725
  _is_launched_by_sky_serve_controller=
580
726
  _is_launched_by_sky_serve_controller,
581
- )
727
+ _request_name=_request_name,
728
+ job_logger=job_logger)
729
+
730
+
731
+ # needed for backward compatibility. Remove by v0.12.0
732
+ cluster_launch = launch
582
733
 
583
734
 
584
735
  @usage_lib.entrypoint
@@ -589,6 +740,7 @@ def exec( # pylint: disable=redefined-builtin
589
740
  down: bool = False,
590
741
  stream_logs: bool = True,
591
742
  backend: Optional[backends.Backend] = None,
743
+ job_logger: logging.Logger = logger,
592
744
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
593
745
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
594
746
  """Executes a task on an existing cluster.
@@ -663,5 +815,6 @@ def exec( # pylint: disable=redefined-builtin
663
815
  Stage.EXEC,
664
816
  ],
665
817
  cluster_name=cluster_name,
666
- detach_run=True,
818
+ job_logger=job_logger,
819
+ _request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
667
820
  )