skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -3,8 +3,9 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
+ import logging
6
7
  import typing
7
- from typing import List, Optional, Tuple, Union
8
+ from typing import Callable, List, Optional, Tuple, Union
8
9
 
9
10
  import colorama
10
11
 
@@ -15,6 +16,8 @@ from sky import global_user_state
15
16
  from sky import optimizer
16
17
  from sky import sky_logging
17
18
  from sky.backends import backend_utils
19
+ from sky.server.requests import request_names
20
+ from sky.skylet import autostop_lib
18
21
  from sky.usage import usage_lib
19
22
  from sky.utils import admin_policy_utils
20
23
  from sky.utils import common
@@ -23,11 +26,13 @@ from sky.utils import dag_utils
23
26
  from sky.utils import resources_utils
24
27
  from sky.utils import rich_utils
25
28
  from sky.utils import status_lib
29
+ from sky.utils import tempstore
26
30
  from sky.utils import timeline
27
31
  from sky.utils import ux_utils
28
32
 
29
33
  if typing.TYPE_CHECKING:
30
34
  import sky
35
+ from sky import resources as resources_lib
31
36
 
32
37
  logger = sky_logging.init_logger(__name__)
33
38
 
@@ -108,16 +113,18 @@ def _execute(
108
113
  stages: Optional[List[Stage]] = None,
109
114
  cluster_name: Optional[str] = None,
110
115
  detach_setup: bool = False,
111
- detach_run: bool = False,
112
116
  idle_minutes_to_autostop: Optional[int] = None,
113
117
  no_setup: bool = False,
114
118
  clone_disk_from: Optional[str] = None,
115
119
  skip_unnecessary_provisioning: bool = False,
120
+ *, #keyword only separator
116
121
  # Internal only:
117
122
  # pylint: disable=invalid-name
123
+ _request_name: request_names.AdminPolicyRequestName,
118
124
  _quiet_optimizer: bool = False,
119
125
  _is_launched_by_jobs_controller: bool = False,
120
126
  _is_launched_by_sky_serve_controller: bool = False,
127
+ job_logger: logging.Logger = logger,
121
128
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
122
129
  """Execute an entrypoint.
123
130
 
@@ -152,8 +159,6 @@ def _execute(
152
159
  job itself. You can safely ctrl-c to detach from logging, and it will
153
160
  not interrupt the setup process. To see the logs again after detaching,
154
161
  use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
155
- detach_run: If True, as soon as a job is submitted, return from this
156
- function and do not stream execution logs.
157
162
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
158
163
  autostop after this many minutes of idleness.
159
164
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
@@ -170,26 +175,89 @@ def _execute(
170
175
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
171
176
  if dryrun.
172
177
  """
173
-
174
178
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
175
179
  for task in dag.tasks:
176
- if task.storage_mounts is not None:
177
- for storage in task.storage_mounts.values():
178
- # Ensure the storage is constructed.
179
- storage.construct()
180
- dag, _ = admin_policy_utils.apply(
181
- dag,
182
- request_options=admin_policy.RequestOptions(
183
- cluster_name=cluster_name,
184
- idle_minutes_to_autostop=idle_minutes_to_autostop,
185
- down=down,
180
+ for resource in task.resources:
181
+ # For backward compatibility, we need to override the autostop
182
+ # config at server-side for legacy clients. This should be set
183
+ # before admin policy to make the admin policy get the final
184
+ # value of autostop config.
185
+ # TODO(aylei): remove this after we bump the API version.
186
+ resource.override_autostop_config(
187
+ down=down, idle_minutes=idle_minutes_to_autostop)
188
+ if resource.autostop_config is not None:
189
+ down = resource.autostop_config.down
190
+ idle_minutes_to_autostop = resource.autostop_config.idle_minutes
191
+ with admin_policy_utils.apply_and_use_config_in_current_request(
192
+ dag,
193
+ request_name=_request_name,
194
+ request_options=admin_policy.RequestOptions(
195
+ cluster_name=cluster_name,
196
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
197
+ down=down,
198
+ dryrun=dryrun,
199
+ )) as dag:
200
+ dag.resolve_and_validate_volumes()
201
+ if (not _is_launched_by_jobs_controller and
202
+ not _is_launched_by_sky_serve_controller):
203
+ # Only process pre-mount operations on API server.
204
+ dag.pre_mount_volumes()
205
+ for task in dag.tasks:
206
+ if task.storage_mounts is not None:
207
+ for storage in task.storage_mounts.values():
208
+ # Ensure the storage is constructed.
209
+ storage.construct()
210
+ return _execute_dag(
211
+ dag,
186
212
  dryrun=dryrun,
187
- ))
213
+ stream_logs=stream_logs,
214
+ handle=handle,
215
+ backend=backend,
216
+ retry_until_up=retry_until_up,
217
+ optimize_target=optimize_target,
218
+ stages=stages,
219
+ cluster_name=cluster_name,
220
+ detach_setup=detach_setup,
221
+ no_setup=no_setup,
222
+ clone_disk_from=clone_disk_from,
223
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning,
224
+ _quiet_optimizer=_quiet_optimizer,
225
+ _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
226
+ _is_launched_by_sky_serve_controller=
227
+ _is_launched_by_sky_serve_controller,
228
+ job_logger=job_logger)
229
+
230
+
231
+ def _execute_dag(
232
+ dag: 'sky.Dag',
233
+ dryrun: bool,
234
+ stream_logs: bool,
235
+ handle: Optional[backends.ResourceHandle],
236
+ backend: Optional[backends.Backend],
237
+ retry_until_up: bool,
238
+ optimize_target: common.OptimizeTarget,
239
+ stages: Optional[List[Stage]],
240
+ cluster_name: Optional[str],
241
+ detach_setup: bool,
242
+ no_setup: bool,
243
+ clone_disk_from: Optional[str],
244
+ skip_unnecessary_provisioning: bool,
245
+ # pylint: disable=invalid-name
246
+ _quiet_optimizer: bool,
247
+ _is_launched_by_jobs_controller: bool,
248
+ _is_launched_by_sky_serve_controller: bool,
249
+ job_logger: logging.Logger = logger,
250
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
251
+ """Execute a DAG.
252
+
253
+ This is an internal helper function for _execute() and is expected to be
254
+ called only by _execute().
255
+ """
188
256
  assert len(dag) == 1, f'We support 1 task for now. {dag}'
189
257
  task = dag.tasks[0]
190
258
 
191
259
  if any(r.job_recovery is not None for r in task.resources):
192
- logger.warning(
260
+ job_logger.warning(
193
261
  f'{colorama.Style.DIM}The task has `job_recovery` specified, '
194
262
  'but is launched as an unmanaged job. It will be ignored.'
195
263
  'To enable job recovery, use managed jobs: sky jobs launch.'
@@ -197,8 +265,10 @@ def _execute(
197
265
 
198
266
  cluster_exists = False
199
267
  if cluster_name is not None:
200
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
201
- cluster_exists = cluster_record is not None
268
+ # We use launched_at to check if the cluster exists, because this
269
+ # db query is faster than get_cluster_from_name.
270
+ cluster_exists = global_user_state.cluster_with_name_exists(
271
+ cluster_name)
202
272
  # TODO(woosuk): If the cluster exists, print a warning that
203
273
  # `cpus` and `memory` are not used as a job scheduling constraint,
204
274
  # unlike `gpus`.
@@ -214,8 +284,7 @@ def _execute(
214
284
  if controller is not None:
215
285
  requested_features.add(
216
286
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS)
217
- if controller_utils.high_availability_specified(cluster_name,
218
- skip_warning=False):
287
+ if controller_utils.high_availability_specified(cluster_name):
219
288
  requested_features.add(clouds.CloudImplementationFeatures.
220
289
  HIGH_AVAILABILITY_CONTROLLERS)
221
290
  # If we provision a cluster that supports high availability
@@ -226,11 +295,43 @@ def _execute(
226
295
  requested_features |= task.get_required_cloud_features()
227
296
 
228
297
  backend = backend if backend is not None else backends.CloudVmRayBackend()
298
+ # Figure out autostop config.
299
+ # Note: Ideally this can happen after provisioning, so we can check the
300
+ # autostop config from the launched resources. Before provisioning,
301
+ # we aren't sure which resources will be launched, and different
302
+ # resources may have different autostop configs.
229
303
  if isinstance(backend, backends.CloudVmRayBackend):
230
- if down and idle_minutes_to_autostop is None:
231
- # Use auto{stop,down} to terminate the cluster after the task is
232
- # done.
233
- idle_minutes_to_autostop = 0
304
+ # No autostop config specified on command line, use the
305
+ # config from resources.
306
+ # TODO(cooperc): This should be done after provisioning, in order to
307
+ # support different autostop configs for different resources.
308
+ # Blockers:
309
+ # - Need autostop config to set requested_features before
310
+ # provisioning.
311
+ # - Need to send info message about idle_minutes_to_autostop==0 here
312
+ # - Need to check if autostop is supported by the backend.
313
+ resources = list(task.resources)
314
+ for resource in resources:
315
+ if resource.autostop_config != resources[0].autostop_config:
316
+ raise ValueError(
317
+ 'All resources must have the same autostop config.')
318
+ resource_autostop_config = resources[0].autostop_config
319
+
320
+ idle_minutes_to_autostop: Optional[int] = None
321
+ down = False
322
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
323
+ if resource_autostop_config is not None:
324
+ if resource_autostop_config.enabled:
325
+ idle_minutes_to_autostop = (
326
+ resource_autostop_config.idle_minutes)
327
+ down = resource_autostop_config.down
328
+ wait_for = resource_autostop_config.wait_for
329
+ else:
330
+ # Autostop is explicitly disabled, so cancel it if it's
331
+ # already set.
332
+ assert not resource_autostop_config.enabled
333
+ idle_minutes_to_autostop = -1
334
+ down = False
234
335
  if idle_minutes_to_autostop is not None:
235
336
  if idle_minutes_to_autostop == 0:
236
337
  # idle_minutes_to_autostop=0 can cause the following problem:
@@ -239,10 +340,10 @@ def _execute(
239
340
  # itself have no task running and start the auto{stop,down}
240
341
  # process, before the task is submitted in the EXEC stage.
241
342
  verb = 'torn down' if down else 'stopped'
242
- logger.info(f'{colorama.Style.DIM}The cluster will '
243
- f'be {verb} after 1 minutes of idleness '
244
- '(after all jobs finish).'
245
- f'{colorama.Style.RESET_ALL}')
343
+ job_logger.info(f'{colorama.Style.DIM}The cluster will '
344
+ f'be {verb} after 1 minutes of idleness '
345
+ '(after all jobs finish).'
346
+ f'{colorama.Style.RESET_ALL}')
246
347
  idle_minutes_to_autostop = 1
247
348
  if Stage.DOWN in stages:
248
349
  stages.remove(Stage.DOWN)
@@ -257,27 +358,21 @@ def _execute(
257
358
  # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
258
359
  # the backend.
259
360
 
260
- elif idle_minutes_to_autostop is not None:
261
- # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend.
262
- with ux_utils.print_exception_no_traceback():
263
- raise ValueError(
264
- f'Backend {backend.NAME} does not support autostop, please try'
265
- f' {backends.CloudVmRayBackend.NAME}')
266
-
267
361
  if Stage.CLONE_DISK in stages:
268
362
  task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name,
269
363
  task)
270
364
 
365
+ is_managed = (_is_launched_by_jobs_controller or
366
+ _is_launched_by_sky_serve_controller)
367
+
271
368
  if not cluster_exists:
272
369
  # If spot is launched on serve or jobs controller, we don't need to
273
370
  # print out the hint.
274
- if (Stage.PROVISION in stages and task.use_spot and
275
- not _is_launched_by_jobs_controller and
276
- not _is_launched_by_sky_serve_controller):
371
+ if (Stage.PROVISION in stages and task.use_spot and not is_managed):
277
372
  yellow = colorama.Fore.YELLOW
278
373
  bold = colorama.Style.BRIGHT
279
374
  reset = colorama.Style.RESET_ALL
280
- logger.info(
375
+ job_logger.info(
281
376
  f'{yellow}Launching a spot job that does not '
282
377
  f'automatically recover from preemptions. To '
283
378
  'get automatic recovery, use managed job instead: '
@@ -296,7 +391,7 @@ def _execute(
296
391
  controller = controller_utils.Controllers.from_name(
297
392
  cluster_name)
298
393
  if controller is not None:
299
- logger.info(
394
+ job_logger.info(
300
395
  f'Choosing resources for {controller.value.name}...'
301
396
  )
302
397
  dag = optimizer.Optimizer.optimize(dag,
@@ -305,6 +400,26 @@ def _execute(
305
400
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
306
401
  assert task.best_resources is not None, task
307
402
 
403
+ # Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
404
+ # lock. After the backend acquires the lock and refreshes state, the
405
+ # original "do we need to optimize?" decision may be stale (e.g., the
406
+ # cluster just got terminated). To compensate without moving the optimizer
407
+ # into the backend, we inject a small planner the backend can call under
408
+ # the lock only when no reusable snapshot and no caller plan exist.
409
+ planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
410
+ if isinstance(backend,
411
+ backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
412
+
413
+ def _planner(_t: 'sky.Task'):
414
+ new_dag = optimizer.Optimizer.optimize(dag,
415
+ minimize=optimize_target,
416
+ quiet=_quiet_optimizer)
417
+ new_task = new_dag.tasks[0]
418
+ assert new_task.best_resources is not None, new_task
419
+ return new_task.best_resources.assert_launchable()
420
+
421
+ planner = _planner
422
+
308
423
  backend.register_info(
309
424
  dag=dag,
310
425
  optimize_target=optimize_target,
@@ -312,7 +427,9 @@ def _execute(
312
427
  # That's because we want to do commands in task.setup and task.run again
313
428
  # after K8S pod recovers from a crash.
314
429
  # See `kubernetes-ray.yml.j2` for more details.
315
- dump_final_script=is_controller_high_availability_supported)
430
+ dump_final_script=is_controller_high_availability_supported,
431
+ is_managed=is_managed,
432
+ planner=planner)
316
433
 
317
434
  if task.storage_mounts is not None:
318
435
  # Optimizer should eventually choose where to store bucket
@@ -337,7 +454,7 @@ def _execute(
337
454
  if handle is None:
338
455
  assert dryrun, ('If not dryrun, handle must be set or '
339
456
  'Stage.PROVISION must be included in stages.')
340
- logger.info('Dryrun finished.')
457
+ job_logger.info('Dryrun finished.')
341
458
  return None, None
342
459
 
343
460
  do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
@@ -346,39 +463,50 @@ def _execute(
346
463
  (task.file_mounts is not None or
347
464
  task.storage_mounts is not None))
348
465
  if do_workdir or do_file_mounts:
349
- logger.info(ux_utils.starting_message('Syncing files.'))
466
+ job_logger.info(ux_utils.starting_message('Syncing files.'))
350
467
 
351
468
  if do_workdir:
352
- backend.sync_workdir(handle, task.workdir)
469
+ if cluster_name is not None:
470
+ global_user_state.add_cluster_event(
471
+ cluster_name, status_lib.ClusterStatus.INIT,
472
+ 'Syncing files to cluster',
473
+ global_user_state.ClusterEventType.STATUS_CHANGE)
474
+ backend.sync_workdir(handle, task.workdir, task.envs_and_secrets)
353
475
 
354
476
  if do_file_mounts:
477
+ if cluster_name is not None:
478
+ global_user_state.add_cluster_event(
479
+ cluster_name, status_lib.ClusterStatus.UP,
480
+ 'Syncing file mounts',
481
+ global_user_state.ClusterEventType.STATUS_CHANGE)
355
482
  backend.sync_file_mounts(handle, task.file_mounts,
356
483
  task.storage_mounts)
357
484
 
358
485
  if no_setup:
359
- logger.info('Setup commands skipped.')
486
+ job_logger.info('Setup commands skipped.')
360
487
  elif Stage.SETUP in stages and not dryrun:
361
488
  if skip_unnecessary_provisioning and provisioning_skipped:
362
- logger.debug('Unnecessary provisioning was skipped, so '
363
- 'skipping setup as well.')
489
+ job_logger.debug('Unnecessary provisioning was skipped, so '
490
+ 'skipping setup as well.')
364
491
  else:
492
+ if cluster_name is not None:
493
+ global_user_state.add_cluster_event(
494
+ cluster_name, status_lib.ClusterStatus.UP,
495
+ 'Running setup commands to install dependencies',
496
+ global_user_state.ClusterEventType.STATUS_CHANGE)
365
497
  backend.setup(handle, task, detach_setup=detach_setup)
366
498
 
367
499
  if Stage.PRE_EXEC in stages and not dryrun:
368
500
  if idle_minutes_to_autostop is not None:
369
501
  assert isinstance(backend, backends.CloudVmRayBackend)
370
502
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
371
- backend.set_autostop(handle,
372
- idle_minutes_to_autostop,
373
- down=down)
503
+ backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
504
+ down)
374
505
 
375
506
  if Stage.EXEC in stages:
376
507
  try:
377
508
  global_user_state.update_last_use(handle.get_cluster_name())
378
- job_id = backend.execute(handle,
379
- task,
380
- detach_run,
381
- dryrun=dryrun)
509
+ job_id = backend.execute(handle, task, dryrun=dryrun)
382
510
  finally:
383
511
  # Enables post_execute() to be run after KeyboardInterrupt.
384
512
  backend.post_execute(handle, down)
@@ -395,6 +523,9 @@ def _execute(
395
523
 
396
524
  @timeline.event
397
525
  @usage_lib.entrypoint
526
+ # A launch routine will share tempfiles between steps, so we init a tempdir
527
+ # for the launch routine and gc the entire dir after launch.
528
+ @tempstore.with_tempdir
398
529
  def launch(
399
530
  task: Union['sky.Task', 'sky.Dag'],
400
531
  cluster_name: Optional[str] = None,
@@ -408,12 +539,16 @@ def launch(
408
539
  no_setup: bool = False,
409
540
  clone_disk_from: Optional[str] = None,
410
541
  fast: bool = False,
542
+ *, #keyword only separator
411
543
  # Internal only:
412
544
  # pylint: disable=invalid-name
413
545
  _quiet_optimizer: bool = False,
414
546
  _is_launched_by_jobs_controller: bool = False,
415
547
  _is_launched_by_sky_serve_controller: bool = False,
416
548
  _disable_controller_check: bool = False,
549
+ _request_name: request_names.AdminPolicyRequestName = request_names.
550
+ AdminPolicyRequestName.CLUSTER_LAUNCH,
551
+ job_logger: logging.Logger = logger,
417
552
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
418
553
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
419
554
  """Launches a cluster or task.
@@ -432,7 +567,7 @@ def launch(
432
567
  import sky
433
568
  task = sky.Task(run='echo hello SkyPilot')
434
569
  task.set_resources(
435
- sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
570
+ sky.Resources(infra='aws', accelerators='V100:4'))
436
571
  sky.launch(task, cluster_name='my-cluster')
437
572
 
438
573
 
@@ -448,13 +583,16 @@ def launch(
448
583
  running/pending jobs are found in the job queue. Setting this
449
584
  flag is equivalent to running
450
585
  ``sky.launch(...)`` and then
451
- ``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
452
- will not be autostopped.
586
+ ``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
587
+ config specified in the task' resources will be overridden by
588
+ this parameter.
453
589
  down: Tear down the cluster after all jobs finish (successfully or
454
590
  abnormally). If --idle-minutes-to-autostop is also set, the
455
591
  cluster will be torn down after the specified idle time.
456
592
  Note that if errors occur during provisioning/data syncing/setting
457
- up, the cluster will not be torn down for debugging purposes.
593
+ up, the cluster will not be torn down for debugging purposes. If
594
+ set, the autostop config specified in the task' resources will be
595
+ overridden by this parameter.
458
596
  dryrun: if True, do not actually launch the cluster.
459
597
  stream_logs: if True, show the logs in the terminal.
460
598
  backend: backend to use. If None, use the default backend
@@ -556,7 +694,6 @@ def launch(
556
694
  # see the setup logs when inspecting the launch process to know
557
695
  # excatly what the job is waiting for.
558
696
  detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
559
-
560
697
  return _execute(
561
698
  entrypoint=entrypoint,
562
699
  dryrun=dryrun,
@@ -569,7 +706,6 @@ def launch(
569
706
  stages=stages,
570
707
  cluster_name=cluster_name,
571
708
  detach_setup=detach_setup,
572
- detach_run=True,
573
709
  idle_minutes_to_autostop=idle_minutes_to_autostop,
574
710
  no_setup=no_setup,
575
711
  clone_disk_from=clone_disk_from,
@@ -578,7 +714,12 @@ def launch(
578
714
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
579
715
  _is_launched_by_sky_serve_controller=
580
716
  _is_launched_by_sky_serve_controller,
581
- )
717
+ _request_name=_request_name,
718
+ job_logger=job_logger)
719
+
720
+
721
+ # needed for backward compatibility. Remove by v0.10.7 or v0.11.0
722
+ cluster_launch = launch
582
723
 
583
724
 
584
725
  @usage_lib.entrypoint
@@ -589,6 +730,7 @@ def exec( # pylint: disable=redefined-builtin
589
730
  down: bool = False,
590
731
  stream_logs: bool = True,
591
732
  backend: Optional[backends.Backend] = None,
733
+ job_logger: logging.Logger = logger,
592
734
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
593
735
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
594
736
  """Executes a task on an existing cluster.
@@ -663,5 +805,6 @@ def exec( # pylint: disable=redefined-builtin
663
805
  Stage.EXEC,
664
806
  ],
665
807
  cluster_name=cluster_name,
666
- detach_run=True,
808
+ job_logger=job_logger,
809
+ _request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
667
810
  )