skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -5,23 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
5
5
  resources:
6
6
  job_recovery: EAGER_NEXT_REGION
7
7
  """
8
- import time
8
+ import asyncio
9
+ import logging
10
+ import os
9
11
  import traceback
10
12
  import typing
11
- from typing import Optional
13
+ from typing import Optional, Set
12
14
 
13
- import sky
14
15
  from sky import backends
16
+ from sky import dag as dag_lib
15
17
  from sky import exceptions
16
- from sky import execution
17
18
  from sky import global_user_state
18
19
  from sky import sky_logging
20
+ from sky import skypilot_config
19
21
  from sky.backends import backend_utils
22
+ from sky.client import sdk
20
23
  from sky.jobs import scheduler
24
+ from sky.jobs import state
21
25
  from sky.jobs import utils as managed_job_utils
26
+ from sky.serve import serve_utils
27
+ from sky.skylet import constants
22
28
  from sky.skylet import job_lib
23
29
  from sky.usage import usage_lib
24
30
  from sky.utils import common_utils
31
+ from sky.utils import context_utils
32
+ from sky.utils import env_options
25
33
  from sky.utils import registry
26
34
  from sky.utils import status_lib
27
35
  from sky.utils import ux_utils
@@ -39,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
39
47
  # Minutes to job cluster autodown. This should be significantly larger than
40
48
  # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
41
49
  # cluster before its status can be updated by the job controller.
42
- _AUTODOWN_MINUTES = 5
50
+ _AUTODOWN_MINUTES = 10
51
+
52
+ ENV_VARS_TO_CLEAR = [
53
+ skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
54
+ constants.USER_ID_ENV_VAR,
55
+ constants.USER_ENV_VAR,
56
+ env_options.Options.SHOW_DEBUG_INFO.env_key,
57
+ ]
43
58
 
44
59
 
45
60
  class StrategyExecutor:
@@ -47,29 +62,65 @@ class StrategyExecutor:
47
62
 
48
63
  RETRY_INIT_GAP_SECONDS = 60
49
64
 
50
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
51
- task: 'task_lib.Task', max_restarts_on_errors: int,
52
- job_id: int) -> None:
65
+ def __init__(
66
+ self,
67
+ cluster_name: Optional[str],
68
+ backend: 'backends.Backend',
69
+ task: 'task_lib.Task',
70
+ max_restarts_on_errors: int,
71
+ job_id: int,
72
+ task_id: int,
73
+ pool: Optional[str],
74
+ starting: Set[int],
75
+ starting_lock: asyncio.Lock,
76
+ starting_signal: asyncio.Condition,
77
+ ) -> None:
53
78
  """Initialize the strategy executor.
54
79
 
55
80
  Args:
56
81
  cluster_name: The name of the cluster.
57
82
  backend: The backend to use. Only CloudVMRayBackend is supported.
58
83
  task: The task to execute.
84
+ max_restarts_on_errors: Maximum number of restarts on errors.
85
+ job_id: The ID of the job.
86
+ task_id: The ID of the task.
87
+ starting: Set of job IDs that are currently starting.
88
+ starting_lock: Lock to synchronize starting jobs.
89
+ starting_signal: Condition to signal when a job can start.
59
90
  """
60
91
  assert isinstance(backend, backends.CloudVmRayBackend), (
61
92
  'Only CloudVMRayBackend is supported.')
62
- self.dag = sky.Dag()
93
+ self.dag = dag_lib.Dag()
63
94
  self.dag.add(task)
95
+ # For jobs submitted to a pool, the cluster name might change after each
96
+ # recovery. Initially this is set to an empty string to indicate that no
97
+ # cluster is assigned yet, and in `_launch`, it will be set to one of
98
+ # the cluster names in the pool.
64
99
  self.cluster_name = cluster_name
65
100
  self.backend = backend
66
101
  self.max_restarts_on_errors = max_restarts_on_errors
67
102
  self.job_id = job_id
103
+ self.task_id = task_id
104
+ self.pool = pool
68
105
  self.restart_cnt_on_failure = 0
106
+ self.job_id_on_pool_cluster: Optional[int] = None
107
+ self.starting = starting
108
+ self.starting_lock = starting_lock
109
+ self.starting_signal = starting_signal
69
110
 
70
111
  @classmethod
71
- def make(cls, cluster_name: str, backend: 'backends.Backend',
72
- task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
112
+ def make(
113
+ cls,
114
+ cluster_name: Optional[str],
115
+ backend: 'backends.Backend',
116
+ task: 'task_lib.Task',
117
+ job_id: int,
118
+ task_id: int,
119
+ pool: Optional[str],
120
+ starting: Set[int],
121
+ starting_lock: asyncio.Lock,
122
+ starting_signal: asyncio.Condition,
123
+ ) -> 'StrategyExecutor':
73
124
  """Create a strategy from a task."""
74
125
 
75
126
  resource_list = list(task.resources)
@@ -86,8 +137,11 @@ class StrategyExecutor:
86
137
  # original task.resources
87
138
  task.set_resources(type(task.resources)(new_resources_list))
88
139
  if isinstance(job_recovery, dict):
89
- job_recovery_name = job_recovery.pop(
140
+ name = job_recovery.pop(
90
141
  'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
142
+ assert name is None or isinstance(name, str), (
143
+ name, 'The job recovery strategy name must be a string or None')
144
+ job_recovery_name: Optional[str] = name
91
145
  max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
92
146
  0)
93
147
  else:
@@ -97,9 +151,11 @@ class StrategyExecutor:
97
151
  from_str(job_recovery_name))
98
152
  assert job_recovery_strategy is not None, job_recovery_name
99
153
  return job_recovery_strategy(cluster_name, backend, task,
100
- max_restarts_on_errors, job_id)
154
+ max_restarts_on_errors, job_id, task_id,
155
+ pool, starting, starting_lock,
156
+ starting_signal)
101
157
 
102
- def launch(self) -> float:
158
+ async def launch(self) -> float:
103
159
  """Launch the cluster for the first time.
104
160
 
105
161
  It can fail if resource is not available. Need to check the cluster
@@ -111,11 +167,11 @@ class StrategyExecutor:
111
167
  Raises: Please refer to the docstring of self._launch().
112
168
  """
113
169
 
114
- job_submit_at = self._launch(max_retry=None)
170
+ job_submit_at = await self._launch(max_retry=None)
115
171
  assert job_submit_at is not None
116
172
  return job_submit_at
117
173
 
118
- def recover(self) -> float:
174
+ async def recover(self) -> float:
119
175
  """Relaunch the cluster after failure and wait until job starts.
120
176
 
121
177
  When recover() is called the cluster should be in STOPPED status (i.e.
@@ -125,12 +181,12 @@ class StrategyExecutor:
125
181
  """
126
182
  raise NotImplementedError
127
183
 
128
- def _try_cancel_all_jobs(self):
129
- from sky import core # pylint: disable=import-outside-toplevel
130
-
131
- handle = global_user_state.get_handle_from_cluster_name(
132
- self.cluster_name)
133
- if handle is None:
184
+ async def _try_cancel_jobs(self):
185
+ if self.cluster_name is None:
186
+ return
187
+ handle = await context_utils.to_thread(
188
+ global_user_state.get_handle_from_cluster_name, self.cluster_name)
189
+ if handle is None or self.pool is not None:
134
190
  return
135
191
  try:
136
192
  usage_lib.messages.usage.set_internal()
@@ -153,9 +209,22 @@ class StrategyExecutor:
153
209
  # should be functional with the `_try_cancel_if_cluster_is_init`
154
210
  # flag, i.e. it sends the cancel signal to the head node, which will
155
211
  # then kill the user process on remaining worker nodes.
156
- core.cancel(cluster_name=self.cluster_name,
157
- all=True,
158
- _try_cancel_if_cluster_is_init=True)
212
+ # Only cancel the corresponding job for worker pool.
213
+ if self.pool is None:
214
+ kwargs = dict(all=True)
215
+ else:
216
+ kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
217
+ request_id = await context_utils.to_thread(
218
+ sdk.cancel,
219
+ cluster_name=self.cluster_name,
220
+ **kwargs,
221
+ _try_cancel_if_cluster_is_init=True,
222
+ )
223
+ logger.debug(f'sdk.cancel request ID: {request_id}')
224
+ await context_utils.to_thread(
225
+ sdk.get,
226
+ request_id,
227
+ )
159
228
  except Exception as e: # pylint: disable=broad-except
160
229
  logger.info('Failed to cancel the job on the cluster. The cluster '
161
230
  'might be already down or the head node is preempted.'
@@ -163,25 +232,26 @@ class StrategyExecutor:
163
232
  f'{common_utils.format_exception(e)}\n'
164
233
  'Terminating the cluster explicitly to ensure no '
165
234
  'remaining job process interferes with recovery.')
166
- managed_job_utils.terminate_cluster(self.cluster_name)
235
+ await context_utils.to_thread(self._cleanup_cluster)
167
236
 
168
- def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
237
+ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
169
238
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
170
239
 
171
240
  Returns:
172
241
  The timestamp of when the job is submitted, or None if failed to
173
242
  submit.
174
243
  """
244
+ assert self.cluster_name is not None
175
245
  status = None
176
246
  job_checking_retry_cnt = 0
177
247
  while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
178
248
  # Avoid the infinite loop, if any bug happens.
179
249
  job_checking_retry_cnt += 1
180
250
  try:
181
- cluster_status, _ = (
182
- backend_utils.refresh_cluster_status_handle(
183
- self.cluster_name,
184
- force_refresh_statuses=set(status_lib.ClusterStatus)))
251
+ cluster_status, _ = (await context_utils.to_thread(
252
+ backend_utils.refresh_cluster_status_handle,
253
+ self.cluster_name,
254
+ force_refresh_statuses=set(status_lib.ClusterStatus)))
185
255
  except Exception as e: # pylint: disable=broad-except
186
256
  # If any unexpected error happens, retry the job checking
187
257
  # loop.
@@ -201,8 +271,10 @@ class StrategyExecutor:
201
271
  break
202
272
 
203
273
  try:
204
- status = managed_job_utils.get_job_status(
205
- self.backend, self.cluster_name)
274
+ status = await managed_job_utils.get_job_status(
275
+ self.backend,
276
+ self.cluster_name,
277
+ job_id=self.job_id_on_pool_cluster)
206
278
  except Exception as e: # pylint: disable=broad-except
207
279
  # If any unexpected error happens, retry the job checking
208
280
  # loop.
@@ -217,8 +289,12 @@ class StrategyExecutor:
217
289
  # Check the job status until it is not in initialized status
218
290
  if status is not None and status > job_lib.JobStatus.INIT:
219
291
  try:
220
- job_submitted_at = managed_job_utils.get_job_timestamp(
221
- self.backend, self.cluster_name, get_end_time=False)
292
+ job_submitted_at = await context_utils.to_thread(
293
+ managed_job_utils.get_job_timestamp,
294
+ self.backend,
295
+ self.cluster_name,
296
+ self.job_id_on_pool_cluster,
297
+ get_end_time=False)
222
298
  return job_submitted_at
223
299
  except Exception as e: # pylint: disable=broad-except
224
300
  # If we failed to get the job timestamp, we will retry
@@ -227,12 +303,20 @@ class StrategyExecutor:
227
303
  'the job start timestamp. Retrying.')
228
304
  continue
229
305
  # Wait for the job to be started
230
- time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
306
+ await asyncio.sleep(
307
+ managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
231
308
  return None
232
309
 
233
- def _launch(self,
234
- max_retry: Optional[int] = 3,
235
- raise_on_failure: bool = True) -> Optional[float]:
310
+ def _cleanup_cluster(self) -> None:
311
+ if self.cluster_name is None:
312
+ return
313
+ if self.pool is None:
314
+ managed_job_utils.terminate_cluster(self.cluster_name)
315
+
316
+ async def _launch(self,
317
+ max_retry: Optional[int] = 3,
318
+ raise_on_failure: bool = True,
319
+ recovery: bool = False) -> Optional[float]:
236
320
  """Implementation of launch().
237
321
 
238
322
  The function will wait until the job starts running, but will leave the
@@ -272,98 +356,234 @@ class StrategyExecutor:
272
356
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
273
357
  while True:
274
358
  retry_cnt += 1
275
- with scheduler.scheduled_launch(self.job_id):
276
- try:
277
- usage_lib.messages.usage.set_internal()
278
- # Detach setup, so that the setup failure can be detected
279
- # by the controller process (job_status -> FAILED_SETUP).
280
- execution.launch(
281
- self.dag,
282
- cluster_name=self.cluster_name,
283
- # We expect to tear down the cluster as soon as the job
284
- # is finished. However, in case the controller dies, set
285
- # autodown to try and avoid a resource leak.
286
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
287
- down=True,
288
- _is_launched_by_jobs_controller=True)
289
- logger.info('Managed job cluster launched.')
290
- except (exceptions.InvalidClusterNameError,
291
- exceptions.NoCloudAccessError,
292
- exceptions.ResourcesMismatchError) as e:
293
- logger.error('Failure happened before provisioning. '
294
- f'{common_utils.format_exception(e)}')
295
- if raise_on_failure:
296
- raise exceptions.ProvisionPrechecksError(reasons=[e])
297
- return None
298
- except exceptions.ResourcesUnavailableError as e:
299
- # This is raised when the launch fails due to prechecks or
300
- # after failing over through all the candidates.
301
- # Please refer to the docstring of `sky.launch` for more
302
- # details of how the exception will be structured.
303
- if not any(
304
- isinstance(err,
305
- exceptions.ResourcesUnavailableError)
306
- for err in e.failover_history):
307
- # _launch() (this function) should fail/exit directly,
308
- # if none of the failover reasons were because of
309
- # resource unavailability or no failover was attempted
310
- # (the optimizer cannot find feasible resources for
311
- # requested resources), i.e., e.failover_history is
312
- # empty. Failing directly avoids the infinite loop of
313
- # retrying the launch when, e.g., an invalid cluster
314
- # name is used and --retry-until-up is specified.
315
- reasons = (e.failover_history
316
- if e.failover_history else [e])
317
- reasons_str = '; '.join(
318
- common_utils.format_exception(err)
319
- for err in reasons)
320
- logger.error(
321
- 'Failure happened before provisioning. Failover '
322
- f'reasons: {reasons_str}')
359
+ try:
360
+ async with scheduler.scheduled_launch(
361
+ self.job_id,
362
+ self.starting,
363
+ self.starting_lock,
364
+ self.starting_signal,
365
+ ):
366
+ # The job state may have been PENDING during backoff -
367
+ # update to STARTING or RECOVERING.
368
+ # On the first attempt (when retry_cnt is 1), we should
369
+ # already be in STARTING or RECOVERING.
370
+ if retry_cnt > 1:
371
+ await state.set_restarting_async(
372
+ self.job_id, self.task_id, recovery)
373
+ try:
374
+ usage_lib.messages.usage.set_internal()
375
+ if self.pool is None:
376
+ assert self.cluster_name is not None
377
+
378
+ # sdk.launch will implicitly start the API server,
379
+ # but then the API server will inherit the current
380
+ # env vars/user, which we may not want.
381
+ # Instead, clear env vars here and call api_start
382
+ # explicitly.
383
+ vars_to_restore = {}
384
+ try:
385
+ for env_var in ENV_VARS_TO_CLEAR:
386
+ vars_to_restore[env_var] = os.environ.pop(
387
+ env_var, None)
388
+ logger.debug('Cleared env var: '
389
+ f'{env_var}')
390
+ logger.debug('Env vars for api_start: '
391
+ f'{os.environ}')
392
+ await context_utils.to_thread(sdk.api_start)
393
+ logger.info('API server started.')
394
+ finally:
395
+ for env_var, value in vars_to_restore.items():
396
+ if value is not None:
397
+ logger.debug('Restored env var: '
398
+ f'{env_var}: {value}')
399
+ os.environ[env_var] = value
400
+
401
+ request_id = None
402
+ try:
403
+ request_id = await context_utils.to_thread(
404
+ sdk.launch,
405
+ self.dag,
406
+ cluster_name=self.cluster_name,
407
+ # We expect to tear down the cluster as soon
408
+ # as the job is finished. However, in case
409
+ # the controller dies, we may end up with a
410
+ # resource leak.
411
+ # Ideally, we should autodown to be safe,
412
+ # but it's fine to disable it for now, as
413
+ # Nebius doesn't support autodown yet.
414
+ # TODO(kevin): set down=True once Nebius
415
+ # supports autodown.
416
+ # idle_minutes_to_autostop=(
417
+ # _AUTODOWN_MINUTES),
418
+ # down=True,
419
+ _is_launched_by_jobs_controller=True,
420
+ )
421
+ logger.debug('sdk.launch request ID: '
422
+ f'{request_id}')
423
+ await context_utils.to_thread(
424
+ sdk.stream_and_get,
425
+ request_id,
426
+ )
427
+ except asyncio.CancelledError:
428
+ if request_id:
429
+ req = await context_utils.to_thread(
430
+ sdk.api_cancel, request_id)
431
+ logger.debug('sdk.api_cancel request '
432
+ f'ID: {req}')
433
+ try:
434
+ await context_utils.to_thread(
435
+ sdk.get, req)
436
+ except Exception as e: # pylint: disable=broad-except
437
+ # we must still return a CancelledError
438
+ logger.error(
439
+ f'Failed to cancel the job: {e}')
440
+ raise
441
+ logger.info('Managed job cluster launched.')
442
+ else:
443
+ self.cluster_name = await (context_utils.to_thread(
444
+ serve_utils.get_next_cluster_name, self.pool,
445
+ self.job_id))
446
+ if self.cluster_name is None:
447
+ raise exceptions.NoClusterLaunchedError(
448
+ 'No cluster name found in the pool.')
449
+ request_id = None
450
+ try:
451
+ request_id = await context_utils.to_thread(
452
+ sdk.exec,
453
+ self.dag,
454
+ cluster_name=self.cluster_name,
455
+ )
456
+ logger.debug('sdk.exec request ID: '
457
+ f'{request_id}')
458
+ job_id_on_pool_cluster, _ = (
459
+ await context_utils.to_thread(
460
+ sdk.get, request_id))
461
+ except asyncio.CancelledError:
462
+ if request_id:
463
+ req = await context_utils.to_thread(
464
+ sdk.api_cancel, request_id)
465
+ logger.debug('sdk.api_cancel request '
466
+ f'ID: {req}')
467
+ try:
468
+ await context_utils.to_thread(
469
+ sdk.get, req)
470
+ except Exception as e: # pylint: disable=broad-except
471
+ # we must still return a CancelledError
472
+ logger.error(
473
+ f'Failed to cancel the job: {e}')
474
+ raise
475
+ assert job_id_on_pool_cluster is not None, (
476
+ self.cluster_name, self.job_id)
477
+ self.job_id_on_pool_cluster = job_id_on_pool_cluster
478
+ await state.set_job_id_on_pool_cluster_async(
479
+ self.job_id, job_id_on_pool_cluster)
480
+ logger.info('Managed job cluster launched.')
481
+ except (exceptions.InvalidClusterNameError,
482
+ exceptions.NoCloudAccessError,
483
+ exceptions.ResourcesMismatchError,
484
+ exceptions.StorageSpecError,
485
+ exceptions.StorageError) as e:
486
+ logger.error('Failure happened before provisioning. '
487
+ f'{common_utils.format_exception(e)}')
323
488
  if raise_on_failure:
324
- raise exceptions.ProvisionPrechecksError(reasons)
325
- return None
326
- logger.info('Failed to launch a cluster with error: '
327
- f'{common_utils.format_exception(e)})')
328
- except Exception as e: # pylint: disable=broad-except
329
- # If the launch fails, it will be recovered by the following
330
- # code.
331
- logger.info('Failed to launch a cluster with error: '
332
- f'{common_utils.format_exception(e)})')
333
- with ux_utils.enable_traceback():
334
- logger.info(f' Traceback: {traceback.format_exc()}')
335
- else: # No exception, the launch succeeds.
336
- # At this point, a sky.launch() has succeeded. Cluster may
337
- # be UP (no preemption since) or DOWN (newly preempted).
338
- job_submitted_at = self._wait_until_job_starts_on_cluster()
339
- if job_submitted_at is not None:
340
- return job_submitted_at
341
- # The job fails to start on the cluster, retry the launch.
342
- # TODO(zhwu): log the unexpected error to usage collection
343
- # for future debugging.
344
- logger.info(
345
- 'Failed to successfully submit the job to the '
346
- 'launched cluster, due to unexpected submission errors '
347
- 'or the cluster being preempted during job submission.')
348
-
349
- # If we get here, the launch did not succeed. Tear down the
350
- # cluster and retry.
351
- managed_job_utils.terminate_cluster(self.cluster_name)
352
- if max_retry is not None and retry_cnt >= max_retry:
353
- # Retry forever if max_retry is None.
354
- if raise_on_failure:
355
- with ux_utils.print_exception_no_traceback():
356
- raise exceptions.ManagedJobReachedMaxRetriesError(
357
- 'Resources unavailable: failed to launch '
358
- f'clusters after {max_retry} retries.')
359
- else:
489
+ raise exceptions.ProvisionPrechecksError(
490
+ reasons=[e])
360
491
  return None
361
- # Exit the scheduled_launch context so that the scheulde state is
362
- # ALIVE during the backoff. This allows other jobs to launch.
363
- gap_seconds = backoff.current_backoff()
364
- logger.info('Retrying to launch the cluster in '
365
- f'{gap_seconds:.1f} seconds.')
366
- time.sleep(gap_seconds)
492
+ except exceptions.ResourcesUnavailableError as e:
493
+ # This is raised when the launch fails due to prechecks
494
+ # or after failing over through all the candidates.
495
+ # Please refer to the docstring of `sky.launch` for more
496
+ # details of how the exception will be structured.
497
+ if not any(
498
+ isinstance(err,
499
+ exceptions.ResourcesUnavailableError)
500
+ for err in e.failover_history):
501
+ # _launch() (this function) should fail/exit
502
+ # directly, if none of the failover reasons were
503
+ # because of resource unavailability or no failover
504
+ # was attempted (the optimizer cannot find feasible
505
+ # resources for requested resources), i.e.,
506
+ # e.failover_history is empty. Failing directly
507
+ # avoids the infinite loop of retrying the launch
508
+ # when, e.g., an invalid cluster name is used and
509
+ # --retry-until-up is specified.
510
+ reasons = (e.failover_history
511
+ if e.failover_history else [e])
512
+ reasons_str = '; '.join(
513
+ common_utils.format_exception(err)
514
+ for err in reasons)
515
+ logger.error(
516
+ 'Failure happened before provisioning. '
517
+ f'Failover reasons: {reasons_str}')
518
+ if raise_on_failure:
519
+ raise exceptions.ProvisionPrechecksError(
520
+ reasons)
521
+ return None
522
+ logger.info('Failed to launch a cluster with error: '
523
+ f'{common_utils.format_exception(e)})')
524
+ except Exception as e: # pylint: disable=broad-except
525
+ # If the launch fails, it will be recovered by the
526
+ # following code.
527
+ logger.info('Failed to launch a cluster with error: '
528
+ f'{common_utils.format_exception(e)})')
529
+ with ux_utils.enable_traceback():
530
+ logger.info(
531
+ f' Traceback: {traceback.format_exc()}')
532
+ else: # No exception, the launch succeeds.
533
+ # At this point, a sky.launch() has succeeded. Cluster
534
+ # may be UP (no preemption since) or DOWN (newly
535
+ # preempted).
536
+ job_submitted_at = await (
537
+ self._wait_until_job_starts_on_cluster())
538
+ if job_submitted_at is not None:
539
+ return job_submitted_at
540
+ # The job fails to start on the cluster, retry the
541
+ # launch.
542
+ # TODO(zhwu): log the unexpected error to usage
543
+ # collection for future debugging.
544
+ logger.info(
545
+ 'Failed to successfully submit the job to the '
546
+ 'launched cluster, due to unexpected submission '
547
+ 'errors or the cluster being preempted during '
548
+ 'job submission.')
549
+
550
+ # If we get here, the launch did not succeed. Tear down the
551
+ # cluster and retry.
552
+ await context_utils.to_thread(self._cleanup_cluster)
553
+ if max_retry is not None and retry_cnt >= max_retry:
554
+ # Retry forever if max_retry is None.
555
+ if raise_on_failure:
556
+ with ux_utils.print_exception_no_traceback():
557
+ raise (
558
+ exceptions.ManagedJobReachedMaxRetriesError(
559
+ 'Resources unavailable: failed to '
560
+ f'launch clusters after {max_retry} '
561
+ 'retries.'))
562
+ else:
563
+ return None
564
+
565
+ # Raise NoClusterLaunchedError to indicate that the job is
566
+ # in retry backoff. This will trigger special handling in
567
+ # scheduler.schedule_launched().
568
+ # We will exit the scheduled_launch context so that the
569
+ # schedule state is ALIVE_BACKOFF during the backoff. This
570
+ # allows other jobs to launch.
571
+ raise exceptions.NoClusterLaunchedError()
572
+
573
+ except exceptions.NoClusterLaunchedError:
574
+ # Update the status to PENDING during backoff.
575
+ await state.set_backoff_pending_async(self.job_id, self.task_id)
576
+ # Calculate the backoff time and sleep.
577
+ gap_seconds = (backoff.current_backoff()
578
+ if self.pool is None else 1)
579
+ logger.info('Retrying to launch the cluster in '
580
+ f'{gap_seconds:.1f} seconds.')
581
+ await asyncio.sleep(gap_seconds)
582
+ continue
583
+ else:
584
+ # The inner loop should either return or throw
585
+ # NoClusterLaunchedError.
586
+ assert False, 'Unreachable'
367
587
 
368
588
  def should_restart_on_failure(self) -> bool:
369
589
  """Increments counter & checks if job should be restarted on a failure.
@@ -384,24 +604,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
384
604
 
385
605
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
386
606
 
387
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
388
- task: 'task_lib.Task', max_restarts_on_errors: int,
389
- job_id: int) -> None:
607
+ def __init__(
608
+ self,
609
+ cluster_name: Optional[str],
610
+ backend: 'backends.Backend',
611
+ task: 'task_lib.Task',
612
+ max_restarts_on_errors: int,
613
+ job_id: int,
614
+ task_id: int,
615
+ pool: Optional[str],
616
+ starting: Set[int],
617
+ starting_lock: asyncio.Lock,
618
+ starting_signal: asyncio.Condition,
619
+ ) -> None:
390
620
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
391
- job_id)
621
+ job_id, task_id, pool, starting, starting_lock,
622
+ starting_signal)
392
623
  # Note down the cloud/region of the launched cluster, so that we can
393
624
  # first retry in the same cloud/region. (Inside recover() we may not
394
625
  # rely on cluster handle, as it can be None if the cluster is
395
626
  # preempted.)
396
627
  self._launched_resources: Optional['resources.Resources'] = None
397
628
 
398
- def _launch(self,
399
- max_retry: Optional[int] = 3,
400
- raise_on_failure: bool = True) -> Optional[float]:
401
- job_submitted_at = super()._launch(max_retry, raise_on_failure)
402
- if job_submitted_at is not None:
629
+ async def _launch(self,
630
+ max_retry: Optional[int] = 3,
631
+ raise_on_failure: bool = True,
632
+ recovery: bool = False) -> Optional[float]:
633
+ job_submitted_at = await super()._launch(max_retry, raise_on_failure,
634
+ recovery)
635
+ if job_submitted_at is not None and self.cluster_name is not None:
403
636
  # Only record the cloud/region if the launch is successful.
404
- handle = global_user_state.get_handle_from_cluster_name(
637
+ handle = await context_utils.to_thread(
638
+ global_user_state.get_handle_from_cluster_name,
405
639
  self.cluster_name)
406
640
  assert isinstance(handle, backends.CloudVmRayResourceHandle), (
407
641
  'Cluster should be launched.', handle)
@@ -411,7 +645,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
411
645
  self._launched_resources = None
412
646
  return job_submitted_at
413
647
 
414
- def recover(self) -> float:
648
+ async def recover(self) -> float:
415
649
  # 1. Cancel the jobs and launch the cluster with the STOPPED status,
416
650
  # so that it will try on the current region first until timeout.
417
651
  # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -419,7 +653,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
419
653
  # original user specification.
420
654
 
421
655
  # Step 1
422
- self._try_cancel_all_jobs()
656
+ await self._try_cancel_jobs()
423
657
 
424
658
  while True:
425
659
  # Add region constraint to the task, to retry on the same region
@@ -433,7 +667,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
433
667
  cloud=launched_cloud, region=launched_region, zone=None)
434
668
  task.set_resources({new_resources})
435
669
  # Not using self.launch to avoid the retry until up logic.
436
- job_submitted_at = self._launch(raise_on_failure=False)
670
+ job_submitted_at = await self._launch(raise_on_failure=False,
671
+ recovery=True)
437
672
  # Restore the original dag, i.e. reset the region constraint.
438
673
  task.set_resources(original_resources)
439
674
  if job_submitted_at is not None:
@@ -442,20 +677,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
442
677
  # Step 2
443
678
  logger.debug('Terminating unhealthy cluster and reset cloud '
444
679
  'region.')
445
- managed_job_utils.terminate_cluster(self.cluster_name)
680
+ await context_utils.to_thread(self._cleanup_cluster)
446
681
 
447
682
  # Step 3
448
683
  logger.debug('Relaunch the cluster without constraining to prior '
449
684
  'cloud/region.')
450
685
  # Not using self.launch to avoid the retry until up logic.
451
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
452
- raise_on_failure=False)
686
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
687
+ raise_on_failure=False,
688
+ recovery=True)
453
689
  if job_submitted_at is None:
454
690
  # Failed to launch the cluster.
455
691
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
456
692
  logger.info('Retrying to recover the cluster in '
457
693
  f'{gap_seconds:.1f} seconds.')
458
- time.sleep(gap_seconds)
694
+ await asyncio.sleep(gap_seconds)
459
695
  continue
460
696
 
461
697
  return job_submitted_at
@@ -487,7 +723,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
487
723
  -> R1Z1 (success)
488
724
  """
489
725
 
490
- def recover(self) -> float:
726
+ async def recover(self) -> float:
491
727
  # 1. Terminate the current cluster
492
728
  # 2. Launch again by explicitly blocking the previously launched region
493
729
  # (this will failover through the entire search space except the
@@ -500,7 +736,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
500
736
 
501
737
  # Step 1
502
738
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
503
- managed_job_utils.terminate_cluster(self.cluster_name)
739
+ await context_utils.to_thread(self._cleanup_cluster)
504
740
 
505
741
  # Step 2
506
742
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -521,7 +757,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
521
757
  region=launched_region)
522
758
  }
523
759
  # Not using self.launch to avoid the retry until up logic.
524
- job_submitted_at = self._launch(raise_on_failure=False)
760
+ job_submitted_at = await self._launch(raise_on_failure=False,
761
+ recovery=True)
525
762
  task.blocked_resources = None
526
763
  if job_submitted_at is not None:
527
764
  return job_submitted_at
@@ -531,14 +768,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
531
768
  logger.debug('Relaunch the cluster without constraining to prior '
532
769
  'cloud/region.')
533
770
  # Not using self.launch to avoid the retry until up logic.
534
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
535
- raise_on_failure=False)
771
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
772
+ raise_on_failure=False,
773
+ recovery=True)
536
774
  if job_submitted_at is None:
537
775
  # Failed to launch the cluster.
538
776
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
539
777
  logger.info('Retrying to recover the cluster in '
540
778
  f'{gap_seconds:.1f} seconds.')
541
- time.sleep(gap_seconds)
779
+ await asyncio.sleep(gap_seconds)
542
780
  continue
543
781
 
544
782
  return job_submitted_at
783
+
784
+
785
+ def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
786
+ """Gets the file path that the logger writes to."""
787
+ for handler in file_logger.handlers:
788
+ if isinstance(handler, logging.FileHandler):
789
+ return handler.baseFilename
790
+ return None