skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -5,23 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
5
5
  resources:
6
6
  job_recovery: EAGER_NEXT_REGION
7
7
  """
8
- import time
8
+ import asyncio
9
+ import logging
10
+ import os
9
11
  import traceback
10
12
  import typing
11
- from typing import Optional
13
+ from typing import Optional, Set
12
14
 
13
- import sky
14
15
  from sky import backends
16
+ from sky import dag as dag_lib
15
17
  from sky import exceptions
16
- from sky import execution
17
18
  from sky import global_user_state
18
19
  from sky import sky_logging
20
+ from sky import skypilot_config
19
21
  from sky.backends import backend_utils
22
+ from sky.client import sdk
20
23
  from sky.jobs import scheduler
24
+ from sky.jobs import state
21
25
  from sky.jobs import utils as managed_job_utils
26
+ from sky.serve import serve_utils
27
+ from sky.skylet import constants
22
28
  from sky.skylet import job_lib
23
29
  from sky.usage import usage_lib
24
30
  from sky.utils import common_utils
31
+ from sky.utils import context_utils
32
+ from sky.utils import env_options
25
33
  from sky.utils import registry
26
34
  from sky.utils import status_lib
27
35
  from sky.utils import ux_utils
@@ -39,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
39
47
  # Minutes to job cluster autodown. This should be significantly larger than
40
48
  # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
41
49
  # cluster before its status can be updated by the job controller.
42
- _AUTODOWN_MINUTES = 5
50
+ _AUTODOWN_MINUTES = 10
51
+
52
+ ENV_VARS_TO_CLEAR = [
53
+ skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
54
+ constants.USER_ID_ENV_VAR,
55
+ constants.USER_ENV_VAR,
56
+ env_options.Options.SHOW_DEBUG_INFO.env_key,
57
+ ]
43
58
 
44
59
 
45
60
  class StrategyExecutor:
@@ -47,29 +62,65 @@ class StrategyExecutor:
47
62
 
48
63
  RETRY_INIT_GAP_SECONDS = 60
49
64
 
50
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
51
- task: 'task_lib.Task', max_restarts_on_errors: int,
52
- job_id: int) -> None:
65
+ def __init__(
66
+ self,
67
+ cluster_name: Optional[str],
68
+ backend: 'backends.Backend',
69
+ task: 'task_lib.Task',
70
+ max_restarts_on_errors: int,
71
+ job_id: int,
72
+ task_id: int,
73
+ pool: Optional[str],
74
+ starting: Set[int],
75
+ starting_lock: asyncio.Lock,
76
+ starting_signal: asyncio.Condition,
77
+ ) -> None:
53
78
  """Initialize the strategy executor.
54
79
 
55
80
  Args:
56
81
  cluster_name: The name of the cluster.
57
82
  backend: The backend to use. Only CloudVMRayBackend is supported.
58
83
  task: The task to execute.
84
+ max_restarts_on_errors: Maximum number of restarts on errors.
85
+ job_id: The ID of the job.
86
+ task_id: The ID of the task.
87
+ starting: Set of job IDs that are currently starting.
88
+ starting_lock: Lock to synchronize starting jobs.
89
+ starting_signal: Condition to signal when a job can start.
59
90
  """
60
91
  assert isinstance(backend, backends.CloudVmRayBackend), (
61
92
  'Only CloudVMRayBackend is supported.')
62
- self.dag = sky.Dag()
93
+ self.dag = dag_lib.Dag()
63
94
  self.dag.add(task)
95
+ # For jobs submitted to a pool, the cluster name might change after each
96
+ # recovery. Initially this is set to an empty string to indicate that no
97
+ # cluster is assigned yet, and in `_launch`, it will be set to one of
98
+ # the cluster names in the pool.
64
99
  self.cluster_name = cluster_name
65
100
  self.backend = backend
66
101
  self.max_restarts_on_errors = max_restarts_on_errors
67
102
  self.job_id = job_id
103
+ self.task_id = task_id
104
+ self.pool = pool
68
105
  self.restart_cnt_on_failure = 0
106
+ self.job_id_on_pool_cluster: Optional[int] = None
107
+ self.starting = starting
108
+ self.starting_lock = starting_lock
109
+ self.starting_signal = starting_signal
69
110
 
70
111
  @classmethod
71
- def make(cls, cluster_name: str, backend: 'backends.Backend',
72
- task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
112
+ def make(
113
+ cls,
114
+ cluster_name: Optional[str],
115
+ backend: 'backends.Backend',
116
+ task: 'task_lib.Task',
117
+ job_id: int,
118
+ task_id: int,
119
+ pool: Optional[str],
120
+ starting: Set[int],
121
+ starting_lock: asyncio.Lock,
122
+ starting_signal: asyncio.Condition,
123
+ ) -> 'StrategyExecutor':
73
124
  """Create a strategy from a task."""
74
125
 
75
126
  resource_list = list(task.resources)
@@ -86,8 +137,11 @@ class StrategyExecutor:
86
137
  # original task.resources
87
138
  task.set_resources(type(task.resources)(new_resources_list))
88
139
  if isinstance(job_recovery, dict):
89
- job_recovery_name = job_recovery.pop(
140
+ name = job_recovery.pop(
90
141
  'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
142
+ assert name is None or isinstance(name, str), (
143
+ name, 'The job recovery strategy name must be a string or None')
144
+ job_recovery_name: Optional[str] = name
91
145
  max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
92
146
  0)
93
147
  else:
@@ -97,9 +151,11 @@ class StrategyExecutor:
97
151
  from_str(job_recovery_name))
98
152
  assert job_recovery_strategy is not None, job_recovery_name
99
153
  return job_recovery_strategy(cluster_name, backend, task,
100
- max_restarts_on_errors, job_id)
154
+ max_restarts_on_errors, job_id, task_id,
155
+ pool, starting, starting_lock,
156
+ starting_signal)
101
157
 
102
- def launch(self) -> float:
158
+ async def launch(self) -> float:
103
159
  """Launch the cluster for the first time.
104
160
 
105
161
  It can fail if resource is not available. Need to check the cluster
@@ -111,11 +167,11 @@ class StrategyExecutor:
111
167
  Raises: Please refer to the docstring of self._launch().
112
168
  """
113
169
 
114
- job_submit_at = self._launch(max_retry=None)
170
+ job_submit_at = await self._launch(max_retry=None)
115
171
  assert job_submit_at is not None
116
172
  return job_submit_at
117
173
 
118
- def recover(self) -> float:
174
+ async def recover(self) -> float:
119
175
  """Relaunch the cluster after failure and wait until job starts.
120
176
 
121
177
  When recover() is called the cluster should be in STOPPED status (i.e.
@@ -125,12 +181,12 @@ class StrategyExecutor:
125
181
  """
126
182
  raise NotImplementedError
127
183
 
128
- def _try_cancel_all_jobs(self):
129
- from sky import core # pylint: disable=import-outside-toplevel
130
-
131
- handle = global_user_state.get_handle_from_cluster_name(
132
- self.cluster_name)
133
- if handle is None:
184
+ async def _try_cancel_jobs(self):
185
+ if self.cluster_name is None:
186
+ return
187
+ handle = await context_utils.to_thread(
188
+ global_user_state.get_handle_from_cluster_name, self.cluster_name)
189
+ if handle is None or self.pool is not None:
134
190
  return
135
191
  try:
136
192
  usage_lib.messages.usage.set_internal()
@@ -153,9 +209,26 @@ class StrategyExecutor:
153
209
  # should be functional with the `_try_cancel_if_cluster_is_init`
154
210
  # flag, i.e. it sends the cancel signal to the head node, which will
155
211
  # then kill the user process on remaining worker nodes.
156
- core.cancel(cluster_name=self.cluster_name,
157
- all=True,
158
- _try_cancel_if_cluster_is_init=True)
212
+ # Only cancel the corresponding job for pool.
213
+ if self.pool is None:
214
+ request_id = await context_utils.to_thread(
215
+ sdk.cancel,
216
+ cluster_name=self.cluster_name,
217
+ all=True,
218
+ _try_cancel_if_cluster_is_init=True,
219
+ )
220
+ else:
221
+ request_id = await context_utils.to_thread(
222
+ sdk.cancel,
223
+ cluster_name=self.cluster_name,
224
+ job_ids=[self.job_id_on_pool_cluster],
225
+ _try_cancel_if_cluster_is_init=True,
226
+ )
227
+ logger.debug(f'sdk.cancel request ID: {request_id}')
228
+ await context_utils.to_thread(
229
+ sdk.get,
230
+ request_id,
231
+ )
159
232
  except Exception as e: # pylint: disable=broad-except
160
233
  logger.info('Failed to cancel the job on the cluster. The cluster '
161
234
  'might be already down or the head node is preempted.'
@@ -163,25 +236,26 @@ class StrategyExecutor:
163
236
  f'{common_utils.format_exception(e)}\n'
164
237
  'Terminating the cluster explicitly to ensure no '
165
238
  'remaining job process interferes with recovery.')
166
- managed_job_utils.terminate_cluster(self.cluster_name)
239
+ await context_utils.to_thread(self._cleanup_cluster)
167
240
 
168
- def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
241
+ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
169
242
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
170
243
 
171
244
  Returns:
172
245
  The timestamp of when the job is submitted, or None if failed to
173
246
  submit.
174
247
  """
248
+ assert self.cluster_name is not None
175
249
  status = None
176
250
  job_checking_retry_cnt = 0
177
251
  while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
178
252
  # Avoid the infinite loop, if any bug happens.
179
253
  job_checking_retry_cnt += 1
180
254
  try:
181
- cluster_status, _ = (
182
- backend_utils.refresh_cluster_status_handle(
183
- self.cluster_name,
184
- force_refresh_statuses=set(status_lib.ClusterStatus)))
255
+ cluster_status, _ = (await context_utils.to_thread(
256
+ backend_utils.refresh_cluster_status_handle,
257
+ self.cluster_name,
258
+ force_refresh_statuses=set(status_lib.ClusterStatus)))
185
259
  except Exception as e: # pylint: disable=broad-except
186
260
  # If any unexpected error happens, retry the job checking
187
261
  # loop.
@@ -201,8 +275,10 @@ class StrategyExecutor:
201
275
  break
202
276
 
203
277
  try:
204
- status = managed_job_utils.get_job_status(
205
- self.backend, self.cluster_name)
278
+ status = await managed_job_utils.get_job_status(
279
+ self.backend,
280
+ self.cluster_name,
281
+ job_id=self.job_id_on_pool_cluster)
206
282
  except Exception as e: # pylint: disable=broad-except
207
283
  # If any unexpected error happens, retry the job checking
208
284
  # loop.
@@ -217,8 +293,12 @@ class StrategyExecutor:
217
293
  # Check the job status until it is not in initialized status
218
294
  if status is not None and status > job_lib.JobStatus.INIT:
219
295
  try:
220
- job_submitted_at = managed_job_utils.get_job_timestamp(
221
- self.backend, self.cluster_name, get_end_time=False)
296
+ job_submitted_at = await context_utils.to_thread(
297
+ managed_job_utils.get_job_timestamp,
298
+ self.backend,
299
+ self.cluster_name,
300
+ self.job_id_on_pool_cluster,
301
+ get_end_time=False)
222
302
  return job_submitted_at
223
303
  except Exception as e: # pylint: disable=broad-except
224
304
  # If we failed to get the job timestamp, we will retry
@@ -227,12 +307,20 @@ class StrategyExecutor:
227
307
  'the job start timestamp. Retrying.')
228
308
  continue
229
309
  # Wait for the job to be started
230
- time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
310
+ await asyncio.sleep(
311
+ managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
231
312
  return None
232
313
 
233
- def _launch(self,
234
- max_retry: Optional[int] = 3,
235
- raise_on_failure: bool = True) -> Optional[float]:
314
+ def _cleanup_cluster(self) -> None:
315
+ if self.cluster_name is None:
316
+ return
317
+ if self.pool is None:
318
+ managed_job_utils.terminate_cluster(self.cluster_name)
319
+
320
+ async def _launch(self,
321
+ max_retry: Optional[int] = 3,
322
+ raise_on_failure: bool = True,
323
+ recovery: bool = False) -> Optional[float]:
236
324
  """Implementation of launch().
237
325
 
238
326
  The function will wait until the job starts running, but will leave the
@@ -272,98 +360,234 @@ class StrategyExecutor:
272
360
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
273
361
  while True:
274
362
  retry_cnt += 1
275
- with scheduler.scheduled_launch(self.job_id):
276
- try:
277
- usage_lib.messages.usage.set_internal()
278
- # Detach setup, so that the setup failure can be detected
279
- # by the controller process (job_status -> FAILED_SETUP).
280
- execution.launch(
281
- self.dag,
282
- cluster_name=self.cluster_name,
283
- # We expect to tear down the cluster as soon as the job
284
- # is finished. However, in case the controller dies, set
285
- # autodown to try and avoid a resource leak.
286
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
287
- down=True,
288
- _is_launched_by_jobs_controller=True)
289
- logger.info('Managed job cluster launched.')
290
- except (exceptions.InvalidClusterNameError,
291
- exceptions.NoCloudAccessError,
292
- exceptions.ResourcesMismatchError) as e:
293
- logger.error('Failure happened before provisioning. '
294
- f'{common_utils.format_exception(e)}')
295
- if raise_on_failure:
296
- raise exceptions.ProvisionPrechecksError(reasons=[e])
297
- return None
298
- except exceptions.ResourcesUnavailableError as e:
299
- # This is raised when the launch fails due to prechecks or
300
- # after failing over through all the candidates.
301
- # Please refer to the docstring of `sky.launch` for more
302
- # details of how the exception will be structured.
303
- if not any(
304
- isinstance(err,
305
- exceptions.ResourcesUnavailableError)
306
- for err in e.failover_history):
307
- # _launch() (this function) should fail/exit directly,
308
- # if none of the failover reasons were because of
309
- # resource unavailability or no failover was attempted
310
- # (the optimizer cannot find feasible resources for
311
- # requested resources), i.e., e.failover_history is
312
- # empty. Failing directly avoids the infinite loop of
313
- # retrying the launch when, e.g., an invalid cluster
314
- # name is used and --retry-until-up is specified.
315
- reasons = (e.failover_history
316
- if e.failover_history else [e])
317
- reasons_str = '; '.join(
318
- common_utils.format_exception(err)
319
- for err in reasons)
320
- logger.error(
321
- 'Failure happened before provisioning. Failover '
322
- f'reasons: {reasons_str}')
363
+ try:
364
+ async with scheduler.scheduled_launch(
365
+ self.job_id,
366
+ self.starting,
367
+ self.starting_lock,
368
+ self.starting_signal,
369
+ ):
370
+ # The job state may have been PENDING during backoff -
371
+ # update to STARTING or RECOVERING.
372
+ # On the first attempt (when retry_cnt is 1), we should
373
+ # already be in STARTING or RECOVERING.
374
+ if retry_cnt > 1:
375
+ await state.set_restarting_async(
376
+ self.job_id, self.task_id, recovery)
377
+ try:
378
+ usage_lib.messages.usage.set_internal()
379
+ if self.pool is None:
380
+ assert self.cluster_name is not None
381
+
382
+ # sdk.launch will implicitly start the API server,
383
+ # but then the API server will inherit the current
384
+ # env vars/user, which we may not want.
385
+ # Instead, clear env vars here and call api_start
386
+ # explicitly.
387
+ vars_to_restore = {}
388
+ try:
389
+ for env_var in ENV_VARS_TO_CLEAR:
390
+ vars_to_restore[env_var] = os.environ.pop(
391
+ env_var, None)
392
+ logger.debug('Cleared env var: '
393
+ f'{env_var}')
394
+ logger.debug('Env vars for api_start: '
395
+ f'{os.environ}')
396
+ await context_utils.to_thread(sdk.api_start)
397
+ logger.info('API server started.')
398
+ finally:
399
+ for env_var, value in vars_to_restore.items():
400
+ if value is not None:
401
+ logger.debug('Restored env var: '
402
+ f'{env_var}: {value}')
403
+ os.environ[env_var] = value
404
+
405
+ request_id = None
406
+ try:
407
+ request_id = await context_utils.to_thread(
408
+ sdk.launch,
409
+ self.dag,
410
+ cluster_name=self.cluster_name,
411
+ # We expect to tear down the cluster as soon
412
+ # as the job is finished. However, in case
413
+ # the controller dies, we may end up with a
414
+ # resource leak.
415
+ # Ideally, we should autodown to be safe,
416
+ # but it's fine to disable it for now, as
417
+ # Nebius doesn't support autodown yet.
418
+ # TODO(kevin): set down=True once Nebius
419
+ # supports autodown.
420
+ # idle_minutes_to_autostop=(
421
+ # _AUTODOWN_MINUTES),
422
+ # down=True,
423
+ _is_launched_by_jobs_controller=True,
424
+ )
425
+ logger.debug('sdk.launch request ID: '
426
+ f'{request_id}')
427
+ await context_utils.to_thread(
428
+ sdk.stream_and_get,
429
+ request_id,
430
+ )
431
+ except asyncio.CancelledError:
432
+ if request_id:
433
+ req = await context_utils.to_thread(
434
+ sdk.api_cancel, request_id)
435
+ logger.debug('sdk.api_cancel request '
436
+ f'ID: {req}')
437
+ try:
438
+ await context_utils.to_thread(
439
+ sdk.get, req)
440
+ except Exception as e: # pylint: disable=broad-except
441
+ # we must still return a CancelledError
442
+ logger.error(
443
+ f'Failed to cancel the job: {e}')
444
+ raise
445
+ logger.info('Managed job cluster launched.')
446
+ else:
447
+ self.cluster_name = await (context_utils.to_thread(
448
+ serve_utils.get_next_cluster_name, self.pool,
449
+ self.job_id))
450
+ if self.cluster_name is None:
451
+ raise exceptions.NoClusterLaunchedError(
452
+ 'No cluster name found in the pool.')
453
+ request_id = None
454
+ try:
455
+ request_id = await context_utils.to_thread(
456
+ sdk.exec,
457
+ self.dag,
458
+ cluster_name=self.cluster_name,
459
+ )
460
+ logger.debug('sdk.exec request ID: '
461
+ f'{request_id}')
462
+ job_id_on_pool_cluster, _ = (
463
+ await context_utils.to_thread(
464
+ sdk.get, request_id))
465
+ except asyncio.CancelledError:
466
+ if request_id:
467
+ req = await context_utils.to_thread(
468
+ sdk.api_cancel, request_id)
469
+ logger.debug('sdk.api_cancel request '
470
+ f'ID: {req}')
471
+ try:
472
+ await context_utils.to_thread(
473
+ sdk.get, req)
474
+ except Exception as e: # pylint: disable=broad-except
475
+ # we must still return a CancelledError
476
+ logger.error(
477
+ f'Failed to cancel the job: {e}')
478
+ raise
479
+ assert job_id_on_pool_cluster is not None, (
480
+ self.cluster_name, self.job_id)
481
+ self.job_id_on_pool_cluster = job_id_on_pool_cluster
482
+ await state.set_job_id_on_pool_cluster_async(
483
+ self.job_id, job_id_on_pool_cluster)
484
+ logger.info('Managed job cluster launched.')
485
+ except (exceptions.InvalidClusterNameError,
486
+ exceptions.NoCloudAccessError,
487
+ exceptions.ResourcesMismatchError,
488
+ exceptions.StorageSpecError,
489
+ exceptions.StorageError) as e:
490
+ logger.error('Failure happened before provisioning. '
491
+ f'{common_utils.format_exception(e)}')
323
492
  if raise_on_failure:
324
- raise exceptions.ProvisionPrechecksError(reasons)
325
- return None
326
- logger.info('Failed to launch a cluster with error: '
327
- f'{common_utils.format_exception(e)})')
328
- except Exception as e: # pylint: disable=broad-except
329
- # If the launch fails, it will be recovered by the following
330
- # code.
331
- logger.info('Failed to launch a cluster with error: '
332
- f'{common_utils.format_exception(e)})')
333
- with ux_utils.enable_traceback():
334
- logger.info(f' Traceback: {traceback.format_exc()}')
335
- else: # No exception, the launch succeeds.
336
- # At this point, a sky.launch() has succeeded. Cluster may
337
- # be UP (no preemption since) or DOWN (newly preempted).
338
- job_submitted_at = self._wait_until_job_starts_on_cluster()
339
- if job_submitted_at is not None:
340
- return job_submitted_at
341
- # The job fails to start on the cluster, retry the launch.
342
- # TODO(zhwu): log the unexpected error to usage collection
343
- # for future debugging.
344
- logger.info(
345
- 'Failed to successfully submit the job to the '
346
- 'launched cluster, due to unexpected submission errors '
347
- 'or the cluster being preempted during job submission.')
348
-
349
- # If we get here, the launch did not succeed. Tear down the
350
- # cluster and retry.
351
- managed_job_utils.terminate_cluster(self.cluster_name)
352
- if max_retry is not None and retry_cnt >= max_retry:
353
- # Retry forever if max_retry is None.
354
- if raise_on_failure:
355
- with ux_utils.print_exception_no_traceback():
356
- raise exceptions.ManagedJobReachedMaxRetriesError(
357
- 'Resources unavailable: failed to launch '
358
- f'clusters after {max_retry} retries.')
359
- else:
493
+ raise exceptions.ProvisionPrechecksError(
494
+ reasons=[e])
360
495
  return None
361
- # Exit the scheduled_launch context so that the scheulde state is
362
- # ALIVE during the backoff. This allows other jobs to launch.
363
- gap_seconds = backoff.current_backoff()
364
- logger.info('Retrying to launch the cluster in '
365
- f'{gap_seconds:.1f} seconds.')
366
- time.sleep(gap_seconds)
496
+ except exceptions.ResourcesUnavailableError as e:
497
+ # This is raised when the launch fails due to prechecks
498
+ # or after failing over through all the candidates.
499
+ # Please refer to the docstring of `sky.launch` for more
500
+ # details of how the exception will be structured.
501
+ if not any(
502
+ isinstance(err,
503
+ exceptions.ResourcesUnavailableError)
504
+ for err in e.failover_history):
505
+ # _launch() (this function) should fail/exit
506
+ # directly, if none of the failover reasons were
507
+ # because of resource unavailability or no failover
508
+ # was attempted (the optimizer cannot find feasible
509
+ # resources for requested resources), i.e.,
510
+ # e.failover_history is empty. Failing directly
511
+ # avoids the infinite loop of retrying the launch
512
+ # when, e.g., an invalid cluster name is used and
513
+ # --retry-until-up is specified.
514
+ reasons = (e.failover_history
515
+ if e.failover_history else [e])
516
+ reasons_str = '; '.join(
517
+ common_utils.format_exception(err)
518
+ for err in reasons)
519
+ logger.error(
520
+ 'Failure happened before provisioning. '
521
+ f'Failover reasons: {reasons_str}')
522
+ if raise_on_failure:
523
+ raise exceptions.ProvisionPrechecksError(
524
+ reasons)
525
+ return None
526
+ logger.info('Failed to launch a cluster with error: '
527
+ f'{common_utils.format_exception(e)})')
528
+ except Exception as e: # pylint: disable=broad-except
529
+ # If the launch fails, it will be recovered by the
530
+ # following code.
531
+ logger.info('Failed to launch a cluster with error: '
532
+ f'{common_utils.format_exception(e)})')
533
+ with ux_utils.enable_traceback():
534
+ logger.info(
535
+ f' Traceback: {traceback.format_exc()}')
536
+ else: # No exception, the launch succeeds.
537
+ # At this point, a sky.launch() has succeeded. Cluster
538
+ # may be UP (no preemption since) or DOWN (newly
539
+ # preempted).
540
+ job_submitted_at = await (
541
+ self._wait_until_job_starts_on_cluster())
542
+ if job_submitted_at is not None:
543
+ return job_submitted_at
544
+ # The job fails to start on the cluster, retry the
545
+ # launch.
546
+ # TODO(zhwu): log the unexpected error to usage
547
+ # collection for future debugging.
548
+ logger.info(
549
+ 'Failed to successfully submit the job to the '
550
+ 'launched cluster, due to unexpected submission '
551
+ 'errors or the cluster being preempted during '
552
+ 'job submission.')
553
+
554
+ # If we get here, the launch did not succeed. Tear down the
555
+ # cluster and retry.
556
+ await context_utils.to_thread(self._cleanup_cluster)
557
+ if max_retry is not None and retry_cnt >= max_retry:
558
+ # Retry forever if max_retry is None.
559
+ if raise_on_failure:
560
+ with ux_utils.print_exception_no_traceback():
561
+ raise (
562
+ exceptions.ManagedJobReachedMaxRetriesError(
563
+ 'Resources unavailable: failed to '
564
+ f'launch clusters after {max_retry} '
565
+ 'retries.'))
566
+ else:
567
+ return None
568
+
569
+ # Raise NoClusterLaunchedError to indicate that the job is
570
+ # in retry backoff. This will trigger special handling in
571
+ # scheduler.schedule_launched().
572
+ # We will exit the scheduled_launch context so that the
573
+ # schedule state is ALIVE_BACKOFF during the backoff. This
574
+ # allows other jobs to launch.
575
+ raise exceptions.NoClusterLaunchedError()
576
+
577
+ except exceptions.NoClusterLaunchedError:
578
+ # Update the status to PENDING during backoff.
579
+ await state.set_backoff_pending_async(self.job_id, self.task_id)
580
+ # Calculate the backoff time and sleep.
581
+ gap_seconds = (backoff.current_backoff()
582
+ if self.pool is None else 1)
583
+ logger.info('Retrying to launch the cluster in '
584
+ f'{gap_seconds:.1f} seconds.')
585
+ await asyncio.sleep(gap_seconds)
586
+ continue
587
+ else:
588
+ # The inner loop should either return or throw
589
+ # NoClusterLaunchedError.
590
+ assert False, 'Unreachable'
367
591
 
368
592
  def should_restart_on_failure(self) -> bool:
369
593
  """Increments counter & checks if job should be restarted on a failure.
@@ -384,24 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
384
608
 
385
609
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
386
610
 
387
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
388
- task: 'task_lib.Task', max_restarts_on_errors: int,
389
- job_id: int) -> None:
611
+ def __init__(
612
+ self,
613
+ cluster_name: Optional[str],
614
+ backend: 'backends.Backend',
615
+ task: 'task_lib.Task',
616
+ max_restarts_on_errors: int,
617
+ job_id: int,
618
+ task_id: int,
619
+ pool: Optional[str],
620
+ starting: Set[int],
621
+ starting_lock: asyncio.Lock,
622
+ starting_signal: asyncio.Condition,
623
+ ) -> None:
390
624
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
391
- job_id)
625
+ job_id, task_id, pool, starting, starting_lock,
626
+ starting_signal)
392
627
  # Note down the cloud/region of the launched cluster, so that we can
393
628
  # first retry in the same cloud/region. (Inside recover() we may not
394
629
  # rely on cluster handle, as it can be None if the cluster is
395
630
  # preempted.)
396
631
  self._launched_resources: Optional['resources.Resources'] = None
397
632
 
398
- def _launch(self,
399
- max_retry: Optional[int] = 3,
400
- raise_on_failure: bool = True) -> Optional[float]:
401
- job_submitted_at = super()._launch(max_retry, raise_on_failure)
402
- if job_submitted_at is not None:
633
+ async def _launch(self,
634
+ max_retry: Optional[int] = 3,
635
+ raise_on_failure: bool = True,
636
+ recovery: bool = False) -> Optional[float]:
637
+ job_submitted_at = await super()._launch(max_retry, raise_on_failure,
638
+ recovery)
639
+ if job_submitted_at is not None and self.cluster_name is not None:
403
640
  # Only record the cloud/region if the launch is successful.
404
- handle = global_user_state.get_handle_from_cluster_name(
641
+ handle = await context_utils.to_thread(
642
+ global_user_state.get_handle_from_cluster_name,
405
643
  self.cluster_name)
406
644
  assert isinstance(handle, backends.CloudVmRayResourceHandle), (
407
645
  'Cluster should be launched.', handle)
@@ -411,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
411
649
  self._launched_resources = None
412
650
  return job_submitted_at
413
651
 
414
- def recover(self) -> float:
652
+ async def recover(self) -> float:
415
653
  # 1. Cancel the jobs and launch the cluster with the STOPPED status,
416
654
  # so that it will try on the current region first until timeout.
417
655
  # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -419,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
419
657
  # original user specification.
420
658
 
421
659
  # Step 1
422
- self._try_cancel_all_jobs()
660
+ await self._try_cancel_jobs()
423
661
 
424
662
  while True:
425
663
  # Add region constraint to the task, to retry on the same region
@@ -433,7 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
433
671
  cloud=launched_cloud, region=launched_region, zone=None)
434
672
  task.set_resources({new_resources})
435
673
  # Not using self.launch to avoid the retry until up logic.
436
- job_submitted_at = self._launch(raise_on_failure=False)
674
+ job_submitted_at = await self._launch(raise_on_failure=False,
675
+ recovery=True)
437
676
  # Restore the original dag, i.e. reset the region constraint.
438
677
  task.set_resources(original_resources)
439
678
  if job_submitted_at is not None:
@@ -442,20 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
442
681
  # Step 2
443
682
  logger.debug('Terminating unhealthy cluster and reset cloud '
444
683
  'region.')
445
- managed_job_utils.terminate_cluster(self.cluster_name)
684
+ await context_utils.to_thread(self._cleanup_cluster)
446
685
 
447
686
  # Step 3
448
687
  logger.debug('Relaunch the cluster without constraining to prior '
449
688
  'cloud/region.')
450
689
  # Not using self.launch to avoid the retry until up logic.
451
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
452
- raise_on_failure=False)
690
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
691
+ raise_on_failure=False,
692
+ recovery=True)
453
693
  if job_submitted_at is None:
454
694
  # Failed to launch the cluster.
455
695
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
456
696
  logger.info('Retrying to recover the cluster in '
457
697
  f'{gap_seconds:.1f} seconds.')
458
- time.sleep(gap_seconds)
698
+ await asyncio.sleep(gap_seconds)
459
699
  continue
460
700
 
461
701
  return job_submitted_at
@@ -487,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
487
727
  -> R1Z1 (success)
488
728
  """
489
729
 
490
- def recover(self) -> float:
730
+ async def recover(self) -> float:
491
731
  # 1. Terminate the current cluster
492
732
  # 2. Launch again by explicitly blocking the previously launched region
493
733
  # (this will failover through the entire search space except the
@@ -500,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
500
740
 
501
741
  # Step 1
502
742
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
503
- managed_job_utils.terminate_cluster(self.cluster_name)
743
+ await context_utils.to_thread(self._cleanup_cluster)
504
744
 
505
745
  # Step 2
506
746
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -521,7 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
521
761
  region=launched_region)
522
762
  }
523
763
  # Not using self.launch to avoid the retry until up logic.
524
- job_submitted_at = self._launch(raise_on_failure=False)
764
+ job_submitted_at = await self._launch(raise_on_failure=False,
765
+ recovery=True)
525
766
  task.blocked_resources = None
526
767
  if job_submitted_at is not None:
527
768
  return job_submitted_at
@@ -531,14 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
531
772
  logger.debug('Relaunch the cluster without constraining to prior '
532
773
  'cloud/region.')
533
774
  # Not using self.launch to avoid the retry until up logic.
534
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
535
- raise_on_failure=False)
775
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
776
+ raise_on_failure=False,
777
+ recovery=True)
536
778
  if job_submitted_at is None:
537
779
  # Failed to launch the cluster.
538
780
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
539
781
  logger.info('Retrying to recover the cluster in '
540
782
  f'{gap_seconds:.1f} seconds.')
541
- time.sleep(gap_seconds)
783
+ await asyncio.sleep(gap_seconds)
542
784
  continue
543
785
 
544
786
  return job_submitted_at
787
+
788
+
789
+ def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
790
+ """Gets the file path that the logger writes to."""
791
+ for handler in file_logger.handlers:
792
+ if isinstance(handler, logging.FileHandler):
793
+ return handler.baseFilename
794
+ return None