skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -5,25 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
5
5
  resources:
6
6
  job_recovery: EAGER_NEXT_REGION
7
7
  """
8
- import time
8
+ import asyncio
9
+ import logging
10
+ import os
9
11
  import traceback
10
12
  import typing
11
- from typing import Optional
13
+ from typing import Optional, Set
12
14
 
13
15
  from sky import backends
14
16
  from sky import dag as dag_lib
15
17
  from sky import exceptions
16
- from sky import execution
17
18
  from sky import global_user_state
18
19
  from sky import sky_logging
20
+ from sky import skypilot_config
19
21
  from sky.backends import backend_utils
22
+ from sky.client import sdk
20
23
  from sky.jobs import scheduler
21
24
  from sky.jobs import state
22
25
  from sky.jobs import utils as managed_job_utils
23
26
  from sky.serve import serve_utils
27
+ from sky.skylet import constants
24
28
  from sky.skylet import job_lib
25
29
  from sky.usage import usage_lib
26
30
  from sky.utils import common_utils
31
+ from sky.utils import context_utils
32
+ from sky.utils import env_options
27
33
  from sky.utils import registry
28
34
  from sky.utils import status_lib
29
35
  from sky.utils import ux_utils
@@ -41,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
41
47
  # Minutes to job cluster autodown. This should be significantly larger than
42
48
  # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
43
49
  # cluster before its status can be updated by the job controller.
44
- _AUTODOWN_MINUTES = 5
50
+ _AUTODOWN_MINUTES = 10
51
+
52
+ ENV_VARS_TO_CLEAR = [
53
+ skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
54
+ constants.USER_ID_ENV_VAR,
55
+ constants.USER_ENV_VAR,
56
+ env_options.Options.SHOW_DEBUG_INFO.env_key,
57
+ ]
45
58
 
46
59
 
47
60
  class StrategyExecutor:
@@ -49,15 +62,31 @@ class StrategyExecutor:
49
62
 
50
63
  RETRY_INIT_GAP_SECONDS = 60
51
64
 
52
- def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
53
- task: 'task_lib.Task', max_restarts_on_errors: int,
54
- job_id: int, task_id: int, pool: Optional[str]) -> None:
65
+ def __init__(
66
+ self,
67
+ cluster_name: Optional[str],
68
+ backend: 'backends.Backend',
69
+ task: 'task_lib.Task',
70
+ max_restarts_on_errors: int,
71
+ job_id: int,
72
+ task_id: int,
73
+ pool: Optional[str],
74
+ starting: Set[int],
75
+ starting_lock: asyncio.Lock,
76
+ starting_signal: asyncio.Condition,
77
+ ) -> None:
55
78
  """Initialize the strategy executor.
56
79
 
57
80
  Args:
58
81
  cluster_name: The name of the cluster.
59
82
  backend: The backend to use. Only CloudVMRayBackend is supported.
60
83
  task: The task to execute.
84
+ max_restarts_on_errors: Maximum number of restarts on errors.
85
+ job_id: The ID of the job.
86
+ task_id: The ID of the task.
87
+ starting: Set of job IDs that are currently starting.
88
+ starting_lock: Lock to synchronize starting jobs.
89
+ starting_signal: Condition to signal when a job can start.
61
90
  """
62
91
  assert isinstance(backend, backends.CloudVmRayBackend), (
63
92
  'Only CloudVMRayBackend is supported.')
@@ -75,11 +104,23 @@ class StrategyExecutor:
75
104
  self.pool = pool
76
105
  self.restart_cnt_on_failure = 0
77
106
  self.job_id_on_pool_cluster: Optional[int] = None
107
+ self.starting = starting
108
+ self.starting_lock = starting_lock
109
+ self.starting_signal = starting_signal
78
110
 
79
111
  @classmethod
80
- def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
81
- task: 'task_lib.Task', job_id: int, task_id: int,
82
- pool: Optional[str]) -> 'StrategyExecutor':
112
+ def make(
113
+ cls,
114
+ cluster_name: Optional[str],
115
+ backend: 'backends.Backend',
116
+ task: 'task_lib.Task',
117
+ job_id: int,
118
+ task_id: int,
119
+ pool: Optional[str],
120
+ starting: Set[int],
121
+ starting_lock: asyncio.Lock,
122
+ starting_signal: asyncio.Condition,
123
+ ) -> 'StrategyExecutor':
83
124
  """Create a strategy from a task."""
84
125
 
85
126
  resource_list = list(task.resources)
@@ -111,9 +152,10 @@ class StrategyExecutor:
111
152
  assert job_recovery_strategy is not None, job_recovery_name
112
153
  return job_recovery_strategy(cluster_name, backend, task,
113
154
  max_restarts_on_errors, job_id, task_id,
114
- pool)
155
+ pool, starting, starting_lock,
156
+ starting_signal)
115
157
 
116
- def launch(self) -> float:
158
+ async def launch(self) -> float:
117
159
  """Launch the cluster for the first time.
118
160
 
119
161
  It can fail if resource is not available. Need to check the cluster
@@ -125,11 +167,11 @@ class StrategyExecutor:
125
167
  Raises: Please refer to the docstring of self._launch().
126
168
  """
127
169
 
128
- job_submit_at = self._launch(max_retry=None)
170
+ job_submit_at = await self._launch(max_retry=None)
129
171
  assert job_submit_at is not None
130
172
  return job_submit_at
131
173
 
132
- def recover(self) -> float:
174
+ async def recover(self) -> float:
133
175
  """Relaunch the cluster after failure and wait until job starts.
134
176
 
135
177
  When recover() is called the cluster should be in STOPPED status (i.e.
@@ -139,13 +181,11 @@ class StrategyExecutor:
139
181
  """
140
182
  raise NotImplementedError
141
183
 
142
- def _try_cancel_jobs(self):
143
- from sky import core # pylint: disable=import-outside-toplevel
144
-
184
+ async def _try_cancel_jobs(self):
145
185
  if self.cluster_name is None:
146
186
  return
147
- handle = global_user_state.get_handle_from_cluster_name(
148
- self.cluster_name)
187
+ handle = await context_utils.to_thread(
188
+ global_user_state.get_handle_from_cluster_name, self.cluster_name)
149
189
  if handle is None or self.pool is not None:
150
190
  return
151
191
  try:
@@ -169,14 +209,26 @@ class StrategyExecutor:
169
209
  # should be functional with the `_try_cancel_if_cluster_is_init`
170
210
  # flag, i.e. it sends the cancel signal to the head node, which will
171
211
  # then kill the user process on remaining worker nodes.
172
- # Only cancel the corresponding job for worker pool.
212
+ # Only cancel the corresponding job for pool.
173
213
  if self.pool is None:
174
- kwargs = dict(all=True)
214
+ request_id = await context_utils.to_thread(
215
+ sdk.cancel,
216
+ cluster_name=self.cluster_name,
217
+ all=True,
218
+ _try_cancel_if_cluster_is_init=True,
219
+ )
175
220
  else:
176
- kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
177
- core.cancel(cluster_name=self.cluster_name,
178
- **kwargs,
179
- _try_cancel_if_cluster_is_init=True)
221
+ request_id = await context_utils.to_thread(
222
+ sdk.cancel,
223
+ cluster_name=self.cluster_name,
224
+ job_ids=[self.job_id_on_pool_cluster],
225
+ _try_cancel_if_cluster_is_init=True,
226
+ )
227
+ logger.debug(f'sdk.cancel request ID: {request_id}')
228
+ await context_utils.to_thread(
229
+ sdk.get,
230
+ request_id,
231
+ )
180
232
  except Exception as e: # pylint: disable=broad-except
181
233
  logger.info('Failed to cancel the job on the cluster. The cluster '
182
234
  'might be already down or the head node is preempted.'
@@ -184,9 +236,9 @@ class StrategyExecutor:
184
236
  f'{common_utils.format_exception(e)}\n'
185
237
  'Terminating the cluster explicitly to ensure no '
186
238
  'remaining job process interferes with recovery.')
187
- self._cleanup_cluster()
239
+ await context_utils.to_thread(self._cleanup_cluster)
188
240
 
189
- def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
241
+ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
190
242
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
191
243
 
192
244
  Returns:
@@ -200,10 +252,10 @@ class StrategyExecutor:
200
252
  # Avoid the infinite loop, if any bug happens.
201
253
  job_checking_retry_cnt += 1
202
254
  try:
203
- cluster_status, _ = (
204
- backend_utils.refresh_cluster_status_handle(
205
- self.cluster_name,
206
- force_refresh_statuses=set(status_lib.ClusterStatus)))
255
+ cluster_status, _ = (await context_utils.to_thread(
256
+ backend_utils.refresh_cluster_status_handle,
257
+ self.cluster_name,
258
+ force_refresh_statuses=set(status_lib.ClusterStatus)))
207
259
  except Exception as e: # pylint: disable=broad-except
208
260
  # If any unexpected error happens, retry the job checking
209
261
  # loop.
@@ -223,7 +275,7 @@ class StrategyExecutor:
223
275
  break
224
276
 
225
277
  try:
226
- status = managed_job_utils.get_job_status(
278
+ status = await managed_job_utils.get_job_status(
227
279
  self.backend,
228
280
  self.cluster_name,
229
281
  job_id=self.job_id_on_pool_cluster)
@@ -241,7 +293,8 @@ class StrategyExecutor:
241
293
  # Check the job status until it is not in initialized status
242
294
  if status is not None and status > job_lib.JobStatus.INIT:
243
295
  try:
244
- job_submitted_at = managed_job_utils.get_job_timestamp(
296
+ job_submitted_at = await context_utils.to_thread(
297
+ managed_job_utils.get_job_timestamp,
245
298
  self.backend,
246
299
  self.cluster_name,
247
300
  self.job_id_on_pool_cluster,
@@ -254,7 +307,8 @@ class StrategyExecutor:
254
307
  'the job start timestamp. Retrying.')
255
308
  continue
256
309
  # Wait for the job to be started
257
- time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
310
+ await asyncio.sleep(
311
+ managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
258
312
  return None
259
313
 
260
314
  def _cleanup_cluster(self) -> None:
@@ -263,10 +317,10 @@ class StrategyExecutor:
263
317
  if self.pool is None:
264
318
  managed_job_utils.terminate_cluster(self.cluster_name)
265
319
 
266
- def _launch(self,
267
- max_retry: Optional[int] = 3,
268
- raise_on_failure: bool = True,
269
- recovery: bool = False) -> Optional[float]:
320
+ async def _launch(self,
321
+ max_retry: Optional[int] = 3,
322
+ raise_on_failure: bool = True,
323
+ recovery: bool = False) -> Optional[float]:
270
324
  """Implementation of launch().
271
325
 
272
326
  The function will wait until the job starts running, but will leave the
@@ -307,54 +361,132 @@ class StrategyExecutor:
307
361
  while True:
308
362
  retry_cnt += 1
309
363
  try:
310
- with scheduler.scheduled_launch(self.job_id):
364
+ async with scheduler.scheduled_launch(
365
+ self.job_id,
366
+ self.starting,
367
+ self.starting_lock,
368
+ self.starting_signal,
369
+ ):
311
370
  # The job state may have been PENDING during backoff -
312
371
  # update to STARTING or RECOVERING.
313
372
  # On the first attempt (when retry_cnt is 1), we should
314
373
  # already be in STARTING or RECOVERING.
315
374
  if retry_cnt > 1:
316
- state.set_restarting(self.job_id, self.task_id,
317
- recovery)
375
+ await state.set_restarting_async(
376
+ self.job_id, self.task_id, recovery)
318
377
  try:
319
378
  usage_lib.messages.usage.set_internal()
320
379
  if self.pool is None:
321
380
  assert self.cluster_name is not None
322
- # Detach setup, so that the setup failure can be
323
- # detected by the controller process (job_status ->
324
- # FAILED_SETUP).
325
- execution.launch(
326
- self.dag,
327
- cluster_name=self.cluster_name,
328
- # We expect to tear down the cluster as soon as
329
- # the job is finished. However, in case the
330
- # controller dies, we may end up with a
331
- # resource leak.
332
- # Ideally, we should autodown to be safe,
333
- # but it's fine to disable it for now, as
334
- # Nebius doesn't support autodown yet.
335
- # TODO(kevin): set down=True once Nebius
336
- # supports autodown.
337
- # idle_minutes_to_autostop=_AUTODOWN_MINUTES,
338
- # down=True,
339
- _is_launched_by_jobs_controller=True)
381
+
382
+ # sdk.launch will implicitly start the API server,
383
+ # but then the API server will inherit the current
384
+ # env vars/user, which we may not want.
385
+ # Instead, clear env vars here and call api_start
386
+ # explicitly.
387
+ vars_to_restore = {}
388
+ try:
389
+ for env_var in ENV_VARS_TO_CLEAR:
390
+ vars_to_restore[env_var] = os.environ.pop(
391
+ env_var, None)
392
+ logger.debug('Cleared env var: '
393
+ f'{env_var}')
394
+ logger.debug('Env vars for api_start: '
395
+ f'{os.environ}')
396
+ await context_utils.to_thread(sdk.api_start)
397
+ logger.info('API server started.')
398
+ finally:
399
+ for env_var, value in vars_to_restore.items():
400
+ if value is not None:
401
+ logger.debug('Restored env var: '
402
+ f'{env_var}: {value}')
403
+ os.environ[env_var] = value
404
+
405
+ request_id = None
406
+ try:
407
+ request_id = await context_utils.to_thread(
408
+ sdk.launch,
409
+ self.dag,
410
+ cluster_name=self.cluster_name,
411
+ # We expect to tear down the cluster as soon
412
+ # as the job is finished. However, in case
413
+ # the controller dies, we may end up with a
414
+ # resource leak.
415
+ # Ideally, we should autodown to be safe,
416
+ # but it's fine to disable it for now, as
417
+ # Nebius doesn't support autodown yet.
418
+ # TODO(kevin): set down=True once Nebius
419
+ # supports autodown.
420
+ # idle_minutes_to_autostop=(
421
+ # _AUTODOWN_MINUTES),
422
+ # down=True,
423
+ _is_launched_by_jobs_controller=True,
424
+ )
425
+ logger.debug('sdk.launch request ID: '
426
+ f'{request_id}')
427
+ await context_utils.to_thread(
428
+ sdk.stream_and_get,
429
+ request_id,
430
+ )
431
+ except asyncio.CancelledError:
432
+ if request_id:
433
+ req = await context_utils.to_thread(
434
+ sdk.api_cancel, request_id)
435
+ logger.debug('sdk.api_cancel request '
436
+ f'ID: {req}')
437
+ try:
438
+ await context_utils.to_thread(
439
+ sdk.get, req)
440
+ except Exception as e: # pylint: disable=broad-except
441
+ # we must still return a CancelledError
442
+ logger.error(
443
+ f'Failed to cancel the job: {e}')
444
+ raise
445
+ logger.info('Managed job cluster launched.')
340
446
  else:
341
- self.cluster_name = (
342
- serve_utils.get_next_cluster_name(
343
- self.pool, self.job_id))
447
+ self.cluster_name = await (context_utils.to_thread(
448
+ serve_utils.get_next_cluster_name, self.pool,
449
+ self.job_id))
344
450
  if self.cluster_name is None:
345
451
  raise exceptions.NoClusterLaunchedError(
346
452
  'No cluster name found in the pool.')
347
- job_id_on_pool_cluster, _ = execution.exec(
348
- self.dag, cluster_name=self.cluster_name)
453
+ request_id = None
454
+ try:
455
+ request_id = await context_utils.to_thread(
456
+ sdk.exec,
457
+ self.dag,
458
+ cluster_name=self.cluster_name,
459
+ )
460
+ logger.debug('sdk.exec request ID: '
461
+ f'{request_id}')
462
+ job_id_on_pool_cluster, _ = (
463
+ await context_utils.to_thread(
464
+ sdk.get, request_id))
465
+ except asyncio.CancelledError:
466
+ if request_id:
467
+ req = await context_utils.to_thread(
468
+ sdk.api_cancel, request_id)
469
+ logger.debug('sdk.api_cancel request '
470
+ f'ID: {req}')
471
+ try:
472
+ await context_utils.to_thread(
473
+ sdk.get, req)
474
+ except Exception as e: # pylint: disable=broad-except
475
+ # we must still return a CancelledError
476
+ logger.error(
477
+ f'Failed to cancel the job: {e}')
478
+ raise
349
479
  assert job_id_on_pool_cluster is not None, (
350
480
  self.cluster_name, self.job_id)
351
481
  self.job_id_on_pool_cluster = job_id_on_pool_cluster
352
- state.set_job_id_on_pool_cluster(
482
+ await state.set_job_id_on_pool_cluster_async(
353
483
  self.job_id, job_id_on_pool_cluster)
354
484
  logger.info('Managed job cluster launched.')
355
485
  except (exceptions.InvalidClusterNameError,
356
486
  exceptions.NoCloudAccessError,
357
- exceptions.ResourcesMismatchError) as e:
487
+ exceptions.ResourcesMismatchError,
488
+ exceptions.StorageSpecError,
489
+ exceptions.StorageError) as e:
358
490
  logger.error('Failure happened before provisioning. '
359
491
  f'{common_utils.format_exception(e)}')
360
492
  if raise_on_failure:
@@ -405,7 +537,7 @@ class StrategyExecutor:
405
537
  # At this point, a sky.launch() has succeeded. Cluster
406
538
  # may be UP (no preemption since) or DOWN (newly
407
539
  # preempted).
408
- job_submitted_at = (
540
+ job_submitted_at = await (
409
541
  self._wait_until_job_starts_on_cluster())
410
542
  if job_submitted_at is not None:
411
543
  return job_submitted_at
@@ -421,7 +553,7 @@ class StrategyExecutor:
421
553
 
422
554
  # If we get here, the launch did not succeed. Tear down the
423
555
  # cluster and retry.
424
- self._cleanup_cluster()
556
+ await context_utils.to_thread(self._cleanup_cluster)
425
557
  if max_retry is not None and retry_cnt >= max_retry:
426
558
  # Retry forever if max_retry is None.
427
559
  if raise_on_failure:
@@ -444,15 +576,13 @@ class StrategyExecutor:
444
576
 
445
577
  except exceptions.NoClusterLaunchedError:
446
578
  # Update the status to PENDING during backoff.
447
- state.set_backoff_pending(self.job_id, self.task_id)
579
+ await state.set_backoff_pending_async(self.job_id, self.task_id)
448
580
  # Calculate the backoff time and sleep.
449
- # We retry immediately for worker pool, since no sky.launch()
450
- # is called and the overhead is minimal.
451
581
  gap_seconds = (backoff.current_backoff()
452
582
  if self.pool is None else 1)
453
583
  logger.info('Retrying to launch the cluster in '
454
584
  f'{gap_seconds:.1f} seconds.')
455
- time.sleep(gap_seconds)
585
+ await asyncio.sleep(gap_seconds)
456
586
  continue
457
587
  else:
458
588
  # The inner loop should either return or throw
@@ -478,26 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
478
608
 
479
609
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
480
610
 
481
- def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
482
- task: 'task_lib.Task', max_restarts_on_errors: int,
483
- job_id: int, task_id: int, pool: Optional[str]) -> None:
611
+ def __init__(
612
+ self,
613
+ cluster_name: Optional[str],
614
+ backend: 'backends.Backend',
615
+ task: 'task_lib.Task',
616
+ max_restarts_on_errors: int,
617
+ job_id: int,
618
+ task_id: int,
619
+ pool: Optional[str],
620
+ starting: Set[int],
621
+ starting_lock: asyncio.Lock,
622
+ starting_signal: asyncio.Condition,
623
+ ) -> None:
484
624
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
485
- job_id, task_id, pool)
625
+ job_id, task_id, pool, starting, starting_lock,
626
+ starting_signal)
486
627
  # Note down the cloud/region of the launched cluster, so that we can
487
628
  # first retry in the same cloud/region. (Inside recover() we may not
488
629
  # rely on cluster handle, as it can be None if the cluster is
489
630
  # preempted.)
490
631
  self._launched_resources: Optional['resources.Resources'] = None
491
632
 
492
- def _launch(self,
493
- max_retry: Optional[int] = 3,
494
- raise_on_failure: bool = True,
495
- recovery: bool = False) -> Optional[float]:
496
- job_submitted_at = super()._launch(max_retry, raise_on_failure,
497
- recovery)
633
+ async def _launch(self,
634
+ max_retry: Optional[int] = 3,
635
+ raise_on_failure: bool = True,
636
+ recovery: bool = False) -> Optional[float]:
637
+ job_submitted_at = await super()._launch(max_retry, raise_on_failure,
638
+ recovery)
498
639
  if job_submitted_at is not None and self.cluster_name is not None:
499
640
  # Only record the cloud/region if the launch is successful.
500
- handle = global_user_state.get_handle_from_cluster_name(
641
+ handle = await context_utils.to_thread(
642
+ global_user_state.get_handle_from_cluster_name,
501
643
  self.cluster_name)
502
644
  assert isinstance(handle, backends.CloudVmRayResourceHandle), (
503
645
  'Cluster should be launched.', handle)
@@ -507,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
507
649
  self._launched_resources = None
508
650
  return job_submitted_at
509
651
 
510
- def recover(self) -> float:
652
+ async def recover(self) -> float:
511
653
  # 1. Cancel the jobs and launch the cluster with the STOPPED status,
512
654
  # so that it will try on the current region first until timeout.
513
655
  # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -515,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
515
657
  # original user specification.
516
658
 
517
659
  # Step 1
518
- self._try_cancel_jobs()
660
+ await self._try_cancel_jobs()
519
661
 
520
662
  while True:
521
663
  # Add region constraint to the task, to retry on the same region
@@ -529,8 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
529
671
  cloud=launched_cloud, region=launched_region, zone=None)
530
672
  task.set_resources({new_resources})
531
673
  # Not using self.launch to avoid the retry until up logic.
532
- job_submitted_at = self._launch(raise_on_failure=False,
533
- recovery=True)
674
+ job_submitted_at = await self._launch(raise_on_failure=False,
675
+ recovery=True)
534
676
  # Restore the original dag, i.e. reset the region constraint.
535
677
  task.set_resources(original_resources)
536
678
  if job_submitted_at is not None:
@@ -539,21 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
539
681
  # Step 2
540
682
  logger.debug('Terminating unhealthy cluster and reset cloud '
541
683
  'region.')
542
- self._cleanup_cluster()
684
+ await context_utils.to_thread(self._cleanup_cluster)
543
685
 
544
686
  # Step 3
545
687
  logger.debug('Relaunch the cluster without constraining to prior '
546
688
  'cloud/region.')
547
689
  # Not using self.launch to avoid the retry until up logic.
548
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
549
- raise_on_failure=False,
550
- recovery=True)
690
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
691
+ raise_on_failure=False,
692
+ recovery=True)
551
693
  if job_submitted_at is None:
552
694
  # Failed to launch the cluster.
553
695
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
554
696
  logger.info('Retrying to recover the cluster in '
555
697
  f'{gap_seconds:.1f} seconds.')
556
- time.sleep(gap_seconds)
698
+ await asyncio.sleep(gap_seconds)
557
699
  continue
558
700
 
559
701
  return job_submitted_at
@@ -585,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
585
727
  -> R1Z1 (success)
586
728
  """
587
729
 
588
- def recover(self) -> float:
730
+ async def recover(self) -> float:
589
731
  # 1. Terminate the current cluster
590
732
  # 2. Launch again by explicitly blocking the previously launched region
591
733
  # (this will failover through the entire search space except the
@@ -598,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
598
740
 
599
741
  # Step 1
600
742
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
601
- self._cleanup_cluster()
743
+ await context_utils.to_thread(self._cleanup_cluster)
602
744
 
603
745
  # Step 2
604
746
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -619,8 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
619
761
  region=launched_region)
620
762
  }
621
763
  # Not using self.launch to avoid the retry until up logic.
622
- job_submitted_at = self._launch(raise_on_failure=False,
623
- recovery=True)
764
+ job_submitted_at = await self._launch(raise_on_failure=False,
765
+ recovery=True)
624
766
  task.blocked_resources = None
625
767
  if job_submitted_at is not None:
626
768
  return job_submitted_at
@@ -630,15 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
630
772
  logger.debug('Relaunch the cluster without constraining to prior '
631
773
  'cloud/region.')
632
774
  # Not using self.launch to avoid the retry until up logic.
633
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
634
- raise_on_failure=False,
635
- recovery=True)
775
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
776
+ raise_on_failure=False,
777
+ recovery=True)
636
778
  if job_submitted_at is None:
637
779
  # Failed to launch the cluster.
638
780
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
639
781
  logger.info('Retrying to recover the cluster in '
640
782
  f'{gap_seconds:.1f} seconds.')
641
- time.sleep(gap_seconds)
783
+ await asyncio.sleep(gap_seconds)
642
784
  continue
643
785
 
644
786
  return job_submitted_at
787
+
788
+
789
+ def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
790
+ """Gets the file path that the logger writes to."""
791
+ for handler in file_logger.handlers:
792
+ if isinstance(handler, logging.FileHandler):
793
+ return handler.baseFilename
794
+ return None