skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,16 @@
1
1
  """Backend: runs on cloud virtual machines, managed by Ray."""
2
2
  import copy
3
+ import dataclasses
3
4
  import enum
4
- import inspect
5
5
  import json
6
6
  import math
7
7
  import os
8
8
  import pathlib
9
+ import random
9
10
  import re
10
11
  import shlex
11
- import shutil
12
12
  import signal
13
+ import socket
13
14
  import subprocess
14
15
  import sys
15
16
  import tempfile
@@ -17,14 +18,14 @@ import textwrap
17
18
  import threading
18
19
  import time
19
20
  import typing
20
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
- Union)
21
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
22
+ Set, Tuple, Union)
22
23
 
23
24
  import colorama
24
- import filelock
25
+ import psutil
25
26
 
26
- import sky
27
27
  from sky import backends
28
+ from sky import catalog
28
29
  from sky import check as sky_check
29
30
  from sky import cloud_stores
30
31
  from sky import clouds
@@ -37,10 +38,11 @@ from sky import resources as resources_lib
37
38
  from sky import sky_logging
38
39
  from sky import skypilot_config
39
40
  from sky import task as task_lib
41
+ from sky.adaptors import common as adaptors_common
40
42
  from sky.backends import backend_utils
43
+ from sky.backends import task_codegen
41
44
  from sky.backends import wheel_utils
42
45
  from sky.clouds import cloud as sky_cloud
43
- from sky.clouds import service_catalog
44
46
  from sky.clouds.utils import gcp_utils
45
47
  from sky.data import data_utils
46
48
  from sky.data import storage as storage_lib
@@ -48,21 +50,26 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
55
+ from sky.serve import constants as serve_constants
52
56
  from sky.server.requests import requests as requests_lib
53
57
  from sky.skylet import autostop_lib
54
58
  from sky.skylet import constants
55
59
  from sky.skylet import job_lib
56
60
  from sky.skylet import log_lib
57
61
  from sky.usage import usage_lib
58
- from sky.utils import accelerator_registry
59
62
  from sky.utils import annotations
60
63
  from sky.utils import cluster_utils
61
64
  from sky.utils import command_runner
62
65
  from sky.utils import common
63
66
  from sky.utils import common_utils
67
+ from sky.utils import context_utils
64
68
  from sky.utils import controller_utils
69
+ from sky.utils import directory_utils
65
70
  from sky.utils import env_options
71
+ from sky.utils import lock_events
72
+ from sky.utils import locks
66
73
  from sky.utils import log_utils
67
74
  from sky.utils import message_utils
68
75
  from sky.utils import registry
@@ -72,9 +79,43 @@ from sky.utils import status_lib
72
79
  from sky.utils import subprocess_utils
73
80
  from sky.utils import timeline
74
81
  from sky.utils import ux_utils
82
+ from sky.utils import volume as volume_lib
83
+ from sky.utils import yaml_utils
75
84
 
76
85
  if typing.TYPE_CHECKING:
86
+ import grpc
87
+
77
88
  from sky import dag
89
+ from sky.schemas.generated import autostopv1_pb2
90
+ from sky.schemas.generated import autostopv1_pb2_grpc
91
+ from sky.schemas.generated import jobsv1_pb2
92
+ from sky.schemas.generated import jobsv1_pb2_grpc
93
+ from sky.schemas.generated import managed_jobsv1_pb2
94
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
95
+ from sky.schemas.generated import servev1_pb2
96
+ from sky.schemas.generated import servev1_pb2_grpc
97
+ else:
98
+ # To avoid requiring grpcio to be installed on the client side.
99
+ grpc = adaptors_common.LazyImport(
100
+ 'grpc',
101
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
102
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
103
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
104
+ autostopv1_pb2 = adaptors_common.LazyImport(
105
+ 'sky.schemas.generated.autostopv1_pb2')
106
+ autostopv1_pb2_grpc = adaptors_common.LazyImport(
107
+ 'sky.schemas.generated.autostopv1_pb2_grpc')
108
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
109
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
110
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
111
+ servev1_pb2 = adaptors_common.LazyImport(
112
+ 'sky.schemas.generated.servev1_pb2')
113
+ servev1_pb2_grpc = adaptors_common.LazyImport(
114
+ 'sky.schemas.generated.servev1_pb2_grpc')
115
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
116
+ 'sky.schemas.generated.managed_jobsv1_pb2')
117
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
118
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
78
119
 
79
120
  Path = str
80
121
 
@@ -96,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
96
137
  clouds.OCI: 300,
97
138
  clouds.Paperspace: 600,
98
139
  clouds.Kubernetes: 300,
140
+ clouds.Shadeform: 300,
99
141
  clouds.Vsphere: 240,
100
142
  }
101
143
 
@@ -141,12 +183,13 @@ _MAX_RAY_UP_RETRY = 5
141
183
  _MAX_GET_ZONE_RETRY = 3
142
184
 
143
185
  _JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
186
+ _LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
144
187
 
145
188
  # Path to the monkey-patched ray up script.
146
189
  # We don't do import then __file__ because that script needs to be filled in
147
190
  # (so import would fail).
148
191
  _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
149
- pathlib.Path(sky.__file__).resolve().parent / 'backends' /
192
+ pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
150
193
  'monkey_patches' / 'monkey_patch_ray_up.py')
151
194
 
152
195
  # The maximum size of a command line arguments is 128 KB, i.e. the command
@@ -161,10 +204,19 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
161
204
  # We use 100KB as a threshold to be safe for other arguments that
162
205
  # might be added during ssh.
163
206
  _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
207
+ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
208
+ ('too long', 255),
209
+ ('request-uri too large', 1),
210
+ ('request header fields too large', 1),
211
+ ('400 bad request', 1), # CloudFlare 400 error
212
+ ]
164
213
 
165
214
  _RESOURCES_UNAVAILABLE_LOG = (
166
215
  'Reasons for provision failures (for details, please check the log above):')
167
216
 
217
+ # Number of seconds to wait locking the cluster before communicating with user.
218
+ _CLUSTER_LOCK_TIMEOUT = 5.0
219
+
168
220
 
169
221
  def _is_command_length_over_limit(command: str) -> bool:
170
222
  """Check if the length of the command exceeds the limit.
@@ -178,6 +230,61 @@ def _is_command_length_over_limit(command: str) -> bool:
178
230
  return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
179
231
 
180
232
 
233
+ def _is_message_too_long(returncode: int,
234
+ output: Optional[str] = None,
235
+ file_path: Optional[str] = None) -> bool:
236
+ """Check if the message sent to the remote is too long.
237
+
238
+ We use inline script to run the setup or run command, i.e. the script will
239
+ be part of the message sent to the remote cluster. There is a chance that
240
+ the command is too long, when people has very long run or setup commands, or
241
+ there is a cloudflare proxy in front of the remote blocking the long
242
+ message. Several common causes are:
243
+ - SSH returning: `too long` in the error message.
244
+ - Cloudflare proxy returning: `414 Request-URI Too Large` or
245
+ `431 Request Header Fields Too Large` error.
246
+
247
+ We use a general length limit check before but it could be inaccurate on
248
+ some systems, e.g. cloudflare proxy, so this is necessary.
249
+
250
+ Args:
251
+ returncode: The return code of the setup command.
252
+ output: The output of the setup command.
253
+ file_path: The path to the setup log file.
254
+ """
255
+ assert (output is None) != (file_path is None), (
256
+ 'Either output or file_path must be provided.', output, file_path)
257
+ to_check = []
258
+ for (match_str,
259
+ desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
260
+ if desired_rc == returncode:
261
+ to_check.append(match_str)
262
+ if not to_check:
263
+ return False
264
+
265
+ def _check_output_for_match_str(output: str) -> bool:
266
+ for match_str in to_check:
267
+ if match_str.lower() in output.lower():
268
+ return True
269
+ return False
270
+
271
+ if file_path is not None:
272
+ try:
273
+ with open(os.path.expanduser(file_path), 'r',
274
+ encoding='utf-8') as f:
275
+ content = f.read()
276
+ return _check_output_for_match_str(content)
277
+ except Exception as e: # pylint: disable=broad-except
278
+ # We don't crash the setup if we cannot read the log file.
279
+ # Instead, we should retry the setup with dumping the script
280
+ # to a file to be safe.
281
+ logger.debug(f'Failed to read setup log file {file_path}: {e}')
282
+ return True
283
+ else:
284
+ assert output is not None, (output, file_path)
285
+ return _check_output_for_match_str(output)
286
+
287
+
181
288
  def _get_cluster_config_template(cloud):
182
289
  cloud_to_template = {
183
290
  clouds.AWS: 'aws-ray.yml.j2',
@@ -189,13 +296,18 @@ def _get_cluster_config_template(cloud):
189
296
  clouds.SCP: 'scp-ray.yml.j2',
190
297
  clouds.OCI: 'oci-ray.yml.j2',
191
298
  clouds.Paperspace: 'paperspace-ray.yml.j2',
299
+ clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
192
300
  clouds.DO: 'do-ray.yml.j2',
193
301
  clouds.RunPod: 'runpod-ray.yml.j2',
194
302
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
303
+ clouds.SSH: 'kubernetes-ray.yml.j2',
304
+ clouds.Shadeform: 'shadeform-ray.yml.j2',
195
305
  clouds.Vsphere: 'vsphere-ray.yml.j2',
196
306
  clouds.Vast: 'vast-ray.yml.j2',
197
307
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
198
- clouds.Nebius: 'nebius-ray.yml.j2'
308
+ clouds.Nebius: 'nebius-ray.yml.j2',
309
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
310
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
199
311
  }
200
312
  return cloud_to_template[type(cloud)]
201
313
 
@@ -225,500 +337,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
225
337
  return f.name
226
338
 
227
339
 
228
- class RayCodeGen:
229
- """Code generator of a Ray program that executes a sky.Task.
230
-
231
- Usage:
232
-
233
- >> codegen = RayCodegen()
234
- >> codegen.add_prologue()
235
-
236
- >> codegen.add_ray_task(...)
237
- >> codegen.add_ray_task(...)
238
-
239
- >> codegen.add_epilogue()
240
- >> code = codegen.build()
241
- """
242
-
243
- def __init__(self):
244
- # Code generated so far, to be joined via '\n'.
245
- self._code = []
246
- # Guard method calling order.
247
- self._has_prologue = False
248
- self._has_epilogue = False
249
-
250
- # For n nodes gang scheduling.
251
- self._has_gang_scheduling = False
252
- self._num_nodes = 0
253
-
254
- self._has_register_run_fn = False
255
-
256
- # job_id
257
- # Job ID is used to identify the job (also this generated code).
258
- # It is a int automatically generated by the DB on the cluster
259
- # and monotonically increasing starting from 1.
260
- # To generate the job ID, we use the following logic:
261
- # code = job_lib.JobLibCodeGen.add_job(username,
262
- # run_timestamp)
263
- # job_id = get_output(run_on_cluster(code))
264
- self.job_id = None
265
-
266
- def add_prologue(self, job_id: int) -> None:
267
- assert not self._has_prologue, 'add_prologue() called twice?'
268
- self._has_prologue = True
269
- self.job_id = job_id
270
- # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
271
- # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
272
- # Otherwise, ray will fail to get the placement group because of a bug
273
- # in ray job.
274
- ray_address = 'auto'
275
- self._code = [
276
- textwrap.dedent(f"""\
277
- import getpass
278
- import hashlib
279
- import io
280
- import os
281
- import pathlib
282
- import selectors
283
- import shlex
284
- import subprocess
285
- import sys
286
- import tempfile
287
- import textwrap
288
- import time
289
- from typing import Dict, List, Optional, Tuple, Union
290
-
291
- # Set the environment variables to avoid deduplicating logs and
292
- # scheduler events. This should be set in driver code, since we are
293
- # not using `ray job submit` anymore, and the environment variables
294
- # from the ray cluster is not inherited.
295
- os.environ['RAY_DEDUP_LOGS'] = '0'
296
- os.environ['RAY_SCHEDULER_EVENTS'] = '0'
297
-
298
- import ray
299
- import ray.util as ray_util
300
-
301
- from sky.skylet import autostop_lib
302
- from sky.skylet import constants
303
- from sky.skylet import job_lib
304
- from sky.utils import log_utils
305
- from sky.utils import subprocess_utils
306
-
307
- SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
308
-
309
- kwargs = dict()
310
- # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
311
- # the directory exists for backward compatibility for the VM
312
- # launched before #1790.
313
- if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
314
- kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
315
- ray.init(
316
- address={ray_address!r},
317
- namespace='__sky__{job_id}__',
318
- log_to_driver=True,
319
- **kwargs
320
- )
321
- def get_or_fail(futures, pg) -> List[int]:
322
- \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
323
- if not futures:
324
- return []
325
- returncodes = [1] * len(futures)
326
- # Wait for 1 task to be ready.
327
- ready = []
328
- # Keep invoking ray.wait if ready is empty. This is because
329
- # ray.wait with timeout=None will only wait for 10**6 seconds,
330
- # which will cause tasks running for more than 12 days to return
331
- # before becoming ready.
332
- # (Such tasks are common in serving jobs.)
333
- # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
334
- while not ready:
335
- ready, unready = ray.wait(futures)
336
- idx = futures.index(ready[0])
337
- returncodes[idx] = ray.get(ready[0])
338
- while unready:
339
- if returncodes[idx] != 0:
340
- for task in unready:
341
- # ray.cancel without force fails to kill tasks.
342
- # We use force=True to kill unready tasks.
343
- ray.cancel(task, force=True)
344
- # Use SIGKILL=128+9 to indicate the task is forcely
345
- # killed.
346
- idx = futures.index(task)
347
- returncodes[idx] = 137
348
- break
349
- ready, unready = ray.wait(unready)
350
- idx = futures.index(ready[0])
351
- returncodes[idx] = ray.get(ready[0])
352
- # Remove the placement group after all tasks are done, so that
353
- # the next job can be scheduled on the released resources
354
- # immediately.
355
- ray_util.remove_placement_group(pg)
356
- sys.stdout.flush()
357
- return returncodes
358
-
359
- run_fn = None
360
- futures = []
361
- """),
362
- # FIXME: This is a hack to make sure that the functions can be found
363
- # by ray.remote. This should be removed once we have a better way to
364
- # specify dependencies for ray.
365
- inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
366
- inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
367
- inspect.getsource(log_lib.process_subprocess_stream),
368
- inspect.getsource(log_lib.run_with_log),
369
- inspect.getsource(log_lib.make_task_bash_script),
370
- inspect.getsource(log_lib.add_ray_env_vars),
371
- inspect.getsource(log_lib.run_bash_command_with_log),
372
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
373
- ]
374
- # Currently, the codegen program is/can only be submitted to the head
375
- # node, due to using job_lib for updating job statuses, and using
376
- # autostop_lib here.
377
- self._code.append(
378
- # Use hasattr to handle backward compatibility.
379
- # TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
380
- textwrap.dedent("""\
381
- if hasattr(autostop_lib, 'set_last_active_time_to_now'):
382
- autostop_lib.set_last_active_time_to_now()
383
- """))
384
- self._code += [
385
- f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
386
- ]
387
-
388
- def add_gang_scheduling_placement_group_and_setup(
389
- self,
390
- num_nodes: int,
391
- resources_dict: Dict[str, float],
392
- stable_cluster_internal_ips: List[str],
393
- env_vars: Dict[str, str],
394
- setup_cmd: Optional[str] = None,
395
- setup_log_path: Optional[str] = None,
396
- ) -> None:
397
- """Create the gang scheduling placement group for a Task.
398
-
399
- cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
400
- variable is assigned in a deterministic order whenever a new task is
401
- added.
402
- """
403
- assert self._has_prologue, (
404
- 'Call add_prologue() before '
405
- 'add_gang_scheduling_placement_group_and_setup().')
406
- self._has_gang_scheduling = True
407
- self._num_nodes = num_nodes
408
-
409
- bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
410
- # Set CPU to avoid ray hanging the resources allocation
411
- # for remote functions, since the task will request 1 CPU
412
- # by default.
413
- task_cpu_demand = resources_dict.pop('CPU')
414
-
415
- if resources_dict:
416
- assert len(resources_dict) == 1, (
417
- 'There can only be one type of accelerator per instance. '
418
- f'Found: {resources_dict}.')
419
- acc_name, acc_count = list(resources_dict.items())[0]
420
- gpu_dict = {'GPU': acc_count}
421
- # gpu_dict should be empty when the accelerator is not GPU.
422
- # TODO(zongheng,zhanghao): an alternative is to start the remote
423
- # cluster with custom resource 'GPU': <n> even if the accelerator(s)
424
- # are not GPU. We opt for the current solution for now.
425
- if accelerator_registry.is_schedulable_non_gpu_accelerator(
426
- acc_name):
427
- gpu_dict = {}
428
- for bundle in bundles:
429
- bundle.update({
430
- # Set the GPU to avoid ray hanging the resources allocation
431
- **gpu_dict,
432
- })
433
-
434
- streaming_message = (
435
- f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
436
- f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
437
- f'be killed){colorama.Style.RESET_ALL}')
438
- self._code += [
439
- textwrap.dedent(f"""\
440
- pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
441
- plural = 's' if {num_nodes} > 1 else ''
442
- node_str = f'{num_nodes} node{{plural}}'
443
- message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
444
- 'Waiting for task resources on '
445
- f'{{node_str}}.{colorama.Style.RESET_ALL}')
446
- print(message, flush=True)
447
- # FIXME: This will print the error message from autoscaler if
448
- # it is waiting for other task to finish. We should hide the
449
- # error message.
450
- ray.get(pg.ready())
451
- print({streaming_message!r}, flush=True)
452
- """)
453
- ]
454
-
455
- job_id = self.job_id
456
- if setup_cmd is not None:
457
- setup_envs = env_vars.copy()
458
- setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
459
- self._code += [
460
- textwrap.dedent(f"""\
461
- setup_cmd = {setup_cmd!r}
462
- _SETUP_CPUS = 0.0001
463
- # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
464
- # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
465
- # We unset it so that user setup command may properly use this env var.
466
- setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
467
- job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
468
-
469
- # The schedule_step should be called after the job status is set to non-PENDING,
470
- # otherwise, the scheduler will think the current job is not submitted yet, and
471
- # skip the scheduling step.
472
- job_lib.scheduler.schedule_step()
473
-
474
- total_num_nodes = len(ray.nodes())
475
- setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
476
- setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
477
- setup_workers = [run_bash_command_with_log \\
478
- .options(
479
- name='setup',
480
- num_cpus=_SETUP_CPUS,
481
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
482
- placement_group=setup_pg,
483
- placement_group_bundle_index=i)
484
- ) \\
485
- .remote(
486
- setup_cmd,
487
- os.path.expanduser({setup_log_path!r}),
488
- env_vars={setup_envs!r},
489
- stream_logs=True,
490
- with_ray=True,
491
- ) for i in range(total_num_nodes)]
492
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
493
- if sum(setup_returncodes) != 0:
494
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
495
- # This waits for all streaming logs to finish.
496
- time.sleep(1)
497
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
498
- 'return code list:{colorama.Style.RESET_ALL}',
499
- setup_returncodes,
500
- flush=True)
501
- # Need this to set the job status in ray job to be FAILED.
502
- sys.exit(1)
503
- """)
504
- ]
505
-
506
- self._code.append(f'job_lib.set_job_started({self.job_id!r})')
507
- if setup_cmd is None:
508
- # Need to call schedule_step() to make sure the scheduler
509
- # schedule the next pending job.
510
- self._code.append('job_lib.scheduler.schedule_step()')
511
-
512
- # Export IP and node rank to the environment variables.
513
- self._code += [
514
- textwrap.dedent(f"""\
515
- @ray.remote
516
- def check_ip():
517
- return ray.util.get_node_ip_address()
518
- gang_scheduling_id_to_ip = ray.get([
519
- check_ip.options(
520
- num_cpus={task_cpu_demand},
521
- scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
522
- placement_group=pg,
523
- placement_group_bundle_index=i
524
- )).remote()
525
- for i in range(pg.bundle_count)
526
- ])
527
-
528
- cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
529
- job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
530
- job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
531
- job_ip_list_str = '\\n'.join(job_ip_rank_list)
532
- """),
533
- ]
534
-
535
- def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
536
- """Register the run function to be run on the remote cluster.
537
-
538
- Args:
539
- run_fn: The run function to be run on the remote cluster.
540
- """
541
- assert self._has_gang_scheduling, (
542
- 'Call add_gang_scheduling_placement_group_and_setup() '
543
- 'before register_run_fn().')
544
- assert not self._has_register_run_fn, (
545
- 'register_run_fn() called twice?')
546
- self._has_register_run_fn = True
547
-
548
- self._code += [
549
- run_fn,
550
- f'run_fn = {run_fn_name}',
551
- ]
552
-
553
- def add_ray_task(self,
554
- bash_script: Optional[str],
555
- task_name: Optional[str],
556
- ray_resources_dict: Dict[str, float],
557
- log_dir: str,
558
- env_vars: Optional[Dict[str, str]] = None,
559
- gang_scheduling_id: int = 0) -> None:
560
- """Generates code for a ray remote task that runs a bash command."""
561
- assert self._has_gang_scheduling, (
562
- 'Call add_gang_scheduling_placement_group_and_setup() before '
563
- 'add_ray_task().')
564
- assert (not self._has_register_run_fn or
565
- bash_script is None), ('bash_script should '
566
- 'be None when run_fn is registered.')
567
- task_cpu_demand = ray_resources_dict.pop('CPU')
568
- # Build remote_task.options(...)
569
- # resources=...
570
- # num_gpus=...
571
- options = []
572
- options.append(f'num_cpus={task_cpu_demand}')
573
-
574
- num_gpus = 0.0
575
- if ray_resources_dict:
576
- assert len(ray_resources_dict) == 1, (
577
- 'There can only be one type of accelerator per instance. '
578
- f'Found: {ray_resources_dict}.')
579
- num_gpus = list(ray_resources_dict.values())[0]
580
- options.append(f'resources={json.dumps(ray_resources_dict)}')
581
-
582
- resources_key = list(ray_resources_dict.keys())[0]
583
- if not accelerator_registry.is_schedulable_non_gpu_accelerator(
584
- resources_key):
585
- # `num_gpus` should be empty when the accelerator is not GPU.
586
- # FIXME: use a set of GPU types, instead of 'tpu' in the key.
587
-
588
- # Passing this ensures that the Ray remote task gets
589
- # CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
590
- # would be force-set to empty by Ray.
591
- options.append(f'num_gpus={num_gpus}')
592
- options.append(
593
- 'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
594
- 'placement_group=pg, '
595
- f'placement_group_bundle_index={gang_scheduling_id})')
596
-
597
- sky_env_vars_dict_str = [
598
- textwrap.dedent(f"""\
599
- sky_env_vars_dict = {{}}
600
- sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
601
- sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
602
- """)
603
- ]
604
-
605
- if env_vars is not None:
606
- sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
607
- for k, v in env_vars.items())
608
- sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
609
-
610
- options_str = ', '.join(options)
611
- logger.debug('Added Task with options: '
612
- f'{options_str}')
613
- # Script to block completion of a job until all storage mounted with
614
- # CACHED_MOUNT mode is uploaded to remote.
615
- rclone_flush_script = textwrap.dedent(f"""\
616
-
617
- if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ]; then
618
- flushed=0
619
- # extra second on top of --vfs-cache-poll-interval to
620
- # avoid race condition between rclone log line creation and this check.
621
- sleep 1
622
- while [ $flushed -eq 0 ]; do
623
- # sleep for the same interval as --vfs-cache-poll-interval
624
- sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
625
- flushed=1
626
- for file in {constants.RCLONE_LOG_DIR}/*; do
627
- exitcode=0
628
- tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
629
- if [ $exitcode -ne 0 ]; then
630
- echo "skypilot: cached mount is still uploading to remote"
631
- flushed=0
632
- break
633
- fi
634
- done
635
- done
636
- echo "skypilot: cached mount uploaded complete"
637
- fi""")
638
- self._code += [
639
- sky_env_vars_dict_str,
640
- textwrap.dedent(f"""\
641
- script = {bash_script!r}
642
- rclone_flush_script = {rclone_flush_script!r}
643
- if run_fn is not None:
644
- script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
645
-
646
- if script is not None:
647
- script += rclone_flush_script
648
- sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
649
-
650
- ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
651
- rank = job_ip_rank_map[ip]
652
-
653
- if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
654
- name_str = '{task_name},' if {task_name!r} != None else 'task,'
655
- log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
656
- else: # Single-node or multi-node task on multi-node cluster
657
- idx_in_cluster = cluster_ips_to_node_id[ip]
658
- if cluster_ips_to_node_id[ip] == 0:
659
- node_name = 'head'
660
- else:
661
- node_name = f'worker{{idx_in_cluster}}'
662
- name_str = f'{{node_name}}, rank={{rank}},'
663
- log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
664
- sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
665
-
666
- sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
667
-
668
- futures.append(run_bash_command_with_log \\
669
- .options(name=name_str, {options_str}) \\
670
- .remote(
671
- script,
672
- log_path,
673
- env_vars=sky_env_vars_dict,
674
- stream_logs=True,
675
- with_ray=True,
676
- ))""")
677
- ]
678
-
679
- def add_epilogue(self) -> None:
680
- """Generates code that waits for all tasks, then exits."""
681
- assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
682
- assert not self._has_epilogue, 'add_epilogue() called twice?'
683
- self._has_epilogue = True
684
-
685
- self._code += [
686
- textwrap.dedent(f"""\
687
- returncodes = get_or_fail(futures, pg)
688
- if sum(returncodes) != 0:
689
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
690
- # Schedule the next pending job immediately to make the job
691
- # scheduling more efficient.
692
- job_lib.scheduler.schedule_step()
693
- # This waits for all streaming logs to finish.
694
- time.sleep(0.5)
695
- reason = ''
696
- # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
697
- if any(r == 139 for r in returncodes):
698
- reason = '(likely due to Segmentation Fault)'
699
- print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
700
- 'return code list:{colorama.Style.RESET_ALL}',
701
- returncodes,
702
- reason,
703
- flush=True)
704
- # Need this to set the job status in ray job to be FAILED.
705
- sys.exit(1)
706
- else:
707
- job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
708
- # Schedule the next pending job immediately to make the job
709
- # scheduling more efficient.
710
- job_lib.scheduler.schedule_step()
711
- # This waits for all streaming logs to finish.
712
- time.sleep(0.5)
713
- """)
714
- ]
715
-
716
- def build(self) -> str:
717
- """Returns the entire generated program."""
718
- assert self._has_epilogue, 'Call add_epilogue() before build().'
719
- return '\n'.join(self._code)
720
-
721
-
722
340
  class GangSchedulingStatus(enum.Enum):
723
341
  """Enum for gang scheduling status."""
724
342
  CLUSTER_READY = 0
@@ -778,34 +396,6 @@ class FailoverCloudErrorHandlerV1:
778
396
  setattr(e, 'detailed_reason', detailed_reason)
779
397
  raise e
780
398
 
781
- @staticmethod
782
- def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
783
- launchable_resources: 'resources_lib.Resources',
784
- region: 'clouds.Region',
785
- zones: Optional[List['clouds.Zone']], stdout: str,
786
- stderr: str):
787
- del zones # Unused.
788
- errors = FailoverCloudErrorHandlerV1._handle_errors(
789
- stdout,
790
- stderr,
791
- is_error_str_known=lambda x: 'SCPError:' in x.strip())
792
-
793
- logger.warning(f'Got error(s) in {region.name}:')
794
- messages = '\n\t'.join(errors)
795
- style = colorama.Style
796
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
797
- _add_to_blocked_resources(blocked_resources,
798
- launchable_resources.copy(zone=None))
799
-
800
- # Sometimes, SCPError will list available regions.
801
- for e in errors:
802
- if e.find('Regions with capacity available:') != -1:
803
- for r in service_catalog.regions('scp'):
804
- if e.find(r.name) == -1:
805
- _add_to_blocked_resources(
806
- blocked_resources,
807
- launchable_resources.copy(region=r.name, zone=None))
808
-
809
399
  @staticmethod
810
400
  def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
811
401
  launchable_resources: 'resources_lib.Resources',
@@ -1085,7 +675,7 @@ class FailoverCloudErrorHandlerV2:
1085
675
  output = str(error)
1086
676
  # Sometimes, lambda cloud error will list available regions.
1087
677
  if output.find('Regions with capacity available:') != -1:
1088
- for r in service_catalog.regions('lambda'):
678
+ for r in catalog.regions('lambda'):
1089
679
  if output.find(r.name) == -1:
1090
680
  _add_to_blocked_resources(
1091
681
  blocked_resources,
@@ -1109,6 +699,21 @@ class FailoverCloudErrorHandlerV2:
1109
699
  FailoverCloudErrorHandlerV2._default_handler(
1110
700
  blocked_resources, launchable_resources, region, zones, error)
1111
701
 
702
+ @staticmethod
703
+ def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
704
+ launchable_resources: 'resources_lib.Resources',
705
+ region: 'clouds.Region',
706
+ zones: Optional[List['clouds.Zone']],
707
+ error: Exception) -> None:
708
+ logger.info(f'SCP handler error: {error}')
709
+ # Block SCP if the credential has expired.
710
+ if isinstance(error, exceptions.InvalidCloudCredentials):
711
+ _add_to_blocked_resources(
712
+ blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
713
+ else:
714
+ FailoverCloudErrorHandlerV2._default_handler(
715
+ blocked_resources, launchable_resources, region, zones, error)
716
+
1112
717
  @staticmethod
1113
718
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1114
719
  launchable_resources: 'resources_lib.Resources',
@@ -1176,7 +781,8 @@ class RetryingVmProvisioner(object):
1176
781
  local_wheel_path: pathlib.Path,
1177
782
  wheel_hash: str,
1178
783
  blocked_resources: Optional[Iterable[
1179
- resources_lib.Resources]] = None):
784
+ resources_lib.Resources]] = None,
785
+ is_managed: Optional[bool] = None):
1180
786
  self._blocked_resources: Set[resources_lib.Resources] = set()
1181
787
  if blocked_resources:
1182
788
  # blocked_resources is not None and not empty.
@@ -1188,6 +794,7 @@ class RetryingVmProvisioner(object):
1188
794
  self._requested_features = requested_features
1189
795
  self._local_wheel_path = local_wheel_path
1190
796
  self._wheel_hash = wheel_hash
797
+ self._is_managed = is_managed
1191
798
 
1192
799
  def _yield_zones(
1193
800
  self, to_provision: resources_lib.Resources, num_nodes: int,
@@ -1232,7 +839,8 @@ class RetryingVmProvisioner(object):
1232
839
  assert isinstance(handle, CloudVmRayResourceHandle), (
1233
840
  'handle should be CloudVmRayResourceHandle (found: '
1234
841
  f'{type(handle)}) {cluster_name!r}')
1235
- config = common_utils.read_yaml(handle.cluster_yaml)
842
+ config = global_user_state.get_cluster_yaml_dict(
843
+ handle.cluster_yaml)
1236
844
  # This is for the case when the zone field is not set in the
1237
845
  # launched resources in a previous launch (e.g., ctrl-c during
1238
846
  # launch and multi-node cluster before PR #1700).
@@ -1316,6 +924,34 @@ class RetryingVmProvisioner(object):
1316
924
  zones = [clouds.Zone(name=to_provision.zone)]
1317
925
  yield zones
1318
926
 
927
+ def _insufficient_resources_msg(
928
+ self,
929
+ to_provision: resources_lib.Resources,
930
+ requested_resources: Set[resources_lib.Resources],
931
+ insufficient_resources: Optional[List[str]],
932
+ ) -> str:
933
+ insufficent_resource_msg = ('' if insufficient_resources is None else
934
+ f' ({", ".join(insufficient_resources)})')
935
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
936
+ if to_provision.zone is not None:
937
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
938
+ elif to_provision.region is not None and to_provision.cloud is not None:
939
+ # For public clouds, provision.region is always set.
940
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
941
+ message += (
942
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
943
+ f'for {requested_resources}. The SSH Node Pool may not '
944
+ 'have enough resources.')
945
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
946
+ message += (f'in context {to_provision.region} for '
947
+ f'{requested_resources}. ')
948
+ else:
949
+ message += (f'in all zones in {to_provision.region} for '
950
+ f'{requested_resources}. ')
951
+ else:
952
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
953
+ return message
954
+
1319
955
  def _retry_zones(
1320
956
  self,
1321
957
  to_provision: resources_lib.Resources,
@@ -1329,6 +965,7 @@ class RetryingVmProvisioner(object):
1329
965
  prev_handle: Optional['CloudVmRayResourceHandle'],
1330
966
  prev_cluster_ever_up: bool,
1331
967
  skip_if_config_hash_matches: Optional[str],
968
+ volume_mounts: Optional[List[volume_lib.VolumeMount]],
1332
969
  ) -> Dict[str, Any]:
1333
970
  """The provision retry loop.
1334
971
 
@@ -1349,12 +986,17 @@ class RetryingVmProvisioner(object):
1349
986
  if not dryrun:
1350
987
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
1351
988
  os.system(f'touch {log_path}')
989
+
1352
990
  rich_utils.force_update_status(
1353
- ux_utils.spinner_message('Launching', log_path))
991
+ ux_utils.spinner_message('Launching',
992
+ log_path,
993
+ cluster_name=cluster_name))
1354
994
 
1355
995
  # Get previous cluster status
1356
996
  cluster_exists = prev_cluster_status is not None
1357
997
 
998
+ to_provision = to_provision.assert_launchable()
999
+
1358
1000
  assert to_provision.region is not None, (
1359
1001
  to_provision, 'region should have been set by the optimizer.')
1360
1002
  region = clouds.Region(to_provision.region)
@@ -1388,6 +1030,7 @@ class RetryingVmProvisioner(object):
1388
1030
  f'To request quotas, check the instruction: '
1389
1031
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1390
1032
 
1033
+ insufficient_resources = None
1391
1034
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1392
1035
  prev_cluster_status,
1393
1036
  prev_cluster_ever_up):
@@ -1432,7 +1075,9 @@ class RetryingVmProvisioner(object):
1432
1075
  region=region,
1433
1076
  zones=zones,
1434
1077
  dryrun=dryrun,
1435
- keep_launch_fields_in_existing_config=cluster_exists)
1078
+ keep_launch_fields_in_existing_config=cluster_exists,
1079
+ volume_mounts=volume_mounts,
1080
+ )
1436
1081
  except exceptions.ResourcesUnavailableError as e:
1437
1082
  # Failed due to catalog issue, e.g. image not found, or
1438
1083
  # GPUs are requested in a Kubernetes cluster but the cluster
@@ -1515,8 +1160,17 @@ class RetryingVmProvisioner(object):
1515
1160
  cluster_handle=handle,
1516
1161
  requested_resources=requested_resources,
1517
1162
  ready=False,
1163
+ is_managed=self._is_managed,
1164
+ provision_log_path=log_abs_path,
1518
1165
  )
1519
1166
 
1167
+ # Add cluster event for actual provisioning start.
1168
+ global_user_state.add_cluster_event(
1169
+ cluster_name, status_lib.ClusterStatus.INIT,
1170
+ f'Provisioning on {to_provision.cloud.display_name()} ' +
1171
+ f'in {to_provision.region}',
1172
+ global_user_state.ClusterEventType.STATUS_CHANGE)
1173
+
1520
1174
  global_user_state.set_owner_identity_for_cluster(
1521
1175
  cluster_name, cloud_user_identity)
1522
1176
 
@@ -1543,11 +1197,13 @@ class RetryingVmProvisioner(object):
1543
1197
  controller_str = ('' if controller is None else
1544
1198
  f' {controller.value.name}')
1545
1199
  if isinstance(to_provision.cloud, clouds.Kubernetes):
1546
- # Omit the region name for Kubernetes.
1200
+ suffix = '.'
1201
+ if region.name.startswith('ssh-'):
1202
+ suffix = f' ({region.name.lstrip("ssh-")})'
1547
1203
  logger.info(
1548
1204
  ux_utils.starting_message(
1549
1205
  f'Launching{controller_str} on '
1550
- f'{to_provision.cloud}.'))
1206
+ f'{to_provision.cloud}{suffix}'))
1551
1207
  else:
1552
1208
  logger.info(
1553
1209
  ux_utils.starting_message(
@@ -1587,6 +1243,24 @@ class RetryingVmProvisioner(object):
1587
1243
  # No teardown happens for this error.
1588
1244
  with ux_utils.print_exception_no_traceback():
1589
1245
  raise
1246
+ except config_lib.KubernetesError as e:
1247
+ if e.insufficent_resources:
1248
+ insufficient_resources = e.insufficent_resources
1249
+ # NOTE: We try to cleanup the cluster even if the previous
1250
+ # cluster does not exist. Also we are fast at
1251
+ # cleaning up clusters now if there is no existing node.
1252
+ CloudVmRayBackend().post_teardown_cleanup(
1253
+ handle,
1254
+ terminate=not prev_cluster_ever_up,
1255
+ remove_from_db=False,
1256
+ failover=True,
1257
+ )
1258
+ # TODO(suquark): other clouds may have different zone
1259
+ # blocking strategy. See '_update_blocklist_on_error'
1260
+ # for details.
1261
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1262
+ self._blocked_resources, to_provision, region, zones, e)
1263
+ continue
1590
1264
  except Exception as e: # pylint: disable=broad-except
1591
1265
  # NOTE: We try to cleanup the cluster even if the previous
1592
1266
  # cluster does not exist. Also we are fast at
@@ -1594,7 +1268,8 @@ class RetryingVmProvisioner(object):
1594
1268
  CloudVmRayBackend().post_teardown_cleanup(
1595
1269
  handle,
1596
1270
  terminate=not prev_cluster_ever_up,
1597
- remove_from_db=False)
1271
+ remove_from_db=False,
1272
+ failover=True)
1598
1273
  # TODO(suquark): other clouds may have different zone
1599
1274
  # blocking strategy. See '_update_blocklist_on_error'
1600
1275
  # for details.
@@ -1650,7 +1325,9 @@ class RetryingVmProvisioner(object):
1650
1325
  config_dict['handle'] = handle
1651
1326
  logger.info(
1652
1327
  ux_utils.finishing_message(
1653
- f'Cluster launched: {cluster_name!r}.', log_path))
1328
+ f'Cluster launched: {cluster_name!r}.',
1329
+ log_path,
1330
+ cluster_name=cluster_name))
1654
1331
  return config_dict
1655
1332
 
1656
1333
  # The cluster is not ready. We must perform error recording and/or
@@ -1714,17 +1391,9 @@ class RetryingVmProvisioner(object):
1714
1391
  terminate=terminate_or_stop,
1715
1392
  remove_from_db=False)
1716
1393
 
1717
- if to_provision.zone is not None:
1718
- message = (
1719
- f'Failed to acquire resources in {to_provision.zone} for '
1720
- f'{requested_resources}. ')
1721
- elif to_provision.region is not None:
1722
- # For public clouds, provision.region is always set.
1723
- message = ('Failed to acquire resources in all zones in '
1724
- f'{to_provision.region} for {requested_resources}. ')
1725
- else:
1726
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1727
- f'for {requested_resources}. ')
1394
+ message = self._insufficient_resources_msg(to_provision,
1395
+ requested_resources,
1396
+ insufficient_resources)
1728
1397
  # Do not failover to other locations if the cluster was ever up, since
1729
1398
  # the user can have some data on the cluster.
1730
1399
  raise exceptions.ResourcesUnavailableError(
@@ -1775,7 +1444,8 @@ class RetryingVmProvisioner(object):
1775
1444
  log_abs_path,
1776
1445
  stream_logs=False,
1777
1446
  start_streaming_at='Shared connection to',
1778
- line_processor=log_utils.RayUpLineProcessor(log_abs_path),
1447
+ line_processor=log_utils.RayUpLineProcessor(
1448
+ log_abs_path, cluster_name=cluster_handle.cluster_name),
1779
1449
  # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
1780
1450
  # time during 'ray up' if insufficient capacity occurs.
1781
1451
  env=dict(
@@ -1919,9 +1589,10 @@ class RetryingVmProvisioner(object):
1919
1589
  # ready to ensure cluster will not scale up after preemption (spot).
1920
1590
  # Skip for non-spot as this takes extra time to provision (~1min).
1921
1591
  if use_spot:
1922
- ray_config = common_utils.read_yaml(cluster_config_file)
1592
+ ray_config = global_user_state.get_cluster_yaml_dict(
1593
+ cluster_config_file)
1923
1594
  ray_config['upscaling_speed'] = 0
1924
- common_utils.dump_yaml(cluster_config_file, ray_config)
1595
+ yaml_utils.dump_yaml(cluster_config_file, ray_config)
1925
1596
  start = time.time()
1926
1597
  returncode, stdout, stderr = ray_up()
1927
1598
  logger.debug(
@@ -2030,6 +1701,7 @@ class RetryingVmProvisioner(object):
2030
1701
  f' that never expire or a service account.\033[0m')
2031
1702
  logger.warning(warnings)
2032
1703
 
1704
+ to_provision = to_provision.assert_launchable()
2033
1705
  # Retrying launchable resources.
2034
1706
  while True:
2035
1707
  try:
@@ -2068,7 +1740,9 @@ class RetryingVmProvisioner(object):
2068
1740
  prev_cluster_status=prev_cluster_status,
2069
1741
  prev_handle=prev_handle,
2070
1742
  prev_cluster_ever_up=prev_cluster_ever_up,
2071
- skip_if_config_hash_matches=skip_if_config_hash_matches)
1743
+ skip_if_config_hash_matches=skip_if_config_hash_matches,
1744
+ volume_mounts=task.volume_mounts,
1745
+ )
2072
1746
  if dryrun:
2073
1747
  return config_dict
2074
1748
  except (exceptions.InvalidClusterNameError,
@@ -2115,8 +1789,6 @@ class RetryingVmProvisioner(object):
2115
1789
  # terminated by _retry_zones().
2116
1790
  assert (prev_cluster_status == status_lib.ClusterStatus.INIT
2117
1791
  ), prev_cluster_status
2118
- assert global_user_state.get_handle_from_cluster_name(
2119
- cluster_name) is None, cluster_name
2120
1792
  logger.info(
2121
1793
  ux_utils.retry_message(
2122
1794
  f'Retrying provisioning with requested resources: '
@@ -2151,20 +1823,45 @@ class RetryingVmProvisioner(object):
2151
1823
  # possible resources or the requested resources is too
2152
1824
  # restrictive. If we reach here, our failover logic finally
2153
1825
  # ends here.
2154
- table = log_utils.create_table(['Resource', 'Reason'])
1826
+ table = log_utils.create_table(['INFRA', 'RESOURCES', 'REASON'])
2155
1827
  for (resource, exception) in resource_exceptions.items():
2156
- table.add_row(
2157
- [resources_utils.format_resource(resource), exception])
2158
- table.max_table_width = shutil.get_terminal_size().columns
1828
+ table.add_row([
1829
+ resource.infra.formatted_str(),
1830
+ resources_utils.format_resource(
1831
+ resource, simplified_only=True)[0], exception
1832
+ ])
1833
+ # Set the max width of REASON column to 80 to avoid the table
1834
+ # being wrapped in a unreadable way.
1835
+ # pylint: disable=protected-access
1836
+ table._max_width = {'REASON': 80}
2159
1837
  raise exceptions.ResourcesUnavailableError(
2160
1838
  _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2161
1839
  failover_history=failover_history)
2162
- to_provision = task.best_resources
1840
+ best_resources = task.best_resources
2163
1841
  assert task in self._dag.tasks, 'Internal logic error.'
2164
- assert to_provision is not None, task
1842
+ assert best_resources is not None, task
1843
+ to_provision = best_resources
2165
1844
  return config_dict
2166
1845
 
2167
1846
 
1847
+ @dataclasses.dataclass
1848
+ class SSHTunnelInfo:
1849
+ port: int
1850
+ pid: int
1851
+
1852
+
1853
+ def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
1854
+ try:
1855
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1856
+ s.settimeout(0.5)
1857
+ s.connect(('localhost', tunnel.port))
1858
+ return True
1859
+ except socket.error as e:
1860
+ logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
1861
+ f'{common_utils.format_exception(e)}')
1862
+ return False
1863
+
1864
+
2168
1865
  class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2169
1866
  """A pickle-able handle to a cluster created by CloudVmRayBackend.
2170
1867
 
@@ -2184,10 +1881,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2184
1881
  - (optional) Launched resources
2185
1882
  - (optional) Docker user name
2186
1883
  - (optional) If TPU(s) are managed, a path to a deletion script.
1884
+ - (optional) Skylet SSH tunnel info.
2187
1885
  """
2188
1886
  # Bump if any fields get added/removed/changed, and add backward
2189
- # compaitibility logic in __setstate__.
2190
- _VERSION = 10
1887
+ # compatibility logic in __setstate__ and/or __getstate__.
1888
+ _VERSION = 12
2191
1889
 
2192
1890
  def __init__(
2193
1891
  self,
@@ -2220,6 +1918,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2220
1918
  self.launched_nodes = launched_nodes
2221
1919
  self.launched_resources = launched_resources
2222
1920
  self.docker_user: Optional[str] = None
1921
+ self.is_grpc_enabled = True
2223
1922
 
2224
1923
  def __repr__(self):
2225
1924
  return (f'ResourceHandle('
@@ -2235,17 +1934,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2235
1934
  f'\n\tlaunched_resources={self.launched_nodes}x '
2236
1935
  f'{self.launched_resources}, '
2237
1936
  f'\n\tdocker_user={self.docker_user},'
2238
- f'\n\tssh_user={self.ssh_user}')
1937
+ f'\n\tssh_user={self.ssh_user},'
1938
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2239
1939
 
2240
1940
  def get_cluster_name(self):
2241
1941
  return self.cluster_name
2242
1942
 
1943
+ def get_cluster_name_on_cloud(self):
1944
+ return self.cluster_name_on_cloud
1945
+
2243
1946
  def _use_internal_ips(self):
2244
1947
  """Returns whether to use internal IPs for SSH connections."""
2245
1948
  # Directly load the `use_internal_ips` flag from the cluster yaml
2246
1949
  # instead of `skypilot_config` as the latter can be changed after the
2247
1950
  # cluster is UP.
2248
- return common_utils.read_yaml(self.cluster_yaml).get(
1951
+ return global_user_state.get_cluster_yaml_dict(self.cluster_yaml).get(
2249
1952
  'provider', {}).get('use_internal_ips', False)
2250
1953
 
2251
1954
  def update_ssh_ports(self, max_attempts: int = 1) -> None:
@@ -2266,15 +1969,20 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2266
1969
  def _update_cluster_info(self):
2267
1970
  # When a cluster is on a cloud that does not support the new
2268
1971
  # provisioner, we should skip updating cluster_info.
2269
- if (self.launched_resources.cloud.PROVISIONER_VERSION >=
1972
+ if (self.launched_resources.cloud is not None and
1973
+ self.launched_resources.cloud.PROVISIONER_VERSION >=
2270
1974
  clouds.ProvisionerVersion.SKYPILOT):
2271
1975
  provider_name = str(self.launched_resources.cloud).lower()
2272
1976
  config = {}
2273
- if os.path.exists(self.cluster_yaml):
2274
- # It is possible that the cluster yaml is not available when
2275
- # the handle is unpickled for service replicas from the
2276
- # controller with older version.
2277
- config = common_utils.read_yaml(self.cluster_yaml)
1977
+ # It is possible that the cluster yaml is not available when
1978
+ # the handle is unpickled for service replicas from the
1979
+ # controller with older version.
1980
+ yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
1981
+ if yaml_str is None:
1982
+ # If the cluster yaml is not available,
1983
+ # we skip updating the cluster info.
1984
+ return
1985
+ config = yaml_utils.safe_load(yaml_str)
2278
1986
  try:
2279
1987
  cluster_info = provision_lib.get_cluster_info(
2280
1988
  provider_name,
@@ -2410,12 +2118,23 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2410
2118
  zip(cluster_internal_ips, cluster_feasible_ips))
2411
2119
 
2412
2120
  # Ensure head node is the first element, then sort based on the
2413
- # external IPs for stableness
2414
- stable_internal_external_ips = [internal_external_ips[0]] + sorted(
2415
- internal_external_ips[1:], key=lambda x: x[1])
2121
+ # external IPs for stableness. Skip for k8s nodes since pods
2122
+ # worker ids are already mapped.
2123
+ if (cluster_info is not None and
2124
+ cluster_info.provider_name == 'kubernetes'):
2125
+ stable_internal_external_ips = internal_external_ips
2126
+ else:
2127
+ stable_internal_external_ips = [internal_external_ips[0]] + sorted(
2128
+ internal_external_ips[1:], key=lambda x: x[1])
2416
2129
  self.stable_internal_external_ips = stable_internal_external_ips
2417
2130
 
2418
- @annotations.lru_cache(scope='global')
2131
+ @context_utils.cancellation_guard
2132
+ # we expect different request to be acting on different clusters
2133
+ # (= different handles) so we have no real expectation of cache hit
2134
+ # across requests.
2135
+ # Do not change this cache to global scope
2136
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
2137
+ @annotations.lru_cache(scope='request', maxsize=10)
2419
2138
  @timeline.event
2420
2139
  def get_command_runners(self,
2421
2140
  force_cached: bool = False,
@@ -2426,19 +2145,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2426
2145
  self.cluster_yaml, self.docker_user, self.ssh_user)
2427
2146
  if avoid_ssh_control:
2428
2147
  ssh_credentials.pop('ssh_control_name', None)
2148
+
2149
+ launched_resources = self.launched_resources.assert_launchable()
2429
2150
  updated_to_skypilot_provisioner_after_provisioned = (
2430
- self.launched_resources.cloud.PROVISIONER_VERSION >=
2151
+ launched_resources.cloud.PROVISIONER_VERSION >=
2431
2152
  clouds.ProvisionerVersion.SKYPILOT and
2432
2153
  self.cached_external_ips is not None and
2433
2154
  self.cached_cluster_info is None)
2434
2155
  if updated_to_skypilot_provisioner_after_provisioned:
2435
2156
  logger.debug(
2436
- f'{self.launched_resources.cloud} has been updated to the new '
2157
+ f'{launched_resources.cloud} has been updated to the new '
2437
2158
  f'provisioner after cluster {self.cluster_name} was '
2438
2159
  f'provisioned. Cached IPs are used for connecting to the '
2439
2160
  'cluster.')
2440
2161
  if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
2441
- self.launched_resources.cloud.PROVISIONER_VERSION or
2162
+ launched_resources.cloud.PROVISIONER_VERSION or
2442
2163
  updated_to_skypilot_provisioner_after_provisioned):
2443
2164
  ip_list = (self.cached_external_ips
2444
2165
  if force_cached else self.external_ips())
@@ -2464,6 +2185,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2464
2185
  'Tried to use cached cluster info, but it\'s missing for '
2465
2186
  f'cluster "{self.cluster_name}"')
2466
2187
  self._update_cluster_info()
2188
+ # For Kubernetes, `KubernetesCommandRunner` want to get the pod names
2189
+ # to run the command. But for high availability serve controller,
2190
+ # the controller pod is part of a deployment, and once the pod is
2191
+ # killed and a new one is created, the pod name changes, so we need
2192
+ # to manually update the cluster info here.
2193
+ # TODO(andyl): See if we can prevent this refresh. Like pass in
2194
+ # deployment name as identifier for KubernetesCommandRunner. Now this
2195
+ # is required for rsync as using deployment in rsync seems to cause
2196
+ # some unknown issues.
2197
+ # TODO(andyl): Should check through the real cluster info. Same as
2198
+ # the TODO in kubernetes/instance.py:terminate_instances
2199
+ if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
2200
+ controller_utils.high_availability_specified(
2201
+ self.cluster_name)):
2202
+ self._update_cluster_info()
2467
2203
 
2468
2204
  assert self.cached_cluster_info is not None, self
2469
2205
  runners = provision_lib.get_command_runners(
@@ -2532,6 +2268,201 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2532
2268
  cluster_config_file)
2533
2269
  self.docker_user = docker_user
2534
2270
 
2271
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2272
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2273
+ self.cluster_name)
2274
+ if metadata is None:
2275
+ return None
2276
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2277
+
2278
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2279
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2280
+ self.cluster_name,
2281
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2282
+
2283
+ def close_skylet_ssh_tunnel(self) -> None:
2284
+ """Terminate the SSH tunnel process and clear its metadata."""
2285
+ tunnel = self._get_skylet_ssh_tunnel()
2286
+ if tunnel is None:
2287
+ return
2288
+ logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
2289
+ self.cluster_name, tunnel.port)
2290
+ try:
2291
+ self._terminate_ssh_tunnel_process(tunnel)
2292
+ finally:
2293
+ self._set_skylet_ssh_tunnel(None)
2294
+
2295
+ def get_grpc_channel(self) -> 'grpc.Channel':
2296
+ grpc_options = [
2297
+ # The task YAMLs can be large, so the default
2298
+ # max_receive_message_length of 4MB might not be enough.
2299
+ ('grpc.max_receive_message_length', -1),
2300
+ ]
2301
+ # It's fine to not grab the lock here, as we're only reading,
2302
+ # and writes are very rare.
2303
+ # It's acceptable to read while another process is opening a tunnel,
2304
+ # because it will only happen on:
2305
+ # 1. A new cluster who has no tunnel yet, or
2306
+ # 2. A cluster with an unhealthy tunnel
2307
+ # For (2), for processes that read the "stale" tunnel, it will fail
2308
+ # and on the next retry, it will call get_grpc_channel again
2309
+ # and get the new tunnel.
2310
+ tunnel = self._get_skylet_ssh_tunnel()
2311
+ if tunnel is not None:
2312
+ if _is_tunnel_healthy(tunnel):
2313
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2314
+ options=grpc_options)
2315
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2316
+ f'{self.cluster_name!r} on port {tunnel.port}')
2317
+
2318
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2319
+ remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2320
+ start_time = time.perf_counter()
2321
+ attempt = 1
2322
+
2323
+ def _get_remaining_timeout() -> float:
2324
+ return max(0.0,
2325
+ remaining_timeout - (time.perf_counter() - start_time))
2326
+
2327
+ while remaining_timeout > 0:
2328
+ logger.debug(
2329
+ 'Attempting to acquire exclusive lock for %s (attempt %d)',
2330
+ lock_id, attempt)
2331
+ exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
2332
+ try:
2333
+ with exclusive_lock.acquire(blocking=False):
2334
+ wait_elapsed = time.perf_counter() - start_time
2335
+ logger.debug(f'Acquired exclusive lock for {lock_id} after '
2336
+ f'{wait_elapsed:.2f}s')
2337
+ try:
2338
+ tunnel = self._open_and_update_skylet_tunnel()
2339
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2340
+ options=grpc_options)
2341
+ except Exception as e: # pylint: disable=broad-except
2342
+ # Failed to open tunnel, release the lock and retry.
2343
+ logger.warning(f'Failed to open tunnel for cluster '
2344
+ f'{self.cluster_name!r}: '
2345
+ f'{common_utils.format_exception(e)}')
2346
+ remaining_timeout = _get_remaining_timeout()
2347
+ attempt += 1
2348
+ continue
2349
+ except locks.LockTimeout:
2350
+ pass
2351
+
2352
+ remaining_timeout = _get_remaining_timeout()
2353
+ logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
2354
+ f'waiting on shared lock (attempt {attempt})')
2355
+ try:
2356
+ # Use shared lock so that concurrent readers can
2357
+ # proceed in parallel.
2358
+ shared_lock = locks.get_lock(lock_id,
2359
+ remaining_timeout,
2360
+ shared_lock=True)
2361
+ # Wait for the exclusive lock to be released.
2362
+ shared_lock.acquire(blocking=True)
2363
+ # We only need the lock for signalling that the new tunnel has
2364
+ # been opened, not for checking the tunnel health.
2365
+ # Same reasoning as why we don't need to grab the lock in
2366
+ # the fast path at the start of this function.
2367
+ shared_lock.release()
2368
+ wait_elapsed = time.perf_counter() - start_time
2369
+ logger.debug(f'Acquired shared lock for {lock_id} after '
2370
+ f'{wait_elapsed:.2f}s')
2371
+ except locks.LockTimeout as e:
2372
+ raise RuntimeError(
2373
+ f'Failed to get gRPC channel for cluster '
2374
+ f'{self.cluster_name!r} due to a timeout when waiting '
2375
+ 'for the SSH tunnel to be opened. Please try again or '
2376
+ f'manually remove the lock at {lock_id}. '
2377
+ f'{common_utils.format_exception(e)}') from e
2378
+
2379
+ # Add small jitter before probing to smoothen the effects
2380
+ # of many readers waking up simultaneously.
2381
+ jitter = random.uniform(0.01, 0.05)
2382
+ time.sleep(jitter)
2383
+
2384
+ # Re-read the tunnel metadata and verify it's healthy.
2385
+ tunnel = self._get_skylet_ssh_tunnel()
2386
+ if tunnel is not None:
2387
+ if _is_tunnel_healthy(tunnel):
2388
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2389
+ options=grpc_options)
2390
+ logger.debug('Failed to connect to SSH tunnel for cluster '
2391
+ f'{self.cluster_name!r} on port {tunnel.port}')
2392
+ # Tunnel is still unhealthy or missing, try again with updated
2393
+ # timeout. This could happen in the case where the thread who
2394
+ # held the exclusive lock to open the tunnel crashed.
2395
+ remaining_timeout = _get_remaining_timeout()
2396
+ attempt += 1
2397
+ raise RuntimeError('Timeout waiting for gRPC channel for cluster '
2398
+ f'{self.cluster_name!r} to be ready.')
2399
+
2400
+ def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
2401
+ """Terminate the SSH tunnel process."""
2402
+ try:
2403
+ proc = psutil.Process(tunnel_info.pid)
2404
+ if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2405
+ logger.debug(
2406
+ f'Terminating SSH tunnel process {tunnel_info.pid}')
2407
+ subprocess_utils.kill_children_processes(proc.pid)
2408
+ except psutil.NoSuchProcess:
2409
+ pass
2410
+ except Exception as e: # pylint: disable=broad-except
2411
+ logger.warning(
2412
+ f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2413
+
2414
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2415
+ """Opens an SSH tunnel to the Skylet on the head node,
2416
+ updates the cluster handle, and persists it to the database."""
2417
+ max_attempts = 3
2418
+ # There could be a race condition here, as multiple processes may
2419
+ # attempt to open the same port at the same time.
2420
+ for attempt in range(max_attempts):
2421
+ runners = self.get_command_runners()
2422
+ head_runner = runners[0]
2423
+ local_port = random.randint(10000, 65535)
2424
+ try:
2425
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2426
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2427
+ except exceptions.CommandError as e:
2428
+ # Don't retry if the error is due to timeout,
2429
+ # connection refused, Kubernetes pods not found,
2430
+ # or an in-progress termination.
2431
+ if (e.detailed_reason is not None and
2432
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2433
+ e.detailed_reason) or
2434
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2435
+ e.detailed_reason) or attempt == max_attempts - 1)):
2436
+ raise e
2437
+ logger.warning(
2438
+ f'Failed to open SSH tunnel on port {local_port} '
2439
+ f'({attempt + 1}/{max_attempts}). '
2440
+ f'{e.error_msg}\n{e.detailed_reason}')
2441
+ continue
2442
+ tunnel_info = SSHTunnelInfo(port=local_port,
2443
+ pid=ssh_tunnel_proc.pid)
2444
+ break
2445
+
2446
+ try:
2447
+ grpc.channel_ready_future(
2448
+ grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2449
+ timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2450
+ # Clean up existing tunnel before setting up the new one.
2451
+ old_tunnel = self._get_skylet_ssh_tunnel()
2452
+ if old_tunnel is not None:
2453
+ self._terminate_ssh_tunnel_process(old_tunnel)
2454
+ self._set_skylet_ssh_tunnel(tunnel_info)
2455
+ return tunnel_info
2456
+ except grpc.FutureTimeoutError as e:
2457
+ self._terminate_ssh_tunnel_process(tunnel_info)
2458
+ logger.warning(
2459
+ f'Skylet gRPC channel for cluster {self.cluster_name} not '
2460
+ f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2461
+ raise e
2462
+ except Exception as e:
2463
+ self._terminate_ssh_tunnel_process(tunnel_info)
2464
+ raise e
2465
+
2535
2466
  @property
2536
2467
  def cluster_yaml(self) -> Optional[str]:
2537
2468
  if self._cluster_yaml is None:
@@ -2542,6 +2473,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2542
2473
  def cluster_yaml(self, value: Optional[str]):
2543
2474
  self._cluster_yaml = value
2544
2475
 
2476
+ @property
2477
+ def instance_ids(self):
2478
+ if self.cached_cluster_info is not None:
2479
+ return self.cached_cluster_info.instance_ids()
2480
+ return None
2481
+
2545
2482
  @property
2546
2483
  def ssh_user(self):
2547
2484
  if self.cached_cluster_info is not None:
@@ -2576,6 +2513,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2576
2513
  num_ips = 1
2577
2514
  return num_ips
2578
2515
 
2516
+ @property
2517
+ def is_grpc_enabled_with_flag(self) -> bool:
2518
+ """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2519
+ return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2520
+
2521
+ def __getstate__(self):
2522
+ state = self.__dict__.copy()
2523
+ # For backwards compatibility. Refer to
2524
+ # https://github.com/skypilot-org/skypilot/pull/7133
2525
+ state.setdefault('skylet_ssh_tunnel', None)
2526
+ return state
2527
+
2579
2528
  def __setstate__(self, state):
2580
2529
  self._version = self._VERSION
2581
2530
 
@@ -2606,7 +2555,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2606
2555
  # pylint: disable=import-outside-toplevel
2607
2556
  launched_resources = state['launched_resources']
2608
2557
  if isinstance(launched_resources.cloud, clouds.Kubernetes):
2609
- yaml_config = common_utils.read_yaml(
2558
+ yaml_config = global_user_state.get_cluster_yaml_dict(
2610
2559
  os.path.expanduser(state['_cluster_yaml']))
2611
2560
  context = kubernetes_utils.get_context_from_config(
2612
2561
  yaml_config['provider'])
@@ -2629,6 +2578,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2629
2578
  os.path.expanduser(state['_cluster_yaml'])):
2630
2579
  state['_cluster_yaml'] = None
2631
2580
 
2581
+ if version < 11:
2582
+ state['is_grpc_enabled'] = False
2583
+ state['skylet_ssh_tunnel'] = None
2584
+
2585
+ if version >= 12:
2586
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
2587
+ state.pop('skylet_ssh_tunnel', None)
2588
+
2632
2589
  self.__dict__.update(state)
2633
2590
 
2634
2591
  # Because the update_cluster_ips and update_ssh_ports
@@ -2653,6 +2610,234 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2653
2610
  pass
2654
2611
 
2655
2612
 
2613
+ class LocalResourcesHandle(CloudVmRayResourceHandle):
2614
+ """A handle for local resources."""
2615
+
2616
+ def __init__(
2617
+ self,
2618
+ *,
2619
+ cluster_name: str,
2620
+ cluster_name_on_cloud: str,
2621
+ cluster_yaml: Optional[str],
2622
+ launched_nodes: int,
2623
+ launched_resources: resources_lib.Resources,
2624
+ stable_internal_external_ips: Optional[List[Tuple[str,
2625
+ str]]] = None,
2626
+ stable_ssh_ports: Optional[List[int]] = None,
2627
+ cluster_info: Optional[provision_common.ClusterInfo] = None
2628
+ ) -> None:
2629
+ super().__init__(
2630
+ cluster_name=cluster_name,
2631
+ cluster_name_on_cloud=cluster_name_on_cloud,
2632
+ cluster_yaml=cluster_yaml,
2633
+ launched_nodes=launched_nodes,
2634
+ launched_resources=launched_resources,
2635
+ stable_internal_external_ips=stable_internal_external_ips,
2636
+ stable_ssh_ports=stable_ssh_ports,
2637
+ cluster_info=cluster_info)
2638
+ # TODO (kyuds): handle jobs consolidation mode. Currently,
2639
+ # jobs consolidation mode will not run a skylet, hence
2640
+ # grpc server will not run. In the future, we should
2641
+ # figure out a way to start grpc in consolidation mode.
2642
+ self.is_grpc_enabled = False
2643
+
2644
+ @context_utils.cancellation_guard
2645
+ # we expect different request to be acting on different clusters
2646
+ # (= different handles) so we have no real expectation of cache hit
2647
+ # across requests.
2648
+ # Do not change this cache to global scope
2649
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
2650
+ @annotations.lru_cache(scope='request', maxsize=10)
2651
+ @timeline.event
2652
+ def get_command_runners(self,
2653
+ force_cached: bool = False,
2654
+ avoid_ssh_control: bool = False
2655
+ ) -> List[command_runner.CommandRunner]:
2656
+ """Returns a list of local command runners."""
2657
+ del force_cached, avoid_ssh_control # Unused.
2658
+ return [command_runner.LocalProcessCommandRunner()]
2659
+
2660
+
2661
+ class SkyletClient:
2662
+ """The client to interact with a remote cluster through Skylet."""
2663
+
2664
+ def __init__(self, channel: 'grpc.Channel'):
2665
+ self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
2666
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
2667
+ self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
2668
+ self._managed_jobs_stub = (
2669
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
2670
+
2671
+ def set_autostop(
2672
+ self,
2673
+ request: 'autostopv1_pb2.SetAutostopRequest',
2674
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2675
+ ) -> 'autostopv1_pb2.SetAutostopResponse':
2676
+ return self._autostop_stub.SetAutostop(request, timeout=timeout)
2677
+
2678
+ def is_autostopping(
2679
+ self,
2680
+ request: 'autostopv1_pb2.IsAutostoppingRequest',
2681
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2682
+ ) -> 'autostopv1_pb2.IsAutostoppingResponse':
2683
+ return self._autostop_stub.IsAutostopping(request, timeout=timeout)
2684
+
2685
+ def add_job(
2686
+ self,
2687
+ request: 'jobsv1_pb2.AddJobRequest',
2688
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2689
+ ) -> 'jobsv1_pb2.AddJobResponse':
2690
+ return self._jobs_stub.AddJob(request, timeout=timeout)
2691
+
2692
+ def queue_job(
2693
+ self,
2694
+ request: 'jobsv1_pb2.QueueJobRequest',
2695
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2696
+ ) -> 'jobsv1_pb2.QueueJobResponse':
2697
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
2698
+
2699
+ def update_status(
2700
+ self,
2701
+ request: 'jobsv1_pb2.UpdateStatusRequest',
2702
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2703
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
2704
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
2705
+
2706
+ def get_job_queue(
2707
+ self,
2708
+ request: 'jobsv1_pb2.GetJobQueueRequest',
2709
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2710
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
2711
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
2712
+
2713
+ def cancel_jobs(
2714
+ self,
2715
+ request: 'jobsv1_pb2.CancelJobsRequest',
2716
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2717
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
2718
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
2719
+
2720
+ def fail_all_in_progress_jobs(
2721
+ self,
2722
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
2723
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2724
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
2725
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
2726
+
2727
+ def get_job_status(
2728
+ self,
2729
+ request: 'jobsv1_pb2.GetJobStatusRequest',
2730
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2731
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
2732
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
2733
+
2734
+ def get_job_submitted_timestamp(
2735
+ self,
2736
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
2737
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2738
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
2739
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
2740
+ timeout=timeout)
2741
+
2742
+ def get_job_ended_timestamp(
2743
+ self,
2744
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
2745
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2746
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
2747
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
2748
+
2749
+ def get_log_dirs_for_jobs(
2750
+ self,
2751
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
2752
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2753
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
2754
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
2755
+
2756
+ def tail_logs(
2757
+ self,
2758
+ request: 'jobsv1_pb2.TailLogsRequest',
2759
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2760
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
2761
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
2762
+
2763
+ def get_service_status(
2764
+ self,
2765
+ request: 'servev1_pb2.GetServiceStatusRequest',
2766
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2767
+ ) -> 'servev1_pb2.GetServiceStatusResponse':
2768
+ return self._serve_stub.GetServiceStatus(request, timeout=timeout)
2769
+
2770
+ def add_serve_version(
2771
+ self,
2772
+ request: 'servev1_pb2.AddVersionRequest',
2773
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2774
+ ) -> 'servev1_pb2.AddVersionResponse':
2775
+ return self._serve_stub.AddVersion(request, timeout=timeout)
2776
+
2777
+ def terminate_services(
2778
+ self,
2779
+ request: 'servev1_pb2.TerminateServicesRequest',
2780
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2781
+ ) -> 'servev1_pb2.TerminateServicesResponse':
2782
+ return self._serve_stub.TerminateServices(request, timeout=timeout)
2783
+
2784
+ def terminate_replica(
2785
+ self,
2786
+ request: 'servev1_pb2.TerminateReplicaRequest',
2787
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2788
+ ) -> 'servev1_pb2.TerminateReplicaResponse':
2789
+ return self._serve_stub.TerminateReplica(request, timeout=timeout)
2790
+
2791
+ def wait_service_registration(
2792
+ self,
2793
+ request: 'servev1_pb2.WaitServiceRegistrationRequest',
2794
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2795
+ ) -> 'servev1_pb2.WaitServiceRegistrationResponse':
2796
+ # set timeout to at least 10 seconds more than service register
2797
+ # constant to make sure that timeouts will not occur.
2798
+ if timeout is not None:
2799
+ timeout = max(timeout,
2800
+ serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
2801
+ return self._serve_stub.WaitServiceRegistration(request,
2802
+ timeout=timeout)
2803
+
2804
+ def update_service(
2805
+ self,
2806
+ request: 'servev1_pb2.UpdateServiceRequest',
2807
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2808
+ ) -> 'servev1_pb2.UpdateServiceResponse':
2809
+ return self._serve_stub.UpdateService(request, timeout=timeout)
2810
+
2811
+ def get_managed_job_controller_version(
2812
+ self,
2813
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
2814
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2815
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
2816
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
2817
+
2818
+ def get_managed_job_table(
2819
+ self,
2820
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
2821
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2822
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
2823
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
2824
+
2825
+ def get_all_managed_job_ids_by_name(
2826
+ self,
2827
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
2828
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2829
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
2830
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
2831
+ timeout=timeout)
2832
+
2833
+ def cancel_managed_jobs(
2834
+ self,
2835
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
2836
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2837
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
2838
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
2839
+
2840
+
2656
2841
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2657
2842
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2658
2843
  """Backend: runs on cloud virtual machines, managed by Ray.
@@ -2665,7 +2850,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2665
2850
  NAME = 'cloudvmray'
2666
2851
 
2667
2852
  # Backward compatibility, with the old name of the handle.
2668
- ResourceHandle = CloudVmRayResourceHandle # pylint: disable=invalid-name
2853
+ ResourceHandle = CloudVmRayResourceHandle # type: ignore
2669
2854
 
2670
2855
  def __init__(self):
2671
2856
  self.run_timestamp = sky_logging.get_run_timestamp()
@@ -2680,6 +2865,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2680
2865
  self._dag = None
2681
2866
  self._optimize_target = None
2682
2867
  self._requested_features = set()
2868
+ self._dump_final_script = False
2869
+ self._is_managed = False
2870
+ # Optional planner (via register_info): used under the per-cluster lock
2871
+ # to produce a fresh concrete plan when neither a reusable snapshot nor
2872
+ # a caller plan is available.
2873
+ self._planner = None
2683
2874
 
2684
2875
  # Command for running the setup script. It is only set when the
2685
2876
  # setup needs to be run outside the self._setup() and as part of
@@ -2696,6 +2887,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2696
2887
  self._requested_features = kwargs.pop('requested_features',
2697
2888
  self._requested_features)
2698
2889
  self._dump_final_script = kwargs.pop('dump_final_script', False)
2890
+ self._is_managed = kwargs.pop('is_managed', False)
2891
+ # Optional planner callback for a fresh plan under lock when no
2892
+ # reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
2893
+ self._planner = kwargs.pop('planner', self._planner)
2699
2894
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2700
2895
 
2701
2896
  def check_resources_fit_cluster(
@@ -2722,9 +2917,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2722
2917
  # Usage Collection:
2723
2918
  usage_lib.messages.usage.update_cluster_resources(
2724
2919
  handle.launched_nodes, launched_resources)
2725
- record = global_user_state.get_cluster_from_name(cluster_name)
2726
- if record is not None:
2727
- usage_lib.messages.usage.update_cluster_status(record['status'])
2920
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
2921
+ if status is not None:
2922
+ usage_lib.messages.usage.update_cluster_status(status)
2728
2923
 
2729
2924
  assert launched_resources.region is not None, handle
2730
2925
 
@@ -2846,12 +3041,46 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2846
3041
  # Check if the cluster is owned by the current user. Raise
2847
3042
  # exceptions.ClusterOwnerIdentityMismatchError
2848
3043
  backend_utils.check_owner_identity(cluster_name)
2849
- lock_path = os.path.expanduser(
2850
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2851
- with timeline.FileLockEvent(lock_path):
2852
- # Try to launch the exiting cluster first. If no existing cluster,
2853
- # this function will create a to_provision_config with required
2854
- # resources.
3044
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
3045
+ communicated_with_user = False
3046
+
3047
+ while True:
3048
+ try:
3049
+ return self._locked_provision(lock_id, task, to_provision,
3050
+ dryrun, stream_logs, cluster_name,
3051
+ retry_until_up,
3052
+ skip_unnecessary_provisioning)
3053
+ except locks.LockTimeout:
3054
+ if not communicated_with_user:
3055
+ rich_utils.force_update_status(
3056
+ ux_utils.spinner_message('Launching - blocked by ' +
3057
+ 'other requests ' +
3058
+ colorama.Style.RESET_ALL +
3059
+ colorama.Style.DIM +
3060
+ 'Check concurrent requests: ' +
3061
+ 'sky api status -v | grep '
3062
+ f'{cluster_name}'))
3063
+
3064
+ def _locked_provision(
3065
+ self,
3066
+ lock_id: str,
3067
+ task: task_lib.Task,
3068
+ to_provision: Optional[resources_lib.Resources],
3069
+ dryrun: bool,
3070
+ stream_logs: bool,
3071
+ cluster_name: str,
3072
+ retry_until_up: bool = False,
3073
+ skip_unnecessary_provisioning: bool = False,
3074
+ ) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
3075
+ with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
3076
+ # Reset spinner message to remove any mention of being blocked
3077
+ # by other requests.
3078
+ rich_utils.force_update_status(
3079
+ ux_utils.spinner_message('Launching'))
3080
+
3081
+ # Try to launch the exiting cluster first. If no existing
3082
+ # cluster, this function will create a to_provision_config
3083
+ # with required resources.
2855
3084
  to_provision_config = self._check_existing_cluster(
2856
3085
  task, to_provision, cluster_name, dryrun)
2857
3086
  assert to_provision_config.resources is not None, (
@@ -2869,14 +3098,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2869
3098
  # TODO(suquark): once we have sky on PyPI, we should directly
2870
3099
  # install sky from PyPI.
2871
3100
  local_wheel_path, wheel_hash = wheel_utils.build_sky_wheel()
2872
- # The most frequent reason for the failure of a provision
2873
- # request is resource unavailability instead of rate
2874
- # limiting; to make users wait shorter, we do not make
2875
- # backoffs exponential.
2876
- backoff = common_utils.Backoff(
2877
- initial_backoff=_RETRY_UNTIL_UP_INIT_GAP_SECONDS,
2878
- max_backoff_factor=1)
2879
- attempt_cnt = 1
2880
3101
  while True:
2881
3102
  # For on-demand instances, RetryingVmProvisioner will retry
2882
3103
  # within the given region first, then optionally retry on all
@@ -2895,21 +3116,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2895
3116
  try:
2896
3117
  retry_provisioner = RetryingVmProvisioner(
2897
3118
  self.log_dir,
2898
- self._dag,
2899
- self._optimize_target,
3119
+ self._dag, # type: ignore[arg-type]
3120
+ self._optimize_target, # type: ignore[arg-type]
2900
3121
  self._requested_features,
2901
3122
  local_wheel_path,
2902
3123
  wheel_hash,
2903
- blocked_resources=task.blocked_resources)
3124
+ blocked_resources=task.blocked_resources,
3125
+ is_managed=self._is_managed)
2904
3126
  log_path = os.path.join(self.log_dir, 'provision.log')
2905
3127
  rich_utils.force_update_status(
2906
- ux_utils.spinner_message('Launching', log_path))
3128
+ ux_utils.spinner_message('Launching',
3129
+ log_path,
3130
+ cluster_name=cluster_name))
2907
3131
  config_dict = retry_provisioner.provision_with_retries(
2908
3132
  task, to_provision_config, dryrun, stream_logs,
2909
3133
  skip_unnecessary_provisioning)
2910
3134
  break
2911
3135
  except exceptions.ResourcesUnavailableError as e:
2912
3136
  log_path = retry_provisioner.log_dir + '/provision.log'
3137
+
2913
3138
  error_message = (
2914
3139
  f'{colorama.Fore.RED}Failed to provision all '
2915
3140
  f'possible launchable resources.'
@@ -2920,23 +3145,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2920
3145
  error_message = str(e)
2921
3146
 
2922
3147
  if retry_until_up:
2923
- logger.error(error_message)
2924
- # Sleep and retry.
2925
- gap_seconds = backoff.current_backoff()
2926
- plural = 's' if attempt_cnt > 1 else ''
3148
+ gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
2927
3149
  retry_message = ux_utils.retry_message(
2928
- f'Retry after {gap_seconds:.0f}s '
2929
- f'({attempt_cnt} attempt{plural}). ')
2930
- logger.info(f'\n{retry_message} '
2931
- f'{ux_utils.log_path_hint(log_path)}'
2932
- f'{colorama.Style.RESET_ALL}')
2933
- attempt_cnt += 1
2934
- time.sleep(gap_seconds)
2935
- continue
3150
+ f'Retry after {gap_seconds:.0f}s ')
3151
+ hint_message = (
3152
+ f'\n{retry_message} '
3153
+ f'{ux_utils.provision_hint(cluster_name)}'
3154
+ f'{colorama.Style.RESET_ALL}')
3155
+
3156
+ # Add cluster event for retry.
3157
+ global_user_state.add_cluster_event(
3158
+ cluster_name, status_lib.ClusterStatus.INIT,
3159
+ f'Retrying provisioning after {gap_seconds:.0f}s',
3160
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3161
+
3162
+ raise exceptions.ExecutionRetryableError(
3163
+ error_message,
3164
+ hint=hint_message,
3165
+ retry_wait_seconds=gap_seconds)
2936
3166
  # Clean up the cluster's entry in `sky status`.
2937
3167
  # Do not remove the stopped cluster from the global state
2938
3168
  # if failed to start.
2939
3169
  if not e.no_failover:
3170
+ global_user_state.add_cluster_event(
3171
+ cluster_name,
3172
+ None,
3173
+ 'Provision failed: ' + str(e),
3174
+ global_user_state.ClusterEventType.STATUS_CHANGE,
3175
+ nop_if_duplicate=True)
2940
3176
  global_user_state.remove_cluster(cluster_name,
2941
3177
  terminate=True)
2942
3178
  usage_lib.messages.usage.update_final_cluster_status(
@@ -2944,7 +3180,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2944
3180
  logger.error(
2945
3181
  ux_utils.error_message(
2946
3182
  'Failed to provision resources. '
2947
- f'{ux_utils.log_path_hint(log_path)}'))
3183
+ f'{ux_utils.provision_hint(cluster_name)}'))
2948
3184
  error_message += (
2949
3185
  '\nTo keep retrying until the cluster is up, use '
2950
3186
  'the `--retry-until-up` flag.')
@@ -2953,8 +3189,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2953
3189
  error_message + '\n' + str(e),
2954
3190
  failover_history=e.failover_history) from None
2955
3191
  if dryrun:
2956
- record = global_user_state.get_cluster_from_name(cluster_name)
2957
- return record['handle'] if record is not None else None, False
3192
+ handle = global_user_state.get_handle_from_cluster_name(
3193
+ cluster_name)
3194
+ return handle if handle is not None else None, False
2958
3195
 
2959
3196
  if config_dict['provisioning_skipped']:
2960
3197
  # Skip further provisioning.
@@ -2962,10 +3199,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2962
3199
  # ('handle', 'provision_record', 'resources_vars')
2963
3200
  # We need to return the handle - but it should be the existing
2964
3201
  # handle for the cluster.
2965
- record = global_user_state.get_cluster_from_name(cluster_name)
2966
- assert record is not None and record['handle'] is not None, (
2967
- cluster_name, record)
2968
- return record['handle'], True
3202
+ handle = global_user_state.get_handle_from_cluster_name(
3203
+ cluster_name)
3204
+ assert handle is not None, (cluster_name, handle)
3205
+ return handle, True
2969
3206
 
2970
3207
  if 'provision_record' in config_dict:
2971
3208
  # New provisioner is used here.
@@ -2980,8 +3217,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2980
3217
  # and other necessary files to the VM.
2981
3218
  # 3. Run setup commands to install dependencies.
2982
3219
  # 4. Starting ray cluster and skylet.
3220
+
3221
+ # Add cluster event for runtime setup start
3222
+ global_user_state.add_cluster_event(
3223
+ handle.cluster_name, status_lib.ClusterStatus.INIT,
3224
+ 'Setting up SkyPilot runtime on cluster',
3225
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3226
+
2983
3227
  cluster_info = provisioner.post_provision_runtime_setup(
2984
- repr(handle.launched_resources.cloud),
3228
+ handle.launched_resources,
2985
3229
  resources_utils.ClusterName(handle.cluster_name,
2986
3230
  handle.cluster_name_on_cloud),
2987
3231
  handle.cluster_yaml,
@@ -2995,6 +3239,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2995
3239
  # manually or by the cloud provider.
2996
3240
  # Optimize the case where the cluster's IPs can be retrieved
2997
3241
  # from cluster_info.
3242
+ handle.cached_cluster_info = cluster_info
2998
3243
  handle.docker_user = cluster_info.docker_user
2999
3244
  handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
3000
3245
  cluster_info=cluster_info)
@@ -3006,7 +3251,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3006
3251
 
3007
3252
  self._update_after_cluster_provisioned(
3008
3253
  handle, to_provision_config.prev_handle, task,
3009
- prev_cluster_status, lock_path, config_hash)
3254
+ prev_cluster_status, config_hash)
3010
3255
  return handle, False
3011
3256
 
3012
3257
  cluster_config_file = config_dict['ray']
@@ -3016,8 +3261,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3016
3261
  ssh_port_list = handle.external_ssh_ports()
3017
3262
  assert ip_list is not None, handle
3018
3263
  assert ssh_port_list is not None, handle
3019
-
3020
- config = common_utils.read_yaml(cluster_config_file)
3264
+ config = global_user_state.get_cluster_yaml_dict(
3265
+ cluster_config_file)
3021
3266
  if 'docker' in config:
3022
3267
  handle.setup_docker_user(cluster_config_file)
3023
3268
 
@@ -3078,14 +3323,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3078
3323
 
3079
3324
  self._update_after_cluster_provisioned(
3080
3325
  handle, to_provision_config.prev_handle, task,
3081
- prev_cluster_status, lock_path, config_hash)
3326
+ prev_cluster_status, config_hash)
3082
3327
  return handle, False
3083
3328
 
3084
3329
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
3085
3330
  cloud = handle.launched_resources.cloud
3086
3331
  logger.debug(
3087
3332
  f'Opening ports {handle.launched_resources.ports} for {cloud}')
3088
- config = common_utils.read_yaml(handle.cluster_yaml)
3333
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
3089
3334
  provider_config = config['provider']
3090
3335
  provision_lib.open_ports(repr(cloud), handle.cluster_name_on_cloud,
3091
3336
  handle.launched_resources.ports,
@@ -3096,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3096
3341
  prev_handle: Optional[CloudVmRayResourceHandle],
3097
3342
  task: task_lib.Task,
3098
3343
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3099
- lock_path: str, config_hash: str) -> None:
3344
+ config_hash: str) -> None:
3100
3345
  usage_lib.messages.usage.update_cluster_resources(
3101
3346
  handle.launched_nodes, handle.launched_resources)
3102
3347
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3108,16 +3353,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3108
3353
  # update_status will query the ray job status for all INIT /
3109
3354
  # PENDING / RUNNING jobs for the real status, since we do not
3110
3355
  # know the actual previous status of the cluster.
3111
- cmd = job_lib.JobLibCodeGen.update_status()
3112
3356
  logger.debug('Update job queue on remote cluster.')
3113
3357
  with rich_utils.safe_status(
3114
3358
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3115
- returncode, _, stderr = self.run_on_head(handle,
3116
- cmd,
3117
- require_outputs=True)
3118
- subprocess_utils.handle_returncode(returncode, cmd,
3119
- 'Failed to update job status.',
3120
- stderr)
3359
+ use_legacy = not handle.is_grpc_enabled_with_flag
3360
+
3361
+ if not use_legacy:
3362
+ try:
3363
+ request = jobsv1_pb2.UpdateStatusRequest()
3364
+ backend_utils.invoke_skylet_with_retries(
3365
+ lambda: SkyletClient(handle.get_grpc_channel()
3366
+ ).update_status(request))
3367
+ except exceptions.SkyletMethodNotImplementedError:
3368
+ use_legacy = True
3369
+
3370
+ if use_legacy:
3371
+ cmd = job_lib.JobLibCodeGen.update_status()
3372
+ returncode, _, stderr = self.run_on_head(
3373
+ handle, cmd, require_outputs=True)
3374
+ subprocess_utils.handle_returncode(
3375
+ returncode, cmd, 'Failed to update job status.', stderr)
3121
3376
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3122
3377
  # Safely set all the previous jobs to FAILED since the cluster
3123
3378
  # is restarted
@@ -3125,14 +3380,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3125
3380
  # 1. A job finishes RUNNING, but right before it update itself
3126
3381
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3127
3382
  # 2. On next `sky start`, it gets reset to FAILED.
3128
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3129
- returncode, stdout, stderr = self.run_on_head(handle,
3130
- cmd,
3131
- require_outputs=True)
3132
- subprocess_utils.handle_returncode(
3133
- returncode, cmd,
3134
- 'Failed to set previously in-progress jobs to FAILED',
3135
- stdout + stderr)
3383
+ use_legacy = not handle.is_grpc_enabled_with_flag
3384
+
3385
+ if not use_legacy:
3386
+ try:
3387
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3388
+ backend_utils.invoke_skylet_with_retries(
3389
+ lambda: SkyletClient(handle.get_grpc_channel(
3390
+ )).fail_all_in_progress_jobs(fail_request))
3391
+ except exceptions.SkyletMethodNotImplementedError:
3392
+ use_legacy = True
3393
+
3394
+ if use_legacy:
3395
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3396
+ returncode, stdout, stderr = self.run_on_head(
3397
+ handle, cmd, require_outputs=True)
3398
+ subprocess_utils.handle_returncode(
3399
+ returncode, cmd,
3400
+ 'Failed to set previously in-progress jobs to FAILED',
3401
+ stdout + stderr)
3136
3402
 
3137
3403
  prev_ports = None
3138
3404
  if prev_handle is not None:
@@ -3142,14 +3408,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3142
3408
  resources_utils.port_ranges_to_set(current_ports) -
3143
3409
  resources_utils.port_ranges_to_set(prev_ports))
3144
3410
  if open_new_ports:
3145
- cloud = handle.launched_resources.cloud
3146
- if not (cloud.OPEN_PORTS_VERSION <=
3411
+ launched_resources = handle.launched_resources.assert_launchable()
3412
+ if not (launched_resources.cloud.OPEN_PORTS_VERSION <=
3147
3413
  clouds.OpenPortsVersion.LAUNCH_ONLY):
3148
3414
  with rich_utils.safe_status(
3149
3415
  ux_utils.spinner_message(
3150
3416
  'Launching - Opening new ports')):
3151
3417
  self._open_ports(handle)
3152
3418
 
3419
+ # Capture task YAML and command
3420
+ user_specified_task_config = None
3421
+ if task is not None:
3422
+ user_specified_task_config = task.to_yaml_config(
3423
+ use_user_specified_yaml=True)
3424
+
3153
3425
  with timeline.Event('backend.provision.post_process'):
3154
3426
  global_user_state.add_or_update_cluster(
3155
3427
  handle.cluster_name,
@@ -3157,7 +3429,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3157
3429
  set(task.resources),
3158
3430
  ready=True,
3159
3431
  config_hash=config_hash,
3432
+ task_config=user_specified_task_config,
3160
3433
  )
3434
+
3435
+ # Add cluster event for successful provisioning.
3436
+ global_user_state.add_cluster_event(
3437
+ handle.cluster_name, status_lib.ClusterStatus.UP,
3438
+ 'Cluster successfully provisioned with ' +
3439
+ f'{handle.launched_nodes} nodes',
3440
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3441
+
3161
3442
  usage_lib.messages.usage.update_final_cluster_status(
3162
3443
  status_lib.ClusterStatus.UP)
3163
3444
  # We still add the cluster to ssh config file on API server, this
@@ -3172,13 +3453,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3172
3453
  handle.cached_external_ssh_ports, handle.docker_user,
3173
3454
  handle.ssh_user)
3174
3455
 
3175
- common_utils.remove_file_if_exists(lock_path)
3176
-
3177
3456
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3178
- workdir: Path) -> None:
3457
+ workdir: Union[Path, Dict[str, Any]],
3458
+ envs_and_secrets: Dict[str, str]) -> None:
3179
3459
  # Even though provision() takes care of it, there may be cases where
3180
3460
  # this function is called in isolation, without calling provision(),
3181
3461
  # e.g., in CLI. So we should rerun rsync_up.
3462
+ if isinstance(workdir, dict):
3463
+ self._sync_git_workdir(handle, envs_and_secrets)
3464
+ else:
3465
+ self._sync_path_workdir(handle, workdir)
3466
+
3467
+ def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
3468
+ envs_and_secrets: Dict[str, str]) -> None:
3469
+ style = colorama.Style
3470
+ ip_list = handle.external_ips()
3471
+ assert ip_list is not None, 'external_ips is not cached in handle'
3472
+
3473
+ log_path = os.path.join(self.log_dir, 'workdir_sync.log')
3474
+
3475
+ # TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
3476
+ runners = handle.get_command_runners()
3477
+
3478
+ def _sync_git_workdir_node(
3479
+ runner: command_runner.CommandRunner) -> None:
3480
+ # Type assertion to help mypy understand the type
3481
+ assert hasattr(
3482
+ runner, 'git_clone'
3483
+ ), f'CommandRunner should have git_clone method, ' \
3484
+ f'got {type(runner)}'
3485
+ runner.git_clone(
3486
+ target_dir=SKY_REMOTE_WORKDIR,
3487
+ log_path=log_path,
3488
+ stream_logs=False,
3489
+ max_retry=3,
3490
+ envs_and_secrets=envs_and_secrets,
3491
+ )
3492
+
3493
+ num_nodes = handle.launched_nodes
3494
+ plural = 's' if num_nodes > 1 else ''
3495
+ logger.info(
3496
+ f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
3497
+ f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3498
+ os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3499
+ os.system(f'touch {log_path}')
3500
+ num_threads = subprocess_utils.get_parallel_threads(
3501
+ str(handle.launched_resources.cloud))
3502
+ with rich_utils.safe_status(
3503
+ ux_utils.spinner_message('Syncing workdir', log_path)):
3504
+ subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
3505
+ num_threads)
3506
+ logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
3507
+
3508
+ def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
3509
+ workdir: Path) -> None:
3182
3510
  fore = colorama.Fore
3183
3511
  style = colorama.Style
3184
3512
  ip_list = handle.external_ips()
@@ -3247,14 +3575,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3247
3575
  TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
3248
3576
  assert here that all storage_mounts are MOUNT mode.
3249
3577
  """
3578
+ launched_resources = handle.launched_resources.assert_launchable()
3250
3579
  with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
3251
3580
  controller_utils.replace_skypilot_config_path_in_file_mounts(
3252
- handle.launched_resources.cloud, all_file_mounts)
3581
+ launched_resources.cloud, all_file_mounts)
3253
3582
  self._execute_file_mounts(handle, all_file_mounts)
3254
3583
  self._execute_storage_mounts(handle, storage_mounts)
3255
3584
  self._set_storage_mounts_metadata(handle.cluster_name,
3256
3585
  storage_mounts)
3257
3586
 
3587
+ def _get_num_gpus(self, task: task_lib.Task) -> int:
3588
+ if task.resources is not None:
3589
+ for resource in task.resources:
3590
+ if (resource.accelerators is not None and
3591
+ isinstance(resource.accelerators, dict)):
3592
+ if len(resource.accelerators) > 0:
3593
+ return math.ceil(
3594
+ list(resource.accelerators.values())[0])
3595
+ return 0
3596
+
3258
3597
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3259
3598
  detach_setup: bool) -> None:
3260
3599
  start = time.time()
@@ -3267,13 +3606,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3267
3606
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3268
3607
  # Need this `-i` option to make sure `source ~/.bashrc` work
3269
3608
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
3609
+ unset_ray_env_vars = ' && '.join(
3610
+ [f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
3611
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3270
3612
  runners = handle.get_command_runners(avoid_ssh_control=True)
3271
3613
 
3272
3614
  def _setup_node(node_id: int) -> None:
3273
- setup_envs = task.envs.copy()
3615
+ setup_envs = task_lib.get_plaintext_envs_and_secrets(
3616
+ task.envs_and_secrets)
3274
3617
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3275
3618
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3276
3619
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
3620
+ setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
3621
+ self._get_num_gpus(task)))
3622
+
3277
3623
  runner = runners[node_id]
3278
3624
  setup_script = log_lib.make_task_bash_script(setup,
3279
3625
  env_vars=setup_envs)
@@ -3329,33 +3675,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3329
3675
  return returncode
3330
3676
 
3331
3677
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3332
- if returncode == 255:
3333
- is_message_too_long = False
3334
- try:
3335
- with open(os.path.expanduser(setup_log_path),
3336
- 'r',
3337
- encoding='utf-8') as f:
3338
- if 'too long' in f.read():
3339
- is_message_too_long = True
3340
- except Exception as e: # pylint: disable=broad-except
3341
- # We don't crash the setup if we cannot read the log file.
3342
- # Instead, we should retry the setup with dumping the script
3343
- # to a file to be safe.
3344
- logger.debug('Failed to read setup log file '
3345
- f'{setup_log_path}: {e}')
3346
- is_message_too_long = True
3347
-
3348
- if is_message_too_long:
3349
- # If the setup script is too long, we retry it with dumping
3350
- # the script to a file and running it with SSH. We use a
3351
- # general length limit check before but it could be
3352
- # inaccurate on some systems.
3353
- logger.debug(
3354
- 'Failed to run setup command inline due to '
3355
- 'command length limit. Dumping setup script to '
3356
- 'file and running it with SSH.')
3357
- _dump_final_script(setup_script)
3358
- returncode = _run_setup(setup_cmd)
3678
+
3679
+ if _is_message_too_long(returncode, file_path=setup_log_path):
3680
+ # If the setup script is too long, we need to retry it
3681
+ # with dumping the script to a file and running it the script
3682
+ # on remote cluster instead.
3683
+ logger.debug('Failed to run setup command inline due to '
3684
+ 'command length limit. Dumping setup script to '
3685
+ 'file and running it with SSH.')
3686
+ _dump_final_script(setup_script)
3687
+ returncode = _run_setup(setup_cmd)
3359
3688
 
3360
3689
  def error_message() -> str:
3361
3690
  # Use the function to avoid tailing the file in success case
@@ -3414,102 +3743,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3414
3743
  logger.info(
3415
3744
  ux_utils.finishing_message('Setup completed.', setup_log_path))
3416
3745
 
3746
+ def _download_file(self, handle: CloudVmRayResourceHandle,
3747
+ local_file_path: str, remote_file_path: str) -> None:
3748
+ """Syncs file from remote to local."""
3749
+ runners = handle.get_command_runners()
3750
+ head_runner = runners[0]
3751
+ head_runner.rsync(
3752
+ source=local_file_path,
3753
+ target=remote_file_path,
3754
+ up=False,
3755
+ stream_logs=False,
3756
+ )
3757
+
3417
3758
  def _exec_code_on_head(
3418
3759
  self,
3419
3760
  handle: CloudVmRayResourceHandle,
3420
3761
  codegen: str,
3421
3762
  job_id: int,
3422
- detach_run: bool = False,
3423
3763
  managed_job_dag: Optional['dag.Dag'] = None,
3764
+ managed_job_user_id: Optional[str] = None,
3765
+ remote_log_dir: Optional[str] = None,
3424
3766
  ) -> None:
3425
3767
  """Executes generated code on the head node."""
3426
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3427
- remote_log_dir = self.log_dir
3768
+ use_legacy = not handle.is_grpc_enabled_with_flag
3769
+ file_name = f'sky_job_{job_id}'
3770
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
3771
+ if remote_log_dir is None:
3772
+ remote_log_dir = self.log_dir
3428
3773
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3429
3774
 
3430
- cd = f'cd {SKY_REMOTE_WORKDIR}'
3775
+ def _dump_code_to_file(codegen: str,
3776
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3777
+ runners = handle.get_command_runners()
3778
+ head_runner = runners[0]
3779
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3780
+ fp.write(codegen)
3781
+ fp.flush()
3782
+ script_path = os.path.join(target_dir, file_name)
3783
+ # We choose to sync code + exec, because the alternative of
3784
+ # 'ray submit' may not work as it may use system python
3785
+ # (python2) to execute the script. Happens for AWS.
3786
+ head_runner.rsync(source=fp.name,
3787
+ target=script_path,
3788
+ up=True,
3789
+ stream_logs=False)
3431
3790
 
3791
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3432
3792
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3433
3793
  f'touch {remote_log_path}')
3434
3794
  encoded_script = shlex.quote(codegen)
3435
3795
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3436
3796
  job_submit_cmd = (
3437
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3438
- # with pid is the same driver process.
3797
+ # JOB_CMD_IDENTIFIER is used for identifying the process
3798
+ # retrieved with pid is the same driver process.
3439
3799
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3440
3800
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3441
3801
  # Do not use &>, which is not POSIX and may not work.
3442
3802
  # Note that the order of ">filename 2>&1" matters.
3443
3803
  f'> {remote_log_path} 2>&1')
3444
-
3445
3804
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3446
3805
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3447
3806
 
3448
- def _dump_code_to_file(codegen: str,
3449
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3450
- runners = handle.get_command_runners()
3451
- head_runner = runners[0]
3452
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3453
- fp.write(codegen)
3454
- fp.flush()
3455
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3456
- # We choose to sync code + exec, because the alternative of 'ray
3457
- # submit' may not work as it may use system python (python2) to
3458
- # execute the script. Happens for AWS.
3459
- head_runner.rsync(source=fp.name,
3460
- target=script_path,
3461
- up=True,
3462
- stream_logs=False)
3463
-
3464
3807
  # Should also be ealier than _is_command_length_over_limit
3465
3808
  # Same reason as in _setup
3466
3809
  if self._dump_final_script:
3467
3810
  _dump_code_to_file(job_submit_cmd,
3468
3811
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3469
3812
 
3470
- if _is_command_length_over_limit(job_submit_cmd):
3471
- _dump_code_to_file(codegen)
3472
- job_submit_cmd = f'{mkdir_code} && {code}'
3473
-
3474
- if managed_job_dag is not None:
3475
- # Add the managed job to job queue database.
3476
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3477
- managed_job_code = managed_job_codegen.set_pending(
3478
- job_id, managed_job_dag)
3479
- # Set the managed job to PENDING state to make sure that this
3480
- # managed job appears in the `sky jobs queue`, even if it needs to
3481
- # wait to be submitted.
3482
- # We cannot set the managed job to PENDING state in the job template
3483
- # (jobs-controller.yaml.j2), as it may need to wait for the run
3484
- # commands to be scheduled on the job controller in high-load cases.
3485
- job_submit_cmd += ' && ' + managed_job_code
3813
+ if not use_legacy:
3814
+ try:
3815
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
3816
+ if managed_job_dag is not None:
3817
+ workspace = skypilot_config.get_active_workspace(
3818
+ force_user_workspace=True)
3819
+ entrypoint = common_utils.get_current_command()
3820
+
3821
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
3822
+ for task_id, task in enumerate(managed_job_dag.tasks):
3823
+ resources_str = backend_utils.get_task_resources_str(
3824
+ task, is_managed_job=True)
3825
+ managed_job_tasks.append(
3826
+ jobsv1_pb2.ManagedJobTask(
3827
+ task_id=task_id,
3828
+ name=task.name,
3829
+ resources_str=resources_str,
3830
+ metadata_json=task.metadata_json))
3831
+
3832
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
3833
+ name=managed_job_dag.name,
3834
+ pool=managed_job_dag.pool,
3835
+ workspace=workspace,
3836
+ entrypoint=entrypoint,
3837
+ tasks=managed_job_tasks,
3838
+ user_id=managed_job_user_id)
3839
+
3840
+ if _is_command_length_over_limit(codegen):
3841
+ _dump_code_to_file(codegen)
3842
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3843
+ job_id=job_id,
3844
+ # codegen not set - server assumes script uploaded
3845
+ remote_log_dir=remote_log_dir,
3846
+ managed_job=managed_job_info,
3847
+ script_path=script_path)
3848
+ else:
3849
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
3850
+ job_id=job_id,
3851
+ codegen=codegen,
3852
+ remote_log_dir=remote_log_dir,
3853
+ managed_job=managed_job_info,
3854
+ script_path=script_path)
3855
+
3856
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
3857
+ handle.get_grpc_channel()).queue_job(queue_job_request))
3858
+ except exceptions.SkyletMethodNotImplementedError:
3859
+ use_legacy = True
3860
+
3861
+ if use_legacy:
3862
+ if _is_command_length_over_limit(job_submit_cmd):
3863
+ _dump_code_to_file(codegen)
3864
+ job_submit_cmd = f'{mkdir_code} && {code}'
3865
+
3866
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3867
+ if managed_job_dag is not None:
3868
+ # Add the managed job to job queue database.
3869
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3870
+ managed_job_code = managed_job_codegen.set_pending(
3871
+ job_id,
3872
+ managed_job_dag,
3873
+ skypilot_config.get_active_workspace(
3874
+ force_user_workspace=True),
3875
+ entrypoint=common_utils.get_current_command(),
3876
+ user_hash=managed_job_user_id)
3877
+ # Set the managed job to PENDING state to make sure that
3878
+ # this managed job appears in the `sky jobs queue`, even
3879
+ # if it needs to wait to be submitted.
3880
+ # We cannot set the managed job to PENDING state in the
3881
+ # job template (jobs-controller.yaml.j2), as it may need
3882
+ # to wait for the run commands to be scheduled on the job
3883
+ # controller in high-load cases.
3884
+ job_submit_cmd += ' && ' + managed_job_code
3885
+ return job_submit_cmd
3886
+
3887
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3486
3888
 
3487
- returncode, stdout, stderr = self.run_on_head(handle,
3488
- job_submit_cmd,
3489
- stream_logs=False,
3490
- require_outputs=True)
3491
- # Happens when someone calls `sky exec` but remote is outdated for
3492
- # running a job. Necessitating calling `sky launch`.
3493
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3494
- handle.cluster_name)
3495
- if returncode == 255 and 'too long' in stdout + stderr:
3496
- # If the generated script is too long, we retry it with dumping
3497
- # the script to a file and running it with SSH. We use a general
3498
- # length limit check before but it could be inaccurate on some
3499
- # systems.
3500
- logger.debug('Failed to submit job due to command length limit. '
3501
- 'Dumping job to file and running it with SSH.')
3502
- _dump_code_to_file(codegen)
3503
- job_submit_cmd = f'{mkdir_code} && {code}'
3504
3889
  returncode, stdout, stderr = self.run_on_head(handle,
3505
3890
  job_submit_cmd,
3506
3891
  stream_logs=False,
3507
3892
  require_outputs=True)
3893
+ # Happens when someone calls `sky exec` but remote is outdated for
3894
+ # running a job. Necessitating calling `sky launch`.
3895
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3896
+ handle.cluster_name)
3897
+ output = stdout + stderr
3898
+ if _is_message_too_long(returncode, output=output):
3899
+ # If the job submit script is too long, we need to retry it
3900
+ # with dumping the script to a file and running it the script
3901
+ # on remote cluster instead.
3902
+ logger.debug(
3903
+ 'Failed to submit job due to command length limit. '
3904
+ 'Dumping job to file and running it with SSH. '
3905
+ f'Output: {output}')
3906
+ _dump_code_to_file(codegen)
3907
+ job_submit_cmd = f'{mkdir_code} && {code}'
3908
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3909
+ returncode, stdout, stderr = self.run_on_head(
3910
+ handle,
3911
+ job_submit_cmd,
3912
+ stream_logs=False,
3913
+ require_outputs=True)
3508
3914
 
3509
- subprocess_utils.handle_returncode(returncode,
3510
- job_submit_cmd,
3511
- f'Failed to submit job {job_id}.',
3512
- stderr=stdout + stderr)
3915
+ subprocess_utils.handle_returncode(
3916
+ returncode,
3917
+ job_submit_cmd,
3918
+ f'Failed to submit job {job_id}.',
3919
+ stderr=stdout + stderr)
3513
3920
 
3514
3921
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
3515
3922
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3518,53 +3925,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3518
3925
  logger.info(
3519
3926
  ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3520
3927
  rich_utils.stop_safe_status()
3521
- if not detach_run:
3522
- if (handle.cluster_name == controller_utils.Controllers.
3523
- JOBS_CONTROLLER.value.cluster_name):
3524
- self.tail_managed_job_logs(handle, job_id)
3525
- else:
3526
- # Sky logs. Not using subprocess.run since it will make the
3527
- # ssh keep connected after ctrl-c.
3528
- self.tail_logs(handle, job_id)
3529
3928
 
3530
3929
  def _add_job(self, handle: CloudVmRayResourceHandle,
3531
- job_name: Optional[str], resources_str: str) -> int:
3532
- code = job_lib.JobLibCodeGen.add_job(
3533
- job_name=job_name,
3534
- username=common_utils.get_user_hash(),
3535
- run_timestamp=self.run_timestamp,
3536
- resources_str=resources_str)
3537
- returncode, job_id_str, stderr = self.run_on_head(handle,
3538
- code,
3539
- stream_logs=False,
3540
- require_outputs=True,
3541
- separate_stderr=True)
3542
- # Happens when someone calls `sky exec` but remote is outdated for
3543
- # adding a job. Necessitating calling `sky launch`.
3544
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3545
- handle.cluster_name)
3546
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3547
- # retry for this, after we figure out the reason.
3548
- subprocess_utils.handle_returncode(returncode, code,
3549
- 'Failed to fetch job id.', stderr)
3550
- try:
3551
- job_id_match = _JOB_ID_PATTERN.search(job_id_str)
3552
- if job_id_match is not None:
3553
- job_id = int(job_id_match.group(1))
3554
- else:
3555
- # For backward compatibility.
3556
- job_id = int(job_id_str)
3557
- except ValueError as e:
3558
- logger.error(stderr)
3559
- raise ValueError(f'Failed to parse job id: {job_id_str}; '
3560
- f'Returncode: {returncode}') from e
3561
- return job_id
3930
+ job_name: Optional[str], resources_str: str,
3931
+ metadata: str) -> Tuple[int, str]:
3932
+ use_legacy = not handle.is_grpc_enabled_with_flag
3933
+
3934
+ if not use_legacy:
3935
+ try:
3936
+ request = jobsv1_pb2.AddJobRequest(
3937
+ job_name=job_name,
3938
+ username=common_utils.get_user_hash(),
3939
+ run_timestamp=self.run_timestamp,
3940
+ resources_str=resources_str,
3941
+ metadata=metadata)
3942
+ response = backend_utils.invoke_skylet_with_retries(
3943
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
3944
+ request))
3945
+ job_id = response.job_id
3946
+ log_dir = response.log_dir
3947
+ return job_id, log_dir
3948
+ except exceptions.SkyletMethodNotImplementedError:
3949
+ use_legacy = True
3950
+
3951
+ if use_legacy:
3952
+ code = job_lib.JobLibCodeGen.add_job(
3953
+ job_name=job_name,
3954
+ username=common_utils.get_user_hash(),
3955
+ run_timestamp=self.run_timestamp,
3956
+ resources_str=resources_str,
3957
+ metadata=metadata)
3958
+ returncode, result_str, stderr = self.run_on_head(
3959
+ handle,
3960
+ code,
3961
+ stream_logs=False,
3962
+ require_outputs=True,
3963
+ separate_stderr=True)
3964
+ # Happens when someone calls `sky exec` but remote is outdated for
3965
+ # adding a job. Necessitating calling `sky launch`.
3966
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3967
+ handle.cluster_name)
3968
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3969
+ # retry for this, after we figure out the reason.
3970
+ subprocess_utils.handle_returncode(returncode, code,
3971
+ 'Failed to fetch job id.',
3972
+ stderr)
3973
+ try:
3974
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
3975
+ if job_id_match is not None:
3976
+ job_id = int(job_id_match.group(1))
3977
+ else:
3978
+ # For backward compatibility.
3979
+ job_id = int(result_str)
3980
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
3981
+ if log_dir_match is not None:
3982
+ log_dir = log_dir_match.group(1).strip()
3983
+ else:
3984
+ # For backward compatibility, use the same log dir as local.
3985
+ log_dir = self.log_dir
3986
+ except ValueError as e:
3987
+ logger.error(stderr)
3988
+ raise ValueError(f'Failed to parse job id: {result_str}; '
3989
+ f'Returncode: {returncode}') from e
3990
+ return job_id, log_dir
3562
3991
 
3563
3992
  def _execute(
3564
3993
  self,
3565
3994
  handle: CloudVmRayResourceHandle,
3566
3995
  task: task_lib.Task,
3567
- detach_run: bool,
3568
3996
  dryrun: bool = False,
3569
3997
  ) -> Optional[int]:
3570
3998
  """Executes the task on the cluster.
@@ -3588,7 +4016,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3588
4016
  # In this case, we reset the resources for the task, so that the
3589
4017
  # detached setup does not need to wait for the task resources to be
3590
4018
  # ready (which is not used for setup anyway).
3591
- valid_resource = sky.Resources()
4019
+ valid_resource = resources_lib.Resources()
3592
4020
  else:
3593
4021
  # Check the task resources vs the cluster resources. Since
3594
4022
  # `sky exec` will not run the provision and _check_existing_cluster
@@ -3610,15 +4038,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3610
4038
  logger.info(f'Dryrun complete. Would have run:\n{task}')
3611
4039
  return None
3612
4040
 
3613
- job_id = self._add_job(handle, task_copy.name, resources_str)
4041
+ job_id, log_dir = self._add_job(handle, task_copy.name, resources_str,
4042
+ task.metadata_json)
3614
4043
 
3615
4044
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
3616
4045
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
3617
4046
  if num_actual_nodes > 1:
3618
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run)
4047
+ self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
3619
4048
  else:
3620
4049
  # Case: task_lib.Task(run, num_nodes=1)
3621
- self._execute_task_one_node(handle, task_copy, job_id, detach_run)
4050
+ self._execute_task_one_node(handle, task_copy, job_id, log_dir)
3622
4051
 
3623
4052
  return job_id
3624
4053
 
@@ -3661,7 +4090,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3661
4090
  is_identity_mismatch_and_purge = False
3662
4091
  try:
3663
4092
  backend_utils.check_owner_identity(cluster_name)
3664
- except exceptions.ClusterOwnerIdentityMismatchError as e:
4093
+ except (exceptions.ClusterOwnerIdentityMismatchError,
4094
+ exceptions.CloudUserIdentityError) as e:
3665
4095
  if purge:
3666
4096
  logger.error(e)
3667
4097
  verbed = 'terminated' if terminate else 'stopped'
@@ -3674,16 +4104,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3674
4104
  is_identity_mismatch_and_purge = True
3675
4105
  else:
3676
4106
  raise
3677
- lock_path = os.path.expanduser(
3678
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
4107
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
4108
+ lock = locks.get_lock(lock_id, timeout=1)
3679
4109
  # Retry in case new cluster operation comes in and holds the lock
3680
4110
  # right after the lock is removed.
3681
4111
  n_attempts = 2
3682
4112
  while True:
3683
4113
  n_attempts -= 1
3684
- # In case other running cluster operations are still holding the
3685
- # lock.
3686
- common_utils.remove_file_if_exists(lock_path)
3687
4114
  # We have to kill the cluster requests, because `down` and `stop`
3688
4115
  # should be higher priority than the cluster requests, and we should
3689
4116
  # release the lock from other requests.
@@ -3701,10 +4128,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3701
4128
  'Failed to kill other launch requests for the '
3702
4129
  f'cluster {handle.cluster_name}: '
3703
4130
  f'{common_utils.format_exception(e, use_bracket=True)}')
4131
+ # In case other running cluster operations are still holding the
4132
+ # lock.
4133
+ lock.force_unlock()
3704
4134
  try:
3705
- with filelock.FileLock(
3706
- lock_path,
3707
- backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
4135
+ with lock:
3708
4136
  self.teardown_no_lock(
3709
4137
  handle,
3710
4138
  terminate,
@@ -3717,14 +4145,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3717
4145
  refresh_cluster_status=(
3718
4146
  not is_identity_mismatch_and_purge))
3719
4147
  if terminate:
3720
- common_utils.remove_file_if_exists(lock_path)
4148
+ lock.force_unlock()
3721
4149
  break
3722
- except filelock.Timeout as e:
4150
+ except locks.LockTimeout as e:
3723
4151
  logger.debug(f'Failed to acquire lock for {cluster_name}, '
3724
4152
  f'retrying...')
3725
4153
  if n_attempts <= 0:
3726
4154
  raise RuntimeError(
3727
- f'Cluster {cluster_name!r} is locked by {lock_path}. '
4155
+ f'Cluster {cluster_name!r} is locked by {lock_id}. '
3728
4156
  'Check to see if it is still being launched') from e
3729
4157
 
3730
4158
  # --- CloudVMRayBackend Specific APIs ---
@@ -3735,6 +4163,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3735
4163
  job_ids: Optional[List[int]] = None,
3736
4164
  stream_logs: bool = True
3737
4165
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4166
+ if handle.is_grpc_enabled_with_flag:
4167
+ try:
4168
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4169
+ response = backend_utils.invoke_skylet_with_retries(
4170
+ lambda: SkyletClient(handle.get_grpc_channel()
4171
+ ).get_job_status(request))
4172
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4173
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4174
+ for job_id, proto_status in response.job_statuses.items()
4175
+ }
4176
+ return statuses
4177
+ except exceptions.SkyletMethodNotImplementedError:
4178
+ pass
4179
+
3738
4180
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
3739
4181
  returncode, stdout, stderr = self.run_on_head(handle,
3740
4182
  code,
@@ -3755,16 +4197,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3755
4197
 
3756
4198
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
3757
4199
  """
3758
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
3759
- returncode, stdout, _ = self.run_on_head(handle,
3760
- code,
3761
- stream_logs=False,
3762
- require_outputs=True)
3763
- subprocess_utils.handle_returncode(
3764
- returncode, code,
3765
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
3766
-
3767
- cancelled_ids = message_utils.decode_payload(stdout)
4200
+ use_legacy = not handle.is_grpc_enabled_with_flag
4201
+
4202
+ if not use_legacy:
4203
+ try:
4204
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4205
+ cancel_all=cancel_all,
4206
+ user_hash=user_hash)
4207
+ response = backend_utils.invoke_skylet_with_retries(
4208
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4209
+ request))
4210
+ cancelled_ids = response.cancelled_job_ids
4211
+ except exceptions.SkyletMethodNotImplementedError:
4212
+ use_legacy = True
4213
+
4214
+ if use_legacy:
4215
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4216
+ user_hash)
4217
+ returncode, stdout, _ = self.run_on_head(handle,
4218
+ code,
4219
+ stream_logs=False,
4220
+ require_outputs=True)
4221
+ subprocess_utils.handle_returncode(
4222
+ returncode, code,
4223
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4224
+ stdout)
4225
+ cancelled_ids = message_utils.decode_payload(stdout)
3768
4226
  if cancelled_ids:
3769
4227
  logger.info(
3770
4228
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -3781,32 +4239,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3781
4239
  Returns:
3782
4240
  A dictionary mapping job_id to log path.
3783
4241
  """
3784
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3785
- returncode, run_timestamps, stderr = self.run_on_head(
3786
- handle,
3787
- code,
3788
- stream_logs=False,
3789
- require_outputs=True,
3790
- separate_stderr=True)
3791
- subprocess_utils.handle_returncode(returncode, code,
3792
- 'Failed to sync logs.', stderr)
3793
- run_timestamps = message_utils.decode_payload(run_timestamps)
3794
- if not run_timestamps:
3795
- logger.info(f'{colorama.Fore.YELLOW}'
3796
- 'No matching log directories found'
3797
- f'{colorama.Style.RESET_ALL}')
3798
- return {}
4242
+ job_to_dir: Dict[str, str] = {}
4243
+ use_legacy = not handle.is_grpc_enabled_with_flag
3799
4244
 
3800
- job_ids = list(run_timestamps.keys())
3801
- run_timestamps = list(run_timestamps.values())
4245
+ if not use_legacy:
4246
+ try:
4247
+ int_job_ids = []
4248
+ if job_ids:
4249
+ for str_job_id in job_ids:
4250
+ if str_job_id.isdigit():
4251
+ int_job_ids.append(int(str_job_id))
4252
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4253
+ job_ids=int_job_ids)
4254
+ response = backend_utils.invoke_skylet_with_retries(
4255
+ lambda: SkyletClient(handle.get_grpc_channel()
4256
+ ).get_log_dirs_for_jobs(request))
4257
+ job_log_dirs = response.job_log_dirs
4258
+ if not job_log_dirs:
4259
+ logger.info(f'{colorama.Fore.YELLOW}'
4260
+ 'No matching log directories found'
4261
+ f'{colorama.Style.RESET_ALL}')
4262
+ return {}
4263
+ for job_id, log_dir in job_log_dirs.items():
4264
+ # Convert to string for backwards compatibility
4265
+ job_to_dir[str(job_id)] = log_dir
4266
+ except exceptions.SkyletMethodNotImplementedError:
4267
+ use_legacy = True
4268
+
4269
+ if use_legacy:
4270
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4271
+ returncode, stdout, stderr = self.run_on_head(handle,
4272
+ code,
4273
+ stream_logs=False,
4274
+ require_outputs=True,
4275
+ separate_stderr=True)
4276
+ subprocess_utils.handle_returncode(returncode, code,
4277
+ 'Failed to sync logs.', stderr)
4278
+ job_to_dir = message_utils.decode_payload(stdout)
4279
+ if not job_to_dir:
4280
+ logger.info(f'{colorama.Fore.YELLOW}'
4281
+ 'No matching log directories found'
4282
+ f'{colorama.Style.RESET_ALL}')
4283
+ return {}
4284
+
4285
+ job_ids = list(job_to_dir.keys())
4286
+ dirs = list(job_to_dir.values())
3802
4287
  remote_log_dirs = [
3803
- os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
3804
- for run_timestamp in run_timestamps
3805
- ]
3806
- local_log_dirs = [
3807
- os.path.join(local_dir, run_timestamp)
3808
- for run_timestamp in run_timestamps
4288
+ # TODO(aylei): backward compatibility for legacy runtime that
4289
+ # returns run_timestamp only, remove after 0.12.0
4290
+ (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
4291
+ constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
3809
4292
  ]
4293
+ # Include cluster name in local log directory path to avoid conflicts
4294
+ # when the same job_id exists on different clusters
4295
+ cluster_name = handle.cluster_name
4296
+ local_log_dirs = []
4297
+ for remote_log_dir in dirs:
4298
+ if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
4299
+ # Extract the job-specific directory name from the full path
4300
+ # e.g., ~/sky_logs/1-job_name -> 1-job_name
4301
+ job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
4302
+ '').lstrip('/')
4303
+ local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
4304
+ else:
4305
+ # remote_log_dir is already just the job directory name (e.g.,
4306
+ # "1-job_name")
4307
+ local_log_dir = os.path.join(local_dir, cluster_name,
4308
+ remote_log_dir)
4309
+ local_log_dirs.append(local_log_dir)
3810
4310
 
3811
4311
  runners = handle.get_command_runners()
3812
4312
 
@@ -3842,12 +4342,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3842
4342
  subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
3843
4343
  return dict(zip(job_ids, local_log_dirs))
3844
4344
 
3845
- def tail_logs(self,
3846
- handle: CloudVmRayResourceHandle,
3847
- job_id: Optional[int],
3848
- managed_job_id: Optional[int] = None,
3849
- follow: bool = True,
3850
- tail: int = 0) -> int:
4345
+ @context_utils.cancellation_guard
4346
+ def tail_logs(
4347
+ self,
4348
+ handle: CloudVmRayResourceHandle,
4349
+ job_id: Optional[int],
4350
+ managed_job_id: Optional[int] = None,
4351
+ follow: bool = True,
4352
+ tail: int = 0,
4353
+ require_outputs: bool = False,
4354
+ stream_logs: bool = True,
4355
+ process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
3851
4356
  """Tail the logs of a job.
3852
4357
 
3853
4358
  Args:
@@ -3857,11 +4362,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3857
4362
  follow: Whether to follow the logs.
3858
4363
  tail: The number of lines to display from the end of the
3859
4364
  log file. If 0, print all lines.
4365
+ require_outputs: Whether to return the stdout/stderr of the command.
4366
+ stream_logs: Whether to stream the logs to stdout/stderr.
4367
+ process_stream: Whether to process the stream.
3860
4368
 
3861
4369
  Returns:
3862
4370
  The exit code of the tail command. Returns code 100 if the job has
3863
4371
  failed. See exceptions.JobExitCode for possible return codes.
3864
4372
  """
4373
+ if handle.is_grpc_enabled_with_flag:
4374
+ last_exit_code = 0
4375
+ try:
4376
+ request = jobsv1_pb2.TailLogsRequest(
4377
+ job_id=job_id,
4378
+ managed_job_id=managed_job_id,
4379
+ follow=follow,
4380
+ tail=tail)
4381
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4382
+ lambda: SkyletClient(handle.get_grpc_channel()
4383
+ ).tail_logs(request, timeout=None)):
4384
+ if resp.log_line:
4385
+ print(resp.log_line, end='', flush=True)
4386
+ last_exit_code = resp.exit_code
4387
+ return last_exit_code
4388
+ except exceptions.SkyletMethodNotImplementedError:
4389
+ pass
4390
+ except grpc.RpcError as e:
4391
+ if e.code() == grpc.StatusCode.CANCELLED:
4392
+ return last_exit_code
4393
+ raise e
4394
+
3865
4395
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
3866
4396
  managed_job_id=managed_job_id,
3867
4397
  follow=follow,
@@ -3876,29 +4406,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3876
4406
  signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
3877
4407
  signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
3878
4408
  try:
3879
- returncode = self.run_on_head(
4409
+ final = self.run_on_head(
3880
4410
  handle,
3881
4411
  code,
3882
- stream_logs=True,
3883
- process_stream=False,
4412
+ stream_logs=stream_logs,
4413
+ process_stream=process_stream,
4414
+ require_outputs=require_outputs,
3884
4415
  # Allocate a pseudo-terminal to disable output buffering.
3885
4416
  # Otherwise, there may be 5 minutes delay in logging.
3886
4417
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3887
4418
  )
3888
4419
  except SystemExit as e:
3889
- returncode = e.code
3890
- return returncode
4420
+ final = e.code
4421
+ return final
3891
4422
 
3892
4423
  def tail_managed_job_logs(self,
3893
4424
  handle: CloudVmRayResourceHandle,
3894
4425
  job_id: Optional[int] = None,
3895
4426
  job_name: Optional[str] = None,
3896
4427
  controller: bool = False,
3897
- follow: bool = True) -> int:
4428
+ follow: bool = True,
4429
+ tail: Optional[int] = None) -> int:
3898
4430
  # if job_name is not None, job_id should be None
3899
4431
  assert job_name is None or job_id is None, (job_name, job_id)
4432
+ # TODO(kevin): Migrate stream_logs to gRPC
3900
4433
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
3901
- job_name, job_id, follow, controller)
4434
+ job_name, job_id, follow, controller, tail)
3902
4435
 
3903
4436
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
3904
4437
  # kill the process, so we need to handle it manually here.
@@ -3942,20 +4475,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3942
4475
  assert job_name is None or job_id is None, (job_name, job_id)
3943
4476
 
3944
4477
  if job_id is None:
3945
- # generate code to get the job_id
4478
+ # get the job_id
3946
4479
  # if job_name is None, get all job_ids
3947
4480
  # TODO: Only get the latest job_id, since that's the only one we use
3948
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3949
- job_name=job_name)
3950
- returncode, job_ids, stderr = self.run_on_head(handle,
3951
- code,
3952
- stream_logs=False,
3953
- require_outputs=True,
3954
- separate_stderr=True)
3955
- subprocess_utils.handle_returncode(returncode, code,
3956
- 'Failed to sync down logs.',
3957
- stderr)
3958
- job_ids = message_utils.decode_payload(job_ids)
4481
+
4482
+ use_legacy = not handle.is_grpc_enabled_with_flag
4483
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4484
+ f'{handle.is_grpc_enabled_with_flag}')
4485
+ if not use_legacy:
4486
+ try:
4487
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4488
+ job_name=job_name)
4489
+ response = backend_utils.invoke_skylet_with_retries(
4490
+ lambda: SkyletClient(handle.get_grpc_channel(
4491
+ )).get_all_managed_job_ids_by_name(request))
4492
+ job_ids = list(response.job_ids)
4493
+ except exceptions.SkyletMethodNotImplementedError:
4494
+ use_legacy = True
4495
+
4496
+ if use_legacy:
4497
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4498
+ job_name=job_name)
4499
+ returncode, job_ids_payload, stderr = self.run_on_head(
4500
+ handle,
4501
+ code,
4502
+ stream_logs=False,
4503
+ require_outputs=True,
4504
+ separate_stderr=True)
4505
+ subprocess_utils.handle_returncode(returncode, code,
4506
+ 'Failed to sync down logs.',
4507
+ stderr)
4508
+ job_ids = message_utils.decode_payload(job_ids_payload)
3959
4509
  if not job_ids:
3960
4510
  logger.info(f'{colorama.Fore.YELLOW}'
3961
4511
  'No matching job found'
@@ -3974,20 +4524,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3974
4524
  # list should aready be in descending order
3975
4525
  job_id = job_ids[0]
3976
4526
 
3977
- # get the run_timestamp
3978
- # the function takes in [job_id]
3979
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3980
- [str(job_id)])
3981
- returncode, run_timestamps, stderr = self.run_on_head(
3982
- handle,
3983
- code,
3984
- stream_logs=False,
3985
- require_outputs=True,
3986
- separate_stderr=True)
3987
- subprocess_utils.handle_returncode(returncode, code,
3988
- 'Failed to sync logs.', stderr)
3989
- # returns with a dict of {job_id: run_timestamp}
3990
- run_timestamps = message_utils.decode_payload(run_timestamps)
4527
+ if isinstance(handle, LocalResourcesHandle):
4528
+ # In consolidation mode, we don't submit a ray job, therefore no
4529
+ # run_timestamp is available. We use a dummy run_timestamp here.
4530
+ run_timestamps = {
4531
+ job_id: f'managed-jobs-consolidation-mode-{job_id}'
4532
+ }
4533
+ else:
4534
+ # get the run_timestamp
4535
+ # the function takes in [job_id]
4536
+ use_legacy = not handle.is_grpc_enabled_with_flag
4537
+ if not use_legacy:
4538
+ try:
4539
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
4540
+ job_ids=[job_id])
4541
+ log_dirs_response = (
4542
+ backend_utils.invoke_skylet_with_retries(
4543
+ lambda: SkyletClient(handle.get_grpc_channel(
4544
+ )).get_log_dirs_for_jobs(log_dirs_request)))
4545
+ job_log_dirs = log_dirs_response.job_log_dirs
4546
+ # Convert back to the expected format
4547
+ # {job_id: run_timestamp}
4548
+ run_timestamps = {}
4549
+ for jid, log_dir in job_log_dirs.items():
4550
+ run_timestamps[int(jid)] = log_dir
4551
+ except exceptions.SkyletMethodNotImplementedError:
4552
+ use_legacy = True
4553
+
4554
+ if use_legacy:
4555
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
4556
+ [str(job_id)])
4557
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
4558
+ handle,
4559
+ code,
4560
+ stream_logs=False,
4561
+ require_outputs=True,
4562
+ separate_stderr=True)
4563
+ subprocess_utils.handle_returncode(returncode, code,
4564
+ 'Failed to sync logs.',
4565
+ stderr)
4566
+ # returns with a dict of {job_id: run_timestamp}
4567
+ run_timestamps = message_utils.decode_payload(
4568
+ run_timestamps_payload)
3991
4569
  if not run_timestamps:
3992
4570
  logger.info(f'{colorama.Fore.YELLOW}'
3993
4571
  'No matching log directories found'
@@ -3996,11 +4574,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3996
4574
 
3997
4575
  run_timestamp = list(run_timestamps.values())[0]
3998
4576
  job_id = list(run_timestamps.keys())[0]
4577
+
4578
+ # If run_timestamp contains the full path with SKY_LOGS_DIRECTORY,
4579
+ # strip the prefix to get just the relative part to avoid duplication
4580
+ # when constructing local paths.
4581
+ if run_timestamp.startswith(constants.SKY_LOGS_DIRECTORY):
4582
+ run_timestamp = run_timestamp[len(constants.SKY_LOGS_DIRECTORY
4583
+ ):].lstrip('/')
3999
4584
  local_log_dir = ''
4000
4585
  if controller: # download controller logs
4001
4586
  remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
4002
4587
  f'{job_id}.log')
4003
- local_log_dir = os.path.join(local_dir, run_timestamp)
4588
+ local_log_dir = os.path.join(local_dir, 'managed_jobs',
4589
+ run_timestamp)
4004
4590
  os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
4005
4591
  exist_ok=True)
4006
4592
 
@@ -4046,11 +4632,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4046
4632
  exist_ok=True)
4047
4633
  log_file = os.path.join(local_log_dir, 'run.log')
4048
4634
 
4049
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4050
- job_id=job_id,
4051
- follow=False,
4052
- controller=False)
4053
-
4635
+ # TODO(kevin): Migrate stream_logs to gRPC
4636
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
4637
+ job_name=None,
4638
+ job_id=int(job_id),
4639
+ follow=False,
4640
+ controller=False)
4054
4641
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4055
4642
  # kill the process, so we need to handle it manually here.
4056
4643
  if threading.current_thread() is threading.main_thread():
@@ -4091,6 +4678,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4091
4678
  Raises:
4092
4679
  RuntimeError: If the cluster fails to be terminated/stopped.
4093
4680
  """
4681
+ try:
4682
+ handle.close_skylet_ssh_tunnel()
4683
+ except Exception as e: # pylint: disable=broad-except
4684
+ # Not critical to the cluster teardown, just log a warning.
4685
+ logger.warning(
4686
+ 'Failed to close Skylet SSH tunnel for cluster '
4687
+ f'{handle.cluster_name}: '
4688
+ f'{common_utils.format_exception(e, use_bracket=True)}')
4689
+
4094
4690
  exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4095
4691
  # We have to kill the cluster requests again within the lock, because
4096
4692
  # any pending requests on the same cluster should be cancelled after
@@ -4116,7 +4712,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4116
4712
  prev_cluster_status, _ = (
4117
4713
  backend_utils.refresh_cluster_status_handle(
4118
4714
  handle.cluster_name,
4119
- acquire_per_cluster_status_lock=False))
4715
+ # There is a case where
4716
+ # 1. The cluster was interrupted during provisioning.
4717
+ # 2. The API request to create the cluster instances was
4718
+ # sent to the cloud, but hasn't been processed yet.
4719
+ # In this case, the cluster will be INIT. We should do a
4720
+ # hard status refresh to see if the instances are
4721
+ # actually there or not. Otherwise, teardown may not
4722
+ # find the instances, leading to a leak. This was
4723
+ # observed in AWS. See also
4724
+ # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
4725
+ force_refresh_statuses={status_lib.ClusterStatus.INIT},
4726
+ cluster_lock_already_held=True,
4727
+ retry_if_missing=False))
4120
4728
  cluster_status_fetched = True
4121
4729
  except exceptions.ClusterStatusFetchingError:
4122
4730
  logger.warning(
@@ -4124,10 +4732,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4124
4732
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4125
4733
  'up.')
4126
4734
  if not cluster_status_fetched:
4127
- record = global_user_state.get_cluster_from_name(
4735
+ status = global_user_state.get_status_from_cluster_name(
4128
4736
  handle.cluster_name)
4129
- prev_cluster_status = record[
4130
- 'status'] if record is not None else None
4737
+ prev_cluster_status = status if status is not None else None
4131
4738
  if prev_cluster_status is None:
4132
4739
  # When the cluster is not in the cluster table, we guarantee that
4133
4740
  # all related resources / cache / config are cleaned up, i.e. it
@@ -4148,8 +4755,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4148
4755
  log_path = os.path.join(os.path.expanduser(self.log_dir),
4149
4756
  'teardown.log')
4150
4757
  log_abs_path = os.path.abspath(log_path)
4151
- cloud = handle.launched_resources.cloud
4152
- config = common_utils.read_yaml(handle.cluster_yaml)
4758
+ launched_resources = handle.launched_resources.assert_launchable()
4759
+ cloud = launched_resources.cloud
4760
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
4153
4761
  cluster_name = handle.cluster_name
4154
4762
  cluster_name_on_cloud = handle.cluster_name_on_cloud
4155
4763
 
@@ -4209,7 +4817,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4209
4817
  from sky.adaptors import ibm
4210
4818
  from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
4211
4819
 
4212
- config_provider = common_utils.read_yaml(
4820
+ config_provider = global_user_state.get_cluster_yaml_dict(
4213
4821
  handle.cluster_yaml)['provider']
4214
4822
  region = config_provider['region']
4215
4823
  search_client = ibm.search_client()
@@ -4238,36 +4846,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4238
4846
  # successfully removed cluster as no exception was raised
4239
4847
  returncode = 0
4240
4848
 
4241
- elif terminate and isinstance(cloud, clouds.SCP):
4242
- # pylint: disable=import-outside-toplevel
4243
- from sky.skylet.providers.scp import node_provider
4244
- config['provider']['cache_stopped_nodes'] = not terminate
4245
- provider = node_provider.SCPNodeProvider(config['provider'],
4246
- cluster_name_on_cloud)
4247
- try:
4248
- if not os.path.exists(provider.metadata.path):
4249
- raise node_provider.SCPError(
4250
- 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
4251
- 'Metadata file does not exist.')
4252
-
4253
- with open(provider.metadata.path, 'r', encoding='utf-8') as f:
4254
- metadata = json.load(f)
4255
- node_id = next(iter(metadata.values())).get(
4256
- 'creation', {}).get('virtualServerId', None)
4257
- provider.terminate_node(node_id)
4258
- returncode = 0
4259
- except node_provider.SCPError as e:
4260
- returncode = 1
4261
- stdout = ''
4262
- stderr = str(e)
4263
-
4264
4849
  else:
4265
4850
  config['provider']['cache_stopped_nodes'] = not terminate
4266
4851
  with tempfile.NamedTemporaryFile('w',
4267
4852
  prefix='sky_',
4268
4853
  delete=False,
4269
4854
  suffix='.yml') as f:
4270
- common_utils.dump_yaml(f.name, config)
4855
+ yaml_utils.dump_yaml(f.name, config)
4271
4856
  f.flush()
4272
4857
 
4273
4858
  teardown_verb = 'Terminating' if terminate else 'Stopping'
@@ -4322,12 +4907,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4322
4907
  handle: CloudVmRayResourceHandle,
4323
4908
  terminate: bool,
4324
4909
  purge: bool = False,
4325
- remove_from_db: bool = True) -> None:
4910
+ remove_from_db: bool = True,
4911
+ failover: bool = False) -> None:
4326
4912
  """Cleanup local configs/caches and delete TPUs after teardown.
4327
4913
 
4328
4914
  This method will handle the following cleanup steps:
4329
4915
  * Deleting the TPUs;
4330
4916
  * Removing ssh configs for the cluster;
4917
+ * Deleting the open ports;
4918
+ * Deleting the custom multi network infrastructure based on the
4919
+ failover flag (e.g. delete firewalls, subnets, and VPCs for GPU
4920
+ Direct if failover is False, otherwise, only delete the subnets);
4331
4921
  * Updating the local state of the cluster;
4332
4922
  * Removing the terminated cluster's scripts and ray yaml files.
4333
4923
  """
@@ -4359,19 +4949,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4359
4949
  # The cluster yaml does not exist when skypilot has not found
4360
4950
  # the right resource to provision the cluster.
4361
4951
  if handle.cluster_yaml is not None:
4952
+ launched_resources = (
4953
+ handle.launched_resources.assert_launchable())
4954
+ cloud = launched_resources.cloud
4955
+ config = global_user_state.get_cluster_yaml_dict(
4956
+ handle.cluster_yaml)
4957
+ ports_cleaned_up = False
4958
+ custom_multi_network_cleaned_up = False
4362
4959
  try:
4363
- cloud = handle.launched_resources.cloud
4364
- config = common_utils.read_yaml(handle.cluster_yaml)
4365
4960
  cloud.check_features_are_supported(
4366
- handle.launched_resources,
4961
+ launched_resources,
4367
4962
  {clouds.CloudImplementationFeatures.OPEN_PORTS})
4368
4963
  provision_lib.cleanup_ports(repr(cloud),
4369
4964
  cluster_name_on_cloud,
4370
4965
  handle.launched_resources.ports,
4371
4966
  config['provider'])
4372
- self.remove_cluster_config(handle)
4967
+ ports_cleaned_up = True
4373
4968
  except exceptions.NotSupportedError:
4374
- pass
4969
+ ports_cleaned_up = True
4375
4970
  except exceptions.PortDoesNotExistError:
4376
4971
  logger.debug('Ports do not exist. Skipping cleanup.')
4377
4972
  except Exception as e: # pylint: disable=broad-except
@@ -4383,8 +4978,43 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4383
4978
  else:
4384
4979
  raise
4385
4980
 
4386
- sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
4387
- handle.cluster_name)
4981
+ # Clean up custom multi networks, e.g. the subnets, firewalls,
4982
+ # and VPCs created for GCP GPUDirect TCPX
4983
+ try:
4984
+ cloud.check_features_are_supported(
4985
+ handle.launched_resources, {
4986
+ clouds.CloudImplementationFeatures.
4987
+ CUSTOM_MULTI_NETWORK
4988
+ })
4989
+ provision_lib.cleanup_custom_multi_network(
4990
+ repr(cloud), cluster_name_on_cloud, config['provider'],
4991
+ failover)
4992
+ custom_multi_network_cleaned_up = True
4993
+ except exceptions.NotSupportedError:
4994
+ custom_multi_network_cleaned_up = True
4995
+ except Exception as e: # pylint: disable=broad-except
4996
+ if purge:
4997
+ msg = common_utils.format_exception(e, use_bracket=True)
4998
+ logger.warning(
4999
+ f'Failed to cleanup custom multi network. Skipping '
5000
+ f'since purge is set. Details: {msg}')
5001
+ else:
5002
+ raise
5003
+
5004
+ if ports_cleaned_up and custom_multi_network_cleaned_up:
5005
+ try:
5006
+ self.remove_cluster_config(handle)
5007
+ except Exception as e: # pylint: disable=broad-except
5008
+ if purge:
5009
+ msg = common_utils.format_exception(
5010
+ e, use_bracket=True)
5011
+ logger.warning(
5012
+ f'Failed to remove cluster config. Skipping '
5013
+ f'since purge is set. Details: {msg}')
5014
+ else:
5015
+ raise
5016
+
5017
+ cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
4388
5018
 
4389
5019
  def _detect_abnormal_non_terminated_nodes(
4390
5020
  handle: CloudVmRayResourceHandle) -> None:
@@ -4400,18 +5030,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4400
5030
  # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
4401
5031
  attempts = 0
4402
5032
  while True:
4403
- config = common_utils.read_yaml(handle.cluster_yaml)
5033
+ config = global_user_state.get_cluster_yaml_dict(
5034
+ handle.cluster_yaml)
4404
5035
 
4405
5036
  logger.debug(f'instance statuses attempt {attempts + 1}')
4406
5037
  node_status_dict = provision_lib.query_instances(
4407
5038
  repr(cloud),
5039
+ handle.cluster_name,
4408
5040
  cluster_name_on_cloud,
4409
5041
  config['provider'],
4410
5042
  non_terminated_only=False)
4411
5043
 
4412
5044
  unexpected_node_state: Optional[Tuple[str, str]] = None
4413
- for node_id, node_status in node_status_dict.items():
4414
- logger.debug(f'{node_id} status: {node_status}')
5045
+ for node_id, node_status_tuple in node_status_dict.items():
5046
+ node_status, reason = node_status_tuple
5047
+ reason = '' if reason is None else f' ({reason})'
5048
+ logger.debug(f'{node_id} status: {node_status}{reason}')
4415
5049
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4416
5050
  # between "stopping/stopped" and "terminating/terminated",
4417
5051
  # so we allow for either status instead of casing on
@@ -4456,13 +5090,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4456
5090
 
4457
5091
  def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
4458
5092
  """Remove the YAML config of a cluster."""
5093
+ cluster_yaml_path = handle.cluster_yaml
4459
5094
  handle.cluster_yaml = None
4460
5095
  global_user_state.update_cluster_handle(handle.cluster_name, handle)
4461
- common_utils.remove_file_if_exists(handle.cluster_yaml)
5096
+ # Removing the cluster YAML can cause some unexpected stability issues.
5097
+ # See #5011.
5098
+ # global_user_state.remove_cluster_yaml(handle.cluster_name)
5099
+ common_utils.remove_file_if_exists(cluster_yaml_path)
4462
5100
 
4463
5101
  def set_autostop(self,
4464
5102
  handle: CloudVmRayResourceHandle,
4465
5103
  idle_minutes_to_autostop: Optional[int],
5104
+ wait_for: Optional[autostop_lib.AutostopWaitFor],
4466
5105
  down: bool = False,
4467
5106
  stream_logs: bool = True) -> None:
4468
5107
  # The core.autostop() function should have already checked that the
@@ -4489,6 +5128,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4489
5128
 
4490
5129
  # down = False is the default, but warn the user in case
4491
5130
  # they have explicitly specified it.
5131
+ # TODO(cooperc): Fix for new autostop stuff.
4492
5132
  config_override_down = skypilot_config.get_nested(
4493
5133
  (controller.value.controller_type, 'controller',
4494
5134
  'autostop', 'down'), None)
@@ -4508,17 +5148,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4508
5148
  # Check if we're stopping spot
4509
5149
  assert (handle.launched_resources is not None and
4510
5150
  handle.launched_resources.cloud is not None), handle
4511
- code = autostop_lib.AutostopCodeGen.set_autostop(
4512
- idle_minutes_to_autostop, self.NAME, down)
4513
- returncode, _, stderr = self.run_on_head(handle,
4514
- code,
4515
- require_outputs=True,
4516
- stream_logs=stream_logs)
4517
- subprocess_utils.handle_returncode(returncode,
4518
- code,
4519
- 'Failed to set autostop',
4520
- stderr=stderr,
4521
- stream_logs=stream_logs)
5151
+ if handle.is_grpc_enabled_with_flag:
5152
+ request = autostopv1_pb2.SetAutostopRequest(
5153
+ idle_minutes=idle_minutes_to_autostop,
5154
+ backend=self.NAME,
5155
+ wait_for=wait_for.to_protobuf() if wait_for is not None else
5156
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
5157
+ down=down,
5158
+ )
5159
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5160
+ handle.get_grpc_channel()).set_autostop(request))
5161
+ else:
5162
+ code = autostop_lib.AutostopCodeGen.set_autostop(
5163
+ idle_minutes_to_autostop, self.NAME, wait_for, down)
5164
+ returncode, _, stderr = self.run_on_head(
5165
+ handle, code, require_outputs=True, stream_logs=stream_logs)
5166
+ subprocess_utils.handle_returncode(returncode,
5167
+ code,
5168
+ 'Failed to set autostop',
5169
+ stderr=stderr,
5170
+ stream_logs=stream_logs)
4522
5171
  global_user_state.set_cluster_autostop_value(
4523
5172
  handle.cluster_name, idle_minutes_to_autostop, down)
4524
5173
 
@@ -4543,22 +5192,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4543
5192
  # The head node of the cluster is not UP or in an abnormal state.
4544
5193
  # We cannot check if the cluster is autostopping.
4545
5194
  return False
4546
- code = autostop_lib.AutostopCodeGen.is_autostopping()
4547
- returncode, stdout, stderr = self.run_on_head(handle,
4548
- code,
4549
- require_outputs=True,
4550
- stream_logs=stream_logs)
4551
-
4552
- if returncode == 0:
4553
- return message_utils.decode_payload(stdout)
4554
- logger.debug('Failed to check if cluster is autostopping with '
4555
- f'{returncode}: {stdout+stderr}\n'
4556
- f'Command: {code}')
4557
- return False
5195
+ if handle.is_grpc_enabled_with_flag:
5196
+ try:
5197
+ request = autostopv1_pb2.IsAutostoppingRequest()
5198
+ response = backend_utils.invoke_skylet_with_retries(
5199
+ lambda: SkyletClient(handle.get_grpc_channel()
5200
+ ).is_autostopping(request))
5201
+ return response.is_autostopping
5202
+ except Exception as e: # pylint: disable=broad-except
5203
+ # The cluster may have been terminated, causing the gRPC call
5204
+ # to timeout and fail.
5205
+ logger.debug(f'Failed to check if cluster is autostopping: {e}')
5206
+ return False
5207
+ else:
5208
+ code = autostop_lib.AutostopCodeGen.is_autostopping()
5209
+ returncode, stdout, stderr = self.run_on_head(
5210
+ handle, code, require_outputs=True, stream_logs=stream_logs)
5211
+ if returncode == 0:
5212
+ return message_utils.decode_payload(stdout)
5213
+ logger.debug('Failed to check if cluster is autostopping with '
5214
+ f'{returncode}: {stdout+stderr}\n'
5215
+ f'Command: {code}')
5216
+ return False
4558
5217
 
4559
5218
  # TODO(zhwu): Refactor this to a CommandRunner class, so different backends
4560
5219
  # can support its own command runner.
4561
5220
  @timeline.event
5221
+ @context_utils.cancellation_guard
4562
5222
  def run_on_head(
4563
5223
  self,
4564
5224
  handle: CloudVmRayResourceHandle,
@@ -4649,7 +5309,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4649
5309
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
4650
5310
  # TODO(zhwu): complete the list of exceptions.
4651
5311
  """
4652
- record = global_user_state.get_cluster_from_name(cluster_name)
5312
+ record = global_user_state.get_cluster_from_name(
5313
+ cluster_name, include_user_info=False, summary_response=True)
4653
5314
  if record is None:
4654
5315
  handle_before_refresh = None
4655
5316
  status_before_refresh = None
@@ -4657,6 +5318,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4657
5318
  handle_before_refresh = record['handle']
4658
5319
  status_before_refresh = record['status']
4659
5320
 
5321
+ handle: Optional[CloudVmRayResourceHandle]
4660
5322
  prev_cluster_status, handle = (status_before_refresh,
4661
5323
  handle_before_refresh)
4662
5324
 
@@ -4668,7 +5330,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4668
5330
  record = backend_utils.refresh_cluster_record(
4669
5331
  cluster_name,
4670
5332
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
4671
- acquire_per_cluster_status_lock=False,
5333
+ cluster_lock_already_held=True,
5334
+ include_user_info=False,
5335
+ summary_response=True,
4672
5336
  )
4673
5337
  if record is not None:
4674
5338
  prev_cluster_status = record['status']
@@ -4677,7 +5341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4677
5341
  prev_cluster_status = None
4678
5342
  handle = None
4679
5343
  # We should check the cluster_ever_up after refresh, because if the
4680
- # cluster is terminated (through console or auto-dwon), the record will
5344
+ # cluster is terminated (through console or auto-down), the record will
4681
5345
  # become None and the cluster_ever_up should be considered as False.
4682
5346
  cluster_ever_up = record is not None and record['cluster_ever_up']
4683
5347
  prev_config_hash = record['config_hash'] if record is not None else None
@@ -4690,16 +5354,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4690
5354
  self.check_resources_fit_cluster(handle, task)
4691
5355
  # Use the existing cluster.
4692
5356
  assert handle.launched_resources is not None, (cluster_name, handle)
5357
+ # Take a random resource in order to get resource info that applies
5358
+ # to all resources.
5359
+ one_task_resource = list(task.resources)[0]
5360
+
4693
5361
  # Assume resources share the same ports.
4694
5362
  for resource in task.resources:
4695
- assert resource.ports == list(task.resources)[0].ports
5363
+ assert resource.ports == one_task_resource.ports
4696
5364
  requested_ports_set = resources_utils.port_ranges_to_set(
4697
- list(task.resources)[0].ports)
5365
+ one_task_resource.ports)
4698
5366
  current_ports_set = resources_utils.port_ranges_to_set(
4699
5367
  handle.launched_resources.ports)
4700
5368
  all_ports = resources_utils.port_set_to_ranges(current_ports_set |
4701
5369
  requested_ports_set)
4702
5370
  to_provision = handle.launched_resources
5371
+ assert to_provision is not None
5372
+ to_provision = to_provision.assert_launchable()
4703
5373
  if (to_provision.cloud.OPEN_PORTS_VERSION <=
4704
5374
  clouds.OpenPortsVersion.LAUNCH_ONLY):
4705
5375
  if not requested_ports_set <= current_ports_set:
@@ -4713,6 +5383,57 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4713
5383
  'a new cluster with the desired ports open.')
4714
5384
  if all_ports:
4715
5385
  to_provision = to_provision.copy(ports=all_ports)
5386
+ # Docker login should always be the same for all resources, since
5387
+ # it's set from envs.
5388
+ for resource in task.resources:
5389
+ assert (resource.docker_login_config ==
5390
+ one_task_resource.docker_login_config), (
5391
+ resource.docker_login_config,
5392
+ one_task_resource.docker_login_config)
5393
+ # If we have docker login config in the new task, override the
5394
+ # existing resources to pick up new credentials. This allows the
5395
+ # user to specify new or fixed credentials if the existing
5396
+ # credentials are not working. If we don't do this, the credentials
5397
+ # from the existing resources will always be reused.
5398
+ if one_task_resource.docker_login_config is not None:
5399
+ to_provision = to_provision.copy(
5400
+ _docker_login_config=one_task_resource.docker_login_config)
5401
+
5402
+ # cluster_config_overrides should be the same for all resources.
5403
+ for resource in task.resources:
5404
+ assert (resource.cluster_config_overrides ==
5405
+ one_task_resource.cluster_config_overrides)
5406
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
5407
+ # Warn users if the Kubernetes pod config is different
5408
+ # from the existing cluster.
5409
+ cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5410
+ cluster_name)
5411
+ actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
5412
+ desired_cluster_yaml_obj = (
5413
+ kubernetes_utils.combine_pod_config_fields_and_metadata(
5414
+ actual_cluster_yaml_obj,
5415
+ cluster_config_overrides=one_task_resource.
5416
+ cluster_config_overrides,
5417
+ cloud=to_provision.cloud,
5418
+ context=to_provision.region))
5419
+
5420
+ def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5421
+ return (yaml_obj.get('available_node_types',
5422
+ {}).get('ray_head_default',
5423
+ {}).get('node_config', {}))
5424
+
5425
+ if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
5426
+ actual_cluster_yaml_obj):
5427
+ # pylint: disable=line-too-long
5428
+ logger.warning(
5429
+ f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
5430
+ f'pod config than the existing cluster. The existing '
5431
+ f'cluster will be used with its current pod config.'
5432
+ f'To apply use your task\'s new pod config:\n'
5433
+ f' • Use a new cluster'
5434
+ f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
5435
+ f'{colorama.Style.RESET_ALL}')
5436
+
4716
5437
  return RetryingVmProvisioner.ToProvisionConfig(
4717
5438
  cluster_name,
4718
5439
  to_provision,
@@ -4727,33 +5448,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4727
5448
  common_utils.check_cluster_name_is_valid(cluster_name)
4728
5449
 
4729
5450
  if to_provision is None:
4730
- # The cluster is recently terminated either by autostop or manually
4731
- # terminated on the cloud. We should use the previously terminated
4732
- # resources to provision the cluster.
4733
- #
4734
- # FIXME(zongheng): this assert can be hit by using two terminals.
4735
- # First, create a 'dbg' cluster. Then:
4736
- # Terminal 1: sky down dbg -y
4737
- # Terminal 2: sky launch -c dbg -- echo
4738
- # Run it in order. Terminal 2 will show this error after terminal 1
4739
- # succeeds in downing the cluster and releasing the lock.
4740
- assert isinstance(
4741
- handle_before_refresh, CloudVmRayResourceHandle), (
4742
- f'Trying to launch cluster {cluster_name!r} recently '
4743
- 'terminated on the cloud, but the handle is not a '
4744
- f'CloudVmRayResourceHandle ({handle_before_refresh}).')
4745
- status_before_refresh_str = None
4746
- if status_before_refresh is not None:
4747
- status_before_refresh_str = status_before_refresh.value
4748
-
4749
- logger.info(
4750
- f'The cluster {cluster_name!r} (status: '
4751
- f'{status_before_refresh_str}) was not found on the cloud: it '
4752
- 'may be autodowned, manually terminated, or its launch never '
4753
- 'succeeded. Provisioning a new cluster by using the same '
4754
- 'resources as its original launch.')
4755
- to_provision = handle_before_refresh.launched_resources
4756
- self.check_resources_fit_cluster(handle_before_refresh, task)
5451
+ # Recently terminated after refresh. OPTIMIZE usually ran outside
5452
+ # the lock, so that decision may be stale by now. Under the lock,
5453
+ # ensure we always have a concrete plan via the following order:
5454
+ # 1) Reuse last placement snapshot (if available);
5455
+ # 2) Else, call injected planner for a fresh plan.
5456
+ # If we still have a pre-refresh handle snapshot with a concrete
5457
+ # placement, prefer reusing it.
5458
+ if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
5459
+ handle_before_refresh.launched_resources is not None):
5460
+ to_provision = handle_before_refresh.launched_resources
5461
+ # Ensure the requested task fits the previous placement.
5462
+ self.check_resources_fit_cluster(handle_before_refresh, task)
5463
+ # Mirror the original message for reuse path.
5464
+ status_before_refresh_str = None
5465
+ if status_before_refresh is not None:
5466
+ status_before_refresh_str = status_before_refresh.value
5467
+ logger.info(
5468
+ f'The cluster {cluster_name!r} (status: '
5469
+ f'{status_before_refresh_str}) was not found on the cloud: '
5470
+ 'it may be autodowned, manually terminated, or its launch '
5471
+ 'never succeeded. Provisioning a new cluster by using the '
5472
+ 'same resources as its original launch.')
5473
+ elif self._planner is not None:
5474
+ to_provision = self._planner(task)
5475
+ logger.info(
5476
+ 'Previous placement snapshot missing; computing a fresh '
5477
+ 'plan for provisioning.')
5478
+ else:
5479
+ # Without a snapshot or planner, we cannot proceed safely.
5480
+ # Surface a user-friendly error without a long traceback.
5481
+ with ux_utils.print_exception_no_traceback():
5482
+ raise RuntimeError(
5483
+ 'No concrete launch plan available after recent cloud '
5484
+ f'termination of cluster {cluster_name!r}. Ensure the '
5485
+ 'OPTIMIZE stage runs or provide concrete resources.')
4757
5486
 
4758
5487
  return RetryingVmProvisioner.ToProvisionConfig(
4759
5488
  cluster_name,
@@ -5033,18 +5762,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5033
5762
  # reconstruct them during cluster restart.
5034
5763
  continue
5035
5764
  storage_mounts_metadata[dst] = storage_obj.handle
5036
- lock_path = (
5037
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
5765
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5038
5766
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5039
5767
  try:
5040
- with filelock.FileLock(lock_path, lock_timeout):
5768
+ with locks.get_lock(lock_id, lock_timeout):
5041
5769
  global_user_state.set_cluster_storage_mounts_metadata(
5042
5770
  cluster_name, storage_mounts_metadata)
5043
- except filelock.Timeout as e:
5771
+ except locks.LockTimeout as e:
5044
5772
  raise RuntimeError(
5045
5773
  f'Failed to store metadata for cluster {cluster_name!r} due to '
5046
5774
  'a timeout when trying to access local database. Please '
5047
- f'try again or manually remove the lock at {lock_path}. '
5775
+ f'try again or manually remove the lock at {lock_id}. '
5048
5776
  f'{common_utils.format_exception(e)}') from None
5049
5777
 
5050
5778
  def get_storage_mounts_metadata(
@@ -5055,19 +5783,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5055
5783
  After retrieving storage_mounts_metadata, it converts back the
5056
5784
  StorageMetadata to Storage object and restores 'storage_mounts.'
5057
5785
  """
5058
- lock_path = (
5059
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
5786
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5060
5787
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5061
5788
  try:
5062
- with filelock.FileLock(lock_path, lock_timeout):
5789
+ with locks.get_lock(lock_id, lock_timeout):
5063
5790
  storage_mounts_metadata = (
5064
5791
  global_user_state.get_cluster_storage_mounts_metadata(
5065
5792
  cluster_name))
5066
- except filelock.Timeout as e:
5793
+ except locks.LockTimeout as e:
5067
5794
  raise RuntimeError(
5068
5795
  f'Failed to retrieve metadata for cluster {cluster_name!r} '
5069
5796
  'due to a timeout when trying to access local database. '
5070
- f'Please try again or manually remove the lock at {lock_path}.'
5797
+ f'Please try again or manually remove the lock at {lock_id}.'
5071
5798
  f' {common_utils.format_exception(e)}') from None
5072
5799
 
5073
5800
  if storage_mounts_metadata is None:
@@ -5104,7 +5831,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5104
5831
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5105
5832
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5106
5833
  """Returns the environment variables for the task."""
5107
- env_vars = task.envs.copy()
5834
+ env_vars = task_lib.get_plaintext_envs_and_secrets(
5835
+ task.envs_and_secrets)
5108
5836
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5109
5837
  # by the controller.
5110
5838
  if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5116,11 +5844,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5116
5844
  env_vars.update(self._skypilot_predefined_env_vars(handle))
5117
5845
  return env_vars
5118
5846
 
5847
+ def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
5848
+ """Returns the user id for the managed job."""
5849
+ if task.managed_job_dag is not None:
5850
+ return task.envs[constants.USER_ID_ENV_VAR]
5851
+ return None
5852
+
5119
5853
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5120
5854
  task: task_lib.Task, job_id: int,
5121
- detach_run: bool) -> None:
5855
+ remote_log_dir: str) -> None:
5122
5856
  # Launch the command as a Ray task.
5123
- log_dir = os.path.join(self.log_dir, 'tasks')
5857
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5124
5858
 
5125
5859
  resources_dict = backend_utils.get_task_demands_dict(task)
5126
5860
  internal_ips = handle.internal_ips()
@@ -5128,9 +5862,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5128
5862
 
5129
5863
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5130
5864
 
5131
- codegen = RayCodeGen()
5865
+ codegen = task_codegen.RayCodeGen()
5132
5866
  codegen.add_prologue(job_id)
5133
- codegen.add_gang_scheduling_placement_group_and_setup(
5867
+ codegen.add_setup(
5134
5868
  1,
5135
5869
  resources_dict,
5136
5870
  stable_cluster_internal_ips=internal_ips,
@@ -5139,36 +5873,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5139
5873
  setup_log_path=os.path.join(log_dir, 'setup.log'),
5140
5874
  )
5141
5875
 
5142
- if callable(task.run):
5143
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5144
- run_fn_name = task.run.__name__
5145
- codegen.register_run_fn(run_fn_code, run_fn_name)
5146
-
5147
- command_for_node = task.run if isinstance(task.run, str) else None
5148
- codegen.add_ray_task(
5149
- bash_script=command_for_node,
5876
+ codegen.add_task(
5877
+ 1,
5878
+ bash_script=task.run,
5150
5879
  env_vars=task_env_vars,
5151
5880
  task_name=task.name,
5152
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5881
+ resources_dict=backend_utils.get_task_demands_dict(task),
5153
5882
  log_dir=log_dir)
5154
5883
 
5155
5884
  codegen.add_epilogue()
5156
5885
 
5157
- self._exec_code_on_head(handle,
5158
- codegen.build(),
5159
- job_id,
5160
- detach_run=detach_run,
5161
- managed_job_dag=task.managed_job_dag)
5886
+ self._exec_code_on_head(
5887
+ handle,
5888
+ codegen.build(),
5889
+ job_id,
5890
+ managed_job_dag=task.managed_job_dag,
5891
+ managed_job_user_id=self._get_managed_job_user_id(task),
5892
+ remote_log_dir=remote_log_dir)
5162
5893
 
5163
5894
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5164
5895
  task: task_lib.Task, job_id: int,
5165
- detach_run: bool) -> None:
5896
+ remote_log_dir: str) -> None:
5166
5897
  # Strategy:
5167
5898
  # ray.init(...)
5168
5899
  # for node:
5169
5900
  # submit _run_cmd(cmd) with resource {node_i: 1}
5170
- log_dir_base = self.log_dir
5171
- log_dir = os.path.join(log_dir_base, 'tasks')
5901
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5172
5902
  resources_dict = backend_utils.get_task_demands_dict(task)
5173
5903
  internal_ips = handle.internal_ips()
5174
5904
  assert internal_ips is not None, 'internal_ips is not cached in handle'
@@ -5177,9 +5907,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5177
5907
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
5178
5908
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5179
5909
 
5180
- codegen = RayCodeGen()
5910
+ codegen = task_codegen.RayCodeGen()
5181
5911
  codegen.add_prologue(job_id)
5182
- codegen.add_gang_scheduling_placement_group_and_setup(
5912
+ codegen.add_setup(
5183
5913
  num_actual_nodes,
5184
5914
  resources_dict,
5185
5915
  stable_cluster_internal_ips=internal_ips,
@@ -5188,30 +5918,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5188
5918
  setup_log_path=os.path.join(log_dir, 'setup.log'),
5189
5919
  )
5190
5920
 
5191
- if callable(task.run):
5192
- run_fn_code = textwrap.dedent(inspect.getsource(task.run))
5193
- run_fn_name = task.run.__name__
5194
- codegen.register_run_fn(run_fn_code, run_fn_name)
5195
-
5196
- # TODO(zhwu): The resources limitation for multi-node ray.tune and
5197
- # horovod should be considered.
5198
- for i in range(num_actual_nodes):
5199
- command_for_node = task.run if isinstance(task.run, str) else None
5200
-
5201
- # Ray's per-node resources, to constrain scheduling each command to
5202
- # the corresponding node, represented by private IPs.
5203
- codegen.add_ray_task(
5204
- bash_script=command_for_node,
5205
- env_vars=task_env_vars,
5206
- task_name=task.name,
5207
- ray_resources_dict=backend_utils.get_task_demands_dict(task),
5208
- log_dir=log_dir,
5209
- gang_scheduling_id=i)
5921
+ codegen.add_task(
5922
+ num_actual_nodes,
5923
+ bash_script=task.run,
5924
+ env_vars=task_env_vars,
5925
+ task_name=task.name,
5926
+ resources_dict=backend_utils.get_task_demands_dict(task),
5927
+ log_dir=log_dir)
5210
5928
 
5211
5929
  codegen.add_epilogue()
5212
5930
  # TODO(zhanghao): Add help info for downloading logs.
5213
- self._exec_code_on_head(handle,
5214
- codegen.build(),
5215
- job_id,
5216
- detach_run=detach_run,
5217
- managed_job_dag=task.managed_job_dag)
5931
+ self._exec_code_on_head(
5932
+ handle,
5933
+ codegen.build(),
5934
+ job_id,
5935
+ managed_job_dag=task.managed_job_dag,
5936
+ managed_job_user_id=self._get_managed_job_user_id(task),
5937
+ remote_log_dir=remote_log_dir)