skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,17 @@
1
1
  """Backend: runs on cloud virtual machines, managed by Ray."""
2
2
  import copy
3
+ import dataclasses
3
4
  import enum
4
5
  import inspect
5
6
  import json
6
7
  import math
7
8
  import os
8
9
  import pathlib
10
+ import random
9
11
  import re
10
12
  import shlex
11
- import shutil
12
13
  import signal
14
+ import socket
13
15
  import subprocess
14
16
  import sys
15
17
  import tempfile
@@ -17,14 +19,14 @@ import textwrap
17
19
  import threading
18
20
  import time
19
21
  import typing
20
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
- Union)
22
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
23
+ Set, Tuple, Union)
22
24
 
23
25
  import colorama
24
- import filelock
26
+ import psutil
25
27
 
26
- import sky
27
28
  from sky import backends
29
+ from sky import catalog
28
30
  from sky import check as sky_check
29
31
  from sky import cloud_stores
30
32
  from sky import clouds
@@ -37,10 +39,10 @@ from sky import resources as resources_lib
37
39
  from sky import sky_logging
38
40
  from sky import skypilot_config
39
41
  from sky import task as task_lib
42
+ from sky.adaptors import common as adaptors_common
40
43
  from sky.backends import backend_utils
41
44
  from sky.backends import wheel_utils
42
45
  from sky.clouds import cloud as sky_cloud
43
- from sky.clouds import service_catalog
44
46
  from sky.clouds.utils import gcp_utils
45
47
  from sky.data import data_utils
46
48
  from sky.data import storage as storage_lib
@@ -48,7 +50,9 @@ from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
50
52
  from sky.provision import provisioner
53
+ from sky.provision.kubernetes import config as config_lib
51
54
  from sky.provision.kubernetes import utils as kubernetes_utils
55
+ from sky.serve import constants as serve_constants
52
56
  from sky.server.requests import requests as requests_lib
53
57
  from sky.skylet import autostop_lib
54
58
  from sky.skylet import constants
@@ -61,8 +65,12 @@ from sky.utils import cluster_utils
61
65
  from sky.utils import command_runner
62
66
  from sky.utils import common
63
67
  from sky.utils import common_utils
68
+ from sky.utils import context_utils
64
69
  from sky.utils import controller_utils
70
+ from sky.utils import directory_utils
65
71
  from sky.utils import env_options
72
+ from sky.utils import lock_events
73
+ from sky.utils import locks
66
74
  from sky.utils import log_utils
67
75
  from sky.utils import message_utils
68
76
  from sky.utils import registry
@@ -72,14 +80,51 @@ from sky.utils import status_lib
72
80
  from sky.utils import subprocess_utils
73
81
  from sky.utils import timeline
74
82
  from sky.utils import ux_utils
83
+ from sky.utils import volume as volume_lib
84
+ from sky.utils import yaml_utils
75
85
 
76
86
  if typing.TYPE_CHECKING:
87
+ import grpc
88
+
77
89
  from sky import dag
90
+ from sky.schemas.generated import autostopv1_pb2
91
+ from sky.schemas.generated import autostopv1_pb2_grpc
92
+ from sky.schemas.generated import jobsv1_pb2
93
+ from sky.schemas.generated import jobsv1_pb2_grpc
94
+ from sky.schemas.generated import managed_jobsv1_pb2
95
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
96
+ from sky.schemas.generated import servev1_pb2
97
+ from sky.schemas.generated import servev1_pb2_grpc
98
+ else:
99
+ # To avoid requiring grpcio to be installed on the client side.
100
+ grpc = adaptors_common.LazyImport(
101
+ 'grpc',
102
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
103
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
104
+ if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
105
+ autostopv1_pb2 = adaptors_common.LazyImport(
106
+ 'sky.schemas.generated.autostopv1_pb2')
107
+ autostopv1_pb2_grpc = adaptors_common.LazyImport(
108
+ 'sky.schemas.generated.autostopv1_pb2_grpc')
109
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
110
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
111
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
112
+ servev1_pb2 = adaptors_common.LazyImport(
113
+ 'sky.schemas.generated.servev1_pb2')
114
+ servev1_pb2_grpc = adaptors_common.LazyImport(
115
+ 'sky.schemas.generated.servev1_pb2_grpc')
116
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
117
+ 'sky.schemas.generated.managed_jobsv1_pb2')
118
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
119
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
78
120
 
79
121
  Path = str
80
122
 
81
123
  SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
82
124
  SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
125
+ # Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
126
+ # from interfering with the Ray cluster in the user's task (if any).
127
+ UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
83
128
 
84
129
  logger = sky_logging.init_logger(__name__)
85
130
 
@@ -96,6 +141,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
96
141
  clouds.OCI: 300,
97
142
  clouds.Paperspace: 600,
98
143
  clouds.Kubernetes: 300,
144
+ clouds.Shadeform: 300,
99
145
  clouds.Vsphere: 240,
100
146
  }
101
147
 
@@ -141,12 +187,13 @@ _MAX_RAY_UP_RETRY = 5
141
187
  _MAX_GET_ZONE_RETRY = 3
142
188
 
143
189
  _JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
190
+ _LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
144
191
 
145
192
  # Path to the monkey-patched ray up script.
146
193
  # We don't do import then __file__ because that script needs to be filled in
147
194
  # (so import would fail).
148
195
  _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
149
- pathlib.Path(sky.__file__).resolve().parent / 'backends' /
196
+ pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
150
197
  'monkey_patches' / 'monkey_patch_ray_up.py')
151
198
 
152
199
  # The maximum size of a command line arguments is 128 KB, i.e. the command
@@ -161,10 +208,19 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
161
208
  # We use 100KB as a threshold to be safe for other arguments that
162
209
  # might be added during ssh.
163
210
  _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
211
+ _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
212
+ ('too long', 255),
213
+ ('request-uri too large', 1),
214
+ ('request header fields too large', 1),
215
+ ('400 bad request', 1), # CloudFlare 400 error
216
+ ]
164
217
 
165
218
  _RESOURCES_UNAVAILABLE_LOG = (
166
219
  'Reasons for provision failures (for details, please check the log above):')
167
220
 
221
+ # Number of seconds to wait locking the cluster before communicating with user.
222
+ _CLUSTER_LOCK_TIMEOUT = 5.0
223
+
168
224
 
169
225
  def _is_command_length_over_limit(command: str) -> bool:
170
226
  """Check if the length of the command exceeds the limit.
@@ -178,6 +234,61 @@ def _is_command_length_over_limit(command: str) -> bool:
178
234
  return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
179
235
 
180
236
 
237
+ def _is_message_too_long(returncode: int,
238
+ output: Optional[str] = None,
239
+ file_path: Optional[str] = None) -> bool:
240
+ """Check if the message sent to the remote is too long.
241
+
242
+ We use inline script to run the setup or run command, i.e. the script will
243
+ be part of the message sent to the remote cluster. There is a chance that
244
+ the command is too long, when people has very long run or setup commands, or
245
+ there is a cloudflare proxy in front of the remote blocking the long
246
+ message. Several common causes are:
247
+ - SSH returning: `too long` in the error message.
248
+ - Cloudflare proxy returning: `414 Request-URI Too Large` or
249
+ `431 Request Header Fields Too Large` error.
250
+
251
+ We use a general length limit check before but it could be inaccurate on
252
+ some systems, e.g. cloudflare proxy, so this is necessary.
253
+
254
+ Args:
255
+ returncode: The return code of the setup command.
256
+ output: The output of the setup command.
257
+ file_path: The path to the setup log file.
258
+ """
259
+ assert (output is None) != (file_path is None), (
260
+ 'Either output or file_path must be provided.', output, file_path)
261
+ to_check = []
262
+ for (match_str,
263
+ desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
264
+ if desired_rc == returncode:
265
+ to_check.append(match_str)
266
+ if not to_check:
267
+ return False
268
+
269
+ def _check_output_for_match_str(output: str) -> bool:
270
+ for match_str in to_check:
271
+ if match_str.lower() in output.lower():
272
+ return True
273
+ return False
274
+
275
+ if file_path is not None:
276
+ try:
277
+ with open(os.path.expanduser(file_path), 'r',
278
+ encoding='utf-8') as f:
279
+ content = f.read()
280
+ return _check_output_for_match_str(content)
281
+ except Exception as e: # pylint: disable=broad-except
282
+ # We don't crash the setup if we cannot read the log file.
283
+ # Instead, we should retry the setup with dumping the script
284
+ # to a file to be safe.
285
+ logger.debug(f'Failed to read setup log file {file_path}: {e}')
286
+ return True
287
+ else:
288
+ assert output is not None, (output, file_path)
289
+ return _check_output_for_match_str(output)
290
+
291
+
181
292
  def _get_cluster_config_template(cloud):
182
293
  cloud_to_template = {
183
294
  clouds.AWS: 'aws-ray.yml.j2',
@@ -189,13 +300,18 @@ def _get_cluster_config_template(cloud):
189
300
  clouds.SCP: 'scp-ray.yml.j2',
190
301
  clouds.OCI: 'oci-ray.yml.j2',
191
302
  clouds.Paperspace: 'paperspace-ray.yml.j2',
303
+ clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
192
304
  clouds.DO: 'do-ray.yml.j2',
193
305
  clouds.RunPod: 'runpod-ray.yml.j2',
194
306
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
307
+ clouds.SSH: 'kubernetes-ray.yml.j2',
308
+ clouds.Shadeform: 'shadeform-ray.yml.j2',
195
309
  clouds.Vsphere: 'vsphere-ray.yml.j2',
196
310
  clouds.Vast: 'vast-ray.yml.j2',
197
311
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
198
- clouds.Nebius: 'nebius-ray.yml.j2'
312
+ clouds.Nebius: 'nebius-ray.yml.j2',
313
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
314
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
199
315
  }
200
316
  return cloud_to_template[type(cloud)]
201
317
 
@@ -274,6 +390,7 @@ class RayCodeGen:
274
390
  ray_address = 'auto'
275
391
  self._code = [
276
392
  textwrap.dedent(f"""\
393
+ import functools
277
394
  import getpass
278
395
  import hashlib
279
396
  import io
@@ -306,6 +423,8 @@ class RayCodeGen:
306
423
 
307
424
  SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
308
425
 
426
+ CANCELLED_RETURN_CODE = 137
427
+
309
428
  kwargs = dict()
310
429
  # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
311
430
  # the directory exists for backward compatibility for the VM
@@ -321,8 +440,10 @@ class RayCodeGen:
321
440
  def get_or_fail(futures, pg) -> List[int]:
322
441
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
323
442
  if not futures:
324
- return []
443
+ return [], []
325
444
  returncodes = [1] * len(futures)
445
+ pids = [None] * len(futures)
446
+ failed = False
326
447
  # Wait for 1 task to be ready.
327
448
  ready = []
328
449
  # Keep invoking ray.wait if ready is empty. This is because
@@ -331,12 +452,22 @@ class RayCodeGen:
331
452
  # before becoming ready.
332
453
  # (Such tasks are common in serving jobs.)
333
454
  # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
455
+
456
+ def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
457
+ nonlocal returncodes, pids, failed
458
+ for task in tasks:
459
+ idx = futures.index(task)
460
+ res = ray.get(task)
461
+ returncodes[idx] = res['return_code']
462
+ pids[idx] = res['pid']
463
+ if res['return_code'] != 0:
464
+ failed = True
465
+
334
466
  while not ready:
335
467
  ready, unready = ray.wait(futures)
336
- idx = futures.index(ready[0])
337
- returncodes[idx] = ray.get(ready[0])
468
+ handle_ready_tasks(ready)
338
469
  while unready:
339
- if returncodes[idx] != 0:
470
+ if failed:
340
471
  for task in unready:
341
472
  # ray.cancel without force fails to kill tasks.
342
473
  # We use force=True to kill unready tasks.
@@ -344,17 +475,16 @@ class RayCodeGen:
344
475
  # Use SIGKILL=128+9 to indicate the task is forcely
345
476
  # killed.
346
477
  idx = futures.index(task)
347
- returncodes[idx] = 137
478
+ returncodes[idx] = CANCELLED_RETURN_CODE
348
479
  break
349
480
  ready, unready = ray.wait(unready)
350
- idx = futures.index(ready[0])
351
- returncodes[idx] = ray.get(ready[0])
481
+ handle_ready_tasks(ready)
352
482
  # Remove the placement group after all tasks are done, so that
353
483
  # the next job can be scheduled on the released resources
354
484
  # immediately.
355
485
  ray_util.remove_placement_group(pg)
356
486
  sys.stdout.flush()
357
- return returncodes
487
+ return returncodes, pids
358
488
 
359
489
  run_fn = None
360
490
  futures = []
@@ -363,13 +493,17 @@ class RayCodeGen:
363
493
  # by ray.remote. This should be removed once we have a better way to
364
494
  # specify dependencies for ray.
365
495
  inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
496
+ inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
366
497
  inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
367
498
  inspect.getsource(log_lib.process_subprocess_stream),
368
499
  inspect.getsource(log_lib.run_with_log),
369
500
  inspect.getsource(log_lib.make_task_bash_script),
370
501
  inspect.getsource(log_lib.add_ray_env_vars),
371
502
  inspect.getsource(log_lib.run_bash_command_with_log),
372
- 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
503
+ inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
504
+ 'run_bash_command_with_log = run_bash_command_with_log',
505
+ 'run_bash_command_with_log_and_return_pid = \
506
+ ray.remote(run_bash_command_with_log_and_return_pid)',
373
507
  ]
374
508
  # Currently, the codegen program is/can only be submitted to the head
375
509
  # node, due to using job_lib for updating job statuses, and using
@@ -471,10 +605,14 @@ class RayCodeGen:
471
605
  # skip the scheduling step.
472
606
  job_lib.scheduler.schedule_step()
473
607
 
474
- total_num_nodes = len(ray.nodes())
608
+ # If some nodes are down and then new nodes are added after launching again,
609
+ # the result of `ray.nodes()` will include all the nodes, so we need to get
610
+ # the alive nodes.
611
+ alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
612
+ total_num_nodes = len(alive_nodes)
475
613
  setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
476
614
  setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
477
- setup_workers = [run_bash_command_with_log \\
615
+ setup_workers = [run_bash_command_with_log_and_return_pid \\
478
616
  .options(
479
617
  name='setup',
480
618
  num_cpus=_SETUP_CPUS,
@@ -489,15 +627,25 @@ class RayCodeGen:
489
627
  stream_logs=True,
490
628
  with_ray=True,
491
629
  ) for i in range(total_num_nodes)]
492
- setup_returncodes = get_or_fail(setup_workers, setup_pg)
493
- if sum(setup_returncodes) != 0:
630
+ setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
631
+ success = True
632
+ failed_workers_and_returncodes = []
633
+ for i in range(len(setup_returncodes)):
634
+ returncode = setup_returncodes[i]
635
+ pid = setup_pids[i]
636
+ if pid == None:
637
+ pid = os.getpid()
638
+ if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
639
+ success = False
640
+ failed_workers_and_returncodes.append((pid, returncode))
641
+ if not success:
642
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
643
+ msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
644
+ msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
645
+ print(msg, flush=True)
494
646
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
495
647
  # This waits for all streaming logs to finish.
496
648
  time.sleep(1)
497
- print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
498
- 'return code list:{colorama.Style.RESET_ALL}',
499
- setup_returncodes,
500
- flush=True)
501
649
  # Need this to set the job status in ray job to be FAILED.
502
650
  sys.exit(1)
503
651
  """)
@@ -614,7 +762,12 @@ class RayCodeGen:
614
762
  # CACHED_MOUNT mode is uploaded to remote.
615
763
  rclone_flush_script = textwrap.dedent(f"""\
616
764
 
617
- if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ]; then
765
+ # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
766
+ # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
767
+ # rclone for normal mounts as well.
768
+ if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
769
+ [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
770
+ [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
618
771
  flushed=0
619
772
  # extra second on top of --vfs-cache-poll-interval to
620
773
  # avoid race condition between rclone log line creation and this check.
@@ -623,7 +776,7 @@ class RayCodeGen:
623
776
  # sleep for the same interval as --vfs-cache-poll-interval
624
777
  sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
625
778
  flushed=1
626
- for file in {constants.RCLONE_LOG_DIR}/*; do
779
+ for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
627
780
  exitcode=0
628
781
  tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
629
782
  if [ $exitcode -ne 0 ]; then
@@ -635,6 +788,8 @@ class RayCodeGen:
635
788
  done
636
789
  echo "skypilot: cached mount uploaded complete"
637
790
  fi""")
791
+ unset_ray_env_vars = ' && '.join(
792
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
638
793
  self._code += [
639
794
  sky_env_vars_dict_str,
640
795
  textwrap.dedent(f"""\
@@ -644,6 +799,7 @@ class RayCodeGen:
644
799
  script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
645
800
 
646
801
  if script is not None:
802
+ script=f'{unset_ray_env_vars}; {{script}}'
647
803
  script += rclone_flush_script
648
804
  sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
649
805
 
@@ -665,7 +821,7 @@ class RayCodeGen:
665
821
 
666
822
  sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
667
823
 
668
- futures.append(run_bash_command_with_log \\
824
+ futures.append(run_bash_command_with_log_and_return_pid \\
669
825
  .options(name=name_str, {options_str}) \\
670
826
  .remote(
671
827
  script,
@@ -684,7 +840,7 @@ class RayCodeGen:
684
840
 
685
841
  self._code += [
686
842
  textwrap.dedent(f"""\
687
- returncodes = get_or_fail(futures, pg)
843
+ returncodes, _ = get_or_fail(futures, pg)
688
844
  if sum(returncodes) != 0:
689
845
  job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
690
846
  # Schedule the next pending job immediately to make the job
@@ -696,6 +852,10 @@ class RayCodeGen:
696
852
  # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
697
853
  if any(r == 139 for r in returncodes):
698
854
  reason = '(likely due to Segmentation Fault)'
855
+ if any(r == 137 for r in returncodes):
856
+ # Find the first non-137 return code
857
+ non_137 = next(r for r in returncodes if r != 137)
858
+ reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
699
859
  print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
700
860
  'return code list:{colorama.Style.RESET_ALL}',
701
861
  returncodes,
@@ -778,34 +938,6 @@ class FailoverCloudErrorHandlerV1:
778
938
  setattr(e, 'detailed_reason', detailed_reason)
779
939
  raise e
780
940
 
781
- @staticmethod
782
- def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
783
- launchable_resources: 'resources_lib.Resources',
784
- region: 'clouds.Region',
785
- zones: Optional[List['clouds.Zone']], stdout: str,
786
- stderr: str):
787
- del zones # Unused.
788
- errors = FailoverCloudErrorHandlerV1._handle_errors(
789
- stdout,
790
- stderr,
791
- is_error_str_known=lambda x: 'SCPError:' in x.strip())
792
-
793
- logger.warning(f'Got error(s) in {region.name}:')
794
- messages = '\n\t'.join(errors)
795
- style = colorama.Style
796
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
797
- _add_to_blocked_resources(blocked_resources,
798
- launchable_resources.copy(zone=None))
799
-
800
- # Sometimes, SCPError will list available regions.
801
- for e in errors:
802
- if e.find('Regions with capacity available:') != -1:
803
- for r in service_catalog.regions('scp'):
804
- if e.find(r.name) == -1:
805
- _add_to_blocked_resources(
806
- blocked_resources,
807
- launchable_resources.copy(region=r.name, zone=None))
808
-
809
941
  @staticmethod
810
942
  def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
811
943
  launchable_resources: 'resources_lib.Resources',
@@ -1085,7 +1217,7 @@ class FailoverCloudErrorHandlerV2:
1085
1217
  output = str(error)
1086
1218
  # Sometimes, lambda cloud error will list available regions.
1087
1219
  if output.find('Regions with capacity available:') != -1:
1088
- for r in service_catalog.regions('lambda'):
1220
+ for r in catalog.regions('lambda'):
1089
1221
  if output.find(r.name) == -1:
1090
1222
  _add_to_blocked_resources(
1091
1223
  blocked_resources,
@@ -1109,6 +1241,21 @@ class FailoverCloudErrorHandlerV2:
1109
1241
  FailoverCloudErrorHandlerV2._default_handler(
1110
1242
  blocked_resources, launchable_resources, region, zones, error)
1111
1243
 
1244
+ @staticmethod
1245
+ def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
1246
+ launchable_resources: 'resources_lib.Resources',
1247
+ region: 'clouds.Region',
1248
+ zones: Optional[List['clouds.Zone']],
1249
+ error: Exception) -> None:
1250
+ logger.info(f'SCP handler error: {error}')
1251
+ # Block SCP if the credential has expired.
1252
+ if isinstance(error, exceptions.InvalidCloudCredentials):
1253
+ _add_to_blocked_resources(
1254
+ blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
1255
+ else:
1256
+ FailoverCloudErrorHandlerV2._default_handler(
1257
+ blocked_resources, launchable_resources, region, zones, error)
1258
+
1112
1259
  @staticmethod
1113
1260
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1114
1261
  launchable_resources: 'resources_lib.Resources',
@@ -1176,7 +1323,8 @@ class RetryingVmProvisioner(object):
1176
1323
  local_wheel_path: pathlib.Path,
1177
1324
  wheel_hash: str,
1178
1325
  blocked_resources: Optional[Iterable[
1179
- resources_lib.Resources]] = None):
1326
+ resources_lib.Resources]] = None,
1327
+ is_managed: Optional[bool] = None):
1180
1328
  self._blocked_resources: Set[resources_lib.Resources] = set()
1181
1329
  if blocked_resources:
1182
1330
  # blocked_resources is not None and not empty.
@@ -1188,6 +1336,7 @@ class RetryingVmProvisioner(object):
1188
1336
  self._requested_features = requested_features
1189
1337
  self._local_wheel_path = local_wheel_path
1190
1338
  self._wheel_hash = wheel_hash
1339
+ self._is_managed = is_managed
1191
1340
 
1192
1341
  def _yield_zones(
1193
1342
  self, to_provision: resources_lib.Resources, num_nodes: int,
@@ -1232,7 +1381,8 @@ class RetryingVmProvisioner(object):
1232
1381
  assert isinstance(handle, CloudVmRayResourceHandle), (
1233
1382
  'handle should be CloudVmRayResourceHandle (found: '
1234
1383
  f'{type(handle)}) {cluster_name!r}')
1235
- config = common_utils.read_yaml(handle.cluster_yaml)
1384
+ config = global_user_state.get_cluster_yaml_dict(
1385
+ handle.cluster_yaml)
1236
1386
  # This is for the case when the zone field is not set in the
1237
1387
  # launched resources in a previous launch (e.g., ctrl-c during
1238
1388
  # launch and multi-node cluster before PR #1700).
@@ -1316,6 +1466,34 @@ class RetryingVmProvisioner(object):
1316
1466
  zones = [clouds.Zone(name=to_provision.zone)]
1317
1467
  yield zones
1318
1468
 
1469
+ def _insufficient_resources_msg(
1470
+ self,
1471
+ to_provision: resources_lib.Resources,
1472
+ requested_resources: Set[resources_lib.Resources],
1473
+ insufficient_resources: Optional[List[str]],
1474
+ ) -> str:
1475
+ insufficent_resource_msg = ('' if insufficient_resources is None else
1476
+ f' ({", ".join(insufficient_resources)})')
1477
+ message = f'Failed to acquire resources{insufficent_resource_msg} '
1478
+ if to_provision.zone is not None:
1479
+ message += (f'in {to_provision.zone} for {requested_resources}. ')
1480
+ elif to_provision.region is not None and to_provision.cloud is not None:
1481
+ # For public clouds, provision.region is always set.
1482
+ if clouds.SSH().is_same_cloud(to_provision.cloud):
1483
+ message += (
1484
+ f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
1485
+ f'for {requested_resources}. The SSH Node Pool may not '
1486
+ 'have enough resources.')
1487
+ elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
1488
+ message += (f'in context {to_provision.region} for '
1489
+ f'{requested_resources}. ')
1490
+ else:
1491
+ message += (f'in all zones in {to_provision.region} for '
1492
+ f'{requested_resources}. ')
1493
+ else:
1494
+ message += (f'{to_provision.cloud} for {requested_resources}. ')
1495
+ return message
1496
+
1319
1497
  def _retry_zones(
1320
1498
  self,
1321
1499
  to_provision: resources_lib.Resources,
@@ -1329,6 +1507,7 @@ class RetryingVmProvisioner(object):
1329
1507
  prev_handle: Optional['CloudVmRayResourceHandle'],
1330
1508
  prev_cluster_ever_up: bool,
1331
1509
  skip_if_config_hash_matches: Optional[str],
1510
+ volume_mounts: Optional[List[volume_lib.VolumeMount]],
1332
1511
  ) -> Dict[str, Any]:
1333
1512
  """The provision retry loop.
1334
1513
 
@@ -1349,12 +1528,17 @@ class RetryingVmProvisioner(object):
1349
1528
  if not dryrun:
1350
1529
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
1351
1530
  os.system(f'touch {log_path}')
1531
+
1352
1532
  rich_utils.force_update_status(
1353
- ux_utils.spinner_message('Launching', log_path))
1533
+ ux_utils.spinner_message('Launching',
1534
+ log_path,
1535
+ cluster_name=cluster_name))
1354
1536
 
1355
1537
  # Get previous cluster status
1356
1538
  cluster_exists = prev_cluster_status is not None
1357
1539
 
1540
+ to_provision = to_provision.assert_launchable()
1541
+
1358
1542
  assert to_provision.region is not None, (
1359
1543
  to_provision, 'region should have been set by the optimizer.')
1360
1544
  region = clouds.Region(to_provision.region)
@@ -1388,6 +1572,7 @@ class RetryingVmProvisioner(object):
1388
1572
  f'To request quotas, check the instruction: '
1389
1573
  f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1390
1574
 
1575
+ insufficient_resources = None
1391
1576
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1392
1577
  prev_cluster_status,
1393
1578
  prev_cluster_ever_up):
@@ -1432,7 +1617,9 @@ class RetryingVmProvisioner(object):
1432
1617
  region=region,
1433
1618
  zones=zones,
1434
1619
  dryrun=dryrun,
1435
- keep_launch_fields_in_existing_config=cluster_exists)
1620
+ keep_launch_fields_in_existing_config=cluster_exists,
1621
+ volume_mounts=volume_mounts,
1622
+ )
1436
1623
  except exceptions.ResourcesUnavailableError as e:
1437
1624
  # Failed due to catalog issue, e.g. image not found, or
1438
1625
  # GPUs are requested in a Kubernetes cluster but the cluster
@@ -1515,8 +1702,17 @@ class RetryingVmProvisioner(object):
1515
1702
  cluster_handle=handle,
1516
1703
  requested_resources=requested_resources,
1517
1704
  ready=False,
1705
+ is_managed=self._is_managed,
1706
+ provision_log_path=log_abs_path,
1518
1707
  )
1519
1708
 
1709
+ # Add cluster event for actual provisioning start.
1710
+ global_user_state.add_cluster_event(
1711
+ cluster_name, status_lib.ClusterStatus.INIT,
1712
+ f'Provisioning on {to_provision.cloud.display_name()} ' +
1713
+ f'in {to_provision.region}',
1714
+ global_user_state.ClusterEventType.STATUS_CHANGE)
1715
+
1520
1716
  global_user_state.set_owner_identity_for_cluster(
1521
1717
  cluster_name, cloud_user_identity)
1522
1718
 
@@ -1543,11 +1739,13 @@ class RetryingVmProvisioner(object):
1543
1739
  controller_str = ('' if controller is None else
1544
1740
  f' {controller.value.name}')
1545
1741
  if isinstance(to_provision.cloud, clouds.Kubernetes):
1546
- # Omit the region name for Kubernetes.
1742
+ suffix = '.'
1743
+ if region.name.startswith('ssh-'):
1744
+ suffix = f' ({region.name.lstrip("ssh-")})'
1547
1745
  logger.info(
1548
1746
  ux_utils.starting_message(
1549
1747
  f'Launching{controller_str} on '
1550
- f'{to_provision.cloud}.'))
1748
+ f'{to_provision.cloud}{suffix}'))
1551
1749
  else:
1552
1750
  logger.info(
1553
1751
  ux_utils.starting_message(
@@ -1587,6 +1785,24 @@ class RetryingVmProvisioner(object):
1587
1785
  # No teardown happens for this error.
1588
1786
  with ux_utils.print_exception_no_traceback():
1589
1787
  raise
1788
+ except config_lib.KubernetesError as e:
1789
+ if e.insufficent_resources:
1790
+ insufficient_resources = e.insufficent_resources
1791
+ # NOTE: We try to cleanup the cluster even if the previous
1792
+ # cluster does not exist. Also we are fast at
1793
+ # cleaning up clusters now if there is no existing node.
1794
+ CloudVmRayBackend().post_teardown_cleanup(
1795
+ handle,
1796
+ terminate=not prev_cluster_ever_up,
1797
+ remove_from_db=False,
1798
+ failover=True,
1799
+ )
1800
+ # TODO(suquark): other clouds may have different zone
1801
+ # blocking strategy. See '_update_blocklist_on_error'
1802
+ # for details.
1803
+ FailoverCloudErrorHandlerV2.update_blocklist_on_error(
1804
+ self._blocked_resources, to_provision, region, zones, e)
1805
+ continue
1590
1806
  except Exception as e: # pylint: disable=broad-except
1591
1807
  # NOTE: We try to cleanup the cluster even if the previous
1592
1808
  # cluster does not exist. Also we are fast at
@@ -1594,7 +1810,8 @@ class RetryingVmProvisioner(object):
1594
1810
  CloudVmRayBackend().post_teardown_cleanup(
1595
1811
  handle,
1596
1812
  terminate=not prev_cluster_ever_up,
1597
- remove_from_db=False)
1813
+ remove_from_db=False,
1814
+ failover=True)
1598
1815
  # TODO(suquark): other clouds may have different zone
1599
1816
  # blocking strategy. See '_update_blocklist_on_error'
1600
1817
  # for details.
@@ -1650,7 +1867,9 @@ class RetryingVmProvisioner(object):
1650
1867
  config_dict['handle'] = handle
1651
1868
  logger.info(
1652
1869
  ux_utils.finishing_message(
1653
- f'Cluster launched: {cluster_name!r}.', log_path))
1870
+ f'Cluster launched: {cluster_name!r}.',
1871
+ log_path,
1872
+ cluster_name=cluster_name))
1654
1873
  return config_dict
1655
1874
 
1656
1875
  # The cluster is not ready. We must perform error recording and/or
@@ -1714,17 +1933,9 @@ class RetryingVmProvisioner(object):
1714
1933
  terminate=terminate_or_stop,
1715
1934
  remove_from_db=False)
1716
1935
 
1717
- if to_provision.zone is not None:
1718
- message = (
1719
- f'Failed to acquire resources in {to_provision.zone} for '
1720
- f'{requested_resources}. ')
1721
- elif to_provision.region is not None:
1722
- # For public clouds, provision.region is always set.
1723
- message = ('Failed to acquire resources in all zones in '
1724
- f'{to_provision.region} for {requested_resources}. ')
1725
- else:
1726
- message = (f'Failed to acquire resources in {to_provision.cloud} '
1727
- f'for {requested_resources}. ')
1936
+ message = self._insufficient_resources_msg(to_provision,
1937
+ requested_resources,
1938
+ insufficient_resources)
1728
1939
  # Do not failover to other locations if the cluster was ever up, since
1729
1940
  # the user can have some data on the cluster.
1730
1941
  raise exceptions.ResourcesUnavailableError(
@@ -1775,7 +1986,8 @@ class RetryingVmProvisioner(object):
1775
1986
  log_abs_path,
1776
1987
  stream_logs=False,
1777
1988
  start_streaming_at='Shared connection to',
1778
- line_processor=log_utils.RayUpLineProcessor(log_abs_path),
1989
+ line_processor=log_utils.RayUpLineProcessor(
1990
+ log_abs_path, cluster_name=cluster_handle.cluster_name),
1779
1991
  # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
1780
1992
  # time during 'ray up' if insufficient capacity occurs.
1781
1993
  env=dict(
@@ -1919,9 +2131,10 @@ class RetryingVmProvisioner(object):
1919
2131
  # ready to ensure cluster will not scale up after preemption (spot).
1920
2132
  # Skip for non-spot as this takes extra time to provision (~1min).
1921
2133
  if use_spot:
1922
- ray_config = common_utils.read_yaml(cluster_config_file)
2134
+ ray_config = global_user_state.get_cluster_yaml_dict(
2135
+ cluster_config_file)
1923
2136
  ray_config['upscaling_speed'] = 0
1924
- common_utils.dump_yaml(cluster_config_file, ray_config)
2137
+ yaml_utils.dump_yaml(cluster_config_file, ray_config)
1925
2138
  start = time.time()
1926
2139
  returncode, stdout, stderr = ray_up()
1927
2140
  logger.debug(
@@ -2030,6 +2243,7 @@ class RetryingVmProvisioner(object):
2030
2243
  f' that never expire or a service account.\033[0m')
2031
2244
  logger.warning(warnings)
2032
2245
 
2246
+ to_provision = to_provision.assert_launchable()
2033
2247
  # Retrying launchable resources.
2034
2248
  while True:
2035
2249
  try:
@@ -2068,7 +2282,9 @@ class RetryingVmProvisioner(object):
2068
2282
  prev_cluster_status=prev_cluster_status,
2069
2283
  prev_handle=prev_handle,
2070
2284
  prev_cluster_ever_up=prev_cluster_ever_up,
2071
- skip_if_config_hash_matches=skip_if_config_hash_matches)
2285
+ skip_if_config_hash_matches=skip_if_config_hash_matches,
2286
+ volume_mounts=task.volume_mounts,
2287
+ )
2072
2288
  if dryrun:
2073
2289
  return config_dict
2074
2290
  except (exceptions.InvalidClusterNameError,
@@ -2115,8 +2331,6 @@ class RetryingVmProvisioner(object):
2115
2331
  # terminated by _retry_zones().
2116
2332
  assert (prev_cluster_status == status_lib.ClusterStatus.INIT
2117
2333
  ), prev_cluster_status
2118
- assert global_user_state.get_handle_from_cluster_name(
2119
- cluster_name) is None, cluster_name
2120
2334
  logger.info(
2121
2335
  ux_utils.retry_message(
2122
2336
  f'Retrying provisioning with requested resources: '
@@ -2151,20 +2365,33 @@ class RetryingVmProvisioner(object):
2151
2365
  # possible resources or the requested resources is too
2152
2366
  # restrictive. If we reach here, our failover logic finally
2153
2367
  # ends here.
2154
- table = log_utils.create_table(['Resource', 'Reason'])
2368
+ table = log_utils.create_table(['INFRA', 'RESOURCES', 'REASON'])
2155
2369
  for (resource, exception) in resource_exceptions.items():
2156
- table.add_row(
2157
- [resources_utils.format_resource(resource), exception])
2158
- table.max_table_width = shutil.get_terminal_size().columns
2370
+ table.add_row([
2371
+ resource.infra.formatted_str(),
2372
+ resources_utils.format_resource(
2373
+ resource, simplified_only=True)[0], exception
2374
+ ])
2375
+ # Set the max width of REASON column to 80 to avoid the table
2376
+ # being wrapped in a unreadable way.
2377
+ # pylint: disable=protected-access
2378
+ table._max_width = {'REASON': 80}
2159
2379
  raise exceptions.ResourcesUnavailableError(
2160
2380
  _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2161
2381
  failover_history=failover_history)
2162
- to_provision = task.best_resources
2382
+ best_resources = task.best_resources
2163
2383
  assert task in self._dag.tasks, 'Internal logic error.'
2164
- assert to_provision is not None, task
2384
+ assert best_resources is not None, task
2385
+ to_provision = best_resources
2165
2386
  return config_dict
2166
2387
 
2167
2388
 
2389
+ @dataclasses.dataclass
2390
+ class SSHTunnelInfo:
2391
+ port: int
2392
+ pid: int
2393
+
2394
+
2168
2395
  class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2169
2396
  """A pickle-able handle to a cluster created by CloudVmRayBackend.
2170
2397
 
@@ -2184,10 +2411,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2184
2411
  - (optional) Launched resources
2185
2412
  - (optional) Docker user name
2186
2413
  - (optional) If TPU(s) are managed, a path to a deletion script.
2414
+ - (optional) Skylet SSH tunnel info.
2187
2415
  """
2188
2416
  # Bump if any fields get added/removed/changed, and add backward
2189
- # compaitibility logic in __setstate__.
2190
- _VERSION = 10
2417
+ # compatibility logic in __setstate__ and/or __getstate__.
2418
+ _VERSION = 12
2191
2419
 
2192
2420
  def __init__(
2193
2421
  self,
@@ -2220,6 +2448,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2220
2448
  self.launched_nodes = launched_nodes
2221
2449
  self.launched_resources = launched_resources
2222
2450
  self.docker_user: Optional[str] = None
2451
+ self.is_grpc_enabled = True
2223
2452
 
2224
2453
  def __repr__(self):
2225
2454
  return (f'ResourceHandle('
@@ -2235,17 +2464,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2235
2464
  f'\n\tlaunched_resources={self.launched_nodes}x '
2236
2465
  f'{self.launched_resources}, '
2237
2466
  f'\n\tdocker_user={self.docker_user},'
2238
- f'\n\tssh_user={self.ssh_user}')
2467
+ f'\n\tssh_user={self.ssh_user},'
2468
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
2239
2469
 
2240
2470
  def get_cluster_name(self):
2241
2471
  return self.cluster_name
2242
2472
 
2473
+ def get_cluster_name_on_cloud(self):
2474
+ return self.cluster_name_on_cloud
2475
+
2243
2476
  def _use_internal_ips(self):
2244
2477
  """Returns whether to use internal IPs for SSH connections."""
2245
2478
  # Directly load the `use_internal_ips` flag from the cluster yaml
2246
2479
  # instead of `skypilot_config` as the latter can be changed after the
2247
2480
  # cluster is UP.
2248
- return common_utils.read_yaml(self.cluster_yaml).get(
2481
+ return global_user_state.get_cluster_yaml_dict(self.cluster_yaml).get(
2249
2482
  'provider', {}).get('use_internal_ips', False)
2250
2483
 
2251
2484
  def update_ssh_ports(self, max_attempts: int = 1) -> None:
@@ -2270,11 +2503,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2270
2503
  clouds.ProvisionerVersion.SKYPILOT):
2271
2504
  provider_name = str(self.launched_resources.cloud).lower()
2272
2505
  config = {}
2273
- if os.path.exists(self.cluster_yaml):
2274
- # It is possible that the cluster yaml is not available when
2275
- # the handle is unpickled for service replicas from the
2276
- # controller with older version.
2277
- config = common_utils.read_yaml(self.cluster_yaml)
2506
+ # It is possible that the cluster yaml is not available when
2507
+ # the handle is unpickled for service replicas from the
2508
+ # controller with older version.
2509
+ yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
2510
+ if yaml_str is None:
2511
+ # If the cluster yaml is not available,
2512
+ # we skip updating the cluster info.
2513
+ return
2514
+ config = yaml_utils.safe_load(yaml_str)
2278
2515
  try:
2279
2516
  cluster_info = provision_lib.get_cluster_info(
2280
2517
  provider_name,
@@ -2410,12 +2647,23 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2410
2647
  zip(cluster_internal_ips, cluster_feasible_ips))
2411
2648
 
2412
2649
  # Ensure head node is the first element, then sort based on the
2413
- # external IPs for stableness
2414
- stable_internal_external_ips = [internal_external_ips[0]] + sorted(
2415
- internal_external_ips[1:], key=lambda x: x[1])
2650
+ # external IPs for stableness. Skip for k8s nodes since pods
2651
+ # worker ids are already mapped.
2652
+ if (cluster_info is not None and
2653
+ cluster_info.provider_name == 'kubernetes'):
2654
+ stable_internal_external_ips = internal_external_ips
2655
+ else:
2656
+ stable_internal_external_ips = [internal_external_ips[0]] + sorted(
2657
+ internal_external_ips[1:], key=lambda x: x[1])
2416
2658
  self.stable_internal_external_ips = stable_internal_external_ips
2417
2659
 
2418
- @annotations.lru_cache(scope='global')
2660
+ @context_utils.cancellation_guard
2661
+ # we expect different request to be acting on different clusters
2662
+ # (= different handles) so we have no real expectation of cache hit
2663
+ # across requests.
2664
+ # Do not change this cache to global scope
2665
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
2666
+ @annotations.lru_cache(scope='request', maxsize=10)
2419
2667
  @timeline.event
2420
2668
  def get_command_runners(self,
2421
2669
  force_cached: bool = False,
@@ -2426,19 +2674,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2426
2674
  self.cluster_yaml, self.docker_user, self.ssh_user)
2427
2675
  if avoid_ssh_control:
2428
2676
  ssh_credentials.pop('ssh_control_name', None)
2677
+
2678
+ launched_resources = self.launched_resources.assert_launchable()
2429
2679
  updated_to_skypilot_provisioner_after_provisioned = (
2430
- self.launched_resources.cloud.PROVISIONER_VERSION >=
2680
+ launched_resources.cloud.PROVISIONER_VERSION >=
2431
2681
  clouds.ProvisionerVersion.SKYPILOT and
2432
2682
  self.cached_external_ips is not None and
2433
2683
  self.cached_cluster_info is None)
2434
2684
  if updated_to_skypilot_provisioner_after_provisioned:
2435
2685
  logger.debug(
2436
- f'{self.launched_resources.cloud} has been updated to the new '
2686
+ f'{launched_resources.cloud} has been updated to the new '
2437
2687
  f'provisioner after cluster {self.cluster_name} was '
2438
2688
  f'provisioned. Cached IPs are used for connecting to the '
2439
2689
  'cluster.')
2440
2690
  if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
2441
- self.launched_resources.cloud.PROVISIONER_VERSION or
2691
+ launched_resources.cloud.PROVISIONER_VERSION or
2442
2692
  updated_to_skypilot_provisioner_after_provisioned):
2443
2693
  ip_list = (self.cached_external_ips
2444
2694
  if force_cached else self.external_ips())
@@ -2464,6 +2714,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2464
2714
  'Tried to use cached cluster info, but it\'s missing for '
2465
2715
  f'cluster "{self.cluster_name}"')
2466
2716
  self._update_cluster_info()
2717
+ # For Kubernetes, `KubernetesCommandRunner` want to get the pod names
2718
+ # to run the command. But for high availability serve controller,
2719
+ # the controller pod is part of a deployment, and once the pod is
2720
+ # killed and a new one is created, the pod name changes, so we need
2721
+ # to manually update the cluster info here.
2722
+ # TODO(andyl): See if we can prevent this refresh. Like pass in
2723
+ # deployment name as identifier for KubernetesCommandRunner. Now this
2724
+ # is required for rsync as using deployment in rsync seems to cause
2725
+ # some unknown issues.
2726
+ # TODO(andyl): Should check through the real cluster info. Same as
2727
+ # the TODO in kubernetes/instance.py:terminate_instances
2728
+ if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
2729
+ controller_utils.high_availability_specified(
2730
+ self.cluster_name)):
2731
+ self._update_cluster_info()
2467
2732
 
2468
2733
  assert self.cached_cluster_info is not None, self
2469
2734
  runners = provision_lib.get_command_runners(
@@ -2532,6 +2797,162 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2532
2797
  cluster_config_file)
2533
2798
  self.docker_user = docker_user
2534
2799
 
2800
+ def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
2801
+ metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
2802
+ self.cluster_name)
2803
+ if metadata is None:
2804
+ return None
2805
+ return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
2806
+
2807
+ def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
2808
+ global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
2809
+ self.cluster_name,
2810
+ (tunnel.port, tunnel.pid) if tunnel is not None else None)
2811
+
2812
+ def close_skylet_ssh_tunnel(self) -> None:
2813
+ """Terminate the SSH tunnel process and clear its metadata."""
2814
+ tunnel = self._get_skylet_ssh_tunnel()
2815
+ if tunnel is None:
2816
+ return
2817
+ logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
2818
+ self.cluster_name, tunnel.port)
2819
+ try:
2820
+ self._terminate_ssh_tunnel_process(tunnel)
2821
+ finally:
2822
+ self._set_skylet_ssh_tunnel(None)
2823
+
2824
+ def get_grpc_channel(self) -> 'grpc.Channel':
2825
+ grpc_options = [
2826
+ # The task YAMLs can be large, so the default
2827
+ # max_receive_message_length of 4MB might not be enough.
2828
+ ('grpc.max_receive_message_length', -1),
2829
+ ]
2830
+ # It's fine to not grab the lock here, as we're only reading,
2831
+ # and writes are very rare.
2832
+ # It's acceptable to read while another process is opening a tunnel,
2833
+ # because it will only happen on:
2834
+ # 1. A new cluster who has no tunnel yet, or
2835
+ # 2. A cluster with an unhealthy tunnel
2836
+ # For (2), for processes that read the "stale" tunnel, it will fail
2837
+ # and on the next retry, it will call get_grpc_channel again
2838
+ # and get the new tunnel.
2839
+ tunnel = self._get_skylet_ssh_tunnel()
2840
+ if tunnel is not None:
2841
+ try:
2842
+ # Check if the tunnel is open.
2843
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2844
+ s.settimeout(0.5)
2845
+ s.connect(('localhost', tunnel.port))
2846
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2847
+ options=grpc_options)
2848
+ except socket.error as e:
2849
+ logger.debug(
2850
+ 'Failed to connect to SSH tunnel for cluster '
2851
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2852
+ 'acquiring lock')
2853
+ pass
2854
+ lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
2855
+ lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
2856
+ lock = locks.get_lock(lock_id, lock_timeout)
2857
+ try:
2858
+ with lock.acquire(blocking=True):
2859
+ # Re-read the tunnel from the DB.
2860
+ tunnel = self._get_skylet_ssh_tunnel()
2861
+ if tunnel is None:
2862
+ logger.debug('No SSH tunnel found for cluster '
2863
+ f'{self.cluster_name!r}, '
2864
+ 'opening the tunnel')
2865
+ tunnel = self._open_and_update_skylet_tunnel()
2866
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2867
+ options=grpc_options)
2868
+ try:
2869
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2870
+ s.settimeout(0.5)
2871
+ s.connect(('localhost', tunnel.port))
2872
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2873
+ options=grpc_options)
2874
+ except socket.error as e:
2875
+ logger.debug(
2876
+ 'Failed to connect to SSH tunnel for cluster '
2877
+ f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2878
+ 'opening new tunnel')
2879
+ tunnel = self._open_and_update_skylet_tunnel()
2880
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2881
+ options=grpc_options)
2882
+ except locks.LockTimeout as e:
2883
+ raise RuntimeError(
2884
+ 'Failed to get gRPC channel for cluster '
2885
+ f'{self.cluster_name!r} due to a timeout when waiting for the '
2886
+ 'SSH tunnel to be opened. Please try again or manually remove '
2887
+ f'the lock at {lock_id}. '
2888
+ f'{common_utils.format_exception(e)}') from e
2889
+
2890
+ def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
2891
+ """Terminate the SSH tunnel process."""
2892
+ try:
2893
+ proc = psutil.Process(tunnel_info.pid)
2894
+ if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2895
+ logger.debug(
2896
+ f'Terminating SSH tunnel process {tunnel_info.pid}')
2897
+ subprocess_utils.kill_children_processes(proc.pid)
2898
+ except psutil.NoSuchProcess:
2899
+ pass
2900
+ except Exception as e: # pylint: disable=broad-except
2901
+ logger.warning(
2902
+ f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2903
+
2904
+ def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
2905
+ """Opens an SSH tunnel to the Skylet on the head node,
2906
+ updates the cluster handle, and persists it to the database."""
2907
+ max_attempts = 3
2908
+ # There could be a race condition here, as multiple processes may
2909
+ # attempt to open the same port at the same time.
2910
+ for attempt in range(max_attempts):
2911
+ runners = self.get_command_runners()
2912
+ head_runner = runners[0]
2913
+ local_port = random.randint(10000, 65535)
2914
+ try:
2915
+ ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
2916
+ head_runner, (local_port, constants.SKYLET_GRPC_PORT))
2917
+ except exceptions.CommandError as e:
2918
+ # Don't retry if the error is due to timeout,
2919
+ # connection refused, Kubernetes pods not found,
2920
+ # or an in-progress termination.
2921
+ if (e.detailed_reason is not None and
2922
+ (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
2923
+ e.detailed_reason) or
2924
+ backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
2925
+ e.detailed_reason) or attempt == max_attempts - 1)):
2926
+ raise e
2927
+ logger.warning(
2928
+ f'Failed to open SSH tunnel on port {local_port} '
2929
+ f'({attempt + 1}/{max_attempts}). '
2930
+ f'{e.error_msg}\n{e.detailed_reason}')
2931
+ continue
2932
+ tunnel_info = SSHTunnelInfo(port=local_port,
2933
+ pid=ssh_tunnel_proc.pid)
2934
+ break
2935
+
2936
+ try:
2937
+ grpc.channel_ready_future(
2938
+ grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2939
+ timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2940
+ # Clean up existing tunnel before setting up the new one.
2941
+ old_tunnel = self._get_skylet_ssh_tunnel()
2942
+ if old_tunnel is not None:
2943
+ self._terminate_ssh_tunnel_process(old_tunnel)
2944
+ self._set_skylet_ssh_tunnel(tunnel_info)
2945
+ return tunnel_info
2946
+ except grpc.FutureTimeoutError as e:
2947
+ self._terminate_ssh_tunnel_process(tunnel_info)
2948
+ logger.warning(
2949
+ f'Skylet gRPC channel for cluster {self.cluster_name} not '
2950
+ f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2951
+ raise e
2952
+ except Exception as e:
2953
+ self._terminate_ssh_tunnel_process(tunnel_info)
2954
+ raise e
2955
+
2535
2956
  @property
2536
2957
  def cluster_yaml(self) -> Optional[str]:
2537
2958
  if self._cluster_yaml is None:
@@ -2542,6 +2963,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2542
2963
  def cluster_yaml(self, value: Optional[str]):
2543
2964
  self._cluster_yaml = value
2544
2965
 
2966
+ @property
2967
+ def instance_ids(self):
2968
+ if self.cached_cluster_info is not None:
2969
+ return self.cached_cluster_info.instance_ids()
2970
+ return None
2971
+
2545
2972
  @property
2546
2973
  def ssh_user(self):
2547
2974
  if self.cached_cluster_info is not None:
@@ -2576,6 +3003,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2576
3003
  num_ips = 1
2577
3004
  return num_ips
2578
3005
 
3006
+ @property
3007
+ def is_grpc_enabled_with_flag(self) -> bool:
3008
+ """Returns whether this handle has gRPC enabled and gRPC flag is set."""
3009
+ return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
3010
+
3011
+ def __getstate__(self):
3012
+ state = self.__dict__.copy()
3013
+ # For backwards compatibility. Refer to
3014
+ # https://github.com/skypilot-org/skypilot/pull/7133
3015
+ state.setdefault('skylet_ssh_tunnel', None)
3016
+ return state
3017
+
2579
3018
  def __setstate__(self, state):
2580
3019
  self._version = self._VERSION
2581
3020
 
@@ -2606,7 +3045,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2606
3045
  # pylint: disable=import-outside-toplevel
2607
3046
  launched_resources = state['launched_resources']
2608
3047
  if isinstance(launched_resources.cloud, clouds.Kubernetes):
2609
- yaml_config = common_utils.read_yaml(
3048
+ yaml_config = global_user_state.get_cluster_yaml_dict(
2610
3049
  os.path.expanduser(state['_cluster_yaml']))
2611
3050
  context = kubernetes_utils.get_context_from_config(
2612
3051
  yaml_config['provider'])
@@ -2629,6 +3068,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2629
3068
  os.path.expanduser(state['_cluster_yaml'])):
2630
3069
  state['_cluster_yaml'] = None
2631
3070
 
3071
+ if version < 11:
3072
+ state['is_grpc_enabled'] = False
3073
+ state['skylet_ssh_tunnel'] = None
3074
+
3075
+ if version >= 12:
3076
+ # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
3077
+ state.pop('skylet_ssh_tunnel', None)
3078
+
2632
3079
  self.__dict__.update(state)
2633
3080
 
2634
3081
  # Because the update_cluster_ips and update_ssh_ports
@@ -2653,6 +3100,234 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2653
3100
  pass
2654
3101
 
2655
3102
 
3103
+ class LocalResourcesHandle(CloudVmRayResourceHandle):
3104
+ """A handle for local resources."""
3105
+
3106
+ def __init__(
3107
+ self,
3108
+ *,
3109
+ cluster_name: str,
3110
+ cluster_name_on_cloud: str,
3111
+ cluster_yaml: Optional[str],
3112
+ launched_nodes: int,
3113
+ launched_resources: resources_lib.Resources,
3114
+ stable_internal_external_ips: Optional[List[Tuple[str,
3115
+ str]]] = None,
3116
+ stable_ssh_ports: Optional[List[int]] = None,
3117
+ cluster_info: Optional[provision_common.ClusterInfo] = None
3118
+ ) -> None:
3119
+ super().__init__(
3120
+ cluster_name=cluster_name,
3121
+ cluster_name_on_cloud=cluster_name_on_cloud,
3122
+ cluster_yaml=cluster_yaml,
3123
+ launched_nodes=launched_nodes,
3124
+ launched_resources=launched_resources,
3125
+ stable_internal_external_ips=stable_internal_external_ips,
3126
+ stable_ssh_ports=stable_ssh_ports,
3127
+ cluster_info=cluster_info)
3128
+ # TODO (kyuds): handle jobs consolidation mode. Currently,
3129
+ # jobs consolidation mode will not run a skylet, hence
3130
+ # grpc server will not run. In the future, we should
3131
+ # figure out a way to start grpc in consolidation mode.
3132
+ self.is_grpc_enabled = False
3133
+
3134
+ @context_utils.cancellation_guard
3135
+ # we expect different request to be acting on different clusters
3136
+ # (= different handles) so we have no real expectation of cache hit
3137
+ # across requests.
3138
+ # Do not change this cache to global scope
3139
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
3140
+ @annotations.lru_cache(scope='request', maxsize=10)
3141
+ @timeline.event
3142
+ def get_command_runners(self,
3143
+ force_cached: bool = False,
3144
+ avoid_ssh_control: bool = False
3145
+ ) -> List[command_runner.CommandRunner]:
3146
+ """Returns a list of local command runners."""
3147
+ del force_cached, avoid_ssh_control # Unused.
3148
+ return [command_runner.LocalProcessCommandRunner()]
3149
+
3150
+
3151
+ class SkyletClient:
3152
+ """The client to interact with a remote cluster through Skylet."""
3153
+
3154
+ def __init__(self, channel: 'grpc.Channel'):
3155
+ self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
3156
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
3157
+ self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
3158
+ self._managed_jobs_stub = (
3159
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
3160
+
3161
+ def set_autostop(
3162
+ self,
3163
+ request: 'autostopv1_pb2.SetAutostopRequest',
3164
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3165
+ ) -> 'autostopv1_pb2.SetAutostopResponse':
3166
+ return self._autostop_stub.SetAutostop(request, timeout=timeout)
3167
+
3168
+ def is_autostopping(
3169
+ self,
3170
+ request: 'autostopv1_pb2.IsAutostoppingRequest',
3171
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3172
+ ) -> 'autostopv1_pb2.IsAutostoppingResponse':
3173
+ return self._autostop_stub.IsAutostopping(request, timeout=timeout)
3174
+
3175
+ def add_job(
3176
+ self,
3177
+ request: 'jobsv1_pb2.AddJobRequest',
3178
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3179
+ ) -> 'jobsv1_pb2.AddJobResponse':
3180
+ return self._jobs_stub.AddJob(request, timeout=timeout)
3181
+
3182
+ def queue_job(
3183
+ self,
3184
+ request: 'jobsv1_pb2.QueueJobRequest',
3185
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3186
+ ) -> 'jobsv1_pb2.QueueJobResponse':
3187
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
3188
+
3189
+ def update_status(
3190
+ self,
3191
+ request: 'jobsv1_pb2.UpdateStatusRequest',
3192
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3193
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
3194
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
3195
+
3196
+ def get_job_queue(
3197
+ self,
3198
+ request: 'jobsv1_pb2.GetJobQueueRequest',
3199
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3200
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
3201
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
3202
+
3203
+ def cancel_jobs(
3204
+ self,
3205
+ request: 'jobsv1_pb2.CancelJobsRequest',
3206
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3207
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
3208
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
3209
+
3210
+ def fail_all_in_progress_jobs(
3211
+ self,
3212
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
3213
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3214
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
3215
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
3216
+
3217
+ def get_job_status(
3218
+ self,
3219
+ request: 'jobsv1_pb2.GetJobStatusRequest',
3220
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3221
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
3222
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
3223
+
3224
+ def get_job_submitted_timestamp(
3225
+ self,
3226
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
3227
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3228
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
3229
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
3230
+ timeout=timeout)
3231
+
3232
+ def get_job_ended_timestamp(
3233
+ self,
3234
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
3235
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3236
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
3237
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
3238
+
3239
+ def get_log_dirs_for_jobs(
3240
+ self,
3241
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
3242
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3243
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
3244
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
3245
+
3246
+ def tail_logs(
3247
+ self,
3248
+ request: 'jobsv1_pb2.TailLogsRequest',
3249
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3250
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
3251
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
3252
+
3253
+ def get_service_status(
3254
+ self,
3255
+ request: 'servev1_pb2.GetServiceStatusRequest',
3256
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3257
+ ) -> 'servev1_pb2.GetServiceStatusResponse':
3258
+ return self._serve_stub.GetServiceStatus(request, timeout=timeout)
3259
+
3260
+ def add_serve_version(
3261
+ self,
3262
+ request: 'servev1_pb2.AddVersionRequest',
3263
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3264
+ ) -> 'servev1_pb2.AddVersionResponse':
3265
+ return self._serve_stub.AddVersion(request, timeout=timeout)
3266
+
3267
+ def terminate_services(
3268
+ self,
3269
+ request: 'servev1_pb2.TerminateServicesRequest',
3270
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3271
+ ) -> 'servev1_pb2.TerminateServicesResponse':
3272
+ return self._serve_stub.TerminateServices(request, timeout=timeout)
3273
+
3274
+ def terminate_replica(
3275
+ self,
3276
+ request: 'servev1_pb2.TerminateReplicaRequest',
3277
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3278
+ ) -> 'servev1_pb2.TerminateReplicaResponse':
3279
+ return self._serve_stub.TerminateReplica(request, timeout=timeout)
3280
+
3281
+ def wait_service_registration(
3282
+ self,
3283
+ request: 'servev1_pb2.WaitServiceRegistrationRequest',
3284
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3285
+ ) -> 'servev1_pb2.WaitServiceRegistrationResponse':
3286
+ # set timeout to at least 10 seconds more than service register
3287
+ # constant to make sure that timeouts will not occur.
3288
+ if timeout is not None:
3289
+ timeout = max(timeout,
3290
+ serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
3291
+ return self._serve_stub.WaitServiceRegistration(request,
3292
+ timeout=timeout)
3293
+
3294
+ def update_service(
3295
+ self,
3296
+ request: 'servev1_pb2.UpdateServiceRequest',
3297
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3298
+ ) -> 'servev1_pb2.UpdateServiceResponse':
3299
+ return self._serve_stub.UpdateService(request, timeout=timeout)
3300
+
3301
+ def get_managed_job_controller_version(
3302
+ self,
3303
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
3304
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3305
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
3306
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
3307
+
3308
+ def get_managed_job_table(
3309
+ self,
3310
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
3311
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3312
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
3313
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
3314
+
3315
+ def get_all_managed_job_ids_by_name(
3316
+ self,
3317
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
3318
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3319
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
3320
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
3321
+ timeout=timeout)
3322
+
3323
+ def cancel_managed_jobs(
3324
+ self,
3325
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
3326
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3327
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
3328
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
3329
+
3330
+
2656
3331
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2657
3332
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2658
3333
  """Backend: runs on cloud virtual machines, managed by Ray.
@@ -2665,7 +3340,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2665
3340
  NAME = 'cloudvmray'
2666
3341
 
2667
3342
  # Backward compatibility, with the old name of the handle.
2668
- ResourceHandle = CloudVmRayResourceHandle # pylint: disable=invalid-name
3343
+ ResourceHandle = CloudVmRayResourceHandle # type: ignore
2669
3344
 
2670
3345
  def __init__(self):
2671
3346
  self.run_timestamp = sky_logging.get_run_timestamp()
@@ -2680,6 +3355,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2680
3355
  self._dag = None
2681
3356
  self._optimize_target = None
2682
3357
  self._requested_features = set()
3358
+ self._dump_final_script = False
3359
+ self._is_managed = False
3360
+ # Optional planner (via register_info): used under the per-cluster lock
3361
+ # to produce a fresh concrete plan when neither a reusable snapshot nor
3362
+ # a caller plan is available.
3363
+ self._planner = None
2683
3364
 
2684
3365
  # Command for running the setup script. It is only set when the
2685
3366
  # setup needs to be run outside the self._setup() and as part of
@@ -2696,6 +3377,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2696
3377
  self._requested_features = kwargs.pop('requested_features',
2697
3378
  self._requested_features)
2698
3379
  self._dump_final_script = kwargs.pop('dump_final_script', False)
3380
+ self._is_managed = kwargs.pop('is_managed', False)
3381
+ # Optional planner callback for a fresh plan under lock when no
3382
+ # reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
3383
+ self._planner = kwargs.pop('planner', self._planner)
2699
3384
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2700
3385
 
2701
3386
  def check_resources_fit_cluster(
@@ -2722,9 +3407,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2722
3407
  # Usage Collection:
2723
3408
  usage_lib.messages.usage.update_cluster_resources(
2724
3409
  handle.launched_nodes, launched_resources)
2725
- record = global_user_state.get_cluster_from_name(cluster_name)
2726
- if record is not None:
2727
- usage_lib.messages.usage.update_cluster_status(record['status'])
3410
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
3411
+ if status is not None:
3412
+ usage_lib.messages.usage.update_cluster_status(status)
2728
3413
 
2729
3414
  assert launched_resources.region is not None, handle
2730
3415
 
@@ -2846,12 +3531,46 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2846
3531
  # Check if the cluster is owned by the current user. Raise
2847
3532
  # exceptions.ClusterOwnerIdentityMismatchError
2848
3533
  backend_utils.check_owner_identity(cluster_name)
2849
- lock_path = os.path.expanduser(
2850
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2851
- with timeline.FileLockEvent(lock_path):
2852
- # Try to launch the exiting cluster first. If no existing cluster,
2853
- # this function will create a to_provision_config with required
2854
- # resources.
3534
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
3535
+ communicated_with_user = False
3536
+
3537
+ while True:
3538
+ try:
3539
+ return self._locked_provision(lock_id, task, to_provision,
3540
+ dryrun, stream_logs, cluster_name,
3541
+ retry_until_up,
3542
+ skip_unnecessary_provisioning)
3543
+ except locks.LockTimeout:
3544
+ if not communicated_with_user:
3545
+ rich_utils.force_update_status(
3546
+ ux_utils.spinner_message('Launching - blocked by ' +
3547
+ 'other requests ' +
3548
+ colorama.Style.RESET_ALL +
3549
+ colorama.Style.DIM +
3550
+ 'Check concurrent requests: ' +
3551
+ 'sky api status -v | grep '
3552
+ f'{cluster_name}'))
3553
+
3554
+ def _locked_provision(
3555
+ self,
3556
+ lock_id: str,
3557
+ task: task_lib.Task,
3558
+ to_provision: Optional[resources_lib.Resources],
3559
+ dryrun: bool,
3560
+ stream_logs: bool,
3561
+ cluster_name: str,
3562
+ retry_until_up: bool = False,
3563
+ skip_unnecessary_provisioning: bool = False,
3564
+ ) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
3565
+ with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
3566
+ # Reset spinner message to remove any mention of being blocked
3567
+ # by other requests.
3568
+ rich_utils.force_update_status(
3569
+ ux_utils.spinner_message('Launching'))
3570
+
3571
+ # Try to launch the exiting cluster first. If no existing
3572
+ # cluster, this function will create a to_provision_config
3573
+ # with required resources.
2855
3574
  to_provision_config = self._check_existing_cluster(
2856
3575
  task, to_provision, cluster_name, dryrun)
2857
3576
  assert to_provision_config.resources is not None, (
@@ -2869,14 +3588,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2869
3588
  # TODO(suquark): once we have sky on PyPI, we should directly
2870
3589
  # install sky from PyPI.
2871
3590
  local_wheel_path, wheel_hash = wheel_utils.build_sky_wheel()
2872
- # The most frequent reason for the failure of a provision
2873
- # request is resource unavailability instead of rate
2874
- # limiting; to make users wait shorter, we do not make
2875
- # backoffs exponential.
2876
- backoff = common_utils.Backoff(
2877
- initial_backoff=_RETRY_UNTIL_UP_INIT_GAP_SECONDS,
2878
- max_backoff_factor=1)
2879
- attempt_cnt = 1
2880
3591
  while True:
2881
3592
  # For on-demand instances, RetryingVmProvisioner will retry
2882
3593
  # within the given region first, then optionally retry on all
@@ -2900,16 +3611,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2900
3611
  self._requested_features,
2901
3612
  local_wheel_path,
2902
3613
  wheel_hash,
2903
- blocked_resources=task.blocked_resources)
3614
+ blocked_resources=task.blocked_resources,
3615
+ is_managed=self._is_managed)
2904
3616
  log_path = os.path.join(self.log_dir, 'provision.log')
2905
3617
  rich_utils.force_update_status(
2906
- ux_utils.spinner_message('Launching', log_path))
3618
+ ux_utils.spinner_message('Launching',
3619
+ log_path,
3620
+ cluster_name=cluster_name))
2907
3621
  config_dict = retry_provisioner.provision_with_retries(
2908
3622
  task, to_provision_config, dryrun, stream_logs,
2909
3623
  skip_unnecessary_provisioning)
2910
3624
  break
2911
3625
  except exceptions.ResourcesUnavailableError as e:
2912
3626
  log_path = retry_provisioner.log_dir + '/provision.log'
3627
+
2913
3628
  error_message = (
2914
3629
  f'{colorama.Fore.RED}Failed to provision all '
2915
3630
  f'possible launchable resources.'
@@ -2920,23 +3635,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2920
3635
  error_message = str(e)
2921
3636
 
2922
3637
  if retry_until_up:
2923
- logger.error(error_message)
2924
- # Sleep and retry.
2925
- gap_seconds = backoff.current_backoff()
2926
- plural = 's' if attempt_cnt > 1 else ''
3638
+ gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
2927
3639
  retry_message = ux_utils.retry_message(
2928
- f'Retry after {gap_seconds:.0f}s '
2929
- f'({attempt_cnt} attempt{plural}). ')
2930
- logger.info(f'\n{retry_message} '
2931
- f'{ux_utils.log_path_hint(log_path)}'
2932
- f'{colorama.Style.RESET_ALL}')
2933
- attempt_cnt += 1
2934
- time.sleep(gap_seconds)
2935
- continue
3640
+ f'Retry after {gap_seconds:.0f}s ')
3641
+ hint_message = (
3642
+ f'\n{retry_message} '
3643
+ f'{ux_utils.provision_hint(cluster_name)}'
3644
+ f'{colorama.Style.RESET_ALL}')
3645
+
3646
+ # Add cluster event for retry.
3647
+ global_user_state.add_cluster_event(
3648
+ cluster_name, status_lib.ClusterStatus.INIT,
3649
+ f'Retrying provisioning after {gap_seconds:.0f}s',
3650
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3651
+
3652
+ raise exceptions.ExecutionRetryableError(
3653
+ error_message,
3654
+ hint=hint_message,
3655
+ retry_wait_seconds=gap_seconds)
2936
3656
  # Clean up the cluster's entry in `sky status`.
2937
3657
  # Do not remove the stopped cluster from the global state
2938
3658
  # if failed to start.
2939
3659
  if not e.no_failover:
3660
+ global_user_state.add_cluster_event(
3661
+ cluster_name,
3662
+ None,
3663
+ 'Provision failed: ' + str(e),
3664
+ global_user_state.ClusterEventType.STATUS_CHANGE,
3665
+ nop_if_duplicate=True)
2940
3666
  global_user_state.remove_cluster(cluster_name,
2941
3667
  terminate=True)
2942
3668
  usage_lib.messages.usage.update_final_cluster_status(
@@ -2944,7 +3670,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2944
3670
  logger.error(
2945
3671
  ux_utils.error_message(
2946
3672
  'Failed to provision resources. '
2947
- f'{ux_utils.log_path_hint(log_path)}'))
3673
+ f'{ux_utils.provision_hint(cluster_name)}'))
2948
3674
  error_message += (
2949
3675
  '\nTo keep retrying until the cluster is up, use '
2950
3676
  'the `--retry-until-up` flag.')
@@ -2953,8 +3679,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2953
3679
  error_message + '\n' + str(e),
2954
3680
  failover_history=e.failover_history) from None
2955
3681
  if dryrun:
2956
- record = global_user_state.get_cluster_from_name(cluster_name)
2957
- return record['handle'] if record is not None else None, False
3682
+ handle = global_user_state.get_handle_from_cluster_name(
3683
+ cluster_name)
3684
+ return handle if handle is not None else None, False
2958
3685
 
2959
3686
  if config_dict['provisioning_skipped']:
2960
3687
  # Skip further provisioning.
@@ -2962,10 +3689,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2962
3689
  # ('handle', 'provision_record', 'resources_vars')
2963
3690
  # We need to return the handle - but it should be the existing
2964
3691
  # handle for the cluster.
2965
- record = global_user_state.get_cluster_from_name(cluster_name)
2966
- assert record is not None and record['handle'] is not None, (
2967
- cluster_name, record)
2968
- return record['handle'], True
3692
+ handle = global_user_state.get_handle_from_cluster_name(
3693
+ cluster_name)
3694
+ assert handle is not None, (cluster_name, handle)
3695
+ return handle, True
2969
3696
 
2970
3697
  if 'provision_record' in config_dict:
2971
3698
  # New provisioner is used here.
@@ -2980,8 +3707,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2980
3707
  # and other necessary files to the VM.
2981
3708
  # 3. Run setup commands to install dependencies.
2982
3709
  # 4. Starting ray cluster and skylet.
3710
+
3711
+ # Add cluster event for runtime setup start
3712
+ global_user_state.add_cluster_event(
3713
+ handle.cluster_name, status_lib.ClusterStatus.INIT,
3714
+ 'Setting up SkyPilot runtime on cluster',
3715
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3716
+
2983
3717
  cluster_info = provisioner.post_provision_runtime_setup(
2984
- repr(handle.launched_resources.cloud),
3718
+ handle.launched_resources,
2985
3719
  resources_utils.ClusterName(handle.cluster_name,
2986
3720
  handle.cluster_name_on_cloud),
2987
3721
  handle.cluster_yaml,
@@ -2995,6 +3729,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2995
3729
  # manually or by the cloud provider.
2996
3730
  # Optimize the case where the cluster's IPs can be retrieved
2997
3731
  # from cluster_info.
3732
+ handle.cached_cluster_info = cluster_info
2998
3733
  handle.docker_user = cluster_info.docker_user
2999
3734
  handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
3000
3735
  cluster_info=cluster_info)
@@ -3006,7 +3741,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3006
3741
 
3007
3742
  self._update_after_cluster_provisioned(
3008
3743
  handle, to_provision_config.prev_handle, task,
3009
- prev_cluster_status, lock_path, config_hash)
3744
+ prev_cluster_status, config_hash)
3010
3745
  return handle, False
3011
3746
 
3012
3747
  cluster_config_file = config_dict['ray']
@@ -3016,8 +3751,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3016
3751
  ssh_port_list = handle.external_ssh_ports()
3017
3752
  assert ip_list is not None, handle
3018
3753
  assert ssh_port_list is not None, handle
3019
-
3020
- config = common_utils.read_yaml(cluster_config_file)
3754
+ config = global_user_state.get_cluster_yaml_dict(
3755
+ cluster_config_file)
3021
3756
  if 'docker' in config:
3022
3757
  handle.setup_docker_user(cluster_config_file)
3023
3758
 
@@ -3078,14 +3813,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3078
3813
 
3079
3814
  self._update_after_cluster_provisioned(
3080
3815
  handle, to_provision_config.prev_handle, task,
3081
- prev_cluster_status, lock_path, config_hash)
3816
+ prev_cluster_status, config_hash)
3082
3817
  return handle, False
3083
3818
 
3084
3819
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
3085
3820
  cloud = handle.launched_resources.cloud
3086
3821
  logger.debug(
3087
3822
  f'Opening ports {handle.launched_resources.ports} for {cloud}')
3088
- config = common_utils.read_yaml(handle.cluster_yaml)
3823
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
3089
3824
  provider_config = config['provider']
3090
3825
  provision_lib.open_ports(repr(cloud), handle.cluster_name_on_cloud,
3091
3826
  handle.launched_resources.ports,
@@ -3096,7 +3831,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3096
3831
  prev_handle: Optional[CloudVmRayResourceHandle],
3097
3832
  task: task_lib.Task,
3098
3833
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3099
- lock_path: str, config_hash: str) -> None:
3834
+ config_hash: str) -> None:
3100
3835
  usage_lib.messages.usage.update_cluster_resources(
3101
3836
  handle.launched_nodes, handle.launched_resources)
3102
3837
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3108,16 +3843,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3108
3843
  # update_status will query the ray job status for all INIT /
3109
3844
  # PENDING / RUNNING jobs for the real status, since we do not
3110
3845
  # know the actual previous status of the cluster.
3111
- cmd = job_lib.JobLibCodeGen.update_status()
3112
3846
  logger.debug('Update job queue on remote cluster.')
3113
3847
  with rich_utils.safe_status(
3114
3848
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3115
- returncode, _, stderr = self.run_on_head(handle,
3116
- cmd,
3117
- require_outputs=True)
3118
- subprocess_utils.handle_returncode(returncode, cmd,
3119
- 'Failed to update job status.',
3120
- stderr)
3849
+ use_legacy = not handle.is_grpc_enabled_with_flag
3850
+
3851
+ if not use_legacy:
3852
+ try:
3853
+ request = jobsv1_pb2.UpdateStatusRequest()
3854
+ backend_utils.invoke_skylet_with_retries(
3855
+ lambda: SkyletClient(handle.get_grpc_channel()
3856
+ ).update_status(request))
3857
+ except exceptions.SkyletMethodNotImplementedError:
3858
+ use_legacy = True
3859
+
3860
+ if use_legacy:
3861
+ cmd = job_lib.JobLibCodeGen.update_status()
3862
+ returncode, _, stderr = self.run_on_head(
3863
+ handle, cmd, require_outputs=True)
3864
+ subprocess_utils.handle_returncode(
3865
+ returncode, cmd, 'Failed to update job status.', stderr)
3121
3866
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3122
3867
  # Safely set all the previous jobs to FAILED since the cluster
3123
3868
  # is restarted
@@ -3125,14 +3870,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3125
3870
  # 1. A job finishes RUNNING, but right before it update itself
3126
3871
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3127
3872
  # 2. On next `sky start`, it gets reset to FAILED.
3128
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3129
- returncode, stdout, stderr = self.run_on_head(handle,
3130
- cmd,
3131
- require_outputs=True)
3132
- subprocess_utils.handle_returncode(
3133
- returncode, cmd,
3134
- 'Failed to set previously in-progress jobs to FAILED',
3135
- stdout + stderr)
3873
+ use_legacy = not handle.is_grpc_enabled_with_flag
3874
+
3875
+ if not use_legacy:
3876
+ try:
3877
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3878
+ backend_utils.invoke_skylet_with_retries(
3879
+ lambda: SkyletClient(handle.get_grpc_channel(
3880
+ )).fail_all_in_progress_jobs(fail_request))
3881
+ except exceptions.SkyletMethodNotImplementedError:
3882
+ use_legacy = True
3883
+
3884
+ if use_legacy:
3885
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3886
+ returncode, stdout, stderr = self.run_on_head(
3887
+ handle, cmd, require_outputs=True)
3888
+ subprocess_utils.handle_returncode(
3889
+ returncode, cmd,
3890
+ 'Failed to set previously in-progress jobs to FAILED',
3891
+ stdout + stderr)
3136
3892
 
3137
3893
  prev_ports = None
3138
3894
  if prev_handle is not None:
@@ -3142,14 +3898,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3142
3898
  resources_utils.port_ranges_to_set(current_ports) -
3143
3899
  resources_utils.port_ranges_to_set(prev_ports))
3144
3900
  if open_new_ports:
3145
- cloud = handle.launched_resources.cloud
3146
- if not (cloud.OPEN_PORTS_VERSION <=
3901
+ launched_resources = handle.launched_resources.assert_launchable()
3902
+ if not (launched_resources.cloud.OPEN_PORTS_VERSION <=
3147
3903
  clouds.OpenPortsVersion.LAUNCH_ONLY):
3148
3904
  with rich_utils.safe_status(
3149
3905
  ux_utils.spinner_message(
3150
3906
  'Launching - Opening new ports')):
3151
3907
  self._open_ports(handle)
3152
3908
 
3909
+ # Capture task YAML and command
3910
+ user_specified_task_config = None
3911
+ if task is not None:
3912
+ user_specified_task_config = task.to_yaml_config(
3913
+ use_user_specified_yaml=True)
3914
+
3153
3915
  with timeline.Event('backend.provision.post_process'):
3154
3916
  global_user_state.add_or_update_cluster(
3155
3917
  handle.cluster_name,
@@ -3157,7 +3919,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3157
3919
  set(task.resources),
3158
3920
  ready=True,
3159
3921
  config_hash=config_hash,
3922
+ task_config=user_specified_task_config,
3160
3923
  )
3924
+
3925
+ # Add cluster event for successful provisioning.
3926
+ global_user_state.add_cluster_event(
3927
+ handle.cluster_name, status_lib.ClusterStatus.UP,
3928
+ 'Cluster successfully provisioned with ' +
3929
+ f'{handle.launched_nodes} nodes',
3930
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3931
+
3161
3932
  usage_lib.messages.usage.update_final_cluster_status(
3162
3933
  status_lib.ClusterStatus.UP)
3163
3934
  # We still add the cluster to ssh config file on API server, this
@@ -3172,13 +3943,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3172
3943
  handle.cached_external_ssh_ports, handle.docker_user,
3173
3944
  handle.ssh_user)
3174
3945
 
3175
- common_utils.remove_file_if_exists(lock_path)
3176
-
3177
3946
  def _sync_workdir(self, handle: CloudVmRayResourceHandle,
3178
- workdir: Path) -> None:
3947
+ workdir: Union[Path, Dict[str, Any]],
3948
+ envs_and_secrets: Dict[str, str]) -> None:
3179
3949
  # Even though provision() takes care of it, there may be cases where
3180
3950
  # this function is called in isolation, without calling provision(),
3181
3951
  # e.g., in CLI. So we should rerun rsync_up.
3952
+ if isinstance(workdir, dict):
3953
+ self._sync_git_workdir(handle, envs_and_secrets)
3954
+ else:
3955
+ self._sync_path_workdir(handle, workdir)
3956
+
3957
+ def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
3958
+ envs_and_secrets: Dict[str, str]) -> None:
3959
+ style = colorama.Style
3960
+ ip_list = handle.external_ips()
3961
+ assert ip_list is not None, 'external_ips is not cached in handle'
3962
+
3963
+ log_path = os.path.join(self.log_dir, 'workdir_sync.log')
3964
+
3965
+ # TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
3966
+ runners = handle.get_command_runners()
3967
+
3968
+ def _sync_git_workdir_node(
3969
+ runner: command_runner.CommandRunner) -> None:
3970
+ # Type assertion to help mypy understand the type
3971
+ assert hasattr(
3972
+ runner, 'git_clone'
3973
+ ), f'CommandRunner should have git_clone method, ' \
3974
+ f'got {type(runner)}'
3975
+ runner.git_clone(
3976
+ target_dir=SKY_REMOTE_WORKDIR,
3977
+ log_path=log_path,
3978
+ stream_logs=False,
3979
+ max_retry=3,
3980
+ envs_and_secrets=envs_and_secrets,
3981
+ )
3982
+
3983
+ num_nodes = handle.launched_nodes
3984
+ plural = 's' if num_nodes > 1 else ''
3985
+ logger.info(
3986
+ f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
3987
+ f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3988
+ os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3989
+ os.system(f'touch {log_path}')
3990
+ num_threads = subprocess_utils.get_parallel_threads(
3991
+ str(handle.launched_resources.cloud))
3992
+ with rich_utils.safe_status(
3993
+ ux_utils.spinner_message('Syncing workdir', log_path)):
3994
+ subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
3995
+ num_threads)
3996
+ logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
3997
+
3998
+ def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
3999
+ workdir: Path) -> None:
3182
4000
  fore = colorama.Fore
3183
4001
  style = colorama.Style
3184
4002
  ip_list = handle.external_ips()
@@ -3247,9 +4065,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3247
4065
  TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
3248
4066
  assert here that all storage_mounts are MOUNT mode.
3249
4067
  """
4068
+ launched_resources = handle.launched_resources.assert_launchable()
3250
4069
  with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
3251
4070
  controller_utils.replace_skypilot_config_path_in_file_mounts(
3252
- handle.launched_resources.cloud, all_file_mounts)
4071
+ launched_resources.cloud, all_file_mounts)
3253
4072
  self._execute_file_mounts(handle, all_file_mounts)
3254
4073
  self._execute_storage_mounts(handle, storage_mounts)
3255
4074
  self._set_storage_mounts_metadata(handle.cluster_name,
@@ -3267,10 +4086,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3267
4086
  remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
3268
4087
  # Need this `-i` option to make sure `source ~/.bashrc` work
3269
4088
  setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
4089
+ unset_ray_env_vars = ' && '.join(
4090
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
4091
+ setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
3270
4092
  runners = handle.get_command_runners(avoid_ssh_control=True)
3271
4093
 
3272
4094
  def _setup_node(node_id: int) -> None:
3273
- setup_envs = task.envs.copy()
4095
+ setup_envs = task.envs_and_secrets
3274
4096
  setup_envs.update(self._skypilot_predefined_env_vars(handle))
3275
4097
  setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
3276
4098
  setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
@@ -3329,33 +4151,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3329
4151
  return returncode
3330
4152
 
3331
4153
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3332
- if returncode == 255:
3333
- is_message_too_long = False
3334
- try:
3335
- with open(os.path.expanduser(setup_log_path),
3336
- 'r',
3337
- encoding='utf-8') as f:
3338
- if 'too long' in f.read():
3339
- is_message_too_long = True
3340
- except Exception as e: # pylint: disable=broad-except
3341
- # We don't crash the setup if we cannot read the log file.
3342
- # Instead, we should retry the setup with dumping the script
3343
- # to a file to be safe.
3344
- logger.debug('Failed to read setup log file '
3345
- f'{setup_log_path}: {e}')
3346
- is_message_too_long = True
3347
-
3348
- if is_message_too_long:
3349
- # If the setup script is too long, we retry it with dumping
3350
- # the script to a file and running it with SSH. We use a
3351
- # general length limit check before but it could be
3352
- # inaccurate on some systems.
3353
- logger.debug(
3354
- 'Failed to run setup command inline due to '
3355
- 'command length limit. Dumping setup script to '
3356
- 'file and running it with SSH.')
3357
- _dump_final_script(setup_script)
3358
- returncode = _run_setup(setup_cmd)
4154
+
4155
+ if _is_message_too_long(returncode, file_path=setup_log_path):
4156
+ # If the setup script is too long, we need to retry it
4157
+ # with dumping the script to a file and running it the script
4158
+ # on remote cluster instead.
4159
+ logger.debug('Failed to run setup command inline due to '
4160
+ 'command length limit. Dumping setup script to '
4161
+ 'file and running it with SSH.')
4162
+ _dump_final_script(setup_script)
4163
+ returncode = _run_setup(setup_cmd)
3359
4164
 
3360
4165
  def error_message() -> str:
3361
4166
  # Use the function to avoid tailing the file in success case
@@ -3414,102 +4219,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3414
4219
  logger.info(
3415
4220
  ux_utils.finishing_message('Setup completed.', setup_log_path))
3416
4221
 
4222
+ def _download_file(self, handle: CloudVmRayResourceHandle,
4223
+ local_file_path: str, remote_file_path: str) -> None:
4224
+ """Syncs file from remote to local."""
4225
+ runners = handle.get_command_runners()
4226
+ head_runner = runners[0]
4227
+ head_runner.rsync(
4228
+ source=local_file_path,
4229
+ target=remote_file_path,
4230
+ up=False,
4231
+ stream_logs=False,
4232
+ )
4233
+
3417
4234
  def _exec_code_on_head(
3418
4235
  self,
3419
4236
  handle: CloudVmRayResourceHandle,
3420
4237
  codegen: str,
3421
4238
  job_id: int,
3422
- detach_run: bool = False,
3423
4239
  managed_job_dag: Optional['dag.Dag'] = None,
4240
+ managed_job_user_id: Optional[str] = None,
4241
+ remote_log_dir: Optional[str] = None,
3424
4242
  ) -> None:
3425
4243
  """Executes generated code on the head node."""
3426
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3427
- remote_log_dir = self.log_dir
4244
+ use_legacy = not handle.is_grpc_enabled_with_flag
4245
+ file_name = f'sky_job_{job_id}'
4246
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
4247
+ if remote_log_dir is None:
4248
+ remote_log_dir = self.log_dir
3428
4249
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3429
4250
 
3430
- cd = f'cd {SKY_REMOTE_WORKDIR}'
4251
+ def _dump_code_to_file(codegen: str,
4252
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
4253
+ runners = handle.get_command_runners()
4254
+ head_runner = runners[0]
4255
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
4256
+ fp.write(codegen)
4257
+ fp.flush()
4258
+ script_path = os.path.join(target_dir, file_name)
4259
+ # We choose to sync code + exec, because the alternative of
4260
+ # 'ray submit' may not work as it may use system python
4261
+ # (python2) to execute the script. Happens for AWS.
4262
+ head_runner.rsync(source=fp.name,
4263
+ target=script_path,
4264
+ up=True,
4265
+ stream_logs=False)
3431
4266
 
4267
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3432
4268
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3433
4269
  f'touch {remote_log_path}')
3434
4270
  encoded_script = shlex.quote(codegen)
3435
4271
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3436
4272
  job_submit_cmd = (
3437
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3438
- # with pid is the same driver process.
4273
+ # JOB_CMD_IDENTIFIER is used for identifying the process
4274
+ # retrieved with pid is the same driver process.
3439
4275
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3440
4276
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3441
4277
  # Do not use &>, which is not POSIX and may not work.
3442
4278
  # Note that the order of ">filename 2>&1" matters.
3443
4279
  f'> {remote_log_path} 2>&1')
3444
-
3445
4280
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3446
4281
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3447
4282
 
3448
- def _dump_code_to_file(codegen: str,
3449
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3450
- runners = handle.get_command_runners()
3451
- head_runner = runners[0]
3452
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3453
- fp.write(codegen)
3454
- fp.flush()
3455
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3456
- # We choose to sync code + exec, because the alternative of 'ray
3457
- # submit' may not work as it may use system python (python2) to
3458
- # execute the script. Happens for AWS.
3459
- head_runner.rsync(source=fp.name,
3460
- target=script_path,
3461
- up=True,
3462
- stream_logs=False)
3463
-
3464
4283
  # Should also be ealier than _is_command_length_over_limit
3465
4284
  # Same reason as in _setup
3466
4285
  if self._dump_final_script:
3467
4286
  _dump_code_to_file(job_submit_cmd,
3468
4287
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3469
4288
 
3470
- if _is_command_length_over_limit(job_submit_cmd):
3471
- _dump_code_to_file(codegen)
3472
- job_submit_cmd = f'{mkdir_code} && {code}'
3473
-
3474
- if managed_job_dag is not None:
3475
- # Add the managed job to job queue database.
3476
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3477
- managed_job_code = managed_job_codegen.set_pending(
3478
- job_id, managed_job_dag)
3479
- # Set the managed job to PENDING state to make sure that this
3480
- # managed job appears in the `sky jobs queue`, even if it needs to
3481
- # wait to be submitted.
3482
- # We cannot set the managed job to PENDING state in the job template
3483
- # (jobs-controller.yaml.j2), as it may need to wait for the run
3484
- # commands to be scheduled on the job controller in high-load cases.
3485
- job_submit_cmd += ' && ' + managed_job_code
4289
+ if not use_legacy:
4290
+ try:
4291
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
4292
+ if managed_job_dag is not None:
4293
+ workspace = skypilot_config.get_active_workspace(
4294
+ force_user_workspace=True)
4295
+ entrypoint = common_utils.get_current_command()
4296
+
4297
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
4298
+ for task_id, task in enumerate(managed_job_dag.tasks):
4299
+ resources_str = backend_utils.get_task_resources_str(
4300
+ task, is_managed_job=True)
4301
+ managed_job_tasks.append(
4302
+ jobsv1_pb2.ManagedJobTask(
4303
+ task_id=task_id,
4304
+ name=task.name,
4305
+ resources_str=resources_str,
4306
+ metadata_json=task.metadata_json))
4307
+
4308
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
4309
+ name=managed_job_dag.name,
4310
+ pool=managed_job_dag.pool,
4311
+ workspace=workspace,
4312
+ entrypoint=entrypoint,
4313
+ tasks=managed_job_tasks,
4314
+ user_id=managed_job_user_id)
4315
+
4316
+ if _is_command_length_over_limit(codegen):
4317
+ _dump_code_to_file(codegen)
4318
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4319
+ job_id=job_id,
4320
+ # codegen not set - server assumes script uploaded
4321
+ remote_log_dir=remote_log_dir,
4322
+ managed_job=managed_job_info,
4323
+ script_path=script_path)
4324
+ else:
4325
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4326
+ job_id=job_id,
4327
+ codegen=codegen,
4328
+ remote_log_dir=remote_log_dir,
4329
+ managed_job=managed_job_info,
4330
+ script_path=script_path)
4331
+
4332
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
4333
+ handle.get_grpc_channel()).queue_job(queue_job_request))
4334
+ except exceptions.SkyletMethodNotImplementedError:
4335
+ use_legacy = True
4336
+
4337
+ if use_legacy:
4338
+ if _is_command_length_over_limit(job_submit_cmd):
4339
+ _dump_code_to_file(codegen)
4340
+ job_submit_cmd = f'{mkdir_code} && {code}'
4341
+
4342
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
4343
+ if managed_job_dag is not None:
4344
+ # Add the managed job to job queue database.
4345
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
4346
+ managed_job_code = managed_job_codegen.set_pending(
4347
+ job_id,
4348
+ managed_job_dag,
4349
+ skypilot_config.get_active_workspace(
4350
+ force_user_workspace=True),
4351
+ entrypoint=common_utils.get_current_command(),
4352
+ user_hash=managed_job_user_id)
4353
+ # Set the managed job to PENDING state to make sure that
4354
+ # this managed job appears in the `sky jobs queue`, even
4355
+ # if it needs to wait to be submitted.
4356
+ # We cannot set the managed job to PENDING state in the
4357
+ # job template (jobs-controller.yaml.j2), as it may need
4358
+ # to wait for the run commands to be scheduled on the job
4359
+ # controller in high-load cases.
4360
+ job_submit_cmd += ' && ' + managed_job_code
4361
+ return job_submit_cmd
4362
+
4363
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
3486
4364
 
3487
- returncode, stdout, stderr = self.run_on_head(handle,
3488
- job_submit_cmd,
3489
- stream_logs=False,
3490
- require_outputs=True)
3491
- # Happens when someone calls `sky exec` but remote is outdated for
3492
- # running a job. Necessitating calling `sky launch`.
3493
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3494
- handle.cluster_name)
3495
- if returncode == 255 and 'too long' in stdout + stderr:
3496
- # If the generated script is too long, we retry it with dumping
3497
- # the script to a file and running it with SSH. We use a general
3498
- # length limit check before but it could be inaccurate on some
3499
- # systems.
3500
- logger.debug('Failed to submit job due to command length limit. '
3501
- 'Dumping job to file and running it with SSH.')
3502
- _dump_code_to_file(codegen)
3503
- job_submit_cmd = f'{mkdir_code} && {code}'
3504
4365
  returncode, stdout, stderr = self.run_on_head(handle,
3505
4366
  job_submit_cmd,
3506
4367
  stream_logs=False,
3507
4368
  require_outputs=True)
4369
+ # Happens when someone calls `sky exec` but remote is outdated for
4370
+ # running a job. Necessitating calling `sky launch`.
4371
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4372
+ handle.cluster_name)
4373
+ output = stdout + stderr
4374
+ if _is_message_too_long(returncode, output=output):
4375
+ # If the job submit script is too long, we need to retry it
4376
+ # with dumping the script to a file and running it the script
4377
+ # on remote cluster instead.
4378
+ logger.debug(
4379
+ 'Failed to submit job due to command length limit. '
4380
+ 'Dumping job to file and running it with SSH. '
4381
+ f'Output: {output}')
4382
+ _dump_code_to_file(codegen)
4383
+ job_submit_cmd = f'{mkdir_code} && {code}'
4384
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4385
+ returncode, stdout, stderr = self.run_on_head(
4386
+ handle,
4387
+ job_submit_cmd,
4388
+ stream_logs=False,
4389
+ require_outputs=True)
3508
4390
 
3509
- subprocess_utils.handle_returncode(returncode,
3510
- job_submit_cmd,
3511
- f'Failed to submit job {job_id}.',
3512
- stderr=stdout + stderr)
4391
+ subprocess_utils.handle_returncode(
4392
+ returncode,
4393
+ job_submit_cmd,
4394
+ f'Failed to submit job {job_id}.',
4395
+ stderr=stdout + stderr)
3513
4396
 
3514
4397
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
3515
4398
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3518,53 +4401,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3518
4401
  logger.info(
3519
4402
  ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3520
4403
  rich_utils.stop_safe_status()
3521
- if not detach_run:
3522
- if (handle.cluster_name == controller_utils.Controllers.
3523
- JOBS_CONTROLLER.value.cluster_name):
3524
- self.tail_managed_job_logs(handle, job_id)
3525
- else:
3526
- # Sky logs. Not using subprocess.run since it will make the
3527
- # ssh keep connected after ctrl-c.
3528
- self.tail_logs(handle, job_id)
3529
4404
 
3530
4405
  def _add_job(self, handle: CloudVmRayResourceHandle,
3531
- job_name: Optional[str], resources_str: str) -> int:
3532
- code = job_lib.JobLibCodeGen.add_job(
3533
- job_name=job_name,
3534
- username=common_utils.get_user_hash(),
3535
- run_timestamp=self.run_timestamp,
3536
- resources_str=resources_str)
3537
- returncode, job_id_str, stderr = self.run_on_head(handle,
3538
- code,
3539
- stream_logs=False,
3540
- require_outputs=True,
3541
- separate_stderr=True)
3542
- # Happens when someone calls `sky exec` but remote is outdated for
3543
- # adding a job. Necessitating calling `sky launch`.
3544
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3545
- handle.cluster_name)
3546
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3547
- # retry for this, after we figure out the reason.
3548
- subprocess_utils.handle_returncode(returncode, code,
3549
- 'Failed to fetch job id.', stderr)
3550
- try:
3551
- job_id_match = _JOB_ID_PATTERN.search(job_id_str)
3552
- if job_id_match is not None:
3553
- job_id = int(job_id_match.group(1))
3554
- else:
3555
- # For backward compatibility.
3556
- job_id = int(job_id_str)
3557
- except ValueError as e:
3558
- logger.error(stderr)
3559
- raise ValueError(f'Failed to parse job id: {job_id_str}; '
3560
- f'Returncode: {returncode}') from e
3561
- return job_id
4406
+ job_name: Optional[str], resources_str: str,
4407
+ metadata: str) -> Tuple[int, str]:
4408
+ use_legacy = not handle.is_grpc_enabled_with_flag
4409
+
4410
+ if not use_legacy:
4411
+ try:
4412
+ request = jobsv1_pb2.AddJobRequest(
4413
+ job_name=job_name,
4414
+ username=common_utils.get_user_hash(),
4415
+ run_timestamp=self.run_timestamp,
4416
+ resources_str=resources_str,
4417
+ metadata=metadata)
4418
+ response = backend_utils.invoke_skylet_with_retries(
4419
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
4420
+ request))
4421
+ job_id = response.job_id
4422
+ log_dir = response.log_dir
4423
+ return job_id, log_dir
4424
+ except exceptions.SkyletMethodNotImplementedError:
4425
+ use_legacy = True
4426
+
4427
+ if use_legacy:
4428
+ code = job_lib.JobLibCodeGen.add_job(
4429
+ job_name=job_name,
4430
+ username=common_utils.get_user_hash(),
4431
+ run_timestamp=self.run_timestamp,
4432
+ resources_str=resources_str,
4433
+ metadata=metadata)
4434
+ returncode, result_str, stderr = self.run_on_head(
4435
+ handle,
4436
+ code,
4437
+ stream_logs=False,
4438
+ require_outputs=True,
4439
+ separate_stderr=True)
4440
+ # Happens when someone calls `sky exec` but remote is outdated for
4441
+ # adding a job. Necessitating calling `sky launch`.
4442
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4443
+ handle.cluster_name)
4444
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
4445
+ # retry for this, after we figure out the reason.
4446
+ subprocess_utils.handle_returncode(returncode, code,
4447
+ 'Failed to fetch job id.',
4448
+ stderr)
4449
+ try:
4450
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
4451
+ if job_id_match is not None:
4452
+ job_id = int(job_id_match.group(1))
4453
+ else:
4454
+ # For backward compatibility.
4455
+ job_id = int(result_str)
4456
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
4457
+ if log_dir_match is not None:
4458
+ log_dir = log_dir_match.group(1).strip()
4459
+ else:
4460
+ # For backward compatibility, use the same log dir as local.
4461
+ log_dir = self.log_dir
4462
+ except ValueError as e:
4463
+ logger.error(stderr)
4464
+ raise ValueError(f'Failed to parse job id: {result_str}; '
4465
+ f'Returncode: {returncode}') from e
4466
+ return job_id, log_dir
3562
4467
 
3563
4468
  def _execute(
3564
4469
  self,
3565
4470
  handle: CloudVmRayResourceHandle,
3566
4471
  task: task_lib.Task,
3567
- detach_run: bool,
3568
4472
  dryrun: bool = False,
3569
4473
  ) -> Optional[int]:
3570
4474
  """Executes the task on the cluster.
@@ -3588,7 +4492,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3588
4492
  # In this case, we reset the resources for the task, so that the
3589
4493
  # detached setup does not need to wait for the task resources to be
3590
4494
  # ready (which is not used for setup anyway).
3591
- valid_resource = sky.Resources()
4495
+ valid_resource = resources_lib.Resources()
3592
4496
  else:
3593
4497
  # Check the task resources vs the cluster resources. Since
3594
4498
  # `sky exec` will not run the provision and _check_existing_cluster
@@ -3610,15 +4514,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3610
4514
  logger.info(f'Dryrun complete. Would have run:\n{task}')
3611
4515
  return None
3612
4516
 
3613
- job_id = self._add_job(handle, task_copy.name, resources_str)
4517
+ job_id, log_dir = self._add_job(handle, task_copy.name, resources_str,
4518
+ task.metadata_json)
3614
4519
 
3615
4520
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
3616
4521
  # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
3617
4522
  if num_actual_nodes > 1:
3618
- self._execute_task_n_nodes(handle, task_copy, job_id, detach_run)
4523
+ self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
3619
4524
  else:
3620
4525
  # Case: task_lib.Task(run, num_nodes=1)
3621
- self._execute_task_one_node(handle, task_copy, job_id, detach_run)
4526
+ self._execute_task_one_node(handle, task_copy, job_id, log_dir)
3622
4527
 
3623
4528
  return job_id
3624
4529
 
@@ -3674,16 +4579,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3674
4579
  is_identity_mismatch_and_purge = True
3675
4580
  else:
3676
4581
  raise
3677
- lock_path = os.path.expanduser(
3678
- backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
4582
+ lock_id = backend_utils.cluster_status_lock_id(cluster_name)
4583
+ lock = locks.get_lock(lock_id, timeout=1)
3679
4584
  # Retry in case new cluster operation comes in and holds the lock
3680
4585
  # right after the lock is removed.
3681
4586
  n_attempts = 2
3682
4587
  while True:
3683
4588
  n_attempts -= 1
3684
- # In case other running cluster operations are still holding the
3685
- # lock.
3686
- common_utils.remove_file_if_exists(lock_path)
3687
4589
  # We have to kill the cluster requests, because `down` and `stop`
3688
4590
  # should be higher priority than the cluster requests, and we should
3689
4591
  # release the lock from other requests.
@@ -3701,10 +4603,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3701
4603
  'Failed to kill other launch requests for the '
3702
4604
  f'cluster {handle.cluster_name}: '
3703
4605
  f'{common_utils.format_exception(e, use_bracket=True)}')
4606
+ # In case other running cluster operations are still holding the
4607
+ # lock.
4608
+ lock.force_unlock()
3704
4609
  try:
3705
- with filelock.FileLock(
3706
- lock_path,
3707
- backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
4610
+ with lock:
3708
4611
  self.teardown_no_lock(
3709
4612
  handle,
3710
4613
  terminate,
@@ -3717,14 +4620,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3717
4620
  refresh_cluster_status=(
3718
4621
  not is_identity_mismatch_and_purge))
3719
4622
  if terminate:
3720
- common_utils.remove_file_if_exists(lock_path)
4623
+ lock.force_unlock()
3721
4624
  break
3722
- except filelock.Timeout as e:
4625
+ except locks.LockTimeout as e:
3723
4626
  logger.debug(f'Failed to acquire lock for {cluster_name}, '
3724
4627
  f'retrying...')
3725
4628
  if n_attempts <= 0:
3726
4629
  raise RuntimeError(
3727
- f'Cluster {cluster_name!r} is locked by {lock_path}. '
4630
+ f'Cluster {cluster_name!r} is locked by {lock_id}. '
3728
4631
  'Check to see if it is still being launched') from e
3729
4632
 
3730
4633
  # --- CloudVMRayBackend Specific APIs ---
@@ -3735,6 +4638,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3735
4638
  job_ids: Optional[List[int]] = None,
3736
4639
  stream_logs: bool = True
3737
4640
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4641
+ if handle.is_grpc_enabled_with_flag:
4642
+ try:
4643
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4644
+ response = backend_utils.invoke_skylet_with_retries(
4645
+ lambda: SkyletClient(handle.get_grpc_channel()
4646
+ ).get_job_status(request))
4647
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4648
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4649
+ for job_id, proto_status in response.job_statuses.items()
4650
+ }
4651
+ return statuses
4652
+ except exceptions.SkyletMethodNotImplementedError:
4653
+ pass
4654
+
3738
4655
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
3739
4656
  returncode, stdout, stderr = self.run_on_head(handle,
3740
4657
  code,
@@ -3755,16 +4672,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3755
4672
 
3756
4673
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
3757
4674
  """
3758
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
3759
- returncode, stdout, _ = self.run_on_head(handle,
3760
- code,
3761
- stream_logs=False,
3762
- require_outputs=True)
3763
- subprocess_utils.handle_returncode(
3764
- returncode, code,
3765
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
3766
-
3767
- cancelled_ids = message_utils.decode_payload(stdout)
4675
+ use_legacy = not handle.is_grpc_enabled_with_flag
4676
+
4677
+ if not use_legacy:
4678
+ try:
4679
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4680
+ cancel_all=cancel_all,
4681
+ user_hash=user_hash)
4682
+ response = backend_utils.invoke_skylet_with_retries(
4683
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4684
+ request))
4685
+ cancelled_ids = response.cancelled_job_ids
4686
+ except exceptions.SkyletMethodNotImplementedError:
4687
+ use_legacy = True
4688
+
4689
+ if use_legacy:
4690
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4691
+ user_hash)
4692
+ returncode, stdout, _ = self.run_on_head(handle,
4693
+ code,
4694
+ stream_logs=False,
4695
+ require_outputs=True)
4696
+ subprocess_utils.handle_returncode(
4697
+ returncode, code,
4698
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4699
+ stdout)
4700
+ cancelled_ids = message_utils.decode_payload(stdout)
3768
4701
  if cancelled_ids:
3769
4702
  logger.info(
3770
4703
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -3781,32 +4714,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3781
4714
  Returns:
3782
4715
  A dictionary mapping job_id to log path.
3783
4716
  """
3784
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3785
- returncode, run_timestamps, stderr = self.run_on_head(
3786
- handle,
3787
- code,
3788
- stream_logs=False,
3789
- require_outputs=True,
3790
- separate_stderr=True)
3791
- subprocess_utils.handle_returncode(returncode, code,
3792
- 'Failed to sync logs.', stderr)
3793
- run_timestamps = message_utils.decode_payload(run_timestamps)
3794
- if not run_timestamps:
3795
- logger.info(f'{colorama.Fore.YELLOW}'
3796
- 'No matching log directories found'
3797
- f'{colorama.Style.RESET_ALL}')
3798
- return {}
4717
+ job_to_dir: Dict[str, str] = {}
4718
+ use_legacy = not handle.is_grpc_enabled_with_flag
3799
4719
 
3800
- job_ids = list(run_timestamps.keys())
3801
- run_timestamps = list(run_timestamps.values())
4720
+ if not use_legacy:
4721
+ try:
4722
+ int_job_ids = []
4723
+ if job_ids:
4724
+ for str_job_id in job_ids:
4725
+ if str_job_id.isdigit():
4726
+ int_job_ids.append(int(str_job_id))
4727
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4728
+ job_ids=int_job_ids)
4729
+ response = backend_utils.invoke_skylet_with_retries(
4730
+ lambda: SkyletClient(handle.get_grpc_channel()
4731
+ ).get_log_dirs_for_jobs(request))
4732
+ job_log_dirs = response.job_log_dirs
4733
+ if not job_log_dirs:
4734
+ logger.info(f'{colorama.Fore.YELLOW}'
4735
+ 'No matching log directories found'
4736
+ f'{colorama.Style.RESET_ALL}')
4737
+ return {}
4738
+ for job_id, log_dir in job_log_dirs.items():
4739
+ # Convert to string for backwards compatibility
4740
+ job_to_dir[str(job_id)] = log_dir
4741
+ except exceptions.SkyletMethodNotImplementedError:
4742
+ use_legacy = True
4743
+
4744
+ if use_legacy:
4745
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4746
+ returncode, stdout, stderr = self.run_on_head(handle,
4747
+ code,
4748
+ stream_logs=False,
4749
+ require_outputs=True,
4750
+ separate_stderr=True)
4751
+ subprocess_utils.handle_returncode(returncode, code,
4752
+ 'Failed to sync logs.', stderr)
4753
+ job_to_dir = message_utils.decode_payload(stdout)
4754
+ if not job_to_dir:
4755
+ logger.info(f'{colorama.Fore.YELLOW}'
4756
+ 'No matching log directories found'
4757
+ f'{colorama.Style.RESET_ALL}')
4758
+ return {}
4759
+
4760
+ job_ids = list(job_to_dir.keys())
4761
+ dirs = list(job_to_dir.values())
3802
4762
  remote_log_dirs = [
3803
- os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
3804
- for run_timestamp in run_timestamps
3805
- ]
3806
- local_log_dirs = [
3807
- os.path.join(local_dir, run_timestamp)
3808
- for run_timestamp in run_timestamps
4763
+ # TODO(aylei): backward compatibility for legacy runtime that
4764
+ # returns run_timestamp only, remove after 0.12.0
4765
+ (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
4766
+ constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
3809
4767
  ]
4768
+ local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
4769
+ if constants.SKY_LOGS_DIRECTORY in dir else
4770
+ os.path.join(local_dir, dir)) for dir in dirs]
3810
4771
 
3811
4772
  runners = handle.get_command_runners()
3812
4773
 
@@ -3842,12 +4803,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3842
4803
  subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
3843
4804
  return dict(zip(job_ids, local_log_dirs))
3844
4805
 
3845
- def tail_logs(self,
3846
- handle: CloudVmRayResourceHandle,
3847
- job_id: Optional[int],
3848
- managed_job_id: Optional[int] = None,
3849
- follow: bool = True,
3850
- tail: int = 0) -> int:
4806
+ @context_utils.cancellation_guard
4807
+ def tail_logs(
4808
+ self,
4809
+ handle: CloudVmRayResourceHandle,
4810
+ job_id: Optional[int],
4811
+ managed_job_id: Optional[int] = None,
4812
+ follow: bool = True,
4813
+ tail: int = 0,
4814
+ require_outputs: bool = False,
4815
+ stream_logs: bool = True,
4816
+ process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
3851
4817
  """Tail the logs of a job.
3852
4818
 
3853
4819
  Args:
@@ -3857,11 +4823,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3857
4823
  follow: Whether to follow the logs.
3858
4824
  tail: The number of lines to display from the end of the
3859
4825
  log file. If 0, print all lines.
4826
+ require_outputs: Whether to return the stdout/stderr of the command.
4827
+ stream_logs: Whether to stream the logs to stdout/stderr.
4828
+ process_stream: Whether to process the stream.
3860
4829
 
3861
4830
  Returns:
3862
4831
  The exit code of the tail command. Returns code 100 if the job has
3863
4832
  failed. See exceptions.JobExitCode for possible return codes.
3864
4833
  """
4834
+ if handle.is_grpc_enabled_with_flag:
4835
+ last_exit_code = 0
4836
+ try:
4837
+ request = jobsv1_pb2.TailLogsRequest(
4838
+ job_id=job_id,
4839
+ managed_job_id=managed_job_id,
4840
+ follow=follow,
4841
+ tail=tail)
4842
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4843
+ lambda: SkyletClient(handle.get_grpc_channel()
4844
+ ).tail_logs(request, timeout=None)):
4845
+ if resp.log_line:
4846
+ print(resp.log_line, end='', flush=True)
4847
+ last_exit_code = resp.exit_code
4848
+ return last_exit_code
4849
+ except exceptions.SkyletMethodNotImplementedError:
4850
+ pass
4851
+ except grpc.RpcError as e:
4852
+ if e.code() == grpc.StatusCode.CANCELLED:
4853
+ return last_exit_code
4854
+ raise e
4855
+
3865
4856
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
3866
4857
  managed_job_id=managed_job_id,
3867
4858
  follow=follow,
@@ -3876,29 +4867,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3876
4867
  signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
3877
4868
  signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
3878
4869
  try:
3879
- returncode = self.run_on_head(
4870
+ final = self.run_on_head(
3880
4871
  handle,
3881
4872
  code,
3882
- stream_logs=True,
3883
- process_stream=False,
4873
+ stream_logs=stream_logs,
4874
+ process_stream=process_stream,
4875
+ require_outputs=require_outputs,
3884
4876
  # Allocate a pseudo-terminal to disable output buffering.
3885
4877
  # Otherwise, there may be 5 minutes delay in logging.
3886
4878
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3887
4879
  )
3888
4880
  except SystemExit as e:
3889
- returncode = e.code
3890
- return returncode
4881
+ final = e.code
4882
+ return final
3891
4883
 
3892
4884
  def tail_managed_job_logs(self,
3893
4885
  handle: CloudVmRayResourceHandle,
3894
4886
  job_id: Optional[int] = None,
3895
4887
  job_name: Optional[str] = None,
3896
4888
  controller: bool = False,
3897
- follow: bool = True) -> int:
4889
+ follow: bool = True,
4890
+ tail: Optional[int] = None) -> int:
3898
4891
  # if job_name is not None, job_id should be None
3899
4892
  assert job_name is None or job_id is None, (job_name, job_id)
4893
+ # TODO(kevin): Migrate stream_logs to gRPC
3900
4894
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
3901
- job_name, job_id, follow, controller)
4895
+ job_name, job_id, follow, controller, tail)
3902
4896
 
3903
4897
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
3904
4898
  # kill the process, so we need to handle it manually here.
@@ -3942,20 +4936,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3942
4936
  assert job_name is None or job_id is None, (job_name, job_id)
3943
4937
 
3944
4938
  if job_id is None:
3945
- # generate code to get the job_id
4939
+ # get the job_id
3946
4940
  # if job_name is None, get all job_ids
3947
4941
  # TODO: Only get the latest job_id, since that's the only one we use
3948
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3949
- job_name=job_name)
3950
- returncode, job_ids, stderr = self.run_on_head(handle,
3951
- code,
3952
- stream_logs=False,
3953
- require_outputs=True,
3954
- separate_stderr=True)
3955
- subprocess_utils.handle_returncode(returncode, code,
3956
- 'Failed to sync down logs.',
3957
- stderr)
3958
- job_ids = message_utils.decode_payload(job_ids)
4942
+
4943
+ use_legacy = not handle.is_grpc_enabled_with_flag
4944
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4945
+ f'{handle.is_grpc_enabled_with_flag}')
4946
+ if not use_legacy:
4947
+ try:
4948
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4949
+ job_name=job_name)
4950
+ response = backend_utils.invoke_skylet_with_retries(
4951
+ lambda: SkyletClient(handle.get_grpc_channel(
4952
+ )).get_all_managed_job_ids_by_name(request))
4953
+ job_ids = list(response.job_ids)
4954
+ except exceptions.SkyletMethodNotImplementedError:
4955
+ use_legacy = True
4956
+
4957
+ if use_legacy:
4958
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4959
+ job_name=job_name)
4960
+ returncode, job_ids_payload, stderr = self.run_on_head(
4961
+ handle,
4962
+ code,
4963
+ stream_logs=False,
4964
+ require_outputs=True,
4965
+ separate_stderr=True)
4966
+ subprocess_utils.handle_returncode(returncode, code,
4967
+ 'Failed to sync down logs.',
4968
+ stderr)
4969
+ job_ids = message_utils.decode_payload(job_ids_payload)
3959
4970
  if not job_ids:
3960
4971
  logger.info(f'{colorama.Fore.YELLOW}'
3961
4972
  'No matching job found'
@@ -3974,20 +4985,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3974
4985
  # list should aready be in descending order
3975
4986
  job_id = job_ids[0]
3976
4987
 
3977
- # get the run_timestamp
3978
- # the function takes in [job_id]
3979
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3980
- [str(job_id)])
3981
- returncode, run_timestamps, stderr = self.run_on_head(
3982
- handle,
3983
- code,
3984
- stream_logs=False,
3985
- require_outputs=True,
3986
- separate_stderr=True)
3987
- subprocess_utils.handle_returncode(returncode, code,
3988
- 'Failed to sync logs.', stderr)
3989
- # returns with a dict of {job_id: run_timestamp}
3990
- run_timestamps = message_utils.decode_payload(run_timestamps)
4988
+ if isinstance(handle, LocalResourcesHandle):
4989
+ # In consolidation mode, we don't submit a ray job, therefore no
4990
+ # run_timestamp is available. We use a dummy run_timestamp here.
4991
+ run_timestamps = {
4992
+ job_id: f'managed-jobs-consolidation-mode-{job_id}'
4993
+ }
4994
+ else:
4995
+ # get the run_timestamp
4996
+ # the function takes in [job_id]
4997
+ use_legacy = not handle.is_grpc_enabled_with_flag
4998
+ if not use_legacy:
4999
+ try:
5000
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
5001
+ job_ids=[job_id])
5002
+ log_dirs_response = (
5003
+ backend_utils.invoke_skylet_with_retries(
5004
+ lambda: SkyletClient(handle.get_grpc_channel(
5005
+ )).get_log_dirs_for_jobs(log_dirs_request)))
5006
+ job_log_dirs = log_dirs_response.job_log_dirs
5007
+ # Convert back to the expected format
5008
+ # {job_id: run_timestamp}
5009
+ run_timestamps = {}
5010
+ for jid, log_dir in job_log_dirs.items():
5011
+ run_timestamps[int(jid)] = log_dir
5012
+ except exceptions.SkyletMethodNotImplementedError:
5013
+ use_legacy = True
5014
+
5015
+ if use_legacy:
5016
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
5017
+ [str(job_id)])
5018
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
5019
+ handle,
5020
+ code,
5021
+ stream_logs=False,
5022
+ require_outputs=True,
5023
+ separate_stderr=True)
5024
+ subprocess_utils.handle_returncode(returncode, code,
5025
+ 'Failed to sync logs.',
5026
+ stderr)
5027
+ # returns with a dict of {job_id: run_timestamp}
5028
+ run_timestamps = message_utils.decode_payload(
5029
+ run_timestamps_payload)
3991
5030
  if not run_timestamps:
3992
5031
  logger.info(f'{colorama.Fore.YELLOW}'
3993
5032
  'No matching log directories found'
@@ -3996,11 +5035,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3996
5035
 
3997
5036
  run_timestamp = list(run_timestamps.values())[0]
3998
5037
  job_id = list(run_timestamps.keys())[0]
5038
+
5039
+ # If run_timestamp contains the full path with SKY_LOGS_DIRECTORY,
5040
+ # strip the prefix to get just the relative part to avoid duplication
5041
+ # when constructing local paths.
5042
+ if run_timestamp.startswith(constants.SKY_LOGS_DIRECTORY):
5043
+ run_timestamp = run_timestamp[len(constants.SKY_LOGS_DIRECTORY
5044
+ ):].lstrip('/')
3999
5045
  local_log_dir = ''
4000
5046
  if controller: # download controller logs
4001
5047
  remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
4002
5048
  f'{job_id}.log')
4003
- local_log_dir = os.path.join(local_dir, run_timestamp)
5049
+ local_log_dir = os.path.join(local_dir, 'managed_jobs',
5050
+ run_timestamp)
4004
5051
  os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
4005
5052
  exist_ok=True)
4006
5053
 
@@ -4046,11 +5093,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4046
5093
  exist_ok=True)
4047
5094
  log_file = os.path.join(local_log_dir, 'run.log')
4048
5095
 
4049
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4050
- job_id=job_id,
4051
- follow=False,
4052
- controller=False)
4053
-
5096
+ # TODO(kevin): Migrate stream_logs to gRPC
5097
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
5098
+ job_name=None,
5099
+ job_id=int(job_id),
5100
+ follow=False,
5101
+ controller=False)
4054
5102
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4055
5103
  # kill the process, so we need to handle it manually here.
4056
5104
  if threading.current_thread() is threading.main_thread():
@@ -4091,6 +5139,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4091
5139
  Raises:
4092
5140
  RuntimeError: If the cluster fails to be terminated/stopped.
4093
5141
  """
5142
+ try:
5143
+ handle.close_skylet_ssh_tunnel()
5144
+ except Exception as e: # pylint: disable=broad-except
5145
+ # Not critical to the cluster teardown, just log a warning.
5146
+ logger.warning(
5147
+ 'Failed to close Skylet SSH tunnel for cluster '
5148
+ f'{handle.cluster_name}: '
5149
+ f'{common_utils.format_exception(e, use_bracket=True)}')
5150
+
4094
5151
  exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4095
5152
  # We have to kill the cluster requests again within the lock, because
4096
5153
  # any pending requests on the same cluster should be cancelled after
@@ -4116,7 +5173,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4116
5173
  prev_cluster_status, _ = (
4117
5174
  backend_utils.refresh_cluster_status_handle(
4118
5175
  handle.cluster_name,
4119
- acquire_per_cluster_status_lock=False))
5176
+ # There is a case where
5177
+ # 1. The cluster was interrupted during provisioning.
5178
+ # 2. The API request to create the cluster instances was
5179
+ # sent to the cloud, but hasn't been processed yet.
5180
+ # In this case, the cluster will be INIT. We should do a
5181
+ # hard status refresh to see if the instances are
5182
+ # actually there or not. Otherwise, teardown may not
5183
+ # find the instances, leading to a leak. This was
5184
+ # observed in AWS. See also
5185
+ # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
5186
+ force_refresh_statuses={status_lib.ClusterStatus.INIT},
5187
+ cluster_lock_already_held=True,
5188
+ retry_if_missing=False))
4120
5189
  cluster_status_fetched = True
4121
5190
  except exceptions.ClusterStatusFetchingError:
4122
5191
  logger.warning(
@@ -4124,10 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4124
5193
  f'{handle.cluster_name!r}. Assuming the cluster is still '
4125
5194
  'up.')
4126
5195
  if not cluster_status_fetched:
4127
- record = global_user_state.get_cluster_from_name(
5196
+ status = global_user_state.get_status_from_cluster_name(
4128
5197
  handle.cluster_name)
4129
- prev_cluster_status = record[
4130
- 'status'] if record is not None else None
5198
+ prev_cluster_status = status if status is not None else None
4131
5199
  if prev_cluster_status is None:
4132
5200
  # When the cluster is not in the cluster table, we guarantee that
4133
5201
  # all related resources / cache / config are cleaned up, i.e. it
@@ -4148,8 +5216,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4148
5216
  log_path = os.path.join(os.path.expanduser(self.log_dir),
4149
5217
  'teardown.log')
4150
5218
  log_abs_path = os.path.abspath(log_path)
4151
- cloud = handle.launched_resources.cloud
4152
- config = common_utils.read_yaml(handle.cluster_yaml)
5219
+ launched_resources = handle.launched_resources.assert_launchable()
5220
+ cloud = launched_resources.cloud
5221
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
4153
5222
  cluster_name = handle.cluster_name
4154
5223
  cluster_name_on_cloud = handle.cluster_name_on_cloud
4155
5224
 
@@ -4209,7 +5278,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4209
5278
  from sky.adaptors import ibm
4210
5279
  from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
4211
5280
 
4212
- config_provider = common_utils.read_yaml(
5281
+ config_provider = global_user_state.get_cluster_yaml_dict(
4213
5282
  handle.cluster_yaml)['provider']
4214
5283
  region = config_provider['region']
4215
5284
  search_client = ibm.search_client()
@@ -4238,36 +5307,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4238
5307
  # successfully removed cluster as no exception was raised
4239
5308
  returncode = 0
4240
5309
 
4241
- elif terminate and isinstance(cloud, clouds.SCP):
4242
- # pylint: disable=import-outside-toplevel
4243
- from sky.skylet.providers.scp import node_provider
4244
- config['provider']['cache_stopped_nodes'] = not terminate
4245
- provider = node_provider.SCPNodeProvider(config['provider'],
4246
- cluster_name_on_cloud)
4247
- try:
4248
- if not os.path.exists(provider.metadata.path):
4249
- raise node_provider.SCPError(
4250
- 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
4251
- 'Metadata file does not exist.')
4252
-
4253
- with open(provider.metadata.path, 'r', encoding='utf-8') as f:
4254
- metadata = json.load(f)
4255
- node_id = next(iter(metadata.values())).get(
4256
- 'creation', {}).get('virtualServerId', None)
4257
- provider.terminate_node(node_id)
4258
- returncode = 0
4259
- except node_provider.SCPError as e:
4260
- returncode = 1
4261
- stdout = ''
4262
- stderr = str(e)
4263
-
4264
5310
  else:
4265
5311
  config['provider']['cache_stopped_nodes'] = not terminate
4266
5312
  with tempfile.NamedTemporaryFile('w',
4267
5313
  prefix='sky_',
4268
5314
  delete=False,
4269
5315
  suffix='.yml') as f:
4270
- common_utils.dump_yaml(f.name, config)
5316
+ yaml_utils.dump_yaml(f.name, config)
4271
5317
  f.flush()
4272
5318
 
4273
5319
  teardown_verb = 'Terminating' if terminate else 'Stopping'
@@ -4322,12 +5368,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4322
5368
  handle: CloudVmRayResourceHandle,
4323
5369
  terminate: bool,
4324
5370
  purge: bool = False,
4325
- remove_from_db: bool = True) -> None:
5371
+ remove_from_db: bool = True,
5372
+ failover: bool = False) -> None:
4326
5373
  """Cleanup local configs/caches and delete TPUs after teardown.
4327
5374
 
4328
5375
  This method will handle the following cleanup steps:
4329
5376
  * Deleting the TPUs;
4330
5377
  * Removing ssh configs for the cluster;
5378
+ * Deleting the open ports;
5379
+ * Deleting the custom multi network infrastructure based on the
5380
+ failover flag (e.g. delete firewalls, subnets, and VPCs for GPU
5381
+ Direct if failover is False, otherwise, only delete the subnets);
4331
5382
  * Updating the local state of the cluster;
4332
5383
  * Removing the terminated cluster's scripts and ray yaml files.
4333
5384
  """
@@ -4359,19 +5410,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4359
5410
  # The cluster yaml does not exist when skypilot has not found
4360
5411
  # the right resource to provision the cluster.
4361
5412
  if handle.cluster_yaml is not None:
5413
+ launched_resources = (
5414
+ handle.launched_resources.assert_launchable())
5415
+ cloud = launched_resources.cloud
5416
+ config = global_user_state.get_cluster_yaml_dict(
5417
+ handle.cluster_yaml)
5418
+ ports_cleaned_up = False
5419
+ custom_multi_network_cleaned_up = False
4362
5420
  try:
4363
- cloud = handle.launched_resources.cloud
4364
- config = common_utils.read_yaml(handle.cluster_yaml)
4365
5421
  cloud.check_features_are_supported(
4366
- handle.launched_resources,
5422
+ launched_resources,
4367
5423
  {clouds.CloudImplementationFeatures.OPEN_PORTS})
4368
5424
  provision_lib.cleanup_ports(repr(cloud),
4369
5425
  cluster_name_on_cloud,
4370
5426
  handle.launched_resources.ports,
4371
5427
  config['provider'])
4372
- self.remove_cluster_config(handle)
5428
+ ports_cleaned_up = True
4373
5429
  except exceptions.NotSupportedError:
4374
- pass
5430
+ ports_cleaned_up = True
4375
5431
  except exceptions.PortDoesNotExistError:
4376
5432
  logger.debug('Ports do not exist. Skipping cleanup.')
4377
5433
  except Exception as e: # pylint: disable=broad-except
@@ -4383,8 +5439,43 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4383
5439
  else:
4384
5440
  raise
4385
5441
 
4386
- sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
4387
- handle.cluster_name)
5442
+ # Clean up custom multi networks, e.g. the subnets, firewalls,
5443
+ # and VPCs created for GCP GPUDirect TCPX
5444
+ try:
5445
+ cloud.check_features_are_supported(
5446
+ handle.launched_resources, {
5447
+ clouds.CloudImplementationFeatures.
5448
+ CUSTOM_MULTI_NETWORK
5449
+ })
5450
+ provision_lib.cleanup_custom_multi_network(
5451
+ repr(cloud), cluster_name_on_cloud, config['provider'],
5452
+ failover)
5453
+ custom_multi_network_cleaned_up = True
5454
+ except exceptions.NotSupportedError:
5455
+ custom_multi_network_cleaned_up = True
5456
+ except Exception as e: # pylint: disable=broad-except
5457
+ if purge:
5458
+ msg = common_utils.format_exception(e, use_bracket=True)
5459
+ logger.warning(
5460
+ f'Failed to cleanup custom multi network. Skipping '
5461
+ f'since purge is set. Details: {msg}')
5462
+ else:
5463
+ raise
5464
+
5465
+ if ports_cleaned_up and custom_multi_network_cleaned_up:
5466
+ try:
5467
+ self.remove_cluster_config(handle)
5468
+ except Exception as e: # pylint: disable=broad-except
5469
+ if purge:
5470
+ msg = common_utils.format_exception(
5471
+ e, use_bracket=True)
5472
+ logger.warning(
5473
+ f'Failed to remove cluster config. Skipping '
5474
+ f'since purge is set. Details: {msg}')
5475
+ else:
5476
+ raise
5477
+
5478
+ cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
4388
5479
 
4389
5480
  def _detect_abnormal_non_terminated_nodes(
4390
5481
  handle: CloudVmRayResourceHandle) -> None:
@@ -4400,18 +5491,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4400
5491
  # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
4401
5492
  attempts = 0
4402
5493
  while True:
4403
- config = common_utils.read_yaml(handle.cluster_yaml)
5494
+ config = global_user_state.get_cluster_yaml_dict(
5495
+ handle.cluster_yaml)
4404
5496
 
4405
5497
  logger.debug(f'instance statuses attempt {attempts + 1}')
4406
5498
  node_status_dict = provision_lib.query_instances(
4407
5499
  repr(cloud),
5500
+ handle.cluster_name,
4408
5501
  cluster_name_on_cloud,
4409
5502
  config['provider'],
4410
5503
  non_terminated_only=False)
4411
5504
 
4412
5505
  unexpected_node_state: Optional[Tuple[str, str]] = None
4413
- for node_id, node_status in node_status_dict.items():
4414
- logger.debug(f'{node_id} status: {node_status}')
5506
+ for node_id, node_status_tuple in node_status_dict.items():
5507
+ node_status, reason = node_status_tuple
5508
+ reason = '' if reason is None else f' ({reason})'
5509
+ logger.debug(f'{node_id} status: {node_status}{reason}')
4415
5510
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4416
5511
  # between "stopping/stopped" and "terminating/terminated",
4417
5512
  # so we allow for either status instead of casing on
@@ -4456,13 +5551,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4456
5551
 
4457
5552
  def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
4458
5553
  """Remove the YAML config of a cluster."""
5554
+ cluster_yaml_path = handle.cluster_yaml
4459
5555
  handle.cluster_yaml = None
4460
5556
  global_user_state.update_cluster_handle(handle.cluster_name, handle)
4461
- common_utils.remove_file_if_exists(handle.cluster_yaml)
5557
+ # Removing the cluster YAML can cause some unexpected stability issues.
5558
+ # See #5011.
5559
+ # global_user_state.remove_cluster_yaml(handle.cluster_name)
5560
+ common_utils.remove_file_if_exists(cluster_yaml_path)
4462
5561
 
4463
5562
  def set_autostop(self,
4464
5563
  handle: CloudVmRayResourceHandle,
4465
5564
  idle_minutes_to_autostop: Optional[int],
5565
+ wait_for: Optional[autostop_lib.AutostopWaitFor],
4466
5566
  down: bool = False,
4467
5567
  stream_logs: bool = True) -> None:
4468
5568
  # The core.autostop() function should have already checked that the
@@ -4489,6 +5589,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4489
5589
 
4490
5590
  # down = False is the default, but warn the user in case
4491
5591
  # they have explicitly specified it.
5592
+ # TODO(cooperc): Fix for new autostop stuff.
4492
5593
  config_override_down = skypilot_config.get_nested(
4493
5594
  (controller.value.controller_type, 'controller',
4494
5595
  'autostop', 'down'), None)
@@ -4508,17 +5609,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4508
5609
  # Check if we're stopping spot
4509
5610
  assert (handle.launched_resources is not None and
4510
5611
  handle.launched_resources.cloud is not None), handle
4511
- code = autostop_lib.AutostopCodeGen.set_autostop(
4512
- idle_minutes_to_autostop, self.NAME, down)
4513
- returncode, _, stderr = self.run_on_head(handle,
4514
- code,
4515
- require_outputs=True,
4516
- stream_logs=stream_logs)
4517
- subprocess_utils.handle_returncode(returncode,
4518
- code,
4519
- 'Failed to set autostop',
4520
- stderr=stderr,
4521
- stream_logs=stream_logs)
5612
+ if handle.is_grpc_enabled_with_flag:
5613
+ request = autostopv1_pb2.SetAutostopRequest(
5614
+ idle_minutes=idle_minutes_to_autostop,
5615
+ backend=self.NAME,
5616
+ wait_for=wait_for.to_protobuf() if wait_for is not None else
5617
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
5618
+ down=down,
5619
+ )
5620
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
5621
+ handle.get_grpc_channel()).set_autostop(request))
5622
+ else:
5623
+ code = autostop_lib.AutostopCodeGen.set_autostop(
5624
+ idle_minutes_to_autostop, self.NAME, wait_for, down)
5625
+ returncode, _, stderr = self.run_on_head(
5626
+ handle, code, require_outputs=True, stream_logs=stream_logs)
5627
+ subprocess_utils.handle_returncode(returncode,
5628
+ code,
5629
+ 'Failed to set autostop',
5630
+ stderr=stderr,
5631
+ stream_logs=stream_logs)
4522
5632
  global_user_state.set_cluster_autostop_value(
4523
5633
  handle.cluster_name, idle_minutes_to_autostop, down)
4524
5634
 
@@ -4543,22 +5653,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4543
5653
  # The head node of the cluster is not UP or in an abnormal state.
4544
5654
  # We cannot check if the cluster is autostopping.
4545
5655
  return False
4546
- code = autostop_lib.AutostopCodeGen.is_autostopping()
4547
- returncode, stdout, stderr = self.run_on_head(handle,
4548
- code,
4549
- require_outputs=True,
4550
- stream_logs=stream_logs)
4551
-
4552
- if returncode == 0:
4553
- return message_utils.decode_payload(stdout)
4554
- logger.debug('Failed to check if cluster is autostopping with '
4555
- f'{returncode}: {stdout+stderr}\n'
4556
- f'Command: {code}')
4557
- return False
5656
+ if handle.is_grpc_enabled_with_flag:
5657
+ try:
5658
+ request = autostopv1_pb2.IsAutostoppingRequest()
5659
+ response = backend_utils.invoke_skylet_with_retries(
5660
+ lambda: SkyletClient(handle.get_grpc_channel()
5661
+ ).is_autostopping(request))
5662
+ return response.is_autostopping
5663
+ except Exception as e: # pylint: disable=broad-except
5664
+ # The cluster may have been terminated, causing the gRPC call
5665
+ # to timeout and fail.
5666
+ logger.debug(f'Failed to check if cluster is autostopping: {e}')
5667
+ return False
5668
+ else:
5669
+ code = autostop_lib.AutostopCodeGen.is_autostopping()
5670
+ returncode, stdout, stderr = self.run_on_head(
5671
+ handle, code, require_outputs=True, stream_logs=stream_logs)
5672
+ if returncode == 0:
5673
+ return message_utils.decode_payload(stdout)
5674
+ logger.debug('Failed to check if cluster is autostopping with '
5675
+ f'{returncode}: {stdout+stderr}\n'
5676
+ f'Command: {code}')
5677
+ return False
4558
5678
 
4559
5679
  # TODO(zhwu): Refactor this to a CommandRunner class, so different backends
4560
5680
  # can support its own command runner.
4561
5681
  @timeline.event
5682
+ @context_utils.cancellation_guard
4562
5683
  def run_on_head(
4563
5684
  self,
4564
5685
  handle: CloudVmRayResourceHandle,
@@ -4649,7 +5770,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4649
5770
  exceptions.InvalidClusterNameError: If the cluster name is invalid.
4650
5771
  # TODO(zhwu): complete the list of exceptions.
4651
5772
  """
4652
- record = global_user_state.get_cluster_from_name(cluster_name)
5773
+ record = global_user_state.get_cluster_from_name(
5774
+ cluster_name, include_user_info=False, summary_response=True)
4653
5775
  if record is None:
4654
5776
  handle_before_refresh = None
4655
5777
  status_before_refresh = None
@@ -4657,6 +5779,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4657
5779
  handle_before_refresh = record['handle']
4658
5780
  status_before_refresh = record['status']
4659
5781
 
5782
+ handle: Optional[CloudVmRayResourceHandle]
4660
5783
  prev_cluster_status, handle = (status_before_refresh,
4661
5784
  handle_before_refresh)
4662
5785
 
@@ -4668,7 +5791,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4668
5791
  record = backend_utils.refresh_cluster_record(
4669
5792
  cluster_name,
4670
5793
  force_refresh_statuses={status_lib.ClusterStatus.INIT},
4671
- acquire_per_cluster_status_lock=False,
5794
+ cluster_lock_already_held=True,
5795
+ include_user_info=False,
5796
+ summary_response=True,
4672
5797
  )
4673
5798
  if record is not None:
4674
5799
  prev_cluster_status = record['status']
@@ -4677,7 +5802,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4677
5802
  prev_cluster_status = None
4678
5803
  handle = None
4679
5804
  # We should check the cluster_ever_up after refresh, because if the
4680
- # cluster is terminated (through console or auto-dwon), the record will
5805
+ # cluster is terminated (through console or auto-down), the record will
4681
5806
  # become None and the cluster_ever_up should be considered as False.
4682
5807
  cluster_ever_up = record is not None and record['cluster_ever_up']
4683
5808
  prev_config_hash = record['config_hash'] if record is not None else None
@@ -4690,16 +5815,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4690
5815
  self.check_resources_fit_cluster(handle, task)
4691
5816
  # Use the existing cluster.
4692
5817
  assert handle.launched_resources is not None, (cluster_name, handle)
5818
+ # Take a random resource in order to get resource info that applies
5819
+ # to all resources.
5820
+ one_task_resource = list(task.resources)[0]
5821
+
4693
5822
  # Assume resources share the same ports.
4694
5823
  for resource in task.resources:
4695
- assert resource.ports == list(task.resources)[0].ports
5824
+ assert resource.ports == one_task_resource.ports
4696
5825
  requested_ports_set = resources_utils.port_ranges_to_set(
4697
- list(task.resources)[0].ports)
5826
+ one_task_resource.ports)
4698
5827
  current_ports_set = resources_utils.port_ranges_to_set(
4699
5828
  handle.launched_resources.ports)
4700
5829
  all_ports = resources_utils.port_set_to_ranges(current_ports_set |
4701
5830
  requested_ports_set)
4702
5831
  to_provision = handle.launched_resources
5832
+ assert to_provision is not None
5833
+ to_provision = to_provision.assert_launchable()
4703
5834
  if (to_provision.cloud.OPEN_PORTS_VERSION <=
4704
5835
  clouds.OpenPortsVersion.LAUNCH_ONLY):
4705
5836
  if not requested_ports_set <= current_ports_set:
@@ -4713,6 +5844,57 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4713
5844
  'a new cluster with the desired ports open.')
4714
5845
  if all_ports:
4715
5846
  to_provision = to_provision.copy(ports=all_ports)
5847
+ # Docker login should always be the same for all resources, since
5848
+ # it's set from envs.
5849
+ for resource in task.resources:
5850
+ assert (resource.docker_login_config ==
5851
+ one_task_resource.docker_login_config), (
5852
+ resource.docker_login_config,
5853
+ one_task_resource.docker_login_config)
5854
+ # If we have docker login config in the new task, override the
5855
+ # existing resources to pick up new credentials. This allows the
5856
+ # user to specify new or fixed credentials if the existing
5857
+ # credentials are not working. If we don't do this, the credentials
5858
+ # from the existing resources will always be reused.
5859
+ if one_task_resource.docker_login_config is not None:
5860
+ to_provision = to_provision.copy(
5861
+ _docker_login_config=one_task_resource.docker_login_config)
5862
+
5863
+ # cluster_config_overrides should be the same for all resources.
5864
+ for resource in task.resources:
5865
+ assert (resource.cluster_config_overrides ==
5866
+ one_task_resource.cluster_config_overrides)
5867
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
5868
+ # Warn users if the Kubernetes pod config is different
5869
+ # from the existing cluster.
5870
+ cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5871
+ cluster_name)
5872
+ actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
5873
+ desired_cluster_yaml_obj = (
5874
+ kubernetes_utils.combine_pod_config_fields_and_metadata(
5875
+ actual_cluster_yaml_obj,
5876
+ cluster_config_overrides=one_task_resource.
5877
+ cluster_config_overrides,
5878
+ cloud=to_provision.cloud,
5879
+ context=to_provision.region))
5880
+
5881
+ def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5882
+ return (yaml_obj.get('available_node_types',
5883
+ {}).get('ray_head_default',
5884
+ {}).get('node_config', {}))
5885
+
5886
+ if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
5887
+ actual_cluster_yaml_obj):
5888
+ # pylint: disable=line-too-long
5889
+ logger.warning(
5890
+ f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
5891
+ f'pod config than the existing cluster. The existing '
5892
+ f'cluster will be used with its current pod config.'
5893
+ f'To apply use your task\'s new pod config:\n'
5894
+ f' • Use a new cluster'
5895
+ f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
5896
+ f'{colorama.Style.RESET_ALL}')
5897
+
4716
5898
  return RetryingVmProvisioner.ToProvisionConfig(
4717
5899
  cluster_name,
4718
5900
  to_provision,
@@ -4727,33 +5909,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4727
5909
  common_utils.check_cluster_name_is_valid(cluster_name)
4728
5910
 
4729
5911
  if to_provision is None:
4730
- # The cluster is recently terminated either by autostop or manually
4731
- # terminated on the cloud. We should use the previously terminated
4732
- # resources to provision the cluster.
4733
- #
4734
- # FIXME(zongheng): this assert can be hit by using two terminals.
4735
- # First, create a 'dbg' cluster. Then:
4736
- # Terminal 1: sky down dbg -y
4737
- # Terminal 2: sky launch -c dbg -- echo
4738
- # Run it in order. Terminal 2 will show this error after terminal 1
4739
- # succeeds in downing the cluster and releasing the lock.
4740
- assert isinstance(
4741
- handle_before_refresh, CloudVmRayResourceHandle), (
4742
- f'Trying to launch cluster {cluster_name!r} recently '
4743
- 'terminated on the cloud, but the handle is not a '
4744
- f'CloudVmRayResourceHandle ({handle_before_refresh}).')
4745
- status_before_refresh_str = None
4746
- if status_before_refresh is not None:
4747
- status_before_refresh_str = status_before_refresh.value
4748
-
4749
- logger.info(
4750
- f'The cluster {cluster_name!r} (status: '
4751
- f'{status_before_refresh_str}) was not found on the cloud: it '
4752
- 'may be autodowned, manually terminated, or its launch never '
4753
- 'succeeded. Provisioning a new cluster by using the same '
4754
- 'resources as its original launch.')
4755
- to_provision = handle_before_refresh.launched_resources
4756
- self.check_resources_fit_cluster(handle_before_refresh, task)
5912
+ # Recently terminated after refresh. OPTIMIZE usually ran outside
5913
+ # the lock, so that decision may be stale by now. Under the lock,
5914
+ # ensure we always have a concrete plan via the following order:
5915
+ # 1) Reuse last placement snapshot (if available);
5916
+ # 2) Else, call injected planner for a fresh plan.
5917
+ # If we still have a pre-refresh handle snapshot with a concrete
5918
+ # placement, prefer reusing it.
5919
+ if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
5920
+ handle_before_refresh.launched_resources is not None):
5921
+ to_provision = handle_before_refresh.launched_resources
5922
+ # Ensure the requested task fits the previous placement.
5923
+ self.check_resources_fit_cluster(handle_before_refresh, task)
5924
+ # Mirror the original message for reuse path.
5925
+ status_before_refresh_str = None
5926
+ if status_before_refresh is not None:
5927
+ status_before_refresh_str = status_before_refresh.value
5928
+ logger.info(
5929
+ f'The cluster {cluster_name!r} (status: '
5930
+ f'{status_before_refresh_str}) was not found on the cloud: '
5931
+ 'it may be autodowned, manually terminated, or its launch '
5932
+ 'never succeeded. Provisioning a new cluster by using the '
5933
+ 'same resources as its original launch.')
5934
+ elif self._planner is not None:
5935
+ to_provision = self._planner(task)
5936
+ logger.info(
5937
+ 'Previous placement snapshot missing; computing a fresh '
5938
+ 'plan for provisioning.')
5939
+ else:
5940
+ # Without a snapshot or planner, we cannot proceed safely.
5941
+ # Surface a user-friendly error without a long traceback.
5942
+ with ux_utils.print_exception_no_traceback():
5943
+ raise RuntimeError(
5944
+ 'No concrete launch plan available after recent cloud '
5945
+ f'termination of cluster {cluster_name!r}. Ensure the '
5946
+ 'OPTIMIZE stage runs or provide concrete resources.')
4757
5947
 
4758
5948
  return RetryingVmProvisioner.ToProvisionConfig(
4759
5949
  cluster_name,
@@ -5033,18 +6223,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5033
6223
  # reconstruct them during cluster restart.
5034
6224
  continue
5035
6225
  storage_mounts_metadata[dst] = storage_obj.handle
5036
- lock_path = (
5037
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
6226
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5038
6227
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5039
6228
  try:
5040
- with filelock.FileLock(lock_path, lock_timeout):
6229
+ with locks.get_lock(lock_id, lock_timeout):
5041
6230
  global_user_state.set_cluster_storage_mounts_metadata(
5042
6231
  cluster_name, storage_mounts_metadata)
5043
- except filelock.Timeout as e:
6232
+ except locks.LockTimeout as e:
5044
6233
  raise RuntimeError(
5045
6234
  f'Failed to store metadata for cluster {cluster_name!r} due to '
5046
6235
  'a timeout when trying to access local database. Please '
5047
- f'try again or manually remove the lock at {lock_path}. '
6236
+ f'try again or manually remove the lock at {lock_id}. '
5048
6237
  f'{common_utils.format_exception(e)}') from None
5049
6238
 
5050
6239
  def get_storage_mounts_metadata(
@@ -5055,19 +6244,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5055
6244
  After retrieving storage_mounts_metadata, it converts back the
5056
6245
  StorageMetadata to Storage object and restores 'storage_mounts.'
5057
6246
  """
5058
- lock_path = (
5059
- backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
6247
+ lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
5060
6248
  lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
5061
6249
  try:
5062
- with filelock.FileLock(lock_path, lock_timeout):
6250
+ with locks.get_lock(lock_id, lock_timeout):
5063
6251
  storage_mounts_metadata = (
5064
6252
  global_user_state.get_cluster_storage_mounts_metadata(
5065
6253
  cluster_name))
5066
- except filelock.Timeout as e:
6254
+ except locks.LockTimeout as e:
5067
6255
  raise RuntimeError(
5068
6256
  f'Failed to retrieve metadata for cluster {cluster_name!r} '
5069
6257
  'due to a timeout when trying to access local database. '
5070
- f'Please try again or manually remove the lock at {lock_path}.'
6258
+ f'Please try again or manually remove the lock at {lock_id}.'
5071
6259
  f' {common_utils.format_exception(e)}') from None
5072
6260
 
5073
6261
  if storage_mounts_metadata is None:
@@ -5104,7 +6292,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5104
6292
  def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
5105
6293
  handle: CloudVmRayResourceHandle) -> Dict[str, str]:
5106
6294
  """Returns the environment variables for the task."""
5107
- env_vars = task.envs.copy()
6295
+ env_vars = task.envs_and_secrets
5108
6296
  # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
5109
6297
  # by the controller.
5110
6298
  if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5116,11 +6304,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5116
6304
  env_vars.update(self._skypilot_predefined_env_vars(handle))
5117
6305
  return env_vars
5118
6306
 
6307
+ def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
6308
+ """Returns the user id for the managed job."""
6309
+ if task.managed_job_dag is not None:
6310
+ return task.envs[constants.USER_ID_ENV_VAR]
6311
+ return None
6312
+
5119
6313
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5120
6314
  task: task_lib.Task, job_id: int,
5121
- detach_run: bool) -> None:
6315
+ remote_log_dir: str) -> None:
5122
6316
  # Launch the command as a Ray task.
5123
- log_dir = os.path.join(self.log_dir, 'tasks')
6317
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5124
6318
 
5125
6319
  resources_dict = backend_utils.get_task_demands_dict(task)
5126
6320
  internal_ips = handle.internal_ips()
@@ -5154,21 +6348,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5154
6348
 
5155
6349
  codegen.add_epilogue()
5156
6350
 
5157
- self._exec_code_on_head(handle,
5158
- codegen.build(),
5159
- job_id,
5160
- detach_run=detach_run,
5161
- managed_job_dag=task.managed_job_dag)
6351
+ self._exec_code_on_head(
6352
+ handle,
6353
+ codegen.build(),
6354
+ job_id,
6355
+ managed_job_dag=task.managed_job_dag,
6356
+ managed_job_user_id=self._get_managed_job_user_id(task),
6357
+ remote_log_dir=remote_log_dir)
5162
6358
 
5163
6359
  def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
5164
6360
  task: task_lib.Task, job_id: int,
5165
- detach_run: bool) -> None:
6361
+ remote_log_dir: str) -> None:
5166
6362
  # Strategy:
5167
6363
  # ray.init(...)
5168
6364
  # for node:
5169
6365
  # submit _run_cmd(cmd) with resource {node_i: 1}
5170
- log_dir_base = self.log_dir
5171
- log_dir = os.path.join(log_dir_base, 'tasks')
6366
+ log_dir = os.path.join(remote_log_dir, 'tasks')
5172
6367
  resources_dict = backend_utils.get_task_demands_dict(task)
5173
6368
  internal_ips = handle.internal_ips()
5174
6369
  assert internal_ips is not None, 'internal_ips is not cached in handle'
@@ -5210,8 +6405,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5210
6405
 
5211
6406
  codegen.add_epilogue()
5212
6407
  # TODO(zhanghao): Add help info for downloading logs.
5213
- self._exec_code_on_head(handle,
5214
- codegen.build(),
5215
- job_id,
5216
- detach_run=detach_run,
5217
- managed_job_dag=task.managed_job_dag)
6408
+ self._exec_code_on_head(
6409
+ handle,
6410
+ codegen.build(),
6411
+ job_id,
6412
+ managed_job_dag=task.managed_job_dag,
6413
+ managed_job_user_id=self._get_managed_job_user_id(task),
6414
+ remote_log_dir=remote_log_dir)