skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Util constants/functions for the backends."""
2
+ import asyncio
2
3
  from datetime import datetime
3
4
  import enum
4
5
  import fnmatch
@@ -6,18 +7,23 @@ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
10
+ import queue as queue_lib
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
16
+ import threading
14
17
  import time
15
18
  import typing
16
- from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
17
21
  import uuid
18
22
 
23
+ import aiohttp
24
+ from aiohttp import ClientTimeout
25
+ from aiohttp import TCPConnector
19
26
  import colorama
20
- import filelock
21
27
  from packaging import version
22
28
  from typing_extensions import Literal
23
29
 
@@ -28,30 +34,45 @@ from sky import check as sky_check
28
34
  from sky import clouds
29
35
  from sky import exceptions
30
36
  from sky import global_user_state
37
+ from sky import logs
31
38
  from sky import provision as provision_lib
32
39
  from sky import sky_logging
33
40
  from sky import skypilot_config
34
41
  from sky.adaptors import common as adaptors_common
42
+ from sky.jobs import utils as managed_job_utils
43
+ from sky.provision import common as provision_common
35
44
  from sky.provision import instance_setup
36
45
  from sky.provision.kubernetes import utils as kubernetes_utils
46
+ from sky.serve import serve_utils
47
+ from sky.server.requests import requests as requests_lib
48
+ from sky.skylet import autostop_lib
37
49
  from sky.skylet import constants
38
50
  from sky.usage import usage_lib
51
+ from sky.utils import auth_utils
39
52
  from sky.utils import cluster_utils
40
53
  from sky.utils import command_runner
41
54
  from sky.utils import common
42
55
  from sky.utils import common_utils
56
+ from sky.utils import context as context_lib
57
+ from sky.utils import context_utils
43
58
  from sky.utils import controller_utils
44
59
  from sky.utils import env_options
60
+ from sky.utils import locks
45
61
  from sky.utils import registry
46
62
  from sky.utils import resources_utils
47
63
  from sky.utils import rich_utils
48
64
  from sky.utils import schemas
49
65
  from sky.utils import status_lib
50
66
  from sky.utils import subprocess_utils
67
+ from sky.utils import tempstore
51
68
  from sky.utils import timeline
52
69
  from sky.utils import ux_utils
70
+ from sky.utils import volume as volume_utils
71
+ from sky.utils import yaml_utils
72
+ from sky.workspaces import core as workspaces_core
53
73
 
54
74
  if typing.TYPE_CHECKING:
75
+ import grpc
55
76
  import requests
56
77
  from requests import adapters
57
78
  from requests.packages.urllib3.util import retry as retry_lib
@@ -69,6 +90,8 @@ else:
69
90
  adapters = adaptors_common.LazyImport('requests.adapters')
70
91
  retry_lib = adaptors_common.LazyImport(
71
92
  'requests.packages.urllib3.util.retry')
93
+ # To avoid requiring grpcio to be installed on the client side.
94
+ grpc = adaptors_common.LazyImport('grpc')
72
95
 
73
96
  logger = sky_logging.init_logger(__name__)
74
97
 
@@ -91,6 +114,13 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
91
114
  # 10.133.0.5: ray.worker.default,
92
115
  _LAUNCHING_IP_PATTERN = re.compile(
93
116
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
117
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
118
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
119
+ _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
120
+ re.IGNORECASE)
121
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
122
+ re.IGNORECASE)
123
+ _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
94
124
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
95
125
 
96
126
  # We check network connection by going through _TEST_IP_LIST. We may need to
@@ -98,24 +128,21 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
98
128
  # Fixed IP addresses are used to avoid DNS lookup blocking the check, for
99
129
  # machine with no internet connection.
100
130
  # Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
101
- _TEST_IP_LIST = ['https://1.1.1.1', 'https://8.8.8.8']
131
+ _TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
102
132
 
103
133
  # Allow each CPU thread take 2 tasks.
104
134
  # Note: This value cannot be too small, otherwise OOM issue may occur.
105
135
  DEFAULT_TASK_CPU_DEMAND = 0.5
106
136
 
107
- # Filelocks for the cluster status change.
108
- CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
109
137
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
110
138
 
111
139
  # Time that must elapse since the last status check before we should re-check if
112
140
  # the cluster has been terminated or autostopped.
113
141
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
114
142
 
115
- # Filelocks for updating cluster's file_mounts.
116
- CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
117
- '~/.sky/.{}_file_mounts.lock')
118
143
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
144
+ WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
145
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
119
146
 
120
147
  # Remote dir that holds our runtime files.
121
148
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -124,7 +151,7 @@ _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
124
151
  'please retry after a while.')
125
152
 
126
153
  # If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
127
- # see any instances in the cloud, the instances might be in the proccess of
154
+ # see any instances in the cloud, the instances might be in the process of
128
155
  # being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
129
156
  # check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
130
157
  # should be set longer than the delay between (sending the create instance
@@ -194,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
194
221
  ('provider', 'availability_zone'),
195
222
  ]
196
223
 
224
+ _ACK_MESSAGE = 'ack'
225
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
226
+
197
227
 
198
228
  def is_ip(s: str) -> bool:
199
229
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -212,7 +242,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
212
242
  # Add retry for the file mounts optimization, as the underlying cp command may
213
243
  # experience transient errors, #4758.
214
244
  @common_utils.retry
215
- def _optimize_file_mounts(yaml_path: str) -> None:
245
+ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
216
246
  """Optimize file mounts in the given ray yaml file.
217
247
 
218
248
  Runtime files handling:
@@ -226,7 +256,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
226
256
  subprocess.CalledProcessError: If the file mounts are failed to be
227
257
  copied.
228
258
  """
229
- yaml_config = common_utils.read_yaml(yaml_path)
259
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
230
260
 
231
261
  file_mounts = yaml_config.get('file_mounts', {})
232
262
  # Remove the file mounts added by the newline.
@@ -242,7 +272,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
242
272
  # - use a remote command to move all runtime files to their right places.
243
273
 
244
274
  # Local tmp dir holding runtime files.
245
- local_runtime_files_dir = tempfile.mkdtemp()
275
+ local_runtime_files_dir = tempstore.mkdtemp()
246
276
  new_file_mounts = {_REMOTE_RUNTIME_FILES_DIR: local_runtime_files_dir}
247
277
 
248
278
  # Generate local_src -> unique_name.
@@ -310,7 +340,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
310
340
  shell=True,
311
341
  check=True)
312
342
 
313
- common_utils.dump_yaml(yaml_path, yaml_config)
343
+ yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
314
344
 
315
345
 
316
346
  def path_size_megabytes(path: str) -> int:
@@ -339,7 +369,13 @@ def path_size_megabytes(path: str) -> int:
339
369
  f'{git_exclude_filter} --dry-run {path!r}')
340
370
  rsync_output = ''
341
371
  try:
342
- rsync_output = str(subprocess.check_output(rsync_command, shell=True))
372
+ # rsync sometimes fails `--dry-run` for MacOS' rsync build, however this function is only used to display
373
+ # a warning message to the user if the size of a file/directory is too
374
+ # large, so we can safely ignore the error.
375
+ rsync_output = str(
376
+ subprocess.check_output(rsync_command,
377
+ shell=True,
378
+ stderr=subprocess.DEVNULL))
343
379
  except subprocess.CalledProcessError:
344
380
  logger.debug('Command failed, proceeding without estimating size: '
345
381
  f'{rsync_command}')
@@ -464,8 +500,8 @@ def _replace_yaml_dicts(
464
500
  if key in old_block:
465
501
  _restore_block(value, old_block[key])
466
502
 
467
- new_config = yaml.safe_load(new_yaml)
468
- old_config = yaml.safe_load(old_yaml)
503
+ new_config = yaml_utils.safe_load(new_yaml)
504
+ old_config = yaml_utils.safe_load(old_yaml)
469
505
  excluded_results = {}
470
506
  # Find all key values excluded from restore
471
507
  for exclude_restore_key_name_list in restore_key_names_exceptions:
@@ -489,7 +525,7 @@ def _replace_yaml_dicts(
489
525
  for key in exclude_restore_key_name[:-1]:
490
526
  curr = curr[key]
491
527
  curr[exclude_restore_key_name[-1]] = value
492
- return common_utils.dump_yaml_str(new_config)
528
+ return yaml_utils.dump_yaml_str(new_config)
493
529
 
494
530
 
495
531
  def get_expirable_clouds(
@@ -509,11 +545,55 @@ def get_expirable_clouds(
509
545
  expirable_clouds = []
510
546
  local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
511
547
  for cloud in enabled_clouds:
512
- remote_identities = skypilot_config.get_nested(
513
- (str(cloud).lower(), 'remote_identity'), None)
514
- if remote_identities is None:
515
- remote_identities = schemas.get_default_remote_identity(
516
- str(cloud).lower())
548
+ # Kubernetes config might have context-specific properties
549
+ if isinstance(cloud, clouds.Kubernetes):
550
+ # get all custom contexts
551
+ contexts = kubernetes_utils.get_custom_config_k8s_contexts()
552
+ # add remote_identity of each context if it exists
553
+ remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
554
+ for context in contexts:
555
+ context_remote_identity = skypilot_config.get_effective_region_config(
556
+ cloud='kubernetes',
557
+ region=context,
558
+ keys=('remote_identity',),
559
+ default_value=None)
560
+ if context_remote_identity is not None:
561
+ if remote_identities is None:
562
+ remote_identities = []
563
+ if isinstance(context_remote_identity, str):
564
+ assert isinstance(remote_identities, list)
565
+ remote_identities.append(
566
+ {context: context_remote_identity})
567
+ elif isinstance(context_remote_identity, list):
568
+ assert isinstance(remote_identities, list)
569
+ remote_identities.extend(context_remote_identity)
570
+ # add global kubernetes remote identity if it exists, if not, add default
571
+ global_remote_identity = skypilot_config.get_effective_region_config(
572
+ cloud='kubernetes',
573
+ region=None,
574
+ keys=('remote_identity',),
575
+ default_value=None)
576
+ if global_remote_identity is not None:
577
+ if remote_identities is None:
578
+ remote_identities = []
579
+ if isinstance(global_remote_identity, str):
580
+ assert isinstance(remote_identities, list)
581
+ remote_identities.append({'*': global_remote_identity})
582
+ elif isinstance(global_remote_identity, list):
583
+ assert isinstance(remote_identities, list)
584
+ remote_identities.extend(global_remote_identity)
585
+ if remote_identities is None:
586
+ remote_identities = schemas.get_default_remote_identity(
587
+ str(cloud).lower())
588
+ else:
589
+ remote_identities = skypilot_config.get_effective_region_config(
590
+ cloud=str(cloud).lower(),
591
+ region=None,
592
+ keys=('remote_identity',),
593
+ default_value=None)
594
+ if remote_identities is None:
595
+ remote_identities = schemas.get_default_remote_identity(
596
+ str(cloud).lower())
517
597
 
518
598
  local_credential_expiring = cloud.can_credential_expire()
519
599
  if isinstance(remote_identities, str):
@@ -528,19 +608,26 @@ def get_expirable_clouds(
528
608
  return expirable_clouds
529
609
 
530
610
 
611
+ def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
612
+ path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
613
+ return f'{cluster_name_on_cloud}-{path_hash}'
614
+
615
+
531
616
  # TODO: too many things happening here - leaky abstraction. Refactor.
532
617
  @timeline.event
533
618
  def write_cluster_config(
534
- to_provision: 'resources_lib.Resources',
535
- num_nodes: int,
536
- cluster_config_template: str,
537
- cluster_name: str,
538
- local_wheel_path: pathlib.Path,
539
- wheel_hash: str,
540
- region: clouds.Region,
541
- zones: Optional[List[clouds.Zone]] = None,
542
- dryrun: bool = False,
543
- keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
619
+ to_provision: 'resources_lib.Resources',
620
+ num_nodes: int,
621
+ cluster_config_template: str,
622
+ cluster_name: str,
623
+ local_wheel_path: pathlib.Path,
624
+ wheel_hash: str,
625
+ region: clouds.Region,
626
+ zones: Optional[List[clouds.Zone]] = None,
627
+ dryrun: bool = False,
628
+ keep_launch_fields_in_existing_config: bool = True,
629
+ volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
630
+ ) -> Dict[str, str]:
544
631
  """Fills in cluster configuration templates and writes them out.
545
632
 
546
633
  Returns:
@@ -588,12 +675,15 @@ def write_cluster_config(
588
675
  resources_utils.ClusterName(
589
676
  cluster_name,
590
677
  cluster_name_on_cloud,
591
- ), region, zones, num_nodes, dryrun)
678
+ ), region, zones, num_nodes, dryrun, volume_mounts)
592
679
  config_dict = {}
593
680
 
594
681
  specific_reservations = set(
595
- skypilot_config.get_nested(
596
- (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
682
+ skypilot_config.get_effective_region_config(
683
+ cloud=str(to_provision.cloud).lower(),
684
+ region=to_provision.region,
685
+ keys=('specific_reservations',),
686
+ default_value=set()))
597
687
 
598
688
  # Remote identity handling can have 4 cases:
599
689
  # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
@@ -605,9 +695,12 @@ def write_cluster_config(
605
695
  # other cases, we exclude the cloud from credential file uploads after
606
696
  # running required checks.
607
697
  assert cluster_name is not None
608
- excluded_clouds = set()
609
- remote_identity_config = skypilot_config.get_nested(
610
- (str(cloud).lower(), 'remote_identity'), None)
698
+ excluded_clouds: Set[clouds.Cloud] = set()
699
+ remote_identity_config = skypilot_config.get_effective_region_config(
700
+ cloud=str(cloud).lower(),
701
+ region=region.name,
702
+ keys=('remote_identity',),
703
+ default_value=None)
611
704
  remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
612
705
  if isinstance(remote_identity_config, str):
613
706
  remote_identity = remote_identity_config
@@ -636,15 +729,25 @@ def write_cluster_config(
636
729
  'is not supported by this cloud. Remove the config or set: '
637
730
  '`remote_identity: LOCAL_CREDENTIALS`.')
638
731
  if isinstance(cloud, clouds.Kubernetes):
639
- if skypilot_config.get_nested(
640
- ('kubernetes', 'allowed_contexts'), None) is None:
732
+ allowed_contexts = skypilot_config.get_workspace_cloud(
733
+ 'kubernetes').get('allowed_contexts', None)
734
+ if allowed_contexts is None:
735
+ allowed_contexts = skypilot_config.get_effective_region_config(
736
+ cloud='kubernetes',
737
+ region=None,
738
+ keys=('allowed_contexts',),
739
+ default_value=None)
740
+ if allowed_contexts is None:
641
741
  excluded_clouds.add(cloud)
642
742
  else:
643
743
  excluded_clouds.add(cloud)
644
744
 
645
745
  for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
646
- remote_identity_config = skypilot_config.get_nested(
647
- (cloud_str.lower(), 'remote_identity'), None)
746
+ remote_identity_config = skypilot_config.get_effective_region_config(
747
+ cloud=cloud_str.lower(),
748
+ region=region.name,
749
+ keys=('remote_identity',),
750
+ default_value=None)
648
751
  if remote_identity_config:
649
752
  if (remote_identity_config ==
650
753
  schemas.RemoteIdentityOptions.NO_UPLOAD.value):
@@ -652,15 +755,24 @@ def write_cluster_config(
652
755
 
653
756
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
654
757
 
655
- private_key_path, _ = auth.get_or_generate_keys()
758
+ logging_agent = logs.get_logging_agent()
759
+ if logging_agent:
760
+ for k, v in logging_agent.get_credential_file_mounts().items():
761
+ assert k not in credentials, f'{k} already in credentials'
762
+ credentials[k] = v
763
+
764
+ private_key_path, _ = auth_utils.get_or_generate_keys()
656
765
  auth_config = {'ssh_private_key': private_key_path}
657
766
  region_name = resources_vars.get('region')
658
767
 
659
768
  yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
660
769
 
661
770
  # Retrieve the ssh_proxy_command for the given cloud / region.
662
- ssh_proxy_command_config = skypilot_config.get_nested(
663
- (str(cloud).lower(), 'ssh_proxy_command'), None)
771
+ ssh_proxy_command_config = skypilot_config.get_effective_region_config(
772
+ cloud=str(cloud).lower(),
773
+ region=None,
774
+ keys=('ssh_proxy_command',),
775
+ default_value=None)
664
776
  if (isinstance(ssh_proxy_command_config, str) or
665
777
  ssh_proxy_command_config is None):
666
778
  ssh_proxy_command = ssh_proxy_command_config
@@ -683,10 +795,63 @@ def write_cluster_config(
683
795
  assert region_name in ssh_proxy_command_config, (
684
796
  region_name, ssh_proxy_command_config)
685
797
  ssh_proxy_command = ssh_proxy_command_config[region_name]
798
+
799
+ use_internal_ips = skypilot_config.get_effective_region_config(
800
+ cloud=str(cloud).lower(),
801
+ region=region.name,
802
+ keys=('use_internal_ips',),
803
+ default_value=False)
804
+ if isinstance(cloud, clouds.AWS):
805
+ # If the use_ssm flag is set to true, we use the ssm proxy command.
806
+ use_ssm = skypilot_config.get_effective_region_config(
807
+ cloud=str(cloud).lower(),
808
+ region=region.name,
809
+ keys=('use_ssm',),
810
+ default_value=None)
811
+
812
+ if use_ssm and ssh_proxy_command is not None:
813
+ raise exceptions.InvalidCloudConfigs(
814
+ 'use_ssm is set to true, but ssh_proxy_command '
815
+ f'is already set to {ssh_proxy_command!r}. Please remove '
816
+ 'ssh_proxy_command or set use_ssm to false.')
817
+
818
+ if use_internal_ips and ssh_proxy_command is None:
819
+ # Only if use_ssm is explicitly not set, we default to using SSM.
820
+ if use_ssm is None:
821
+ logger.warning(
822
+ f'{colorama.Fore.YELLOW}'
823
+ 'use_internal_ips is set to true, '
824
+ 'but ssh_proxy_command is not set. Defaulting to '
825
+ 'using SSM. Specify ssh_proxy_command to use a different '
826
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
827
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
828
+ use_ssm = True
829
+
830
+ if use_ssm:
831
+ aws_profile = os.environ.get('AWS_PROFILE', None)
832
+ profile_str = f'--profile {aws_profile}' if aws_profile else ''
833
+ ip_address_filter = ('Name=private-ip-address,Values=%h'
834
+ if use_internal_ips else
835
+ 'Name=ip-address,Values=%h')
836
+ get_instance_id_command = 'aws ec2 describe-instances ' + \
837
+ f'--region {region_name} --filters {ip_address_filter} ' + \
838
+ '--query \"Reservations[].Instances[].InstanceId\" ' + \
839
+ f'{profile_str} --output text'
840
+ ssm_proxy_command = 'aws ssm start-session --target ' + \
841
+ f'\"$({get_instance_id_command})\" ' + \
842
+ f'--region {region_name} {profile_str} ' + \
843
+ '--document-name AWS-StartSSHSession ' + \
844
+ '--parameters portNumber=%p'
845
+ ssh_proxy_command = ssm_proxy_command
846
+ region_name = 'ssm-session'
686
847
  logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
687
848
 
688
849
  # User-supplied global instance tags from ~/.sky/config.yaml.
689
- labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
850
+ labels = skypilot_config.get_effective_region_config(
851
+ cloud=str(cloud).lower(),
852
+ region=region.name,
853
+ keys=('labels',),
854
+ default_value={})
690
855
  # labels is a dict, which is guaranteed by the type check in
691
856
  # schemas.py
692
857
  assert isinstance(labels, dict), labels
@@ -695,12 +860,6 @@ def write_cluster_config(
695
860
  if to_provision.labels:
696
861
  labels.update(to_provision.labels)
697
862
 
698
- # Dump the Ray ports to a file for Ray job submission
699
- dump_port_command = (
700
- f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
701
- f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
702
- )
703
-
704
863
  # We disable conda auto-activation if the user has specified a docker image
705
864
  # to use, which is likely to already have a conda environment activated.
706
865
  conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
@@ -715,6 +874,32 @@ def write_cluster_config(
715
874
  high_availability_specified = controller_utils.high_availability_specified(
716
875
  cluster_name)
717
876
 
877
+ volume_mount_vars = []
878
+ ephemeral_volume_mount_vars = []
879
+ if volume_mounts is not None:
880
+ for vol in volume_mounts:
881
+ if vol.is_ephemeral:
882
+ volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
883
+ vol.volume_name = volume_name
884
+ vol.volume_config.cloud = repr(cloud)
885
+ vol.volume_config.region = region.name
886
+ vol.volume_config.name = volume_name
887
+ ephemeral_volume_mount_vars.append(vol.to_yaml_config())
888
+ else:
889
+ volume_info = volume_utils.VolumeInfo(
890
+ name=vol.volume_name,
891
+ path=vol.path,
892
+ volume_name_on_cloud=vol.volume_config.name_on_cloud,
893
+ volume_id_on_cloud=vol.volume_config.id_on_cloud,
894
+ )
895
+ volume_mount_vars.append(volume_info)
896
+
897
+ runcmd = skypilot_config.get_effective_region_config(
898
+ cloud=str(to_provision.cloud).lower(),
899
+ region=to_provision.region,
900
+ keys=('post_provision_runcmd',),
901
+ default_value=None)
902
+
718
903
  # Use a tmp file path to avoid incomplete YAML file being re-used in the
719
904
  # future.
720
905
  tmp_yaml_path = yaml_path + '.tmp'
@@ -734,18 +919,23 @@ def write_cluster_config(
734
919
  os.environ.get(constants.USER_ENV_VAR, '')),
735
920
 
736
921
  # Networking configs
737
- 'use_internal_ips': skypilot_config.get_nested(
738
- (str(cloud).lower(), 'use_internal_ips'), False),
922
+ 'use_internal_ips': skypilot_config.get_effective_region_config(
923
+ cloud=str(cloud).lower(),
924
+ region=region.name,
925
+ keys=('use_internal_ips',),
926
+ default_value=False),
739
927
  'ssh_proxy_command': ssh_proxy_command,
740
- 'vpc_name': skypilot_config.get_nested(
741
- (str(cloud).lower(), 'vpc_name'), None),
742
-
928
+ 'vpc_name': skypilot_config.get_effective_region_config(
929
+ cloud=str(cloud).lower(),
930
+ region=region.name,
931
+ keys=('vpc_name',),
932
+ default_value=None),
743
933
  # User-supplied labels.
744
934
  'labels': labels,
745
935
  # User-supplied remote_identity
746
936
  'remote_identity': remote_identity,
747
937
  # The reservation pools that specified by the user. This is
748
- # currently only used by GCP.
938
+ # currently only used by AWS and GCP.
749
939
  'specific_reservations': specific_reservations,
750
940
 
751
941
  # Conda setup
@@ -766,12 +956,14 @@ def write_cluster_config(
766
956
  '{sky_wheel_hash}',
767
957
  wheel_hash).replace('{cloud}',
768
958
  str(cloud).lower()),
959
+ 'copy_skypilot_templates_commands':
960
+ constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
769
961
  # Port of Ray (GCS server).
770
962
  # Ray's default port 6379 is conflicted with Redis.
771
963
  'ray_port': constants.SKY_REMOTE_RAY_PORT,
772
964
  'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
773
965
  'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
774
- 'dump_port_command': dump_port_command,
966
+ 'dump_port_command': instance_setup.DUMP_RAY_PORTS,
775
967
  # Sky-internal constants.
776
968
  'sky_ray_cmd': constants.SKY_RAY_CMD,
777
969
  # pip install needs to have python env activated to make sure
@@ -805,6 +997,14 @@ def write_cluster_config(
805
997
 
806
998
  # High availability
807
999
  'high_availability': high_availability_specified,
1000
+
1001
+ # Volume mounts
1002
+ 'volume_mounts': volume_mount_vars,
1003
+ 'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
1004
+
1005
+ # runcmd to run before any of the SkyPilot runtime setup commands.
1006
+ # This is currently only used by AWS and Kubernetes.
1007
+ 'runcmd': runcmd,
808
1008
  }),
809
1009
  output_path=tmp_yaml_path)
810
1010
  config_dict['cluster_name'] = cluster_name
@@ -812,14 +1012,20 @@ def write_cluster_config(
812
1012
 
813
1013
  # Add kubernetes config fields from ~/.sky/config
814
1014
  if isinstance(cloud, clouds.Kubernetes):
815
- kubernetes_utils.combine_pod_config_fields(
816
- tmp_yaml_path,
817
- cluster_config_overrides=to_provision.cluster_config_overrides)
818
- kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
819
- yaml_obj = common_utils.read_yaml(tmp_yaml_path)
820
- pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
1015
+ cluster_config_overrides = to_provision.cluster_config_overrides
1016
+ with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
1017
+ tmp_yaml_str = f.read()
1018
+ cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
1019
+ combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
1020
+ cluster_yaml_obj,
1021
+ cluster_config_overrides=cluster_config_overrides,
1022
+ cloud=cloud,
1023
+ context=region.name)
1024
+ # Write the updated YAML back to the file
1025
+ yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
1026
+
1027
+ pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
821
1028
  'ray_head_default']['node_config']
822
-
823
1029
  # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
824
1030
  pod_config.pop('deployment_spec', None)
825
1031
  pod_config.pop('pvc_spec', None)
@@ -841,9 +1047,8 @@ def write_cluster_config(
841
1047
  _add_auth_to_cluster_config(cloud, tmp_yaml_path)
842
1048
 
843
1049
  # Restore the old yaml content for backward compatibility.
844
- if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config:
845
- with open(yaml_path, 'r', encoding='utf-8') as f:
846
- old_yaml_content = f.read()
1050
+ old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
1051
+ if old_yaml_content is not None and keep_launch_fields_in_existing_config:
847
1052
  with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
848
1053
  new_yaml_content = f.read()
849
1054
  restored_yaml_content = _replace_yaml_dicts(
@@ -853,11 +1058,7 @@ def write_cluster_config(
853
1058
  with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
854
1059
  f.write(restored_yaml_content)
855
1060
 
856
- # Read the cluster name from the tmp yaml file, to take the backward
857
- # compatbility restortion above into account.
858
- # TODO: remove this after 2 minor releases, 0.10.0.
859
- yaml_config = common_utils.read_yaml(tmp_yaml_path)
860
- config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
1061
+ config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
861
1062
 
862
1063
  # Make sure to do this before we optimize file mounts. Optimization is
863
1064
  # non-deterministic, but everything else before this point should be
@@ -880,18 +1081,29 @@ def write_cluster_config(
880
1081
  # compatibility should go before this call.
881
1082
  _optimize_file_mounts(tmp_yaml_path)
882
1083
 
883
- # Rename the tmp file to the final YAML path.
884
- os.rename(tmp_yaml_path, yaml_path)
885
- usage_lib.messages.usage.update_ray_yaml(yaml_path)
1084
+ # commit the final yaml to the database
1085
+ global_user_state.set_cluster_yaml(
1086
+ cluster_name,
1087
+ open(tmp_yaml_path, 'r', encoding='utf-8').read())
1088
+
1089
+ usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
1090
+
1091
+ # Remove the tmp file.
1092
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1093
+ debug_yaml_path = yaml_path + '.debug'
1094
+ os.rename(tmp_yaml_path, debug_yaml_path)
1095
+ else:
1096
+ os.remove(tmp_yaml_path)
1097
+
886
1098
  return config_dict
887
1099
 
888
1100
 
889
- def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1101
+ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
890
1102
  """Adds SSH key info to the cluster config.
891
1103
 
892
1104
  This function's output removes comments included in the jinja2 template.
893
1105
  """
894
- config = common_utils.read_yaml(cluster_config_file)
1106
+ config = yaml_utils.read_yaml(tmp_yaml_path)
895
1107
  # Check the availability of the cloud type.
896
1108
  if isinstance(cloud, (
897
1109
  clouds.AWS,
@@ -919,9 +1131,17 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
919
1131
  config = auth.setup_vast_authentication(config)
920
1132
  elif isinstance(cloud, clouds.Fluidstack):
921
1133
  config = auth.setup_fluidstack_authentication(config)
1134
+ elif isinstance(cloud, clouds.Hyperbolic):
1135
+ config = auth.setup_hyperbolic_authentication(config)
1136
+ elif isinstance(cloud, clouds.Shadeform):
1137
+ config = auth.setup_shadeform_authentication(config)
1138
+ elif isinstance(cloud, clouds.PrimeIntellect):
1139
+ config = auth.setup_primeintellect_authentication(config)
1140
+ elif isinstance(cloud, clouds.Seeweb):
1141
+ config = auth.setup_seeweb_authentication(config)
922
1142
  else:
923
1143
  assert False, cloud
924
- common_utils.dump_yaml(cluster_config_file, config)
1144
+ yaml_utils.dump_yaml(tmp_yaml_path, config)
925
1145
 
926
1146
 
927
1147
  def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
@@ -979,7 +1199,7 @@ def _count_healthy_nodes_from_ray(output: str,
979
1199
 
980
1200
 
981
1201
  @timeline.event
982
- def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1202
+ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
983
1203
  """Hash the cluster yaml and contents of file mounts to a unique string.
984
1204
 
985
1205
  Two invocations of this function should return the same string if and only
@@ -1021,9 +1241,8 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1021
1241
  Rather than constructing the whole byte sequence, which may be quite large,
1022
1242
  we construct it incrementally by using hash.update() to add new bytes.
1023
1243
  """
1024
-
1025
1244
  # Load the yaml contents so that we can directly remove keys.
1026
- yaml_config = common_utils.read_yaml(yaml_path)
1245
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1027
1246
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
1028
1247
  dict_to_remove_from = yaml_config
1029
1248
  found_key = True
@@ -1042,7 +1261,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1042
1261
  config_hash = hashlib.sha256()
1043
1262
 
1044
1263
  yaml_hash = hashlib.sha256(
1045
- common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
1264
+ yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
1046
1265
  config_hash.update(yaml_hash.digest())
1047
1266
 
1048
1267
  file_mounts = yaml_config.get('file_mounts', {})
@@ -1052,7 +1271,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1052
1271
  file_mounts.pop('')
1053
1272
 
1054
1273
  for dst, src in sorted(file_mounts.items()):
1055
- if src == yaml_path:
1274
+ if src == tmp_yaml_path:
1056
1275
  # Skip the yaml file itself. We have already hashed a modified
1057
1276
  # version of it. The file may include fields we don't want to hash.
1058
1277
  continue
@@ -1147,7 +1366,7 @@ def wait_until_ray_cluster_ready(
1147
1366
  logger.error(common_utils.format_exception(e))
1148
1367
  return False, None # failed
1149
1368
 
1150
- config = common_utils.read_yaml(cluster_config_file)
1369
+ config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
1151
1370
 
1152
1371
  docker_user = None
1153
1372
  if 'docker' in config:
@@ -1247,11 +1466,11 @@ def ssh_credential_from_yaml(
1247
1466
  """
1248
1467
  if cluster_yaml is None:
1249
1468
  return dict()
1250
- config = common_utils.read_yaml(cluster_yaml)
1469
+ config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
1251
1470
  auth_section = config['auth']
1252
1471
  if ssh_user is None:
1253
1472
  ssh_user = auth_section['ssh_user'].strip()
1254
- ssh_private_key = auth_section.get('ssh_private_key')
1473
+ ssh_private_key_path = auth_section.get('ssh_private_key')
1255
1474
  ssh_control_name = config.get('cluster_name', '__default__')
1256
1475
  ssh_proxy_command = auth_section.get('ssh_proxy_command')
1257
1476
 
@@ -1260,9 +1479,10 @@ def ssh_credential_from_yaml(
1260
1479
  constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1261
1480
  ssh_proxy_command = ssh_proxy_command.replace(
1262
1481
  constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1482
+
1263
1483
  credentials = {
1264
1484
  'ssh_user': ssh_user,
1265
- 'ssh_private_key': ssh_private_key,
1485
+ 'ssh_private_key': ssh_private_key_path,
1266
1486
  'ssh_control_name': ssh_control_name,
1267
1487
  'ssh_proxy_command': ssh_proxy_command,
1268
1488
  }
@@ -1275,6 +1495,62 @@ def ssh_credential_from_yaml(
1275
1495
  return credentials
1276
1496
 
1277
1497
 
1498
+ def ssh_credentials_from_handles(
1499
+ handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
1500
+ ) -> List[Dict[str, Any]]:
1501
+ """Returns ssh_user, ssh_private_key and ssh_control name.
1502
+ """
1503
+ non_empty_cluster_yaml_paths = [
1504
+ handle.cluster_yaml
1505
+ for handle in handles
1506
+ if handle.cluster_yaml is not None
1507
+ ]
1508
+ cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
1509
+ non_empty_cluster_yaml_paths)
1510
+ cluster_yaml_dicts_to_index = {
1511
+ cluster_yaml_path: cluster_yaml_dict
1512
+ for cluster_yaml_path, cluster_yaml_dict in zip(
1513
+ non_empty_cluster_yaml_paths, cluster_yaml_dicts)
1514
+ }
1515
+
1516
+ credentials_to_return: List[Dict[str, Any]] = []
1517
+ for handle in handles:
1518
+ if handle.cluster_yaml is None:
1519
+ credentials_to_return.append(dict())
1520
+ continue
1521
+ ssh_user = handle.ssh_user
1522
+ docker_user = handle.docker_user
1523
+ config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
1524
+ auth_section = config['auth']
1525
+ if ssh_user is None:
1526
+ ssh_user = auth_section['ssh_user'].strip()
1527
+ ssh_private_key_path = auth_section.get('ssh_private_key')
1528
+ ssh_control_name = config.get('cluster_name', '__default__')
1529
+ ssh_proxy_command = auth_section.get('ssh_proxy_command')
1530
+
1531
+ # Update the ssh_user placeholder in proxy command, if required
1532
+ if (ssh_proxy_command is not None and
1533
+ constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1534
+ ssh_proxy_command = ssh_proxy_command.replace(
1535
+ constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1536
+
1537
+ credentials = {
1538
+ 'ssh_user': ssh_user,
1539
+ 'ssh_private_key': ssh_private_key_path,
1540
+ 'ssh_control_name': ssh_control_name,
1541
+ 'ssh_proxy_command': ssh_proxy_command,
1542
+ }
1543
+ if docker_user is not None:
1544
+ credentials['docker_user'] = docker_user
1545
+ ssh_provider_module = config['provider']['module']
1546
+ # If we are running ssh command on kubernetes node.
1547
+ if 'kubernetes' in ssh_provider_module:
1548
+ credentials['disable_control_master'] = True
1549
+ credentials_to_return.append(credentials)
1550
+
1551
+ return credentials_to_return
1552
+
1553
+
1278
1554
  def parallel_data_transfer_to_nodes(
1279
1555
  runners: List[command_runner.CommandRunner],
1280
1556
  source: Optional[str],
@@ -1435,7 +1711,7 @@ def get_node_ips(cluster_yaml: str,
1435
1711
  exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
1436
1712
  HEAD or WORKER.
1437
1713
  """
1438
- ray_config = common_utils.read_yaml(cluster_yaml)
1714
+ ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
1439
1715
  # Use the new provisioner for AWS.
1440
1716
  provider_name = cluster_utils.get_provider_name(ray_config)
1441
1717
  cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
@@ -1523,18 +1799,54 @@ def get_node_ips(cluster_yaml: str,
1523
1799
 
1524
1800
  def check_network_connection():
1525
1801
  # Tolerate 3 retries as it is observed that connections can fail.
1526
- adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
1527
1802
  http = requests.Session()
1528
- http.mount('https://', adapter)
1529
- http.mount('http://', adapter)
1530
- for i, ip in enumerate(_TEST_IP_LIST):
1531
- try:
1532
- http.head(ip, timeout=3)
1533
- return
1534
- except (requests.Timeout, requests.exceptions.ConnectionError) as e:
1535
- if i == len(_TEST_IP_LIST) - 1:
1536
- raise exceptions.NetworkError('Could not refresh the cluster. '
1537
- 'Network seems down.') from e
1803
+ http.mount('https://', adapters.HTTPAdapter())
1804
+ http.mount('http://', adapters.HTTPAdapter())
1805
+
1806
+ # Alternate between IPs on each retry
1807
+ max_retries = 3
1808
+ timeout = 0.5
1809
+
1810
+ for _ in range(max_retries):
1811
+ for ip in _TEST_IP_LIST:
1812
+ try:
1813
+ http.head(ip, timeout=timeout)
1814
+ return
1815
+ except (requests.Timeout, requests.exceptions.ConnectionError):
1816
+ continue
1817
+
1818
+ timeout *= 2 # Double the timeout for next retry
1819
+
1820
+ # If we get here, all IPs failed
1821
+ # Assume network connection is down
1822
+ raise exceptions.NetworkError('Could not refresh the cluster. '
1823
+ 'Network seems down.')
1824
+
1825
+
1826
+ async def async_check_network_connection():
1827
+ """Check if the network connection is available.
1828
+
1829
+ Tolerates 3 retries as it is observed that connections can fail.
1830
+ Uses aiohttp for async HTTP requests.
1831
+ """
1832
+ # Create a session with retry logic
1833
+ timeout = ClientTimeout(total=15)
1834
+ connector = TCPConnector(limit=1) # Limit to 1 connection at a time
1835
+
1836
+ async with aiohttp.ClientSession(timeout=timeout,
1837
+ connector=connector) as session:
1838
+ for i, ip in enumerate(_TEST_IP_LIST):
1839
+ try:
1840
+ async with session.head(ip) as response:
1841
+ if response.status < 400: # Any 2xx or 3xx status is good
1842
+ return
1843
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
1844
+ if i == len(_TEST_IP_LIST) - 1:
1845
+ raise exceptions.NetworkError(
1846
+ 'Could not refresh the cluster. '
1847
+ 'Network seems down.') from e
1848
+ # If not the last IP, continue to try the next one
1849
+ continue
1538
1850
 
1539
1851
 
1540
1852
  @timeline.event
@@ -1549,14 +1861,34 @@ def check_owner_identity(cluster_name: str) -> None:
1549
1861
  """
1550
1862
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1551
1863
  return
1552
- record = global_user_state.get_cluster_from_name(cluster_name)
1864
+ record = global_user_state.get_cluster_from_name(cluster_name,
1865
+ include_user_info=False,
1866
+ summary_response=True)
1553
1867
  if record is None:
1554
1868
  return
1869
+ _check_owner_identity_with_record(cluster_name, record)
1870
+
1871
+
1872
+ def _check_owner_identity_with_record(cluster_name: str,
1873
+ record: Dict[str, Any]) -> None:
1874
+ if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1875
+ return
1555
1876
  handle = record['handle']
1556
1877
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1557
1878
  return
1879
+ active_workspace = skypilot_config.get_active_workspace()
1880
+ cluster_workspace = record.get('workspace',
1881
+ constants.SKYPILOT_DEFAULT_WORKSPACE)
1882
+ if active_workspace != cluster_workspace:
1883
+ with ux_utils.print_exception_no_traceback():
1884
+ raise exceptions.ClusterOwnerIdentityMismatchError(
1885
+ f'{colorama.Fore.YELLOW}'
1886
+ f'The cluster {cluster_name!r} is in workspace '
1887
+ f'{cluster_workspace!r}, but the active workspace is '
1888
+ f'{active_workspace!r}.{colorama.Fore.RESET}')
1558
1889
 
1559
- cloud = handle.launched_resources.cloud
1890
+ launched_resources = handle.launched_resources.assert_launchable()
1891
+ cloud = launched_resources.cloud
1560
1892
  user_identities = cloud.get_user_identities()
1561
1893
  owner_identity = record['owner']
1562
1894
  if user_identities is None:
@@ -1625,22 +1957,26 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1625
1957
  }
1626
1958
 
1627
1959
 
1960
+ @context_utils.cancellation_guard
1628
1961
  def _query_cluster_status_via_cloud_api(
1629
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1630
- ) -> List[status_lib.ClusterStatus]:
1631
- """Returns the status of the cluster.
1962
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
1963
+ retry_if_missing: bool,
1964
+ ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1965
+ """Returns the status of the cluster as a list of tuples corresponding
1966
+ to the node status and an optional reason string for said status.
1632
1967
 
1633
1968
  Raises:
1634
1969
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1635
1970
  fetched from the cloud provider.
1636
1971
  """
1972
+ cluster_name = handle.cluster_name
1637
1973
  cluster_name_on_cloud = handle.cluster_name_on_cloud
1638
1974
  cluster_name_in_hint = common_utils.cluster_name_in_hint(
1639
1975
  handle.cluster_name, cluster_name_on_cloud)
1640
1976
  # Use region and zone from the cluster config, instead of the
1641
1977
  # handle.launched_resources, because the latter may not be set
1642
1978
  # correctly yet.
1643
- ray_config = common_utils.read_yaml(handle.cluster_yaml)
1979
+ ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
1644
1980
  provider_config = ray_config['provider']
1645
1981
 
1646
1982
  # Query the cloud provider.
@@ -1651,7 +1987,11 @@ def _query_cluster_status_via_cloud_api(
1651
1987
  cloud_name = repr(handle.launched_resources.cloud)
1652
1988
  try:
1653
1989
  node_status_dict = provision_lib.query_instances(
1654
- cloud_name, cluster_name_on_cloud, provider_config)
1990
+ cloud_name,
1991
+ cluster_name,
1992
+ cluster_name_on_cloud,
1993
+ provider_config,
1994
+ retry_if_missing=retry_if_missing)
1655
1995
  logger.debug(f'Querying {cloud_name} cluster '
1656
1996
  f'{cluster_name_in_hint} '
1657
1997
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -1667,12 +2007,55 @@ def _query_cluster_status_via_cloud_api(
1667
2007
  region = provider_config.get('region') or provider_config.get(
1668
2008
  'location')
1669
2009
  zone = ray_config['provider'].get('availability_zone')
2010
+ # TODO (kyuds): refactor cloud.query_status api to include reason.
2011
+ # Currently not refactoring as this API is actually supposed to be
2012
+ # deprecated soon.
1670
2013
  node_statuses = cloud.query_status(
1671
2014
  cluster_name_on_cloud,
1672
2015
  tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
2016
+ node_statuses = [(status, None) for status in node_statuses]
1673
2017
  return node_statuses
1674
2018
 
1675
2019
 
2020
+ def _query_cluster_info_via_cloud_api(
2021
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
2022
+ ) -> provision_common.ClusterInfo:
2023
+ """Returns the cluster info.
2024
+
2025
+ Raises:
2026
+ exceptions.NotSupportedError: the cloud does not support the new provisioner.
2027
+ exceptions.FetchClusterInfoError: the cluster info cannot be
2028
+ fetched from the cloud provider.
2029
+ """
2030
+ cloud = handle.launched_resources.cloud
2031
+ assert cloud is not None, handle
2032
+ if cloud.STATUS_VERSION >= clouds.StatusVersion.SKYPILOT:
2033
+ try:
2034
+ cloud_name = repr(cloud)
2035
+ ray_config = global_user_state.get_cluster_yaml_dict(
2036
+ handle.cluster_yaml)
2037
+ provider_config = ray_config['provider']
2038
+ region = provider_config.get('region') or provider_config.get(
2039
+ 'location')
2040
+ cluster_info = provision_lib.get_cluster_info(
2041
+ cloud_name, region, handle.cluster_name_on_cloud,
2042
+ provider_config)
2043
+ logger.debug(
2044
+ f'Querying {cloud_name} cluster '
2045
+ f'{handle.cluster_name_on_cloud} '
2046
+ f'head instance:\n{cluster_info.get_head_instance()}\n'
2047
+ f'worker instances:\n{cluster_info.get_worker_instances()}')
2048
+ return cluster_info
2049
+ except Exception as e: # pylint: disable=broad-except
2050
+ with ux_utils.print_exception_no_traceback():
2051
+ raise exceptions.FetchClusterInfoError(
2052
+ reason=exceptions.FetchClusterInfoError.Reason.UNKNOWN
2053
+ ) from e
2054
+ else:
2055
+ raise exceptions.NotSupportedError(
2056
+ f'The cloud {cloud} does not support the SkyPilot provisioner.')
2057
+
2058
+
1676
2059
  def check_can_clone_disk_and_override_task(
1677
2060
  cluster_name: str, target_cluster_name: Optional[str], task: 'task_lib.Task'
1678
2061
  ) -> Tuple['task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']:
@@ -1720,12 +2103,12 @@ def check_can_clone_disk_and_override_task(
1720
2103
  'a new target cluster name.')
1721
2104
 
1722
2105
  new_task_resources = []
1723
- original_cloud = handle.launched_resources.cloud
2106
+ launched_resources = handle.launched_resources.assert_launchable()
2107
+ original_cloud = launched_resources.cloud
1724
2108
  original_cloud.check_features_are_supported(
1725
- handle.launched_resources,
2109
+ launched_resources,
1726
2110
  {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
1727
2111
 
1728
- assert original_cloud is not None, handle.launched_resources
1729
2112
  has_override = False
1730
2113
  has_disk_size_met = False
1731
2114
  has_cloud_met = False
@@ -1739,7 +2122,7 @@ def check_can_clone_disk_and_override_task(
1739
2122
  continue
1740
2123
  has_cloud_met = True
1741
2124
 
1742
- override_param = {}
2125
+ override_param: Dict[str, Any] = {}
1743
2126
  if task_resources.cloud is None:
1744
2127
  override_param['cloud'] = original_cloud
1745
2128
  if task_resources.region is None:
@@ -1786,7 +2169,12 @@ def check_can_clone_disk_and_override_task(
1786
2169
  return task, handle
1787
2170
 
1788
2171
 
1789
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2172
+ def _update_cluster_status(
2173
+ cluster_name: str,
2174
+ record: Dict[str, Any],
2175
+ retry_if_missing: bool,
2176
+ include_user_info: bool = True,
2177
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1790
2178
  """Update the cluster status.
1791
2179
 
1792
2180
  The cluster status is updated by checking ray cluster and real status from
@@ -1813,13 +2201,16 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1813
2201
  fetched from the cloud provider or there are leaked nodes causing
1814
2202
  the node number larger than expected.
1815
2203
  """
1816
- record = global_user_state.get_cluster_from_name(cluster_name)
1817
- if record is None:
1818
- return None
1819
2204
  handle = record['handle']
1820
2205
  if handle.cluster_yaml is None:
1821
2206
  # Remove cluster from db since this cluster does not have a config file
1822
2207
  # or any other ongoing requests
2208
+ global_user_state.add_cluster_event(
2209
+ cluster_name,
2210
+ None,
2211
+ 'Cluster has no YAML file. Removing the cluster from cache.',
2212
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2213
+ nop_if_duplicate=True)
1823
2214
  global_user_state.remove_cluster(cluster_name, terminate=True)
1824
2215
  logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
1825
2216
  'Removing the cluster from cache.')
@@ -1828,10 +2219,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1828
2219
  return record
1829
2220
  cluster_name = handle.cluster_name
1830
2221
 
1831
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2222
+ node_statuses = _query_cluster_status_via_cloud_api(
2223
+ handle, retry_if_missing=retry_if_missing)
1832
2224
 
1833
- all_nodes_up = (all(
1834
- status == status_lib.ClusterStatus.UP for status in node_statuses) and
2225
+ all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2226
+ for status in node_statuses) and
1835
2227
  len(node_statuses) == handle.launched_nodes)
1836
2228
 
1837
2229
  def get_node_counts_from_ray_status(
@@ -1842,14 +2234,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1842
2234
  require_outputs=True,
1843
2235
  separate_stderr=True)
1844
2236
  if rc:
1845
- raise RuntimeError(
1846
- f'Refreshing status ({cluster_name!r}): Failed to check '
1847
- f'ray cluster\'s healthiness with '
1848
- f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1849
- f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
2237
+ raise exceptions.CommandError(
2238
+ rc, instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
2239
+ f'Failed to check ray cluster\'s healthiness.\n'
2240
+ '-- stdout --\n'
2241
+ f'{output}\n', stderr)
1850
2242
  return (*_count_healthy_nodes_from_ray(output), output, stderr)
1851
2243
 
2244
+ ray_status_details: Optional[str] = None
2245
+
1852
2246
  def run_ray_status_to_check_ray_cluster_healthy() -> bool:
2247
+ nonlocal ray_status_details
1853
2248
  try:
1854
2249
  # NOTE: fetching the IPs is very slow as it calls into
1855
2250
  # `ray get head-ip/worker-ips`. Using cached IPs is safe because
@@ -1872,9 +2267,49 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1872
2267
 
1873
2268
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
1874
2269
 
2270
+ cloud_name = repr(handle.launched_resources.cloud).lower()
2271
+ # Initialize variables in case all retries fail
2272
+ ready_head = 0
2273
+ ready_workers = 0
2274
+ output = ''
2275
+ stderr = ''
1875
2276
  for i in range(5):
1876
- ready_head, ready_workers, output, stderr = (
1877
- get_node_counts_from_ray_status(head_runner))
2277
+ try:
2278
+ ready_head, ready_workers, output, stderr = (
2279
+ get_node_counts_from_ray_status(head_runner))
2280
+ except exceptions.CommandError as e:
2281
+ logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
2282
+ f' {i}: {common_utils.format_exception(e)}')
2283
+ if cloud_name != 'kubernetes':
2284
+ # Non-k8s clusters can be manually restarted and:
2285
+ # 1. Get new IP addresses, or
2286
+ # 2. Not have the SkyPilot runtime setup
2287
+ #
2288
+ # So we should surface a message to the user to
2289
+ # help them recover from this inconsistent state.
2290
+ has_new_ip_addr = (
2291
+ e.detailed_reason is not None and
2292
+ _SSH_CONNECTION_TIMED_OUT_PATTERN.search(
2293
+ e.detailed_reason.strip()) is not None)
2294
+ runtime_not_setup = (_RAY_CLUSTER_NOT_FOUND_MESSAGE
2295
+ in e.error_msg)
2296
+ if has_new_ip_addr or runtime_not_setup:
2297
+ yellow = colorama.Fore.YELLOW
2298
+ bright = colorama.Style.BRIGHT
2299
+ reset = colorama.Style.RESET_ALL
2300
+ ux_utils.console_newline()
2301
+ logger.warning(
2302
+ f'{yellow}Failed getting cluster status despite all nodes '
2303
+ f'being up ({cluster_name!r}). '
2304
+ f'If the cluster was restarted manually, try running: '
2305
+ f'{reset}{bright}sky start {cluster_name}{reset} '
2306
+ f'{yellow}to recover from INIT status.{reset}')
2307
+ return False
2308
+ raise e
2309
+ # We retry for kubernetes because coreweave can have a
2310
+ # transient network issue.
2311
+ time.sleep(1)
2312
+ continue
1878
2313
  if ready_head + ready_workers == total_nodes:
1879
2314
  return True
1880
2315
  logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
@@ -1892,19 +2327,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1892
2327
  # showing up
1893
2328
  time.sleep(1)
1894
2329
 
2330
+ ray_status_details = (
2331
+ f'{ready_head + ready_workers}/{total_nodes} ready')
1895
2332
  raise RuntimeError(
1896
2333
  f'Refreshing status ({cluster_name!r}): ray status not showing '
1897
2334
  f'all nodes ({ready_head + ready_workers}/'
1898
2335
  f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
1899
2336
 
1900
2337
  except exceptions.FetchClusterInfoError:
2338
+ ray_status_details = 'failed to get IPs'
1901
2339
  logger.debug(
1902
2340
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
1903
2341
  except RuntimeError as e:
2342
+ if ray_status_details is None:
2343
+ ray_status_details = str(e)
1904
2344
  logger.debug(common_utils.format_exception(e))
1905
2345
  except Exception as e: # pylint: disable=broad-except
1906
2346
  # This can be raised by `external_ssh_ports()`, due to the
1907
2347
  # underlying call to kubernetes API.
2348
+ ray_status_details = str(e)
1908
2349
  logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
1909
2350
  exc_info=e)
1910
2351
  return False
@@ -1925,16 +2366,28 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1925
2366
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
1926
2367
  # head-ip/worker-ips`.
1927
2368
  record['status'] = status_lib.ClusterStatus.UP
1928
- global_user_state.add_or_update_cluster(cluster_name,
1929
- handle,
1930
- requested_resources=None,
1931
- ready=True,
1932
- is_launch=False)
1933
- return global_user_state.get_cluster_from_name(cluster_name)
2369
+ # Add cluster event for instance status check.
2370
+ global_user_state.add_cluster_event(
2371
+ cluster_name,
2372
+ status_lib.ClusterStatus.UP,
2373
+ 'All nodes up; SkyPilot runtime healthy.',
2374
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2375
+ nop_if_duplicate=True)
2376
+ global_user_state.add_or_update_cluster(
2377
+ cluster_name,
2378
+ handle,
2379
+ requested_resources=None,
2380
+ ready=True,
2381
+ is_launch=False,
2382
+ existing_cluster_hash=record['cluster_hash'])
2383
+ return global_user_state.get_cluster_from_name(
2384
+ cluster_name,
2385
+ include_user_info=include_user_info,
2386
+ summary_response=summary_response)
1934
2387
 
1935
2388
  # All cases below are transitioning the cluster to non-UP states.
1936
-
1937
- if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
2389
+ launched_resources = handle.launched_resources.assert_launchable()
2390
+ if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
1938
2391
  clouds.StatusVersion.SKYPILOT):
1939
2392
  # Note: launched_at is set during sky launch, even on an existing
1940
2393
  # cluster. This will catch the case where the cluster was terminated on
@@ -1947,7 +2400,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1947
2400
  # and check again. This is a best-effort leak prevention check.
1948
2401
  # See https://github.com/skypilot-org/skypilot/issues/4431.
1949
2402
  time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
1950
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2403
+ node_statuses = _query_cluster_status_via_cloud_api(
2404
+ handle, retry_if_missing=False)
1951
2405
  # Note: even if all the node_statuses are UP now, we will still
1952
2406
  # consider this cluster abnormal, and its status will be INIT.
1953
2407
 
@@ -2002,85 +2456,168 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2002
2456
  # * The cluster is partially or completely in the INIT state, which means
2003
2457
  # that provisioning was interrupted. This is considered abnormal.
2004
2458
  #
2005
- # An abnormal cluster will transition to INIT and have any autostop setting
2006
- # reset (unless it's autostopping/autodowning).
2007
- is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
2008
- status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
2459
+ # An abnormal cluster will transition to INIT, and one of the following will happen:
2460
+ # (1) If the SkyPilot provisioner is used AND the head node is alive, we
2461
+ # will not reset the autostop setting. Because autostop is handled by
2462
+ # the skylet through the cloud APIs, and will continue to function
2463
+ # regardless of the ray cluster's health.
2464
+ # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2465
+ # autostopping/autodowning.
2466
+ some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2467
+ # If all nodes are up and ray cluster is health, we would have returned
2468
+ # earlier. So if all_nodes_up is True and we are here, it means the ray
2469
+ # cluster must have been unhealthy.
2470
+ ray_cluster_unhealthy = all_nodes_up
2471
+ some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2472
+ for status in node_statuses)
2473
+ is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2474
+
2009
2475
  if is_abnormal:
2476
+ status_reason = ', '.join(
2477
+ [status[1] for status in node_statuses if status[1] is not None])
2478
+
2479
+ if some_nodes_terminated:
2480
+ init_reason = 'one or more nodes terminated'
2481
+ elif ray_cluster_unhealthy:
2482
+ init_reason = f'ray cluster is unhealthy ({ray_status_details})'
2483
+ elif some_nodes_not_stopped:
2484
+ init_reason = 'some but not all nodes are stopped'
2010
2485
  logger.debug('The cluster is abnormal. Setting to INIT status. '
2011
2486
  f'node_statuses: {node_statuses}')
2012
- backend = get_backend_from_handle(handle)
2013
- if isinstance(backend,
2014
- backends.CloudVmRayBackend) and record['autostop'] >= 0:
2015
- if not backend.is_definitely_autostopping(handle,
2016
- stream_logs=False):
2017
- # Friendly hint.
2018
- autostop = record['autostop']
2019
- maybe_down_str = ' --down' if record['to_down'] else ''
2020
- noun = 'autodown' if record['to_down'] else 'autostop'
2021
-
2022
- # Reset the autostopping as the cluster is abnormal, and may
2023
- # not correctly autostop. Resetting the autostop will let
2024
- # the user know that the autostop may not happen to avoid
2025
- # leakages from the assumption that the cluster will autostop.
2026
- success = True
2027
- reset_local_autostop = True
2487
+ if record['autostop'] >= 0:
2488
+ is_head_node_alive = False
2489
+ if launched_resources.cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
2490
+ # Check if the head node is alive
2028
2491
  try:
2029
- backend.set_autostop(handle, -1, stream_logs=False)
2030
- except exceptions.CommandError as e:
2031
- success = False
2032
- if e.returncode == 255:
2033
- word = 'autostopped' if noun == 'autostop' else 'autodowned'
2034
- logger.debug(f'The cluster is likely {word}.')
2035
- reset_local_autostop = False
2036
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
2037
- success = False
2038
- logger.debug(f'Failed to reset autostop. Due to '
2039
- f'{common_utils.format_exception(e)}')
2040
- if reset_local_autostop:
2041
- global_user_state.set_cluster_autostop_value(
2042
- handle.cluster_name, -1, to_down=False)
2043
-
2044
- if success:
2045
- operation_str = (f'Canceled {noun} on the cluster '
2046
- f'{cluster_name!r}')
2492
+ cluster_info = _query_cluster_info_via_cloud_api(handle)
2493
+ is_head_node_alive = cluster_info.get_head_instance(
2494
+ ) is not None
2495
+ except Exception as e: # pylint: disable=broad-except
2496
+ logger.debug(
2497
+ f'Failed to get cluster info for {cluster_name!r}: '
2498
+ f'{common_utils.format_exception(e)}')
2499
+
2500
+ backend = get_backend_from_handle(handle)
2501
+ if isinstance(backend, backends.CloudVmRayBackend):
2502
+ if is_head_node_alive:
2503
+ logger.debug(
2504
+ f'Skipping autostop reset for cluster {cluster_name!r} '
2505
+ 'because the head node is alive.')
2506
+ elif not backend.is_definitely_autostopping(handle,
2507
+ stream_logs=False):
2508
+ # Friendly hint.
2509
+ autostop = record['autostop']
2510
+ maybe_down_str = ' --down' if record['to_down'] else ''
2511
+ noun = 'autodown' if record['to_down'] else 'autostop'
2512
+
2513
+ # Reset the autostopping as the cluster is abnormal, and may
2514
+ # not correctly autostop. Resetting the autostop will let
2515
+ # the user know that the autostop may not happen to avoid
2516
+ # leakages from the assumption that the cluster will autostop.
2517
+ success = True
2518
+ reset_local_autostop = True
2519
+ try:
2520
+ backend.set_autostop(
2521
+ handle,
2522
+ -1,
2523
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
2524
+ stream_logs=False)
2525
+ except (exceptions.CommandError,
2526
+ grpc.FutureTimeoutError) as e:
2527
+ success = False
2528
+ if isinstance(e, grpc.FutureTimeoutError) or (
2529
+ isinstance(e, exceptions.CommandError) and
2530
+ e.returncode == 255):
2531
+ word = 'autostopped' if noun == 'autostop' else 'autodowned'
2532
+ logger.debug(f'The cluster is likely {word}.')
2533
+ reset_local_autostop = False
2534
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
2535
+ success = False
2536
+ logger.debug(f'Failed to reset autostop. Due to '
2537
+ f'{common_utils.format_exception(e)}')
2538
+ if reset_local_autostop:
2539
+ global_user_state.set_cluster_autostop_value(
2540
+ handle.cluster_name, -1, to_down=False)
2541
+
2542
+ if success:
2543
+ operation_str = (f'Canceled {noun} on the cluster '
2544
+ f'{cluster_name!r}')
2545
+ else:
2546
+ operation_str = (
2547
+ f'Attempted to cancel {noun} on the '
2548
+ f'cluster {cluster_name!r} with best effort')
2549
+ yellow = colorama.Fore.YELLOW
2550
+ bright = colorama.Style.BRIGHT
2551
+ reset = colorama.Style.RESET_ALL
2552
+ ux_utils.console_newline()
2553
+ logger.warning(
2554
+ f'{yellow}{operation_str}, since it is found to be in an '
2555
+ f'abnormal state. To fix, try running: {reset}{bright}sky '
2556
+ f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
2557
+ f'{reset}')
2047
2558
  else:
2048
- operation_str = (
2049
- f'Attempted to cancel {noun} on the '
2050
- f'cluster {cluster_name!r} with best effort')
2051
- yellow = colorama.Fore.YELLOW
2052
- bright = colorama.Style.BRIGHT
2053
- reset = colorama.Style.RESET_ALL
2054
- ux_utils.console_newline()
2055
- logger.warning(
2056
- f'{yellow}{operation_str}, since it is found to be in an '
2057
- f'abnormal state. To fix, try running: {reset}{bright}sky '
2058
- f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
2059
- f'{reset}')
2060
- else:
2061
- ux_utils.console_newline()
2062
- operation_str = 'autodowning' if record[
2063
- 'to_down'] else 'autostopping'
2064
- logger.info(
2065
- f'Cluster {cluster_name!r} is {operation_str}. Setting to '
2066
- 'INIT status; try refresh again in a while.')
2559
+ ux_utils.console_newline()
2560
+ operation_str = 'autodowning' if record[
2561
+ 'to_down'] else 'autostopping'
2562
+ logger.info(
2563
+ f'Cluster {cluster_name!r} is {operation_str}. Setting to '
2564
+ 'INIT status; try refresh again in a while.')
2067
2565
 
2068
2566
  # If the user starts part of a STOPPED cluster, we still need a status
2069
2567
  # to represent the abnormal status. For spot cluster, it can also
2070
2568
  # represent that the cluster is partially preempted.
2071
2569
  # TODO(zhwu): the definition of INIT should be audited/changed.
2072
2570
  # Adding a new status UNHEALTHY for abnormal status can be a choice.
2073
- global_user_state.add_or_update_cluster(cluster_name,
2074
- handle,
2075
- requested_resources=None,
2076
- ready=False,
2077
- is_launch=False)
2078
- return global_user_state.get_cluster_from_name(cluster_name)
2571
+ init_reason_regex = None
2572
+ if not status_reason:
2573
+ # If there is not a status reason, don't re-add (and overwrite) the
2574
+ # event if there is already an event with the same reason which may
2575
+ # have a status reason.
2576
+ # Some status reason clears after a certain time (e.g. k8s events
2577
+ # are only stored for an hour by default), so it is possible that
2578
+ # the previous event has a status reason, but now it does not.
2579
+ init_reason_regex = (f'^Cluster is abnormal because '
2580
+ f'{re.escape(init_reason)}.*')
2581
+ log_message = f'Cluster is abnormal because {init_reason}'
2582
+ if status_reason:
2583
+ log_message += f' ({status_reason})'
2584
+ log_message += '. Transitioned to INIT.'
2585
+ global_user_state.add_cluster_event(
2586
+ cluster_name,
2587
+ status_lib.ClusterStatus.INIT,
2588
+ log_message,
2589
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2590
+ nop_if_duplicate=True,
2591
+ duplicate_regex=init_reason_regex)
2592
+ global_user_state.add_or_update_cluster(
2593
+ cluster_name,
2594
+ handle,
2595
+ requested_resources=None,
2596
+ ready=False,
2597
+ is_launch=False,
2598
+ existing_cluster_hash=record['cluster_hash'])
2599
+ return global_user_state.get_cluster_from_name(
2600
+ cluster_name,
2601
+ include_user_info=include_user_info,
2602
+ summary_response=summary_response)
2079
2603
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2080
2604
  # STOPPED.
2605
+ verb = 'terminated' if to_terminate else 'stopped'
2081
2606
  backend = backends.CloudVmRayBackend()
2607
+ global_user_state.add_cluster_event(
2608
+ cluster_name,
2609
+ None,
2610
+ f'All nodes {verb}, cleaning up the cluster.',
2611
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2612
+ # This won't do anything for a terminated cluster, but it's needed for a
2613
+ # stopped cluster.
2614
+ nop_if_duplicate=True,
2615
+ )
2082
2616
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2083
- return global_user_state.get_cluster_from_name(cluster_name)
2617
+ return global_user_state.get_cluster_from_name(
2618
+ cluster_name,
2619
+ include_user_info=include_user_info,
2620
+ summary_response=summary_response)
2084
2621
 
2085
2622
 
2086
2623
  def _must_refresh_cluster_status(
@@ -2102,12 +2639,14 @@ def _must_refresh_cluster_status(
2102
2639
 
2103
2640
 
2104
2641
  def refresh_cluster_record(
2105
- cluster_name: str,
2106
- *,
2107
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2108
- acquire_per_cluster_status_lock: bool = True,
2109
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2110
- ) -> Optional[Dict[str, Any]]:
2642
+ cluster_name: str,
2643
+ *,
2644
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2645
+ cluster_lock_already_held: bool = False,
2646
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2647
+ include_user_info: bool = True,
2648
+ summary_response: bool = False,
2649
+ retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
2111
2650
  """Refresh the cluster, and return the possibly updated record.
2112
2651
 
2113
2652
  The function will update the cached cluster status in the global state. For
@@ -2124,14 +2663,20 @@ def refresh_cluster_record(
2124
2663
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
2125
2664
  1. the cluster is a spot cluster, or
2126
2665
  2. cluster autostop is set and the cluster is not STOPPED.
2127
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
2128
- before updating the status. Even if this is True, the lock may not be
2129
- acquired if the status does not need to be refreshed.
2666
+ cluster_lock_already_held: Whether the caller is already holding the
2667
+ per-cluster lock. You MUST NOT set this to True if the caller does not
2668
+ already hold the lock. If True, we will not acquire the lock before
2669
+ updating the status. Failing to hold the lock while updating the
2670
+ status can lead to correctness issues - e.g. an launch in-progress may
2671
+ appear to be DOWN incorrectly. Even if this is set to False, the lock
2672
+ may not be acquired if the status does not need to be refreshed.
2130
2673
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
2131
2674
  lock. If timeout, the function will use the cached status. If the
2132
2675
  value is <0, do not timeout (wait for the lock indefinitely). By
2133
2676
  default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2134
2677
  if correctness is required, you must set this to -1.
2678
+ retry_if_missing: Whether to retry the call to the cloud api if the
2679
+ cluster is not found when querying the live status on the cloud.
2135
2680
 
2136
2681
  Returns:
2137
2682
  If the cluster is terminated or does not exist, return None.
@@ -2147,69 +2692,95 @@ def refresh_cluster_record(
2147
2692
  the node number larger than expected.
2148
2693
  """
2149
2694
 
2150
- record = global_user_state.get_cluster_from_name(cluster_name)
2695
+ ctx = context_lib.get()
2696
+ record = global_user_state.get_cluster_from_name(
2697
+ cluster_name,
2698
+ include_user_info=include_user_info,
2699
+ summary_response=summary_response)
2151
2700
  if record is None:
2152
2701
  return None
2153
- check_owner_identity(cluster_name)
2154
-
2155
- if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2156
- return record
2157
-
2158
- # The loop logic allows us to notice if the status was updated in the
2159
- # global_user_state by another process and stop trying to get the lock.
2160
- # The core loop logic is adapted from FileLock's implementation.
2161
- lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2162
- start_time = time.perf_counter()
2163
-
2164
- # Loop until we have an up-to-date status or until we acquire the lock.
2165
- while True:
2166
- # Check to see if we can return the cached status.
2167
- if not _must_refresh_cluster_status(record, force_refresh_statuses):
2168
- return record
2169
-
2170
- if not acquire_per_cluster_status_lock:
2171
- return _update_cluster_status(cluster_name)
2172
-
2173
- # Try to acquire the lock so we can fetch the status.
2174
- try:
2175
- with lock.acquire(blocking=False):
2176
- # Check the cluster status again, since it could have been
2177
- # updated between our last check and acquiring the lock.
2178
- record = global_user_state.get_cluster_from_name(cluster_name)
2179
- if record is None or not _must_refresh_cluster_status(
2180
- record, force_refresh_statuses):
2181
- return record
2182
- # Update and return the cluster status.
2183
- return _update_cluster_status(cluster_name)
2184
- except filelock.Timeout:
2185
- # lock.acquire() will throw a Timeout exception if the lock is not
2186
- # available and we have blocking=False.
2187
- pass
2188
-
2189
- # Logic adapted from FileLock.acquire().
2190
- # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2191
- # Otherwise, if we have timed out, return the cached status. This has
2192
- # the potential to cause correctness issues, but if so it is the
2193
- # caller's responsibility to set the timeout to -1.
2194
- if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
2195
- logger.debug('Refreshing status: Failed get the lock for cluster '
2196
- f'{cluster_name!r}. Using the cached status.')
2197
- return record
2198
- time.sleep(0.05)
2199
-
2200
- # Refresh for next loop iteration.
2201
- record = global_user_state.get_cluster_from_name(cluster_name)
2202
- if record is None:
2203
- return None
2702
+ # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
2703
+ # using the correct cloud credentials.
2704
+ workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
2705
+ with skypilot_config.local_active_workspace_ctx(workspace):
2706
+ # check_owner_identity returns if the record handle is
2707
+ # not a CloudVmRayResourceHandle
2708
+ _check_owner_identity_with_record(cluster_name, record)
2709
+
2710
+ # The loop logic allows us to notice if the status was updated in the
2711
+ # global_user_state by another process and stop trying to get the lock.
2712
+ lock = locks.get_lock(cluster_status_lock_id(cluster_name))
2713
+ start_time = time.perf_counter()
2714
+
2715
+ # Loop until we have an up-to-date status or until we acquire the lock.
2716
+ while True:
2717
+ # Check if the context is canceled.
2718
+ if ctx is not None and ctx.is_canceled():
2719
+ raise asyncio.CancelledError()
2720
+ # Check to see if we can return the cached status.
2721
+ if not _must_refresh_cluster_status(record, force_refresh_statuses):
2722
+ return record
2723
+
2724
+ if cluster_lock_already_held:
2725
+ return _update_cluster_status(cluster_name, record,
2726
+ retry_if_missing,
2727
+ include_user_info,
2728
+ summary_response)
2729
+
2730
+ # Try to acquire the lock so we can fetch the status.
2731
+ try:
2732
+ with lock.acquire(blocking=False):
2733
+ # Check the cluster status again, since it could have been
2734
+ # updated between our last check and acquiring the lock.
2735
+ record = global_user_state.get_cluster_from_name(
2736
+ cluster_name,
2737
+ include_user_info=include_user_info,
2738
+ summary_response=summary_response)
2739
+ if record is None or not _must_refresh_cluster_status(
2740
+ record, force_refresh_statuses):
2741
+ return record
2742
+ # Update and return the cluster status.
2743
+ return _update_cluster_status(cluster_name, record,
2744
+ retry_if_missing,
2745
+ include_user_info,
2746
+ summary_response)
2747
+
2748
+ except locks.LockTimeout:
2749
+ # lock.acquire() will throw a Timeout exception if the lock is not
2750
+ # available and we have blocking=False.
2751
+ pass
2752
+
2753
+ # Logic adapted from FileLock.acquire().
2754
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2755
+ # Otherwise, if we have timed out, return the cached status. This has
2756
+ # the potential to cause correctness issues, but if so it is the
2757
+ # caller's responsibility to set the timeout to -1.
2758
+ if 0 <= cluster_status_lock_timeout < time.perf_counter(
2759
+ ) - start_time:
2760
+ logger.debug(
2761
+ 'Refreshing status: Failed get the lock for cluster '
2762
+ f'{cluster_name!r}. Using the cached status.')
2763
+ return record
2764
+ time.sleep(lock.poll_interval)
2765
+
2766
+ # Refresh for next loop iteration.
2767
+ record = global_user_state.get_cluster_from_name(
2768
+ cluster_name,
2769
+ include_user_info=include_user_info,
2770
+ summary_response=summary_response)
2771
+ if record is None:
2772
+ return None
2204
2773
 
2205
2774
 
2206
2775
  @timeline.event
2776
+ @context_utils.cancellation_guard
2207
2777
  def refresh_cluster_status_handle(
2208
2778
  cluster_name: str,
2209
2779
  *,
2210
2780
  force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2211
- acquire_per_cluster_status_lock: bool = True,
2212
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2781
+ cluster_lock_already_held: bool = False,
2782
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2783
+ retry_if_missing: bool = True,
2213
2784
  ) -> Tuple[Optional[status_lib.ClusterStatus],
2214
2785
  Optional[backends.ResourceHandle]]:
2215
2786
  """Refresh the cluster, and return the possibly updated status and handle.
@@ -2221,8 +2792,11 @@ def refresh_cluster_status_handle(
2221
2792
  record = refresh_cluster_record(
2222
2793
  cluster_name,
2223
2794
  force_refresh_statuses=force_refresh_statuses,
2224
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2225
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2795
+ cluster_lock_already_held=cluster_lock_already_held,
2796
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2797
+ include_user_info=False,
2798
+ summary_response=True,
2799
+ retry_if_missing=retry_if_missing)
2226
2800
  if record is None:
2227
2801
  return None, None
2228
2802
  return record['status'], record['handle']
@@ -2253,6 +2827,7 @@ def check_cluster_available(
2253
2827
  ...
2254
2828
 
2255
2829
 
2830
+ @context_utils.cancellation_guard
2256
2831
  def check_cluster_available(
2257
2832
  cluster_name: str,
2258
2833
  *,
@@ -2272,7 +2847,9 @@ def check_cluster_available(
2272
2847
  exceptions.CloudUserIdentityError: if we fail to get the current user
2273
2848
  identity.
2274
2849
  """
2275
- record = global_user_state.get_cluster_from_name(cluster_name)
2850
+ record = global_user_state.get_cluster_from_name(cluster_name,
2851
+ include_user_info=False,
2852
+ summary_response=True)
2276
2853
  if dryrun:
2277
2854
  assert record is not None, cluster_name
2278
2855
  return record['handle']
@@ -2404,6 +2981,19 @@ def is_controller_accessible(
2404
2981
  exceptions.ClusterNotUpError: if the controller is not accessible, or
2405
2982
  failed to be connected.
2406
2983
  """
2984
+ if (managed_job_utils.is_consolidation_mode() and
2985
+ controller == controller_utils.Controllers.JOBS_CONTROLLER
2986
+ ) or (serve_utils.is_consolidation_mode() and
2987
+ controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER):
2988
+ cn = 'local-controller-consolidation'
2989
+ return backends.LocalResourcesHandle(
2990
+ cluster_name=cn,
2991
+ cluster_name_on_cloud=cn,
2992
+ cluster_yaml=None,
2993
+ launched_nodes=1,
2994
+ launched_resources=sky.Resources(cloud=clouds.Cloud(),
2995
+ instance_type=cn),
2996
+ )
2407
2997
  if non_existent_message is None:
2408
2998
  non_existent_message = controller.value.default_hint_if_non_existent
2409
2999
  cluster_name = controller.value.cluster_name
@@ -2446,7 +3036,8 @@ def is_controller_accessible(
2446
3036
  f'fatal, but {controller_name} commands/calls may hang or return '
2447
3037
  'stale information, when the controller is not up.\n'
2448
3038
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2449
- record = global_user_state.get_cluster_from_name(cluster_name)
3039
+ record = global_user_state.get_cluster_from_name(
3040
+ cluster_name, include_user_info=False, summary_response=True)
2450
3041
  if record is not None:
2451
3042
  controller_status, handle = record['status'], record['handle']
2452
3043
  # We check the connection even if the cluster has a cached status UP
@@ -2467,7 +3058,7 @@ def is_controller_accessible(
2467
3058
  need_connection_check):
2468
3059
  # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
2469
3060
  # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
2470
- # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
3061
+ # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
2471
3062
  # we can allow access to the controller.
2472
3063
  ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2473
3064
  handle.docker_user,
@@ -2503,21 +3094,99 @@ class CloudFilter(enum.Enum):
2503
3094
  LOCAL = 'local'
2504
3095
 
2505
3096
 
2506
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3097
+ def _get_glob_clusters(
3098
+ clusters: List[str],
3099
+ silent: bool = False,
3100
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
2507
3101
  """Returns a list of clusters that match the glob pattern."""
2508
3102
  glob_clusters = []
2509
3103
  for cluster in clusters:
2510
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3104
+ glob_cluster = global_user_state.get_glob_cluster_names(
3105
+ cluster, workspaces_filter=workspaces_filter)
2511
3106
  if len(glob_cluster) == 0 and not silent:
2512
3107
  logger.info(f'Cluster {cluster} not found.')
2513
3108
  glob_clusters.extend(glob_cluster)
2514
3109
  return list(set(glob_clusters))
2515
3110
 
2516
3111
 
3112
+ def _refresh_cluster(
3113
+ cluster_name: str,
3114
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3115
+ include_user_info: bool = True,
3116
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3117
+ try:
3118
+ record = refresh_cluster_record(
3119
+ cluster_name,
3120
+ force_refresh_statuses=force_refresh_statuses,
3121
+ cluster_lock_already_held=False,
3122
+ include_user_info=include_user_info,
3123
+ summary_response=summary_response)
3124
+ except (exceptions.ClusterStatusFetchingError,
3125
+ exceptions.CloudUserIdentityError,
3126
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3127
+ # Do not fail the entire refresh process. The caller will
3128
+ # handle the 'UNKNOWN' status, and collect the errors into
3129
+ # a table.
3130
+ record = {'status': 'UNKNOWN', 'error': e}
3131
+ return record
3132
+
3133
+
3134
+ def refresh_cluster_records() -> None:
3135
+ """Refreshes the status of all clusters, except managed clusters.
3136
+
3137
+ Used by the background status refresh daemon.
3138
+ This function is a stripped-down version of get_clusters, with only the
3139
+ bare bones refresh logic.
3140
+
3141
+ Returns:
3142
+ None
3143
+
3144
+ Raises:
3145
+ None
3146
+ """
3147
+ # We force to exclude managed clusters to avoid multiple sources
3148
+ # manipulating them. For example, SkyServe assumes the replica manager
3149
+ # is the only source of truth for the cluster status.
3150
+ cluster_names = set(
3151
+ global_user_state.get_cluster_names(exclude_managed_clusters=True))
3152
+
3153
+ # TODO(syang): we should try not to leak
3154
+ # request info in backend_utils.py.
3155
+ # Refactor this to use some other info to
3156
+ # determine if a launch is in progress.
3157
+ cluster_names_with_launch_request = {
3158
+ request.cluster_name for request in requests_lib.get_request_tasks(
3159
+ req_filter=requests_lib.RequestTaskFilter(
3160
+ status=[requests_lib.RequestStatus.RUNNING],
3161
+ include_request_names=['sky.launch'],
3162
+ fields=['cluster_name']))
3163
+ }
3164
+ cluster_names_without_launch_request = (cluster_names -
3165
+ cluster_names_with_launch_request)
3166
+
3167
+ def _refresh_cluster_record(cluster_name):
3168
+ return _refresh_cluster(cluster_name,
3169
+ force_refresh_statuses=set(
3170
+ status_lib.ClusterStatus),
3171
+ include_user_info=False,
3172
+ summary_response=True)
3173
+
3174
+ if len(cluster_names_without_launch_request) > 0:
3175
+ # Do not refresh the clusters that have an active launch request.
3176
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3177
+ cluster_names_without_launch_request)
3178
+
3179
+
2517
3180
  def get_clusters(
2518
3181
  refresh: common.StatusRefreshMode,
2519
3182
  cluster_names: Optional[Union[str, List[str]]] = None,
2520
3183
  all_users: bool = True,
3184
+ include_credentials: bool = False,
3185
+ summary_response: bool = False,
3186
+ include_handle: bool = True,
3187
+ # Internal only:
3188
+ # pylint: disable=invalid-name
3189
+ _include_is_managed: bool = False,
2521
3190
  ) -> List[Dict[str, Any]]:
2522
3191
  """Returns a list of cached or optionally refreshed cluster records.
2523
3192
 
@@ -2527,114 +3196,159 @@ def get_clusters(
2527
3196
  of the clusters.
2528
3197
 
2529
3198
  Args:
2530
- include_controller: Whether to include controllers, e.g. jobs controller
2531
- or sky serve controller.
2532
3199
  refresh: Whether to refresh the status of the clusters. (Refreshing will
2533
3200
  set the status to STOPPED if the cluster cannot be pinged.)
2534
- cloud_filter: Sets which clouds to filer through from the global user
2535
- state. Supports three values, 'all' for all clouds, 'public' for
2536
- public clouds only, and 'local' for only local clouds.
2537
3201
  cluster_names: If provided, only return records for the given cluster
2538
3202
  names.
3203
+ all_users: If True, return clusters from all users. If False, only
3204
+ return clusters from the current user.
3205
+ include_credentials: If True, include cluster ssh credentials in the
3206
+ return value.
3207
+ _include_is_managed: Whether to force include clusters created by the
3208
+ controller.
2539
3209
 
2540
3210
  Returns:
2541
3211
  A list of cluster records. If the cluster does not exist or has been
2542
3212
  terminated, the record will be omitted from the returned list.
2543
3213
  """
2544
- records = global_user_state.get_clusters()
3214
+ accessible_workspaces = workspaces_core.get_workspaces()
3215
+ if cluster_names is not None:
3216
+ if isinstance(cluster_names, str):
3217
+ cluster_names = [cluster_names]
3218
+ non_glob_cluster_names = []
3219
+ glob_cluster_names = []
3220
+ for cluster_name in cluster_names:
3221
+ if ux_utils.is_glob_pattern(cluster_name):
3222
+ glob_cluster_names.append(cluster_name)
3223
+ else:
3224
+ non_glob_cluster_names.append(cluster_name)
3225
+ cluster_names = non_glob_cluster_names
3226
+ if glob_cluster_names:
3227
+ cluster_names += _get_glob_clusters(
3228
+ glob_cluster_names,
3229
+ silent=True,
3230
+ workspaces_filter=accessible_workspaces)
3231
+
3232
+ exclude_managed_clusters = False
3233
+ if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
3234
+ exclude_managed_clusters = True
3235
+ user_hashes_filter = None
2545
3236
  if not all_users:
2546
- current_user_hash = common_utils.get_user_hash()
2547
- records = [
2548
- record for record in records
2549
- if record['user_hash'] == current_user_hash
2550
- ]
3237
+ user_hashes_filter = {common_utils.get_current_user().id}
3238
+ records = global_user_state.get_clusters(
3239
+ exclude_managed_clusters=exclude_managed_clusters,
3240
+ user_hashes_filter=user_hashes_filter,
3241
+ workspaces_filter=accessible_workspaces,
3242
+ cluster_names=cluster_names,
3243
+ summary_response=summary_response)
2551
3244
 
2552
3245
  yellow = colorama.Fore.YELLOW
2553
3246
  bright = colorama.Style.BRIGHT
2554
3247
  reset = colorama.Style.RESET_ALL
2555
3248
 
2556
- def _update_record_with_credentials_and_resources_str(
2557
- record: Optional[Dict[str, Any]]) -> None:
3249
+ if cluster_names is not None:
3250
+ record_names = {record['name'] for record in records}
3251
+ not_found_clusters = ux_utils.get_non_matched_query(
3252
+ cluster_names, record_names)
3253
+ if not_found_clusters:
3254
+ clusters_str = ', '.join(not_found_clusters)
3255
+ logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
3256
+
3257
+ def _get_records_with_handle(
3258
+ records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
3259
+ """Filter for records that have a handle"""
3260
+ return [
3261
+ record for record in records
3262
+ if record is not None and record['handle'] is not None
3263
+ ]
3264
+
3265
+ def _update_records_with_handle_info(
3266
+ records: List[Optional[Dict[str, Any]]]) -> None:
3267
+ """Add resource str to record"""
3268
+ for record in _get_records_with_handle(records):
3269
+ handle = record['handle']
3270
+ resource_str_simple, resource_str_full = (
3271
+ resources_utils.get_readable_resources_repr(
3272
+ handle, simplified_only=False))
3273
+ record['resources_str'] = resource_str_simple
3274
+ record['resources_str_full'] = resource_str_full
3275
+ if not summary_response:
3276
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3277
+
3278
+ def _update_records_with_credentials(
3279
+ records: List[Optional[Dict[str, Any]]]) -> None:
2558
3280
  """Add the credentials to the record.
2559
3281
 
2560
3282
  This is useful for the client side to setup the ssh config of the
2561
3283
  cluster.
2562
3284
  """
2563
- if record is None:
2564
- return
2565
- handle = record['handle']
2566
- if handle is None:
3285
+ records_with_handle = _get_records_with_handle(records)
3286
+ if len(records_with_handle) == 0:
2567
3287
  return
2568
- record['resources_str'] = resources_utils.get_readable_resources_repr(
2569
- handle)
2570
- credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2571
- handle.docker_user,
2572
- handle.ssh_user)
2573
3288
 
2574
- if not credentials:
2575
- return
2576
- ssh_private_key_path = credentials.get('ssh_private_key', None)
2577
- if ssh_private_key_path is not None:
2578
- with open(os.path.expanduser(ssh_private_key_path),
2579
- 'r',
2580
- encoding='utf-8') as f:
2581
- credentials['ssh_private_key_content'] = f.read()
2582
- else:
2583
- private_key_path, _ = auth.get_or_generate_keys()
2584
- with open(os.path.expanduser(private_key_path),
2585
- 'r',
2586
- encoding='utf-8') as f:
2587
- credentials['ssh_private_key_content'] = f.read()
2588
- record['credentials'] = credentials
2589
-
2590
- if cluster_names is not None:
2591
- if isinstance(cluster_names, str):
2592
- cluster_names = [cluster_names]
2593
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
2594
- new_records = []
2595
- not_exist_cluster_names = []
2596
- for cluster_name in cluster_names:
2597
- for record in records:
2598
- if record['name'] == cluster_name:
2599
- new_records.append(record)
2600
- break
3289
+ handles = [record['handle'] for record in records_with_handle]
3290
+ credentials = ssh_credentials_from_handles(handles)
3291
+ cached_private_keys: Dict[str, str] = {}
3292
+ for record, credential in zip(records_with_handle, credentials):
3293
+ if not credential:
3294
+ continue
3295
+ ssh_private_key_path = credential.get('ssh_private_key', None)
3296
+ if ssh_private_key_path is not None:
3297
+ expanded_private_key_path = os.path.expanduser(
3298
+ ssh_private_key_path)
3299
+ if not os.path.exists(expanded_private_key_path):
3300
+ success = auth_utils.create_ssh_key_files_from_db(
3301
+ ssh_private_key_path)
3302
+ if not success:
3303
+ # If the ssh key files are not found, we do not
3304
+ # update the record with credentials.
3305
+ logger.debug(
3306
+ f'SSH keys not found for cluster {record["name"]} '
3307
+ f'at key path {ssh_private_key_path}')
3308
+ continue
2601
3309
  else:
2602
- not_exist_cluster_names.append(cluster_name)
2603
- if not_exist_cluster_names:
2604
- clusters_str = ', '.join(not_exist_cluster_names)
2605
- logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2606
- records = new_records
2607
-
2608
- def _update_record_with_resources(record: Optional[Dict[str, Any]]) -> None:
3310
+ private_key_path, _ = auth_utils.get_or_generate_keys()
3311
+ expanded_private_key_path = os.path.expanduser(private_key_path)
3312
+ if expanded_private_key_path in cached_private_keys:
3313
+ credential['ssh_private_key_content'] = cached_private_keys[
3314
+ expanded_private_key_path]
3315
+ else:
3316
+ with open(expanded_private_key_path, 'r',
3317
+ encoding='utf-8') as f:
3318
+ credential['ssh_private_key_content'] = f.read()
3319
+ cached_private_keys[expanded_private_key_path] = credential[
3320
+ 'ssh_private_key_content']
3321
+ record['credentials'] = credential
3322
+
3323
+ def _update_records_with_resources(
3324
+ records: List[Optional[Dict[str, Any]]],) -> None:
2609
3325
  """Add the resources to the record."""
2610
- if record is None:
2611
- return
2612
- handle = record['handle']
2613
- if handle is None:
2614
- return
2615
- record['nodes'] = handle.launched_nodes
2616
- if handle.launched_resources is None:
2617
- return
2618
- record['cloud'] = (f'{handle.launched_resources.cloud}'
2619
- if handle.launched_resources.cloud else None)
2620
- record['region'] = (f'{handle.launched_resources.region}'
2621
- if handle.launched_resources.region else None)
2622
- record['cpus'] = (f'{handle.launched_resources.cpus}'
2623
- if handle.launched_resources.cpus else None)
2624
- record['memory'] = (f'{handle.launched_resources.memory}'
2625
- if handle.launched_resources.memory else None)
2626
- record['accelerators'] = (f'{handle.launched_resources.accelerators}'
2627
- if handle.launched_resources.accelerators else
2628
- None)
2629
-
2630
- # Add auth_config to the records
2631
- for record in records:
2632
- _update_record_with_credentials_and_resources_str(record)
2633
-
3326
+ for record in _get_records_with_handle(records):
3327
+ handle = record['handle']
3328
+ record['nodes'] = handle.launched_nodes
3329
+ if handle.launched_resources is None:
3330
+ continue
3331
+ record['cloud'] = (f'{handle.launched_resources.cloud}'
3332
+ if handle.launched_resources.cloud else None)
3333
+ record['region'] = (f'{handle.launched_resources.region}'
3334
+ if handle.launched_resources.region else None)
3335
+ record['cpus'] = (f'{handle.launched_resources.cpus}'
3336
+ if handle.launched_resources.cpus else None)
3337
+ record['memory'] = (f'{handle.launched_resources.memory}'
3338
+ if handle.launched_resources.memory else None)
3339
+ record['accelerators'] = (
3340
+ f'{handle.launched_resources.accelerators}'
3341
+ if handle.launched_resources.accelerators else None)
3342
+ if not include_handle:
3343
+ record.pop('handle', None)
3344
+
3345
+ # Add handle info to the records
3346
+ _update_records_with_handle_info(records)
3347
+ if include_credentials:
3348
+ _update_records_with_credentials(records)
2634
3349
  if refresh == common.StatusRefreshMode.NONE:
2635
3350
  # Add resources to the records
2636
- for record in records:
2637
- _update_record_with_resources(record)
3351
+ _update_records_with_resources(records)
2638
3352
  return records
2639
3353
 
2640
3354
  plural = 's' if len(records) > 1 else ''
@@ -2650,47 +3364,76 @@ def get_clusters(
2650
3364
  else:
2651
3365
  force_refresh_statuses = None
2652
3366
 
2653
- def _refresh_cluster(cluster_name):
2654
- try:
2655
- record = refresh_cluster_record(
2656
- cluster_name,
2657
- force_refresh_statuses=force_refresh_statuses,
2658
- acquire_per_cluster_status_lock=True)
2659
- _update_record_with_credentials_and_resources_str(record)
2660
- except (exceptions.ClusterStatusFetchingError,
2661
- exceptions.CloudUserIdentityError,
2662
- exceptions.ClusterOwnerIdentityMismatchError) as e:
2663
- # Do not fail the entire refresh process. The caller will
2664
- # handle the 'UNKNOWN' status, and collect the errors into
2665
- # a table.
2666
- record = {'status': 'UNKNOWN', 'error': e}
2667
- progress.update(task, advance=1)
3367
+ def _refresh_cluster_record(cluster_name):
3368
+ record = _refresh_cluster(cluster_name,
3369
+ force_refresh_statuses=force_refresh_statuses,
3370
+ include_user_info=True,
3371
+ summary_response=summary_response)
3372
+ # record may be None if the cluster is deleted during refresh,
3373
+ # e.g. all the Pods of a cluster on Kubernetes have been
3374
+ # deleted before refresh.
3375
+ if record is not None and 'error' not in record:
3376
+ _update_records_with_handle_info([record])
3377
+ if include_credentials:
3378
+ _update_records_with_credentials([record])
3379
+ progress.update(task, advance=1)
2668
3380
  return record
2669
3381
 
2670
3382
  cluster_names = [record['name'] for record in records]
3383
+ # TODO(syang): we should try not to leak
3384
+ # request info in backend_utils.py.
3385
+ # Refactor this to use some other info to
3386
+ # determine if a launch is in progress.
3387
+ cluster_names_with_launch_request = {
3388
+ request.cluster_name for request in requests_lib.get_request_tasks(
3389
+ req_filter=requests_lib.RequestTaskFilter(
3390
+ status=[requests_lib.RequestStatus.RUNNING],
3391
+ include_request_names=['sky.launch'],
3392
+ cluster_names=cluster_names,
3393
+ fields=['cluster_name']))
3394
+ }
3395
+ # Preserve the index of the cluster name as it appears on "records"
3396
+ cluster_names_without_launch_request = [
3397
+ (i, cluster_name)
3398
+ for i, cluster_name in enumerate(cluster_names)
3399
+ if cluster_name not in cluster_names_with_launch_request
3400
+ ]
3401
+ # for clusters that have an active launch request, we do not refresh the status
2671
3402
  updated_records = []
2672
- if len(cluster_names) > 0:
3403
+ if len(cluster_names_without_launch_request) > 0:
2673
3404
  with progress:
2674
3405
  updated_records = subprocess_utils.run_in_parallel(
2675
- _refresh_cluster, cluster_names)
2676
-
3406
+ _refresh_cluster_record, [
3407
+ cluster_name
3408
+ for _, cluster_name in cluster_names_without_launch_request
3409
+ ])
3410
+ # Preserve the index of the cluster name as it appears on "records"
3411
+ # before filtering for clusters being launched.
3412
+ updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
3413
+ cluster_names_without_launch_request[i][0]: updated_records[i]
3414
+ for i in range(len(cluster_names_without_launch_request))
3415
+ }
2677
3416
  # Show information for removed clusters.
2678
3417
  kept_records = []
2679
3418
  autodown_clusters, remaining_clusters, failed_clusters = [], [], []
2680
3419
  for i, record in enumerate(records):
2681
- if updated_records[i] is None:
3420
+ if i not in updated_records_dict:
3421
+ # record was not refreshed, keep the original record
3422
+ kept_records.append(record)
3423
+ continue
3424
+ updated_record = updated_records_dict[i]
3425
+ if updated_record is None:
2682
3426
  if record['to_down']:
2683
- autodown_clusters.append(cluster_names[i])
3427
+ autodown_clusters.append(record['name'])
2684
3428
  else:
2685
- remaining_clusters.append(cluster_names[i])
2686
- elif updated_records[i]['status'] == 'UNKNOWN':
2687
- failed_clusters.append(
2688
- (cluster_names[i], updated_records[i]['error']))
3429
+ remaining_clusters.append(record['name'])
3430
+ elif updated_record['status'] == 'UNKNOWN':
3431
+ failed_clusters.append((record['name'], updated_record['error']))
2689
3432
  # Keep the original record if the status is unknown,
2690
3433
  # so that the user can still see the cluster.
2691
3434
  kept_records.append(record)
2692
3435
  else:
2693
- kept_records.append(updated_records[i])
3436
+ kept_records.append(updated_record)
2694
3437
 
2695
3438
  if autodown_clusters:
2696
3439
  plural = 's' if len(autodown_clusters) > 1 else ''
@@ -2711,8 +3454,7 @@ def get_clusters(
2711
3454
  logger.warning(f' {bright}{cluster_name}{reset}: {e}')
2712
3455
 
2713
3456
  # Add resources to the records
2714
- for record in kept_records:
2715
- _update_record_with_resources(record)
3457
+ _update_records_with_resources(kept_records)
2716
3458
  return kept_records
2717
3459
 
2718
3460
 
@@ -2799,6 +3541,7 @@ def get_task_resources_str(task: 'task_lib.Task',
2799
3541
  if is_managed_job:
2800
3542
  if task.best_resources.use_spot:
2801
3543
  spot_str = '[Spot]'
3544
+ assert task.best_resources.cpus is not None
2802
3545
  task_cpu_demand = task.best_resources.cpus
2803
3546
  if accelerator_dict is None:
2804
3547
  resources_str = f'CPU:{task_cpu_demand}'
@@ -2891,13 +3634,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
2891
3634
  `stderr`. Typically due to the local client version just got updated, and
2892
3635
  the remote runtime is an older version.
2893
3636
  """
2894
- pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
2895
- r'attribute \'(.*)\'')
2896
3637
  if returncode != 0:
2897
- # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
2898
- # the remote cluster. Remove this after 0.10.0 is released.
2899
- attribute_error = re.findall(pattern, stderr)
2900
- if attribute_error or 'SkyPilot runtime is too old' in stderr:
3638
+ if 'SkyPilot runtime is too old' in stderr:
2901
3639
  with ux_utils.print_exception_no_traceback():
2902
3640
  raise RuntimeError(
2903
3641
  f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
@@ -2943,7 +3681,8 @@ def get_endpoints(cluster: str,
2943
3681
  with ux_utils.print_exception_no_traceback():
2944
3682
  raise ValueError(f'Invalid endpoint {port!r}.') from None
2945
3683
  cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
2946
- cluster_names=[cluster])
3684
+ cluster_names=[cluster],
3685
+ _include_is_managed=True)
2947
3686
  if not cluster_records:
2948
3687
  with ux_utils.print_exception_no_traceback():
2949
3688
  raise exceptions.ClusterNotUpError(
@@ -2965,7 +3704,7 @@ def get_endpoints(cluster: str,
2965
3704
  f'for cluster {cluster!r} with backend '
2966
3705
  f'{get_backend_from_handle(handle).NAME}.')
2967
3706
 
2968
- launched_resources = handle.launched_resources
3707
+ launched_resources = handle.launched_resources.assert_launchable()
2969
3708
  cloud = launched_resources.cloud
2970
3709
  try:
2971
3710
  cloud.check_features_are_supported(
@@ -2975,18 +3714,18 @@ def get_endpoints(cluster: str,
2975
3714
  raise ValueError('Querying endpoints is not supported '
2976
3715
  f'for {cluster!r} on {cloud}.') from None
2977
3716
 
2978
- config = common_utils.read_yaml(handle.cluster_yaml)
3717
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
2979
3718
  port_details = provision_lib.query_ports(repr(cloud),
2980
3719
  handle.cluster_name_on_cloud,
2981
3720
  handle.launched_resources.ports,
2982
3721
  head_ip=handle.head_ip,
2983
3722
  provider_config=config['provider'])
2984
3723
 
3724
+ launched_resources = handle.launched_resources.assert_launchable()
2985
3725
  # Validation before returning the endpoints
2986
3726
  if port is not None:
2987
3727
  # If the requested endpoint was not to be exposed
2988
- port_set = resources_utils.port_ranges_to_set(
2989
- handle.launched_resources.ports)
3728
+ port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
2990
3729
  if port not in port_set:
2991
3730
  logger.warning(f'Port {port} is not exposed on '
2992
3731
  f'cluster {cluster!r}.')
@@ -2995,17 +3734,17 @@ def get_endpoints(cluster: str,
2995
3734
  if port not in port_details:
2996
3735
  error_msg = (f'Port {port} not exposed yet. '
2997
3736
  f'{_ENDPOINTS_RETRY_MESSAGE} ')
2998
- if handle.launched_resources.cloud.is_same_cloud(
2999
- clouds.Kubernetes()):
3737
+ if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
3000
3738
  # Add Kubernetes specific debugging info
3001
- error_msg += (kubernetes_utils.get_endpoint_debug_message())
3739
+ error_msg += kubernetes_utils.get_endpoint_debug_message(
3740
+ launched_resources.region)
3002
3741
  logger.warning(error_msg)
3003
3742
  return {}
3004
3743
  return {port: port_details[port][0].url()}
3005
3744
  else:
3006
3745
  if not port_details:
3007
3746
  # If cluster had no ports to be exposed
3008
- if handle.launched_resources.ports is None:
3747
+ if launched_resources.ports is None:
3009
3748
  logger.warning(f'Cluster {cluster!r} does not have any '
3010
3749
  'ports to be exposed.')
3011
3750
  return {}
@@ -3014,13 +3753,200 @@ def get_endpoints(cluster: str,
3014
3753
  else:
3015
3754
  error_msg = (f'No endpoints exposed yet. '
3016
3755
  f'{_ENDPOINTS_RETRY_MESSAGE} ')
3017
- if handle.launched_resources.cloud.is_same_cloud(
3018
- clouds.Kubernetes()):
3756
+ if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
3019
3757
  # Add Kubernetes specific debugging info
3020
- error_msg += \
3021
- kubernetes_utils.get_endpoint_debug_message()
3758
+ error_msg += kubernetes_utils.get_endpoint_debug_message(
3759
+ launched_resources.region)
3022
3760
  logger.warning(error_msg)
3023
3761
  return {}
3024
3762
  return {
3025
3763
  port_num: urls[0].url() for port_num, urls in port_details.items()
3026
3764
  }
3765
+
3766
+
3767
+ def cluster_status_lock_id(cluster_name: str) -> str:
3768
+ """Get the lock ID for cluster status operations."""
3769
+ return f'{cluster_name}_status'
3770
+
3771
+
3772
+ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3773
+ """Get the lock ID for cluster file mounts operations."""
3774
+ return f'{cluster_name}_file_mounts'
3775
+
3776
+
3777
+ def workspace_lock_id(workspace_name: str) -> str:
3778
+ """Get the lock ID for workspace operations."""
3779
+ return f'{workspace_name}_workspace'
3780
+
3781
+
3782
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3783
+ """Get the lock ID for cluster tunnel operations."""
3784
+ return f'{cluster_name}_ssh_tunnel'
3785
+
3786
+
3787
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3788
+ command_runner.KubernetesCommandRunner],
3789
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3790
+ local_port, remote_port = port_forward
3791
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3792
+ # Disabling ControlMaster makes things easier to reason about
3793
+ # with respect to resource management/ownership,
3794
+ # as killing the process will close the tunnel too.
3795
+ head_runner.disable_control_master = True
3796
+ head_runner.port_forward_execute_remote_command = True
3797
+
3798
+ # The default connect_timeout of 1s is too short for
3799
+ # connecting to clusters using a jump server.
3800
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3801
+ # which is counted towards non-idleness.
3802
+ cmd: List[str] = head_runner.port_forward_command(
3803
+ [(local_port, remote_port)],
3804
+ connect_timeout=5,
3805
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3806
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3807
+ # cat so the command doesn't exit until we kill it
3808
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3809
+ cmd_str = ' '.join(cmd)
3810
+ logger.debug(f'Running port forward command: {cmd_str}')
3811
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3812
+ shell=True,
3813
+ stdin=subprocess.PIPE,
3814
+ stdout=subprocess.PIPE,
3815
+ stderr=subprocess.PIPE,
3816
+ start_new_session=True,
3817
+ text=True)
3818
+ # Wait until we receive an ack from the remote cluster or
3819
+ # the SSH connection times out.
3820
+ queue: queue_lib.Queue = queue_lib.Queue()
3821
+ stdout_thread = threading.Thread(
3822
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3823
+ args=(queue, ssh_tunnel_proc.stdout),
3824
+ daemon=True)
3825
+ stdout_thread.start()
3826
+ while ssh_tunnel_proc.poll() is None:
3827
+ try:
3828
+ ack = queue.get_nowait()
3829
+ except queue_lib.Empty:
3830
+ ack = None
3831
+ time.sleep(0.1)
3832
+ continue
3833
+ assert ack is not None
3834
+ if isinstance(
3835
+ head_runner,
3836
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3837
+ break
3838
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3839
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3840
+ # On kind clusters, this error occurs if we make a request
3841
+ # immediately after the port-forward is established on a new pod:
3842
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3843
+ # failed to execute portforward in network namespace
3844
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3845
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3846
+ # connect: connection refused
3847
+ # So we need to poll the port on the pod to check if it is open.
3848
+ # We did not observe this with real Kubernetes clusters.
3849
+ timeout = 5
3850
+ port_check_cmd = (
3851
+ # We install netcat in our ray-node container,
3852
+ # so we can use it here.
3853
+ # (See kubernetes-ray.yml.j2)
3854
+ f'end=$((SECONDS+{timeout})); '
3855
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3856
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3857
+ 'sleep 0.1; '
3858
+ 'done')
3859
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3860
+ require_outputs=True,
3861
+ stream_logs=False)
3862
+ if returncode != 0:
3863
+ try:
3864
+ ssh_tunnel_proc.terminate()
3865
+ ssh_tunnel_proc.wait(timeout=5)
3866
+ except subprocess.TimeoutExpired:
3867
+ ssh_tunnel_proc.kill()
3868
+ ssh_tunnel_proc.wait()
3869
+ finally:
3870
+ error_msg = (f'Failed to check remote port {remote_port}')
3871
+ if stdout:
3872
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3873
+ raise exceptions.CommandError(returncode=returncode,
3874
+ command=cmd_str,
3875
+ error_msg=error_msg,
3876
+ detailed_reason=stderr)
3877
+ break
3878
+
3879
+ if ssh_tunnel_proc.poll() is not None:
3880
+ stdout, stderr = ssh_tunnel_proc.communicate()
3881
+ error_msg = 'Port forward failed'
3882
+ if stdout:
3883
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3884
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3885
+ command=cmd_str,
3886
+ error_msg=error_msg,
3887
+ detailed_reason=stderr)
3888
+ return ssh_tunnel_proc
3889
+
3890
+
3891
+ T = TypeVar('T')
3892
+
3893
+
3894
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3895
+ """Generic helper for making Skylet gRPC requests.
3896
+
3897
+ This method handles the common pattern of:
3898
+ 1. Try the gRPC request
3899
+ 2. If SSH tunnel is closed, recreate it and retry
3900
+ """
3901
+ max_attempts = 5
3902
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3903
+ last_exception: Optional[Exception] = None
3904
+
3905
+ for _ in range(max_attempts):
3906
+ try:
3907
+ return func()
3908
+ except grpc.RpcError as e:
3909
+ last_exception = e
3910
+ _handle_grpc_error(e, backoff.current_backoff())
3911
+
3912
+ raise RuntimeError(
3913
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3914
+ ) from last_exception
3915
+
3916
+
3917
+ def invoke_skylet_streaming_with_retries(
3918
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3919
+ """Generic helper for making Skylet streaming gRPC requests."""
3920
+ max_attempts = 3
3921
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3922
+ last_exception: Optional[Exception] = None
3923
+
3924
+ for _ in range(max_attempts):
3925
+ try:
3926
+ for response in stream_func():
3927
+ yield response
3928
+ return
3929
+ except grpc.RpcError as e:
3930
+ last_exception = e
3931
+ _handle_grpc_error(e, backoff.current_backoff())
3932
+
3933
+ raise RuntimeError(
3934
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3935
+ ) from last_exception
3936
+
3937
+
3938
+ def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3939
+ if e.code() == grpc.StatusCode.INTERNAL:
3940
+ with ux_utils.print_exception_no_traceback():
3941
+ raise exceptions.SkyletInternalError(e.details())
3942
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3943
+ time.sleep(current_backoff)
3944
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3945
+ ) == grpc.StatusCode.UNKNOWN:
3946
+ # Handle backwards compatibility: old server doesn't implement this RPC.
3947
+ # Let the caller fall back to legacy execution.
3948
+ raise exceptions.SkyletMethodNotImplementedError(
3949
+ f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
3950
+ )
3951
+ else:
3952
+ raise e