skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  """Util constants/functions for the backends."""
2
+ import asyncio
2
3
  from datetime import datetime
3
4
  import enum
4
5
  import fnmatch
@@ -6,18 +7,23 @@ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
10
+ import queue as queue_lib
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
16
+ import threading
14
17
  import time
15
18
  import typing
16
- from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
17
21
  import uuid
18
22
 
23
+ import aiohttp
24
+ from aiohttp import ClientTimeout
25
+ from aiohttp import TCPConnector
19
26
  import colorama
20
- import filelock
21
27
  from packaging import version
22
28
  from typing_extensions import Literal
23
29
 
@@ -28,30 +34,44 @@ from sky import check as sky_check
28
34
  from sky import clouds
29
35
  from sky import exceptions
30
36
  from sky import global_user_state
37
+ from sky import logs
31
38
  from sky import provision as provision_lib
32
39
  from sky import sky_logging
33
40
  from sky import skypilot_config
34
41
  from sky.adaptors import common as adaptors_common
42
+ from sky.jobs import utils as managed_job_utils
43
+ from sky.provision import common as provision_common
35
44
  from sky.provision import instance_setup
36
45
  from sky.provision.kubernetes import utils as kubernetes_utils
46
+ from sky.serve import serve_utils
47
+ from sky.server.requests import requests as requests_lib
48
+ from sky.skylet import autostop_lib
37
49
  from sky.skylet import constants
38
50
  from sky.usage import usage_lib
51
+ from sky.utils import auth_utils
39
52
  from sky.utils import cluster_utils
40
53
  from sky.utils import command_runner
41
54
  from sky.utils import common
42
55
  from sky.utils import common_utils
56
+ from sky.utils import context as context_lib
57
+ from sky.utils import context_utils
43
58
  from sky.utils import controller_utils
44
59
  from sky.utils import env_options
60
+ from sky.utils import locks
45
61
  from sky.utils import registry
46
62
  from sky.utils import resources_utils
47
63
  from sky.utils import rich_utils
48
64
  from sky.utils import schemas
49
65
  from sky.utils import status_lib
50
66
  from sky.utils import subprocess_utils
67
+ from sky.utils import tempstore
51
68
  from sky.utils import timeline
52
69
  from sky.utils import ux_utils
70
+ from sky.utils import yaml_utils
71
+ from sky.workspaces import core as workspaces_core
53
72
 
54
73
  if typing.TYPE_CHECKING:
74
+ import grpc
55
75
  import requests
56
76
  from requests import adapters
57
77
  from requests.packages.urllib3.util import retry as retry_lib
@@ -62,6 +82,7 @@ if typing.TYPE_CHECKING:
62
82
  from sky import task as task_lib
63
83
  from sky.backends import cloud_vm_ray_backend
64
84
  from sky.backends import local_docker_backend
85
+ from sky.utils import volume as volume_lib
65
86
  else:
66
87
  yaml = adaptors_common.LazyImport('yaml')
67
88
  requests = adaptors_common.LazyImport('requests')
@@ -69,6 +90,8 @@ else:
69
90
  adapters = adaptors_common.LazyImport('requests.adapters')
70
91
  retry_lib = adaptors_common.LazyImport(
71
92
  'requests.packages.urllib3.util.retry')
93
+ # To avoid requiring grpcio to be installed on the client side.
94
+ grpc = adaptors_common.LazyImport('grpc')
72
95
 
73
96
  logger = sky_logging.init_logger(__name__)
74
97
 
@@ -91,6 +114,13 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
91
114
  # 10.133.0.5: ray.worker.default,
92
115
  _LAUNCHING_IP_PATTERN = re.compile(
93
116
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
117
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
118
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
119
+ _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
120
+ re.IGNORECASE)
121
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
122
+ re.IGNORECASE)
123
+ _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
94
124
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
95
125
 
96
126
  # We check network connection by going through _TEST_IP_LIST. We may need to
@@ -98,24 +128,21 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
98
128
  # Fixed IP addresses are used to avoid DNS lookup blocking the check, for
99
129
  # machine with no internet connection.
100
130
  # Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
101
- _TEST_IP_LIST = ['https://1.1.1.1', 'https://8.8.8.8']
131
+ _TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
102
132
 
103
133
  # Allow each CPU thread take 2 tasks.
104
134
  # Note: This value cannot be too small, otherwise OOM issue may occur.
105
135
  DEFAULT_TASK_CPU_DEMAND = 0.5
106
136
 
107
- # Filelocks for the cluster status change.
108
- CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
109
137
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
110
138
 
111
139
  # Time that must elapse since the last status check before we should re-check if
112
140
  # the cluster has been terminated or autostopped.
113
141
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
114
142
 
115
- # Filelocks for updating cluster's file_mounts.
116
- CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
117
- '~/.sky/.{}_file_mounts.lock')
118
143
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
144
+ WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
145
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
119
146
 
120
147
  # Remote dir that holds our runtime files.
121
148
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -124,7 +151,7 @@ _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
124
151
  'please retry after a while.')
125
152
 
126
153
  # If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
127
- # see any instances in the cloud, the instances might be in the proccess of
154
+ # see any instances in the cloud, the instances might be in the process of
128
155
  # being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
129
156
  # check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
130
157
  # should be set longer than the delay between (sending the create instance
@@ -194,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
194
221
  ('provider', 'availability_zone'),
195
222
  ]
196
223
 
224
+ _ACK_MESSAGE = 'ack'
225
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
226
+
197
227
 
198
228
  def is_ip(s: str) -> bool:
199
229
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -212,7 +242,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
212
242
  # Add retry for the file mounts optimization, as the underlying cp command may
213
243
  # experience transient errors, #4758.
214
244
  @common_utils.retry
215
- def _optimize_file_mounts(yaml_path: str) -> None:
245
+ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
216
246
  """Optimize file mounts in the given ray yaml file.
217
247
 
218
248
  Runtime files handling:
@@ -226,7 +256,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
226
256
  subprocess.CalledProcessError: If the file mounts are failed to be
227
257
  copied.
228
258
  """
229
- yaml_config = common_utils.read_yaml(yaml_path)
259
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
230
260
 
231
261
  file_mounts = yaml_config.get('file_mounts', {})
232
262
  # Remove the file mounts added by the newline.
@@ -242,7 +272,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
242
272
  # - use a remote command to move all runtime files to their right places.
243
273
 
244
274
  # Local tmp dir holding runtime files.
245
- local_runtime_files_dir = tempfile.mkdtemp()
275
+ local_runtime_files_dir = tempstore.mkdtemp()
246
276
  new_file_mounts = {_REMOTE_RUNTIME_FILES_DIR: local_runtime_files_dir}
247
277
 
248
278
  # Generate local_src -> unique_name.
@@ -310,7 +340,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
310
340
  shell=True,
311
341
  check=True)
312
342
 
313
- common_utils.dump_yaml(yaml_path, yaml_config)
343
+ yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
314
344
 
315
345
 
316
346
  def path_size_megabytes(path: str) -> int:
@@ -339,7 +369,13 @@ def path_size_megabytes(path: str) -> int:
339
369
  f'{git_exclude_filter} --dry-run {path!r}')
340
370
  rsync_output = ''
341
371
  try:
342
- rsync_output = str(subprocess.check_output(rsync_command, shell=True))
372
+ # rsync sometimes fails `--dry-run` for MacOS' rsync build, however this function is only used to display
373
+ # a warning message to the user if the size of a file/directory is too
374
+ # large, so we can safely ignore the error.
375
+ rsync_output = str(
376
+ subprocess.check_output(rsync_command,
377
+ shell=True,
378
+ stderr=subprocess.DEVNULL))
343
379
  except subprocess.CalledProcessError:
344
380
  logger.debug('Command failed, proceeding without estimating size: '
345
381
  f'{rsync_command}')
@@ -464,8 +500,8 @@ def _replace_yaml_dicts(
464
500
  if key in old_block:
465
501
  _restore_block(value, old_block[key])
466
502
 
467
- new_config = yaml.safe_load(new_yaml)
468
- old_config = yaml.safe_load(old_yaml)
503
+ new_config = yaml_utils.safe_load(new_yaml)
504
+ old_config = yaml_utils.safe_load(old_yaml)
469
505
  excluded_results = {}
470
506
  # Find all key values excluded from restore
471
507
  for exclude_restore_key_name_list in restore_key_names_exceptions:
@@ -489,7 +525,7 @@ def _replace_yaml_dicts(
489
525
  for key in exclude_restore_key_name[:-1]:
490
526
  curr = curr[key]
491
527
  curr[exclude_restore_key_name[-1]] = value
492
- return common_utils.dump_yaml_str(new_config)
528
+ return yaml_utils.dump_yaml_str(new_config)
493
529
 
494
530
 
495
531
  def get_expirable_clouds(
@@ -509,11 +545,55 @@ def get_expirable_clouds(
509
545
  expirable_clouds = []
510
546
  local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
511
547
  for cloud in enabled_clouds:
512
- remote_identities = skypilot_config.get_nested(
513
- (str(cloud).lower(), 'remote_identity'), None)
514
- if remote_identities is None:
515
- remote_identities = schemas.get_default_remote_identity(
516
- str(cloud).lower())
548
+ # Kubernetes config might have context-specific properties
549
+ if isinstance(cloud, clouds.Kubernetes):
550
+ # get all custom contexts
551
+ contexts = kubernetes_utils.get_custom_config_k8s_contexts()
552
+ # add remote_identity of each context if it exists
553
+ remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
554
+ for context in contexts:
555
+ context_remote_identity = skypilot_config.get_effective_region_config(
556
+ cloud='kubernetes',
557
+ region=context,
558
+ keys=('remote_identity',),
559
+ default_value=None)
560
+ if context_remote_identity is not None:
561
+ if remote_identities is None:
562
+ remote_identities = []
563
+ if isinstance(context_remote_identity, str):
564
+ assert isinstance(remote_identities, list)
565
+ remote_identities.append(
566
+ {context: context_remote_identity})
567
+ elif isinstance(context_remote_identity, list):
568
+ assert isinstance(remote_identities, list)
569
+ remote_identities.extend(context_remote_identity)
570
+ # add global kubernetes remote identity if it exists, if not, add default
571
+ global_remote_identity = skypilot_config.get_effective_region_config(
572
+ cloud='kubernetes',
573
+ region=None,
574
+ keys=('remote_identity',),
575
+ default_value=None)
576
+ if global_remote_identity is not None:
577
+ if remote_identities is None:
578
+ remote_identities = []
579
+ if isinstance(global_remote_identity, str):
580
+ assert isinstance(remote_identities, list)
581
+ remote_identities.append({'*': global_remote_identity})
582
+ elif isinstance(global_remote_identity, list):
583
+ assert isinstance(remote_identities, list)
584
+ remote_identities.extend(global_remote_identity)
585
+ if remote_identities is None:
586
+ remote_identities = schemas.get_default_remote_identity(
587
+ str(cloud).lower())
588
+ else:
589
+ remote_identities = skypilot_config.get_effective_region_config(
590
+ cloud=str(cloud).lower(),
591
+ region=None,
592
+ keys=('remote_identity',),
593
+ default_value=None)
594
+ if remote_identities is None:
595
+ remote_identities = schemas.get_default_remote_identity(
596
+ str(cloud).lower())
517
597
 
518
598
  local_credential_expiring = cloud.can_credential_expire()
519
599
  if isinstance(remote_identities, str):
@@ -531,16 +611,18 @@ def get_expirable_clouds(
531
611
  # TODO: too many things happening here - leaky abstraction. Refactor.
532
612
  @timeline.event
533
613
  def write_cluster_config(
534
- to_provision: 'resources_lib.Resources',
535
- num_nodes: int,
536
- cluster_config_template: str,
537
- cluster_name: str,
538
- local_wheel_path: pathlib.Path,
539
- wheel_hash: str,
540
- region: clouds.Region,
541
- zones: Optional[List[clouds.Zone]] = None,
542
- dryrun: bool = False,
543
- keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
614
+ to_provision: 'resources_lib.Resources',
615
+ num_nodes: int,
616
+ cluster_config_template: str,
617
+ cluster_name: str,
618
+ local_wheel_path: pathlib.Path,
619
+ wheel_hash: str,
620
+ region: clouds.Region,
621
+ zones: Optional[List[clouds.Zone]] = None,
622
+ dryrun: bool = False,
623
+ keep_launch_fields_in_existing_config: bool = True,
624
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
625
+ ) -> Dict[str, str]:
544
626
  """Fills in cluster configuration templates and writes them out.
545
627
 
546
628
  Returns:
@@ -588,12 +670,15 @@ def write_cluster_config(
588
670
  resources_utils.ClusterName(
589
671
  cluster_name,
590
672
  cluster_name_on_cloud,
591
- ), region, zones, num_nodes, dryrun)
673
+ ), region, zones, num_nodes, dryrun, volume_mounts)
592
674
  config_dict = {}
593
675
 
594
676
  specific_reservations = set(
595
- skypilot_config.get_nested(
596
- (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
677
+ skypilot_config.get_effective_region_config(
678
+ cloud=str(to_provision.cloud).lower(),
679
+ region=to_provision.region,
680
+ keys=('specific_reservations',),
681
+ default_value=set()))
597
682
 
598
683
  # Remote identity handling can have 4 cases:
599
684
  # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
@@ -605,9 +690,12 @@ def write_cluster_config(
605
690
  # other cases, we exclude the cloud from credential file uploads after
606
691
  # running required checks.
607
692
  assert cluster_name is not None
608
- excluded_clouds = set()
609
- remote_identity_config = skypilot_config.get_nested(
610
- (str(cloud).lower(), 'remote_identity'), None)
693
+ excluded_clouds: Set[clouds.Cloud] = set()
694
+ remote_identity_config = skypilot_config.get_effective_region_config(
695
+ cloud=str(cloud).lower(),
696
+ region=region.name,
697
+ keys=('remote_identity',),
698
+ default_value=None)
611
699
  remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
612
700
  if isinstance(remote_identity_config, str):
613
701
  remote_identity = remote_identity_config
@@ -636,15 +724,25 @@ def write_cluster_config(
636
724
  'is not supported by this cloud. Remove the config or set: '
637
725
  '`remote_identity: LOCAL_CREDENTIALS`.')
638
726
  if isinstance(cloud, clouds.Kubernetes):
639
- if skypilot_config.get_nested(
640
- ('kubernetes', 'allowed_contexts'), None) is None:
727
+ allowed_contexts = skypilot_config.get_workspace_cloud(
728
+ 'kubernetes').get('allowed_contexts', None)
729
+ if allowed_contexts is None:
730
+ allowed_contexts = skypilot_config.get_effective_region_config(
731
+ cloud='kubernetes',
732
+ region=None,
733
+ keys=('allowed_contexts',),
734
+ default_value=None)
735
+ if allowed_contexts is None:
641
736
  excluded_clouds.add(cloud)
642
737
  else:
643
738
  excluded_clouds.add(cloud)
644
739
 
645
740
  for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
646
- remote_identity_config = skypilot_config.get_nested(
647
- (cloud_str.lower(), 'remote_identity'), None)
741
+ remote_identity_config = skypilot_config.get_effective_region_config(
742
+ cloud=cloud_str.lower(),
743
+ region=region.name,
744
+ keys=('remote_identity',),
745
+ default_value=None)
648
746
  if remote_identity_config:
649
747
  if (remote_identity_config ==
650
748
  schemas.RemoteIdentityOptions.NO_UPLOAD.value):
@@ -652,15 +750,24 @@ def write_cluster_config(
652
750
 
653
751
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
654
752
 
655
- private_key_path, _ = auth.get_or_generate_keys()
753
+ logging_agent = logs.get_logging_agent()
754
+ if logging_agent:
755
+ for k, v in logging_agent.get_credential_file_mounts().items():
756
+ assert k not in credentials, f'{k} already in credentials'
757
+ credentials[k] = v
758
+
759
+ private_key_path, _ = auth_utils.get_or_generate_keys()
656
760
  auth_config = {'ssh_private_key': private_key_path}
657
761
  region_name = resources_vars.get('region')
658
762
 
659
763
  yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
660
764
 
661
765
  # Retrieve the ssh_proxy_command for the given cloud / region.
662
- ssh_proxy_command_config = skypilot_config.get_nested(
663
- (str(cloud).lower(), 'ssh_proxy_command'), None)
766
+ ssh_proxy_command_config = skypilot_config.get_effective_region_config(
767
+ cloud=str(cloud).lower(),
768
+ region=None,
769
+ keys=('ssh_proxy_command',),
770
+ default_value=None)
664
771
  if (isinstance(ssh_proxy_command_config, str) or
665
772
  ssh_proxy_command_config is None):
666
773
  ssh_proxy_command = ssh_proxy_command_config
@@ -683,10 +790,63 @@ def write_cluster_config(
683
790
  assert region_name in ssh_proxy_command_config, (
684
791
  region_name, ssh_proxy_command_config)
685
792
  ssh_proxy_command = ssh_proxy_command_config[region_name]
793
+
794
+ use_internal_ips = skypilot_config.get_effective_region_config(
795
+ cloud=str(cloud).lower(),
796
+ region=region.name,
797
+ keys=('use_internal_ips',),
798
+ default_value=False)
799
+ if isinstance(cloud, clouds.AWS):
800
+ # If the use_ssm flag is set to true, we use the ssm proxy command.
801
+ use_ssm = skypilot_config.get_effective_region_config(
802
+ cloud=str(cloud).lower(),
803
+ region=region.name,
804
+ keys=('use_ssm',),
805
+ default_value=None)
806
+
807
+ if use_ssm and ssh_proxy_command is not None:
808
+ raise exceptions.InvalidCloudConfigs(
809
+ 'use_ssm is set to true, but ssh_proxy_command '
810
+ f'is already set to {ssh_proxy_command!r}. Please remove '
811
+ 'ssh_proxy_command or set use_ssm to false.')
812
+
813
+ if use_internal_ips and ssh_proxy_command is None:
814
+ # Only if use_ssm is explicitly not set, we default to using SSM.
815
+ if use_ssm is None:
816
+ logger.warning(
817
+ f'{colorama.Fore.YELLOW}'
818
+ 'use_internal_ips is set to true, '
819
+ 'but ssh_proxy_command is not set. Defaulting to '
820
+ 'using SSM. Specify ssh_proxy_command to use a different '
821
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
822
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
823
+ use_ssm = True
824
+
825
+ if use_ssm:
826
+ aws_profile = os.environ.get('AWS_PROFILE', None)
827
+ profile_str = f'--profile {aws_profile}' if aws_profile else ''
828
+ ip_address_filter = ('Name=private-ip-address,Values=%h'
829
+ if use_internal_ips else
830
+ 'Name=ip-address,Values=%h')
831
+ get_instance_id_command = 'aws ec2 describe-instances ' + \
832
+ f'--region {region_name} --filters {ip_address_filter} ' + \
833
+ '--query \"Reservations[].Instances[].InstanceId\" ' + \
834
+ f'{profile_str} --output text'
835
+ ssm_proxy_command = 'aws ssm start-session --target ' + \
836
+ f'\"$({get_instance_id_command})\" ' + \
837
+ f'--region {region_name} {profile_str} ' + \
838
+ '--document-name AWS-StartSSHSession ' + \
839
+ '--parameters portNumber=%p'
840
+ ssh_proxy_command = ssm_proxy_command
841
+ region_name = 'ssm-session'
686
842
  logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
687
843
 
688
844
  # User-supplied global instance tags from ~/.sky/config.yaml.
689
- labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
845
+ labels = skypilot_config.get_effective_region_config(
846
+ cloud=str(cloud).lower(),
847
+ region=region.name,
848
+ keys=('labels',),
849
+ default_value={})
690
850
  # labels is a dict, which is guaranteed by the type check in
691
851
  # schemas.py
692
852
  assert isinstance(labels, dict), labels
@@ -715,6 +875,22 @@ def write_cluster_config(
715
875
  high_availability_specified = controller_utils.high_availability_specified(
716
876
  cluster_name)
717
877
 
878
+ volume_mount_vars = []
879
+ if volume_mounts is not None:
880
+ for vol in volume_mounts:
881
+ volume_mount_vars.append({
882
+ 'name': vol.volume_name,
883
+ 'path': vol.path,
884
+ 'volume_name_on_cloud': vol.volume_config.name_on_cloud,
885
+ 'volume_id_on_cloud': vol.volume_config.id_on_cloud,
886
+ })
887
+
888
+ runcmd = skypilot_config.get_effective_region_config(
889
+ cloud=str(to_provision.cloud).lower(),
890
+ region=to_provision.region,
891
+ keys=('post_provision_runcmd',),
892
+ default_value=None)
893
+
718
894
  # Use a tmp file path to avoid incomplete YAML file being re-used in the
719
895
  # future.
720
896
  tmp_yaml_path = yaml_path + '.tmp'
@@ -734,18 +910,23 @@ def write_cluster_config(
734
910
  os.environ.get(constants.USER_ENV_VAR, '')),
735
911
 
736
912
  # Networking configs
737
- 'use_internal_ips': skypilot_config.get_nested(
738
- (str(cloud).lower(), 'use_internal_ips'), False),
913
+ 'use_internal_ips': skypilot_config.get_effective_region_config(
914
+ cloud=str(cloud).lower(),
915
+ region=region.name,
916
+ keys=('use_internal_ips',),
917
+ default_value=False),
739
918
  'ssh_proxy_command': ssh_proxy_command,
740
- 'vpc_name': skypilot_config.get_nested(
741
- (str(cloud).lower(), 'vpc_name'), None),
742
-
919
+ 'vpc_name': skypilot_config.get_effective_region_config(
920
+ cloud=str(cloud).lower(),
921
+ region=region.name,
922
+ keys=('vpc_name',),
923
+ default_value=None),
743
924
  # User-supplied labels.
744
925
  'labels': labels,
745
926
  # User-supplied remote_identity
746
927
  'remote_identity': remote_identity,
747
928
  # The reservation pools that specified by the user. This is
748
- # currently only used by GCP.
929
+ # currently only used by AWS and GCP.
749
930
  'specific_reservations': specific_reservations,
750
931
 
751
932
  # Conda setup
@@ -805,6 +986,13 @@ def write_cluster_config(
805
986
 
806
987
  # High availability
807
988
  'high_availability': high_availability_specified,
989
+
990
+ # Volume mounts
991
+ 'volume_mounts': volume_mount_vars,
992
+
993
+ # runcmd to append to the cloud-init cloud config passed to the
994
+ # machine's UserData. This is currently only used by AWS.
995
+ 'runcmd': runcmd,
808
996
  }),
809
997
  output_path=tmp_yaml_path)
810
998
  config_dict['cluster_name'] = cluster_name
@@ -812,14 +1000,20 @@ def write_cluster_config(
812
1000
 
813
1001
  # Add kubernetes config fields from ~/.sky/config
814
1002
  if isinstance(cloud, clouds.Kubernetes):
815
- kubernetes_utils.combine_pod_config_fields(
816
- tmp_yaml_path,
817
- cluster_config_overrides=to_provision.cluster_config_overrides)
818
- kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
819
- yaml_obj = common_utils.read_yaml(tmp_yaml_path)
820
- pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
1003
+ cluster_config_overrides = to_provision.cluster_config_overrides
1004
+ with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
1005
+ tmp_yaml_str = f.read()
1006
+ cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
1007
+ combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
1008
+ cluster_yaml_obj,
1009
+ cluster_config_overrides=cluster_config_overrides,
1010
+ cloud=cloud,
1011
+ context=region.name)
1012
+ # Write the updated YAML back to the file
1013
+ yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
1014
+
1015
+ pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
821
1016
  'ray_head_default']['node_config']
822
-
823
1017
  # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
824
1018
  pod_config.pop('deployment_spec', None)
825
1019
  pod_config.pop('pvc_spec', None)
@@ -841,9 +1035,8 @@ def write_cluster_config(
841
1035
  _add_auth_to_cluster_config(cloud, tmp_yaml_path)
842
1036
 
843
1037
  # Restore the old yaml content for backward compatibility.
844
- if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config:
845
- with open(yaml_path, 'r', encoding='utf-8') as f:
846
- old_yaml_content = f.read()
1038
+ old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
1039
+ if old_yaml_content is not None and keep_launch_fields_in_existing_config:
847
1040
  with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
848
1041
  new_yaml_content = f.read()
849
1042
  restored_yaml_content = _replace_yaml_dicts(
@@ -856,7 +1049,7 @@ def write_cluster_config(
856
1049
  # Read the cluster name from the tmp yaml file, to take the backward
857
1050
  # compatbility restortion above into account.
858
1051
  # TODO: remove this after 2 minor releases, 0.10.0.
859
- yaml_config = common_utils.read_yaml(tmp_yaml_path)
1052
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
860
1053
  config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
861
1054
 
862
1055
  # Make sure to do this before we optimize file mounts. Optimization is
@@ -880,18 +1073,29 @@ def write_cluster_config(
880
1073
  # compatibility should go before this call.
881
1074
  _optimize_file_mounts(tmp_yaml_path)
882
1075
 
883
- # Rename the tmp file to the final YAML path.
884
- os.rename(tmp_yaml_path, yaml_path)
885
- usage_lib.messages.usage.update_ray_yaml(yaml_path)
1076
+ # commit the final yaml to the database
1077
+ global_user_state.set_cluster_yaml(
1078
+ cluster_name,
1079
+ open(tmp_yaml_path, 'r', encoding='utf-8').read())
1080
+
1081
+ usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
1082
+
1083
+ # Remove the tmp file.
1084
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1085
+ debug_yaml_path = yaml_path + '.debug'
1086
+ os.rename(tmp_yaml_path, debug_yaml_path)
1087
+ else:
1088
+ os.remove(tmp_yaml_path)
1089
+
886
1090
  return config_dict
887
1091
 
888
1092
 
889
- def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1093
+ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
890
1094
  """Adds SSH key info to the cluster config.
891
1095
 
892
1096
  This function's output removes comments included in the jinja2 template.
893
1097
  """
894
- config = common_utils.read_yaml(cluster_config_file)
1098
+ config = yaml_utils.read_yaml(tmp_yaml_path)
895
1099
  # Check the availability of the cloud type.
896
1100
  if isinstance(cloud, (
897
1101
  clouds.AWS,
@@ -919,9 +1123,17 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
919
1123
  config = auth.setup_vast_authentication(config)
920
1124
  elif isinstance(cloud, clouds.Fluidstack):
921
1125
  config = auth.setup_fluidstack_authentication(config)
1126
+ elif isinstance(cloud, clouds.Hyperbolic):
1127
+ config = auth.setup_hyperbolic_authentication(config)
1128
+ elif isinstance(cloud, clouds.Shadeform):
1129
+ config = auth.setup_shadeform_authentication(config)
1130
+ elif isinstance(cloud, clouds.PrimeIntellect):
1131
+ config = auth.setup_primeintellect_authentication(config)
1132
+ elif isinstance(cloud, clouds.Seeweb):
1133
+ config = auth.setup_seeweb_authentication(config)
922
1134
  else:
923
1135
  assert False, cloud
924
- common_utils.dump_yaml(cluster_config_file, config)
1136
+ yaml_utils.dump_yaml(tmp_yaml_path, config)
925
1137
 
926
1138
 
927
1139
  def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
@@ -979,7 +1191,7 @@ def _count_healthy_nodes_from_ray(output: str,
979
1191
 
980
1192
 
981
1193
  @timeline.event
982
- def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1194
+ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
983
1195
  """Hash the cluster yaml and contents of file mounts to a unique string.
984
1196
 
985
1197
  Two invocations of this function should return the same string if and only
@@ -1021,9 +1233,8 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1021
1233
  Rather than constructing the whole byte sequence, which may be quite large,
1022
1234
  we construct it incrementally by using hash.update() to add new bytes.
1023
1235
  """
1024
-
1025
1236
  # Load the yaml contents so that we can directly remove keys.
1026
- yaml_config = common_utils.read_yaml(yaml_path)
1237
+ yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1027
1238
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
1028
1239
  dict_to_remove_from = yaml_config
1029
1240
  found_key = True
@@ -1042,7 +1253,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1042
1253
  config_hash = hashlib.sha256()
1043
1254
 
1044
1255
  yaml_hash = hashlib.sha256(
1045
- common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
1256
+ yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
1046
1257
  config_hash.update(yaml_hash.digest())
1047
1258
 
1048
1259
  file_mounts = yaml_config.get('file_mounts', {})
@@ -1052,7 +1263,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1052
1263
  file_mounts.pop('')
1053
1264
 
1054
1265
  for dst, src in sorted(file_mounts.items()):
1055
- if src == yaml_path:
1266
+ if src == tmp_yaml_path:
1056
1267
  # Skip the yaml file itself. We have already hashed a modified
1057
1268
  # version of it. The file may include fields we don't want to hash.
1058
1269
  continue
@@ -1147,7 +1358,7 @@ def wait_until_ray_cluster_ready(
1147
1358
  logger.error(common_utils.format_exception(e))
1148
1359
  return False, None # failed
1149
1360
 
1150
- config = common_utils.read_yaml(cluster_config_file)
1361
+ config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
1151
1362
 
1152
1363
  docker_user = None
1153
1364
  if 'docker' in config:
@@ -1247,11 +1458,11 @@ def ssh_credential_from_yaml(
1247
1458
  """
1248
1459
  if cluster_yaml is None:
1249
1460
  return dict()
1250
- config = common_utils.read_yaml(cluster_yaml)
1461
+ config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
1251
1462
  auth_section = config['auth']
1252
1463
  if ssh_user is None:
1253
1464
  ssh_user = auth_section['ssh_user'].strip()
1254
- ssh_private_key = auth_section.get('ssh_private_key')
1465
+ ssh_private_key_path = auth_section.get('ssh_private_key')
1255
1466
  ssh_control_name = config.get('cluster_name', '__default__')
1256
1467
  ssh_proxy_command = auth_section.get('ssh_proxy_command')
1257
1468
 
@@ -1260,9 +1471,10 @@ def ssh_credential_from_yaml(
1260
1471
  constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1261
1472
  ssh_proxy_command = ssh_proxy_command.replace(
1262
1473
  constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1474
+
1263
1475
  credentials = {
1264
1476
  'ssh_user': ssh_user,
1265
- 'ssh_private_key': ssh_private_key,
1477
+ 'ssh_private_key': ssh_private_key_path,
1266
1478
  'ssh_control_name': ssh_control_name,
1267
1479
  'ssh_proxy_command': ssh_proxy_command,
1268
1480
  }
@@ -1275,6 +1487,62 @@ def ssh_credential_from_yaml(
1275
1487
  return credentials
1276
1488
 
1277
1489
 
1490
+ def ssh_credentials_from_handles(
1491
+ handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
1492
+ ) -> List[Dict[str, Any]]:
1493
+ """Returns ssh_user, ssh_private_key and ssh_control name.
1494
+ """
1495
+ non_empty_cluster_yaml_paths = [
1496
+ handle.cluster_yaml
1497
+ for handle in handles
1498
+ if handle.cluster_yaml is not None
1499
+ ]
1500
+ cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
1501
+ non_empty_cluster_yaml_paths)
1502
+ cluster_yaml_dicts_to_index = {
1503
+ cluster_yaml_path: cluster_yaml_dict
1504
+ for cluster_yaml_path, cluster_yaml_dict in zip(
1505
+ non_empty_cluster_yaml_paths, cluster_yaml_dicts)
1506
+ }
1507
+
1508
+ credentials_to_return: List[Dict[str, Any]] = []
1509
+ for handle in handles:
1510
+ if handle.cluster_yaml is None:
1511
+ credentials_to_return.append(dict())
1512
+ continue
1513
+ ssh_user = handle.ssh_user
1514
+ docker_user = handle.docker_user
1515
+ config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
1516
+ auth_section = config['auth']
1517
+ if ssh_user is None:
1518
+ ssh_user = auth_section['ssh_user'].strip()
1519
+ ssh_private_key_path = auth_section.get('ssh_private_key')
1520
+ ssh_control_name = config.get('cluster_name', '__default__')
1521
+ ssh_proxy_command = auth_section.get('ssh_proxy_command')
1522
+
1523
+ # Update the ssh_user placeholder in proxy command, if required
1524
+ if (ssh_proxy_command is not None and
1525
+ constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1526
+ ssh_proxy_command = ssh_proxy_command.replace(
1527
+ constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1528
+
1529
+ credentials = {
1530
+ 'ssh_user': ssh_user,
1531
+ 'ssh_private_key': ssh_private_key_path,
1532
+ 'ssh_control_name': ssh_control_name,
1533
+ 'ssh_proxy_command': ssh_proxy_command,
1534
+ }
1535
+ if docker_user is not None:
1536
+ credentials['docker_user'] = docker_user
1537
+ ssh_provider_module = config['provider']['module']
1538
+ # If we are running ssh command on kubernetes node.
1539
+ if 'kubernetes' in ssh_provider_module:
1540
+ credentials['disable_control_master'] = True
1541
+ credentials_to_return.append(credentials)
1542
+
1543
+ return credentials_to_return
1544
+
1545
+
1278
1546
  def parallel_data_transfer_to_nodes(
1279
1547
  runners: List[command_runner.CommandRunner],
1280
1548
  source: Optional[str],
@@ -1435,7 +1703,7 @@ def get_node_ips(cluster_yaml: str,
1435
1703
  exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
1436
1704
  HEAD or WORKER.
1437
1705
  """
1438
- ray_config = common_utils.read_yaml(cluster_yaml)
1706
+ ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
1439
1707
  # Use the new provisioner for AWS.
1440
1708
  provider_name = cluster_utils.get_provider_name(ray_config)
1441
1709
  cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
@@ -1523,18 +1791,54 @@ def get_node_ips(cluster_yaml: str,
1523
1791
 
1524
1792
  def check_network_connection():
1525
1793
  # Tolerate 3 retries as it is observed that connections can fail.
1526
- adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
1527
1794
  http = requests.Session()
1528
- http.mount('https://', adapter)
1529
- http.mount('http://', adapter)
1530
- for i, ip in enumerate(_TEST_IP_LIST):
1531
- try:
1532
- http.head(ip, timeout=3)
1533
- return
1534
- except (requests.Timeout, requests.exceptions.ConnectionError) as e:
1535
- if i == len(_TEST_IP_LIST) - 1:
1536
- raise exceptions.NetworkError('Could not refresh the cluster. '
1537
- 'Network seems down.') from e
1795
+ http.mount('https://', adapters.HTTPAdapter())
1796
+ http.mount('http://', adapters.HTTPAdapter())
1797
+
1798
+ # Alternate between IPs on each retry
1799
+ max_retries = 3
1800
+ timeout = 0.5
1801
+
1802
+ for _ in range(max_retries):
1803
+ for ip in _TEST_IP_LIST:
1804
+ try:
1805
+ http.head(ip, timeout=timeout)
1806
+ return
1807
+ except (requests.Timeout, requests.exceptions.ConnectionError):
1808
+ continue
1809
+
1810
+ timeout *= 2 # Double the timeout for next retry
1811
+
1812
+ # If we get here, all IPs failed
1813
+ # Assume network connection is down
1814
+ raise exceptions.NetworkError('Could not refresh the cluster. '
1815
+ 'Network seems down.')
1816
+
1817
+
1818
+ async def async_check_network_connection():
1819
+ """Check if the network connection is available.
1820
+
1821
+ Tolerates 3 retries as it is observed that connections can fail.
1822
+ Uses aiohttp for async HTTP requests.
1823
+ """
1824
+ # Create a session with retry logic
1825
+ timeout = ClientTimeout(total=15)
1826
+ connector = TCPConnector(limit=1) # Limit to 1 connection at a time
1827
+
1828
+ async with aiohttp.ClientSession(timeout=timeout,
1829
+ connector=connector) as session:
1830
+ for i, ip in enumerate(_TEST_IP_LIST):
1831
+ try:
1832
+ async with session.head(ip) as response:
1833
+ if response.status < 400: # Any 2xx or 3xx status is good
1834
+ return
1835
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
1836
+ if i == len(_TEST_IP_LIST) - 1:
1837
+ raise exceptions.NetworkError(
1838
+ 'Could not refresh the cluster. '
1839
+ 'Network seems down.') from e
1840
+ # If not the last IP, continue to try the next one
1841
+ continue
1538
1842
 
1539
1843
 
1540
1844
  @timeline.event
@@ -1549,14 +1853,34 @@ def check_owner_identity(cluster_name: str) -> None:
1549
1853
  """
1550
1854
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1551
1855
  return
1552
- record = global_user_state.get_cluster_from_name(cluster_name)
1856
+ record = global_user_state.get_cluster_from_name(cluster_name,
1857
+ include_user_info=False,
1858
+ summary_response=True)
1553
1859
  if record is None:
1554
1860
  return
1861
+ _check_owner_identity_with_record(cluster_name, record)
1862
+
1863
+
1864
+ def _check_owner_identity_with_record(cluster_name: str,
1865
+ record: Dict[str, Any]) -> None:
1866
+ if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1867
+ return
1555
1868
  handle = record['handle']
1556
1869
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1557
1870
  return
1871
+ active_workspace = skypilot_config.get_active_workspace()
1872
+ cluster_workspace = record.get('workspace',
1873
+ constants.SKYPILOT_DEFAULT_WORKSPACE)
1874
+ if active_workspace != cluster_workspace:
1875
+ with ux_utils.print_exception_no_traceback():
1876
+ raise exceptions.ClusterOwnerIdentityMismatchError(
1877
+ f'{colorama.Fore.YELLOW}'
1878
+ f'The cluster {cluster_name!r} is in workspace '
1879
+ f'{cluster_workspace!r}, but the active workspace is '
1880
+ f'{active_workspace!r}.{colorama.Fore.RESET}')
1558
1881
 
1559
- cloud = handle.launched_resources.cloud
1882
+ launched_resources = handle.launched_resources.assert_launchable()
1883
+ cloud = launched_resources.cloud
1560
1884
  user_identities = cloud.get_user_identities()
1561
1885
  owner_identity = record['owner']
1562
1886
  if user_identities is None:
@@ -1625,22 +1949,26 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1625
1949
  }
1626
1950
 
1627
1951
 
1952
+ @context_utils.cancellation_guard
1628
1953
  def _query_cluster_status_via_cloud_api(
1629
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1630
- ) -> List[status_lib.ClusterStatus]:
1631
- """Returns the status of the cluster.
1954
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
1955
+ retry_if_missing: bool,
1956
+ ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1957
+ """Returns the status of the cluster as a list of tuples corresponding
1958
+ to the node status and an optional reason string for said status.
1632
1959
 
1633
1960
  Raises:
1634
1961
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1635
1962
  fetched from the cloud provider.
1636
1963
  """
1964
+ cluster_name = handle.cluster_name
1637
1965
  cluster_name_on_cloud = handle.cluster_name_on_cloud
1638
1966
  cluster_name_in_hint = common_utils.cluster_name_in_hint(
1639
1967
  handle.cluster_name, cluster_name_on_cloud)
1640
1968
  # Use region and zone from the cluster config, instead of the
1641
1969
  # handle.launched_resources, because the latter may not be set
1642
1970
  # correctly yet.
1643
- ray_config = common_utils.read_yaml(handle.cluster_yaml)
1971
+ ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
1644
1972
  provider_config = ray_config['provider']
1645
1973
 
1646
1974
  # Query the cloud provider.
@@ -1651,7 +1979,11 @@ def _query_cluster_status_via_cloud_api(
1651
1979
  cloud_name = repr(handle.launched_resources.cloud)
1652
1980
  try:
1653
1981
  node_status_dict = provision_lib.query_instances(
1654
- cloud_name, cluster_name_on_cloud, provider_config)
1982
+ cloud_name,
1983
+ cluster_name,
1984
+ cluster_name_on_cloud,
1985
+ provider_config,
1986
+ retry_if_missing=retry_if_missing)
1655
1987
  logger.debug(f'Querying {cloud_name} cluster '
1656
1988
  f'{cluster_name_in_hint} '
1657
1989
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -1667,12 +1999,55 @@ def _query_cluster_status_via_cloud_api(
1667
1999
  region = provider_config.get('region') or provider_config.get(
1668
2000
  'location')
1669
2001
  zone = ray_config['provider'].get('availability_zone')
2002
+ # TODO (kyuds): refactor cloud.query_status api to include reason.
2003
+ # Currently not refactoring as this API is actually supposed to be
2004
+ # deprecated soon.
1670
2005
  node_statuses = cloud.query_status(
1671
2006
  cluster_name_on_cloud,
1672
2007
  tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
2008
+ node_statuses = [(status, None) for status in node_statuses]
1673
2009
  return node_statuses
1674
2010
 
1675
2011
 
2012
+ def _query_cluster_info_via_cloud_api(
2013
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
2014
+ ) -> provision_common.ClusterInfo:
2015
+ """Returns the cluster info.
2016
+
2017
+ Raises:
2018
+ exceptions.NotSupportedError: the cloud does not support the new provisioner.
2019
+ exceptions.FetchClusterInfoError: the cluster info cannot be
2020
+ fetched from the cloud provider.
2021
+ """
2022
+ cloud = handle.launched_resources.cloud
2023
+ assert cloud is not None, handle
2024
+ if cloud.STATUS_VERSION >= clouds.StatusVersion.SKYPILOT:
2025
+ try:
2026
+ cloud_name = repr(cloud)
2027
+ ray_config = global_user_state.get_cluster_yaml_dict(
2028
+ handle.cluster_yaml)
2029
+ provider_config = ray_config['provider']
2030
+ region = provider_config.get('region') or provider_config.get(
2031
+ 'location')
2032
+ cluster_info = provision_lib.get_cluster_info(
2033
+ cloud_name, region, handle.cluster_name_on_cloud,
2034
+ provider_config)
2035
+ logger.debug(
2036
+ f'Querying {cloud_name} cluster '
2037
+ f'{handle.cluster_name_on_cloud} '
2038
+ f'head instance:\n{cluster_info.get_head_instance()}\n'
2039
+ f'worker instances:\n{cluster_info.get_worker_instances()}')
2040
+ return cluster_info
2041
+ except Exception as e: # pylint: disable=broad-except
2042
+ with ux_utils.print_exception_no_traceback():
2043
+ raise exceptions.FetchClusterInfoError(
2044
+ reason=exceptions.FetchClusterInfoError.Reason.UNKNOWN
2045
+ ) from e
2046
+ else:
2047
+ raise exceptions.NotSupportedError(
2048
+ f'The cloud {cloud} does not support the SkyPilot provisioner.')
2049
+
2050
+
1676
2051
  def check_can_clone_disk_and_override_task(
1677
2052
  cluster_name: str, target_cluster_name: Optional[str], task: 'task_lib.Task'
1678
2053
  ) -> Tuple['task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']:
@@ -1720,12 +2095,12 @@ def check_can_clone_disk_and_override_task(
1720
2095
  'a new target cluster name.')
1721
2096
 
1722
2097
  new_task_resources = []
1723
- original_cloud = handle.launched_resources.cloud
2098
+ launched_resources = handle.launched_resources.assert_launchable()
2099
+ original_cloud = launched_resources.cloud
1724
2100
  original_cloud.check_features_are_supported(
1725
- handle.launched_resources,
2101
+ launched_resources,
1726
2102
  {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
1727
2103
 
1728
- assert original_cloud is not None, handle.launched_resources
1729
2104
  has_override = False
1730
2105
  has_disk_size_met = False
1731
2106
  has_cloud_met = False
@@ -1739,7 +2114,7 @@ def check_can_clone_disk_and_override_task(
1739
2114
  continue
1740
2115
  has_cloud_met = True
1741
2116
 
1742
- override_param = {}
2117
+ override_param: Dict[str, Any] = {}
1743
2118
  if task_resources.cloud is None:
1744
2119
  override_param['cloud'] = original_cloud
1745
2120
  if task_resources.region is None:
@@ -1786,7 +2161,12 @@ def check_can_clone_disk_and_override_task(
1786
2161
  return task, handle
1787
2162
 
1788
2163
 
1789
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2164
+ def _update_cluster_status(
2165
+ cluster_name: str,
2166
+ record: Dict[str, Any],
2167
+ retry_if_missing: bool,
2168
+ include_user_info: bool = True,
2169
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
1790
2170
  """Update the cluster status.
1791
2171
 
1792
2172
  The cluster status is updated by checking ray cluster and real status from
@@ -1813,13 +2193,16 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1813
2193
  fetched from the cloud provider or there are leaked nodes causing
1814
2194
  the node number larger than expected.
1815
2195
  """
1816
- record = global_user_state.get_cluster_from_name(cluster_name)
1817
- if record is None:
1818
- return None
1819
2196
  handle = record['handle']
1820
2197
  if handle.cluster_yaml is None:
1821
2198
  # Remove cluster from db since this cluster does not have a config file
1822
2199
  # or any other ongoing requests
2200
+ global_user_state.add_cluster_event(
2201
+ cluster_name,
2202
+ None,
2203
+ 'Cluster has no YAML file. Removing the cluster from cache.',
2204
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2205
+ nop_if_duplicate=True)
1823
2206
  global_user_state.remove_cluster(cluster_name, terminate=True)
1824
2207
  logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
1825
2208
  'Removing the cluster from cache.')
@@ -1828,10 +2211,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1828
2211
  return record
1829
2212
  cluster_name = handle.cluster_name
1830
2213
 
1831
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2214
+ node_statuses = _query_cluster_status_via_cloud_api(
2215
+ handle, retry_if_missing=retry_if_missing)
1832
2216
 
1833
- all_nodes_up = (all(
1834
- status == status_lib.ClusterStatus.UP for status in node_statuses) and
2217
+ all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2218
+ for status in node_statuses) and
1835
2219
  len(node_statuses) == handle.launched_nodes)
1836
2220
 
1837
2221
  def get_node_counts_from_ray_status(
@@ -1842,14 +2226,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1842
2226
  require_outputs=True,
1843
2227
  separate_stderr=True)
1844
2228
  if rc:
1845
- raise RuntimeError(
1846
- f'Refreshing status ({cluster_name!r}): Failed to check '
1847
- f'ray cluster\'s healthiness with '
1848
- f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1849
- f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
2229
+ raise exceptions.CommandError(
2230
+ rc, instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
2231
+ f'Failed to check ray cluster\'s healthiness.\n'
2232
+ '-- stdout --\n'
2233
+ f'{output}\n', stderr)
1850
2234
  return (*_count_healthy_nodes_from_ray(output), output, stderr)
1851
2235
 
2236
+ ray_status_details: Optional[str] = None
2237
+
1852
2238
  def run_ray_status_to_check_ray_cluster_healthy() -> bool:
2239
+ nonlocal ray_status_details
1853
2240
  try:
1854
2241
  # NOTE: fetching the IPs is very slow as it calls into
1855
2242
  # `ray get head-ip/worker-ips`. Using cached IPs is safe because
@@ -1872,9 +2259,44 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1872
2259
 
1873
2260
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
1874
2261
 
2262
+ cloud_name = repr(handle.launched_resources.cloud).lower()
1875
2263
  for i in range(5):
1876
- ready_head, ready_workers, output, stderr = (
1877
- get_node_counts_from_ray_status(head_runner))
2264
+ try:
2265
+ ready_head, ready_workers, output, stderr = (
2266
+ get_node_counts_from_ray_status(head_runner))
2267
+ except exceptions.CommandError as e:
2268
+ logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
2269
+ f' {i}: {common_utils.format_exception(e)}')
2270
+ if cloud_name != 'kubernetes':
2271
+ # Non-k8s clusters can be manually restarted and:
2272
+ # 1. Get new IP addresses, or
2273
+ # 2. Not have the SkyPilot runtime setup
2274
+ #
2275
+ # So we should surface a message to the user to
2276
+ # help them recover from this inconsistent state.
2277
+ has_new_ip_addr = (
2278
+ e.detailed_reason is not None and
2279
+ _SSH_CONNECTION_TIMED_OUT_PATTERN.search(
2280
+ e.detailed_reason.strip()) is not None)
2281
+ runtime_not_setup = (_RAY_CLUSTER_NOT_FOUND_MESSAGE
2282
+ in e.error_msg)
2283
+ if has_new_ip_addr or runtime_not_setup:
2284
+ yellow = colorama.Fore.YELLOW
2285
+ bright = colorama.Style.BRIGHT
2286
+ reset = colorama.Style.RESET_ALL
2287
+ ux_utils.console_newline()
2288
+ logger.warning(
2289
+ f'{yellow}Failed getting cluster status despite all nodes '
2290
+ f'being up ({cluster_name!r}). '
2291
+ f'If the cluster was restarted manually, try running: '
2292
+ f'{reset}{bright}sky start {cluster_name}{reset} '
2293
+ f'{yellow}to recover from INIT status.{reset}')
2294
+ return False
2295
+ raise e
2296
+ # We retry for kubernetes because coreweave can have a
2297
+ # transient network issue.
2298
+ time.sleep(1)
2299
+ continue
1878
2300
  if ready_head + ready_workers == total_nodes:
1879
2301
  return True
1880
2302
  logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
@@ -1892,19 +2314,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1892
2314
  # showing up
1893
2315
  time.sleep(1)
1894
2316
 
2317
+ ray_status_details = (
2318
+ f'{ready_head + ready_workers}/{total_nodes} ready')
1895
2319
  raise RuntimeError(
1896
2320
  f'Refreshing status ({cluster_name!r}): ray status not showing '
1897
2321
  f'all nodes ({ready_head + ready_workers}/'
1898
2322
  f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
1899
2323
 
1900
2324
  except exceptions.FetchClusterInfoError:
2325
+ ray_status_details = 'failed to get IPs'
1901
2326
  logger.debug(
1902
2327
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
1903
2328
  except RuntimeError as e:
2329
+ if ray_status_details is None:
2330
+ ray_status_details = str(e)
1904
2331
  logger.debug(common_utils.format_exception(e))
1905
2332
  except Exception as e: # pylint: disable=broad-except
1906
2333
  # This can be raised by `external_ssh_ports()`, due to the
1907
2334
  # underlying call to kubernetes API.
2335
+ ray_status_details = str(e)
1908
2336
  logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
1909
2337
  exc_info=e)
1910
2338
  return False
@@ -1925,16 +2353,28 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1925
2353
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
1926
2354
  # head-ip/worker-ips`.
1927
2355
  record['status'] = status_lib.ClusterStatus.UP
1928
- global_user_state.add_or_update_cluster(cluster_name,
1929
- handle,
1930
- requested_resources=None,
1931
- ready=True,
1932
- is_launch=False)
1933
- return global_user_state.get_cluster_from_name(cluster_name)
2356
+ # Add cluster event for instance status check.
2357
+ global_user_state.add_cluster_event(
2358
+ cluster_name,
2359
+ status_lib.ClusterStatus.UP,
2360
+ 'All nodes up; SkyPilot runtime healthy.',
2361
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2362
+ nop_if_duplicate=True)
2363
+ global_user_state.add_or_update_cluster(
2364
+ cluster_name,
2365
+ handle,
2366
+ requested_resources=None,
2367
+ ready=True,
2368
+ is_launch=False,
2369
+ existing_cluster_hash=record['cluster_hash'])
2370
+ return global_user_state.get_cluster_from_name(
2371
+ cluster_name,
2372
+ include_user_info=include_user_info,
2373
+ summary_response=summary_response)
1934
2374
 
1935
2375
  # All cases below are transitioning the cluster to non-UP states.
1936
-
1937
- if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
2376
+ launched_resources = handle.launched_resources.assert_launchable()
2377
+ if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
1938
2378
  clouds.StatusVersion.SKYPILOT):
1939
2379
  # Note: launched_at is set during sky launch, even on an existing
1940
2380
  # cluster. This will catch the case where the cluster was terminated on
@@ -1947,7 +2387,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1947
2387
  # and check again. This is a best-effort leak prevention check.
1948
2388
  # See https://github.com/skypilot-org/skypilot/issues/4431.
1949
2389
  time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
1950
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2390
+ node_statuses = _query_cluster_status_via_cloud_api(
2391
+ handle, retry_if_missing=False)
1951
2392
  # Note: even if all the node_statuses are UP now, we will still
1952
2393
  # consider this cluster abnormal, and its status will be INIT.
1953
2394
 
@@ -2002,85 +2443,168 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2002
2443
  # * The cluster is partially or completely in the INIT state, which means
2003
2444
  # that provisioning was interrupted. This is considered abnormal.
2004
2445
  #
2005
- # An abnormal cluster will transition to INIT and have any autostop setting
2006
- # reset (unless it's autostopping/autodowning).
2007
- is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
2008
- status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
2446
+ # An abnormal cluster will transition to INIT, and one of the following will happen:
2447
+ # (1) If the SkyPilot provisioner is used AND the head node is alive, we
2448
+ # will not reset the autostop setting. Because autostop is handled by
2449
+ # the skylet through the cloud APIs, and will continue to function
2450
+ # regardless of the ray cluster's health.
2451
+ # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2452
+ # autostopping/autodowning.
2453
+ some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2454
+ # If all nodes are up and ray cluster is health, we would have returned
2455
+ # earlier. So if all_nodes_up is True and we are here, it means the ray
2456
+ # cluster must have been unhealthy.
2457
+ ray_cluster_unhealthy = all_nodes_up
2458
+ some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2459
+ for status in node_statuses)
2460
+ is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2461
+
2009
2462
  if is_abnormal:
2463
+ status_reason = ', '.join(
2464
+ [status[1] for status in node_statuses if status[1] is not None])
2465
+
2466
+ if some_nodes_terminated:
2467
+ init_reason = 'one or more nodes terminated'
2468
+ elif ray_cluster_unhealthy:
2469
+ init_reason = f'ray cluster is unhealthy ({ray_status_details})'
2470
+ elif some_nodes_not_stopped:
2471
+ init_reason = 'some but not all nodes are stopped'
2010
2472
  logger.debug('The cluster is abnormal. Setting to INIT status. '
2011
2473
  f'node_statuses: {node_statuses}')
2012
- backend = get_backend_from_handle(handle)
2013
- if isinstance(backend,
2014
- backends.CloudVmRayBackend) and record['autostop'] >= 0:
2015
- if not backend.is_definitely_autostopping(handle,
2016
- stream_logs=False):
2017
- # Friendly hint.
2018
- autostop = record['autostop']
2019
- maybe_down_str = ' --down' if record['to_down'] else ''
2020
- noun = 'autodown' if record['to_down'] else 'autostop'
2021
-
2022
- # Reset the autostopping as the cluster is abnormal, and may
2023
- # not correctly autostop. Resetting the autostop will let
2024
- # the user know that the autostop may not happen to avoid
2025
- # leakages from the assumption that the cluster will autostop.
2026
- success = True
2027
- reset_local_autostop = True
2474
+ if record['autostop'] >= 0:
2475
+ is_head_node_alive = False
2476
+ if launched_resources.cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
2477
+ # Check if the head node is alive
2028
2478
  try:
2029
- backend.set_autostop(handle, -1, stream_logs=False)
2030
- except exceptions.CommandError as e:
2031
- success = False
2032
- if e.returncode == 255:
2033
- word = 'autostopped' if noun == 'autostop' else 'autodowned'
2034
- logger.debug(f'The cluster is likely {word}.')
2035
- reset_local_autostop = False
2036
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
2037
- success = False
2038
- logger.debug(f'Failed to reset autostop. Due to '
2039
- f'{common_utils.format_exception(e)}')
2040
- if reset_local_autostop:
2041
- global_user_state.set_cluster_autostop_value(
2042
- handle.cluster_name, -1, to_down=False)
2043
-
2044
- if success:
2045
- operation_str = (f'Canceled {noun} on the cluster '
2046
- f'{cluster_name!r}')
2479
+ cluster_info = _query_cluster_info_via_cloud_api(handle)
2480
+ is_head_node_alive = cluster_info.get_head_instance(
2481
+ ) is not None
2482
+ except Exception as e: # pylint: disable=broad-except
2483
+ logger.debug(
2484
+ f'Failed to get cluster info for {cluster_name!r}: '
2485
+ f'{common_utils.format_exception(e)}')
2486
+
2487
+ backend = get_backend_from_handle(handle)
2488
+ if isinstance(backend, backends.CloudVmRayBackend):
2489
+ if is_head_node_alive:
2490
+ logger.debug(
2491
+ f'Skipping autostop reset for cluster {cluster_name!r} '
2492
+ 'because the head node is alive.')
2493
+ elif not backend.is_definitely_autostopping(handle,
2494
+ stream_logs=False):
2495
+ # Friendly hint.
2496
+ autostop = record['autostop']
2497
+ maybe_down_str = ' --down' if record['to_down'] else ''
2498
+ noun = 'autodown' if record['to_down'] else 'autostop'
2499
+
2500
+ # Reset the autostopping as the cluster is abnormal, and may
2501
+ # not correctly autostop. Resetting the autostop will let
2502
+ # the user know that the autostop may not happen to avoid
2503
+ # leakages from the assumption that the cluster will autostop.
2504
+ success = True
2505
+ reset_local_autostop = True
2506
+ try:
2507
+ backend.set_autostop(
2508
+ handle,
2509
+ -1,
2510
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
2511
+ stream_logs=False)
2512
+ except (exceptions.CommandError,
2513
+ grpc.FutureTimeoutError) as e:
2514
+ success = False
2515
+ if isinstance(e, grpc.FutureTimeoutError) or (
2516
+ isinstance(e, exceptions.CommandError) and
2517
+ e.returncode == 255):
2518
+ word = 'autostopped' if noun == 'autostop' else 'autodowned'
2519
+ logger.debug(f'The cluster is likely {word}.')
2520
+ reset_local_autostop = False
2521
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
2522
+ success = False
2523
+ logger.debug(f'Failed to reset autostop. Due to '
2524
+ f'{common_utils.format_exception(e)}')
2525
+ if reset_local_autostop:
2526
+ global_user_state.set_cluster_autostop_value(
2527
+ handle.cluster_name, -1, to_down=False)
2528
+
2529
+ if success:
2530
+ operation_str = (f'Canceled {noun} on the cluster '
2531
+ f'{cluster_name!r}')
2532
+ else:
2533
+ operation_str = (
2534
+ f'Attempted to cancel {noun} on the '
2535
+ f'cluster {cluster_name!r} with best effort')
2536
+ yellow = colorama.Fore.YELLOW
2537
+ bright = colorama.Style.BRIGHT
2538
+ reset = colorama.Style.RESET_ALL
2539
+ ux_utils.console_newline()
2540
+ logger.warning(
2541
+ f'{yellow}{operation_str}, since it is found to be in an '
2542
+ f'abnormal state. To fix, try running: {reset}{bright}sky '
2543
+ f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
2544
+ f'{reset}')
2047
2545
  else:
2048
- operation_str = (
2049
- f'Attempted to cancel {noun} on the '
2050
- f'cluster {cluster_name!r} with best effort')
2051
- yellow = colorama.Fore.YELLOW
2052
- bright = colorama.Style.BRIGHT
2053
- reset = colorama.Style.RESET_ALL
2054
- ux_utils.console_newline()
2055
- logger.warning(
2056
- f'{yellow}{operation_str}, since it is found to be in an '
2057
- f'abnormal state. To fix, try running: {reset}{bright}sky '
2058
- f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
2059
- f'{reset}')
2060
- else:
2061
- ux_utils.console_newline()
2062
- operation_str = 'autodowning' if record[
2063
- 'to_down'] else 'autostopping'
2064
- logger.info(
2065
- f'Cluster {cluster_name!r} is {operation_str}. Setting to '
2066
- 'INIT status; try refresh again in a while.')
2546
+ ux_utils.console_newline()
2547
+ operation_str = 'autodowning' if record[
2548
+ 'to_down'] else 'autostopping'
2549
+ logger.info(
2550
+ f'Cluster {cluster_name!r} is {operation_str}. Setting to '
2551
+ 'INIT status; try refresh again in a while.')
2067
2552
 
2068
2553
  # If the user starts part of a STOPPED cluster, we still need a status
2069
2554
  # to represent the abnormal status. For spot cluster, it can also
2070
2555
  # represent that the cluster is partially preempted.
2071
2556
  # TODO(zhwu): the definition of INIT should be audited/changed.
2072
2557
  # Adding a new status UNHEALTHY for abnormal status can be a choice.
2073
- global_user_state.add_or_update_cluster(cluster_name,
2074
- handle,
2075
- requested_resources=None,
2076
- ready=False,
2077
- is_launch=False)
2078
- return global_user_state.get_cluster_from_name(cluster_name)
2558
+ init_reason_regex = None
2559
+ if not status_reason:
2560
+ # If there is not a status reason, don't re-add (and overwrite) the
2561
+ # event if there is already an event with the same reason which may
2562
+ # have a status reason.
2563
+ # Some status reason clears after a certain time (e.g. k8s events
2564
+ # are only stored for an hour by default), so it is possible that
2565
+ # the previous event has a status reason, but now it does not.
2566
+ init_reason_regex = (f'^Cluster is abnormal because '
2567
+ f'{re.escape(init_reason)}.*')
2568
+ log_message = f'Cluster is abnormal because {init_reason}'
2569
+ if status_reason:
2570
+ log_message += f' ({status_reason})'
2571
+ log_message += '. Transitioned to INIT.'
2572
+ global_user_state.add_cluster_event(
2573
+ cluster_name,
2574
+ status_lib.ClusterStatus.INIT,
2575
+ log_message,
2576
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2577
+ nop_if_duplicate=True,
2578
+ duplicate_regex=init_reason_regex)
2579
+ global_user_state.add_or_update_cluster(
2580
+ cluster_name,
2581
+ handle,
2582
+ requested_resources=None,
2583
+ ready=False,
2584
+ is_launch=False,
2585
+ existing_cluster_hash=record['cluster_hash'])
2586
+ return global_user_state.get_cluster_from_name(
2587
+ cluster_name,
2588
+ include_user_info=include_user_info,
2589
+ summary_response=summary_response)
2079
2590
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2080
2591
  # STOPPED.
2592
+ verb = 'terminated' if to_terminate else 'stopped'
2081
2593
  backend = backends.CloudVmRayBackend()
2594
+ global_user_state.add_cluster_event(
2595
+ cluster_name,
2596
+ None,
2597
+ f'All nodes {verb}, cleaning up the cluster.',
2598
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2599
+ # This won't do anything for a terminated cluster, but it's needed for a
2600
+ # stopped cluster.
2601
+ nop_if_duplicate=True,
2602
+ )
2082
2603
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2083
- return global_user_state.get_cluster_from_name(cluster_name)
2604
+ return global_user_state.get_cluster_from_name(
2605
+ cluster_name,
2606
+ include_user_info=include_user_info,
2607
+ summary_response=summary_response)
2084
2608
 
2085
2609
 
2086
2610
  def _must_refresh_cluster_status(
@@ -2102,12 +2626,14 @@ def _must_refresh_cluster_status(
2102
2626
 
2103
2627
 
2104
2628
  def refresh_cluster_record(
2105
- cluster_name: str,
2106
- *,
2107
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2108
- acquire_per_cluster_status_lock: bool = True,
2109
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2110
- ) -> Optional[Dict[str, Any]]:
2629
+ cluster_name: str,
2630
+ *,
2631
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2632
+ cluster_lock_already_held: bool = False,
2633
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2634
+ include_user_info: bool = True,
2635
+ summary_response: bool = False,
2636
+ retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
2111
2637
  """Refresh the cluster, and return the possibly updated record.
2112
2638
 
2113
2639
  The function will update the cached cluster status in the global state. For
@@ -2124,14 +2650,20 @@ def refresh_cluster_record(
2124
2650
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
2125
2651
  1. the cluster is a spot cluster, or
2126
2652
  2. cluster autostop is set and the cluster is not STOPPED.
2127
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
2128
- before updating the status. Even if this is True, the lock may not be
2129
- acquired if the status does not need to be refreshed.
2653
+ cluster_lock_already_held: Whether the caller is already holding the
2654
+ per-cluster lock. You MUST NOT set this to True if the caller does not
2655
+ already hold the lock. If True, we will not acquire the lock before
2656
+ updating the status. Failing to hold the lock while updating the
2657
+ status can lead to correctness issues - e.g. an launch in-progress may
2658
+ appear to be DOWN incorrectly. Even if this is set to False, the lock
2659
+ may not be acquired if the status does not need to be refreshed.
2130
2660
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
2131
2661
  lock. If timeout, the function will use the cached status. If the
2132
2662
  value is <0, do not timeout (wait for the lock indefinitely). By
2133
2663
  default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2134
2664
  if correctness is required, you must set this to -1.
2665
+ retry_if_missing: Whether to retry the call to the cloud api if the
2666
+ cluster is not found when querying the live status on the cloud.
2135
2667
 
2136
2668
  Returns:
2137
2669
  If the cluster is terminated or does not exist, return None.
@@ -2147,69 +2679,95 @@ def refresh_cluster_record(
2147
2679
  the node number larger than expected.
2148
2680
  """
2149
2681
 
2150
- record = global_user_state.get_cluster_from_name(cluster_name)
2682
+ ctx = context_lib.get()
2683
+ record = global_user_state.get_cluster_from_name(
2684
+ cluster_name,
2685
+ include_user_info=include_user_info,
2686
+ summary_response=summary_response)
2151
2687
  if record is None:
2152
2688
  return None
2153
- check_owner_identity(cluster_name)
2154
-
2155
- if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2156
- return record
2157
-
2158
- # The loop logic allows us to notice if the status was updated in the
2159
- # global_user_state by another process and stop trying to get the lock.
2160
- # The core loop logic is adapted from FileLock's implementation.
2161
- lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2162
- start_time = time.perf_counter()
2163
-
2164
- # Loop until we have an up-to-date status or until we acquire the lock.
2165
- while True:
2166
- # Check to see if we can return the cached status.
2167
- if not _must_refresh_cluster_status(record, force_refresh_statuses):
2168
- return record
2169
-
2170
- if not acquire_per_cluster_status_lock:
2171
- return _update_cluster_status(cluster_name)
2172
-
2173
- # Try to acquire the lock so we can fetch the status.
2174
- try:
2175
- with lock.acquire(blocking=False):
2176
- # Check the cluster status again, since it could have been
2177
- # updated between our last check and acquiring the lock.
2178
- record = global_user_state.get_cluster_from_name(cluster_name)
2179
- if record is None or not _must_refresh_cluster_status(
2180
- record, force_refresh_statuses):
2181
- return record
2182
- # Update and return the cluster status.
2183
- return _update_cluster_status(cluster_name)
2184
- except filelock.Timeout:
2185
- # lock.acquire() will throw a Timeout exception if the lock is not
2186
- # available and we have blocking=False.
2187
- pass
2188
-
2189
- # Logic adapted from FileLock.acquire().
2190
- # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2191
- # Otherwise, if we have timed out, return the cached status. This has
2192
- # the potential to cause correctness issues, but if so it is the
2193
- # caller's responsibility to set the timeout to -1.
2194
- if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
2195
- logger.debug('Refreshing status: Failed get the lock for cluster '
2196
- f'{cluster_name!r}. Using the cached status.')
2197
- return record
2198
- time.sleep(0.05)
2199
-
2200
- # Refresh for next loop iteration.
2201
- record = global_user_state.get_cluster_from_name(cluster_name)
2202
- if record is None:
2203
- return None
2689
+ # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
2690
+ # using the correct cloud credentials.
2691
+ workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
2692
+ with skypilot_config.local_active_workspace_ctx(workspace):
2693
+ # check_owner_identity returns if the record handle is
2694
+ # not a CloudVmRayResourceHandle
2695
+ _check_owner_identity_with_record(cluster_name, record)
2696
+
2697
+ # The loop logic allows us to notice if the status was updated in the
2698
+ # global_user_state by another process and stop trying to get the lock.
2699
+ lock = locks.get_lock(cluster_status_lock_id(cluster_name))
2700
+ start_time = time.perf_counter()
2701
+
2702
+ # Loop until we have an up-to-date status or until we acquire the lock.
2703
+ while True:
2704
+ # Check if the context is canceled.
2705
+ if ctx is not None and ctx.is_canceled():
2706
+ raise asyncio.CancelledError()
2707
+ # Check to see if we can return the cached status.
2708
+ if not _must_refresh_cluster_status(record, force_refresh_statuses):
2709
+ return record
2710
+
2711
+ if cluster_lock_already_held:
2712
+ return _update_cluster_status(cluster_name, record,
2713
+ retry_if_missing,
2714
+ include_user_info,
2715
+ summary_response)
2716
+
2717
+ # Try to acquire the lock so we can fetch the status.
2718
+ try:
2719
+ with lock.acquire(blocking=False):
2720
+ # Check the cluster status again, since it could have been
2721
+ # updated between our last check and acquiring the lock.
2722
+ record = global_user_state.get_cluster_from_name(
2723
+ cluster_name,
2724
+ include_user_info=include_user_info,
2725
+ summary_response=summary_response)
2726
+ if record is None or not _must_refresh_cluster_status(
2727
+ record, force_refresh_statuses):
2728
+ return record
2729
+ # Update and return the cluster status.
2730
+ return _update_cluster_status(cluster_name, record,
2731
+ retry_if_missing,
2732
+ include_user_info,
2733
+ summary_response)
2734
+
2735
+ except locks.LockTimeout:
2736
+ # lock.acquire() will throw a Timeout exception if the lock is not
2737
+ # available and we have blocking=False.
2738
+ pass
2739
+
2740
+ # Logic adapted from FileLock.acquire().
2741
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2742
+ # Otherwise, if we have timed out, return the cached status. This has
2743
+ # the potential to cause correctness issues, but if so it is the
2744
+ # caller's responsibility to set the timeout to -1.
2745
+ if 0 <= cluster_status_lock_timeout < time.perf_counter(
2746
+ ) - start_time:
2747
+ logger.debug(
2748
+ 'Refreshing status: Failed get the lock for cluster '
2749
+ f'{cluster_name!r}. Using the cached status.')
2750
+ return record
2751
+ time.sleep(lock.poll_interval)
2752
+
2753
+ # Refresh for next loop iteration.
2754
+ record = global_user_state.get_cluster_from_name(
2755
+ cluster_name,
2756
+ include_user_info=include_user_info,
2757
+ summary_response=summary_response)
2758
+ if record is None:
2759
+ return None
2204
2760
 
2205
2761
 
2206
2762
  @timeline.event
2763
+ @context_utils.cancellation_guard
2207
2764
  def refresh_cluster_status_handle(
2208
2765
  cluster_name: str,
2209
2766
  *,
2210
2767
  force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2211
- acquire_per_cluster_status_lock: bool = True,
2212
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2768
+ cluster_lock_already_held: bool = False,
2769
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2770
+ retry_if_missing: bool = True,
2213
2771
  ) -> Tuple[Optional[status_lib.ClusterStatus],
2214
2772
  Optional[backends.ResourceHandle]]:
2215
2773
  """Refresh the cluster, and return the possibly updated status and handle.
@@ -2221,8 +2779,11 @@ def refresh_cluster_status_handle(
2221
2779
  record = refresh_cluster_record(
2222
2780
  cluster_name,
2223
2781
  force_refresh_statuses=force_refresh_statuses,
2224
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2225
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2782
+ cluster_lock_already_held=cluster_lock_already_held,
2783
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2784
+ include_user_info=False,
2785
+ summary_response=True,
2786
+ retry_if_missing=retry_if_missing)
2226
2787
  if record is None:
2227
2788
  return None, None
2228
2789
  return record['status'], record['handle']
@@ -2253,6 +2814,7 @@ def check_cluster_available(
2253
2814
  ...
2254
2815
 
2255
2816
 
2817
+ @context_utils.cancellation_guard
2256
2818
  def check_cluster_available(
2257
2819
  cluster_name: str,
2258
2820
  *,
@@ -2272,7 +2834,9 @@ def check_cluster_available(
2272
2834
  exceptions.CloudUserIdentityError: if we fail to get the current user
2273
2835
  identity.
2274
2836
  """
2275
- record = global_user_state.get_cluster_from_name(cluster_name)
2837
+ record = global_user_state.get_cluster_from_name(cluster_name,
2838
+ include_user_info=False,
2839
+ summary_response=True)
2276
2840
  if dryrun:
2277
2841
  assert record is not None, cluster_name
2278
2842
  return record['handle']
@@ -2404,6 +2968,19 @@ def is_controller_accessible(
2404
2968
  exceptions.ClusterNotUpError: if the controller is not accessible, or
2405
2969
  failed to be connected.
2406
2970
  """
2971
+ if (managed_job_utils.is_consolidation_mode() and
2972
+ controller == controller_utils.Controllers.JOBS_CONTROLLER
2973
+ ) or (serve_utils.is_consolidation_mode() and
2974
+ controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER):
2975
+ cn = 'local-controller-consolidation'
2976
+ return backends.LocalResourcesHandle(
2977
+ cluster_name=cn,
2978
+ cluster_name_on_cloud=cn,
2979
+ cluster_yaml=None,
2980
+ launched_nodes=1,
2981
+ launched_resources=sky.Resources(cloud=clouds.Cloud(),
2982
+ instance_type=cn),
2983
+ )
2407
2984
  if non_existent_message is None:
2408
2985
  non_existent_message = controller.value.default_hint_if_non_existent
2409
2986
  cluster_name = controller.value.cluster_name
@@ -2446,7 +3023,8 @@ def is_controller_accessible(
2446
3023
  f'fatal, but {controller_name} commands/calls may hang or return '
2447
3024
  'stale information, when the controller is not up.\n'
2448
3025
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2449
- record = global_user_state.get_cluster_from_name(cluster_name)
3026
+ record = global_user_state.get_cluster_from_name(
3027
+ cluster_name, include_user_info=False, summary_response=True)
2450
3028
  if record is not None:
2451
3029
  controller_status, handle = record['status'], record['handle']
2452
3030
  # We check the connection even if the cluster has a cached status UP
@@ -2467,7 +3045,7 @@ def is_controller_accessible(
2467
3045
  need_connection_check):
2468
3046
  # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
2469
3047
  # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
2470
- # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
3048
+ # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
2471
3049
  # we can allow access to the controller.
2472
3050
  ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2473
3051
  handle.docker_user,
@@ -2503,21 +3081,100 @@ class CloudFilter(enum.Enum):
2503
3081
  LOCAL = 'local'
2504
3082
 
2505
3083
 
2506
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3084
+ def _get_glob_clusters(
3085
+ clusters: List[str],
3086
+ silent: bool = False,
3087
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
2507
3088
  """Returns a list of clusters that match the glob pattern."""
2508
3089
  glob_clusters = []
2509
3090
  for cluster in clusters:
2510
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3091
+ glob_cluster = global_user_state.get_glob_cluster_names(
3092
+ cluster, workspaces_filter=workspaces_filter)
2511
3093
  if len(glob_cluster) == 0 and not silent:
2512
3094
  logger.info(f'Cluster {cluster} not found.')
2513
3095
  glob_clusters.extend(glob_cluster)
2514
3096
  return list(set(glob_clusters))
2515
3097
 
2516
3098
 
3099
+ def _refresh_cluster(
3100
+ cluster_name: str,
3101
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3102
+ include_user_info: bool = True,
3103
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3104
+ try:
3105
+ record = refresh_cluster_record(
3106
+ cluster_name,
3107
+ force_refresh_statuses=force_refresh_statuses,
3108
+ cluster_lock_already_held=False,
3109
+ include_user_info=include_user_info,
3110
+ summary_response=summary_response)
3111
+ except (exceptions.ClusterStatusFetchingError,
3112
+ exceptions.CloudUserIdentityError,
3113
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3114
+ # Do not fail the entire refresh process. The caller will
3115
+ # handle the 'UNKNOWN' status, and collect the errors into
3116
+ # a table.
3117
+ record = {'status': 'UNKNOWN', 'error': e}
3118
+ return record
3119
+
3120
+
3121
+ def refresh_cluster_records() -> None:
3122
+ """Refreshes the status of all clusters, except managed clusters.
3123
+
3124
+ Used by the background status refresh daemon.
3125
+ This function is a stripped-down version of get_clusters, with only the
3126
+ bare bones refresh logic.
3127
+
3128
+ Returns:
3129
+ None
3130
+
3131
+ Raises:
3132
+ None
3133
+ """
3134
+ exclude_managed_clusters = True
3135
+ if env_options.Options.SHOW_DEBUG_INFO.get():
3136
+ exclude_managed_clusters = False
3137
+ cluster_names = set(
3138
+ global_user_state.get_cluster_names(
3139
+ exclude_managed_clusters=exclude_managed_clusters,))
3140
+
3141
+ # TODO(syang): we should try not to leak
3142
+ # request info in backend_utils.py.
3143
+ # Refactor this to use some other info to
3144
+ # determine if a launch is in progress.
3145
+ cluster_names_with_launch_request = {
3146
+ request.cluster_name for request in requests_lib.get_request_tasks(
3147
+ req_filter=requests_lib.RequestTaskFilter(
3148
+ status=[requests_lib.RequestStatus.RUNNING],
3149
+ include_request_names=['sky.launch'],
3150
+ fields=['cluster_name']))
3151
+ }
3152
+ cluster_names_without_launch_request = (cluster_names -
3153
+ cluster_names_with_launch_request)
3154
+
3155
+ def _refresh_cluster_record(cluster_name):
3156
+ return _refresh_cluster(cluster_name,
3157
+ force_refresh_statuses=set(
3158
+ status_lib.ClusterStatus),
3159
+ include_user_info=False,
3160
+ summary_response=True)
3161
+
3162
+ if len(cluster_names_without_launch_request) > 0:
3163
+ # Do not refresh the clusters that have an active launch request.
3164
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3165
+ cluster_names_without_launch_request)
3166
+
3167
+
2517
3168
  def get_clusters(
2518
3169
  refresh: common.StatusRefreshMode,
2519
3170
  cluster_names: Optional[Union[str, List[str]]] = None,
2520
3171
  all_users: bool = True,
3172
+ include_credentials: bool = False,
3173
+ summary_response: bool = False,
3174
+ include_handle: bool = True,
3175
+ # Internal only:
3176
+ # pylint: disable=invalid-name
3177
+ _include_is_managed: bool = False,
2521
3178
  ) -> List[Dict[str, Any]]:
2522
3179
  """Returns a list of cached or optionally refreshed cluster records.
2523
3180
 
@@ -2527,114 +3184,159 @@ def get_clusters(
2527
3184
  of the clusters.
2528
3185
 
2529
3186
  Args:
2530
- include_controller: Whether to include controllers, e.g. jobs controller
2531
- or sky serve controller.
2532
3187
  refresh: Whether to refresh the status of the clusters. (Refreshing will
2533
3188
  set the status to STOPPED if the cluster cannot be pinged.)
2534
- cloud_filter: Sets which clouds to filer through from the global user
2535
- state. Supports three values, 'all' for all clouds, 'public' for
2536
- public clouds only, and 'local' for only local clouds.
2537
3189
  cluster_names: If provided, only return records for the given cluster
2538
3190
  names.
3191
+ all_users: If True, return clusters from all users. If False, only
3192
+ return clusters from the current user.
3193
+ include_credentials: If True, include cluster ssh credentials in the
3194
+ return value.
3195
+ _include_is_managed: Whether to force include clusters created by the
3196
+ controller.
2539
3197
 
2540
3198
  Returns:
2541
3199
  A list of cluster records. If the cluster does not exist or has been
2542
3200
  terminated, the record will be omitted from the returned list.
2543
3201
  """
2544
- records = global_user_state.get_clusters()
3202
+ accessible_workspaces = workspaces_core.get_workspaces()
3203
+ if cluster_names is not None:
3204
+ if isinstance(cluster_names, str):
3205
+ cluster_names = [cluster_names]
3206
+ non_glob_cluster_names = []
3207
+ glob_cluster_names = []
3208
+ for cluster_name in cluster_names:
3209
+ if ux_utils.is_glob_pattern(cluster_name):
3210
+ glob_cluster_names.append(cluster_name)
3211
+ else:
3212
+ non_glob_cluster_names.append(cluster_name)
3213
+ cluster_names = non_glob_cluster_names
3214
+ if glob_cluster_names:
3215
+ cluster_names += _get_glob_clusters(
3216
+ glob_cluster_names,
3217
+ silent=True,
3218
+ workspaces_filter=accessible_workspaces)
3219
+
3220
+ exclude_managed_clusters = False
3221
+ if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
3222
+ exclude_managed_clusters = True
3223
+ user_hashes_filter = None
2545
3224
  if not all_users:
2546
- current_user_hash = common_utils.get_user_hash()
2547
- records = [
2548
- record for record in records
2549
- if record['user_hash'] == current_user_hash
2550
- ]
3225
+ user_hashes_filter = {common_utils.get_current_user().id}
3226
+ records = global_user_state.get_clusters(
3227
+ exclude_managed_clusters=exclude_managed_clusters,
3228
+ user_hashes_filter=user_hashes_filter,
3229
+ workspaces_filter=accessible_workspaces,
3230
+ cluster_names=cluster_names,
3231
+ summary_response=summary_response)
2551
3232
 
2552
3233
  yellow = colorama.Fore.YELLOW
2553
3234
  bright = colorama.Style.BRIGHT
2554
3235
  reset = colorama.Style.RESET_ALL
2555
3236
 
2556
- def _update_record_with_credentials_and_resources_str(
2557
- record: Optional[Dict[str, Any]]) -> None:
3237
+ if cluster_names is not None:
3238
+ record_names = {record['name'] for record in records}
3239
+ not_found_clusters = ux_utils.get_non_matched_query(
3240
+ cluster_names, record_names)
3241
+ if not_found_clusters:
3242
+ clusters_str = ', '.join(not_found_clusters)
3243
+ logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
3244
+
3245
+ def _get_records_with_handle(
3246
+ records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
3247
+ """Filter for records that have a handle"""
3248
+ return [
3249
+ record for record in records
3250
+ if record is not None and record['handle'] is not None
3251
+ ]
3252
+
3253
+ def _update_records_with_handle_info(
3254
+ records: List[Optional[Dict[str, Any]]]) -> None:
3255
+ """Add resource str to record"""
3256
+ for record in _get_records_with_handle(records):
3257
+ handle = record['handle']
3258
+ resource_str_simple, resource_str_full = (
3259
+ resources_utils.get_readable_resources_repr(
3260
+ handle, simplified_only=False))
3261
+ record['resources_str'] = resource_str_simple
3262
+ record['resources_str_full'] = resource_str_full
3263
+ if not summary_response:
3264
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3265
+
3266
+ def _update_records_with_credentials(
3267
+ records: List[Optional[Dict[str, Any]]]) -> None:
2558
3268
  """Add the credentials to the record.
2559
3269
 
2560
3270
  This is useful for the client side to setup the ssh config of the
2561
3271
  cluster.
2562
3272
  """
2563
- if record is None:
2564
- return
2565
- handle = record['handle']
2566
- if handle is None:
3273
+ records_with_handle = _get_records_with_handle(records)
3274
+ if len(records_with_handle) == 0:
2567
3275
  return
2568
- record['resources_str'] = resources_utils.get_readable_resources_repr(
2569
- handle)
2570
- credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2571
- handle.docker_user,
2572
- handle.ssh_user)
2573
-
2574
- if not credentials:
2575
- return
2576
- ssh_private_key_path = credentials.get('ssh_private_key', None)
2577
- if ssh_private_key_path is not None:
2578
- with open(os.path.expanduser(ssh_private_key_path),
2579
- 'r',
2580
- encoding='utf-8') as f:
2581
- credentials['ssh_private_key_content'] = f.read()
2582
- else:
2583
- private_key_path, _ = auth.get_or_generate_keys()
2584
- with open(os.path.expanduser(private_key_path),
2585
- 'r',
2586
- encoding='utf-8') as f:
2587
- credentials['ssh_private_key_content'] = f.read()
2588
- record['credentials'] = credentials
2589
3276
 
2590
- if cluster_names is not None:
2591
- if isinstance(cluster_names, str):
2592
- cluster_names = [cluster_names]
2593
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
2594
- new_records = []
2595
- not_exist_cluster_names = []
2596
- for cluster_name in cluster_names:
2597
- for record in records:
2598
- if record['name'] == cluster_name:
2599
- new_records.append(record)
2600
- break
3277
+ handles = [record['handle'] for record in records_with_handle]
3278
+ credentials = ssh_credentials_from_handles(handles)
3279
+ cached_private_keys: Dict[str, str] = {}
3280
+ for record, credential in zip(records_with_handle, credentials):
3281
+ if not credential:
3282
+ continue
3283
+ ssh_private_key_path = credential.get('ssh_private_key', None)
3284
+ if ssh_private_key_path is not None:
3285
+ expanded_private_key_path = os.path.expanduser(
3286
+ ssh_private_key_path)
3287
+ if not os.path.exists(expanded_private_key_path):
3288
+ success = auth_utils.create_ssh_key_files_from_db(
3289
+ ssh_private_key_path)
3290
+ if not success:
3291
+ # If the ssh key files are not found, we do not
3292
+ # update the record with credentials.
3293
+ logger.debug(
3294
+ f'SSH keys not found for cluster {record["name"]} '
3295
+ f'at key path {ssh_private_key_path}')
3296
+ continue
2601
3297
  else:
2602
- not_exist_cluster_names.append(cluster_name)
2603
- if not_exist_cluster_names:
2604
- clusters_str = ', '.join(not_exist_cluster_names)
2605
- logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2606
- records = new_records
2607
-
2608
- def _update_record_with_resources(record: Optional[Dict[str, Any]]) -> None:
3298
+ private_key_path, _ = auth_utils.get_or_generate_keys()
3299
+ expanded_private_key_path = os.path.expanduser(private_key_path)
3300
+ if expanded_private_key_path in cached_private_keys:
3301
+ credential['ssh_private_key_content'] = cached_private_keys[
3302
+ expanded_private_key_path]
3303
+ else:
3304
+ with open(expanded_private_key_path, 'r',
3305
+ encoding='utf-8') as f:
3306
+ credential['ssh_private_key_content'] = f.read()
3307
+ cached_private_keys[expanded_private_key_path] = credential[
3308
+ 'ssh_private_key_content']
3309
+ record['credentials'] = credential
3310
+
3311
+ def _update_records_with_resources(
3312
+ records: List[Optional[Dict[str, Any]]],) -> None:
2609
3313
  """Add the resources to the record."""
2610
- if record is None:
2611
- return
2612
- handle = record['handle']
2613
- if handle is None:
2614
- return
2615
- record['nodes'] = handle.launched_nodes
2616
- if handle.launched_resources is None:
2617
- return
2618
- record['cloud'] = (f'{handle.launched_resources.cloud}'
2619
- if handle.launched_resources.cloud else None)
2620
- record['region'] = (f'{handle.launched_resources.region}'
2621
- if handle.launched_resources.region else None)
2622
- record['cpus'] = (f'{handle.launched_resources.cpus}'
2623
- if handle.launched_resources.cpus else None)
2624
- record['memory'] = (f'{handle.launched_resources.memory}'
2625
- if handle.launched_resources.memory else None)
2626
- record['accelerators'] = (f'{handle.launched_resources.accelerators}'
2627
- if handle.launched_resources.accelerators else
2628
- None)
2629
-
2630
- # Add auth_config to the records
2631
- for record in records:
2632
- _update_record_with_credentials_and_resources_str(record)
2633
-
3314
+ for record in _get_records_with_handle(records):
3315
+ handle = record['handle']
3316
+ record['nodes'] = handle.launched_nodes
3317
+ if handle.launched_resources is None:
3318
+ continue
3319
+ record['cloud'] = (f'{handle.launched_resources.cloud}'
3320
+ if handle.launched_resources.cloud else None)
3321
+ record['region'] = (f'{handle.launched_resources.region}'
3322
+ if handle.launched_resources.region else None)
3323
+ record['cpus'] = (f'{handle.launched_resources.cpus}'
3324
+ if handle.launched_resources.cpus else None)
3325
+ record['memory'] = (f'{handle.launched_resources.memory}'
3326
+ if handle.launched_resources.memory else None)
3327
+ record['accelerators'] = (
3328
+ f'{handle.launched_resources.accelerators}'
3329
+ if handle.launched_resources.accelerators else None)
3330
+ if not include_handle:
3331
+ record.pop('handle', None)
3332
+
3333
+ # Add handle info to the records
3334
+ _update_records_with_handle_info(records)
3335
+ if include_credentials:
3336
+ _update_records_with_credentials(records)
2634
3337
  if refresh == common.StatusRefreshMode.NONE:
2635
3338
  # Add resources to the records
2636
- for record in records:
2637
- _update_record_with_resources(record)
3339
+ _update_records_with_resources(records)
2638
3340
  return records
2639
3341
 
2640
3342
  plural = 's' if len(records) > 1 else ''
@@ -2650,47 +3352,76 @@ def get_clusters(
2650
3352
  else:
2651
3353
  force_refresh_statuses = None
2652
3354
 
2653
- def _refresh_cluster(cluster_name):
2654
- try:
2655
- record = refresh_cluster_record(
2656
- cluster_name,
2657
- force_refresh_statuses=force_refresh_statuses,
2658
- acquire_per_cluster_status_lock=True)
2659
- _update_record_with_credentials_and_resources_str(record)
2660
- except (exceptions.ClusterStatusFetchingError,
2661
- exceptions.CloudUserIdentityError,
2662
- exceptions.ClusterOwnerIdentityMismatchError) as e:
2663
- # Do not fail the entire refresh process. The caller will
2664
- # handle the 'UNKNOWN' status, and collect the errors into
2665
- # a table.
2666
- record = {'status': 'UNKNOWN', 'error': e}
2667
- progress.update(task, advance=1)
3355
+ def _refresh_cluster_record(cluster_name):
3356
+ record = _refresh_cluster(cluster_name,
3357
+ force_refresh_statuses=force_refresh_statuses,
3358
+ include_user_info=True,
3359
+ summary_response=summary_response)
3360
+ # record may be None if the cluster is deleted during refresh,
3361
+ # e.g. all the Pods of a cluster on Kubernetes have been
3362
+ # deleted before refresh.
3363
+ if record is not None and 'error' not in record:
3364
+ _update_records_with_handle_info([record])
3365
+ if include_credentials:
3366
+ _update_records_with_credentials([record])
3367
+ progress.update(task, advance=1)
2668
3368
  return record
2669
3369
 
2670
3370
  cluster_names = [record['name'] for record in records]
3371
+ # TODO(syang): we should try not to leak
3372
+ # request info in backend_utils.py.
3373
+ # Refactor this to use some other info to
3374
+ # determine if a launch is in progress.
3375
+ cluster_names_with_launch_request = {
3376
+ request.cluster_name for request in requests_lib.get_request_tasks(
3377
+ req_filter=requests_lib.RequestTaskFilter(
3378
+ status=[requests_lib.RequestStatus.RUNNING],
3379
+ include_request_names=['sky.launch'],
3380
+ cluster_names=cluster_names,
3381
+ fields=['cluster_name']))
3382
+ }
3383
+ # Preserve the index of the cluster name as it appears on "records"
3384
+ cluster_names_without_launch_request = [
3385
+ (i, cluster_name)
3386
+ for i, cluster_name in enumerate(cluster_names)
3387
+ if cluster_name not in cluster_names_with_launch_request
3388
+ ]
3389
+ # for clusters that have an active launch request, we do not refresh the status
2671
3390
  updated_records = []
2672
- if len(cluster_names) > 0:
3391
+ if len(cluster_names_without_launch_request) > 0:
2673
3392
  with progress:
2674
3393
  updated_records = subprocess_utils.run_in_parallel(
2675
- _refresh_cluster, cluster_names)
2676
-
3394
+ _refresh_cluster_record, [
3395
+ cluster_name
3396
+ for _, cluster_name in cluster_names_without_launch_request
3397
+ ])
3398
+ # Preserve the index of the cluster name as it appears on "records"
3399
+ # before filtering for clusters being launched.
3400
+ updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
3401
+ cluster_names_without_launch_request[i][0]: updated_records[i]
3402
+ for i in range(len(cluster_names_without_launch_request))
3403
+ }
2677
3404
  # Show information for removed clusters.
2678
3405
  kept_records = []
2679
3406
  autodown_clusters, remaining_clusters, failed_clusters = [], [], []
2680
3407
  for i, record in enumerate(records):
2681
- if updated_records[i] is None:
3408
+ if i not in updated_records_dict:
3409
+ # record was not refreshed, keep the original record
3410
+ kept_records.append(record)
3411
+ continue
3412
+ updated_record = updated_records_dict[i]
3413
+ if updated_record is None:
2682
3414
  if record['to_down']:
2683
- autodown_clusters.append(cluster_names[i])
3415
+ autodown_clusters.append(record['name'])
2684
3416
  else:
2685
- remaining_clusters.append(cluster_names[i])
2686
- elif updated_records[i]['status'] == 'UNKNOWN':
2687
- failed_clusters.append(
2688
- (cluster_names[i], updated_records[i]['error']))
3417
+ remaining_clusters.append(record['name'])
3418
+ elif updated_record['status'] == 'UNKNOWN':
3419
+ failed_clusters.append((record['name'], updated_record['error']))
2689
3420
  # Keep the original record if the status is unknown,
2690
3421
  # so that the user can still see the cluster.
2691
3422
  kept_records.append(record)
2692
3423
  else:
2693
- kept_records.append(updated_records[i])
3424
+ kept_records.append(updated_record)
2694
3425
 
2695
3426
  if autodown_clusters:
2696
3427
  plural = 's' if len(autodown_clusters) > 1 else ''
@@ -2711,8 +3442,7 @@ def get_clusters(
2711
3442
  logger.warning(f' {bright}{cluster_name}{reset}: {e}')
2712
3443
 
2713
3444
  # Add resources to the records
2714
- for record in kept_records:
2715
- _update_record_with_resources(record)
3445
+ _update_records_with_resources(kept_records)
2716
3446
  return kept_records
2717
3447
 
2718
3448
 
@@ -2799,6 +3529,7 @@ def get_task_resources_str(task: 'task_lib.Task',
2799
3529
  if is_managed_job:
2800
3530
  if task.best_resources.use_spot:
2801
3531
  spot_str = '[Spot]'
3532
+ assert task.best_resources.cpus is not None
2802
3533
  task_cpu_demand = task.best_resources.cpus
2803
3534
  if accelerator_dict is None:
2804
3535
  resources_str = f'CPU:{task_cpu_demand}'
@@ -2943,7 +3674,8 @@ def get_endpoints(cluster: str,
2943
3674
  with ux_utils.print_exception_no_traceback():
2944
3675
  raise ValueError(f'Invalid endpoint {port!r}.') from None
2945
3676
  cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
2946
- cluster_names=[cluster])
3677
+ cluster_names=[cluster],
3678
+ _include_is_managed=True)
2947
3679
  if not cluster_records:
2948
3680
  with ux_utils.print_exception_no_traceback():
2949
3681
  raise exceptions.ClusterNotUpError(
@@ -2965,7 +3697,7 @@ def get_endpoints(cluster: str,
2965
3697
  f'for cluster {cluster!r} with backend '
2966
3698
  f'{get_backend_from_handle(handle).NAME}.')
2967
3699
 
2968
- launched_resources = handle.launched_resources
3700
+ launched_resources = handle.launched_resources.assert_launchable()
2969
3701
  cloud = launched_resources.cloud
2970
3702
  try:
2971
3703
  cloud.check_features_are_supported(
@@ -2975,18 +3707,18 @@ def get_endpoints(cluster: str,
2975
3707
  raise ValueError('Querying endpoints is not supported '
2976
3708
  f'for {cluster!r} on {cloud}.') from None
2977
3709
 
2978
- config = common_utils.read_yaml(handle.cluster_yaml)
3710
+ config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
2979
3711
  port_details = provision_lib.query_ports(repr(cloud),
2980
3712
  handle.cluster_name_on_cloud,
2981
3713
  handle.launched_resources.ports,
2982
3714
  head_ip=handle.head_ip,
2983
3715
  provider_config=config['provider'])
2984
3716
 
3717
+ launched_resources = handle.launched_resources.assert_launchable()
2985
3718
  # Validation before returning the endpoints
2986
3719
  if port is not None:
2987
3720
  # If the requested endpoint was not to be exposed
2988
- port_set = resources_utils.port_ranges_to_set(
2989
- handle.launched_resources.ports)
3721
+ port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
2990
3722
  if port not in port_set:
2991
3723
  logger.warning(f'Port {port} is not exposed on '
2992
3724
  f'cluster {cluster!r}.')
@@ -2995,17 +3727,17 @@ def get_endpoints(cluster: str,
2995
3727
  if port not in port_details:
2996
3728
  error_msg = (f'Port {port} not exposed yet. '
2997
3729
  f'{_ENDPOINTS_RETRY_MESSAGE} ')
2998
- if handle.launched_resources.cloud.is_same_cloud(
2999
- clouds.Kubernetes()):
3730
+ if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
3000
3731
  # Add Kubernetes specific debugging info
3001
- error_msg += (kubernetes_utils.get_endpoint_debug_message())
3732
+ error_msg += kubernetes_utils.get_endpoint_debug_message(
3733
+ launched_resources.region)
3002
3734
  logger.warning(error_msg)
3003
3735
  return {}
3004
3736
  return {port: port_details[port][0].url()}
3005
3737
  else:
3006
3738
  if not port_details:
3007
3739
  # If cluster had no ports to be exposed
3008
- if handle.launched_resources.ports is None:
3740
+ if launched_resources.ports is None:
3009
3741
  logger.warning(f'Cluster {cluster!r} does not have any '
3010
3742
  'ports to be exposed.')
3011
3743
  return {}
@@ -3014,13 +3746,200 @@ def get_endpoints(cluster: str,
3014
3746
  else:
3015
3747
  error_msg = (f'No endpoints exposed yet. '
3016
3748
  f'{_ENDPOINTS_RETRY_MESSAGE} ')
3017
- if handle.launched_resources.cloud.is_same_cloud(
3018
- clouds.Kubernetes()):
3749
+ if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
3019
3750
  # Add Kubernetes specific debugging info
3020
- error_msg += \
3021
- kubernetes_utils.get_endpoint_debug_message()
3751
+ error_msg += kubernetes_utils.get_endpoint_debug_message(
3752
+ launched_resources.region)
3022
3753
  logger.warning(error_msg)
3023
3754
  return {}
3024
3755
  return {
3025
3756
  port_num: urls[0].url() for port_num, urls in port_details.items()
3026
3757
  }
3758
+
3759
+
3760
+ def cluster_status_lock_id(cluster_name: str) -> str:
3761
+ """Get the lock ID for cluster status operations."""
3762
+ return f'{cluster_name}_status'
3763
+
3764
+
3765
+ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3766
+ """Get the lock ID for cluster file mounts operations."""
3767
+ return f'{cluster_name}_file_mounts'
3768
+
3769
+
3770
+ def workspace_lock_id(workspace_name: str) -> str:
3771
+ """Get the lock ID for workspace operations."""
3772
+ return f'{workspace_name}_workspace'
3773
+
3774
+
3775
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3776
+ """Get the lock ID for cluster tunnel operations."""
3777
+ return f'{cluster_name}_ssh_tunnel'
3778
+
3779
+
3780
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3781
+ command_runner.KubernetesCommandRunner],
3782
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3783
+ local_port, remote_port = port_forward
3784
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3785
+ # Disabling ControlMaster makes things easier to reason about
3786
+ # with respect to resource management/ownership,
3787
+ # as killing the process will close the tunnel too.
3788
+ head_runner.disable_control_master = True
3789
+ head_runner.port_forward_execute_remote_command = True
3790
+
3791
+ # The default connect_timeout of 1s is too short for
3792
+ # connecting to clusters using a jump server.
3793
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3794
+ # which is counted towards non-idleness.
3795
+ cmd: List[str] = head_runner.port_forward_command(
3796
+ [(local_port, remote_port)],
3797
+ connect_timeout=5,
3798
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3799
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3800
+ # cat so the command doesn't exit until we kill it
3801
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3802
+ cmd_str = ' '.join(cmd)
3803
+ logger.debug(f'Running port forward command: {cmd_str}')
3804
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3805
+ shell=True,
3806
+ stdin=subprocess.PIPE,
3807
+ stdout=subprocess.PIPE,
3808
+ stderr=subprocess.PIPE,
3809
+ start_new_session=True,
3810
+ text=True)
3811
+ # Wait until we receive an ack from the remote cluster or
3812
+ # the SSH connection times out.
3813
+ queue: queue_lib.Queue = queue_lib.Queue()
3814
+ stdout_thread = threading.Thread(
3815
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3816
+ args=(queue, ssh_tunnel_proc.stdout),
3817
+ daemon=True)
3818
+ stdout_thread.start()
3819
+ while ssh_tunnel_proc.poll() is None:
3820
+ try:
3821
+ ack = queue.get_nowait()
3822
+ except queue_lib.Empty:
3823
+ ack = None
3824
+ time.sleep(0.1)
3825
+ continue
3826
+ assert ack is not None
3827
+ if isinstance(
3828
+ head_runner,
3829
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3830
+ break
3831
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3832
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3833
+ # On kind clusters, this error occurs if we make a request
3834
+ # immediately after the port-forward is established on a new pod:
3835
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3836
+ # failed to execute portforward in network namespace
3837
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3838
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3839
+ # connect: connection refused
3840
+ # So we need to poll the port on the pod to check if it is open.
3841
+ # We did not observe this with real Kubernetes clusters.
3842
+ timeout = 5
3843
+ port_check_cmd = (
3844
+ # We install netcat in our ray-node container,
3845
+ # so we can use it here.
3846
+ # (See kubernetes-ray.yml.j2)
3847
+ f'end=$((SECONDS+{timeout})); '
3848
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3849
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3850
+ 'sleep 0.1; '
3851
+ 'done')
3852
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3853
+ require_outputs=True,
3854
+ stream_logs=False)
3855
+ if returncode != 0:
3856
+ try:
3857
+ ssh_tunnel_proc.terminate()
3858
+ ssh_tunnel_proc.wait(timeout=5)
3859
+ except subprocess.TimeoutExpired:
3860
+ ssh_tunnel_proc.kill()
3861
+ ssh_tunnel_proc.wait()
3862
+ finally:
3863
+ error_msg = (f'Failed to check remote port {remote_port}')
3864
+ if stdout:
3865
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3866
+ raise exceptions.CommandError(returncode=returncode,
3867
+ command=cmd_str,
3868
+ error_msg=error_msg,
3869
+ detailed_reason=stderr)
3870
+ break
3871
+
3872
+ if ssh_tunnel_proc.poll() is not None:
3873
+ stdout, stderr = ssh_tunnel_proc.communicate()
3874
+ error_msg = 'Port forward failed'
3875
+ if stdout:
3876
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3877
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3878
+ command=cmd_str,
3879
+ error_msg=error_msg,
3880
+ detailed_reason=stderr)
3881
+ return ssh_tunnel_proc
3882
+
3883
+
3884
+ T = TypeVar('T')
3885
+
3886
+
3887
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3888
+ """Generic helper for making Skylet gRPC requests.
3889
+
3890
+ This method handles the common pattern of:
3891
+ 1. Try the gRPC request
3892
+ 2. If SSH tunnel is closed, recreate it and retry
3893
+ """
3894
+ max_attempts = 5
3895
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3896
+ last_exception: Optional[Exception] = None
3897
+
3898
+ for _ in range(max_attempts):
3899
+ try:
3900
+ return func()
3901
+ except grpc.RpcError as e:
3902
+ last_exception = e
3903
+ _handle_grpc_error(e, backoff.current_backoff())
3904
+
3905
+ raise RuntimeError(
3906
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3907
+ ) from last_exception
3908
+
3909
+
3910
+ def invoke_skylet_streaming_with_retries(
3911
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3912
+ """Generic helper for making Skylet streaming gRPC requests."""
3913
+ max_attempts = 3
3914
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3915
+ last_exception: Optional[Exception] = None
3916
+
3917
+ for _ in range(max_attempts):
3918
+ try:
3919
+ for response in stream_func():
3920
+ yield response
3921
+ return
3922
+ except grpc.RpcError as e:
3923
+ last_exception = e
3924
+ _handle_grpc_error(e, backoff.current_backoff())
3925
+
3926
+ raise RuntimeError(
3927
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3928
+ ) from last_exception
3929
+
3930
+
3931
+ def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3932
+ if e.code() == grpc.StatusCode.INTERNAL:
3933
+ with ux_utils.print_exception_no_traceback():
3934
+ raise exceptions.SkyletInternalError(e.details())
3935
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3936
+ time.sleep(current_backoff)
3937
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3938
+ ) == grpc.StatusCode.UNKNOWN:
3939
+ # Handle backwards compatibility: old server doesn't implement this RPC.
3940
+ # Let the caller fall back to legacy execution.
3941
+ raise exceptions.SkyletMethodNotImplementedError(
3942
+ f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
3943
+ )
3944
+ else:
3945
+ raise e