skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Util constants/functions for the backends."""
2
+ import asyncio
2
3
  from datetime import datetime
3
4
  import enum
4
5
  import fnmatch
@@ -6,20 +7,24 @@ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
10
+ import queue as queue_lib
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
16
+ import threading
14
17
  import time
15
18
  import typing
16
- from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
17
- TypeVar, Union)
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
18
21
  import uuid
19
22
 
23
+ import aiohttp
24
+ from aiohttp import ClientTimeout
25
+ from aiohttp import TCPConnector
20
26
  import colorama
21
27
  from packaging import version
22
- import psutil
23
28
  from typing_extensions import Literal
24
29
 
25
30
  import sky
@@ -43,10 +48,12 @@ from sky.server.requests import requests as requests_lib
43
48
  from sky.skylet import autostop_lib
44
49
  from sky.skylet import constants
45
50
  from sky.usage import usage_lib
51
+ from sky.utils import auth_utils
46
52
  from sky.utils import cluster_utils
47
53
  from sky.utils import command_runner
48
54
  from sky.utils import common
49
55
  from sky.utils import common_utils
56
+ from sky.utils import context as context_lib
50
57
  from sky.utils import context_utils
51
58
  from sky.utils import controller_utils
52
59
  from sky.utils import env_options
@@ -60,6 +67,7 @@ from sky.utils import subprocess_utils
60
67
  from sky.utils import tempstore
61
68
  from sky.utils import timeline
62
69
  from sky.utils import ux_utils
70
+ from sky.utils import volume as volume_utils
63
71
  from sky.utils import yaml_utils
64
72
  from sky.workspaces import core as workspaces_core
65
73
 
@@ -75,7 +83,6 @@ if typing.TYPE_CHECKING:
75
83
  from sky import task as task_lib
76
84
  from sky.backends import cloud_vm_ray_backend
77
85
  from sky.backends import local_docker_backend
78
- from sky.utils import volume as volume_lib
79
86
  else:
80
87
  yaml = adaptors_common.LazyImport('yaml')
81
88
  requests = adaptors_common.LazyImport('requests')
@@ -107,8 +114,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
107
114
  # 10.133.0.5: ray.worker.default,
108
115
  _LAUNCHING_IP_PATTERN = re.compile(
109
116
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
117
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
118
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
110
119
  _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
111
120
  re.IGNORECASE)
121
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
122
+ re.IGNORECASE)
112
123
  _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
113
124
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
114
125
 
@@ -131,6 +142,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
131
142
 
132
143
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
133
144
  WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
145
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
134
146
 
135
147
  # Remote dir that holds our runtime files.
136
148
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -209,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
209
221
  ('provider', 'availability_zone'),
210
222
  ]
211
223
 
224
+ _ACK_MESSAGE = 'ack'
225
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
226
+
212
227
 
213
228
  def is_ip(s: str) -> bool:
214
229
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -535,7 +550,7 @@ def get_expirable_clouds(
535
550
  # get all custom contexts
536
551
  contexts = kubernetes_utils.get_custom_config_k8s_contexts()
537
552
  # add remote_identity of each context if it exists
538
- remote_identities = None
553
+ remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
539
554
  for context in contexts:
540
555
  context_remote_identity = skypilot_config.get_effective_region_config(
541
556
  cloud='kubernetes',
@@ -546,9 +561,11 @@ def get_expirable_clouds(
546
561
  if remote_identities is None:
547
562
  remote_identities = []
548
563
  if isinstance(context_remote_identity, str):
564
+ assert isinstance(remote_identities, list)
549
565
  remote_identities.append(
550
566
  {context: context_remote_identity})
551
567
  elif isinstance(context_remote_identity, list):
568
+ assert isinstance(remote_identities, list)
552
569
  remote_identities.extend(context_remote_identity)
553
570
  # add global kubernetes remote identity if it exists, if not, add default
554
571
  global_remote_identity = skypilot_config.get_effective_region_config(
@@ -560,8 +577,10 @@ def get_expirable_clouds(
560
577
  if remote_identities is None:
561
578
  remote_identities = []
562
579
  if isinstance(global_remote_identity, str):
580
+ assert isinstance(remote_identities, list)
563
581
  remote_identities.append({'*': global_remote_identity})
564
582
  elif isinstance(global_remote_identity, list):
583
+ assert isinstance(remote_identities, list)
565
584
  remote_identities.extend(global_remote_identity)
566
585
  if remote_identities is None:
567
586
  remote_identities = schemas.get_default_remote_identity(
@@ -589,6 +608,11 @@ def get_expirable_clouds(
589
608
  return expirable_clouds
590
609
 
591
610
 
611
+ def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
612
+ path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
613
+ return f'{cluster_name_on_cloud}-{path_hash}'
614
+
615
+
592
616
  # TODO: too many things happening here - leaky abstraction. Refactor.
593
617
  @timeline.event
594
618
  def write_cluster_config(
@@ -602,7 +626,7 @@ def write_cluster_config(
602
626
  zones: Optional[List[clouds.Zone]] = None,
603
627
  dryrun: bool = False,
604
628
  keep_launch_fields_in_existing_config: bool = True,
605
- volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
629
+ volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
606
630
  ) -> Dict[str, str]:
607
631
  """Fills in cluster configuration templates and writes them out.
608
632
 
@@ -705,11 +729,15 @@ def write_cluster_config(
705
729
  'is not supported by this cloud. Remove the config or set: '
706
730
  '`remote_identity: LOCAL_CREDENTIALS`.')
707
731
  if isinstance(cloud, clouds.Kubernetes):
708
- if skypilot_config.get_effective_region_config(
732
+ allowed_contexts = skypilot_config.get_workspace_cloud(
733
+ 'kubernetes').get('allowed_contexts', None)
734
+ if allowed_contexts is None:
735
+ allowed_contexts = skypilot_config.get_effective_region_config(
709
736
  cloud='kubernetes',
710
737
  region=None,
711
738
  keys=('allowed_contexts',),
712
- default_value=None) is None:
739
+ default_value=None)
740
+ if allowed_contexts is None:
713
741
  excluded_clouds.add(cloud)
714
742
  else:
715
743
  excluded_clouds.add(cloud)
@@ -733,7 +761,7 @@ def write_cluster_config(
733
761
  assert k not in credentials, f'{k} already in credentials'
734
762
  credentials[k] = v
735
763
 
736
- private_key_path, _ = auth.get_or_generate_keys()
764
+ private_key_path, _ = auth_utils.get_or_generate_keys()
737
765
  auth_config = {'ssh_private_key': private_key_path}
738
766
  region_name = resources_vars.get('region')
739
767
 
@@ -767,6 +795,55 @@ def write_cluster_config(
767
795
  assert region_name in ssh_proxy_command_config, (
768
796
  region_name, ssh_proxy_command_config)
769
797
  ssh_proxy_command = ssh_proxy_command_config[region_name]
798
+
799
+ use_internal_ips = skypilot_config.get_effective_region_config(
800
+ cloud=str(cloud).lower(),
801
+ region=region.name,
802
+ keys=('use_internal_ips',),
803
+ default_value=False)
804
+ if isinstance(cloud, clouds.AWS):
805
+ # If the use_ssm flag is set to true, we use the ssm proxy command.
806
+ use_ssm = skypilot_config.get_effective_region_config(
807
+ cloud=str(cloud).lower(),
808
+ region=region.name,
809
+ keys=('use_ssm',),
810
+ default_value=None)
811
+
812
+ if use_ssm and ssh_proxy_command is not None:
813
+ raise exceptions.InvalidCloudConfigs(
814
+ 'use_ssm is set to true, but ssh_proxy_command '
815
+ f'is already set to {ssh_proxy_command!r}. Please remove '
816
+ 'ssh_proxy_command or set use_ssm to false.')
817
+
818
+ if use_internal_ips and ssh_proxy_command is None:
819
+ # Only if use_ssm is explicitly not set, we default to using SSM.
820
+ if use_ssm is None:
821
+ logger.warning(
822
+ f'{colorama.Fore.YELLOW}'
823
+ 'use_internal_ips is set to true, '
824
+ 'but ssh_proxy_command is not set. Defaulting to '
825
+ 'using SSM. Specify ssh_proxy_command to use a different '
826
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
827
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
828
+ use_ssm = True
829
+
830
+ if use_ssm:
831
+ aws_profile = os.environ.get('AWS_PROFILE', None)
832
+ profile_str = f'--profile {aws_profile}' if aws_profile else ''
833
+ ip_address_filter = ('Name=private-ip-address,Values=%h'
834
+ if use_internal_ips else
835
+ 'Name=ip-address,Values=%h')
836
+ get_instance_id_command = 'aws ec2 describe-instances ' + \
837
+ f'--region {region_name} --filters {ip_address_filter} ' + \
838
+ '--query \"Reservations[].Instances[].InstanceId\" ' + \
839
+ f'{profile_str} --output text'
840
+ ssm_proxy_command = 'aws ssm start-session --target ' + \
841
+ f'\"$({get_instance_id_command})\" ' + \
842
+ f'--region {region_name} {profile_str} ' + \
843
+ '--document-name AWS-StartSSHSession ' + \
844
+ '--parameters portNumber=%p'
845
+ ssh_proxy_command = ssm_proxy_command
846
+ region_name = 'ssm-session'
770
847
  logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
771
848
 
772
849
  # User-supplied global instance tags from ~/.sky/config.yaml.
@@ -783,12 +860,6 @@ def write_cluster_config(
783
860
  if to_provision.labels:
784
861
  labels.update(to_provision.labels)
785
862
 
786
- # Dump the Ray ports to a file for Ray job submission
787
- dump_port_command = (
788
- f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
789
- f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
790
- )
791
-
792
863
  # We disable conda auto-activation if the user has specified a docker image
793
864
  # to use, which is likely to already have a conda environment activated.
794
865
  conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
@@ -804,14 +875,24 @@ def write_cluster_config(
804
875
  cluster_name)
805
876
 
806
877
  volume_mount_vars = []
878
+ ephemeral_volume_mount_vars = []
807
879
  if volume_mounts is not None:
808
880
  for vol in volume_mounts:
809
- volume_mount_vars.append({
810
- 'name': vol.volume_name,
811
- 'path': vol.path,
812
- 'volume_name_on_cloud': vol.volume_config.name_on_cloud,
813
- 'volume_id_on_cloud': vol.volume_config.id_on_cloud,
814
- })
881
+ if vol.is_ephemeral:
882
+ volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
883
+ vol.volume_name = volume_name
884
+ vol.volume_config.cloud = repr(cloud)
885
+ vol.volume_config.region = region.name
886
+ vol.volume_config.name = volume_name
887
+ ephemeral_volume_mount_vars.append(vol.to_yaml_config())
888
+ else:
889
+ volume_info = volume_utils.VolumeInfo(
890
+ name=vol.volume_name,
891
+ path=vol.path,
892
+ volume_name_on_cloud=vol.volume_config.name_on_cloud,
893
+ volume_id_on_cloud=vol.volume_config.id_on_cloud,
894
+ )
895
+ volume_mount_vars.append(volume_info)
815
896
 
816
897
  runcmd = skypilot_config.get_effective_region_config(
817
898
  cloud=str(to_provision.cloud).lower(),
@@ -875,12 +956,14 @@ def write_cluster_config(
875
956
  '{sky_wheel_hash}',
876
957
  wheel_hash).replace('{cloud}',
877
958
  str(cloud).lower()),
959
+ 'copy_skypilot_templates_commands':
960
+ constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
878
961
  # Port of Ray (GCS server).
879
962
  # Ray's default port 6379 is conflicted with Redis.
880
963
  'ray_port': constants.SKY_REMOTE_RAY_PORT,
881
964
  'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
882
965
  'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
883
- 'dump_port_command': dump_port_command,
966
+ 'dump_port_command': instance_setup.DUMP_RAY_PORTS,
884
967
  # Sky-internal constants.
885
968
  'sky_ray_cmd': constants.SKY_RAY_CMD,
886
969
  # pip install needs to have python env activated to make sure
@@ -917,9 +1000,10 @@ def write_cluster_config(
917
1000
 
918
1001
  # Volume mounts
919
1002
  'volume_mounts': volume_mount_vars,
1003
+ 'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
920
1004
 
921
- # runcmd to append to the cloud-init cloud config passed to the
922
- # machine's UserData. This is currently only used by AWS.
1005
+ # runcmd to run before any of the SkyPilot runtime setup commands.
1006
+ # This is currently only used by AWS and Kubernetes.
923
1007
  'runcmd': runcmd,
924
1008
  }),
925
1009
  output_path=tmp_yaml_path)
@@ -974,11 +1058,7 @@ def write_cluster_config(
974
1058
  with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
975
1059
  f.write(restored_yaml_content)
976
1060
 
977
- # Read the cluster name from the tmp yaml file, to take the backward
978
- # compatbility restortion above into account.
979
- # TODO: remove this after 2 minor releases, 0.10.0.
980
- yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
981
- config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
1061
+ config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
982
1062
 
983
1063
  # Make sure to do this before we optimize file mounts. Optimization is
984
1064
  # non-deterministic, but everything else before this point should be
@@ -1053,6 +1133,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1053
1133
  config = auth.setup_fluidstack_authentication(config)
1054
1134
  elif isinstance(cloud, clouds.Hyperbolic):
1055
1135
  config = auth.setup_hyperbolic_authentication(config)
1136
+ elif isinstance(cloud, clouds.Shadeform):
1137
+ config = auth.setup_shadeform_authentication(config)
1138
+ elif isinstance(cloud, clouds.PrimeIntellect):
1139
+ config = auth.setup_primeintellect_authentication(config)
1140
+ elif isinstance(cloud, clouds.Seeweb):
1141
+ config = auth.setup_seeweb_authentication(config)
1056
1142
  else:
1057
1143
  assert False, cloud
1058
1144
  yaml_utils.dump_yaml(tmp_yaml_path, config)
@@ -1155,7 +1241,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
1155
1241
  Rather than constructing the whole byte sequence, which may be quite large,
1156
1242
  we construct it incrementally by using hash.update() to add new bytes.
1157
1243
  """
1158
-
1159
1244
  # Load the yaml contents so that we can directly remove keys.
1160
1245
  yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1161
1246
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
@@ -1738,6 +1823,32 @@ def check_network_connection():
1738
1823
  'Network seems down.')
1739
1824
 
1740
1825
 
1826
+ async def async_check_network_connection():
1827
+ """Check if the network connection is available.
1828
+
1829
+ Tolerates 3 retries as it is observed that connections can fail.
1830
+ Uses aiohttp for async HTTP requests.
1831
+ """
1832
+ # Create a session with retry logic
1833
+ timeout = ClientTimeout(total=15)
1834
+ connector = TCPConnector(limit=1) # Limit to 1 connection at a time
1835
+
1836
+ async with aiohttp.ClientSession(timeout=timeout,
1837
+ connector=connector) as session:
1838
+ for i, ip in enumerate(_TEST_IP_LIST):
1839
+ try:
1840
+ async with session.head(ip) as response:
1841
+ if response.status < 400: # Any 2xx or 3xx status is good
1842
+ return
1843
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
1844
+ if i == len(_TEST_IP_LIST) - 1:
1845
+ raise exceptions.NetworkError(
1846
+ 'Could not refresh the cluster. '
1847
+ 'Network seems down.') from e
1848
+ # If not the last IP, continue to try the next one
1849
+ continue
1850
+
1851
+
1741
1852
  @timeline.event
1742
1853
  def check_owner_identity(cluster_name: str) -> None:
1743
1854
  """Check if current user is the same as the user who created the cluster.
@@ -1750,9 +1861,18 @@ def check_owner_identity(cluster_name: str) -> None:
1750
1861
  """
1751
1862
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1752
1863
  return
1753
- record = global_user_state.get_cluster_from_name(cluster_name)
1864
+ record = global_user_state.get_cluster_from_name(cluster_name,
1865
+ include_user_info=False,
1866
+ summary_response=True)
1754
1867
  if record is None:
1755
1868
  return
1869
+ _check_owner_identity_with_record(cluster_name, record)
1870
+
1871
+
1872
+ def _check_owner_identity_with_record(cluster_name: str,
1873
+ record: Dict[str, Any]) -> None:
1874
+ if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1875
+ return
1756
1876
  handle = record['handle']
1757
1877
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1758
1878
  return
@@ -1837,8 +1957,10 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1837
1957
  }
1838
1958
 
1839
1959
 
1960
+ @context_utils.cancellation_guard
1840
1961
  def _query_cluster_status_via_cloud_api(
1841
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1962
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
1963
+ retry_if_missing: bool,
1842
1964
  ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1843
1965
  """Returns the status of the cluster as a list of tuples corresponding
1844
1966
  to the node status and an optional reason string for said status.
@@ -1865,8 +1987,11 @@ def _query_cluster_status_via_cloud_api(
1865
1987
  cloud_name = repr(handle.launched_resources.cloud)
1866
1988
  try:
1867
1989
  node_status_dict = provision_lib.query_instances(
1868
- cloud_name, cluster_name, cluster_name_on_cloud,
1869
- provider_config)
1990
+ cloud_name,
1991
+ cluster_name,
1992
+ cluster_name_on_cloud,
1993
+ provider_config,
1994
+ retry_if_missing=retry_if_missing)
1870
1995
  logger.debug(f'Querying {cloud_name} cluster '
1871
1996
  f'{cluster_name_in_hint} '
1872
1997
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -2044,7 +2169,12 @@ def check_can_clone_disk_and_override_task(
2044
2169
  return task, handle
2045
2170
 
2046
2171
 
2047
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2172
+ def _update_cluster_status(
2173
+ cluster_name: str,
2174
+ record: Dict[str, Any],
2175
+ retry_if_missing: bool,
2176
+ include_user_info: bool = True,
2177
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2048
2178
  """Update the cluster status.
2049
2179
 
2050
2180
  The cluster status is updated by checking ray cluster and real status from
@@ -2071,9 +2201,6 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2071
2201
  fetched from the cloud provider or there are leaked nodes causing
2072
2202
  the node number larger than expected.
2073
2203
  """
2074
- record = global_user_state.get_cluster_from_name(cluster_name)
2075
- if record is None:
2076
- return None
2077
2204
  handle = record['handle']
2078
2205
  if handle.cluster_yaml is None:
2079
2206
  # Remove cluster from db since this cluster does not have a config file
@@ -2092,7 +2219,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2092
2219
  return record
2093
2220
  cluster_name = handle.cluster_name
2094
2221
 
2095
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2222
+ node_statuses = _query_cluster_status_via_cloud_api(
2223
+ handle, retry_if_missing=retry_if_missing)
2096
2224
 
2097
2225
  all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2098
2226
  for status in node_statuses) and
@@ -2140,6 +2268,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2140
2268
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
2141
2269
 
2142
2270
  cloud_name = repr(handle.launched_resources.cloud).lower()
2271
+ # Initialize variables in case all retries fail
2272
+ ready_head = 0
2273
+ ready_workers = 0
2274
+ output = ''
2275
+ stderr = ''
2143
2276
  for i in range(5):
2144
2277
  try:
2145
2278
  ready_head, ready_workers, output, stderr = (
@@ -2240,12 +2373,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2240
2373
  'All nodes up; SkyPilot runtime healthy.',
2241
2374
  global_user_state.ClusterEventType.STATUS_CHANGE,
2242
2375
  nop_if_duplicate=True)
2243
- global_user_state.add_or_update_cluster(cluster_name,
2244
- handle,
2245
- requested_resources=None,
2246
- ready=True,
2247
- is_launch=False)
2248
- return global_user_state.get_cluster_from_name(cluster_name)
2376
+ global_user_state.add_or_update_cluster(
2377
+ cluster_name,
2378
+ handle,
2379
+ requested_resources=None,
2380
+ ready=True,
2381
+ is_launch=False,
2382
+ existing_cluster_hash=record['cluster_hash'])
2383
+ return global_user_state.get_cluster_from_name(
2384
+ cluster_name,
2385
+ include_user_info=include_user_info,
2386
+ summary_response=summary_response)
2249
2387
 
2250
2388
  # All cases below are transitioning the cluster to non-UP states.
2251
2389
  launched_resources = handle.launched_resources.assert_launchable()
@@ -2262,7 +2400,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2262
2400
  # and check again. This is a best-effort leak prevention check.
2263
2401
  # See https://github.com/skypilot-org/skypilot/issues/4431.
2264
2402
  time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
2265
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2403
+ node_statuses = _query_cluster_status_via_cloud_api(
2404
+ handle, retry_if_missing=False)
2266
2405
  # Note: even if all the node_statuses are UP now, we will still
2267
2406
  # consider this cluster abnormal, and its status will be INIT.
2268
2407
 
@@ -2450,12 +2589,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2450
2589
  global_user_state.ClusterEventType.STATUS_CHANGE,
2451
2590
  nop_if_duplicate=True,
2452
2591
  duplicate_regex=init_reason_regex)
2453
- global_user_state.add_or_update_cluster(cluster_name,
2454
- handle,
2455
- requested_resources=None,
2456
- ready=False,
2457
- is_launch=False)
2458
- return global_user_state.get_cluster_from_name(cluster_name)
2592
+ global_user_state.add_or_update_cluster(
2593
+ cluster_name,
2594
+ handle,
2595
+ requested_resources=None,
2596
+ ready=False,
2597
+ is_launch=False,
2598
+ existing_cluster_hash=record['cluster_hash'])
2599
+ return global_user_state.get_cluster_from_name(
2600
+ cluster_name,
2601
+ include_user_info=include_user_info,
2602
+ summary_response=summary_response)
2459
2603
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2460
2604
  # STOPPED.
2461
2605
  verb = 'terminated' if to_terminate else 'stopped'
@@ -2470,7 +2614,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2470
2614
  nop_if_duplicate=True,
2471
2615
  )
2472
2616
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2473
- return global_user_state.get_cluster_from_name(cluster_name)
2617
+ return global_user_state.get_cluster_from_name(
2618
+ cluster_name,
2619
+ include_user_info=include_user_info,
2620
+ summary_response=summary_response)
2474
2621
 
2475
2622
 
2476
2623
  def _must_refresh_cluster_status(
@@ -2492,12 +2639,14 @@ def _must_refresh_cluster_status(
2492
2639
 
2493
2640
 
2494
2641
  def refresh_cluster_record(
2495
- cluster_name: str,
2496
- *,
2497
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2498
- acquire_per_cluster_status_lock: bool = True,
2499
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2500
- ) -> Optional[Dict[str, Any]]:
2642
+ cluster_name: str,
2643
+ *,
2644
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2645
+ cluster_lock_already_held: bool = False,
2646
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2647
+ include_user_info: bool = True,
2648
+ summary_response: bool = False,
2649
+ retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
2501
2650
  """Refresh the cluster, and return the possibly updated record.
2502
2651
 
2503
2652
  The function will update the cached cluster status in the global state. For
@@ -2514,14 +2663,20 @@ def refresh_cluster_record(
2514
2663
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
2515
2664
  1. the cluster is a spot cluster, or
2516
2665
  2. cluster autostop is set and the cluster is not STOPPED.
2517
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
2518
- before updating the status. Even if this is True, the lock may not be
2519
- acquired if the status does not need to be refreshed.
2666
+ cluster_lock_already_held: Whether the caller is already holding the
2667
+ per-cluster lock. You MUST NOT set this to True if the caller does not
2668
+ already hold the lock. If True, we will not acquire the lock before
2669
+ updating the status. Failing to hold the lock while updating the
2670
+ status can lead to correctness issues - e.g. an launch in-progress may
2671
+ appear to be DOWN incorrectly. Even if this is set to False, the lock
2672
+ may not be acquired if the status does not need to be refreshed.
2520
2673
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
2521
2674
  lock. If timeout, the function will use the cached status. If the
2522
2675
  value is <0, do not timeout (wait for the lock indefinitely). By
2523
2676
  default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2524
2677
  if correctness is required, you must set this to -1.
2678
+ retry_if_missing: Whether to retry the call to the cloud api if the
2679
+ cluster is not found when querying the live status on the cloud.
2525
2680
 
2526
2681
  Returns:
2527
2682
  If the cluster is terminated or does not exist, return None.
@@ -2537,17 +2692,20 @@ def refresh_cluster_record(
2537
2692
  the node number larger than expected.
2538
2693
  """
2539
2694
 
2540
- record = global_user_state.get_cluster_from_name(cluster_name)
2695
+ ctx = context_lib.get()
2696
+ record = global_user_state.get_cluster_from_name(
2697
+ cluster_name,
2698
+ include_user_info=include_user_info,
2699
+ summary_response=summary_response)
2541
2700
  if record is None:
2542
2701
  return None
2543
2702
  # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
2544
2703
  # using the correct cloud credentials.
2545
2704
  workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
2546
2705
  with skypilot_config.local_active_workspace_ctx(workspace):
2547
- check_owner_identity(cluster_name)
2548
-
2549
- if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2550
- return record
2706
+ # check_owner_identity returns if the record handle is
2707
+ # not a CloudVmRayResourceHandle
2708
+ _check_owner_identity_with_record(cluster_name, record)
2551
2709
 
2552
2710
  # The loop logic allows us to notice if the status was updated in the
2553
2711
  # global_user_state by another process and stop trying to get the lock.
@@ -2556,12 +2714,18 @@ def refresh_cluster_record(
2556
2714
 
2557
2715
  # Loop until we have an up-to-date status or until we acquire the lock.
2558
2716
  while True:
2717
+ # Check if the context is canceled.
2718
+ if ctx is not None and ctx.is_canceled():
2719
+ raise asyncio.CancelledError()
2559
2720
  # Check to see if we can return the cached status.
2560
2721
  if not _must_refresh_cluster_status(record, force_refresh_statuses):
2561
2722
  return record
2562
2723
 
2563
- if not acquire_per_cluster_status_lock:
2564
- return _update_cluster_status(cluster_name)
2724
+ if cluster_lock_already_held:
2725
+ return _update_cluster_status(cluster_name, record,
2726
+ retry_if_missing,
2727
+ include_user_info,
2728
+ summary_response)
2565
2729
 
2566
2730
  # Try to acquire the lock so we can fetch the status.
2567
2731
  try:
@@ -2569,12 +2733,17 @@ def refresh_cluster_record(
2569
2733
  # Check the cluster status again, since it could have been
2570
2734
  # updated between our last check and acquiring the lock.
2571
2735
  record = global_user_state.get_cluster_from_name(
2572
- cluster_name)
2736
+ cluster_name,
2737
+ include_user_info=include_user_info,
2738
+ summary_response=summary_response)
2573
2739
  if record is None or not _must_refresh_cluster_status(
2574
2740
  record, force_refresh_statuses):
2575
2741
  return record
2576
2742
  # Update and return the cluster status.
2577
- return _update_cluster_status(cluster_name)
2743
+ return _update_cluster_status(cluster_name, record,
2744
+ retry_if_missing,
2745
+ include_user_info,
2746
+ summary_response)
2578
2747
 
2579
2748
  except locks.LockTimeout:
2580
2749
  # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2592,10 +2761,13 @@ def refresh_cluster_record(
2592
2761
  'Refreshing status: Failed get the lock for cluster '
2593
2762
  f'{cluster_name!r}. Using the cached status.')
2594
2763
  return record
2595
- time.sleep(0.05)
2764
+ time.sleep(lock.poll_interval)
2596
2765
 
2597
2766
  # Refresh for next loop iteration.
2598
- record = global_user_state.get_cluster_from_name(cluster_name)
2767
+ record = global_user_state.get_cluster_from_name(
2768
+ cluster_name,
2769
+ include_user_info=include_user_info,
2770
+ summary_response=summary_response)
2599
2771
  if record is None:
2600
2772
  return None
2601
2773
 
@@ -2606,8 +2778,9 @@ def refresh_cluster_status_handle(
2606
2778
  cluster_name: str,
2607
2779
  *,
2608
2780
  force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2609
- acquire_per_cluster_status_lock: bool = True,
2610
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2781
+ cluster_lock_already_held: bool = False,
2782
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2783
+ retry_if_missing: bool = True,
2611
2784
  ) -> Tuple[Optional[status_lib.ClusterStatus],
2612
2785
  Optional[backends.ResourceHandle]]:
2613
2786
  """Refresh the cluster, and return the possibly updated status and handle.
@@ -2619,8 +2792,11 @@ def refresh_cluster_status_handle(
2619
2792
  record = refresh_cluster_record(
2620
2793
  cluster_name,
2621
2794
  force_refresh_statuses=force_refresh_statuses,
2622
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2623
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2795
+ cluster_lock_already_held=cluster_lock_already_held,
2796
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2797
+ include_user_info=False,
2798
+ summary_response=True,
2799
+ retry_if_missing=retry_if_missing)
2624
2800
  if record is None:
2625
2801
  return None, None
2626
2802
  return record['status'], record['handle']
@@ -2671,7 +2847,9 @@ def check_cluster_available(
2671
2847
  exceptions.CloudUserIdentityError: if we fail to get the current user
2672
2848
  identity.
2673
2849
  """
2674
- record = global_user_state.get_cluster_from_name(cluster_name)
2850
+ record = global_user_state.get_cluster_from_name(cluster_name,
2851
+ include_user_info=False,
2852
+ summary_response=True)
2675
2853
  if dryrun:
2676
2854
  assert record is not None, cluster_name
2677
2855
  return record['handle']
@@ -2858,7 +3036,8 @@ def is_controller_accessible(
2858
3036
  f'fatal, but {controller_name} commands/calls may hang or return '
2859
3037
  'stale information, when the controller is not up.\n'
2860
3038
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2861
- record = global_user_state.get_cluster_from_name(cluster_name)
3039
+ record = global_user_state.get_cluster_from_name(
3040
+ cluster_name, include_user_info=False, summary_response=True)
2862
3041
  if record is not None:
2863
3042
  controller_status, handle = record['status'], record['handle']
2864
3043
  # We check the connection even if the cluster has a cached status UP
@@ -2915,22 +3094,96 @@ class CloudFilter(enum.Enum):
2915
3094
  LOCAL = 'local'
2916
3095
 
2917
3096
 
2918
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3097
+ def _get_glob_clusters(
3098
+ clusters: List[str],
3099
+ silent: bool = False,
3100
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
2919
3101
  """Returns a list of clusters that match the glob pattern."""
2920
3102
  glob_clusters = []
2921
3103
  for cluster in clusters:
2922
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3104
+ glob_cluster = global_user_state.get_glob_cluster_names(
3105
+ cluster, workspaces_filter=workspaces_filter)
2923
3106
  if len(glob_cluster) == 0 and not silent:
2924
3107
  logger.info(f'Cluster {cluster} not found.')
2925
3108
  glob_clusters.extend(glob_cluster)
2926
3109
  return list(set(glob_clusters))
2927
3110
 
2928
3111
 
3112
+ def _refresh_cluster(
3113
+ cluster_name: str,
3114
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3115
+ include_user_info: bool = True,
3116
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3117
+ try:
3118
+ record = refresh_cluster_record(
3119
+ cluster_name,
3120
+ force_refresh_statuses=force_refresh_statuses,
3121
+ cluster_lock_already_held=False,
3122
+ include_user_info=include_user_info,
3123
+ summary_response=summary_response)
3124
+ except (exceptions.ClusterStatusFetchingError,
3125
+ exceptions.CloudUserIdentityError,
3126
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3127
+ # Do not fail the entire refresh process. The caller will
3128
+ # handle the 'UNKNOWN' status, and collect the errors into
3129
+ # a table.
3130
+ record = {'status': 'UNKNOWN', 'error': e}
3131
+ return record
3132
+
3133
+
3134
+ def refresh_cluster_records() -> None:
3135
+ """Refreshes the status of all clusters, except managed clusters.
3136
+
3137
+ Used by the background status refresh daemon.
3138
+ This function is a stripped-down version of get_clusters, with only the
3139
+ bare bones refresh logic.
3140
+
3141
+ Returns:
3142
+ None
3143
+
3144
+ Raises:
3145
+ None
3146
+ """
3147
+ # We force to exclude managed clusters to avoid multiple sources
3148
+ # manipulating them. For example, SkyServe assumes the replica manager
3149
+ # is the only source of truth for the cluster status.
3150
+ cluster_names = set(
3151
+ global_user_state.get_cluster_names(exclude_managed_clusters=True))
3152
+
3153
+ # TODO(syang): we should try not to leak
3154
+ # request info in backend_utils.py.
3155
+ # Refactor this to use some other info to
3156
+ # determine if a launch is in progress.
3157
+ cluster_names_with_launch_request = {
3158
+ request.cluster_name for request in requests_lib.get_request_tasks(
3159
+ req_filter=requests_lib.RequestTaskFilter(
3160
+ status=[requests_lib.RequestStatus.RUNNING],
3161
+ include_request_names=['sky.launch'],
3162
+ fields=['cluster_name']))
3163
+ }
3164
+ cluster_names_without_launch_request = (cluster_names -
3165
+ cluster_names_with_launch_request)
3166
+
3167
+ def _refresh_cluster_record(cluster_name):
3168
+ return _refresh_cluster(cluster_name,
3169
+ force_refresh_statuses=set(
3170
+ status_lib.ClusterStatus),
3171
+ include_user_info=False,
3172
+ summary_response=True)
3173
+
3174
+ if len(cluster_names_without_launch_request) > 0:
3175
+ # Do not refresh the clusters that have an active launch request.
3176
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3177
+ cluster_names_without_launch_request)
3178
+
3179
+
2929
3180
  def get_clusters(
2930
3181
  refresh: common.StatusRefreshMode,
2931
3182
  cluster_names: Optional[Union[str, List[str]]] = None,
2932
3183
  all_users: bool = True,
2933
3184
  include_credentials: bool = False,
3185
+ summary_response: bool = False,
3186
+ include_handle: bool = True,
2934
3187
  # Internal only:
2935
3188
  # pylint: disable=invalid-name
2936
3189
  _include_is_managed: bool = False,
@@ -2958,6 +3211,23 @@ def get_clusters(
2958
3211
  A list of cluster records. If the cluster does not exist or has been
2959
3212
  terminated, the record will be omitted from the returned list.
2960
3213
  """
3214
+ accessible_workspaces = workspaces_core.get_workspaces()
3215
+ if cluster_names is not None:
3216
+ if isinstance(cluster_names, str):
3217
+ cluster_names = [cluster_names]
3218
+ non_glob_cluster_names = []
3219
+ glob_cluster_names = []
3220
+ for cluster_name in cluster_names:
3221
+ if ux_utils.is_glob_pattern(cluster_name):
3222
+ glob_cluster_names.append(cluster_name)
3223
+ else:
3224
+ non_glob_cluster_names.append(cluster_name)
3225
+ cluster_names = non_glob_cluster_names
3226
+ if glob_cluster_names:
3227
+ cluster_names += _get_glob_clusters(
3228
+ glob_cluster_names,
3229
+ silent=True,
3230
+ workspaces_filter=accessible_workspaces)
2961
3231
 
2962
3232
  exclude_managed_clusters = False
2963
3233
  if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -2965,34 +3235,24 @@ def get_clusters(
2965
3235
  user_hashes_filter = None
2966
3236
  if not all_users:
2967
3237
  user_hashes_filter = {common_utils.get_current_user().id}
2968
- accessible_workspaces = workspaces_core.get_workspaces()
2969
-
2970
3238
  records = global_user_state.get_clusters(
2971
3239
  exclude_managed_clusters=exclude_managed_clusters,
2972
3240
  user_hashes_filter=user_hashes_filter,
2973
- workspaces_filter=accessible_workspaces)
3241
+ workspaces_filter=accessible_workspaces,
3242
+ cluster_names=cluster_names,
3243
+ summary_response=summary_response)
2974
3244
 
2975
3245
  yellow = colorama.Fore.YELLOW
2976
3246
  bright = colorama.Style.BRIGHT
2977
3247
  reset = colorama.Style.RESET_ALL
2978
3248
 
2979
3249
  if cluster_names is not None:
2980
- if isinstance(cluster_names, str):
2981
- cluster_names = [cluster_names]
2982
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
2983
- new_records = []
2984
- not_exist_cluster_names = []
2985
- for cluster_name in cluster_names:
2986
- for record in records:
2987
- if record['name'] == cluster_name:
2988
- new_records.append(record)
2989
- break
2990
- else:
2991
- not_exist_cluster_names.append(cluster_name)
2992
- if not_exist_cluster_names:
2993
- clusters_str = ', '.join(not_exist_cluster_names)
3250
+ record_names = {record['name'] for record in records}
3251
+ not_found_clusters = ux_utils.get_non_matched_query(
3252
+ cluster_names, record_names)
3253
+ if not_found_clusters:
3254
+ clusters_str = ', '.join(not_found_clusters)
2994
3255
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2995
- records = new_records
2996
3256
 
2997
3257
  def _get_records_with_handle(
2998
3258
  records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
@@ -3002,17 +3262,18 @@ def get_clusters(
3002
3262
  if record is not None and record['handle'] is not None
3003
3263
  ]
3004
3264
 
3005
- def _update_records_with_resources_str(
3265
+ def _update_records_with_handle_info(
3006
3266
  records: List[Optional[Dict[str, Any]]]) -> None:
3007
3267
  """Add resource str to record"""
3008
3268
  for record in _get_records_with_handle(records):
3009
3269
  handle = record['handle']
3010
- record[
3011
- 'resources_str'] = resources_utils.get_readable_resources_repr(
3012
- handle, simplify=True)
3013
- record[
3014
- 'resources_str_full'] = resources_utils.get_readable_resources_repr(
3015
- handle, simplify=False)
3270
+ resource_str_simple, resource_str_full = (
3271
+ resources_utils.get_readable_resources_repr(
3272
+ handle, simplified_only=False))
3273
+ record['resources_str'] = resource_str_simple
3274
+ record['resources_str_full'] = resource_str_full
3275
+ if not summary_response:
3276
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3016
3277
 
3017
3278
  def _update_records_with_credentials(
3018
3279
  records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3036,9 +3297,17 @@ def get_clusters(
3036
3297
  expanded_private_key_path = os.path.expanduser(
3037
3298
  ssh_private_key_path)
3038
3299
  if not os.path.exists(expanded_private_key_path):
3039
- auth.create_ssh_key_files_from_db(ssh_private_key_path)
3300
+ success = auth_utils.create_ssh_key_files_from_db(
3301
+ ssh_private_key_path)
3302
+ if not success:
3303
+ # If the ssh key files are not found, we do not
3304
+ # update the record with credentials.
3305
+ logger.debug(
3306
+ f'SSH keys not found for cluster {record["name"]} '
3307
+ f'at key path {ssh_private_key_path}')
3308
+ continue
3040
3309
  else:
3041
- private_key_path, _ = auth.get_or_generate_keys()
3310
+ private_key_path, _ = auth_utils.get_or_generate_keys()
3042
3311
  expanded_private_key_path = os.path.expanduser(private_key_path)
3043
3312
  if expanded_private_key_path in cached_private_keys:
3044
3313
  credential['ssh_private_key_content'] = cached_private_keys[
@@ -3052,7 +3321,7 @@ def get_clusters(
3052
3321
  record['credentials'] = credential
3053
3322
 
3054
3323
  def _update_records_with_resources(
3055
- records: List[Optional[Dict[str, Any]]]) -> None:
3324
+ records: List[Optional[Dict[str, Any]]],) -> None:
3056
3325
  """Add the resources to the record."""
3057
3326
  for record in _get_records_with_handle(records):
3058
3327
  handle = record['handle']
@@ -3070,9 +3339,11 @@ def get_clusters(
3070
3339
  record['accelerators'] = (
3071
3340
  f'{handle.launched_resources.accelerators}'
3072
3341
  if handle.launched_resources.accelerators else None)
3342
+ if not include_handle:
3343
+ record.pop('handle', None)
3073
3344
 
3074
- # Add auth_config to the records
3075
- _update_records_with_resources_str(records)
3345
+ # Add handle info to the records
3346
+ _update_records_with_handle_info(records)
3076
3347
  if include_credentials:
3077
3348
  _update_records_with_credentials(records)
3078
3349
  if refresh == common.StatusRefreshMode.NONE:
@@ -3093,65 +3364,76 @@ def get_clusters(
3093
3364
  else:
3094
3365
  force_refresh_statuses = None
3095
3366
 
3096
- def _refresh_cluster(cluster_name):
3097
- # TODO(syang): we should try not to leak
3098
- # request info in backend_utils.py.
3099
- # Refactor this to use some other info to
3100
- # determine if a launch is in progress.
3101
- request = requests_lib.get_request_tasks(
3102
- req_filter=requests_lib.RequestTaskFilter(
3103
- status=[requests_lib.RequestStatus.RUNNING],
3104
- cluster_names=[cluster_name],
3105
- include_request_names=['sky.launch']))
3106
- if len(request) > 0:
3107
- # There is an active launch request on the cluster,
3108
- # so we don't want to update the cluster status until
3109
- # the request is completed.
3110
- logger.debug(f'skipping refresh for cluster {cluster_name} '
3111
- 'as there is an active launch request')
3112
- return global_user_state.get_cluster_from_name(cluster_name)
3113
- try:
3114
- record = refresh_cluster_record(
3115
- cluster_name,
3116
- force_refresh_statuses=force_refresh_statuses,
3117
- acquire_per_cluster_status_lock=True)
3118
- _update_records_with_resources_str([record])
3367
+ def _refresh_cluster_record(cluster_name):
3368
+ record = _refresh_cluster(cluster_name,
3369
+ force_refresh_statuses=force_refresh_statuses,
3370
+ include_user_info=True,
3371
+ summary_response=summary_response)
3372
+ # record may be None if the cluster is deleted during refresh,
3373
+ # e.g. all the Pods of a cluster on Kubernetes have been
3374
+ # deleted before refresh.
3375
+ if record is not None and 'error' not in record:
3376
+ _update_records_with_handle_info([record])
3119
3377
  if include_credentials:
3120
3378
  _update_records_with_credentials([record])
3121
- except (exceptions.ClusterStatusFetchingError,
3122
- exceptions.CloudUserIdentityError,
3123
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3124
- # Do not fail the entire refresh process. The caller will
3125
- # handle the 'UNKNOWN' status, and collect the errors into
3126
- # a table.
3127
- record = {'status': 'UNKNOWN', 'error': e}
3128
- progress.update(task, advance=1)
3379
+ progress.update(task, advance=1)
3129
3380
  return record
3130
3381
 
3131
3382
  cluster_names = [record['name'] for record in records]
3383
+ # TODO(syang): we should try not to leak
3384
+ # request info in backend_utils.py.
3385
+ # Refactor this to use some other info to
3386
+ # determine if a launch is in progress.
3387
+ cluster_names_with_launch_request = {
3388
+ request.cluster_name for request in requests_lib.get_request_tasks(
3389
+ req_filter=requests_lib.RequestTaskFilter(
3390
+ status=[requests_lib.RequestStatus.RUNNING],
3391
+ include_request_names=['sky.launch'],
3392
+ cluster_names=cluster_names,
3393
+ fields=['cluster_name']))
3394
+ }
3395
+ # Preserve the index of the cluster name as it appears on "records"
3396
+ cluster_names_without_launch_request = [
3397
+ (i, cluster_name)
3398
+ for i, cluster_name in enumerate(cluster_names)
3399
+ if cluster_name not in cluster_names_with_launch_request
3400
+ ]
3401
+ # for clusters that have an active launch request, we do not refresh the status
3132
3402
  updated_records = []
3133
- if len(cluster_names) > 0:
3403
+ if len(cluster_names_without_launch_request) > 0:
3134
3404
  with progress:
3135
3405
  updated_records = subprocess_utils.run_in_parallel(
3136
- _refresh_cluster, cluster_names)
3137
-
3406
+ _refresh_cluster_record, [
3407
+ cluster_name
3408
+ for _, cluster_name in cluster_names_without_launch_request
3409
+ ])
3410
+ # Preserve the index of the cluster name as it appears on "records"
3411
+ # before filtering for clusters being launched.
3412
+ updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
3413
+ cluster_names_without_launch_request[i][0]: updated_records[i]
3414
+ for i in range(len(cluster_names_without_launch_request))
3415
+ }
3138
3416
  # Show information for removed clusters.
3139
3417
  kept_records = []
3140
3418
  autodown_clusters, remaining_clusters, failed_clusters = [], [], []
3141
3419
  for i, record in enumerate(records):
3142
- if updated_records[i] is None:
3420
+ if i not in updated_records_dict:
3421
+ # record was not refreshed, keep the original record
3422
+ kept_records.append(record)
3423
+ continue
3424
+ updated_record = updated_records_dict[i]
3425
+ if updated_record is None:
3143
3426
  if record['to_down']:
3144
- autodown_clusters.append(cluster_names[i])
3427
+ autodown_clusters.append(record['name'])
3145
3428
  else:
3146
- remaining_clusters.append(cluster_names[i])
3147
- elif updated_records[i]['status'] == 'UNKNOWN':
3148
- failed_clusters.append(
3149
- (cluster_names[i], updated_records[i]['error']))
3429
+ remaining_clusters.append(record['name'])
3430
+ elif updated_record['status'] == 'UNKNOWN':
3431
+ failed_clusters.append((record['name'], updated_record['error']))
3150
3432
  # Keep the original record if the status is unknown,
3151
3433
  # so that the user can still see the cluster.
3152
3434
  kept_records.append(record)
3153
3435
  else:
3154
- kept_records.append(updated_records[i])
3436
+ kept_records.append(updated_record)
3155
3437
 
3156
3438
  if autodown_clusters:
3157
3439
  plural = 's' if len(autodown_clusters) > 1 else ''
@@ -3352,13 +3634,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
3352
3634
  `stderr`. Typically due to the local client version just got updated, and
3353
3635
  the remote runtime is an older version.
3354
3636
  """
3355
- pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
3356
- r'attribute \'(.*)\'')
3357
3637
  if returncode != 0:
3358
- # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
3359
- # the remote cluster. Remove this after 0.10.0 is released.
3360
- attribute_error = re.findall(pattern, stderr)
3361
- if attribute_error or 'SkyPilot runtime is too old' in stderr:
3638
+ if 'SkyPilot runtime is too old' in stderr:
3362
3639
  with ux_utils.print_exception_no_traceback():
3363
3640
  raise RuntimeError(
3364
3641
  f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
@@ -3502,19 +3779,126 @@ def workspace_lock_id(workspace_name: str) -> str:
3502
3779
  return f'{workspace_name}_workspace'
3503
3780
 
3504
3781
 
3782
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3783
+ """Get the lock ID for cluster tunnel operations."""
3784
+ return f'{cluster_name}_ssh_tunnel'
3785
+
3786
+
3787
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3788
+ command_runner.KubernetesCommandRunner],
3789
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3790
+ local_port, remote_port = port_forward
3791
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3792
+ # Disabling ControlMaster makes things easier to reason about
3793
+ # with respect to resource management/ownership,
3794
+ # as killing the process will close the tunnel too.
3795
+ head_runner.disable_control_master = True
3796
+ head_runner.port_forward_execute_remote_command = True
3797
+
3798
+ # The default connect_timeout of 1s is too short for
3799
+ # connecting to clusters using a jump server.
3800
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3801
+ # which is counted towards non-idleness.
3802
+ cmd: List[str] = head_runner.port_forward_command(
3803
+ [(local_port, remote_port)],
3804
+ connect_timeout=5,
3805
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3806
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3807
+ # cat so the command doesn't exit until we kill it
3808
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3809
+ cmd_str = ' '.join(cmd)
3810
+ logger.debug(f'Running port forward command: {cmd_str}')
3811
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3812
+ shell=True,
3813
+ stdin=subprocess.PIPE,
3814
+ stdout=subprocess.PIPE,
3815
+ stderr=subprocess.PIPE,
3816
+ start_new_session=True,
3817
+ text=True)
3818
+ # Wait until we receive an ack from the remote cluster or
3819
+ # the SSH connection times out.
3820
+ queue: queue_lib.Queue = queue_lib.Queue()
3821
+ stdout_thread = threading.Thread(
3822
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3823
+ args=(queue, ssh_tunnel_proc.stdout),
3824
+ daemon=True)
3825
+ stdout_thread.start()
3826
+ while ssh_tunnel_proc.poll() is None:
3827
+ try:
3828
+ ack = queue.get_nowait()
3829
+ except queue_lib.Empty:
3830
+ ack = None
3831
+ time.sleep(0.1)
3832
+ continue
3833
+ assert ack is not None
3834
+ if isinstance(
3835
+ head_runner,
3836
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3837
+ break
3838
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3839
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3840
+ # On kind clusters, this error occurs if we make a request
3841
+ # immediately after the port-forward is established on a new pod:
3842
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3843
+ # failed to execute portforward in network namespace
3844
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3845
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3846
+ # connect: connection refused
3847
+ # So we need to poll the port on the pod to check if it is open.
3848
+ # We did not observe this with real Kubernetes clusters.
3849
+ timeout = 5
3850
+ port_check_cmd = (
3851
+ # We install netcat in our ray-node container,
3852
+ # so we can use it here.
3853
+ # (See kubernetes-ray.yml.j2)
3854
+ f'end=$((SECONDS+{timeout})); '
3855
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3856
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3857
+ 'sleep 0.1; '
3858
+ 'done')
3859
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3860
+ require_outputs=True,
3861
+ stream_logs=False)
3862
+ if returncode != 0:
3863
+ try:
3864
+ ssh_tunnel_proc.terminate()
3865
+ ssh_tunnel_proc.wait(timeout=5)
3866
+ except subprocess.TimeoutExpired:
3867
+ ssh_tunnel_proc.kill()
3868
+ ssh_tunnel_proc.wait()
3869
+ finally:
3870
+ error_msg = (f'Failed to check remote port {remote_port}')
3871
+ if stdout:
3872
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3873
+ raise exceptions.CommandError(returncode=returncode,
3874
+ command=cmd_str,
3875
+ error_msg=error_msg,
3876
+ detailed_reason=stderr)
3877
+ break
3878
+
3879
+ if ssh_tunnel_proc.poll() is not None:
3880
+ stdout, stderr = ssh_tunnel_proc.communicate()
3881
+ error_msg = 'Port forward failed'
3882
+ if stdout:
3883
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3884
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3885
+ command=cmd_str,
3886
+ error_msg=error_msg,
3887
+ detailed_reason=stderr)
3888
+ return ssh_tunnel_proc
3889
+
3890
+
3505
3891
  T = TypeVar('T')
3506
3892
 
3507
3893
 
3508
- def invoke_skylet_with_retries(
3509
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3510
- func: Callable[..., T]) -> T:
3894
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3511
3895
  """Generic helper for making Skylet gRPC requests.
3512
3896
 
3513
3897
  This method handles the common pattern of:
3514
3898
  1. Try the gRPC request
3515
3899
  2. If SSH tunnel is closed, recreate it and retry
3516
3900
  """
3517
- max_attempts = 3
3901
+ max_attempts = 5
3518
3902
  backoff = common_utils.Backoff(initial_backoff=0.5)
3519
3903
  last_exception: Optional[Exception] = None
3520
3904
 
@@ -3523,26 +3907,46 @@ def invoke_skylet_with_retries(
3523
3907
  return func()
3524
3908
  except grpc.RpcError as e:
3525
3909
  last_exception = e
3526
- if e.code() == grpc.StatusCode.INTERNAL:
3527
- with ux_utils.print_exception_no_traceback():
3528
- raise exceptions.SkyletInternalError(e.details())
3529
- elif e.code() == grpc.StatusCode.UNAVAILABLE:
3530
- recreate_tunnel = True
3531
- try:
3532
- if handle.skylet_ssh_tunnel is not None:
3533
- proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3534
- if proc.is_running(
3535
- ) and proc.status() != psutil.STATUS_ZOMBIE:
3536
- recreate_tunnel = False
3537
- except psutil.NoSuchProcess:
3538
- pass
3539
-
3540
- if recreate_tunnel:
3541
- handle.open_and_update_skylet_tunnel()
3542
-
3543
- time.sleep(backoff.current_backoff())
3544
- else:
3545
- raise e
3910
+ _handle_grpc_error(e, backoff.current_backoff())
3546
3911
 
3547
- raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3548
- ) from last_exception
3912
+ raise RuntimeError(
3913
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3914
+ ) from last_exception
3915
+
3916
+
3917
+ def invoke_skylet_streaming_with_retries(
3918
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3919
+ """Generic helper for making Skylet streaming gRPC requests."""
3920
+ max_attempts = 3
3921
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3922
+ last_exception: Optional[Exception] = None
3923
+
3924
+ for _ in range(max_attempts):
3925
+ try:
3926
+ for response in stream_func():
3927
+ yield response
3928
+ return
3929
+ except grpc.RpcError as e:
3930
+ last_exception = e
3931
+ _handle_grpc_error(e, backoff.current_backoff())
3932
+
3933
+ raise RuntimeError(
3934
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3935
+ ) from last_exception
3936
+
3937
+
3938
+ def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3939
+ if e.code() == grpc.StatusCode.INTERNAL:
3940
+ with ux_utils.print_exception_no_traceback():
3941
+ raise exceptions.SkyletInternalError(e.details())
3942
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3943
+ time.sleep(current_backoff)
3944
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3945
+ ) == grpc.StatusCode.UNKNOWN:
3946
+ # Handle backwards compatibility: old server doesn't implement this RPC.
3947
+ # Let the caller fall back to legacy execution.
3948
+ raise exceptions.SkyletMethodNotImplementedError(
3949
+ f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
3950
+ )
3951
+ else:
3952
+ raise e