skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Util constants/functions for the backends."""
2
+ import asyncio
2
3
  from datetime import datetime
3
4
  import enum
4
5
  import fnmatch
@@ -6,20 +7,24 @@ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
10
+ import queue as queue_lib
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
12
14
  import sys
13
15
  import tempfile
16
+ import threading
14
17
  import time
15
18
  import typing
16
- from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
17
- TypeVar, Union)
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
18
21
  import uuid
19
22
 
23
+ import aiohttp
24
+ from aiohttp import ClientTimeout
25
+ from aiohttp import TCPConnector
20
26
  import colorama
21
27
  from packaging import version
22
- import psutil
23
28
  from typing_extensions import Literal
24
29
 
25
30
  import sky
@@ -43,10 +48,12 @@ from sky.server.requests import requests as requests_lib
43
48
  from sky.skylet import autostop_lib
44
49
  from sky.skylet import constants
45
50
  from sky.usage import usage_lib
51
+ from sky.utils import auth_utils
46
52
  from sky.utils import cluster_utils
47
53
  from sky.utils import command_runner
48
54
  from sky.utils import common
49
55
  from sky.utils import common_utils
56
+ from sky.utils import context as context_lib
50
57
  from sky.utils import context_utils
51
58
  from sky.utils import controller_utils
52
59
  from sky.utils import env_options
@@ -60,6 +67,7 @@ from sky.utils import subprocess_utils
60
67
  from sky.utils import tempstore
61
68
  from sky.utils import timeline
62
69
  from sky.utils import ux_utils
70
+ from sky.utils import volume as volume_utils
63
71
  from sky.utils import yaml_utils
64
72
  from sky.workspaces import core as workspaces_core
65
73
 
@@ -75,7 +83,6 @@ if typing.TYPE_CHECKING:
75
83
  from sky import task as task_lib
76
84
  from sky.backends import cloud_vm_ray_backend
77
85
  from sky.backends import local_docker_backend
78
- from sky.utils import volume as volume_lib
79
86
  else:
80
87
  yaml = adaptors_common.LazyImport('yaml')
81
88
  requests = adaptors_common.LazyImport('requests')
@@ -107,8 +114,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
107
114
  # 10.133.0.5: ray.worker.default,
108
115
  _LAUNCHING_IP_PATTERN = re.compile(
109
116
  r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
117
+ SSH_CONNECTION_ERROR_PATTERN = re.compile(
118
+ r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
110
119
  _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
111
120
  re.IGNORECASE)
121
+ K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
122
+ re.IGNORECASE)
112
123
  _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
113
124
  WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
114
125
 
@@ -131,10 +142,24 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
131
142
 
132
143
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
133
144
  WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
145
+ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
134
146
 
135
147
  # Remote dir that holds our runtime files.
136
148
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
137
149
 
150
+ # The maximum size of a command line arguments is 128 KB, i.e. the command
151
+ # executed with /bin/sh should be less than 128KB.
152
+ # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
153
+ #
154
+ # If a user have very long run or setup commands, the generated command may
155
+ # exceed the limit, as we directly include scripts in job submission commands.
156
+ # If the command is too long, we instead write it to a file, rsync and execute
157
+ # it.
158
+ #
159
+ # We use 100KB as a threshold to be safe for other arguments that
160
+ # might be added during ssh.
161
+ _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
162
+
138
163
  _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
139
164
  'please retry after a while.')
140
165
 
@@ -209,6 +234,21 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
209
234
  ('provider', 'availability_zone'),
210
235
  ]
211
236
 
237
+ _ACK_MESSAGE = 'ack'
238
+ _FORWARDING_FROM_MESSAGE = 'Forwarding from'
239
+
240
+
241
+ def is_command_length_over_limit(command: str) -> bool:
242
+ """Check if the length of the command exceeds the limit.
243
+
244
+ We calculate the length of the command after quoting the command twice as
245
+ when it is executed by the CommandRunner, the command will be quoted twice
246
+ to ensure the correctness, which will add significant length to the command.
247
+ """
248
+
249
+ quoted_length = len(shlex.quote(shlex.quote(command)))
250
+ return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
251
+
212
252
 
213
253
  def is_ip(s: str) -> bool:
214
254
  """Returns whether this string matches IP_ADDR_REGEX."""
@@ -535,7 +575,7 @@ def get_expirable_clouds(
535
575
  # get all custom contexts
536
576
  contexts = kubernetes_utils.get_custom_config_k8s_contexts()
537
577
  # add remote_identity of each context if it exists
538
- remote_identities = None
578
+ remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
539
579
  for context in contexts:
540
580
  context_remote_identity = skypilot_config.get_effective_region_config(
541
581
  cloud='kubernetes',
@@ -546,9 +586,11 @@ def get_expirable_clouds(
546
586
  if remote_identities is None:
547
587
  remote_identities = []
548
588
  if isinstance(context_remote_identity, str):
589
+ assert isinstance(remote_identities, list)
549
590
  remote_identities.append(
550
591
  {context: context_remote_identity})
551
592
  elif isinstance(context_remote_identity, list):
593
+ assert isinstance(remote_identities, list)
552
594
  remote_identities.extend(context_remote_identity)
553
595
  # add global kubernetes remote identity if it exists, if not, add default
554
596
  global_remote_identity = skypilot_config.get_effective_region_config(
@@ -560,8 +602,10 @@ def get_expirable_clouds(
560
602
  if remote_identities is None:
561
603
  remote_identities = []
562
604
  if isinstance(global_remote_identity, str):
605
+ assert isinstance(remote_identities, list)
563
606
  remote_identities.append({'*': global_remote_identity})
564
607
  elif isinstance(global_remote_identity, list):
608
+ assert isinstance(remote_identities, list)
565
609
  remote_identities.extend(global_remote_identity)
566
610
  if remote_identities is None:
567
611
  remote_identities = schemas.get_default_remote_identity(
@@ -589,6 +633,11 @@ def get_expirable_clouds(
589
633
  return expirable_clouds
590
634
 
591
635
 
636
+ def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
637
+ path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
638
+ return f'{cluster_name_on_cloud}-{path_hash}'
639
+
640
+
592
641
  # TODO: too many things happening here - leaky abstraction. Refactor.
593
642
  @timeline.event
594
643
  def write_cluster_config(
@@ -602,7 +651,7 @@ def write_cluster_config(
602
651
  zones: Optional[List[clouds.Zone]] = None,
603
652
  dryrun: bool = False,
604
653
  keep_launch_fields_in_existing_config: bool = True,
605
- volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
654
+ volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
606
655
  ) -> Dict[str, str]:
607
656
  """Fills in cluster configuration templates and writes them out.
608
657
 
@@ -705,11 +754,15 @@ def write_cluster_config(
705
754
  'is not supported by this cloud. Remove the config or set: '
706
755
  '`remote_identity: LOCAL_CREDENTIALS`.')
707
756
  if isinstance(cloud, clouds.Kubernetes):
708
- if skypilot_config.get_effective_region_config(
757
+ allowed_contexts = skypilot_config.get_workspace_cloud(
758
+ 'kubernetes').get('allowed_contexts', None)
759
+ if allowed_contexts is None:
760
+ allowed_contexts = skypilot_config.get_effective_region_config(
709
761
  cloud='kubernetes',
710
762
  region=None,
711
763
  keys=('allowed_contexts',),
712
- default_value=None) is None:
764
+ default_value=None)
765
+ if allowed_contexts is None:
713
766
  excluded_clouds.add(cloud)
714
767
  else:
715
768
  excluded_clouds.add(cloud)
@@ -733,7 +786,7 @@ def write_cluster_config(
733
786
  assert k not in credentials, f'{k} already in credentials'
734
787
  credentials[k] = v
735
788
 
736
- private_key_path, _ = auth.get_or_generate_keys()
789
+ private_key_path, _ = auth_utils.get_or_generate_keys()
737
790
  auth_config = {'ssh_private_key': private_key_path}
738
791
  region_name = resources_vars.get('region')
739
792
 
@@ -767,6 +820,55 @@ def write_cluster_config(
767
820
  assert region_name in ssh_proxy_command_config, (
768
821
  region_name, ssh_proxy_command_config)
769
822
  ssh_proxy_command = ssh_proxy_command_config[region_name]
823
+
824
+ use_internal_ips = skypilot_config.get_effective_region_config(
825
+ cloud=str(cloud).lower(),
826
+ region=region.name,
827
+ keys=('use_internal_ips',),
828
+ default_value=False)
829
+ if isinstance(cloud, clouds.AWS):
830
+ # If the use_ssm flag is set to true, we use the ssm proxy command.
831
+ use_ssm = skypilot_config.get_effective_region_config(
832
+ cloud=str(cloud).lower(),
833
+ region=region.name,
834
+ keys=('use_ssm',),
835
+ default_value=None)
836
+
837
+ if use_ssm and ssh_proxy_command is not None:
838
+ raise exceptions.InvalidCloudConfigs(
839
+ 'use_ssm is set to true, but ssh_proxy_command '
840
+ f'is already set to {ssh_proxy_command!r}. Please remove '
841
+ 'ssh_proxy_command or set use_ssm to false.')
842
+
843
+ if use_internal_ips and ssh_proxy_command is None:
844
+ # Only if use_ssm is explicitly not set, we default to using SSM.
845
+ if use_ssm is None:
846
+ logger.warning(
847
+ f'{colorama.Fore.YELLOW}'
848
+ 'use_internal_ips is set to true, '
849
+ 'but ssh_proxy_command is not set. Defaulting to '
850
+ 'using SSM. Specify ssh_proxy_command to use a different '
851
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
852
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
853
+ use_ssm = True
854
+
855
+ if use_ssm:
856
+ aws_profile = os.environ.get('AWS_PROFILE', None)
857
+ profile_str = f'--profile {aws_profile}' if aws_profile else ''
858
+ ip_address_filter = ('Name=private-ip-address,Values=%h'
859
+ if use_internal_ips else
860
+ 'Name=ip-address,Values=%h')
861
+ get_instance_id_command = 'aws ec2 describe-instances ' + \
862
+ f'--region {region_name} --filters {ip_address_filter} ' + \
863
+ '--query \"Reservations[].Instances[].InstanceId\" ' + \
864
+ f'{profile_str} --output text'
865
+ ssm_proxy_command = 'aws ssm start-session --target ' + \
866
+ f'\"$({get_instance_id_command})\" ' + \
867
+ f'--region {region_name} {profile_str} ' + \
868
+ '--document-name AWS-StartSSHSession ' + \
869
+ '--parameters portNumber=%p'
870
+ ssh_proxy_command = ssm_proxy_command
871
+ region_name = 'ssm-session'
770
872
  logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
771
873
 
772
874
  # User-supplied global instance tags from ~/.sky/config.yaml.
@@ -783,12 +885,6 @@ def write_cluster_config(
783
885
  if to_provision.labels:
784
886
  labels.update(to_provision.labels)
785
887
 
786
- # Dump the Ray ports to a file for Ray job submission
787
- dump_port_command = (
788
- f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
789
- f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
790
- )
791
-
792
888
  # We disable conda auto-activation if the user has specified a docker image
793
889
  # to use, which is likely to already have a conda environment activated.
794
890
  conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
@@ -804,14 +900,24 @@ def write_cluster_config(
804
900
  cluster_name)
805
901
 
806
902
  volume_mount_vars = []
903
+ ephemeral_volume_mount_vars = []
807
904
  if volume_mounts is not None:
808
905
  for vol in volume_mounts:
809
- volume_mount_vars.append({
810
- 'name': vol.volume_name,
811
- 'path': vol.path,
812
- 'volume_name_on_cloud': vol.volume_config.name_on_cloud,
813
- 'volume_id_on_cloud': vol.volume_config.id_on_cloud,
814
- })
906
+ if vol.is_ephemeral:
907
+ volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
908
+ vol.volume_name = volume_name
909
+ vol.volume_config.cloud = repr(cloud)
910
+ vol.volume_config.region = region.name
911
+ vol.volume_config.name = volume_name
912
+ ephemeral_volume_mount_vars.append(vol.to_yaml_config())
913
+ else:
914
+ volume_info = volume_utils.VolumeInfo(
915
+ name=vol.volume_name,
916
+ path=vol.path,
917
+ volume_name_on_cloud=vol.volume_config.name_on_cloud,
918
+ volume_id_on_cloud=vol.volume_config.id_on_cloud,
919
+ )
920
+ volume_mount_vars.append(volume_info)
815
921
 
816
922
  runcmd = skypilot_config.get_effective_region_config(
817
923
  cloud=str(to_provision.cloud).lower(),
@@ -865,6 +971,9 @@ def write_cluster_config(
865
971
  '{conda_auto_activate}',
866
972
  conda_auto_activate).replace('{is_custom_docker}',
867
973
  is_custom_docker),
974
+ # Currently only used by Slurm. For other clouds, it is
975
+ # already part of ray_skypilot_installation_commands
976
+ 'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
868
977
  'ray_skypilot_installation_commands':
869
978
  (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
870
979
  '{sky_wheel_hash}',
@@ -875,12 +984,14 @@ def write_cluster_config(
875
984
  '{sky_wheel_hash}',
876
985
  wheel_hash).replace('{cloud}',
877
986
  str(cloud).lower()),
987
+ 'copy_skypilot_templates_commands':
988
+ constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
878
989
  # Port of Ray (GCS server).
879
990
  # Ray's default port 6379 is conflicted with Redis.
880
991
  'ray_port': constants.SKY_REMOTE_RAY_PORT,
881
992
  'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
882
993
  'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
883
- 'dump_port_command': dump_port_command,
994
+ 'dump_port_command': instance_setup.DUMP_RAY_PORTS,
884
995
  # Sky-internal constants.
885
996
  'sky_ray_cmd': constants.SKY_RAY_CMD,
886
997
  # pip install needs to have python env activated to make sure
@@ -917,9 +1028,10 @@ def write_cluster_config(
917
1028
 
918
1029
  # Volume mounts
919
1030
  'volume_mounts': volume_mount_vars,
1031
+ 'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
920
1032
 
921
- # runcmd to append to the cloud-init cloud config passed to the
922
- # machine's UserData. This is currently only used by AWS.
1033
+ # runcmd to run before any of the SkyPilot runtime setup commands.
1034
+ # This is currently only used by AWS and Kubernetes.
923
1035
  'runcmd': runcmd,
924
1036
  }),
925
1037
  output_path=tmp_yaml_path)
@@ -974,9 +1086,9 @@ def write_cluster_config(
974
1086
  with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
975
1087
  f.write(restored_yaml_content)
976
1088
 
977
- # Read the cluster name from the tmp yaml file, to take the backward
978
- # compatbility restortion above into account.
979
- # TODO: remove this after 2 minor releases, 0.10.0.
1089
+ # Read the cluster_name_on_cloud from the restored yaml. This is a hack to
1090
+ # make sure that launching on the same cluster across multiple users works
1091
+ # correctly. See #8232.
980
1092
  yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
981
1093
  config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
982
1094
 
@@ -1025,17 +1137,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1025
1137
  """
1026
1138
  config = yaml_utils.read_yaml(tmp_yaml_path)
1027
1139
  # Check the availability of the cloud type.
1028
- if isinstance(cloud, (
1140
+ if isinstance(
1141
+ cloud,
1142
+ (
1029
1143
  clouds.AWS,
1030
1144
  clouds.OCI,
1031
1145
  clouds.SCP,
1146
+ # TODO(jwj): Handle Slurm-specific auth logic
1147
+ clouds.Slurm,
1032
1148
  clouds.Vsphere,
1033
1149
  clouds.Cudo,
1034
1150
  clouds.Paperspace,
1035
1151
  clouds.Azure,
1036
1152
  clouds.DO,
1037
1153
  clouds.Nebius,
1038
- )):
1154
+ )):
1039
1155
  config = auth.configure_ssh_info(config)
1040
1156
  elif isinstance(cloud, clouds.GCP):
1041
1157
  config = auth.setup_gcp_authentication(config)
@@ -1053,6 +1169,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
1053
1169
  config = auth.setup_fluidstack_authentication(config)
1054
1170
  elif isinstance(cloud, clouds.Hyperbolic):
1055
1171
  config = auth.setup_hyperbolic_authentication(config)
1172
+ elif isinstance(cloud, clouds.Shadeform):
1173
+ config = auth.setup_shadeform_authentication(config)
1174
+ elif isinstance(cloud, clouds.PrimeIntellect):
1175
+ config = auth.setup_primeintellect_authentication(config)
1176
+ elif isinstance(cloud, clouds.Seeweb):
1177
+ config = auth.setup_seeweb_authentication(config)
1056
1178
  else:
1057
1179
  assert False, cloud
1058
1180
  yaml_utils.dump_yaml(tmp_yaml_path, config)
@@ -1155,7 +1277,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
1155
1277
  Rather than constructing the whole byte sequence, which may be quite large,
1156
1278
  we construct it incrementally by using hash.update() to add new bytes.
1157
1279
  """
1158
-
1159
1280
  # Load the yaml contents so that we can directly remove keys.
1160
1281
  yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1161
1282
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
@@ -1738,6 +1859,32 @@ def check_network_connection():
1738
1859
  'Network seems down.')
1739
1860
 
1740
1861
 
1862
+ async def async_check_network_connection():
1863
+ """Check if the network connection is available.
1864
+
1865
+ Tolerates 3 retries as it is observed that connections can fail.
1866
+ Uses aiohttp for async HTTP requests.
1867
+ """
1868
+ # Create a session with retry logic
1869
+ timeout = ClientTimeout(total=15)
1870
+ connector = TCPConnector(limit=1) # Limit to 1 connection at a time
1871
+
1872
+ async with aiohttp.ClientSession(timeout=timeout,
1873
+ connector=connector) as session:
1874
+ for i, ip in enumerate(_TEST_IP_LIST):
1875
+ try:
1876
+ async with session.head(ip) as response:
1877
+ if response.status < 400: # Any 2xx or 3xx status is good
1878
+ return
1879
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
1880
+ if i == len(_TEST_IP_LIST) - 1:
1881
+ raise exceptions.NetworkError(
1882
+ 'Could not refresh the cluster. '
1883
+ 'Network seems down.') from e
1884
+ # If not the last IP, continue to try the next one
1885
+ continue
1886
+
1887
+
1741
1888
  @timeline.event
1742
1889
  def check_owner_identity(cluster_name: str) -> None:
1743
1890
  """Check if current user is the same as the user who created the cluster.
@@ -1750,9 +1897,18 @@ def check_owner_identity(cluster_name: str) -> None:
1750
1897
  """
1751
1898
  if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1752
1899
  return
1753
- record = global_user_state.get_cluster_from_name(cluster_name)
1900
+ record = global_user_state.get_cluster_from_name(cluster_name,
1901
+ include_user_info=False,
1902
+ summary_response=True)
1754
1903
  if record is None:
1755
1904
  return
1905
+ _check_owner_identity_with_record(cluster_name, record)
1906
+
1907
+
1908
+ def _check_owner_identity_with_record(cluster_name: str,
1909
+ record: Dict[str, Any]) -> None:
1910
+ if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
1911
+ return
1756
1912
  handle = record['handle']
1757
1913
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1758
1914
  return
@@ -1837,8 +1993,10 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1837
1993
  }
1838
1994
 
1839
1995
 
1996
+ @context_utils.cancellation_guard
1840
1997
  def _query_cluster_status_via_cloud_api(
1841
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1998
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
1999
+ retry_if_missing: bool,
1842
2000
  ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1843
2001
  """Returns the status of the cluster as a list of tuples corresponding
1844
2002
  to the node status and an optional reason string for said status.
@@ -1865,8 +2023,11 @@ def _query_cluster_status_via_cloud_api(
1865
2023
  cloud_name = repr(handle.launched_resources.cloud)
1866
2024
  try:
1867
2025
  node_status_dict = provision_lib.query_instances(
1868
- cloud_name, cluster_name, cluster_name_on_cloud,
1869
- provider_config)
2026
+ cloud_name,
2027
+ cluster_name,
2028
+ cluster_name_on_cloud,
2029
+ provider_config,
2030
+ retry_if_missing=retry_if_missing)
1870
2031
  logger.debug(f'Querying {cloud_name} cluster '
1871
2032
  f'{cluster_name_in_hint} '
1872
2033
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -2044,7 +2205,12 @@ def check_can_clone_disk_and_override_task(
2044
2205
  return task, handle
2045
2206
 
2046
2207
 
2047
- def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2208
+ def _update_cluster_status(
2209
+ cluster_name: str,
2210
+ record: Dict[str, Any],
2211
+ retry_if_missing: bool,
2212
+ include_user_info: bool = True,
2213
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
2048
2214
  """Update the cluster status.
2049
2215
 
2050
2216
  The cluster status is updated by checking ray cluster and real status from
@@ -2071,9 +2237,6 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2071
2237
  fetched from the cloud provider or there are leaked nodes causing
2072
2238
  the node number larger than expected.
2073
2239
  """
2074
- record = global_user_state.get_cluster_from_name(cluster_name)
2075
- if record is None:
2076
- return None
2077
2240
  handle = record['handle']
2078
2241
  if handle.cluster_yaml is None:
2079
2242
  # Remove cluster from db since this cluster does not have a config file
@@ -2092,7 +2255,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2092
2255
  return record
2093
2256
  cluster_name = handle.cluster_name
2094
2257
 
2095
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2258
+ node_statuses = _query_cluster_status_via_cloud_api(
2259
+ handle, retry_if_missing=retry_if_missing)
2096
2260
 
2097
2261
  all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2098
2262
  for status in node_statuses) and
@@ -2140,6 +2304,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2140
2304
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
2141
2305
 
2142
2306
  cloud_name = repr(handle.launched_resources.cloud).lower()
2307
+ # Initialize variables in case all retries fail
2308
+ ready_head = 0
2309
+ ready_workers = 0
2310
+ output = ''
2311
+ stderr = ''
2143
2312
  for i in range(5):
2144
2313
  try:
2145
2314
  ready_head, ready_workers, output, stderr = (
@@ -2228,7 +2397,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2228
2397
  # remain healthy for a while before the cloud completely preempts the VMs.
2229
2398
  # We have mitigated this by again first querying the VM state from the cloud
2230
2399
  # provider.
2231
- if all_nodes_up and run_ray_status_to_check_ray_cluster_healthy():
2400
+ cloud = handle.launched_resources.cloud
2401
+
2402
+ # For Slurm, skip Ray health check since it doesn't use Ray.
2403
+ should_check_ray = cloud is not None and cloud.uses_ray()
2404
+ if all_nodes_up and (not should_check_ray or
2405
+ run_ray_status_to_check_ray_cluster_healthy()):
2232
2406
  # NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
2233
2407
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
2234
2408
  # head-ip/worker-ips`.
@@ -2240,12 +2414,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2240
2414
  'All nodes up; SkyPilot runtime healthy.',
2241
2415
  global_user_state.ClusterEventType.STATUS_CHANGE,
2242
2416
  nop_if_duplicate=True)
2243
- global_user_state.add_or_update_cluster(cluster_name,
2244
- handle,
2245
- requested_resources=None,
2246
- ready=True,
2247
- is_launch=False)
2248
- return global_user_state.get_cluster_from_name(cluster_name)
2417
+ global_user_state.add_or_update_cluster(
2418
+ cluster_name,
2419
+ handle,
2420
+ requested_resources=None,
2421
+ ready=True,
2422
+ is_launch=False,
2423
+ existing_cluster_hash=record['cluster_hash'])
2424
+ return global_user_state.get_cluster_from_name(
2425
+ cluster_name,
2426
+ include_user_info=include_user_info,
2427
+ summary_response=summary_response)
2249
2428
 
2250
2429
  # All cases below are transitioning the cluster to non-UP states.
2251
2430
  launched_resources = handle.launched_resources.assert_launchable()
@@ -2262,7 +2441,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2262
2441
  # and check again. This is a best-effort leak prevention check.
2263
2442
  # See https://github.com/skypilot-org/skypilot/issues/4431.
2264
2443
  time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
2265
- node_statuses = _query_cluster_status_via_cloud_api(handle)
2444
+ node_statuses = _query_cluster_status_via_cloud_api(
2445
+ handle, retry_if_missing=False)
2266
2446
  # Note: even if all the node_statuses are UP now, we will still
2267
2447
  # consider this cluster abnormal, and its status will be INIT.
2268
2448
 
@@ -2450,12 +2630,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2450
2630
  global_user_state.ClusterEventType.STATUS_CHANGE,
2451
2631
  nop_if_duplicate=True,
2452
2632
  duplicate_regex=init_reason_regex)
2453
- global_user_state.add_or_update_cluster(cluster_name,
2454
- handle,
2455
- requested_resources=None,
2456
- ready=False,
2457
- is_launch=False)
2458
- return global_user_state.get_cluster_from_name(cluster_name)
2633
+ global_user_state.add_or_update_cluster(
2634
+ cluster_name,
2635
+ handle,
2636
+ requested_resources=None,
2637
+ ready=False,
2638
+ is_launch=False,
2639
+ existing_cluster_hash=record['cluster_hash'])
2640
+ return global_user_state.get_cluster_from_name(
2641
+ cluster_name,
2642
+ include_user_info=include_user_info,
2643
+ summary_response=summary_response)
2459
2644
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2460
2645
  # STOPPED.
2461
2646
  verb = 'terminated' if to_terminate else 'stopped'
@@ -2470,7 +2655,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2470
2655
  nop_if_duplicate=True,
2471
2656
  )
2472
2657
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2473
- return global_user_state.get_cluster_from_name(cluster_name)
2658
+ return global_user_state.get_cluster_from_name(
2659
+ cluster_name,
2660
+ include_user_info=include_user_info,
2661
+ summary_response=summary_response)
2474
2662
 
2475
2663
 
2476
2664
  def _must_refresh_cluster_status(
@@ -2492,12 +2680,14 @@ def _must_refresh_cluster_status(
2492
2680
 
2493
2681
 
2494
2682
  def refresh_cluster_record(
2495
- cluster_name: str,
2496
- *,
2497
- force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2498
- acquire_per_cluster_status_lock: bool = True,
2499
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2500
- ) -> Optional[Dict[str, Any]]:
2683
+ cluster_name: str,
2684
+ *,
2685
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2686
+ cluster_lock_already_held: bool = False,
2687
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2688
+ include_user_info: bool = True,
2689
+ summary_response: bool = False,
2690
+ retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
2501
2691
  """Refresh the cluster, and return the possibly updated record.
2502
2692
 
2503
2693
  The function will update the cached cluster status in the global state. For
@@ -2514,14 +2704,20 @@ def refresh_cluster_record(
2514
2704
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
2515
2705
  1. the cluster is a spot cluster, or
2516
2706
  2. cluster autostop is set and the cluster is not STOPPED.
2517
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
2518
- before updating the status. Even if this is True, the lock may not be
2519
- acquired if the status does not need to be refreshed.
2707
+ cluster_lock_already_held: Whether the caller is already holding the
2708
+ per-cluster lock. You MUST NOT set this to True if the caller does not
2709
+ already hold the lock. If True, we will not acquire the lock before
2710
+ updating the status. Failing to hold the lock while updating the
2711
+ status can lead to correctness issues - e.g. an launch in-progress may
2712
+ appear to be DOWN incorrectly. Even if this is set to False, the lock
2713
+ may not be acquired if the status does not need to be refreshed.
2520
2714
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
2521
2715
  lock. If timeout, the function will use the cached status. If the
2522
2716
  value is <0, do not timeout (wait for the lock indefinitely). By
2523
2717
  default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2524
2718
  if correctness is required, you must set this to -1.
2719
+ retry_if_missing: Whether to retry the call to the cloud api if the
2720
+ cluster is not found when querying the live status on the cloud.
2525
2721
 
2526
2722
  Returns:
2527
2723
  If the cluster is terminated or does not exist, return None.
@@ -2537,17 +2733,20 @@ def refresh_cluster_record(
2537
2733
  the node number larger than expected.
2538
2734
  """
2539
2735
 
2540
- record = global_user_state.get_cluster_from_name(cluster_name)
2736
+ ctx = context_lib.get()
2737
+ record = global_user_state.get_cluster_from_name(
2738
+ cluster_name,
2739
+ include_user_info=include_user_info,
2740
+ summary_response=summary_response)
2541
2741
  if record is None:
2542
2742
  return None
2543
2743
  # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
2544
2744
  # using the correct cloud credentials.
2545
2745
  workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
2546
2746
  with skypilot_config.local_active_workspace_ctx(workspace):
2547
- check_owner_identity(cluster_name)
2548
-
2549
- if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2550
- return record
2747
+ # check_owner_identity returns if the record handle is
2748
+ # not a CloudVmRayResourceHandle
2749
+ _check_owner_identity_with_record(cluster_name, record)
2551
2750
 
2552
2751
  # The loop logic allows us to notice if the status was updated in the
2553
2752
  # global_user_state by another process and stop trying to get the lock.
@@ -2556,12 +2755,18 @@ def refresh_cluster_record(
2556
2755
 
2557
2756
  # Loop until we have an up-to-date status or until we acquire the lock.
2558
2757
  while True:
2758
+ # Check if the context is canceled.
2759
+ if ctx is not None and ctx.is_canceled():
2760
+ raise asyncio.CancelledError()
2559
2761
  # Check to see if we can return the cached status.
2560
2762
  if not _must_refresh_cluster_status(record, force_refresh_statuses):
2561
2763
  return record
2562
2764
 
2563
- if not acquire_per_cluster_status_lock:
2564
- return _update_cluster_status(cluster_name)
2765
+ if cluster_lock_already_held:
2766
+ return _update_cluster_status(cluster_name, record,
2767
+ retry_if_missing,
2768
+ include_user_info,
2769
+ summary_response)
2565
2770
 
2566
2771
  # Try to acquire the lock so we can fetch the status.
2567
2772
  try:
@@ -2569,12 +2774,17 @@ def refresh_cluster_record(
2569
2774
  # Check the cluster status again, since it could have been
2570
2775
  # updated between our last check and acquiring the lock.
2571
2776
  record = global_user_state.get_cluster_from_name(
2572
- cluster_name)
2777
+ cluster_name,
2778
+ include_user_info=include_user_info,
2779
+ summary_response=summary_response)
2573
2780
  if record is None or not _must_refresh_cluster_status(
2574
2781
  record, force_refresh_statuses):
2575
2782
  return record
2576
2783
  # Update and return the cluster status.
2577
- return _update_cluster_status(cluster_name)
2784
+ return _update_cluster_status(cluster_name, record,
2785
+ retry_if_missing,
2786
+ include_user_info,
2787
+ summary_response)
2578
2788
 
2579
2789
  except locks.LockTimeout:
2580
2790
  # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2592,10 +2802,13 @@ def refresh_cluster_record(
2592
2802
  'Refreshing status: Failed get the lock for cluster '
2593
2803
  f'{cluster_name!r}. Using the cached status.')
2594
2804
  return record
2595
- time.sleep(0.05)
2805
+ time.sleep(lock.poll_interval)
2596
2806
 
2597
2807
  # Refresh for next loop iteration.
2598
- record = global_user_state.get_cluster_from_name(cluster_name)
2808
+ record = global_user_state.get_cluster_from_name(
2809
+ cluster_name,
2810
+ include_user_info=include_user_info,
2811
+ summary_response=summary_response)
2599
2812
  if record is None:
2600
2813
  return None
2601
2814
 
@@ -2606,8 +2819,9 @@ def refresh_cluster_status_handle(
2606
2819
  cluster_name: str,
2607
2820
  *,
2608
2821
  force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
2609
- acquire_per_cluster_status_lock: bool = True,
2610
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
2822
+ cluster_lock_already_held: bool = False,
2823
+ cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
2824
+ retry_if_missing: bool = True,
2611
2825
  ) -> Tuple[Optional[status_lib.ClusterStatus],
2612
2826
  Optional[backends.ResourceHandle]]:
2613
2827
  """Refresh the cluster, and return the possibly updated status and handle.
@@ -2619,8 +2833,11 @@ def refresh_cluster_status_handle(
2619
2833
  record = refresh_cluster_record(
2620
2834
  cluster_name,
2621
2835
  force_refresh_statuses=force_refresh_statuses,
2622
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2623
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2836
+ cluster_lock_already_held=cluster_lock_already_held,
2837
+ cluster_status_lock_timeout=cluster_status_lock_timeout,
2838
+ include_user_info=False,
2839
+ summary_response=True,
2840
+ retry_if_missing=retry_if_missing)
2624
2841
  if record is None:
2625
2842
  return None, None
2626
2843
  return record['status'], record['handle']
@@ -2671,7 +2888,9 @@ def check_cluster_available(
2671
2888
  exceptions.CloudUserIdentityError: if we fail to get the current user
2672
2889
  identity.
2673
2890
  """
2674
- record = global_user_state.get_cluster_from_name(cluster_name)
2891
+ record = global_user_state.get_cluster_from_name(cluster_name,
2892
+ include_user_info=False,
2893
+ summary_response=True)
2675
2894
  if dryrun:
2676
2895
  assert record is not None, cluster_name
2677
2896
  return record['handle']
@@ -2858,7 +3077,8 @@ def is_controller_accessible(
2858
3077
  f'fatal, but {controller_name} commands/calls may hang or return '
2859
3078
  'stale information, when the controller is not up.\n'
2860
3079
  f' Details: {common_utils.format_exception(e, use_bracket=True)}')
2861
- record = global_user_state.get_cluster_from_name(cluster_name)
3080
+ record = global_user_state.get_cluster_from_name(
3081
+ cluster_name, include_user_info=False, summary_response=True)
2862
3082
  if record is not None:
2863
3083
  controller_status, handle = record['status'], record['handle']
2864
3084
  # We check the connection even if the cluster has a cached status UP
@@ -2915,22 +3135,96 @@ class CloudFilter(enum.Enum):
2915
3135
  LOCAL = 'local'
2916
3136
 
2917
3137
 
2918
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
3138
+ def _get_glob_clusters(
3139
+ clusters: List[str],
3140
+ silent: bool = False,
3141
+ workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
2919
3142
  """Returns a list of clusters that match the glob pattern."""
2920
3143
  glob_clusters = []
2921
3144
  for cluster in clusters:
2922
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
3145
+ glob_cluster = global_user_state.get_glob_cluster_names(
3146
+ cluster, workspaces_filter=workspaces_filter)
2923
3147
  if len(glob_cluster) == 0 and not silent:
2924
3148
  logger.info(f'Cluster {cluster} not found.')
2925
3149
  glob_clusters.extend(glob_cluster)
2926
3150
  return list(set(glob_clusters))
2927
3151
 
2928
3152
 
3153
+ def _refresh_cluster(
3154
+ cluster_name: str,
3155
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
3156
+ include_user_info: bool = True,
3157
+ summary_response: bool = False) -> Optional[Dict[str, Any]]:
3158
+ try:
3159
+ record = refresh_cluster_record(
3160
+ cluster_name,
3161
+ force_refresh_statuses=force_refresh_statuses,
3162
+ cluster_lock_already_held=False,
3163
+ include_user_info=include_user_info,
3164
+ summary_response=summary_response)
3165
+ except (exceptions.ClusterStatusFetchingError,
3166
+ exceptions.CloudUserIdentityError,
3167
+ exceptions.ClusterOwnerIdentityMismatchError) as e:
3168
+ # Do not fail the entire refresh process. The caller will
3169
+ # handle the 'UNKNOWN' status, and collect the errors into
3170
+ # a table.
3171
+ record = {'status': 'UNKNOWN', 'error': e}
3172
+ return record
3173
+
3174
+
3175
+ def refresh_cluster_records() -> None:
3176
+ """Refreshes the status of all clusters, except managed clusters.
3177
+
3178
+ Used by the background status refresh daemon.
3179
+ This function is a stripped-down version of get_clusters, with only the
3180
+ bare bones refresh logic.
3181
+
3182
+ Returns:
3183
+ None
3184
+
3185
+ Raises:
3186
+ None
3187
+ """
3188
+ # We force to exclude managed clusters to avoid multiple sources
3189
+ # manipulating them. For example, SkyServe assumes the replica manager
3190
+ # is the only source of truth for the cluster status.
3191
+ cluster_names = set(
3192
+ global_user_state.get_cluster_names(exclude_managed_clusters=True))
3193
+
3194
+ # TODO(syang): we should try not to leak
3195
+ # request info in backend_utils.py.
3196
+ # Refactor this to use some other info to
3197
+ # determine if a launch is in progress.
3198
+ cluster_names_with_launch_request = {
3199
+ request.cluster_name for request in requests_lib.get_request_tasks(
3200
+ req_filter=requests_lib.RequestTaskFilter(
3201
+ status=[requests_lib.RequestStatus.RUNNING],
3202
+ include_request_names=['sky.launch'],
3203
+ fields=['cluster_name']))
3204
+ }
3205
+ cluster_names_without_launch_request = (cluster_names -
3206
+ cluster_names_with_launch_request)
3207
+
3208
+ def _refresh_cluster_record(cluster_name):
3209
+ return _refresh_cluster(cluster_name,
3210
+ force_refresh_statuses=set(
3211
+ status_lib.ClusterStatus),
3212
+ include_user_info=False,
3213
+ summary_response=True)
3214
+
3215
+ if len(cluster_names_without_launch_request) > 0:
3216
+ # Do not refresh the clusters that have an active launch request.
3217
+ subprocess_utils.run_in_parallel(_refresh_cluster_record,
3218
+ cluster_names_without_launch_request)
3219
+
3220
+
2929
3221
  def get_clusters(
2930
3222
  refresh: common.StatusRefreshMode,
2931
3223
  cluster_names: Optional[Union[str, List[str]]] = None,
2932
3224
  all_users: bool = True,
2933
3225
  include_credentials: bool = False,
3226
+ summary_response: bool = False,
3227
+ include_handle: bool = True,
2934
3228
  # Internal only:
2935
3229
  # pylint: disable=invalid-name
2936
3230
  _include_is_managed: bool = False,
@@ -2958,6 +3252,23 @@ def get_clusters(
2958
3252
  A list of cluster records. If the cluster does not exist or has been
2959
3253
  terminated, the record will be omitted from the returned list.
2960
3254
  """
3255
+ accessible_workspaces = workspaces_core.get_workspaces()
3256
+ if cluster_names is not None:
3257
+ if isinstance(cluster_names, str):
3258
+ cluster_names = [cluster_names]
3259
+ non_glob_cluster_names = []
3260
+ glob_cluster_names = []
3261
+ for cluster_name in cluster_names:
3262
+ if ux_utils.is_glob_pattern(cluster_name):
3263
+ glob_cluster_names.append(cluster_name)
3264
+ else:
3265
+ non_glob_cluster_names.append(cluster_name)
3266
+ cluster_names = non_glob_cluster_names
3267
+ if glob_cluster_names:
3268
+ cluster_names += _get_glob_clusters(
3269
+ glob_cluster_names,
3270
+ silent=True,
3271
+ workspaces_filter=accessible_workspaces)
2961
3272
 
2962
3273
  exclude_managed_clusters = False
2963
3274
  if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -2965,34 +3276,24 @@ def get_clusters(
2965
3276
  user_hashes_filter = None
2966
3277
  if not all_users:
2967
3278
  user_hashes_filter = {common_utils.get_current_user().id}
2968
- accessible_workspaces = workspaces_core.get_workspaces()
2969
-
2970
3279
  records = global_user_state.get_clusters(
2971
3280
  exclude_managed_clusters=exclude_managed_clusters,
2972
3281
  user_hashes_filter=user_hashes_filter,
2973
- workspaces_filter=accessible_workspaces)
3282
+ workspaces_filter=accessible_workspaces,
3283
+ cluster_names=cluster_names,
3284
+ summary_response=summary_response)
2974
3285
 
2975
3286
  yellow = colorama.Fore.YELLOW
2976
3287
  bright = colorama.Style.BRIGHT
2977
3288
  reset = colorama.Style.RESET_ALL
2978
3289
 
2979
3290
  if cluster_names is not None:
2980
- if isinstance(cluster_names, str):
2981
- cluster_names = [cluster_names]
2982
- cluster_names = _get_glob_clusters(cluster_names, silent=True)
2983
- new_records = []
2984
- not_exist_cluster_names = []
2985
- for cluster_name in cluster_names:
2986
- for record in records:
2987
- if record['name'] == cluster_name:
2988
- new_records.append(record)
2989
- break
2990
- else:
2991
- not_exist_cluster_names.append(cluster_name)
2992
- if not_exist_cluster_names:
2993
- clusters_str = ', '.join(not_exist_cluster_names)
3291
+ record_names = {record['name'] for record in records}
3292
+ not_found_clusters = ux_utils.get_non_matched_query(
3293
+ cluster_names, record_names)
3294
+ if not_found_clusters:
3295
+ clusters_str = ', '.join(not_found_clusters)
2994
3296
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2995
- records = new_records
2996
3297
 
2997
3298
  def _get_records_with_handle(
2998
3299
  records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
@@ -3002,17 +3303,18 @@ def get_clusters(
3002
3303
  if record is not None and record['handle'] is not None
3003
3304
  ]
3004
3305
 
3005
- def _update_records_with_resources_str(
3306
+ def _update_records_with_handle_info(
3006
3307
  records: List[Optional[Dict[str, Any]]]) -> None:
3007
3308
  """Add resource str to record"""
3008
3309
  for record in _get_records_with_handle(records):
3009
3310
  handle = record['handle']
3010
- record[
3011
- 'resources_str'] = resources_utils.get_readable_resources_repr(
3012
- handle, simplify=True)
3013
- record[
3014
- 'resources_str_full'] = resources_utils.get_readable_resources_repr(
3015
- handle, simplify=False)
3311
+ resource_str_simple, resource_str_full = (
3312
+ resources_utils.get_readable_resources_repr(
3313
+ handle, simplified_only=False))
3314
+ record['resources_str'] = resource_str_simple
3315
+ record['resources_str_full'] = resource_str_full
3316
+ if not summary_response:
3317
+ record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
3016
3318
 
3017
3319
  def _update_records_with_credentials(
3018
3320
  records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3036,9 +3338,17 @@ def get_clusters(
3036
3338
  expanded_private_key_path = os.path.expanduser(
3037
3339
  ssh_private_key_path)
3038
3340
  if not os.path.exists(expanded_private_key_path):
3039
- auth.create_ssh_key_files_from_db(ssh_private_key_path)
3341
+ success = auth_utils.create_ssh_key_files_from_db(
3342
+ ssh_private_key_path)
3343
+ if not success:
3344
+ # If the ssh key files are not found, we do not
3345
+ # update the record with credentials.
3346
+ logger.debug(
3347
+ f'SSH keys not found for cluster {record["name"]} '
3348
+ f'at key path {ssh_private_key_path}')
3349
+ continue
3040
3350
  else:
3041
- private_key_path, _ = auth.get_or_generate_keys()
3351
+ private_key_path, _ = auth_utils.get_or_generate_keys()
3042
3352
  expanded_private_key_path = os.path.expanduser(private_key_path)
3043
3353
  if expanded_private_key_path in cached_private_keys:
3044
3354
  credential['ssh_private_key_content'] = cached_private_keys[
@@ -3052,7 +3362,7 @@ def get_clusters(
3052
3362
  record['credentials'] = credential
3053
3363
 
3054
3364
  def _update_records_with_resources(
3055
- records: List[Optional[Dict[str, Any]]]) -> None:
3365
+ records: List[Optional[Dict[str, Any]]],) -> None:
3056
3366
  """Add the resources to the record."""
3057
3367
  for record in _get_records_with_handle(records):
3058
3368
  handle = record['handle']
@@ -3070,9 +3380,11 @@ def get_clusters(
3070
3380
  record['accelerators'] = (
3071
3381
  f'{handle.launched_resources.accelerators}'
3072
3382
  if handle.launched_resources.accelerators else None)
3383
+ if not include_handle:
3384
+ record.pop('handle', None)
3073
3385
 
3074
- # Add auth_config to the records
3075
- _update_records_with_resources_str(records)
3386
+ # Add handle info to the records
3387
+ _update_records_with_handle_info(records)
3076
3388
  if include_credentials:
3077
3389
  _update_records_with_credentials(records)
3078
3390
  if refresh == common.StatusRefreshMode.NONE:
@@ -3093,65 +3405,76 @@ def get_clusters(
3093
3405
  else:
3094
3406
  force_refresh_statuses = None
3095
3407
 
3096
- def _refresh_cluster(cluster_name):
3097
- # TODO(syang): we should try not to leak
3098
- # request info in backend_utils.py.
3099
- # Refactor this to use some other info to
3100
- # determine if a launch is in progress.
3101
- request = requests_lib.get_request_tasks(
3102
- req_filter=requests_lib.RequestTaskFilter(
3103
- status=[requests_lib.RequestStatus.RUNNING],
3104
- cluster_names=[cluster_name],
3105
- include_request_names=['sky.launch']))
3106
- if len(request) > 0:
3107
- # There is an active launch request on the cluster,
3108
- # so we don't want to update the cluster status until
3109
- # the request is completed.
3110
- logger.debug(f'skipping refresh for cluster {cluster_name} '
3111
- 'as there is an active launch request')
3112
- return global_user_state.get_cluster_from_name(cluster_name)
3113
- try:
3114
- record = refresh_cluster_record(
3115
- cluster_name,
3116
- force_refresh_statuses=force_refresh_statuses,
3117
- acquire_per_cluster_status_lock=True)
3118
- _update_records_with_resources_str([record])
3408
+ def _refresh_cluster_record(cluster_name):
3409
+ record = _refresh_cluster(cluster_name,
3410
+ force_refresh_statuses=force_refresh_statuses,
3411
+ include_user_info=True,
3412
+ summary_response=summary_response)
3413
+ # record may be None if the cluster is deleted during refresh,
3414
+ # e.g. all the Pods of a cluster on Kubernetes have been
3415
+ # deleted before refresh.
3416
+ if record is not None and 'error' not in record:
3417
+ _update_records_with_handle_info([record])
3119
3418
  if include_credentials:
3120
3419
  _update_records_with_credentials([record])
3121
- except (exceptions.ClusterStatusFetchingError,
3122
- exceptions.CloudUserIdentityError,
3123
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3124
- # Do not fail the entire refresh process. The caller will
3125
- # handle the 'UNKNOWN' status, and collect the errors into
3126
- # a table.
3127
- record = {'status': 'UNKNOWN', 'error': e}
3128
- progress.update(task, advance=1)
3420
+ progress.update(task, advance=1)
3129
3421
  return record
3130
3422
 
3131
3423
  cluster_names = [record['name'] for record in records]
3424
+ # TODO(syang): we should try not to leak
3425
+ # request info in backend_utils.py.
3426
+ # Refactor this to use some other info to
3427
+ # determine if a launch is in progress.
3428
+ cluster_names_with_launch_request = {
3429
+ request.cluster_name for request in requests_lib.get_request_tasks(
3430
+ req_filter=requests_lib.RequestTaskFilter(
3431
+ status=[requests_lib.RequestStatus.RUNNING],
3432
+ include_request_names=['sky.launch'],
3433
+ cluster_names=cluster_names,
3434
+ fields=['cluster_name']))
3435
+ }
3436
+ # Preserve the index of the cluster name as it appears on "records"
3437
+ cluster_names_without_launch_request = [
3438
+ (i, cluster_name)
3439
+ for i, cluster_name in enumerate(cluster_names)
3440
+ if cluster_name not in cluster_names_with_launch_request
3441
+ ]
3442
+ # for clusters that have an active launch request, we do not refresh the status
3132
3443
  updated_records = []
3133
- if len(cluster_names) > 0:
3444
+ if len(cluster_names_without_launch_request) > 0:
3134
3445
  with progress:
3135
3446
  updated_records = subprocess_utils.run_in_parallel(
3136
- _refresh_cluster, cluster_names)
3137
-
3447
+ _refresh_cluster_record, [
3448
+ cluster_name
3449
+ for _, cluster_name in cluster_names_without_launch_request
3450
+ ])
3451
+ # Preserve the index of the cluster name as it appears on "records"
3452
+ # before filtering for clusters being launched.
3453
+ updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
3454
+ cluster_names_without_launch_request[i][0]: updated_records[i]
3455
+ for i in range(len(cluster_names_without_launch_request))
3456
+ }
3138
3457
  # Show information for removed clusters.
3139
3458
  kept_records = []
3140
3459
  autodown_clusters, remaining_clusters, failed_clusters = [], [], []
3141
3460
  for i, record in enumerate(records):
3142
- if updated_records[i] is None:
3461
+ if i not in updated_records_dict:
3462
+ # record was not refreshed, keep the original record
3463
+ kept_records.append(record)
3464
+ continue
3465
+ updated_record = updated_records_dict[i]
3466
+ if updated_record is None:
3143
3467
  if record['to_down']:
3144
- autodown_clusters.append(cluster_names[i])
3468
+ autodown_clusters.append(record['name'])
3145
3469
  else:
3146
- remaining_clusters.append(cluster_names[i])
3147
- elif updated_records[i]['status'] == 'UNKNOWN':
3148
- failed_clusters.append(
3149
- (cluster_names[i], updated_records[i]['error']))
3470
+ remaining_clusters.append(record['name'])
3471
+ elif updated_record['status'] == 'UNKNOWN':
3472
+ failed_clusters.append((record['name'], updated_record['error']))
3150
3473
  # Keep the original record if the status is unknown,
3151
3474
  # so that the user can still see the cluster.
3152
3475
  kept_records.append(record)
3153
3476
  else:
3154
- kept_records.append(updated_records[i])
3477
+ kept_records.append(updated_record)
3155
3478
 
3156
3479
  if autodown_clusters:
3157
3480
  plural = 's' if len(autodown_clusters) > 1 else ''
@@ -3352,13 +3675,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
3352
3675
  `stderr`. Typically due to the local client version just got updated, and
3353
3676
  the remote runtime is an older version.
3354
3677
  """
3355
- pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
3356
- r'attribute \'(.*)\'')
3357
3678
  if returncode != 0:
3358
- # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
3359
- # the remote cluster. Remove this after 0.10.0 is released.
3360
- attribute_error = re.findall(pattern, stderr)
3361
- if attribute_error or 'SkyPilot runtime is too old' in stderr:
3679
+ if 'SkyPilot runtime is too old' in stderr:
3362
3680
  with ux_utils.print_exception_no_traceback():
3363
3681
  raise RuntimeError(
3364
3682
  f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
@@ -3502,19 +3820,126 @@ def workspace_lock_id(workspace_name: str) -> str:
3502
3820
  return f'{workspace_name}_workspace'
3503
3821
 
3504
3822
 
3823
+ def cluster_tunnel_lock_id(cluster_name: str) -> str:
3824
+ """Get the lock ID for cluster tunnel operations."""
3825
+ return f'{cluster_name}_ssh_tunnel'
3826
+
3827
+
3828
+ def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
3829
+ command_runner.KubernetesCommandRunner],
3830
+ port_forward: Tuple[int, int]) -> subprocess.Popen:
3831
+ local_port, remote_port = port_forward
3832
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3833
+ # Disabling ControlMaster makes things easier to reason about
3834
+ # with respect to resource management/ownership,
3835
+ # as killing the process will close the tunnel too.
3836
+ head_runner.disable_control_master = True
3837
+ head_runner.port_forward_execute_remote_command = True
3838
+
3839
+ # The default connect_timeout of 1s is too short for
3840
+ # connecting to clusters using a jump server.
3841
+ # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
3842
+ # which is counted towards non-idleness.
3843
+ cmd: List[str] = head_runner.port_forward_command(
3844
+ [(local_port, remote_port)],
3845
+ connect_timeout=5,
3846
+ ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
3847
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
3848
+ # cat so the command doesn't exit until we kill it
3849
+ cmd += [f'"echo {_ACK_MESSAGE} && cat"']
3850
+ cmd_str = ' '.join(cmd)
3851
+ logger.debug(f'Running port forward command: {cmd_str}')
3852
+ ssh_tunnel_proc = subprocess.Popen(cmd_str,
3853
+ shell=True,
3854
+ stdin=subprocess.PIPE,
3855
+ stdout=subprocess.PIPE,
3856
+ stderr=subprocess.PIPE,
3857
+ start_new_session=True,
3858
+ text=True)
3859
+ # Wait until we receive an ack from the remote cluster or
3860
+ # the SSH connection times out.
3861
+ queue: queue_lib.Queue = queue_lib.Queue()
3862
+ stdout_thread = threading.Thread(
3863
+ target=lambda queue, stdout: queue.put(stdout.readline()),
3864
+ args=(queue, ssh_tunnel_proc.stdout),
3865
+ daemon=True)
3866
+ stdout_thread.start()
3867
+ while ssh_tunnel_proc.poll() is None:
3868
+ try:
3869
+ ack = queue.get_nowait()
3870
+ except queue_lib.Empty:
3871
+ ack = None
3872
+ time.sleep(0.1)
3873
+ continue
3874
+ assert ack is not None
3875
+ if isinstance(
3876
+ head_runner,
3877
+ command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
3878
+ break
3879
+ elif isinstance(head_runner, command_runner.KubernetesCommandRunner
3880
+ ) and _FORWARDING_FROM_MESSAGE in ack:
3881
+ # On kind clusters, this error occurs if we make a request
3882
+ # immediately after the port-forward is established on a new pod:
3883
+ # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
3884
+ # failed to execute portforward in network namespace
3885
+ # "/var/run/netns/cni-...": failed to connect to localhost:46590
3886
+ # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
3887
+ # connect: connection refused
3888
+ # So we need to poll the port on the pod to check if it is open.
3889
+ # We did not observe this with real Kubernetes clusters.
3890
+ timeout = 5
3891
+ port_check_cmd = (
3892
+ # We install netcat in our ray-node container,
3893
+ # so we can use it here.
3894
+ # (See kubernetes-ray.yml.j2)
3895
+ f'end=$((SECONDS+{timeout})); '
3896
+ f'while ! nc -z -w 1 localhost {remote_port}; do '
3897
+ 'if (( SECONDS >= end )); then exit 1; fi; '
3898
+ 'sleep 0.1; '
3899
+ 'done')
3900
+ returncode, stdout, stderr = head_runner.run(port_check_cmd,
3901
+ require_outputs=True,
3902
+ stream_logs=False)
3903
+ if returncode != 0:
3904
+ try:
3905
+ ssh_tunnel_proc.terminate()
3906
+ ssh_tunnel_proc.wait(timeout=5)
3907
+ except subprocess.TimeoutExpired:
3908
+ ssh_tunnel_proc.kill()
3909
+ ssh_tunnel_proc.wait()
3910
+ finally:
3911
+ error_msg = (f'Failed to check remote port {remote_port}')
3912
+ if stdout:
3913
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3914
+ raise exceptions.CommandError(returncode=returncode,
3915
+ command=cmd_str,
3916
+ error_msg=error_msg,
3917
+ detailed_reason=stderr)
3918
+ break
3919
+
3920
+ if ssh_tunnel_proc.poll() is not None:
3921
+ stdout, stderr = ssh_tunnel_proc.communicate()
3922
+ error_msg = 'Port forward failed'
3923
+ if stdout:
3924
+ error_msg += f'\n-- stdout --\n{stdout}\n'
3925
+ raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
3926
+ command=cmd_str,
3927
+ error_msg=error_msg,
3928
+ detailed_reason=stderr)
3929
+ return ssh_tunnel_proc
3930
+
3931
+
3505
3932
  T = TypeVar('T')
3506
3933
 
3507
3934
 
3508
- def invoke_skylet_with_retries(
3509
- handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3510
- func: Callable[..., T]) -> T:
3935
+ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3511
3936
  """Generic helper for making Skylet gRPC requests.
3512
3937
 
3513
3938
  This method handles the common pattern of:
3514
3939
  1. Try the gRPC request
3515
3940
  2. If SSH tunnel is closed, recreate it and retry
3516
3941
  """
3517
- max_attempts = 3
3942
+ max_attempts = 5
3518
3943
  backoff = common_utils.Backoff(initial_backoff=0.5)
3519
3944
  last_exception: Optional[Exception] = None
3520
3945
 
@@ -3523,26 +3948,46 @@ def invoke_skylet_with_retries(
3523
3948
  return func()
3524
3949
  except grpc.RpcError as e:
3525
3950
  last_exception = e
3526
- if e.code() == grpc.StatusCode.INTERNAL:
3527
- with ux_utils.print_exception_no_traceback():
3528
- raise exceptions.SkyletInternalError(e.details())
3529
- elif e.code() == grpc.StatusCode.UNAVAILABLE:
3530
- recreate_tunnel = True
3531
- try:
3532
- if handle.skylet_ssh_tunnel is not None:
3533
- proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3534
- if proc.is_running(
3535
- ) and proc.status() != psutil.STATUS_ZOMBIE:
3536
- recreate_tunnel = False
3537
- except psutil.NoSuchProcess:
3538
- pass
3539
-
3540
- if recreate_tunnel:
3541
- handle.open_and_update_skylet_tunnel()
3542
-
3543
- time.sleep(backoff.current_backoff())
3544
- else:
3545
- raise e
3951
+ _handle_grpc_error(e, backoff.current_backoff())
3952
+
3953
+ raise RuntimeError(
3954
+ f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
3955
+ ) from last_exception
3956
+
3546
3957
 
3547
- raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3548
- ) from last_exception
3958
+ def invoke_skylet_streaming_with_retries(
3959
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3960
+ """Generic helper for making Skylet streaming gRPC requests."""
3961
+ max_attempts = 3
3962
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3963
+ last_exception: Optional[Exception] = None
3964
+
3965
+ for _ in range(max_attempts):
3966
+ try:
3967
+ for response in stream_func():
3968
+ yield response
3969
+ return
3970
+ except grpc.RpcError as e:
3971
+ last_exception = e
3972
+ _handle_grpc_error(e, backoff.current_backoff())
3973
+
3974
+ raise RuntimeError(
3975
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3976
+ ) from last_exception
3977
+
3978
+
3979
+ def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3980
+ if e.code() == grpc.StatusCode.INTERNAL:
3981
+ with ux_utils.print_exception_no_traceback():
3982
+ raise exceptions.SkyletInternalError(e.details())
3983
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3984
+ time.sleep(current_backoff)
3985
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3986
+ ) == grpc.StatusCode.UNKNOWN:
3987
+ # Handle backwards compatibility: old server doesn't implement this RPC.
3988
+ # Let the caller fall back to legacy execution.
3989
+ raise exceptions.SkyletMethodNotImplementedError(
3990
+ f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
3991
+ )
3992
+ else:
3993
+ raise e