skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/client/cli/command.py CHANGED
@@ -32,6 +32,7 @@ import shlex
32
32
  import shutil
33
33
  import subprocess
34
34
  import sys
35
+ import time
35
36
  import traceback
36
37
  import typing
37
38
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
@@ -59,8 +60,9 @@ from sky import task as task_lib
59
60
  from sky.adaptors import common as adaptors_common
60
61
  from sky.client import sdk
61
62
  from sky.client.cli import flags
62
- from sky.client.cli import git
63
- from sky.data import storage_utils
63
+ from sky.client.cli import table_utils
64
+ from sky.client.cli import utils as cli_utils
65
+ from sky.jobs.state import ManagedJobStatus
64
66
  from sky.provision.kubernetes import constants as kubernetes_constants
65
67
  from sky.provision.kubernetes import utils as kubernetes_utils
66
68
  from sky.schemas.api import responses
@@ -79,7 +81,6 @@ from sky.utils import controller_utils
79
81
  from sky.utils import dag_utils
80
82
  from sky.utils import directory_utils
81
83
  from sky.utils import env_options
82
- from sky.utils import git as git_utils
83
84
  from sky.utils import infra_utils
84
85
  from sky.utils import log_utils
85
86
  from sky.utils import registry
@@ -89,9 +90,9 @@ from sky.utils import status_lib
89
90
  from sky.utils import subprocess_utils
90
91
  from sky.utils import timeline
91
92
  from sky.utils import ux_utils
93
+ from sky.utils import volume as volume_utils
92
94
  from sky.utils import yaml_utils
93
95
  from sky.utils.cli_utils import status_utils
94
- from sky.volumes import utils as volumes_utils
95
96
  from sky.volumes.client import sdk as volumes_sdk
96
97
 
97
98
  if typing.TYPE_CHECKING:
@@ -113,6 +114,24 @@ an autogenerated name."""
113
114
  # command.
114
115
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
115
116
  _NUM_MANAGED_JOBS_TO_SHOW = 50
117
+ _NUM_REQUESTS_TO_SHOW = 50
118
+ _DEFAULT_REQUEST_FIELDS_TO_SHOW = [
119
+ 'request_id', 'name', 'user_id', 'status', 'created_at'
120
+ ]
121
+ _VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
122
+ 'cluster_name'
123
+ ]
124
+ _DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
125
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
126
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
127
+ ]
128
+ _VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
129
+ 'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
130
+ 'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
131
+ 'failure_reason', 'metadata'
132
+ ]
133
+ _USER_NAME_FIELD = ['user_name']
134
+ _USER_HASH_FIELD = ['user_hash']
116
135
 
117
136
  _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
118
137
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -129,6 +148,7 @@ def _get_cluster_records_and_set_ssh_config(
129
148
  clusters: Optional[List[str]],
130
149
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
131
150
  all_users: bool = False,
151
+ verbose: bool = False,
132
152
  ) -> List[responses.StatusResponse]:
133
153
  """Returns a list of clusters that match the glob pattern.
134
154
 
@@ -146,23 +166,30 @@ def _get_cluster_records_and_set_ssh_config(
146
166
  request_id = sdk.status(clusters,
147
167
  refresh=refresh,
148
168
  all_users=all_users,
149
- _include_credentials=True)
169
+ _include_credentials=True,
170
+ _summary_response=not verbose)
150
171
  cluster_records = sdk.stream_and_get(request_id)
151
172
  # Update the SSH config for all clusters
152
173
  for record in cluster_records:
153
174
  handle = record['handle']
154
-
175
+ name = record['name']
155
176
  if not (handle is not None and handle.cached_external_ips is not None
156
177
  and 'credentials' in record):
157
178
  # If the cluster is not UP or does not have credentials available,
158
179
  # we need to remove the cluster from the SSH config.
159
- cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
180
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
181
+ continue
182
+ if not record['credentials']:
183
+ # The credential is missing for some reason, continue.
184
+ logger.debug(
185
+ f'Client did not receive SSH credential for cluster {name}')
160
186
  continue
161
187
 
162
188
  # During the failover, even though a cluster does not exist, the handle
163
189
  # can still exist in the record, and we check for credentials to avoid
164
190
  # updating the SSH config for non-existent clusters.
165
191
  credentials = record['credentials']
192
+ ips = handle.cached_external_ips
166
193
  if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
167
194
  # Replace the proxy command to proxy through the SkyPilot API
168
195
  # server with websocket.
@@ -191,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
191
218
  f'{server_common.get_server_url()} '
192
219
  f'{handle.cluster_name}\"')
193
220
  credentials['ssh_proxy_command'] = proxy_command
221
+ elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
222
+ # TODO(kevin): This is a temporary workaround, ideally we want to
223
+ # get a shell through srun --pty bash on the existing sbatch job.
224
+
225
+ # Proxy through the controller/login node to reach the worker node.
226
+ if (handle.cached_internal_ips is None or
227
+ not handle.cached_internal_ips):
228
+ logger.debug(
229
+ f'Cluster {name} does not have cached internal IPs. '
230
+ 'Skipping SSH config update.')
231
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
232
+ continue
233
+
234
+ escaped_key_path = shlex.quote(
235
+ cluster_utils.SSHConfigHelper.generate_local_key_file(
236
+ handle.cluster_name, credentials))
237
+ controller_host = handle.cached_external_ips[0]
238
+
239
+ # Build jump proxy: ssh to worker via controller/login node
240
+ proxy_command = (f'ssh -tt -i {escaped_key_path} '
241
+ '-o StrictHostKeyChecking=no '
242
+ '-o UserKnownHostsFile=/dev/null '
243
+ '-o IdentitiesOnly=yes '
244
+ '-W %h:%p '
245
+ f'{handle.ssh_user}@{controller_host}')
246
+ original_proxy = credentials.get('ssh_proxy_command')
247
+ if original_proxy:
248
+ proxy_command += (
249
+ f' -o ProxyCommand={shlex.quote(original_proxy)}')
250
+
251
+ credentials['ssh_proxy_command'] = proxy_command
252
+
253
+ # For Slurm, use the worker's internal IP as the SSH target
254
+ ips = handle.cached_internal_ips
194
255
 
195
256
  cluster_utils.SSHConfigHelper.add_cluster(
196
257
  handle.cluster_name,
197
- handle.cached_external_ips,
258
+ ips,
198
259
  credentials,
199
260
  handle.cached_external_ssh_ports,
200
261
  handle.docker_user,
@@ -783,8 +844,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
783
844
 
784
845
  # Update the workdir config from the command line parameters.
785
846
  # And update the envs and secrets from the workdir.
786
- _update_task_workdir(task, workdir, git_url, git_ref)
787
- _update_task_workdir_and_secrets_from_workdir(task)
847
+ task.update_workdir(workdir, git_url, git_ref)
848
+ task.update_envs_and_secrets_from_workdir()
788
849
 
789
850
  # job launch specific.
790
851
  if job_recovery is not None:
@@ -799,73 +860,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
799
860
  return task
800
861
 
801
862
 
802
- def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
803
- git_url: Optional[str], git_ref: Optional[str]):
804
- """Updates the task workdir.
805
-
806
- Args:
807
- task: The task to update.
808
- workdir: The workdir to update.
809
- git_url: The git url to update.
810
- git_ref: The git ref to update.
811
- """
812
- if task.workdir is None or isinstance(task.workdir, str):
813
- if workdir is not None:
814
- task.workdir = workdir
815
- return
816
- if git_url is not None:
817
- task.workdir = {}
818
- task.workdir['url'] = git_url
819
- if git_ref is not None:
820
- task.workdir['ref'] = git_ref
821
- return
822
- return
823
- if git_url is not None:
824
- task.workdir['url'] = git_url
825
- if git_ref is not None:
826
- task.workdir['ref'] = git_ref
827
- return
828
-
829
-
830
- def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
831
- """Updates the task secrets from the workdir.
832
-
833
- Args:
834
- task: The task to update.
835
- """
836
- if task.workdir is None:
837
- return
838
- if not isinstance(task.workdir, dict):
839
- return
840
- url = task.workdir['url']
841
- ref = task.workdir.get('ref', '')
842
- token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
843
- ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
844
- try:
845
- git_repo = git.GitRepo(url, ref, token, ssh_key_path)
846
- clone_info = git_repo.get_repo_clone_info()
847
- if clone_info is None:
848
- return
849
- task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
850
- if ref:
851
- ref_type = git_repo.get_ref_type()
852
- if ref_type == git.GitRefType.COMMIT:
853
- task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
854
- elif ref_type == git.GitRefType.BRANCH:
855
- task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
856
- elif ref_type == git.GitRefType.TAG:
857
- task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
858
- if clone_info.token is None and clone_info.ssh_key is None:
859
- return
860
- if clone_info.token is not None:
861
- task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
862
- if clone_info.ssh_key is not None:
863
- task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
864
- except exceptions.GitError as e:
865
- with ux_utils.print_exception_no_traceback():
866
- raise ValueError(f'{str(e)}') from None
867
-
868
-
869
863
  class _NaturalOrderGroup(click.Group):
870
864
  """Lists commands in the order defined in this script.
871
865
 
@@ -873,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
873
867
  """
874
868
 
875
869
  def list_commands(self, ctx): # pylint: disable=unused-argument
876
- return self.commands.keys()
870
+ # Preserve definition order but hide aliases (same command object) and
871
+ # commands explicitly marked as hidden.
872
+ seen_commands = set()
873
+ names = []
874
+ for name, command in self.commands.items():
875
+ if getattr(command, 'hidden', False):
876
+ continue
877
+ command_id = id(command)
878
+ if command_id in seen_commands:
879
+ continue
880
+ seen_commands.add(command_id)
881
+ names.append(name)
882
+ return names
877
883
 
878
884
  @usage_lib.entrypoint('sky.cli', fallback=True)
879
885
  def invoke(self, ctx):
@@ -1160,7 +1166,7 @@ def launch(
1160
1166
  if task.service is not None:
1161
1167
  noun = 'pool' if task.service.pool else 'service'
1162
1168
  capnoun = noun.capitalize()
1163
- sysname = 'Jobs Worker Pool' if task.service.pool else 'SkyServe'
1169
+ sysname = 'Pool' if task.service.pool else 'SkyServe'
1164
1170
  cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
1165
1171
  logger.info(
1166
1172
  f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
@@ -1388,14 +1394,24 @@ def exec(
1388
1394
 
1389
1395
 
1390
1396
  def _handle_jobs_queue_request(
1391
- request_id: server_common.RequestId[List[Dict[str, Any]]],
1392
- show_all: bool,
1393
- show_user: bool,
1394
- max_num_jobs_to_show: Optional[int],
1395
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1397
+ request_id: server_common.RequestId[Union[
1398
+ List[responses.ManagedJobRecord],
1399
+ Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
1400
+ show_all: bool,
1401
+ show_user: bool,
1402
+ max_num_jobs_to_show: Optional[int],
1403
+ pool_status_request_id: Optional[server_common.RequestId[List[Dict[
1404
+ str, Any]]]] = None,
1405
+ is_called_by_user: bool = False,
1406
+ only_in_progress: bool = False,
1407
+ queue_result_version: cli_utils.QueueResultVersion = cli_utils.
1408
+ QueueResultVersion.V1,
1409
+ ) -> Tuple[Optional[int], str]:
1396
1410
  """Get the in-progress managed jobs.
1397
1411
 
1398
1412
  Args:
1413
+ request_id: The request ID for managed jobs.
1414
+ pool_status_request_id: The request ID for pool status, or None.
1399
1415
  show_all: Show all information of each job (e.g., region, price).
1400
1416
  show_user: Show the user who submitted the job.
1401
1417
  max_num_jobs_to_show: If not None, limit the number of jobs to show to
@@ -1403,6 +1419,8 @@ def _handle_jobs_queue_request(
1403
1419
  and `sky jobs queue`.
1404
1420
  is_called_by_user: If this function is called by user directly, or an
1405
1421
  internal call.
1422
+ only_in_progress: If True, only return the number of in-progress jobs.
1423
+ queue_result_version: The version of the queue result.
1406
1424
 
1407
1425
  Returns:
1408
1426
  A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
@@ -1413,11 +1431,47 @@ def _handle_jobs_queue_request(
1413
1431
  # TODO(SKY-980): remove unnecessary fallbacks on the client side.
1414
1432
  num_in_progress_jobs = None
1415
1433
  msg = ''
1434
+ status_counts: Optional[Dict[str, int]] = None
1435
+ pool_status_result = None
1416
1436
  try:
1417
1437
  if not is_called_by_user:
1418
1438
  usage_lib.messages.usage.set_internal()
1419
- managed_jobs_ = sdk.stream_and_get(request_id)
1420
- num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
1439
+ # Call both stream_and_get functions in parallel
1440
+ def get_jobs_queue_result():
1441
+ return sdk.stream_and_get(request_id)
1442
+
1443
+ def get_pool_status_result():
1444
+ if pool_status_request_id is not None:
1445
+ try:
1446
+ return sdk.stream_and_get(pool_status_request_id)
1447
+ except Exception: # pylint: disable=broad-except
1448
+ # If getting pool status fails, just continue without it
1449
+ return None
1450
+ return None
1451
+
1452
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
1453
+ jobs_future = executor.submit(get_jobs_queue_result)
1454
+ pool_status_future = executor.submit(get_pool_status_result)
1455
+
1456
+ result = jobs_future.result()
1457
+ pool_status_result = pool_status_future.result()
1458
+
1459
+ if queue_result_version.v2():
1460
+ managed_jobs_, total, status_counts, _ = result
1461
+ if only_in_progress:
1462
+ num_in_progress_jobs = 0
1463
+ if status_counts:
1464
+ for status_value, count in status_counts.items():
1465
+ status_enum = managed_jobs.ManagedJobStatus(
1466
+ status_value)
1467
+ if not status_enum.is_terminal():
1468
+ num_in_progress_jobs += count
1469
+ else:
1470
+ num_in_progress_jobs = total
1471
+ else:
1472
+ managed_jobs_ = result
1473
+ num_in_progress_jobs = len(
1474
+ set(job['job_id'] for job in managed_jobs_))
1421
1475
  except exceptions.ClusterNotUpError as e:
1422
1476
  controller_status = e.cluster_status
1423
1477
  msg = str(e)
@@ -1461,10 +1515,14 @@ def _handle_jobs_queue_request(
1461
1515
  msg += ('Failed to query managed jobs: '
1462
1516
  f'{common_utils.format_exception(e, use_bracket=True)}')
1463
1517
  else:
1464
- msg = managed_jobs.format_job_table(managed_jobs_,
1465
- show_all=show_all,
1466
- show_user=show_user,
1467
- max_jobs=max_num_jobs_to_show)
1518
+ msg = table_utils.format_job_table(
1519
+ managed_jobs_,
1520
+ pool_status=pool_status_result,
1521
+ show_all=show_all,
1522
+ show_user=show_user,
1523
+ max_jobs=max_num_jobs_to_show,
1524
+ status_counts=status_counts,
1525
+ )
1468
1526
  return num_in_progress_jobs, msg
1469
1527
 
1470
1528
 
@@ -1562,35 +1620,6 @@ def _handle_services_request(
1562
1620
  return num_services, msg
1563
1621
 
1564
1622
 
1565
- def _status_kubernetes(show_all: bool):
1566
- """Show all SkyPilot resources in the current Kubernetes context.
1567
-
1568
- Args:
1569
- show_all (bool): Show all job information (e.g., start time, failures).
1570
- """
1571
- all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
1572
- sdk.status_kubernetes()))
1573
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1574
- f'Kubernetes cluster state (context: {context})'
1575
- f'{colorama.Style.RESET_ALL}')
1576
- status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
1577
- show_all)
1578
- if all_jobs:
1579
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1580
- f'Managed jobs'
1581
- f'{colorama.Style.RESET_ALL}')
1582
- msg = managed_jobs.format_job_table(all_jobs,
1583
- show_all=show_all,
1584
- show_user=False)
1585
- click.echo(msg)
1586
- if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1587
- # TODO: Parse serve controllers and show services separately.
1588
- # Currently we show a hint that services are shown as clusters.
1589
- click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
1590
- 'shown in the "SkyPilot clusters" section.'
1591
- f'{colorama.Style.RESET_ALL}')
1592
-
1593
-
1594
1623
  def _show_endpoint(query_clusters: Optional[List[str]],
1595
1624
  cluster_records: List[responses.StatusResponse], ip: bool,
1596
1625
  endpoints: bool, endpoint: Optional[int]) -> None:
@@ -1717,15 +1746,7 @@ def _show_enabled_infra(
1717
1746
  default=True,
1718
1747
  is_flag=True,
1719
1748
  required=False,
1720
- help='Also show cluster pools, if any.')
1721
- @click.option(
1722
- '--kubernetes',
1723
- '--k8s',
1724
- default=False,
1725
- is_flag=True,
1726
- required=False,
1727
- help='[Experimental] Show all SkyPilot resources (including from other '
1728
- 'users) in the current Kubernetes context.')
1749
+ help='Also show pools, if any.')
1729
1750
  @click.argument('clusters',
1730
1751
  required=False,
1731
1752
  type=str,
@@ -1737,8 +1758,8 @@ def _show_enabled_infra(
1737
1758
  # pylint: disable=redefined-builtin
1738
1759
  def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1739
1760
  endpoint: Optional[int], show_managed_jobs: bool,
1740
- show_services: bool, show_pools: bool, kubernetes: bool,
1741
- clusters: List[str], all_users: bool):
1761
+ show_services: bool, show_pools: bool, clusters: List[str],
1762
+ all_users: bool):
1742
1763
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1743
1764
  """Show clusters.
1744
1765
 
@@ -1801,9 +1822,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1801
1822
  or for autostop-enabled clusters, use ``--refresh`` to query the latest
1802
1823
  cluster statuses from the cloud providers.
1803
1824
  """
1804
- if kubernetes:
1805
- _status_kubernetes(verbose)
1806
- return
1807
1825
  # Do not show job queue if user specifies clusters, and if user
1808
1826
  # specifies --ip or --endpoint(s).
1809
1827
  show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
@@ -1853,9 +1871,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1853
1871
 
1854
1872
  # Phase 2: Parallel submission of all API requests
1855
1873
  def submit_managed_jobs():
1856
- return managed_jobs.queue(refresh=False,
1857
- skip_finished=True,
1858
- all_users=all_users)
1874
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
1875
+ if all_users:
1876
+ fields = fields + _USER_NAME_FIELD
1877
+ return cli_utils.get_managed_job_queue(
1878
+ refresh=False,
1879
+ skip_finished=True,
1880
+ all_users=all_users,
1881
+ fields=fields,
1882
+ limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1883
+ )
1859
1884
 
1860
1885
  def submit_services(
1861
1886
  ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
@@ -1870,17 +1895,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1870
1895
  return None
1871
1896
 
1872
1897
  def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
1873
- try:
1874
- return sdk.workspaces()
1875
- except RuntimeError:
1876
- # Backward compatibility for API server before #5660.
1877
- # TODO(zhwu): remove this after 0.10.0.
1878
- logger.warning(f'{colorama.Style.DIM}SkyPilot API server is '
1879
- 'in an old version, and may miss feature: '
1880
- 'workspaces. Update with: sky api stop; '
1881
- 'sky api start'
1882
- f'{colorama.Style.RESET_ALL}')
1883
- return None
1898
+ return sdk.workspaces()
1884
1899
 
1885
1900
  active_workspace = skypilot_config.get_active_workspace()
1886
1901
 
@@ -1888,6 +1903,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1888
1903
  return sdk.enabled_clouds(workspace=active_workspace, expand=True)
1889
1904
 
1890
1905
  managed_jobs_queue_request_id = None
1906
+ queue_result_version = cli_utils.QueueResultVersion.V1
1891
1907
  service_status_request_id = None
1892
1908
  workspace_request_id = None
1893
1909
  pool_status_request_id = None
@@ -1906,7 +1922,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1906
1922
 
1907
1923
  # Get the request IDs
1908
1924
  if show_managed_jobs:
1909
- managed_jobs_queue_request_id = managed_jobs_request_future.result()
1925
+ (managed_jobs_queue_request_id,
1926
+ queue_result_version) = managed_jobs_request_future.result()
1910
1927
  if show_services:
1911
1928
  service_status_request_id = services_request_future.result()
1912
1929
  if show_pools:
@@ -1927,7 +1944,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1927
1944
 
1928
1945
  # Phase 3: Get cluster records and handle special cases
1929
1946
  cluster_records = _get_cluster_records_and_set_ssh_config(
1930
- query_clusters, refresh_mode, all_users)
1947
+ query_clusters, refresh_mode, all_users, verbose)
1931
1948
 
1932
1949
  # TOOD(zhwu): setup the ssh config for status
1933
1950
  if ip or show_endpoints:
@@ -1938,7 +1955,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1938
1955
  controllers = []
1939
1956
  for cluster_record in cluster_records:
1940
1957
  cluster_name = cluster_record['name']
1941
- controller = controller_utils.Controllers.from_name(cluster_name)
1958
+ controller = controller_utils.Controllers.from_name(
1959
+ cluster_name, expect_exact_match=False)
1942
1960
  if controller is not None:
1943
1961
  controllers.append(cluster_record)
1944
1962
  else:
@@ -1967,10 +1985,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1967
1985
  try:
1968
1986
  num_in_progress_jobs, msg = _handle_jobs_queue_request(
1969
1987
  managed_jobs_queue_request_id,
1988
+ pool_status_request_id=pool_status_request_id,
1970
1989
  show_all=False,
1971
1990
  show_user=all_users,
1972
1991
  max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1973
- is_called_by_user=False)
1992
+ is_called_by_user=False,
1993
+ only_in_progress=True,
1994
+ queue_result_version=queue_result_version,
1995
+ )
1974
1996
  except KeyboardInterrupt:
1975
1997
  sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
1976
1998
  managed_jobs_query_interrupted = True
@@ -2066,6 +2088,35 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
2066
2088
  click.echo('\n' + '\n'.join(hints))
2067
2089
 
2068
2090
 
2091
+ @cli.command(hidden=True)
2092
+ @flags.config_option(expose_value=False)
2093
+ @flags.verbose_option()
2094
+ def status_kubernetes(verbose: bool):
2095
+ """[Experimental] Show all SkyPilot resources (including from other '
2096
+ 'users) in the current Kubernetes context."""
2097
+ all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
2098
+ sdk.status_kubernetes()))
2099
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2100
+ f'Kubernetes cluster state (context: {context})'
2101
+ f'{colorama.Style.RESET_ALL}')
2102
+ status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
2103
+ show_all=verbose)
2104
+ if all_jobs:
2105
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2106
+ f'Managed jobs'
2107
+ f'{colorama.Style.RESET_ALL}')
2108
+ msg = table_utils.format_job_table(all_jobs,
2109
+ show_all=verbose,
2110
+ show_user=False)
2111
+ click.echo(msg)
2112
+ if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
2113
+ # TODO: Parse serve controllers and show services separately.
2114
+ # Currently we show a hint that services are shown as clusters.
2115
+ click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
2116
+ 'shown in the "SkyPilot clusters" section.'
2117
+ f'{colorama.Style.RESET_ALL}')
2118
+
2119
+
2069
2120
  @cli.command()
2070
2121
  @flags.config_option(expose_value=False)
2071
2122
  @flags.all_option('Show all cluster information.')
@@ -2104,7 +2155,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
2104
2155
  for cluster_record in cluster_records:
2105
2156
  cluster_name = cluster_record['name']
2106
2157
  try:
2107
- controller = controller_utils.Controllers.from_name(cluster_name)
2158
+ controller = controller_utils.Controllers.from_name(
2159
+ cluster_name, expect_exact_match=False)
2108
2160
  except AssertionError:
2109
2161
  # There could be some old controller clusters from previous
2110
2162
  # versions that we should not show in the cost report.
@@ -2192,7 +2244,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2192
2244
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
2193
2245
  f' {common_utils.format_exception(e)}')
2194
2246
  return
2195
- job_tables[cluster] = job_lib.format_job_queue(job_table)
2247
+ job_tables[cluster] = table_utils.format_job_queue(job_table)
2196
2248
 
2197
2249
  subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2198
2250
  user_str = 'all users' if all_users else 'current user'
@@ -2213,6 +2265,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2213
2265
  is_flag=True,
2214
2266
  default=False,
2215
2267
  help='Stream the cluster provisioning logs (provision.log).')
2268
+ @click.option('--worker',
2269
+ '-w',
2270
+ default=None,
2271
+ type=int,
2272
+ help='The worker ID to stream the logs from. '
2273
+ 'If not set, stream the logs of the head node.')
2216
2274
  @click.option(
2217
2275
  '--sync-down',
2218
2276
  '-s',
@@ -2250,6 +2308,7 @@ def logs(
2250
2308
  cluster: str,
2251
2309
  job_ids: Tuple[str, ...],
2252
2310
  provision: bool,
2311
+ worker: Optional[int],
2253
2312
  sync_down: bool,
2254
2313
  status: bool, # pylint: disable=redefined-outer-name
2255
2314
  follow: bool,
@@ -2279,6 +2338,13 @@ def logs(
2279
2338
  4. If the job fails or fetching the logs fails, the command will exit with
2280
2339
  a non-zero return code.
2281
2340
  """
2341
+ if worker is not None:
2342
+ if not provision:
2343
+ raise click.UsageError(
2344
+ '--worker can only be used with --provision.')
2345
+ if worker < 1:
2346
+ raise click.UsageError('--worker must be a positive integer.')
2347
+
2282
2348
  if provision and (sync_down or status or job_ids):
2283
2349
  raise click.UsageError(
2284
2350
  '--provision cannot be combined with job log options '
@@ -2298,7 +2364,11 @@ def logs(
2298
2364
 
2299
2365
  if provision:
2300
2366
  # Stream provision logs
2301
- sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
2367
+ sys.exit(
2368
+ sdk.tail_provision_logs(cluster_name=cluster,
2369
+ worker=worker,
2370
+ follow=follow,
2371
+ tail=tail))
2302
2372
 
2303
2373
  if sync_down:
2304
2374
  with rich_utils.client_status(
@@ -2476,7 +2546,8 @@ def cancel(
2476
2546
  job_ids=job_ids_to_cancel)
2477
2547
  _async_call_or_wait(request_id, async_call, 'sky.cancel')
2478
2548
  except exceptions.NotSupportedError as e:
2479
- controller = controller_utils.Controllers.from_name(cluster)
2549
+ controller = controller_utils.Controllers.from_name(
2550
+ cluster, expect_exact_match=False)
2480
2551
  assert controller is not None, cluster
2481
2552
  with ux_utils.print_exception_no_traceback():
2482
2553
  raise click.UsageError(
@@ -2777,7 +2848,8 @@ def start(
2777
2848
  # Get all clusters that are not controllers.
2778
2849
  cluster_records = [
2779
2850
  cluster for cluster in all_clusters
2780
- if controller_utils.Controllers.from_name(cluster['name']) is None
2851
+ if controller_utils.Controllers.from_name(
2852
+ cluster['name'], expect_exact_match=False) is None
2781
2853
  ]
2782
2854
  if cluster_records is None:
2783
2855
  # Get GLOB cluster names
@@ -2839,7 +2911,8 @@ def start(
2839
2911
  # Checks for controller clusters (jobs controller / sky serve controller).
2840
2912
  controllers, normal_clusters = [], []
2841
2913
  for name in to_start:
2842
- if controller_utils.Controllers.from_name(name) is not None:
2914
+ if controller_utils.Controllers.from_name(
2915
+ name, expect_exact_match=False) is not None:
2843
2916
  controllers.append(name)
2844
2917
  else:
2845
2918
  normal_clusters.append(name)
@@ -2975,16 +3048,28 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2975
3048
  to be torn down (e.g., because it has jobs running or
2976
3049
  it is in init state)
2977
3050
  """
2978
- controller = controller_utils.Controllers.from_name(controller_name)
3051
+ controller = controller_utils.Controllers.from_name(
3052
+ controller_name, expect_exact_match=False)
2979
3053
  assert controller is not None, controller_name
2980
3054
 
3055
+ status_counts: Optional[Dict[str, int]] = None
3056
+ managed_jobs_: List[responses.ManagedJobRecord] = []
2981
3057
  with rich_utils.client_status(
2982
3058
  '[bold cyan]Checking for in-progress managed jobs and pools[/]'):
2983
3059
  try:
2984
- request_id = managed_jobs.queue(refresh=False,
2985
- skip_finished=True,
2986
- all_users=True)
2987
- managed_jobs_ = sdk.stream_and_get(request_id)
3060
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
3061
+ request_id, queue_result_version = cli_utils.get_managed_job_queue(
3062
+ refresh=False,
3063
+ skip_finished=True,
3064
+ all_users=True,
3065
+ fields=fields,
3066
+ )
3067
+ result = sdk.stream_and_get(request_id)
3068
+ if queue_result_version.v2():
3069
+ managed_jobs_, _, status_counts, _ = result
3070
+ else:
3071
+ managed_jobs_ = typing.cast(List[responses.ManagedJobRecord],
3072
+ result)
2988
3073
  request_id_pools = managed_jobs.pool_status(pool_names=None)
2989
3074
  pools_ = sdk.stream_and_get(request_id_pools)
2990
3075
  except exceptions.ClusterNotUpError as e:
@@ -3002,25 +3087,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
3002
3087
  # there is no in-prgress managed jobs.
3003
3088
  managed_jobs_ = []
3004
3089
  pools_ = []
3005
- except exceptions.InconsistentConsolidationModeError:
3006
- # If this error is raised, it means the user switched to the
3007
- # consolidation mode but the previous controller cluster is still
3008
- # running. We should allow the user to tear down the controller
3009
- # cluster in this case.
3010
- with skypilot_config.override_skypilot_config(
3011
- {'jobs': {
3012
- 'controller': {
3013
- 'consolidation_mode': False
3014
- }
3015
- }}):
3016
- # Check again with the consolidation mode disabled. This is to
3017
- # make sure there is no in-progress managed jobs.
3018
- request_id = managed_jobs.queue(refresh=False,
3019
- skip_finished=True,
3020
- all_users=True)
3021
- managed_jobs_ = sdk.stream_and_get(request_id)
3022
- request_id_pools = managed_jobs.pool_status(pool_names=None)
3023
- pools_ = sdk.stream_and_get(request_id_pools)
3024
3090
 
3025
3091
  msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
3026
3092
  'jobs controller. Please be aware of the following:'
@@ -3029,9 +3095,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
3029
3095
  'jobs (output of `sky jobs queue`) will be lost.')
3030
3096
  click.echo(msg)
3031
3097
  if managed_jobs_:
3032
- job_table = managed_jobs.format_job_table(managed_jobs_,
3033
- show_all=False,
3034
- show_user=True)
3098
+ job_table = table_utils.format_job_table(
3099
+ managed_jobs_,
3100
+ show_all=False,
3101
+ show_user=True,
3102
+ status_counts=status_counts,
3103
+ )
3035
3104
  msg = controller.value.decline_down_for_dirty_controller_hint
3036
3105
  # Add prefix to each line to align with the bullet point.
3037
3106
  msg += '\n'.join(
@@ -3074,7 +3143,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3074
3143
  to be torn down (e.g., because it has services running or
3075
3144
  it is in init state)
3076
3145
  """
3077
- controller = controller_utils.Controllers.from_name(controller_name)
3146
+ controller = controller_utils.Controllers.from_name(
3147
+ controller_name, expect_exact_match=False)
3078
3148
  assert controller is not None, controller_name
3079
3149
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
3080
3150
  try:
@@ -3093,21 +3163,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3093
3163
  # controller being STOPPED or being firstly launched, i.e., there is
3094
3164
  # no in-prgress services.
3095
3165
  services = []
3096
- except exceptions.InconsistentConsolidationModeError:
3097
- # If this error is raised, it means the user switched to the
3098
- # consolidation mode but the previous controller cluster is still
3099
- # running. We should allow the user to tear down the controller
3100
- # cluster in this case.
3101
- with skypilot_config.override_skypilot_config(
3102
- {'serve': {
3103
- 'controller': {
3104
- 'consolidation_mode': False
3105
- }
3106
- }}):
3107
- # Check again with the consolidation mode disabled. This is to
3108
- # make sure there is no in-progress services.
3109
- request_id = serve_lib.status(service_names=None)
3110
- services = sdk.stream_and_get(request_id)
3111
3166
 
3112
3167
  if services:
3113
3168
  service_names = [service['name'] for service in services]
@@ -3185,14 +3240,15 @@ def _down_or_stop_clusters(
3185
3240
  names = list(names)
3186
3241
  if names:
3187
3242
  controllers = [
3188
- name for name in names
3189
- if controller_utils.Controllers.from_name(name) is not None
3243
+ name for name in names if controller_utils.Controllers.from_name(
3244
+ name, expect_exact_match=False) is not None
3190
3245
  ]
3191
3246
  controllers_str = ', '.join(map(repr, controllers))
3192
3247
  names = [
3193
3248
  cluster['name']
3194
3249
  for cluster in _get_cluster_records_and_set_ssh_config(names)
3195
- if controller_utils.Controllers.from_name(cluster['name']) is None
3250
+ if controller_utils.Controllers.from_name(
3251
+ cluster['name'], expect_exact_match=False) is None
3196
3252
  ]
3197
3253
 
3198
3254
  # Make sure the controllers are explicitly specified without other
@@ -3217,7 +3273,7 @@ def _down_or_stop_clusters(
3217
3273
  f'{controllers_str} is currently not supported.')
3218
3274
  else:
3219
3275
  controller = controller_utils.Controllers.from_name(
3220
- controller_name)
3276
+ controller_name, expect_exact_match=False)
3221
3277
  assert controller is not None
3222
3278
  hint_or_raise = _controller_to_hint_or_raise(controller)
3223
3279
  try:
@@ -3265,9 +3321,10 @@ def _down_or_stop_clusters(
3265
3321
  names = [
3266
3322
  record['name']
3267
3323
  for record in all_clusters
3268
- if controller_utils.Controllers.from_name(record['name']) is None
3269
- and (down or idle_minutes_to_autostop is not None or
3270
- record['status'] != status_lib.ClusterStatus.STOPPED)
3324
+ if controller_utils.Controllers.from_name(
3325
+ record['name'], expect_exact_match=False) is None and
3326
+ (down or idle_minutes_to_autostop is not None or
3327
+ record['status'] != status_lib.ClusterStatus.STOPPED)
3271
3328
  ]
3272
3329
 
3273
3330
  clusters = names
@@ -3297,6 +3354,9 @@ def _down_or_stop_clusters(
3297
3354
 
3298
3355
  request_ids = []
3299
3356
 
3357
+ successes: List[str] = []
3358
+ failures: List[Tuple[str, str]] = []
3359
+
3300
3360
  def _down_or_stop(name: str):
3301
3361
  success_progress = False
3302
3362
  if idle_minutes_to_autostop is not None:
@@ -3304,16 +3364,20 @@ def _down_or_stop_clusters(
3304
3364
  request_id = sdk.autostop(name, idle_minutes_to_autostop,
3305
3365
  wait_for, down)
3306
3366
  request_ids.append(request_id)
3367
+ progress.stop()
3307
3368
  _async_call_or_wait(
3308
3369
  request_id, async_call,
3309
3370
  server_constants.REQUEST_NAME_PREFIX + operation)
3310
- except (exceptions.NotSupportedError,
3311
- exceptions.ClusterNotUpError) as e:
3371
+ progress.start()
3372
+ except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
3373
+ exceptions.CloudError) as e:
3312
3374
  message = str(e)
3375
+ failures.append((name, str(e)))
3313
3376
  else: # no exception raised
3314
3377
  success_progress = True
3315
3378
  message = (f'{colorama.Fore.GREEN}{operation} '
3316
3379
  f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
3380
+ successes.append(name)
3317
3381
  if idle_minutes_to_autostop >= 0:
3318
3382
  option_str = 'down' if down else 'stop'
3319
3383
  passive_str = 'downed' if down else 'stopped'
@@ -3333,9 +3397,11 @@ def _down_or_stop_clusters(
3333
3397
  else:
3334
3398
  request_id = sdk.stop(name, purge=purge)
3335
3399
  request_ids.append(request_id)
3400
+ progress.stop()
3336
3401
  _async_call_or_wait(
3337
3402
  request_id, async_call,
3338
3403
  server_constants.REQUEST_NAME_PREFIX + operation)
3404
+ progress.start()
3339
3405
  if not async_call:
3340
3406
  # Remove the cluster from the SSH config file as soon as it
3341
3407
  # is stopped or downed.
@@ -3345,13 +3411,17 @@ def _down_or_stop_clusters(
3345
3411
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
3346
3412
  f'{colorama.Style.RESET_ALL}'
3347
3413
  f'\nReason: {common_utils.format_exception(e)}.')
3414
+ failures.append((name, str(e)))
3348
3415
  except (exceptions.NotSupportedError,
3349
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3416
+ exceptions.ClusterOwnerIdentityMismatchError,
3417
+ exceptions.CloudError) as e:
3350
3418
  message = str(e)
3419
+ failures.append((name, str(e)))
3351
3420
  else: # no exception raised
3352
3421
  message = (
3353
3422
  f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
3354
3423
  f'{colorama.Style.RESET_ALL}')
3424
+ successes.append(name)
3355
3425
  if not down:
3356
3426
  message += ('\n To restart the cluster, run: '
3357
3427
  f'{colorama.Style.BRIGHT}sky start {name}'
@@ -3365,6 +3435,10 @@ def _down_or_stop_clusters(
3365
3435
  progress.start()
3366
3436
 
3367
3437
  with progress:
3438
+ # we write a new line here to avoid the "Waiting for 'sky.down'
3439
+ # request to be scheduled" message from being printed on the same line
3440
+ # as the "Terminating <num> clusters..." message
3441
+ click.echo('')
3368
3442
  subprocess_utils.run_in_parallel(_down_or_stop, clusters)
3369
3443
  progress.live.transient = False
3370
3444
  # Make sure the progress bar not mess up the terminal.
@@ -3374,6 +3448,31 @@ def _down_or_stop_clusters(
3374
3448
  click.secho(f'{operation} requests are sent. Check the requests\' '
3375
3449
  'status with `sky request get <request_id>`.')
3376
3450
 
3451
+ show_summary = len(clusters) > 1
3452
+
3453
+ if show_summary:
3454
+ click.echo('\nSummary:')
3455
+ if successes:
3456
+ # Preserve the original order of clusters as provided by user.
3457
+ click.echo(' ✓ Succeeded: ' + ', '.join(successes))
3458
+ if failures:
3459
+ # Format failures: if one failure, keep on same line. If multiple,
3460
+ # indent each failed cluster on its own line for readability.
3461
+ if len(failures) == 1:
3462
+ name, reason = failures[0]
3463
+ first = reason.strip().splitlines()[0]
3464
+ first = first if len(first) <= 120 else first[:120] + '…'
3465
+ click.echo(f' ✗ Failed: {name} ({first})')
3466
+ else:
3467
+ click.echo(' ✗ Failed:')
3468
+ for name, reason in failures:
3469
+ first = reason.strip().splitlines()[0]
3470
+ first = first if len(first) <= 120 else first[:120] + '…'
3471
+ click.echo(f' {name} ({first})')
3472
+
3473
+ if failures:
3474
+ click.echo('Cluster(s) failed. See details above.')
3475
+
3377
3476
 
3378
3477
  @cli.command(cls=_DocumentedCodeCommand)
3379
3478
  @flags.config_option(expose_value=False)
@@ -3483,6 +3582,10 @@ def show_gpus(
3483
3582
  maximum quantities of the GPU available on a single node and the real-time
3484
3583
  availability of the GPU across all nodes in the Kubernetes cluster.
3485
3584
 
3585
+ If ``--cloud slurm`` is specified, it will show the maximum quantities of
3586
+ the GPU available on a single node and the real-time availability of the
3587
+ GPU across all nodes in the Slurm cluster.
3588
+
3486
3589
  Definitions of certain fields:
3487
3590
 
3488
3591
  * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3538,6 +3641,8 @@ def show_gpus(
3538
3641
  cloud_is_kubernetes = isinstance(
3539
3642
  cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
3540
3643
  cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
3644
+ cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
3645
+
3541
3646
  # TODO(romilb): We should move this to the backend.
3542
3647
  kubernetes_autoscaling = skypilot_config.get_effective_region_config(
3543
3648
  cloud='kubernetes',
@@ -3546,6 +3651,7 @@ def show_gpus(
3546
3651
  default_value=None) is not None
3547
3652
  kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
3548
3653
  ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
3654
+ slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
3549
3655
  query_k8s_realtime_gpu = (kubernetes_is_enabled and
3550
3656
  (cloud_name is None or cloud_is_kubernetes))
3551
3657
  query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3605,8 +3711,9 @@ def show_gpus(
3605
3711
  raise ValueError(full_err_msg)
3606
3712
  no_permissions_str = '<no permissions>'
3607
3713
  realtime_gpu_infos = []
3714
+ # Stores per-GPU totals as [ready_capacity, available, not_ready].
3608
3715
  total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3609
- lambda: [0, 0])
3716
+ lambda: [0, 0, 0])
3610
3717
  all_nodes_info = []
3611
3718
 
3612
3719
  # display an aggregated table for all contexts
@@ -3617,6 +3724,33 @@ def show_gpus(
3617
3724
 
3618
3725
  num_filtered_contexts = 0
3619
3726
 
3727
+ def _count_not_ready_gpus(
3728
+ nodes_info: Optional['models.KubernetesNodesInfo']
3729
+ ) -> Dict[str, int]:
3730
+ """Return counts of GPUs on not ready nodes keyed by GPU type."""
3731
+ not_ready_counts: Dict[str, int] = collections.defaultdict(int)
3732
+ if nodes_info is None:
3733
+ return not_ready_counts
3734
+
3735
+ node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
3736
+ for node_info in node_info_dict.values():
3737
+ accelerator_type = getattr(node_info, 'accelerator_type', None)
3738
+ if not accelerator_type:
3739
+ continue
3740
+
3741
+ total_info = getattr(node_info, 'total', {})
3742
+ accelerator_count = 0
3743
+ if isinstance(total_info, dict):
3744
+ accelerator_count = int(
3745
+ total_info.get('accelerator_count', 0))
3746
+ if accelerator_count <= 0:
3747
+ continue
3748
+
3749
+ node_is_ready = getattr(node_info, 'is_ready', True)
3750
+ if not node_is_ready:
3751
+ not_ready_counts[accelerator_type] += accelerator_count
3752
+ return not_ready_counts
3753
+
3620
3754
  if realtime_gpu_availability_lists:
3621
3755
  for (ctx, availability_list) in realtime_gpu_availability_lists:
3622
3756
  if not _filter_ctx(ctx):
@@ -3626,6 +3760,12 @@ def show_gpus(
3626
3760
  else:
3627
3761
  display_ctx = ctx
3628
3762
  num_filtered_contexts += 1
3763
+ # Collect node info for this context before building tables so
3764
+ # we can exclude GPUs on not ready nodes from the totals.
3765
+ nodes_info = sdk.stream_and_get(
3766
+ sdk.kubernetes_node_info(context=ctx))
3767
+ context_not_ready_counts = _count_not_ready_gpus(nodes_info)
3768
+
3629
3769
  realtime_gpu_table = log_utils.create_table(
3630
3770
  ['GPU', qty_header, 'UTILIZATION'])
3631
3771
  for realtime_gpu_availability in sorted(availability_list):
@@ -3634,24 +3774,116 @@ def show_gpus(
3634
3774
  available_qty = (gpu_availability.available
3635
3775
  if gpu_availability.available != -1 else
3636
3776
  no_permissions_str)
3777
+ # Exclude GPUs on not ready nodes from capacity counts.
3778
+ not_ready_count = min(
3779
+ context_not_ready_counts.get(gpu_availability.gpu, 0),
3780
+ gpu_availability.capacity)
3781
+ # Ensure capacity is never below the reported available
3782
+ # quantity (if available is unknown, treat as 0 for totals).
3783
+ available_for_totals = max(
3784
+ gpu_availability.available
3785
+ if gpu_availability.available != -1 else 0, 0)
3786
+ effective_capacity = max(
3787
+ gpu_availability.capacity - not_ready_count,
3788
+ available_for_totals)
3789
+ utilization = (
3790
+ f'{available_qty} of {effective_capacity} free')
3791
+ if not_ready_count > 0:
3792
+ utilization += f' ({not_ready_count} not ready)'
3637
3793
  realtime_gpu_table.add_row([
3638
3794
  gpu_availability.gpu,
3639
3795
  _list_to_str(gpu_availability.counts),
3640
- f'{available_qty} of {gpu_availability.capacity} free',
3796
+ utilization,
3641
3797
  ])
3642
3798
  gpu = gpu_availability.gpu
3643
- capacity = gpu_availability.capacity
3644
3799
  # we want total, so skip permission denied.
3645
- available = max(gpu_availability.available, 0)
3646
- if capacity > 0:
3647
- total_gpu_info[gpu][0] += capacity
3648
- total_gpu_info[gpu][1] += available
3800
+ if effective_capacity > 0 or not_ready_count > 0:
3801
+ total_gpu_info[gpu][0] += effective_capacity
3802
+ total_gpu_info[gpu][1] += available_for_totals
3803
+ total_gpu_info[gpu][2] += not_ready_count
3649
3804
  realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
3650
- # Collect node info for this context
3651
- nodes_info = sdk.stream_and_get(
3652
- sdk.kubernetes_node_info(context=ctx))
3653
3805
  all_nodes_info.append((display_ctx, nodes_info))
3654
3806
  if num_filtered_contexts > 1:
3807
+ total_realtime_gpu_table = log_utils.create_table(
3808
+ ['GPU', 'UTILIZATION'])
3809
+ for gpu, stats in total_gpu_info.items():
3810
+ not_ready = stats[2]
3811
+ utilization = f'{stats[1]} of {stats[0]} free'
3812
+ if not_ready > 0:
3813
+ utilization += f' ({not_ready} not ready)'
3814
+ total_realtime_gpu_table.add_row([gpu, utilization])
3815
+ else:
3816
+ total_realtime_gpu_table = None
3817
+
3818
+ return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3819
+
3820
+ def _get_slurm_realtime_gpu_tables(
3821
+ name_filter: Optional[str] = None,
3822
+ quantity_filter: Optional[int] = None
3823
+ ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
3824
+ Optional['prettytable.PrettyTable']]:
3825
+ """Get Slurm GPU availability tables.
3826
+
3827
+ Args:
3828
+ name_filter: Filter GPUs by name.
3829
+ quantity_filter: Filter GPUs by quantity.
3830
+
3831
+ Returns:
3832
+ A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
3833
+ """
3834
+ if quantity_filter:
3835
+ qty_header = 'QTY_FILTER'
3836
+ else:
3837
+ qty_header = 'REQUESTABLE_QTY_PER_NODE'
3838
+
3839
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3840
+ sdk.realtime_slurm_gpu_availability(
3841
+ name_filter=name_filter, quantity_filter=quantity_filter))
3842
+ if not realtime_gpu_availability_lists:
3843
+ err_msg = 'No GPUs found in any Slurm partition. '
3844
+ debug_msg = 'To further debug, run: sky check slurm '
3845
+ if name_filter is not None:
3846
+ gpu_info_msg = f' {name_filter!r}'
3847
+ if quantity_filter is not None:
3848
+ gpu_info_msg += (' with requested quantity'
3849
+ f' {quantity_filter}')
3850
+ err_msg = (f'Resources{gpu_info_msg} not found '
3851
+ 'in any Slurm partition. ')
3852
+ debug_msg = ('To show available accelerators on Slurm,'
3853
+ ' run: sky show-gpus --cloud slurm ')
3854
+ raise ValueError(err_msg + debug_msg)
3855
+
3856
+ realtime_gpu_infos = []
3857
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3858
+ lambda: [0, 0])
3859
+
3860
+ for (slurm_cluster,
3861
+ availability_list) in realtime_gpu_availability_lists:
3862
+ realtime_gpu_table = log_utils.create_table(
3863
+ ['GPU', qty_header, 'UTILIZATION'])
3864
+ for realtime_gpu_availability in sorted(availability_list):
3865
+ gpu_availability = models.RealtimeGpuAvailability(
3866
+ *realtime_gpu_availability)
3867
+ # Use the counts directly from the backend, which are already
3868
+ # generated in powers of 2 (plus any actual maximums)
3869
+ requestable_quantities = gpu_availability.counts
3870
+ realtime_gpu_table.add_row([
3871
+ gpu_availability.gpu,
3872
+ _list_to_str(requestable_quantities),
3873
+ (f'{gpu_availability.available} of '
3874
+ f'{gpu_availability.capacity} free'),
3875
+ ])
3876
+ gpu = gpu_availability.gpu
3877
+ capacity = gpu_availability.capacity
3878
+ available = gpu_availability.available
3879
+ if capacity > 0:
3880
+ total_gpu_info[gpu][0] += capacity
3881
+ total_gpu_info[gpu][1] += available
3882
+ realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
3883
+
3884
+ # display an aggregated table for all partitions
3885
+ # if there are more than one partitions with GPUs
3886
+ if len(realtime_gpu_infos) > 1:
3655
3887
  total_realtime_gpu_table = log_utils.create_table(
3656
3888
  ['GPU', 'UTILIZATION'])
3657
3889
  for gpu, stats in total_gpu_info.items():
@@ -3660,7 +3892,7 @@ def show_gpus(
3660
3892
  else:
3661
3893
  total_realtime_gpu_table = None
3662
3894
 
3663
- return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3895
+ return realtime_gpu_infos, total_realtime_gpu_table
3664
3896
 
3665
3897
  def _format_kubernetes_node_info_combined(
3666
3898
  contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
@@ -3684,11 +3916,16 @@ def show_gpus(
3684
3916
  acc_type = node_info.accelerator_type
3685
3917
  if acc_type is None:
3686
3918
  acc_type = '-'
3687
- node_table.add_row([
3688
- context_name, node_name, acc_type,
3689
- f'{available} of {node_info.total["accelerator_count"]} '
3690
- 'free'
3691
- ])
3919
+ utilization_str = (
3920
+ f'{available} of '
3921
+ f'{node_info.total["accelerator_count"]} free')
3922
+ # Check if node is ready (defaults to True for backward
3923
+ # compatibility with older server versions)
3924
+ node_is_ready = getattr(node_info, 'is_ready', True)
3925
+ if not node_is_ready:
3926
+ utilization_str += ' (Node NotReady)'
3927
+ node_table.add_row(
3928
+ [context_name, node_name, acc_type, utilization_str])
3692
3929
 
3693
3930
  k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
3694
3931
  if hints:
@@ -3699,6 +3936,43 @@ def show_gpus(
3699
3936
  f'{colorama.Style.RESET_ALL}\n'
3700
3937
  f'{node_table.get_string()}')
3701
3938
 
3939
+ def _format_slurm_node_info() -> str:
3940
+ node_table = log_utils.create_table([
3941
+ 'CLUSTER',
3942
+ 'NODE',
3943
+ 'PARTITION',
3944
+ 'STATE',
3945
+ 'GPU',
3946
+ 'UTILIZATION',
3947
+ ])
3948
+
3949
+ # Get all cluster names
3950
+ slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
3951
+
3952
+ # Query each cluster
3953
+ for cluster_name in slurm_cluster_names:
3954
+ nodes_info = sdk.stream_and_get(
3955
+ sdk.slurm_node_info(slurm_cluster_name=cluster_name))
3956
+
3957
+ for node_info in nodes_info:
3958
+ node_table.add_row([
3959
+ cluster_name,
3960
+ node_info.get('node_name'),
3961
+ node_info.get('partition', '-'),
3962
+ node_info.get('node_state'),
3963
+ node_info.get('gpu_type') or '',
3964
+ (f'{node_info.get("free_gpus", 0)} of '
3965
+ f'{node_info.get("total_gpus", 0)} free'),
3966
+ ])
3967
+
3968
+ slurm_per_node_msg = 'Slurm per node accelerator availability'
3969
+ # Optional: Add hint message if needed, similar to k8s
3970
+
3971
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3972
+ f'{slurm_per_node_msg}'
3973
+ f'{colorama.Style.RESET_ALL}\n'
3974
+ f'{node_table.get_string()}')
3975
+
3702
3976
  def _format_kubernetes_realtime_gpu(
3703
3977
  total_table: Optional['prettytable.PrettyTable'],
3704
3978
  k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3828,6 +4102,28 @@ def show_gpus(
3828
4102
  return True, print_section_titles
3829
4103
  return False, print_section_titles
3830
4104
 
4105
+ def _format_slurm_realtime_gpu(
4106
+ total_table, slurm_realtime_infos,
4107
+ show_node_info: bool) -> Generator[str, None, None]:
4108
+ # print total table
4109
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
4110
+ 'Slurm GPUs'
4111
+ f'{colorama.Style.RESET_ALL}\n')
4112
+ if total_table is not None:
4113
+ yield from total_table.get_string()
4114
+ yield '\n'
4115
+
4116
+ # print individual infos.
4117
+ for (partition, slurm_realtime_table) in slurm_realtime_infos:
4118
+ partition_str = f'Slurm Cluster: {partition}'
4119
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4120
+ f'{partition_str}'
4121
+ f'{colorama.Style.RESET_ALL}\n')
4122
+ yield from slurm_realtime_table.get_string()
4123
+ yield '\n'
4124
+ if show_node_info:
4125
+ yield _format_slurm_node_info()
4126
+
3831
4127
  def _output() -> Generator[str, None, None]:
3832
4128
  gpu_table = log_utils.create_table(
3833
4129
  ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3845,10 +4141,12 @@ def show_gpus(
3845
4141
  if cloud_name is None:
3846
4142
  clouds_to_list = [
3847
4143
  c for c in constants.ALL_CLOUDS
3848
- if c != 'kubernetes' and c != 'ssh'
4144
+ if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
3849
4145
  ]
3850
4146
 
3851
4147
  k8s_messages = ''
4148
+ slurm_messages = ''
4149
+ k8s_printed = False
3852
4150
  if accelerator_str is None:
3853
4151
  # Collect k8s related messages in k8s_messages and print them at end
3854
4152
  print_section_titles = False
@@ -3860,6 +4158,7 @@ def show_gpus(
3860
4158
  yield '\n\n'
3861
4159
  stop_iter_one, print_section_titles_one, k8s_messages_one = (
3862
4160
  yield from _possibly_show_k8s_like_realtime(is_ssh))
4161
+ k8s_printed = True
3863
4162
  stop_iter = stop_iter or stop_iter_one
3864
4163
  print_section_titles = (print_section_titles or
3865
4164
  print_section_titles_one)
@@ -3867,11 +4166,45 @@ def show_gpus(
3867
4166
  prev_print_section_titles = print_section_titles_one
3868
4167
  if stop_iter:
3869
4168
  return
4169
+ # If cloud is slurm, we want to show real-time capacity
4170
+ if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
4171
+ try:
4172
+ # If --cloud slurm is not specified, we want to catch
4173
+ # the case where no GPUs are available on the cluster and
4174
+ # print the warning at the end.
4175
+ slurm_realtime_infos, total_table = (
4176
+ _get_slurm_realtime_gpu_tables())
4177
+ except ValueError as e:
4178
+ if not cloud_is_slurm:
4179
+ # Make it a note if cloud is not slurm
4180
+ slurm_messages += 'Note: '
4181
+ slurm_messages += str(e)
4182
+ else:
4183
+ print_section_titles = True
4184
+ if k8s_printed:
4185
+ yield '\n'
4186
+
4187
+ yield from _format_slurm_realtime_gpu(total_table,
4188
+ slurm_realtime_infos,
4189
+ show_node_info=True)
4190
+
4191
+ if cloud_is_slurm:
4192
+ # Do not show clouds if --cloud slurm is specified
4193
+ if not slurm_is_enabled:
4194
+ yield ('Slurm is not enabled. To fix, run: '
4195
+ 'sky check slurm ')
4196
+ yield slurm_messages
4197
+ return
3870
4198
 
3871
4199
  # For show_all, show the k8s message at the start since output is
3872
4200
  # long and the user may not scroll to the end.
3873
- if show_all and k8s_messages:
3874
- yield k8s_messages
4201
+ if show_all and (k8s_messages or slurm_messages):
4202
+ if k8s_messages:
4203
+ yield k8s_messages
4204
+ if slurm_messages:
4205
+ if k8s_messages:
4206
+ yield '\n'
4207
+ yield slurm_messages
3875
4208
  yield '\n\n'
3876
4209
 
3877
4210
  list_accelerator_counts_result = sdk.stream_and_get(
@@ -3919,9 +4252,10 @@ def show_gpus(
3919
4252
  else:
3920
4253
  yield ('\n\nHint: use -a/--all to see all accelerators '
3921
4254
  '(including non-common ones) and pricing.')
3922
- if k8s_messages:
4255
+ if k8s_messages or slurm_messages:
3923
4256
  yield '\n'
3924
4257
  yield k8s_messages
4258
+ yield slurm_messages
3925
4259
  return
3926
4260
  else:
3927
4261
  # Parse accelerator string
@@ -3961,6 +4295,31 @@ def show_gpus(
3961
4295
  if stop_iter:
3962
4296
  return
3963
4297
 
4298
+ # Handle Slurm filtering by name and quantity
4299
+ if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
4300
+ not show_all):
4301
+ # Print section title if not showing all and instead a specific
4302
+ # accelerator is requested
4303
+ print_section_titles = True
4304
+ try:
4305
+ slurm_realtime_infos, total_table = (
4306
+ _get_slurm_realtime_gpu_tables(name_filter=name,
4307
+ quantity_filter=quantity))
4308
+
4309
+ yield from _format_slurm_realtime_gpu(total_table,
4310
+ slurm_realtime_infos,
4311
+ show_node_info=False)
4312
+ except ValueError as e:
4313
+ # In the case of a specific accelerator, show the error message
4314
+ # immediately (e.g., "Resources A10G not found ...")
4315
+ yield str(e)
4316
+ yield slurm_messages
4317
+ if cloud_is_slurm:
4318
+ # Do not show clouds if --cloud slurm is specified
4319
+ if not slurm_is_enabled:
4320
+ yield ('Slurm is not enabled. To fix, run: '
4321
+ 'sky check slurm ')
4322
+ return
3964
4323
  # For clouds other than Kubernetes, get the accelerator details
3965
4324
  # Case-sensitive
3966
4325
  list_accelerators_result = sdk.stream_and_get(
@@ -4093,8 +4452,7 @@ def storage_ls(verbose: bool):
4093
4452
  """List storage objects managed by SkyPilot."""
4094
4453
  request_id = sdk.storage_ls()
4095
4454
  storages = sdk.stream_and_get(request_id)
4096
- storage_table = storage_utils.format_storage_table(storages,
4097
- show_all=verbose)
4455
+ storage_table = table_utils.format_storage_table(storages, show_all=verbose)
4098
4456
  click.echo(storage_table)
4099
4457
 
4100
4458
 
@@ -4174,6 +4532,10 @@ def volumes():
4174
4532
  pass
4175
4533
 
4176
4534
 
4535
+ # Add 'volume' as an alias for 'volumes'
4536
+ cli.add_command(volumes, name='volume')
4537
+
4538
+
4177
4539
  @volumes.command('apply', cls=_DocumentedCodeCommand)
4178
4540
  @flags.config_option(expose_value=False)
4179
4541
  @click.argument('entrypoint',
@@ -4189,17 +4551,25 @@ def volumes():
4189
4551
  @click.option('--infra',
4190
4552
  required=False,
4191
4553
  type=str,
4192
- help='Infra. Format: k8s, k8s/context-name. '
4554
+ help='Infrastructure to use. '
4555
+ 'Format: cloud, cloud/region, cloud/region/zone, or '
4556
+ 'k8s/context-name.'
4557
+ 'Examples: k8s, k8s/my-context, runpod/US/US-CA-2. '
4193
4558
  'Override the infra defined in the YAML.')
4194
- @click.option(
4195
- '--type',
4196
- required=False,
4197
- type=str,
4198
- help='Volume type. Format: pvc. Override the type defined in the YAML.')
4559
+ @click.option('--type',
4560
+ required=False,
4561
+ type=click.Choice(volume_utils.VolumeType.supported_types()),
4562
+ help='Volume type. Override the type defined in the YAML.')
4199
4563
  @click.option('--size',
4200
4564
  required=False,
4201
4565
  type=str,
4202
4566
  help='Volume size. Override the size defined in the YAML.')
4567
+ @click.option(
4568
+ '--use-existing/--no-use-existing',
4569
+ required=False,
4570
+ default=None,
4571
+ help='Whether to use an existing volume. Override the use_existing '
4572
+ 'defined in the YAML.')
4203
4573
  @click.option('--yes',
4204
4574
  '-y',
4205
4575
  is_flag=True,
@@ -4214,6 +4584,7 @@ def volumes_apply(
4214
4584
  infra: Optional[str],
4215
4585
  type: Optional[str], # pylint: disable=redefined-builtin
4216
4586
  size: Optional[str],
4587
+ use_existing: Optional[bool],
4217
4588
  yes: bool,
4218
4589
  async_call: bool):
4219
4590
  """Apply a volume.
@@ -4226,7 +4597,11 @@ def volumes_apply(
4226
4597
  sky volumes apply volume.yaml
4227
4598
  \b
4228
4599
  # Apply a volume from a command.
4229
- sky volumes apply --name pvc1 --infra k8s --type pvc --size 100Gi
4600
+ sky volumes apply --name pvc1 --infra k8s --type k8s-pvc --size 100Gi
4601
+ \b
4602
+ # Apply a volume with existing PVC `pvc2` from a command.
4603
+ sky volumes apply --name pvc2 --infra k8s --type k8s-pvc --size 100Gi
4604
+ --use-existing
4230
4605
  """
4231
4606
  # pylint: disable=import-outside-toplevel
4232
4607
  from sky.volumes import volume as volume_lib
@@ -4245,7 +4620,8 @@ def volumes_apply(
4245
4620
  f'{entrypoint_str!r} needs to be a YAML file')
4246
4621
  if yaml_config is not None:
4247
4622
  volume_config_dict = yaml_config.copy()
4248
- override_config = _build_volume_override_config(name, infra, type, size)
4623
+ override_config = _build_volume_override_config(name, infra, type, size,
4624
+ use_existing)
4249
4625
  volume_config_dict.update(override_config)
4250
4626
 
4251
4627
  # Create Volume instance
@@ -4253,6 +4629,13 @@ def volumes_apply(
4253
4629
 
4254
4630
  logger.debug(f'Volume config: {volume.to_yaml_config()}')
4255
4631
 
4632
+ # TODO(kevin): remove the try block in v0.13.0
4633
+ try:
4634
+ volumes_sdk.validate(volume)
4635
+ except exceptions.APINotSupportedError:
4636
+ # Do best-effort client-side validation.
4637
+ volume.validate(skip_cloud_compatibility=True)
4638
+
4256
4639
  if not yes:
4257
4640
  click.confirm(f'Proceed to create volume {volume.name!r}?',
4258
4641
  default=True,
@@ -4269,11 +4652,15 @@ def volumes_apply(
4269
4652
  f'{colorama.Style.RESET_ALL}')
4270
4653
 
4271
4654
 
4272
- def _build_volume_override_config(name: Optional[str], infra: Optional[str],
4273
- volume_type: Optional[str],
4274
- size: Optional[str]) -> Dict[str, str]:
4655
+ def _build_volume_override_config(
4656
+ name: Optional[str],
4657
+ infra: Optional[str],
4658
+ volume_type: Optional[str],
4659
+ size: Optional[str],
4660
+ use_existing: Optional[bool],
4661
+ ) -> Dict[str, Any]:
4275
4662
  """Parse the volume override config."""
4276
- override_config = {}
4663
+ override_config: Dict[str, Any] = {}
4277
4664
  if name is not None:
4278
4665
  override_config['name'] = name
4279
4666
  if infra is not None:
@@ -4282,6 +4669,8 @@ def _build_volume_override_config(name: Optional[str], infra: Optional[str],
4282
4669
  override_config['type'] = volume_type
4283
4670
  if size is not None:
4284
4671
  override_config['size'] = size
4672
+ if use_existing is not None:
4673
+ override_config['use_existing'] = use_existing
4285
4674
  return override_config
4286
4675
 
4287
4676
 
@@ -4298,8 +4687,8 @@ def volumes_ls(verbose: bool):
4298
4687
  """List volumes managed by SkyPilot."""
4299
4688
  request_id = volumes_sdk.ls()
4300
4689
  all_volumes = sdk.stream_and_get(request_id)
4301
- volume_table = volumes_utils.format_volume_table(all_volumes,
4302
- show_all=verbose)
4690
+ volume_table = table_utils.format_volume_table(all_volumes,
4691
+ show_all=verbose)
4303
4692
  click.echo(volume_table)
4304
4693
 
4305
4694
 
@@ -4537,10 +4926,11 @@ def jobs_launch(
4537
4926
  break
4538
4927
  if print_setup_fm_warning:
4539
4928
  click.secho(
4540
- f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
4541
- ' will be ignored when submit jobs to pool. To update a pool, '
4542
- f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
4929
+ f'{colorama.Fore.YELLOW}Setup, file mounts, and storage mounts'
4930
+ ' will be ignored when submitting jobs to pool. To update a '
4931
+ f'pool, please use `sky jobs pool apply {pool} new-pool.yaml`. '
4543
4932
  f'{colorama.Style.RESET_ALL}')
4933
+ print_setup_fm_warning = False
4544
4934
 
4545
4935
  # Optimize info is only show if _need_confirmation.
4546
4936
  if not yes:
@@ -4556,10 +4946,15 @@ def jobs_launch(
4556
4946
  job_id_handle = _async_call_or_wait(request_id, async_call,
4557
4947
  'sky.jobs.launch')
4558
4948
 
4559
- if not async_call and not detach_run:
4560
- job_ids = job_id_handle[0]
4561
- if isinstance(job_ids, int) or len(job_ids) == 1:
4562
- job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
4949
+ if async_call:
4950
+ return
4951
+
4952
+ job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
4953
+ int) else job_id_handle[0]
4954
+
4955
+ if not detach_run:
4956
+ if len(job_ids) == 1:
4957
+ job_id = job_ids[0]
4563
4958
  returncode = managed_jobs.tail_logs(name=None,
4564
4959
  job_id=job_id,
4565
4960
  follow=True,
@@ -4568,7 +4963,8 @@ def jobs_launch(
4568
4963
  else:
4569
4964
  # TODO(tian): This can be very long. Considering have a "group id"
4570
4965
  # and query all job ids with the same group id.
4571
- job_ids_str = ','.join(map(str, job_ids))
4966
+ # Sort job ids to ensure consistent ordering.
4967
+ job_ids_str = ','.join(map(str, sorted(job_ids)))
4572
4968
  click.secho(
4573
4969
  f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
4574
4970
  f'{job_ids_str}{colorama.Style.RESET_ALL}.'
@@ -4587,6 +4983,14 @@ def jobs_launch(
4587
4983
  @jobs.command('queue', cls=_DocumentedCodeCommand)
4588
4984
  @flags.config_option(expose_value=False)
4589
4985
  @flags.verbose_option()
4986
+ @click.option(
4987
+ '--limit',
4988
+ '-l',
4989
+ default=_NUM_MANAGED_JOBS_TO_SHOW,
4990
+ type=int,
4991
+ required=False,
4992
+ help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
4993
+ f' use "-a/--all" to show all jobs.'))
4590
4994
  @click.option(
4591
4995
  '--refresh',
4592
4996
  '-r',
@@ -4606,7 +5010,7 @@ def jobs_launch(
4606
5010
  @usage_lib.entrypoint
4607
5011
  # pylint: disable=redefined-builtin
4608
5012
  def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4609
- all_users: bool, all: bool):
5013
+ all_users: bool, all: bool, limit: int):
4610
5014
  """Show statuses of managed jobs.
4611
5015
 
4612
5016
  Each managed jobs can have one of the following statuses:
@@ -4657,18 +5061,56 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4657
5061
 
4658
5062
  watch -n60 sky jobs queue
4659
5063
 
5064
+ (Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
5065
+
5066
+ .. code-block:: bash
5067
+
5068
+ sky jobs queue -l 10
5069
+
4660
5070
  """
4661
5071
  click.secho('Fetching managed job statuses...', fg='cyan')
4662
5072
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
4663
- managed_jobs_request_id = managed_jobs.queue(
4664
- refresh=refresh, skip_finished=skip_finished, all_users=all_users)
4665
- max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
5073
+ max_num_jobs_to_show = (limit if not all else None)
5074
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
5075
+ if verbose:
5076
+ fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
5077
+ if all_users:
5078
+ fields = fields + _USER_NAME_FIELD
5079
+ if verbose:
5080
+ fields = fields + _USER_HASH_FIELD
5081
+ # Call both cli_utils.get_managed_job_queue and managed_jobs.pool_status
5082
+ # in parallel
5083
+ def get_managed_jobs_queue():
5084
+ return cli_utils.get_managed_job_queue(refresh=refresh,
5085
+ skip_finished=skip_finished,
5086
+ all_users=all_users,
5087
+ limit=max_num_jobs_to_show,
5088
+ fields=fields)
5089
+
5090
+ def get_pool_status():
5091
+ try:
5092
+ return managed_jobs.pool_status(pool_names=None)
5093
+ except Exception: # pylint: disable=broad-except
5094
+ # If pool_status fails, we'll just skip the worker information
5095
+ return None
5096
+
5097
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
5098
+ managed_jobs_future = executor.submit(get_managed_jobs_queue)
5099
+ pool_status_future = executor.submit(get_pool_status)
5100
+
5101
+ (managed_jobs_request_id,
5102
+ queue_result_version) = managed_jobs_future.result()
5103
+ pool_status_request_id = pool_status_future.result()
5104
+
4666
5105
  num_jobs, msg = _handle_jobs_queue_request(
4667
5106
  managed_jobs_request_id,
5107
+ pool_status_request_id=pool_status_request_id,
4668
5108
  show_all=verbose,
4669
5109
  show_user=all_users,
4670
5110
  max_num_jobs_to_show=max_num_jobs_to_show,
4671
- is_called_by_user=True)
5111
+ is_called_by_user=True,
5112
+ queue_result_version=queue_result_version,
5113
+ )
4672
5114
  if not skip_finished:
4673
5115
  in_progress_only_hint = ''
4674
5116
  else:
@@ -4681,7 +5123,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4681
5123
  f'{colorama.Fore.CYAN}'
4682
5124
  f'Only showing the latest {max_num_jobs_to_show} '
4683
5125
  f'managed jobs'
4684
- f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
5126
+ f'(use --limit to show more managed jobs or '
5127
+ f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4685
5128
 
4686
5129
 
4687
5130
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
@@ -4849,7 +5292,7 @@ def pool():
4849
5292
  @pool.command('apply', cls=_DocumentedCodeCommand)
4850
5293
  @flags.config_option(expose_value=False)
4851
5294
  @click.argument('pool_yaml',
4852
- required=True,
5295
+ required=False,
4853
5296
  type=str,
4854
5297
  nargs=-1,
4855
5298
  **_get_shell_complete_args(_complete_file_name))
@@ -4864,17 +5307,22 @@ def pool():
4864
5307
  type=click.Choice([m.value for m in serve_lib.UpdateMode],
4865
5308
  case_sensitive=False),
4866
5309
  required=False,
4867
- help=('Update mode. If "rolling", cluster pool will be updated '
4868
- 'with rolling update. If "blue_green", cluster pool will '
5310
+ help=('Update mode. If "rolling", pool will be updated '
5311
+ 'with rolling update. If "blue_green", pool will '
4869
5312
  'be updated with blue-green update. This option is only '
4870
5313
  'valid when the pool is already running.'))
5314
+ @click.option('--workers',
5315
+ default=None,
5316
+ type=int,
5317
+ required=False,
5318
+ help='Can be used to update the number of workers in the pool.')
4871
5319
  @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
4872
5320
  flags.COMMON_OPTIONS)
4873
5321
  @flags.yes_option()
4874
5322
  @timeline.event
4875
5323
  @usage_lib.entrypoint
4876
5324
  def jobs_pool_apply(
4877
- pool_yaml: Tuple[str, ...],
5325
+ pool_yaml: Optional[Tuple[str, ...]],
4878
5326
  pool: Optional[str], # pylint: disable=redefined-outer-name
4879
5327
  workdir: Optional[str],
4880
5328
  infra: Optional[str],
@@ -4896,60 +5344,80 @@ def jobs_pool_apply(
4896
5344
  disk_tier: Optional[str],
4897
5345
  network_tier: Optional[str],
4898
5346
  mode: str,
5347
+ workers: Optional[int],
4899
5348
  yes: bool,
4900
5349
  async_call: bool,
4901
5350
  ):
4902
- """Apply a config to a cluster pool for managed jobs submission.
4903
-
4904
- If the pool is already running, the config will be applied to the pool.
4905
- Otherwise, a new pool will be created.
4906
-
4907
- POOL_YAML must point to a valid YAML file.
5351
+ """Either apply a config to a pool for managed jobs submission
5352
+ or update the number of workers in the pool. One of POOL_YAML or --workers
5353
+ must be provided.
5354
+ Config:
5355
+ If the pool is already running, the config will be applied to the pool.
5356
+ Otherwise, a new pool will be created.
5357
+ Workers:
5358
+ The --workers option can be used to override the number of workers
5359
+ specified in the YAML file, or to update workers without a YAML file.
5360
+ Example:
5361
+ sky jobs pool apply -p my-pool --workers 5
4908
5362
  """
4909
5363
  cloud, region, zone = _handle_infra_cloud_region_zone_options(
4910
5364
  infra, cloud, region, zone)
4911
- if pool is None:
4912
- pool = serve_lib.generate_service_name(pool=True)
5365
+ if workers is not None and pool_yaml is not None and len(pool_yaml) > 0:
5366
+ raise click.UsageError(
5367
+ 'Cannot specify both --workers and POOL_YAML. Please use one of '
5368
+ 'them.')
4913
5369
 
4914
- task = _generate_task_with_service(
4915
- service_name=pool,
4916
- service_yaml_args=pool_yaml,
4917
- workdir=workdir,
4918
- cloud=cloud,
4919
- region=region,
4920
- zone=zone,
4921
- gpus=gpus,
4922
- cpus=cpus,
4923
- memory=memory,
4924
- instance_type=instance_type,
4925
- num_nodes=num_nodes,
4926
- use_spot=use_spot,
4927
- image_id=image_id,
4928
- env_file=env_file,
4929
- env=env,
4930
- secret=secret,
4931
- disk_size=disk_size,
4932
- disk_tier=disk_tier,
4933
- network_tier=network_tier,
4934
- ports=ports,
4935
- not_supported_cmd='sky jobs pool up',
4936
- pool=True,
4937
- )
4938
- assert task.service is not None
4939
- if not task.service.pool:
4940
- raise click.UsageError('The YAML file needs a `pool` section.')
4941
- click.secho('Pool spec:', fg='cyan')
4942
- click.echo(task.service)
4943
- serve_lib.validate_service_task(task, pool=True)
5370
+ if pool_yaml is None or len(pool_yaml) == 0:
5371
+ if pool is None:
5372
+ raise click.UsageError(
5373
+ 'A pool name must be provided to update the number of workers.')
5374
+ task = None
5375
+ click.secho(f'Attempting to update {pool} to have {workers} workers',
5376
+ fg='cyan')
5377
+ else:
5378
+ if pool is None:
5379
+ pool = serve_lib.generate_service_name(pool=True)
5380
+
5381
+ task = _generate_task_with_service(
5382
+ service_name=pool,
5383
+ service_yaml_args=pool_yaml,
5384
+ workdir=workdir,
5385
+ cloud=cloud,
5386
+ region=region,
5387
+ zone=zone,
5388
+ gpus=gpus,
5389
+ cpus=cpus,
5390
+ memory=memory,
5391
+ instance_type=instance_type,
5392
+ num_nodes=num_nodes,
5393
+ use_spot=use_spot,
5394
+ image_id=image_id,
5395
+ env_file=env_file,
5396
+ env=env,
5397
+ secret=secret,
5398
+ disk_size=disk_size,
5399
+ disk_tier=disk_tier,
5400
+ network_tier=network_tier,
5401
+ ports=ports,
5402
+ not_supported_cmd='sky jobs pool up',
5403
+ pool=True,
5404
+ )
5405
+ assert task.service is not None
5406
+ if not task.service.pool:
5407
+ raise click.UsageError('The YAML file needs a `pool` section.')
5408
+ click.secho('Pool spec:', fg='cyan')
5409
+ click.echo(task.service)
5410
+ serve_lib.validate_service_task(task, pool=True)
4944
5411
 
4945
- click.secho(
4946
- 'Each pool worker will use the following resources (estimated):',
4947
- fg='cyan')
4948
- with dag_lib.Dag() as dag:
4949
- dag.add(task)
5412
+ click.secho(
5413
+ 'Each pool worker will use the following resources (estimated):',
5414
+ fg='cyan')
5415
+ with dag_lib.Dag() as dag:
5416
+ dag.add(task)
4950
5417
 
4951
5418
  request_id = managed_jobs.pool_apply(task,
4952
5419
  pool,
5420
+ workers=workers,
4953
5421
  mode=serve_lib.UpdateMode(mode),
4954
5422
  _need_confirmation=not yes)
4955
5423
  _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
@@ -4962,7 +5430,7 @@ def jobs_pool_apply(
4962
5430
  @usage_lib.entrypoint
4963
5431
  # pylint: disable=redefined-builtin
4964
5432
  def jobs_pool_status(verbose: bool, pool_names: List[str]):
4965
- """Show statuses of cluster pools.
5433
+ """Show statuses of pools.
4966
5434
 
4967
5435
  Show detailed statuses of one or more pools. If POOL_NAME is not
4968
5436
  provided, show all pools' status.
@@ -5018,12 +5486,108 @@ def jobs_pool_down(
5018
5486
  raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
5019
5487
  f'Provided {argument_str!r}.')
5020
5488
 
5021
- if not yes:
5022
- quoted_pool_names = [f'{name!r}' for name in pool_names]
5023
- list_pool_str = ', '.join(quoted_pool_names)
5024
- pool_identity_str = f'pool(s) {list_pool_str}'
5025
- if all:
5026
- pool_identity_str = 'all pools'
5489
+ def _get_nonterminal_jobs(pool_names: List[str],
5490
+ all: bool) -> List[responses.ManagedJobRecord]:
5491
+ # Get nonterminal jobs for this pool using managed_jobs.queue
5492
+ request_id, queue_result_version = cli_utils.get_managed_job_queue(
5493
+ refresh=False,
5494
+ skip_finished=True,
5495
+ all_users=True,
5496
+ limit=None,
5497
+ fields=['job_id', 'status', 'pool'],
5498
+ )
5499
+ jobs_result = sdk.stream_and_get(request_id)
5500
+
5501
+ # Handle both tuple and list responses
5502
+ jobs_list: List[responses.ManagedJobRecord]
5503
+ if queue_result_version.v2():
5504
+ jobs_list = jobs_result[0]
5505
+ else:
5506
+ jobs_list = typing.cast(List[responses.ManagedJobRecord],
5507
+ jobs_result)
5508
+
5509
+ def _should_include_job(job: responses.ManagedJobRecord) -> bool:
5510
+ # Job must not be terminal.
5511
+ if job.get('status', ManagedJobStatus.SUCCEEDED).is_terminal():
5512
+ return False
5513
+ # If len is 0 then we are using -a option, so we include all jobs
5514
+ # if they're associated with a pool.
5515
+ if all:
5516
+ return job.get('pool') is not None
5517
+ # Otherwise we are using specific pool names, so we include the job
5518
+ # if it's associated with one of the specified pools.
5519
+ return job.get('pool') in pool_names
5520
+
5521
+ # Filter jobs by pool name and ensure nonterminal
5522
+ pool_jobs = [job for job in jobs_list if _should_include_job(job)]
5523
+ return pool_jobs
5524
+
5525
+ quoted_pool_names = [f'{name!r}' for name in pool_names]
5526
+ list_pool_str = ', '.join(quoted_pool_names)
5527
+ pool_identity_str = f'pool(s) {list_pool_str}'
5528
+ if all:
5529
+ pool_identity_str = 'all pools'
5530
+
5531
+ already_confirmed = False
5532
+ try:
5533
+ pool_jobs = _get_nonterminal_jobs(pool_names, all)
5534
+ if pool_jobs:
5535
+ num_jobs = len(pool_jobs)
5536
+ job_ids = [job['job_id'] for job in pool_jobs]
5537
+ job_ids_str = ','.join(str(job_id) for job_id in job_ids)
5538
+ click.echo(
5539
+ f'{colorama.Fore.YELLOW}Pool(s) has {num_jobs} '
5540
+ f'nonterminal jobs: {job_ids_str} so it is not yet safe to down'
5541
+ f'.{colorama.Style.RESET_ALL}')
5542
+ if not yes:
5543
+ should_cancel = click.confirm(
5544
+ 'Would you like to cancel all jobs and down the pool(s)?',
5545
+ default=False,
5546
+ abort=False,
5547
+ show_default=True)
5548
+ if not should_cancel:
5549
+ raise click.Abort()
5550
+ already_confirmed = True
5551
+
5552
+ # Cancel all jobs in the pool
5553
+ with rich_utils.client_status(
5554
+ ux_utils.spinner_message(
5555
+ f'Cancelling {num_jobs} jobs in {pool_identity_str}...')
5556
+ ):
5557
+ try:
5558
+ sdk.get(managed_jobs.cancel(job_ids=job_ids))
5559
+ except Exception as e:
5560
+ logger.warning(f'Failed to cancel jobs: {e}.')
5561
+ raise e
5562
+
5563
+ max_wait_time = 300 # 5 minutes max wait
5564
+ check_interval = 2 # Check every 2 seconds
5565
+ start_time = time.time()
5566
+ remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
5567
+ while (remaining_pool_jobs and
5568
+ time.time() - start_time < max_wait_time):
5569
+ # Check remaining jobs via API
5570
+ time.sleep(check_interval)
5571
+ remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
5572
+ ux_utils.spinner_message(
5573
+ f'Waiting for {len(remaining_pool_jobs)} '
5574
+ 'jobs to be cancelled...')
5575
+
5576
+ click.echo('\r' + ' ' * 80 + '\r', nl=False)
5577
+ if time.time() - start_time >= max_wait_time:
5578
+ click.echo(
5579
+ f'{colorama.Fore.YELLOW}Warning: Timeout waiting '
5580
+ f'for jobs to finish. Proceeding with pool down '
5581
+ f'anyway.{colorama.Style.RESET_ALL}')
5582
+ else:
5583
+ click.echo('All jobs cancelled.')
5584
+ except Exception as e: # pylint: disable=broad-except
5585
+ # If API call fails, log warning but continue with pool down
5586
+ logger.warning(
5587
+ f'Failed to check for running jobs in pool(s): {pool_names!r}: {e}.'
5588
+ ' Proceeding with pool down.')
5589
+
5590
+ if not yes and not already_confirmed:
5027
5591
  click.confirm(f'Terminating {pool_identity_str}. Proceed?',
5028
5592
  default=True,
5029
5593
  abort=True,
@@ -5205,22 +5769,22 @@ def jobs_pool_logs(
5205
5769
  .. code-block:: bash
5206
5770
 
5207
5771
  # Tail the controller logs of a pool
5208
- sky pool logs --controller [POOL_NAME]
5772
+ sky jobs pool logs --controller [POOL_NAME]
5209
5773
  \b
5210
5774
  # Print the worker logs so far and exit
5211
- sky pool logs --no-follow [POOL_NAME]
5775
+ sky jobs pool logs --no-follow [POOL_NAME] 1
5212
5776
  \b
5213
5777
  # Tail the logs of worker 1
5214
- sky pool logs [POOL_NAME] 1
5778
+ sky jobs pool logs [POOL_NAME] 1
5215
5779
  \b
5216
5780
  # Show the last 100 lines of the controller logs
5217
- sky pool logs --controller --tail 100 [POOL_NAME]
5781
+ sky jobs pool logs --controller --tail 100 [POOL_NAME]
5218
5782
  \b
5219
5783
  # Sync down all logs of the pool (controller, all workers)
5220
- sky pool logs [POOL_NAME] --sync-down
5784
+ sky jobs pool logs [POOL_NAME] --sync-down
5221
5785
  \b
5222
5786
  # Sync down controller logs and logs for workers 1 and 3
5223
- sky pool logs [POOL_NAME] 1 3 --controller --sync-down
5787
+ sky jobs pool logs [POOL_NAME] 1 3 --controller --sync-down
5224
5788
  """
5225
5789
  _handle_serve_logs(pool_name,
5226
5790
  follow=follow,
@@ -5236,7 +5800,15 @@ def jobs_pool_logs(
5236
5800
  @flags.config_option(expose_value=False)
5237
5801
  @usage_lib.entrypoint
5238
5802
  def dashboard() -> None:
5239
- """Starts the dashboard for skypilot."""
5803
+ """Opens the SkyPilot dashboard."""
5804
+ sdk.dashboard()
5805
+
5806
+
5807
+ @cli.command(cls=_DocumentedCodeCommand, hidden=True)
5808
+ @flags.config_option(expose_value=False)
5809
+ @usage_lib.entrypoint
5810
+ def ui() -> None:
5811
+ """Opens the SkyPilot dashboard."""
5240
5812
  sdk.dashboard()
5241
5813
 
5242
5814
 
@@ -5247,28 +5819,30 @@ def serve():
5247
5819
 
5248
5820
 
5249
5821
  def _generate_task_with_service(
5250
- service_name: str,
5251
- service_yaml_args: Tuple[str, ...],
5252
- workdir: Optional[str],
5253
- cloud: Optional[str],
5254
- region: Optional[str],
5255
- zone: Optional[str],
5256
- num_nodes: Optional[int],
5257
- use_spot: Optional[bool],
5258
- image_id: Optional[str],
5259
- env_file: Optional[Dict[str, str]],
5260
- env: List[Tuple[str, str]],
5261
- secret: Optional[List[Tuple[str, str]]],
5262
- gpus: Optional[str],
5263
- instance_type: Optional[str],
5264
- ports: Optional[Tuple[str]],
5265
- cpus: Optional[str],
5266
- memory: Optional[str],
5267
- disk_size: Optional[int],
5268
- disk_tier: Optional[str],
5269
- network_tier: Optional[str],
5270
- not_supported_cmd: str,
5271
- pool: bool, # pylint: disable=redefined-outer-name
5822
+ service_name: str,
5823
+ service_yaml_args: Tuple[str, ...],
5824
+ workdir: Optional[str],
5825
+ cloud: Optional[str],
5826
+ region: Optional[str],
5827
+ zone: Optional[str],
5828
+ num_nodes: Optional[int],
5829
+ use_spot: Optional[bool],
5830
+ image_id: Optional[str],
5831
+ env_file: Optional[Dict[str, str]],
5832
+ env: List[Tuple[str, str]],
5833
+ secret: Optional[List[Tuple[str, str]]],
5834
+ gpus: Optional[str],
5835
+ instance_type: Optional[str],
5836
+ ports: Optional[Tuple[str]],
5837
+ cpus: Optional[str],
5838
+ memory: Optional[str],
5839
+ disk_size: Optional[int],
5840
+ disk_tier: Optional[str],
5841
+ network_tier: Optional[str],
5842
+ not_supported_cmd: str,
5843
+ pool: bool, # pylint: disable=redefined-outer-name
5844
+ git_url: Optional[str] = None,
5845
+ git_ref: Optional[str] = None,
5272
5846
  ) -> task_lib.Task:
5273
5847
  """Generate a task with service section from a service YAML file."""
5274
5848
  is_yaml, _ = _check_yaml(''.join(service_yaml_args))
@@ -5298,6 +5872,8 @@ def _generate_task_with_service(
5298
5872
  disk_tier=disk_tier,
5299
5873
  network_tier=network_tier,
5300
5874
  ports=ports,
5875
+ git_url=git_url,
5876
+ git_ref=git_ref,
5301
5877
  )
5302
5878
  if isinstance(task, dag_lib.Dag):
5303
5879
  raise click.UsageError(
@@ -5313,7 +5889,7 @@ def _generate_task_with_service(
5313
5889
  if task.service.pool:
5314
5890
  if task.service.ports is not None or ports:
5315
5891
  with ux_utils.print_exception_no_traceback():
5316
- raise ValueError('Cannot specify ports in a cluster pool.')
5892
+ raise ValueError('Cannot specify ports in a pool.')
5317
5893
  return task
5318
5894
 
5319
5895
  # NOTE(yi): we only allow one service port now.
@@ -5389,6 +5965,10 @@ def _generate_task_with_service(
5389
5965
  type=str,
5390
5966
  help='A service name. Unique for each service. If not provided, '
5391
5967
  'a unique name is autogenerated.')
5968
+ @click.option('--git-url', type=str, help='Git repository URL.')
5969
+ @click.option('--git-ref',
5970
+ type=str,
5971
+ help='Git reference (branch, tag, or commit hash) to use.')
5392
5972
  @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
5393
5973
  flags.COMMON_OPTIONS)
5394
5974
  @flags.yes_option()
@@ -5418,6 +5998,8 @@ def serve_up(
5418
5998
  network_tier: Optional[str],
5419
5999
  yes: bool,
5420
6000
  async_call: bool,
6001
+ git_url: Optional[str] = None,
6002
+ git_ref: Optional[str] = None,
5421
6003
  ):
5422
6004
  """Launch a SkyServe service.
5423
6005
 
@@ -5475,6 +6057,8 @@ def serve_up(
5475
6057
  ports=ports,
5476
6058
  not_supported_cmd='sky serve up',
5477
6059
  pool=False,
6060
+ git_url=git_url,
6061
+ git_ref=git_ref,
5478
6062
  )
5479
6063
  assert task.service is not None
5480
6064
  if task.service.pool:
@@ -5556,6 +6140,8 @@ def serve_update(
5556
6140
  sky serve update --mode blue_green sky-service-16aa new_service.yaml
5557
6141
 
5558
6142
  """
6143
+ # TODO(lloyd-brown): Add a way to update number of replicas for serve
6144
+ # the way we did for pools.
5559
6145
  cloud, region, zone = _handle_infra_cloud_region_zone_options(
5560
6146
  infra, cloud, region, zone)
5561
6147
  task = _generate_task_with_service(
@@ -5918,94 +6504,39 @@ def local():
5918
6504
  help='Launch cluster without GPU support even '
5919
6505
  'if GPUs are detected on the host.')
5920
6506
  @click.option(
5921
- '--ips',
6507
+ '--name',
5922
6508
  type=str,
5923
6509
  required=False,
5924
- help='Path to the file containing IP addresses of remote machines.')
5925
- @click.option('--ssh-user',
5926
- type=str,
5927
- required=False,
5928
- help='SSH username for accessing remote machines.')
5929
- @click.option('--ssh-key-path',
5930
- type=str,
5931
- required=False,
5932
- help='Path to the SSH private key.')
5933
- @click.option('--cleanup',
5934
- is_flag=True,
5935
- help='Clean up the remote cluster instead of deploying it.')
6510
+ help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
5936
6511
  @click.option(
5937
- '--context-name',
5938
- type=str,
6512
+ '--port-start',
6513
+ type=int,
5939
6514
  required=False,
5940
- help='Name to use for the kubeconfig context. Defaults to "default".')
5941
- @click.option('--password',
5942
- type=str,
5943
- required=False,
5944
- help='Password for the ssh-user to execute sudo commands. '
5945
- 'Required only if passwordless sudo is not setup.')
6515
+ help='Starting port range for the local kind cluster. Needs to be a '
6516
+ 'multiple of 100. If not given, a random range will be used. '
6517
+ 'Used without ip list.')
5946
6518
  @local.command('up', cls=_DocumentedCodeCommand)
5947
6519
  @flags.config_option(expose_value=False)
5948
6520
  @_add_click_options(flags.COMMON_OPTIONS)
5949
6521
  @usage_lib.entrypoint
5950
- def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5951
- cleanup: bool, context_name: Optional[str],
5952
- password: Optional[str], async_call: bool):
5953
- """Creates a local or remote cluster."""
5954
-
5955
- def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
5956
- # If any of --ips, --ssh-user, or --ssh-key-path is specified,
5957
- # all must be specified
5958
- if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
5959
- if not (ips and ssh_user and ssh_key_path):
5960
- raise click.BadParameter(
5961
- 'All --ips, --ssh-user, and --ssh-key-path '
5962
- 'must be specified together.')
5963
-
5964
- # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
5965
- # are all provided
5966
- if cleanup and not (ips and ssh_user and ssh_key_path):
5967
- raise click.BadParameter('--cleanup can only be used with '
5968
- '--ips, --ssh-user and --ssh-key-path.')
5969
-
5970
- _validate_args(ips, ssh_user, ssh_key_path, cleanup)
5971
-
5972
- # If remote deployment arguments are specified, run remote up script
5973
- ip_list = None
5974
- ssh_key = None
5975
- if ips and ssh_user and ssh_key_path:
5976
- # Read and validate IP file
5977
- try:
5978
- with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
5979
- ip_list = f.read().strip().splitlines()
5980
- if not ip_list:
5981
- raise click.BadParameter(f'IP file is empty: {ips}')
5982
- except (IOError, OSError) as e:
5983
- raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
5984
-
5985
- # Read and validate SSH key file
5986
- try:
5987
- with open(os.path.expanduser(ssh_key_path), 'r',
5988
- encoding='utf-8') as f:
5989
- ssh_key = f.read()
5990
- if not ssh_key:
5991
- raise click.BadParameter(
5992
- f'SSH key file is empty: {ssh_key_path}')
5993
- except (IOError, OSError) as e:
5994
- raise click.BadParameter(
5995
- f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5996
-
5997
- request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
5998
- context_name, password)
6522
+ def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
6523
+ async_call: bool):
6524
+ """Creates a local cluster."""
6525
+ request_id = sdk.local_up(gpus, name, port_start)
5999
6526
  _async_call_or_wait(request_id, async_call, request_name='local up')
6000
6527
 
6001
6528
 
6529
+ @click.option('--name',
6530
+ type=str,
6531
+ required=False,
6532
+ help='Name of the cluster to down. Defaults to "skypilot".')
6002
6533
  @local.command('down', cls=_DocumentedCodeCommand)
6003
6534
  @flags.config_option(expose_value=False)
6004
6535
  @_add_click_options(flags.COMMON_OPTIONS)
6005
6536
  @usage_lib.entrypoint
6006
- def local_down(async_call: bool):
6537
+ def local_down(name: Optional[str], async_call: bool):
6007
6538
  """Deletes a local cluster."""
6008
- request_id = sdk.local_down()
6539
+ request_id = sdk.local_down(name)
6009
6540
  _async_call_or_wait(request_id, async_call, request_name='sky.local.down')
6010
6541
 
6011
6542
 
@@ -6119,20 +6650,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6119
6650
  **_get_shell_complete_args(_complete_api_request))
6120
6651
  @flags.all_option('Cancel all your requests.')
6121
6652
  @flags.all_users_option('Cancel all requests from all users.')
6653
+ @flags.yes_option()
6122
6654
  @usage_lib.entrypoint
6123
6655
  # pylint: disable=redefined-builtin
6124
- def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6656
+ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
6657
+ yes: bool):
6125
6658
  """Cancel a request running on SkyPilot API server."""
6126
6659
  if all or all_users:
6127
- keyword = 'ALL USERS\'' if all_users else 'YOUR'
6128
- user_input = click.prompt(
6129
- f'This will cancel all {keyword} requests.\n'
6130
- f'To proceed, please type {colorama.Style.BRIGHT}'
6131
- f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6132
- type=str)
6133
- if user_input != 'cancel all requests':
6134
- raise click.Abort()
6135
- if all:
6660
+ if not yes:
6661
+ keyword = 'ALL USERS\'' if all_users else 'YOUR'
6662
+ user_input = click.prompt(
6663
+ f'This will cancel all {keyword} requests.\n'
6664
+ f'To proceed, please type {colorama.Style.BRIGHT}'
6665
+ f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6666
+ type=str)
6667
+ if user_input != 'cancel all requests':
6668
+ raise click.Abort()
6136
6669
  request_ids = None
6137
6670
  cancelled_request_ids = sdk.get(
6138
6671
  sdk.api_cancel(request_ids=request_ids, all_users=all_users))
@@ -6146,9 +6679,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6146
6679
  fg='green')
6147
6680
 
6148
6681
 
6682
+ class IntOrNone(click.ParamType):
6683
+ """Int or None"""
6684
+ name = 'int-or-none'
6685
+
6686
+ def convert(self, value, param, ctx):
6687
+ if isinstance(value, int):
6688
+ return value
6689
+ if isinstance(value, str) and value.lower() in ('none', 'all'):
6690
+ return None
6691
+ try:
6692
+ return int(value)
6693
+ except ValueError:
6694
+ self.fail(f'{value!r} is not a valid integer or "none" or "all"',
6695
+ param, ctx)
6696
+
6697
+
6698
+ INT_OR_NONE = IntOrNone()
6699
+
6700
+
6149
6701
  @api.command('status', cls=_DocumentedCodeCommand)
6150
6702
  @flags.config_option(expose_value=False)
6151
- @click.argument('request_ids',
6703
+ @click.argument('request_id_prefixes',
6152
6704
  required=False,
6153
6705
  type=str,
6154
6706
  nargs=-1,
@@ -6158,16 +6710,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6158
6710
  is_flag=True,
6159
6711
  default=False,
6160
6712
  required=False,
6161
- help='Show requests of all statuses.')
6713
+ help=('Show requests of all statuses, including finished ones '
6714
+ '(SUCCEEDED, FAILED, CANCELLED). By default, only active '
6715
+ 'requests (PENDING, RUNNING) are shown.'))
6716
+ @click.option(
6717
+ '--limit',
6718
+ '-l',
6719
+ default=_NUM_REQUESTS_TO_SHOW,
6720
+ type=INT_OR_NONE,
6721
+ required=False,
6722
+ help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
6723
+ f' set to "none" or "all" to show all requests.'))
6162
6724
  @flags.verbose_option('Show more details.')
6163
6725
  @usage_lib.entrypoint
6164
6726
  # pylint: disable=redefined-builtin
6165
- def api_status(request_ids: Optional[List[str]], all_status: bool,
6166
- verbose: bool):
6727
+ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
6728
+ verbose: bool, limit: Optional[int]):
6167
6729
  """List requests on SkyPilot API server."""
6168
- if not request_ids:
6169
- request_ids = None
6170
- request_list = sdk.api_status(request_ids, all_status)
6730
+ if not request_id_prefixes:
6731
+ request_id_prefixes = None
6732
+ fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
6733
+ if verbose:
6734
+ fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
6735
+ request_list = sdk.api_status(request_id_prefixes, all_status, limit,
6736
+ fields)
6171
6737
  columns = ['ID', 'User', 'Name']
6172
6738
  if verbose:
6173
6739
  columns.append('Cluster')
@@ -6193,8 +6759,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
6193
6759
  if verbose:
6194
6760
  dummy_row.append('-')
6195
6761
  table.add_row(dummy_row)
6196
- click.echo()
6197
6762
  click.echo(table)
6763
+ if limit and len(request_list) >= limit:
6764
+ click.echo()
6765
+ click.echo(
6766
+ f'Showing {limit} requests. Use "-l none" or "-l all" to show'
6767
+ f' all requests.')
6198
6768
 
6199
6769
 
6200
6770
  @api.command('login', cls=_DocumentedCodeCommand)