skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/client/cli/command.py CHANGED
@@ -32,6 +32,7 @@ import shlex
32
32
  import shutil
33
33
  import subprocess
34
34
  import sys
35
+ import time
35
36
  import traceback
36
37
  import typing
37
38
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
@@ -59,8 +60,9 @@ from sky import task as task_lib
59
60
  from sky.adaptors import common as adaptors_common
60
61
  from sky.client import sdk
61
62
  from sky.client.cli import flags
62
- from sky.client.cli import git
63
- from sky.data import storage_utils
63
+ from sky.client.cli import table_utils
64
+ from sky.client.cli import utils as cli_utils
65
+ from sky.jobs.state import ManagedJobStatus
64
66
  from sky.provision.kubernetes import constants as kubernetes_constants
65
67
  from sky.provision.kubernetes import utils as kubernetes_utils
66
68
  from sky.schemas.api import responses
@@ -79,7 +81,6 @@ from sky.utils import controller_utils
79
81
  from sky.utils import dag_utils
80
82
  from sky.utils import directory_utils
81
83
  from sky.utils import env_options
82
- from sky.utils import git as git_utils
83
84
  from sky.utils import infra_utils
84
85
  from sky.utils import log_utils
85
86
  from sky.utils import registry
@@ -89,9 +90,9 @@ from sky.utils import status_lib
89
90
  from sky.utils import subprocess_utils
90
91
  from sky.utils import timeline
91
92
  from sky.utils import ux_utils
93
+ from sky.utils import volume as volume_utils
92
94
  from sky.utils import yaml_utils
93
95
  from sky.utils.cli_utils import status_utils
94
- from sky.volumes import utils as volumes_utils
95
96
  from sky.volumes.client import sdk as volumes_sdk
96
97
 
97
98
  if typing.TYPE_CHECKING:
@@ -113,6 +114,24 @@ an autogenerated name."""
113
114
  # command.
114
115
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
115
116
  _NUM_MANAGED_JOBS_TO_SHOW = 50
117
+ _NUM_REQUESTS_TO_SHOW = 50
118
+ _DEFAULT_REQUEST_FIELDS_TO_SHOW = [
119
+ 'request_id', 'name', 'user_id', 'status', 'created_at'
120
+ ]
121
+ _VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
122
+ 'cluster_name'
123
+ ]
124
+ _DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
125
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
126
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
127
+ ]
128
+ _VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
129
+ 'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
130
+ 'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
131
+ 'failure_reason', 'metadata'
132
+ ]
133
+ _USER_NAME_FIELD = ['user_name']
134
+ _USER_HASH_FIELD = ['user_hash']
116
135
 
117
136
  _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
118
137
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -129,6 +148,7 @@ def _get_cluster_records_and_set_ssh_config(
129
148
  clusters: Optional[List[str]],
130
149
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
131
150
  all_users: bool = False,
151
+ verbose: bool = False,
132
152
  ) -> List[responses.StatusResponse]:
133
153
  """Returns a list of clusters that match the glob pattern.
134
154
 
@@ -146,17 +166,23 @@ def _get_cluster_records_and_set_ssh_config(
146
166
  request_id = sdk.status(clusters,
147
167
  refresh=refresh,
148
168
  all_users=all_users,
149
- _include_credentials=True)
169
+ _include_credentials=True,
170
+ _summary_response=not verbose)
150
171
  cluster_records = sdk.stream_and_get(request_id)
151
172
  # Update the SSH config for all clusters
152
173
  for record in cluster_records:
153
174
  handle = record['handle']
154
-
175
+ name = record['name']
155
176
  if not (handle is not None and handle.cached_external_ips is not None
156
177
  and 'credentials' in record):
157
178
  # If the cluster is not UP or does not have credentials available,
158
179
  # we need to remove the cluster from the SSH config.
159
- cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
180
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
181
+ continue
182
+ if not record['credentials']:
183
+ # The credential is missing for some reason, continue.
184
+ logger.debug(
185
+ f'Client did not receive SSH credential for cluster {name}')
160
186
  continue
161
187
 
162
188
  # During the failover, even though a cluster does not exist, the handle
@@ -783,8 +809,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
783
809
 
784
810
  # Update the workdir config from the command line parameters.
785
811
  # And update the envs and secrets from the workdir.
786
- _update_task_workdir(task, workdir, git_url, git_ref)
787
- _update_task_workdir_and_secrets_from_workdir(task)
812
+ task.update_workdir(workdir, git_url, git_ref)
813
+ task.update_envs_and_secrets_from_workdir()
788
814
 
789
815
  # job launch specific.
790
816
  if job_recovery is not None:
@@ -799,73 +825,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
799
825
  return task
800
826
 
801
827
 
802
- def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
803
- git_url: Optional[str], git_ref: Optional[str]):
804
- """Updates the task workdir.
805
-
806
- Args:
807
- task: The task to update.
808
- workdir: The workdir to update.
809
- git_url: The git url to update.
810
- git_ref: The git ref to update.
811
- """
812
- if task.workdir is None or isinstance(task.workdir, str):
813
- if workdir is not None:
814
- task.workdir = workdir
815
- return
816
- if git_url is not None:
817
- task.workdir = {}
818
- task.workdir['url'] = git_url
819
- if git_ref is not None:
820
- task.workdir['ref'] = git_ref
821
- return
822
- return
823
- if git_url is not None:
824
- task.workdir['url'] = git_url
825
- if git_ref is not None:
826
- task.workdir['ref'] = git_ref
827
- return
828
-
829
-
830
- def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
831
- """Updates the task secrets from the workdir.
832
-
833
- Args:
834
- task: The task to update.
835
- """
836
- if task.workdir is None:
837
- return
838
- if not isinstance(task.workdir, dict):
839
- return
840
- url = task.workdir['url']
841
- ref = task.workdir.get('ref', '')
842
- token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
843
- ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
844
- try:
845
- git_repo = git.GitRepo(url, ref, token, ssh_key_path)
846
- clone_info = git_repo.get_repo_clone_info()
847
- if clone_info is None:
848
- return
849
- task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
850
- if ref:
851
- ref_type = git_repo.get_ref_type()
852
- if ref_type == git.GitRefType.COMMIT:
853
- task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
854
- elif ref_type == git.GitRefType.BRANCH:
855
- task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
856
- elif ref_type == git.GitRefType.TAG:
857
- task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
858
- if clone_info.token is None and clone_info.ssh_key is None:
859
- return
860
- if clone_info.token is not None:
861
- task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
862
- if clone_info.ssh_key is not None:
863
- task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
864
- except exceptions.GitError as e:
865
- with ux_utils.print_exception_no_traceback():
866
- raise ValueError(f'{str(e)}') from None
867
-
868
-
869
828
  class _NaturalOrderGroup(click.Group):
870
829
  """Lists commands in the order defined in this script.
871
830
 
@@ -1160,7 +1119,7 @@ def launch(
1160
1119
  if task.service is not None:
1161
1120
  noun = 'pool' if task.service.pool else 'service'
1162
1121
  capnoun = noun.capitalize()
1163
- sysname = 'Jobs Worker Pool' if task.service.pool else 'SkyServe'
1122
+ sysname = 'Pool' if task.service.pool else 'SkyServe'
1164
1123
  cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
1165
1124
  logger.info(
1166
1125
  f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
@@ -1388,14 +1347,24 @@ def exec(
1388
1347
 
1389
1348
 
1390
1349
  def _handle_jobs_queue_request(
1391
- request_id: server_common.RequestId[List[Dict[str, Any]]],
1392
- show_all: bool,
1393
- show_user: bool,
1394
- max_num_jobs_to_show: Optional[int],
1395
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1350
+ request_id: server_common.RequestId[Union[
1351
+ List[responses.ManagedJobRecord],
1352
+ Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
1353
+ show_all: bool,
1354
+ show_user: bool,
1355
+ max_num_jobs_to_show: Optional[int],
1356
+ pool_status_request_id: Optional[server_common.RequestId[List[Dict[
1357
+ str, Any]]]] = None,
1358
+ is_called_by_user: bool = False,
1359
+ only_in_progress: bool = False,
1360
+ queue_result_version: cli_utils.QueueResultVersion = cli_utils.
1361
+ QueueResultVersion.V1,
1362
+ ) -> Tuple[Optional[int], str]:
1396
1363
  """Get the in-progress managed jobs.
1397
1364
 
1398
1365
  Args:
1366
+ request_id: The request ID for managed jobs.
1367
+ pool_status_request_id: The request ID for pool status, or None.
1399
1368
  show_all: Show all information of each job (e.g., region, price).
1400
1369
  show_user: Show the user who submitted the job.
1401
1370
  max_num_jobs_to_show: If not None, limit the number of jobs to show to
@@ -1403,6 +1372,8 @@ def _handle_jobs_queue_request(
1403
1372
  and `sky jobs queue`.
1404
1373
  is_called_by_user: If this function is called by user directly, or an
1405
1374
  internal call.
1375
+ only_in_progress: If True, only return the number of in-progress jobs.
1376
+ queue_result_version: The version of the queue result.
1406
1377
 
1407
1378
  Returns:
1408
1379
  A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
@@ -1413,11 +1384,47 @@ def _handle_jobs_queue_request(
1413
1384
  # TODO(SKY-980): remove unnecessary fallbacks on the client side.
1414
1385
  num_in_progress_jobs = None
1415
1386
  msg = ''
1387
+ status_counts: Optional[Dict[str, int]] = None
1388
+ pool_status_result = None
1416
1389
  try:
1417
1390
  if not is_called_by_user:
1418
1391
  usage_lib.messages.usage.set_internal()
1419
- managed_jobs_ = sdk.stream_and_get(request_id)
1420
- num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
1392
+ # Call both stream_and_get functions in parallel
1393
+ def get_jobs_queue_result():
1394
+ return sdk.stream_and_get(request_id)
1395
+
1396
+ def get_pool_status_result():
1397
+ if pool_status_request_id is not None:
1398
+ try:
1399
+ return sdk.stream_and_get(pool_status_request_id)
1400
+ except Exception: # pylint: disable=broad-except
1401
+ # If getting pool status fails, just continue without it
1402
+ return None
1403
+ return None
1404
+
1405
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
1406
+ jobs_future = executor.submit(get_jobs_queue_result)
1407
+ pool_status_future = executor.submit(get_pool_status_result)
1408
+
1409
+ result = jobs_future.result()
1410
+ pool_status_result = pool_status_future.result()
1411
+
1412
+ if queue_result_version.v2():
1413
+ managed_jobs_, total, status_counts, _ = result
1414
+ if only_in_progress:
1415
+ num_in_progress_jobs = 0
1416
+ if status_counts:
1417
+ for status_value, count in status_counts.items():
1418
+ status_enum = managed_jobs.ManagedJobStatus(
1419
+ status_value)
1420
+ if not status_enum.is_terminal():
1421
+ num_in_progress_jobs += count
1422
+ else:
1423
+ num_in_progress_jobs = total
1424
+ else:
1425
+ managed_jobs_ = result
1426
+ num_in_progress_jobs = len(
1427
+ set(job['job_id'] for job in managed_jobs_))
1421
1428
  except exceptions.ClusterNotUpError as e:
1422
1429
  controller_status = e.cluster_status
1423
1430
  msg = str(e)
@@ -1461,10 +1468,14 @@ def _handle_jobs_queue_request(
1461
1468
  msg += ('Failed to query managed jobs: '
1462
1469
  f'{common_utils.format_exception(e, use_bracket=True)}')
1463
1470
  else:
1464
- msg = managed_jobs.format_job_table(managed_jobs_,
1465
- show_all=show_all,
1466
- show_user=show_user,
1467
- max_jobs=max_num_jobs_to_show)
1471
+ msg = table_utils.format_job_table(
1472
+ managed_jobs_,
1473
+ pool_status=pool_status_result,
1474
+ show_all=show_all,
1475
+ show_user=show_user,
1476
+ max_jobs=max_num_jobs_to_show,
1477
+ status_counts=status_counts,
1478
+ )
1468
1479
  return num_in_progress_jobs, msg
1469
1480
 
1470
1481
 
@@ -1562,35 +1573,6 @@ def _handle_services_request(
1562
1573
  return num_services, msg
1563
1574
 
1564
1575
 
1565
- def _status_kubernetes(show_all: bool):
1566
- """Show all SkyPilot resources in the current Kubernetes context.
1567
-
1568
- Args:
1569
- show_all (bool): Show all job information (e.g., start time, failures).
1570
- """
1571
- all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
1572
- sdk.status_kubernetes()))
1573
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1574
- f'Kubernetes cluster state (context: {context})'
1575
- f'{colorama.Style.RESET_ALL}')
1576
- status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
1577
- show_all)
1578
- if all_jobs:
1579
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1580
- f'Managed jobs'
1581
- f'{colorama.Style.RESET_ALL}')
1582
- msg = managed_jobs.format_job_table(all_jobs,
1583
- show_all=show_all,
1584
- show_user=False)
1585
- click.echo(msg)
1586
- if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1587
- # TODO: Parse serve controllers and show services separately.
1588
- # Currently we show a hint that services are shown as clusters.
1589
- click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
1590
- 'shown in the "SkyPilot clusters" section.'
1591
- f'{colorama.Style.RESET_ALL}')
1592
-
1593
-
1594
1576
  def _show_endpoint(query_clusters: Optional[List[str]],
1595
1577
  cluster_records: List[responses.StatusResponse], ip: bool,
1596
1578
  endpoints: bool, endpoint: Optional[int]) -> None:
@@ -1717,15 +1699,7 @@ def _show_enabled_infra(
1717
1699
  default=True,
1718
1700
  is_flag=True,
1719
1701
  required=False,
1720
- help='Also show cluster pools, if any.')
1721
- @click.option(
1722
- '--kubernetes',
1723
- '--k8s',
1724
- default=False,
1725
- is_flag=True,
1726
- required=False,
1727
- help='[Experimental] Show all SkyPilot resources (including from other '
1728
- 'users) in the current Kubernetes context.')
1702
+ help='Also show pools, if any.')
1729
1703
  @click.argument('clusters',
1730
1704
  required=False,
1731
1705
  type=str,
@@ -1737,8 +1711,8 @@ def _show_enabled_infra(
1737
1711
  # pylint: disable=redefined-builtin
1738
1712
  def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1739
1713
  endpoint: Optional[int], show_managed_jobs: bool,
1740
- show_services: bool, show_pools: bool, kubernetes: bool,
1741
- clusters: List[str], all_users: bool):
1714
+ show_services: bool, show_pools: bool, clusters: List[str],
1715
+ all_users: bool):
1742
1716
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1743
1717
  """Show clusters.
1744
1718
 
@@ -1801,9 +1775,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1801
1775
  or for autostop-enabled clusters, use ``--refresh`` to query the latest
1802
1776
  cluster statuses from the cloud providers.
1803
1777
  """
1804
- if kubernetes:
1805
- _status_kubernetes(verbose)
1806
- return
1807
1778
  # Do not show job queue if user specifies clusters, and if user
1808
1779
  # specifies --ip or --endpoint(s).
1809
1780
  show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
@@ -1853,9 +1824,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1853
1824
 
1854
1825
  # Phase 2: Parallel submission of all API requests
1855
1826
  def submit_managed_jobs():
1856
- return managed_jobs.queue(refresh=False,
1857
- skip_finished=True,
1858
- all_users=all_users)
1827
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
1828
+ if all_users:
1829
+ fields = fields + _USER_NAME_FIELD
1830
+ return cli_utils.get_managed_job_queue(
1831
+ refresh=False,
1832
+ skip_finished=True,
1833
+ all_users=all_users,
1834
+ fields=fields,
1835
+ limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1836
+ )
1859
1837
 
1860
1838
  def submit_services(
1861
1839
  ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
@@ -1870,17 +1848,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1870
1848
  return None
1871
1849
 
1872
1850
  def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
1873
- try:
1874
- return sdk.workspaces()
1875
- except RuntimeError:
1876
- # Backward compatibility for API server before #5660.
1877
- # TODO(zhwu): remove this after 0.10.0.
1878
- logger.warning(f'{colorama.Style.DIM}SkyPilot API server is '
1879
- 'in an old version, and may miss feature: '
1880
- 'workspaces. Update with: sky api stop; '
1881
- 'sky api start'
1882
- f'{colorama.Style.RESET_ALL}')
1883
- return None
1851
+ return sdk.workspaces()
1884
1852
 
1885
1853
  active_workspace = skypilot_config.get_active_workspace()
1886
1854
 
@@ -1888,6 +1856,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1888
1856
  return sdk.enabled_clouds(workspace=active_workspace, expand=True)
1889
1857
 
1890
1858
  managed_jobs_queue_request_id = None
1859
+ queue_result_version = cli_utils.QueueResultVersion.V1
1891
1860
  service_status_request_id = None
1892
1861
  workspace_request_id = None
1893
1862
  pool_status_request_id = None
@@ -1906,7 +1875,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1906
1875
 
1907
1876
  # Get the request IDs
1908
1877
  if show_managed_jobs:
1909
- managed_jobs_queue_request_id = managed_jobs_request_future.result()
1878
+ (managed_jobs_queue_request_id,
1879
+ queue_result_version) = managed_jobs_request_future.result()
1910
1880
  if show_services:
1911
1881
  service_status_request_id = services_request_future.result()
1912
1882
  if show_pools:
@@ -1927,7 +1897,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1927
1897
 
1928
1898
  # Phase 3: Get cluster records and handle special cases
1929
1899
  cluster_records = _get_cluster_records_and_set_ssh_config(
1930
- query_clusters, refresh_mode, all_users)
1900
+ query_clusters, refresh_mode, all_users, verbose)
1931
1901
 
1932
1902
  # TOOD(zhwu): setup the ssh config for status
1933
1903
  if ip or show_endpoints:
@@ -1938,7 +1908,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1938
1908
  controllers = []
1939
1909
  for cluster_record in cluster_records:
1940
1910
  cluster_name = cluster_record['name']
1941
- controller = controller_utils.Controllers.from_name(cluster_name)
1911
+ controller = controller_utils.Controllers.from_name(
1912
+ cluster_name, expect_exact_match=False)
1942
1913
  if controller is not None:
1943
1914
  controllers.append(cluster_record)
1944
1915
  else:
@@ -1967,10 +1938,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1967
1938
  try:
1968
1939
  num_in_progress_jobs, msg = _handle_jobs_queue_request(
1969
1940
  managed_jobs_queue_request_id,
1941
+ pool_status_request_id=pool_status_request_id,
1970
1942
  show_all=False,
1971
1943
  show_user=all_users,
1972
1944
  max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1973
- is_called_by_user=False)
1945
+ is_called_by_user=False,
1946
+ only_in_progress=True,
1947
+ queue_result_version=queue_result_version,
1948
+ )
1974
1949
  except KeyboardInterrupt:
1975
1950
  sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
1976
1951
  managed_jobs_query_interrupted = True
@@ -2066,6 +2041,35 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
2066
2041
  click.echo('\n' + '\n'.join(hints))
2067
2042
 
2068
2043
 
2044
+ @cli.command(hidden=True)
2045
+ @flags.config_option(expose_value=False)
2046
+ @flags.verbose_option()
2047
+ def status_kubernetes(verbose: bool):
2048
+ """[Experimental] Show all SkyPilot resources (including from other '
2049
+ 'users) in the current Kubernetes context."""
2050
+ all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
2051
+ sdk.status_kubernetes()))
2052
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2053
+ f'Kubernetes cluster state (context: {context})'
2054
+ f'{colorama.Style.RESET_ALL}')
2055
+ status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
2056
+ show_all=verbose)
2057
+ if all_jobs:
2058
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2059
+ f'Managed jobs'
2060
+ f'{colorama.Style.RESET_ALL}')
2061
+ msg = table_utils.format_job_table(all_jobs,
2062
+ show_all=verbose,
2063
+ show_user=False)
2064
+ click.echo(msg)
2065
+ if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
2066
+ # TODO: Parse serve controllers and show services separately.
2067
+ # Currently we show a hint that services are shown as clusters.
2068
+ click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
2069
+ 'shown in the "SkyPilot clusters" section.'
2070
+ f'{colorama.Style.RESET_ALL}')
2071
+
2072
+
2069
2073
  @cli.command()
2070
2074
  @flags.config_option(expose_value=False)
2071
2075
  @flags.all_option('Show all cluster information.')
@@ -2104,7 +2108,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
2104
2108
  for cluster_record in cluster_records:
2105
2109
  cluster_name = cluster_record['name']
2106
2110
  try:
2107
- controller = controller_utils.Controllers.from_name(cluster_name)
2111
+ controller = controller_utils.Controllers.from_name(
2112
+ cluster_name, expect_exact_match=False)
2108
2113
  except AssertionError:
2109
2114
  # There could be some old controller clusters from previous
2110
2115
  # versions that we should not show in the cost report.
@@ -2192,7 +2197,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2192
2197
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
2193
2198
  f' {common_utils.format_exception(e)}')
2194
2199
  return
2195
- job_tables[cluster] = job_lib.format_job_queue(job_table)
2200
+ job_tables[cluster] = table_utils.format_job_queue(job_table)
2196
2201
 
2197
2202
  subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2198
2203
  user_str = 'all users' if all_users else 'current user'
@@ -2213,6 +2218,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2213
2218
  is_flag=True,
2214
2219
  default=False,
2215
2220
  help='Stream the cluster provisioning logs (provision.log).')
2221
+ @click.option('--worker',
2222
+ '-w',
2223
+ default=None,
2224
+ type=int,
2225
+ help='The worker ID to stream the logs from. '
2226
+ 'If not set, stream the logs of the head node.')
2216
2227
  @click.option(
2217
2228
  '--sync-down',
2218
2229
  '-s',
@@ -2250,6 +2261,7 @@ def logs(
2250
2261
  cluster: str,
2251
2262
  job_ids: Tuple[str, ...],
2252
2263
  provision: bool,
2264
+ worker: Optional[int],
2253
2265
  sync_down: bool,
2254
2266
  status: bool, # pylint: disable=redefined-outer-name
2255
2267
  follow: bool,
@@ -2279,6 +2291,13 @@ def logs(
2279
2291
  4. If the job fails or fetching the logs fails, the command will exit with
2280
2292
  a non-zero return code.
2281
2293
  """
2294
+ if worker is not None:
2295
+ if not provision:
2296
+ raise click.UsageError(
2297
+ '--worker can only be used with --provision.')
2298
+ if worker < 1:
2299
+ raise click.UsageError('--worker must be a positive integer.')
2300
+
2282
2301
  if provision and (sync_down or status or job_ids):
2283
2302
  raise click.UsageError(
2284
2303
  '--provision cannot be combined with job log options '
@@ -2298,7 +2317,11 @@ def logs(
2298
2317
 
2299
2318
  if provision:
2300
2319
  # Stream provision logs
2301
- sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
2320
+ sys.exit(
2321
+ sdk.tail_provision_logs(cluster_name=cluster,
2322
+ worker=worker,
2323
+ follow=follow,
2324
+ tail=tail))
2302
2325
 
2303
2326
  if sync_down:
2304
2327
  with rich_utils.client_status(
@@ -2476,7 +2499,8 @@ def cancel(
2476
2499
  job_ids=job_ids_to_cancel)
2477
2500
  _async_call_or_wait(request_id, async_call, 'sky.cancel')
2478
2501
  except exceptions.NotSupportedError as e:
2479
- controller = controller_utils.Controllers.from_name(cluster)
2502
+ controller = controller_utils.Controllers.from_name(
2503
+ cluster, expect_exact_match=False)
2480
2504
  assert controller is not None, cluster
2481
2505
  with ux_utils.print_exception_no_traceback():
2482
2506
  raise click.UsageError(
@@ -2777,7 +2801,8 @@ def start(
2777
2801
  # Get all clusters that are not controllers.
2778
2802
  cluster_records = [
2779
2803
  cluster for cluster in all_clusters
2780
- if controller_utils.Controllers.from_name(cluster['name']) is None
2804
+ if controller_utils.Controllers.from_name(
2805
+ cluster['name'], expect_exact_match=False) is None
2781
2806
  ]
2782
2807
  if cluster_records is None:
2783
2808
  # Get GLOB cluster names
@@ -2839,7 +2864,8 @@ def start(
2839
2864
  # Checks for controller clusters (jobs controller / sky serve controller).
2840
2865
  controllers, normal_clusters = [], []
2841
2866
  for name in to_start:
2842
- if controller_utils.Controllers.from_name(name) is not None:
2867
+ if controller_utils.Controllers.from_name(
2868
+ name, expect_exact_match=False) is not None:
2843
2869
  controllers.append(name)
2844
2870
  else:
2845
2871
  normal_clusters.append(name)
@@ -2975,16 +3001,28 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2975
3001
  to be torn down (e.g., because it has jobs running or
2976
3002
  it is in init state)
2977
3003
  """
2978
- controller = controller_utils.Controllers.from_name(controller_name)
3004
+ controller = controller_utils.Controllers.from_name(
3005
+ controller_name, expect_exact_match=False)
2979
3006
  assert controller is not None, controller_name
2980
3007
 
3008
+ status_counts: Optional[Dict[str, int]] = None
3009
+ managed_jobs_: List[responses.ManagedJobRecord] = []
2981
3010
  with rich_utils.client_status(
2982
3011
  '[bold cyan]Checking for in-progress managed jobs and pools[/]'):
2983
3012
  try:
2984
- request_id = managed_jobs.queue(refresh=False,
2985
- skip_finished=True,
2986
- all_users=True)
2987
- managed_jobs_ = sdk.stream_and_get(request_id)
3013
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
3014
+ request_id, queue_result_version = cli_utils.get_managed_job_queue(
3015
+ refresh=False,
3016
+ skip_finished=True,
3017
+ all_users=True,
3018
+ fields=fields,
3019
+ )
3020
+ result = sdk.stream_and_get(request_id)
3021
+ if queue_result_version.v2():
3022
+ managed_jobs_, _, status_counts, _ = result
3023
+ else:
3024
+ managed_jobs_ = typing.cast(List[responses.ManagedJobRecord],
3025
+ result)
2988
3026
  request_id_pools = managed_jobs.pool_status(pool_names=None)
2989
3027
  pools_ = sdk.stream_and_get(request_id_pools)
2990
3028
  except exceptions.ClusterNotUpError as e:
@@ -3002,25 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
3002
3040
  # there is no in-prgress managed jobs.
3003
3041
  managed_jobs_ = []
3004
3042
  pools_ = []
3005
- except exceptions.InconsistentConsolidationModeError:
3006
- # If this error is raised, it means the user switched to the
3007
- # consolidation mode but the previous controller cluster is still
3008
- # running. We should allow the user to tear down the controller
3009
- # cluster in this case.
3010
- with skypilot_config.override_skypilot_config(
3011
- {'jobs': {
3012
- 'controller': {
3013
- 'consolidation_mode': False
3014
- }
3015
- }}):
3016
- # Check again with the consolidation mode disabled. This is to
3017
- # make sure there is no in-progress managed jobs.
3018
- request_id = managed_jobs.queue(refresh=False,
3019
- skip_finished=True,
3020
- all_users=True)
3021
- managed_jobs_ = sdk.stream_and_get(request_id)
3022
- request_id_pools = managed_jobs.pool_status(pool_names=None)
3023
- pools_ = sdk.stream_and_get(request_id_pools)
3024
3043
 
3025
3044
  msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
3026
3045
  'jobs controller. Please be aware of the following:'
@@ -3029,9 +3048,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
3029
3048
  'jobs (output of `sky jobs queue`) will be lost.')
3030
3049
  click.echo(msg)
3031
3050
  if managed_jobs_:
3032
- job_table = managed_jobs.format_job_table(managed_jobs_,
3033
- show_all=False,
3034
- show_user=True)
3051
+ job_table = table_utils.format_job_table(
3052
+ managed_jobs_,
3053
+ show_all=False,
3054
+ show_user=True,
3055
+ status_counts=status_counts,
3056
+ )
3035
3057
  msg = controller.value.decline_down_for_dirty_controller_hint
3036
3058
  # Add prefix to each line to align with the bullet point.
3037
3059
  msg += '\n'.join(
@@ -3074,7 +3096,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3074
3096
  to be torn down (e.g., because it has services running or
3075
3097
  it is in init state)
3076
3098
  """
3077
- controller = controller_utils.Controllers.from_name(controller_name)
3099
+ controller = controller_utils.Controllers.from_name(
3100
+ controller_name, expect_exact_match=False)
3078
3101
  assert controller is not None, controller_name
3079
3102
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
3080
3103
  try:
@@ -3093,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3093
3116
  # controller being STOPPED or being firstly launched, i.e., there is
3094
3117
  # no in-prgress services.
3095
3118
  services = []
3096
- except exceptions.InconsistentConsolidationModeError:
3097
- # If this error is raised, it means the user switched to the
3098
- # consolidation mode but the previous controller cluster is still
3099
- # running. We should allow the user to tear down the controller
3100
- # cluster in this case.
3101
- with skypilot_config.override_skypilot_config(
3102
- {'serve': {
3103
- 'controller': {
3104
- 'consolidation_mode': False
3105
- }
3106
- }}):
3107
- # Check again with the consolidation mode disabled. This is to
3108
- # make sure there is no in-progress services.
3109
- request_id = serve_lib.status(service_names=None)
3110
- services = sdk.stream_and_get(request_id)
3111
3119
 
3112
3120
  if services:
3113
3121
  service_names = [service['name'] for service in services]
@@ -3185,14 +3193,15 @@ def _down_or_stop_clusters(
3185
3193
  names = list(names)
3186
3194
  if names:
3187
3195
  controllers = [
3188
- name for name in names
3189
- if controller_utils.Controllers.from_name(name) is not None
3196
+ name for name in names if controller_utils.Controllers.from_name(
3197
+ name, expect_exact_match=False) is not None
3190
3198
  ]
3191
3199
  controllers_str = ', '.join(map(repr, controllers))
3192
3200
  names = [
3193
3201
  cluster['name']
3194
3202
  for cluster in _get_cluster_records_and_set_ssh_config(names)
3195
- if controller_utils.Controllers.from_name(cluster['name']) is None
3203
+ if controller_utils.Controllers.from_name(
3204
+ cluster['name'], expect_exact_match=False) is None
3196
3205
  ]
3197
3206
 
3198
3207
  # Make sure the controllers are explicitly specified without other
@@ -3217,7 +3226,7 @@ def _down_or_stop_clusters(
3217
3226
  f'{controllers_str} is currently not supported.')
3218
3227
  else:
3219
3228
  controller = controller_utils.Controllers.from_name(
3220
- controller_name)
3229
+ controller_name, expect_exact_match=False)
3221
3230
  assert controller is not None
3222
3231
  hint_or_raise = _controller_to_hint_or_raise(controller)
3223
3232
  try:
@@ -3265,9 +3274,10 @@ def _down_or_stop_clusters(
3265
3274
  names = [
3266
3275
  record['name']
3267
3276
  for record in all_clusters
3268
- if controller_utils.Controllers.from_name(record['name']) is None
3269
- and (down or idle_minutes_to_autostop is not None or
3270
- record['status'] != status_lib.ClusterStatus.STOPPED)
3277
+ if controller_utils.Controllers.from_name(
3278
+ record['name'], expect_exact_match=False) is None and
3279
+ (down or idle_minutes_to_autostop is not None or
3280
+ record['status'] != status_lib.ClusterStatus.STOPPED)
3271
3281
  ]
3272
3282
 
3273
3283
  clusters = names
@@ -3297,6 +3307,9 @@ def _down_or_stop_clusters(
3297
3307
 
3298
3308
  request_ids = []
3299
3309
 
3310
+ successes: List[str] = []
3311
+ failures: List[Tuple[str, str]] = []
3312
+
3300
3313
  def _down_or_stop(name: str):
3301
3314
  success_progress = False
3302
3315
  if idle_minutes_to_autostop is not None:
@@ -3304,16 +3317,20 @@ def _down_or_stop_clusters(
3304
3317
  request_id = sdk.autostop(name, idle_minutes_to_autostop,
3305
3318
  wait_for, down)
3306
3319
  request_ids.append(request_id)
3320
+ progress.stop()
3307
3321
  _async_call_or_wait(
3308
3322
  request_id, async_call,
3309
3323
  server_constants.REQUEST_NAME_PREFIX + operation)
3310
- except (exceptions.NotSupportedError,
3311
- exceptions.ClusterNotUpError) as e:
3324
+ progress.start()
3325
+ except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
3326
+ exceptions.CloudError) as e:
3312
3327
  message = str(e)
3328
+ failures.append((name, str(e)))
3313
3329
  else: # no exception raised
3314
3330
  success_progress = True
3315
3331
  message = (f'{colorama.Fore.GREEN}{operation} '
3316
3332
  f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
3333
+ successes.append(name)
3317
3334
  if idle_minutes_to_autostop >= 0:
3318
3335
  option_str = 'down' if down else 'stop'
3319
3336
  passive_str = 'downed' if down else 'stopped'
@@ -3333,9 +3350,11 @@ def _down_or_stop_clusters(
3333
3350
  else:
3334
3351
  request_id = sdk.stop(name, purge=purge)
3335
3352
  request_ids.append(request_id)
3353
+ progress.stop()
3336
3354
  _async_call_or_wait(
3337
3355
  request_id, async_call,
3338
3356
  server_constants.REQUEST_NAME_PREFIX + operation)
3357
+ progress.start()
3339
3358
  if not async_call:
3340
3359
  # Remove the cluster from the SSH config file as soon as it
3341
3360
  # is stopped or downed.
@@ -3345,13 +3364,17 @@ def _down_or_stop_clusters(
3345
3364
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
3346
3365
  f'{colorama.Style.RESET_ALL}'
3347
3366
  f'\nReason: {common_utils.format_exception(e)}.')
3367
+ failures.append((name, str(e)))
3348
3368
  except (exceptions.NotSupportedError,
3349
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3369
+ exceptions.ClusterOwnerIdentityMismatchError,
3370
+ exceptions.CloudError) as e:
3350
3371
  message = str(e)
3372
+ failures.append((name, str(e)))
3351
3373
  else: # no exception raised
3352
3374
  message = (
3353
3375
  f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
3354
3376
  f'{colorama.Style.RESET_ALL}')
3377
+ successes.append(name)
3355
3378
  if not down:
3356
3379
  message += ('\n To restart the cluster, run: '
3357
3380
  f'{colorama.Style.BRIGHT}sky start {name}'
@@ -3365,6 +3388,10 @@ def _down_or_stop_clusters(
3365
3388
  progress.start()
3366
3389
 
3367
3390
  with progress:
3391
+ # we write a new line here to avoid the "Waiting for 'sky.down'
3392
+ # request to be scheduled" message from being printed on the same line
3393
+ # as the "Terminating <num> clusters..." message
3394
+ click.echo('')
3368
3395
  subprocess_utils.run_in_parallel(_down_or_stop, clusters)
3369
3396
  progress.live.transient = False
3370
3397
  # Make sure the progress bar not mess up the terminal.
@@ -3374,6 +3401,31 @@ def _down_or_stop_clusters(
3374
3401
  click.secho(f'{operation} requests are sent. Check the requests\' '
3375
3402
  'status with `sky request get <request_id>`.')
3376
3403
 
3404
+ show_summary = len(clusters) > 1
3405
+
3406
+ if show_summary:
3407
+ click.echo('\nSummary:')
3408
+ if successes:
3409
+ # Preserve the original order of clusters as provided by user.
3410
+ click.echo(' ✓ Succeeded: ' + ', '.join(successes))
3411
+ if failures:
3412
+ # Format failures: if one failure, keep on same line. If multiple,
3413
+ # indent each failed cluster on its own line for readability.
3414
+ if len(failures) == 1:
3415
+ name, reason = failures[0]
3416
+ first = reason.strip().splitlines()[0]
3417
+ first = first if len(first) <= 120 else first[:120] + '…'
3418
+ click.echo(f' ✗ Failed: {name} ({first})')
3419
+ else:
3420
+ click.echo(' ✗ Failed:')
3421
+ for name, reason in failures:
3422
+ first = reason.strip().splitlines()[0]
3423
+ first = first if len(first) <= 120 else first[:120] + '…'
3424
+ click.echo(f' {name} ({first})')
3425
+
3426
+ if failures:
3427
+ click.echo('Cluster(s) failed. See details above.')
3428
+
3377
3429
 
3378
3430
  @cli.command(cls=_DocumentedCodeCommand)
3379
3431
  @flags.config_option(expose_value=False)
@@ -4093,8 +4145,7 @@ def storage_ls(verbose: bool):
4093
4145
  """List storage objects managed by SkyPilot."""
4094
4146
  request_id = sdk.storage_ls()
4095
4147
  storages = sdk.stream_and_get(request_id)
4096
- storage_table = storage_utils.format_storage_table(storages,
4097
- show_all=verbose)
4148
+ storage_table = table_utils.format_storage_table(storages, show_all=verbose)
4098
4149
  click.echo(storage_table)
4099
4150
 
4100
4151
 
@@ -4174,6 +4225,10 @@ def volumes():
4174
4225
  pass
4175
4226
 
4176
4227
 
4228
+ # Add 'volume' as an alias for 'volumes'
4229
+ cli.add_command(volumes, name='volume')
4230
+
4231
+
4177
4232
  @volumes.command('apply', cls=_DocumentedCodeCommand)
4178
4233
  @flags.config_option(expose_value=False)
4179
4234
  @click.argument('entrypoint',
@@ -4189,17 +4244,25 @@ def volumes():
4189
4244
  @click.option('--infra',
4190
4245
  required=False,
4191
4246
  type=str,
4192
- help='Infra. Format: k8s, k8s/context-name. '
4247
+ help='Infrastructure to use. '
4248
+ 'Format: cloud, cloud/region, cloud/region/zone, or '
4249
+ 'k8s/context-name.'
4250
+ 'Examples: k8s, k8s/my-context, runpod/US/US-CA-2. '
4193
4251
  'Override the infra defined in the YAML.')
4194
- @click.option(
4195
- '--type',
4196
- required=False,
4197
- type=str,
4198
- help='Volume type. Format: pvc. Override the type defined in the YAML.')
4252
+ @click.option('--type',
4253
+ required=False,
4254
+ type=click.Choice(volume_utils.VolumeType.supported_types()),
4255
+ help='Volume type. Override the type defined in the YAML.')
4199
4256
  @click.option('--size',
4200
4257
  required=False,
4201
4258
  type=str,
4202
4259
  help='Volume size. Override the size defined in the YAML.')
4260
+ @click.option(
4261
+ '--use-existing/--no-use-existing',
4262
+ required=False,
4263
+ default=None,
4264
+ help='Whether to use an existing volume. Override the use_existing '
4265
+ 'defined in the YAML.')
4203
4266
  @click.option('--yes',
4204
4267
  '-y',
4205
4268
  is_flag=True,
@@ -4214,6 +4277,7 @@ def volumes_apply(
4214
4277
  infra: Optional[str],
4215
4278
  type: Optional[str], # pylint: disable=redefined-builtin
4216
4279
  size: Optional[str],
4280
+ use_existing: Optional[bool],
4217
4281
  yes: bool,
4218
4282
  async_call: bool):
4219
4283
  """Apply a volume.
@@ -4226,7 +4290,11 @@ def volumes_apply(
4226
4290
  sky volumes apply volume.yaml
4227
4291
  \b
4228
4292
  # Apply a volume from a command.
4229
- sky volumes apply --name pvc1 --infra k8s --type pvc --size 100Gi
4293
+ sky volumes apply --name pvc1 --infra k8s --type k8s-pvc --size 100Gi
4294
+ \b
4295
+ # Apply a volume with existing PVC `pvc2` from a command.
4296
+ sky volumes apply --name pvc2 --infra k8s --type k8s-pvc --size 100Gi
4297
+ --use-existing
4230
4298
  """
4231
4299
  # pylint: disable=import-outside-toplevel
4232
4300
  from sky.volumes import volume as volume_lib
@@ -4245,7 +4313,8 @@ def volumes_apply(
4245
4313
  f'{entrypoint_str!r} needs to be a YAML file')
4246
4314
  if yaml_config is not None:
4247
4315
  volume_config_dict = yaml_config.copy()
4248
- override_config = _build_volume_override_config(name, infra, type, size)
4316
+ override_config = _build_volume_override_config(name, infra, type, size,
4317
+ use_existing)
4249
4318
  volume_config_dict.update(override_config)
4250
4319
 
4251
4320
  # Create Volume instance
@@ -4253,6 +4322,13 @@ def volumes_apply(
4253
4322
 
4254
4323
  logger.debug(f'Volume config: {volume.to_yaml_config()}')
4255
4324
 
4325
+ # TODO(kevin): remove the try block in v0.13.0
4326
+ try:
4327
+ volumes_sdk.validate(volume)
4328
+ except exceptions.APINotSupportedError:
4329
+ # Do best-effort client-side validation.
4330
+ volume.validate(skip_cloud_compatibility=True)
4331
+
4256
4332
  if not yes:
4257
4333
  click.confirm(f'Proceed to create volume {volume.name!r}?',
4258
4334
  default=True,
@@ -4269,11 +4345,15 @@ def volumes_apply(
4269
4345
  f'{colorama.Style.RESET_ALL}')
4270
4346
 
4271
4347
 
4272
- def _build_volume_override_config(name: Optional[str], infra: Optional[str],
4273
- volume_type: Optional[str],
4274
- size: Optional[str]) -> Dict[str, str]:
4348
+ def _build_volume_override_config(
4349
+ name: Optional[str],
4350
+ infra: Optional[str],
4351
+ volume_type: Optional[str],
4352
+ size: Optional[str],
4353
+ use_existing: Optional[bool],
4354
+ ) -> Dict[str, Any]:
4275
4355
  """Parse the volume override config."""
4276
- override_config = {}
4356
+ override_config: Dict[str, Any] = {}
4277
4357
  if name is not None:
4278
4358
  override_config['name'] = name
4279
4359
  if infra is not None:
@@ -4282,6 +4362,8 @@ def _build_volume_override_config(name: Optional[str], infra: Optional[str],
4282
4362
  override_config['type'] = volume_type
4283
4363
  if size is not None:
4284
4364
  override_config['size'] = size
4365
+ if use_existing is not None:
4366
+ override_config['use_existing'] = use_existing
4285
4367
  return override_config
4286
4368
 
4287
4369
 
@@ -4298,8 +4380,8 @@ def volumes_ls(verbose: bool):
4298
4380
  """List volumes managed by SkyPilot."""
4299
4381
  request_id = volumes_sdk.ls()
4300
4382
  all_volumes = sdk.stream_and_get(request_id)
4301
- volume_table = volumes_utils.format_volume_table(all_volumes,
4302
- show_all=verbose)
4383
+ volume_table = table_utils.format_volume_table(all_volumes,
4384
+ show_all=verbose)
4303
4385
  click.echo(volume_table)
4304
4386
 
4305
4387
 
@@ -4537,10 +4619,11 @@ def jobs_launch(
4537
4619
  break
4538
4620
  if print_setup_fm_warning:
4539
4621
  click.secho(
4540
- f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
4541
- ' will be ignored when submit jobs to pool. To update a pool, '
4542
- f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
4622
+ f'{colorama.Fore.YELLOW}Setup, file mounts, and storage mounts'
4623
+ ' will be ignored when submitting jobs to pool. To update a '
4624
+ f'pool, please use `sky jobs pool apply {pool} new-pool.yaml`. '
4543
4625
  f'{colorama.Style.RESET_ALL}')
4626
+ print_setup_fm_warning = False
4544
4627
 
4545
4628
  # Optimize info is only show if _need_confirmation.
4546
4629
  if not yes:
@@ -4556,10 +4639,15 @@ def jobs_launch(
4556
4639
  job_id_handle = _async_call_or_wait(request_id, async_call,
4557
4640
  'sky.jobs.launch')
4558
4641
 
4559
- if not async_call and not detach_run:
4560
- job_ids = job_id_handle[0]
4561
- if isinstance(job_ids, int) or len(job_ids) == 1:
4562
- job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
4642
+ if async_call:
4643
+ return
4644
+
4645
+ job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
4646
+ int) else job_id_handle[0]
4647
+
4648
+ if not detach_run:
4649
+ if len(job_ids) == 1:
4650
+ job_id = job_ids[0]
4563
4651
  returncode = managed_jobs.tail_logs(name=None,
4564
4652
  job_id=job_id,
4565
4653
  follow=True,
@@ -4568,7 +4656,8 @@ def jobs_launch(
4568
4656
  else:
4569
4657
  # TODO(tian): This can be very long. Considering have a "group id"
4570
4658
  # and query all job ids with the same group id.
4571
- job_ids_str = ','.join(map(str, job_ids))
4659
+ # Sort job ids to ensure consistent ordering.
4660
+ job_ids_str = ','.join(map(str, sorted(job_ids)))
4572
4661
  click.secho(
4573
4662
  f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
4574
4663
  f'{job_ids_str}{colorama.Style.RESET_ALL}.'
@@ -4587,6 +4676,14 @@ def jobs_launch(
4587
4676
  @jobs.command('queue', cls=_DocumentedCodeCommand)
4588
4677
  @flags.config_option(expose_value=False)
4589
4678
  @flags.verbose_option()
4679
+ @click.option(
4680
+ '--limit',
4681
+ '-l',
4682
+ default=_NUM_MANAGED_JOBS_TO_SHOW,
4683
+ type=int,
4684
+ required=False,
4685
+ help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
4686
+ f' use "-a/--all" to show all jobs.'))
4590
4687
  @click.option(
4591
4688
  '--refresh',
4592
4689
  '-r',
@@ -4606,7 +4703,7 @@ def jobs_launch(
4606
4703
  @usage_lib.entrypoint
4607
4704
  # pylint: disable=redefined-builtin
4608
4705
  def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4609
- all_users: bool, all: bool):
4706
+ all_users: bool, all: bool, limit: int):
4610
4707
  """Show statuses of managed jobs.
4611
4708
 
4612
4709
  Each managed jobs can have one of the following statuses:
@@ -4657,18 +4754,56 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4657
4754
 
4658
4755
  watch -n60 sky jobs queue
4659
4756
 
4757
+ (Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
4758
+
4759
+ .. code-block:: bash
4760
+
4761
+ sky jobs queue -l 10
4762
+
4660
4763
  """
4661
4764
  click.secho('Fetching managed job statuses...', fg='cyan')
4662
4765
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
4663
- managed_jobs_request_id = managed_jobs.queue(
4664
- refresh=refresh, skip_finished=skip_finished, all_users=all_users)
4665
- max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
4766
+ max_num_jobs_to_show = (limit if not all else None)
4767
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
4768
+ if verbose:
4769
+ fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
4770
+ if all_users:
4771
+ fields = fields + _USER_NAME_FIELD
4772
+ if verbose:
4773
+ fields = fields + _USER_HASH_FIELD
4774
+ # Call both cli_utils.get_managed_job_queue and managed_jobs.pool_status
4775
+ # in parallel
4776
+ def get_managed_jobs_queue():
4777
+ return cli_utils.get_managed_job_queue(refresh=refresh,
4778
+ skip_finished=skip_finished,
4779
+ all_users=all_users,
4780
+ limit=max_num_jobs_to_show,
4781
+ fields=fields)
4782
+
4783
+ def get_pool_status():
4784
+ try:
4785
+ return managed_jobs.pool_status(pool_names=None)
4786
+ except Exception: # pylint: disable=broad-except
4787
+ # If pool_status fails, we'll just skip the worker information
4788
+ return None
4789
+
4790
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
4791
+ managed_jobs_future = executor.submit(get_managed_jobs_queue)
4792
+ pool_status_future = executor.submit(get_pool_status)
4793
+
4794
+ (managed_jobs_request_id,
4795
+ queue_result_version) = managed_jobs_future.result()
4796
+ pool_status_request_id = pool_status_future.result()
4797
+
4666
4798
  num_jobs, msg = _handle_jobs_queue_request(
4667
4799
  managed_jobs_request_id,
4800
+ pool_status_request_id=pool_status_request_id,
4668
4801
  show_all=verbose,
4669
4802
  show_user=all_users,
4670
4803
  max_num_jobs_to_show=max_num_jobs_to_show,
4671
- is_called_by_user=True)
4804
+ is_called_by_user=True,
4805
+ queue_result_version=queue_result_version,
4806
+ )
4672
4807
  if not skip_finished:
4673
4808
  in_progress_only_hint = ''
4674
4809
  else:
@@ -4681,7 +4816,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4681
4816
  f'{colorama.Fore.CYAN}'
4682
4817
  f'Only showing the latest {max_num_jobs_to_show} '
4683
4818
  f'managed jobs'
4684
- f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4819
+ f'(use --limit to show more managed jobs or '
4820
+ f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4685
4821
 
4686
4822
 
4687
4823
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
@@ -4849,7 +4985,7 @@ def pool():
4849
4985
  @pool.command('apply', cls=_DocumentedCodeCommand)
4850
4986
  @flags.config_option(expose_value=False)
4851
4987
  @click.argument('pool_yaml',
4852
- required=True,
4988
+ required=False,
4853
4989
  type=str,
4854
4990
  nargs=-1,
4855
4991
  **_get_shell_complete_args(_complete_file_name))
@@ -4864,17 +5000,22 @@ def pool():
4864
5000
  type=click.Choice([m.value for m in serve_lib.UpdateMode],
4865
5001
  case_sensitive=False),
4866
5002
  required=False,
4867
- help=('Update mode. If "rolling", cluster pool will be updated '
4868
- 'with rolling update. If "blue_green", cluster pool will '
5003
+ help=('Update mode. If "rolling", pool will be updated '
5004
+ 'with rolling update. If "blue_green", pool will '
4869
5005
  'be updated with blue-green update. This option is only '
4870
5006
  'valid when the pool is already running.'))
5007
+ @click.option('--workers',
5008
+ default=None,
5009
+ type=int,
5010
+ required=False,
5011
+ help='Can be used to update the number of workers in the pool.')
4871
5012
  @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
4872
5013
  flags.COMMON_OPTIONS)
4873
5014
  @flags.yes_option()
4874
5015
  @timeline.event
4875
5016
  @usage_lib.entrypoint
4876
5017
  def jobs_pool_apply(
4877
- pool_yaml: Tuple[str, ...],
5018
+ pool_yaml: Optional[Tuple[str, ...]],
4878
5019
  pool: Optional[str], # pylint: disable=redefined-outer-name
4879
5020
  workdir: Optional[str],
4880
5021
  infra: Optional[str],
@@ -4896,60 +5037,80 @@ def jobs_pool_apply(
4896
5037
  disk_tier: Optional[str],
4897
5038
  network_tier: Optional[str],
4898
5039
  mode: str,
5040
+ workers: Optional[int],
4899
5041
  yes: bool,
4900
5042
  async_call: bool,
4901
5043
  ):
4902
- """Apply a config to a cluster pool for managed jobs submission.
4903
-
4904
- If the pool is already running, the config will be applied to the pool.
4905
- Otherwise, a new pool will be created.
4906
-
4907
- POOL_YAML must point to a valid YAML file.
5044
+ """Either apply a config to a pool for managed jobs submission
5045
+ or update the number of workers in the pool. One of POOL_YAML or --workers
5046
+ must be provided.
5047
+ Config:
5048
+ If the pool is already running, the config will be applied to the pool.
5049
+ Otherwise, a new pool will be created.
5050
+ Workers:
5051
+ The --workers option can be used to override the number of workers
5052
+ specified in the YAML file, or to update workers without a YAML file.
5053
+ Example:
5054
+ sky jobs pool apply -p my-pool --workers 5
4908
5055
  """
4909
5056
  cloud, region, zone = _handle_infra_cloud_region_zone_options(
4910
5057
  infra, cloud, region, zone)
4911
- if pool is None:
4912
- pool = serve_lib.generate_service_name(pool=True)
5058
+ if workers is not None and pool_yaml is not None and len(pool_yaml) > 0:
5059
+ raise click.UsageError(
5060
+ 'Cannot specify both --workers and POOL_YAML. Please use one of '
5061
+ 'them.')
4913
5062
 
4914
- task = _generate_task_with_service(
4915
- service_name=pool,
4916
- service_yaml_args=pool_yaml,
4917
- workdir=workdir,
4918
- cloud=cloud,
4919
- region=region,
4920
- zone=zone,
4921
- gpus=gpus,
4922
- cpus=cpus,
4923
- memory=memory,
4924
- instance_type=instance_type,
4925
- num_nodes=num_nodes,
4926
- use_spot=use_spot,
4927
- image_id=image_id,
4928
- env_file=env_file,
4929
- env=env,
4930
- secret=secret,
4931
- disk_size=disk_size,
4932
- disk_tier=disk_tier,
4933
- network_tier=network_tier,
4934
- ports=ports,
4935
- not_supported_cmd='sky jobs pool up',
4936
- pool=True,
4937
- )
4938
- assert task.service is not None
4939
- if not task.service.pool:
4940
- raise click.UsageError('The YAML file needs a `pool` section.')
4941
- click.secho('Pool spec:', fg='cyan')
4942
- click.echo(task.service)
4943
- serve_lib.validate_service_task(task, pool=True)
5063
+ if pool_yaml is None or len(pool_yaml) == 0:
5064
+ if pool is None:
5065
+ raise click.UsageError(
5066
+ 'A pool name must be provided to update the number of workers.')
5067
+ task = None
5068
+ click.secho(f'Attempting to update {pool} to have {workers} workers',
5069
+ fg='cyan')
5070
+ else:
5071
+ if pool is None:
5072
+ pool = serve_lib.generate_service_name(pool=True)
5073
+
5074
+ task = _generate_task_with_service(
5075
+ service_name=pool,
5076
+ service_yaml_args=pool_yaml,
5077
+ workdir=workdir,
5078
+ cloud=cloud,
5079
+ region=region,
5080
+ zone=zone,
5081
+ gpus=gpus,
5082
+ cpus=cpus,
5083
+ memory=memory,
5084
+ instance_type=instance_type,
5085
+ num_nodes=num_nodes,
5086
+ use_spot=use_spot,
5087
+ image_id=image_id,
5088
+ env_file=env_file,
5089
+ env=env,
5090
+ secret=secret,
5091
+ disk_size=disk_size,
5092
+ disk_tier=disk_tier,
5093
+ network_tier=network_tier,
5094
+ ports=ports,
5095
+ not_supported_cmd='sky jobs pool up',
5096
+ pool=True,
5097
+ )
5098
+ assert task.service is not None
5099
+ if not task.service.pool:
5100
+ raise click.UsageError('The YAML file needs a `pool` section.')
5101
+ click.secho('Pool spec:', fg='cyan')
5102
+ click.echo(task.service)
5103
+ serve_lib.validate_service_task(task, pool=True)
4944
5104
 
4945
- click.secho(
4946
- 'Each pool worker will use the following resources (estimated):',
4947
- fg='cyan')
4948
- with dag_lib.Dag() as dag:
4949
- dag.add(task)
5105
+ click.secho(
5106
+ 'Each pool worker will use the following resources (estimated):',
5107
+ fg='cyan')
5108
+ with dag_lib.Dag() as dag:
5109
+ dag.add(task)
4950
5110
 
4951
5111
  request_id = managed_jobs.pool_apply(task,
4952
5112
  pool,
5113
+ workers=workers,
4953
5114
  mode=serve_lib.UpdateMode(mode),
4954
5115
  _need_confirmation=not yes)
4955
5116
  _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
@@ -4962,7 +5123,7 @@ def jobs_pool_apply(
4962
5123
  @usage_lib.entrypoint
4963
5124
  # pylint: disable=redefined-builtin
4964
5125
  def jobs_pool_status(verbose: bool, pool_names: List[str]):
4965
- """Show statuses of cluster pools.
5126
+ """Show statuses of pools.
4966
5127
 
4967
5128
  Show detailed statuses of one or more pools. If POOL_NAME is not
4968
5129
  provided, show all pools' status.
@@ -5018,12 +5179,108 @@ def jobs_pool_down(
5018
5179
  raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
5019
5180
  f'Provided {argument_str!r}.')
5020
5181
 
5021
- if not yes:
5022
- quoted_pool_names = [f'{name!r}' for name in pool_names]
5023
- list_pool_str = ', '.join(quoted_pool_names)
5024
- pool_identity_str = f'pool(s) {list_pool_str}'
5025
- if all:
5026
- pool_identity_str = 'all pools'
5182
+ def _get_nonterminal_jobs(pool_names: List[str],
5183
+ all: bool) -> List[responses.ManagedJobRecord]:
5184
+ # Get nonterminal jobs for this pool using managed_jobs.queue
5185
+ request_id, queue_result_version = cli_utils.get_managed_job_queue(
5186
+ refresh=False,
5187
+ skip_finished=True,
5188
+ all_users=True,
5189
+ limit=None,
5190
+ fields=['job_id', 'status', 'pool'],
5191
+ )
5192
+ jobs_result = sdk.stream_and_get(request_id)
5193
+
5194
+ # Handle both tuple and list responses
5195
+ jobs_list: List[responses.ManagedJobRecord]
5196
+ if queue_result_version.v2():
5197
+ jobs_list = jobs_result[0]
5198
+ else:
5199
+ jobs_list = typing.cast(List[responses.ManagedJobRecord],
5200
+ jobs_result)
5201
+
5202
+ def _should_include_job(job: responses.ManagedJobRecord) -> bool:
5203
+ # Job must not be terminal.
5204
+ if job.get('status', ManagedJobStatus.SUCCEEDED).is_terminal():
5205
+ return False
5206
+ # If len is 0 then we are using -a option, so we include all jobs
5207
+ # if they're associated with a pool.
5208
+ if all:
5209
+ return job.get('pool') is not None
5210
+ # Otherwise we are using specific pool names, so we include the job
5211
+ # if it's associated with one of the specified pools.
5212
+ return job.get('pool') in pool_names
5213
+
5214
+ # Filter jobs by pool name and ensure nonterminal
5215
+ pool_jobs = [job for job in jobs_list if _should_include_job(job)]
5216
+ return pool_jobs
5217
+
5218
+ quoted_pool_names = [f'{name!r}' for name in pool_names]
5219
+ list_pool_str = ', '.join(quoted_pool_names)
5220
+ pool_identity_str = f'pool(s) {list_pool_str}'
5221
+ if all:
5222
+ pool_identity_str = 'all pools'
5223
+
5224
+ already_confirmed = False
5225
+ try:
5226
+ pool_jobs = _get_nonterminal_jobs(pool_names, all)
5227
+ if pool_jobs:
5228
+ num_jobs = len(pool_jobs)
5229
+ job_ids = [job['job_id'] for job in pool_jobs]
5230
+ job_ids_str = ','.join(str(job_id) for job_id in job_ids)
5231
+ click.echo(
5232
+ f'{colorama.Fore.YELLOW}Pool(s) has {num_jobs} '
5233
+ f'nonterminal jobs: {job_ids_str} so it is not yet safe to down'
5234
+ f'.{colorama.Style.RESET_ALL}')
5235
+ if not yes:
5236
+ should_cancel = click.confirm(
5237
+ 'Would you like to cancel all jobs and down the pool(s)?',
5238
+ default=False,
5239
+ abort=False,
5240
+ show_default=True)
5241
+ if not should_cancel:
5242
+ raise click.Abort()
5243
+ already_confirmed = True
5244
+
5245
+ # Cancel all jobs in the pool
5246
+ with rich_utils.client_status(
5247
+ ux_utils.spinner_message(
5248
+ f'Cancelling {num_jobs} jobs in {pool_identity_str}...')
5249
+ ):
5250
+ try:
5251
+ sdk.get(managed_jobs.cancel(job_ids=job_ids))
5252
+ except Exception as e:
5253
+ logger.warning(f'Failed to cancel jobs: {e}.')
5254
+ raise e
5255
+
5256
+ max_wait_time = 300 # 5 minutes max wait
5257
+ check_interval = 2 # Check every 2 seconds
5258
+ start_time = time.time()
5259
+ remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
5260
+ while (remaining_pool_jobs and
5261
+ time.time() - start_time < max_wait_time):
5262
+ # Check remaining jobs via API
5263
+ time.sleep(check_interval)
5264
+ remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
5265
+ ux_utils.spinner_message(
5266
+ f'Waiting for {len(remaining_pool_jobs)} '
5267
+ 'jobs to be cancelled...')
5268
+
5269
+ click.echo('\r' + ' ' * 80 + '\r', nl=False)
5270
+ if time.time() - start_time >= max_wait_time:
5271
+ click.echo(
5272
+ f'{colorama.Fore.YELLOW}Warning: Timeout waiting '
5273
+ f'for jobs to finish. Proceeding with pool down '
5274
+ f'anyway.{colorama.Style.RESET_ALL}')
5275
+ else:
5276
+ click.echo('All jobs cancelled.')
5277
+ except Exception as e: # pylint: disable=broad-except
5278
+ # If API call fails, log warning but continue with pool down
5279
+ logger.warning(
5280
+ f'Failed to check for running jobs in pool(s): {pool_names!r}: {e}.'
5281
+ ' Proceeding with pool down.')
5282
+
5283
+ if not yes and not already_confirmed:
5027
5284
  click.confirm(f'Terminating {pool_identity_str}. Proceed?',
5028
5285
  default=True,
5029
5286
  abort=True,
@@ -5205,22 +5462,22 @@ def jobs_pool_logs(
5205
5462
  .. code-block:: bash
5206
5463
 
5207
5464
  # Tail the controller logs of a pool
5208
- sky pool logs --controller [POOL_NAME]
5465
+ sky jobs pool logs --controller [POOL_NAME]
5209
5466
  \b
5210
5467
  # Print the worker logs so far and exit
5211
- sky pool logs --no-follow [POOL_NAME]
5468
+ sky jobs pool logs --no-follow [POOL_NAME] 1
5212
5469
  \b
5213
5470
  # Tail the logs of worker 1
5214
- sky pool logs [POOL_NAME] 1
5471
+ sky jobs pool logs [POOL_NAME] 1
5215
5472
  \b
5216
5473
  # Show the last 100 lines of the controller logs
5217
- sky pool logs --controller --tail 100 [POOL_NAME]
5474
+ sky jobs pool logs --controller --tail 100 [POOL_NAME]
5218
5475
  \b
5219
5476
  # Sync down all logs of the pool (controller, all workers)
5220
- sky pool logs [POOL_NAME] --sync-down
5477
+ sky jobs pool logs [POOL_NAME] --sync-down
5221
5478
  \b
5222
5479
  # Sync down controller logs and logs for workers 1 and 3
5223
- sky pool logs [POOL_NAME] 1 3 --controller --sync-down
5480
+ sky jobs pool logs [POOL_NAME] 1 3 --controller --sync-down
5224
5481
  """
5225
5482
  _handle_serve_logs(pool_name,
5226
5483
  follow=follow,
@@ -5236,7 +5493,15 @@ def jobs_pool_logs(
5236
5493
  @flags.config_option(expose_value=False)
5237
5494
  @usage_lib.entrypoint
5238
5495
  def dashboard() -> None:
5239
- """Starts the dashboard for skypilot."""
5496
+ """Opens the SkyPilot dashboard."""
5497
+ sdk.dashboard()
5498
+
5499
+
5500
+ @cli.command(cls=_DocumentedCodeCommand, hidden=True)
5501
+ @flags.config_option(expose_value=False)
5502
+ @usage_lib.entrypoint
5503
+ def ui() -> None:
5504
+ """Opens the SkyPilot dashboard."""
5240
5505
  sdk.dashboard()
5241
5506
 
5242
5507
 
@@ -5247,28 +5512,30 @@ def serve():
5247
5512
 
5248
5513
 
5249
5514
  def _generate_task_with_service(
5250
- service_name: str,
5251
- service_yaml_args: Tuple[str, ...],
5252
- workdir: Optional[str],
5253
- cloud: Optional[str],
5254
- region: Optional[str],
5255
- zone: Optional[str],
5256
- num_nodes: Optional[int],
5257
- use_spot: Optional[bool],
5258
- image_id: Optional[str],
5259
- env_file: Optional[Dict[str, str]],
5260
- env: List[Tuple[str, str]],
5261
- secret: Optional[List[Tuple[str, str]]],
5262
- gpus: Optional[str],
5263
- instance_type: Optional[str],
5264
- ports: Optional[Tuple[str]],
5265
- cpus: Optional[str],
5266
- memory: Optional[str],
5267
- disk_size: Optional[int],
5268
- disk_tier: Optional[str],
5269
- network_tier: Optional[str],
5270
- not_supported_cmd: str,
5271
- pool: bool, # pylint: disable=redefined-outer-name
5515
+ service_name: str,
5516
+ service_yaml_args: Tuple[str, ...],
5517
+ workdir: Optional[str],
5518
+ cloud: Optional[str],
5519
+ region: Optional[str],
5520
+ zone: Optional[str],
5521
+ num_nodes: Optional[int],
5522
+ use_spot: Optional[bool],
5523
+ image_id: Optional[str],
5524
+ env_file: Optional[Dict[str, str]],
5525
+ env: List[Tuple[str, str]],
5526
+ secret: Optional[List[Tuple[str, str]]],
5527
+ gpus: Optional[str],
5528
+ instance_type: Optional[str],
5529
+ ports: Optional[Tuple[str]],
5530
+ cpus: Optional[str],
5531
+ memory: Optional[str],
5532
+ disk_size: Optional[int],
5533
+ disk_tier: Optional[str],
5534
+ network_tier: Optional[str],
5535
+ not_supported_cmd: str,
5536
+ pool: bool, # pylint: disable=redefined-outer-name
5537
+ git_url: Optional[str] = None,
5538
+ git_ref: Optional[str] = None,
5272
5539
  ) -> task_lib.Task:
5273
5540
  """Generate a task with service section from a service YAML file."""
5274
5541
  is_yaml, _ = _check_yaml(''.join(service_yaml_args))
@@ -5298,6 +5565,8 @@ def _generate_task_with_service(
5298
5565
  disk_tier=disk_tier,
5299
5566
  network_tier=network_tier,
5300
5567
  ports=ports,
5568
+ git_url=git_url,
5569
+ git_ref=git_ref,
5301
5570
  )
5302
5571
  if isinstance(task, dag_lib.Dag):
5303
5572
  raise click.UsageError(
@@ -5313,7 +5582,7 @@ def _generate_task_with_service(
5313
5582
  if task.service.pool:
5314
5583
  if task.service.ports is not None or ports:
5315
5584
  with ux_utils.print_exception_no_traceback():
5316
- raise ValueError('Cannot specify ports in a cluster pool.')
5585
+ raise ValueError('Cannot specify ports in a pool.')
5317
5586
  return task
5318
5587
 
5319
5588
  # NOTE(yi): we only allow one service port now.
@@ -5389,6 +5658,10 @@ def _generate_task_with_service(
5389
5658
  type=str,
5390
5659
  help='A service name. Unique for each service. If not provided, '
5391
5660
  'a unique name is autogenerated.')
5661
+ @click.option('--git-url', type=str, help='Git repository URL.')
5662
+ @click.option('--git-ref',
5663
+ type=str,
5664
+ help='Git reference (branch, tag, or commit hash) to use.')
5392
5665
  @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
5393
5666
  flags.COMMON_OPTIONS)
5394
5667
  @flags.yes_option()
@@ -5418,6 +5691,8 @@ def serve_up(
5418
5691
  network_tier: Optional[str],
5419
5692
  yes: bool,
5420
5693
  async_call: bool,
5694
+ git_url: Optional[str] = None,
5695
+ git_ref: Optional[str] = None,
5421
5696
  ):
5422
5697
  """Launch a SkyServe service.
5423
5698
 
@@ -5475,6 +5750,8 @@ def serve_up(
5475
5750
  ports=ports,
5476
5751
  not_supported_cmd='sky serve up',
5477
5752
  pool=False,
5753
+ git_url=git_url,
5754
+ git_ref=git_ref,
5478
5755
  )
5479
5756
  assert task.service is not None
5480
5757
  if task.service.pool:
@@ -5556,6 +5833,8 @@ def serve_update(
5556
5833
  sky serve update --mode blue_green sky-service-16aa new_service.yaml
5557
5834
 
5558
5835
  """
5836
+ # TODO(lloyd-brown): Add a way to update number of replicas for serve
5837
+ # the way we did for pools.
5559
5838
  cloud, region, zone = _handle_infra_cloud_region_zone_options(
5560
5839
  infra, cloud, region, zone)
5561
5840
  task = _generate_task_with_service(
@@ -5918,94 +6197,39 @@ def local():
5918
6197
  help='Launch cluster without GPU support even '
5919
6198
  'if GPUs are detected on the host.')
5920
6199
  @click.option(
5921
- '--ips',
6200
+ '--name',
5922
6201
  type=str,
5923
6202
  required=False,
5924
- help='Path to the file containing IP addresses of remote machines.')
5925
- @click.option('--ssh-user',
5926
- type=str,
5927
- required=False,
5928
- help='SSH username for accessing remote machines.')
5929
- @click.option('--ssh-key-path',
5930
- type=str,
5931
- required=False,
5932
- help='Path to the SSH private key.')
5933
- @click.option('--cleanup',
5934
- is_flag=True,
5935
- help='Clean up the remote cluster instead of deploying it.')
6203
+ help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
5936
6204
  @click.option(
5937
- '--context-name',
5938
- type=str,
6205
+ '--port-start',
6206
+ type=int,
5939
6207
  required=False,
5940
- help='Name to use for the kubeconfig context. Defaults to "default".')
5941
- @click.option('--password',
5942
- type=str,
5943
- required=False,
5944
- help='Password for the ssh-user to execute sudo commands. '
5945
- 'Required only if passwordless sudo is not setup.')
6208
+ help='Starting port range for the local kind cluster. Needs to be a '
6209
+ 'multiple of 100. If not given, a random range will be used. '
6210
+ 'Used without ip list.')
5946
6211
  @local.command('up', cls=_DocumentedCodeCommand)
5947
6212
  @flags.config_option(expose_value=False)
5948
6213
  @_add_click_options(flags.COMMON_OPTIONS)
5949
6214
  @usage_lib.entrypoint
5950
- def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5951
- cleanup: bool, context_name: Optional[str],
5952
- password: Optional[str], async_call: bool):
5953
- """Creates a local or remote cluster."""
5954
-
5955
- def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
5956
- # If any of --ips, --ssh-user, or --ssh-key-path is specified,
5957
- # all must be specified
5958
- if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
5959
- if not (ips and ssh_user and ssh_key_path):
5960
- raise click.BadParameter(
5961
- 'All --ips, --ssh-user, and --ssh-key-path '
5962
- 'must be specified together.')
5963
-
5964
- # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
5965
- # are all provided
5966
- if cleanup and not (ips and ssh_user and ssh_key_path):
5967
- raise click.BadParameter('--cleanup can only be used with '
5968
- '--ips, --ssh-user and --ssh-key-path.')
5969
-
5970
- _validate_args(ips, ssh_user, ssh_key_path, cleanup)
5971
-
5972
- # If remote deployment arguments are specified, run remote up script
5973
- ip_list = None
5974
- ssh_key = None
5975
- if ips and ssh_user and ssh_key_path:
5976
- # Read and validate IP file
5977
- try:
5978
- with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
5979
- ip_list = f.read().strip().splitlines()
5980
- if not ip_list:
5981
- raise click.BadParameter(f'IP file is empty: {ips}')
5982
- except (IOError, OSError) as e:
5983
- raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
5984
-
5985
- # Read and validate SSH key file
5986
- try:
5987
- with open(os.path.expanduser(ssh_key_path), 'r',
5988
- encoding='utf-8') as f:
5989
- ssh_key = f.read()
5990
- if not ssh_key:
5991
- raise click.BadParameter(
5992
- f'SSH key file is empty: {ssh_key_path}')
5993
- except (IOError, OSError) as e:
5994
- raise click.BadParameter(
5995
- f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5996
-
5997
- request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
5998
- context_name, password)
6215
+ def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
6216
+ async_call: bool):
6217
+ """Creates a local cluster."""
6218
+ request_id = sdk.local_up(gpus, name, port_start)
5999
6219
  _async_call_or_wait(request_id, async_call, request_name='local up')
6000
6220
 
6001
6221
 
6222
+ @click.option('--name',
6223
+ type=str,
6224
+ required=False,
6225
+ help='Name of the cluster to down. Defaults to "skypilot".')
6002
6226
  @local.command('down', cls=_DocumentedCodeCommand)
6003
6227
  @flags.config_option(expose_value=False)
6004
6228
  @_add_click_options(flags.COMMON_OPTIONS)
6005
6229
  @usage_lib.entrypoint
6006
- def local_down(async_call: bool):
6230
+ def local_down(name: Optional[str], async_call: bool):
6007
6231
  """Deletes a local cluster."""
6008
- request_id = sdk.local_down()
6232
+ request_id = sdk.local_down(name)
6009
6233
  _async_call_or_wait(request_id, async_call, request_name='sky.local.down')
6010
6234
 
6011
6235
 
@@ -6119,20 +6343,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6119
6343
  **_get_shell_complete_args(_complete_api_request))
6120
6344
  @flags.all_option('Cancel all your requests.')
6121
6345
  @flags.all_users_option('Cancel all requests from all users.')
6346
+ @flags.yes_option()
6122
6347
  @usage_lib.entrypoint
6123
6348
  # pylint: disable=redefined-builtin
6124
- def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6349
+ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
6350
+ yes: bool):
6125
6351
  """Cancel a request running on SkyPilot API server."""
6126
6352
  if all or all_users:
6127
- keyword = 'ALL USERS\'' if all_users else 'YOUR'
6128
- user_input = click.prompt(
6129
- f'This will cancel all {keyword} requests.\n'
6130
- f'To proceed, please type {colorama.Style.BRIGHT}'
6131
- f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6132
- type=str)
6133
- if user_input != 'cancel all requests':
6134
- raise click.Abort()
6135
- if all:
6353
+ if not yes:
6354
+ keyword = 'ALL USERS\'' if all_users else 'YOUR'
6355
+ user_input = click.prompt(
6356
+ f'This will cancel all {keyword} requests.\n'
6357
+ f'To proceed, please type {colorama.Style.BRIGHT}'
6358
+ f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6359
+ type=str)
6360
+ if user_input != 'cancel all requests':
6361
+ raise click.Abort()
6136
6362
  request_ids = None
6137
6363
  cancelled_request_ids = sdk.get(
6138
6364
  sdk.api_cancel(request_ids=request_ids, all_users=all_users))
@@ -6146,9 +6372,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6146
6372
  fg='green')
6147
6373
 
6148
6374
 
6375
+ class IntOrNone(click.ParamType):
6376
+ """Int or None"""
6377
+ name = 'int-or-none'
6378
+
6379
+ def convert(self, value, param, ctx):
6380
+ if isinstance(value, int):
6381
+ return value
6382
+ if isinstance(value, str) and value.lower() in ('none', 'all'):
6383
+ return None
6384
+ try:
6385
+ return int(value)
6386
+ except ValueError:
6387
+ self.fail(f'{value!r} is not a valid integer or "none" or "all"',
6388
+ param, ctx)
6389
+
6390
+
6391
+ INT_OR_NONE = IntOrNone()
6392
+
6393
+
6149
6394
  @api.command('status', cls=_DocumentedCodeCommand)
6150
6395
  @flags.config_option(expose_value=False)
6151
- @click.argument('request_ids',
6396
+ @click.argument('request_id_prefixes',
6152
6397
  required=False,
6153
6398
  type=str,
6154
6399
  nargs=-1,
@@ -6158,16 +6403,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6158
6403
  is_flag=True,
6159
6404
  default=False,
6160
6405
  required=False,
6161
- help='Show requests of all statuses.')
6406
+ help=('Show requests of all statuses, including finished ones '
6407
+ '(SUCCEEDED, FAILED, CANCELLED). By default, only active '
6408
+ 'requests (PENDING, RUNNING) are shown.'))
6409
+ @click.option(
6410
+ '--limit',
6411
+ '-l',
6412
+ default=_NUM_REQUESTS_TO_SHOW,
6413
+ type=INT_OR_NONE,
6414
+ required=False,
6415
+ help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
6416
+ f' set to "none" or "all" to show all requests.'))
6162
6417
  @flags.verbose_option('Show more details.')
6163
6418
  @usage_lib.entrypoint
6164
6419
  # pylint: disable=redefined-builtin
6165
- def api_status(request_ids: Optional[List[str]], all_status: bool,
6166
- verbose: bool):
6420
+ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
6421
+ verbose: bool, limit: Optional[int]):
6167
6422
  """List requests on SkyPilot API server."""
6168
- if not request_ids:
6169
- request_ids = None
6170
- request_list = sdk.api_status(request_ids, all_status)
6423
+ if not request_id_prefixes:
6424
+ request_id_prefixes = None
6425
+ fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
6426
+ if verbose:
6427
+ fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
6428
+ request_list = sdk.api_status(request_id_prefixes, all_status, limit,
6429
+ fields)
6171
6430
  columns = ['ID', 'User', 'Name']
6172
6431
  if verbose:
6173
6432
  columns.append('Cluster')
@@ -6193,8 +6452,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
6193
6452
  if verbose:
6194
6453
  dummy_row.append('-')
6195
6454
  table.add_row(dummy_row)
6196
- click.echo()
6197
6455
  click.echo(table)
6456
+ if limit and len(request_list) >= limit:
6457
+ click.echo()
6458
+ click.echo(
6459
+ f'Showing {limit} requests. Use "-l none" or "-l all" to show'
6460
+ f' all requests.')
6198
6461
 
6199
6462
 
6200
6463
  @api.command('login', cls=_DocumentedCodeCommand)