PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (429) hide show

sky/__init__.py +12 -2
sky/adaptors/aws.py +27 -22
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/adaptors/slurm.py +478 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +630 -185
sky/backends/cloud_vm_ray_backend.py +1111 -928
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +971 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -3
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +36 -32
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/slurm_catalog.py +243 -0
sky/check.py +87 -46
sky/client/cli/command.py +1004 -434
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +188 -65
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +8 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +47 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/slurm.py +578 -0
sky/clouds/ssh.py +6 -3
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +43 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +296 -195
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +177 -30
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/models.py +2 -0
sky/optimizer.py +7 -6
sky/provision/__init__.py +38 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +22 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +112 -28
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +422 -422
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +45 -15
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +572 -0
sky/provision/slurm/utils.py +583 -0
sky/provision/vast/instance.py +9 -4
sky/provision/vast/utils.py +10 -6
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/plugins.py +222 -0
sky/server/requests/executor.py +384 -145
sky/server/requests/payloads.py +83 -19
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +123 -0
sky/server/requests/requests.py +511 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +102 -20
sky/server/requests/serializers/return_value_serializers.py +60 -0
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +497 -179
sky/server/server_utils.py +30 -0
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +64 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +116 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +139 -29
sky/skylet/events.py +74 -14
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +189 -0
sky/skylet/job_lib.py +143 -105
sky/skylet/log_lib.py +252 -8
sky/skylet/log_lib.pyi +47 -7
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +27 -2
sky/skylet/subprocess_daemon.py +104 -28
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/ssh_node_pools/deploy/deploy.py +952 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +20 -21
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +204 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/slurm-ray.yml.j2 +85 -0
sky/templates/vast-ray.yml.j2 +2 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/model.conf +1 -1
sky/users/permission.py +84 -44
sky/users/rbac.py +31 -3
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +128 -6
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +283 -30
sky/utils/command_runner.pyi +63 -7
sky/utils/common.py +3 -1
sky/utils/common_utils.py +55 -7
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +138 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0

sky/client/cli/command.py CHANGED Viewed

@@ -32,6 +32,7 @@ import shlex
 import shutil
 import subprocess
 import sys
+import time
 import traceback
 import typing
 from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
@@ -59,8 +60,9 @@ from sky import task as task_lib
 from sky.adaptors import common as adaptors_common
 from sky.client import sdk
 from sky.client.cli import flags
-from sky.client.cli import git
-from sky.data import storage_utils
+from sky.client.cli import table_utils
+from sky.client.cli import utils as cli_utils
+from sky.jobs.state import ManagedJobStatus
 from sky.provision.kubernetes import constants as kubernetes_constants
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.schemas.api import responses
@@ -79,7 +81,6 @@ from sky.utils import controller_utils
 from sky.utils import dag_utils
 from sky.utils import directory_utils
 from sky.utils import env_options
-from sky.utils import git as git_utils
 from sky.utils import infra_utils
 from sky.utils import log_utils
 from sky.utils import registry
@@ -89,9 +90,9 @@ from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
+from sky.utils import volume as volume_utils
 from sky.utils import yaml_utils
 from sky.utils.cli_utils import status_utils
-from sky.volumes import utils as volumes_utils
 from sky.volumes.client import sdk as volumes_sdk
 if typing.TYPE_CHECKING:
@@ -113,6 +114,24 @@ an autogenerated name."""
 # command.
 _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
 _NUM_MANAGED_JOBS_TO_SHOW = 50
+_NUM_REQUESTS_TO_SHOW = 50
+_DEFAULT_REQUEST_FIELDS_TO_SHOW = [
+    'request_id', 'name', 'user_id', 'status', 'created_at'
+]
+_VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
+    'cluster_name'
+]
+_DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
+    'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
+    'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
+]
+_VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
+    'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
+    'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
+    'failure_reason', 'metadata'
+]
+_USER_NAME_FIELD = ['user_name']
+_USER_HASH_FIELD = ['user_hash']
 _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
     '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -129,6 +148,7 @@ def _get_cluster_records_and_set_ssh_config(
     clusters: Optional[List[str]],
     refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
     all_users: bool = False,
+    verbose: bool = False,
 ) -> List[responses.StatusResponse]:
     """Returns a list of clusters that match the glob pattern.
@@ -146,23 +166,30 @@ def _get_cluster_records_and_set_ssh_config(
     request_id = sdk.status(clusters,
                             refresh=refresh,
                             all_users=all_users,
-                            _include_credentials=True)
+                            _include_credentials=True,
+                            _summary_response=not verbose)
     cluster_records = sdk.stream_and_get(request_id)
     # Update the SSH config for all clusters
     for record in cluster_records:
         handle = record['handle']
+        name = record['name']
         if not (handle is not None and handle.cached_external_ips is not None
                 and 'credentials' in record):
             # If the cluster is not UP or does not have credentials available,
             # we need to remove the cluster from the SSH config.
-            cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
+            cluster_utils.SSHConfigHelper.remove_cluster(name)
+            continue
+        if not record['credentials']:
+            # The credential is missing for some reason, continue.
+            logger.debug(
+                f'Client did not receive SSH credential for cluster {name}')
             continue
         # During the failover, even though a cluster does not exist, the handle
         # can still exist in the record, and we check for credentials to avoid
         # updating the SSH config for non-existent clusters.
         credentials = record['credentials']
+        ips = handle.cached_external_ips
         if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
             # Replace the proxy command to proxy through the SkyPilot API
             # server with websocket.
@@ -191,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
                 f'{server_common.get_server_url()} '
                 f'{handle.cluster_name}\"')
             credentials['ssh_proxy_command'] = proxy_command
+        elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
+            # TODO(kevin): This is a temporary workaround, ideally we want to
+            # get a shell through srun --pty bash on the existing sbatch job.
+            # Proxy through the controller/login node to reach the worker node.
+            if (handle.cached_internal_ips is None or
+                    not handle.cached_internal_ips):
+                logger.debug(
+                    f'Cluster {name} does not have cached internal IPs. '
+                    'Skipping SSH config update.')
+                cluster_utils.SSHConfigHelper.remove_cluster(name)
+                continue
+            escaped_key_path = shlex.quote(
+                cluster_utils.SSHConfigHelper.generate_local_key_file(
+                    handle.cluster_name, credentials))
+            controller_host = handle.cached_external_ips[0]
+            # Build jump proxy: ssh to worker via controller/login node
+            proxy_command = (f'ssh -tt -i {escaped_key_path} '
+                             '-o StrictHostKeyChecking=no '
+                             '-o UserKnownHostsFile=/dev/null '
+                             '-o IdentitiesOnly=yes '
+                             '-W %h:%p '
+                             f'{handle.ssh_user}@{controller_host}')
+            original_proxy = credentials.get('ssh_proxy_command')
+            if original_proxy:
+                proxy_command += (
+                    f' -o ProxyCommand={shlex.quote(original_proxy)}')
+            credentials['ssh_proxy_command'] = proxy_command
+            # For Slurm, use the worker's internal IP as the SSH target
+            ips = handle.cached_internal_ips
         cluster_utils.SSHConfigHelper.add_cluster(
             handle.cluster_name,
-            handle.cached_external_ips,
+            ips,
             credentials,
             handle.cached_external_ssh_ports,
             handle.docker_user,
@@ -783,8 +844,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     # Update the workdir config from the command line parameters.
     # And update the envs and secrets from the workdir.
-    _update_task_workdir(task, workdir, git_url, git_ref)
-    _update_task_workdir_and_secrets_from_workdir(task)
+    task.update_workdir(workdir, git_url, git_ref)
+    task.update_envs_and_secrets_from_workdir()
     # job launch specific.
     if job_recovery is not None:
@@ -799,73 +860,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     return task
-def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
-                         git_url: Optional[str], git_ref: Optional[str]):
-    """Updates the task workdir.
-    Args:
-        task: The task to update.
-        workdir: The workdir to update.
-        git_url: The git url to update.
-        git_ref: The git ref to update.
-    """
-    if task.workdir is None or isinstance(task.workdir, str):
-        if workdir is not None:
-            task.workdir = workdir
-            return
-        if git_url is not None:
-            task.workdir = {}
-            task.workdir['url'] = git_url
-            if git_ref is not None:
-                task.workdir['ref'] = git_ref
-            return
-        return
-    if git_url is not None:
-        task.workdir['url'] = git_url
-    if git_ref is not None:
-        task.workdir['ref'] = git_ref
-    return
-def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
-    """Updates the task secrets from the workdir.
-    Args:
-        task: The task to update.
-    """
-    if task.workdir is None:
-        return
-    if not isinstance(task.workdir, dict):
-        return
-    url = task.workdir['url']
-    ref = task.workdir.get('ref', '')
-    token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
-    ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
-    try:
-        git_repo = git.GitRepo(url, ref, token, ssh_key_path)
-        clone_info = git_repo.get_repo_clone_info()
-        if clone_info is None:
-            return
-        task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
-        if ref:
-            ref_type = git_repo.get_ref_type()
-            if ref_type == git.GitRefType.COMMIT:
-                task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
-            elif ref_type == git.GitRefType.BRANCH:
-                task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
-            elif ref_type == git.GitRefType.TAG:
-                task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
-        if clone_info.token is None and clone_info.ssh_key is None:
-            return
-        if clone_info.token is not None:
-            task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
-        if clone_info.ssh_key is not None:
-            task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
-    except exceptions.GitError as e:
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError(f'{str(e)}') from None
 class _NaturalOrderGroup(click.Group):
     """Lists commands in the order defined in this script.
@@ -873,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
     """
     def list_commands(self, ctx):  # pylint: disable=unused-argument
-        return self.commands.keys()
+        # Preserve definition order but hide aliases (same command object) and
+        # commands explicitly marked as hidden.
+        seen_commands = set()
+        names = []
+        for name, command in self.commands.items():
+            if getattr(command, 'hidden', False):
+                continue
+            command_id = id(command)
+            if command_id in seen_commands:
+                continue
+            seen_commands.add(command_id)
+            names.append(name)
+        return names
     @usage_lib.entrypoint('sky.cli', fallback=True)
     def invoke(self, ctx):
@@ -1160,7 +1166,7 @@ def launch(
     if task.service is not None:
         noun = 'pool' if task.service.pool else 'service'
         capnoun = noun.capitalize()
-        sysname = 'Jobs Worker Pool' if task.service.pool else 'SkyServe'
+        sysname = 'Pool' if task.service.pool else 'SkyServe'
         cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
         logger.info(
             f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
@@ -1388,14 +1394,24 @@ def exec(
 def _handle_jobs_queue_request(
-        request_id: server_common.RequestId[List[Dict[str, Any]]],
-        show_all: bool,
-        show_user: bool,
-        max_num_jobs_to_show: Optional[int],
-        is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
+    request_id: server_common.RequestId[Union[
+        List[responses.ManagedJobRecord],
+        Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
+    show_all: bool,
+    show_user: bool,
+    max_num_jobs_to_show: Optional[int],
+    pool_status_request_id: Optional[server_common.RequestId[List[Dict[
+        str, Any]]]] = None,
+    is_called_by_user: bool = False,
+    only_in_progress: bool = False,
+    queue_result_version: cli_utils.QueueResultVersion = cli_utils.
+    QueueResultVersion.V1,
+) -> Tuple[Optional[int], str]:
     """Get the in-progress managed jobs.
     Args:
+        request_id: The request ID for managed jobs.
+        pool_status_request_id: The request ID for pool status, or None.
         show_all: Show all information of each job (e.g., region, price).
         show_user: Show the user who submitted the job.
         max_num_jobs_to_show: If not None, limit the number of jobs to show to
@@ -1403,6 +1419,8 @@ def _handle_jobs_queue_request(
             and `sky jobs queue`.
         is_called_by_user: If this function is called by user directly, or an
             internal call.
+        only_in_progress: If True, only return the number of in-progress jobs.
+        queue_result_version: The version of the queue result.
     Returns:
         A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
@@ -1413,11 +1431,47 @@ def _handle_jobs_queue_request(
     # TODO(SKY-980): remove unnecessary fallbacks on the client side.
     num_in_progress_jobs = None
     msg = ''
+    status_counts: Optional[Dict[str, int]] = None
+    pool_status_result = None
     try:
         if not is_called_by_user:
             usage_lib.messages.usage.set_internal()
-        managed_jobs_ = sdk.stream_and_get(request_id)
-        num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
+        # Call both stream_and_get functions in parallel
+        def get_jobs_queue_result():
+            return sdk.stream_and_get(request_id)
+        def get_pool_status_result():
+            if pool_status_request_id is not None:
+                try:
+                    return sdk.stream_and_get(pool_status_request_id)
+                except Exception:  # pylint: disable=broad-except
+                    # If getting pool status fails, just continue without it
+                    return None
+            return None
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            jobs_future = executor.submit(get_jobs_queue_result)
+            pool_status_future = executor.submit(get_pool_status_result)
+            result = jobs_future.result()
+            pool_status_result = pool_status_future.result()
+        if queue_result_version.v2():
+            managed_jobs_, total, status_counts, _ = result
+            if only_in_progress:
+                num_in_progress_jobs = 0
+                if status_counts:
+                    for status_value, count in status_counts.items():
+                        status_enum = managed_jobs.ManagedJobStatus(
+                            status_value)
+                        if not status_enum.is_terminal():
+                            num_in_progress_jobs += count
+            else:
+                num_in_progress_jobs = total
+        else:
+            managed_jobs_ = result
+            num_in_progress_jobs = len(
+                set(job['job_id'] for job in managed_jobs_))
     except exceptions.ClusterNotUpError as e:
         controller_status = e.cluster_status
         msg = str(e)
@@ -1461,10 +1515,14 @@ def _handle_jobs_queue_request(
         msg += ('Failed to query managed jobs: '
                 f'{common_utils.format_exception(e, use_bracket=True)}')
     else:
-        msg = managed_jobs.format_job_table(managed_jobs_,
-                                            show_all=show_all,
-                                            show_user=show_user,
-                                            max_jobs=max_num_jobs_to_show)
+        msg = table_utils.format_job_table(
+            managed_jobs_,
+            pool_status=pool_status_result,
+            show_all=show_all,
+            show_user=show_user,
+            max_jobs=max_num_jobs_to_show,
+            status_counts=status_counts,
+        )
     return num_in_progress_jobs, msg
@@ -1562,35 +1620,6 @@ def _handle_services_request(
     return num_services, msg
-def _status_kubernetes(show_all: bool):
-    """Show all SkyPilot resources in the current Kubernetes context.
-    Args:
-        show_all (bool): Show all job information (e.g., start time, failures).
-    """
-    all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
-        sdk.status_kubernetes()))
-    click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-               f'Kubernetes cluster state (context: {context})'
-               f'{colorama.Style.RESET_ALL}')
-    status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
-                                                      show_all)
-    if all_jobs:
-        click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                   f'Managed jobs'
-                   f'{colorama.Style.RESET_ALL}')
-        msg = managed_jobs.format_job_table(all_jobs,
-                                            show_all=show_all,
-                                            show_user=False)
-        click.echo(msg)
-    if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
-        # TODO: Parse serve controllers and show services separately.
-        #  Currently we show a hint that services are shown as clusters.
-        click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
-                   'shown in the "SkyPilot clusters" section.'
-                   f'{colorama.Style.RESET_ALL}')
 def _show_endpoint(query_clusters: Optional[List[str]],
                    cluster_records: List[responses.StatusResponse], ip: bool,
                    endpoints: bool, endpoint: Optional[int]) -> None:
@@ -1717,15 +1746,7 @@ def _show_enabled_infra(
               default=True,
               is_flag=True,
               required=False,
-              help='Also show cluster pools, if any.')
-@click.option(
-    '--kubernetes',
-    '--k8s',
-    default=False,
-    is_flag=True,
-    required=False,
-    help='[Experimental] Show all SkyPilot resources (including from other '
-    'users) in the current Kubernetes context.')
+              help='Also show pools, if any.')
 @click.argument('clusters',
                 required=False,
                 type=str,
@@ -1737,8 +1758,8 @@ def _show_enabled_infra(
 # pylint: disable=redefined-builtin
 def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
            endpoint: Optional[int], show_managed_jobs: bool,
-           show_services: bool, show_pools: bool, kubernetes: bool,
-           clusters: List[str], all_users: bool):
+           show_services: bool, show_pools: bool, clusters: List[str],
+           all_users: bool):
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Show clusters.
@@ -1801,9 +1822,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
       or for autostop-enabled clusters, use ``--refresh`` to query the latest
       cluster statuses from the cloud providers.
     """
-    if kubernetes:
-        _status_kubernetes(verbose)
-        return
     # Do not show job queue if user specifies clusters, and if user
     # specifies --ip or --endpoint(s).
     show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
@@ -1853,9 +1871,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
     # Phase 2: Parallel submission of all API requests
     def submit_managed_jobs():
-        return managed_jobs.queue(refresh=False,
-                                  skip_finished=True,
-                                  all_users=all_users)
+        fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
+        if all_users:
+            fields = fields + _USER_NAME_FIELD
+        return cli_utils.get_managed_job_queue(
+            refresh=False,
+            skip_finished=True,
+            all_users=all_users,
+            fields=fields,
+            limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
+        )
     def submit_services(
     ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
@@ -1870,17 +1895,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
             return None
     def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
-        try:
-            return sdk.workspaces()
-        except RuntimeError:
-            # Backward compatibility for API server before #5660.
-            # TODO(zhwu): remove this after 0.10.0.
-            logger.warning(f'{colorama.Style.DIM}SkyPilot API server is '
-                           'in an old version, and may miss feature: '
-                           'workspaces. Update with: sky api stop; '
-                           'sky api start'
-                           f'{colorama.Style.RESET_ALL}')
-            return None
+        return sdk.workspaces()
     active_workspace = skypilot_config.get_active_workspace()
@@ -1888,6 +1903,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
         return sdk.enabled_clouds(workspace=active_workspace, expand=True)
     managed_jobs_queue_request_id = None
+    queue_result_version = cli_utils.QueueResultVersion.V1
     service_status_request_id = None
     workspace_request_id = None
     pool_status_request_id = None
@@ -1906,7 +1922,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
         # Get the request IDs
         if show_managed_jobs:
-            managed_jobs_queue_request_id = managed_jobs_request_future.result()
+            (managed_jobs_queue_request_id,
+             queue_result_version) = managed_jobs_request_future.result()
         if show_services:
             service_status_request_id = services_request_future.result()
         if show_pools:
@@ -1927,7 +1944,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
     # Phase 3: Get cluster records and handle special cases
     cluster_records = _get_cluster_records_and_set_ssh_config(
-        query_clusters, refresh_mode, all_users)
+        query_clusters, refresh_mode, all_users, verbose)
     # TOOD(zhwu): setup the ssh config for status
     if ip or show_endpoints:
@@ -1938,7 +1955,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
     controllers = []
     for cluster_record in cluster_records:
         cluster_name = cluster_record['name']
-        controller = controller_utils.Controllers.from_name(cluster_name)
+        controller = controller_utils.Controllers.from_name(
+            cluster_name, expect_exact_match=False)
         if controller is not None:
             controllers.append(cluster_record)
         else:
@@ -1967,10 +1985,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
             try:
                 num_in_progress_jobs, msg = _handle_jobs_queue_request(
                     managed_jobs_queue_request_id,
+                    pool_status_request_id=pool_status_request_id,
                     show_all=False,
                     show_user=all_users,
                     max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
-                    is_called_by_user=False)
+                    is_called_by_user=False,
+                    only_in_progress=True,
+                    queue_result_version=queue_result_version,
+                )
             except KeyboardInterrupt:
                 sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
                 managed_jobs_query_interrupted = True
@@ -2066,6 +2088,35 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
         click.echo('\n' + '\n'.join(hints))
+@cli.command(hidden=True)
+@flags.config_option(expose_value=False)
+@flags.verbose_option()
+def status_kubernetes(verbose: bool):
+    """[Experimental] Show all SkyPilot resources (including from other '
+    'users) in the current Kubernetes context."""
+    all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
+        sdk.status_kubernetes()))
+    click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+               f'Kubernetes cluster state (context: {context})'
+               f'{colorama.Style.RESET_ALL}')
+    status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
+                                                      show_all=verbose)
+    if all_jobs:
+        click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                   f'Managed jobs'
+                   f'{colorama.Style.RESET_ALL}')
+        msg = table_utils.format_job_table(all_jobs,
+                                           show_all=verbose,
+                                           show_user=False)
+        click.echo(msg)
+    if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
+        # TODO: Parse serve controllers and show services separately.
+        #  Currently we show a hint that services are shown as clusters.
+        click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
+                   'shown in the "SkyPilot clusters" section.'
+                   f'{colorama.Style.RESET_ALL}')
 @cli.command()
 @flags.config_option(expose_value=False)
 @flags.all_option('Show all cluster information.')
@@ -2104,7 +2155,8 @@ def cost_report(all: bool, days: int):  # pylint: disable=redefined-builtin
     for cluster_record in cluster_records:
         cluster_name = cluster_record['name']
         try:
-            controller = controller_utils.Controllers.from_name(cluster_name)
+            controller = controller_utils.Controllers.from_name(
+                cluster_name, expect_exact_match=False)
         except AssertionError:
             # There could be some old controller clusters from previous
             # versions that we should not show in the cost report.
@@ -2192,7 +2244,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
                        f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
                        f'  {common_utils.format_exception(e)}')
             return
-        job_tables[cluster] = job_lib.format_job_queue(job_table)
+        job_tables[cluster] = table_utils.format_job_queue(job_table)
     subprocess_utils.run_in_parallel(_get_job_queue, clusters)
     user_str = 'all users' if all_users else 'current user'
@@ -2213,6 +2265,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
               is_flag=True,
               default=False,
               help='Stream the cluster provisioning logs (provision.log).')
+@click.option('--worker',
+              '-w',
+              default=None,
+              type=int,
+              help='The worker ID to stream the logs from. '
+              'If not set, stream the logs of the head node.')
 @click.option(
     '--sync-down',
     '-s',
@@ -2250,6 +2308,7 @@ def logs(
     cluster: str,
     job_ids: Tuple[str, ...],
     provision: bool,
+    worker: Optional[int],
     sync_down: bool,
     status: bool,  # pylint: disable=redefined-outer-name
     follow: bool,
@@ -2279,6 +2338,13 @@ def logs(
     4. If the job fails or fetching the logs fails, the command will exit with
     a non-zero return code.
     """
+    if worker is not None:
+        if not provision:
+            raise click.UsageError(
+                '--worker can only be used with --provision.')
+        if worker < 1:
+            raise click.UsageError('--worker must be a positive integer.')
     if provision and (sync_down or status or job_ids):
         raise click.UsageError(
             '--provision cannot be combined with job log options '
@@ -2298,7 +2364,11 @@ def logs(
     if provision:
         # Stream provision logs
-        sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
+        sys.exit(
+            sdk.tail_provision_logs(cluster_name=cluster,
+                                    worker=worker,
+                                    follow=follow,
+                                    tail=tail))
     if sync_down:
         with rich_utils.client_status(
@@ -2476,7 +2546,8 @@ def cancel(
                                     job_ids=job_ids_to_cancel)
             _async_call_or_wait(request_id, async_call, 'sky.cancel')
         except exceptions.NotSupportedError as e:
-            controller = controller_utils.Controllers.from_name(cluster)
+            controller = controller_utils.Controllers.from_name(
+                cluster, expect_exact_match=False)
             assert controller is not None, cluster
             with ux_utils.print_exception_no_traceback():
                 raise click.UsageError(
@@ -2777,7 +2848,8 @@ def start(
         # Get all clusters that are not controllers.
         cluster_records = [
             cluster for cluster in all_clusters
-            if controller_utils.Controllers.from_name(cluster['name']) is None
+            if controller_utils.Controllers.from_name(
+                cluster['name'], expect_exact_match=False) is None
         ]
     if cluster_records is None:
         # Get GLOB cluster names
@@ -2839,7 +2911,8 @@ def start(
     # Checks for controller clusters (jobs controller / sky serve controller).
     controllers, normal_clusters = [], []
     for name in to_start:
-        if controller_utils.Controllers.from_name(name) is not None:
+        if controller_utils.Controllers.from_name(
+                name, expect_exact_match=False) is not None:
             controllers.append(name)
         else:
             normal_clusters.append(name)
@@ -2975,16 +3048,28 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
             to be torn down (e.g., because it has jobs running or
             it is in init state)
     """
-    controller = controller_utils.Controllers.from_name(controller_name)
+    controller = controller_utils.Controllers.from_name(
+        controller_name, expect_exact_match=False)
     assert controller is not None, controller_name
+    status_counts: Optional[Dict[str, int]] = None
+    managed_jobs_: List[responses.ManagedJobRecord] = []
     with rich_utils.client_status(
             '[bold cyan]Checking for in-progress managed jobs and pools[/]'):
         try:
-            request_id = managed_jobs.queue(refresh=False,
-                                            skip_finished=True,
-                                            all_users=True)
-            managed_jobs_ = sdk.stream_and_get(request_id)
+            fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
+            request_id, queue_result_version = cli_utils.get_managed_job_queue(
+                refresh=False,
+                skip_finished=True,
+                all_users=True,
+                fields=fields,
+            )
+            result = sdk.stream_and_get(request_id)
+            if queue_result_version.v2():
+                managed_jobs_, _, status_counts, _ = result
+            else:
+                managed_jobs_ = typing.cast(List[responses.ManagedJobRecord],
+                                            result)
             request_id_pools = managed_jobs.pool_status(pool_names=None)
             pools_ = sdk.stream_and_get(request_id_pools)
         except exceptions.ClusterNotUpError as e:
@@ -3002,25 +3087,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
             # there is no in-prgress managed jobs.
             managed_jobs_ = []
             pools_ = []
-        except exceptions.InconsistentConsolidationModeError:
-            # If this error is raised, it means the user switched to the
-            # consolidation mode but the previous controller cluster is still
-            # running. We should allow the user to tear down the controller
-            # cluster in this case.
-            with skypilot_config.override_skypilot_config(
-                {'jobs': {
-                    'controller': {
-                        'consolidation_mode': False
-                    }
-                }}):
-                # Check again with the consolidation mode disabled. This is to
-                # make sure there is no in-progress managed jobs.
-                request_id = managed_jobs.queue(refresh=False,
-                                                skip_finished=True,
-                                                all_users=True)
-                managed_jobs_ = sdk.stream_and_get(request_id)
-                request_id_pools = managed_jobs.pool_status(pool_names=None)
-                pools_ = sdk.stream_and_get(request_id_pools)
     msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
            'jobs controller. Please be aware of the following:'
@@ -3029,9 +3095,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
            'jobs (output of `sky jobs queue`) will be lost.')
     click.echo(msg)
     if managed_jobs_:
-        job_table = managed_jobs.format_job_table(managed_jobs_,
-                                                  show_all=False,
-                                                  show_user=True)
+        job_table = table_utils.format_job_table(
+            managed_jobs_,
+            show_all=False,
+            show_user=True,
+            status_counts=status_counts,
+        )
         msg = controller.value.decline_down_for_dirty_controller_hint
         # Add prefix to each line to align with the bullet point.
         msg += '\n'.join(
@@ -3074,7 +3143,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
             to be torn down (e.g., because it has services running or
             it is in init state)
     """
-    controller = controller_utils.Controllers.from_name(controller_name)
+    controller = controller_utils.Controllers.from_name(
+        controller_name, expect_exact_match=False)
     assert controller is not None, controller_name
     with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
         try:
@@ -3093,21 +3163,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
             # controller being STOPPED or being firstly launched, i.e., there is
             # no in-prgress services.
             services = []
-        except exceptions.InconsistentConsolidationModeError:
-            # If this error is raised, it means the user switched to the
-            # consolidation mode but the previous controller cluster is still
-            # running. We should allow the user to tear down the controller
-            # cluster in this case.
-            with skypilot_config.override_skypilot_config(
-                {'serve': {
-                    'controller': {
-                        'consolidation_mode': False
-                    }
-                }}):
-                # Check again with the consolidation mode disabled. This is to
-                # make sure there is no in-progress services.
-                request_id = serve_lib.status(service_names=None)
-                services = sdk.stream_and_get(request_id)
     if services:
         service_names = [service['name'] for service in services]
@@ -3185,14 +3240,15 @@ def _down_or_stop_clusters(
     names = list(names)
     if names:
         controllers = [
-            name for name in names
-            if controller_utils.Controllers.from_name(name) is not None
+            name for name in names if controller_utils.Controllers.from_name(
+                name, expect_exact_match=False) is not None
         ]
         controllers_str = ', '.join(map(repr, controllers))
         names = [
             cluster['name']
             for cluster in _get_cluster_records_and_set_ssh_config(names)
-            if controller_utils.Controllers.from_name(cluster['name']) is None
+            if controller_utils.Controllers.from_name(
+                cluster['name'], expect_exact_match=False) is None
         ]
         # Make sure the controllers are explicitly specified without other
@@ -3217,7 +3273,7 @@ def _down_or_stop_clusters(
                     f'{controllers_str} is currently not supported.')
             else:
                 controller = controller_utils.Controllers.from_name(
-                    controller_name)
+                    controller_name, expect_exact_match=False)
                 assert controller is not None
                 hint_or_raise = _controller_to_hint_or_raise(controller)
                 try:
@@ -3265,9 +3321,10 @@ def _down_or_stop_clusters(
         names = [
             record['name']
             for record in all_clusters
-            if controller_utils.Controllers.from_name(record['name']) is None
-            and (down or idle_minutes_to_autostop is not None or
-                 record['status'] != status_lib.ClusterStatus.STOPPED)
+            if controller_utils.Controllers.from_name(
+                record['name'], expect_exact_match=False) is None and
+            (down or idle_minutes_to_autostop is not None or
+             record['status'] != status_lib.ClusterStatus.STOPPED)
         ]
     clusters = names
@@ -3297,6 +3354,9 @@ def _down_or_stop_clusters(
     request_ids = []
+    successes: List[str] = []
+    failures: List[Tuple[str, str]] = []
     def _down_or_stop(name: str):
         success_progress = False
         if idle_minutes_to_autostop is not None:
@@ -3304,16 +3364,20 @@ def _down_or_stop_clusters(
                 request_id = sdk.autostop(name, idle_minutes_to_autostop,
                                           wait_for, down)
                 request_ids.append(request_id)
+                progress.stop()
                 _async_call_or_wait(
                     request_id, async_call,
                     server_constants.REQUEST_NAME_PREFIX + operation)
-            except (exceptions.NotSupportedError,
-                    exceptions.ClusterNotUpError) as e:
+                progress.start()
+            except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
+                    exceptions.CloudError) as e:
                 message = str(e)
+                failures.append((name, str(e)))
             else:  # no exception raised
                 success_progress = True
                 message = (f'{colorama.Fore.GREEN}{operation} '
                            f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
+                successes.append(name)
                 if idle_minutes_to_autostop >= 0:
                     option_str = 'down' if down else 'stop'
                     passive_str = 'downed' if down else 'stopped'
@@ -3333,9 +3397,11 @@ def _down_or_stop_clusters(
                 else:
                     request_id = sdk.stop(name, purge=purge)
                 request_ids.append(request_id)
+                progress.stop()
                 _async_call_or_wait(
                     request_id, async_call,
                     server_constants.REQUEST_NAME_PREFIX + operation)
+                progress.start()
                 if not async_call:
                     # Remove the cluster from the SSH config file as soon as it
                     # is stopped or downed.
@@ -3345,13 +3411,17 @@ def _down_or_stop_clusters(
                     f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
                     f'{colorama.Style.RESET_ALL}'
                     f'\nReason: {common_utils.format_exception(e)}.')
+                failures.append((name, str(e)))
             except (exceptions.NotSupportedError,
-                    exceptions.ClusterOwnerIdentityMismatchError) as e:
+                    exceptions.ClusterOwnerIdentityMismatchError,
+                    exceptions.CloudError) as e:
                 message = str(e)
+                failures.append((name, str(e)))
             else:  # no exception raised
                 message = (
                     f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
                     f'{colorama.Style.RESET_ALL}')
+                successes.append(name)
                 if not down:
                     message += ('\n  To restart the cluster, run: '
                                 f'{colorama.Style.BRIGHT}sky start {name}'
@@ -3365,6 +3435,10 @@ def _down_or_stop_clusters(
         progress.start()
     with progress:
+        # we write a new line here to avoid the "Waiting for 'sky.down'
+        # request to be scheduled" message from being printed on the same line
+        # as the "Terminating <num> clusters..." message
+        click.echo('')
         subprocess_utils.run_in_parallel(_down_or_stop, clusters)
         progress.live.transient = False
         # Make sure the progress bar not mess up the terminal.
@@ -3374,6 +3448,31 @@ def _down_or_stop_clusters(
         click.secho(f'{operation} requests are sent. Check the requests\' '
                     'status with `sky request get <request_id>`.')
+    show_summary = len(clusters) > 1
+    if show_summary:
+        click.echo('\nSummary:')
+        if successes:
+            # Preserve the original order of clusters as provided by user.
+            click.echo('  ✓ Succeeded: ' + ', '.join(successes))
+        if failures:
+            # Format failures: if one failure, keep on same line. If multiple,
+            # indent each failed cluster on its own line for readability.
+            if len(failures) == 1:
+                name, reason = failures[0]
+                first = reason.strip().splitlines()[0]
+                first = first if len(first) <= 120 else first[:120] + '…'
+                click.echo(f'  ✗ Failed: {name} ({first})')
+            else:
+                click.echo('  ✗ Failed:')
+                for name, reason in failures:
+                    first = reason.strip().splitlines()[0]
+                    first = first if len(first) <= 120 else first[:120] + '…'
+                    click.echo(f'      {name} ({first})')
+    if failures:
+        click.echo('Cluster(s) failed. See details above.')
 @cli.command(cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
@@ -3483,6 +3582,10 @@ def show_gpus(
     maximum quantities of the GPU available on a single node and the real-time
     availability of the GPU across all nodes in the Kubernetes cluster.
+    If ``--cloud slurm`` is specified, it will show the maximum quantities of
+    the GPU available on a single node and the real-time availability of the
+    GPU across all nodes in the Slurm cluster.
     Definitions of certain fields:
     * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3538,6 +3641,8 @@ def show_gpus(
     cloud_is_kubernetes = isinstance(
         cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
     cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
+    cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
     # TODO(romilb): We should move this to the backend.
     kubernetes_autoscaling = skypilot_config.get_effective_region_config(
         cloud='kubernetes',
@@ -3546,6 +3651,7 @@ def show_gpus(
         default_value=None) is not None
     kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
     ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
+    slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
     query_k8s_realtime_gpu = (kubernetes_is_enabled and
                               (cloud_name is None or cloud_is_kubernetes))
     query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3605,8 +3711,9 @@ def show_gpus(
             raise ValueError(full_err_msg)
         no_permissions_str = '<no permissions>'
         realtime_gpu_infos = []
+        # Stores per-GPU totals as [ready_capacity, available, not_ready].
         total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
-            lambda: [0, 0])
+            lambda: [0, 0, 0])
         all_nodes_info = []
         # display an aggregated table for all contexts
@@ -3617,6 +3724,33 @@ def show_gpus(
         num_filtered_contexts = 0
+        def _count_not_ready_gpus(
+            nodes_info: Optional['models.KubernetesNodesInfo']
+        ) -> Dict[str, int]:
+            """Return counts of GPUs on not ready nodes keyed by GPU type."""
+            not_ready_counts: Dict[str, int] = collections.defaultdict(int)
+            if nodes_info is None:
+                return not_ready_counts
+            node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
+            for node_info in node_info_dict.values():
+                accelerator_type = getattr(node_info, 'accelerator_type', None)
+                if not accelerator_type:
+                    continue
+                total_info = getattr(node_info, 'total', {})
+                accelerator_count = 0
+                if isinstance(total_info, dict):
+                    accelerator_count = int(
+                        total_info.get('accelerator_count', 0))
+                if accelerator_count <= 0:
+                    continue
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    not_ready_counts[accelerator_type] += accelerator_count
+            return not_ready_counts
         if realtime_gpu_availability_lists:
             for (ctx, availability_list) in realtime_gpu_availability_lists:
                 if not _filter_ctx(ctx):
@@ -3626,6 +3760,12 @@ def show_gpus(
                 else:
                     display_ctx = ctx
                 num_filtered_contexts += 1
+                # Collect node info for this context before building tables so
+                # we can exclude GPUs on not ready nodes from the totals.
+                nodes_info = sdk.stream_and_get(
+                    sdk.kubernetes_node_info(context=ctx))
+                context_not_ready_counts = _count_not_ready_gpus(nodes_info)
                 realtime_gpu_table = log_utils.create_table(
                     ['GPU', qty_header, 'UTILIZATION'])
                 for realtime_gpu_availability in sorted(availability_list):
@@ -3634,24 +3774,116 @@ def show_gpus(
                     available_qty = (gpu_availability.available
                                      if gpu_availability.available != -1 else
                                      no_permissions_str)
+                    # Exclude GPUs on not ready nodes from capacity counts.
+                    not_ready_count = min(
+                        context_not_ready_counts.get(gpu_availability.gpu, 0),
+                        gpu_availability.capacity)
+                    # Ensure capacity is never below the reported available
+                    # quantity (if available is unknown, treat as 0 for totals).
+                    available_for_totals = max(
+                        gpu_availability.available
+                        if gpu_availability.available != -1 else 0, 0)
+                    effective_capacity = max(
+                        gpu_availability.capacity - not_ready_count,
+                        available_for_totals)
+                    utilization = (
+                        f'{available_qty} of {effective_capacity} free')
+                    if not_ready_count > 0:
+                        utilization += f' ({not_ready_count} not ready)'
                     realtime_gpu_table.add_row([
                         gpu_availability.gpu,
                         _list_to_str(gpu_availability.counts),
-                        f'{available_qty} of {gpu_availability.capacity} free',
+                        utilization,
                     ])
                     gpu = gpu_availability.gpu
-                    capacity = gpu_availability.capacity
                     # we want total, so skip permission denied.
-                    available = max(gpu_availability.available, 0)
-                    if capacity > 0:
-                        total_gpu_info[gpu][0] += capacity
-                        total_gpu_info[gpu][1] += available
+                    if effective_capacity > 0 or not_ready_count > 0:
+                        total_gpu_info[gpu][0] += effective_capacity
+                        total_gpu_info[gpu][1] += available_for_totals
+                        total_gpu_info[gpu][2] += not_ready_count
                 realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
-                # Collect node info for this context
-                nodes_info = sdk.stream_and_get(
-                    sdk.kubernetes_node_info(context=ctx))
                 all_nodes_info.append((display_ctx, nodes_info))
         if num_filtered_contexts > 1:
+            total_realtime_gpu_table = log_utils.create_table(
+                ['GPU', 'UTILIZATION'])
+            for gpu, stats in total_gpu_info.items():
+                not_ready = stats[2]
+                utilization = f'{stats[1]} of {stats[0]} free'
+                if not_ready > 0:
+                    utilization += f' ({not_ready} not ready)'
+                total_realtime_gpu_table.add_row([gpu, utilization])
+        else:
+            total_realtime_gpu_table = None
+        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+    def _get_slurm_realtime_gpu_tables(
+        name_filter: Optional[str] = None,
+        quantity_filter: Optional[int] = None
+    ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
+               Optional['prettytable.PrettyTable']]:
+        """Get Slurm GPU availability tables.
+        Args:
+            name_filter: Filter GPUs by name.
+            quantity_filter: Filter GPUs by quantity.
+        Returns:
+            A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
+        """
+        if quantity_filter:
+            qty_header = 'QTY_FILTER'
+        else:
+            qty_header = 'REQUESTABLE_QTY_PER_NODE'
+        realtime_gpu_availability_lists = sdk.stream_and_get(
+            sdk.realtime_slurm_gpu_availability(
+                name_filter=name_filter, quantity_filter=quantity_filter))
+        if not realtime_gpu_availability_lists:
+            err_msg = 'No GPUs found in any Slurm partition. '
+            debug_msg = 'To further debug, run: sky check slurm '
+            if name_filter is not None:
+                gpu_info_msg = f' {name_filter!r}'
+                if quantity_filter is not None:
+                    gpu_info_msg += (' with requested quantity'
+                                     f' {quantity_filter}')
+                err_msg = (f'Resources{gpu_info_msg} not found '
+                           'in any Slurm partition. ')
+                debug_msg = ('To show available accelerators on Slurm,'
+                             ' run: sky show-gpus --cloud slurm ')
+            raise ValueError(err_msg + debug_msg)
+        realtime_gpu_infos = []
+        total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
+            lambda: [0, 0])
+        for (slurm_cluster,
+             availability_list) in realtime_gpu_availability_lists:
+            realtime_gpu_table = log_utils.create_table(
+                ['GPU', qty_header, 'UTILIZATION'])
+            for realtime_gpu_availability in sorted(availability_list):
+                gpu_availability = models.RealtimeGpuAvailability(
+                    *realtime_gpu_availability)
+                # Use the counts directly from the backend, which are already
+                # generated in powers of 2 (plus any actual maximums)
+                requestable_quantities = gpu_availability.counts
+                realtime_gpu_table.add_row([
+                    gpu_availability.gpu,
+                    _list_to_str(requestable_quantities),
+                    (f'{gpu_availability.available} of '
+                     f'{gpu_availability.capacity} free'),
+                ])
+                gpu = gpu_availability.gpu
+                capacity = gpu_availability.capacity
+                available = gpu_availability.available
+                if capacity > 0:
+                    total_gpu_info[gpu][0] += capacity
+                    total_gpu_info[gpu][1] += available
+            realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
+        # display an aggregated table for all partitions
+        # if there are more than one partitions with GPUs
+        if len(realtime_gpu_infos) > 1:
             total_realtime_gpu_table = log_utils.create_table(
                 ['GPU', 'UTILIZATION'])
             for gpu, stats in total_gpu_info.items():
@@ -3660,7 +3892,7 @@ def show_gpus(
         else:
             total_realtime_gpu_table = None
-        return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
+        return realtime_gpu_infos, total_realtime_gpu_table
     def _format_kubernetes_node_info_combined(
             contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
@@ -3684,11 +3916,16 @@ def show_gpus(
                 acc_type = node_info.accelerator_type
                 if acc_type is None:
                     acc_type = '-'
-                node_table.add_row([
-                    context_name, node_name, acc_type,
-                    f'{available} of {node_info.total["accelerator_count"]} '
-                    'free'
-                ])
+                utilization_str = (
+                    f'{available} of '
+                    f'{node_info.total["accelerator_count"]} free')
+                # Check if node is ready (defaults to True for backward
+                # compatibility with older server versions)
+                node_is_ready = getattr(node_info, 'is_ready', True)
+                if not node_is_ready:
+                    utilization_str += ' (Node NotReady)'
+                node_table.add_row(
+                    [context_name, node_name, acc_type, utilization_str])
         k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
         if hints:
@@ -3699,6 +3936,43 @@ def show_gpus(
                 f'{colorama.Style.RESET_ALL}\n'
                 f'{node_table.get_string()}')
+    def _format_slurm_node_info() -> str:
+        node_table = log_utils.create_table([
+            'CLUSTER',
+            'NODE',
+            'PARTITION',
+            'STATE',
+            'GPU',
+            'UTILIZATION',
+        ])
+        # Get all cluster names
+        slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
+        # Query each cluster
+        for cluster_name in slurm_cluster_names:
+            nodes_info = sdk.stream_and_get(
+                sdk.slurm_node_info(slurm_cluster_name=cluster_name))
+            for node_info in nodes_info:
+                node_table.add_row([
+                    cluster_name,
+                    node_info.get('node_name'),
+                    node_info.get('partition', '-'),
+                    node_info.get('node_state'),
+                    node_info.get('gpu_type') or '',
+                    (f'{node_info.get("free_gpus", 0)} of '
+                     f'{node_info.get("total_gpus", 0)} free'),
+                ])
+        slurm_per_node_msg = 'Slurm per node accelerator availability'
+        # Optional: Add hint message if needed, similar to k8s
+        return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
+                f'{slurm_per_node_msg}'
+                f'{colorama.Style.RESET_ALL}\n'
+                f'{node_table.get_string()}')
     def _format_kubernetes_realtime_gpu(
             total_table: Optional['prettytable.PrettyTable'],
             k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3828,6 +4102,28 @@ def show_gpus(
                 return True, print_section_titles
         return False, print_section_titles
+    def _format_slurm_realtime_gpu(
+            total_table, slurm_realtime_infos,
+            show_node_info: bool) -> Generator[str, None, None]:
+        # print total table
+        yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
+               'Slurm GPUs'
+               f'{colorama.Style.RESET_ALL}\n')
+        if total_table is not None:
+            yield from total_table.get_string()
+            yield '\n'
+        # print individual infos.
+        for (partition, slurm_realtime_table) in slurm_realtime_infos:
+            partition_str = f'Slurm Cluster: {partition}'
+            yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                   f'{partition_str}'
+                   f'{colorama.Style.RESET_ALL}\n')
+            yield from slurm_realtime_table.get_string()
+            yield '\n'
+        if show_node_info:
+            yield _format_slurm_node_info()
     def _output() -> Generator[str, None, None]:
         gpu_table = log_utils.create_table(
             ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3845,10 +4141,12 @@ def show_gpus(
         if cloud_name is None:
             clouds_to_list = [
                 c for c in constants.ALL_CLOUDS
-                if c != 'kubernetes' and c != 'ssh'
+                if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
             ]
         k8s_messages = ''
+        slurm_messages = ''
+        k8s_printed = False
         if accelerator_str is None:
             # Collect k8s related messages in k8s_messages and print them at end
             print_section_titles = False
@@ -3860,6 +4158,7 @@ def show_gpus(
                     yield '\n\n'
                 stop_iter_one, print_section_titles_one, k8s_messages_one = (
                     yield from _possibly_show_k8s_like_realtime(is_ssh))
+                k8s_printed = True
                 stop_iter = stop_iter or stop_iter_one
                 print_section_titles = (print_section_titles or
                                         print_section_titles_one)
@@ -3867,11 +4166,45 @@ def show_gpus(
                 prev_print_section_titles = print_section_titles_one
             if stop_iter:
                 return
+            # If cloud is slurm, we want to show real-time capacity
+            if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
+                try:
+                    # If --cloud slurm is not specified, we want to catch
+                    # the case where no GPUs are available on the cluster and
+                    # print the warning at the end.
+                    slurm_realtime_infos, total_table = (
+                        _get_slurm_realtime_gpu_tables())
+                except ValueError as e:
+                    if not cloud_is_slurm:
+                        # Make it a note if cloud is not slurm
+                        slurm_messages += 'Note: '
+                    slurm_messages += str(e)
+                else:
+                    print_section_titles = True
+                    if k8s_printed:
+                        yield '\n'
+                    yield from _format_slurm_realtime_gpu(total_table,
+                                                          slurm_realtime_infos,
+                                                          show_node_info=True)
+            if cloud_is_slurm:
+                # Do not show clouds if --cloud slurm is specified
+                if not slurm_is_enabled:
+                    yield ('Slurm is not enabled. To fix, run: '
+                           'sky check slurm ')
+                yield slurm_messages
+                return
             # For show_all, show the k8s message at the start since output is
             # long and the user may not scroll to the end.
-            if show_all and k8s_messages:
-                yield k8s_messages
+            if show_all and (k8s_messages or slurm_messages):
+                if k8s_messages:
+                    yield k8s_messages
+                if slurm_messages:
+                    if k8s_messages:
+                        yield '\n'
+                    yield slurm_messages
                 yield '\n\n'
             list_accelerator_counts_result = sdk.stream_and_get(
@@ -3919,9 +4252,10 @@ def show_gpus(
             else:
                 yield ('\n\nHint: use -a/--all to see all accelerators '
                        '(including non-common ones) and pricing.')
-                if k8s_messages:
+                if k8s_messages or slurm_messages:
                     yield '\n'
                     yield k8s_messages
+                    yield slurm_messages
                 return
         else:
             # Parse accelerator string
@@ -3961,6 +4295,31 @@ def show_gpus(
         if stop_iter:
             return
+        # Handle Slurm filtering by name and quantity
+        if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
+                not show_all):
+            # Print section title if not showing all and instead a specific
+            # accelerator is requested
+            print_section_titles = True
+            try:
+                slurm_realtime_infos, total_table = (
+                    _get_slurm_realtime_gpu_tables(name_filter=name,
+                                                   quantity_filter=quantity))
+                yield from _format_slurm_realtime_gpu(total_table,
+                                                      slurm_realtime_infos,
+                                                      show_node_info=False)
+            except ValueError as e:
+                # In the case of a specific accelerator, show the error message
+                # immediately (e.g., "Resources A10G not found ...")
+                yield str(e)
+            yield slurm_messages
+        if cloud_is_slurm:
+            # Do not show clouds if --cloud slurm is specified
+            if not slurm_is_enabled:
+                yield ('Slurm is not enabled. To fix, run: '
+                       'sky check slurm ')
+            return
         # For clouds other than Kubernetes, get the accelerator details
         # Case-sensitive
         list_accelerators_result = sdk.stream_and_get(
@@ -4093,8 +4452,7 @@ def storage_ls(verbose: bool):
     """List storage objects managed by SkyPilot."""
     request_id = sdk.storage_ls()
     storages = sdk.stream_and_get(request_id)
-    storage_table = storage_utils.format_storage_table(storages,
-                                                       show_all=verbose)
+    storage_table = table_utils.format_storage_table(storages, show_all=verbose)
     click.echo(storage_table)
@@ -4174,6 +4532,10 @@ def volumes():
     pass
+# Add 'volume' as an alias for 'volumes'
+cli.add_command(volumes, name='volume')
 @volumes.command('apply', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
 @click.argument('entrypoint',
@@ -4189,17 +4551,25 @@ def volumes():
 @click.option('--infra',
               required=False,
               type=str,
-              help='Infra. Format: k8s, k8s/context-name. '
+              help='Infrastructure to use. '
+              'Format: cloud, cloud/region, cloud/region/zone, or '
+              'k8s/context-name.'
+              'Examples: k8s, k8s/my-context, runpod/US/US-CA-2. '
               'Override the infra defined in the YAML.')
-@click.option(
-    '--type',
-    required=False,
-    type=str,
-    help='Volume type. Format: pvc. Override the type defined in the YAML.')
+@click.option('--type',
+              required=False,
+              type=click.Choice(volume_utils.VolumeType.supported_types()),
+              help='Volume type. Override the type defined in the YAML.')
 @click.option('--size',
               required=False,
               type=str,
               help='Volume size. Override the size defined in the YAML.')
+@click.option(
+    '--use-existing/--no-use-existing',
+    required=False,
+    default=None,
+    help='Whether to use an existing volume. Override the use_existing '
+    'defined in the YAML.')
 @click.option('--yes',
               '-y',
               is_flag=True,
@@ -4214,6 +4584,7 @@ def volumes_apply(
         infra: Optional[str],
         type: Optional[str],  # pylint: disable=redefined-builtin
         size: Optional[str],
+        use_existing: Optional[bool],
         yes: bool,
         async_call: bool):
     """Apply a volume.
@@ -4226,7 +4597,11 @@ def volumes_apply(
         sky volumes apply volume.yaml
         \b
         # Apply a volume from a command.
-        sky volumes apply --name pvc1 --infra k8s --type pvc --size 100Gi
+        sky volumes apply --name pvc1 --infra k8s --type k8s-pvc --size 100Gi
+        \b
+        # Apply a volume with existing PVC `pvc2` from a command.
+        sky volumes apply --name pvc2 --infra k8s --type k8s-pvc --size 100Gi
+        --use-existing
     """
     # pylint: disable=import-outside-toplevel
     from sky.volumes import volume as volume_lib
@@ -4245,7 +4620,8 @@ def volumes_apply(
                     f'{entrypoint_str!r} needs to be a YAML file')
         if yaml_config is not None:
             volume_config_dict = yaml_config.copy()
-    override_config = _build_volume_override_config(name, infra, type, size)
+    override_config = _build_volume_override_config(name, infra, type, size,
+                                                    use_existing)
     volume_config_dict.update(override_config)
     # Create Volume instance
@@ -4253,6 +4629,13 @@ def volumes_apply(
     logger.debug(f'Volume config: {volume.to_yaml_config()}')
+    # TODO(kevin): remove the try block in v0.13.0
+    try:
+        volumes_sdk.validate(volume)
+    except exceptions.APINotSupportedError:
+        # Do best-effort client-side validation.
+        volume.validate(skip_cloud_compatibility=True)
     if not yes:
         click.confirm(f'Proceed to create volume {volume.name!r}?',
                       default=True,
@@ -4269,11 +4652,15 @@ def volumes_apply(
                      f'{colorama.Style.RESET_ALL}')
-def _build_volume_override_config(name: Optional[str], infra: Optional[str],
-                                  volume_type: Optional[str],
-                                  size: Optional[str]) -> Dict[str, str]:
+def _build_volume_override_config(
+    name: Optional[str],
+    infra: Optional[str],
+    volume_type: Optional[str],
+    size: Optional[str],
+    use_existing: Optional[bool],
+) -> Dict[str, Any]:
     """Parse the volume override config."""
-    override_config = {}
+    override_config: Dict[str, Any] = {}
     if name is not None:
         override_config['name'] = name
     if infra is not None:
@@ -4282,6 +4669,8 @@ def _build_volume_override_config(name: Optional[str], infra: Optional[str],
         override_config['type'] = volume_type
     if size is not None:
         override_config['size'] = size
+    if use_existing is not None:
+        override_config['use_existing'] = use_existing
     return override_config
@@ -4298,8 +4687,8 @@ def volumes_ls(verbose: bool):
     """List volumes managed by SkyPilot."""
     request_id = volumes_sdk.ls()
     all_volumes = sdk.stream_and_get(request_id)
-    volume_table = volumes_utils.format_volume_table(all_volumes,
-                                                     show_all=verbose)
+    volume_table = table_utils.format_volume_table(all_volumes,
+                                                   show_all=verbose)
     click.echo(volume_table)
@@ -4537,10 +4926,11 @@ def jobs_launch(
                 break
         if print_setup_fm_warning:
             click.secho(
-                f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
-                ' will be ignored when submit jobs to pool. To update a pool, '
-                f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
+                f'{colorama.Fore.YELLOW}Setup, file mounts, and storage mounts'
+                ' will be ignored when submitting jobs to pool. To update a '
+                f'pool, please use `sky jobs pool apply {pool} new-pool.yaml`. '
                 f'{colorama.Style.RESET_ALL}')
+        print_setup_fm_warning = False
     # Optimize info is only show if _need_confirmation.
     if not yes:
@@ -4556,10 +4946,15 @@ def jobs_launch(
     job_id_handle = _async_call_or_wait(request_id, async_call,
                                         'sky.jobs.launch')
-    if not async_call and not detach_run:
-        job_ids = job_id_handle[0]
-        if isinstance(job_ids, int) or len(job_ids) == 1:
-            job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
+    if async_call:
+        return
+    job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
+                                               int) else job_id_handle[0]
+    if not detach_run:
+        if len(job_ids) == 1:
+            job_id = job_ids[0]
             returncode = managed_jobs.tail_logs(name=None,
                                                 job_id=job_id,
                                                 follow=True,
@@ -4568,7 +4963,8 @@ def jobs_launch(
         else:
             # TODO(tian): This can be very long. Considering have a "group id"
             # and query all job ids with the same group id.
-            job_ids_str = ','.join(map(str, job_ids))
+            # Sort job ids to ensure consistent ordering.
+            job_ids_str = ','.join(map(str, sorted(job_ids)))
             click.secho(
                 f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
                 f'{job_ids_str}{colorama.Style.RESET_ALL}.'
@@ -4587,6 +4983,14 @@ def jobs_launch(
 @jobs.command('queue', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
 @flags.verbose_option()
+@click.option(
+    '--limit',
+    '-l',
+    default=_NUM_MANAGED_JOBS_TO_SHOW,
+    type=int,
+    required=False,
+    help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
+          f' use "-a/--all" to show all jobs.'))
 @click.option(
     '--refresh',
     '-r',
@@ -4606,7 +5010,7 @@ def jobs_launch(
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
 def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
-               all_users: bool, all: bool):
+               all_users: bool, all: bool, limit: int):
     """Show statuses of managed jobs.
     Each managed jobs can have one of the following statuses:
@@ -4657,18 +5061,56 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
       watch -n60 sky jobs queue
+    (Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
+    .. code-block:: bash
+      sky jobs queue -l 10
     """
     click.secho('Fetching managed job statuses...', fg='cyan')
     with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
-        managed_jobs_request_id = managed_jobs.queue(
-            refresh=refresh, skip_finished=skip_finished, all_users=all_users)
-        max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
+        max_num_jobs_to_show = (limit if not all else None)
+        fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
+        if verbose:
+            fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
+        if all_users:
+            fields = fields + _USER_NAME_FIELD
+            if verbose:
+                fields = fields + _USER_HASH_FIELD
+        # Call both cli_utils.get_managed_job_queue and managed_jobs.pool_status
+        # in parallel
+        def get_managed_jobs_queue():
+            return cli_utils.get_managed_job_queue(refresh=refresh,
+                                                   skip_finished=skip_finished,
+                                                   all_users=all_users,
+                                                   limit=max_num_jobs_to_show,
+                                                   fields=fields)
+        def get_pool_status():
+            try:
+                return managed_jobs.pool_status(pool_names=None)
+            except Exception:  # pylint: disable=broad-except
+                # If pool_status fails, we'll just skip the worker information
+                return None
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            managed_jobs_future = executor.submit(get_managed_jobs_queue)
+            pool_status_future = executor.submit(get_pool_status)
+            (managed_jobs_request_id,
+             queue_result_version) = managed_jobs_future.result()
+            pool_status_request_id = pool_status_future.result()
         num_jobs, msg = _handle_jobs_queue_request(
             managed_jobs_request_id,
+            pool_status_request_id=pool_status_request_id,
             show_all=verbose,
             show_user=all_users,
             max_num_jobs_to_show=max_num_jobs_to_show,
-            is_called_by_user=True)
+            is_called_by_user=True,
+            queue_result_version=queue_result_version,
+        )
     if not skip_finished:
         in_progress_only_hint = ''
     else:
@@ -4681,7 +5123,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
             f'{colorama.Fore.CYAN}'
             f'Only showing the latest {max_num_jobs_to_show} '
             f'managed jobs'
-            f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
+            f'(use --limit to show more managed jobs or '
+            f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
 @jobs.command('cancel', cls=_DocumentedCodeCommand)
@@ -4849,7 +5292,7 @@ def pool():
 @pool.command('apply', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
 @click.argument('pool_yaml',
-                required=True,
+                required=False,
                 type=str,
                 nargs=-1,
                 **_get_shell_complete_args(_complete_file_name))
@@ -4864,17 +5307,22 @@ def pool():
               type=click.Choice([m.value for m in serve_lib.UpdateMode],
                                 case_sensitive=False),
               required=False,
-              help=('Update mode. If "rolling", cluster pool will be updated '
-                    'with rolling update. If "blue_green", cluster pool will '
+              help=('Update mode. If "rolling", pool will be updated '
+                    'with rolling update. If "blue_green", pool will '
                     'be updated with blue-green update. This option is only '
                     'valid when the pool is already running.'))
+@click.option('--workers',
+              default=None,
+              type=int,
+              required=False,
+              help='Can be used to update the number of workers in the pool.')
 @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
                     flags.COMMON_OPTIONS)
 @flags.yes_option()
 @timeline.event
 @usage_lib.entrypoint
 def jobs_pool_apply(
-    pool_yaml: Tuple[str, ...],
+    pool_yaml: Optional[Tuple[str, ...]],
     pool: Optional[str],  # pylint: disable=redefined-outer-name
     workdir: Optional[str],
     infra: Optional[str],
@@ -4896,60 +5344,80 @@ def jobs_pool_apply(
     disk_tier: Optional[str],
     network_tier: Optional[str],
     mode: str,
+    workers: Optional[int],
     yes: bool,
     async_call: bool,
 ):
-    """Apply a config to a cluster pool for managed jobs submission.
-    If the pool is already running, the config will be applied to the pool.
-    Otherwise, a new pool will be created.
-    POOL_YAML must point to a valid YAML file.
+    """Either apply a config to a pool for managed jobs submission
+    or update the number of workers in the pool. One of POOL_YAML or --workers
+    must be provided.
+    Config:
+        If the pool is already running, the config will be applied to the pool.
+        Otherwise, a new pool will be created.
+    Workers:
+        The --workers option can be used to override the number of workers
+        specified in the YAML file, or to update workers without a YAML file.
+        Example:
+            sky jobs pool apply -p my-pool --workers 5
     """
     cloud, region, zone = _handle_infra_cloud_region_zone_options(
         infra, cloud, region, zone)
-    if pool is None:
-        pool = serve_lib.generate_service_name(pool=True)
+    if workers is not None and pool_yaml is not None and len(pool_yaml) > 0:
+        raise click.UsageError(
+            'Cannot specify both --workers and POOL_YAML. Please use one of '
+            'them.')
-    task = _generate_task_with_service(
-        service_name=pool,
-        service_yaml_args=pool_yaml,
-        workdir=workdir,
-        cloud=cloud,
-        region=region,
-        zone=zone,
-        gpus=gpus,
-        cpus=cpus,
-        memory=memory,
-        instance_type=instance_type,
-        num_nodes=num_nodes,
-        use_spot=use_spot,
-        image_id=image_id,
-        env_file=env_file,
-        env=env,
-        secret=secret,
-        disk_size=disk_size,
-        disk_tier=disk_tier,
-        network_tier=network_tier,
-        ports=ports,
-        not_supported_cmd='sky jobs pool up',
-        pool=True,
-    )
-    assert task.service is not None
-    if not task.service.pool:
-        raise click.UsageError('The YAML file needs a `pool` section.')
-    click.secho('Pool spec:', fg='cyan')
-    click.echo(task.service)
-    serve_lib.validate_service_task(task, pool=True)
+    if pool_yaml is None or len(pool_yaml) == 0:
+        if pool is None:
+            raise click.UsageError(
+                'A pool name must be provided to update the number of workers.')
+        task = None
+        click.secho(f'Attempting to update {pool} to have {workers} workers',
+                    fg='cyan')
+    else:
+        if pool is None:
+            pool = serve_lib.generate_service_name(pool=True)
+        task = _generate_task_with_service(
+            service_name=pool,
+            service_yaml_args=pool_yaml,
+            workdir=workdir,
+            cloud=cloud,
+            region=region,
+            zone=zone,
+            gpus=gpus,
+            cpus=cpus,
+            memory=memory,
+            instance_type=instance_type,
+            num_nodes=num_nodes,
+            use_spot=use_spot,
+            image_id=image_id,
+            env_file=env_file,
+            env=env,
+            secret=secret,
+            disk_size=disk_size,
+            disk_tier=disk_tier,
+            network_tier=network_tier,
+            ports=ports,
+            not_supported_cmd='sky jobs pool up',
+            pool=True,
+        )
+        assert task.service is not None
+        if not task.service.pool:
+            raise click.UsageError('The YAML file needs a `pool` section.')
+        click.secho('Pool spec:', fg='cyan')
+        click.echo(task.service)
+        serve_lib.validate_service_task(task, pool=True)
-    click.secho(
-        'Each pool worker will use the following resources (estimated):',
-        fg='cyan')
-    with dag_lib.Dag() as dag:
-        dag.add(task)
+        click.secho(
+            'Each pool worker will use the following resources (estimated):',
+            fg='cyan')
+        with dag_lib.Dag() as dag:
+            dag.add(task)
     request_id = managed_jobs.pool_apply(task,
                                          pool,
+                                         workers=workers,
                                          mode=serve_lib.UpdateMode(mode),
                                          _need_confirmation=not yes)
     _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
@@ -4962,7 +5430,7 @@ def jobs_pool_apply(
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
 def jobs_pool_status(verbose: bool, pool_names: List[str]):
-    """Show statuses of cluster pools.
+    """Show statuses of pools.
     Show detailed statuses of one or more pools. If POOL_NAME is not
     provided, show all pools' status.
@@ -5018,12 +5486,108 @@ def jobs_pool_down(
         raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
                                f'Provided {argument_str!r}.')
-    if not yes:
-        quoted_pool_names = [f'{name!r}' for name in pool_names]
-        list_pool_str = ', '.join(quoted_pool_names)
-        pool_identity_str = f'pool(s) {list_pool_str}'
-        if all:
-            pool_identity_str = 'all pools'
+    def _get_nonterminal_jobs(pool_names: List[str],
+                              all: bool) -> List[responses.ManagedJobRecord]:
+        # Get nonterminal jobs for this pool using managed_jobs.queue
+        request_id, queue_result_version = cli_utils.get_managed_job_queue(
+            refresh=False,
+            skip_finished=True,
+            all_users=True,
+            limit=None,
+            fields=['job_id', 'status', 'pool'],
+        )
+        jobs_result = sdk.stream_and_get(request_id)
+        # Handle both tuple and list responses
+        jobs_list: List[responses.ManagedJobRecord]
+        if queue_result_version.v2():
+            jobs_list = jobs_result[0]
+        else:
+            jobs_list = typing.cast(List[responses.ManagedJobRecord],
+                                    jobs_result)
+        def _should_include_job(job: responses.ManagedJobRecord) -> bool:
+            # Job must not be terminal.
+            if job.get('status', ManagedJobStatus.SUCCEEDED).is_terminal():
+                return False
+            # If len is 0 then we are using -a option, so we include all jobs
+            # if they're associated with a pool.
+            if all:
+                return job.get('pool') is not None
+            # Otherwise we are using specific pool names, so we include the job
+            # if it's associated with one of the specified pools.
+            return job.get('pool') in pool_names
+        # Filter jobs by pool name and ensure nonterminal
+        pool_jobs = [job for job in jobs_list if _should_include_job(job)]
+        return pool_jobs
+    quoted_pool_names = [f'{name!r}' for name in pool_names]
+    list_pool_str = ', '.join(quoted_pool_names)
+    pool_identity_str = f'pool(s) {list_pool_str}'
+    if all:
+        pool_identity_str = 'all pools'
+    already_confirmed = False
+    try:
+        pool_jobs = _get_nonterminal_jobs(pool_names, all)
+        if pool_jobs:
+            num_jobs = len(pool_jobs)
+            job_ids = [job['job_id'] for job in pool_jobs]
+            job_ids_str = ','.join(str(job_id) for job_id in job_ids)
+            click.echo(
+                f'{colorama.Fore.YELLOW}Pool(s) has {num_jobs} '
+                f'nonterminal jobs: {job_ids_str} so it is not yet safe to down'
+                f'.{colorama.Style.RESET_ALL}')
+            if not yes:
+                should_cancel = click.confirm(
+                    'Would you like to cancel all jobs and down the pool(s)?',
+                    default=False,
+                    abort=False,
+                    show_default=True)
+                if not should_cancel:
+                    raise click.Abort()
+                already_confirmed = True
+            # Cancel all jobs in the pool
+            with rich_utils.client_status(
+                    ux_utils.spinner_message(
+                        f'Cancelling {num_jobs} jobs in {pool_identity_str}...')
+            ):
+                try:
+                    sdk.get(managed_jobs.cancel(job_ids=job_ids))
+                except Exception as e:
+                    logger.warning(f'Failed to cancel jobs: {e}.')
+                    raise e
+                max_wait_time = 300  # 5 minutes max wait
+                check_interval = 2  # Check every 2 seconds
+                start_time = time.time()
+                remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
+                while (remaining_pool_jobs and
+                       time.time() - start_time < max_wait_time):
+                    # Check remaining jobs via API
+                    time.sleep(check_interval)
+                    remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
+                    ux_utils.spinner_message(
+                        f'Waiting for {len(remaining_pool_jobs)} '
+                        'jobs to be cancelled...')
+                click.echo('\r' + ' ' * 80 + '\r', nl=False)
+                if time.time() - start_time >= max_wait_time:
+                    click.echo(
+                        f'{colorama.Fore.YELLOW}Warning: Timeout waiting '
+                        f'for jobs to finish. Proceeding with pool down '
+                        f'anyway.{colorama.Style.RESET_ALL}')
+                else:
+                    click.echo('All jobs cancelled.')
+    except Exception as e:  # pylint: disable=broad-except
+        # If API call fails, log warning but continue with pool down
+        logger.warning(
+            f'Failed to check for running jobs in pool(s): {pool_names!r}: {e}.'
+            ' Proceeding with pool down.')
+    if not yes and not already_confirmed:
         click.confirm(f'Terminating {pool_identity_str}. Proceed?',
                       default=True,
                       abort=True,
@@ -5205,22 +5769,22 @@ def jobs_pool_logs(
     .. code-block:: bash
         # Tail the controller logs of a pool
-        sky pool logs --controller [POOL_NAME]
+        sky jobs pool logs --controller [POOL_NAME]
         \b
         # Print the worker logs so far and exit
-        sky pool logs --no-follow [POOL_NAME]
+        sky jobs pool logs --no-follow [POOL_NAME] 1
         \b
         # Tail the logs of worker 1
-        sky pool logs [POOL_NAME] 1
+        sky jobs pool logs [POOL_NAME] 1
         \b
         # Show the last 100 lines of the controller logs
-        sky pool logs --controller --tail 100 [POOL_NAME]
+        sky jobs pool logs --controller --tail 100 [POOL_NAME]
         \b
         # Sync down all logs of the pool (controller, all workers)
-        sky pool logs [POOL_NAME] --sync-down
+        sky jobs pool logs [POOL_NAME] --sync-down
         \b
         # Sync down controller logs and logs for workers 1 and 3
-        sky pool logs [POOL_NAME] 1 3 --controller --sync-down
+        sky jobs pool logs [POOL_NAME] 1 3 --controller --sync-down
     """
     _handle_serve_logs(pool_name,
                        follow=follow,
@@ -5236,7 +5800,15 @@ def jobs_pool_logs(
 @flags.config_option(expose_value=False)
 @usage_lib.entrypoint
 def dashboard() -> None:
-    """Starts the dashboard for skypilot."""
+    """Opens the SkyPilot dashboard."""
+    sdk.dashboard()
+@cli.command(cls=_DocumentedCodeCommand, hidden=True)
+@flags.config_option(expose_value=False)
+@usage_lib.entrypoint
+def ui() -> None:
+    """Opens the SkyPilot dashboard."""
     sdk.dashboard()
@@ -5247,28 +5819,30 @@ def serve():
 def _generate_task_with_service(
-        service_name: str,
-        service_yaml_args: Tuple[str, ...],
-        workdir: Optional[str],
-        cloud: Optional[str],
-        region: Optional[str],
-        zone: Optional[str],
-        num_nodes: Optional[int],
-        use_spot: Optional[bool],
-        image_id: Optional[str],
-        env_file: Optional[Dict[str, str]],
-        env: List[Tuple[str, str]],
-        secret: Optional[List[Tuple[str, str]]],
-        gpus: Optional[str],
-        instance_type: Optional[str],
-        ports: Optional[Tuple[str]],
-        cpus: Optional[str],
-        memory: Optional[str],
-        disk_size: Optional[int],
-        disk_tier: Optional[str],
-        network_tier: Optional[str],
-        not_supported_cmd: str,
-        pool: bool,  # pylint: disable=redefined-outer-name
+    service_name: str,
+    service_yaml_args: Tuple[str, ...],
+    workdir: Optional[str],
+    cloud: Optional[str],
+    region: Optional[str],
+    zone: Optional[str],
+    num_nodes: Optional[int],
+    use_spot: Optional[bool],
+    image_id: Optional[str],
+    env_file: Optional[Dict[str, str]],
+    env: List[Tuple[str, str]],
+    secret: Optional[List[Tuple[str, str]]],
+    gpus: Optional[str],
+    instance_type: Optional[str],
+    ports: Optional[Tuple[str]],
+    cpus: Optional[str],
+    memory: Optional[str],
+    disk_size: Optional[int],
+    disk_tier: Optional[str],
+    network_tier: Optional[str],
+    not_supported_cmd: str,
+    pool: bool,  # pylint: disable=redefined-outer-name
+    git_url: Optional[str] = None,
+    git_ref: Optional[str] = None,
 ) -> task_lib.Task:
     """Generate a task with service section from a service YAML file."""
     is_yaml, _ = _check_yaml(''.join(service_yaml_args))
@@ -5298,6 +5872,8 @@ def _generate_task_with_service(
         disk_tier=disk_tier,
         network_tier=network_tier,
         ports=ports,
+        git_url=git_url,
+        git_ref=git_ref,
     )
     if isinstance(task, dag_lib.Dag):
         raise click.UsageError(
@@ -5313,7 +5889,7 @@ def _generate_task_with_service(
     if task.service.pool:
         if task.service.ports is not None or ports:
             with ux_utils.print_exception_no_traceback():
-                raise ValueError('Cannot specify ports in a cluster pool.')
+                raise ValueError('Cannot specify ports in a pool.')
         return task
     # NOTE(yi): we only allow one service port now.
@@ -5389,6 +5965,10 @@ def _generate_task_with_service(
               type=str,
               help='A service name. Unique for each service. If not provided, '
               'a unique name is autogenerated.')
+@click.option('--git-url', type=str, help='Git repository URL.')
+@click.option('--git-ref',
+              type=str,
+              help='Git reference (branch, tag, or commit hash) to use.')
 @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
                     flags.COMMON_OPTIONS)
 @flags.yes_option()
@@ -5418,6 +5998,8 @@ def serve_up(
     network_tier: Optional[str],
     yes: bool,
     async_call: bool,
+    git_url: Optional[str] = None,
+    git_ref: Optional[str] = None,
 ):
     """Launch a SkyServe service.
@@ -5475,6 +6057,8 @@ def serve_up(
         ports=ports,
         not_supported_cmd='sky serve up',
         pool=False,
+        git_url=git_url,
+        git_ref=git_ref,
     )
     assert task.service is not None
     if task.service.pool:
@@ -5556,6 +6140,8 @@ def serve_update(
         sky serve update --mode blue_green sky-service-16aa new_service.yaml
     """
+    # TODO(lloyd-brown): Add a way to update number of replicas for serve
+    # the way we did for pools.
     cloud, region, zone = _handle_infra_cloud_region_zone_options(
         infra, cloud, region, zone)
     task = _generate_task_with_service(
@@ -5918,94 +6504,39 @@ def local():
               help='Launch cluster without GPU support even '
               'if GPUs are detected on the host.')
 @click.option(
-    '--ips',
+    '--name',
     type=str,
     required=False,
-    help='Path to the file containing IP addresses of remote machines.')
-@click.option('--ssh-user',
-              type=str,
-              required=False,
-              help='SSH username for accessing remote machines.')
-@click.option('--ssh-key-path',
-              type=str,
-              required=False,
-              help='Path to the SSH private key.')
-@click.option('--cleanup',
-              is_flag=True,
-              help='Clean up the remote cluster instead of deploying it.')
+    help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
 @click.option(
-    '--context-name',
-    type=str,
+    '--port-start',
+    type=int,
     required=False,
-    help='Name to use for the kubeconfig context. Defaults to "default".')
-@click.option('--password',
-              type=str,
-              required=False,
-              help='Password for the ssh-user to execute sudo commands. '
-              'Required only if passwordless sudo is not setup.')
+    help='Starting port range for the local kind cluster. Needs to be a '
+    'multiple of 100. If not given, a random range will be used. '
+    'Used without ip list.')
 @local.command('up', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
 @_add_click_options(flags.COMMON_OPTIONS)
 @usage_lib.entrypoint
-def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
-             cleanup: bool, context_name: Optional[str],
-             password: Optional[str], async_call: bool):
-    """Creates a local or remote cluster."""
-    def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
-        # If any of --ips, --ssh-user, or --ssh-key-path is specified,
-        # all must be specified
-        if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
-            if not (ips and ssh_user and ssh_key_path):
-                raise click.BadParameter(
-                    'All --ips, --ssh-user, and --ssh-key-path '
-                    'must be specified together.')
-        # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
-        # are all provided
-        if cleanup and not (ips and ssh_user and ssh_key_path):
-            raise click.BadParameter('--cleanup can only be used with '
-                                     '--ips, --ssh-user and --ssh-key-path.')
-    _validate_args(ips, ssh_user, ssh_key_path, cleanup)
-    # If remote deployment arguments are specified, run remote up script
-    ip_list = None
-    ssh_key = None
-    if ips and ssh_user and ssh_key_path:
-        # Read and validate IP file
-        try:
-            with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
-                ip_list = f.read().strip().splitlines()
-            if not ip_list:
-                raise click.BadParameter(f'IP file is empty: {ips}')
-        except (IOError, OSError) as e:
-            raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
-        # Read and validate SSH key file
-        try:
-            with open(os.path.expanduser(ssh_key_path), 'r',
-                      encoding='utf-8') as f:
-                ssh_key = f.read()
-            if not ssh_key:
-                raise click.BadParameter(
-                    f'SSH key file is empty: {ssh_key_path}')
-        except (IOError, OSError) as e:
-            raise click.BadParameter(
-                f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
-    request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
-                              context_name, password)
+def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
+             async_call: bool):
+    """Creates a local cluster."""
+    request_id = sdk.local_up(gpus, name, port_start)
     _async_call_or_wait(request_id, async_call, request_name='local up')
+@click.option('--name',
+              type=str,
+              required=False,
+              help='Name of the cluster to down. Defaults to "skypilot".')
 @local.command('down', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
 @_add_click_options(flags.COMMON_OPTIONS)
 @usage_lib.entrypoint
-def local_down(async_call: bool):
+def local_down(name: Optional[str], async_call: bool):
     """Deletes a local cluster."""
-    request_id = sdk.local_down()
+    request_id = sdk.local_down(name)
     _async_call_or_wait(request_id, async_call, request_name='sky.local.down')
@@ -6119,20 +6650,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
                 **_get_shell_complete_args(_complete_api_request))
 @flags.all_option('Cancel all your requests.')
 @flags.all_users_option('Cancel all requests from all users.')
+@flags.yes_option()
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
-def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
+def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
+               yes: bool):
     """Cancel a request running on SkyPilot API server."""
     if all or all_users:
-        keyword = 'ALL USERS\'' if all_users else 'YOUR'
-        user_input = click.prompt(
-            f'This will cancel all {keyword} requests.\n'
-            f'To proceed, please type {colorama.Style.BRIGHT}'
-            f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
-            type=str)
-        if user_input != 'cancel all requests':
-            raise click.Abort()
-    if all:
+        if not yes:
+            keyword = 'ALL USERS\'' if all_users else 'YOUR'
+            user_input = click.prompt(
+                f'This will cancel all {keyword} requests.\n'
+                f'To proceed, please type {colorama.Style.BRIGHT}'
+                f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
+                type=str)
+            if user_input != 'cancel all requests':
+                raise click.Abort()
         request_ids = None
     cancelled_request_ids = sdk.get(
         sdk.api_cancel(request_ids=request_ids, all_users=all_users))
@@ -6146,9 +6679,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
                     fg='green')
+class IntOrNone(click.ParamType):
+    """Int or None"""
+    name = 'int-or-none'
+    def convert(self, value, param, ctx):
+        if isinstance(value, int):
+            return value
+        if isinstance(value, str) and value.lower() in ('none', 'all'):
+            return None
+        try:
+            return int(value)
+        except ValueError:
+            self.fail(f'{value!r} is not a valid integer or "none" or "all"',
+                      param, ctx)
+INT_OR_NONE = IntOrNone()
 @api.command('status', cls=_DocumentedCodeCommand)
 @flags.config_option(expose_value=False)
-@click.argument('request_ids',
+@click.argument('request_id_prefixes',
                 required=False,
                 type=str,
                 nargs=-1,
@@ -6158,16 +6710,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
               is_flag=True,
               default=False,
               required=False,
-              help='Show requests of all statuses.')
+              help=('Show requests of all statuses, including finished ones '
+                    '(SUCCEEDED, FAILED, CANCELLED). By default, only active '
+                    'requests (PENDING, RUNNING) are shown.'))
+@click.option(
+    '--limit',
+    '-l',
+    default=_NUM_REQUESTS_TO_SHOW,
+    type=INT_OR_NONE,
+    required=False,
+    help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
+          f' set to "none" or "all" to show all requests.'))
 @flags.verbose_option('Show more details.')
 @usage_lib.entrypoint
 # pylint: disable=redefined-builtin
-def api_status(request_ids: Optional[List[str]], all_status: bool,
-               verbose: bool):
+def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
+               verbose: bool, limit: Optional[int]):
     """List requests on SkyPilot API server."""
-    if not request_ids:
-        request_ids = None
-    request_list = sdk.api_status(request_ids, all_status)
+    if not request_id_prefixes:
+        request_id_prefixes = None
+    fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
+    if verbose:
+        fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
+    request_list = sdk.api_status(request_id_prefixes, all_status, limit,
+                                  fields)
     columns = ['ID', 'User', 'Name']
     if verbose:
         columns.append('Cluster')
@@ -6193,8 +6759,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
         if verbose:
             dummy_row.append('-')
         table.add_row(dummy_row)
-        click.echo()
     click.echo(table)
+    if limit and len(request_list) >= limit:
+        click.echo()
+        click.echo(
+            f'Showing {limit} requests. Use "-l none" or "-l all" to show'
+            f' all requests.')
 @api.command('login', cls=_DocumentedCodeCommand)

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl