skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -32,6 +32,7 @@ import shlex
|
|
|
32
32
|
import shutil
|
|
33
33
|
import subprocess
|
|
34
34
|
import sys
|
|
35
|
+
import time
|
|
35
36
|
import traceback
|
|
36
37
|
import typing
|
|
37
38
|
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
|
@@ -59,8 +60,9 @@ from sky import task as task_lib
|
|
|
59
60
|
from sky.adaptors import common as adaptors_common
|
|
60
61
|
from sky.client import sdk
|
|
61
62
|
from sky.client.cli import flags
|
|
62
|
-
from sky.client.cli import
|
|
63
|
-
from sky.
|
|
63
|
+
from sky.client.cli import table_utils
|
|
64
|
+
from sky.client.cli import utils as cli_utils
|
|
65
|
+
from sky.jobs.state import ManagedJobStatus
|
|
64
66
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
65
67
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
66
68
|
from sky.schemas.api import responses
|
|
@@ -79,7 +81,6 @@ from sky.utils import controller_utils
|
|
|
79
81
|
from sky.utils import dag_utils
|
|
80
82
|
from sky.utils import directory_utils
|
|
81
83
|
from sky.utils import env_options
|
|
82
|
-
from sky.utils import git as git_utils
|
|
83
84
|
from sky.utils import infra_utils
|
|
84
85
|
from sky.utils import log_utils
|
|
85
86
|
from sky.utils import registry
|
|
@@ -89,9 +90,9 @@ from sky.utils import status_lib
|
|
|
89
90
|
from sky.utils import subprocess_utils
|
|
90
91
|
from sky.utils import timeline
|
|
91
92
|
from sky.utils import ux_utils
|
|
93
|
+
from sky.utils import volume as volume_utils
|
|
92
94
|
from sky.utils import yaml_utils
|
|
93
95
|
from sky.utils.cli_utils import status_utils
|
|
94
|
-
from sky.volumes import utils as volumes_utils
|
|
95
96
|
from sky.volumes.client import sdk as volumes_sdk
|
|
96
97
|
|
|
97
98
|
if typing.TYPE_CHECKING:
|
|
@@ -113,6 +114,24 @@ an autogenerated name."""
|
|
|
113
114
|
# command.
|
|
114
115
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
|
|
115
116
|
_NUM_MANAGED_JOBS_TO_SHOW = 50
|
|
117
|
+
_NUM_REQUESTS_TO_SHOW = 50
|
|
118
|
+
_DEFAULT_REQUEST_FIELDS_TO_SHOW = [
|
|
119
|
+
'request_id', 'name', 'user_id', 'status', 'created_at'
|
|
120
|
+
]
|
|
121
|
+
_VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
|
|
122
|
+
'cluster_name'
|
|
123
|
+
]
|
|
124
|
+
_DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
|
|
125
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
126
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
127
|
+
]
|
|
128
|
+
_VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
|
|
129
|
+
'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
|
|
130
|
+
'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
|
|
131
|
+
'failure_reason', 'metadata'
|
|
132
|
+
]
|
|
133
|
+
_USER_NAME_FIELD = ['user_name']
|
|
134
|
+
_USER_HASH_FIELD = ['user_hash']
|
|
116
135
|
|
|
117
136
|
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
|
118
137
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
|
@@ -129,6 +148,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
129
148
|
clusters: Optional[List[str]],
|
|
130
149
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
131
150
|
all_users: bool = False,
|
|
151
|
+
verbose: bool = False,
|
|
132
152
|
) -> List[responses.StatusResponse]:
|
|
133
153
|
"""Returns a list of clusters that match the glob pattern.
|
|
134
154
|
|
|
@@ -146,23 +166,30 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
146
166
|
request_id = sdk.status(clusters,
|
|
147
167
|
refresh=refresh,
|
|
148
168
|
all_users=all_users,
|
|
149
|
-
_include_credentials=True
|
|
169
|
+
_include_credentials=True,
|
|
170
|
+
_summary_response=not verbose)
|
|
150
171
|
cluster_records = sdk.stream_and_get(request_id)
|
|
151
172
|
# Update the SSH config for all clusters
|
|
152
173
|
for record in cluster_records:
|
|
153
174
|
handle = record['handle']
|
|
154
|
-
|
|
175
|
+
name = record['name']
|
|
155
176
|
if not (handle is not None and handle.cached_external_ips is not None
|
|
156
177
|
and 'credentials' in record):
|
|
157
178
|
# If the cluster is not UP or does not have credentials available,
|
|
158
179
|
# we need to remove the cluster from the SSH config.
|
|
159
|
-
cluster_utils.SSHConfigHelper.remove_cluster(
|
|
180
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
|
181
|
+
continue
|
|
182
|
+
if not record['credentials']:
|
|
183
|
+
# The credential is missing for some reason, continue.
|
|
184
|
+
logger.debug(
|
|
185
|
+
f'Client did not receive SSH credential for cluster {name}')
|
|
160
186
|
continue
|
|
161
187
|
|
|
162
188
|
# During the failover, even though a cluster does not exist, the handle
|
|
163
189
|
# can still exist in the record, and we check for credentials to avoid
|
|
164
190
|
# updating the SSH config for non-existent clusters.
|
|
165
191
|
credentials = record['credentials']
|
|
192
|
+
ips = handle.cached_external_ips
|
|
166
193
|
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
|
167
194
|
# Replace the proxy command to proxy through the SkyPilot API
|
|
168
195
|
# server with websocket.
|
|
@@ -191,10 +218,44 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
191
218
|
f'{server_common.get_server_url()} '
|
|
192
219
|
f'{handle.cluster_name}\"')
|
|
193
220
|
credentials['ssh_proxy_command'] = proxy_command
|
|
221
|
+
elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
222
|
+
# TODO(kevin): This is a temporary workaround, ideally we want to
|
|
223
|
+
# get a shell through srun --pty bash on the existing sbatch job.
|
|
224
|
+
|
|
225
|
+
# Proxy through the controller/login node to reach the worker node.
|
|
226
|
+
if (handle.cached_internal_ips is None or
|
|
227
|
+
not handle.cached_internal_ips):
|
|
228
|
+
logger.debug(
|
|
229
|
+
f'Cluster {name} does not have cached internal IPs. '
|
|
230
|
+
'Skipping SSH config update.')
|
|
231
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
escaped_key_path = shlex.quote(
|
|
235
|
+
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
|
236
|
+
handle.cluster_name, credentials))
|
|
237
|
+
controller_host = handle.cached_external_ips[0]
|
|
238
|
+
|
|
239
|
+
# Build jump proxy: ssh to worker via controller/login node
|
|
240
|
+
proxy_command = (f'ssh -tt -i {escaped_key_path} '
|
|
241
|
+
'-o StrictHostKeyChecking=no '
|
|
242
|
+
'-o UserKnownHostsFile=/dev/null '
|
|
243
|
+
'-o IdentitiesOnly=yes '
|
|
244
|
+
'-W %h:%p '
|
|
245
|
+
f'{handle.ssh_user}@{controller_host}')
|
|
246
|
+
original_proxy = credentials.get('ssh_proxy_command')
|
|
247
|
+
if original_proxy:
|
|
248
|
+
proxy_command += (
|
|
249
|
+
f' -o ProxyCommand={shlex.quote(original_proxy)}')
|
|
250
|
+
|
|
251
|
+
credentials['ssh_proxy_command'] = proxy_command
|
|
252
|
+
|
|
253
|
+
# For Slurm, use the worker's internal IP as the SSH target
|
|
254
|
+
ips = handle.cached_internal_ips
|
|
194
255
|
|
|
195
256
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
196
257
|
handle.cluster_name,
|
|
197
|
-
|
|
258
|
+
ips,
|
|
198
259
|
credentials,
|
|
199
260
|
handle.cached_external_ssh_ports,
|
|
200
261
|
handle.docker_user,
|
|
@@ -783,8 +844,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
783
844
|
|
|
784
845
|
# Update the workdir config from the command line parameters.
|
|
785
846
|
# And update the envs and secrets from the workdir.
|
|
786
|
-
|
|
787
|
-
|
|
847
|
+
task.update_workdir(workdir, git_url, git_ref)
|
|
848
|
+
task.update_envs_and_secrets_from_workdir()
|
|
788
849
|
|
|
789
850
|
# job launch specific.
|
|
790
851
|
if job_recovery is not None:
|
|
@@ -799,73 +860,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
799
860
|
return task
|
|
800
861
|
|
|
801
862
|
|
|
802
|
-
def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
|
|
803
|
-
git_url: Optional[str], git_ref: Optional[str]):
|
|
804
|
-
"""Updates the task workdir.
|
|
805
|
-
|
|
806
|
-
Args:
|
|
807
|
-
task: The task to update.
|
|
808
|
-
workdir: The workdir to update.
|
|
809
|
-
git_url: The git url to update.
|
|
810
|
-
git_ref: The git ref to update.
|
|
811
|
-
"""
|
|
812
|
-
if task.workdir is None or isinstance(task.workdir, str):
|
|
813
|
-
if workdir is not None:
|
|
814
|
-
task.workdir = workdir
|
|
815
|
-
return
|
|
816
|
-
if git_url is not None:
|
|
817
|
-
task.workdir = {}
|
|
818
|
-
task.workdir['url'] = git_url
|
|
819
|
-
if git_ref is not None:
|
|
820
|
-
task.workdir['ref'] = git_ref
|
|
821
|
-
return
|
|
822
|
-
return
|
|
823
|
-
if git_url is not None:
|
|
824
|
-
task.workdir['url'] = git_url
|
|
825
|
-
if git_ref is not None:
|
|
826
|
-
task.workdir['ref'] = git_ref
|
|
827
|
-
return
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
|
|
831
|
-
"""Updates the task secrets from the workdir.
|
|
832
|
-
|
|
833
|
-
Args:
|
|
834
|
-
task: The task to update.
|
|
835
|
-
"""
|
|
836
|
-
if task.workdir is None:
|
|
837
|
-
return
|
|
838
|
-
if not isinstance(task.workdir, dict):
|
|
839
|
-
return
|
|
840
|
-
url = task.workdir['url']
|
|
841
|
-
ref = task.workdir.get('ref', '')
|
|
842
|
-
token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
|
|
843
|
-
ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
|
|
844
|
-
try:
|
|
845
|
-
git_repo = git.GitRepo(url, ref, token, ssh_key_path)
|
|
846
|
-
clone_info = git_repo.get_repo_clone_info()
|
|
847
|
-
if clone_info is None:
|
|
848
|
-
return
|
|
849
|
-
task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
|
|
850
|
-
if ref:
|
|
851
|
-
ref_type = git_repo.get_ref_type()
|
|
852
|
-
if ref_type == git.GitRefType.COMMIT:
|
|
853
|
-
task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
|
|
854
|
-
elif ref_type == git.GitRefType.BRANCH:
|
|
855
|
-
task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
|
|
856
|
-
elif ref_type == git.GitRefType.TAG:
|
|
857
|
-
task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
|
|
858
|
-
if clone_info.token is None and clone_info.ssh_key is None:
|
|
859
|
-
return
|
|
860
|
-
if clone_info.token is not None:
|
|
861
|
-
task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
|
|
862
|
-
if clone_info.ssh_key is not None:
|
|
863
|
-
task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
|
|
864
|
-
except exceptions.GitError as e:
|
|
865
|
-
with ux_utils.print_exception_no_traceback():
|
|
866
|
-
raise ValueError(f'{str(e)}') from None
|
|
867
|
-
|
|
868
|
-
|
|
869
863
|
class _NaturalOrderGroup(click.Group):
|
|
870
864
|
"""Lists commands in the order defined in this script.
|
|
871
865
|
|
|
@@ -873,7 +867,19 @@ class _NaturalOrderGroup(click.Group):
|
|
|
873
867
|
"""
|
|
874
868
|
|
|
875
869
|
def list_commands(self, ctx): # pylint: disable=unused-argument
|
|
876
|
-
|
|
870
|
+
# Preserve definition order but hide aliases (same command object) and
|
|
871
|
+
# commands explicitly marked as hidden.
|
|
872
|
+
seen_commands = set()
|
|
873
|
+
names = []
|
|
874
|
+
for name, command in self.commands.items():
|
|
875
|
+
if getattr(command, 'hidden', False):
|
|
876
|
+
continue
|
|
877
|
+
command_id = id(command)
|
|
878
|
+
if command_id in seen_commands:
|
|
879
|
+
continue
|
|
880
|
+
seen_commands.add(command_id)
|
|
881
|
+
names.append(name)
|
|
882
|
+
return names
|
|
877
883
|
|
|
878
884
|
@usage_lib.entrypoint('sky.cli', fallback=True)
|
|
879
885
|
def invoke(self, ctx):
|
|
@@ -1160,7 +1166,7 @@ def launch(
|
|
|
1160
1166
|
if task.service is not None:
|
|
1161
1167
|
noun = 'pool' if task.service.pool else 'service'
|
|
1162
1168
|
capnoun = noun.capitalize()
|
|
1163
|
-
sysname = '
|
|
1169
|
+
sysname = 'Pool' if task.service.pool else 'SkyServe'
|
|
1164
1170
|
cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
|
|
1165
1171
|
logger.info(
|
|
1166
1172
|
f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
|
|
@@ -1388,14 +1394,24 @@ def exec(
|
|
|
1388
1394
|
|
|
1389
1395
|
|
|
1390
1396
|
def _handle_jobs_queue_request(
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1397
|
+
request_id: server_common.RequestId[Union[
|
|
1398
|
+
List[responses.ManagedJobRecord],
|
|
1399
|
+
Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
|
|
1400
|
+
show_all: bool,
|
|
1401
|
+
show_user: bool,
|
|
1402
|
+
max_num_jobs_to_show: Optional[int],
|
|
1403
|
+
pool_status_request_id: Optional[server_common.RequestId[List[Dict[
|
|
1404
|
+
str, Any]]]] = None,
|
|
1405
|
+
is_called_by_user: bool = False,
|
|
1406
|
+
only_in_progress: bool = False,
|
|
1407
|
+
queue_result_version: cli_utils.QueueResultVersion = cli_utils.
|
|
1408
|
+
QueueResultVersion.V1,
|
|
1409
|
+
) -> Tuple[Optional[int], str]:
|
|
1396
1410
|
"""Get the in-progress managed jobs.
|
|
1397
1411
|
|
|
1398
1412
|
Args:
|
|
1413
|
+
request_id: The request ID for managed jobs.
|
|
1414
|
+
pool_status_request_id: The request ID for pool status, or None.
|
|
1399
1415
|
show_all: Show all information of each job (e.g., region, price).
|
|
1400
1416
|
show_user: Show the user who submitted the job.
|
|
1401
1417
|
max_num_jobs_to_show: If not None, limit the number of jobs to show to
|
|
@@ -1403,6 +1419,8 @@ def _handle_jobs_queue_request(
|
|
|
1403
1419
|
and `sky jobs queue`.
|
|
1404
1420
|
is_called_by_user: If this function is called by user directly, or an
|
|
1405
1421
|
internal call.
|
|
1422
|
+
only_in_progress: If True, only return the number of in-progress jobs.
|
|
1423
|
+
queue_result_version: The version of the queue result.
|
|
1406
1424
|
|
|
1407
1425
|
Returns:
|
|
1408
1426
|
A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
|
|
@@ -1413,11 +1431,47 @@ def _handle_jobs_queue_request(
|
|
|
1413
1431
|
# TODO(SKY-980): remove unnecessary fallbacks on the client side.
|
|
1414
1432
|
num_in_progress_jobs = None
|
|
1415
1433
|
msg = ''
|
|
1434
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
1435
|
+
pool_status_result = None
|
|
1416
1436
|
try:
|
|
1417
1437
|
if not is_called_by_user:
|
|
1418
1438
|
usage_lib.messages.usage.set_internal()
|
|
1419
|
-
|
|
1420
|
-
|
|
1439
|
+
# Call both stream_and_get functions in parallel
|
|
1440
|
+
def get_jobs_queue_result():
|
|
1441
|
+
return sdk.stream_and_get(request_id)
|
|
1442
|
+
|
|
1443
|
+
def get_pool_status_result():
|
|
1444
|
+
if pool_status_request_id is not None:
|
|
1445
|
+
try:
|
|
1446
|
+
return sdk.stream_and_get(pool_status_request_id)
|
|
1447
|
+
except Exception: # pylint: disable=broad-except
|
|
1448
|
+
# If getting pool status fails, just continue without it
|
|
1449
|
+
return None
|
|
1450
|
+
return None
|
|
1451
|
+
|
|
1452
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
1453
|
+
jobs_future = executor.submit(get_jobs_queue_result)
|
|
1454
|
+
pool_status_future = executor.submit(get_pool_status_result)
|
|
1455
|
+
|
|
1456
|
+
result = jobs_future.result()
|
|
1457
|
+
pool_status_result = pool_status_future.result()
|
|
1458
|
+
|
|
1459
|
+
if queue_result_version.v2():
|
|
1460
|
+
managed_jobs_, total, status_counts, _ = result
|
|
1461
|
+
if only_in_progress:
|
|
1462
|
+
num_in_progress_jobs = 0
|
|
1463
|
+
if status_counts:
|
|
1464
|
+
for status_value, count in status_counts.items():
|
|
1465
|
+
status_enum = managed_jobs.ManagedJobStatus(
|
|
1466
|
+
status_value)
|
|
1467
|
+
if not status_enum.is_terminal():
|
|
1468
|
+
num_in_progress_jobs += count
|
|
1469
|
+
else:
|
|
1470
|
+
num_in_progress_jobs = total
|
|
1471
|
+
else:
|
|
1472
|
+
managed_jobs_ = result
|
|
1473
|
+
num_in_progress_jobs = len(
|
|
1474
|
+
set(job['job_id'] for job in managed_jobs_))
|
|
1421
1475
|
except exceptions.ClusterNotUpError as e:
|
|
1422
1476
|
controller_status = e.cluster_status
|
|
1423
1477
|
msg = str(e)
|
|
@@ -1461,10 +1515,14 @@ def _handle_jobs_queue_request(
|
|
|
1461
1515
|
msg += ('Failed to query managed jobs: '
|
|
1462
1516
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1463
1517
|
else:
|
|
1464
|
-
msg =
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1518
|
+
msg = table_utils.format_job_table(
|
|
1519
|
+
managed_jobs_,
|
|
1520
|
+
pool_status=pool_status_result,
|
|
1521
|
+
show_all=show_all,
|
|
1522
|
+
show_user=show_user,
|
|
1523
|
+
max_jobs=max_num_jobs_to_show,
|
|
1524
|
+
status_counts=status_counts,
|
|
1525
|
+
)
|
|
1468
1526
|
return num_in_progress_jobs, msg
|
|
1469
1527
|
|
|
1470
1528
|
|
|
@@ -1562,35 +1620,6 @@ def _handle_services_request(
|
|
|
1562
1620
|
return num_services, msg
|
|
1563
1621
|
|
|
1564
1622
|
|
|
1565
|
-
def _status_kubernetes(show_all: bool):
|
|
1566
|
-
"""Show all SkyPilot resources in the current Kubernetes context.
|
|
1567
|
-
|
|
1568
|
-
Args:
|
|
1569
|
-
show_all (bool): Show all job information (e.g., start time, failures).
|
|
1570
|
-
"""
|
|
1571
|
-
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
|
1572
|
-
sdk.status_kubernetes()))
|
|
1573
|
-
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1574
|
-
f'Kubernetes cluster state (context: {context})'
|
|
1575
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1576
|
-
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
|
1577
|
-
show_all)
|
|
1578
|
-
if all_jobs:
|
|
1579
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1580
|
-
f'Managed jobs'
|
|
1581
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1582
|
-
msg = managed_jobs.format_job_table(all_jobs,
|
|
1583
|
-
show_all=show_all,
|
|
1584
|
-
show_user=False)
|
|
1585
|
-
click.echo(msg)
|
|
1586
|
-
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
|
1587
|
-
# TODO: Parse serve controllers and show services separately.
|
|
1588
|
-
# Currently we show a hint that services are shown as clusters.
|
|
1589
|
-
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
|
1590
|
-
'shown in the "SkyPilot clusters" section.'
|
|
1591
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
1623
|
def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1595
1624
|
cluster_records: List[responses.StatusResponse], ip: bool,
|
|
1596
1625
|
endpoints: bool, endpoint: Optional[int]) -> None:
|
|
@@ -1717,15 +1746,7 @@ def _show_enabled_infra(
|
|
|
1717
1746
|
default=True,
|
|
1718
1747
|
is_flag=True,
|
|
1719
1748
|
required=False,
|
|
1720
|
-
help='Also show
|
|
1721
|
-
@click.option(
|
|
1722
|
-
'--kubernetes',
|
|
1723
|
-
'--k8s',
|
|
1724
|
-
default=False,
|
|
1725
|
-
is_flag=True,
|
|
1726
|
-
required=False,
|
|
1727
|
-
help='[Experimental] Show all SkyPilot resources (including from other '
|
|
1728
|
-
'users) in the current Kubernetes context.')
|
|
1749
|
+
help='Also show pools, if any.')
|
|
1729
1750
|
@click.argument('clusters',
|
|
1730
1751
|
required=False,
|
|
1731
1752
|
type=str,
|
|
@@ -1737,8 +1758,8 @@ def _show_enabled_infra(
|
|
|
1737
1758
|
# pylint: disable=redefined-builtin
|
|
1738
1759
|
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1739
1760
|
endpoint: Optional[int], show_managed_jobs: bool,
|
|
1740
|
-
show_services: bool, show_pools: bool,
|
|
1741
|
-
|
|
1761
|
+
show_services: bool, show_pools: bool, clusters: List[str],
|
|
1762
|
+
all_users: bool):
|
|
1742
1763
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1743
1764
|
"""Show clusters.
|
|
1744
1765
|
|
|
@@ -1801,9 +1822,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1801
1822
|
or for autostop-enabled clusters, use ``--refresh`` to query the latest
|
|
1802
1823
|
cluster statuses from the cloud providers.
|
|
1803
1824
|
"""
|
|
1804
|
-
if kubernetes:
|
|
1805
|
-
_status_kubernetes(verbose)
|
|
1806
|
-
return
|
|
1807
1825
|
# Do not show job queue if user specifies clusters, and if user
|
|
1808
1826
|
# specifies --ip or --endpoint(s).
|
|
1809
1827
|
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
|
@@ -1853,9 +1871,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1853
1871
|
|
|
1854
1872
|
# Phase 2: Parallel submission of all API requests
|
|
1855
1873
|
def submit_managed_jobs():
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1874
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
1875
|
+
if all_users:
|
|
1876
|
+
fields = fields + _USER_NAME_FIELD
|
|
1877
|
+
return cli_utils.get_managed_job_queue(
|
|
1878
|
+
refresh=False,
|
|
1879
|
+
skip_finished=True,
|
|
1880
|
+
all_users=all_users,
|
|
1881
|
+
fields=fields,
|
|
1882
|
+
limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1883
|
+
)
|
|
1859
1884
|
|
|
1860
1885
|
def submit_services(
|
|
1861
1886
|
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
@@ -1870,17 +1895,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1870
1895
|
return None
|
|
1871
1896
|
|
|
1872
1897
|
def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
|
|
1873
|
-
|
|
1874
|
-
return sdk.workspaces()
|
|
1875
|
-
except RuntimeError:
|
|
1876
|
-
# Backward compatibility for API server before #5660.
|
|
1877
|
-
# TODO(zhwu): remove this after 0.10.0.
|
|
1878
|
-
logger.warning(f'{colorama.Style.DIM}SkyPilot API server is '
|
|
1879
|
-
'in an old version, and may miss feature: '
|
|
1880
|
-
'workspaces. Update with: sky api stop; '
|
|
1881
|
-
'sky api start'
|
|
1882
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1883
|
-
return None
|
|
1898
|
+
return sdk.workspaces()
|
|
1884
1899
|
|
|
1885
1900
|
active_workspace = skypilot_config.get_active_workspace()
|
|
1886
1901
|
|
|
@@ -1888,6 +1903,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1888
1903
|
return sdk.enabled_clouds(workspace=active_workspace, expand=True)
|
|
1889
1904
|
|
|
1890
1905
|
managed_jobs_queue_request_id = None
|
|
1906
|
+
queue_result_version = cli_utils.QueueResultVersion.V1
|
|
1891
1907
|
service_status_request_id = None
|
|
1892
1908
|
workspace_request_id = None
|
|
1893
1909
|
pool_status_request_id = None
|
|
@@ -1906,7 +1922,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1906
1922
|
|
|
1907
1923
|
# Get the request IDs
|
|
1908
1924
|
if show_managed_jobs:
|
|
1909
|
-
managed_jobs_queue_request_id
|
|
1925
|
+
(managed_jobs_queue_request_id,
|
|
1926
|
+
queue_result_version) = managed_jobs_request_future.result()
|
|
1910
1927
|
if show_services:
|
|
1911
1928
|
service_status_request_id = services_request_future.result()
|
|
1912
1929
|
if show_pools:
|
|
@@ -1927,7 +1944,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1927
1944
|
|
|
1928
1945
|
# Phase 3: Get cluster records and handle special cases
|
|
1929
1946
|
cluster_records = _get_cluster_records_and_set_ssh_config(
|
|
1930
|
-
query_clusters, refresh_mode, all_users)
|
|
1947
|
+
query_clusters, refresh_mode, all_users, verbose)
|
|
1931
1948
|
|
|
1932
1949
|
# TOOD(zhwu): setup the ssh config for status
|
|
1933
1950
|
if ip or show_endpoints:
|
|
@@ -1938,7 +1955,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1938
1955
|
controllers = []
|
|
1939
1956
|
for cluster_record in cluster_records:
|
|
1940
1957
|
cluster_name = cluster_record['name']
|
|
1941
|
-
controller = controller_utils.Controllers.from_name(
|
|
1958
|
+
controller = controller_utils.Controllers.from_name(
|
|
1959
|
+
cluster_name, expect_exact_match=False)
|
|
1942
1960
|
if controller is not None:
|
|
1943
1961
|
controllers.append(cluster_record)
|
|
1944
1962
|
else:
|
|
@@ -1967,10 +1985,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1967
1985
|
try:
|
|
1968
1986
|
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
|
1969
1987
|
managed_jobs_queue_request_id,
|
|
1988
|
+
pool_status_request_id=pool_status_request_id,
|
|
1970
1989
|
show_all=False,
|
|
1971
1990
|
show_user=all_users,
|
|
1972
1991
|
max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1973
|
-
is_called_by_user=False
|
|
1992
|
+
is_called_by_user=False,
|
|
1993
|
+
only_in_progress=True,
|
|
1994
|
+
queue_result_version=queue_result_version,
|
|
1995
|
+
)
|
|
1974
1996
|
except KeyboardInterrupt:
|
|
1975
1997
|
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
|
1976
1998
|
managed_jobs_query_interrupted = True
|
|
@@ -2066,6 +2088,35 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
2066
2088
|
click.echo('\n' + '\n'.join(hints))
|
|
2067
2089
|
|
|
2068
2090
|
|
|
2091
|
+
@cli.command(hidden=True)
|
|
2092
|
+
@flags.config_option(expose_value=False)
|
|
2093
|
+
@flags.verbose_option()
|
|
2094
|
+
def status_kubernetes(verbose: bool):
|
|
2095
|
+
"""[Experimental] Show all SkyPilot resources (including from other '
|
|
2096
|
+
'users) in the current Kubernetes context."""
|
|
2097
|
+
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
|
2098
|
+
sdk.status_kubernetes()))
|
|
2099
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2100
|
+
f'Kubernetes cluster state (context: {context})'
|
|
2101
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2102
|
+
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
|
2103
|
+
show_all=verbose)
|
|
2104
|
+
if all_jobs:
|
|
2105
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2106
|
+
f'Managed jobs'
|
|
2107
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2108
|
+
msg = table_utils.format_job_table(all_jobs,
|
|
2109
|
+
show_all=verbose,
|
|
2110
|
+
show_user=False)
|
|
2111
|
+
click.echo(msg)
|
|
2112
|
+
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
|
2113
|
+
# TODO: Parse serve controllers and show services separately.
|
|
2114
|
+
# Currently we show a hint that services are shown as clusters.
|
|
2115
|
+
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
|
2116
|
+
'shown in the "SkyPilot clusters" section.'
|
|
2117
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2118
|
+
|
|
2119
|
+
|
|
2069
2120
|
@cli.command()
|
|
2070
2121
|
@flags.config_option(expose_value=False)
|
|
2071
2122
|
@flags.all_option('Show all cluster information.')
|
|
@@ -2104,7 +2155,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
|
|
|
2104
2155
|
for cluster_record in cluster_records:
|
|
2105
2156
|
cluster_name = cluster_record['name']
|
|
2106
2157
|
try:
|
|
2107
|
-
controller = controller_utils.Controllers.from_name(
|
|
2158
|
+
controller = controller_utils.Controllers.from_name(
|
|
2159
|
+
cluster_name, expect_exact_match=False)
|
|
2108
2160
|
except AssertionError:
|
|
2109
2161
|
# There could be some old controller clusters from previous
|
|
2110
2162
|
# versions that we should not show in the cost report.
|
|
@@ -2192,7 +2244,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2192
2244
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
|
2193
2245
|
f' {common_utils.format_exception(e)}')
|
|
2194
2246
|
return
|
|
2195
|
-
job_tables[cluster] =
|
|
2247
|
+
job_tables[cluster] = table_utils.format_job_queue(job_table)
|
|
2196
2248
|
|
|
2197
2249
|
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
|
2198
2250
|
user_str = 'all users' if all_users else 'current user'
|
|
@@ -2213,6 +2265,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2213
2265
|
is_flag=True,
|
|
2214
2266
|
default=False,
|
|
2215
2267
|
help='Stream the cluster provisioning logs (provision.log).')
|
|
2268
|
+
@click.option('--worker',
|
|
2269
|
+
'-w',
|
|
2270
|
+
default=None,
|
|
2271
|
+
type=int,
|
|
2272
|
+
help='The worker ID to stream the logs from. '
|
|
2273
|
+
'If not set, stream the logs of the head node.')
|
|
2216
2274
|
@click.option(
|
|
2217
2275
|
'--sync-down',
|
|
2218
2276
|
'-s',
|
|
@@ -2250,6 +2308,7 @@ def logs(
|
|
|
2250
2308
|
cluster: str,
|
|
2251
2309
|
job_ids: Tuple[str, ...],
|
|
2252
2310
|
provision: bool,
|
|
2311
|
+
worker: Optional[int],
|
|
2253
2312
|
sync_down: bool,
|
|
2254
2313
|
status: bool, # pylint: disable=redefined-outer-name
|
|
2255
2314
|
follow: bool,
|
|
@@ -2279,6 +2338,13 @@ def logs(
|
|
|
2279
2338
|
4. If the job fails or fetching the logs fails, the command will exit with
|
|
2280
2339
|
a non-zero return code.
|
|
2281
2340
|
"""
|
|
2341
|
+
if worker is not None:
|
|
2342
|
+
if not provision:
|
|
2343
|
+
raise click.UsageError(
|
|
2344
|
+
'--worker can only be used with --provision.')
|
|
2345
|
+
if worker < 1:
|
|
2346
|
+
raise click.UsageError('--worker must be a positive integer.')
|
|
2347
|
+
|
|
2282
2348
|
if provision and (sync_down or status or job_ids):
|
|
2283
2349
|
raise click.UsageError(
|
|
2284
2350
|
'--provision cannot be combined with job log options '
|
|
@@ -2298,7 +2364,11 @@ def logs(
|
|
|
2298
2364
|
|
|
2299
2365
|
if provision:
|
|
2300
2366
|
# Stream provision logs
|
|
2301
|
-
sys.exit(
|
|
2367
|
+
sys.exit(
|
|
2368
|
+
sdk.tail_provision_logs(cluster_name=cluster,
|
|
2369
|
+
worker=worker,
|
|
2370
|
+
follow=follow,
|
|
2371
|
+
tail=tail))
|
|
2302
2372
|
|
|
2303
2373
|
if sync_down:
|
|
2304
2374
|
with rich_utils.client_status(
|
|
@@ -2476,7 +2546,8 @@ def cancel(
|
|
|
2476
2546
|
job_ids=job_ids_to_cancel)
|
|
2477
2547
|
_async_call_or_wait(request_id, async_call, 'sky.cancel')
|
|
2478
2548
|
except exceptions.NotSupportedError as e:
|
|
2479
|
-
controller = controller_utils.Controllers.from_name(
|
|
2549
|
+
controller = controller_utils.Controllers.from_name(
|
|
2550
|
+
cluster, expect_exact_match=False)
|
|
2480
2551
|
assert controller is not None, cluster
|
|
2481
2552
|
with ux_utils.print_exception_no_traceback():
|
|
2482
2553
|
raise click.UsageError(
|
|
@@ -2777,7 +2848,8 @@ def start(
|
|
|
2777
2848
|
# Get all clusters that are not controllers.
|
|
2778
2849
|
cluster_records = [
|
|
2779
2850
|
cluster for cluster in all_clusters
|
|
2780
|
-
if controller_utils.Controllers.from_name(
|
|
2851
|
+
if controller_utils.Controllers.from_name(
|
|
2852
|
+
cluster['name'], expect_exact_match=False) is None
|
|
2781
2853
|
]
|
|
2782
2854
|
if cluster_records is None:
|
|
2783
2855
|
# Get GLOB cluster names
|
|
@@ -2839,7 +2911,8 @@ def start(
|
|
|
2839
2911
|
# Checks for controller clusters (jobs controller / sky serve controller).
|
|
2840
2912
|
controllers, normal_clusters = [], []
|
|
2841
2913
|
for name in to_start:
|
|
2842
|
-
if controller_utils.Controllers.from_name(
|
|
2914
|
+
if controller_utils.Controllers.from_name(
|
|
2915
|
+
name, expect_exact_match=False) is not None:
|
|
2843
2916
|
controllers.append(name)
|
|
2844
2917
|
else:
|
|
2845
2918
|
normal_clusters.append(name)
|
|
@@ -2975,16 +3048,28 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2975
3048
|
to be torn down (e.g., because it has jobs running or
|
|
2976
3049
|
it is in init state)
|
|
2977
3050
|
"""
|
|
2978
|
-
controller = controller_utils.Controllers.from_name(
|
|
3051
|
+
controller = controller_utils.Controllers.from_name(
|
|
3052
|
+
controller_name, expect_exact_match=False)
|
|
2979
3053
|
assert controller is not None, controller_name
|
|
2980
3054
|
|
|
3055
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
3056
|
+
managed_jobs_: List[responses.ManagedJobRecord] = []
|
|
2981
3057
|
with rich_utils.client_status(
|
|
2982
3058
|
'[bold cyan]Checking for in-progress managed jobs and pools[/]'):
|
|
2983
3059
|
try:
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
3060
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
|
|
3061
|
+
request_id, queue_result_version = cli_utils.get_managed_job_queue(
|
|
3062
|
+
refresh=False,
|
|
3063
|
+
skip_finished=True,
|
|
3064
|
+
all_users=True,
|
|
3065
|
+
fields=fields,
|
|
3066
|
+
)
|
|
3067
|
+
result = sdk.stream_and_get(request_id)
|
|
3068
|
+
if queue_result_version.v2():
|
|
3069
|
+
managed_jobs_, _, status_counts, _ = result
|
|
3070
|
+
else:
|
|
3071
|
+
managed_jobs_ = typing.cast(List[responses.ManagedJobRecord],
|
|
3072
|
+
result)
|
|
2988
3073
|
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2989
3074
|
pools_ = sdk.stream_and_get(request_id_pools)
|
|
2990
3075
|
except exceptions.ClusterNotUpError as e:
|
|
@@ -3002,25 +3087,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3002
3087
|
# there is no in-prgress managed jobs.
|
|
3003
3088
|
managed_jobs_ = []
|
|
3004
3089
|
pools_ = []
|
|
3005
|
-
except exceptions.InconsistentConsolidationModeError:
|
|
3006
|
-
# If this error is raised, it means the user switched to the
|
|
3007
|
-
# consolidation mode but the previous controller cluster is still
|
|
3008
|
-
# running. We should allow the user to tear down the controller
|
|
3009
|
-
# cluster in this case.
|
|
3010
|
-
with skypilot_config.override_skypilot_config(
|
|
3011
|
-
{'jobs': {
|
|
3012
|
-
'controller': {
|
|
3013
|
-
'consolidation_mode': False
|
|
3014
|
-
}
|
|
3015
|
-
}}):
|
|
3016
|
-
# Check again with the consolidation mode disabled. This is to
|
|
3017
|
-
# make sure there is no in-progress managed jobs.
|
|
3018
|
-
request_id = managed_jobs.queue(refresh=False,
|
|
3019
|
-
skip_finished=True,
|
|
3020
|
-
all_users=True)
|
|
3021
|
-
managed_jobs_ = sdk.stream_and_get(request_id)
|
|
3022
|
-
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
3023
|
-
pools_ = sdk.stream_and_get(request_id_pools)
|
|
3024
3090
|
|
|
3025
3091
|
msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
|
|
3026
3092
|
'jobs controller. Please be aware of the following:'
|
|
@@ -3029,9 +3095,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3029
3095
|
'jobs (output of `sky jobs queue`) will be lost.')
|
|
3030
3096
|
click.echo(msg)
|
|
3031
3097
|
if managed_jobs_:
|
|
3032
|
-
job_table =
|
|
3033
|
-
|
|
3034
|
-
|
|
3098
|
+
job_table = table_utils.format_job_table(
|
|
3099
|
+
managed_jobs_,
|
|
3100
|
+
show_all=False,
|
|
3101
|
+
show_user=True,
|
|
3102
|
+
status_counts=status_counts,
|
|
3103
|
+
)
|
|
3035
3104
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
|
3036
3105
|
# Add prefix to each line to align with the bullet point.
|
|
3037
3106
|
msg += '\n'.join(
|
|
@@ -3074,7 +3143,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
3074
3143
|
to be torn down (e.g., because it has services running or
|
|
3075
3144
|
it is in init state)
|
|
3076
3145
|
"""
|
|
3077
|
-
controller = controller_utils.Controllers.from_name(
|
|
3146
|
+
controller = controller_utils.Controllers.from_name(
|
|
3147
|
+
controller_name, expect_exact_match=False)
|
|
3078
3148
|
assert controller is not None, controller_name
|
|
3079
3149
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
|
3080
3150
|
try:
|
|
@@ -3093,21 +3163,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
3093
3163
|
# controller being STOPPED or being firstly launched, i.e., there is
|
|
3094
3164
|
# no in-prgress services.
|
|
3095
3165
|
services = []
|
|
3096
|
-
except exceptions.InconsistentConsolidationModeError:
|
|
3097
|
-
# If this error is raised, it means the user switched to the
|
|
3098
|
-
# consolidation mode but the previous controller cluster is still
|
|
3099
|
-
# running. We should allow the user to tear down the controller
|
|
3100
|
-
# cluster in this case.
|
|
3101
|
-
with skypilot_config.override_skypilot_config(
|
|
3102
|
-
{'serve': {
|
|
3103
|
-
'controller': {
|
|
3104
|
-
'consolidation_mode': False
|
|
3105
|
-
}
|
|
3106
|
-
}}):
|
|
3107
|
-
# Check again with the consolidation mode disabled. This is to
|
|
3108
|
-
# make sure there is no in-progress services.
|
|
3109
|
-
request_id = serve_lib.status(service_names=None)
|
|
3110
|
-
services = sdk.stream_and_get(request_id)
|
|
3111
3166
|
|
|
3112
3167
|
if services:
|
|
3113
3168
|
service_names = [service['name'] for service in services]
|
|
@@ -3185,14 +3240,15 @@ def _down_or_stop_clusters(
|
|
|
3185
3240
|
names = list(names)
|
|
3186
3241
|
if names:
|
|
3187
3242
|
controllers = [
|
|
3188
|
-
name for name in names
|
|
3189
|
-
|
|
3243
|
+
name for name in names if controller_utils.Controllers.from_name(
|
|
3244
|
+
name, expect_exact_match=False) is not None
|
|
3190
3245
|
]
|
|
3191
3246
|
controllers_str = ', '.join(map(repr, controllers))
|
|
3192
3247
|
names = [
|
|
3193
3248
|
cluster['name']
|
|
3194
3249
|
for cluster in _get_cluster_records_and_set_ssh_config(names)
|
|
3195
|
-
if controller_utils.Controllers.from_name(
|
|
3250
|
+
if controller_utils.Controllers.from_name(
|
|
3251
|
+
cluster['name'], expect_exact_match=False) is None
|
|
3196
3252
|
]
|
|
3197
3253
|
|
|
3198
3254
|
# Make sure the controllers are explicitly specified without other
|
|
@@ -3217,7 +3273,7 @@ def _down_or_stop_clusters(
|
|
|
3217
3273
|
f'{controllers_str} is currently not supported.')
|
|
3218
3274
|
else:
|
|
3219
3275
|
controller = controller_utils.Controllers.from_name(
|
|
3220
|
-
controller_name)
|
|
3276
|
+
controller_name, expect_exact_match=False)
|
|
3221
3277
|
assert controller is not None
|
|
3222
3278
|
hint_or_raise = _controller_to_hint_or_raise(controller)
|
|
3223
3279
|
try:
|
|
@@ -3265,9 +3321,10 @@ def _down_or_stop_clusters(
|
|
|
3265
3321
|
names = [
|
|
3266
3322
|
record['name']
|
|
3267
3323
|
for record in all_clusters
|
|
3268
|
-
if controller_utils.Controllers.from_name(
|
|
3269
|
-
|
|
3270
|
-
|
|
3324
|
+
if controller_utils.Controllers.from_name(
|
|
3325
|
+
record['name'], expect_exact_match=False) is None and
|
|
3326
|
+
(down or idle_minutes_to_autostop is not None or
|
|
3327
|
+
record['status'] != status_lib.ClusterStatus.STOPPED)
|
|
3271
3328
|
]
|
|
3272
3329
|
|
|
3273
3330
|
clusters = names
|
|
@@ -3297,6 +3354,9 @@ def _down_or_stop_clusters(
|
|
|
3297
3354
|
|
|
3298
3355
|
request_ids = []
|
|
3299
3356
|
|
|
3357
|
+
successes: List[str] = []
|
|
3358
|
+
failures: List[Tuple[str, str]] = []
|
|
3359
|
+
|
|
3300
3360
|
def _down_or_stop(name: str):
|
|
3301
3361
|
success_progress = False
|
|
3302
3362
|
if idle_minutes_to_autostop is not None:
|
|
@@ -3304,16 +3364,20 @@ def _down_or_stop_clusters(
|
|
|
3304
3364
|
request_id = sdk.autostop(name, idle_minutes_to_autostop,
|
|
3305
3365
|
wait_for, down)
|
|
3306
3366
|
request_ids.append(request_id)
|
|
3367
|
+
progress.stop()
|
|
3307
3368
|
_async_call_or_wait(
|
|
3308
3369
|
request_id, async_call,
|
|
3309
3370
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3310
|
-
|
|
3311
|
-
|
|
3371
|
+
progress.start()
|
|
3372
|
+
except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
|
|
3373
|
+
exceptions.CloudError) as e:
|
|
3312
3374
|
message = str(e)
|
|
3375
|
+
failures.append((name, str(e)))
|
|
3313
3376
|
else: # no exception raised
|
|
3314
3377
|
success_progress = True
|
|
3315
3378
|
message = (f'{colorama.Fore.GREEN}{operation} '
|
|
3316
3379
|
f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
|
|
3380
|
+
successes.append(name)
|
|
3317
3381
|
if idle_minutes_to_autostop >= 0:
|
|
3318
3382
|
option_str = 'down' if down else 'stop'
|
|
3319
3383
|
passive_str = 'downed' if down else 'stopped'
|
|
@@ -3333,9 +3397,11 @@ def _down_or_stop_clusters(
|
|
|
3333
3397
|
else:
|
|
3334
3398
|
request_id = sdk.stop(name, purge=purge)
|
|
3335
3399
|
request_ids.append(request_id)
|
|
3400
|
+
progress.stop()
|
|
3336
3401
|
_async_call_or_wait(
|
|
3337
3402
|
request_id, async_call,
|
|
3338
3403
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3404
|
+
progress.start()
|
|
3339
3405
|
if not async_call:
|
|
3340
3406
|
# Remove the cluster from the SSH config file as soon as it
|
|
3341
3407
|
# is stopped or downed.
|
|
@@ -3345,13 +3411,17 @@ def _down_or_stop_clusters(
|
|
|
3345
3411
|
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
|
|
3346
3412
|
f'{colorama.Style.RESET_ALL}'
|
|
3347
3413
|
f'\nReason: {common_utils.format_exception(e)}.')
|
|
3414
|
+
failures.append((name, str(e)))
|
|
3348
3415
|
except (exceptions.NotSupportedError,
|
|
3349
|
-
exceptions.ClusterOwnerIdentityMismatchError
|
|
3416
|
+
exceptions.ClusterOwnerIdentityMismatchError,
|
|
3417
|
+
exceptions.CloudError) as e:
|
|
3350
3418
|
message = str(e)
|
|
3419
|
+
failures.append((name, str(e)))
|
|
3351
3420
|
else: # no exception raised
|
|
3352
3421
|
message = (
|
|
3353
3422
|
f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
|
|
3354
3423
|
f'{colorama.Style.RESET_ALL}')
|
|
3424
|
+
successes.append(name)
|
|
3355
3425
|
if not down:
|
|
3356
3426
|
message += ('\n To restart the cluster, run: '
|
|
3357
3427
|
f'{colorama.Style.BRIGHT}sky start {name}'
|
|
@@ -3365,6 +3435,10 @@ def _down_or_stop_clusters(
|
|
|
3365
3435
|
progress.start()
|
|
3366
3436
|
|
|
3367
3437
|
with progress:
|
|
3438
|
+
# we write a new line here to avoid the "Waiting for 'sky.down'
|
|
3439
|
+
# request to be scheduled" message from being printed on the same line
|
|
3440
|
+
# as the "Terminating <num> clusters..." message
|
|
3441
|
+
click.echo('')
|
|
3368
3442
|
subprocess_utils.run_in_parallel(_down_or_stop, clusters)
|
|
3369
3443
|
progress.live.transient = False
|
|
3370
3444
|
# Make sure the progress bar not mess up the terminal.
|
|
@@ -3374,6 +3448,31 @@ def _down_or_stop_clusters(
|
|
|
3374
3448
|
click.secho(f'{operation} requests are sent. Check the requests\' '
|
|
3375
3449
|
'status with `sky request get <request_id>`.')
|
|
3376
3450
|
|
|
3451
|
+
show_summary = len(clusters) > 1
|
|
3452
|
+
|
|
3453
|
+
if show_summary:
|
|
3454
|
+
click.echo('\nSummary:')
|
|
3455
|
+
if successes:
|
|
3456
|
+
# Preserve the original order of clusters as provided by user.
|
|
3457
|
+
click.echo(' ✓ Succeeded: ' + ', '.join(successes))
|
|
3458
|
+
if failures:
|
|
3459
|
+
# Format failures: if one failure, keep on same line. If multiple,
|
|
3460
|
+
# indent each failed cluster on its own line for readability.
|
|
3461
|
+
if len(failures) == 1:
|
|
3462
|
+
name, reason = failures[0]
|
|
3463
|
+
first = reason.strip().splitlines()[0]
|
|
3464
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3465
|
+
click.echo(f' ✗ Failed: {name} ({first})')
|
|
3466
|
+
else:
|
|
3467
|
+
click.echo(' ✗ Failed:')
|
|
3468
|
+
for name, reason in failures:
|
|
3469
|
+
first = reason.strip().splitlines()[0]
|
|
3470
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3471
|
+
click.echo(f' {name} ({first})')
|
|
3472
|
+
|
|
3473
|
+
if failures:
|
|
3474
|
+
click.echo('Cluster(s) failed. See details above.')
|
|
3475
|
+
|
|
3377
3476
|
|
|
3378
3477
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
3379
3478
|
@flags.config_option(expose_value=False)
|
|
@@ -3483,6 +3582,10 @@ def show_gpus(
|
|
|
3483
3582
|
maximum quantities of the GPU available on a single node and the real-time
|
|
3484
3583
|
availability of the GPU across all nodes in the Kubernetes cluster.
|
|
3485
3584
|
|
|
3585
|
+
If ``--cloud slurm`` is specified, it will show the maximum quantities of
|
|
3586
|
+
the GPU available on a single node and the real-time availability of the
|
|
3587
|
+
GPU across all nodes in the Slurm cluster.
|
|
3588
|
+
|
|
3486
3589
|
Definitions of certain fields:
|
|
3487
3590
|
|
|
3488
3591
|
* ``DEVICE_MEM``: Memory of a single device; does not depend on the device
|
|
@@ -3538,6 +3641,8 @@ def show_gpus(
|
|
|
3538
3641
|
cloud_is_kubernetes = isinstance(
|
|
3539
3642
|
cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
|
|
3540
3643
|
cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
|
|
3644
|
+
cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
|
|
3645
|
+
|
|
3541
3646
|
# TODO(romilb): We should move this to the backend.
|
|
3542
3647
|
kubernetes_autoscaling = skypilot_config.get_effective_region_config(
|
|
3543
3648
|
cloud='kubernetes',
|
|
@@ -3546,6 +3651,7 @@ def show_gpus(
|
|
|
3546
3651
|
default_value=None) is not None
|
|
3547
3652
|
kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
|
|
3548
3653
|
ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
|
|
3654
|
+
slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
|
|
3549
3655
|
query_k8s_realtime_gpu = (kubernetes_is_enabled and
|
|
3550
3656
|
(cloud_name is None or cloud_is_kubernetes))
|
|
3551
3657
|
query_ssh_realtime_gpu = (ssh_is_enabled and
|
|
@@ -3605,8 +3711,9 @@ def show_gpus(
|
|
|
3605
3711
|
raise ValueError(full_err_msg)
|
|
3606
3712
|
no_permissions_str = '<no permissions>'
|
|
3607
3713
|
realtime_gpu_infos = []
|
|
3714
|
+
# Stores per-GPU totals as [ready_capacity, available, not_ready].
|
|
3608
3715
|
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3609
|
-
lambda: [0, 0])
|
|
3716
|
+
lambda: [0, 0, 0])
|
|
3610
3717
|
all_nodes_info = []
|
|
3611
3718
|
|
|
3612
3719
|
# display an aggregated table for all contexts
|
|
@@ -3617,6 +3724,33 @@ def show_gpus(
|
|
|
3617
3724
|
|
|
3618
3725
|
num_filtered_contexts = 0
|
|
3619
3726
|
|
|
3727
|
+
def _count_not_ready_gpus(
|
|
3728
|
+
nodes_info: Optional['models.KubernetesNodesInfo']
|
|
3729
|
+
) -> Dict[str, int]:
|
|
3730
|
+
"""Return counts of GPUs on not ready nodes keyed by GPU type."""
|
|
3731
|
+
not_ready_counts: Dict[str, int] = collections.defaultdict(int)
|
|
3732
|
+
if nodes_info is None:
|
|
3733
|
+
return not_ready_counts
|
|
3734
|
+
|
|
3735
|
+
node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
|
|
3736
|
+
for node_info in node_info_dict.values():
|
|
3737
|
+
accelerator_type = getattr(node_info, 'accelerator_type', None)
|
|
3738
|
+
if not accelerator_type:
|
|
3739
|
+
continue
|
|
3740
|
+
|
|
3741
|
+
total_info = getattr(node_info, 'total', {})
|
|
3742
|
+
accelerator_count = 0
|
|
3743
|
+
if isinstance(total_info, dict):
|
|
3744
|
+
accelerator_count = int(
|
|
3745
|
+
total_info.get('accelerator_count', 0))
|
|
3746
|
+
if accelerator_count <= 0:
|
|
3747
|
+
continue
|
|
3748
|
+
|
|
3749
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3750
|
+
if not node_is_ready:
|
|
3751
|
+
not_ready_counts[accelerator_type] += accelerator_count
|
|
3752
|
+
return not_ready_counts
|
|
3753
|
+
|
|
3620
3754
|
if realtime_gpu_availability_lists:
|
|
3621
3755
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
|
3622
3756
|
if not _filter_ctx(ctx):
|
|
@@ -3626,6 +3760,12 @@ def show_gpus(
|
|
|
3626
3760
|
else:
|
|
3627
3761
|
display_ctx = ctx
|
|
3628
3762
|
num_filtered_contexts += 1
|
|
3763
|
+
# Collect node info for this context before building tables so
|
|
3764
|
+
# we can exclude GPUs on not ready nodes from the totals.
|
|
3765
|
+
nodes_info = sdk.stream_and_get(
|
|
3766
|
+
sdk.kubernetes_node_info(context=ctx))
|
|
3767
|
+
context_not_ready_counts = _count_not_ready_gpus(nodes_info)
|
|
3768
|
+
|
|
3629
3769
|
realtime_gpu_table = log_utils.create_table(
|
|
3630
3770
|
['GPU', qty_header, 'UTILIZATION'])
|
|
3631
3771
|
for realtime_gpu_availability in sorted(availability_list):
|
|
@@ -3634,24 +3774,116 @@ def show_gpus(
|
|
|
3634
3774
|
available_qty = (gpu_availability.available
|
|
3635
3775
|
if gpu_availability.available != -1 else
|
|
3636
3776
|
no_permissions_str)
|
|
3777
|
+
# Exclude GPUs on not ready nodes from capacity counts.
|
|
3778
|
+
not_ready_count = min(
|
|
3779
|
+
context_not_ready_counts.get(gpu_availability.gpu, 0),
|
|
3780
|
+
gpu_availability.capacity)
|
|
3781
|
+
# Ensure capacity is never below the reported available
|
|
3782
|
+
# quantity (if available is unknown, treat as 0 for totals).
|
|
3783
|
+
available_for_totals = max(
|
|
3784
|
+
gpu_availability.available
|
|
3785
|
+
if gpu_availability.available != -1 else 0, 0)
|
|
3786
|
+
effective_capacity = max(
|
|
3787
|
+
gpu_availability.capacity - not_ready_count,
|
|
3788
|
+
available_for_totals)
|
|
3789
|
+
utilization = (
|
|
3790
|
+
f'{available_qty} of {effective_capacity} free')
|
|
3791
|
+
if not_ready_count > 0:
|
|
3792
|
+
utilization += f' ({not_ready_count} not ready)'
|
|
3637
3793
|
realtime_gpu_table.add_row([
|
|
3638
3794
|
gpu_availability.gpu,
|
|
3639
3795
|
_list_to_str(gpu_availability.counts),
|
|
3640
|
-
|
|
3796
|
+
utilization,
|
|
3641
3797
|
])
|
|
3642
3798
|
gpu = gpu_availability.gpu
|
|
3643
|
-
capacity = gpu_availability.capacity
|
|
3644
3799
|
# we want total, so skip permission denied.
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
total_gpu_info[gpu][
|
|
3648
|
-
total_gpu_info[gpu][
|
|
3800
|
+
if effective_capacity > 0 or not_ready_count > 0:
|
|
3801
|
+
total_gpu_info[gpu][0] += effective_capacity
|
|
3802
|
+
total_gpu_info[gpu][1] += available_for_totals
|
|
3803
|
+
total_gpu_info[gpu][2] += not_ready_count
|
|
3649
3804
|
realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
|
|
3650
|
-
# Collect node info for this context
|
|
3651
|
-
nodes_info = sdk.stream_and_get(
|
|
3652
|
-
sdk.kubernetes_node_info(context=ctx))
|
|
3653
3805
|
all_nodes_info.append((display_ctx, nodes_info))
|
|
3654
3806
|
if num_filtered_contexts > 1:
|
|
3807
|
+
total_realtime_gpu_table = log_utils.create_table(
|
|
3808
|
+
['GPU', 'UTILIZATION'])
|
|
3809
|
+
for gpu, stats in total_gpu_info.items():
|
|
3810
|
+
not_ready = stats[2]
|
|
3811
|
+
utilization = f'{stats[1]} of {stats[0]} free'
|
|
3812
|
+
if not_ready > 0:
|
|
3813
|
+
utilization += f' ({not_ready} not ready)'
|
|
3814
|
+
total_realtime_gpu_table.add_row([gpu, utilization])
|
|
3815
|
+
else:
|
|
3816
|
+
total_realtime_gpu_table = None
|
|
3817
|
+
|
|
3818
|
+
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
|
3819
|
+
|
|
3820
|
+
def _get_slurm_realtime_gpu_tables(
|
|
3821
|
+
name_filter: Optional[str] = None,
|
|
3822
|
+
quantity_filter: Optional[int] = None
|
|
3823
|
+
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
|
3824
|
+
Optional['prettytable.PrettyTable']]:
|
|
3825
|
+
"""Get Slurm GPU availability tables.
|
|
3826
|
+
|
|
3827
|
+
Args:
|
|
3828
|
+
name_filter: Filter GPUs by name.
|
|
3829
|
+
quantity_filter: Filter GPUs by quantity.
|
|
3830
|
+
|
|
3831
|
+
Returns:
|
|
3832
|
+
A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
|
|
3833
|
+
"""
|
|
3834
|
+
if quantity_filter:
|
|
3835
|
+
qty_header = 'QTY_FILTER'
|
|
3836
|
+
else:
|
|
3837
|
+
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
|
3838
|
+
|
|
3839
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
|
3840
|
+
sdk.realtime_slurm_gpu_availability(
|
|
3841
|
+
name_filter=name_filter, quantity_filter=quantity_filter))
|
|
3842
|
+
if not realtime_gpu_availability_lists:
|
|
3843
|
+
err_msg = 'No GPUs found in any Slurm partition. '
|
|
3844
|
+
debug_msg = 'To further debug, run: sky check slurm '
|
|
3845
|
+
if name_filter is not None:
|
|
3846
|
+
gpu_info_msg = f' {name_filter!r}'
|
|
3847
|
+
if quantity_filter is not None:
|
|
3848
|
+
gpu_info_msg += (' with requested quantity'
|
|
3849
|
+
f' {quantity_filter}')
|
|
3850
|
+
err_msg = (f'Resources{gpu_info_msg} not found '
|
|
3851
|
+
'in any Slurm partition. ')
|
|
3852
|
+
debug_msg = ('To show available accelerators on Slurm,'
|
|
3853
|
+
' run: sky show-gpus --cloud slurm ')
|
|
3854
|
+
raise ValueError(err_msg + debug_msg)
|
|
3855
|
+
|
|
3856
|
+
realtime_gpu_infos = []
|
|
3857
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3858
|
+
lambda: [0, 0])
|
|
3859
|
+
|
|
3860
|
+
for (slurm_cluster,
|
|
3861
|
+
availability_list) in realtime_gpu_availability_lists:
|
|
3862
|
+
realtime_gpu_table = log_utils.create_table(
|
|
3863
|
+
['GPU', qty_header, 'UTILIZATION'])
|
|
3864
|
+
for realtime_gpu_availability in sorted(availability_list):
|
|
3865
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
|
3866
|
+
*realtime_gpu_availability)
|
|
3867
|
+
# Use the counts directly from the backend, which are already
|
|
3868
|
+
# generated in powers of 2 (plus any actual maximums)
|
|
3869
|
+
requestable_quantities = gpu_availability.counts
|
|
3870
|
+
realtime_gpu_table.add_row([
|
|
3871
|
+
gpu_availability.gpu,
|
|
3872
|
+
_list_to_str(requestable_quantities),
|
|
3873
|
+
(f'{gpu_availability.available} of '
|
|
3874
|
+
f'{gpu_availability.capacity} free'),
|
|
3875
|
+
])
|
|
3876
|
+
gpu = gpu_availability.gpu
|
|
3877
|
+
capacity = gpu_availability.capacity
|
|
3878
|
+
available = gpu_availability.available
|
|
3879
|
+
if capacity > 0:
|
|
3880
|
+
total_gpu_info[gpu][0] += capacity
|
|
3881
|
+
total_gpu_info[gpu][1] += available
|
|
3882
|
+
realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
|
|
3883
|
+
|
|
3884
|
+
# display an aggregated table for all partitions
|
|
3885
|
+
# if there are more than one partitions with GPUs
|
|
3886
|
+
if len(realtime_gpu_infos) > 1:
|
|
3655
3887
|
total_realtime_gpu_table = log_utils.create_table(
|
|
3656
3888
|
['GPU', 'UTILIZATION'])
|
|
3657
3889
|
for gpu, stats in total_gpu_info.items():
|
|
@@ -3660,7 +3892,7 @@ def show_gpus(
|
|
|
3660
3892
|
else:
|
|
3661
3893
|
total_realtime_gpu_table = None
|
|
3662
3894
|
|
|
3663
|
-
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3895
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3664
3896
|
|
|
3665
3897
|
def _format_kubernetes_node_info_combined(
|
|
3666
3898
|
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
|
@@ -3684,11 +3916,16 @@ def show_gpus(
|
|
|
3684
3916
|
acc_type = node_info.accelerator_type
|
|
3685
3917
|
if acc_type is None:
|
|
3686
3918
|
acc_type = '-'
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
f'{
|
|
3690
|
-
|
|
3691
|
-
|
|
3919
|
+
utilization_str = (
|
|
3920
|
+
f'{available} of '
|
|
3921
|
+
f'{node_info.total["accelerator_count"]} free')
|
|
3922
|
+
# Check if node is ready (defaults to True for backward
|
|
3923
|
+
# compatibility with older server versions)
|
|
3924
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3925
|
+
if not node_is_ready:
|
|
3926
|
+
utilization_str += ' (Node NotReady)'
|
|
3927
|
+
node_table.add_row(
|
|
3928
|
+
[context_name, node_name, acc_type, utilization_str])
|
|
3692
3929
|
|
|
3693
3930
|
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
|
3694
3931
|
if hints:
|
|
@@ -3699,6 +3936,43 @@ def show_gpus(
|
|
|
3699
3936
|
f'{colorama.Style.RESET_ALL}\n'
|
|
3700
3937
|
f'{node_table.get_string()}')
|
|
3701
3938
|
|
|
3939
|
+
def _format_slurm_node_info() -> str:
|
|
3940
|
+
node_table = log_utils.create_table([
|
|
3941
|
+
'CLUSTER',
|
|
3942
|
+
'NODE',
|
|
3943
|
+
'PARTITION',
|
|
3944
|
+
'STATE',
|
|
3945
|
+
'GPU',
|
|
3946
|
+
'UTILIZATION',
|
|
3947
|
+
])
|
|
3948
|
+
|
|
3949
|
+
# Get all cluster names
|
|
3950
|
+
slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
|
|
3951
|
+
|
|
3952
|
+
# Query each cluster
|
|
3953
|
+
for cluster_name in slurm_cluster_names:
|
|
3954
|
+
nodes_info = sdk.stream_and_get(
|
|
3955
|
+
sdk.slurm_node_info(slurm_cluster_name=cluster_name))
|
|
3956
|
+
|
|
3957
|
+
for node_info in nodes_info:
|
|
3958
|
+
node_table.add_row([
|
|
3959
|
+
cluster_name,
|
|
3960
|
+
node_info.get('node_name'),
|
|
3961
|
+
node_info.get('partition', '-'),
|
|
3962
|
+
node_info.get('node_state'),
|
|
3963
|
+
node_info.get('gpu_type') or '',
|
|
3964
|
+
(f'{node_info.get("free_gpus", 0)} of '
|
|
3965
|
+
f'{node_info.get("total_gpus", 0)} free'),
|
|
3966
|
+
])
|
|
3967
|
+
|
|
3968
|
+
slurm_per_node_msg = 'Slurm per node accelerator availability'
|
|
3969
|
+
# Optional: Add hint message if needed, similar to k8s
|
|
3970
|
+
|
|
3971
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
|
3972
|
+
f'{slurm_per_node_msg}'
|
|
3973
|
+
f'{colorama.Style.RESET_ALL}\n'
|
|
3974
|
+
f'{node_table.get_string()}')
|
|
3975
|
+
|
|
3702
3976
|
def _format_kubernetes_realtime_gpu(
|
|
3703
3977
|
total_table: Optional['prettytable.PrettyTable'],
|
|
3704
3978
|
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
|
@@ -3828,6 +4102,28 @@ def show_gpus(
|
|
|
3828
4102
|
return True, print_section_titles
|
|
3829
4103
|
return False, print_section_titles
|
|
3830
4104
|
|
|
4105
|
+
def _format_slurm_realtime_gpu(
|
|
4106
|
+
total_table, slurm_realtime_infos,
|
|
4107
|
+
show_node_info: bool) -> Generator[str, None, None]:
|
|
4108
|
+
# print total table
|
|
4109
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
|
4110
|
+
'Slurm GPUs'
|
|
4111
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4112
|
+
if total_table is not None:
|
|
4113
|
+
yield from total_table.get_string()
|
|
4114
|
+
yield '\n'
|
|
4115
|
+
|
|
4116
|
+
# print individual infos.
|
|
4117
|
+
for (partition, slurm_realtime_table) in slurm_realtime_infos:
|
|
4118
|
+
partition_str = f'Slurm Cluster: {partition}'
|
|
4119
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
4120
|
+
f'{partition_str}'
|
|
4121
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4122
|
+
yield from slurm_realtime_table.get_string()
|
|
4123
|
+
yield '\n'
|
|
4124
|
+
if show_node_info:
|
|
4125
|
+
yield _format_slurm_node_info()
|
|
4126
|
+
|
|
3831
4127
|
def _output() -> Generator[str, None, None]:
|
|
3832
4128
|
gpu_table = log_utils.create_table(
|
|
3833
4129
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
|
@@ -3845,10 +4141,12 @@ def show_gpus(
|
|
|
3845
4141
|
if cloud_name is None:
|
|
3846
4142
|
clouds_to_list = [
|
|
3847
4143
|
c for c in constants.ALL_CLOUDS
|
|
3848
|
-
if c != 'kubernetes' and c != 'ssh'
|
|
4144
|
+
if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
|
|
3849
4145
|
]
|
|
3850
4146
|
|
|
3851
4147
|
k8s_messages = ''
|
|
4148
|
+
slurm_messages = ''
|
|
4149
|
+
k8s_printed = False
|
|
3852
4150
|
if accelerator_str is None:
|
|
3853
4151
|
# Collect k8s related messages in k8s_messages and print them at end
|
|
3854
4152
|
print_section_titles = False
|
|
@@ -3860,6 +4158,7 @@ def show_gpus(
|
|
|
3860
4158
|
yield '\n\n'
|
|
3861
4159
|
stop_iter_one, print_section_titles_one, k8s_messages_one = (
|
|
3862
4160
|
yield from _possibly_show_k8s_like_realtime(is_ssh))
|
|
4161
|
+
k8s_printed = True
|
|
3863
4162
|
stop_iter = stop_iter or stop_iter_one
|
|
3864
4163
|
print_section_titles = (print_section_titles or
|
|
3865
4164
|
print_section_titles_one)
|
|
@@ -3867,11 +4166,45 @@ def show_gpus(
|
|
|
3867
4166
|
prev_print_section_titles = print_section_titles_one
|
|
3868
4167
|
if stop_iter:
|
|
3869
4168
|
return
|
|
4169
|
+
# If cloud is slurm, we want to show real-time capacity
|
|
4170
|
+
if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
|
|
4171
|
+
try:
|
|
4172
|
+
# If --cloud slurm is not specified, we want to catch
|
|
4173
|
+
# the case where no GPUs are available on the cluster and
|
|
4174
|
+
# print the warning at the end.
|
|
4175
|
+
slurm_realtime_infos, total_table = (
|
|
4176
|
+
_get_slurm_realtime_gpu_tables())
|
|
4177
|
+
except ValueError as e:
|
|
4178
|
+
if not cloud_is_slurm:
|
|
4179
|
+
# Make it a note if cloud is not slurm
|
|
4180
|
+
slurm_messages += 'Note: '
|
|
4181
|
+
slurm_messages += str(e)
|
|
4182
|
+
else:
|
|
4183
|
+
print_section_titles = True
|
|
4184
|
+
if k8s_printed:
|
|
4185
|
+
yield '\n'
|
|
4186
|
+
|
|
4187
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4188
|
+
slurm_realtime_infos,
|
|
4189
|
+
show_node_info=True)
|
|
4190
|
+
|
|
4191
|
+
if cloud_is_slurm:
|
|
4192
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4193
|
+
if not slurm_is_enabled:
|
|
4194
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4195
|
+
'sky check slurm ')
|
|
4196
|
+
yield slurm_messages
|
|
4197
|
+
return
|
|
3870
4198
|
|
|
3871
4199
|
# For show_all, show the k8s message at the start since output is
|
|
3872
4200
|
# long and the user may not scroll to the end.
|
|
3873
|
-
if show_all and k8s_messages:
|
|
3874
|
-
|
|
4201
|
+
if show_all and (k8s_messages or slurm_messages):
|
|
4202
|
+
if k8s_messages:
|
|
4203
|
+
yield k8s_messages
|
|
4204
|
+
if slurm_messages:
|
|
4205
|
+
if k8s_messages:
|
|
4206
|
+
yield '\n'
|
|
4207
|
+
yield slurm_messages
|
|
3875
4208
|
yield '\n\n'
|
|
3876
4209
|
|
|
3877
4210
|
list_accelerator_counts_result = sdk.stream_and_get(
|
|
@@ -3919,9 +4252,10 @@ def show_gpus(
|
|
|
3919
4252
|
else:
|
|
3920
4253
|
yield ('\n\nHint: use -a/--all to see all accelerators '
|
|
3921
4254
|
'(including non-common ones) and pricing.')
|
|
3922
|
-
if k8s_messages:
|
|
4255
|
+
if k8s_messages or slurm_messages:
|
|
3923
4256
|
yield '\n'
|
|
3924
4257
|
yield k8s_messages
|
|
4258
|
+
yield slurm_messages
|
|
3925
4259
|
return
|
|
3926
4260
|
else:
|
|
3927
4261
|
# Parse accelerator string
|
|
@@ -3961,6 +4295,31 @@ def show_gpus(
|
|
|
3961
4295
|
if stop_iter:
|
|
3962
4296
|
return
|
|
3963
4297
|
|
|
4298
|
+
# Handle Slurm filtering by name and quantity
|
|
4299
|
+
if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
|
|
4300
|
+
not show_all):
|
|
4301
|
+
# Print section title if not showing all and instead a specific
|
|
4302
|
+
# accelerator is requested
|
|
4303
|
+
print_section_titles = True
|
|
4304
|
+
try:
|
|
4305
|
+
slurm_realtime_infos, total_table = (
|
|
4306
|
+
_get_slurm_realtime_gpu_tables(name_filter=name,
|
|
4307
|
+
quantity_filter=quantity))
|
|
4308
|
+
|
|
4309
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4310
|
+
slurm_realtime_infos,
|
|
4311
|
+
show_node_info=False)
|
|
4312
|
+
except ValueError as e:
|
|
4313
|
+
# In the case of a specific accelerator, show the error message
|
|
4314
|
+
# immediately (e.g., "Resources A10G not found ...")
|
|
4315
|
+
yield str(e)
|
|
4316
|
+
yield slurm_messages
|
|
4317
|
+
if cloud_is_slurm:
|
|
4318
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4319
|
+
if not slurm_is_enabled:
|
|
4320
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4321
|
+
'sky check slurm ')
|
|
4322
|
+
return
|
|
3964
4323
|
# For clouds other than Kubernetes, get the accelerator details
|
|
3965
4324
|
# Case-sensitive
|
|
3966
4325
|
list_accelerators_result = sdk.stream_and_get(
|
|
@@ -4093,8 +4452,7 @@ def storage_ls(verbose: bool):
|
|
|
4093
4452
|
"""List storage objects managed by SkyPilot."""
|
|
4094
4453
|
request_id = sdk.storage_ls()
|
|
4095
4454
|
storages = sdk.stream_and_get(request_id)
|
|
4096
|
-
storage_table =
|
|
4097
|
-
show_all=verbose)
|
|
4455
|
+
storage_table = table_utils.format_storage_table(storages, show_all=verbose)
|
|
4098
4456
|
click.echo(storage_table)
|
|
4099
4457
|
|
|
4100
4458
|
|
|
@@ -4174,6 +4532,10 @@ def volumes():
|
|
|
4174
4532
|
pass
|
|
4175
4533
|
|
|
4176
4534
|
|
|
4535
|
+
# Add 'volume' as an alias for 'volumes'
|
|
4536
|
+
cli.add_command(volumes, name='volume')
|
|
4537
|
+
|
|
4538
|
+
|
|
4177
4539
|
@volumes.command('apply', cls=_DocumentedCodeCommand)
|
|
4178
4540
|
@flags.config_option(expose_value=False)
|
|
4179
4541
|
@click.argument('entrypoint',
|
|
@@ -4189,17 +4551,25 @@ def volumes():
|
|
|
4189
4551
|
@click.option('--infra',
|
|
4190
4552
|
required=False,
|
|
4191
4553
|
type=str,
|
|
4192
|
-
help='
|
|
4554
|
+
help='Infrastructure to use. '
|
|
4555
|
+
'Format: cloud, cloud/region, cloud/region/zone, or '
|
|
4556
|
+
'k8s/context-name.'
|
|
4557
|
+
'Examples: k8s, k8s/my-context, runpod/US/US-CA-2. '
|
|
4193
4558
|
'Override the infra defined in the YAML.')
|
|
4194
|
-
@click.option(
|
|
4195
|
-
|
|
4196
|
-
|
|
4197
|
-
|
|
4198
|
-
help='Volume type. Format: pvc. Override the type defined in the YAML.')
|
|
4559
|
+
@click.option('--type',
|
|
4560
|
+
required=False,
|
|
4561
|
+
type=click.Choice(volume_utils.VolumeType.supported_types()),
|
|
4562
|
+
help='Volume type. Override the type defined in the YAML.')
|
|
4199
4563
|
@click.option('--size',
|
|
4200
4564
|
required=False,
|
|
4201
4565
|
type=str,
|
|
4202
4566
|
help='Volume size. Override the size defined in the YAML.')
|
|
4567
|
+
@click.option(
|
|
4568
|
+
'--use-existing/--no-use-existing',
|
|
4569
|
+
required=False,
|
|
4570
|
+
default=None,
|
|
4571
|
+
help='Whether to use an existing volume. Override the use_existing '
|
|
4572
|
+
'defined in the YAML.')
|
|
4203
4573
|
@click.option('--yes',
|
|
4204
4574
|
'-y',
|
|
4205
4575
|
is_flag=True,
|
|
@@ -4214,6 +4584,7 @@ def volumes_apply(
|
|
|
4214
4584
|
infra: Optional[str],
|
|
4215
4585
|
type: Optional[str], # pylint: disable=redefined-builtin
|
|
4216
4586
|
size: Optional[str],
|
|
4587
|
+
use_existing: Optional[bool],
|
|
4217
4588
|
yes: bool,
|
|
4218
4589
|
async_call: bool):
|
|
4219
4590
|
"""Apply a volume.
|
|
@@ -4226,7 +4597,11 @@ def volumes_apply(
|
|
|
4226
4597
|
sky volumes apply volume.yaml
|
|
4227
4598
|
\b
|
|
4228
4599
|
# Apply a volume from a command.
|
|
4229
|
-
sky volumes apply --name pvc1 --infra k8s --type pvc --size 100Gi
|
|
4600
|
+
sky volumes apply --name pvc1 --infra k8s --type k8s-pvc --size 100Gi
|
|
4601
|
+
\b
|
|
4602
|
+
# Apply a volume with existing PVC `pvc2` from a command.
|
|
4603
|
+
sky volumes apply --name pvc2 --infra k8s --type k8s-pvc --size 100Gi
|
|
4604
|
+
--use-existing
|
|
4230
4605
|
"""
|
|
4231
4606
|
# pylint: disable=import-outside-toplevel
|
|
4232
4607
|
from sky.volumes import volume as volume_lib
|
|
@@ -4245,7 +4620,8 @@ def volumes_apply(
|
|
|
4245
4620
|
f'{entrypoint_str!r} needs to be a YAML file')
|
|
4246
4621
|
if yaml_config is not None:
|
|
4247
4622
|
volume_config_dict = yaml_config.copy()
|
|
4248
|
-
override_config = _build_volume_override_config(name, infra, type, size
|
|
4623
|
+
override_config = _build_volume_override_config(name, infra, type, size,
|
|
4624
|
+
use_existing)
|
|
4249
4625
|
volume_config_dict.update(override_config)
|
|
4250
4626
|
|
|
4251
4627
|
# Create Volume instance
|
|
@@ -4253,6 +4629,13 @@ def volumes_apply(
|
|
|
4253
4629
|
|
|
4254
4630
|
logger.debug(f'Volume config: {volume.to_yaml_config()}')
|
|
4255
4631
|
|
|
4632
|
+
# TODO(kevin): remove the try block in v0.13.0
|
|
4633
|
+
try:
|
|
4634
|
+
volumes_sdk.validate(volume)
|
|
4635
|
+
except exceptions.APINotSupportedError:
|
|
4636
|
+
# Do best-effort client-side validation.
|
|
4637
|
+
volume.validate(skip_cloud_compatibility=True)
|
|
4638
|
+
|
|
4256
4639
|
if not yes:
|
|
4257
4640
|
click.confirm(f'Proceed to create volume {volume.name!r}?',
|
|
4258
4641
|
default=True,
|
|
@@ -4269,11 +4652,15 @@ def volumes_apply(
|
|
|
4269
4652
|
f'{colorama.Style.RESET_ALL}')
|
|
4270
4653
|
|
|
4271
4654
|
|
|
4272
|
-
def _build_volume_override_config(
|
|
4273
|
-
|
|
4274
|
-
|
|
4655
|
+
def _build_volume_override_config(
|
|
4656
|
+
name: Optional[str],
|
|
4657
|
+
infra: Optional[str],
|
|
4658
|
+
volume_type: Optional[str],
|
|
4659
|
+
size: Optional[str],
|
|
4660
|
+
use_existing: Optional[bool],
|
|
4661
|
+
) -> Dict[str, Any]:
|
|
4275
4662
|
"""Parse the volume override config."""
|
|
4276
|
-
override_config = {}
|
|
4663
|
+
override_config: Dict[str, Any] = {}
|
|
4277
4664
|
if name is not None:
|
|
4278
4665
|
override_config['name'] = name
|
|
4279
4666
|
if infra is not None:
|
|
@@ -4282,6 +4669,8 @@ def _build_volume_override_config(name: Optional[str], infra: Optional[str],
|
|
|
4282
4669
|
override_config['type'] = volume_type
|
|
4283
4670
|
if size is not None:
|
|
4284
4671
|
override_config['size'] = size
|
|
4672
|
+
if use_existing is not None:
|
|
4673
|
+
override_config['use_existing'] = use_existing
|
|
4285
4674
|
return override_config
|
|
4286
4675
|
|
|
4287
4676
|
|
|
@@ -4298,8 +4687,8 @@ def volumes_ls(verbose: bool):
|
|
|
4298
4687
|
"""List volumes managed by SkyPilot."""
|
|
4299
4688
|
request_id = volumes_sdk.ls()
|
|
4300
4689
|
all_volumes = sdk.stream_and_get(request_id)
|
|
4301
|
-
volume_table =
|
|
4302
|
-
|
|
4690
|
+
volume_table = table_utils.format_volume_table(all_volumes,
|
|
4691
|
+
show_all=verbose)
|
|
4303
4692
|
click.echo(volume_table)
|
|
4304
4693
|
|
|
4305
4694
|
|
|
@@ -4537,10 +4926,11 @@ def jobs_launch(
|
|
|
4537
4926
|
break
|
|
4538
4927
|
if print_setup_fm_warning:
|
|
4539
4928
|
click.secho(
|
|
4540
|
-
f'{colorama.Fore.YELLOW}
|
|
4541
|
-
' will be ignored when
|
|
4542
|
-
f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
|
|
4929
|
+
f'{colorama.Fore.YELLOW}Setup, file mounts, and storage mounts'
|
|
4930
|
+
' will be ignored when submitting jobs to pool. To update a '
|
|
4931
|
+
f'pool, please use `sky jobs pool apply {pool} new-pool.yaml`. '
|
|
4543
4932
|
f'{colorama.Style.RESET_ALL}')
|
|
4933
|
+
print_setup_fm_warning = False
|
|
4544
4934
|
|
|
4545
4935
|
# Optimize info is only show if _need_confirmation.
|
|
4546
4936
|
if not yes:
|
|
@@ -4556,10 +4946,15 @@ def jobs_launch(
|
|
|
4556
4946
|
job_id_handle = _async_call_or_wait(request_id, async_call,
|
|
4557
4947
|
'sky.jobs.launch')
|
|
4558
4948
|
|
|
4559
|
-
if
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4949
|
+
if async_call:
|
|
4950
|
+
return
|
|
4951
|
+
|
|
4952
|
+
job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
|
|
4953
|
+
int) else job_id_handle[0]
|
|
4954
|
+
|
|
4955
|
+
if not detach_run:
|
|
4956
|
+
if len(job_ids) == 1:
|
|
4957
|
+
job_id = job_ids[0]
|
|
4563
4958
|
returncode = managed_jobs.tail_logs(name=None,
|
|
4564
4959
|
job_id=job_id,
|
|
4565
4960
|
follow=True,
|
|
@@ -4568,7 +4963,8 @@ def jobs_launch(
|
|
|
4568
4963
|
else:
|
|
4569
4964
|
# TODO(tian): This can be very long. Considering have a "group id"
|
|
4570
4965
|
# and query all job ids with the same group id.
|
|
4571
|
-
|
|
4966
|
+
# Sort job ids to ensure consistent ordering.
|
|
4967
|
+
job_ids_str = ','.join(map(str, sorted(job_ids)))
|
|
4572
4968
|
click.secho(
|
|
4573
4969
|
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4574
4970
|
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
@@ -4587,6 +4983,14 @@ def jobs_launch(
|
|
|
4587
4983
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
|
4588
4984
|
@flags.config_option(expose_value=False)
|
|
4589
4985
|
@flags.verbose_option()
|
|
4986
|
+
@click.option(
|
|
4987
|
+
'--limit',
|
|
4988
|
+
'-l',
|
|
4989
|
+
default=_NUM_MANAGED_JOBS_TO_SHOW,
|
|
4990
|
+
type=int,
|
|
4991
|
+
required=False,
|
|
4992
|
+
help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
|
|
4993
|
+
f' use "-a/--all" to show all jobs.'))
|
|
4590
4994
|
@click.option(
|
|
4591
4995
|
'--refresh',
|
|
4592
4996
|
'-r',
|
|
@@ -4606,7 +5010,7 @@ def jobs_launch(
|
|
|
4606
5010
|
@usage_lib.entrypoint
|
|
4607
5011
|
# pylint: disable=redefined-builtin
|
|
4608
5012
|
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4609
|
-
all_users: bool, all: bool):
|
|
5013
|
+
all_users: bool, all: bool, limit: int):
|
|
4610
5014
|
"""Show statuses of managed jobs.
|
|
4611
5015
|
|
|
4612
5016
|
Each managed jobs can have one of the following statuses:
|
|
@@ -4657,18 +5061,56 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4657
5061
|
|
|
4658
5062
|
watch -n60 sky jobs queue
|
|
4659
5063
|
|
|
5064
|
+
(Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
|
|
5065
|
+
|
|
5066
|
+
.. code-block:: bash
|
|
5067
|
+
|
|
5068
|
+
sky jobs queue -l 10
|
|
5069
|
+
|
|
4660
5070
|
"""
|
|
4661
5071
|
click.secho('Fetching managed job statuses...', fg='cyan')
|
|
4662
5072
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
5073
|
+
max_num_jobs_to_show = (limit if not all else None)
|
|
5074
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
5075
|
+
if verbose:
|
|
5076
|
+
fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
|
|
5077
|
+
if all_users:
|
|
5078
|
+
fields = fields + _USER_NAME_FIELD
|
|
5079
|
+
if verbose:
|
|
5080
|
+
fields = fields + _USER_HASH_FIELD
|
|
5081
|
+
# Call both cli_utils.get_managed_job_queue and managed_jobs.pool_status
|
|
5082
|
+
# in parallel
|
|
5083
|
+
def get_managed_jobs_queue():
|
|
5084
|
+
return cli_utils.get_managed_job_queue(refresh=refresh,
|
|
5085
|
+
skip_finished=skip_finished,
|
|
5086
|
+
all_users=all_users,
|
|
5087
|
+
limit=max_num_jobs_to_show,
|
|
5088
|
+
fields=fields)
|
|
5089
|
+
|
|
5090
|
+
def get_pool_status():
|
|
5091
|
+
try:
|
|
5092
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
5093
|
+
except Exception: # pylint: disable=broad-except
|
|
5094
|
+
# If pool_status fails, we'll just skip the worker information
|
|
5095
|
+
return None
|
|
5096
|
+
|
|
5097
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
5098
|
+
managed_jobs_future = executor.submit(get_managed_jobs_queue)
|
|
5099
|
+
pool_status_future = executor.submit(get_pool_status)
|
|
5100
|
+
|
|
5101
|
+
(managed_jobs_request_id,
|
|
5102
|
+
queue_result_version) = managed_jobs_future.result()
|
|
5103
|
+
pool_status_request_id = pool_status_future.result()
|
|
5104
|
+
|
|
4666
5105
|
num_jobs, msg = _handle_jobs_queue_request(
|
|
4667
5106
|
managed_jobs_request_id,
|
|
5107
|
+
pool_status_request_id=pool_status_request_id,
|
|
4668
5108
|
show_all=verbose,
|
|
4669
5109
|
show_user=all_users,
|
|
4670
5110
|
max_num_jobs_to_show=max_num_jobs_to_show,
|
|
4671
|
-
is_called_by_user=True
|
|
5111
|
+
is_called_by_user=True,
|
|
5112
|
+
queue_result_version=queue_result_version,
|
|
5113
|
+
)
|
|
4672
5114
|
if not skip_finished:
|
|
4673
5115
|
in_progress_only_hint = ''
|
|
4674
5116
|
else:
|
|
@@ -4681,7 +5123,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4681
5123
|
f'{colorama.Fore.CYAN}'
|
|
4682
5124
|
f'Only showing the latest {max_num_jobs_to_show} '
|
|
4683
5125
|
f'managed jobs'
|
|
4684
|
-
f'(use --
|
|
5126
|
+
f'(use --limit to show more managed jobs or '
|
|
5127
|
+
f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
|
|
4685
5128
|
|
|
4686
5129
|
|
|
4687
5130
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
|
@@ -4849,7 +5292,7 @@ def pool():
|
|
|
4849
5292
|
@pool.command('apply', cls=_DocumentedCodeCommand)
|
|
4850
5293
|
@flags.config_option(expose_value=False)
|
|
4851
5294
|
@click.argument('pool_yaml',
|
|
4852
|
-
required=
|
|
5295
|
+
required=False,
|
|
4853
5296
|
type=str,
|
|
4854
5297
|
nargs=-1,
|
|
4855
5298
|
**_get_shell_complete_args(_complete_file_name))
|
|
@@ -4864,17 +5307,22 @@ def pool():
|
|
|
4864
5307
|
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
|
4865
5308
|
case_sensitive=False),
|
|
4866
5309
|
required=False,
|
|
4867
|
-
help=('Update mode. If "rolling",
|
|
4868
|
-
'with rolling update. If "blue_green",
|
|
5310
|
+
help=('Update mode. If "rolling", pool will be updated '
|
|
5311
|
+
'with rolling update. If "blue_green", pool will '
|
|
4869
5312
|
'be updated with blue-green update. This option is only '
|
|
4870
5313
|
'valid when the pool is already running.'))
|
|
5314
|
+
@click.option('--workers',
|
|
5315
|
+
default=None,
|
|
5316
|
+
type=int,
|
|
5317
|
+
required=False,
|
|
5318
|
+
help='Can be used to update the number of workers in the pool.')
|
|
4871
5319
|
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
4872
5320
|
flags.COMMON_OPTIONS)
|
|
4873
5321
|
@flags.yes_option()
|
|
4874
5322
|
@timeline.event
|
|
4875
5323
|
@usage_lib.entrypoint
|
|
4876
5324
|
def jobs_pool_apply(
|
|
4877
|
-
pool_yaml: Tuple[str, ...],
|
|
5325
|
+
pool_yaml: Optional[Tuple[str, ...]],
|
|
4878
5326
|
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4879
5327
|
workdir: Optional[str],
|
|
4880
5328
|
infra: Optional[str],
|
|
@@ -4896,60 +5344,80 @@ def jobs_pool_apply(
|
|
|
4896
5344
|
disk_tier: Optional[str],
|
|
4897
5345
|
network_tier: Optional[str],
|
|
4898
5346
|
mode: str,
|
|
5347
|
+
workers: Optional[int],
|
|
4899
5348
|
yes: bool,
|
|
4900
5349
|
async_call: bool,
|
|
4901
5350
|
):
|
|
4902
|
-
"""
|
|
4903
|
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
5351
|
+
"""Either apply a config to a pool for managed jobs submission
|
|
5352
|
+
or update the number of workers in the pool. One of POOL_YAML or --workers
|
|
5353
|
+
must be provided.
|
|
5354
|
+
Config:
|
|
5355
|
+
If the pool is already running, the config will be applied to the pool.
|
|
5356
|
+
Otherwise, a new pool will be created.
|
|
5357
|
+
Workers:
|
|
5358
|
+
The --workers option can be used to override the number of workers
|
|
5359
|
+
specified in the YAML file, or to update workers without a YAML file.
|
|
5360
|
+
Example:
|
|
5361
|
+
sky jobs pool apply -p my-pool --workers 5
|
|
4908
5362
|
"""
|
|
4909
5363
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4910
5364
|
infra, cloud, region, zone)
|
|
4911
|
-
if
|
|
4912
|
-
|
|
5365
|
+
if workers is not None and pool_yaml is not None and len(pool_yaml) > 0:
|
|
5366
|
+
raise click.UsageError(
|
|
5367
|
+
'Cannot specify both --workers and POOL_YAML. Please use one of '
|
|
5368
|
+
'them.')
|
|
4913
5369
|
|
|
4914
|
-
|
|
4915
|
-
|
|
4916
|
-
|
|
4917
|
-
|
|
4918
|
-
|
|
4919
|
-
|
|
4920
|
-
|
|
4921
|
-
|
|
4922
|
-
|
|
4923
|
-
|
|
4924
|
-
|
|
4925
|
-
|
|
4926
|
-
|
|
4927
|
-
|
|
4928
|
-
|
|
4929
|
-
|
|
4930
|
-
|
|
4931
|
-
|
|
4932
|
-
|
|
4933
|
-
|
|
4934
|
-
|
|
4935
|
-
|
|
4936
|
-
|
|
4937
|
-
|
|
4938
|
-
|
|
4939
|
-
|
|
4940
|
-
|
|
4941
|
-
|
|
4942
|
-
|
|
4943
|
-
|
|
5370
|
+
if pool_yaml is None or len(pool_yaml) == 0:
|
|
5371
|
+
if pool is None:
|
|
5372
|
+
raise click.UsageError(
|
|
5373
|
+
'A pool name must be provided to update the number of workers.')
|
|
5374
|
+
task = None
|
|
5375
|
+
click.secho(f'Attempting to update {pool} to have {workers} workers',
|
|
5376
|
+
fg='cyan')
|
|
5377
|
+
else:
|
|
5378
|
+
if pool is None:
|
|
5379
|
+
pool = serve_lib.generate_service_name(pool=True)
|
|
5380
|
+
|
|
5381
|
+
task = _generate_task_with_service(
|
|
5382
|
+
service_name=pool,
|
|
5383
|
+
service_yaml_args=pool_yaml,
|
|
5384
|
+
workdir=workdir,
|
|
5385
|
+
cloud=cloud,
|
|
5386
|
+
region=region,
|
|
5387
|
+
zone=zone,
|
|
5388
|
+
gpus=gpus,
|
|
5389
|
+
cpus=cpus,
|
|
5390
|
+
memory=memory,
|
|
5391
|
+
instance_type=instance_type,
|
|
5392
|
+
num_nodes=num_nodes,
|
|
5393
|
+
use_spot=use_spot,
|
|
5394
|
+
image_id=image_id,
|
|
5395
|
+
env_file=env_file,
|
|
5396
|
+
env=env,
|
|
5397
|
+
secret=secret,
|
|
5398
|
+
disk_size=disk_size,
|
|
5399
|
+
disk_tier=disk_tier,
|
|
5400
|
+
network_tier=network_tier,
|
|
5401
|
+
ports=ports,
|
|
5402
|
+
not_supported_cmd='sky jobs pool up',
|
|
5403
|
+
pool=True,
|
|
5404
|
+
)
|
|
5405
|
+
assert task.service is not None
|
|
5406
|
+
if not task.service.pool:
|
|
5407
|
+
raise click.UsageError('The YAML file needs a `pool` section.')
|
|
5408
|
+
click.secho('Pool spec:', fg='cyan')
|
|
5409
|
+
click.echo(task.service)
|
|
5410
|
+
serve_lib.validate_service_task(task, pool=True)
|
|
4944
5411
|
|
|
4945
|
-
|
|
4946
|
-
|
|
4947
|
-
|
|
4948
|
-
|
|
4949
|
-
|
|
5412
|
+
click.secho(
|
|
5413
|
+
'Each pool worker will use the following resources (estimated):',
|
|
5414
|
+
fg='cyan')
|
|
5415
|
+
with dag_lib.Dag() as dag:
|
|
5416
|
+
dag.add(task)
|
|
4950
5417
|
|
|
4951
5418
|
request_id = managed_jobs.pool_apply(task,
|
|
4952
5419
|
pool,
|
|
5420
|
+
workers=workers,
|
|
4953
5421
|
mode=serve_lib.UpdateMode(mode),
|
|
4954
5422
|
_need_confirmation=not yes)
|
|
4955
5423
|
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
@@ -4962,7 +5430,7 @@ def jobs_pool_apply(
|
|
|
4962
5430
|
@usage_lib.entrypoint
|
|
4963
5431
|
# pylint: disable=redefined-builtin
|
|
4964
5432
|
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
4965
|
-
"""Show statuses of
|
|
5433
|
+
"""Show statuses of pools.
|
|
4966
5434
|
|
|
4967
5435
|
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
4968
5436
|
provided, show all pools' status.
|
|
@@ -5018,12 +5486,108 @@ def jobs_pool_down(
|
|
|
5018
5486
|
raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
|
|
5019
5487
|
f'Provided {argument_str!r}.')
|
|
5020
5488
|
|
|
5021
|
-
|
|
5022
|
-
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
|
|
5026
|
-
|
|
5489
|
+
def _get_nonterminal_jobs(pool_names: List[str],
|
|
5490
|
+
all: bool) -> List[responses.ManagedJobRecord]:
|
|
5491
|
+
# Get nonterminal jobs for this pool using managed_jobs.queue
|
|
5492
|
+
request_id, queue_result_version = cli_utils.get_managed_job_queue(
|
|
5493
|
+
refresh=False,
|
|
5494
|
+
skip_finished=True,
|
|
5495
|
+
all_users=True,
|
|
5496
|
+
limit=None,
|
|
5497
|
+
fields=['job_id', 'status', 'pool'],
|
|
5498
|
+
)
|
|
5499
|
+
jobs_result = sdk.stream_and_get(request_id)
|
|
5500
|
+
|
|
5501
|
+
# Handle both tuple and list responses
|
|
5502
|
+
jobs_list: List[responses.ManagedJobRecord]
|
|
5503
|
+
if queue_result_version.v2():
|
|
5504
|
+
jobs_list = jobs_result[0]
|
|
5505
|
+
else:
|
|
5506
|
+
jobs_list = typing.cast(List[responses.ManagedJobRecord],
|
|
5507
|
+
jobs_result)
|
|
5508
|
+
|
|
5509
|
+
def _should_include_job(job: responses.ManagedJobRecord) -> bool:
|
|
5510
|
+
# Job must not be terminal.
|
|
5511
|
+
if job.get('status', ManagedJobStatus.SUCCEEDED).is_terminal():
|
|
5512
|
+
return False
|
|
5513
|
+
# If len is 0 then we are using -a option, so we include all jobs
|
|
5514
|
+
# if they're associated with a pool.
|
|
5515
|
+
if all:
|
|
5516
|
+
return job.get('pool') is not None
|
|
5517
|
+
# Otherwise we are using specific pool names, so we include the job
|
|
5518
|
+
# if it's associated with one of the specified pools.
|
|
5519
|
+
return job.get('pool') in pool_names
|
|
5520
|
+
|
|
5521
|
+
# Filter jobs by pool name and ensure nonterminal
|
|
5522
|
+
pool_jobs = [job for job in jobs_list if _should_include_job(job)]
|
|
5523
|
+
return pool_jobs
|
|
5524
|
+
|
|
5525
|
+
quoted_pool_names = [f'{name!r}' for name in pool_names]
|
|
5526
|
+
list_pool_str = ', '.join(quoted_pool_names)
|
|
5527
|
+
pool_identity_str = f'pool(s) {list_pool_str}'
|
|
5528
|
+
if all:
|
|
5529
|
+
pool_identity_str = 'all pools'
|
|
5530
|
+
|
|
5531
|
+
already_confirmed = False
|
|
5532
|
+
try:
|
|
5533
|
+
pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5534
|
+
if pool_jobs:
|
|
5535
|
+
num_jobs = len(pool_jobs)
|
|
5536
|
+
job_ids = [job['job_id'] for job in pool_jobs]
|
|
5537
|
+
job_ids_str = ','.join(str(job_id) for job_id in job_ids)
|
|
5538
|
+
click.echo(
|
|
5539
|
+
f'{colorama.Fore.YELLOW}Pool(s) has {num_jobs} '
|
|
5540
|
+
f'nonterminal jobs: {job_ids_str} so it is not yet safe to down'
|
|
5541
|
+
f'.{colorama.Style.RESET_ALL}')
|
|
5542
|
+
if not yes:
|
|
5543
|
+
should_cancel = click.confirm(
|
|
5544
|
+
'Would you like to cancel all jobs and down the pool(s)?',
|
|
5545
|
+
default=False,
|
|
5546
|
+
abort=False,
|
|
5547
|
+
show_default=True)
|
|
5548
|
+
if not should_cancel:
|
|
5549
|
+
raise click.Abort()
|
|
5550
|
+
already_confirmed = True
|
|
5551
|
+
|
|
5552
|
+
# Cancel all jobs in the pool
|
|
5553
|
+
with rich_utils.client_status(
|
|
5554
|
+
ux_utils.spinner_message(
|
|
5555
|
+
f'Cancelling {num_jobs} jobs in {pool_identity_str}...')
|
|
5556
|
+
):
|
|
5557
|
+
try:
|
|
5558
|
+
sdk.get(managed_jobs.cancel(job_ids=job_ids))
|
|
5559
|
+
except Exception as e:
|
|
5560
|
+
logger.warning(f'Failed to cancel jobs: {e}.')
|
|
5561
|
+
raise e
|
|
5562
|
+
|
|
5563
|
+
max_wait_time = 300 # 5 minutes max wait
|
|
5564
|
+
check_interval = 2 # Check every 2 seconds
|
|
5565
|
+
start_time = time.time()
|
|
5566
|
+
remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5567
|
+
while (remaining_pool_jobs and
|
|
5568
|
+
time.time() - start_time < max_wait_time):
|
|
5569
|
+
# Check remaining jobs via API
|
|
5570
|
+
time.sleep(check_interval)
|
|
5571
|
+
remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5572
|
+
ux_utils.spinner_message(
|
|
5573
|
+
f'Waiting for {len(remaining_pool_jobs)} '
|
|
5574
|
+
'jobs to be cancelled...')
|
|
5575
|
+
|
|
5576
|
+
click.echo('\r' + ' ' * 80 + '\r', nl=False)
|
|
5577
|
+
if time.time() - start_time >= max_wait_time:
|
|
5578
|
+
click.echo(
|
|
5579
|
+
f'{colorama.Fore.YELLOW}Warning: Timeout waiting '
|
|
5580
|
+
f'for jobs to finish. Proceeding with pool down '
|
|
5581
|
+
f'anyway.{colorama.Style.RESET_ALL}')
|
|
5582
|
+
else:
|
|
5583
|
+
click.echo('All jobs cancelled.')
|
|
5584
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5585
|
+
# If API call fails, log warning but continue with pool down
|
|
5586
|
+
logger.warning(
|
|
5587
|
+
f'Failed to check for running jobs in pool(s): {pool_names!r}: {e}.'
|
|
5588
|
+
' Proceeding with pool down.')
|
|
5589
|
+
|
|
5590
|
+
if not yes and not already_confirmed:
|
|
5027
5591
|
click.confirm(f'Terminating {pool_identity_str}. Proceed?',
|
|
5028
5592
|
default=True,
|
|
5029
5593
|
abort=True,
|
|
@@ -5205,22 +5769,22 @@ def jobs_pool_logs(
|
|
|
5205
5769
|
.. code-block:: bash
|
|
5206
5770
|
|
|
5207
5771
|
# Tail the controller logs of a pool
|
|
5208
|
-
sky pool logs --controller [POOL_NAME]
|
|
5772
|
+
sky jobs pool logs --controller [POOL_NAME]
|
|
5209
5773
|
\b
|
|
5210
5774
|
# Print the worker logs so far and exit
|
|
5211
|
-
sky pool logs --no-follow [POOL_NAME]
|
|
5775
|
+
sky jobs pool logs --no-follow [POOL_NAME] 1
|
|
5212
5776
|
\b
|
|
5213
5777
|
# Tail the logs of worker 1
|
|
5214
|
-
sky pool logs [POOL_NAME] 1
|
|
5778
|
+
sky jobs pool logs [POOL_NAME] 1
|
|
5215
5779
|
\b
|
|
5216
5780
|
# Show the last 100 lines of the controller logs
|
|
5217
|
-
sky pool logs --controller --tail 100 [POOL_NAME]
|
|
5781
|
+
sky jobs pool logs --controller --tail 100 [POOL_NAME]
|
|
5218
5782
|
\b
|
|
5219
5783
|
# Sync down all logs of the pool (controller, all workers)
|
|
5220
|
-
sky pool logs [POOL_NAME] --sync-down
|
|
5784
|
+
sky jobs pool logs [POOL_NAME] --sync-down
|
|
5221
5785
|
\b
|
|
5222
5786
|
# Sync down controller logs and logs for workers 1 and 3
|
|
5223
|
-
sky pool logs [POOL_NAME] 1 3 --controller --sync-down
|
|
5787
|
+
sky jobs pool logs [POOL_NAME] 1 3 --controller --sync-down
|
|
5224
5788
|
"""
|
|
5225
5789
|
_handle_serve_logs(pool_name,
|
|
5226
5790
|
follow=follow,
|
|
@@ -5236,7 +5800,15 @@ def jobs_pool_logs(
|
|
|
5236
5800
|
@flags.config_option(expose_value=False)
|
|
5237
5801
|
@usage_lib.entrypoint
|
|
5238
5802
|
def dashboard() -> None:
|
|
5239
|
-
"""
|
|
5803
|
+
"""Opens the SkyPilot dashboard."""
|
|
5804
|
+
sdk.dashboard()
|
|
5805
|
+
|
|
5806
|
+
|
|
5807
|
+
@cli.command(cls=_DocumentedCodeCommand, hidden=True)
|
|
5808
|
+
@flags.config_option(expose_value=False)
|
|
5809
|
+
@usage_lib.entrypoint
|
|
5810
|
+
def ui() -> None:
|
|
5811
|
+
"""Opens the SkyPilot dashboard."""
|
|
5240
5812
|
sdk.dashboard()
|
|
5241
5813
|
|
|
5242
5814
|
|
|
@@ -5247,28 +5819,30 @@ def serve():
|
|
|
5247
5819
|
|
|
5248
5820
|
|
|
5249
5821
|
def _generate_task_with_service(
|
|
5250
|
-
|
|
5251
|
-
|
|
5252
|
-
|
|
5253
|
-
|
|
5254
|
-
|
|
5255
|
-
|
|
5256
|
-
|
|
5257
|
-
|
|
5258
|
-
|
|
5259
|
-
|
|
5260
|
-
|
|
5261
|
-
|
|
5262
|
-
|
|
5263
|
-
|
|
5264
|
-
|
|
5265
|
-
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
5269
|
-
|
|
5270
|
-
|
|
5271
|
-
|
|
5822
|
+
service_name: str,
|
|
5823
|
+
service_yaml_args: Tuple[str, ...],
|
|
5824
|
+
workdir: Optional[str],
|
|
5825
|
+
cloud: Optional[str],
|
|
5826
|
+
region: Optional[str],
|
|
5827
|
+
zone: Optional[str],
|
|
5828
|
+
num_nodes: Optional[int],
|
|
5829
|
+
use_spot: Optional[bool],
|
|
5830
|
+
image_id: Optional[str],
|
|
5831
|
+
env_file: Optional[Dict[str, str]],
|
|
5832
|
+
env: List[Tuple[str, str]],
|
|
5833
|
+
secret: Optional[List[Tuple[str, str]]],
|
|
5834
|
+
gpus: Optional[str],
|
|
5835
|
+
instance_type: Optional[str],
|
|
5836
|
+
ports: Optional[Tuple[str]],
|
|
5837
|
+
cpus: Optional[str],
|
|
5838
|
+
memory: Optional[str],
|
|
5839
|
+
disk_size: Optional[int],
|
|
5840
|
+
disk_tier: Optional[str],
|
|
5841
|
+
network_tier: Optional[str],
|
|
5842
|
+
not_supported_cmd: str,
|
|
5843
|
+
pool: bool, # pylint: disable=redefined-outer-name
|
|
5844
|
+
git_url: Optional[str] = None,
|
|
5845
|
+
git_ref: Optional[str] = None,
|
|
5272
5846
|
) -> task_lib.Task:
|
|
5273
5847
|
"""Generate a task with service section from a service YAML file."""
|
|
5274
5848
|
is_yaml, _ = _check_yaml(''.join(service_yaml_args))
|
|
@@ -5298,6 +5872,8 @@ def _generate_task_with_service(
|
|
|
5298
5872
|
disk_tier=disk_tier,
|
|
5299
5873
|
network_tier=network_tier,
|
|
5300
5874
|
ports=ports,
|
|
5875
|
+
git_url=git_url,
|
|
5876
|
+
git_ref=git_ref,
|
|
5301
5877
|
)
|
|
5302
5878
|
if isinstance(task, dag_lib.Dag):
|
|
5303
5879
|
raise click.UsageError(
|
|
@@ -5313,7 +5889,7 @@ def _generate_task_with_service(
|
|
|
5313
5889
|
if task.service.pool:
|
|
5314
5890
|
if task.service.ports is not None or ports:
|
|
5315
5891
|
with ux_utils.print_exception_no_traceback():
|
|
5316
|
-
raise ValueError('Cannot specify ports in a
|
|
5892
|
+
raise ValueError('Cannot specify ports in a pool.')
|
|
5317
5893
|
return task
|
|
5318
5894
|
|
|
5319
5895
|
# NOTE(yi): we only allow one service port now.
|
|
@@ -5389,6 +5965,10 @@ def _generate_task_with_service(
|
|
|
5389
5965
|
type=str,
|
|
5390
5966
|
help='A service name. Unique for each service. If not provided, '
|
|
5391
5967
|
'a unique name is autogenerated.')
|
|
5968
|
+
@click.option('--git-url', type=str, help='Git repository URL.')
|
|
5969
|
+
@click.option('--git-ref',
|
|
5970
|
+
type=str,
|
|
5971
|
+
help='Git reference (branch, tag, or commit hash) to use.')
|
|
5392
5972
|
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
5393
5973
|
flags.COMMON_OPTIONS)
|
|
5394
5974
|
@flags.yes_option()
|
|
@@ -5418,6 +5998,8 @@ def serve_up(
|
|
|
5418
5998
|
network_tier: Optional[str],
|
|
5419
5999
|
yes: bool,
|
|
5420
6000
|
async_call: bool,
|
|
6001
|
+
git_url: Optional[str] = None,
|
|
6002
|
+
git_ref: Optional[str] = None,
|
|
5421
6003
|
):
|
|
5422
6004
|
"""Launch a SkyServe service.
|
|
5423
6005
|
|
|
@@ -5475,6 +6057,8 @@ def serve_up(
|
|
|
5475
6057
|
ports=ports,
|
|
5476
6058
|
not_supported_cmd='sky serve up',
|
|
5477
6059
|
pool=False,
|
|
6060
|
+
git_url=git_url,
|
|
6061
|
+
git_ref=git_ref,
|
|
5478
6062
|
)
|
|
5479
6063
|
assert task.service is not None
|
|
5480
6064
|
if task.service.pool:
|
|
@@ -5556,6 +6140,8 @@ def serve_update(
|
|
|
5556
6140
|
sky serve update --mode blue_green sky-service-16aa new_service.yaml
|
|
5557
6141
|
|
|
5558
6142
|
"""
|
|
6143
|
+
# TODO(lloyd-brown): Add a way to update number of replicas for serve
|
|
6144
|
+
# the way we did for pools.
|
|
5559
6145
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
5560
6146
|
infra, cloud, region, zone)
|
|
5561
6147
|
task = _generate_task_with_service(
|
|
@@ -5918,94 +6504,39 @@ def local():
|
|
|
5918
6504
|
help='Launch cluster without GPU support even '
|
|
5919
6505
|
'if GPUs are detected on the host.')
|
|
5920
6506
|
@click.option(
|
|
5921
|
-
'--
|
|
6507
|
+
'--name',
|
|
5922
6508
|
type=str,
|
|
5923
6509
|
required=False,
|
|
5924
|
-
help='
|
|
5925
|
-
@click.option('--ssh-user',
|
|
5926
|
-
type=str,
|
|
5927
|
-
required=False,
|
|
5928
|
-
help='SSH username for accessing remote machines.')
|
|
5929
|
-
@click.option('--ssh-key-path',
|
|
5930
|
-
type=str,
|
|
5931
|
-
required=False,
|
|
5932
|
-
help='Path to the SSH private key.')
|
|
5933
|
-
@click.option('--cleanup',
|
|
5934
|
-
is_flag=True,
|
|
5935
|
-
help='Clean up the remote cluster instead of deploying it.')
|
|
6510
|
+
help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
|
|
5936
6511
|
@click.option(
|
|
5937
|
-
'--
|
|
5938
|
-
type=
|
|
6512
|
+
'--port-start',
|
|
6513
|
+
type=int,
|
|
5939
6514
|
required=False,
|
|
5940
|
-
help='
|
|
5941
|
-
|
|
5942
|
-
|
|
5943
|
-
required=False,
|
|
5944
|
-
help='Password for the ssh-user to execute sudo commands. '
|
|
5945
|
-
'Required only if passwordless sudo is not setup.')
|
|
6515
|
+
help='Starting port range for the local kind cluster. Needs to be a '
|
|
6516
|
+
'multiple of 100. If not given, a random range will be used. '
|
|
6517
|
+
'Used without ip list.')
|
|
5946
6518
|
@local.command('up', cls=_DocumentedCodeCommand)
|
|
5947
6519
|
@flags.config_option(expose_value=False)
|
|
5948
6520
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
5949
6521
|
@usage_lib.entrypoint
|
|
5950
|
-
def local_up(gpus: bool,
|
|
5951
|
-
|
|
5952
|
-
|
|
5953
|
-
|
|
5954
|
-
|
|
5955
|
-
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
|
5956
|
-
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
|
5957
|
-
# all must be specified
|
|
5958
|
-
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
|
|
5959
|
-
if not (ips and ssh_user and ssh_key_path):
|
|
5960
|
-
raise click.BadParameter(
|
|
5961
|
-
'All --ips, --ssh-user, and --ssh-key-path '
|
|
5962
|
-
'must be specified together.')
|
|
5963
|
-
|
|
5964
|
-
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
|
5965
|
-
# are all provided
|
|
5966
|
-
if cleanup and not (ips and ssh_user and ssh_key_path):
|
|
5967
|
-
raise click.BadParameter('--cleanup can only be used with '
|
|
5968
|
-
'--ips, --ssh-user and --ssh-key-path.')
|
|
5969
|
-
|
|
5970
|
-
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
|
5971
|
-
|
|
5972
|
-
# If remote deployment arguments are specified, run remote up script
|
|
5973
|
-
ip_list = None
|
|
5974
|
-
ssh_key = None
|
|
5975
|
-
if ips and ssh_user and ssh_key_path:
|
|
5976
|
-
# Read and validate IP file
|
|
5977
|
-
try:
|
|
5978
|
-
with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
|
|
5979
|
-
ip_list = f.read().strip().splitlines()
|
|
5980
|
-
if not ip_list:
|
|
5981
|
-
raise click.BadParameter(f'IP file is empty: {ips}')
|
|
5982
|
-
except (IOError, OSError) as e:
|
|
5983
|
-
raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
|
|
5984
|
-
|
|
5985
|
-
# Read and validate SSH key file
|
|
5986
|
-
try:
|
|
5987
|
-
with open(os.path.expanduser(ssh_key_path), 'r',
|
|
5988
|
-
encoding='utf-8') as f:
|
|
5989
|
-
ssh_key = f.read()
|
|
5990
|
-
if not ssh_key:
|
|
5991
|
-
raise click.BadParameter(
|
|
5992
|
-
f'SSH key file is empty: {ssh_key_path}')
|
|
5993
|
-
except (IOError, OSError) as e:
|
|
5994
|
-
raise click.BadParameter(
|
|
5995
|
-
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
|
|
5996
|
-
|
|
5997
|
-
request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
|
|
5998
|
-
context_name, password)
|
|
6522
|
+
def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
|
|
6523
|
+
async_call: bool):
|
|
6524
|
+
"""Creates a local cluster."""
|
|
6525
|
+
request_id = sdk.local_up(gpus, name, port_start)
|
|
5999
6526
|
_async_call_or_wait(request_id, async_call, request_name='local up')
|
|
6000
6527
|
|
|
6001
6528
|
|
|
6529
|
+
@click.option('--name',
|
|
6530
|
+
type=str,
|
|
6531
|
+
required=False,
|
|
6532
|
+
help='Name of the cluster to down. Defaults to "skypilot".')
|
|
6002
6533
|
@local.command('down', cls=_DocumentedCodeCommand)
|
|
6003
6534
|
@flags.config_option(expose_value=False)
|
|
6004
6535
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
6005
6536
|
@usage_lib.entrypoint
|
|
6006
|
-
def local_down(async_call: bool):
|
|
6537
|
+
def local_down(name: Optional[str], async_call: bool):
|
|
6007
6538
|
"""Deletes a local cluster."""
|
|
6008
|
-
request_id = sdk.local_down()
|
|
6539
|
+
request_id = sdk.local_down(name)
|
|
6009
6540
|
_async_call_or_wait(request_id, async_call, request_name='sky.local.down')
|
|
6010
6541
|
|
|
6011
6542
|
|
|
@@ -6119,20 +6650,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6119
6650
|
**_get_shell_complete_args(_complete_api_request))
|
|
6120
6651
|
@flags.all_option('Cancel all your requests.')
|
|
6121
6652
|
@flags.all_users_option('Cancel all requests from all users.')
|
|
6653
|
+
@flags.yes_option()
|
|
6122
6654
|
@usage_lib.entrypoint
|
|
6123
6655
|
# pylint: disable=redefined-builtin
|
|
6124
|
-
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool
|
|
6656
|
+
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
|
|
6657
|
+
yes: bool):
|
|
6125
6658
|
"""Cancel a request running on SkyPilot API server."""
|
|
6126
6659
|
if all or all_users:
|
|
6127
|
-
|
|
6128
|
-
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6132
|
-
|
|
6133
|
-
|
|
6134
|
-
|
|
6135
|
-
|
|
6660
|
+
if not yes:
|
|
6661
|
+
keyword = 'ALL USERS\'' if all_users else 'YOUR'
|
|
6662
|
+
user_input = click.prompt(
|
|
6663
|
+
f'This will cancel all {keyword} requests.\n'
|
|
6664
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
|
6665
|
+
f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
|
|
6666
|
+
type=str)
|
|
6667
|
+
if user_input != 'cancel all requests':
|
|
6668
|
+
raise click.Abort()
|
|
6136
6669
|
request_ids = None
|
|
6137
6670
|
cancelled_request_ids = sdk.get(
|
|
6138
6671
|
sdk.api_cancel(request_ids=request_ids, all_users=all_users))
|
|
@@ -6146,9 +6679,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6146
6679
|
fg='green')
|
|
6147
6680
|
|
|
6148
6681
|
|
|
6682
|
+
class IntOrNone(click.ParamType):
|
|
6683
|
+
"""Int or None"""
|
|
6684
|
+
name = 'int-or-none'
|
|
6685
|
+
|
|
6686
|
+
def convert(self, value, param, ctx):
|
|
6687
|
+
if isinstance(value, int):
|
|
6688
|
+
return value
|
|
6689
|
+
if isinstance(value, str) and value.lower() in ('none', 'all'):
|
|
6690
|
+
return None
|
|
6691
|
+
try:
|
|
6692
|
+
return int(value)
|
|
6693
|
+
except ValueError:
|
|
6694
|
+
self.fail(f'{value!r} is not a valid integer or "none" or "all"',
|
|
6695
|
+
param, ctx)
|
|
6696
|
+
|
|
6697
|
+
|
|
6698
|
+
INT_OR_NONE = IntOrNone()
|
|
6699
|
+
|
|
6700
|
+
|
|
6149
6701
|
@api.command('status', cls=_DocumentedCodeCommand)
|
|
6150
6702
|
@flags.config_option(expose_value=False)
|
|
6151
|
-
@click.argument('
|
|
6703
|
+
@click.argument('request_id_prefixes',
|
|
6152
6704
|
required=False,
|
|
6153
6705
|
type=str,
|
|
6154
6706
|
nargs=-1,
|
|
@@ -6158,16 +6710,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6158
6710
|
is_flag=True,
|
|
6159
6711
|
default=False,
|
|
6160
6712
|
required=False,
|
|
6161
|
-
help='Show requests of all statuses
|
|
6713
|
+
help=('Show requests of all statuses, including finished ones '
|
|
6714
|
+
'(SUCCEEDED, FAILED, CANCELLED). By default, only active '
|
|
6715
|
+
'requests (PENDING, RUNNING) are shown.'))
|
|
6716
|
+
@click.option(
|
|
6717
|
+
'--limit',
|
|
6718
|
+
'-l',
|
|
6719
|
+
default=_NUM_REQUESTS_TO_SHOW,
|
|
6720
|
+
type=INT_OR_NONE,
|
|
6721
|
+
required=False,
|
|
6722
|
+
help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
|
|
6723
|
+
f' set to "none" or "all" to show all requests.'))
|
|
6162
6724
|
@flags.verbose_option('Show more details.')
|
|
6163
6725
|
@usage_lib.entrypoint
|
|
6164
6726
|
# pylint: disable=redefined-builtin
|
|
6165
|
-
def api_status(
|
|
6166
|
-
verbose: bool):
|
|
6727
|
+
def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
|
|
6728
|
+
verbose: bool, limit: Optional[int]):
|
|
6167
6729
|
"""List requests on SkyPilot API server."""
|
|
6168
|
-
if not
|
|
6169
|
-
|
|
6170
|
-
|
|
6730
|
+
if not request_id_prefixes:
|
|
6731
|
+
request_id_prefixes = None
|
|
6732
|
+
fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
|
|
6733
|
+
if verbose:
|
|
6734
|
+
fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
|
|
6735
|
+
request_list = sdk.api_status(request_id_prefixes, all_status, limit,
|
|
6736
|
+
fields)
|
|
6171
6737
|
columns = ['ID', 'User', 'Name']
|
|
6172
6738
|
if verbose:
|
|
6173
6739
|
columns.append('Cluster')
|
|
@@ -6193,8 +6759,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
|
6193
6759
|
if verbose:
|
|
6194
6760
|
dummy_row.append('-')
|
|
6195
6761
|
table.add_row(dummy_row)
|
|
6196
|
-
click.echo()
|
|
6197
6762
|
click.echo(table)
|
|
6763
|
+
if limit and len(request_list) >= limit:
|
|
6764
|
+
click.echo()
|
|
6765
|
+
click.echo(
|
|
6766
|
+
f'Showing {limit} requests. Use "-l none" or "-l all" to show'
|
|
6767
|
+
f' all requests.')
|
|
6198
6768
|
|
|
6199
6769
|
|
|
6200
6770
|
@api.command('login', cls=_DocumentedCodeCommand)
|