skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -32,6 +32,7 @@ import shlex
|
|
|
32
32
|
import shutil
|
|
33
33
|
import subprocess
|
|
34
34
|
import sys
|
|
35
|
+
import time
|
|
35
36
|
import traceback
|
|
36
37
|
import typing
|
|
37
38
|
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
|
@@ -59,8 +60,9 @@ from sky import task as task_lib
|
|
|
59
60
|
from sky.adaptors import common as adaptors_common
|
|
60
61
|
from sky.client import sdk
|
|
61
62
|
from sky.client.cli import flags
|
|
62
|
-
from sky.client.cli import
|
|
63
|
-
from sky.
|
|
63
|
+
from sky.client.cli import table_utils
|
|
64
|
+
from sky.client.cli import utils as cli_utils
|
|
65
|
+
from sky.jobs.state import ManagedJobStatus
|
|
64
66
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
65
67
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
66
68
|
from sky.schemas.api import responses
|
|
@@ -79,7 +81,6 @@ from sky.utils import controller_utils
|
|
|
79
81
|
from sky.utils import dag_utils
|
|
80
82
|
from sky.utils import directory_utils
|
|
81
83
|
from sky.utils import env_options
|
|
82
|
-
from sky.utils import git as git_utils
|
|
83
84
|
from sky.utils import infra_utils
|
|
84
85
|
from sky.utils import log_utils
|
|
85
86
|
from sky.utils import registry
|
|
@@ -89,9 +90,9 @@ from sky.utils import status_lib
|
|
|
89
90
|
from sky.utils import subprocess_utils
|
|
90
91
|
from sky.utils import timeline
|
|
91
92
|
from sky.utils import ux_utils
|
|
93
|
+
from sky.utils import volume as volume_utils
|
|
92
94
|
from sky.utils import yaml_utils
|
|
93
95
|
from sky.utils.cli_utils import status_utils
|
|
94
|
-
from sky.volumes import utils as volumes_utils
|
|
95
96
|
from sky.volumes.client import sdk as volumes_sdk
|
|
96
97
|
|
|
97
98
|
if typing.TYPE_CHECKING:
|
|
@@ -113,6 +114,24 @@ an autogenerated name."""
|
|
|
113
114
|
# command.
|
|
114
115
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
|
|
115
116
|
_NUM_MANAGED_JOBS_TO_SHOW = 50
|
|
117
|
+
_NUM_REQUESTS_TO_SHOW = 50
|
|
118
|
+
_DEFAULT_REQUEST_FIELDS_TO_SHOW = [
|
|
119
|
+
'request_id', 'name', 'user_id', 'status', 'created_at'
|
|
120
|
+
]
|
|
121
|
+
_VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
|
|
122
|
+
'cluster_name'
|
|
123
|
+
]
|
|
124
|
+
_DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
|
|
125
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
126
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
127
|
+
]
|
|
128
|
+
_VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
|
|
129
|
+
'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
|
|
130
|
+
'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
|
|
131
|
+
'failure_reason', 'metadata'
|
|
132
|
+
]
|
|
133
|
+
_USER_NAME_FIELD = ['user_name']
|
|
134
|
+
_USER_HASH_FIELD = ['user_hash']
|
|
116
135
|
|
|
117
136
|
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
|
118
137
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
|
@@ -129,6 +148,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
129
148
|
clusters: Optional[List[str]],
|
|
130
149
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
131
150
|
all_users: bool = False,
|
|
151
|
+
verbose: bool = False,
|
|
132
152
|
) -> List[responses.StatusResponse]:
|
|
133
153
|
"""Returns a list of clusters that match the glob pattern.
|
|
134
154
|
|
|
@@ -146,17 +166,23 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
146
166
|
request_id = sdk.status(clusters,
|
|
147
167
|
refresh=refresh,
|
|
148
168
|
all_users=all_users,
|
|
149
|
-
_include_credentials=True
|
|
169
|
+
_include_credentials=True,
|
|
170
|
+
_summary_response=not verbose)
|
|
150
171
|
cluster_records = sdk.stream_and_get(request_id)
|
|
151
172
|
# Update the SSH config for all clusters
|
|
152
173
|
for record in cluster_records:
|
|
153
174
|
handle = record['handle']
|
|
154
|
-
|
|
175
|
+
name = record['name']
|
|
155
176
|
if not (handle is not None and handle.cached_external_ips is not None
|
|
156
177
|
and 'credentials' in record):
|
|
157
178
|
# If the cluster is not UP or does not have credentials available,
|
|
158
179
|
# we need to remove the cluster from the SSH config.
|
|
159
|
-
cluster_utils.SSHConfigHelper.remove_cluster(
|
|
180
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
|
181
|
+
continue
|
|
182
|
+
if not record['credentials']:
|
|
183
|
+
# The credential is missing for some reason, continue.
|
|
184
|
+
logger.debug(
|
|
185
|
+
f'Client did not receive SSH credential for cluster {name}')
|
|
160
186
|
continue
|
|
161
187
|
|
|
162
188
|
# During the failover, even though a cluster does not exist, the handle
|
|
@@ -783,8 +809,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
783
809
|
|
|
784
810
|
# Update the workdir config from the command line parameters.
|
|
785
811
|
# And update the envs and secrets from the workdir.
|
|
786
|
-
|
|
787
|
-
|
|
812
|
+
task.update_workdir(workdir, git_url, git_ref)
|
|
813
|
+
task.update_envs_and_secrets_from_workdir()
|
|
788
814
|
|
|
789
815
|
# job launch specific.
|
|
790
816
|
if job_recovery is not None:
|
|
@@ -799,73 +825,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
|
799
825
|
return task
|
|
800
826
|
|
|
801
827
|
|
|
802
|
-
def _update_task_workdir(task: task_lib.Task, workdir: Optional[str],
|
|
803
|
-
git_url: Optional[str], git_ref: Optional[str]):
|
|
804
|
-
"""Updates the task workdir.
|
|
805
|
-
|
|
806
|
-
Args:
|
|
807
|
-
task: The task to update.
|
|
808
|
-
workdir: The workdir to update.
|
|
809
|
-
git_url: The git url to update.
|
|
810
|
-
git_ref: The git ref to update.
|
|
811
|
-
"""
|
|
812
|
-
if task.workdir is None or isinstance(task.workdir, str):
|
|
813
|
-
if workdir is not None:
|
|
814
|
-
task.workdir = workdir
|
|
815
|
-
return
|
|
816
|
-
if git_url is not None:
|
|
817
|
-
task.workdir = {}
|
|
818
|
-
task.workdir['url'] = git_url
|
|
819
|
-
if git_ref is not None:
|
|
820
|
-
task.workdir['ref'] = git_ref
|
|
821
|
-
return
|
|
822
|
-
return
|
|
823
|
-
if git_url is not None:
|
|
824
|
-
task.workdir['url'] = git_url
|
|
825
|
-
if git_ref is not None:
|
|
826
|
-
task.workdir['ref'] = git_ref
|
|
827
|
-
return
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
def _update_task_workdir_and_secrets_from_workdir(task: task_lib.Task):
|
|
831
|
-
"""Updates the task secrets from the workdir.
|
|
832
|
-
|
|
833
|
-
Args:
|
|
834
|
-
task: The task to update.
|
|
835
|
-
"""
|
|
836
|
-
if task.workdir is None:
|
|
837
|
-
return
|
|
838
|
-
if not isinstance(task.workdir, dict):
|
|
839
|
-
return
|
|
840
|
-
url = task.workdir['url']
|
|
841
|
-
ref = task.workdir.get('ref', '')
|
|
842
|
-
token = os.environ.get(git_utils.GIT_TOKEN_ENV_VAR)
|
|
843
|
-
ssh_key_path = os.environ.get(git_utils.GIT_SSH_KEY_PATH_ENV_VAR)
|
|
844
|
-
try:
|
|
845
|
-
git_repo = git.GitRepo(url, ref, token, ssh_key_path)
|
|
846
|
-
clone_info = git_repo.get_repo_clone_info()
|
|
847
|
-
if clone_info is None:
|
|
848
|
-
return
|
|
849
|
-
task.envs[git_utils.GIT_URL_ENV_VAR] = clone_info.url
|
|
850
|
-
if ref:
|
|
851
|
-
ref_type = git_repo.get_ref_type()
|
|
852
|
-
if ref_type == git.GitRefType.COMMIT:
|
|
853
|
-
task.envs[git_utils.GIT_COMMIT_HASH_ENV_VAR] = ref
|
|
854
|
-
elif ref_type == git.GitRefType.BRANCH:
|
|
855
|
-
task.envs[git_utils.GIT_BRANCH_ENV_VAR] = ref
|
|
856
|
-
elif ref_type == git.GitRefType.TAG:
|
|
857
|
-
task.envs[git_utils.GIT_TAG_ENV_VAR] = ref
|
|
858
|
-
if clone_info.token is None and clone_info.ssh_key is None:
|
|
859
|
-
return
|
|
860
|
-
if clone_info.token is not None:
|
|
861
|
-
task.secrets[git_utils.GIT_TOKEN_ENV_VAR] = clone_info.token
|
|
862
|
-
if clone_info.ssh_key is not None:
|
|
863
|
-
task.secrets[git_utils.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
|
|
864
|
-
except exceptions.GitError as e:
|
|
865
|
-
with ux_utils.print_exception_no_traceback():
|
|
866
|
-
raise ValueError(f'{str(e)}') from None
|
|
867
|
-
|
|
868
|
-
|
|
869
828
|
class _NaturalOrderGroup(click.Group):
|
|
870
829
|
"""Lists commands in the order defined in this script.
|
|
871
830
|
|
|
@@ -1160,7 +1119,7 @@ def launch(
|
|
|
1160
1119
|
if task.service is not None:
|
|
1161
1120
|
noun = 'pool' if task.service.pool else 'service'
|
|
1162
1121
|
capnoun = noun.capitalize()
|
|
1163
|
-
sysname = '
|
|
1122
|
+
sysname = 'Pool' if task.service.pool else 'SkyServe'
|
|
1164
1123
|
cmd = 'sky jobs pool apply' if task.service.pool else 'sky serve up'
|
|
1165
1124
|
logger.info(
|
|
1166
1125
|
f'{colorama.Fore.YELLOW}{capnoun} section will be ignored when '
|
|
@@ -1388,14 +1347,24 @@ def exec(
|
|
|
1388
1347
|
|
|
1389
1348
|
|
|
1390
1349
|
def _handle_jobs_queue_request(
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1350
|
+
request_id: server_common.RequestId[Union[
|
|
1351
|
+
List[responses.ManagedJobRecord],
|
|
1352
|
+
Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
|
|
1353
|
+
show_all: bool,
|
|
1354
|
+
show_user: bool,
|
|
1355
|
+
max_num_jobs_to_show: Optional[int],
|
|
1356
|
+
pool_status_request_id: Optional[server_common.RequestId[List[Dict[
|
|
1357
|
+
str, Any]]]] = None,
|
|
1358
|
+
is_called_by_user: bool = False,
|
|
1359
|
+
only_in_progress: bool = False,
|
|
1360
|
+
queue_result_version: cli_utils.QueueResultVersion = cli_utils.
|
|
1361
|
+
QueueResultVersion.V1,
|
|
1362
|
+
) -> Tuple[Optional[int], str]:
|
|
1396
1363
|
"""Get the in-progress managed jobs.
|
|
1397
1364
|
|
|
1398
1365
|
Args:
|
|
1366
|
+
request_id: The request ID for managed jobs.
|
|
1367
|
+
pool_status_request_id: The request ID for pool status, or None.
|
|
1399
1368
|
show_all: Show all information of each job (e.g., region, price).
|
|
1400
1369
|
show_user: Show the user who submitted the job.
|
|
1401
1370
|
max_num_jobs_to_show: If not None, limit the number of jobs to show to
|
|
@@ -1403,6 +1372,8 @@ def _handle_jobs_queue_request(
|
|
|
1403
1372
|
and `sky jobs queue`.
|
|
1404
1373
|
is_called_by_user: If this function is called by user directly, or an
|
|
1405
1374
|
internal call.
|
|
1375
|
+
only_in_progress: If True, only return the number of in-progress jobs.
|
|
1376
|
+
queue_result_version: The version of the queue result.
|
|
1406
1377
|
|
|
1407
1378
|
Returns:
|
|
1408
1379
|
A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
|
|
@@ -1413,11 +1384,47 @@ def _handle_jobs_queue_request(
|
|
|
1413
1384
|
# TODO(SKY-980): remove unnecessary fallbacks on the client side.
|
|
1414
1385
|
num_in_progress_jobs = None
|
|
1415
1386
|
msg = ''
|
|
1387
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
1388
|
+
pool_status_result = None
|
|
1416
1389
|
try:
|
|
1417
1390
|
if not is_called_by_user:
|
|
1418
1391
|
usage_lib.messages.usage.set_internal()
|
|
1419
|
-
|
|
1420
|
-
|
|
1392
|
+
# Call both stream_and_get functions in parallel
|
|
1393
|
+
def get_jobs_queue_result():
|
|
1394
|
+
return sdk.stream_and_get(request_id)
|
|
1395
|
+
|
|
1396
|
+
def get_pool_status_result():
|
|
1397
|
+
if pool_status_request_id is not None:
|
|
1398
|
+
try:
|
|
1399
|
+
return sdk.stream_and_get(pool_status_request_id)
|
|
1400
|
+
except Exception: # pylint: disable=broad-except
|
|
1401
|
+
# If getting pool status fails, just continue without it
|
|
1402
|
+
return None
|
|
1403
|
+
return None
|
|
1404
|
+
|
|
1405
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
1406
|
+
jobs_future = executor.submit(get_jobs_queue_result)
|
|
1407
|
+
pool_status_future = executor.submit(get_pool_status_result)
|
|
1408
|
+
|
|
1409
|
+
result = jobs_future.result()
|
|
1410
|
+
pool_status_result = pool_status_future.result()
|
|
1411
|
+
|
|
1412
|
+
if queue_result_version.v2():
|
|
1413
|
+
managed_jobs_, total, status_counts, _ = result
|
|
1414
|
+
if only_in_progress:
|
|
1415
|
+
num_in_progress_jobs = 0
|
|
1416
|
+
if status_counts:
|
|
1417
|
+
for status_value, count in status_counts.items():
|
|
1418
|
+
status_enum = managed_jobs.ManagedJobStatus(
|
|
1419
|
+
status_value)
|
|
1420
|
+
if not status_enum.is_terminal():
|
|
1421
|
+
num_in_progress_jobs += count
|
|
1422
|
+
else:
|
|
1423
|
+
num_in_progress_jobs = total
|
|
1424
|
+
else:
|
|
1425
|
+
managed_jobs_ = result
|
|
1426
|
+
num_in_progress_jobs = len(
|
|
1427
|
+
set(job['job_id'] for job in managed_jobs_))
|
|
1421
1428
|
except exceptions.ClusterNotUpError as e:
|
|
1422
1429
|
controller_status = e.cluster_status
|
|
1423
1430
|
msg = str(e)
|
|
@@ -1461,10 +1468,14 @@ def _handle_jobs_queue_request(
|
|
|
1461
1468
|
msg += ('Failed to query managed jobs: '
|
|
1462
1469
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
1463
1470
|
else:
|
|
1464
|
-
msg =
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1471
|
+
msg = table_utils.format_job_table(
|
|
1472
|
+
managed_jobs_,
|
|
1473
|
+
pool_status=pool_status_result,
|
|
1474
|
+
show_all=show_all,
|
|
1475
|
+
show_user=show_user,
|
|
1476
|
+
max_jobs=max_num_jobs_to_show,
|
|
1477
|
+
status_counts=status_counts,
|
|
1478
|
+
)
|
|
1468
1479
|
return num_in_progress_jobs, msg
|
|
1469
1480
|
|
|
1470
1481
|
|
|
@@ -1562,35 +1573,6 @@ def _handle_services_request(
|
|
|
1562
1573
|
return num_services, msg
|
|
1563
1574
|
|
|
1564
1575
|
|
|
1565
|
-
def _status_kubernetes(show_all: bool):
|
|
1566
|
-
"""Show all SkyPilot resources in the current Kubernetes context.
|
|
1567
|
-
|
|
1568
|
-
Args:
|
|
1569
|
-
show_all (bool): Show all job information (e.g., start time, failures).
|
|
1570
|
-
"""
|
|
1571
|
-
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
|
1572
|
-
sdk.status_kubernetes()))
|
|
1573
|
-
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1574
|
-
f'Kubernetes cluster state (context: {context})'
|
|
1575
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1576
|
-
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
|
1577
|
-
show_all)
|
|
1578
|
-
if all_jobs:
|
|
1579
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
1580
|
-
f'Managed jobs'
|
|
1581
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1582
|
-
msg = managed_jobs.format_job_table(all_jobs,
|
|
1583
|
-
show_all=show_all,
|
|
1584
|
-
show_user=False)
|
|
1585
|
-
click.echo(msg)
|
|
1586
|
-
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
|
1587
|
-
# TODO: Parse serve controllers and show services separately.
|
|
1588
|
-
# Currently we show a hint that services are shown as clusters.
|
|
1589
|
-
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
|
1590
|
-
'shown in the "SkyPilot clusters" section.'
|
|
1591
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
1576
|
def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1595
1577
|
cluster_records: List[responses.StatusResponse], ip: bool,
|
|
1596
1578
|
endpoints: bool, endpoint: Optional[int]) -> None:
|
|
@@ -1717,15 +1699,7 @@ def _show_enabled_infra(
|
|
|
1717
1699
|
default=True,
|
|
1718
1700
|
is_flag=True,
|
|
1719
1701
|
required=False,
|
|
1720
|
-
help='Also show
|
|
1721
|
-
@click.option(
|
|
1722
|
-
'--kubernetes',
|
|
1723
|
-
'--k8s',
|
|
1724
|
-
default=False,
|
|
1725
|
-
is_flag=True,
|
|
1726
|
-
required=False,
|
|
1727
|
-
help='[Experimental] Show all SkyPilot resources (including from other '
|
|
1728
|
-
'users) in the current Kubernetes context.')
|
|
1702
|
+
help='Also show pools, if any.')
|
|
1729
1703
|
@click.argument('clusters',
|
|
1730
1704
|
required=False,
|
|
1731
1705
|
type=str,
|
|
@@ -1737,8 +1711,8 @@ def _show_enabled_infra(
|
|
|
1737
1711
|
# pylint: disable=redefined-builtin
|
|
1738
1712
|
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1739
1713
|
endpoint: Optional[int], show_managed_jobs: bool,
|
|
1740
|
-
show_services: bool, show_pools: bool,
|
|
1741
|
-
|
|
1714
|
+
show_services: bool, show_pools: bool, clusters: List[str],
|
|
1715
|
+
all_users: bool):
|
|
1742
1716
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1743
1717
|
"""Show clusters.
|
|
1744
1718
|
|
|
@@ -1801,9 +1775,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1801
1775
|
or for autostop-enabled clusters, use ``--refresh`` to query the latest
|
|
1802
1776
|
cluster statuses from the cloud providers.
|
|
1803
1777
|
"""
|
|
1804
|
-
if kubernetes:
|
|
1805
|
-
_status_kubernetes(verbose)
|
|
1806
|
-
return
|
|
1807
1778
|
# Do not show job queue if user specifies clusters, and if user
|
|
1808
1779
|
# specifies --ip or --endpoint(s).
|
|
1809
1780
|
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
|
@@ -1853,9 +1824,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1853
1824
|
|
|
1854
1825
|
# Phase 2: Parallel submission of all API requests
|
|
1855
1826
|
def submit_managed_jobs():
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1827
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
1828
|
+
if all_users:
|
|
1829
|
+
fields = fields + _USER_NAME_FIELD
|
|
1830
|
+
return cli_utils.get_managed_job_queue(
|
|
1831
|
+
refresh=False,
|
|
1832
|
+
skip_finished=True,
|
|
1833
|
+
all_users=all_users,
|
|
1834
|
+
fields=fields,
|
|
1835
|
+
limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1836
|
+
)
|
|
1859
1837
|
|
|
1860
1838
|
def submit_services(
|
|
1861
1839
|
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
@@ -1870,17 +1848,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1870
1848
|
return None
|
|
1871
1849
|
|
|
1872
1850
|
def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
|
|
1873
|
-
|
|
1874
|
-
return sdk.workspaces()
|
|
1875
|
-
except RuntimeError:
|
|
1876
|
-
# Backward compatibility for API server before #5660.
|
|
1877
|
-
# TODO(zhwu): remove this after 0.10.0.
|
|
1878
|
-
logger.warning(f'{colorama.Style.DIM}SkyPilot API server is '
|
|
1879
|
-
'in an old version, and may miss feature: '
|
|
1880
|
-
'workspaces. Update with: sky api stop; '
|
|
1881
|
-
'sky api start'
|
|
1882
|
-
f'{colorama.Style.RESET_ALL}')
|
|
1883
|
-
return None
|
|
1851
|
+
return sdk.workspaces()
|
|
1884
1852
|
|
|
1885
1853
|
active_workspace = skypilot_config.get_active_workspace()
|
|
1886
1854
|
|
|
@@ -1888,6 +1856,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1888
1856
|
return sdk.enabled_clouds(workspace=active_workspace, expand=True)
|
|
1889
1857
|
|
|
1890
1858
|
managed_jobs_queue_request_id = None
|
|
1859
|
+
queue_result_version = cli_utils.QueueResultVersion.V1
|
|
1891
1860
|
service_status_request_id = None
|
|
1892
1861
|
workspace_request_id = None
|
|
1893
1862
|
pool_status_request_id = None
|
|
@@ -1906,7 +1875,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1906
1875
|
|
|
1907
1876
|
# Get the request IDs
|
|
1908
1877
|
if show_managed_jobs:
|
|
1909
|
-
managed_jobs_queue_request_id
|
|
1878
|
+
(managed_jobs_queue_request_id,
|
|
1879
|
+
queue_result_version) = managed_jobs_request_future.result()
|
|
1910
1880
|
if show_services:
|
|
1911
1881
|
service_status_request_id = services_request_future.result()
|
|
1912
1882
|
if show_pools:
|
|
@@ -1927,7 +1897,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1927
1897
|
|
|
1928
1898
|
# Phase 3: Get cluster records and handle special cases
|
|
1929
1899
|
cluster_records = _get_cluster_records_and_set_ssh_config(
|
|
1930
|
-
query_clusters, refresh_mode, all_users)
|
|
1900
|
+
query_clusters, refresh_mode, all_users, verbose)
|
|
1931
1901
|
|
|
1932
1902
|
# TOOD(zhwu): setup the ssh config for status
|
|
1933
1903
|
if ip or show_endpoints:
|
|
@@ -1938,7 +1908,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1938
1908
|
controllers = []
|
|
1939
1909
|
for cluster_record in cluster_records:
|
|
1940
1910
|
cluster_name = cluster_record['name']
|
|
1941
|
-
controller = controller_utils.Controllers.from_name(
|
|
1911
|
+
controller = controller_utils.Controllers.from_name(
|
|
1912
|
+
cluster_name, expect_exact_match=False)
|
|
1942
1913
|
if controller is not None:
|
|
1943
1914
|
controllers.append(cluster_record)
|
|
1944
1915
|
else:
|
|
@@ -1967,10 +1938,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1967
1938
|
try:
|
|
1968
1939
|
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
|
1969
1940
|
managed_jobs_queue_request_id,
|
|
1941
|
+
pool_status_request_id=pool_status_request_id,
|
|
1970
1942
|
show_all=False,
|
|
1971
1943
|
show_user=all_users,
|
|
1972
1944
|
max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
|
1973
|
-
is_called_by_user=False
|
|
1945
|
+
is_called_by_user=False,
|
|
1946
|
+
only_in_progress=True,
|
|
1947
|
+
queue_result_version=queue_result_version,
|
|
1948
|
+
)
|
|
1974
1949
|
except KeyboardInterrupt:
|
|
1975
1950
|
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
|
1976
1951
|
managed_jobs_query_interrupted = True
|
|
@@ -2066,6 +2041,35 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
2066
2041
|
click.echo('\n' + '\n'.join(hints))
|
|
2067
2042
|
|
|
2068
2043
|
|
|
2044
|
+
@cli.command(hidden=True)
|
|
2045
|
+
@flags.config_option(expose_value=False)
|
|
2046
|
+
@flags.verbose_option()
|
|
2047
|
+
def status_kubernetes(verbose: bool):
|
|
2048
|
+
"""[Experimental] Show all SkyPilot resources (including from other '
|
|
2049
|
+
'users) in the current Kubernetes context."""
|
|
2050
|
+
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
|
2051
|
+
sdk.status_kubernetes()))
|
|
2052
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2053
|
+
f'Kubernetes cluster state (context: {context})'
|
|
2054
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2055
|
+
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
|
2056
|
+
show_all=verbose)
|
|
2057
|
+
if all_jobs:
|
|
2058
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
2059
|
+
f'Managed jobs'
|
|
2060
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2061
|
+
msg = table_utils.format_job_table(all_jobs,
|
|
2062
|
+
show_all=verbose,
|
|
2063
|
+
show_user=False)
|
|
2064
|
+
click.echo(msg)
|
|
2065
|
+
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
|
2066
|
+
# TODO: Parse serve controllers and show services separately.
|
|
2067
|
+
# Currently we show a hint that services are shown as clusters.
|
|
2068
|
+
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
|
2069
|
+
'shown in the "SkyPilot clusters" section.'
|
|
2070
|
+
f'{colorama.Style.RESET_ALL}')
|
|
2071
|
+
|
|
2072
|
+
|
|
2069
2073
|
@cli.command()
|
|
2070
2074
|
@flags.config_option(expose_value=False)
|
|
2071
2075
|
@flags.all_option('Show all cluster information.')
|
|
@@ -2104,7 +2108,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
|
|
|
2104
2108
|
for cluster_record in cluster_records:
|
|
2105
2109
|
cluster_name = cluster_record['name']
|
|
2106
2110
|
try:
|
|
2107
|
-
controller = controller_utils.Controllers.from_name(
|
|
2111
|
+
controller = controller_utils.Controllers.from_name(
|
|
2112
|
+
cluster_name, expect_exact_match=False)
|
|
2108
2113
|
except AssertionError:
|
|
2109
2114
|
# There could be some old controller clusters from previous
|
|
2110
2115
|
# versions that we should not show in the cost report.
|
|
@@ -2192,7 +2197,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2192
2197
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
|
2193
2198
|
f' {common_utils.format_exception(e)}')
|
|
2194
2199
|
return
|
|
2195
|
-
job_tables[cluster] =
|
|
2200
|
+
job_tables[cluster] = table_utils.format_job_queue(job_table)
|
|
2196
2201
|
|
|
2197
2202
|
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
|
2198
2203
|
user_str = 'all users' if all_users else 'current user'
|
|
@@ -2213,6 +2218,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2213
2218
|
is_flag=True,
|
|
2214
2219
|
default=False,
|
|
2215
2220
|
help='Stream the cluster provisioning logs (provision.log).')
|
|
2221
|
+
@click.option('--worker',
|
|
2222
|
+
'-w',
|
|
2223
|
+
default=None,
|
|
2224
|
+
type=int,
|
|
2225
|
+
help='The worker ID to stream the logs from. '
|
|
2226
|
+
'If not set, stream the logs of the head node.')
|
|
2216
2227
|
@click.option(
|
|
2217
2228
|
'--sync-down',
|
|
2218
2229
|
'-s',
|
|
@@ -2250,6 +2261,7 @@ def logs(
|
|
|
2250
2261
|
cluster: str,
|
|
2251
2262
|
job_ids: Tuple[str, ...],
|
|
2252
2263
|
provision: bool,
|
|
2264
|
+
worker: Optional[int],
|
|
2253
2265
|
sync_down: bool,
|
|
2254
2266
|
status: bool, # pylint: disable=redefined-outer-name
|
|
2255
2267
|
follow: bool,
|
|
@@ -2279,6 +2291,13 @@ def logs(
|
|
|
2279
2291
|
4. If the job fails or fetching the logs fails, the command will exit with
|
|
2280
2292
|
a non-zero return code.
|
|
2281
2293
|
"""
|
|
2294
|
+
if worker is not None:
|
|
2295
|
+
if not provision:
|
|
2296
|
+
raise click.UsageError(
|
|
2297
|
+
'--worker can only be used with --provision.')
|
|
2298
|
+
if worker < 1:
|
|
2299
|
+
raise click.UsageError('--worker must be a positive integer.')
|
|
2300
|
+
|
|
2282
2301
|
if provision and (sync_down or status or job_ids):
|
|
2283
2302
|
raise click.UsageError(
|
|
2284
2303
|
'--provision cannot be combined with job log options '
|
|
@@ -2298,7 +2317,11 @@ def logs(
|
|
|
2298
2317
|
|
|
2299
2318
|
if provision:
|
|
2300
2319
|
# Stream provision logs
|
|
2301
|
-
sys.exit(
|
|
2320
|
+
sys.exit(
|
|
2321
|
+
sdk.tail_provision_logs(cluster_name=cluster,
|
|
2322
|
+
worker=worker,
|
|
2323
|
+
follow=follow,
|
|
2324
|
+
tail=tail))
|
|
2302
2325
|
|
|
2303
2326
|
if sync_down:
|
|
2304
2327
|
with rich_utils.client_status(
|
|
@@ -2476,7 +2499,8 @@ def cancel(
|
|
|
2476
2499
|
job_ids=job_ids_to_cancel)
|
|
2477
2500
|
_async_call_or_wait(request_id, async_call, 'sky.cancel')
|
|
2478
2501
|
except exceptions.NotSupportedError as e:
|
|
2479
|
-
controller = controller_utils.Controllers.from_name(
|
|
2502
|
+
controller = controller_utils.Controllers.from_name(
|
|
2503
|
+
cluster, expect_exact_match=False)
|
|
2480
2504
|
assert controller is not None, cluster
|
|
2481
2505
|
with ux_utils.print_exception_no_traceback():
|
|
2482
2506
|
raise click.UsageError(
|
|
@@ -2777,7 +2801,8 @@ def start(
|
|
|
2777
2801
|
# Get all clusters that are not controllers.
|
|
2778
2802
|
cluster_records = [
|
|
2779
2803
|
cluster for cluster in all_clusters
|
|
2780
|
-
if controller_utils.Controllers.from_name(
|
|
2804
|
+
if controller_utils.Controllers.from_name(
|
|
2805
|
+
cluster['name'], expect_exact_match=False) is None
|
|
2781
2806
|
]
|
|
2782
2807
|
if cluster_records is None:
|
|
2783
2808
|
# Get GLOB cluster names
|
|
@@ -2839,7 +2864,8 @@ def start(
|
|
|
2839
2864
|
# Checks for controller clusters (jobs controller / sky serve controller).
|
|
2840
2865
|
controllers, normal_clusters = [], []
|
|
2841
2866
|
for name in to_start:
|
|
2842
|
-
if controller_utils.Controllers.from_name(
|
|
2867
|
+
if controller_utils.Controllers.from_name(
|
|
2868
|
+
name, expect_exact_match=False) is not None:
|
|
2843
2869
|
controllers.append(name)
|
|
2844
2870
|
else:
|
|
2845
2871
|
normal_clusters.append(name)
|
|
@@ -2975,16 +3001,28 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2975
3001
|
to be torn down (e.g., because it has jobs running or
|
|
2976
3002
|
it is in init state)
|
|
2977
3003
|
"""
|
|
2978
|
-
controller = controller_utils.Controllers.from_name(
|
|
3004
|
+
controller = controller_utils.Controllers.from_name(
|
|
3005
|
+
controller_name, expect_exact_match=False)
|
|
2979
3006
|
assert controller is not None, controller_name
|
|
2980
3007
|
|
|
3008
|
+
status_counts: Optional[Dict[str, int]] = None
|
|
3009
|
+
managed_jobs_: List[responses.ManagedJobRecord] = []
|
|
2981
3010
|
with rich_utils.client_status(
|
|
2982
3011
|
'[bold cyan]Checking for in-progress managed jobs and pools[/]'):
|
|
2983
3012
|
try:
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
3013
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
|
|
3014
|
+
request_id, queue_result_version = cli_utils.get_managed_job_queue(
|
|
3015
|
+
refresh=False,
|
|
3016
|
+
skip_finished=True,
|
|
3017
|
+
all_users=True,
|
|
3018
|
+
fields=fields,
|
|
3019
|
+
)
|
|
3020
|
+
result = sdk.stream_and_get(request_id)
|
|
3021
|
+
if queue_result_version.v2():
|
|
3022
|
+
managed_jobs_, _, status_counts, _ = result
|
|
3023
|
+
else:
|
|
3024
|
+
managed_jobs_ = typing.cast(List[responses.ManagedJobRecord],
|
|
3025
|
+
result)
|
|
2988
3026
|
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
2989
3027
|
pools_ = sdk.stream_and_get(request_id_pools)
|
|
2990
3028
|
except exceptions.ClusterNotUpError as e:
|
|
@@ -3002,25 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3002
3040
|
# there is no in-prgress managed jobs.
|
|
3003
3041
|
managed_jobs_ = []
|
|
3004
3042
|
pools_ = []
|
|
3005
|
-
except exceptions.InconsistentConsolidationModeError:
|
|
3006
|
-
# If this error is raised, it means the user switched to the
|
|
3007
|
-
# consolidation mode but the previous controller cluster is still
|
|
3008
|
-
# running. We should allow the user to tear down the controller
|
|
3009
|
-
# cluster in this case.
|
|
3010
|
-
with skypilot_config.override_skypilot_config(
|
|
3011
|
-
{'jobs': {
|
|
3012
|
-
'controller': {
|
|
3013
|
-
'consolidation_mode': False
|
|
3014
|
-
}
|
|
3015
|
-
}}):
|
|
3016
|
-
# Check again with the consolidation mode disabled. This is to
|
|
3017
|
-
# make sure there is no in-progress managed jobs.
|
|
3018
|
-
request_id = managed_jobs.queue(refresh=False,
|
|
3019
|
-
skip_finished=True,
|
|
3020
|
-
all_users=True)
|
|
3021
|
-
managed_jobs_ = sdk.stream_and_get(request_id)
|
|
3022
|
-
request_id_pools = managed_jobs.pool_status(pool_names=None)
|
|
3023
|
-
pools_ = sdk.stream_and_get(request_id_pools)
|
|
3024
3043
|
|
|
3025
3044
|
msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
|
|
3026
3045
|
'jobs controller. Please be aware of the following:'
|
|
@@ -3029,9 +3048,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
3029
3048
|
'jobs (output of `sky jobs queue`) will be lost.')
|
|
3030
3049
|
click.echo(msg)
|
|
3031
3050
|
if managed_jobs_:
|
|
3032
|
-
job_table =
|
|
3033
|
-
|
|
3034
|
-
|
|
3051
|
+
job_table = table_utils.format_job_table(
|
|
3052
|
+
managed_jobs_,
|
|
3053
|
+
show_all=False,
|
|
3054
|
+
show_user=True,
|
|
3055
|
+
status_counts=status_counts,
|
|
3056
|
+
)
|
|
3035
3057
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
|
3036
3058
|
# Add prefix to each line to align with the bullet point.
|
|
3037
3059
|
msg += '\n'.join(
|
|
@@ -3074,7 +3096,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
3074
3096
|
to be torn down (e.g., because it has services running or
|
|
3075
3097
|
it is in init state)
|
|
3076
3098
|
"""
|
|
3077
|
-
controller = controller_utils.Controllers.from_name(
|
|
3099
|
+
controller = controller_utils.Controllers.from_name(
|
|
3100
|
+
controller_name, expect_exact_match=False)
|
|
3078
3101
|
assert controller is not None, controller_name
|
|
3079
3102
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
|
3080
3103
|
try:
|
|
@@ -3093,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
3093
3116
|
# controller being STOPPED or being firstly launched, i.e., there is
|
|
3094
3117
|
# no in-prgress services.
|
|
3095
3118
|
services = []
|
|
3096
|
-
except exceptions.InconsistentConsolidationModeError:
|
|
3097
|
-
# If this error is raised, it means the user switched to the
|
|
3098
|
-
# consolidation mode but the previous controller cluster is still
|
|
3099
|
-
# running. We should allow the user to tear down the controller
|
|
3100
|
-
# cluster in this case.
|
|
3101
|
-
with skypilot_config.override_skypilot_config(
|
|
3102
|
-
{'serve': {
|
|
3103
|
-
'controller': {
|
|
3104
|
-
'consolidation_mode': False
|
|
3105
|
-
}
|
|
3106
|
-
}}):
|
|
3107
|
-
# Check again with the consolidation mode disabled. This is to
|
|
3108
|
-
# make sure there is no in-progress services.
|
|
3109
|
-
request_id = serve_lib.status(service_names=None)
|
|
3110
|
-
services = sdk.stream_and_get(request_id)
|
|
3111
3119
|
|
|
3112
3120
|
if services:
|
|
3113
3121
|
service_names = [service['name'] for service in services]
|
|
@@ -3185,14 +3193,15 @@ def _down_or_stop_clusters(
|
|
|
3185
3193
|
names = list(names)
|
|
3186
3194
|
if names:
|
|
3187
3195
|
controllers = [
|
|
3188
|
-
name for name in names
|
|
3189
|
-
|
|
3196
|
+
name for name in names if controller_utils.Controllers.from_name(
|
|
3197
|
+
name, expect_exact_match=False) is not None
|
|
3190
3198
|
]
|
|
3191
3199
|
controllers_str = ', '.join(map(repr, controllers))
|
|
3192
3200
|
names = [
|
|
3193
3201
|
cluster['name']
|
|
3194
3202
|
for cluster in _get_cluster_records_and_set_ssh_config(names)
|
|
3195
|
-
if controller_utils.Controllers.from_name(
|
|
3203
|
+
if controller_utils.Controllers.from_name(
|
|
3204
|
+
cluster['name'], expect_exact_match=False) is None
|
|
3196
3205
|
]
|
|
3197
3206
|
|
|
3198
3207
|
# Make sure the controllers are explicitly specified without other
|
|
@@ -3217,7 +3226,7 @@ def _down_or_stop_clusters(
|
|
|
3217
3226
|
f'{controllers_str} is currently not supported.')
|
|
3218
3227
|
else:
|
|
3219
3228
|
controller = controller_utils.Controllers.from_name(
|
|
3220
|
-
controller_name)
|
|
3229
|
+
controller_name, expect_exact_match=False)
|
|
3221
3230
|
assert controller is not None
|
|
3222
3231
|
hint_or_raise = _controller_to_hint_or_raise(controller)
|
|
3223
3232
|
try:
|
|
@@ -3265,9 +3274,10 @@ def _down_or_stop_clusters(
|
|
|
3265
3274
|
names = [
|
|
3266
3275
|
record['name']
|
|
3267
3276
|
for record in all_clusters
|
|
3268
|
-
if controller_utils.Controllers.from_name(
|
|
3269
|
-
|
|
3270
|
-
|
|
3277
|
+
if controller_utils.Controllers.from_name(
|
|
3278
|
+
record['name'], expect_exact_match=False) is None and
|
|
3279
|
+
(down or idle_minutes_to_autostop is not None or
|
|
3280
|
+
record['status'] != status_lib.ClusterStatus.STOPPED)
|
|
3271
3281
|
]
|
|
3272
3282
|
|
|
3273
3283
|
clusters = names
|
|
@@ -3297,6 +3307,9 @@ def _down_or_stop_clusters(
|
|
|
3297
3307
|
|
|
3298
3308
|
request_ids = []
|
|
3299
3309
|
|
|
3310
|
+
successes: List[str] = []
|
|
3311
|
+
failures: List[Tuple[str, str]] = []
|
|
3312
|
+
|
|
3300
3313
|
def _down_or_stop(name: str):
|
|
3301
3314
|
success_progress = False
|
|
3302
3315
|
if idle_minutes_to_autostop is not None:
|
|
@@ -3304,16 +3317,20 @@ def _down_or_stop_clusters(
|
|
|
3304
3317
|
request_id = sdk.autostop(name, idle_minutes_to_autostop,
|
|
3305
3318
|
wait_for, down)
|
|
3306
3319
|
request_ids.append(request_id)
|
|
3320
|
+
progress.stop()
|
|
3307
3321
|
_async_call_or_wait(
|
|
3308
3322
|
request_id, async_call,
|
|
3309
3323
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3310
|
-
|
|
3311
|
-
|
|
3324
|
+
progress.start()
|
|
3325
|
+
except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
|
|
3326
|
+
exceptions.CloudError) as e:
|
|
3312
3327
|
message = str(e)
|
|
3328
|
+
failures.append((name, str(e)))
|
|
3313
3329
|
else: # no exception raised
|
|
3314
3330
|
success_progress = True
|
|
3315
3331
|
message = (f'{colorama.Fore.GREEN}{operation} '
|
|
3316
3332
|
f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
|
|
3333
|
+
successes.append(name)
|
|
3317
3334
|
if idle_minutes_to_autostop >= 0:
|
|
3318
3335
|
option_str = 'down' if down else 'stop'
|
|
3319
3336
|
passive_str = 'downed' if down else 'stopped'
|
|
@@ -3333,9 +3350,11 @@ def _down_or_stop_clusters(
|
|
|
3333
3350
|
else:
|
|
3334
3351
|
request_id = sdk.stop(name, purge=purge)
|
|
3335
3352
|
request_ids.append(request_id)
|
|
3353
|
+
progress.stop()
|
|
3336
3354
|
_async_call_or_wait(
|
|
3337
3355
|
request_id, async_call,
|
|
3338
3356
|
server_constants.REQUEST_NAME_PREFIX + operation)
|
|
3357
|
+
progress.start()
|
|
3339
3358
|
if not async_call:
|
|
3340
3359
|
# Remove the cluster from the SSH config file as soon as it
|
|
3341
3360
|
# is stopped or downed.
|
|
@@ -3345,13 +3364,17 @@ def _down_or_stop_clusters(
|
|
|
3345
3364
|
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
|
|
3346
3365
|
f'{colorama.Style.RESET_ALL}'
|
|
3347
3366
|
f'\nReason: {common_utils.format_exception(e)}.')
|
|
3367
|
+
failures.append((name, str(e)))
|
|
3348
3368
|
except (exceptions.NotSupportedError,
|
|
3349
|
-
exceptions.ClusterOwnerIdentityMismatchError
|
|
3369
|
+
exceptions.ClusterOwnerIdentityMismatchError,
|
|
3370
|
+
exceptions.CloudError) as e:
|
|
3350
3371
|
message = str(e)
|
|
3372
|
+
failures.append((name, str(e)))
|
|
3351
3373
|
else: # no exception raised
|
|
3352
3374
|
message = (
|
|
3353
3375
|
f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
|
|
3354
3376
|
f'{colorama.Style.RESET_ALL}')
|
|
3377
|
+
successes.append(name)
|
|
3355
3378
|
if not down:
|
|
3356
3379
|
message += ('\n To restart the cluster, run: '
|
|
3357
3380
|
f'{colorama.Style.BRIGHT}sky start {name}'
|
|
@@ -3365,6 +3388,10 @@ def _down_or_stop_clusters(
|
|
|
3365
3388
|
progress.start()
|
|
3366
3389
|
|
|
3367
3390
|
with progress:
|
|
3391
|
+
# we write a new line here to avoid the "Waiting for 'sky.down'
|
|
3392
|
+
# request to be scheduled" message from being printed on the same line
|
|
3393
|
+
# as the "Terminating <num> clusters..." message
|
|
3394
|
+
click.echo('')
|
|
3368
3395
|
subprocess_utils.run_in_parallel(_down_or_stop, clusters)
|
|
3369
3396
|
progress.live.transient = False
|
|
3370
3397
|
# Make sure the progress bar not mess up the terminal.
|
|
@@ -3374,6 +3401,31 @@ def _down_or_stop_clusters(
|
|
|
3374
3401
|
click.secho(f'{operation} requests are sent. Check the requests\' '
|
|
3375
3402
|
'status with `sky request get <request_id>`.')
|
|
3376
3403
|
|
|
3404
|
+
show_summary = len(clusters) > 1
|
|
3405
|
+
|
|
3406
|
+
if show_summary:
|
|
3407
|
+
click.echo('\nSummary:')
|
|
3408
|
+
if successes:
|
|
3409
|
+
# Preserve the original order of clusters as provided by user.
|
|
3410
|
+
click.echo(' ✓ Succeeded: ' + ', '.join(successes))
|
|
3411
|
+
if failures:
|
|
3412
|
+
# Format failures: if one failure, keep on same line. If multiple,
|
|
3413
|
+
# indent each failed cluster on its own line for readability.
|
|
3414
|
+
if len(failures) == 1:
|
|
3415
|
+
name, reason = failures[0]
|
|
3416
|
+
first = reason.strip().splitlines()[0]
|
|
3417
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3418
|
+
click.echo(f' ✗ Failed: {name} ({first})')
|
|
3419
|
+
else:
|
|
3420
|
+
click.echo(' ✗ Failed:')
|
|
3421
|
+
for name, reason in failures:
|
|
3422
|
+
first = reason.strip().splitlines()[0]
|
|
3423
|
+
first = first if len(first) <= 120 else first[:120] + '…'
|
|
3424
|
+
click.echo(f' {name} ({first})')
|
|
3425
|
+
|
|
3426
|
+
if failures:
|
|
3427
|
+
click.echo('Cluster(s) failed. See details above.')
|
|
3428
|
+
|
|
3377
3429
|
|
|
3378
3430
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
3379
3431
|
@flags.config_option(expose_value=False)
|
|
@@ -4093,8 +4145,7 @@ def storage_ls(verbose: bool):
|
|
|
4093
4145
|
"""List storage objects managed by SkyPilot."""
|
|
4094
4146
|
request_id = sdk.storage_ls()
|
|
4095
4147
|
storages = sdk.stream_and_get(request_id)
|
|
4096
|
-
storage_table =
|
|
4097
|
-
show_all=verbose)
|
|
4148
|
+
storage_table = table_utils.format_storage_table(storages, show_all=verbose)
|
|
4098
4149
|
click.echo(storage_table)
|
|
4099
4150
|
|
|
4100
4151
|
|
|
@@ -4174,6 +4225,10 @@ def volumes():
|
|
|
4174
4225
|
pass
|
|
4175
4226
|
|
|
4176
4227
|
|
|
4228
|
+
# Add 'volume' as an alias for 'volumes'
|
|
4229
|
+
cli.add_command(volumes, name='volume')
|
|
4230
|
+
|
|
4231
|
+
|
|
4177
4232
|
@volumes.command('apply', cls=_DocumentedCodeCommand)
|
|
4178
4233
|
@flags.config_option(expose_value=False)
|
|
4179
4234
|
@click.argument('entrypoint',
|
|
@@ -4189,17 +4244,25 @@ def volumes():
|
|
|
4189
4244
|
@click.option('--infra',
|
|
4190
4245
|
required=False,
|
|
4191
4246
|
type=str,
|
|
4192
|
-
help='
|
|
4247
|
+
help='Infrastructure to use. '
|
|
4248
|
+
'Format: cloud, cloud/region, cloud/region/zone, or '
|
|
4249
|
+
'k8s/context-name.'
|
|
4250
|
+
'Examples: k8s, k8s/my-context, runpod/US/US-CA-2. '
|
|
4193
4251
|
'Override the infra defined in the YAML.')
|
|
4194
|
-
@click.option(
|
|
4195
|
-
|
|
4196
|
-
|
|
4197
|
-
|
|
4198
|
-
help='Volume type. Format: pvc. Override the type defined in the YAML.')
|
|
4252
|
+
@click.option('--type',
|
|
4253
|
+
required=False,
|
|
4254
|
+
type=click.Choice(volume_utils.VolumeType.supported_types()),
|
|
4255
|
+
help='Volume type. Override the type defined in the YAML.')
|
|
4199
4256
|
@click.option('--size',
|
|
4200
4257
|
required=False,
|
|
4201
4258
|
type=str,
|
|
4202
4259
|
help='Volume size. Override the size defined in the YAML.')
|
|
4260
|
+
@click.option(
|
|
4261
|
+
'--use-existing/--no-use-existing',
|
|
4262
|
+
required=False,
|
|
4263
|
+
default=None,
|
|
4264
|
+
help='Whether to use an existing volume. Override the use_existing '
|
|
4265
|
+
'defined in the YAML.')
|
|
4203
4266
|
@click.option('--yes',
|
|
4204
4267
|
'-y',
|
|
4205
4268
|
is_flag=True,
|
|
@@ -4214,6 +4277,7 @@ def volumes_apply(
|
|
|
4214
4277
|
infra: Optional[str],
|
|
4215
4278
|
type: Optional[str], # pylint: disable=redefined-builtin
|
|
4216
4279
|
size: Optional[str],
|
|
4280
|
+
use_existing: Optional[bool],
|
|
4217
4281
|
yes: bool,
|
|
4218
4282
|
async_call: bool):
|
|
4219
4283
|
"""Apply a volume.
|
|
@@ -4226,7 +4290,11 @@ def volumes_apply(
|
|
|
4226
4290
|
sky volumes apply volume.yaml
|
|
4227
4291
|
\b
|
|
4228
4292
|
# Apply a volume from a command.
|
|
4229
|
-
sky volumes apply --name pvc1 --infra k8s --type pvc --size 100Gi
|
|
4293
|
+
sky volumes apply --name pvc1 --infra k8s --type k8s-pvc --size 100Gi
|
|
4294
|
+
\b
|
|
4295
|
+
# Apply a volume with existing PVC `pvc2` from a command.
|
|
4296
|
+
sky volumes apply --name pvc2 --infra k8s --type k8s-pvc --size 100Gi
|
|
4297
|
+
--use-existing
|
|
4230
4298
|
"""
|
|
4231
4299
|
# pylint: disable=import-outside-toplevel
|
|
4232
4300
|
from sky.volumes import volume as volume_lib
|
|
@@ -4245,7 +4313,8 @@ def volumes_apply(
|
|
|
4245
4313
|
f'{entrypoint_str!r} needs to be a YAML file')
|
|
4246
4314
|
if yaml_config is not None:
|
|
4247
4315
|
volume_config_dict = yaml_config.copy()
|
|
4248
|
-
override_config = _build_volume_override_config(name, infra, type, size
|
|
4316
|
+
override_config = _build_volume_override_config(name, infra, type, size,
|
|
4317
|
+
use_existing)
|
|
4249
4318
|
volume_config_dict.update(override_config)
|
|
4250
4319
|
|
|
4251
4320
|
# Create Volume instance
|
|
@@ -4253,6 +4322,13 @@ def volumes_apply(
|
|
|
4253
4322
|
|
|
4254
4323
|
logger.debug(f'Volume config: {volume.to_yaml_config()}')
|
|
4255
4324
|
|
|
4325
|
+
# TODO(kevin): remove the try block in v0.13.0
|
|
4326
|
+
try:
|
|
4327
|
+
volumes_sdk.validate(volume)
|
|
4328
|
+
except exceptions.APINotSupportedError:
|
|
4329
|
+
# Do best-effort client-side validation.
|
|
4330
|
+
volume.validate(skip_cloud_compatibility=True)
|
|
4331
|
+
|
|
4256
4332
|
if not yes:
|
|
4257
4333
|
click.confirm(f'Proceed to create volume {volume.name!r}?',
|
|
4258
4334
|
default=True,
|
|
@@ -4269,11 +4345,15 @@ def volumes_apply(
|
|
|
4269
4345
|
f'{colorama.Style.RESET_ALL}')
|
|
4270
4346
|
|
|
4271
4347
|
|
|
4272
|
-
def _build_volume_override_config(
|
|
4273
|
-
|
|
4274
|
-
|
|
4348
|
+
def _build_volume_override_config(
|
|
4349
|
+
name: Optional[str],
|
|
4350
|
+
infra: Optional[str],
|
|
4351
|
+
volume_type: Optional[str],
|
|
4352
|
+
size: Optional[str],
|
|
4353
|
+
use_existing: Optional[bool],
|
|
4354
|
+
) -> Dict[str, Any]:
|
|
4275
4355
|
"""Parse the volume override config."""
|
|
4276
|
-
override_config = {}
|
|
4356
|
+
override_config: Dict[str, Any] = {}
|
|
4277
4357
|
if name is not None:
|
|
4278
4358
|
override_config['name'] = name
|
|
4279
4359
|
if infra is not None:
|
|
@@ -4282,6 +4362,8 @@ def _build_volume_override_config(name: Optional[str], infra: Optional[str],
|
|
|
4282
4362
|
override_config['type'] = volume_type
|
|
4283
4363
|
if size is not None:
|
|
4284
4364
|
override_config['size'] = size
|
|
4365
|
+
if use_existing is not None:
|
|
4366
|
+
override_config['use_existing'] = use_existing
|
|
4285
4367
|
return override_config
|
|
4286
4368
|
|
|
4287
4369
|
|
|
@@ -4298,8 +4380,8 @@ def volumes_ls(verbose: bool):
|
|
|
4298
4380
|
"""List volumes managed by SkyPilot."""
|
|
4299
4381
|
request_id = volumes_sdk.ls()
|
|
4300
4382
|
all_volumes = sdk.stream_and_get(request_id)
|
|
4301
|
-
volume_table =
|
|
4302
|
-
|
|
4383
|
+
volume_table = table_utils.format_volume_table(all_volumes,
|
|
4384
|
+
show_all=verbose)
|
|
4303
4385
|
click.echo(volume_table)
|
|
4304
4386
|
|
|
4305
4387
|
|
|
@@ -4537,10 +4619,11 @@ def jobs_launch(
|
|
|
4537
4619
|
break
|
|
4538
4620
|
if print_setup_fm_warning:
|
|
4539
4621
|
click.secho(
|
|
4540
|
-
f'{colorama.Fore.YELLOW}
|
|
4541
|
-
' will be ignored when
|
|
4542
|
-
f'please use `sky jobs pool apply {pool} new-pool.yaml`. '
|
|
4622
|
+
f'{colorama.Fore.YELLOW}Setup, file mounts, and storage mounts'
|
|
4623
|
+
' will be ignored when submitting jobs to pool. To update a '
|
|
4624
|
+
f'pool, please use `sky jobs pool apply {pool} new-pool.yaml`. '
|
|
4543
4625
|
f'{colorama.Style.RESET_ALL}')
|
|
4626
|
+
print_setup_fm_warning = False
|
|
4544
4627
|
|
|
4545
4628
|
# Optimize info is only show if _need_confirmation.
|
|
4546
4629
|
if not yes:
|
|
@@ -4556,10 +4639,15 @@ def jobs_launch(
|
|
|
4556
4639
|
job_id_handle = _async_call_or_wait(request_id, async_call,
|
|
4557
4640
|
'sky.jobs.launch')
|
|
4558
4641
|
|
|
4559
|
-
if
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4642
|
+
if async_call:
|
|
4643
|
+
return
|
|
4644
|
+
|
|
4645
|
+
job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
|
|
4646
|
+
int) else job_id_handle[0]
|
|
4647
|
+
|
|
4648
|
+
if not detach_run:
|
|
4649
|
+
if len(job_ids) == 1:
|
|
4650
|
+
job_id = job_ids[0]
|
|
4563
4651
|
returncode = managed_jobs.tail_logs(name=None,
|
|
4564
4652
|
job_id=job_id,
|
|
4565
4653
|
follow=True,
|
|
@@ -4568,7 +4656,8 @@ def jobs_launch(
|
|
|
4568
4656
|
else:
|
|
4569
4657
|
# TODO(tian): This can be very long. Considering have a "group id"
|
|
4570
4658
|
# and query all job ids with the same group id.
|
|
4571
|
-
|
|
4659
|
+
# Sort job ids to ensure consistent ordering.
|
|
4660
|
+
job_ids_str = ','.join(map(str, sorted(job_ids)))
|
|
4572
4661
|
click.secho(
|
|
4573
4662
|
f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
|
|
4574
4663
|
f'{job_ids_str}{colorama.Style.RESET_ALL}.'
|
|
@@ -4587,6 +4676,14 @@ def jobs_launch(
|
|
|
4587
4676
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
|
4588
4677
|
@flags.config_option(expose_value=False)
|
|
4589
4678
|
@flags.verbose_option()
|
|
4679
|
+
@click.option(
|
|
4680
|
+
'--limit',
|
|
4681
|
+
'-l',
|
|
4682
|
+
default=_NUM_MANAGED_JOBS_TO_SHOW,
|
|
4683
|
+
type=int,
|
|
4684
|
+
required=False,
|
|
4685
|
+
help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
|
|
4686
|
+
f' use "-a/--all" to show all jobs.'))
|
|
4590
4687
|
@click.option(
|
|
4591
4688
|
'--refresh',
|
|
4592
4689
|
'-r',
|
|
@@ -4606,7 +4703,7 @@ def jobs_launch(
|
|
|
4606
4703
|
@usage_lib.entrypoint
|
|
4607
4704
|
# pylint: disable=redefined-builtin
|
|
4608
4705
|
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4609
|
-
all_users: bool, all: bool):
|
|
4706
|
+
all_users: bool, all: bool, limit: int):
|
|
4610
4707
|
"""Show statuses of managed jobs.
|
|
4611
4708
|
|
|
4612
4709
|
Each managed jobs can have one of the following statuses:
|
|
@@ -4657,18 +4754,56 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4657
4754
|
|
|
4658
4755
|
watch -n60 sky jobs queue
|
|
4659
4756
|
|
|
4757
|
+
(Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
|
|
4758
|
+
|
|
4759
|
+
.. code-block:: bash
|
|
4760
|
+
|
|
4761
|
+
sky jobs queue -l 10
|
|
4762
|
+
|
|
4660
4763
|
"""
|
|
4661
4764
|
click.secho('Fetching managed job statuses...', fg='cyan')
|
|
4662
4765
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
4766
|
+
max_num_jobs_to_show = (limit if not all else None)
|
|
4767
|
+
fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
|
|
4768
|
+
if verbose:
|
|
4769
|
+
fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
|
|
4770
|
+
if all_users:
|
|
4771
|
+
fields = fields + _USER_NAME_FIELD
|
|
4772
|
+
if verbose:
|
|
4773
|
+
fields = fields + _USER_HASH_FIELD
|
|
4774
|
+
# Call both cli_utils.get_managed_job_queue and managed_jobs.pool_status
|
|
4775
|
+
# in parallel
|
|
4776
|
+
def get_managed_jobs_queue():
|
|
4777
|
+
return cli_utils.get_managed_job_queue(refresh=refresh,
|
|
4778
|
+
skip_finished=skip_finished,
|
|
4779
|
+
all_users=all_users,
|
|
4780
|
+
limit=max_num_jobs_to_show,
|
|
4781
|
+
fields=fields)
|
|
4782
|
+
|
|
4783
|
+
def get_pool_status():
|
|
4784
|
+
try:
|
|
4785
|
+
return managed_jobs.pool_status(pool_names=None)
|
|
4786
|
+
except Exception: # pylint: disable=broad-except
|
|
4787
|
+
# If pool_status fails, we'll just skip the worker information
|
|
4788
|
+
return None
|
|
4789
|
+
|
|
4790
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
4791
|
+
managed_jobs_future = executor.submit(get_managed_jobs_queue)
|
|
4792
|
+
pool_status_future = executor.submit(get_pool_status)
|
|
4793
|
+
|
|
4794
|
+
(managed_jobs_request_id,
|
|
4795
|
+
queue_result_version) = managed_jobs_future.result()
|
|
4796
|
+
pool_status_request_id = pool_status_future.result()
|
|
4797
|
+
|
|
4666
4798
|
num_jobs, msg = _handle_jobs_queue_request(
|
|
4667
4799
|
managed_jobs_request_id,
|
|
4800
|
+
pool_status_request_id=pool_status_request_id,
|
|
4668
4801
|
show_all=verbose,
|
|
4669
4802
|
show_user=all_users,
|
|
4670
4803
|
max_num_jobs_to_show=max_num_jobs_to_show,
|
|
4671
|
-
is_called_by_user=True
|
|
4804
|
+
is_called_by_user=True,
|
|
4805
|
+
queue_result_version=queue_result_version,
|
|
4806
|
+
)
|
|
4672
4807
|
if not skip_finished:
|
|
4673
4808
|
in_progress_only_hint = ''
|
|
4674
4809
|
else:
|
|
@@ -4681,7 +4816,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
|
4681
4816
|
f'{colorama.Fore.CYAN}'
|
|
4682
4817
|
f'Only showing the latest {max_num_jobs_to_show} '
|
|
4683
4818
|
f'managed jobs'
|
|
4684
|
-
f'(use --
|
|
4819
|
+
f'(use --limit to show more managed jobs or '
|
|
4820
|
+
f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
|
|
4685
4821
|
|
|
4686
4822
|
|
|
4687
4823
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
|
@@ -4849,7 +4985,7 @@ def pool():
|
|
|
4849
4985
|
@pool.command('apply', cls=_DocumentedCodeCommand)
|
|
4850
4986
|
@flags.config_option(expose_value=False)
|
|
4851
4987
|
@click.argument('pool_yaml',
|
|
4852
|
-
required=
|
|
4988
|
+
required=False,
|
|
4853
4989
|
type=str,
|
|
4854
4990
|
nargs=-1,
|
|
4855
4991
|
**_get_shell_complete_args(_complete_file_name))
|
|
@@ -4864,17 +5000,22 @@ def pool():
|
|
|
4864
5000
|
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
|
4865
5001
|
case_sensitive=False),
|
|
4866
5002
|
required=False,
|
|
4867
|
-
help=('Update mode. If "rolling",
|
|
4868
|
-
'with rolling update. If "blue_green",
|
|
5003
|
+
help=('Update mode. If "rolling", pool will be updated '
|
|
5004
|
+
'with rolling update. If "blue_green", pool will '
|
|
4869
5005
|
'be updated with blue-green update. This option is only '
|
|
4870
5006
|
'valid when the pool is already running.'))
|
|
5007
|
+
@click.option('--workers',
|
|
5008
|
+
default=None,
|
|
5009
|
+
type=int,
|
|
5010
|
+
required=False,
|
|
5011
|
+
help='Can be used to update the number of workers in the pool.')
|
|
4871
5012
|
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
4872
5013
|
flags.COMMON_OPTIONS)
|
|
4873
5014
|
@flags.yes_option()
|
|
4874
5015
|
@timeline.event
|
|
4875
5016
|
@usage_lib.entrypoint
|
|
4876
5017
|
def jobs_pool_apply(
|
|
4877
|
-
pool_yaml: Tuple[str, ...],
|
|
5018
|
+
pool_yaml: Optional[Tuple[str, ...]],
|
|
4878
5019
|
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4879
5020
|
workdir: Optional[str],
|
|
4880
5021
|
infra: Optional[str],
|
|
@@ -4896,60 +5037,80 @@ def jobs_pool_apply(
|
|
|
4896
5037
|
disk_tier: Optional[str],
|
|
4897
5038
|
network_tier: Optional[str],
|
|
4898
5039
|
mode: str,
|
|
5040
|
+
workers: Optional[int],
|
|
4899
5041
|
yes: bool,
|
|
4900
5042
|
async_call: bool,
|
|
4901
5043
|
):
|
|
4902
|
-
"""
|
|
4903
|
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
5044
|
+
"""Either apply a config to a pool for managed jobs submission
|
|
5045
|
+
or update the number of workers in the pool. One of POOL_YAML or --workers
|
|
5046
|
+
must be provided.
|
|
5047
|
+
Config:
|
|
5048
|
+
If the pool is already running, the config will be applied to the pool.
|
|
5049
|
+
Otherwise, a new pool will be created.
|
|
5050
|
+
Workers:
|
|
5051
|
+
The --workers option can be used to override the number of workers
|
|
5052
|
+
specified in the YAML file, or to update workers without a YAML file.
|
|
5053
|
+
Example:
|
|
5054
|
+
sky jobs pool apply -p my-pool --workers 5
|
|
4908
5055
|
"""
|
|
4909
5056
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4910
5057
|
infra, cloud, region, zone)
|
|
4911
|
-
if
|
|
4912
|
-
|
|
5058
|
+
if workers is not None and pool_yaml is not None and len(pool_yaml) > 0:
|
|
5059
|
+
raise click.UsageError(
|
|
5060
|
+
'Cannot specify both --workers and POOL_YAML. Please use one of '
|
|
5061
|
+
'them.')
|
|
4913
5062
|
|
|
4914
|
-
|
|
4915
|
-
|
|
4916
|
-
|
|
4917
|
-
|
|
4918
|
-
|
|
4919
|
-
|
|
4920
|
-
|
|
4921
|
-
|
|
4922
|
-
|
|
4923
|
-
|
|
4924
|
-
|
|
4925
|
-
|
|
4926
|
-
|
|
4927
|
-
|
|
4928
|
-
|
|
4929
|
-
|
|
4930
|
-
|
|
4931
|
-
|
|
4932
|
-
|
|
4933
|
-
|
|
4934
|
-
|
|
4935
|
-
|
|
4936
|
-
|
|
4937
|
-
|
|
4938
|
-
|
|
4939
|
-
|
|
4940
|
-
|
|
4941
|
-
|
|
4942
|
-
|
|
4943
|
-
|
|
5063
|
+
if pool_yaml is None or len(pool_yaml) == 0:
|
|
5064
|
+
if pool is None:
|
|
5065
|
+
raise click.UsageError(
|
|
5066
|
+
'A pool name must be provided to update the number of workers.')
|
|
5067
|
+
task = None
|
|
5068
|
+
click.secho(f'Attempting to update {pool} to have {workers} workers',
|
|
5069
|
+
fg='cyan')
|
|
5070
|
+
else:
|
|
5071
|
+
if pool is None:
|
|
5072
|
+
pool = serve_lib.generate_service_name(pool=True)
|
|
5073
|
+
|
|
5074
|
+
task = _generate_task_with_service(
|
|
5075
|
+
service_name=pool,
|
|
5076
|
+
service_yaml_args=pool_yaml,
|
|
5077
|
+
workdir=workdir,
|
|
5078
|
+
cloud=cloud,
|
|
5079
|
+
region=region,
|
|
5080
|
+
zone=zone,
|
|
5081
|
+
gpus=gpus,
|
|
5082
|
+
cpus=cpus,
|
|
5083
|
+
memory=memory,
|
|
5084
|
+
instance_type=instance_type,
|
|
5085
|
+
num_nodes=num_nodes,
|
|
5086
|
+
use_spot=use_spot,
|
|
5087
|
+
image_id=image_id,
|
|
5088
|
+
env_file=env_file,
|
|
5089
|
+
env=env,
|
|
5090
|
+
secret=secret,
|
|
5091
|
+
disk_size=disk_size,
|
|
5092
|
+
disk_tier=disk_tier,
|
|
5093
|
+
network_tier=network_tier,
|
|
5094
|
+
ports=ports,
|
|
5095
|
+
not_supported_cmd='sky jobs pool up',
|
|
5096
|
+
pool=True,
|
|
5097
|
+
)
|
|
5098
|
+
assert task.service is not None
|
|
5099
|
+
if not task.service.pool:
|
|
5100
|
+
raise click.UsageError('The YAML file needs a `pool` section.')
|
|
5101
|
+
click.secho('Pool spec:', fg='cyan')
|
|
5102
|
+
click.echo(task.service)
|
|
5103
|
+
serve_lib.validate_service_task(task, pool=True)
|
|
4944
5104
|
|
|
4945
|
-
|
|
4946
|
-
|
|
4947
|
-
|
|
4948
|
-
|
|
4949
|
-
|
|
5105
|
+
click.secho(
|
|
5106
|
+
'Each pool worker will use the following resources (estimated):',
|
|
5107
|
+
fg='cyan')
|
|
5108
|
+
with dag_lib.Dag() as dag:
|
|
5109
|
+
dag.add(task)
|
|
4950
5110
|
|
|
4951
5111
|
request_id = managed_jobs.pool_apply(task,
|
|
4952
5112
|
pool,
|
|
5113
|
+
workers=workers,
|
|
4953
5114
|
mode=serve_lib.UpdateMode(mode),
|
|
4954
5115
|
_need_confirmation=not yes)
|
|
4955
5116
|
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
@@ -4962,7 +5123,7 @@ def jobs_pool_apply(
|
|
|
4962
5123
|
@usage_lib.entrypoint
|
|
4963
5124
|
# pylint: disable=redefined-builtin
|
|
4964
5125
|
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
4965
|
-
"""Show statuses of
|
|
5126
|
+
"""Show statuses of pools.
|
|
4966
5127
|
|
|
4967
5128
|
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
4968
5129
|
provided, show all pools' status.
|
|
@@ -5018,12 +5179,108 @@ def jobs_pool_down(
|
|
|
5018
5179
|
raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
|
|
5019
5180
|
f'Provided {argument_str!r}.')
|
|
5020
5181
|
|
|
5021
|
-
|
|
5022
|
-
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
|
|
5026
|
-
|
|
5182
|
+
def _get_nonterminal_jobs(pool_names: List[str],
|
|
5183
|
+
all: bool) -> List[responses.ManagedJobRecord]:
|
|
5184
|
+
# Get nonterminal jobs for this pool using managed_jobs.queue
|
|
5185
|
+
request_id, queue_result_version = cli_utils.get_managed_job_queue(
|
|
5186
|
+
refresh=False,
|
|
5187
|
+
skip_finished=True,
|
|
5188
|
+
all_users=True,
|
|
5189
|
+
limit=None,
|
|
5190
|
+
fields=['job_id', 'status', 'pool'],
|
|
5191
|
+
)
|
|
5192
|
+
jobs_result = sdk.stream_and_get(request_id)
|
|
5193
|
+
|
|
5194
|
+
# Handle both tuple and list responses
|
|
5195
|
+
jobs_list: List[responses.ManagedJobRecord]
|
|
5196
|
+
if queue_result_version.v2():
|
|
5197
|
+
jobs_list = jobs_result[0]
|
|
5198
|
+
else:
|
|
5199
|
+
jobs_list = typing.cast(List[responses.ManagedJobRecord],
|
|
5200
|
+
jobs_result)
|
|
5201
|
+
|
|
5202
|
+
def _should_include_job(job: responses.ManagedJobRecord) -> bool:
|
|
5203
|
+
# Job must not be terminal.
|
|
5204
|
+
if job.get('status', ManagedJobStatus.SUCCEEDED).is_terminal():
|
|
5205
|
+
return False
|
|
5206
|
+
# If len is 0 then we are using -a option, so we include all jobs
|
|
5207
|
+
# if they're associated with a pool.
|
|
5208
|
+
if all:
|
|
5209
|
+
return job.get('pool') is not None
|
|
5210
|
+
# Otherwise we are using specific pool names, so we include the job
|
|
5211
|
+
# if it's associated with one of the specified pools.
|
|
5212
|
+
return job.get('pool') in pool_names
|
|
5213
|
+
|
|
5214
|
+
# Filter jobs by pool name and ensure nonterminal
|
|
5215
|
+
pool_jobs = [job for job in jobs_list if _should_include_job(job)]
|
|
5216
|
+
return pool_jobs
|
|
5217
|
+
|
|
5218
|
+
quoted_pool_names = [f'{name!r}' for name in pool_names]
|
|
5219
|
+
list_pool_str = ', '.join(quoted_pool_names)
|
|
5220
|
+
pool_identity_str = f'pool(s) {list_pool_str}'
|
|
5221
|
+
if all:
|
|
5222
|
+
pool_identity_str = 'all pools'
|
|
5223
|
+
|
|
5224
|
+
already_confirmed = False
|
|
5225
|
+
try:
|
|
5226
|
+
pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5227
|
+
if pool_jobs:
|
|
5228
|
+
num_jobs = len(pool_jobs)
|
|
5229
|
+
job_ids = [job['job_id'] for job in pool_jobs]
|
|
5230
|
+
job_ids_str = ','.join(str(job_id) for job_id in job_ids)
|
|
5231
|
+
click.echo(
|
|
5232
|
+
f'{colorama.Fore.YELLOW}Pool(s) has {num_jobs} '
|
|
5233
|
+
f'nonterminal jobs: {job_ids_str} so it is not yet safe to down'
|
|
5234
|
+
f'.{colorama.Style.RESET_ALL}')
|
|
5235
|
+
if not yes:
|
|
5236
|
+
should_cancel = click.confirm(
|
|
5237
|
+
'Would you like to cancel all jobs and down the pool(s)?',
|
|
5238
|
+
default=False,
|
|
5239
|
+
abort=False,
|
|
5240
|
+
show_default=True)
|
|
5241
|
+
if not should_cancel:
|
|
5242
|
+
raise click.Abort()
|
|
5243
|
+
already_confirmed = True
|
|
5244
|
+
|
|
5245
|
+
# Cancel all jobs in the pool
|
|
5246
|
+
with rich_utils.client_status(
|
|
5247
|
+
ux_utils.spinner_message(
|
|
5248
|
+
f'Cancelling {num_jobs} jobs in {pool_identity_str}...')
|
|
5249
|
+
):
|
|
5250
|
+
try:
|
|
5251
|
+
sdk.get(managed_jobs.cancel(job_ids=job_ids))
|
|
5252
|
+
except Exception as e:
|
|
5253
|
+
logger.warning(f'Failed to cancel jobs: {e}.')
|
|
5254
|
+
raise e
|
|
5255
|
+
|
|
5256
|
+
max_wait_time = 300 # 5 minutes max wait
|
|
5257
|
+
check_interval = 2 # Check every 2 seconds
|
|
5258
|
+
start_time = time.time()
|
|
5259
|
+
remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5260
|
+
while (remaining_pool_jobs and
|
|
5261
|
+
time.time() - start_time < max_wait_time):
|
|
5262
|
+
# Check remaining jobs via API
|
|
5263
|
+
time.sleep(check_interval)
|
|
5264
|
+
remaining_pool_jobs = _get_nonterminal_jobs(pool_names, all)
|
|
5265
|
+
ux_utils.spinner_message(
|
|
5266
|
+
f'Waiting for {len(remaining_pool_jobs)} '
|
|
5267
|
+
'jobs to be cancelled...')
|
|
5268
|
+
|
|
5269
|
+
click.echo('\r' + ' ' * 80 + '\r', nl=False)
|
|
5270
|
+
if time.time() - start_time >= max_wait_time:
|
|
5271
|
+
click.echo(
|
|
5272
|
+
f'{colorama.Fore.YELLOW}Warning: Timeout waiting '
|
|
5273
|
+
f'for jobs to finish. Proceeding with pool down '
|
|
5274
|
+
f'anyway.{colorama.Style.RESET_ALL}')
|
|
5275
|
+
else:
|
|
5276
|
+
click.echo('All jobs cancelled.')
|
|
5277
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5278
|
+
# If API call fails, log warning but continue with pool down
|
|
5279
|
+
logger.warning(
|
|
5280
|
+
f'Failed to check for running jobs in pool(s): {pool_names!r}: {e}.'
|
|
5281
|
+
' Proceeding with pool down.')
|
|
5282
|
+
|
|
5283
|
+
if not yes and not already_confirmed:
|
|
5027
5284
|
click.confirm(f'Terminating {pool_identity_str}. Proceed?',
|
|
5028
5285
|
default=True,
|
|
5029
5286
|
abort=True,
|
|
@@ -5205,22 +5462,22 @@ def jobs_pool_logs(
|
|
|
5205
5462
|
.. code-block:: bash
|
|
5206
5463
|
|
|
5207
5464
|
# Tail the controller logs of a pool
|
|
5208
|
-
sky pool logs --controller [POOL_NAME]
|
|
5465
|
+
sky jobs pool logs --controller [POOL_NAME]
|
|
5209
5466
|
\b
|
|
5210
5467
|
# Print the worker logs so far and exit
|
|
5211
|
-
sky pool logs --no-follow [POOL_NAME]
|
|
5468
|
+
sky jobs pool logs --no-follow [POOL_NAME] 1
|
|
5212
5469
|
\b
|
|
5213
5470
|
# Tail the logs of worker 1
|
|
5214
|
-
sky pool logs [POOL_NAME] 1
|
|
5471
|
+
sky jobs pool logs [POOL_NAME] 1
|
|
5215
5472
|
\b
|
|
5216
5473
|
# Show the last 100 lines of the controller logs
|
|
5217
|
-
sky pool logs --controller --tail 100 [POOL_NAME]
|
|
5474
|
+
sky jobs pool logs --controller --tail 100 [POOL_NAME]
|
|
5218
5475
|
\b
|
|
5219
5476
|
# Sync down all logs of the pool (controller, all workers)
|
|
5220
|
-
sky pool logs [POOL_NAME] --sync-down
|
|
5477
|
+
sky jobs pool logs [POOL_NAME] --sync-down
|
|
5221
5478
|
\b
|
|
5222
5479
|
# Sync down controller logs and logs for workers 1 and 3
|
|
5223
|
-
sky pool logs [POOL_NAME] 1 3 --controller --sync-down
|
|
5480
|
+
sky jobs pool logs [POOL_NAME] 1 3 --controller --sync-down
|
|
5224
5481
|
"""
|
|
5225
5482
|
_handle_serve_logs(pool_name,
|
|
5226
5483
|
follow=follow,
|
|
@@ -5236,7 +5493,15 @@ def jobs_pool_logs(
|
|
|
5236
5493
|
@flags.config_option(expose_value=False)
|
|
5237
5494
|
@usage_lib.entrypoint
|
|
5238
5495
|
def dashboard() -> None:
|
|
5239
|
-
"""
|
|
5496
|
+
"""Opens the SkyPilot dashboard."""
|
|
5497
|
+
sdk.dashboard()
|
|
5498
|
+
|
|
5499
|
+
|
|
5500
|
+
@cli.command(cls=_DocumentedCodeCommand, hidden=True)
|
|
5501
|
+
@flags.config_option(expose_value=False)
|
|
5502
|
+
@usage_lib.entrypoint
|
|
5503
|
+
def ui() -> None:
|
|
5504
|
+
"""Opens the SkyPilot dashboard."""
|
|
5240
5505
|
sdk.dashboard()
|
|
5241
5506
|
|
|
5242
5507
|
|
|
@@ -5247,28 +5512,30 @@ def serve():
|
|
|
5247
5512
|
|
|
5248
5513
|
|
|
5249
5514
|
def _generate_task_with_service(
|
|
5250
|
-
|
|
5251
|
-
|
|
5252
|
-
|
|
5253
|
-
|
|
5254
|
-
|
|
5255
|
-
|
|
5256
|
-
|
|
5257
|
-
|
|
5258
|
-
|
|
5259
|
-
|
|
5260
|
-
|
|
5261
|
-
|
|
5262
|
-
|
|
5263
|
-
|
|
5264
|
-
|
|
5265
|
-
|
|
5266
|
-
|
|
5267
|
-
|
|
5268
|
-
|
|
5269
|
-
|
|
5270
|
-
|
|
5271
|
-
|
|
5515
|
+
service_name: str,
|
|
5516
|
+
service_yaml_args: Tuple[str, ...],
|
|
5517
|
+
workdir: Optional[str],
|
|
5518
|
+
cloud: Optional[str],
|
|
5519
|
+
region: Optional[str],
|
|
5520
|
+
zone: Optional[str],
|
|
5521
|
+
num_nodes: Optional[int],
|
|
5522
|
+
use_spot: Optional[bool],
|
|
5523
|
+
image_id: Optional[str],
|
|
5524
|
+
env_file: Optional[Dict[str, str]],
|
|
5525
|
+
env: List[Tuple[str, str]],
|
|
5526
|
+
secret: Optional[List[Tuple[str, str]]],
|
|
5527
|
+
gpus: Optional[str],
|
|
5528
|
+
instance_type: Optional[str],
|
|
5529
|
+
ports: Optional[Tuple[str]],
|
|
5530
|
+
cpus: Optional[str],
|
|
5531
|
+
memory: Optional[str],
|
|
5532
|
+
disk_size: Optional[int],
|
|
5533
|
+
disk_tier: Optional[str],
|
|
5534
|
+
network_tier: Optional[str],
|
|
5535
|
+
not_supported_cmd: str,
|
|
5536
|
+
pool: bool, # pylint: disable=redefined-outer-name
|
|
5537
|
+
git_url: Optional[str] = None,
|
|
5538
|
+
git_ref: Optional[str] = None,
|
|
5272
5539
|
) -> task_lib.Task:
|
|
5273
5540
|
"""Generate a task with service section from a service YAML file."""
|
|
5274
5541
|
is_yaml, _ = _check_yaml(''.join(service_yaml_args))
|
|
@@ -5298,6 +5565,8 @@ def _generate_task_with_service(
|
|
|
5298
5565
|
disk_tier=disk_tier,
|
|
5299
5566
|
network_tier=network_tier,
|
|
5300
5567
|
ports=ports,
|
|
5568
|
+
git_url=git_url,
|
|
5569
|
+
git_ref=git_ref,
|
|
5301
5570
|
)
|
|
5302
5571
|
if isinstance(task, dag_lib.Dag):
|
|
5303
5572
|
raise click.UsageError(
|
|
@@ -5313,7 +5582,7 @@ def _generate_task_with_service(
|
|
|
5313
5582
|
if task.service.pool:
|
|
5314
5583
|
if task.service.ports is not None or ports:
|
|
5315
5584
|
with ux_utils.print_exception_no_traceback():
|
|
5316
|
-
raise ValueError('Cannot specify ports in a
|
|
5585
|
+
raise ValueError('Cannot specify ports in a pool.')
|
|
5317
5586
|
return task
|
|
5318
5587
|
|
|
5319
5588
|
# NOTE(yi): we only allow one service port now.
|
|
@@ -5389,6 +5658,10 @@ def _generate_task_with_service(
|
|
|
5389
5658
|
type=str,
|
|
5390
5659
|
help='A service name. Unique for each service. If not provided, '
|
|
5391
5660
|
'a unique name is autogenerated.')
|
|
5661
|
+
@click.option('--git-url', type=str, help='Git repository URL.')
|
|
5662
|
+
@click.option('--git-ref',
|
|
5663
|
+
type=str,
|
|
5664
|
+
help='Git reference (branch, tag, or commit hash) to use.')
|
|
5392
5665
|
@_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
|
|
5393
5666
|
flags.COMMON_OPTIONS)
|
|
5394
5667
|
@flags.yes_option()
|
|
@@ -5418,6 +5691,8 @@ def serve_up(
|
|
|
5418
5691
|
network_tier: Optional[str],
|
|
5419
5692
|
yes: bool,
|
|
5420
5693
|
async_call: bool,
|
|
5694
|
+
git_url: Optional[str] = None,
|
|
5695
|
+
git_ref: Optional[str] = None,
|
|
5421
5696
|
):
|
|
5422
5697
|
"""Launch a SkyServe service.
|
|
5423
5698
|
|
|
@@ -5475,6 +5750,8 @@ def serve_up(
|
|
|
5475
5750
|
ports=ports,
|
|
5476
5751
|
not_supported_cmd='sky serve up',
|
|
5477
5752
|
pool=False,
|
|
5753
|
+
git_url=git_url,
|
|
5754
|
+
git_ref=git_ref,
|
|
5478
5755
|
)
|
|
5479
5756
|
assert task.service is not None
|
|
5480
5757
|
if task.service.pool:
|
|
@@ -5556,6 +5833,8 @@ def serve_update(
|
|
|
5556
5833
|
sky serve update --mode blue_green sky-service-16aa new_service.yaml
|
|
5557
5834
|
|
|
5558
5835
|
"""
|
|
5836
|
+
# TODO(lloyd-brown): Add a way to update number of replicas for serve
|
|
5837
|
+
# the way we did for pools.
|
|
5559
5838
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
5560
5839
|
infra, cloud, region, zone)
|
|
5561
5840
|
task = _generate_task_with_service(
|
|
@@ -5918,94 +6197,39 @@ def local():
|
|
|
5918
6197
|
help='Launch cluster without GPU support even '
|
|
5919
6198
|
'if GPUs are detected on the host.')
|
|
5920
6199
|
@click.option(
|
|
5921
|
-
'--
|
|
6200
|
+
'--name',
|
|
5922
6201
|
type=str,
|
|
5923
6202
|
required=False,
|
|
5924
|
-
help='
|
|
5925
|
-
@click.option('--ssh-user',
|
|
5926
|
-
type=str,
|
|
5927
|
-
required=False,
|
|
5928
|
-
help='SSH username for accessing remote machines.')
|
|
5929
|
-
@click.option('--ssh-key-path',
|
|
5930
|
-
type=str,
|
|
5931
|
-
required=False,
|
|
5932
|
-
help='Path to the SSH private key.')
|
|
5933
|
-
@click.option('--cleanup',
|
|
5934
|
-
is_flag=True,
|
|
5935
|
-
help='Clean up the remote cluster instead of deploying it.')
|
|
6203
|
+
help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
|
|
5936
6204
|
@click.option(
|
|
5937
|
-
'--
|
|
5938
|
-
type=
|
|
6205
|
+
'--port-start',
|
|
6206
|
+
type=int,
|
|
5939
6207
|
required=False,
|
|
5940
|
-
help='
|
|
5941
|
-
|
|
5942
|
-
|
|
5943
|
-
required=False,
|
|
5944
|
-
help='Password for the ssh-user to execute sudo commands. '
|
|
5945
|
-
'Required only if passwordless sudo is not setup.')
|
|
6208
|
+
help='Starting port range for the local kind cluster. Needs to be a '
|
|
6209
|
+
'multiple of 100. If not given, a random range will be used. '
|
|
6210
|
+
'Used without ip list.')
|
|
5946
6211
|
@local.command('up', cls=_DocumentedCodeCommand)
|
|
5947
6212
|
@flags.config_option(expose_value=False)
|
|
5948
6213
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
5949
6214
|
@usage_lib.entrypoint
|
|
5950
|
-
def local_up(gpus: bool,
|
|
5951
|
-
|
|
5952
|
-
|
|
5953
|
-
|
|
5954
|
-
|
|
5955
|
-
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
|
5956
|
-
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
|
5957
|
-
# all must be specified
|
|
5958
|
-
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
|
|
5959
|
-
if not (ips and ssh_user and ssh_key_path):
|
|
5960
|
-
raise click.BadParameter(
|
|
5961
|
-
'All --ips, --ssh-user, and --ssh-key-path '
|
|
5962
|
-
'must be specified together.')
|
|
5963
|
-
|
|
5964
|
-
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
|
5965
|
-
# are all provided
|
|
5966
|
-
if cleanup and not (ips and ssh_user and ssh_key_path):
|
|
5967
|
-
raise click.BadParameter('--cleanup can only be used with '
|
|
5968
|
-
'--ips, --ssh-user and --ssh-key-path.')
|
|
5969
|
-
|
|
5970
|
-
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
|
5971
|
-
|
|
5972
|
-
# If remote deployment arguments are specified, run remote up script
|
|
5973
|
-
ip_list = None
|
|
5974
|
-
ssh_key = None
|
|
5975
|
-
if ips and ssh_user and ssh_key_path:
|
|
5976
|
-
# Read and validate IP file
|
|
5977
|
-
try:
|
|
5978
|
-
with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
|
|
5979
|
-
ip_list = f.read().strip().splitlines()
|
|
5980
|
-
if not ip_list:
|
|
5981
|
-
raise click.BadParameter(f'IP file is empty: {ips}')
|
|
5982
|
-
except (IOError, OSError) as e:
|
|
5983
|
-
raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
|
|
5984
|
-
|
|
5985
|
-
# Read and validate SSH key file
|
|
5986
|
-
try:
|
|
5987
|
-
with open(os.path.expanduser(ssh_key_path), 'r',
|
|
5988
|
-
encoding='utf-8') as f:
|
|
5989
|
-
ssh_key = f.read()
|
|
5990
|
-
if not ssh_key:
|
|
5991
|
-
raise click.BadParameter(
|
|
5992
|
-
f'SSH key file is empty: {ssh_key_path}')
|
|
5993
|
-
except (IOError, OSError) as e:
|
|
5994
|
-
raise click.BadParameter(
|
|
5995
|
-
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
|
|
5996
|
-
|
|
5997
|
-
request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
|
|
5998
|
-
context_name, password)
|
|
6215
|
+
def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
|
|
6216
|
+
async_call: bool):
|
|
6217
|
+
"""Creates a local cluster."""
|
|
6218
|
+
request_id = sdk.local_up(gpus, name, port_start)
|
|
5999
6219
|
_async_call_or_wait(request_id, async_call, request_name='local up')
|
|
6000
6220
|
|
|
6001
6221
|
|
|
6222
|
+
@click.option('--name',
|
|
6223
|
+
type=str,
|
|
6224
|
+
required=False,
|
|
6225
|
+
help='Name of the cluster to down. Defaults to "skypilot".')
|
|
6002
6226
|
@local.command('down', cls=_DocumentedCodeCommand)
|
|
6003
6227
|
@flags.config_option(expose_value=False)
|
|
6004
6228
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
6005
6229
|
@usage_lib.entrypoint
|
|
6006
|
-
def local_down(async_call: bool):
|
|
6230
|
+
def local_down(name: Optional[str], async_call: bool):
|
|
6007
6231
|
"""Deletes a local cluster."""
|
|
6008
|
-
request_id = sdk.local_down()
|
|
6232
|
+
request_id = sdk.local_down(name)
|
|
6009
6233
|
_async_call_or_wait(request_id, async_call, request_name='sky.local.down')
|
|
6010
6234
|
|
|
6011
6235
|
|
|
@@ -6119,20 +6343,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6119
6343
|
**_get_shell_complete_args(_complete_api_request))
|
|
6120
6344
|
@flags.all_option('Cancel all your requests.')
|
|
6121
6345
|
@flags.all_users_option('Cancel all requests from all users.')
|
|
6346
|
+
@flags.yes_option()
|
|
6122
6347
|
@usage_lib.entrypoint
|
|
6123
6348
|
# pylint: disable=redefined-builtin
|
|
6124
|
-
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool
|
|
6349
|
+
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
|
|
6350
|
+
yes: bool):
|
|
6125
6351
|
"""Cancel a request running on SkyPilot API server."""
|
|
6126
6352
|
if all or all_users:
|
|
6127
|
-
|
|
6128
|
-
|
|
6129
|
-
|
|
6130
|
-
|
|
6131
|
-
|
|
6132
|
-
|
|
6133
|
-
|
|
6134
|
-
|
|
6135
|
-
|
|
6353
|
+
if not yes:
|
|
6354
|
+
keyword = 'ALL USERS\'' if all_users else 'YOUR'
|
|
6355
|
+
user_input = click.prompt(
|
|
6356
|
+
f'This will cancel all {keyword} requests.\n'
|
|
6357
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
|
6358
|
+
f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
|
|
6359
|
+
type=str)
|
|
6360
|
+
if user_input != 'cancel all requests':
|
|
6361
|
+
raise click.Abort()
|
|
6136
6362
|
request_ids = None
|
|
6137
6363
|
cancelled_request_ids = sdk.get(
|
|
6138
6364
|
sdk.api_cancel(request_ids=request_ids, all_users=all_users))
|
|
@@ -6146,9 +6372,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6146
6372
|
fg='green')
|
|
6147
6373
|
|
|
6148
6374
|
|
|
6375
|
+
class IntOrNone(click.ParamType):
|
|
6376
|
+
"""Int or None"""
|
|
6377
|
+
name = 'int-or-none'
|
|
6378
|
+
|
|
6379
|
+
def convert(self, value, param, ctx):
|
|
6380
|
+
if isinstance(value, int):
|
|
6381
|
+
return value
|
|
6382
|
+
if isinstance(value, str) and value.lower() in ('none', 'all'):
|
|
6383
|
+
return None
|
|
6384
|
+
try:
|
|
6385
|
+
return int(value)
|
|
6386
|
+
except ValueError:
|
|
6387
|
+
self.fail(f'{value!r} is not a valid integer or "none" or "all"',
|
|
6388
|
+
param, ctx)
|
|
6389
|
+
|
|
6390
|
+
|
|
6391
|
+
INT_OR_NONE = IntOrNone()
|
|
6392
|
+
|
|
6393
|
+
|
|
6149
6394
|
@api.command('status', cls=_DocumentedCodeCommand)
|
|
6150
6395
|
@flags.config_option(expose_value=False)
|
|
6151
|
-
@click.argument('
|
|
6396
|
+
@click.argument('request_id_prefixes',
|
|
6152
6397
|
required=False,
|
|
6153
6398
|
type=str,
|
|
6154
6399
|
nargs=-1,
|
|
@@ -6158,16 +6403,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6158
6403
|
is_flag=True,
|
|
6159
6404
|
default=False,
|
|
6160
6405
|
required=False,
|
|
6161
|
-
help='Show requests of all statuses
|
|
6406
|
+
help=('Show requests of all statuses, including finished ones '
|
|
6407
|
+
'(SUCCEEDED, FAILED, CANCELLED). By default, only active '
|
|
6408
|
+
'requests (PENDING, RUNNING) are shown.'))
|
|
6409
|
+
@click.option(
|
|
6410
|
+
'--limit',
|
|
6411
|
+
'-l',
|
|
6412
|
+
default=_NUM_REQUESTS_TO_SHOW,
|
|
6413
|
+
type=INT_OR_NONE,
|
|
6414
|
+
required=False,
|
|
6415
|
+
help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
|
|
6416
|
+
f' set to "none" or "all" to show all requests.'))
|
|
6162
6417
|
@flags.verbose_option('Show more details.')
|
|
6163
6418
|
@usage_lib.entrypoint
|
|
6164
6419
|
# pylint: disable=redefined-builtin
|
|
6165
|
-
def api_status(
|
|
6166
|
-
verbose: bool):
|
|
6420
|
+
def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
|
|
6421
|
+
verbose: bool, limit: Optional[int]):
|
|
6167
6422
|
"""List requests on SkyPilot API server."""
|
|
6168
|
-
if not
|
|
6169
|
-
|
|
6170
|
-
|
|
6423
|
+
if not request_id_prefixes:
|
|
6424
|
+
request_id_prefixes = None
|
|
6425
|
+
fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
|
|
6426
|
+
if verbose:
|
|
6427
|
+
fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
|
|
6428
|
+
request_list = sdk.api_status(request_id_prefixes, all_status, limit,
|
|
6429
|
+
fields)
|
|
6171
6430
|
columns = ['ID', 'User', 'Name']
|
|
6172
6431
|
if verbose:
|
|
6173
6432
|
columns.append('Cluster')
|
|
@@ -6193,8 +6452,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
|
6193
6452
|
if verbose:
|
|
6194
6453
|
dummy_row.append('-')
|
|
6195
6454
|
table.add_row(dummy_row)
|
|
6196
|
-
click.echo()
|
|
6197
6455
|
click.echo(table)
|
|
6456
|
+
if limit and len(request_list) >= limit:
|
|
6457
|
+
click.echo()
|
|
6458
|
+
click.echo(
|
|
6459
|
+
f'Showing {limit} requests. Use "-l none" or "-l all" to show'
|
|
6460
|
+
f' all requests.')
|
|
6198
6461
|
|
|
6199
6462
|
|
|
6200
6463
|
@api.command('login', cls=_DocumentedCodeCommand)
|