skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/client/sdk.py
CHANGED
|
@@ -31,14 +31,18 @@ from sky import skypilot_config
|
|
|
31
31
|
from sky.adaptors import common as adaptors_common
|
|
32
32
|
from sky.client import common as client_common
|
|
33
33
|
from sky.client import oauth as oauth_lib
|
|
34
|
+
from sky.jobs import scheduler
|
|
35
|
+
from sky.jobs import utils as managed_job_utils
|
|
34
36
|
from sky.schemas.api import responses
|
|
35
37
|
from sky.server import common as server_common
|
|
36
38
|
from sky.server import rest
|
|
37
39
|
from sky.server import versions
|
|
38
40
|
from sky.server.requests import payloads
|
|
41
|
+
from sky.server.requests import request_names
|
|
39
42
|
from sky.server.requests import requests as requests_lib
|
|
40
43
|
from sky.skylet import autostop_lib
|
|
41
44
|
from sky.skylet import constants
|
|
45
|
+
from sky.ssh_node_pools import utils as ssh_utils
|
|
42
46
|
from sky.usage import usage_lib
|
|
43
47
|
from sky.utils import admin_policy_utils
|
|
44
48
|
from sky.utils import annotations
|
|
@@ -54,7 +58,6 @@ from sky.utils import status_lib
|
|
|
54
58
|
from sky.utils import subprocess_utils
|
|
55
59
|
from sky.utils import ux_utils
|
|
56
60
|
from sky.utils import yaml_utils
|
|
57
|
-
from sky.utils.kubernetes import ssh_utils
|
|
58
61
|
|
|
59
62
|
if typing.TYPE_CHECKING:
|
|
60
63
|
import base64
|
|
@@ -97,6 +100,9 @@ def reload_config() -> None:
|
|
|
97
100
|
skypilot_config.safe_reload_config()
|
|
98
101
|
|
|
99
102
|
|
|
103
|
+
# The overloads are not comprehensive - e.g. get_result Literal[False] could be
|
|
104
|
+
# specified to return None. We can add more overloads if needed. To do that see
|
|
105
|
+
# https://github.com/python/mypy/issues/8634#issuecomment-609411104
|
|
100
106
|
@typing.overload
|
|
101
107
|
def stream_response(request_id: None,
|
|
102
108
|
response: 'requests.Response',
|
|
@@ -111,7 +117,16 @@ def stream_response(request_id: server_common.RequestId[T],
|
|
|
111
117
|
response: 'requests.Response',
|
|
112
118
|
output_stream: Optional['io.TextIOBase'] = None,
|
|
113
119
|
resumable: bool = False,
|
|
114
|
-
get_result:
|
|
120
|
+
get_result: Literal[True] = True) -> T:
|
|
121
|
+
...
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@typing.overload
|
|
125
|
+
def stream_response(request_id: server_common.RequestId[T],
|
|
126
|
+
response: 'requests.Response',
|
|
127
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
|
128
|
+
resumable: bool = False,
|
|
129
|
+
get_result: bool = True) -> Optional[T]:
|
|
115
130
|
...
|
|
116
131
|
|
|
117
132
|
|
|
@@ -367,6 +382,16 @@ def workspaces() -> server_common.RequestId[Dict[str, Any]]:
|
|
|
367
382
|
return server_common.get_request_id(response)
|
|
368
383
|
|
|
369
384
|
|
|
385
|
+
def _raise_exception_object_on_client(e: BaseException) -> None:
|
|
386
|
+
"""Raise the exception object on the client."""
|
|
387
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
388
|
+
stacktrace = getattr(e, 'stacktrace', str(e))
|
|
389
|
+
logger.error('=== Traceback on SkyPilot API Server ===\n'
|
|
390
|
+
f'{stacktrace}')
|
|
391
|
+
with ux_utils.print_exception_no_traceback():
|
|
392
|
+
raise e
|
|
393
|
+
|
|
394
|
+
|
|
370
395
|
@usage_lib.entrypoint
|
|
371
396
|
@server_common.check_server_healthy_or_start
|
|
372
397
|
@annotations.client_api
|
|
@@ -407,9 +432,8 @@ def validate(
|
|
|
407
432
|
response = server_common.make_authenticated_request(
|
|
408
433
|
'POST', '/validate', json=json.loads(body.model_dump_json()))
|
|
409
434
|
if response.status_code == 400:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
response.json().get('detail'))
|
|
435
|
+
_raise_exception_object_on_client(
|
|
436
|
+
exceptions.deserialize_exception(response.json().get('detail')))
|
|
413
437
|
|
|
414
438
|
|
|
415
439
|
@usage_lib.entrypoint
|
|
@@ -590,7 +614,10 @@ def launch(
|
|
|
590
614
|
down=down,
|
|
591
615
|
dryrun=dryrun)
|
|
592
616
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
593
|
-
dag,
|
|
617
|
+
dag,
|
|
618
|
+
request_name=request_names.AdminPolicyRequestName.CLUSTER_LAUNCH,
|
|
619
|
+
request_options=request_options,
|
|
620
|
+
at_client_side=True) as dag:
|
|
594
621
|
return _launch(
|
|
595
622
|
dag,
|
|
596
623
|
cluster_name,
|
|
@@ -648,7 +675,7 @@ def _launch(
|
|
|
648
675
|
clusters = get(status_request_id)
|
|
649
676
|
cluster_user_hash = common_utils.get_user_hash()
|
|
650
677
|
cluster_user_hash_str = ''
|
|
651
|
-
current_user = common_utils.
|
|
678
|
+
current_user = common_utils.get_local_user_name()
|
|
652
679
|
cluster_user_name = current_user
|
|
653
680
|
if not clusters:
|
|
654
681
|
# Show the optimize log before the prompt if the cluster does not
|
|
@@ -912,6 +939,7 @@ def tail_logs(
|
|
|
912
939
|
@annotations.client_api
|
|
913
940
|
@rest.retry_transient_errors()
|
|
914
941
|
def tail_provision_logs(cluster_name: str,
|
|
942
|
+
worker: Optional[int] = None,
|
|
915
943
|
follow: bool = True,
|
|
916
944
|
tail: int = 0,
|
|
917
945
|
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
@@ -919,17 +947,31 @@ def tail_provision_logs(cluster_name: str,
|
|
|
919
947
|
|
|
920
948
|
Args:
|
|
921
949
|
cluster_name: name of the cluster.
|
|
950
|
+
worker: worker id in multi-node cluster.
|
|
951
|
+
If None, stream the logs of the head node.
|
|
922
952
|
follow: follow the logs.
|
|
923
953
|
tail: lines from end to tail.
|
|
924
954
|
output_stream: optional stream to write logs.
|
|
925
955
|
Returns:
|
|
926
956
|
Exit code 0 on streaming success; raises on HTTP error.
|
|
927
957
|
"""
|
|
928
|
-
body = payloads.
|
|
958
|
+
body = payloads.ProvisionLogsBody(cluster_name=cluster_name)
|
|
959
|
+
|
|
960
|
+
if worker is not None:
|
|
961
|
+
remote_api_version = versions.get_remote_api_version()
|
|
962
|
+
if remote_api_version is not None and remote_api_version >= 21:
|
|
963
|
+
if worker < 1:
|
|
964
|
+
raise ValueError('Worker must be a positive integer.')
|
|
965
|
+
body.worker = worker
|
|
966
|
+
else:
|
|
967
|
+
raise exceptions.APINotSupportedError(
|
|
968
|
+
'Worker node provision logs are not supported in your API '
|
|
969
|
+
'server. Please upgrade to a newer API server to use it.')
|
|
929
970
|
params = {
|
|
930
971
|
'follow': str(follow).lower(),
|
|
931
972
|
'tail': tail,
|
|
932
973
|
}
|
|
974
|
+
|
|
933
975
|
response = server_common.make_authenticated_request(
|
|
934
976
|
'POST',
|
|
935
977
|
'/provision_logs',
|
|
@@ -938,13 +980,21 @@ def tail_provision_logs(cluster_name: str,
|
|
|
938
980
|
stream=True,
|
|
939
981
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
|
940
982
|
None))
|
|
983
|
+
# Check for HTTP errors before streaming the response
|
|
984
|
+
if response.status_code != 200:
|
|
985
|
+
with ux_utils.print_exception_no_traceback():
|
|
986
|
+
raise exceptions.CommandError(response.status_code,
|
|
987
|
+
'tail_provision_logs',
|
|
988
|
+
'Failed to stream provision logs',
|
|
989
|
+
response.text)
|
|
990
|
+
|
|
941
991
|
# Log request is idempotent when tail is 0, thus can resume previous
|
|
942
992
|
# streaming point on retry.
|
|
943
993
|
# request_id=None here because /provision_logs does not create an async
|
|
944
994
|
# request. Instead, it streams a plain file from the server. This does NOT
|
|
945
995
|
# violate the stream_response doc warning about None in multi-user
|
|
946
|
-
# environments: we are not asking stream_response to select
|
|
947
|
-
# request
|
|
996
|
+
# environments: we are not asking stream_response to select "the latest
|
|
997
|
+
# request". We already have the HTTP response to stream; request_id=None
|
|
948
998
|
# merely disables the follow-up GET. It is also necessary for --no-follow
|
|
949
999
|
# to return cleanly after printing the tailed lines. If we provided a
|
|
950
1000
|
# non-None request_id here, the get(request_id) in stream_response(
|
|
@@ -1266,9 +1316,11 @@ def autostop(
|
|
|
1266
1316
|
@usage_lib.entrypoint
|
|
1267
1317
|
@server_common.check_server_healthy_or_start
|
|
1268
1318
|
@annotations.client_api
|
|
1269
|
-
def queue(
|
|
1270
|
-
|
|
1271
|
-
|
|
1319
|
+
def queue(
|
|
1320
|
+
cluster_name: str,
|
|
1321
|
+
skip_finished: bool = False,
|
|
1322
|
+
all_users: bool = False
|
|
1323
|
+
) -> server_common.RequestId[List[responses.ClusterJobRecord]]:
|
|
1272
1324
|
"""Gets the job queue of a cluster.
|
|
1273
1325
|
|
|
1274
1326
|
Args:
|
|
@@ -1281,8 +1333,8 @@ def queue(cluster_name: str,
|
|
|
1281
1333
|
The request ID of the queue request.
|
|
1282
1334
|
|
|
1283
1335
|
Request Returns:
|
|
1284
|
-
job_records (List[
|
|
1285
|
-
queue.
|
|
1336
|
+
job_records (List[responses.ClusterJobRecord]): A list of job records
|
|
1337
|
+
for each job in the queue.
|
|
1286
1338
|
|
|
1287
1339
|
.. code-block:: python
|
|
1288
1340
|
|
|
@@ -1428,6 +1480,7 @@ def status(
|
|
|
1428
1480
|
all_users: bool = False,
|
|
1429
1481
|
*,
|
|
1430
1482
|
_include_credentials: bool = False,
|
|
1483
|
+
_summary_response: bool = False,
|
|
1431
1484
|
) -> server_common.RequestId[List[responses.StatusResponse]]:
|
|
1432
1485
|
"""Gets cluster statuses.
|
|
1433
1486
|
|
|
@@ -1513,6 +1566,7 @@ def status(
|
|
|
1513
1566
|
refresh=refresh,
|
|
1514
1567
|
all_users=all_users,
|
|
1515
1568
|
include_credentials=_include_credentials,
|
|
1569
|
+
summary_response=_summary_response,
|
|
1516
1570
|
)
|
|
1517
1571
|
response = server_common.make_authenticated_request(
|
|
1518
1572
|
'POST', '/status', json=json.loads(body.model_dump_json()))
|
|
@@ -1613,26 +1667,15 @@ def cost_report(
|
|
|
1613
1667
|
@usage_lib.entrypoint
|
|
1614
1668
|
@server_common.check_server_healthy_or_start
|
|
1615
1669
|
@annotations.client_api
|
|
1616
|
-
def storage_ls() -> server_common.RequestId[List[
|
|
1670
|
+
def storage_ls() -> server_common.RequestId[List[responses.StorageRecord]]:
|
|
1617
1671
|
"""Gets the storages.
|
|
1618
1672
|
|
|
1619
1673
|
Returns:
|
|
1620
1674
|
The request ID of the storage list request.
|
|
1621
1675
|
|
|
1622
1676
|
Request Returns:
|
|
1623
|
-
storage_records (List[
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
.. code-block:: python
|
|
1627
|
-
|
|
1628
|
-
{
|
|
1629
|
-
'name': (str) storage name,
|
|
1630
|
-
'launched_at': (int) timestamp of creation,
|
|
1631
|
-
'store': (List[sky.StoreType]) storage type,
|
|
1632
|
-
'last_use': (int) timestamp of last use,
|
|
1633
|
-
'status': (sky.StorageStatus) storage status,
|
|
1634
|
-
}
|
|
1635
|
-
]
|
|
1677
|
+
storage_records (List[responses.StorageRecord]):
|
|
1678
|
+
A list of storage records.
|
|
1636
1679
|
"""
|
|
1637
1680
|
response = server_common.make_authenticated_request('GET', '/storage/ls')
|
|
1638
1681
|
return server_common.get_request_id(response)
|
|
@@ -1669,12 +1712,8 @@ def storage_delete(name: str) -> server_common.RequestId[None]:
|
|
|
1669
1712
|
@server_common.check_server_healthy_or_start
|
|
1670
1713
|
@annotations.client_api
|
|
1671
1714
|
def local_up(gpus: bool,
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
ssh_key: Optional[str],
|
|
1675
|
-
cleanup: bool,
|
|
1676
|
-
context_name: Optional[str] = None,
|
|
1677
|
-
password: Optional[str] = None) -> server_common.RequestId[None]:
|
|
1715
|
+
name: Optional[str] = None,
|
|
1716
|
+
port_start: Optional[int] = None) -> server_common.RequestId[None]:
|
|
1678
1717
|
"""Launches a Kubernetes cluster on local machines.
|
|
1679
1718
|
|
|
1680
1719
|
Returns:
|
|
@@ -1685,16 +1724,10 @@ def local_up(gpus: bool,
|
|
|
1685
1724
|
# TODO: move this check to server.
|
|
1686
1725
|
if not server_common.is_api_server_local():
|
|
1687
1726
|
with ux_utils.print_exception_no_traceback():
|
|
1688
|
-
raise ValueError(
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
body = payloads.LocalUpBody(gpus=gpus,
|
|
1692
|
-
ips=ips,
|
|
1693
|
-
ssh_user=ssh_user,
|
|
1694
|
-
ssh_key=ssh_key,
|
|
1695
|
-
cleanup=cleanup,
|
|
1696
|
-
context_name=context_name,
|
|
1697
|
-
password=password)
|
|
1727
|
+
raise ValueError('`sky local up` is only supported when '
|
|
1728
|
+
'running SkyPilot locally.')
|
|
1729
|
+
|
|
1730
|
+
body = payloads.LocalUpBody(gpus=gpus, name=name, port_start=port_start)
|
|
1698
1731
|
response = server_common.make_authenticated_request(
|
|
1699
1732
|
'POST', '/local_up', json=json.loads(body.model_dump_json()))
|
|
1700
1733
|
return server_common.get_request_id(response)
|
|
@@ -1703,16 +1736,19 @@ def local_up(gpus: bool,
|
|
|
1703
1736
|
@usage_lib.entrypoint
|
|
1704
1737
|
@server_common.check_server_healthy_or_start
|
|
1705
1738
|
@annotations.client_api
|
|
1706
|
-
def local_down() -> server_common.RequestId[None]:
|
|
1739
|
+
def local_down(name: Optional[str]) -> server_common.RequestId[None]:
|
|
1707
1740
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1708
1741
|
# We do not allow local up when the API server is running remotely since it
|
|
1709
1742
|
# will modify the kubeconfig.
|
|
1710
1743
|
# TODO: move this check to remote server.
|
|
1711
1744
|
if not server_common.is_api_server_local():
|
|
1712
1745
|
with ux_utils.print_exception_no_traceback():
|
|
1713
|
-
raise ValueError('sky local down is only supported when running '
|
|
1746
|
+
raise ValueError('`sky local down` is only supported when running '
|
|
1714
1747
|
'SkyPilot locally.')
|
|
1715
|
-
|
|
1748
|
+
|
|
1749
|
+
body = payloads.LocalDownBody(name=name)
|
|
1750
|
+
response = server_common.make_authenticated_request(
|
|
1751
|
+
'POST', '/local_down', json=json.loads(body.model_dump_json()))
|
|
1716
1752
|
return server_common.get_request_id(response)
|
|
1717
1753
|
|
|
1718
1754
|
|
|
@@ -1900,11 +1936,12 @@ def kubernetes_node_info(
|
|
|
1900
1936
|
@usage_lib.entrypoint
|
|
1901
1937
|
@server_common.check_server_healthy_or_start
|
|
1902
1938
|
@annotations.client_api
|
|
1903
|
-
def status_kubernetes() -> server_common.RequestId[
|
|
1904
|
-
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
"""Gets all SkyPilot clusters and jobs
|
|
1939
|
+
def status_kubernetes() -> server_common.RequestId[
|
|
1940
|
+
Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
1941
|
+
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
1942
|
+
List[responses.ManagedJobRecord], Optional[str]]]:
|
|
1943
|
+
"""[Experimental] Gets all SkyPilot clusters and jobs
|
|
1944
|
+
in the Kubernetes cluster.
|
|
1908
1945
|
|
|
1909
1946
|
Managed jobs and services are also included in the clusters returned.
|
|
1910
1947
|
The caller must parse the controllers to identify which clusters are run
|
|
@@ -1976,12 +2013,7 @@ def get(request_id: server_common.RequestId[T]) -> T:
|
|
|
1976
2013
|
error = request_task.get_error()
|
|
1977
2014
|
if error is not None:
|
|
1978
2015
|
error_obj = error['object']
|
|
1979
|
-
|
|
1980
|
-
stacktrace = getattr(error_obj, 'stacktrace', str(error_obj))
|
|
1981
|
-
logger.error('=== Traceback on SkyPilot API Server ===\n'
|
|
1982
|
-
f'{stacktrace}')
|
|
1983
|
-
with ux_utils.print_exception_no_traceback():
|
|
1984
|
-
raise error_obj
|
|
2016
|
+
_raise_exception_object_on_client(error_obj)
|
|
1985
2017
|
if request_task.status == requests_lib.RequestStatus.CANCELLED:
|
|
1986
2018
|
with ux_utils.print_exception_no_traceback():
|
|
1987
2019
|
raise exceptions.RequestCancelled(
|
|
@@ -2067,6 +2099,12 @@ def stream_and_get(
|
|
|
2067
2099
|
detail = response.json().get('detail')
|
|
2068
2100
|
with ux_utils.print_exception_no_traceback():
|
|
2069
2101
|
raise exceptions.ClientError(f'Failed to stream logs: {detail}')
|
|
2102
|
+
stream_request_id: Optional[server_common.RequestId[
|
|
2103
|
+
T]] = server_common.get_stream_request_id(response)
|
|
2104
|
+
if request_id is not None and stream_request_id is not None:
|
|
2105
|
+
assert request_id == stream_request_id
|
|
2106
|
+
if request_id is None:
|
|
2107
|
+
request_id = stream_request_id
|
|
2070
2108
|
elif response.status_code != 200:
|
|
2071
2109
|
# TODO(syang): handle the case where the requestID is not provided
|
|
2072
2110
|
# see https://github.com/skypilot-org/skypilot/issues/6549
|
|
@@ -2076,6 +2114,7 @@ def stream_and_get(
|
|
|
2076
2114
|
return stream_response(request_id,
|
|
2077
2115
|
response,
|
|
2078
2116
|
output_stream,
|
|
2117
|
+
resumable=True,
|
|
2079
2118
|
get_result=follow)
|
|
2080
2119
|
|
|
2081
2120
|
|
|
@@ -2150,7 +2189,9 @@ def _local_api_server_running(kill: bool = False) -> bool:
|
|
|
2150
2189
|
def api_status(
|
|
2151
2190
|
request_ids: Optional[List[Union[server_common.RequestId[T], str]]] = None,
|
|
2152
2191
|
# pylint: disable=redefined-builtin
|
|
2153
|
-
all_status: bool = False
|
|
2192
|
+
all_status: bool = False,
|
|
2193
|
+
limit: Optional[int] = None,
|
|
2194
|
+
fields: Optional[List[str]] = None,
|
|
2154
2195
|
) -> List[payloads.RequestPayload]:
|
|
2155
2196
|
"""Lists all requests.
|
|
2156
2197
|
|
|
@@ -2159,6 +2200,8 @@ def api_status(
|
|
|
2159
2200
|
If None, all requests are queried.
|
|
2160
2201
|
all_status: Whether to list all finished requests as well. This argument
|
|
2161
2202
|
is ignored if request_ids is not None.
|
|
2203
|
+
limit: The number of requests to show. If None, show all requests.
|
|
2204
|
+
fields: The fields to get. If None, get all fields.
|
|
2162
2205
|
|
|
2163
2206
|
Returns:
|
|
2164
2207
|
A list of request payloads.
|
|
@@ -2167,8 +2210,12 @@ def api_status(
|
|
|
2167
2210
|
logger.info('SkyPilot API server is not running.')
|
|
2168
2211
|
return []
|
|
2169
2212
|
|
|
2170
|
-
body = payloads.RequestStatusBody(
|
|
2171
|
-
|
|
2213
|
+
body = payloads.RequestStatusBody(
|
|
2214
|
+
request_ids=request_ids,
|
|
2215
|
+
all_status=all_status,
|
|
2216
|
+
limit=limit,
|
|
2217
|
+
fields=fields,
|
|
2218
|
+
)
|
|
2172
2219
|
response = server_common.make_authenticated_request(
|
|
2173
2220
|
'GET',
|
|
2174
2221
|
'/api/status',
|
|
@@ -2287,10 +2334,32 @@ def api_stop() -> None:
|
|
|
2287
2334
|
f'Cannot kill the API server at {server_url} because it is not '
|
|
2288
2335
|
f'the default SkyPilot API server started locally.')
|
|
2289
2336
|
|
|
2290
|
-
|
|
2337
|
+
# Acquire the api server creation lock to prevent multiple processes from
|
|
2338
|
+
# stopping and starting the API server at the same time.
|
|
2339
|
+
with filelock.FileLock(
|
|
2340
|
+
os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
|
|
2341
|
+
try:
|
|
2342
|
+
records = scheduler.get_controller_process_records()
|
|
2343
|
+
if records is not None:
|
|
2344
|
+
for record in records:
|
|
2345
|
+
try:
|
|
2346
|
+
if managed_job_utils.controller_process_alive(
|
|
2347
|
+
record, quiet=False):
|
|
2348
|
+
subprocess_utils.kill_children_processes(
|
|
2349
|
+
parent_pids=[record.pid], force=True)
|
|
2350
|
+
except (psutil.NoSuchProcess, psutil.ZombieProcess):
|
|
2351
|
+
continue
|
|
2352
|
+
os.remove(os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
|
|
2353
|
+
except FileNotFoundError:
|
|
2354
|
+
# its fine we will create it
|
|
2355
|
+
pass
|
|
2356
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2357
|
+
# in case we get perm issues or something is messed up, just ignore
|
|
2358
|
+
# it and assume the process is dead
|
|
2359
|
+
logger.error(f'Error looking at job controller pid file: {e}')
|
|
2360
|
+
pass
|
|
2291
2361
|
|
|
2292
|
-
|
|
2293
|
-
server_common.clear_local_api_server_database()
|
|
2362
|
+
found = _local_api_server_running(kill=True)
|
|
2294
2363
|
|
|
2295
2364
|
if found:
|
|
2296
2365
|
logger.info(f'{colorama.Fore.GREEN}SkyPilot API server stopped.'
|
|
@@ -2675,3 +2744,57 @@ def api_logout() -> None:
|
|
|
2675
2744
|
_clear_api_server_config()
|
|
2676
2745
|
logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
|
|
2677
2746
|
f'{colorama.Style.RESET_ALL}')
|
|
2747
|
+
|
|
2748
|
+
|
|
2749
|
+
@usage_lib.entrypoint
|
|
2750
|
+
@server_common.check_server_healthy_or_start
|
|
2751
|
+
@versions.minimal_api_version(24)
|
|
2752
|
+
@annotations.client_api
|
|
2753
|
+
def realtime_slurm_gpu_availability(
|
|
2754
|
+
name_filter: Optional[str] = None,
|
|
2755
|
+
quantity_filter: Optional[int] = None) -> server_common.RequestId:
|
|
2756
|
+
"""Gets the real-time Slurm GPU availability.
|
|
2757
|
+
|
|
2758
|
+
Args:
|
|
2759
|
+
name_filter: Optional name filter for GPUs.
|
|
2760
|
+
quantity_filter: Optional quantity filter for GPUs.
|
|
2761
|
+
|
|
2762
|
+
Returns:
|
|
2763
|
+
The request ID of the Slurm GPU availability request.
|
|
2764
|
+
"""
|
|
2765
|
+
body = payloads.SlurmGpuAvailabilityRequestBody(
|
|
2766
|
+
name_filter=name_filter,
|
|
2767
|
+
quantity_filter=quantity_filter,
|
|
2768
|
+
)
|
|
2769
|
+
response = server_common.make_authenticated_request(
|
|
2770
|
+
'POST',
|
|
2771
|
+
'/slurm_gpu_availability',
|
|
2772
|
+
json=json.loads(body.model_dump_json()),
|
|
2773
|
+
)
|
|
2774
|
+
return server_common.get_request_id(response)
|
|
2775
|
+
|
|
2776
|
+
|
|
2777
|
+
@usage_lib.entrypoint
|
|
2778
|
+
@server_common.check_server_healthy_or_start
|
|
2779
|
+
@versions.minimal_api_version(24)
|
|
2780
|
+
@annotations.client_api
|
|
2781
|
+
def slurm_node_info(
|
|
2782
|
+
slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
|
|
2783
|
+
"""Gets the resource information for all nodes in the Slurm cluster.
|
|
2784
|
+
|
|
2785
|
+
Returns:
|
|
2786
|
+
The request ID of the Slurm node info request.
|
|
2787
|
+
|
|
2788
|
+
Request Returns:
|
|
2789
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing info
|
|
2790
|
+
for a single Slurm node (node_name, partition, node_state,
|
|
2791
|
+
gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
|
|
2792
|
+
"""
|
|
2793
|
+
body = payloads.SlurmNodeInfoRequestBody(
|
|
2794
|
+
slurm_cluster_name=slurm_cluster_name)
|
|
2795
|
+
response = server_common.make_authenticated_request(
|
|
2796
|
+
'GET',
|
|
2797
|
+
'/slurm_node_info',
|
|
2798
|
+
json=json.loads(body.model_dump_json()),
|
|
2799
|
+
)
|
|
2800
|
+
return server_common.get_request_id(response)
|
sky/client/sdk_async.py
CHANGED
|
@@ -19,20 +19,16 @@ import aiohttp
|
|
|
19
19
|
import colorama
|
|
20
20
|
|
|
21
21
|
from sky import admin_policy
|
|
22
|
-
from sky import backends
|
|
23
22
|
from sky import catalog
|
|
24
23
|
from sky import exceptions
|
|
25
|
-
from sky import models
|
|
26
24
|
from sky import sky_logging
|
|
27
25
|
from sky.client import common as client_common
|
|
28
26
|
from sky.client import sdk
|
|
29
|
-
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
30
27
|
from sky.schemas.api import responses
|
|
31
28
|
from sky.server import common as server_common
|
|
32
29
|
from sky.server import rest
|
|
33
30
|
from sky.server.requests import payloads
|
|
34
31
|
from sky.server.requests import requests as requests_lib
|
|
35
|
-
from sky.skylet import job_lib
|
|
36
32
|
from sky.usage import usage_lib
|
|
37
33
|
from sky.utils import annotations
|
|
38
34
|
from sky.utils import common
|
|
@@ -45,6 +41,11 @@ if typing.TYPE_CHECKING:
|
|
|
45
41
|
import io
|
|
46
42
|
|
|
47
43
|
import sky
|
|
44
|
+
from sky import backends
|
|
45
|
+
from sky import models
|
|
46
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
47
|
+
from sky.skylet import autostop_lib
|
|
48
|
+
from sky.skylet import job_lib
|
|
48
49
|
|
|
49
50
|
logger = sky_logging.init_logger(__name__)
|
|
50
51
|
logging.getLogger('httpx').setLevel(logging.CRITICAL)
|
|
@@ -381,9 +382,10 @@ async def launch(
|
|
|
381
382
|
cluster_name: Optional[str] = None,
|
|
382
383
|
retry_until_up: bool = False,
|
|
383
384
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
385
|
+
wait_for: Optional['autostop_lib.AutostopWaitFor'] = None,
|
|
384
386
|
dryrun: bool = False,
|
|
385
387
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
386
|
-
backend: Optional[backends.Backend] = None,
|
|
388
|
+
backend: Optional['backends.Backend'] = None,
|
|
387
389
|
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
|
388
390
|
no_setup: bool = False,
|
|
389
391
|
clone_disk_from: Optional[str] = None,
|
|
@@ -395,12 +397,12 @@ async def launch(
|
|
|
395
397
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
396
398
|
_disable_controller_check: bool = False,
|
|
397
399
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
|
|
398
|
-
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
400
|
+
) -> Tuple[Optional[int], Optional['backends.ResourceHandle']]:
|
|
399
401
|
"""Async version of launch() that launches a cluster or task."""
|
|
400
402
|
request_id = await context_utils.to_thread(
|
|
401
403
|
sdk.launch, task, cluster_name, retry_until_up,
|
|
402
|
-
idle_minutes_to_autostop, dryrun, down, backend,
|
|
403
|
-
no_setup, clone_disk_from, fast, _need_confirmation,
|
|
404
|
+
idle_minutes_to_autostop, wait_for, dryrun, down, backend,
|
|
405
|
+
optimize_target, no_setup, clone_disk_from, fast, _need_confirmation,
|
|
404
406
|
_is_launched_by_jobs_controller, _is_launched_by_sky_serve_controller,
|
|
405
407
|
_disable_controller_check)
|
|
406
408
|
if stream_logs is not None:
|
|
@@ -416,9 +418,9 @@ async def exec( # pylint: disable=redefined-builtin
|
|
|
416
418
|
cluster_name: Optional[str] = None,
|
|
417
419
|
dryrun: bool = False,
|
|
418
420
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
419
|
-
backend: Optional[backends.Backend] = None,
|
|
421
|
+
backend: Optional['backends.Backend'] = None,
|
|
420
422
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
|
|
421
|
-
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
423
|
+
) -> Tuple[Optional[int], Optional['backends.ResourceHandle']]:
|
|
422
424
|
"""Async version of exec() that executes a task on an existing cluster."""
|
|
423
425
|
request_id = await context_utils.to_thread(sdk.exec, task, cluster_name,
|
|
424
426
|
dryrun, down, backend)
|
|
@@ -454,15 +456,17 @@ async def download_logs(cluster_name: str,
|
|
|
454
456
|
async def start(
|
|
455
457
|
cluster_name: str,
|
|
456
458
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
459
|
+
wait_for: Optional['autostop_lib.AutostopWaitFor'] = None,
|
|
457
460
|
retry_until_up: bool = False,
|
|
458
461
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
459
462
|
force: bool = False,
|
|
460
463
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
|
|
461
|
-
) -> backends.CloudVmRayResourceHandle:
|
|
464
|
+
) -> 'backends.CloudVmRayResourceHandle':
|
|
462
465
|
"""Async version of start() that restarts a cluster."""
|
|
463
466
|
request_id = await context_utils.to_thread(sdk.start, cluster_name,
|
|
464
467
|
idle_minutes_to_autostop,
|
|
465
|
-
retry_until_up, down,
|
|
468
|
+
wait_for, retry_until_up, down,
|
|
469
|
+
force)
|
|
466
470
|
if stream_logs is not None:
|
|
467
471
|
return await _stream_and_get(request_id, stream_logs)
|
|
468
472
|
else:
|
|
@@ -502,13 +506,14 @@ async def stop(
|
|
|
502
506
|
async def autostop(
|
|
503
507
|
cluster_name: str,
|
|
504
508
|
idle_minutes: int,
|
|
509
|
+
wait_for: Optional['autostop_lib.AutostopWaitFor'] = None,
|
|
505
510
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
506
511
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
507
512
|
) -> None:
|
|
508
513
|
"""Async version of autostop() that schedules an autostop/autodown for a
|
|
509
514
|
cluster."""
|
|
510
515
|
request_id = await context_utils.to_thread(sdk.autostop, cluster_name,
|
|
511
|
-
idle_minutes, down)
|
|
516
|
+
idle_minutes, wait_for, down)
|
|
512
517
|
if stream_logs is not None:
|
|
513
518
|
return await _stream_and_get(request_id, stream_logs)
|
|
514
519
|
else:
|
|
@@ -518,11 +523,11 @@ async def autostop(
|
|
|
518
523
|
@usage_lib.entrypoint
|
|
519
524
|
@annotations.client_api
|
|
520
525
|
async def queue(
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
) -> List[
|
|
526
|
+
cluster_name: str,
|
|
527
|
+
skip_finished: bool = False,
|
|
528
|
+
all_users: bool = False,
|
|
529
|
+
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
530
|
+
) -> List[responses.ClusterJobRecord]:
|
|
526
531
|
"""Async version of queue() that gets the job queue of a cluster."""
|
|
527
532
|
request_id = await context_utils.to_thread(sdk.queue, cluster_name,
|
|
528
533
|
skip_finished, all_users)
|
|
@@ -538,7 +543,7 @@ async def job_status(
|
|
|
538
543
|
cluster_name: str,
|
|
539
544
|
job_ids: Optional[List[int]] = None,
|
|
540
545
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
541
|
-
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
546
|
+
) -> Dict[Optional[int], Optional['job_lib.JobStatus']]:
|
|
542
547
|
"""Async version of job_status() that gets the status of jobs on a
|
|
543
548
|
cluster."""
|
|
544
549
|
request_id = await context_utils.to_thread(sdk.job_status, cluster_name,
|
|
@@ -651,18 +656,13 @@ async def storage_delete(
|
|
|
651
656
|
@annotations.client_api
|
|
652
657
|
async def local_up(
|
|
653
658
|
gpus: bool,
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
ssh_key: Optional[str],
|
|
657
|
-
cleanup: bool,
|
|
658
|
-
context_name: Optional[str] = None,
|
|
659
|
-
password: Optional[str] = None,
|
|
659
|
+
name: Optional[str] = None,
|
|
660
|
+
port_start: Optional[int] = None,
|
|
660
661
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
|
|
661
662
|
"""Async version of local_up() that launches a Kubernetes cluster on
|
|
662
663
|
local machines."""
|
|
663
|
-
request_id = await context_utils.to_thread(sdk.local_up, gpus,
|
|
664
|
-
|
|
665
|
-
context_name, password)
|
|
664
|
+
request_id = await context_utils.to_thread(sdk.local_up, gpus, name,
|
|
665
|
+
port_start)
|
|
666
666
|
if stream_logs is not None:
|
|
667
667
|
return await _stream_and_get(request_id, stream_logs)
|
|
668
668
|
else:
|
|
@@ -672,10 +672,11 @@ async def local_up(
|
|
|
672
672
|
@usage_lib.entrypoint
|
|
673
673
|
@annotations.client_api
|
|
674
674
|
async def local_down(
|
|
675
|
+
name: Optional[str] = None,
|
|
675
676
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
|
|
676
677
|
"""Async version of local_down() that tears down the Kubernetes cluster
|
|
677
678
|
started by local_up."""
|
|
678
|
-
request_id = await context_utils.to_thread(sdk.local_down)
|
|
679
|
+
request_id = await context_utils.to_thread(sdk.local_down, name)
|
|
679
680
|
if stream_logs is not None:
|
|
680
681
|
return await _stream_and_get(request_id, stream_logs)
|
|
681
682
|
else:
|
|
@@ -718,7 +719,7 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
718
719
|
quantity_filter: Optional[int] = None,
|
|
719
720
|
is_ssh: Optional[bool] = None,
|
|
720
721
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
721
|
-
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
|
722
|
+
) -> List[Tuple[str, List['models.RealtimeGpuAvailability']]]:
|
|
722
723
|
"""Async version of realtime_kubernetes_gpu_availability() that gets the
|
|
723
724
|
real-time Kubernetes GPU availability."""
|
|
724
725
|
request_id = await context_utils.to_thread(
|
|
@@ -735,7 +736,7 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
735
736
|
async def kubernetes_node_info(
|
|
736
737
|
context: Optional[str] = None,
|
|
737
738
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
738
|
-
) -> models.KubernetesNodesInfo:
|
|
739
|
+
) -> 'models.KubernetesNodesInfo':
|
|
739
740
|
"""Async version of kubernetes_node_info() that gets the resource
|
|
740
741
|
information for all the nodes in the cluster."""
|
|
741
742
|
request_id = await context_utils.to_thread(sdk.kubernetes_node_info,
|
|
@@ -750,8 +751,8 @@ async def kubernetes_node_info(
|
|
|
750
751
|
@annotations.client_api
|
|
751
752
|
async def status_kubernetes(
|
|
752
753
|
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
753
|
-
) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
754
|
-
List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
|
|
754
|
+
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
755
|
+
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
755
756
|
List[Dict[str, Any]], Optional[str]]:
|
|
756
757
|
"""Async version of status_kubernetes() that gets all SkyPilot clusters
|
|
757
758
|
and jobs in the Kubernetes cluster."""
|