skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/server.py
CHANGED
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import asyncio
|
|
5
5
|
import base64
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
import contextlib
|
|
7
8
|
import datetime
|
|
9
|
+
from enum import IntEnum
|
|
8
10
|
import hashlib
|
|
9
11
|
import json
|
|
10
12
|
import multiprocessing
|
|
@@ -14,15 +16,18 @@ import posixpath
|
|
|
14
16
|
import re
|
|
15
17
|
import resource
|
|
16
18
|
import shutil
|
|
19
|
+
import struct
|
|
17
20
|
import sys
|
|
18
21
|
import threading
|
|
19
|
-
|
|
22
|
+
import traceback
|
|
23
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple
|
|
20
24
|
import uuid
|
|
21
25
|
import zipfile
|
|
22
26
|
|
|
23
27
|
import aiofiles
|
|
24
28
|
import anyio
|
|
25
29
|
import fastapi
|
|
30
|
+
from fastapi import responses as fastapi_responses
|
|
26
31
|
from fastapi.middleware import cors
|
|
27
32
|
import starlette.middleware.base
|
|
28
33
|
import uvloop
|
|
@@ -38,9 +43,12 @@ from sky import global_user_state
|
|
|
38
43
|
from sky import models
|
|
39
44
|
from sky import sky_logging
|
|
40
45
|
from sky.data import storage_utils
|
|
46
|
+
from sky.jobs import utils as managed_job_utils
|
|
41
47
|
from sky.jobs.server import server as jobs_rest
|
|
42
48
|
from sky.metrics import utils as metrics_utils
|
|
49
|
+
from sky.provision import metadata_utils
|
|
43
50
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
51
|
+
from sky.provision.slurm import utils as slurm_utils
|
|
44
52
|
from sky.schemas.api import responses
|
|
45
53
|
from sky.serve.server import server as serve_rest
|
|
46
54
|
from sky.server import common
|
|
@@ -48,14 +56,19 @@ from sky.server import config as server_config
|
|
|
48
56
|
from sky.server import constants as server_constants
|
|
49
57
|
from sky.server import daemons
|
|
50
58
|
from sky.server import metrics
|
|
59
|
+
from sky.server import middleware_utils
|
|
60
|
+
from sky.server import plugins
|
|
61
|
+
from sky.server import server_utils
|
|
51
62
|
from sky.server import state
|
|
52
63
|
from sky.server import stream_utils
|
|
53
64
|
from sky.server import versions
|
|
54
65
|
from sky.server.auth import authn
|
|
66
|
+
from sky.server.auth import loopback
|
|
55
67
|
from sky.server.auth import oauth2_proxy
|
|
56
68
|
from sky.server.requests import executor
|
|
57
69
|
from sky.server.requests import payloads
|
|
58
70
|
from sky.server.requests import preconditions
|
|
71
|
+
from sky.server.requests import request_names
|
|
59
72
|
from sky.server.requests import requests as requests_lib
|
|
60
73
|
from sky.skylet import constants
|
|
61
74
|
from sky.ssh_node_pools import server as ssh_node_pools_rest
|
|
@@ -67,10 +80,13 @@ from sky.utils import common as common_lib
|
|
|
67
80
|
from sky.utils import common_utils
|
|
68
81
|
from sky.utils import context
|
|
69
82
|
from sky.utils import context_utils
|
|
83
|
+
from sky.utils import controller_utils
|
|
70
84
|
from sky.utils import dag_utils
|
|
85
|
+
from sky.utils import env_options
|
|
71
86
|
from sky.utils import perf_utils
|
|
72
87
|
from sky.utils import status_lib
|
|
73
88
|
from sky.utils import subprocess_utils
|
|
89
|
+
from sky.utils import ux_utils
|
|
74
90
|
from sky.utils.db import db_utils
|
|
75
91
|
from sky.volumes.server import server as volumes_rest
|
|
76
92
|
from sky.workspaces import server as workspaces_rest
|
|
@@ -128,6 +144,7 @@ def _try_set_basic_auth_user(request: fastapi.Request):
|
|
|
128
144
|
break
|
|
129
145
|
|
|
130
146
|
|
|
147
|
+
@middleware_utils.websocket_aware
|
|
131
148
|
class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
132
149
|
"""Middleware to handle RBAC."""
|
|
133
150
|
|
|
@@ -157,11 +174,9 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
157
174
|
"""Middleware to add a request ID to each request."""
|
|
158
175
|
|
|
159
176
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
160
|
-
request_id =
|
|
177
|
+
request_id = requests_lib.get_new_request_id()
|
|
161
178
|
request.state.request_id = request_id
|
|
162
179
|
response = await call_next(request)
|
|
163
|
-
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
164
|
-
response.headers['X-Request-ID'] = request_id
|
|
165
180
|
response.headers['X-Skypilot-Request-ID'] = request_id
|
|
166
181
|
return response
|
|
167
182
|
|
|
@@ -177,6 +192,7 @@ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
|
|
|
177
192
|
return models.User(id=user_hash, name=user_name)
|
|
178
193
|
|
|
179
194
|
|
|
195
|
+
@middleware_utils.websocket_aware
|
|
180
196
|
class InitializeRequestAuthUserMiddleware(
|
|
181
197
|
starlette.middleware.base.BaseHTTPMiddleware):
|
|
182
198
|
|
|
@@ -187,10 +203,15 @@ class InitializeRequestAuthUserMiddleware(
|
|
|
187
203
|
return await call_next(request)
|
|
188
204
|
|
|
189
205
|
|
|
206
|
+
@middleware_utils.websocket_aware
|
|
190
207
|
class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
191
208
|
"""Middleware to handle HTTP Basic Auth."""
|
|
192
209
|
|
|
193
210
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
211
|
+
if managed_job_utils.is_consolidation_mode(
|
|
212
|
+
) and loopback.is_loopback_request(request):
|
|
213
|
+
return await call_next(request)
|
|
214
|
+
|
|
194
215
|
if request.url.path.startswith('/api/health'):
|
|
195
216
|
# Try to set the auth user from basic auth
|
|
196
217
|
_try_set_basic_auth_user(request)
|
|
@@ -234,6 +255,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
234
255
|
return await call_next(request)
|
|
235
256
|
|
|
236
257
|
|
|
258
|
+
@middleware_utils.websocket_aware
|
|
237
259
|
class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
238
260
|
"""Middleware to handle Bearer Token Auth (Service Accounts)."""
|
|
239
261
|
|
|
@@ -361,6 +383,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
361
383
|
return await call_next(request)
|
|
362
384
|
|
|
363
385
|
|
|
386
|
+
@middleware_utils.websocket_aware
|
|
364
387
|
class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
365
388
|
"""Middleware to handle auth proxy."""
|
|
366
389
|
|
|
@@ -437,7 +460,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
437
460
|
if lag_threshold is not None and lag > lag_threshold:
|
|
438
461
|
logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
|
|
439
462
|
f'{lag_threshold} seconds.')
|
|
440
|
-
|
|
463
|
+
metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
|
|
441
464
|
pid=pid).observe(lag)
|
|
442
465
|
target = now + interval
|
|
443
466
|
loop.call_at(target, tick)
|
|
@@ -445,6 +468,23 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
445
468
|
loop.call_at(target, tick)
|
|
446
469
|
|
|
447
470
|
|
|
471
|
+
async def schedule_on_boot_check_async():
|
|
472
|
+
try:
|
|
473
|
+
await executor.schedule_request_async(
|
|
474
|
+
request_id='skypilot-server-on-boot-check',
|
|
475
|
+
request_name=request_names.RequestName.CHECK,
|
|
476
|
+
request_body=server_utils.build_body_at_server(
|
|
477
|
+
request=None, body_type=payloads.CheckBody),
|
|
478
|
+
func=sky_check.check,
|
|
479
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
480
|
+
is_skypilot_system=True,
|
|
481
|
+
)
|
|
482
|
+
except exceptions.RequestAlreadyExistsError:
|
|
483
|
+
# Lifespan will be executed in each uvicorn worker process, we
|
|
484
|
+
# can safely ignore the error if the task is already scheduled.
|
|
485
|
+
logger.debug('Request skypilot-server-on-boot-check already exists.')
|
|
486
|
+
|
|
487
|
+
|
|
448
488
|
@contextlib.asynccontextmanager
|
|
449
489
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
450
490
|
"""FastAPI lifespan context manager."""
|
|
@@ -454,10 +494,11 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
454
494
|
if event.should_skip():
|
|
455
495
|
continue
|
|
456
496
|
try:
|
|
457
|
-
executor.
|
|
497
|
+
await executor.schedule_request_async(
|
|
458
498
|
request_id=event.id,
|
|
459
499
|
request_name=event.name,
|
|
460
|
-
request_body=
|
|
500
|
+
request_body=server_utils.build_body_at_server(
|
|
501
|
+
request=None, body_type=payloads.RequestBody),
|
|
461
502
|
func=event.run_event,
|
|
462
503
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
463
504
|
is_skypilot_system=True,
|
|
@@ -469,8 +510,9 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
469
510
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
470
511
|
# can safely ignore the error if the task is already scheduled.
|
|
471
512
|
logger.debug(f'Request {event.id} already exists.')
|
|
513
|
+
await schedule_on_boot_check_async()
|
|
472
514
|
asyncio.create_task(cleanup_upload_ids())
|
|
473
|
-
if
|
|
515
|
+
if metrics_utils.METRICS_ENABLED:
|
|
474
516
|
# Start monitoring the event loop lag in each server worker
|
|
475
517
|
# event loop (process).
|
|
476
518
|
asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
|
|
@@ -518,6 +560,7 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
518
560
|
return await call_next(request)
|
|
519
561
|
|
|
520
562
|
|
|
563
|
+
@middleware_utils.websocket_aware
|
|
521
564
|
class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
522
565
|
"""Middleware to control requests when server is shutting down."""
|
|
523
566
|
|
|
@@ -537,6 +580,7 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
537
580
|
return await call_next(request)
|
|
538
581
|
|
|
539
582
|
|
|
583
|
+
@middleware_utils.websocket_aware
|
|
540
584
|
class APIVersionMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
541
585
|
"""Middleware to add API version to the request."""
|
|
542
586
|
|
|
@@ -579,6 +623,9 @@ app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
|
|
579
623
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
|
580
624
|
app.add_middleware(metrics.PrometheusMiddleware)
|
|
581
625
|
app.add_middleware(APIVersionMiddleware)
|
|
626
|
+
# The order of all the authentication-related middleware is important.
|
|
627
|
+
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
628
|
+
# request.state.auth_user.
|
|
582
629
|
app.add_middleware(RBACMiddleware)
|
|
583
630
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
|
584
631
|
app.add_middleware(GracefulShutdownMiddleware)
|
|
@@ -592,12 +639,7 @@ app.add_middleware(
|
|
|
592
639
|
allow_credentials=True,
|
|
593
640
|
allow_methods=['*'],
|
|
594
641
|
allow_headers=['*'],
|
|
595
|
-
|
|
596
|
-
expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
|
|
597
|
-
# The order of all the authentication-related middleware is important.
|
|
598
|
-
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
599
|
-
# request.state.auth_user.
|
|
600
|
-
app.add_middleware(RBACMiddleware)
|
|
642
|
+
expose_headers=['X-Skypilot-Request-ID'])
|
|
601
643
|
# Authentication based on oauth2-proxy.
|
|
602
644
|
app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
|
|
603
645
|
# AuthProxyMiddleware should precede BasicAuthMiddleware and
|
|
@@ -615,6 +657,17 @@ app.add_middleware(BearerTokenMiddleware)
|
|
|
615
657
|
# middleware above.
|
|
616
658
|
app.add_middleware(InitializeRequestAuthUserMiddleware)
|
|
617
659
|
app.add_middleware(RequestIDMiddleware)
|
|
660
|
+
|
|
661
|
+
# Load plugins after all the middlewares are added, to keep the core
|
|
662
|
+
# middleware stack intact if a plugin adds new middlewares.
|
|
663
|
+
# Note: server.py will be imported twice in server process, once as
|
|
664
|
+
# the top-level entrypoint module and once imported by uvicorn, we only
|
|
665
|
+
# load the plugin when imported by uvicorn for server process.
|
|
666
|
+
# TODO(aylei): move uvicorn app out of the top-level module to avoid
|
|
667
|
+
# duplicate app initialization.
|
|
668
|
+
if __name__ == 'sky.server.server':
|
|
669
|
+
plugins.load_plugins(plugins.ExtensionContext(app=app))
|
|
670
|
+
|
|
618
671
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
|
619
672
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
|
620
673
|
app.include_router(users_rest.router, prefix='/users', tags=['users'])
|
|
@@ -625,16 +678,28 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
|
|
625
678
|
app.include_router(ssh_node_pools_rest.router,
|
|
626
679
|
prefix='/ssh_node_pools',
|
|
627
680
|
tags=['ssh_node_pools'])
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
681
|
+
# increase the resource limit for the server
|
|
682
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
683
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
|
|
687
|
+
def handle_concurrent_worker_exhausted_error(
|
|
688
|
+
request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
|
|
689
|
+
del request # request is not used
|
|
690
|
+
# Print detailed error message to server log
|
|
691
|
+
logger.error('Concurrent worker exhausted: '
|
|
692
|
+
f'{common_utils.format_exception(e)}')
|
|
693
|
+
with ux_utils.enable_traceback():
|
|
694
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
695
|
+
# Return human readable error message to client
|
|
696
|
+
return fastapi.responses.JSONResponse(
|
|
697
|
+
status_code=503,
|
|
698
|
+
content={
|
|
699
|
+
'detail':
|
|
700
|
+
('The server has exhausted its concurrent worker limit. '
|
|
701
|
+
'Please try again or scale the server if the load persists.')
|
|
702
|
+
})
|
|
638
703
|
|
|
639
704
|
|
|
640
705
|
@app.get('/token')
|
|
@@ -680,9 +745,9 @@ async def token(request: fastapi.Request,
|
|
|
680
745
|
async def check(request: fastapi.Request,
|
|
681
746
|
check_body: payloads.CheckBody) -> None:
|
|
682
747
|
"""Checks enabled clouds."""
|
|
683
|
-
executor.
|
|
748
|
+
await executor.schedule_request_async(
|
|
684
749
|
request_id=request.state.request_id,
|
|
685
|
-
request_name=
|
|
750
|
+
request_name=request_names.RequestName.CHECK,
|
|
686
751
|
request_body=check_body,
|
|
687
752
|
func=sky_check.check,
|
|
688
753
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -694,11 +759,14 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
694
759
|
workspace: Optional[str] = None,
|
|
695
760
|
expand: bool = False) -> None:
|
|
696
761
|
"""Gets enabled clouds on the server."""
|
|
697
|
-
executor.
|
|
762
|
+
await executor.schedule_request_async(
|
|
698
763
|
request_id=request.state.request_id,
|
|
699
|
-
request_name=
|
|
700
|
-
request_body=
|
|
701
|
-
|
|
764
|
+
request_name=request_names.RequestName.ENABLED_CLOUDS,
|
|
765
|
+
request_body=server_utils.build_body_at_server(
|
|
766
|
+
request=request,
|
|
767
|
+
body_type=payloads.EnabledCloudsBody,
|
|
768
|
+
workspace=workspace,
|
|
769
|
+
expand=expand),
|
|
702
770
|
func=core.enabled_clouds,
|
|
703
771
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
704
772
|
)
|
|
@@ -710,9 +778,10 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
710
778
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
711
779
|
) -> None:
|
|
712
780
|
"""Gets real-time Kubernetes GPU availability."""
|
|
713
|
-
executor.
|
|
781
|
+
await executor.schedule_request_async(
|
|
714
782
|
request_id=request.state.request_id,
|
|
715
|
-
request_name=
|
|
783
|
+
request_name=request_names.RequestName.
|
|
784
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY,
|
|
716
785
|
request_body=realtime_gpu_availability_body,
|
|
717
786
|
func=core.realtime_kubernetes_gpu_availability,
|
|
718
787
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -725,22 +794,53 @@ async def kubernetes_node_info(
|
|
|
725
794
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
726
795
|
) -> None:
|
|
727
796
|
"""Gets Kubernetes nodes information and hints."""
|
|
728
|
-
executor.
|
|
797
|
+
await executor.schedule_request_async(
|
|
729
798
|
request_id=request.state.request_id,
|
|
730
|
-
request_name=
|
|
799
|
+
request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
|
|
731
800
|
request_body=kubernetes_node_info_body,
|
|
732
801
|
func=kubernetes_utils.get_kubernetes_node_info,
|
|
733
802
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
734
803
|
)
|
|
735
804
|
|
|
736
805
|
|
|
806
|
+
@app.post('/slurm_gpu_availability')
|
|
807
|
+
async def slurm_gpu_availability(
|
|
808
|
+
request: fastapi.Request,
|
|
809
|
+
slurm_gpu_availability_body: payloads.SlurmGpuAvailabilityRequestBody
|
|
810
|
+
) -> None:
|
|
811
|
+
"""Gets real-time Slurm GPU availability."""
|
|
812
|
+
await executor.schedule_request_async(
|
|
813
|
+
request_id=request.state.request_id,
|
|
814
|
+
request_name=request_names.RequestName.REALTIME_SLURM_GPU_AVAILABILITY,
|
|
815
|
+
request_body=slurm_gpu_availability_body,
|
|
816
|
+
func=core.realtime_slurm_gpu_availability,
|
|
817
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
@app.get('/slurm_node_info')
|
|
822
|
+
async def slurm_node_info(
|
|
823
|
+
request: fastapi.Request,
|
|
824
|
+
slurm_node_info_body: payloads.SlurmNodeInfoRequestBody) -> None:
|
|
825
|
+
"""Gets detailed information for each node in the Slurm cluster."""
|
|
826
|
+
await executor.schedule_request_async(
|
|
827
|
+
request_id=request.state.request_id,
|
|
828
|
+
request_name=request_names.RequestName.SLURM_NODE_INFO,
|
|
829
|
+
request_body=slurm_node_info_body,
|
|
830
|
+
func=slurm_utils.slurm_node_info,
|
|
831
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
|
|
737
835
|
@app.get('/status_kubernetes')
|
|
738
836
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
739
|
-
"""
|
|
740
|
-
|
|
837
|
+
"""[Experimental] Get all SkyPilot resources (including from other '
|
|
838
|
+
'users) in the current Kubernetes context."""
|
|
839
|
+
await executor.schedule_request_async(
|
|
741
840
|
request_id=request.state.request_id,
|
|
742
|
-
request_name=
|
|
743
|
-
request_body=
|
|
841
|
+
request_name=request_names.RequestName.STATUS_KUBERNETES,
|
|
842
|
+
request_body=server_utils.build_body_at_server(
|
|
843
|
+
request=request, body_type=payloads.RequestBody),
|
|
744
844
|
func=core.status_kubernetes,
|
|
745
845
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
746
846
|
)
|
|
@@ -751,9 +851,9 @@ async def list_accelerators(
|
|
|
751
851
|
request: fastapi.Request,
|
|
752
852
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
753
853
|
"""Gets list of accelerators from cloud catalog."""
|
|
754
|
-
executor.
|
|
854
|
+
await executor.schedule_request_async(
|
|
755
855
|
request_id=request.state.request_id,
|
|
756
|
-
request_name=
|
|
856
|
+
request_name=request_names.RequestName.LIST_ACCELERATORS,
|
|
757
857
|
request_body=list_accelerator_counts_body,
|
|
758
858
|
func=catalog.list_accelerators,
|
|
759
859
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -766,9 +866,9 @@ async def list_accelerator_counts(
|
|
|
766
866
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
767
867
|
) -> None:
|
|
768
868
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
769
|
-
executor.
|
|
869
|
+
await executor.schedule_request_async(
|
|
770
870
|
request_id=request.state.request_id,
|
|
771
|
-
request_name=
|
|
871
|
+
request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
|
|
772
872
|
request_body=list_accelerator_counts_body,
|
|
773
873
|
func=catalog.list_accelerator_counts,
|
|
774
874
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -802,6 +902,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
802
902
|
# server thread.
|
|
803
903
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
804
904
|
dag,
|
|
905
|
+
request_name=request_names.AdminPolicyRequestName.VALIDATE,
|
|
805
906
|
request_options=validate_body.get_request_options()) as dag:
|
|
806
907
|
dag.resolve_and_validate_volumes()
|
|
807
908
|
# Skip validating workdir and file_mounts, as those need to be
|
|
@@ -815,6 +916,11 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
815
916
|
# thread executor to avoid blocking the uvicorn event loop.
|
|
816
917
|
await context_utils.to_thread(validate_dag, dag)
|
|
817
918
|
except Exception as e: # pylint: disable=broad-except
|
|
919
|
+
# Print the exception to the API server log.
|
|
920
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
921
|
+
logger.info('/validate exception:', exc_info=True)
|
|
922
|
+
# Set the exception stacktrace for the serialized exception.
|
|
923
|
+
requests_lib.set_exception_stacktrace(e)
|
|
818
924
|
raise fastapi.HTTPException(
|
|
819
925
|
status_code=400, detail=exceptions.serialize_exception(e)) from e
|
|
820
926
|
|
|
@@ -823,9 +929,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
823
929
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
824
930
|
request: fastapi.Request) -> None:
|
|
825
931
|
"""Optimizes the user's DAG."""
|
|
826
|
-
executor.
|
|
932
|
+
await executor.schedule_request_async(
|
|
827
933
|
request_id=request.state.request_id,
|
|
828
|
-
request_name=
|
|
934
|
+
request_name=request_names.RequestName.OPTIMIZE,
|
|
829
935
|
request_body=optimize_body,
|
|
830
936
|
ignore_return_value=True,
|
|
831
937
|
func=core.optimize,
|
|
@@ -1033,9 +1139,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1033
1139
|
"""Launches a cluster or task."""
|
|
1034
1140
|
request_id = request.state.request_id
|
|
1035
1141
|
logger.info(f'Launching request: {request_id}')
|
|
1036
|
-
executor.
|
|
1142
|
+
await executor.schedule_request_async(
|
|
1037
1143
|
request_id,
|
|
1038
|
-
request_name=
|
|
1144
|
+
request_name=request_names.RequestName.CLUSTER_LAUNCH,
|
|
1039
1145
|
request_body=launch_body,
|
|
1040
1146
|
func=execution.launch,
|
|
1041
1147
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1049,9 +1155,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1049
1155
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1050
1156
|
"""Executes a task on an existing cluster."""
|
|
1051
1157
|
cluster_name = exec_body.cluster_name
|
|
1052
|
-
executor.
|
|
1158
|
+
await executor.schedule_request_async(
|
|
1053
1159
|
request_id=request.state.request_id,
|
|
1054
|
-
request_name=
|
|
1160
|
+
request_name=request_names.RequestName.CLUSTER_EXEC,
|
|
1055
1161
|
request_body=exec_body,
|
|
1056
1162
|
func=execution.exec,
|
|
1057
1163
|
precondition=preconditions.ClusterStartCompletePrecondition(
|
|
@@ -1067,9 +1173,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1067
1173
|
async def stop(request: fastapi.Request,
|
|
1068
1174
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1069
1175
|
"""Stops a cluster."""
|
|
1070
|
-
executor.
|
|
1176
|
+
await executor.schedule_request_async(
|
|
1071
1177
|
request_id=request.state.request_id,
|
|
1072
|
-
request_name=
|
|
1178
|
+
request_name=request_names.RequestName.CLUSTER_STOP,
|
|
1073
1179
|
request_body=stop_body,
|
|
1074
1180
|
func=core.stop,
|
|
1075
1181
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1087,9 +1193,9 @@ async def status(
|
|
|
1087
1193
|
raise fastapi.HTTPException(
|
|
1088
1194
|
status_code=503,
|
|
1089
1195
|
detail='Server is shutting down, please try again later.')
|
|
1090
|
-
executor.
|
|
1196
|
+
await executor.schedule_request_async(
|
|
1091
1197
|
request_id=request.state.request_id,
|
|
1092
|
-
request_name=
|
|
1198
|
+
request_name=request_names.RequestName.CLUSTER_STATUS,
|
|
1093
1199
|
request_body=status_body,
|
|
1094
1200
|
func=core.status,
|
|
1095
1201
|
schedule_type=(requests_lib.ScheduleType.LONG if
|
|
@@ -1102,9 +1208,9 @@ async def status(
|
|
|
1102
1208
|
async def endpoints(request: fastapi.Request,
|
|
1103
1209
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1104
1210
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1105
|
-
executor.
|
|
1211
|
+
await executor.schedule_request_async(
|
|
1106
1212
|
request_id=request.state.request_id,
|
|
1107
|
-
request_name=
|
|
1213
|
+
request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
|
|
1108
1214
|
request_body=endpoint_body,
|
|
1109
1215
|
func=core.endpoints,
|
|
1110
1216
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1116,9 +1222,9 @@ async def endpoints(request: fastapi.Request,
|
|
|
1116
1222
|
async def down(request: fastapi.Request,
|
|
1117
1223
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1118
1224
|
"""Tears down a cluster."""
|
|
1119
|
-
executor.
|
|
1225
|
+
await executor.schedule_request_async(
|
|
1120
1226
|
request_id=request.state.request_id,
|
|
1121
|
-
request_name=
|
|
1227
|
+
request_name=request_names.RequestName.CLUSTER_DOWN,
|
|
1122
1228
|
request_body=down_body,
|
|
1123
1229
|
func=core.down,
|
|
1124
1230
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1130,9 +1236,9 @@ async def down(request: fastapi.Request,
|
|
|
1130
1236
|
async def start(request: fastapi.Request,
|
|
1131
1237
|
start_body: payloads.StartBody) -> None:
|
|
1132
1238
|
"""Restarts a cluster."""
|
|
1133
|
-
executor.
|
|
1239
|
+
await executor.schedule_request_async(
|
|
1134
1240
|
request_id=request.state.request_id,
|
|
1135
|
-
request_name=
|
|
1241
|
+
request_name=request_names.RequestName.CLUSTER_START,
|
|
1136
1242
|
request_body=start_body,
|
|
1137
1243
|
func=core.start,
|
|
1138
1244
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1144,9 +1250,9 @@ async def start(request: fastapi.Request,
|
|
|
1144
1250
|
async def autostop(request: fastapi.Request,
|
|
1145
1251
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1146
1252
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1147
|
-
executor.
|
|
1253
|
+
await executor.schedule_request_async(
|
|
1148
1254
|
request_id=request.state.request_id,
|
|
1149
|
-
request_name=
|
|
1255
|
+
request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
|
|
1150
1256
|
request_body=autostop_body,
|
|
1151
1257
|
func=core.autostop,
|
|
1152
1258
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1158,9 +1264,9 @@ async def autostop(request: fastapi.Request,
|
|
|
1158
1264
|
async def queue(request: fastapi.Request,
|
|
1159
1265
|
queue_body: payloads.QueueBody) -> None:
|
|
1160
1266
|
"""Gets the job queue of a cluster."""
|
|
1161
|
-
executor.
|
|
1267
|
+
await executor.schedule_request_async(
|
|
1162
1268
|
request_id=request.state.request_id,
|
|
1163
|
-
request_name=
|
|
1269
|
+
request_name=request_names.RequestName.CLUSTER_QUEUE,
|
|
1164
1270
|
request_body=queue_body,
|
|
1165
1271
|
func=core.queue,
|
|
1166
1272
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1172,9 +1278,9 @@ async def queue(request: fastapi.Request,
|
|
|
1172
1278
|
async def job_status(request: fastapi.Request,
|
|
1173
1279
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1174
1280
|
"""Gets the status of a job."""
|
|
1175
|
-
executor.
|
|
1281
|
+
await executor.schedule_request_async(
|
|
1176
1282
|
request_id=request.state.request_id,
|
|
1177
|
-
request_name=
|
|
1283
|
+
request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
|
|
1178
1284
|
request_body=job_status_body,
|
|
1179
1285
|
func=core.job_status,
|
|
1180
1286
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1186,9 +1292,9 @@ async def job_status(request: fastapi.Request,
|
|
|
1186
1292
|
async def cancel(request: fastapi.Request,
|
|
1187
1293
|
cancel_body: payloads.CancelBody) -> None:
|
|
1188
1294
|
"""Cancels jobs on a cluster."""
|
|
1189
|
-
executor.
|
|
1295
|
+
await executor.schedule_request_async(
|
|
1190
1296
|
request_id=request.state.request_id,
|
|
1191
|
-
request_name=
|
|
1297
|
+
request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
|
|
1192
1298
|
request_body=cancel_body,
|
|
1193
1299
|
func=core.cancel,
|
|
1194
1300
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1205,32 +1311,24 @@ async def logs(
|
|
|
1205
1311
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
|
1206
1312
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1207
1313
|
# request status.
|
|
1208
|
-
|
|
1314
|
+
executor.check_request_thread_executor_available()
|
|
1315
|
+
request_task = await executor.prepare_request_async(
|
|
1209
1316
|
request_id=request.state.request_id,
|
|
1210
|
-
request_name=
|
|
1317
|
+
request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
|
|
1211
1318
|
request_body=cluster_job_body,
|
|
1212
1319
|
func=core.tail_logs,
|
|
1213
1320
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1321
|
+
request_cluster_name=cluster_job_body.cluster_name,
|
|
1214
1322
|
)
|
|
1215
|
-
task =
|
|
1216
|
-
|
|
1217
|
-
async def cancel_task():
|
|
1218
|
-
try:
|
|
1219
|
-
logger.info('Client disconnected for request: '
|
|
1220
|
-
f'{request.state.request_id}')
|
|
1221
|
-
task.cancel()
|
|
1222
|
-
await task
|
|
1223
|
-
except asyncio.CancelledError:
|
|
1224
|
-
pass
|
|
1225
|
-
|
|
1226
|
-
# Cancel the task after the request is done or client disconnects
|
|
1227
|
-
background_tasks.add_task(cancel_task)
|
|
1323
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
1324
|
+
background_tasks.add_task(task.cancel)
|
|
1228
1325
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1229
1326
|
# the same approach as /stream.
|
|
1230
|
-
return stream_utils.
|
|
1327
|
+
return stream_utils.stream_response_for_long_request(
|
|
1231
1328
|
request_id=request.state.request_id,
|
|
1232
1329
|
logs_path=request_task.log_path,
|
|
1233
1330
|
background_tasks=background_tasks,
|
|
1331
|
+
kill_request_on_disconnect=False,
|
|
1234
1332
|
)
|
|
1235
1333
|
|
|
1236
1334
|
|
|
@@ -1245,9 +1343,9 @@ async def download_logs(
|
|
|
1245
1343
|
# We should reuse the original request body, so that the env vars, such as
|
|
1246
1344
|
# user hash, are kept the same.
|
|
1247
1345
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1248
|
-
executor.
|
|
1346
|
+
await executor.schedule_request_async(
|
|
1249
1347
|
request_id=request.state.request_id,
|
|
1250
|
-
request_name=
|
|
1348
|
+
request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
|
|
1251
1349
|
request_body=cluster_jobs_body,
|
|
1252
1350
|
func=core.download_logs,
|
|
1253
1351
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1324,27 +1422,55 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1324
1422
|
|
|
1325
1423
|
# TODO(aylei): run it asynchronously after global_user_state support async op
|
|
1326
1424
|
@app.post('/provision_logs')
|
|
1327
|
-
def provision_logs(
|
|
1425
|
+
def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
1328
1426
|
follow: bool = True,
|
|
1329
1427
|
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1330
1428
|
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1429
|
+
log_path = None
|
|
1430
|
+
cluster_name = provision_logs_body.cluster_name
|
|
1431
|
+
worker = provision_logs_body.worker
|
|
1432
|
+
# stream head node logs
|
|
1433
|
+
if worker is None:
|
|
1434
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1435
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1436
|
+
cluster_name)
|
|
1437
|
+
if not log_path_str:
|
|
1438
|
+
log_path_str = (
|
|
1439
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1440
|
+
cluster_name))
|
|
1441
|
+
if not log_path_str:
|
|
1442
|
+
raise fastapi.HTTPException(
|
|
1443
|
+
status_code=404,
|
|
1444
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1445
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1446
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1447
|
+
if not log_path.exists():
|
|
1448
|
+
raise fastapi.HTTPException(
|
|
1449
|
+
status_code=404,
|
|
1450
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1342
1451
|
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1452
|
+
# stream worker node logs
|
|
1453
|
+
else:
|
|
1454
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
1455
|
+
if handle is None:
|
|
1456
|
+
raise fastapi.HTTPException(
|
|
1457
|
+
status_code=404,
|
|
1458
|
+
detail=('Cluster handle is not recorded for this cluster. '
|
|
1459
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1460
|
+
# instance_ids includes head node
|
|
1461
|
+
instance_ids = handle.instance_ids
|
|
1462
|
+
if instance_ids is None:
|
|
1463
|
+
raise fastapi.HTTPException(
|
|
1464
|
+
status_code=400,
|
|
1465
|
+
detail='Instance IDs are not recorded for this cluster. '
|
|
1466
|
+
'Please relaunch to generate provisioning logs.')
|
|
1467
|
+
if worker > len(instance_ids) - 1:
|
|
1468
|
+
raise fastapi.HTTPException(
|
|
1469
|
+
status_code=400,
|
|
1470
|
+
detail=f'Worker {worker} is out of range. '
|
|
1471
|
+
f'The cluster has {len(instance_ids)} nodes.')
|
|
1472
|
+
log_path = metadata_utils.get_instance_log_dir(
|
|
1473
|
+
handle.get_cluster_name_on_cloud(), instance_ids[worker])
|
|
1348
1474
|
|
|
1349
1475
|
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1350
1476
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
@@ -1353,7 +1479,8 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1353
1479
|
content=stream_utils.log_streamer(None,
|
|
1354
1480
|
log_path,
|
|
1355
1481
|
tail=effective_tail,
|
|
1356
|
-
follow=follow
|
|
1482
|
+
follow=follow,
|
|
1483
|
+
cluster_name=cluster_name),
|
|
1357
1484
|
media_type='text/plain',
|
|
1358
1485
|
headers={
|
|
1359
1486
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1367,9 +1494,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1367
1494
|
async def cost_report(request: fastapi.Request,
|
|
1368
1495
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1369
1496
|
"""Gets the cost report of a cluster."""
|
|
1370
|
-
executor.
|
|
1497
|
+
await executor.schedule_request_async(
|
|
1371
1498
|
request_id=request.state.request_id,
|
|
1372
|
-
request_name=
|
|
1499
|
+
request_name=request_names.RequestName.CLUSTER_COST_REPORT,
|
|
1373
1500
|
request_body=cost_report_body,
|
|
1374
1501
|
func=core.cost_report,
|
|
1375
1502
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1379,10 +1506,11 @@ async def cost_report(request: fastapi.Request,
|
|
|
1379
1506
|
@app.get('/storage/ls')
|
|
1380
1507
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1381
1508
|
"""Gets the storages."""
|
|
1382
|
-
executor.
|
|
1509
|
+
await executor.schedule_request_async(
|
|
1383
1510
|
request_id=request.state.request_id,
|
|
1384
|
-
request_name=
|
|
1385
|
-
request_body=
|
|
1511
|
+
request_name=request_names.RequestName.STORAGE_LS,
|
|
1512
|
+
request_body=server_utils.build_body_at_server(
|
|
1513
|
+
request=request, body_type=payloads.RequestBody),
|
|
1386
1514
|
func=core.storage_ls,
|
|
1387
1515
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1388
1516
|
)
|
|
@@ -1392,9 +1520,9 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1392
1520
|
async def storage_delete(request: fastapi.Request,
|
|
1393
1521
|
storage_body: payloads.StorageBody) -> None:
|
|
1394
1522
|
"""Deletes a storage."""
|
|
1395
|
-
executor.
|
|
1523
|
+
await executor.schedule_request_async(
|
|
1396
1524
|
request_id=request.state.request_id,
|
|
1397
|
-
request_name=
|
|
1525
|
+
request_name=request_names.RequestName.STORAGE_DELETE,
|
|
1398
1526
|
request_body=storage_body,
|
|
1399
1527
|
func=core.storage_delete,
|
|
1400
1528
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1405,9 +1533,9 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1405
1533
|
async def local_up(request: fastapi.Request,
|
|
1406
1534
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1407
1535
|
"""Launches a Kubernetes cluster on API server."""
|
|
1408
|
-
executor.
|
|
1536
|
+
await executor.schedule_request_async(
|
|
1409
1537
|
request_id=request.state.request_id,
|
|
1410
|
-
request_name=
|
|
1538
|
+
request_name=request_names.RequestName.LOCAL_UP,
|
|
1411
1539
|
request_body=local_up_body,
|
|
1412
1540
|
func=core.local_up,
|
|
1413
1541
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1415,21 +1543,39 @@ async def local_up(request: fastapi.Request,
|
|
|
1415
1543
|
|
|
1416
1544
|
|
|
1417
1545
|
@app.post('/local_down')
|
|
1418
|
-
async def local_down(request: fastapi.Request
|
|
1546
|
+
async def local_down(request: fastapi.Request,
|
|
1547
|
+
local_down_body: payloads.LocalDownBody) -> None:
|
|
1419
1548
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1420
|
-
executor.
|
|
1549
|
+
await executor.schedule_request_async(
|
|
1421
1550
|
request_id=request.state.request_id,
|
|
1422
|
-
request_name=
|
|
1423
|
-
request_body=
|
|
1551
|
+
request_name=request_names.RequestName.LOCAL_DOWN,
|
|
1552
|
+
request_body=local_down_body,
|
|
1424
1553
|
func=core.local_down,
|
|
1425
1554
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
1426
1555
|
)
|
|
1427
1556
|
|
|
1428
1557
|
|
|
1558
|
+
async def get_expanded_request_id(request_id: str) -> str:
|
|
1559
|
+
"""Gets the expanded request ID for a given request ID prefix."""
|
|
1560
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1561
|
+
request_id, fields=['request_id'])
|
|
1562
|
+
if request_tasks is None:
|
|
1563
|
+
raise fastapi.HTTPException(status_code=404,
|
|
1564
|
+
detail=f'Request {request_id!r} not found')
|
|
1565
|
+
if len(request_tasks) > 1:
|
|
1566
|
+
raise fastapi.HTTPException(status_code=400,
|
|
1567
|
+
detail=('Multiple requests found for '
|
|
1568
|
+
f'request ID prefix: {request_id}'))
|
|
1569
|
+
return request_tasks[0].request_id
|
|
1570
|
+
|
|
1571
|
+
|
|
1429
1572
|
# === API server related APIs ===
|
|
1430
|
-
@app.get('/api/get')
|
|
1573
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1431
1574
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1432
1575
|
"""Gets a request with a given request ID prefix."""
|
|
1576
|
+
# Validate request_id prefix matches a single request.
|
|
1577
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1578
|
+
|
|
1433
1579
|
while True:
|
|
1434
1580
|
req_status = await requests_lib.get_request_status_async(request_id)
|
|
1435
1581
|
if req_status is None:
|
|
@@ -1446,6 +1592,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1446
1592
|
# to avoid storming the DB and CPU in the meantime
|
|
1447
1593
|
await asyncio.sleep(0.1)
|
|
1448
1594
|
request_task = await requests_lib.get_request_async(request_id)
|
|
1595
|
+
# TODO(aylei): refine this, /api/get will not be retried and this is
|
|
1596
|
+
# meaningless to retry. It is the original request that should be retried.
|
|
1449
1597
|
if request_task.should_retry:
|
|
1450
1598
|
raise fastapi.HTTPException(
|
|
1451
1599
|
status_code=503, detail=f'Request {request_id!r} should be retried')
|
|
@@ -1487,13 +1635,18 @@ async def stream(
|
|
|
1487
1635
|
clients, console for CLI/API clients), 'plain' (force plain text),
|
|
1488
1636
|
'html' (force HTML), or 'console' (force console)
|
|
1489
1637
|
"""
|
|
1638
|
+
# We need to save the user-supplied request ID for the response header.
|
|
1639
|
+
user_supplied_request_id = request_id
|
|
1490
1640
|
if request_id is not None and log_path is not None:
|
|
1491
1641
|
raise fastapi.HTTPException(
|
|
1492
1642
|
status_code=400,
|
|
1493
1643
|
detail='Only one of request_id and log_path can be provided')
|
|
1494
1644
|
|
|
1645
|
+
if request_id is not None:
|
|
1646
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1647
|
+
|
|
1495
1648
|
if request_id is None and log_path is None:
|
|
1496
|
-
request_id = requests_lib.
|
|
1649
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1497
1650
|
if request_id is None:
|
|
1498
1651
|
raise fastapi.HTTPException(status_code=404,
|
|
1499
1652
|
detail='No request found')
|
|
@@ -1520,13 +1673,17 @@ async def stream(
|
|
|
1520
1673
|
'X-Accel-Buffering': 'no'
|
|
1521
1674
|
})
|
|
1522
1675
|
|
|
1676
|
+
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1523
1677
|
# Original plain text streaming logic
|
|
1524
1678
|
if request_id is not None:
|
|
1525
|
-
request_task = await requests_lib.get_request_async(
|
|
1679
|
+
request_task = await requests_lib.get_request_async(
|
|
1680
|
+
request_id, fields=['request_id', 'schedule_type'])
|
|
1526
1681
|
if request_task is None:
|
|
1527
1682
|
print(f'No task with request ID {request_id}')
|
|
1528
1683
|
raise fastapi.HTTPException(
|
|
1529
1684
|
status_code=404, detail=f'Request {request_id!r} not found')
|
|
1685
|
+
# req.log_path is derived from request_id,
|
|
1686
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1530
1687
|
log_path_to_stream = request_task.log_path
|
|
1531
1688
|
if not log_path_to_stream.exists():
|
|
1532
1689
|
# The log file might be deleted by the request GC daemon but the
|
|
@@ -1534,6 +1691,9 @@ async def stream(
|
|
|
1534
1691
|
raise fastapi.HTTPException(
|
|
1535
1692
|
status_code=404,
|
|
1536
1693
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1694
|
+
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1695
|
+
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1696
|
+
del request_task
|
|
1537
1697
|
else:
|
|
1538
1698
|
assert log_path is not None, (request_id, log_path)
|
|
1539
1699
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1567,18 +1727,26 @@ async def stream(
|
|
|
1567
1727
|
detail=f'Log path {log_path!r} does not exist')
|
|
1568
1728
|
|
|
1569
1729
|
log_path_to_stream = resolved_log_path
|
|
1730
|
+
|
|
1731
|
+
headers = {
|
|
1732
|
+
'Cache-Control': 'no-cache, no-transform',
|
|
1733
|
+
'X-Accel-Buffering': 'no',
|
|
1734
|
+
'Transfer-Encoding': 'chunked'
|
|
1735
|
+
}
|
|
1736
|
+
if request_id is not None:
|
|
1737
|
+
headers[server_constants.STREAM_REQUEST_HEADER] = (
|
|
1738
|
+
user_supplied_request_id
|
|
1739
|
+
if user_supplied_request_id else request_id)
|
|
1740
|
+
|
|
1570
1741
|
return fastapi.responses.StreamingResponse(
|
|
1571
1742
|
content=stream_utils.log_streamer(request_id,
|
|
1572
1743
|
log_path_to_stream,
|
|
1573
1744
|
plain_logs=format == 'plain',
|
|
1574
1745
|
tail=tail,
|
|
1575
|
-
follow=follow
|
|
1746
|
+
follow=follow,
|
|
1747
|
+
polling_interval=polling_interval),
|
|
1576
1748
|
media_type='text/plain',
|
|
1577
|
-
headers=
|
|
1578
|
-
'Cache-Control': 'no-cache, no-transform',
|
|
1579
|
-
'X-Accel-Buffering': 'no',
|
|
1580
|
-
'Transfer-Encoding': 'chunked'
|
|
1581
|
-
},
|
|
1749
|
+
headers=headers,
|
|
1582
1750
|
)
|
|
1583
1751
|
|
|
1584
1752
|
|
|
@@ -1586,11 +1754,11 @@ async def stream(
|
|
|
1586
1754
|
async def api_cancel(request: fastapi.Request,
|
|
1587
1755
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1588
1756
|
"""Cancels requests."""
|
|
1589
|
-
executor.
|
|
1757
|
+
await executor.schedule_request_async(
|
|
1590
1758
|
request_id=request.state.request_id,
|
|
1591
|
-
request_name=
|
|
1759
|
+
request_name=request_names.RequestName.API_CANCEL,
|
|
1592
1760
|
request_body=request_cancel_body,
|
|
1593
|
-
func=requests_lib.
|
|
1761
|
+
func=requests_lib.kill_requests_with_prefix,
|
|
1594
1762
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1595
1763
|
)
|
|
1596
1764
|
|
|
@@ -1598,9 +1766,13 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1598
1766
|
@app.get('/api/status')
|
|
1599
1767
|
async def api_status(
|
|
1600
1768
|
request_ids: Optional[List[str]] = fastapi.Query(
|
|
1601
|
-
None, description='Request
|
|
1769
|
+
None, description='Request ID prefixes to get status for.'),
|
|
1602
1770
|
all_status: bool = fastapi.Query(
|
|
1603
1771
|
False, description='Get finished requests as well.'),
|
|
1772
|
+
limit: Optional[int] = fastapi.Query(
|
|
1773
|
+
None, description='Number of requests to show.'),
|
|
1774
|
+
fields: Optional[List[str]] = fastapi.Query(
|
|
1775
|
+
None, description='Fields to get. If None, get all fields.'),
|
|
1604
1776
|
) -> List[payloads.RequestPayload]:
|
|
1605
1777
|
"""Gets the list of requests."""
|
|
1606
1778
|
if request_ids is None:
|
|
@@ -1611,18 +1783,34 @@ async def api_status(
|
|
|
1611
1783
|
requests_lib.RequestStatus.RUNNING,
|
|
1612
1784
|
]
|
|
1613
1785
|
request_tasks = await requests_lib.get_request_tasks_async(
|
|
1614
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
1615
|
-
|
|
1786
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
1787
|
+
status=statuses,
|
|
1788
|
+
limit=limit,
|
|
1789
|
+
fields=fields,
|
|
1790
|
+
sort=True,
|
|
1791
|
+
))
|
|
1792
|
+
return requests_lib.encode_requests(request_tasks)
|
|
1616
1793
|
else:
|
|
1617
1794
|
encoded_request_tasks = []
|
|
1618
1795
|
for request_id in request_ids:
|
|
1619
|
-
|
|
1620
|
-
|
|
1796
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1797
|
+
request_id)
|
|
1798
|
+
if request_tasks is None:
|
|
1621
1799
|
continue
|
|
1622
|
-
|
|
1800
|
+
for request_task in request_tasks:
|
|
1801
|
+
encoded_request_tasks.append(request_task.readable_encode())
|
|
1623
1802
|
return encoded_request_tasks
|
|
1624
1803
|
|
|
1625
1804
|
|
|
1805
|
+
@app.get('/api/plugins', response_class=fastapi_responses.ORJSONResponse)
|
|
1806
|
+
async def list_plugins() -> Dict[str, List[Dict[str, Any]]]:
|
|
1807
|
+
"""Return metadata about loaded backend plugins."""
|
|
1808
|
+
plugin_info = [{
|
|
1809
|
+
'js_extension_path': plugin.js_extension_path,
|
|
1810
|
+
} for plugin in plugins.get_plugins()]
|
|
1811
|
+
return {'plugins': plugin_info}
|
|
1812
|
+
|
|
1813
|
+
|
|
1626
1814
|
@app.get(
|
|
1627
1815
|
'/api/health',
|
|
1628
1816
|
# response_model_exclude_unset omits unset fields
|
|
@@ -1679,23 +1867,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1679
1867
|
version=sky.__version__,
|
|
1680
1868
|
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1681
1869
|
commit=sky.__commit__,
|
|
1870
|
+
# Whether basic auth on api server is enabled
|
|
1682
1871
|
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1683
1872
|
'false').lower() == 'true',
|
|
1684
1873
|
user=user if user is not None else None,
|
|
1874
|
+
# Whether service account token is enabled
|
|
1875
|
+
service_account_token_enabled=(os.environ.get(
|
|
1876
|
+
constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
|
1877
|
+
'false').lower() == 'true'),
|
|
1878
|
+
# Whether basic auth on ingress is enabled
|
|
1879
|
+
ingress_basic_auth_enabled=os.environ.get(
|
|
1880
|
+
constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
|
|
1881
|
+
'false').lower() == 'true',
|
|
1685
1882
|
)
|
|
1686
1883
|
|
|
1687
1884
|
|
|
1885
|
+
class KubernetesSSHMessageType(IntEnum):
|
|
1886
|
+
REGULAR_DATA = 0
|
|
1887
|
+
PINGPONG = 1
|
|
1888
|
+
LATENCY_MEASUREMENT = 2
|
|
1889
|
+
|
|
1890
|
+
|
|
1688
1891
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
1689
|
-
async def kubernetes_pod_ssh_proxy(
|
|
1690
|
-
|
|
1892
|
+
async def kubernetes_pod_ssh_proxy(
|
|
1893
|
+
websocket: fastapi.WebSocket,
|
|
1894
|
+
cluster_name: str,
|
|
1895
|
+
client_version: Optional[int] = None) -> None:
|
|
1691
1896
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
|
1692
1897
|
await websocket.accept()
|
|
1693
1898
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1694
1899
|
|
|
1900
|
+
timestamps_supported = client_version is not None and client_version > 21
|
|
1901
|
+
logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
|
|
1902
|
+
client_version = {client_version}')
|
|
1903
|
+
|
|
1695
1904
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1905
|
+
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1906
|
+
cluster_records = await context_utils.to_thread_with_executor(
|
|
1907
|
+
thread_pool_executor, core.status, cluster_name, all_users=True)
|
|
1699
1908
|
cluster_record = cluster_records[0]
|
|
1700
1909
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1701
1910
|
raise fastapi.HTTPException(
|
|
@@ -1734,17 +1943,70 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1734
1943
|
return
|
|
1735
1944
|
|
|
1736
1945
|
logger.info(f'Starting port-forward to local port: {local_port}')
|
|
1946
|
+
conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
|
|
1947
|
+
pid=os.getpid())
|
|
1948
|
+
ssh_failed = False
|
|
1949
|
+
websocket_closed = False
|
|
1737
1950
|
try:
|
|
1951
|
+
conn_gauge.inc()
|
|
1738
1952
|
# Connect to the local port
|
|
1739
1953
|
reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
|
|
1740
1954
|
|
|
1741
1955
|
async def websocket_to_ssh():
|
|
1742
1956
|
try:
|
|
1743
1957
|
async for message in websocket.iter_bytes():
|
|
1958
|
+
if timestamps_supported:
|
|
1959
|
+
type_size = struct.calcsize('!B')
|
|
1960
|
+
message_type = struct.unpack('!B',
|
|
1961
|
+
message[:type_size])[0]
|
|
1962
|
+
if (message_type ==
|
|
1963
|
+
KubernetesSSHMessageType.REGULAR_DATA):
|
|
1964
|
+
# Regular data - strip type byte and forward to SSH
|
|
1965
|
+
message = message[type_size:]
|
|
1966
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG:
|
|
1967
|
+
# PING message - respond with PONG (type 1)
|
|
1968
|
+
ping_id_size = struct.calcsize('!I')
|
|
1969
|
+
if len(message) != type_size + ping_id_size:
|
|
1970
|
+
raise ValueError('Invalid PING message '
|
|
1971
|
+
f'length: {len(message)}')
|
|
1972
|
+
# Return the same PING message, so that the client
|
|
1973
|
+
# can measure the latency.
|
|
1974
|
+
await websocket.send_bytes(message)
|
|
1975
|
+
continue
|
|
1976
|
+
elif (message_type ==
|
|
1977
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT):
|
|
1978
|
+
# Latency measurement from client
|
|
1979
|
+
latency_size = struct.calcsize('!Q')
|
|
1980
|
+
if len(message) != type_size + latency_size:
|
|
1981
|
+
raise ValueError(
|
|
1982
|
+
'Invalid latency measurement '
|
|
1983
|
+
f'message length: {len(message)}')
|
|
1984
|
+
avg_latency_ms = struct.unpack(
|
|
1985
|
+
'!Q',
|
|
1986
|
+
message[type_size:type_size + latency_size])[0]
|
|
1987
|
+
latency_seconds = avg_latency_ms / 1000
|
|
1988
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
|
|
1989
|
+
continue
|
|
1990
|
+
else:
|
|
1991
|
+
# Unknown message type.
|
|
1992
|
+
raise ValueError(
|
|
1993
|
+
f'Unknown message type: {message_type}')
|
|
1744
1994
|
writer.write(message)
|
|
1745
|
-
|
|
1995
|
+
try:
|
|
1996
|
+
await writer.drain()
|
|
1997
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1998
|
+
# Typically we will not reach here, if the ssh to pod
|
|
1999
|
+
# is disconnected, ssh_to_websocket will exit first.
|
|
2000
|
+
# But just in case.
|
|
2001
|
+
logger.error('Failed to write to pod through '
|
|
2002
|
+
f'port-forward connection: {e}')
|
|
2003
|
+
nonlocal ssh_failed
|
|
2004
|
+
ssh_failed = True
|
|
2005
|
+
break
|
|
1746
2006
|
except fastapi.WebSocketDisconnect:
|
|
1747
2007
|
pass
|
|
2008
|
+
nonlocal websocket_closed
|
|
2009
|
+
websocket_closed = True
|
|
1748
2010
|
writer.close()
|
|
1749
2011
|
|
|
1750
2012
|
async def ssh_to_websocket():
|
|
@@ -1752,62 +2014,65 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1752
2014
|
while True:
|
|
1753
2015
|
data = await reader.read(1024)
|
|
1754
2016
|
if not data:
|
|
2017
|
+
if not websocket_closed:
|
|
2018
|
+
logger.warning('SSH connection to pod is '
|
|
2019
|
+
'disconnected before websocket '
|
|
2020
|
+
'connection is closed')
|
|
2021
|
+
nonlocal ssh_failed
|
|
2022
|
+
ssh_failed = True
|
|
1755
2023
|
break
|
|
2024
|
+
if timestamps_supported:
|
|
2025
|
+
# Prepend message type byte (0 = regular data)
|
|
2026
|
+
message_type_bytes = struct.pack(
|
|
2027
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
2028
|
+
data = message_type_bytes + data
|
|
1756
2029
|
await websocket.send_bytes(data)
|
|
1757
2030
|
except Exception: # pylint: disable=broad-except
|
|
1758
2031
|
pass
|
|
1759
|
-
|
|
2032
|
+
try:
|
|
2033
|
+
await websocket.close()
|
|
2034
|
+
except Exception: # pylint: disable=broad-except
|
|
2035
|
+
# The websocket might has been closed by the client.
|
|
2036
|
+
pass
|
|
1760
2037
|
|
|
1761
2038
|
await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
|
|
1762
2039
|
finally:
|
|
1763
|
-
|
|
2040
|
+
conn_gauge.dec()
|
|
2041
|
+
reason = ''
|
|
2042
|
+
try:
|
|
2043
|
+
logger.info('Terminating kubectl port-forward process')
|
|
2044
|
+
proc.terminate()
|
|
2045
|
+
except ProcessLookupError:
|
|
2046
|
+
stdout = await proc.stdout.read()
|
|
2047
|
+
logger.error('kubectl port-forward was terminated before the '
|
|
2048
|
+
'ssh websocket connection was closed. Remaining '
|
|
2049
|
+
f'output: {str(stdout)}')
|
|
2050
|
+
reason = 'KubectlPortForwardExit'
|
|
2051
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
2052
|
+
pid=os.getpid(), reason='KubectlPortForwardExit').inc()
|
|
2053
|
+
else:
|
|
2054
|
+
if ssh_failed:
|
|
2055
|
+
reason = 'SSHToPodDisconnected'
|
|
2056
|
+
else:
|
|
2057
|
+
reason = 'ClientClosed'
|
|
2058
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
2059
|
+
pid=os.getpid(), reason=reason).inc()
|
|
1764
2060
|
|
|
1765
2061
|
|
|
1766
2062
|
@app.get('/all_contexts')
|
|
1767
2063
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1768
2064
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1769
2065
|
|
|
1770
|
-
executor.
|
|
2066
|
+
await executor.schedule_request_async(
|
|
1771
2067
|
request_id=request.state.request_id,
|
|
1772
|
-
request_name=
|
|
1773
|
-
request_body=
|
|
2068
|
+
request_name=request_names.RequestName.ALL_CONTEXTS,
|
|
2069
|
+
request_body=server_utils.build_body_at_server(
|
|
2070
|
+
request=request, body_type=payloads.RequestBody),
|
|
1774
2071
|
func=core.get_all_contexts,
|
|
1775
2072
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1776
2073
|
)
|
|
1777
2074
|
|
|
1778
2075
|
|
|
1779
|
-
@app.get('/gpu-metrics')
|
|
1780
|
-
async def gpu_metrics() -> fastapi.Response:
|
|
1781
|
-
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1782
|
-
contexts = core.get_all_contexts()
|
|
1783
|
-
all_metrics = []
|
|
1784
|
-
successful_contexts = 0
|
|
1785
|
-
|
|
1786
|
-
tasks = [
|
|
1787
|
-
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
1788
|
-
for context in contexts
|
|
1789
|
-
if context != 'in-cluster'
|
|
1790
|
-
]
|
|
1791
|
-
|
|
1792
|
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1793
|
-
|
|
1794
|
-
for i, result in enumerate(results):
|
|
1795
|
-
if isinstance(result, Exception):
|
|
1796
|
-
logger.error(
|
|
1797
|
-
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1798
|
-
else:
|
|
1799
|
-
metrics_text = result
|
|
1800
|
-
all_metrics.append(metrics_text)
|
|
1801
|
-
successful_contexts += 1
|
|
1802
|
-
|
|
1803
|
-
combined_metrics = '\n\n'.join(all_metrics)
|
|
1804
|
-
|
|
1805
|
-
# Return as plain text for Prometheus compatibility
|
|
1806
|
-
return fastapi.Response(
|
|
1807
|
-
content=combined_metrics,
|
|
1808
|
-
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
2076
|
# === Internal APIs ===
|
|
1812
2077
|
@app.get('/api/completion/cluster_name')
|
|
1813
2078
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
|
@@ -1852,6 +2117,14 @@ async def serve_dashboard(full_path: str):
|
|
|
1852
2117
|
if os.path.isfile(file_path):
|
|
1853
2118
|
return fastapi.responses.FileResponse(file_path)
|
|
1854
2119
|
|
|
2120
|
+
# Serve plugin catch-all page for any /plugins/* paths so client-side
|
|
2121
|
+
# routing can bootstrap correctly.
|
|
2122
|
+
if full_path == 'plugins' or full_path.startswith('plugins/'):
|
|
2123
|
+
plugin_catchall = os.path.join(server_constants.DASHBOARD_DIR,
|
|
2124
|
+
'plugins', '[...slug].html')
|
|
2125
|
+
if os.path.isfile(plugin_catchall):
|
|
2126
|
+
return fastapi.responses.FileResponse(plugin_catchall)
|
|
2127
|
+
|
|
1855
2128
|
# Serve index.html for client-side routing
|
|
1856
2129
|
# e.g. /clusters, /jobs
|
|
1857
2130
|
index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
|
|
@@ -1905,6 +2178,7 @@ if __name__ == '__main__':
|
|
|
1905
2178
|
|
|
1906
2179
|
from sky.server import uvicorn as skyuvicorn
|
|
1907
2180
|
|
|
2181
|
+
logger.info('Initializing SkyPilot API server')
|
|
1908
2182
|
skyuvicorn.add_timestamp_prefix_for_server_logs()
|
|
1909
2183
|
|
|
1910
2184
|
parser = argparse.ArgumentParser()
|
|
@@ -1916,22 +2190,63 @@ if __name__ == '__main__':
|
|
|
1916
2190
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
1917
2191
|
cmd_args = parser.parse_args()
|
|
1918
2192
|
if cmd_args.port == cmd_args.metrics_port:
|
|
2193
|
+
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
1919
2194
|
raise ValueError('port and metrics-port cannot be the same')
|
|
1920
2195
|
|
|
2196
|
+
# Fail fast if the port is not available to avoid corrupt the state
|
|
2197
|
+
# of potential running server instance.
|
|
2198
|
+
# We might reach here because the running server is currently not
|
|
2199
|
+
# responding, thus the healthz check fails and `sky api start` think
|
|
2200
|
+
# we should start a new server instance.
|
|
2201
|
+
if not common_utils.is_port_available(cmd_args.port):
|
|
2202
|
+
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
2203
|
+
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
2204
|
+
|
|
2205
|
+
# Maybe touch the signal file on API server startup. Do it again here even
|
|
2206
|
+
# if we already touched it in the sky/server/common.py::_start_api_server.
|
|
2207
|
+
# This is because the sky/server/common.py::_start_api_server function call
|
|
2208
|
+
# is running outside the skypilot API server process tree. The process tree
|
|
2209
|
+
# starts within that function (see the `subprocess.Popen` call in
|
|
2210
|
+
# sky/server/common.py::_start_api_server). When pg is used, the
|
|
2211
|
+
# _start_api_server function will not load the config file from db, which
|
|
2212
|
+
# will ignore the consolidation mode config. Here, inside the process tree,
|
|
2213
|
+
# we already reload the config as a server (with env var _start_api_server),
|
|
2214
|
+
# so we will respect the consolidation mode config.
|
|
2215
|
+
# Refers to #7717 for more details.
|
|
2216
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
2217
|
+
|
|
1921
2218
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
1922
2219
|
# that it is shown only when the API server is started.
|
|
1923
2220
|
usage_lib.maybe_show_privacy_policy()
|
|
1924
2221
|
|
|
1925
2222
|
# Initialize global user state db
|
|
1926
2223
|
db_utils.set_max_connections(1)
|
|
2224
|
+
logger.info('Initializing database engine')
|
|
1927
2225
|
global_user_state.initialize_and_get_db()
|
|
2226
|
+
logger.info('Database engine initialized')
|
|
1928
2227
|
# Initialize request db
|
|
1929
2228
|
requests_lib.reset_db_and_logs()
|
|
1930
2229
|
# Restore the server user hash
|
|
2230
|
+
logger.info('Initializing server user hash')
|
|
1931
2231
|
_init_or_restore_server_user_hash()
|
|
2232
|
+
|
|
1932
2233
|
max_db_connections = global_user_state.get_max_db_connections()
|
|
1933
|
-
|
|
1934
|
-
|
|
2234
|
+
logger.info(f'Max db connections: {max_db_connections}')
|
|
2235
|
+
|
|
2236
|
+
# Reserve memory for jobs and serve/pool controller in consolidation mode.
|
|
2237
|
+
reserved_memory_mb = (
|
|
2238
|
+
controller_utils.compute_memory_reserved_for_controllers(
|
|
2239
|
+
reserve_for_controllers=os.environ.get(
|
|
2240
|
+
constants.OVERRIDE_CONSOLIDATION_MODE) is not None,
|
|
2241
|
+
# For jobs controller, we need to reserve for both jobs and
|
|
2242
|
+
# pool controller.
|
|
2243
|
+
reserve_extra_for_pool=not os.environ.get(
|
|
2244
|
+
constants.IS_SKYPILOT_SERVE_CONTROLLER)))
|
|
2245
|
+
|
|
2246
|
+
config = server_config.compute_server_config(
|
|
2247
|
+
cmd_args.deploy,
|
|
2248
|
+
max_db_connections,
|
|
2249
|
+
reserved_memory_mb=reserved_memory_mb)
|
|
1935
2250
|
|
|
1936
2251
|
num_workers = config.num_server_workers
|
|
1937
2252
|
|
|
@@ -1960,7 +2275,8 @@ if __name__ == '__main__':
|
|
|
1960
2275
|
uvicorn_config = uvicorn.Config('sky.server.server:app',
|
|
1961
2276
|
host=cmd_args.host,
|
|
1962
2277
|
port=cmd_args.port,
|
|
1963
|
-
workers=num_workers
|
|
2278
|
+
workers=num_workers,
|
|
2279
|
+
ws_per_message_deflate=False)
|
|
1964
2280
|
skyuvicorn.run(uvicorn_config,
|
|
1965
2281
|
max_db_connections=config.num_db_connections_per_worker)
|
|
1966
2282
|
except Exception as exc: # pylint: disable=broad-except
|
|
@@ -1972,6 +2288,8 @@ if __name__ == '__main__':
|
|
|
1972
2288
|
|
|
1973
2289
|
for gt in global_tasks:
|
|
1974
2290
|
gt.cancel()
|
|
2291
|
+
for plugin in plugins.get_plugins():
|
|
2292
|
+
plugin.shutdown()
|
|
1975
2293
|
subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
|
|
1976
2294
|
workers,
|
|
1977
2295
|
num_threads=len(workers))
|