PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250509py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show

sky/__init__.py +22 -6
sky/adaptors/aws.py +25 -7
sky/adaptors/common.py +24 -1
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/hyperbolic.py +8 -0
sky/adaptors/kubernetes.py +149 -18
sky/adaptors/nebius.py +170 -17
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/runpod.py +68 -0
sky/adaptors/seeweb.py +167 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +187 -4
sky/authentication.py +179 -225
sky/backends/__init__.py +4 -2
sky/backends/backend.py +22 -9
sky/backends/backend_utils.py +1299 -380
sky/backends/cloud_vm_ray_backend.py +1715 -518
sky/backends/docker_utils.py +1 -1
sky/backends/local_docker_backend.py +11 -6
sky/backends/wheel_utils.py +37 -9
sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
sky/{clouds/service_catalog → catalog}/common.py +89 -48
sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/catalog/data_fetchers/fetch_nebius.py +335 -0
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
sky/catalog/hyperbolic_catalog.py +136 -0
sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
sky/catalog/primeintellect_catalog.py +95 -0
sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/ssh_catalog.py +167 -0
sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
sky/check.py +491 -203
sky/cli.py +5 -6005
sky/client/{cli.py → cli/command.py} +2477 -1885
sky/client/cli/deprecation_utils.py +99 -0
sky/client/cli/flags.py +359 -0
sky/client/cli/table_utils.py +320 -0
sky/client/common.py +70 -32
sky/client/oauth.py +82 -0
sky/client/sdk.py +1203 -297
sky/client/sdk_async.py +833 -0
sky/client/service_account_auth.py +47 -0
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +13 -0
sky/clouds/aws.py +358 -93
sky/clouds/azure.py +105 -83
sky/clouds/cloud.py +127 -36
sky/clouds/cudo.py +68 -50
sky/clouds/do.py +66 -48
sky/clouds/fluidstack.py +63 -44
sky/clouds/gcp.py +339 -110
sky/clouds/hyperbolic.py +293 -0
sky/clouds/ibm.py +70 -49
sky/clouds/kubernetes.py +563 -162
sky/clouds/lambda_cloud.py +74 -54
sky/clouds/nebius.py +206 -80
sky/clouds/oci.py +88 -66
sky/clouds/paperspace.py +61 -44
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +164 -74
sky/clouds/scp.py +89 -83
sky/clouds/seeweb.py +466 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +263 -0
sky/clouds/utils/aws_utils.py +10 -4
sky/clouds/utils/gcp_utils.py +87 -11
sky/clouds/utils/oci_utils.py +38 -14
sky/clouds/utils/scp_utils.py +177 -124
sky/clouds/vast.py +99 -77
sky/clouds/vsphere.py +51 -40
sky/core.py +349 -139
sky/dag.py +15 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -0
sky/dashboard/out/infra.html +1 -0
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -0
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -0
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -0
sky/dashboard/out/workspaces/[name].html +1 -0
sky/dashboard/out/workspaces.html +1 -0
sky/data/data_utils.py +137 -1
sky/data/mounting_utils.py +269 -84
sky/data/storage.py +1451 -1807
sky/data/storage_utils.py +43 -57
sky/exceptions.py +132 -2
sky/execution.py +206 -63
sky/global_user_state.py +2374 -586
sky/jobs/__init__.py +5 -0
sky/jobs/client/sdk.py +242 -65
sky/jobs/client/sdk_async.py +143 -0
sky/jobs/constants.py +9 -8
sky/jobs/controller.py +839 -277
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +398 -152
sky/jobs/scheduler.py +315 -189
sky/jobs/server/core.py +829 -255
sky/jobs/server/server.py +156 -115
sky/jobs/server/utils.py +136 -0
sky/jobs/state.py +2092 -701
sky/jobs/utils.py +1242 -160
sky/logs/__init__.py +21 -0
sky/logs/agent.py +108 -0
sky/logs/aws.py +243 -0
sky/logs/gcp.py +91 -0
sky/metrics/__init__.py +0 -0
sky/metrics/utils.py +443 -0
sky/models.py +78 -1
sky/optimizer.py +164 -70
sky/provision/__init__.py +90 -4
sky/provision/aws/config.py +147 -26
sky/provision/aws/instance.py +135 -50
sky/provision/azure/instance.py +10 -5
sky/provision/common.py +13 -1
sky/provision/cudo/cudo_machine_type.py +1 -1
sky/provision/cudo/cudo_utils.py +14 -8
sky/provision/cudo/cudo_wrapper.py +72 -71
sky/provision/cudo/instance.py +10 -6
sky/provision/do/instance.py +10 -6
sky/provision/do/utils.py +4 -3
sky/provision/docker_utils.py +114 -23
sky/provision/fluidstack/instance.py +13 -8
sky/provision/gcp/__init__.py +1 -0
sky/provision/gcp/config.py +301 -19
sky/provision/gcp/constants.py +218 -0
sky/provision/gcp/instance.py +36 -8
sky/provision/gcp/instance_utils.py +18 -4
sky/provision/gcp/volume_utils.py +247 -0
sky/provision/hyperbolic/__init__.py +12 -0
sky/provision/hyperbolic/config.py +10 -0
sky/provision/hyperbolic/instance.py +437 -0
sky/provision/hyperbolic/utils.py +373 -0
sky/provision/instance_setup.py +93 -14
sky/provision/kubernetes/__init__.py +5 -0
sky/provision/kubernetes/config.py +9 -52
sky/provision/kubernetes/constants.py +17 -0
sky/provision/kubernetes/instance.py +789 -247
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +27 -17
sky/provision/kubernetes/network_utils.py +40 -43
sky/provision/kubernetes/utils.py +1192 -531
sky/provision/kubernetes/volume.py +282 -0
sky/provision/lambda_cloud/instance.py +22 -16
sky/provision/nebius/constants.py +50 -0
sky/provision/nebius/instance.py +19 -6
sky/provision/nebius/utils.py +196 -91
sky/provision/oci/instance.py +10 -5
sky/provision/paperspace/instance.py +10 -7
sky/provision/paperspace/utils.py +1 -1
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +110 -36
sky/provision/runpod/__init__.py +5 -0
sky/provision/runpod/instance.py +27 -6
sky/provision/runpod/utils.py +51 -18
sky/provision/runpod/volume.py +180 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +531 -0
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +807 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/ssh/__init__.py +18 -0
sky/provision/vast/instance.py +13 -8
sky/provision/vast/utils.py +10 -7
sky/provision/vsphere/common/vim_utils.py +1 -2
sky/provision/vsphere/instance.py +15 -10
sky/provision/vsphere/vsphere_utils.py +9 -19
sky/py.typed +0 -0
sky/resources.py +844 -118
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +225 -0
sky/schemas/db/README +4 -0
sky/schemas/db/env.py +90 -0
sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
sky/schemas/db/global_user_state/004_is_managed.py +34 -0
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/script.py.mako +28 -0
sky/schemas/db/serve_state/001_initial_schema.py +67 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +357 -5
sky/serve/client/impl.py +310 -0
sky/serve/client/sdk.py +47 -139
sky/serve/client/sdk_async.py +130 -0
sky/serve/constants.py +10 -8
sky/serve/controller.py +64 -19
sky/serve/load_balancer.py +106 -60
sky/serve/load_balancing_policies.py +115 -1
sky/serve/replica_managers.py +273 -162
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +554 -251
sky/serve/serve_utils.py +733 -220
sky/serve/server/core.py +66 -711
sky/serve/server/impl.py +1093 -0
sky/serve/server/server.py +21 -18
sky/serve/service.py +133 -48
sky/serve/service_spec.py +135 -16
sky/serve/spot_placer.py +3 -0
sky/server/auth/__init__.py +0 -0
sky/server/auth/authn.py +50 -0
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +200 -0
sky/server/common.py +475 -181
sky/server/config.py +81 -23
sky/server/constants.py +44 -6
sky/server/daemons.py +229 -0
sky/server/html/token_page.html +185 -0
sky/server/metrics.py +160 -0
sky/server/requests/executor.py +528 -138
sky/server/requests/payloads.py +351 -17
sky/server/requests/preconditions.py +21 -17
sky/server/requests/process.py +112 -29
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +817 -224
sky/server/requests/serializers/decoders.py +82 -31
sky/server/requests/serializers/encoders.py +140 -22
sky/server/requests/threads.py +106 -0
sky/server/rest.py +417 -0
sky/server/server.py +1290 -284
sky/server/state.py +20 -0
sky/server/stream_utils.py +345 -57
sky/server/uvicorn.py +217 -3
sky/server/versions.py +270 -0
sky/setup_files/MANIFEST.in +5 -0
sky/setup_files/alembic.ini +156 -0
sky/setup_files/dependencies.py +136 -31
sky/setup_files/setup.py +44 -42
sky/sky_logging.py +102 -5
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/autostop_lib.py +129 -8
sky/skylet/configs.py +27 -20
sky/skylet/constants.py +171 -19
sky/skylet/events.py +105 -21
sky/skylet/job_lib.py +335 -104
sky/skylet/log_lib.py +297 -18
sky/skylet/log_lib.pyi +44 -1
sky/skylet/ray_patches/__init__.py +17 -3
sky/skylet/ray_patches/autoscaler.py.diff +18 -0
sky/skylet/ray_patches/cli.py.diff +19 -0
sky/skylet/ray_patches/command_runner.py.diff +17 -0
sky/skylet/ray_patches/log_monitor.py.diff +20 -0
sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
sky/skylet/ray_patches/updater.py.diff +18 -0
sky/skylet/ray_patches/worker.py.diff +41 -0
sky/skylet/services.py +564 -0
sky/skylet/skylet.py +63 -4
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +506 -99
sky/ssh_node_pools/__init__.py +1 -0
sky/ssh_node_pools/core.py +135 -0
sky/ssh_node_pools/server.py +233 -0
sky/task.py +621 -137
sky/templates/aws-ray.yml.j2 +10 -3
sky/templates/azure-ray.yml.j2 +1 -1
sky/templates/do-ray.yml.j2 +1 -1
sky/templates/gcp-ray.yml.j2 +57 -0
sky/templates/hyperbolic-ray.yml.j2 +67 -0
sky/templates/jobs-controller.yaml.j2 +27 -24
sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
sky/templates/kubernetes-ray.yml.j2 +607 -51
sky/templates/lambda-ray.yml.j2 +1 -1
sky/templates/nebius-ray.yml.j2 +33 -12
sky/templates/paperspace-ray.yml.j2 +1 -1
sky/templates/primeintellect-ray.yml.j2 +71 -0
sky/templates/runpod-ray.yml.j2 +9 -1
sky/templates/scp-ray.yml.j2 +3 -50
sky/templates/seeweb-ray.yml.j2 +108 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/sky-serve-controller.yaml.j2 +22 -2
sky/templates/websocket_proxy.py +178 -18
sky/usage/usage_lib.py +18 -11
sky/users/__init__.py +0 -0
sky/users/model.conf +15 -0
sky/users/permission.py +387 -0
sky/users/rbac.py +121 -0
sky/users/server.py +720 -0
sky/users/token_service.py +218 -0
sky/utils/accelerator_registry.py +34 -5
sky/utils/admin_policy_utils.py +84 -38
sky/utils/annotations.py +16 -5
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/benchmark_utils.py +60 -0
sky/utils/cli_utils/status_utils.py +159 -86
sky/utils/cluster_utils.py +31 -9
sky/utils/command_runner.py +354 -68
sky/utils/command_runner.pyi +93 -3
sky/utils/common.py +35 -8
sky/utils/common_utils.py +310 -87
sky/utils/config_utils.py +87 -5
sky/utils/context.py +402 -0
sky/utils/context_utils.py +222 -0
sky/utils/controller_utils.py +264 -89
sky/utils/dag_utils.py +31 -12
sky/utils/db/__init__.py +0 -0
sky/utils/db/db_utils.py +470 -0
sky/utils/db/migration_utils.py +133 -0
sky/utils/directory_utils.py +12 -0
sky/utils/env_options.py +13 -0
sky/utils/git.py +567 -0
sky/utils/git_clone.sh +460 -0
sky/utils/infra_utils.py +195 -0
sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
sky/utils/kubernetes/config_map_utils.py +133 -0
sky/utils/kubernetes/create_cluster.sh +13 -27
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
sky/utils/kubernetes/gpu_labeler.py +5 -5
sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
sky/utils/kubernetes/ssh-tunnel.sh +379 -0
sky/utils/kubernetes/ssh_utils.py +221 -0
sky/utils/kubernetes_enums.py +8 -15
sky/utils/lock_events.py +94 -0
sky/utils/locks.py +368 -0
sky/utils/log_utils.py +300 -6
sky/utils/perf_utils.py +22 -0
sky/utils/resource_checker.py +298 -0
sky/utils/resources_utils.py +249 -32
sky/utils/rich_utils.py +213 -37
sky/utils/schemas.py +905 -147
sky/utils/serialize_utils.py +16 -0
sky/utils/status_lib.py +10 -0
sky/utils/subprocess_utils.py +38 -15
sky/utils/tempstore.py +70 -0
sky/utils/timeline.py +24 -52
sky/utils/ux_utils.py +84 -15
sky/utils/validator.py +11 -1
sky/utils/volume.py +86 -0
sky/utils/yaml_utils.py +111 -0
sky/volumes/__init__.py +13 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +149 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +258 -0
sky/volumes/server/server.py +122 -0
sky/volumes/volume.py +212 -0
sky/workspaces/__init__.py +0 -0
sky/workspaces/core.py +655 -0
sky/workspaces/server.py +101 -0
sky/workspaces/utils.py +56 -0
skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
sky/benchmark/benchmark_state.py +0 -256
sky/benchmark/benchmark_utils.py +0 -641
sky/clouds/service_catalog/constants.py +0 -7
sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
sky/jobs/dashboard/dashboard.py +0 -223
sky/jobs/dashboard/static/favicon.ico +0 -0
sky/jobs/dashboard/templates/index.html +0 -831
sky/jobs/server/dashboard_utils.py +0 -69
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/db_utils.py +0 -100
sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
/sky/{clouds/service_catalog → catalog}/config.py +0 -0
/sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
/sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
/sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/server/server.py CHANGED Viewed

@@ -2,55 +2,88 @@
 import argparse
 import asyncio
+import base64
+from concurrent.futures import ThreadPoolExecutor
 import contextlib
-import dataclasses
 import datetime
-import logging
+from enum import IntEnum
+import hashlib
+import json
 import multiprocessing
 import os
 import pathlib
+import posixpath
 import re
+import resource
 import shutil
+import struct
 import sys
-from typing import Any, Dict, List, Literal, Optional, Set, Tuple
+import threading
+import traceback
+from typing import Dict, List, Literal, Optional, Set, Tuple
 import uuid
 import zipfile
 import aiofiles
+import anyio
 import fastapi
+from fastapi import responses as fastapi_responses
 from fastapi.middleware import cors
 import starlette.middleware.base
+import uvloop
 import sky
+from sky import catalog
 from sky import check as sky_check
 from sky import clouds
 from sky import core
 from sky import exceptions
 from sky import execution
 from sky import global_user_state
+from sky import models
 from sky import sky_logging
-from sky.clouds import service_catalog
 from sky.data import storage_utils
+from sky.jobs import utils as managed_job_utils
 from sky.jobs.server import server as jobs_rest
+from sky.metrics import utils as metrics_utils
+from sky.provision import metadata_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.schemas.api import responses
 from sky.serve.server import server as serve_rest
 from sky.server import common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
+from sky.server import daemons
+from sky.server import metrics
+from sky.server import state
 from sky.server import stream_utils
+from sky.server import versions
+from sky.server.auth import authn
+from sky.server.auth import loopback
+from sky.server.auth import oauth2_proxy
 from sky.server.requests import executor
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
+from sky.server.requests import request_names
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
+from sky.ssh_node_pools import server as ssh_node_pools_rest
 from sky.usage import usage_lib
+from sky.users import permission
+from sky.users import server as users_rest
 from sky.utils import admin_policy_utils
 from sky.utils import common as common_lib
 from sky.utils import common_utils
+from sky.utils import context
+from sky.utils import context_utils
 from sky.utils import dag_utils
-from sky.utils import env_options
+from sky.utils import perf_utils
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
+from sky.utils import ux_utils
+from sky.utils.db import db_utils
+from sky.volumes.server import server as volumes_rest
+from sky.workspaces import server as workspaces_rest
 # pylint: disable=ungrouped-imports
 if sys.version_info >= (3, 10):
@@ -60,31 +93,8 @@ else:
 P = ParamSpec('P')
+_SERVER_USER_HASH_KEY = 'server_user_hash'
-def _add_timestamp_prefix_for_server_logs() -> None:
-    server_logger = sky_logging.init_logger('sky.server')
-    # Clear existing handlers first to prevent duplicates
-    server_logger.handlers.clear()
-    # Disable propagation to avoid the root logger of SkyPilot being affected
-    server_logger.propagate = False
-    # Add date prefix to the log message printed by loggers under
-    # server.
-    stream_handler = logging.StreamHandler(sys.stdout)
-    if env_options.Options.SHOW_DEBUG_INFO.get():
-        stream_handler.setLevel(logging.DEBUG)
-    else:
-        stream_handler.setLevel(logging.INFO)
-    stream_handler.flush = sys.stdout.flush  # type: ignore
-    stream_handler.setFormatter(sky_logging.FORMATTER)
-    server_logger.addHandler(stream_handler)
-    # Add date prefix to the log message printed by uvicorn.
-    for name in ['uvicorn', 'uvicorn.access']:
-        uvicorn_logger = logging.getLogger(name)
-        uvicorn_logger.handlers.clear()
-        uvicorn_logger.addHandler(stream_handler)
-_add_timestamp_prefix_for_server_logs()
 logger = sky_logging.init_logger(__name__)
 # TODO(zhwu): Streaming requests, such log tailing after sky launch or sky logs,
@@ -92,11 +102,72 @@ logger = sky_logging.init_logger(__name__)
 # response will block other requests from being processed.
+def _basic_auth_401_response(content: str):
+    """Return a 401 response with basic auth realm."""
+    return fastapi.responses.JSONResponse(
+        status_code=401,
+        headers={'WWW-Authenticate': 'Basic realm=\"SkyPilot\"'},
+        content=content)
+def _try_set_basic_auth_user(request: fastapi.Request):
+    auth_header = request.headers.get('authorization')
+    if not auth_header or not auth_header.lower().startswith('basic '):
+        return
+    # Check username and password
+    encoded = auth_header.split(' ', 1)[1]
+    try:
+        decoded = base64.b64decode(encoded).decode()
+        username, password = decoded.split(':', 1)
+    except Exception:  # pylint: disable=broad-except
+        return
+    users = global_user_state.get_user_by_name(username)
+    if not users:
+        return
+    for user in users:
+        if not user.name or not user.password:
+            continue
+        username_encoded = username.encode('utf8')
+        db_username_encoded = user.name.encode('utf8')
+        if (username_encoded == db_username_encoded and
+                common.crypt_ctx.verify(password, user.password)):
+            request.state.auth_user = user
+            break
+class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to handle RBAC."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        # TODO(hailong): should have a list of paths
+        # that are not checked for RBAC
+        if (request.url.path.startswith('/dashboard/') or
+                request.url.path.startswith('/api/')):
+            return await call_next(request)
+        auth_user = request.state.auth_user
+        if auth_user is None:
+            return await call_next(request)
+        permission_service = permission.permission_service
+        # Check the role permission
+        if permission_service.check_endpoint_permission(auth_user.id,
+                                                        request.url.path,
+                                                        request.method):
+            return fastapi.responses.JSONResponse(
+                status_code=403, content={'detail': 'Forbidden'})
+        return await call_next(request)
 class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
     """Middleware to add a request ID to each request."""
     async def dispatch(self, request: fastapi.Request, call_next):
-        request_id = str(uuid.uuid4())
+        request_id = requests_lib.get_new_request_id()
         request.state.request_id = request_id
         response = await call_next(request)
         # TODO(syang): remove X-Request-ID when v0.10.0 is released.
@@ -105,6 +176,238 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
         return response
+def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
+    header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
+                                 'X-Auth-Request-Email')
+    if header_name not in request.headers:
+        return None
+    user_name = request.headers[header_name]
+    user_hash = hashlib.md5(
+        user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
+    return models.User(id=user_hash, name=user_name)
+class InitializeRequestAuthUserMiddleware(
+        starlette.middleware.base.BaseHTTPMiddleware):
+    async def dispatch(self, request: fastapi.Request, call_next):
+        # Make sure that request.state.auth_user is set. Otherwise, we may get a
+        # KeyError while trying to read it.
+        request.state.auth_user = None
+        return await call_next(request)
+class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to handle HTTP Basic Auth."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if managed_job_utils.is_consolidation_mode(
+        ) and loopback.is_loopback_request(request):
+            return await call_next(request)
+        if request.url.path.startswith('/api/health'):
+            # Try to set the auth user from basic auth
+            _try_set_basic_auth_user(request)
+            return await call_next(request)
+        auth_header = request.headers.get('authorization')
+        if not auth_header:
+            return _basic_auth_401_response('Authentication required')
+        # Only handle basic auth
+        if not auth_header.lower().startswith('basic '):
+            return _basic_auth_401_response('Invalid authentication method')
+        # Check username and password
+        encoded = auth_header.split(' ', 1)[1]
+        try:
+            decoded = base64.b64decode(encoded).decode()
+            username, password = decoded.split(':', 1)
+        except Exception:  # pylint: disable=broad-except
+            return _basic_auth_401_response('Invalid basic auth')
+        users = global_user_state.get_user_by_name(username)
+        if not users:
+            return _basic_auth_401_response('Invalid credentials')
+        valid_user = False
+        for user in users:
+            if not user.name or not user.password:
+                continue
+            username_encoded = username.encode('utf8')
+            db_username_encoded = user.name.encode('utf8')
+            if (username_encoded == db_username_encoded and
+                    common.crypt_ctx.verify(password, user.password)):
+                valid_user = True
+                request.state.auth_user = user
+                await authn.override_user_info_in_request_body(request, user)
+                break
+        if not valid_user:
+            return _basic_auth_401_response('Invalid credentials')
+        return await call_next(request)
+class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to handle Bearer Token Auth (Service Accounts)."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        """Make sure correct bearer token auth is present.
+        1. If the request has the X-Skypilot-Auth-Mode: token header, it must
+           have a valid bearer token.
+        2. For backwards compatibility, if the request has a Bearer token
+           beginning with "sky_" (even if X-Skypilot-Auth-Mode is not present),
+           it must be a valid token.
+        3. If X-Skypilot-Auth-Mode is not set to "token", and there is no Bearer
+           token beginning with "sky_", allow the request to continue.
+        In conjunction with an auth proxy, the idea is to make the auth proxy
+        bypass requests with bearer tokens, instead setting the
+        X-Skypilot-Auth-Mode header. The auth proxy should either validate the
+        auth or set the header X-Skypilot-Auth-Mode: token.
+        """
+        has_skypilot_auth_header = (
+            request.headers.get('X-Skypilot-Auth-Mode') == 'token')
+        auth_header = request.headers.get('authorization')
+        has_bearer_token_starting_with_sky = (
+            auth_header and auth_header.lower().startswith('bearer ') and
+            auth_header.split(' ', 1)[1].startswith('sky_'))
+        if (not has_skypilot_auth_header and
+                not has_bearer_token_starting_with_sky):
+            # This is case #3 above. We do not need to validate the request.
+            # No Bearer token, continue with normal processing (OAuth2 cookies,
+            # etc.)
+            return await call_next(request)
+        # After this point, all requests must be validated.
+        if auth_header is None:
+            return fastapi.responses.JSONResponse(
+                status_code=401, content={'detail': 'Authentication required'})
+        # Extract token
+        split_header = auth_header.split(' ', 1)
+        if split_header[0].lower() != 'bearer':
+            return fastapi.responses.JSONResponse(
+                status_code=401,
+                content={'detail': 'Invalid authentication method'})
+        sa_token = split_header[1]
+        # Handle SkyPilot service account tokens
+        return await self._handle_service_account_token(request, sa_token,
+                                                        call_next)
+    async def _handle_service_account_token(self, request: fastapi.Request,
+                                            sa_token: str, call_next):
+        """Handle SkyPilot service account tokens."""
+        # Check if service account tokens are enabled
+        sa_enabled = os.environ.get(constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
+                                    'false').lower()
+        if sa_enabled != 'true':
+            return fastapi.responses.JSONResponse(
+                status_code=401,
+                content={'detail': 'Service account authentication disabled'})
+        try:
+            # Import here to avoid circular imports
+            # pylint: disable=import-outside-toplevel
+            from sky.users.token_service import token_service
+            # Verify and decode JWT token
+            payload = token_service.verify_token(sa_token)
+            if payload is None:
+                logger.warning('Service account token verification failed')
+                return fastapi.responses.JSONResponse(
+                    status_code=401,
+                    content={
+                        'detail': 'Invalid or expired service account token'
+                    })
+            # Extract user information from JWT payload
+            user_id = payload.get('sub')
+            user_name = payload.get('name')
+            token_id = payload.get('token_id')
+            if not user_id or not token_id:
+                logger.warning(
+                    'Invalid token payload: missing user_id or token_id')
+                return fastapi.responses.JSONResponse(
+                    status_code=401,
+                    content={'detail': 'Invalid token payload'})
+            # Verify user still exists in database
+            user_info = global_user_state.get_user(user_id)
+            if user_info is None:
+                logger.warning(
+                    f'Service account user {user_id} no longer exists')
+                return fastapi.responses.JSONResponse(
+                    status_code=401,
+                    content={'detail': 'Service account user no longer exists'})
+            # Update last used timestamp for token tracking
+            try:
+                global_user_state.update_service_account_token_last_used(
+                    token_id)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.debug(f'Failed to update token last used time: {e}')
+            # Set the authenticated user
+            auth_user = models.User(id=user_id,
+                                    name=user_name or user_info.name)
+            request.state.auth_user = auth_user
+            # Override user info in request body for service account requests
+            await authn.override_user_info_in_request_body(request, auth_user)
+            logger.debug(f'Authenticated service account: {user_id}')
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(f'Service account authentication failed: {e}',
+                         exc_info=True)
+            return fastapi.responses.JSONResponse(
+                status_code=401,
+                content={
+                    'detail': f'Service account authentication failed: {str(e)}'
+                })
+        return await call_next(request)
+class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to handle auth proxy."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        auth_user = _get_auth_user_header(request)
+        if request.state.auth_user is not None:
+            # Previous middleware is trusted more than this middleware.  For
+            # instance, a client could set the Authorization and the
+            # X-Auth-Request-Email header. In that case, the auth proxy will be
+            # skipped and we should rely on the Bearer token to authenticate the
+            # user - but that means the user could set X-Auth-Request-Email to
+            # whatever the user wants. We should thus ignore it.
+            if auth_user is not None:
+                logger.debug('Warning: ignoring auth proxy header since the '
+                             'auth user was already set.')
+            return await call_next(request)
+        # Add user to database if auth_user is present
+        if auth_user is not None:
+            newly_added = global_user_state.add_or_update_user(auth_user)
+            if newly_added:
+                permission.permission_service.add_user_if_not_exists(
+                    auth_user.id)
+        # Store user info in request.state for access by GET endpoints
+        if auth_user is not None:
+            request.state.auth_user = auth_user
+        await authn.override_user_info_in_request_body(request, auth_user)
+        return await call_next(request)
 # Default expiration time for upload ids before cleanup.
 _DEFAULT_UPLOAD_EXPIRATION_TIME = datetime.timedelta(hours=1)
 # Key: (upload_id, user_hash), Value: the time when the upload id needs to be
@@ -134,21 +437,74 @@ async def cleanup_upload_ids():
                 upload_ids_to_cleanup.pop((upload_id, user_hash))
+async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
+                           interval: float = 0.1) -> None:
+    target = loop.time() + interval
+    pid = str(os.getpid())
+    lag_threshold = perf_utils.get_loop_lag_threshold()
+    def tick():
+        nonlocal target
+        now = loop.time()
+        lag = max(0.0, now - target)
+        if lag_threshold is not None and lag > lag_threshold:
+            logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
+                           f'{lag_threshold} seconds.')
+        metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
+            pid=pid).observe(lag)
+        target = now + interval
+        loop.call_at(target, tick)
+    loop.call_at(target, tick)
+async def schedule_on_boot_check_async():
+    try:
+        await executor.schedule_request_async(
+            request_id='skypilot-server-on-boot-check',
+            request_name=request_names.RequestName.CHECK,
+            request_body=payloads.CheckBody(),
+            func=sky_check.check,
+            schedule_type=requests_lib.ScheduleType.SHORT,
+            is_skypilot_system=True,
+        )
+    except exceptions.RequestAlreadyExistsError:
+        # Lifespan will be executed in each uvicorn worker process, we
+        # can safely ignore the error if the task is already scheduled.
+        logger.debug('Request skypilot-server-on-boot-check already exists.')
 @contextlib.asynccontextmanager
 async def lifespan(app: fastapi.FastAPI):  # pylint: disable=redefined-outer-name
     """FastAPI lifespan context manager."""
     del app  # unused
     # Startup: Run background tasks
-    for event in requests_lib.INTERNAL_REQUEST_DAEMONS:
-        executor.schedule_request(
-            request_id=event.id,
-            request_name=event.name,
-            request_body=payloads.RequestBody(),
-            func=event.event_fn,
-            schedule_type=requests_lib.ScheduleType.SHORT,
-            is_skypilot_system=True,
-        )
+    for event in daemons.INTERNAL_REQUEST_DAEMONS:
+        if event.should_skip():
+            continue
+        try:
+            await executor.schedule_request_async(
+                request_id=event.id,
+                request_name=event.name,
+                request_body=payloads.RequestBody(),
+                func=event.run_event,
+                schedule_type=requests_lib.ScheduleType.SHORT,
+                is_skypilot_system=True,
+                # Request deamon should be retried if the process pool is
+                # broken.
+                retryable=True,
+            )
+        except exceptions.RequestAlreadyExistsError:
+            # Lifespan will be executed in each uvicorn worker process, we
+            # can safely ignore the error if the task is already scheduled.
+            logger.debug(f'Request {event.id} already exists.')
+    await schedule_on_boot_check_async()
     asyncio.create_task(cleanup_upload_ids())
+    if metrics_utils.METRICS_ENABLED:
+        # Start monitoring the event loop lag in each server worker
+        # event loop (process).
+        asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
     yield
     # Shutdown: Add any cleanup code here if needed
@@ -166,8 +522,99 @@ class InternalDashboardPrefixMiddleware(
         return await call_next(request)
+class CacheControlStaticMiddleware(starlette.middleware.base.BaseHTTPMiddleware
+                                  ):
+    """Middleware to add cache control headers to static files."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if request.url.path.startswith('/dashboard/_next'):
+            response = await call_next(request)
+            response.headers['Cache-Control'] = 'max-age=3600'
+            return response
+        return await call_next(request)
+class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to check the path of requests."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if request.url.path.startswith('/dashboard/'):
+            # If the requested path is not relative to the expected directory,
+            # then the user is attempting path traversal, so deny the request.
+            parent = pathlib.Path('/dashboard')
+            request_path = pathlib.Path(posixpath.normpath(request.url.path))
+            if not _is_relative_to(request_path, parent):
+                return fastapi.responses.JSONResponse(
+                    status_code=403, content={'detail': 'Forbidden'})
+        return await call_next(request)
+class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to control requests when server is shutting down."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if state.get_block_requests():
+            # Allow /api/ paths to continue, which are critical to operate
+            # on-going requests but will not submit new requests.
+            if not request.url.path.startswith('/api/'):
+                # Client will retry on 503 error.
+                return fastapi.responses.JSONResponse(
+                    status_code=503,
+                    content={
+                        'detail': 'Server is shutting down, '
+                                  'please try again later.'
+                    })
+        return await call_next(request)
+class APIVersionMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to add API version to the request."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        version_info = versions.check_compatibility_at_server(request.headers)
+        # Bypass version handling for backward compatibility with clients prior
+        # to v0.11.0, the client will check the version in the body of
+        # /api/health response and hint an upgrade.
+        # TODO(aylei): remove this after v0.13.0 is released.
+        if version_info is None:
+            return await call_next(request)
+        if version_info.error is None:
+            versions.set_remote_api_version(version_info.api_version)
+            versions.set_remote_version(version_info.version)
+            response = await call_next(request)
+        else:
+            response = fastapi.responses.JSONResponse(
+                status_code=400,
+                content={
+                    'error': common.ApiServerStatus.VERSION_MISMATCH.value,
+                    'message': version_info.error,
+                })
+        response.headers[server_constants.API_VERSION_HEADER] = str(
+            server_constants.API_VERSION)
+        response.headers[server_constants.VERSION_HEADER] = \
+            versions.get_local_readable_version()
+        return response
 app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
+# Middleware wraps in the order defined here. E.g., given
+#   app.add_middleware(Middleware1)
+#   app.add_middleware(Middleware2)
+#   app.add_middleware(Middleware3)
+# The effect will be like:
+#   Middleware3(Middleware2(Middleware1(request)))
+# If MiddlewareN does something like print(n); call_next(); print(n), you'll get
+#   3; 2; 1; <request>; 1; 2; 3
+# Use environment variable to make the metrics middleware optional.
+if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+    app.add_middleware(metrics.PrometheusMiddleware)
+app.add_middleware(APIVersionMiddleware)
+app.add_middleware(RBACMiddleware)
 app.add_middleware(InternalDashboardPrefixMiddleware)
+app.add_middleware(GracefulShutdownMiddleware)
+app.add_middleware(PathCleanMiddleware)
+app.add_middleware(CacheControlStaticMiddleware)
 app.add_middleware(
     cors.CORSMiddleware,
     # TODO(zhwu): in production deployment, we should restrict the allowed
@@ -176,20 +623,119 @@ app.add_middleware(
     allow_credentials=True,
     allow_methods=['*'],
     allow_headers=['*'],
-    # TODO(syang): remove X-Request-ID when v0.10.0 is released.
+    # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
     expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
+# The order of all the authentication-related middleware is important.
+# RBACMiddleware must precede all the auth middleware, so it can access
+# request.state.auth_user.
+app.add_middleware(RBACMiddleware)
+# Authentication based on oauth2-proxy.
+app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
+# AuthProxyMiddleware should precede BasicAuthMiddleware and
+# BearerTokenMiddleware, since it should be skipped if either of those set the
+# auth user.
+app.add_middleware(AuthProxyMiddleware)
+enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
+if str(enable_basic_auth).lower() == 'true':
+    app.add_middleware(BasicAuthMiddleware)
+# Bearer token middleware should always be present to handle service account
+# authentication
+app.add_middleware(BearerTokenMiddleware)
+# InitializeRequestAuthUserMiddleware must be the last added middleware so that
+# request.state.auth_user is always set, but can be overridden by the auth
+# middleware above.
+app.add_middleware(InitializeRequestAuthUserMiddleware)
 app.add_middleware(RequestIDMiddleware)
 app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
 app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
+app.include_router(users_rest.router, prefix='/users', tags=['users'])
+app.include_router(workspaces_rest.router,
+                   prefix='/workspaces',
+                   tags=['workspaces'])
+app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
+app.include_router(ssh_node_pools_rest.router,
+                   prefix='/ssh_node_pools',
+                   tags=['ssh_node_pools'])
+# increase the resource limit for the server
+soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+# Increase the limit of files we can open to our hard limit. This fixes bugs
+# where we can not aquire file locks or open enough logs and the API server
+# crashes. On Mac, the hard limit is 9,223,372,036,854,775,807.
+# TODO(luca) figure out what to do if we need to open more than 2^63 files.
+try:
+    soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+except Exception:  # pylint: disable=broad-except
+    pass  # no issue, we will warn the user later if its too low
+@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
+def handle_concurrent_worker_exhausted_error(
+        request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
+    del request  # request is not used
+    # Print detailed error message to server log
+    logger.error('Concurrent worker exhausted: '
+                 f'{common_utils.format_exception(e)}')
+    with ux_utils.enable_traceback():
+        logger.error(f'  Traceback: {traceback.format_exc()}')
+    # Return human readable error message to client
+    return fastapi.responses.JSONResponse(
+        status_code=503,
+        content={
+            'detail':
+                ('The server has exhausted its concurrent worker limit. '
+                 'Please try again or scale the server if the load persists.')
+        })
+@app.get('/token')
+async def token(request: fastapi.Request,
+                local_port: Optional[int] = None) -> fastapi.responses.Response:
+    del local_port  # local_port is used by the served js, but ignored by server
+    user = _get_auth_user_header(request)
+    token_data = {
+        'v': 1,  # Token version number, bump for backwards incompatible.
+        'user': user.id if user is not None else None,
+        'cookies': request.cookies,
+    }
+    # Use base64 encoding to avoid having to escape anything in the HTML.
+    json_bytes = json.dumps(token_data).encode('utf-8')
+    base64_str = base64.b64encode(json_bytes).decode('utf-8')
+    html_dir = pathlib.Path(__file__).parent / 'html'
+    token_page_path = html_dir / 'token_page.html'
+    try:
+        with open(token_page_path, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+    except FileNotFoundError as e:
+        raise fastapi.HTTPException(
+            status_code=500, detail='Token page template not found.') from e
+    user_info_string = f'Logged in as {user.name}' if user is not None else ''
+    html_content = html_content.replace(
+        'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
+        base64_str).replace('USER_PLACEHOLDER', user_info_string)
+    return fastapi.responses.HTMLResponse(
+        content=html_content,
+        headers={
+            'Cache-Control': 'no-cache, no-transform',
+            # X-Accel-Buffering: no is useful for preventing buffering issues
+            # with some reverse proxies.
+            'X-Accel-Buffering': 'no'
+        })
 @app.post('/check')
 async def check(request: fastapi.Request,
                 check_body: payloads.CheckBody) -> None:
     """Checks enabled clouds."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='check',
+        request_name=request_names.RequestName.CHECK,
         request_body=check_body,
         func=sky_check.check,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -197,12 +743,15 @@ async def check(request: fastapi.Request,
 @app.get('/enabled_clouds')
-async def enabled_clouds(request: fastapi.Request) -> None:
+async def enabled_clouds(request: fastapi.Request,
+                         workspace: Optional[str] = None,
+                         expand: bool = False) -> None:
     """Gets enabled clouds on the server."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='enabled_clouds',
-        request_body=payloads.RequestBody(),
+        request_name=request_names.RequestName.ENABLED_CLOUDS,
+        request_body=payloads.EnabledCloudsBody(workspace=workspace,
+                                                expand=expand),
         func=core.enabled_clouds,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -214,9 +763,10 @@ async def realtime_kubernetes_gpu_availability(
     realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
 ) -> None:
     """Gets real-time Kubernetes GPU availability."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='realtime_kubernetes_gpu_availability',
+        request_name=request_names.RequestName.
+        REALTIME_KUBERNETES_GPU_AVAILABILITY,
         request_body=realtime_gpu_availability_body,
         func=core.realtime_kubernetes_gpu_availability,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -229,9 +779,9 @@ async def kubernetes_node_info(
         kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
 ) -> None:
     """Gets Kubernetes nodes information and hints."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='kubernetes_node_info',
+        request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
         request_body=kubernetes_node_info_body,
         func=kubernetes_utils.get_kubernetes_node_info,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -241,9 +791,9 @@ async def kubernetes_node_info(
 @app.get('/status_kubernetes')
 async def status_kubernetes(request: fastapi.Request) -> None:
     """Gets Kubernetes status."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='status_kubernetes',
+        request_name=request_names.RequestName.STATUS_KUBERNETES,
         request_body=payloads.RequestBody(),
         func=core.status_kubernetes,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -255,11 +805,11 @@ async def list_accelerators(
         request: fastapi.Request,
         list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
     """Gets list of accelerators from cloud catalog."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='list_accelerators',
+        request_name=request_names.RequestName.LIST_ACCELERATORS,
         request_body=list_accelerator_counts_body,
-        func=service_catalog.list_accelerators,
+        func=catalog.list_accelerators,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -270,11 +820,11 @@ async def list_accelerator_counts(
         list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
 ) -> None:
     """Gets list of accelerator counts from cloud catalog."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='list_accelerator_counts',
+        request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
         request_body=list_accelerator_counts_body,
-        func=service_catalog.list_accelerator_counts,
+        func=catalog.list_accelerator_counts,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -292,25 +842,33 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
     # pairs.
     logger.debug(f'Validating tasks: {validate_body.dag}')
+    context.initialize()
+    ctx = context.get()
+    assert ctx is not None
+    # TODO(aylei): generalize this to all requests without a db record.
+    ctx.override_envs(validate_body.env_vars)
     def validate_dag(dag: dag_utils.dag_lib.Dag):
         # TODO: Admin policy may contain arbitrary code, which may be expensive
         # to run and may block the server thread. However, moving it into the
         # executor adds a ~150ms penalty on the local API server because of
         # added RTTs. For now, we stick to doing the validation inline in the
         # server thread.
-        dag, _ = admin_policy_utils.apply(
-            dag, request_options=validate_body.request_options)
-        # Skip validating workdir and file_mounts, as those need to be
-        # validated after the files are uploaded to the SkyPilot API server
-        # with `upload_mounts_to_api_server`.
-        dag.validate(skip_file_mounts=True, skip_workdir=True)
+        with admin_policy_utils.apply_and_use_config_in_current_request(
+                dag,
+                request_name=request_names.AdminPolicyRequestName.VALIDATE,
+                request_options=validate_body.get_request_options()) as dag:
+            dag.resolve_and_validate_volumes()
+            # Skip validating workdir and file_mounts, as those need to be
+            # validated after the files are uploaded to the SkyPilot API server
+            # with `upload_mounts_to_api_server`.
+            dag.validate(skip_file_mounts=True, skip_workdir=True)
     try:
         dag = dag_utils.load_chain_dag_from_yaml_str(validate_body.dag)
-        loop = asyncio.get_running_loop()
         # Apply admin policy and validate DAG is blocking, run it in a separate
         # thread executor to avoid blocking the uvicorn event loop.
-        await loop.run_in_executor(None, validate_dag, dag)
+        await context_utils.to_thread(validate_dag, dag)
     except Exception as e:  # pylint: disable=broad-except
         raise fastapi.HTTPException(
             status_code=400, detail=exceptions.serialize_exception(e)) from e
@@ -320,9 +878,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
 async def optimize(optimize_body: payloads.OptimizeBody,
                    request: fastapi.Request) -> None:
     """Optimizes the user's DAG."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='optimize',
+        request_name=request_names.RequestName.OPTIMIZE,
         request_body=optimize_body,
         ignore_return_value=True,
         func=core.optimize,
@@ -350,16 +908,30 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
         chunk_index: The chunk index, starting from 0.
         total_chunks: The total number of chunks.
     """
+    # Field _body would be set if the request body has been received, fail fast
+    # to surface potential memory issues, i.e. catch the issue in our smoke
+    # test.
+    # pylint: disable=protected-access
+    if hasattr(request, '_body'):
+        raise fastapi.HTTPException(
+            status_code=500,
+            detail='Upload request body should not be received before streaming'
+        )
     # Add the upload id to the cleanup list.
     upload_ids_to_cleanup[(upload_id,
                            user_hash)] = (datetime.datetime.now() +
                                           _DEFAULT_UPLOAD_EXPIRATION_TIME)
+    # For anonymous access, use the user hash from client
+    user_id = user_hash
+    if request.state.auth_user is not None:
+        # Otherwise, the authenticated identity should be used.
+        user_id = request.state.auth_user.id
     # TODO(SKY-1271): We need to double check security of uploading zip file.
     client_file_mounts_dir = (
-        common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_hash /
+        common.API_SERVER_CLIENT_DIR.expanduser().resolve() / user_id /
         'file_mounts')
-    client_file_mounts_dir.mkdir(parents=True, exist_ok=True)
+    await anyio.Path(client_file_mounts_dir).mkdir(parents=True, exist_ok=True)
     # Check upload_id to be a valid SkyPilot run_timestamp appended with 8 hex
     # characters, e.g. 'sky-2025-01-17-09-10-13-933602-35d31c22'.
@@ -382,7 +954,7 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
         zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
     else:
         chunk_dir = client_file_mounts_dir / upload_id
-        chunk_dir.mkdir(parents=True, exist_ok=True)
+        await anyio.Path(chunk_dir).mkdir(parents=True, exist_ok=True)
         zip_file_path = chunk_dir / f'part{chunk_index}.incomplete'
     try:
@@ -412,8 +984,9 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
         zip_file_path.rename(zip_file_path.with_suffix(''))
         missing_chunks = get_missing_chunks(total_chunks)
         if missing_chunks:
-            return payloads.UploadZipFileResponse(status='uploading',
-                                                  missing_chunks=missing_chunks)
+            return payloads.UploadZipFileResponse(
+                status=responses.UploadStatus.UPLOADING.value,
+                missing_chunks=missing_chunks)
         zip_file_path = client_file_mounts_dir / f'{upload_id}.zip'
         async with aiofiles.open(zip_file_path, 'wb') as zip_file:
             for chunk in range(total_chunks):
@@ -427,10 +1000,11 @@ async def upload_zip_file(request: fastapi.Request, user_hash: str,
                         await zip_file.write(data)
     logger.info(f'Uploaded zip file: {zip_file_path}')
-    unzip_file(zip_file_path, client_file_mounts_dir)
+    await unzip_file(zip_file_path, client_file_mounts_dir)
     if total_chunks > 1:
-        shutil.rmtree(chunk_dir)
-    return payloads.UploadZipFileResponse(status='completed')
+        await context_utils.to_thread(shutil.rmtree, chunk_dir)
+    return payloads.UploadZipFileResponse(
+        status=responses.UploadStatus.COMPLETED.value)
 def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
@@ -443,61 +1017,69 @@ def _is_relative_to(path: pathlib.Path, parent: pathlib.Path) -> bool:
         return False
-def unzip_file(zip_file_path: pathlib.Path,
-               client_file_mounts_dir: pathlib.Path) -> None:
-    """Unzips a zip file."""
-    try:
-        with zipfile.ZipFile(zip_file_path, 'r') as zipf:
-            for member in zipf.infolist():
-                # Determine the new path
-                original_path = os.path.normpath(member.filename)
-                new_path = client_file_mounts_dir / original_path.lstrip('/')
-                if (member.external_attr >> 28) == 0xA:
-                    # Symlink. Read the target path and create a symlink.
+async def unzip_file(zip_file_path: pathlib.Path,
+                     client_file_mounts_dir: pathlib.Path) -> None:
+    """Unzips a zip file without blocking the event loop."""
+    def _do_unzip() -> None:
+        try:
+            with zipfile.ZipFile(zip_file_path, 'r') as zipf:
+                for member in zipf.infolist():
+                    # Determine the new path
+                    original_path = os.path.normpath(member.filename)
+                    new_path = client_file_mounts_dir / original_path.lstrip(
+                        '/')
+                    if (member.external_attr >> 28) == 0xA:
+                        # Symlink. Read the target path and create a symlink.
+                        new_path.parent.mkdir(parents=True, exist_ok=True)
+                        target = zipf.read(member).decode()
+                        assert not os.path.isabs(target), target
+                        # Since target is a relative path, we need to check that
+                        # it is under `client_file_mounts_dir` for security.
+                        full_target_path = (new_path.parent / target).resolve()
+                        if not _is_relative_to(full_target_path,
+                                               client_file_mounts_dir):
+                            raise ValueError(
+                                f'Symlink target {target} leads to a '
+                                'file not in userspace. Aborted.')
+                        if new_path.exists() or new_path.is_symlink():
+                            new_path.unlink(missing_ok=True)
+                        new_path.symlink_to(
+                            target,
+                            target_is_directory=member.filename.endswith('/'))
+                        continue
+                    # Handle directories
+                    if member.filename.endswith('/'):
+                        new_path.mkdir(parents=True, exist_ok=True)
+                        continue
+                    # Handle files
                     new_path.parent.mkdir(parents=True, exist_ok=True)
-                    target = zipf.read(member).decode()
-                    assert not os.path.isabs(target), target
-                    # Since target is a relative path, we need to check that it
-                    # is under `client_file_mounts_dir` for security.
-                    full_target_path = (new_path.parent / target).resolve()
-                    if not _is_relative_to(full_target_path,
-                                           client_file_mounts_dir):
-                        raise ValueError(f'Symlink target {target} leads to a '
-                                         'file not in userspace. Aborted.')
-                    if new_path.exists() or new_path.is_symlink():
-                        new_path.unlink(missing_ok=True)
-                    new_path.symlink_to(
-                        target,
-                        target_is_directory=member.filename.endswith('/'))
-                    continue
-                # Handle directories
-                if member.filename.endswith('/'):
-                    new_path.mkdir(parents=True, exist_ok=True)
-                    continue
-                # Handle files
-                new_path.parent.mkdir(parents=True, exist_ok=True)
-                with zipf.open(member) as member_file, new_path.open('wb') as f:
-                    # Use shutil.copyfileobj to copy files in chunks, so it does
-                    # not load the entire file into memory.
-                    shutil.copyfileobj(member_file, f)
-    except zipfile.BadZipFile as e:
-        logger.error(f'Bad zip file: {zip_file_path}')
-        raise fastapi.HTTPException(
-            status_code=400,
-            detail=f'Invalid zip file: {common_utils.format_exception(e)}')
-    except Exception as e:
-        logger.error(f'Error unzipping file: {zip_file_path}')
-        raise fastapi.HTTPException(
-            status_code=500,
-            detail=(f'Error unzipping file: '
-                    f'{common_utils.format_exception(e)}'))
+                    with zipf.open(member) as member_file, new_path.open(
+                            'wb') as f:
+                        # Use shutil.copyfileobj to copy files in chunks,
+                        # so it does not load the entire file into memory.
+                        shutil.copyfileobj(member_file, f)
+        except zipfile.BadZipFile as e:
+            logger.error(f'Bad zip file: {zip_file_path}')
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail=f'Invalid zip file: {common_utils.format_exception(e)}')
+        except Exception as e:
+            logger.error(f'Error unzipping file: {zip_file_path}')
+            raise fastapi.HTTPException(
+                status_code=500,
+                detail=(f'Error unzipping file: '
+                        f'{common_utils.format_exception(e)}'))
+        finally:
+            # Cleanup the temporary file regardless of
+            # success/failure handling above
+            zip_file_path.unlink(missing_ok=True)
-    # Cleanup the temporary file
-    zip_file_path.unlink()
+    await context_utils.to_thread(_do_unzip)
 @app.post('/launch')
@@ -506,13 +1088,14 @@ async def launch(launch_body: payloads.LaunchBody,
     """Launches a cluster or task."""
     request_id = request.state.request_id
     logger.info(f'Launching request: {request_id}')
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id,
-        request_name='launch',
+        request_name=request_names.RequestName.CLUSTER_LAUNCH,
         request_body=launch_body,
         func=execution.launch,
         schedule_type=requests_lib.ScheduleType.LONG,
         request_cluster_name=launch_body.cluster_name,
+        retryable=launch_body.retry_until_up,
     )
@@ -521,9 +1104,9 @@ async def launch(launch_body: payloads.LaunchBody,
 async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
     """Executes a task on an existing cluster."""
     cluster_name = exec_body.cluster_name
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='exec',
+        request_name=request_names.RequestName.CLUSTER_EXEC,
         request_body=exec_body,
         func=execution.exec,
         precondition=preconditions.ClusterStartCompletePrecondition(
@@ -539,9 +1122,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
 async def stop(request: fastapi.Request,
                stop_body: payloads.StopOrDownBody) -> None:
     """Stops a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='stop',
+        request_name=request_names.RequestName.CLUSTER_STOP,
         request_body=stop_body,
         func=core.stop,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -555,9 +1138,13 @@ async def status(
     status_body: payloads.StatusBody = payloads.StatusBody()
 ) -> None:
     """Gets cluster statuses."""
-    executor.schedule_request(
+    if state.get_block_requests():
+        raise fastapi.HTTPException(
+            status_code=503,
+            detail='Server is shutting down, please try again later.')
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='status',
+        request_name=request_names.RequestName.CLUSTER_STATUS,
         request_body=status_body,
         func=core.status,
         schedule_type=(requests_lib.ScheduleType.LONG if
@@ -570,9 +1157,9 @@ async def status(
 async def endpoints(request: fastapi.Request,
                     endpoint_body: payloads.EndpointsBody) -> None:
     """Gets the endpoint for a given cluster and port number (endpoint)."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='endpoints',
+        request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
         request_body=endpoint_body,
         func=core.endpoints,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -584,9 +1171,9 @@ async def endpoints(request: fastapi.Request,
 async def down(request: fastapi.Request,
                down_body: payloads.StopOrDownBody) -> None:
     """Tears down a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='down',
+        request_name=request_names.RequestName.CLUSTER_DOWN,
         request_body=down_body,
         func=core.down,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -598,9 +1185,9 @@ async def down(request: fastapi.Request,
 async def start(request: fastapi.Request,
                 start_body: payloads.StartBody) -> None:
     """Restarts a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='start',
+        request_name=request_names.RequestName.CLUSTER_START,
         request_body=start_body,
         func=core.start,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -612,9 +1199,9 @@ async def start(request: fastapi.Request,
 async def autostop(request: fastapi.Request,
                    autostop_body: payloads.AutostopBody) -> None:
     """Schedules an autostop/autodown for a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='autostop',
+        request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
         request_body=autostop_body,
         func=core.autostop,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -626,9 +1213,9 @@ async def autostop(request: fastapi.Request,
 async def queue(request: fastapi.Request,
                 queue_body: payloads.QueueBody) -> None:
     """Gets the job queue of a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='queue',
+        request_name=request_names.RequestName.CLUSTER_QUEUE,
         request_body=queue_body,
         func=core.queue,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -640,9 +1227,9 @@ async def queue(request: fastapi.Request,
 async def job_status(request: fastapi.Request,
                      job_status_body: payloads.JobStatusBody) -> None:
     """Gets the status of a job."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='job_status',
+        request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
         request_body=job_status_body,
         func=core.job_status,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -654,9 +1241,9 @@ async def job_status(request: fastapi.Request,
 async def cancel(request: fastapi.Request,
                  cancel_body: payloads.CancelBody) -> None:
     """Cancels jobs on a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='cancel',
+        request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
         request_body=cancel_body,
         func=core.cancel,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -673,36 +1260,27 @@ async def logs(
     # TODO(zhwu): This should wait for the request on the cluster, e.g., async
     # launch, to finish, so that a user does not need to manually pull the
     # request status.
-    executor.schedule_request(
+    executor.check_request_thread_executor_available()
+    request_task = await executor.prepare_request_async(
         request_id=request.state.request_id,
-        request_name='logs',
+        request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
         request_body=cluster_job_body,
         func=core.tail_logs,
-        # TODO(aylei): We have tail logs scheduled as SHORT request, because it
-        # should be responsive. However, it can be long running if the user's
-        # job keeps running, and we should avoid it taking the SHORT worker.
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cluster_job_body.cluster_name,
     )
-    request_task = requests_lib.get_request(request.state.request_id)
+    task = executor.execute_request_in_coroutine(request_task)
+    background_tasks.add_task(task.cancel)
     # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
     # the same approach as /stream.
-    return stream_utils.stream_response(
-        request_id=request_task.request_id,
+    return stream_utils.stream_response_for_long_request(
+        request_id=request.state.request_id,
         logs_path=request_task.log_path,
         background_tasks=background_tasks,
+        kill_request_on_disconnect=False,
     )
-@app.get('/users')
-async def users() -> List[Dict[str, Any]]:
-    """Gets all users."""
-    user_list = global_user_state.get_all_users()
-    return [user.to_dict() for user in user_list]
 @app.post('/download_logs')
 async def download_logs(
         request: fastapi.Request,
@@ -714,9 +1292,9 @@ async def download_logs(
     # We should reuse the original request body, so that the env vars, such as
     # user hash, are kept the same.
     cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='download_logs',
+        request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
         request_body=cluster_jobs_body,
         func=core.download_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -725,7 +1303,8 @@ async def download_logs(
 @app.post('/download')
-async def download(download_body: payloads.DownloadBody) -> None:
+async def download(download_body: payloads.DownloadBody,
+                   request: fastapi.Request) -> None:
     """Downloads a folder from the cluster to the local machine."""
     folder_paths = [
         pathlib.Path(folder_path) for folder_path in download_body.folder_paths
@@ -750,11 +1329,25 @@ async def download(download_body: payloads.DownloadBody) -> None:
         logs_dir_on_api_server).expanduser().resolve() / zip_filename
     try:
-        folders = [
-            str(folder_path.expanduser().resolve())
-            for folder_path in folder_paths
-        ]
-        storage_utils.zip_files_and_folders(folders, zip_path)
+        def _zip_files_and_folders(folder_paths, zip_path):
+            folders = [
+                str(folder_path.expanduser().resolve())
+                for folder_path in folder_paths
+            ]
+            # Check for optional query parameter to control zip entry structure
+            relative = request.query_params.get('relative', 'home')
+            if relative == 'items':
+                # Dashboard-friendly: entries relative to selected folders
+                storage_utils.zip_files_and_folders(folders,
+                                                    zip_path,
+                                                    relative_to_items=True)
+            else:
+                # CLI-friendly (default): entries with full paths for mapping
+                storage_utils.zip_files_and_folders(folders, zip_path)
+        await context_utils.to_thread(_zip_files_and_folders, folder_paths,
+                                      zip_path)
         # Add home path to the response headers, so that the client can replace
         # the remote path in the zip file to the local path.
@@ -776,13 +1369,84 @@ async def download(download_body: payloads.DownloadBody) -> None:
                                     detail=f'Error creating zip file: {str(e)}')
-@app.get('/cost_report')
-async def cost_report(request: fastapi.Request) -> None:
+# TODO(aylei): run it asynchronously after global_user_state support async op
+@app.post('/provision_logs')
+def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
+                   follow: bool = True,
+                   tail: int = 0) -> fastapi.responses.StreamingResponse:
+    """Streams the provision.log for the latest launch request of a cluster."""
+    log_path = None
+    cluster_name = provision_logs_body.cluster_name
+    worker = provision_logs_body.worker
+    # stream head node logs
+    if worker is None:
+        # Prefer clusters table first, then cluster_history as fallback.
+        log_path_str = global_user_state.get_cluster_provision_log_path(
+            cluster_name)
+        if not log_path_str:
+            log_path_str = (
+                global_user_state.get_cluster_history_provision_log_path(
+                    cluster_name))
+        if not log_path_str:
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=('Provision log path is not recorded for this cluster. '
+                        'Please relaunch to generate provisioning logs.'))
+        log_path = pathlib.Path(log_path_str).expanduser().resolve()
+        if not log_path.exists():
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=f'Provision log path does not exist: {str(log_path)}')
+    # stream worker node logs
+    else:
+        handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+        if handle is None:
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=('Cluster handle is not recorded for this cluster. '
+                        'Please relaunch to generate provisioning logs.'))
+        # instance_ids includes head node
+        instance_ids = handle.instance_ids
+        if instance_ids is None:
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail='Instance IDs are not recorded for this cluster. '
+                'Please relaunch to generate provisioning logs.')
+        if worker > len(instance_ids) - 1:
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail=f'Worker {worker} is out of range. '
+                f'The cluster has {len(instance_ids)} nodes.')
+        log_path = metadata_utils.get_instance_log_dir(
+            handle.get_cluster_name_on_cloud(), instance_ids[worker])
+    # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
+    effective_tail = None if tail is None or tail <= 0 else tail
+    return fastapi.responses.StreamingResponse(
+        content=stream_utils.log_streamer(None,
+                                          log_path,
+                                          tail=effective_tail,
+                                          follow=follow,
+                                          cluster_name=cluster_name),
+        media_type='text/plain',
+        headers={
+            'Cache-Control': 'no-cache, no-transform',
+            'X-Accel-Buffering': 'no',
+            'Transfer-Encoding': 'chunked',
+        },
+    )
+@app.post('/cost_report')
+async def cost_report(request: fastapi.Request,
+                      cost_report_body: payloads.CostReportBody) -> None:
     """Gets the cost report of a cluster."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='cost_report',
-        request_body=payloads.RequestBody(),
+        request_name=request_names.RequestName.CLUSTER_COST_REPORT,
+        request_body=cost_report_body,
         func=core.cost_report,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -791,9 +1455,9 @@ async def cost_report(request: fastapi.Request) -> None:
 @app.get('/storage/ls')
 async def storage_ls(request: fastapi.Request) -> None:
     """Gets the storages."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='storage_ls',
+        request_name=request_names.RequestName.STORAGE_LS,
         request_body=payloads.RequestBody(),
         func=core.storage_ls,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -804,9 +1468,9 @@ async def storage_ls(request: fastapi.Request) -> None:
 async def storage_delete(request: fastapi.Request,
                          storage_body: payloads.StorageBody) -> None:
     """Deletes a storage."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='storage_delete',
+        request_name=request_names.RequestName.STORAGE_DELETE,
         request_body=storage_body,
         func=core.storage_delete,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -817,9 +1481,9 @@ async def storage_delete(request: fastapi.Request,
 async def local_up(request: fastapi.Request,
                    local_up_body: payloads.LocalUpBody) -> None:
     """Launches a Kubernetes cluster on API server."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='local_up',
+        request_name=request_names.RequestName.LOCAL_UP,
         request_body=local_up_body,
         func=core.local_up,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -827,37 +1491,65 @@ async def local_up(request: fastapi.Request,
 @app.post('/local_down')
-async def local_down(request: fastapi.Request) -> None:
+async def local_down(request: fastapi.Request,
+                     local_down_body: payloads.LocalDownBody) -> None:
     """Tears down the Kubernetes cluster started by local_up."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='local_down',
-        request_body=payloads.RequestBody(),
+        request_name=request_names.RequestName.LOCAL_DOWN,
+        request_body=local_down_body,
         func=core.local_down,
         schedule_type=requests_lib.ScheduleType.LONG,
     )
+async def get_expanded_request_id(request_id: str) -> str:
+    """Gets the expanded request ID for a given request ID prefix."""
+    request_tasks = await requests_lib.get_requests_async_with_prefix(
+        request_id, fields=['request_id'])
+    if request_tasks is None:
+        raise fastapi.HTTPException(status_code=404,
+                                    detail=f'Request {request_id!r} not found')
+    if len(request_tasks) > 1:
+        raise fastapi.HTTPException(status_code=400,
+                                    detail=('Multiple requests found for '
+                                            f'request ID prefix: {request_id}'))
+    return request_tasks[0].request_id
 # === API server related APIs ===
-@app.get('/api/get')
-async def api_get(request_id: str) -> requests_lib.RequestPayload:
+@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
+async def api_get(request_id: str) -> payloads.RequestPayload:
     """Gets a request with a given request ID prefix."""
+    # Validate request_id prefix matches a single request.
+    request_id = await get_expanded_request_id(request_id)
     while True:
-        request_task = requests_lib.get_request(request_id)
-        if request_task is None:
+        req_status = await requests_lib.get_request_status_async(request_id)
+        if req_status is None:
             print(f'No task with request ID {request_id}', flush=True)
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id!r} not found')
-        if request_task.status > requests_lib.RequestStatus.RUNNING:
-            request_error = request_task.get_error()
-            if request_error is not None:
-                raise fastapi.HTTPException(status_code=500,
-                                            detail=dataclasses.asdict(
-                                                request_task.encode()))
-            return request_task.encode()
+        if (req_status.status == requests_lib.RequestStatus.RUNNING and
+                daemons.is_daemon_request_id(request_id)):
+            # Daemon requests run forever, break without waiting for complete.
+            break
+        if req_status.status > requests_lib.RequestStatus.RUNNING:
+            break
         # yield control to allow other coroutines to run, sleep shortly
         # to avoid storming the DB and CPU in the meantime
         await asyncio.sleep(0.1)
+    request_task = await requests_lib.get_request_async(request_id)
+    # TODO(aylei): refine this, /api/get will not be retried and this is
+    # meaningless to retry. It is the original request that should be retried.
+    if request_task.should_retry:
+        raise fastapi.HTTPException(
+            status_code=503, detail=f'Request {request_id!r} should be retried')
+    request_error = request_task.get_error()
+    if request_error is not None:
+        raise fastapi.HTTPException(status_code=500,
+                                    detail=request_task.encode().model_dump())
+    return request_task.encode()
 @app.get('/api/stream')
@@ -891,13 +1583,18 @@ async def stream(
             clients, console for CLI/API clients), 'plain' (force plain text),
             'html' (force HTML), or 'console' (force console)
     """
+    # We need to save the user-supplied request ID for the response header.
+    user_supplied_request_id = request_id
     if request_id is not None and log_path is not None:
         raise fastapi.HTTPException(
             status_code=400,
             detail='Only one of request_id and log_path can be provided')
+    if request_id is not None:
+        request_id = await get_expanded_request_id(request_id)
     if request_id is None and log_path is None:
-        request_id = requests_lib.get_latest_request_id()
+        request_id = await requests_lib.get_latest_request_id_async()
         if request_id is None:
             raise fastapi.HTTPException(status_code=404,
                                         detail='No request found')
@@ -924,19 +1621,40 @@ async def stream(
                 'X-Accel-Buffering': 'no'
             })
+    polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
     # Original plain text streaming logic
     if request_id is not None:
-        request_task = requests_lib.get_request(request_id)
+        request_task = await requests_lib.get_request_async(
+            request_id, fields=['request_id', 'schedule_type'])
         if request_task is None:
             print(f'No task with request ID {request_id}')
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id!r} not found')
+        # req.log_path is derived from request_id,
+        # so it's ok to just grab the request_id in the above query.
         log_path_to_stream = request_task.log_path
+        if not log_path_to_stream.exists():
+            # The log file might be deleted by the request GC daemon but the
+            # request task is still in the database.
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=f'Log of request {request_id!r} has been deleted')
+        if request_task.schedule_type == requests_lib.ScheduleType.LONG:
+            polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
+        del request_task
     else:
         assert log_path is not None, (request_id, log_path)
         if log_path == constants.API_SERVER_LOGS:
             resolved_log_path = pathlib.Path(
                 constants.API_SERVER_LOGS).expanduser()
+            if not resolved_log_path.exists():
+                raise fastapi.HTTPException(
+                    status_code=404,
+                    detail='Server log file does not exist. The API server may '
+                    'have been started with `--foreground` - check the '
+                    'stdout of API server process, such as: '
+                    '`kubectl logs -n api-server-namespace '
+                    'api-server-pod-name`')
         else:
             # This should be a log path under ~/sky_logs.
             resolved_logs_directory = pathlib.Path(
@@ -957,18 +1675,26 @@ async def stream(
                     detail=f'Log path {log_path!r} does not exist')
         log_path_to_stream = resolved_log_path
+    headers = {
+        'Cache-Control': 'no-cache, no-transform',
+        'X-Accel-Buffering': 'no',
+        'Transfer-Encoding': 'chunked'
+    }
+    if request_id is not None:
+        headers[server_constants.STREAM_REQUEST_HEADER] = (
+            user_supplied_request_id
+            if user_supplied_request_id else request_id)
     return fastapi.responses.StreamingResponse(
         content=stream_utils.log_streamer(request_id,
                                           log_path_to_stream,
                                           plain_logs=format == 'plain',
                                           tail=tail,
-                                          follow=follow),
+                                          follow=follow,
+                                          polling_interval=polling_interval),
         media_type='text/plain',
-        headers={
-            'Cache-Control': 'no-cache, no-transform',
-            'X-Accel-Buffering': 'no',
-            'Transfer-Encoding': 'chunked'
-        },
+        headers=headers,
     )
@@ -976,11 +1702,11 @@ async def stream(
 async def api_cancel(request: fastapi.Request,
                      request_cancel_body: payloads.RequestCancelBody) -> None:
     """Cancels requests."""
-    executor.schedule_request(
+    await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='api_cancel',
+        request_name=request_names.RequestName.API_CANCEL,
         request_body=request_cancel_body,
-        func=requests_lib.kill_requests,
+        func=requests_lib.kill_requests_with_prefix,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -988,10 +1714,14 @@ async def api_cancel(request: fastapi.Request,
 @app.get('/api/status')
 async def api_status(
     request_ids: Optional[List[str]] = fastapi.Query(
-        None, description='Request IDs to get status for.'),
+        None, description='Request ID prefixes to get status for.'),
     all_status: bool = fastapi.Query(
         False, description='Get finished requests as well.'),
-) -> List[requests_lib.RequestPayload]:
+    limit: Optional[int] = fastapi.Query(
+        None, description='Number of requests to show.'),
+    fields: Optional[List[str]] = fastapi.Query(
+        None, description='Fields to get. If None, get all fields.'),
+) -> List[payloads.RequestPayload]:
     """Gets the list of requests."""
     if request_ids is None:
         statuses = None
@@ -1000,53 +1730,120 @@ async def api_status(
                 requests_lib.RequestStatus.PENDING,
                 requests_lib.RequestStatus.RUNNING,
             ]
-        return [
-            request_task.readable_encode()
-            for request_task in requests_lib.get_request_tasks(status=statuses)
-        ]
+        request_tasks = await requests_lib.get_request_tasks_async(
+            req_filter=requests_lib.RequestTaskFilter(
+                status=statuses,
+                limit=limit,
+                fields=fields,
+                sort=True,
+            ))
+        return requests_lib.encode_requests(request_tasks)
     else:
         encoded_request_tasks = []
         for request_id in request_ids:
-            request_task = requests_lib.get_request(request_id)
-            if request_task is None:
+            request_tasks = await requests_lib.get_requests_async_with_prefix(
+                request_id)
+            if request_tasks is None:
                 continue
-            encoded_request_tasks.append(request_task.readable_encode())
+            for request_task in request_tasks:
+                encoded_request_tasks.append(request_task.readable_encode())
         return encoded_request_tasks
-@app.get('/api/health')
-async def health() -> Dict[str, str]:
+@app.get(
+    '/api/health',
+    # response_model_exclude_unset omits unset fields
+    # in the response JSON.
+    response_model_exclude_unset=True)
+async def health(request: fastapi.Request) -> responses.APIHealthResponse:
     """Checks the health of the API server.
     Returns:
-        A dictionary with the following keys:
-        - status: str; The status of the API server.
-        - api_version: str; The API version of the API server.
-        - version: str; The version of SkyPilot used for API server.
-        - version_on_disk: str; The version of the SkyPilot installation on
-          disk, which can be used to warn about restarting the API server
-        - commit: str; The commit hash of SkyPilot used for API server.
+        responses.APIHealthResponse: The health response.
     """
-    return {
-        'status': common.ApiServerStatus.HEALTHY.value,
-        'api_version': server_constants.API_VERSION,
-        'version': sky.__version__,
-        'version_on_disk': common.get_skypilot_version_on_disk(),
-        'commit': sky.__commit__,
-    }
+    user = request.state.auth_user
+    server_status = common.ApiServerStatus.HEALTHY
+    if getattr(request.state, 'anonymous_user', False):
+        # API server authentication is enabled, but the request is not
+        # authenticated. We still have to serve the request because the
+        # /api/health endpoint has two different usage:
+        # 1. For health check from `api start` and external ochestration
+        #    tools (k8s), which does not require authentication and user info.
+        # 2. Return server info to client and hint client to login if required.
+        # Separating these two usage to different APIs will break backward
+        # compatibility for existing ochestration solutions (e.g. helm chart).
+        # So we serve these two usages in a backward compatible manner below.
+        client_version = versions.get_remote_api_version()
+        # - For Client with API version >= 14, we return 200 response with
+        #   status=NEEDS_AUTH, new client will handle the login process.
+        # - For health check from `sky api start`, the client code always uses
+        #   the same API version with the server, thus there is no compatibility
+        #   issue.
+        server_status = common.ApiServerStatus.NEEDS_AUTH
+        if client_version is None:
+            # - For health check from ochestration tools (e.g. k8s), we also
+            #   return 200 with status=NEEDS_AUTH, which passes HTTP probe
+            #   check.
+            # - There is no harm when an malicious client calls /api/health
+            #   without authentication since no sensitive information is
+            #   returned.
+            return responses.APIHealthResponse(
+                status=common.ApiServerStatus.HEALTHY,)
+        # TODO(aylei): remove this after min_compatible_api_version >= 14.
+        if client_version < 14:
+            # For Client with API version < 14, the NEEDS_AUTH status is not
+            # honored. Return 401 to trigger the login process.
+            raise fastapi.HTTPException(status_code=401,
+                                        detail='Authentication required')
+    logger.debug(f'Health endpoint: request.state.auth_user = {user}')
+    return responses.APIHealthResponse(
+        status=server_status,
+        # Kept for backward compatibility, clients before 0.11.0 will read this
+        # field to check compatibility and hint the user to upgrade the CLI.
+        # TODO(aylei): remove this field after 0.13.0
+        api_version=str(server_constants.API_VERSION),
+        version=sky.__version__,
+        version_on_disk=common.get_skypilot_version_on_disk(),
+        commit=sky.__commit__,
+        # Whether basic auth on api server is enabled
+        basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
+                                          'false').lower() == 'true',
+        user=user if user is not None else None,
+        # Whether service account token is enabled
+        service_account_token_enabled=(os.environ.get(
+            constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
+            'false').lower() == 'true'),
+        # Whether basic auth on ingress is enabled
+        ingress_basic_auth_enabled=os.environ.get(
+            constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
+            'false').lower() == 'true',
+    )
+class KubernetesSSHMessageType(IntEnum):
+    REGULAR_DATA = 0
+    PINGPONG = 1
+    LATENCY_MEASUREMENT = 2
 @app.websocket('/kubernetes-pod-ssh-proxy')
 async def kubernetes_pod_ssh_proxy(
-    websocket: fastapi.WebSocket,
-    cluster_name_body: payloads.ClusterNameBody = fastapi.Depends()
-) -> None:
+        websocket: fastapi.WebSocket,
+        cluster_name: str,
+        client_version: Optional[int] = None) -> None:
     """Proxies SSH to the Kubernetes pod with websocket."""
     await websocket.accept()
-    cluster_name = cluster_name_body.cluster_name
     logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
-    cluster_records = core.status(cluster_name, all_users=True)
+    timestamps_supported = client_version is not None and client_version > 21
+    logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
+        client_version = {client_version}')
+    # Run core.status in another thread to avoid blocking the event loop.
+    with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
+        cluster_records = await context_utils.to_thread_with_executor(
+            thread_pool_executor, core.status, cluster_name, all_users=True)
     cluster_record = cluster_records[0]
     if cluster_record['status'] != status_lib.ClusterStatus.UP:
         raise fastapi.HTTPException(
@@ -1085,17 +1882,70 @@ async def kubernetes_pod_ssh_proxy(
             return
     logger.info(f'Starting port-forward to local port: {local_port}')
+    conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
+        pid=os.getpid())
+    ssh_failed = False
+    websocket_closed = False
     try:
+        conn_gauge.inc()
         # Connect to the local port
         reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
         async def websocket_to_ssh():
             try:
                 async for message in websocket.iter_bytes():
+                    if timestamps_supported:
+                        type_size = struct.calcsize('!B')
+                        message_type = struct.unpack('!B',
+                                                     message[:type_size])[0]
+                        if (message_type ==
+                                KubernetesSSHMessageType.REGULAR_DATA):
+                            # Regular data - strip type byte and forward to SSH
+                            message = message[type_size:]
+                        elif message_type == KubernetesSSHMessageType.PINGPONG:
+                            # PING message - respond with PONG (type 1)
+                            ping_id_size = struct.calcsize('!I')
+                            if len(message) != type_size + ping_id_size:
+                                raise ValueError('Invalid PING message '
+                                                 f'length: {len(message)}')
+                            # Return the same PING message, so that the client
+                            # can measure the latency.
+                            await websocket.send_bytes(message)
+                            continue
+                        elif (message_type ==
+                              KubernetesSSHMessageType.LATENCY_MEASUREMENT):
+                            # Latency measurement from client
+                            latency_size = struct.calcsize('!Q')
+                            if len(message) != type_size + latency_size:
+                                raise ValueError(
+                                    'Invalid latency measurement '
+                                    f'message length: {len(message)}')
+                            avg_latency_ms = struct.unpack(
+                                '!Q',
+                                message[type_size:type_size + latency_size])[0]
+                            latency_seconds = avg_latency_ms / 1000
+                            metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds)  # pylint: disable=line-too-long
+                            continue
+                        else:
+                            # Unknown message type.
+                            raise ValueError(
+                                f'Unknown message type: {message_type}')
                     writer.write(message)
-                    await writer.drain()
+                    try:
+                        await writer.drain()
+                    except Exception as e:  # pylint: disable=broad-except
+                        # Typically we will not reach here, if the ssh to pod
+                        # is disconnected, ssh_to_websocket will exit first.
+                        # But just in case.
+                        logger.error('Failed to write to pod through '
+                                     f'port-forward connection: {e}')
+                        nonlocal ssh_failed
+                        ssh_failed = True
+                        break
             except fastapi.WebSocketDisconnect:
                 pass
+            nonlocal websocket_closed
+            websocket_closed = True
             writer.close()
         async def ssh_to_websocket():
@@ -1103,87 +1953,249 @@ async def kubernetes_pod_ssh_proxy(
                 while True:
                     data = await reader.read(1024)
                     if not data:
+                        if not websocket_closed:
+                            logger.warning('SSH connection to pod is '
+                                           'disconnected before websocket '
+                                           'connection is closed')
+                            nonlocal ssh_failed
+                            ssh_failed = True
                         break
+                    if timestamps_supported:
+                        # Prepend message type byte (0 = regular data)
+                        message_type_bytes = struct.pack(
+                            '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
+                        data = message_type_bytes + data
                     await websocket.send_bytes(data)
             except Exception:  # pylint: disable=broad-except
                 pass
-            await websocket.close()
+            try:
+                await websocket.close()
+            except Exception:  # pylint: disable=broad-except
+                # The websocket might has been closed by the client.
+                pass
         await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
     finally:
-        proc.terminate()
+        conn_gauge.dec()
+        reason = ''
+        try:
+            logger.info('Terminating kubectl port-forward process')
+            proc.terminate()
+        except ProcessLookupError:
+            stdout = await proc.stdout.read()
+            logger.error('kubectl port-forward was terminated before the '
+                         'ssh websocket connection was closed. Remaining '
+                         f'output: {str(stdout)}')
+            reason = 'KubectlPortForwardExit'
+            metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+                pid=os.getpid(), reason='KubectlPortForwardExit').inc()
+        else:
+            if ssh_failed:
+                reason = 'SSHToPodDisconnected'
+            else:
+                reason = 'ClientClosed'
+        metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+            pid=os.getpid(), reason=reason).inc()
+@app.get('/all_contexts')
+async def all_contexts(request: fastapi.Request) -> None:
+    """Gets all Kubernetes and SSH node pool contexts."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.ALL_CONTEXTS,
+        request_body=payloads.RequestBody(),
+        func=core.get_all_contexts,
+        schedule_type=requests_lib.ScheduleType.SHORT,
+    )
 # === Internal APIs ===
 @app.get('/api/completion/cluster_name')
 async def complete_cluster_name(incomplete: str,) -> List[str]:
-    return global_user_state.get_cluster_names_start_with(incomplete)
+    return await context_utils.to_thread(
+        global_user_state.get_cluster_names_start_with, incomplete)
 @app.get('/api/completion/storage_name')
 async def complete_storage_name(incomplete: str,) -> List[str]:
-    return global_user_state.get_storage_names_start_with(incomplete)
+    return await context_utils.to_thread(
+        global_user_state.get_storage_names_start_with, incomplete)
-# Add a route to serve static files
-@app.get('/{full_path:path}')
-async def serve_static_or_dashboard(full_path: str):
-    """Serves static files for any unmatched routes.
+@app.get('/api/completion/volume_name')
+async def complete_volume_name(incomplete: str,) -> List[str]:
+    return await context_utils.to_thread(
+        global_user_state.get_volume_names_start_with, incomplete)
-    Handles the /dashboard prefix from Next.js configuration.
-    """
-    # Check if the path starts with 'dashboard/' and remove it if it does
-    if full_path.startswith('dashboard/'):
-        full_path = full_path[len('dashboard/'):]
-    # Try to serve the file directly from the out directory first
+@app.get('/api/completion/api_request')
+async def complete_api_request(incomplete: str,) -> List[str]:
+    return await requests_lib.get_api_request_ids_start_with(incomplete)
+@app.get('/dashboard/{full_path:path}')
+async def serve_dashboard(full_path: str):
+    """Serves the Next.js dashboard application.
+    Args:
+        full_path: The path requested by the client.
+        e.g. /clusters, /jobs
+    Returns:
+        FileResponse for static files or index.html for client-side routing.
+    Raises:
+        HTTPException: If the path is invalid or file not found.
+    """
+    # Try to serve the staticfile directly e.g. /skypilot.svg,
+    # /favicon.ico, and /_next/, etc.
     file_path = os.path.join(server_constants.DASHBOARD_DIR, full_path)
     if os.path.isfile(file_path):
         return fastapi.responses.FileResponse(file_path)
-    # If file not found, serve the index.html for client-side routing.
-    # For example, the non-matched arbitrary route (/ or /test) from
-    # client will be redirected to the index.html.
+    # Serve index.html for client-side routing
+    # e.g. /clusters, /jobs
     index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
     try:
         with open(index_path, 'r', encoding='utf-8') as f:
             content = f.read()
         return fastapi.responses.HTMLResponse(content=content)
     except Exception as e:
         logger.error(f'Error serving dashboard: {e}')
         raise fastapi.HTTPException(status_code=500, detail=str(e))
+# Redirect the root path to dashboard
+@app.get('/')
+async def root():
+    return fastapi.responses.RedirectResponse(url='/dashboard/')
+def _init_or_restore_server_user_hash():
+    """Restores the server user hash from the global user state db.
+    The API server must have a stable user hash across restarts and potential
+    multiple replicas. Thus we persist the user hash in db and restore it on
+    startup. When upgrading from old version, the user hash will be read from
+    the local file (if any) to keep the user hash consistent.
+    """
+    def apply_user_hash(user_hash: str) -> None:
+        # For local API server, the user hash in db and local file should be
+        # same so there is no harm to override here.
+        common_utils.set_user_hash_locally(user_hash)
+        # Refresh the server user hash for current process after restore or
+        # initialize the user hash in db, child processes will get the correct
+        # server id from the local cache file.
+        common_lib.refresh_server_id()
+    user_hash = global_user_state.get_system_config(_SERVER_USER_HASH_KEY)
+    if user_hash is not None:
+        apply_user_hash(user_hash)
+        return
+    # Initial deployment, generate a user hash and save it to the db.
+    user_hash = common_utils.get_user_hash()
+    global_user_state.set_system_config(_SERVER_USER_HASH_KEY, user_hash)
+    apply_user_hash(user_hash)
 if __name__ == '__main__':
     import uvicorn
     from sky.server import uvicorn as skyuvicorn
-    requests_lib.reset_db_and_logs()
+    logger.info('Initializing SkyPilot API server')
+    skyuvicorn.add_timestamp_prefix_for_server_logs()
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', default='127.0.0.1')
     parser.add_argument('--port', default=46580, type=int)
     parser.add_argument('--deploy', action='store_true')
+    # Serve metrics on a separate port to isolate it from the application APIs:
+    # metrics port will not be exposed to the public network typically.
+    parser.add_argument('--metrics-port', default=9090, type=int)
     cmd_args = parser.parse_args()
+    if cmd_args.port == cmd_args.metrics_port:
+        logger.error('port and metrics-port cannot be the same, exiting.')
+        raise ValueError('port and metrics-port cannot be the same')
+    # Fail fast if the port is not available to avoid corrupt the state
+    # of potential running server instance.
+    # We might reach here because the running server is currently not
+    # responding, thus the healthz check fails and `sky api start` think
+    # we should start a new server instance.
+    if not common_utils.is_port_available(cmd_args.port):
+        logger.error(f'Port {cmd_args.port} is not available, exiting.')
+        raise RuntimeError(f'Port {cmd_args.port} is not available')
+    # Maybe touch the signal file on API server startup. Do it again here even
+    # if we already touched it in the sky/server/common.py::_start_api_server.
+    # This is because the sky/server/common.py::_start_api_server function call
+    # is running outside the skypilot API server process tree. The process tree
+    # starts within that function (see the `subprocess.Popen` call in
+    # sky/server/common.py::_start_api_server). When pg is used, the
+    # _start_api_server function will not load the config file from db, which
+    # will ignore the consolidation mode config. Here, inside the process tree,
+    # we already reload the config as a server (with env var _start_api_server),
+    # so we will respect the consolidation mode config.
+    # Refers to #7717 for more details.
+    managed_job_utils.is_consolidation_mode(on_api_restart=True)
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()
-    config = server_config.compute_server_config(cmd_args.deploy)
+    # Initialize global user state db
+    db_utils.set_max_connections(1)
+    logger.info('Initializing database engine')
+    global_user_state.initialize_and_get_db()
+    logger.info('Database engine initialized')
+    # Initialize request db
+    requests_lib.reset_db_and_logs()
+    # Restore the server user hash
+    logger.info('Initializing server user hash')
+    _init_or_restore_server_user_hash()
+    max_db_connections = global_user_state.get_max_db_connections()
+    logger.info(f'Max db connections: {max_db_connections}')
+    config = server_config.compute_server_config(cmd_args.deploy,
+                                                 max_db_connections)
     num_workers = config.num_server_workers
-    sub_procs = []
+    queue_server: Optional[multiprocessing.Process] = None
+    workers: List[executor.RequestWorker] = []
+    # Global background tasks that will be scheduled in a separate event loop.
+    global_tasks: List[asyncio.Task] = []
     try:
-        sub_procs = executor.start(config)
+        background = uvloop.new_event_loop()
+        if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+            metrics_server = metrics.build_metrics_server(
+                cmd_args.host, cmd_args.metrics_port)
+            global_tasks.append(background.create_task(metrics_server.serve()))
+        global_tasks.append(
+            background.create_task(requests_lib.requests_gc_daemon()))
+        global_tasks.append(
+            background.create_task(
+                global_user_state.cluster_event_retention_daemon()))
+        threading.Thread(target=background.run_forever, daemon=True).start()
+        queue_server, workers = executor.start(config)
         logger.info(f'Starting SkyPilot API server, workers={num_workers}')
         # We don't support reload for now, since it may cause leakage of request
         # workers or interrupt running requests.
-        config = uvicorn.Config('sky.server.server:app',
-                                host=cmd_args.host,
-                                port=cmd_args.port,
-                                workers=num_workers)
-        skyuvicorn.run(config)
+        uvicorn_config = uvicorn.Config('sky.server.server:app',
+                                        host=cmd_args.host,
+                                        port=cmd_args.port,
+                                        workers=num_workers,
+                                        ws_per_message_deflate=False)
+        skyuvicorn.run(uvicorn_config,
+                       max_db_connections=config.num_db_connections_per_worker)
     except Exception as exc:  # pylint: disable=broad-except
         logger.error(f'Failed to start SkyPilot API server: '
                      f'{common_utils.format_exception(exc, use_bracket=True)}')
@@ -1191,17 +2203,11 @@ if __name__ == '__main__':
     finally:
         logger.info('Shutting down SkyPilot API server...')
-        def cleanup(proc: multiprocessing.Process) -> None:
-            try:
-                proc.terminate()
-                proc.join()
-            finally:
-                # The process may not be started yet, close it anyway.
-                proc.close()
-        # Terminate processes in reverse order in case dependency, especially
-        # queue server. Terminate queue server first does not affect the
-        # correctness of cleanup but introduce redundant error messages.
-        subprocess_utils.run_in_parallel(cleanup,
-                                         list(reversed(sub_procs)),
-                                         num_threads=len(sub_procs))
+        for gt in global_tasks:
+            gt.cancel()
+        subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
+                                         workers,
+                                         num_threads=len(workers))
+        if queue_server is not None:
+            queue_server.kill()
+            queue_server.join()

skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250509py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl