skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/server.py
CHANGED
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import asyncio
|
|
5
5
|
import base64
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
import contextlib
|
|
7
8
|
import datetime
|
|
9
|
+
from enum import IntEnum
|
|
8
10
|
import hashlib
|
|
9
11
|
import json
|
|
10
12
|
import multiprocessing
|
|
@@ -14,8 +16,10 @@ import posixpath
|
|
|
14
16
|
import re
|
|
15
17
|
import resource
|
|
16
18
|
import shutil
|
|
19
|
+
import struct
|
|
17
20
|
import sys
|
|
18
21
|
import threading
|
|
22
|
+
import traceback
|
|
19
23
|
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
20
24
|
import uuid
|
|
21
25
|
import zipfile
|
|
@@ -23,6 +27,7 @@ import zipfile
|
|
|
23
27
|
import aiofiles
|
|
24
28
|
import anyio
|
|
25
29
|
import fastapi
|
|
30
|
+
from fastapi import responses as fastapi_responses
|
|
26
31
|
from fastapi.middleware import cors
|
|
27
32
|
import starlette.middleware.base
|
|
28
33
|
import uvloop
|
|
@@ -38,8 +43,10 @@ from sky import global_user_state
|
|
|
38
43
|
from sky import models
|
|
39
44
|
from sky import sky_logging
|
|
40
45
|
from sky.data import storage_utils
|
|
46
|
+
from sky.jobs import utils as managed_job_utils
|
|
41
47
|
from sky.jobs.server import server as jobs_rest
|
|
42
48
|
from sky.metrics import utils as metrics_utils
|
|
49
|
+
from sky.provision import metadata_utils
|
|
43
50
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
44
51
|
from sky.schemas.api import responses
|
|
45
52
|
from sky.serve.server import server as serve_rest
|
|
@@ -48,14 +55,17 @@ from sky.server import config as server_config
|
|
|
48
55
|
from sky.server import constants as server_constants
|
|
49
56
|
from sky.server import daemons
|
|
50
57
|
from sky.server import metrics
|
|
58
|
+
from sky.server import middleware_utils
|
|
51
59
|
from sky.server import state
|
|
52
60
|
from sky.server import stream_utils
|
|
53
61
|
from sky.server import versions
|
|
54
62
|
from sky.server.auth import authn
|
|
63
|
+
from sky.server.auth import loopback
|
|
55
64
|
from sky.server.auth import oauth2_proxy
|
|
56
65
|
from sky.server.requests import executor
|
|
57
66
|
from sky.server.requests import payloads
|
|
58
67
|
from sky.server.requests import preconditions
|
|
68
|
+
from sky.server.requests import request_names
|
|
59
69
|
from sky.server.requests import requests as requests_lib
|
|
60
70
|
from sky.skylet import constants
|
|
61
71
|
from sky.ssh_node_pools import server as ssh_node_pools_rest
|
|
@@ -67,10 +77,13 @@ from sky.utils import common as common_lib
|
|
|
67
77
|
from sky.utils import common_utils
|
|
68
78
|
from sky.utils import context
|
|
69
79
|
from sky.utils import context_utils
|
|
80
|
+
from sky.utils import controller_utils
|
|
70
81
|
from sky.utils import dag_utils
|
|
82
|
+
from sky.utils import env_options
|
|
71
83
|
from sky.utils import perf_utils
|
|
72
84
|
from sky.utils import status_lib
|
|
73
85
|
from sky.utils import subprocess_utils
|
|
86
|
+
from sky.utils import ux_utils
|
|
74
87
|
from sky.utils.db import db_utils
|
|
75
88
|
from sky.volumes.server import server as volumes_rest
|
|
76
89
|
from sky.workspaces import server as workspaces_rest
|
|
@@ -128,6 +141,7 @@ def _try_set_basic_auth_user(request: fastapi.Request):
|
|
|
128
141
|
break
|
|
129
142
|
|
|
130
143
|
|
|
144
|
+
@middleware_utils.websocket_aware
|
|
131
145
|
class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
132
146
|
"""Middleware to handle RBAC."""
|
|
133
147
|
|
|
@@ -157,11 +171,9 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
157
171
|
"""Middleware to add a request ID to each request."""
|
|
158
172
|
|
|
159
173
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
160
|
-
request_id =
|
|
174
|
+
request_id = requests_lib.get_new_request_id()
|
|
161
175
|
request.state.request_id = request_id
|
|
162
176
|
response = await call_next(request)
|
|
163
|
-
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
164
|
-
response.headers['X-Request-ID'] = request_id
|
|
165
177
|
response.headers['X-Skypilot-Request-ID'] = request_id
|
|
166
178
|
return response
|
|
167
179
|
|
|
@@ -177,6 +189,7 @@ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
|
|
|
177
189
|
return models.User(id=user_hash, name=user_name)
|
|
178
190
|
|
|
179
191
|
|
|
192
|
+
@middleware_utils.websocket_aware
|
|
180
193
|
class InitializeRequestAuthUserMiddleware(
|
|
181
194
|
starlette.middleware.base.BaseHTTPMiddleware):
|
|
182
195
|
|
|
@@ -187,10 +200,15 @@ class InitializeRequestAuthUserMiddleware(
|
|
|
187
200
|
return await call_next(request)
|
|
188
201
|
|
|
189
202
|
|
|
203
|
+
@middleware_utils.websocket_aware
|
|
190
204
|
class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
191
205
|
"""Middleware to handle HTTP Basic Auth."""
|
|
192
206
|
|
|
193
207
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
208
|
+
if managed_job_utils.is_consolidation_mode(
|
|
209
|
+
) and loopback.is_loopback_request(request):
|
|
210
|
+
return await call_next(request)
|
|
211
|
+
|
|
194
212
|
if request.url.path.startswith('/api/health'):
|
|
195
213
|
# Try to set the auth user from basic auth
|
|
196
214
|
_try_set_basic_auth_user(request)
|
|
@@ -234,6 +252,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
234
252
|
return await call_next(request)
|
|
235
253
|
|
|
236
254
|
|
|
255
|
+
@middleware_utils.websocket_aware
|
|
237
256
|
class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
238
257
|
"""Middleware to handle Bearer Token Auth (Service Accounts)."""
|
|
239
258
|
|
|
@@ -361,6 +380,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
361
380
|
return await call_next(request)
|
|
362
381
|
|
|
363
382
|
|
|
383
|
+
@middleware_utils.websocket_aware
|
|
364
384
|
class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
365
385
|
"""Middleware to handle auth proxy."""
|
|
366
386
|
|
|
@@ -437,7 +457,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
437
457
|
if lag_threshold is not None and lag > lag_threshold:
|
|
438
458
|
logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
|
|
439
459
|
f'{lag_threshold} seconds.')
|
|
440
|
-
|
|
460
|
+
metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
|
|
441
461
|
pid=pid).observe(lag)
|
|
442
462
|
target = now + interval
|
|
443
463
|
loop.call_at(target, tick)
|
|
@@ -445,6 +465,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
445
465
|
loop.call_at(target, tick)
|
|
446
466
|
|
|
447
467
|
|
|
468
|
+
async def schedule_on_boot_check_async():
|
|
469
|
+
try:
|
|
470
|
+
await executor.schedule_request_async(
|
|
471
|
+
request_id='skypilot-server-on-boot-check',
|
|
472
|
+
request_name=request_names.RequestName.CHECK,
|
|
473
|
+
request_body=payloads.CheckBody(),
|
|
474
|
+
func=sky_check.check,
|
|
475
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
476
|
+
is_skypilot_system=True,
|
|
477
|
+
)
|
|
478
|
+
except exceptions.RequestAlreadyExistsError:
|
|
479
|
+
# Lifespan will be executed in each uvicorn worker process, we
|
|
480
|
+
# can safely ignore the error if the task is already scheduled.
|
|
481
|
+
logger.debug('Request skypilot-server-on-boot-check already exists.')
|
|
482
|
+
|
|
483
|
+
|
|
448
484
|
@contextlib.asynccontextmanager
|
|
449
485
|
async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
|
|
450
486
|
"""FastAPI lifespan context manager."""
|
|
@@ -454,7 +490,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
454
490
|
if event.should_skip():
|
|
455
491
|
continue
|
|
456
492
|
try:
|
|
457
|
-
executor.
|
|
493
|
+
await executor.schedule_request_async(
|
|
458
494
|
request_id=event.id,
|
|
459
495
|
request_name=event.name,
|
|
460
496
|
request_body=payloads.RequestBody(),
|
|
@@ -469,8 +505,9 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
469
505
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
470
506
|
# can safely ignore the error if the task is already scheduled.
|
|
471
507
|
logger.debug(f'Request {event.id} already exists.')
|
|
508
|
+
await schedule_on_boot_check_async()
|
|
472
509
|
asyncio.create_task(cleanup_upload_ids())
|
|
473
|
-
if
|
|
510
|
+
if metrics_utils.METRICS_ENABLED:
|
|
474
511
|
# Start monitoring the event loop lag in each server worker
|
|
475
512
|
# event loop (process).
|
|
476
513
|
asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
|
|
@@ -518,6 +555,7 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
518
555
|
return await call_next(request)
|
|
519
556
|
|
|
520
557
|
|
|
558
|
+
@middleware_utils.websocket_aware
|
|
521
559
|
class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
522
560
|
"""Middleware to control requests when server is shutting down."""
|
|
523
561
|
|
|
@@ -537,6 +575,7 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
537
575
|
return await call_next(request)
|
|
538
576
|
|
|
539
577
|
|
|
578
|
+
@middleware_utils.websocket_aware
|
|
540
579
|
class APIVersionMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
541
580
|
"""Middleware to add API version to the request."""
|
|
542
581
|
|
|
@@ -579,6 +618,9 @@ app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
|
|
579
618
|
if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
|
|
580
619
|
app.add_middleware(metrics.PrometheusMiddleware)
|
|
581
620
|
app.add_middleware(APIVersionMiddleware)
|
|
621
|
+
# The order of all the authentication-related middleware is important.
|
|
622
|
+
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
623
|
+
# request.state.auth_user.
|
|
582
624
|
app.add_middleware(RBACMiddleware)
|
|
583
625
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
|
584
626
|
app.add_middleware(GracefulShutdownMiddleware)
|
|
@@ -592,12 +634,7 @@ app.add_middleware(
|
|
|
592
634
|
allow_credentials=True,
|
|
593
635
|
allow_methods=['*'],
|
|
594
636
|
allow_headers=['*'],
|
|
595
|
-
|
|
596
|
-
expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
|
|
597
|
-
# The order of all the authentication-related middleware is important.
|
|
598
|
-
# RBACMiddleware must precede all the auth middleware, so it can access
|
|
599
|
-
# request.state.auth_user.
|
|
600
|
-
app.add_middleware(RBACMiddleware)
|
|
637
|
+
expose_headers=['X-Skypilot-Request-ID'])
|
|
601
638
|
# Authentication based on oauth2-proxy.
|
|
602
639
|
app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
|
|
603
640
|
# AuthProxyMiddleware should precede BasicAuthMiddleware and
|
|
@@ -625,16 +662,28 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
|
|
|
625
662
|
app.include_router(ssh_node_pools_rest.router,
|
|
626
663
|
prefix='/ssh_node_pools',
|
|
627
664
|
tags=['ssh_node_pools'])
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
665
|
+
# increase the resource limit for the server
|
|
666
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
667
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
|
|
671
|
+
def handle_concurrent_worker_exhausted_error(
|
|
672
|
+
request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
|
|
673
|
+
del request # request is not used
|
|
674
|
+
# Print detailed error message to server log
|
|
675
|
+
logger.error('Concurrent worker exhausted: '
|
|
676
|
+
f'{common_utils.format_exception(e)}')
|
|
677
|
+
with ux_utils.enable_traceback():
|
|
678
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
679
|
+
# Return human readable error message to client
|
|
680
|
+
return fastapi.responses.JSONResponse(
|
|
681
|
+
status_code=503,
|
|
682
|
+
content={
|
|
683
|
+
'detail':
|
|
684
|
+
('The server has exhausted its concurrent worker limit. '
|
|
685
|
+
'Please try again or scale the server if the load persists.')
|
|
686
|
+
})
|
|
638
687
|
|
|
639
688
|
|
|
640
689
|
@app.get('/token')
|
|
@@ -680,9 +729,9 @@ async def token(request: fastapi.Request,
|
|
|
680
729
|
async def check(request: fastapi.Request,
|
|
681
730
|
check_body: payloads.CheckBody) -> None:
|
|
682
731
|
"""Checks enabled clouds."""
|
|
683
|
-
executor.
|
|
732
|
+
await executor.schedule_request_async(
|
|
684
733
|
request_id=request.state.request_id,
|
|
685
|
-
request_name=
|
|
734
|
+
request_name=request_names.RequestName.CHECK,
|
|
686
735
|
request_body=check_body,
|
|
687
736
|
func=sky_check.check,
|
|
688
737
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -694,9 +743,9 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
694
743
|
workspace: Optional[str] = None,
|
|
695
744
|
expand: bool = False) -> None:
|
|
696
745
|
"""Gets enabled clouds on the server."""
|
|
697
|
-
executor.
|
|
746
|
+
await executor.schedule_request_async(
|
|
698
747
|
request_id=request.state.request_id,
|
|
699
|
-
request_name=
|
|
748
|
+
request_name=request_names.RequestName.ENABLED_CLOUDS,
|
|
700
749
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
701
750
|
expand=expand),
|
|
702
751
|
func=core.enabled_clouds,
|
|
@@ -710,9 +759,10 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
710
759
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
711
760
|
) -> None:
|
|
712
761
|
"""Gets real-time Kubernetes GPU availability."""
|
|
713
|
-
executor.
|
|
762
|
+
await executor.schedule_request_async(
|
|
714
763
|
request_id=request.state.request_id,
|
|
715
|
-
request_name=
|
|
764
|
+
request_name=request_names.RequestName.
|
|
765
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY,
|
|
716
766
|
request_body=realtime_gpu_availability_body,
|
|
717
767
|
func=core.realtime_kubernetes_gpu_availability,
|
|
718
768
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -725,9 +775,9 @@ async def kubernetes_node_info(
|
|
|
725
775
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
726
776
|
) -> None:
|
|
727
777
|
"""Gets Kubernetes nodes information and hints."""
|
|
728
|
-
executor.
|
|
778
|
+
await executor.schedule_request_async(
|
|
729
779
|
request_id=request.state.request_id,
|
|
730
|
-
request_name=
|
|
780
|
+
request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
|
|
731
781
|
request_body=kubernetes_node_info_body,
|
|
732
782
|
func=kubernetes_utils.get_kubernetes_node_info,
|
|
733
783
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -736,10 +786,11 @@ async def kubernetes_node_info(
|
|
|
736
786
|
|
|
737
787
|
@app.get('/status_kubernetes')
|
|
738
788
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
739
|
-
"""
|
|
740
|
-
|
|
789
|
+
"""[Experimental] Get all SkyPilot resources (including from other '
|
|
790
|
+
'users) in the current Kubernetes context."""
|
|
791
|
+
await executor.schedule_request_async(
|
|
741
792
|
request_id=request.state.request_id,
|
|
742
|
-
request_name=
|
|
793
|
+
request_name=request_names.RequestName.STATUS_KUBERNETES,
|
|
743
794
|
request_body=payloads.RequestBody(),
|
|
744
795
|
func=core.status_kubernetes,
|
|
745
796
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -751,9 +802,9 @@ async def list_accelerators(
|
|
|
751
802
|
request: fastapi.Request,
|
|
752
803
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
753
804
|
"""Gets list of accelerators from cloud catalog."""
|
|
754
|
-
executor.
|
|
805
|
+
await executor.schedule_request_async(
|
|
755
806
|
request_id=request.state.request_id,
|
|
756
|
-
request_name=
|
|
807
|
+
request_name=request_names.RequestName.LIST_ACCELERATORS,
|
|
757
808
|
request_body=list_accelerator_counts_body,
|
|
758
809
|
func=catalog.list_accelerators,
|
|
759
810
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -766,9 +817,9 @@ async def list_accelerator_counts(
|
|
|
766
817
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
767
818
|
) -> None:
|
|
768
819
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
769
|
-
executor.
|
|
820
|
+
await executor.schedule_request_async(
|
|
770
821
|
request_id=request.state.request_id,
|
|
771
|
-
request_name=
|
|
822
|
+
request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
|
|
772
823
|
request_body=list_accelerator_counts_body,
|
|
773
824
|
func=catalog.list_accelerator_counts,
|
|
774
825
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -802,6 +853,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
802
853
|
# server thread.
|
|
803
854
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
804
855
|
dag,
|
|
856
|
+
request_name=request_names.AdminPolicyRequestName.VALIDATE,
|
|
805
857
|
request_options=validate_body.get_request_options()) as dag:
|
|
806
858
|
dag.resolve_and_validate_volumes()
|
|
807
859
|
# Skip validating workdir and file_mounts, as those need to be
|
|
@@ -815,6 +867,11 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
815
867
|
# thread executor to avoid blocking the uvicorn event loop.
|
|
816
868
|
await context_utils.to_thread(validate_dag, dag)
|
|
817
869
|
except Exception as e: # pylint: disable=broad-except
|
|
870
|
+
# Print the exception to the API server log.
|
|
871
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
872
|
+
logger.info('/validate exception:', exc_info=True)
|
|
873
|
+
# Set the exception stacktrace for the serialized exception.
|
|
874
|
+
requests_lib.set_exception_stacktrace(e)
|
|
818
875
|
raise fastapi.HTTPException(
|
|
819
876
|
status_code=400, detail=exceptions.serialize_exception(e)) from e
|
|
820
877
|
|
|
@@ -823,9 +880,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
823
880
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
824
881
|
request: fastapi.Request) -> None:
|
|
825
882
|
"""Optimizes the user's DAG."""
|
|
826
|
-
executor.
|
|
883
|
+
await executor.schedule_request_async(
|
|
827
884
|
request_id=request.state.request_id,
|
|
828
|
-
request_name=
|
|
885
|
+
request_name=request_names.RequestName.OPTIMIZE,
|
|
829
886
|
request_body=optimize_body,
|
|
830
887
|
ignore_return_value=True,
|
|
831
888
|
func=core.optimize,
|
|
@@ -1033,9 +1090,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1033
1090
|
"""Launches a cluster or task."""
|
|
1034
1091
|
request_id = request.state.request_id
|
|
1035
1092
|
logger.info(f'Launching request: {request_id}')
|
|
1036
|
-
executor.
|
|
1093
|
+
await executor.schedule_request_async(
|
|
1037
1094
|
request_id,
|
|
1038
|
-
request_name=
|
|
1095
|
+
request_name=request_names.RequestName.CLUSTER_LAUNCH,
|
|
1039
1096
|
request_body=launch_body,
|
|
1040
1097
|
func=execution.launch,
|
|
1041
1098
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1049,9 +1106,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1049
1106
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1050
1107
|
"""Executes a task on an existing cluster."""
|
|
1051
1108
|
cluster_name = exec_body.cluster_name
|
|
1052
|
-
executor.
|
|
1109
|
+
await executor.schedule_request_async(
|
|
1053
1110
|
request_id=request.state.request_id,
|
|
1054
|
-
request_name=
|
|
1111
|
+
request_name=request_names.RequestName.CLUSTER_EXEC,
|
|
1055
1112
|
request_body=exec_body,
|
|
1056
1113
|
func=execution.exec,
|
|
1057
1114
|
precondition=preconditions.ClusterStartCompletePrecondition(
|
|
@@ -1067,9 +1124,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1067
1124
|
async def stop(request: fastapi.Request,
|
|
1068
1125
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1069
1126
|
"""Stops a cluster."""
|
|
1070
|
-
executor.
|
|
1127
|
+
await executor.schedule_request_async(
|
|
1071
1128
|
request_id=request.state.request_id,
|
|
1072
|
-
request_name=
|
|
1129
|
+
request_name=request_names.RequestName.CLUSTER_STOP,
|
|
1073
1130
|
request_body=stop_body,
|
|
1074
1131
|
func=core.stop,
|
|
1075
1132
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1087,9 +1144,9 @@ async def status(
|
|
|
1087
1144
|
raise fastapi.HTTPException(
|
|
1088
1145
|
status_code=503,
|
|
1089
1146
|
detail='Server is shutting down, please try again later.')
|
|
1090
|
-
executor.
|
|
1147
|
+
await executor.schedule_request_async(
|
|
1091
1148
|
request_id=request.state.request_id,
|
|
1092
|
-
request_name=
|
|
1149
|
+
request_name=request_names.RequestName.CLUSTER_STATUS,
|
|
1093
1150
|
request_body=status_body,
|
|
1094
1151
|
func=core.status,
|
|
1095
1152
|
schedule_type=(requests_lib.ScheduleType.LONG if
|
|
@@ -1102,9 +1159,9 @@ async def status(
|
|
|
1102
1159
|
async def endpoints(request: fastapi.Request,
|
|
1103
1160
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1104
1161
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1105
|
-
executor.
|
|
1162
|
+
await executor.schedule_request_async(
|
|
1106
1163
|
request_id=request.state.request_id,
|
|
1107
|
-
request_name=
|
|
1164
|
+
request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
|
|
1108
1165
|
request_body=endpoint_body,
|
|
1109
1166
|
func=core.endpoints,
|
|
1110
1167
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1116,9 +1173,9 @@ async def endpoints(request: fastapi.Request,
|
|
|
1116
1173
|
async def down(request: fastapi.Request,
|
|
1117
1174
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1118
1175
|
"""Tears down a cluster."""
|
|
1119
|
-
executor.
|
|
1176
|
+
await executor.schedule_request_async(
|
|
1120
1177
|
request_id=request.state.request_id,
|
|
1121
|
-
request_name=
|
|
1178
|
+
request_name=request_names.RequestName.CLUSTER_DOWN,
|
|
1122
1179
|
request_body=down_body,
|
|
1123
1180
|
func=core.down,
|
|
1124
1181
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1130,9 +1187,9 @@ async def down(request: fastapi.Request,
|
|
|
1130
1187
|
async def start(request: fastapi.Request,
|
|
1131
1188
|
start_body: payloads.StartBody) -> None:
|
|
1132
1189
|
"""Restarts a cluster."""
|
|
1133
|
-
executor.
|
|
1190
|
+
await executor.schedule_request_async(
|
|
1134
1191
|
request_id=request.state.request_id,
|
|
1135
|
-
request_name=
|
|
1192
|
+
request_name=request_names.RequestName.CLUSTER_START,
|
|
1136
1193
|
request_body=start_body,
|
|
1137
1194
|
func=core.start,
|
|
1138
1195
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1144,9 +1201,9 @@ async def start(request: fastapi.Request,
|
|
|
1144
1201
|
async def autostop(request: fastapi.Request,
|
|
1145
1202
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1146
1203
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1147
|
-
executor.
|
|
1204
|
+
await executor.schedule_request_async(
|
|
1148
1205
|
request_id=request.state.request_id,
|
|
1149
|
-
request_name=
|
|
1206
|
+
request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
|
|
1150
1207
|
request_body=autostop_body,
|
|
1151
1208
|
func=core.autostop,
|
|
1152
1209
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1158,9 +1215,9 @@ async def autostop(request: fastapi.Request,
|
|
|
1158
1215
|
async def queue(request: fastapi.Request,
|
|
1159
1216
|
queue_body: payloads.QueueBody) -> None:
|
|
1160
1217
|
"""Gets the job queue of a cluster."""
|
|
1161
|
-
executor.
|
|
1218
|
+
await executor.schedule_request_async(
|
|
1162
1219
|
request_id=request.state.request_id,
|
|
1163
|
-
request_name=
|
|
1220
|
+
request_name=request_names.RequestName.CLUSTER_QUEUE,
|
|
1164
1221
|
request_body=queue_body,
|
|
1165
1222
|
func=core.queue,
|
|
1166
1223
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1172,9 +1229,9 @@ async def queue(request: fastapi.Request,
|
|
|
1172
1229
|
async def job_status(request: fastapi.Request,
|
|
1173
1230
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1174
1231
|
"""Gets the status of a job."""
|
|
1175
|
-
executor.
|
|
1232
|
+
await executor.schedule_request_async(
|
|
1176
1233
|
request_id=request.state.request_id,
|
|
1177
|
-
request_name=
|
|
1234
|
+
request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
|
|
1178
1235
|
request_body=job_status_body,
|
|
1179
1236
|
func=core.job_status,
|
|
1180
1237
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1186,9 +1243,9 @@ async def job_status(request: fastapi.Request,
|
|
|
1186
1243
|
async def cancel(request: fastapi.Request,
|
|
1187
1244
|
cancel_body: payloads.CancelBody) -> None:
|
|
1188
1245
|
"""Cancels jobs on a cluster."""
|
|
1189
|
-
executor.
|
|
1246
|
+
await executor.schedule_request_async(
|
|
1190
1247
|
request_id=request.state.request_id,
|
|
1191
|
-
request_name=
|
|
1248
|
+
request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
|
|
1192
1249
|
request_body=cancel_body,
|
|
1193
1250
|
func=core.cancel,
|
|
1194
1251
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1205,32 +1262,24 @@ async def logs(
|
|
|
1205
1262
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
|
1206
1263
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1207
1264
|
# request status.
|
|
1208
|
-
|
|
1265
|
+
executor.check_request_thread_executor_available()
|
|
1266
|
+
request_task = await executor.prepare_request_async(
|
|
1209
1267
|
request_id=request.state.request_id,
|
|
1210
|
-
request_name=
|
|
1268
|
+
request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
|
|
1211
1269
|
request_body=cluster_job_body,
|
|
1212
1270
|
func=core.tail_logs,
|
|
1213
1271
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1272
|
+
request_cluster_name=cluster_job_body.cluster_name,
|
|
1214
1273
|
)
|
|
1215
|
-
task =
|
|
1216
|
-
|
|
1217
|
-
async def cancel_task():
|
|
1218
|
-
try:
|
|
1219
|
-
logger.info('Client disconnected for request: '
|
|
1220
|
-
f'{request.state.request_id}')
|
|
1221
|
-
task.cancel()
|
|
1222
|
-
await task
|
|
1223
|
-
except asyncio.CancelledError:
|
|
1224
|
-
pass
|
|
1225
|
-
|
|
1226
|
-
# Cancel the task after the request is done or client disconnects
|
|
1227
|
-
background_tasks.add_task(cancel_task)
|
|
1274
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
1275
|
+
background_tasks.add_task(task.cancel)
|
|
1228
1276
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1229
1277
|
# the same approach as /stream.
|
|
1230
|
-
return stream_utils.
|
|
1278
|
+
return stream_utils.stream_response_for_long_request(
|
|
1231
1279
|
request_id=request.state.request_id,
|
|
1232
1280
|
logs_path=request_task.log_path,
|
|
1233
1281
|
background_tasks=background_tasks,
|
|
1282
|
+
kill_request_on_disconnect=False,
|
|
1234
1283
|
)
|
|
1235
1284
|
|
|
1236
1285
|
|
|
@@ -1245,9 +1294,9 @@ async def download_logs(
|
|
|
1245
1294
|
# We should reuse the original request body, so that the env vars, such as
|
|
1246
1295
|
# user hash, are kept the same.
|
|
1247
1296
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1248
|
-
executor.
|
|
1297
|
+
await executor.schedule_request_async(
|
|
1249
1298
|
request_id=request.state.request_id,
|
|
1250
|
-
request_name=
|
|
1299
|
+
request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
|
|
1251
1300
|
request_body=cluster_jobs_body,
|
|
1252
1301
|
func=core.download_logs,
|
|
1253
1302
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1324,27 +1373,55 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1324
1373
|
|
|
1325
1374
|
# TODO(aylei): run it asynchronously after global_user_state support async op
|
|
1326
1375
|
@app.post('/provision_logs')
|
|
1327
|
-
def provision_logs(
|
|
1376
|
+
def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
1328
1377
|
follow: bool = True,
|
|
1329
1378
|
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1330
1379
|
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1380
|
+
log_path = None
|
|
1381
|
+
cluster_name = provision_logs_body.cluster_name
|
|
1382
|
+
worker = provision_logs_body.worker
|
|
1383
|
+
# stream head node logs
|
|
1384
|
+
if worker is None:
|
|
1385
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1386
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1387
|
+
cluster_name)
|
|
1388
|
+
if not log_path_str:
|
|
1389
|
+
log_path_str = (
|
|
1390
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1391
|
+
cluster_name))
|
|
1392
|
+
if not log_path_str:
|
|
1393
|
+
raise fastapi.HTTPException(
|
|
1394
|
+
status_code=404,
|
|
1395
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1396
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1397
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1398
|
+
if not log_path.exists():
|
|
1399
|
+
raise fastapi.HTTPException(
|
|
1400
|
+
status_code=404,
|
|
1401
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1342
1402
|
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1403
|
+
# stream worker node logs
|
|
1404
|
+
else:
|
|
1405
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
1406
|
+
if handle is None:
|
|
1407
|
+
raise fastapi.HTTPException(
|
|
1408
|
+
status_code=404,
|
|
1409
|
+
detail=('Cluster handle is not recorded for this cluster. '
|
|
1410
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1411
|
+
# instance_ids includes head node
|
|
1412
|
+
instance_ids = handle.instance_ids
|
|
1413
|
+
if instance_ids is None:
|
|
1414
|
+
raise fastapi.HTTPException(
|
|
1415
|
+
status_code=400,
|
|
1416
|
+
detail='Instance IDs are not recorded for this cluster. '
|
|
1417
|
+
'Please relaunch to generate provisioning logs.')
|
|
1418
|
+
if worker > len(instance_ids) - 1:
|
|
1419
|
+
raise fastapi.HTTPException(
|
|
1420
|
+
status_code=400,
|
|
1421
|
+
detail=f'Worker {worker} is out of range. '
|
|
1422
|
+
f'The cluster has {len(instance_ids)} nodes.')
|
|
1423
|
+
log_path = metadata_utils.get_instance_log_dir(
|
|
1424
|
+
handle.get_cluster_name_on_cloud(), instance_ids[worker])
|
|
1348
1425
|
|
|
1349
1426
|
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1350
1427
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
@@ -1353,7 +1430,8 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1353
1430
|
content=stream_utils.log_streamer(None,
|
|
1354
1431
|
log_path,
|
|
1355
1432
|
tail=effective_tail,
|
|
1356
|
-
follow=follow
|
|
1433
|
+
follow=follow,
|
|
1434
|
+
cluster_name=cluster_name),
|
|
1357
1435
|
media_type='text/plain',
|
|
1358
1436
|
headers={
|
|
1359
1437
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1367,9 +1445,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1367
1445
|
async def cost_report(request: fastapi.Request,
|
|
1368
1446
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1369
1447
|
"""Gets the cost report of a cluster."""
|
|
1370
|
-
executor.
|
|
1448
|
+
await executor.schedule_request_async(
|
|
1371
1449
|
request_id=request.state.request_id,
|
|
1372
|
-
request_name=
|
|
1450
|
+
request_name=request_names.RequestName.CLUSTER_COST_REPORT,
|
|
1373
1451
|
request_body=cost_report_body,
|
|
1374
1452
|
func=core.cost_report,
|
|
1375
1453
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1379,9 +1457,9 @@ async def cost_report(request: fastapi.Request,
|
|
|
1379
1457
|
@app.get('/storage/ls')
|
|
1380
1458
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1381
1459
|
"""Gets the storages."""
|
|
1382
|
-
executor.
|
|
1460
|
+
await executor.schedule_request_async(
|
|
1383
1461
|
request_id=request.state.request_id,
|
|
1384
|
-
request_name=
|
|
1462
|
+
request_name=request_names.RequestName.STORAGE_LS,
|
|
1385
1463
|
request_body=payloads.RequestBody(),
|
|
1386
1464
|
func=core.storage_ls,
|
|
1387
1465
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1392,9 +1470,9 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1392
1470
|
async def storage_delete(request: fastapi.Request,
|
|
1393
1471
|
storage_body: payloads.StorageBody) -> None:
|
|
1394
1472
|
"""Deletes a storage."""
|
|
1395
|
-
executor.
|
|
1473
|
+
await executor.schedule_request_async(
|
|
1396
1474
|
request_id=request.state.request_id,
|
|
1397
|
-
request_name=
|
|
1475
|
+
request_name=request_names.RequestName.STORAGE_DELETE,
|
|
1398
1476
|
request_body=storage_body,
|
|
1399
1477
|
func=core.storage_delete,
|
|
1400
1478
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1405,9 +1483,9 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1405
1483
|
async def local_up(request: fastapi.Request,
|
|
1406
1484
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1407
1485
|
"""Launches a Kubernetes cluster on API server."""
|
|
1408
|
-
executor.
|
|
1486
|
+
await executor.schedule_request_async(
|
|
1409
1487
|
request_id=request.state.request_id,
|
|
1410
|
-
request_name=
|
|
1488
|
+
request_name=request_names.RequestName.LOCAL_UP,
|
|
1411
1489
|
request_body=local_up_body,
|
|
1412
1490
|
func=core.local_up,
|
|
1413
1491
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1415,21 +1493,39 @@ async def local_up(request: fastapi.Request,
|
|
|
1415
1493
|
|
|
1416
1494
|
|
|
1417
1495
|
@app.post('/local_down')
|
|
1418
|
-
async def local_down(request: fastapi.Request
|
|
1496
|
+
async def local_down(request: fastapi.Request,
|
|
1497
|
+
local_down_body: payloads.LocalDownBody) -> None:
|
|
1419
1498
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1420
|
-
executor.
|
|
1499
|
+
await executor.schedule_request_async(
|
|
1421
1500
|
request_id=request.state.request_id,
|
|
1422
|
-
request_name=
|
|
1423
|
-
request_body=
|
|
1501
|
+
request_name=request_names.RequestName.LOCAL_DOWN,
|
|
1502
|
+
request_body=local_down_body,
|
|
1424
1503
|
func=core.local_down,
|
|
1425
1504
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
1426
1505
|
)
|
|
1427
1506
|
|
|
1428
1507
|
|
|
1508
|
+
async def get_expanded_request_id(request_id: str) -> str:
|
|
1509
|
+
"""Gets the expanded request ID for a given request ID prefix."""
|
|
1510
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1511
|
+
request_id, fields=['request_id'])
|
|
1512
|
+
if request_tasks is None:
|
|
1513
|
+
raise fastapi.HTTPException(status_code=404,
|
|
1514
|
+
detail=f'Request {request_id!r} not found')
|
|
1515
|
+
if len(request_tasks) > 1:
|
|
1516
|
+
raise fastapi.HTTPException(status_code=400,
|
|
1517
|
+
detail=('Multiple requests found for '
|
|
1518
|
+
f'request ID prefix: {request_id}'))
|
|
1519
|
+
return request_tasks[0].request_id
|
|
1520
|
+
|
|
1521
|
+
|
|
1429
1522
|
# === API server related APIs ===
|
|
1430
|
-
@app.get('/api/get')
|
|
1523
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1431
1524
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1432
1525
|
"""Gets a request with a given request ID prefix."""
|
|
1526
|
+
# Validate request_id prefix matches a single request.
|
|
1527
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1528
|
+
|
|
1433
1529
|
while True:
|
|
1434
1530
|
req_status = await requests_lib.get_request_status_async(request_id)
|
|
1435
1531
|
if req_status is None:
|
|
@@ -1446,6 +1542,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1446
1542
|
# to avoid storming the DB and CPU in the meantime
|
|
1447
1543
|
await asyncio.sleep(0.1)
|
|
1448
1544
|
request_task = await requests_lib.get_request_async(request_id)
|
|
1545
|
+
# TODO(aylei): refine this, /api/get will not be retried and this is
|
|
1546
|
+
# meaningless to retry. It is the original request that should be retried.
|
|
1449
1547
|
if request_task.should_retry:
|
|
1450
1548
|
raise fastapi.HTTPException(
|
|
1451
1549
|
status_code=503, detail=f'Request {request_id!r} should be retried')
|
|
@@ -1487,13 +1585,18 @@ async def stream(
|
|
|
1487
1585
|
clients, console for CLI/API clients), 'plain' (force plain text),
|
|
1488
1586
|
'html' (force HTML), or 'console' (force console)
|
|
1489
1587
|
"""
|
|
1588
|
+
# We need to save the user-supplied request ID for the response header.
|
|
1589
|
+
user_supplied_request_id = request_id
|
|
1490
1590
|
if request_id is not None and log_path is not None:
|
|
1491
1591
|
raise fastapi.HTTPException(
|
|
1492
1592
|
status_code=400,
|
|
1493
1593
|
detail='Only one of request_id and log_path can be provided')
|
|
1494
1594
|
|
|
1595
|
+
if request_id is not None:
|
|
1596
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1597
|
+
|
|
1495
1598
|
if request_id is None and log_path is None:
|
|
1496
|
-
request_id = requests_lib.
|
|
1599
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1497
1600
|
if request_id is None:
|
|
1498
1601
|
raise fastapi.HTTPException(status_code=404,
|
|
1499
1602
|
detail='No request found')
|
|
@@ -1520,13 +1623,17 @@ async def stream(
|
|
|
1520
1623
|
'X-Accel-Buffering': 'no'
|
|
1521
1624
|
})
|
|
1522
1625
|
|
|
1626
|
+
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1523
1627
|
# Original plain text streaming logic
|
|
1524
1628
|
if request_id is not None:
|
|
1525
|
-
request_task = await requests_lib.get_request_async(
|
|
1629
|
+
request_task = await requests_lib.get_request_async(
|
|
1630
|
+
request_id, fields=['request_id', 'schedule_type'])
|
|
1526
1631
|
if request_task is None:
|
|
1527
1632
|
print(f'No task with request ID {request_id}')
|
|
1528
1633
|
raise fastapi.HTTPException(
|
|
1529
1634
|
status_code=404, detail=f'Request {request_id!r} not found')
|
|
1635
|
+
# req.log_path is derived from request_id,
|
|
1636
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1530
1637
|
log_path_to_stream = request_task.log_path
|
|
1531
1638
|
if not log_path_to_stream.exists():
|
|
1532
1639
|
# The log file might be deleted by the request GC daemon but the
|
|
@@ -1534,6 +1641,9 @@ async def stream(
|
|
|
1534
1641
|
raise fastapi.HTTPException(
|
|
1535
1642
|
status_code=404,
|
|
1536
1643
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1644
|
+
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1645
|
+
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1646
|
+
del request_task
|
|
1537
1647
|
else:
|
|
1538
1648
|
assert log_path is not None, (request_id, log_path)
|
|
1539
1649
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1567,18 +1677,26 @@ async def stream(
|
|
|
1567
1677
|
detail=f'Log path {log_path!r} does not exist')
|
|
1568
1678
|
|
|
1569
1679
|
log_path_to_stream = resolved_log_path
|
|
1680
|
+
|
|
1681
|
+
headers = {
|
|
1682
|
+
'Cache-Control': 'no-cache, no-transform',
|
|
1683
|
+
'X-Accel-Buffering': 'no',
|
|
1684
|
+
'Transfer-Encoding': 'chunked'
|
|
1685
|
+
}
|
|
1686
|
+
if request_id is not None:
|
|
1687
|
+
headers[server_constants.STREAM_REQUEST_HEADER] = (
|
|
1688
|
+
user_supplied_request_id
|
|
1689
|
+
if user_supplied_request_id else request_id)
|
|
1690
|
+
|
|
1570
1691
|
return fastapi.responses.StreamingResponse(
|
|
1571
1692
|
content=stream_utils.log_streamer(request_id,
|
|
1572
1693
|
log_path_to_stream,
|
|
1573
1694
|
plain_logs=format == 'plain',
|
|
1574
1695
|
tail=tail,
|
|
1575
|
-
follow=follow
|
|
1696
|
+
follow=follow,
|
|
1697
|
+
polling_interval=polling_interval),
|
|
1576
1698
|
media_type='text/plain',
|
|
1577
|
-
headers=
|
|
1578
|
-
'Cache-Control': 'no-cache, no-transform',
|
|
1579
|
-
'X-Accel-Buffering': 'no',
|
|
1580
|
-
'Transfer-Encoding': 'chunked'
|
|
1581
|
-
},
|
|
1699
|
+
headers=headers,
|
|
1582
1700
|
)
|
|
1583
1701
|
|
|
1584
1702
|
|
|
@@ -1586,11 +1704,11 @@ async def stream(
|
|
|
1586
1704
|
async def api_cancel(request: fastapi.Request,
|
|
1587
1705
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1588
1706
|
"""Cancels requests."""
|
|
1589
|
-
executor.
|
|
1707
|
+
await executor.schedule_request_async(
|
|
1590
1708
|
request_id=request.state.request_id,
|
|
1591
|
-
request_name=
|
|
1709
|
+
request_name=request_names.RequestName.API_CANCEL,
|
|
1592
1710
|
request_body=request_cancel_body,
|
|
1593
|
-
func=requests_lib.
|
|
1711
|
+
func=requests_lib.kill_requests_with_prefix,
|
|
1594
1712
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1595
1713
|
)
|
|
1596
1714
|
|
|
@@ -1598,9 +1716,13 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1598
1716
|
@app.get('/api/status')
|
|
1599
1717
|
async def api_status(
|
|
1600
1718
|
request_ids: Optional[List[str]] = fastapi.Query(
|
|
1601
|
-
None, description='Request
|
|
1719
|
+
None, description='Request ID prefixes to get status for.'),
|
|
1602
1720
|
all_status: bool = fastapi.Query(
|
|
1603
1721
|
False, description='Get finished requests as well.'),
|
|
1722
|
+
limit: Optional[int] = fastapi.Query(
|
|
1723
|
+
None, description='Number of requests to show.'),
|
|
1724
|
+
fields: Optional[List[str]] = fastapi.Query(
|
|
1725
|
+
None, description='Fields to get. If None, get all fields.'),
|
|
1604
1726
|
) -> List[payloads.RequestPayload]:
|
|
1605
1727
|
"""Gets the list of requests."""
|
|
1606
1728
|
if request_ids is None:
|
|
@@ -1611,15 +1733,22 @@ async def api_status(
|
|
|
1611
1733
|
requests_lib.RequestStatus.RUNNING,
|
|
1612
1734
|
]
|
|
1613
1735
|
request_tasks = await requests_lib.get_request_tasks_async(
|
|
1614
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
1615
|
-
|
|
1736
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
1737
|
+
status=statuses,
|
|
1738
|
+
limit=limit,
|
|
1739
|
+
fields=fields,
|
|
1740
|
+
sort=True,
|
|
1741
|
+
))
|
|
1742
|
+
return requests_lib.encode_requests(request_tasks)
|
|
1616
1743
|
else:
|
|
1617
1744
|
encoded_request_tasks = []
|
|
1618
1745
|
for request_id in request_ids:
|
|
1619
|
-
|
|
1620
|
-
|
|
1746
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1747
|
+
request_id)
|
|
1748
|
+
if request_tasks is None:
|
|
1621
1749
|
continue
|
|
1622
|
-
|
|
1750
|
+
for request_task in request_tasks:
|
|
1751
|
+
encoded_request_tasks.append(request_task.readable_encode())
|
|
1623
1752
|
return encoded_request_tasks
|
|
1624
1753
|
|
|
1625
1754
|
|
|
@@ -1679,23 +1808,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1679
1808
|
version=sky.__version__,
|
|
1680
1809
|
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1681
1810
|
commit=sky.__commit__,
|
|
1811
|
+
# Whether basic auth on api server is enabled
|
|
1682
1812
|
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1683
1813
|
'false').lower() == 'true',
|
|
1684
1814
|
user=user if user is not None else None,
|
|
1815
|
+
# Whether service account token is enabled
|
|
1816
|
+
service_account_token_enabled=(os.environ.get(
|
|
1817
|
+
constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
|
1818
|
+
'false').lower() == 'true'),
|
|
1819
|
+
# Whether basic auth on ingress is enabled
|
|
1820
|
+
ingress_basic_auth_enabled=os.environ.get(
|
|
1821
|
+
constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
|
|
1822
|
+
'false').lower() == 'true',
|
|
1685
1823
|
)
|
|
1686
1824
|
|
|
1687
1825
|
|
|
1826
|
+
class KubernetesSSHMessageType(IntEnum):
|
|
1827
|
+
REGULAR_DATA = 0
|
|
1828
|
+
PINGPONG = 1
|
|
1829
|
+
LATENCY_MEASUREMENT = 2
|
|
1830
|
+
|
|
1831
|
+
|
|
1688
1832
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
1689
|
-
async def kubernetes_pod_ssh_proxy(
|
|
1690
|
-
|
|
1833
|
+
async def kubernetes_pod_ssh_proxy(
|
|
1834
|
+
websocket: fastapi.WebSocket,
|
|
1835
|
+
cluster_name: str,
|
|
1836
|
+
client_version: Optional[int] = None) -> None:
|
|
1691
1837
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
|
1692
1838
|
await websocket.accept()
|
|
1693
1839
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1694
1840
|
|
|
1841
|
+
timestamps_supported = client_version is not None and client_version > 21
|
|
1842
|
+
logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
|
|
1843
|
+
client_version = {client_version}')
|
|
1844
|
+
|
|
1695
1845
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1846
|
+
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1847
|
+
cluster_records = await context_utils.to_thread_with_executor(
|
|
1848
|
+
thread_pool_executor, core.status, cluster_name, all_users=True)
|
|
1699
1849
|
cluster_record = cluster_records[0]
|
|
1700
1850
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1701
1851
|
raise fastapi.HTTPException(
|
|
@@ -1734,17 +1884,70 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1734
1884
|
return
|
|
1735
1885
|
|
|
1736
1886
|
logger.info(f'Starting port-forward to local port: {local_port}')
|
|
1887
|
+
conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
|
|
1888
|
+
pid=os.getpid())
|
|
1889
|
+
ssh_failed = False
|
|
1890
|
+
websocket_closed = False
|
|
1737
1891
|
try:
|
|
1892
|
+
conn_gauge.inc()
|
|
1738
1893
|
# Connect to the local port
|
|
1739
1894
|
reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
|
|
1740
1895
|
|
|
1741
1896
|
async def websocket_to_ssh():
|
|
1742
1897
|
try:
|
|
1743
1898
|
async for message in websocket.iter_bytes():
|
|
1899
|
+
if timestamps_supported:
|
|
1900
|
+
type_size = struct.calcsize('!B')
|
|
1901
|
+
message_type = struct.unpack('!B',
|
|
1902
|
+
message[:type_size])[0]
|
|
1903
|
+
if (message_type ==
|
|
1904
|
+
KubernetesSSHMessageType.REGULAR_DATA):
|
|
1905
|
+
# Regular data - strip type byte and forward to SSH
|
|
1906
|
+
message = message[type_size:]
|
|
1907
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG:
|
|
1908
|
+
# PING message - respond with PONG (type 1)
|
|
1909
|
+
ping_id_size = struct.calcsize('!I')
|
|
1910
|
+
if len(message) != type_size + ping_id_size:
|
|
1911
|
+
raise ValueError('Invalid PING message '
|
|
1912
|
+
f'length: {len(message)}')
|
|
1913
|
+
# Return the same PING message, so that the client
|
|
1914
|
+
# can measure the latency.
|
|
1915
|
+
await websocket.send_bytes(message)
|
|
1916
|
+
continue
|
|
1917
|
+
elif (message_type ==
|
|
1918
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT):
|
|
1919
|
+
# Latency measurement from client
|
|
1920
|
+
latency_size = struct.calcsize('!Q')
|
|
1921
|
+
if len(message) != type_size + latency_size:
|
|
1922
|
+
raise ValueError(
|
|
1923
|
+
'Invalid latency measurement '
|
|
1924
|
+
f'message length: {len(message)}')
|
|
1925
|
+
avg_latency_ms = struct.unpack(
|
|
1926
|
+
'!Q',
|
|
1927
|
+
message[type_size:type_size + latency_size])[0]
|
|
1928
|
+
latency_seconds = avg_latency_ms / 1000
|
|
1929
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
|
|
1930
|
+
continue
|
|
1931
|
+
else:
|
|
1932
|
+
# Unknown message type.
|
|
1933
|
+
raise ValueError(
|
|
1934
|
+
f'Unknown message type: {message_type}')
|
|
1744
1935
|
writer.write(message)
|
|
1745
|
-
|
|
1936
|
+
try:
|
|
1937
|
+
await writer.drain()
|
|
1938
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1939
|
+
# Typically we will not reach here, if the ssh to pod
|
|
1940
|
+
# is disconnected, ssh_to_websocket will exit first.
|
|
1941
|
+
# But just in case.
|
|
1942
|
+
logger.error('Failed to write to pod through '
|
|
1943
|
+
f'port-forward connection: {e}')
|
|
1944
|
+
nonlocal ssh_failed
|
|
1945
|
+
ssh_failed = True
|
|
1946
|
+
break
|
|
1746
1947
|
except fastapi.WebSocketDisconnect:
|
|
1747
1948
|
pass
|
|
1949
|
+
nonlocal websocket_closed
|
|
1950
|
+
websocket_closed = True
|
|
1748
1951
|
writer.close()
|
|
1749
1952
|
|
|
1750
1953
|
async def ssh_to_websocket():
|
|
@@ -1752,62 +1955,64 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1752
1955
|
while True:
|
|
1753
1956
|
data = await reader.read(1024)
|
|
1754
1957
|
if not data:
|
|
1958
|
+
if not websocket_closed:
|
|
1959
|
+
logger.warning('SSH connection to pod is '
|
|
1960
|
+
'disconnected before websocket '
|
|
1961
|
+
'connection is closed')
|
|
1962
|
+
nonlocal ssh_failed
|
|
1963
|
+
ssh_failed = True
|
|
1755
1964
|
break
|
|
1965
|
+
if timestamps_supported:
|
|
1966
|
+
# Prepend message type byte (0 = regular data)
|
|
1967
|
+
message_type_bytes = struct.pack(
|
|
1968
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
1969
|
+
data = message_type_bytes + data
|
|
1756
1970
|
await websocket.send_bytes(data)
|
|
1757
1971
|
except Exception: # pylint: disable=broad-except
|
|
1758
1972
|
pass
|
|
1759
|
-
|
|
1973
|
+
try:
|
|
1974
|
+
await websocket.close()
|
|
1975
|
+
except Exception: # pylint: disable=broad-except
|
|
1976
|
+
# The websocket might has been closed by the client.
|
|
1977
|
+
pass
|
|
1760
1978
|
|
|
1761
1979
|
await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
|
|
1762
1980
|
finally:
|
|
1763
|
-
|
|
1981
|
+
conn_gauge.dec()
|
|
1982
|
+
reason = ''
|
|
1983
|
+
try:
|
|
1984
|
+
logger.info('Terminating kubectl port-forward process')
|
|
1985
|
+
proc.terminate()
|
|
1986
|
+
except ProcessLookupError:
|
|
1987
|
+
stdout = await proc.stdout.read()
|
|
1988
|
+
logger.error('kubectl port-forward was terminated before the '
|
|
1989
|
+
'ssh websocket connection was closed. Remaining '
|
|
1990
|
+
f'output: {str(stdout)}')
|
|
1991
|
+
reason = 'KubectlPortForwardExit'
|
|
1992
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
1993
|
+
pid=os.getpid(), reason='KubectlPortForwardExit').inc()
|
|
1994
|
+
else:
|
|
1995
|
+
if ssh_failed:
|
|
1996
|
+
reason = 'SSHToPodDisconnected'
|
|
1997
|
+
else:
|
|
1998
|
+
reason = 'ClientClosed'
|
|
1999
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
|
|
2000
|
+
pid=os.getpid(), reason=reason).inc()
|
|
1764
2001
|
|
|
1765
2002
|
|
|
1766
2003
|
@app.get('/all_contexts')
|
|
1767
2004
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1768
2005
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1769
2006
|
|
|
1770
|
-
executor.
|
|
2007
|
+
await executor.schedule_request_async(
|
|
1771
2008
|
request_id=request.state.request_id,
|
|
1772
|
-
request_name=
|
|
2009
|
+
request_name=request_names.RequestName.ALL_CONTEXTS,
|
|
1773
2010
|
request_body=payloads.RequestBody(),
|
|
1774
2011
|
func=core.get_all_contexts,
|
|
1775
2012
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1776
2013
|
)
|
|
1777
2014
|
|
|
1778
2015
|
|
|
1779
|
-
@app.get('/gpu-metrics')
|
|
1780
|
-
async def gpu_metrics() -> fastapi.Response:
|
|
1781
|
-
"""Gets the GPU metrics from multiple external k8s clusters"""
|
|
1782
|
-
contexts = core.get_all_contexts()
|
|
1783
|
-
all_metrics = []
|
|
1784
|
-
successful_contexts = 0
|
|
1785
|
-
|
|
1786
|
-
tasks = [
|
|
1787
|
-
asyncio.create_task(metrics_utils.get_metrics_for_context(context))
|
|
1788
|
-
for context in contexts
|
|
1789
|
-
if context != 'in-cluster'
|
|
1790
|
-
]
|
|
1791
|
-
|
|
1792
|
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1793
|
-
|
|
1794
|
-
for i, result in enumerate(results):
|
|
1795
|
-
if isinstance(result, Exception):
|
|
1796
|
-
logger.error(
|
|
1797
|
-
f'Failed to get metrics for context {contexts[i]}: {result}')
|
|
1798
|
-
else:
|
|
1799
|
-
metrics_text = result
|
|
1800
|
-
all_metrics.append(metrics_text)
|
|
1801
|
-
successful_contexts += 1
|
|
1802
|
-
|
|
1803
|
-
combined_metrics = '\n\n'.join(all_metrics)
|
|
1804
|
-
|
|
1805
|
-
# Return as plain text for Prometheus compatibility
|
|
1806
|
-
return fastapi.Response(
|
|
1807
|
-
content=combined_metrics,
|
|
1808
|
-
media_type='text/plain; version=0.0.4; charset=utf-8')
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
2016
|
# === Internal APIs ===
|
|
1812
2017
|
@app.get('/api/completion/cluster_name')
|
|
1813
2018
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
|
@@ -1905,6 +2110,7 @@ if __name__ == '__main__':
|
|
|
1905
2110
|
|
|
1906
2111
|
from sky.server import uvicorn as skyuvicorn
|
|
1907
2112
|
|
|
2113
|
+
logger.info('Initializing SkyPilot API server')
|
|
1908
2114
|
skyuvicorn.add_timestamp_prefix_for_server_logs()
|
|
1909
2115
|
|
|
1910
2116
|
parser = argparse.ArgumentParser()
|
|
@@ -1916,22 +2122,63 @@ if __name__ == '__main__':
|
|
|
1916
2122
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
1917
2123
|
cmd_args = parser.parse_args()
|
|
1918
2124
|
if cmd_args.port == cmd_args.metrics_port:
|
|
2125
|
+
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
1919
2126
|
raise ValueError('port and metrics-port cannot be the same')
|
|
1920
2127
|
|
|
2128
|
+
# Fail fast if the port is not available to avoid corrupt the state
|
|
2129
|
+
# of potential running server instance.
|
|
2130
|
+
# We might reach here because the running server is currently not
|
|
2131
|
+
# responding, thus the healthz check fails and `sky api start` think
|
|
2132
|
+
# we should start a new server instance.
|
|
2133
|
+
if not common_utils.is_port_available(cmd_args.port):
|
|
2134
|
+
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
2135
|
+
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
2136
|
+
|
|
2137
|
+
# Maybe touch the signal file on API server startup. Do it again here even
|
|
2138
|
+
# if we already touched it in the sky/server/common.py::_start_api_server.
|
|
2139
|
+
# This is because the sky/server/common.py::_start_api_server function call
|
|
2140
|
+
# is running outside the skypilot API server process tree. The process tree
|
|
2141
|
+
# starts within that function (see the `subprocess.Popen` call in
|
|
2142
|
+
# sky/server/common.py::_start_api_server). When pg is used, the
|
|
2143
|
+
# _start_api_server function will not load the config file from db, which
|
|
2144
|
+
# will ignore the consolidation mode config. Here, inside the process tree,
|
|
2145
|
+
# we already reload the config as a server (with env var _start_api_server),
|
|
2146
|
+
# so we will respect the consolidation mode config.
|
|
2147
|
+
# Refers to #7717 for more details.
|
|
2148
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
2149
|
+
|
|
1921
2150
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
1922
2151
|
# that it is shown only when the API server is started.
|
|
1923
2152
|
usage_lib.maybe_show_privacy_policy()
|
|
1924
2153
|
|
|
1925
2154
|
# Initialize global user state db
|
|
1926
2155
|
db_utils.set_max_connections(1)
|
|
2156
|
+
logger.info('Initializing database engine')
|
|
1927
2157
|
global_user_state.initialize_and_get_db()
|
|
2158
|
+
logger.info('Database engine initialized')
|
|
1928
2159
|
# Initialize request db
|
|
1929
2160
|
requests_lib.reset_db_and_logs()
|
|
1930
2161
|
# Restore the server user hash
|
|
2162
|
+
logger.info('Initializing server user hash')
|
|
1931
2163
|
_init_or_restore_server_user_hash()
|
|
2164
|
+
|
|
1932
2165
|
max_db_connections = global_user_state.get_max_db_connections()
|
|
1933
|
-
|
|
1934
|
-
|
|
2166
|
+
logger.info(f'Max db connections: {max_db_connections}')
|
|
2167
|
+
|
|
2168
|
+
# Reserve memory for jobs and serve/pool controller in consolidation mode.
|
|
2169
|
+
reserved_memory_mb = (
|
|
2170
|
+
controller_utils.compute_memory_reserved_for_controllers(
|
|
2171
|
+
reserve_for_controllers=os.environ.get(
|
|
2172
|
+
constants.OVERRIDE_CONSOLIDATION_MODE) is not None,
|
|
2173
|
+
# For jobs controller, we need to reserve for both jobs and
|
|
2174
|
+
# pool controller.
|
|
2175
|
+
reserve_extra_for_pool=not os.environ.get(
|
|
2176
|
+
constants.IS_SKYPILOT_SERVE_CONTROLLER)))
|
|
2177
|
+
|
|
2178
|
+
config = server_config.compute_server_config(
|
|
2179
|
+
cmd_args.deploy,
|
|
2180
|
+
max_db_connections,
|
|
2181
|
+
reserved_memory_mb=reserved_memory_mb)
|
|
1935
2182
|
|
|
1936
2183
|
num_workers = config.num_server_workers
|
|
1937
2184
|
|
|
@@ -1960,7 +2207,8 @@ if __name__ == '__main__':
|
|
|
1960
2207
|
uvicorn_config = uvicorn.Config('sky.server.server:app',
|
|
1961
2208
|
host=cmd_args.host,
|
|
1962
2209
|
port=cmd_args.port,
|
|
1963
|
-
workers=num_workers
|
|
2210
|
+
workers=num_workers,
|
|
2211
|
+
ws_per_message_deflate=False)
|
|
1964
2212
|
skyuvicorn.run(uvicorn_config,
|
|
1965
2213
|
max_db_connections=config.num_db_connections_per_worker)
|
|
1966
2214
|
except Exception as exc: # pylint: disable=broad-except
|