skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/server/server.py
CHANGED
|
@@ -10,6 +10,7 @@ from sky.server import common as server_common
|
|
|
10
10
|
from sky.server import stream_utils
|
|
11
11
|
from sky.server.requests import executor
|
|
12
12
|
from sky.server.requests import payloads
|
|
13
|
+
from sky.server.requests import request_names
|
|
13
14
|
from sky.server.requests import requests as api_requests
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.utils import common
|
|
@@ -23,9 +24,9 @@ async def up(
|
|
|
23
24
|
request: fastapi.Request,
|
|
24
25
|
up_body: payloads.ServeUpBody,
|
|
25
26
|
) -> None:
|
|
26
|
-
executor.
|
|
27
|
+
await executor.schedule_request_async(
|
|
27
28
|
request_id=request.state.request_id,
|
|
28
|
-
request_name=
|
|
29
|
+
request_name=request_names.RequestName.SERVE_UP,
|
|
29
30
|
request_body=up_body,
|
|
30
31
|
func=core.up,
|
|
31
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -38,9 +39,9 @@ async def update(
|
|
|
38
39
|
request: fastapi.Request,
|
|
39
40
|
update_body: payloads.ServeUpdateBody,
|
|
40
41
|
) -> None:
|
|
41
|
-
executor.
|
|
42
|
+
await executor.schedule_request_async(
|
|
42
43
|
request_id=request.state.request_id,
|
|
43
|
-
request_name=
|
|
44
|
+
request_name=request_names.RequestName.SERVE_UPDATE,
|
|
44
45
|
request_body=update_body,
|
|
45
46
|
func=core.update,
|
|
46
47
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -53,9 +54,9 @@ async def down(
|
|
|
53
54
|
request: fastapi.Request,
|
|
54
55
|
down_body: payloads.ServeDownBody,
|
|
55
56
|
) -> None:
|
|
56
|
-
executor.
|
|
57
|
+
await executor.schedule_request_async(
|
|
57
58
|
request_id=request.state.request_id,
|
|
58
|
-
request_name=
|
|
59
|
+
request_name=request_names.RequestName.SERVE_DOWN,
|
|
59
60
|
request_body=down_body,
|
|
60
61
|
func=core.down,
|
|
61
62
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -68,9 +69,9 @@ async def terminate_replica(
|
|
|
68
69
|
request: fastapi.Request,
|
|
69
70
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
71
|
) -> None:
|
|
71
|
-
executor.
|
|
72
|
+
await executor.schedule_request_async(
|
|
72
73
|
request_id=request.state.request_id,
|
|
73
|
-
request_name=
|
|
74
|
+
request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
|
|
74
75
|
request_body=terminate_replica_body,
|
|
75
76
|
func=core.terminate_replica,
|
|
76
77
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -83,9 +84,9 @@ async def status(
|
|
|
83
84
|
request: fastapi.Request,
|
|
84
85
|
status_body: payloads.ServeStatusBody,
|
|
85
86
|
) -> None:
|
|
86
|
-
executor.
|
|
87
|
+
await executor.schedule_request_async(
|
|
87
88
|
request_id=request.state.request_id,
|
|
88
|
-
request_name=
|
|
89
|
+
request_name=request_names.RequestName.SERVE_STATUS,
|
|
89
90
|
request_body=status_body,
|
|
90
91
|
func=core.status,
|
|
91
92
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -98,22 +99,23 @@ async def tail_logs(
|
|
|
98
99
|
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
|
99
100
|
background_tasks: fastapi.BackgroundTasks
|
|
100
101
|
) -> fastapi.responses.StreamingResponse:
|
|
101
|
-
executor.
|
|
102
|
+
executor.check_request_thread_executor_available()
|
|
103
|
+
request_task = await executor.prepare_request_async(
|
|
102
104
|
request_id=request.state.request_id,
|
|
103
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SERVE_LOGS,
|
|
104
106
|
request_body=log_body,
|
|
105
107
|
func=core.tail_logs,
|
|
106
108
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
107
109
|
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
108
110
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
return stream_utils.stream_response(
|
|
111
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
112
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
113
|
+
background_tasks.add_task(task.cancel)
|
|
114
|
+
return stream_utils.stream_response_for_long_request(
|
|
114
115
|
request_id=request_task.request_id,
|
|
115
116
|
logs_path=request_task.log_path,
|
|
116
117
|
background_tasks=background_tasks,
|
|
118
|
+
kill_request_on_disconnect=False,
|
|
117
119
|
)
|
|
118
120
|
|
|
119
121
|
|
|
@@ -131,9 +133,9 @@ async def download_logs(
|
|
|
131
133
|
# We should reuse the original request body, so that the env vars, such as
|
|
132
134
|
# user hash, are kept the same.
|
|
133
135
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
134
|
-
executor.
|
|
136
|
+
await executor.schedule_request_async(
|
|
135
137
|
request_id=request.state.request_id,
|
|
136
|
-
request_name=
|
|
138
|
+
request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
|
|
137
139
|
request_body=download_logs_body,
|
|
138
140
|
func=core.sync_down_logs,
|
|
139
141
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/serve/service.py
CHANGED
|
@@ -13,7 +13,6 @@ from typing import Dict
|
|
|
13
13
|
|
|
14
14
|
import filelock
|
|
15
15
|
|
|
16
|
-
from sky import authentication
|
|
17
16
|
from sky import exceptions
|
|
18
17
|
from sky import global_user_state
|
|
19
18
|
from sky import sky_logging
|
|
@@ -21,7 +20,6 @@ from sky import task as task_lib
|
|
|
21
20
|
from sky.backends import backend_utils
|
|
22
21
|
from sky.backends import cloud_vm_ray_backend
|
|
23
22
|
from sky.data import data_utils
|
|
24
|
-
from sky.jobs import scheduler as jobs_scheduler
|
|
25
23
|
from sky.serve import constants
|
|
26
24
|
from sky.serve import controller
|
|
27
25
|
from sky.serve import load_balancer
|
|
@@ -29,9 +27,11 @@ from sky.serve import replica_managers
|
|
|
29
27
|
from sky.serve import serve_state
|
|
30
28
|
from sky.serve import serve_utils
|
|
31
29
|
from sky.skylet import constants as skylet_constants
|
|
30
|
+
from sky.utils import auth_utils
|
|
32
31
|
from sky.utils import common_utils
|
|
33
32
|
from sky.utils import controller_utils
|
|
34
33
|
from sky.utils import subprocess_utils
|
|
34
|
+
from sky.utils import thread_utils
|
|
35
35
|
from sky.utils import ux_utils
|
|
36
36
|
|
|
37
37
|
# Use the explicit logger name so that the logger is under the
|
|
@@ -66,11 +66,11 @@ def _handle_signal(service_name: str) -> None:
|
|
|
66
66
|
raise error_type(f'User signal received: {user_signal.value}')
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
def cleanup_storage(
|
|
69
|
+
def cleanup_storage(yaml_content: str) -> bool:
|
|
70
70
|
"""Clean up the storage for the service.
|
|
71
71
|
|
|
72
72
|
Args:
|
|
73
|
-
|
|
73
|
+
yaml_content: The yaml content of the service.
|
|
74
74
|
|
|
75
75
|
Returns:
|
|
76
76
|
True if the storage is cleaned up successfully, False otherwise.
|
|
@@ -78,7 +78,7 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
78
78
|
failed = False
|
|
79
79
|
|
|
80
80
|
try:
|
|
81
|
-
task = task_lib.Task.
|
|
81
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
82
82
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
83
83
|
# Need to re-construct storage object in the controller process
|
|
84
84
|
# because when SkyPilot API server machine sends the yaml config to the
|
|
@@ -116,7 +116,7 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
116
116
|
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
117
117
|
# because we killed all the processes (controller & replica manager) before
|
|
118
118
|
# calling this function.
|
|
119
|
-
def _cleanup(service_name: str) -> bool:
|
|
119
|
+
def _cleanup(service_name: str, pool: bool) -> bool:
|
|
120
120
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
121
121
|
# Cleanup the HA recovery script first as it is possible that some error
|
|
122
122
|
# was raised when we construct the task object (e.g.,
|
|
@@ -124,8 +124,8 @@ def _cleanup(service_name: str) -> bool:
|
|
|
124
124
|
serve_state.remove_ha_recovery_script(service_name)
|
|
125
125
|
failed = False
|
|
126
126
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
127
|
-
|
|
128
|
-
|
|
127
|
+
info2thr: Dict[replica_managers.ReplicaInfo,
|
|
128
|
+
thread_utils.SafeThread] = dict()
|
|
129
129
|
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
130
130
|
# generate_replica_cluster_name`. Change it if you change the function.
|
|
131
131
|
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
@@ -136,9 +136,12 @@ def _cleanup(service_name: str) -> bool:
|
|
|
136
136
|
f'{info.replica_id} not found. Might be a failed '
|
|
137
137
|
'cluster. Skipping.')
|
|
138
138
|
continue
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
|
|
140
|
+
log_file_name = serve_utils.generate_replica_log_file_name(
|
|
141
|
+
service_name, info.replica_id)
|
|
142
|
+
t = thread_utils.SafeThread(target=replica_managers.terminate_cluster,
|
|
143
|
+
args=(info.cluster_name, log_file_name))
|
|
144
|
+
info2thr[info] = t
|
|
142
145
|
# Set replica status to `SHUTTING_DOWN`
|
|
143
146
|
info.status_property.sky_launch_status = (
|
|
144
147
|
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
@@ -158,32 +161,32 @@ def _cleanup(service_name: str) -> bool:
|
|
|
158
161
|
|
|
159
162
|
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
160
163
|
# TODO(tian): Refactor to use the same logic and code.
|
|
161
|
-
while
|
|
162
|
-
snapshot = list(
|
|
163
|
-
for info,
|
|
164
|
-
if
|
|
164
|
+
while info2thr:
|
|
165
|
+
snapshot = list(info2thr.items())
|
|
166
|
+
for info, t in snapshot:
|
|
167
|
+
if t.is_alive():
|
|
165
168
|
continue
|
|
166
169
|
if (info.status_property.sky_down_status ==
|
|
167
170
|
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
168
|
-
if controller_utils.can_terminate():
|
|
171
|
+
if controller_utils.can_terminate(pool):
|
|
169
172
|
try:
|
|
170
|
-
|
|
173
|
+
t.start()
|
|
171
174
|
except Exception as e: # pylint: disable=broad-except
|
|
172
175
|
_set_to_failed_cleanup(info)
|
|
173
|
-
logger.error(f'Failed to start
|
|
176
|
+
logger.error(f'Failed to start thread for replica '
|
|
174
177
|
f'{info.replica_id}: {e}')
|
|
175
|
-
del
|
|
178
|
+
del info2thr[info]
|
|
176
179
|
else:
|
|
177
180
|
info.status_property.sky_down_status = (
|
|
178
181
|
common_utils.ProcessStatus.RUNNING)
|
|
179
182
|
serve_state.add_or_update_replica(
|
|
180
183
|
service_name, info.replica_id, info)
|
|
181
184
|
else:
|
|
182
|
-
logger.info('Terminate
|
|
185
|
+
logger.info('Terminate thread for replica '
|
|
183
186
|
f'{info.replica_id} finished.')
|
|
184
|
-
|
|
185
|
-
del
|
|
186
|
-
if
|
|
187
|
+
t.join()
|
|
188
|
+
del info2thr[info]
|
|
189
|
+
if t.format_exc is None:
|
|
187
190
|
serve_state.remove_replica(service_name, info.replica_id)
|
|
188
191
|
logger.info(
|
|
189
192
|
f'Replica {info.replica_id} terminated successfully.')
|
|
@@ -191,19 +194,23 @@ def _cleanup(service_name: str) -> bool:
|
|
|
191
194
|
_set_to_failed_cleanup(info)
|
|
192
195
|
time.sleep(3)
|
|
193
196
|
|
|
194
|
-
versions = serve_state.get_service_versions(service_name)
|
|
195
|
-
serve_state.remove_service_versions(service_name)
|
|
196
|
-
|
|
197
197
|
def cleanup_version_storage(version: int) -> bool:
|
|
198
|
-
|
|
199
|
-
|
|
198
|
+
yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
199
|
+
if yaml_content is None:
|
|
200
|
+
logger.warning(f'No yaml content found for version {version}')
|
|
201
|
+
return True
|
|
200
202
|
logger.info(f'Cleaning up storage for version {version}, '
|
|
201
|
-
f'
|
|
202
|
-
return cleanup_storage(
|
|
203
|
+
f'yaml_content: {yaml_content}')
|
|
204
|
+
return cleanup_storage(yaml_content)
|
|
203
205
|
|
|
206
|
+
versions = serve_state.get_service_versions(service_name)
|
|
204
207
|
if not all(map(cleanup_version_storage, versions)):
|
|
205
208
|
failed = True
|
|
206
209
|
|
|
210
|
+
# Cleanup version metadata after all storages are cleaned up, otherwise
|
|
211
|
+
# the get_yaml_content will return None as all versions are deleted.
|
|
212
|
+
serve_state.delete_all_versions(service_name)
|
|
213
|
+
|
|
207
214
|
return failed
|
|
208
215
|
|
|
209
216
|
|
|
@@ -228,41 +235,39 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
228
235
|
"""
|
|
229
236
|
# Generate ssh key pair to avoid race condition when multiple sky.launch
|
|
230
237
|
# are executed at the same time.
|
|
231
|
-
|
|
238
|
+
auth_utils.get_or_generate_keys()
|
|
232
239
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
# Already checked before submit to controller.
|
|
236
|
-
assert task.service is not None, task
|
|
237
|
-
service_spec = task.service
|
|
238
|
-
|
|
239
|
-
def is_recovery_mode(service_name: str) -> bool:
|
|
240
|
-
"""Check if service exists in database to determine recovery mode.
|
|
241
|
-
"""
|
|
242
|
-
service = serve_state.get_service_from_name(service_name)
|
|
243
|
-
return service is not None
|
|
244
|
-
|
|
245
|
-
is_recovery = is_recovery_mode(service_name)
|
|
240
|
+
service = serve_state.get_service_from_name(service_name)
|
|
241
|
+
is_recovery = service is not None
|
|
246
242
|
logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
|
|
247
243
|
|
|
244
|
+
def _read_yaml_content(yaml_path: str) -> str:
|
|
245
|
+
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
246
|
+
return f.read()
|
|
247
|
+
|
|
248
248
|
if is_recovery:
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
249
|
+
yaml_content = service['yaml_content']
|
|
250
|
+
# Backward compatibility for old service records that
|
|
251
|
+
# does not dump the yaml content to version database.
|
|
252
|
+
# TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
|
|
253
|
+
if yaml_content is None:
|
|
254
|
+
yaml_content = _read_yaml_content(tmp_task_yaml)
|
|
252
255
|
else:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
+
yaml_content = _read_yaml_content(tmp_task_yaml)
|
|
257
|
+
|
|
258
|
+
# Initialize database record for the service.
|
|
259
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
260
|
+
# Already checked before submit to controller.
|
|
261
|
+
assert task.service is not None, task
|
|
262
|
+
service_spec = task.service
|
|
256
263
|
|
|
257
264
|
service_dir = os.path.expanduser(
|
|
258
265
|
serve_utils.generate_remote_service_dir_name(service_name))
|
|
259
|
-
service_task_yaml = serve_utils.generate_task_yaml_file_name(
|
|
260
|
-
service_name, version)
|
|
261
266
|
|
|
262
267
|
if not is_recovery:
|
|
263
268
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
264
|
-
if not controller_utils.can_start_new_process():
|
|
265
|
-
cleanup_storage(
|
|
269
|
+
if not controller_utils.can_start_new_process(task.service.pool):
|
|
270
|
+
cleanup_storage(yaml_content)
|
|
266
271
|
with ux_utils.print_exception_no_traceback():
|
|
267
272
|
raise RuntimeError(
|
|
268
273
|
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
@@ -278,25 +283,24 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
278
283
|
pool=service_spec.pool,
|
|
279
284
|
controller_pid=os.getpid(),
|
|
280
285
|
entrypoint=entrypoint)
|
|
281
|
-
jobs_scheduler.maybe_schedule_next_jobs()
|
|
282
286
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
283
287
|
# for more details.
|
|
284
288
|
if not success:
|
|
285
|
-
cleanup_storage(
|
|
289
|
+
cleanup_storage(yaml_content)
|
|
286
290
|
with ux_utils.print_exception_no_traceback():
|
|
287
291
|
raise ValueError(f'Service {service_name} already exists.')
|
|
288
292
|
|
|
289
293
|
# Create the service working directory.
|
|
290
294
|
os.makedirs(service_dir, exist_ok=True)
|
|
291
295
|
|
|
292
|
-
|
|
293
|
-
#
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
# sync to a tmp file first and then copy it to the final name
|
|
297
|
-
# if there is no name conflict.
|
|
298
|
-
shutil.copy(tmp_task_yaml, service_task_yaml)
|
|
296
|
+
version = constants.INITIAL_VERSION
|
|
297
|
+
# Add initial version information to the service state.
|
|
298
|
+
serve_state.add_or_update_version(service_name, version, service_spec,
|
|
299
|
+
yaml_content)
|
|
299
300
|
else:
|
|
301
|
+
version = serve_state.get_latest_version(service_name)
|
|
302
|
+
if version is None:
|
|
303
|
+
raise ValueError(f'No version found for service {service_name}')
|
|
300
304
|
serve_state.update_service_controller_pid(service_name, os.getpid())
|
|
301
305
|
|
|
302
306
|
controller_process = None
|
|
@@ -328,8 +332,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
328
332
|
controller_host = _get_controller_host()
|
|
329
333
|
controller_process = multiprocessing.Process(
|
|
330
334
|
target=controller.run_controller,
|
|
331
|
-
args=(service_name, service_spec,
|
|
332
|
-
|
|
335
|
+
args=(service_name, service_spec, version, controller_host,
|
|
336
|
+
controller_port))
|
|
333
337
|
controller_process.start()
|
|
334
338
|
|
|
335
339
|
if not is_recovery:
|
|
@@ -350,8 +354,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
350
354
|
# TODO(tian): Probably we could enable multiple ports specified in
|
|
351
355
|
# service spec and we could start multiple load balancers.
|
|
352
356
|
# After that, we will have a mapping from replica port to endpoint.
|
|
353
|
-
# NOTE(tian): We don't need the load balancer for
|
|
354
|
-
# Skip the load balancer process for
|
|
357
|
+
# NOTE(tian): We don't need the load balancer for pool.
|
|
358
|
+
# Skip the load balancer process for pool.
|
|
355
359
|
if not service_spec.pool:
|
|
356
360
|
load_balancer_process = multiprocessing.Process(
|
|
357
361
|
target=ux_utils.RedirectOutputForProcess(
|
|
@@ -386,7 +390,19 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
386
390
|
for process in process_to_kill:
|
|
387
391
|
process.join()
|
|
388
392
|
|
|
389
|
-
|
|
393
|
+
# Catch any exception here to avoid it kill the service monitoring
|
|
394
|
+
# process. In which case, the service will not only fail to clean
|
|
395
|
+
# up, but also cannot be terminated in the future as no process
|
|
396
|
+
# will handle the user signal anymore. Instead, we catch any error
|
|
397
|
+
# and set it to FAILED_CLEANUP instead.
|
|
398
|
+
try:
|
|
399
|
+
failed = _cleanup(service_name, service_spec.pool)
|
|
400
|
+
except Exception as e: # pylint: disable=broad-except
|
|
401
|
+
logger.error(f'Failed to clean up service {service_name}: {e}')
|
|
402
|
+
with ux_utils.enable_traceback():
|
|
403
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
404
|
+
failed = True
|
|
405
|
+
|
|
390
406
|
if failed:
|
|
391
407
|
serve_state.set_service_status_and_active_versions(
|
|
392
408
|
service_name, serve_state.ServiceStatus.FAILED_CLEANUP)
|
sky/serve/service_spec.py
CHANGED
|
@@ -188,7 +188,7 @@ class SkyServiceSpec:
|
|
|
188
188
|
with ux_utils.print_exception_no_traceback():
|
|
189
189
|
raise ValueError('Cannot specify `replica_policy` for cluster '
|
|
190
190
|
'pool. Only `workers: <num>` is supported '
|
|
191
|
-
'for
|
|
191
|
+
'for pool now.')
|
|
192
192
|
|
|
193
193
|
simplified_policy_section = config.get('replicas', None)
|
|
194
194
|
workers_config = config.get('workers', None)
|
|
@@ -198,7 +198,7 @@ class SkyServiceSpec:
|
|
|
198
198
|
' Please use one of them.')
|
|
199
199
|
if simplified_policy_section is not None and pool_config:
|
|
200
200
|
with ux_utils.print_exception_no_traceback():
|
|
201
|
-
raise ValueError('Cannot specify `replicas` for
|
|
201
|
+
raise ValueError('Cannot specify `replicas` for pool. '
|
|
202
202
|
'Please use `workers` instead.')
|
|
203
203
|
if simplified_policy_section is None:
|
|
204
204
|
simplified_policy_section = workers_config
|
|
@@ -266,14 +266,13 @@ class SkyServiceSpec:
|
|
|
266
266
|
return SkyServiceSpec(**service_config)
|
|
267
267
|
|
|
268
268
|
@staticmethod
|
|
269
|
-
def
|
|
270
|
-
|
|
271
|
-
config = yaml_utils.safe_load(f)
|
|
269
|
+
def from_yaml_str(yaml_str: str) -> 'SkyServiceSpec':
|
|
270
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
272
271
|
|
|
273
272
|
if isinstance(config, str):
|
|
274
273
|
with ux_utils.print_exception_no_traceback():
|
|
275
274
|
raise ValueError('YAML loaded as str, not as dict. '
|
|
276
|
-
f'Is it correct?
|
|
275
|
+
f'Is it correct? content:\n{yaml_str}')
|
|
277
276
|
|
|
278
277
|
if config is None:
|
|
279
278
|
config = {}
|
|
@@ -281,10 +280,16 @@ class SkyServiceSpec:
|
|
|
281
280
|
if 'service' not in config:
|
|
282
281
|
with ux_utils.print_exception_no_traceback():
|
|
283
282
|
raise ValueError('Service YAML must have a "service" section. '
|
|
284
|
-
f'Is it correct?
|
|
283
|
+
f'Is it correct? content:\n{yaml_str}')
|
|
285
284
|
|
|
286
285
|
return SkyServiceSpec.from_yaml_config(config['service'])
|
|
287
286
|
|
|
287
|
+
@staticmethod
|
|
288
|
+
def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
|
|
289
|
+
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
290
|
+
yaml_content = f.read()
|
|
291
|
+
return SkyServiceSpec.from_yaml_str(yaml_content)
|
|
292
|
+
|
|
288
293
|
def to_yaml_config(self) -> Dict[str, Any]:
|
|
289
294
|
config: Dict[str, Any] = {}
|
|
290
295
|
|
|
@@ -506,3 +511,36 @@ class SkyServiceSpec:
|
|
|
506
511
|
if not hasattr(self, '_pool'):
|
|
507
512
|
return False
|
|
508
513
|
return bool(self._pool)
|
|
514
|
+
|
|
515
|
+
def copy(self, **override) -> 'SkyServiceSpec':
|
|
516
|
+
return SkyServiceSpec(
|
|
517
|
+
readiness_path=override.pop('readiness_path', self._readiness_path),
|
|
518
|
+
initial_delay_seconds=override.pop('initial_delay_seconds',
|
|
519
|
+
self._initial_delay_seconds),
|
|
520
|
+
readiness_timeout_seconds=override.pop(
|
|
521
|
+
'readiness_timeout_seconds', self._readiness_timeout_seconds),
|
|
522
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
523
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
524
|
+
num_overprovision=override.pop('num_overprovision',
|
|
525
|
+
self._num_overprovision),
|
|
526
|
+
ports=override.pop('ports', self._ports),
|
|
527
|
+
target_qps_per_replica=override.pop('target_qps_per_replica',
|
|
528
|
+
self._target_qps_per_replica),
|
|
529
|
+
post_data=override.pop('post_data', self._post_data),
|
|
530
|
+
tls_credential=override.pop('tls_credential', self._tls_credential),
|
|
531
|
+
readiness_headers=override.pop('readiness_headers',
|
|
532
|
+
self._readiness_headers),
|
|
533
|
+
dynamic_ondemand_fallback=override.pop(
|
|
534
|
+
'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
|
|
535
|
+
base_ondemand_fallback_replicas=override.pop(
|
|
536
|
+
'base_ondemand_fallback_replicas',
|
|
537
|
+
self._base_ondemand_fallback_replicas),
|
|
538
|
+
spot_placer=override.pop('spot_placer', self._spot_placer),
|
|
539
|
+
upscale_delay_seconds=override.pop('upscale_delay_seconds',
|
|
540
|
+
self._upscale_delay_seconds),
|
|
541
|
+
downscale_delay_seconds=override.pop('downscale_delay_seconds',
|
|
542
|
+
self._downscale_delay_seconds),
|
|
543
|
+
load_balancing_policy=override.pop('load_balancing_policy',
|
|
544
|
+
self._load_balancing_policy),
|
|
545
|
+
pool=override.pop('pool', self._pool),
|
|
546
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared loopback detection utilities for auth middlewares."""
|
|
2
|
+
|
|
3
|
+
import ipaddress
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
|
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
|
10
|
+
|
|
11
|
+
COMMON_PROXY_HEADERS = [
|
|
12
|
+
'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
|
|
13
|
+
'X-Forwarded-Host', 'X-Forwarded-Proto'
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_loopback_ip(ip_str: str) -> bool:
|
|
18
|
+
"""Check if an IP address is a loopback address."""
|
|
19
|
+
try:
|
|
20
|
+
ip = ipaddress.ip_address(ip_str)
|
|
21
|
+
return ip.is_loopback
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_loopback_request(request: fastapi.Request) -> bool:
|
|
27
|
+
"""Determine if a request is coming from localhost."""
|
|
28
|
+
if request.client is None:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
client_host = request.client.host
|
|
32
|
+
if client_host == 'localhost' or _is_loopback_ip(client_host):
|
|
33
|
+
# Additional checks: ensure no forwarding headers are present.
|
|
34
|
+
# If there are any, assume this traffic went through a proxy.
|
|
35
|
+
return not any(
|
|
36
|
+
request.headers.get(header) for header in COMMON_PROXY_HEADERS)
|
|
37
|
+
|
|
38
|
+
return False
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -15,7 +15,10 @@ import starlette.middleware.base
|
|
|
15
15
|
from sky import global_user_state
|
|
16
16
|
from sky import models
|
|
17
17
|
from sky import sky_logging
|
|
18
|
+
from sky.jobs import utils as managed_job_utils
|
|
19
|
+
from sky.server import middleware_utils
|
|
18
20
|
from sky.server.auth import authn
|
|
21
|
+
from sky.server.auth import loopback
|
|
19
22
|
from sky.users import permission
|
|
20
23
|
from sky.utils import common_utils
|
|
21
24
|
|
|
@@ -34,11 +37,12 @@ OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
|
|
|
34
37
|
OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
|
|
35
38
|
|
|
36
39
|
|
|
40
|
+
@middleware_utils.websocket_aware
|
|
37
41
|
class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
38
42
|
"""Middleware to handle authentication by delegating to OAuth2 Proxy."""
|
|
39
43
|
|
|
40
|
-
def __init__(self,
|
|
41
|
-
super().__init__(
|
|
44
|
+
def __init__(self, *args, **kwargs):
|
|
45
|
+
super().__init__(*args, **kwargs)
|
|
42
46
|
self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
|
|
43
47
|
'false') == 'true')
|
|
44
48
|
self.proxy_base: str = ''
|
|
@@ -108,6 +112,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
108
112
|
# Already authenticated
|
|
109
113
|
return await call_next(request)
|
|
110
114
|
|
|
115
|
+
if managed_job_utils.is_consolidation_mode(
|
|
116
|
+
) and loopback.is_loopback_request(request):
|
|
117
|
+
return await call_next(request)
|
|
118
|
+
|
|
111
119
|
async with aiohttp.ClientSession() as session:
|
|
112
120
|
try:
|
|
113
121
|
return await self._authenticate(request, call_next, session)
|
|
@@ -120,13 +128,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
120
128
|
|
|
121
129
|
async def _authenticate(self, request: fastapi.Request, call_next,
|
|
122
130
|
session: aiohttp.ClientSession):
|
|
123
|
-
forwarded_headers =
|
|
131
|
+
forwarded_headers = {}
|
|
124
132
|
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
125
133
|
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
126
|
-
|
|
127
|
-
# to reduce the auth overhead.
|
|
128
|
-
forwarded_headers.pop('content-length', None)
|
|
129
|
-
forwarded_headers.pop('content-type', None)
|
|
134
|
+
forwarded_headers['Host'] = request.url.hostname
|
|
130
135
|
logger.debug(f'authenticate request: {auth_url}, '
|
|
131
136
|
f'headers: {forwarded_headers}')
|
|
132
137
|
|