skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/service_spec.py
CHANGED
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import textwrap
|
|
5
|
-
import
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
6
|
|
|
8
7
|
from sky import serve
|
|
9
|
-
from sky.adaptors import common as adaptors_common
|
|
10
8
|
from sky.serve import constants
|
|
11
9
|
from sky.serve import load_balancing_policies as lb_policies
|
|
12
10
|
from sky.serve import serve_utils
|
|
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
|
|
|
14
12
|
from sky.utils import common_utils
|
|
15
13
|
from sky.utils import schemas
|
|
16
14
|
from sky.utils import ux_utils
|
|
17
|
-
|
|
18
|
-
if typing.TYPE_CHECKING:
|
|
19
|
-
import yaml
|
|
20
|
-
else:
|
|
21
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
15
|
+
from sky.utils import yaml_utils
|
|
22
16
|
|
|
23
17
|
|
|
24
18
|
class SkyServiceSpec:
|
|
@@ -33,7 +27,7 @@ class SkyServiceSpec:
|
|
|
33
27
|
max_replicas: Optional[int] = None,
|
|
34
28
|
num_overprovision: Optional[int] = None,
|
|
35
29
|
ports: Optional[str] = None,
|
|
36
|
-
target_qps_per_replica: Optional[float] = None,
|
|
30
|
+
target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None,
|
|
37
31
|
post_data: Optional[Dict[str, Any]] = None,
|
|
38
32
|
tls_credential: Optional[serve_utils.TLSCredential] = None,
|
|
39
33
|
readiness_headers: Optional[Dict[str, str]] = None,
|
|
@@ -43,7 +37,33 @@ class SkyServiceSpec:
|
|
|
43
37
|
upscale_delay_seconds: Optional[int] = None,
|
|
44
38
|
downscale_delay_seconds: Optional[int] = None,
|
|
45
39
|
load_balancing_policy: Optional[str] = None,
|
|
40
|
+
pool: Optional[bool] = None,
|
|
46
41
|
) -> None:
|
|
42
|
+
if pool:
|
|
43
|
+
for unsupported_field in [
|
|
44
|
+
'max_replicas',
|
|
45
|
+
'num_overprovision',
|
|
46
|
+
'target_qps_per_replica',
|
|
47
|
+
'upscale_delay_seconds',
|
|
48
|
+
'downscale_delay_seconds',
|
|
49
|
+
'base_ondemand_fallback_replicas',
|
|
50
|
+
'dynamic_ondemand_fallback',
|
|
51
|
+
'spot_placer',
|
|
52
|
+
'load_balancing_policy',
|
|
53
|
+
'ports',
|
|
54
|
+
'post_data',
|
|
55
|
+
'tls_credential',
|
|
56
|
+
'readiness_headers',
|
|
57
|
+
]:
|
|
58
|
+
if locals()[unsupported_field] is not None:
|
|
59
|
+
with ux_utils.print_exception_no_traceback():
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f'{unsupported_field} is not supported for pool.')
|
|
62
|
+
if max_replicas is not None and max_replicas != min_replicas:
|
|
63
|
+
with ux_utils.print_exception_no_traceback():
|
|
64
|
+
raise ValueError('Autoscaling is not supported for pool '
|
|
65
|
+
'for now.')
|
|
66
|
+
|
|
47
67
|
if max_replicas is not None and max_replicas < min_replicas:
|
|
48
68
|
with ux_utils.print_exception_no_traceback():
|
|
49
69
|
raise ValueError('max_replicas must be greater than or '
|
|
@@ -83,7 +103,8 @@ class SkyServiceSpec:
|
|
|
83
103
|
self._max_replicas: Optional[int] = max_replicas
|
|
84
104
|
self._num_overprovision: Optional[int] = num_overprovision
|
|
85
105
|
self._ports: Optional[str] = ports
|
|
86
|
-
self._target_qps_per_replica: Optional[float
|
|
106
|
+
self._target_qps_per_replica: Optional[Union[float, Dict[
|
|
107
|
+
str, float]]] = target_qps_per_replica
|
|
87
108
|
self._post_data: Optional[Dict[str, Any]] = post_data
|
|
88
109
|
self._tls_credential: Optional[serve_utils.TLSCredential] = (
|
|
89
110
|
tls_credential)
|
|
@@ -96,6 +117,7 @@ class SkyServiceSpec:
|
|
|
96
117
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
|
97
118
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
|
98
119
|
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
|
120
|
+
self._pool: Optional[bool] = pool
|
|
99
121
|
|
|
100
122
|
self._use_ondemand_fallback: bool = (
|
|
101
123
|
self.dynamic_ondemand_fallback is not None and
|
|
@@ -115,7 +137,7 @@ class SkyServiceSpec:
|
|
|
115
137
|
|
|
116
138
|
service_config: Dict[str, Any] = {}
|
|
117
139
|
|
|
118
|
-
readiness_section = config
|
|
140
|
+
readiness_section = config.get('readiness_probe', '/')
|
|
119
141
|
if isinstance(readiness_section, str):
|
|
120
142
|
service_config['readiness_path'] = readiness_section
|
|
121
143
|
initial_delay_seconds = None
|
|
@@ -157,8 +179,29 @@ class SkyServiceSpec:
|
|
|
157
179
|
raise ValueError('Port must be between 1 and 65535.')
|
|
158
180
|
service_config['ports'] = str(ports) if ports is not None else None
|
|
159
181
|
|
|
182
|
+
pool_config = config.get('pool', None)
|
|
183
|
+
if pool_config is not None:
|
|
184
|
+
service_config['pool'] = pool_config
|
|
185
|
+
|
|
160
186
|
policy_section = config.get('replica_policy', None)
|
|
187
|
+
if policy_section is not None and pool_config:
|
|
188
|
+
with ux_utils.print_exception_no_traceback():
|
|
189
|
+
raise ValueError('Cannot specify `replica_policy` for cluster '
|
|
190
|
+
'pool. Only `workers: <num>` is supported '
|
|
191
|
+
'for pool now.')
|
|
192
|
+
|
|
161
193
|
simplified_policy_section = config.get('replicas', None)
|
|
194
|
+
workers_config = config.get('workers', None)
|
|
195
|
+
if simplified_policy_section is not None and workers_config is not None:
|
|
196
|
+
with ux_utils.print_exception_no_traceback():
|
|
197
|
+
raise ValueError('Cannot specify both `replicas` and `workers`.'
|
|
198
|
+
' Please use one of them.')
|
|
199
|
+
if simplified_policy_section is not None and pool_config:
|
|
200
|
+
with ux_utils.print_exception_no_traceback():
|
|
201
|
+
raise ValueError('Cannot specify `replicas` for pool. '
|
|
202
|
+
'Please use `workers` instead.')
|
|
203
|
+
if simplified_policy_section is None:
|
|
204
|
+
simplified_policy_section = workers_config
|
|
162
205
|
if policy_section is None or simplified_policy_section is not None:
|
|
163
206
|
if simplified_policy_section is not None:
|
|
164
207
|
min_replicas = simplified_policy_section
|
|
@@ -193,6 +236,26 @@ class SkyServiceSpec:
|
|
|
193
236
|
service_config['load_balancing_policy'] = config.get(
|
|
194
237
|
'load_balancing_policy', None)
|
|
195
238
|
|
|
239
|
+
# Validate instance-aware settings
|
|
240
|
+
target_qps_per_replica = service_config['target_qps_per_replica']
|
|
241
|
+
load_balancing_policy = service_config['load_balancing_policy']
|
|
242
|
+
|
|
243
|
+
if isinstance(target_qps_per_replica, dict):
|
|
244
|
+
if load_balancing_policy != 'instance_aware_least_load':
|
|
245
|
+
with ux_utils.print_exception_no_traceback():
|
|
246
|
+
raise ValueError(
|
|
247
|
+
'When using dict type target_qps_per_replica, '
|
|
248
|
+
'load_balancing_policy must be '
|
|
249
|
+
'"instance_aware_least_load".')
|
|
250
|
+
|
|
251
|
+
if load_balancing_policy == 'instance_aware_least_load':
|
|
252
|
+
if not isinstance(target_qps_per_replica, dict):
|
|
253
|
+
with ux_utils.print_exception_no_traceback():
|
|
254
|
+
raise ValueError(
|
|
255
|
+
'When using "instance_aware_least_load" policy, '
|
|
256
|
+
'target_qps_per_replica must be a '
|
|
257
|
+
'dict mapping GPU types to QPS values.')
|
|
258
|
+
|
|
196
259
|
tls_section = config.get('tls', None)
|
|
197
260
|
if tls_section is not None:
|
|
198
261
|
service_config['tls_credential'] = serve_utils.TLSCredential(
|
|
@@ -203,14 +266,13 @@ class SkyServiceSpec:
|
|
|
203
266
|
return SkyServiceSpec(**service_config)
|
|
204
267
|
|
|
205
268
|
@staticmethod
|
|
206
|
-
def
|
|
207
|
-
|
|
208
|
-
config = yaml.safe_load(f)
|
|
269
|
+
def from_yaml_str(yaml_str: str) -> 'SkyServiceSpec':
|
|
270
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
209
271
|
|
|
210
272
|
if isinstance(config, str):
|
|
211
273
|
with ux_utils.print_exception_no_traceback():
|
|
212
274
|
raise ValueError('YAML loaded as str, not as dict. '
|
|
213
|
-
f'Is it correct?
|
|
275
|
+
f'Is it correct? content:\n{yaml_str}')
|
|
214
276
|
|
|
215
277
|
if config is None:
|
|
216
278
|
config = {}
|
|
@@ -218,10 +280,16 @@ class SkyServiceSpec:
|
|
|
218
280
|
if 'service' not in config:
|
|
219
281
|
with ux_utils.print_exception_no_traceback():
|
|
220
282
|
raise ValueError('Service YAML must have a "service" section. '
|
|
221
|
-
f'Is it correct?
|
|
283
|
+
f'Is it correct? content:\n{yaml_str}')
|
|
222
284
|
|
|
223
285
|
return SkyServiceSpec.from_yaml_config(config['service'])
|
|
224
286
|
|
|
287
|
+
@staticmethod
|
|
288
|
+
def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
|
|
289
|
+
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
290
|
+
yaml_content = f.read()
|
|
291
|
+
return SkyServiceSpec.from_yaml_str(yaml_content)
|
|
292
|
+
|
|
225
293
|
def to_yaml_config(self) -> Dict[str, Any]:
|
|
226
294
|
config: Dict[str, Any] = {}
|
|
227
295
|
|
|
@@ -239,6 +307,13 @@ class SkyServiceSpec:
|
|
|
239
307
|
config[section] = dict()
|
|
240
308
|
config[section][key] = value
|
|
241
309
|
|
|
310
|
+
add_if_not_none('pool', None, self._pool)
|
|
311
|
+
|
|
312
|
+
if self.pool:
|
|
313
|
+
# For pool, currently only `workers: <num>` is supported.
|
|
314
|
+
add_if_not_none('workers', None, self.min_replicas)
|
|
315
|
+
return config
|
|
316
|
+
|
|
242
317
|
add_if_not_none('readiness_probe', 'path', self.readiness_path)
|
|
243
318
|
add_if_not_none('readiness_probe', 'initial_delay_seconds',
|
|
244
319
|
self.initial_delay_seconds)
|
|
@@ -306,10 +381,14 @@ class SkyServiceSpec:
|
|
|
306
381
|
return ' '.join(policy_strs)
|
|
307
382
|
|
|
308
383
|
def autoscaling_policy_str(self):
|
|
384
|
+
if self.pool:
|
|
385
|
+
# We only support fixed-size pool for now.
|
|
386
|
+
return f'Fixed-size ({self.min_replicas} workers)'
|
|
309
387
|
# TODO(MaoZiming): Update policy_str
|
|
388
|
+
noun = 'worker' if self.pool else 'replica'
|
|
310
389
|
min_plural = '' if self.min_replicas == 1 else 's'
|
|
311
390
|
if self.max_replicas == self.min_replicas or self.max_replicas is None:
|
|
312
|
-
return f'Fixed {self.min_replicas}
|
|
391
|
+
return f'Fixed {self.min_replicas} {noun}{min_plural}'
|
|
313
392
|
# Already checked in __init__.
|
|
314
393
|
assert self.target_qps_per_replica is not None
|
|
315
394
|
# TODO(tian): Refactor to contain more information
|
|
@@ -319,8 +398,8 @@ class SkyServiceSpec:
|
|
|
319
398
|
overprovision_str = (
|
|
320
399
|
f' with {self.num_overprovision} overprovisioned replicas')
|
|
321
400
|
return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
|
|
322
|
-
f'
|
|
323
|
-
f'
|
|
401
|
+
f'{noun}{max_plural}{overprovision_str} (target QPS per '
|
|
402
|
+
f'{noun}: {self.target_qps_per_replica})')
|
|
324
403
|
|
|
325
404
|
def set_ports(self, ports: str) -> None:
|
|
326
405
|
self._ports = ports
|
|
@@ -332,6 +411,10 @@ class SkyServiceSpec:
|
|
|
332
411
|
f'Certfile: {self.tls_credential.certfile}')
|
|
333
412
|
|
|
334
413
|
def __repr__(self) -> str:
|
|
414
|
+
if self.pool:
|
|
415
|
+
return textwrap.dedent(f"""\
|
|
416
|
+
Worker policy: {self.autoscaling_policy_str()}
|
|
417
|
+
""")
|
|
335
418
|
return textwrap.dedent(f"""\
|
|
336
419
|
Readiness probe method: {self.probe_str()}
|
|
337
420
|
Readiness initial delay seconds: {self.initial_delay_seconds}
|
|
@@ -372,7 +455,8 @@ class SkyServiceSpec:
|
|
|
372
455
|
return self._ports
|
|
373
456
|
|
|
374
457
|
@property
|
|
375
|
-
def target_qps_per_replica(
|
|
458
|
+
def target_qps_per_replica(
|
|
459
|
+
self) -> Optional[Union[float, Dict[str, float]]]:
|
|
376
460
|
return self._target_qps_per_replica
|
|
377
461
|
|
|
378
462
|
@property
|
|
@@ -420,3 +504,43 @@ class SkyServiceSpec:
|
|
|
420
504
|
def load_balancing_policy(self) -> str:
|
|
421
505
|
return lb_policies.LoadBalancingPolicy.make_policy_name(
|
|
422
506
|
self._load_balancing_policy)
|
|
507
|
+
|
|
508
|
+
@property
|
|
509
|
+
def pool(self) -> bool:
|
|
510
|
+
# This can happen for backward compatibility.
|
|
511
|
+
if not hasattr(self, '_pool'):
|
|
512
|
+
return False
|
|
513
|
+
return bool(self._pool)
|
|
514
|
+
|
|
515
|
+
def copy(self, **override) -> 'SkyServiceSpec':
|
|
516
|
+
return SkyServiceSpec(
|
|
517
|
+
readiness_path=override.pop('readiness_path', self._readiness_path),
|
|
518
|
+
initial_delay_seconds=override.pop('initial_delay_seconds',
|
|
519
|
+
self._initial_delay_seconds),
|
|
520
|
+
readiness_timeout_seconds=override.pop(
|
|
521
|
+
'readiness_timeout_seconds', self._readiness_timeout_seconds),
|
|
522
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
523
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
524
|
+
num_overprovision=override.pop('num_overprovision',
|
|
525
|
+
self._num_overprovision),
|
|
526
|
+
ports=override.pop('ports', self._ports),
|
|
527
|
+
target_qps_per_replica=override.pop('target_qps_per_replica',
|
|
528
|
+
self._target_qps_per_replica),
|
|
529
|
+
post_data=override.pop('post_data', self._post_data),
|
|
530
|
+
tls_credential=override.pop('tls_credential', self._tls_credential),
|
|
531
|
+
readiness_headers=override.pop('readiness_headers',
|
|
532
|
+
self._readiness_headers),
|
|
533
|
+
dynamic_ondemand_fallback=override.pop(
|
|
534
|
+
'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
|
|
535
|
+
base_ondemand_fallback_replicas=override.pop(
|
|
536
|
+
'base_ondemand_fallback_replicas',
|
|
537
|
+
self._base_ondemand_fallback_replicas),
|
|
538
|
+
spot_placer=override.pop('spot_placer', self._spot_placer),
|
|
539
|
+
upscale_delay_seconds=override.pop('upscale_delay_seconds',
|
|
540
|
+
self._upscale_delay_seconds),
|
|
541
|
+
downscale_delay_seconds=override.pop('downscale_delay_seconds',
|
|
542
|
+
self._downscale_delay_seconds),
|
|
543
|
+
load_balancing_policy=override.pop('load_balancing_policy',
|
|
544
|
+
self._load_balancing_policy),
|
|
545
|
+
pool=override.pop('pool', self._pool),
|
|
546
|
+
)
|
sky/serve/spot_placer.py
CHANGED
|
@@ -46,6 +46,8 @@ class Location:
|
|
|
46
46
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
|
49
|
+
assert resources.cloud is not None, 'Cloud must be specified'
|
|
50
|
+
assert resources.region is not None, 'Region must be specified'
|
|
49
51
|
return cls(resources.cloud, resources.region, resources.zone)
|
|
50
52
|
|
|
51
53
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
|
|
147
149
|
cloud_str = str(launchable.cloud)
|
|
148
150
|
region = launchable.region
|
|
149
151
|
zone = launchable.zone
|
|
152
|
+
assert region is not None, 'Region must be specified'
|
|
150
153
|
if (cloud_str not in location_requirements and
|
|
151
154
|
location_requirements):
|
|
152
155
|
continue
|
|
File without changes
|
sky/server/auth/authn.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Authentication module."""
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky import models
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.skylet import constants
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# TODO(hailong): Remove this function and use request.state.auth_user instead.
|
|
15
|
+
async def override_user_info_in_request_body(request: fastapi.Request,
|
|
16
|
+
auth_user: Optional[models.User]):
|
|
17
|
+
# Skip for upload requests to avoid consuming the body prematurely, which
|
|
18
|
+
# will break the streaming upload.
|
|
19
|
+
if request.url.path.startswith('/upload'):
|
|
20
|
+
return
|
|
21
|
+
if auth_user is None:
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
body = await request.body()
|
|
25
|
+
if body:
|
|
26
|
+
try:
|
|
27
|
+
original_json = await request.json()
|
|
28
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
|
29
|
+
logger.error(f'Error parsing request JSON: {e}')
|
|
30
|
+
else:
|
|
31
|
+
logger.debug(f'Overriding user for {request.state.request_id}: '
|
|
32
|
+
f'{auth_user.name}, {auth_user.id}')
|
|
33
|
+
if 'env_vars' in original_json:
|
|
34
|
+
if isinstance(original_json.get('env_vars'), dict):
|
|
35
|
+
original_json['env_vars'][
|
|
36
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
37
|
+
original_json['env_vars'][
|
|
38
|
+
constants.USER_ENV_VAR] = auth_user.name
|
|
39
|
+
else:
|
|
40
|
+
logger.warning(
|
|
41
|
+
f'"env_vars" in request body is not a dictionary '
|
|
42
|
+
f'for request {request.state.request_id}. '
|
|
43
|
+
'Skipping user info injection into body.')
|
|
44
|
+
else:
|
|
45
|
+
original_json['env_vars'] = {}
|
|
46
|
+
original_json['env_vars'][
|
|
47
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
|
48
|
+
original_json['env_vars'][
|
|
49
|
+
constants.USER_ENV_VAR] = auth_user.name
|
|
50
|
+
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared loopback detection utilities for auth middlewares."""
|
|
2
|
+
|
|
3
|
+
import ipaddress
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
|
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
|
10
|
+
|
|
11
|
+
COMMON_PROXY_HEADERS = [
|
|
12
|
+
'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'X-Client-IP',
|
|
13
|
+
'X-Forwarded-Host', 'X-Forwarded-Proto'
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_loopback_ip(ip_str: str) -> bool:
|
|
18
|
+
"""Check if an IP address is a loopback address."""
|
|
19
|
+
try:
|
|
20
|
+
ip = ipaddress.ip_address(ip_str)
|
|
21
|
+
return ip.is_loopback
|
|
22
|
+
except ValueError:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_loopback_request(request: fastapi.Request) -> bool:
|
|
27
|
+
"""Determine if a request is coming from localhost."""
|
|
28
|
+
if request.client is None:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
client_host = request.client.host
|
|
32
|
+
if client_host == 'localhost' or _is_loopback_ip(client_host):
|
|
33
|
+
# Additional checks: ensure no forwarding headers are present.
|
|
34
|
+
# If there are any, assume this traffic went through a proxy.
|
|
35
|
+
return not any(
|
|
36
|
+
request.headers.get(header) for header in COMMON_PROXY_HEADERS)
|
|
37
|
+
|
|
38
|
+
return False
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Authentication based on oauth2-proxy."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import http
|
|
6
|
+
import os
|
|
7
|
+
import traceback
|
|
8
|
+
from typing import Optional
|
|
9
|
+
import urllib
|
|
10
|
+
|
|
11
|
+
import aiohttp
|
|
12
|
+
import fastapi
|
|
13
|
+
import starlette.middleware.base
|
|
14
|
+
|
|
15
|
+
from sky import global_user_state
|
|
16
|
+
from sky import models
|
|
17
|
+
from sky import sky_logging
|
|
18
|
+
from sky.jobs import utils as managed_job_utils
|
|
19
|
+
from sky.server import middleware_utils
|
|
20
|
+
from sky.server.auth import authn
|
|
21
|
+
from sky.server.auth import loopback
|
|
22
|
+
from sky.users import permission
|
|
23
|
+
from sky.utils import common_utils
|
|
24
|
+
|
|
25
|
+
logger = sky_logging.init_logger(__name__)
|
|
26
|
+
|
|
27
|
+
# We do not support setting these in config.yaml because:
|
|
28
|
+
# 1. config.yaml can be updated dynamically, but auth middleware does not
|
|
29
|
+
# support hot reload yet.
|
|
30
|
+
# 2. If we introduce hot reload for auth middleware, bad config might
|
|
31
|
+
# invalidate all authenticated sessions and thus cannot be rolled back
|
|
32
|
+
# by API users.
|
|
33
|
+
# TODO(aylei): we should introduce server.yaml for static server admin config,
|
|
34
|
+
# which is more structured than multiple environment variables and can be less
|
|
35
|
+
# confusing to users.
|
|
36
|
+
OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
|
|
37
|
+
OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@middleware_utils.websocket_aware
|
|
41
|
+
class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
42
|
+
"""Middleware to handle authentication by delegating to OAuth2 Proxy."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, *args, **kwargs):
|
|
45
|
+
super().__init__(*args, **kwargs)
|
|
46
|
+
self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
|
|
47
|
+
'false') == 'true')
|
|
48
|
+
self.proxy_base: str = ''
|
|
49
|
+
if self.enabled:
|
|
50
|
+
proxy_base = os.getenv(OAUTH2_PROXY_BASE_URL_ENV_VAR)
|
|
51
|
+
if not proxy_base:
|
|
52
|
+
raise ValueError('OAuth2 Proxy is enabled but base_url is not '
|
|
53
|
+
'set')
|
|
54
|
+
self.proxy_base = proxy_base.rstrip('/')
|
|
55
|
+
|
|
56
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
|
57
|
+
if not self.enabled:
|
|
58
|
+
return await call_next(request)
|
|
59
|
+
|
|
60
|
+
# Forward /oauth2/* to oauth2-proxy, including /oauth2/start and
|
|
61
|
+
# /oauth2/callback.
|
|
62
|
+
if request.url.path.startswith('/oauth2'):
|
|
63
|
+
return await self.forward_to_oauth2_proxy(request)
|
|
64
|
+
|
|
65
|
+
return await self.authenticate(request, call_next)
|
|
66
|
+
|
|
67
|
+
async def forward_to_oauth2_proxy(self, request: fastapi.Request):
|
|
68
|
+
"""Forward requests to oauth2-proxy service."""
|
|
69
|
+
logger.debug(f'forwarding to oauth2-proxy: {request.url.path}')
|
|
70
|
+
path = request.url.path.lstrip('/')
|
|
71
|
+
target_url = f'{self.proxy_base}/{path}'
|
|
72
|
+
body = await request.body()
|
|
73
|
+
async with aiohttp.ClientSession() as session:
|
|
74
|
+
try:
|
|
75
|
+
forwarded_headers = dict(request.headers)
|
|
76
|
+
async with session.request(
|
|
77
|
+
method=request.method,
|
|
78
|
+
url=target_url,
|
|
79
|
+
headers=forwarded_headers,
|
|
80
|
+
data=body,
|
|
81
|
+
cookies=request.cookies,
|
|
82
|
+
params=request.query_params,
|
|
83
|
+
allow_redirects=False,
|
|
84
|
+
) as response:
|
|
85
|
+
response_body = await response.read()
|
|
86
|
+
fastapi_response = fastapi.responses.Response(
|
|
87
|
+
content=response_body,
|
|
88
|
+
status_code=response.status,
|
|
89
|
+
headers=dict(response.headers),
|
|
90
|
+
)
|
|
91
|
+
# Forward cookies from OAuth2 proxy response to client
|
|
92
|
+
for cookie_name, cookie in response.cookies.items():
|
|
93
|
+
fastapi_response.set_cookie(
|
|
94
|
+
key=cookie_name,
|
|
95
|
+
value=cookie.value,
|
|
96
|
+
max_age=cookie.get('max-age'),
|
|
97
|
+
expires=cookie.get('expires'),
|
|
98
|
+
path=cookie.get('path', '/'),
|
|
99
|
+
domain=cookie.get('domain'),
|
|
100
|
+
secure=cookie.get('secure', False),
|
|
101
|
+
httponly=cookie.get('httponly', False),
|
|
102
|
+
)
|
|
103
|
+
return fastapi_response
|
|
104
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
105
|
+
logger.error(f'Error forwarding to OAuth2 proxy: {e}')
|
|
106
|
+
return fastapi.responses.JSONResponse(
|
|
107
|
+
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
108
|
+
content={'detail': 'oauth2-proxy service unavailable'})
|
|
109
|
+
|
|
110
|
+
async def authenticate(self, request: fastapi.Request, call_next):
|
|
111
|
+
if request.state.auth_user is not None:
|
|
112
|
+
# Already authenticated
|
|
113
|
+
return await call_next(request)
|
|
114
|
+
|
|
115
|
+
if managed_job_utils.is_consolidation_mode(
|
|
116
|
+
) and loopback.is_loopback_request(request):
|
|
117
|
+
return await call_next(request)
|
|
118
|
+
|
|
119
|
+
async with aiohttp.ClientSession() as session:
|
|
120
|
+
try:
|
|
121
|
+
return await self._authenticate(request, call_next, session)
|
|
122
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
123
|
+
logger.error(f'Error communicating with OAuth2 proxy: {e}'
|
|
124
|
+
f'{traceback.format_exc()}')
|
|
125
|
+
return fastapi.responses.JSONResponse(
|
|
126
|
+
status_code=http.HTTPStatus.BAD_GATEWAY,
|
|
127
|
+
content={'detail': 'oauth2-proxy service unavailable'})
|
|
128
|
+
|
|
129
|
+
async def _authenticate(self, request: fastapi.Request, call_next,
|
|
130
|
+
session: aiohttp.ClientSession):
|
|
131
|
+
forwarded_headers = {}
|
|
132
|
+
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
133
|
+
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
134
|
+
forwarded_headers['Host'] = request.url.hostname
|
|
135
|
+
logger.debug(f'authenticate request: {auth_url}, '
|
|
136
|
+
f'headers: {forwarded_headers}')
|
|
137
|
+
|
|
138
|
+
async with session.request(
|
|
139
|
+
method='GET',
|
|
140
|
+
url=auth_url,
|
|
141
|
+
headers=forwarded_headers,
|
|
142
|
+
cookies=request.cookies,
|
|
143
|
+
timeout=aiohttp.ClientTimeout(total=10),
|
|
144
|
+
allow_redirects=False,
|
|
145
|
+
) as auth_response:
|
|
146
|
+
|
|
147
|
+
if auth_response.status == http.HTTPStatus.ACCEPTED:
|
|
148
|
+
# User is authenticated, extract user info from headers
|
|
149
|
+
auth_user = self.get_auth_user(auth_response)
|
|
150
|
+
if not auth_user:
|
|
151
|
+
return fastapi.responses.JSONResponse(
|
|
152
|
+
status_code=http.HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
153
|
+
content={
|
|
154
|
+
'detail':
|
|
155
|
+
'oauth2-proxy is enabled but did not'
|
|
156
|
+
'return user info, check your oauth2-proxy'
|
|
157
|
+
'setup.'
|
|
158
|
+
})
|
|
159
|
+
newly_added = global_user_state.add_or_update_user(auth_user)
|
|
160
|
+
if newly_added:
|
|
161
|
+
permission.permission_service.add_user_if_not_exists(
|
|
162
|
+
auth_user.id)
|
|
163
|
+
request.state.auth_user = auth_user
|
|
164
|
+
await authn.override_user_info_in_request_body(
|
|
165
|
+
request, auth_user)
|
|
166
|
+
return await call_next(request)
|
|
167
|
+
elif auth_response.status == http.HTTPStatus.UNAUTHORIZED:
|
|
168
|
+
# For /api/health, we should allow unauthenticated requests to
|
|
169
|
+
# not break healthz check.
|
|
170
|
+
# TODO(aylei): remove this to an aggregated login middleware
|
|
171
|
+
# in favor of the unified authentication.
|
|
172
|
+
if request.url.path.startswith('/api/health'):
|
|
173
|
+
request.state.anonymous_user = True
|
|
174
|
+
return await call_next(request)
|
|
175
|
+
|
|
176
|
+
# TODO(aylei): in unified authentication, the redirection
|
|
177
|
+
# or rejection should be done after all the authentication
|
|
178
|
+
# methods are performed.
|
|
179
|
+
# Not authenticated, redirect to sign-in
|
|
180
|
+
redirect_path = request.url.path
|
|
181
|
+
if request.url.query:
|
|
182
|
+
redirect_path += f'?{request.url.query}'
|
|
183
|
+
rd = urllib.parse.quote(redirect_path)
|
|
184
|
+
signin_url = (f'{request.base_url}oauth2/start?'
|
|
185
|
+
f'rd={rd}')
|
|
186
|
+
return fastapi.responses.RedirectResponse(url=signin_url)
|
|
187
|
+
else:
|
|
188
|
+
logger.error('oauth2-proxy returned unexpected status '
|
|
189
|
+
f'{auth_response.status}: {auth_response.text}')
|
|
190
|
+
return fastapi.responses.JSONResponse(
|
|
191
|
+
status_code=auth_response.status,
|
|
192
|
+
content={'detail': 'oauth2-proxy error'})
|
|
193
|
+
|
|
194
|
+
def get_auth_user(
|
|
195
|
+
self, response: aiohttp.ClientResponse) -> Optional[models.User]:
|
|
196
|
+
"""Extract user info from OAuth2 proxy response headers."""
|
|
197
|
+
email_header = response.headers.get('X-Auth-Request-Email')
|
|
198
|
+
if email_header:
|
|
199
|
+
user_hash = hashlib.md5(email_header.encode()).hexdigest(
|
|
200
|
+
)[:common_utils.USER_HASH_LENGTH]
|
|
201
|
+
return models.User(id=user_hash, name=email_header)
|
|
202
|
+
return None
|