skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Async SDK for SkyServe."""
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sky.client import sdk_async
|
|
6
|
+
from sky.serve.client import sdk
|
|
7
|
+
from sky.usage import usage_lib
|
|
8
|
+
from sky.utils import context_utils
|
|
9
|
+
|
|
10
|
+
if typing.TYPE_CHECKING:
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
import sky
|
|
14
|
+
from sky.serve import serve_utils
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@usage_lib.entrypoint
|
|
18
|
+
async def up(
|
|
19
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
20
|
+
service_name: str,
|
|
21
|
+
# Internal only:
|
|
22
|
+
# pylint: disable=invalid-name
|
|
23
|
+
_need_confirmation: bool = False,
|
|
24
|
+
stream_logs: Optional[
|
|
25
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
26
|
+
) -> Tuple[str, str]:
|
|
27
|
+
"""Async version of up() that spins up a service."""
|
|
28
|
+
request_id = await context_utils.to_thread(sdk.up, task, service_name,
|
|
29
|
+
_need_confirmation)
|
|
30
|
+
if stream_logs is not None:
|
|
31
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
32
|
+
else:
|
|
33
|
+
return await sdk_async.get(request_id)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@usage_lib.entrypoint
|
|
37
|
+
async def update(
|
|
38
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
39
|
+
service_name: str,
|
|
40
|
+
mode: 'serve_utils.UpdateMode',
|
|
41
|
+
# Internal only:
|
|
42
|
+
# pylint: disable=invalid-name
|
|
43
|
+
_need_confirmation: bool = False,
|
|
44
|
+
stream_logs: Optional[
|
|
45
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Async version of update() that updates an existing service."""
|
|
48
|
+
request_id = await context_utils.to_thread(sdk.update, task, service_name,
|
|
49
|
+
mode, _need_confirmation)
|
|
50
|
+
if stream_logs is not None:
|
|
51
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
52
|
+
else:
|
|
53
|
+
return await sdk_async.get(request_id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@usage_lib.entrypoint
|
|
57
|
+
async def down(
|
|
58
|
+
service_names: Optional[Union[str, List[str]]],
|
|
59
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
60
|
+
purge: bool = False,
|
|
61
|
+
stream_logs: Optional[
|
|
62
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Async version of down() that tears down a service."""
|
|
65
|
+
request_id = await context_utils.to_thread(sdk.down, service_names, all,
|
|
66
|
+
purge)
|
|
67
|
+
if stream_logs is not None:
|
|
68
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
69
|
+
else:
|
|
70
|
+
return await sdk_async.get(request_id)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@usage_lib.entrypoint
|
|
74
|
+
async def terminate_replica(
|
|
75
|
+
service_name: str,
|
|
76
|
+
replica_id: int,
|
|
77
|
+
purge: bool,
|
|
78
|
+
stream_logs: Optional[
|
|
79
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Async version of terminate_replica() that tears down a specific
|
|
82
|
+
replica."""
|
|
83
|
+
request_id = await context_utils.to_thread(sdk.terminate_replica,
|
|
84
|
+
service_name, replica_id, purge)
|
|
85
|
+
if stream_logs is not None:
|
|
86
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
87
|
+
else:
|
|
88
|
+
return await sdk_async.get(request_id)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@usage_lib.entrypoint
|
|
92
|
+
async def status(
|
|
93
|
+
service_names: Optional[Union[str, List[str]]],
|
|
94
|
+
stream_logs: Optional[
|
|
95
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
96
|
+
) -> List[Dict[str, Any]]:
|
|
97
|
+
"""Async version of status() that sdk_async.gets service statuses."""
|
|
98
|
+
request_id = await context_utils.to_thread(sdk.status, service_names)
|
|
99
|
+
if stream_logs is not None:
|
|
100
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
101
|
+
else:
|
|
102
|
+
return await sdk_async.get(request_id)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@usage_lib.entrypoint
|
|
106
|
+
async def tail_logs(service_name: str,
|
|
107
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
108
|
+
replica_id: Optional[int] = None,
|
|
109
|
+
follow: bool = True,
|
|
110
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
|
111
|
+
"""Async version of tail_logs() that tails logs for a service."""
|
|
112
|
+
return await context_utils.to_thread(sdk.tail_logs, service_name, target,
|
|
113
|
+
replica_id, follow, output_stream)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@usage_lib.entrypoint
|
|
117
|
+
async def sync_down_logs(service_name: str,
|
|
118
|
+
local_dir: str,
|
|
119
|
+
*,
|
|
120
|
+
targets: Optional[Union[
|
|
121
|
+
str, 'serve_utils.ServiceComponent', List[Union[
|
|
122
|
+
str, 'serve_utils.ServiceComponent']]]] = None,
|
|
123
|
+
replica_ids: Optional[List[int]] = None) -> None:
|
|
124
|
+
"""Async version of sync_down_logs() that syncs down logs from service
|
|
125
|
+
components."""
|
|
126
|
+
return await context_utils.to_thread(sdk.sync_down_logs,
|
|
127
|
+
service_name,
|
|
128
|
+
local_dir,
|
|
129
|
+
targets=targets,
|
|
130
|
+
replica_ids=replica_ids)
|
sky/serve/constants.py
CHANGED
|
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
|
|
|
73
73
|
'down': False,
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
-
# Due to the CPU/memory usage of the controller process launched with a job on
|
|
77
|
-
# controller VM (use ray job under the hood), we need to reserve some CPU/memory
|
|
78
|
-
# for each serve controller process.
|
|
79
|
-
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
|
|
80
|
-
# services.
|
|
81
|
-
CONTROLLER_MEMORY_USAGE_GB = 1.0
|
|
82
|
-
|
|
83
76
|
# A period of time to initialize your service. Any readiness probe failures
|
|
84
77
|
# during this period will be ignored.
|
|
85
78
|
DEFAULT_INITIAL_DELAY_SECONDS = 1200
|
|
@@ -104,8 +97,17 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
|
104
97
|
# Changelog:
|
|
105
98
|
# v1.0 - Introduce rolling update.
|
|
106
99
|
# v2.0 - Added template-replica feature.
|
|
107
|
-
|
|
100
|
+
# v3.0 - Added cluster pool.
|
|
101
|
+
# v4.0 - Added pool argument to wait_service_registration.
|
|
102
|
+
# v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
|
|
103
|
+
SERVE_VERSION = 5
|
|
108
104
|
|
|
109
105
|
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
110
106
|
'The version of service is outdated and does not support manually '
|
|
111
107
|
'terminating replicas. Please terminate the service and spin up again.')
|
|
108
|
+
|
|
109
|
+
# Dummy run command for cluster pool.
|
|
110
|
+
POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
|
|
111
|
+
|
|
112
|
+
# Error message for max number of services reached.
|
|
113
|
+
MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
|
sky/serve/controller.py
CHANGED
|
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
|
|
|
4
4
|
"""
|
|
5
5
|
import contextlib
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
10
|
import traceback
|
|
@@ -26,11 +27,12 @@ from sky.utils import ux_utils
|
|
|
26
27
|
logger = sky_logging.init_logger(__name__)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
class
|
|
30
|
+
class AutoscalerInfoFilter(logging.Filter):
|
|
30
31
|
|
|
31
32
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
32
33
|
message = record.getMessage()
|
|
33
|
-
return not ('GET' in message and '200' in message
|
|
34
|
+
return not ('GET' in message and '200' in message and
|
|
35
|
+
'/autoscaler/info' in message)
|
|
34
36
|
|
|
35
37
|
|
|
36
38
|
class SkyServeController:
|
|
@@ -42,12 +44,13 @@ class SkyServeController:
|
|
|
42
44
|
"""
|
|
43
45
|
|
|
44
46
|
def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
|
|
45
|
-
|
|
47
|
+
service_task_yaml: str, host: str, port: int) -> None:
|
|
46
48
|
self._service_name = service_name
|
|
47
49
|
self._replica_manager: replica_managers.ReplicaManager = (
|
|
48
|
-
replica_managers.SkyPilotReplicaManager(
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
replica_managers.SkyPilotReplicaManager(
|
|
51
|
+
service_name=service_name,
|
|
52
|
+
spec=service_spec,
|
|
53
|
+
service_task_yaml_path=service_task_yaml))
|
|
51
54
|
self._autoscaler: autoscalers.Autoscaler = (
|
|
52
55
|
autoscalers.Autoscaler.from_spec(service_name, service_spec))
|
|
53
56
|
self._host = host
|
|
@@ -59,6 +62,7 @@ class SkyServeController:
|
|
|
59
62
|
uvicorn_access_logger = logging.getLogger('uvicorn.access')
|
|
60
63
|
for handler in uvicorn_access_logger.handlers:
|
|
61
64
|
handler.setFormatter(sky_logging.FORMATTER)
|
|
65
|
+
handler.addFilter(AutoscalerInfoFilter())
|
|
62
66
|
yield
|
|
63
67
|
|
|
64
68
|
def _run_autoscaler(self):
|
|
@@ -74,7 +78,11 @@ class SkyServeController:
|
|
|
74
78
|
assert record is not None, ('No service record found for '
|
|
75
79
|
f'{self._service_name}')
|
|
76
80
|
active_versions = record['active_versions']
|
|
77
|
-
logger.info(f'All replica info: {replica_infos}')
|
|
81
|
+
logger.info(f'All replica info for autoscaler: {replica_infos}')
|
|
82
|
+
|
|
83
|
+
# Autoscaler now extracts GPU type info directly from
|
|
84
|
+
# replica_infos in generate_scaling_decisions method
|
|
85
|
+
# for better decoupling.
|
|
78
86
|
scaling_options = self._autoscaler.generate_scaling_decisions(
|
|
79
87
|
replica_infos, active_versions)
|
|
80
88
|
for scaling_option in scaling_options:
|
|
@@ -99,6 +107,11 @@ class SkyServeController:
|
|
|
99
107
|
|
|
100
108
|
def run(self) -> None:
|
|
101
109
|
|
|
110
|
+
@self._app.get('/autoscaler/info')
|
|
111
|
+
async def get_autoscaler_info() -> fastapi.Response:
|
|
112
|
+
return responses.JSONResponse(content=self._autoscaler.info(),
|
|
113
|
+
status_code=200)
|
|
114
|
+
|
|
102
115
|
@self._app.post('/controller/load_balancer_sync')
|
|
103
116
|
async def load_balancer_sync(
|
|
104
117
|
request: fastapi.Request) -> fastapi.Response:
|
|
@@ -109,11 +122,37 @@ class SkyServeController:
|
|
|
109
122
|
timestamps: List[int] = request_aggregator.get('timestamps', [])
|
|
110
123
|
logger.info(f'Received {len(timestamps)} inflight requests.')
|
|
111
124
|
self._autoscaler.collect_request_information(request_aggregator)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
125
|
+
|
|
126
|
+
# Get replica information for instance-aware load balancing
|
|
127
|
+
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
128
|
+
ready_replica_urls = self._replica_manager.get_active_replica_urls()
|
|
129
|
+
|
|
130
|
+
# Use URL-to-info mapping to avoid duplication
|
|
131
|
+
replica_info = {}
|
|
132
|
+
for info in replica_infos:
|
|
133
|
+
if info.url in ready_replica_urls:
|
|
134
|
+
# Get GPU type from handle.launched_resources.accelerators
|
|
135
|
+
gpu_type = 'unknown'
|
|
136
|
+
handle = info.handle()
|
|
137
|
+
if handle is not None:
|
|
138
|
+
accelerators = handle.launched_resources.accelerators
|
|
139
|
+
if accelerators and len(accelerators) > 0:
|
|
140
|
+
# Get the first accelerator type
|
|
141
|
+
gpu_type = list(accelerators.keys())[0]
|
|
142
|
+
|
|
143
|
+
replica_info[info.url] = {'gpu_type': gpu_type}
|
|
144
|
+
|
|
145
|
+
# Check that all ready replica URLs are included in replica_info
|
|
146
|
+
missing_urls = set(ready_replica_urls) - set(replica_info.keys())
|
|
147
|
+
if missing_urls:
|
|
148
|
+
logger.warning(f'Ready replica URLs missing from replica_info: '
|
|
149
|
+
f'{missing_urls}')
|
|
150
|
+
# fallback: add missing URLs with unknown GPU type
|
|
151
|
+
for url in missing_urls:
|
|
152
|
+
replica_info[url] = {'gpu_type': 'unknown'}
|
|
153
|
+
|
|
154
|
+
return responses.JSONResponse(
|
|
155
|
+
content={'replica_info': replica_info}, status_code=200)
|
|
117
156
|
|
|
118
157
|
@self._app.post('/controller/update_service')
|
|
119
158
|
async def update_service(request: fastapi.Request) -> fastapi.Response:
|
|
@@ -155,9 +194,13 @@ class SkyServeController:
|
|
|
155
194
|
return responses.JSONResponse(content={'message': 'Success'},
|
|
156
195
|
status_code=200)
|
|
157
196
|
except Exception as e: # pylint: disable=broad-except
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
return responses.JSONResponse(content={
|
|
197
|
+
exception_str = common_utils.format_exception(e)
|
|
198
|
+
logger.error(f'Error in update_service: {exception_str}')
|
|
199
|
+
return responses.JSONResponse(content={
|
|
200
|
+
'message': 'Error',
|
|
201
|
+
'exception': exception_str,
|
|
202
|
+
'traceback': traceback.format_exc()
|
|
203
|
+
},
|
|
161
204
|
status_code=500)
|
|
162
205
|
|
|
163
206
|
@self._app.post('/controller/terminate_replica')
|
|
@@ -232,7 +275,7 @@ class SkyServeController:
|
|
|
232
275
|
threading.Thread(target=self._run_autoscaler).start()
|
|
233
276
|
|
|
234
277
|
logger.info('SkyServe Controller started on '
|
|
235
|
-
f'http://{self._host}:{self._port}')
|
|
278
|
+
f'http://{self._host}:{self._port}. PID: {os.getpid()}')
|
|
236
279
|
|
|
237
280
|
uvicorn.run(self._app, host=self._host, port=self._port)
|
|
238
281
|
|
|
@@ -240,7 +283,9 @@ class SkyServeController:
|
|
|
240
283
|
# TODO(tian): Probably we should support service that will stop the VM in
|
|
241
284
|
# specific time period.
|
|
242
285
|
def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
286
|
+
service_task_yaml: str, controller_host: str,
|
|
287
|
+
controller_port: int):
|
|
288
|
+
controller = SkyServeController(service_name, service_spec,
|
|
289
|
+
service_task_yaml, controller_host,
|
|
290
|
+
controller_port)
|
|
246
291
|
controller.run()
|
sky/serve/load_balancer.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""LoadBalancer: Distribute any incoming request to all ready replicas."""
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import threading
|
|
5
|
-
|
|
6
|
+
import traceback
|
|
7
|
+
from typing import Dict, List, Optional, Union
|
|
6
8
|
|
|
7
9
|
import aiohttp
|
|
8
10
|
import fastapi
|
|
@@ -28,11 +30,13 @@ class SkyServeLoadBalancer:
|
|
|
28
30
|
"""
|
|
29
31
|
|
|
30
32
|
def __init__(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
self,
|
|
34
|
+
controller_url: str,
|
|
35
|
+
load_balancer_port: int,
|
|
36
|
+
load_balancing_policy_name: Optional[str] = None,
|
|
37
|
+
tls_credential: Optional[serve_utils.TLSCredential] = None,
|
|
38
|
+
target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
|
|
39
|
+
) -> None:
|
|
36
40
|
"""Initialize the load balancer.
|
|
37
41
|
|
|
38
42
|
Args:
|
|
@@ -42,6 +46,9 @@ class SkyServeLoadBalancer:
|
|
|
42
46
|
to use. Defaults to None.
|
|
43
47
|
tls_credentials: The TLS credentials for HTTPS endpoint. Defaults
|
|
44
48
|
to None.
|
|
49
|
+
target_qps_per_replica: Target QPS per replica for instance-aware
|
|
50
|
+
load balancing. Can be a float or dict mapping GPU types to QPS.
|
|
51
|
+
Defaults to None.
|
|
45
52
|
"""
|
|
46
53
|
self._app = fastapi.FastAPI()
|
|
47
54
|
self._controller_url: str = controller_url
|
|
@@ -49,6 +56,15 @@ class SkyServeLoadBalancer:
|
|
|
49
56
|
# Use the registry to create the load balancing policy
|
|
50
57
|
self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
|
|
51
58
|
load_balancing_policy_name)
|
|
59
|
+
|
|
60
|
+
# Set accelerator QPS for instance-aware policies
|
|
61
|
+
if (target_qps_per_replica and
|
|
62
|
+
isinstance(target_qps_per_replica, dict) and
|
|
63
|
+
isinstance(self._load_balancing_policy,
|
|
64
|
+
lb_policies.InstanceAwareLeastLoadPolicy)):
|
|
65
|
+
self._load_balancing_policy.set_target_qps_per_accelerator(
|
|
66
|
+
target_qps_per_replica)
|
|
67
|
+
|
|
52
68
|
logger.info('Starting load balancer with policy '
|
|
53
69
|
f'{load_balancing_policy_name}.')
|
|
54
70
|
self._request_aggregator: serve_utils.RequestsAggregator = (
|
|
@@ -69,6 +85,56 @@ class SkyServeLoadBalancer:
|
|
|
69
85
|
# updating it from _sync_with_controller.
|
|
70
86
|
self._client_pool_lock: threading.Lock = threading.Lock()
|
|
71
87
|
|
|
88
|
+
async def _sync_with_controller_once(self) -> List[asyncio.Task]:
|
|
89
|
+
close_client_tasks = []
|
|
90
|
+
ready_replica_urls = []
|
|
91
|
+
replica_info = {}
|
|
92
|
+
|
|
93
|
+
async with aiohttp.ClientSession() as session:
|
|
94
|
+
try:
|
|
95
|
+
# Send request information
|
|
96
|
+
async with session.post(
|
|
97
|
+
self._controller_url + '/controller/load_balancer_sync',
|
|
98
|
+
json={
|
|
99
|
+
'request_aggregator':
|
|
100
|
+
self._request_aggregator.to_dict()
|
|
101
|
+
},
|
|
102
|
+
timeout=aiohttp.ClientTimeout(5),
|
|
103
|
+
) as response:
|
|
104
|
+
# Clean up after reporting request info to avoid OOM.
|
|
105
|
+
self._request_aggregator.clear()
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
response_json = await response.json()
|
|
108
|
+
replica_info = response_json.get('replica_info', {})
|
|
109
|
+
ready_replica_urls = list(replica_info.keys())
|
|
110
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
111
|
+
logger.error(f'An error occurred when syncing with '
|
|
112
|
+
f'the controller: {e}'
|
|
113
|
+
f'\nTraceback: {traceback.format_exc()}')
|
|
114
|
+
else:
|
|
115
|
+
logger.info(f'Available Replica URLs: {ready_replica_urls}')
|
|
116
|
+
with self._client_pool_lock:
|
|
117
|
+
self._load_balancing_policy.set_ready_replicas(
|
|
118
|
+
ready_replica_urls)
|
|
119
|
+
# Set replica info for instance-aware policies
|
|
120
|
+
if isinstance(self._load_balancing_policy,
|
|
121
|
+
lb_policies.InstanceAwareLeastLoadPolicy):
|
|
122
|
+
self._load_balancing_policy.set_replica_info(
|
|
123
|
+
replica_info)
|
|
124
|
+
for replica_url in ready_replica_urls:
|
|
125
|
+
if replica_url not in self._client_pool:
|
|
126
|
+
self._client_pool[replica_url] = httpx.AsyncClient(
|
|
127
|
+
base_url=replica_url)
|
|
128
|
+
urls_to_close = set(
|
|
129
|
+
self._client_pool.keys()) - set(ready_replica_urls)
|
|
130
|
+
client_to_close = []
|
|
131
|
+
for replica_url in urls_to_close:
|
|
132
|
+
client_to_close.append(
|
|
133
|
+
self._client_pool.pop(replica_url))
|
|
134
|
+
for client in client_to_close:
|
|
135
|
+
close_client_tasks.append(client.aclose())
|
|
136
|
+
return close_client_tasks
|
|
137
|
+
|
|
72
138
|
async def _sync_with_controller(self):
|
|
73
139
|
"""Sync with controller periodically.
|
|
74
140
|
|
|
@@ -82,49 +148,16 @@ class SkyServeLoadBalancer:
|
|
|
82
148
|
await asyncio.sleep(5)
|
|
83
149
|
|
|
84
150
|
while True:
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
},
|
|
96
|
-
timeout=aiohttp.ClientTimeout(5),
|
|
97
|
-
) as response:
|
|
98
|
-
# Clean up after reporting request info to avoid OOM.
|
|
99
|
-
self._request_aggregator.clear()
|
|
100
|
-
response.raise_for_status()
|
|
101
|
-
response_json = await response.json()
|
|
102
|
-
ready_replica_urls = response_json.get(
|
|
103
|
-
'ready_replica_urls', [])
|
|
104
|
-
except aiohttp.ClientError as e:
|
|
105
|
-
logger.error('An error occurred when syncing with '
|
|
106
|
-
f'the controller: {e}')
|
|
107
|
-
else:
|
|
108
|
-
logger.info(f'Available Replica URLs: {ready_replica_urls}')
|
|
109
|
-
with self._client_pool_lock:
|
|
110
|
-
self._load_balancing_policy.set_ready_replicas(
|
|
111
|
-
ready_replica_urls)
|
|
112
|
-
for replica_url in ready_replica_urls:
|
|
113
|
-
if replica_url not in self._client_pool:
|
|
114
|
-
self._client_pool[replica_url] = (
|
|
115
|
-
httpx.AsyncClient(base_url=replica_url))
|
|
116
|
-
urls_to_close = set(
|
|
117
|
-
self._client_pool.keys()) - set(ready_replica_urls)
|
|
118
|
-
client_to_close = []
|
|
119
|
-
for replica_url in urls_to_close:
|
|
120
|
-
client_to_close.append(
|
|
121
|
-
self._client_pool.pop(replica_url))
|
|
122
|
-
for client in client_to_close:
|
|
123
|
-
close_client_tasks.append(client.aclose())
|
|
124
|
-
|
|
125
|
-
await asyncio.sleep(constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
|
|
126
|
-
# Await those tasks after the interval to avoid blocking.
|
|
127
|
-
await asyncio.gather(*close_client_tasks)
|
|
151
|
+
try:
|
|
152
|
+
close_client_tasks = await self._sync_with_controller_once()
|
|
153
|
+
await asyncio.sleep(
|
|
154
|
+
constants.LB_CONTROLLER_SYNC_INTERVAL_SECONDS)
|
|
155
|
+
# Await those tasks after the interval to avoid blocking.
|
|
156
|
+
await asyncio.gather(*close_client_tasks)
|
|
157
|
+
except Exception as e: # pylint: disable=broad-except
|
|
158
|
+
logger.error(f'An error occurred when syncing with '
|
|
159
|
+
f'the controller: {e}'
|
|
160
|
+
f'\nTraceback: {traceback.format_exc()}')
|
|
128
161
|
|
|
129
162
|
async def _proxy_request_to(
|
|
130
163
|
self, url: str, request: fastapi.Request
|
|
@@ -168,7 +201,8 @@ class SkyServeLoadBalancer:
|
|
|
168
201
|
background=background.BackgroundTask(background_func))
|
|
169
202
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
170
203
|
logger.error(f'Error when proxy request to {url}: '
|
|
171
|
-
f'{common_utils.format_exception(e)}'
|
|
204
|
+
f'{common_utils.format_exception(e)}'
|
|
205
|
+
f'\nTraceback: {traceback.format_exc()}')
|
|
172
206
|
return e
|
|
173
207
|
|
|
174
208
|
async def _proxy_with_retries(
|
|
@@ -243,7 +277,8 @@ class SkyServeLoadBalancer:
|
|
|
243
277
|
protocol = 'https' if self._tls_credential is not None else 'http'
|
|
244
278
|
|
|
245
279
|
logger.info('SkyServe Load Balancer started on '
|
|
246
|
-
f'{protocol}://0.0.0.0:{self._load_balancer_port}'
|
|
280
|
+
f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
|
|
281
|
+
f'PID: {os.getpid()}')
|
|
247
282
|
|
|
248
283
|
uvicorn.run(self._app,
|
|
249
284
|
host='0.0.0.0',
|
|
@@ -252,23 +287,31 @@ class SkyServeLoadBalancer:
|
|
|
252
287
|
|
|
253
288
|
|
|
254
289
|
def run_load_balancer(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
290
|
+
controller_addr: str,
|
|
291
|
+
load_balancer_port: int,
|
|
292
|
+
load_balancing_policy_name: Optional[str] = None,
|
|
293
|
+
tls_credential: Optional[serve_utils.TLSCredential] = None,
|
|
294
|
+
target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
|
|
295
|
+
) -> None:
|
|
259
296
|
""" Run the load balancer.
|
|
260
297
|
|
|
261
298
|
Args:
|
|
262
299
|
controller_addr: The address of the controller.
|
|
263
300
|
load_balancer_port: The port where the load balancer listens to.
|
|
264
|
-
policy_name: The name of the load balancing policy to use.
|
|
265
|
-
|
|
301
|
+
policy_name: The name of the load balancing policy to use.
|
|
302
|
+
Defaults to None.
|
|
303
|
+
tls_credential:
|
|
304
|
+
The TLS credentials for HTTPS endpoint. Defaults to None.
|
|
305
|
+
target_qps_per_replica: Target QPS per replica for instance-aware
|
|
306
|
+
load balancing. Can be a float or dict mapping GPU types to QPS.
|
|
307
|
+
Defaults to None.
|
|
266
308
|
"""
|
|
267
309
|
load_balancer = SkyServeLoadBalancer(
|
|
268
310
|
controller_url=controller_addr,
|
|
269
311
|
load_balancer_port=load_balancer_port,
|
|
270
312
|
load_balancing_policy_name=load_balancing_policy_name,
|
|
271
|
-
tls_credential=tls_credential
|
|
313
|
+
tls_credential=tls_credential,
|
|
314
|
+
target_qps_per_replica=target_qps_per_replica)
|
|
272
315
|
load_balancer.run()
|
|
273
316
|
|
|
274
317
|
|
|
@@ -292,5 +335,8 @@ if __name__ == '__main__':
|
|
|
292
335
|
help=f'The load balancing policy to use. Available policies: '
|
|
293
336
|
f'{", ".join(available_policies)}.')
|
|
294
337
|
args = parser.parse_args()
|
|
295
|
-
run_load_balancer(args.controller_addr,
|
|
296
|
-
args.
|
|
338
|
+
run_load_balancer(args.controller_addr,
|
|
339
|
+
args.load_balancer_port,
|
|
340
|
+
args.load_balancing_policy,
|
|
341
|
+
tls_credential=None,
|
|
342
|
+
target_qps_per_replica=None)
|