skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/serve/server/server.py
CHANGED
|
@@ -10,6 +10,7 @@ from sky.server import common as server_common
|
|
|
10
10
|
from sky.server import stream_utils
|
|
11
11
|
from sky.server.requests import executor
|
|
12
12
|
from sky.server.requests import payloads
|
|
13
|
+
from sky.server.requests import request_names
|
|
13
14
|
from sky.server.requests import requests as api_requests
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.utils import common
|
|
@@ -23,9 +24,9 @@ async def up(
|
|
|
23
24
|
request: fastapi.Request,
|
|
24
25
|
up_body: payloads.ServeUpBody,
|
|
25
26
|
) -> None:
|
|
26
|
-
executor.
|
|
27
|
+
await executor.schedule_request_async(
|
|
27
28
|
request_id=request.state.request_id,
|
|
28
|
-
request_name=
|
|
29
|
+
request_name=request_names.RequestName.SERVE_UP,
|
|
29
30
|
request_body=up_body,
|
|
30
31
|
func=core.up,
|
|
31
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -38,9 +39,9 @@ async def update(
|
|
|
38
39
|
request: fastapi.Request,
|
|
39
40
|
update_body: payloads.ServeUpdateBody,
|
|
40
41
|
) -> None:
|
|
41
|
-
executor.
|
|
42
|
+
await executor.schedule_request_async(
|
|
42
43
|
request_id=request.state.request_id,
|
|
43
|
-
request_name=
|
|
44
|
+
request_name=request_names.RequestName.SERVE_UPDATE,
|
|
44
45
|
request_body=update_body,
|
|
45
46
|
func=core.update,
|
|
46
47
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -53,9 +54,9 @@ async def down(
|
|
|
53
54
|
request: fastapi.Request,
|
|
54
55
|
down_body: payloads.ServeDownBody,
|
|
55
56
|
) -> None:
|
|
56
|
-
executor.
|
|
57
|
+
await executor.schedule_request_async(
|
|
57
58
|
request_id=request.state.request_id,
|
|
58
|
-
request_name=
|
|
59
|
+
request_name=request_names.RequestName.SERVE_DOWN,
|
|
59
60
|
request_body=down_body,
|
|
60
61
|
func=core.down,
|
|
61
62
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -68,9 +69,9 @@ async def terminate_replica(
|
|
|
68
69
|
request: fastapi.Request,
|
|
69
70
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
71
|
) -> None:
|
|
71
|
-
executor.
|
|
72
|
+
await executor.schedule_request_async(
|
|
72
73
|
request_id=request.state.request_id,
|
|
73
|
-
request_name=
|
|
74
|
+
request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
|
|
74
75
|
request_body=terminate_replica_body,
|
|
75
76
|
func=core.terminate_replica,
|
|
76
77
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -83,9 +84,9 @@ async def status(
|
|
|
83
84
|
request: fastapi.Request,
|
|
84
85
|
status_body: payloads.ServeStatusBody,
|
|
85
86
|
) -> None:
|
|
86
|
-
executor.
|
|
87
|
+
await executor.schedule_request_async(
|
|
87
88
|
request_id=request.state.request_id,
|
|
88
|
-
request_name=
|
|
89
|
+
request_name=request_names.RequestName.SERVE_STATUS,
|
|
89
90
|
request_body=status_body,
|
|
90
91
|
func=core.status,
|
|
91
92
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -98,21 +99,23 @@ async def tail_logs(
|
|
|
98
99
|
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
|
99
100
|
background_tasks: fastapi.BackgroundTasks
|
|
100
101
|
) -> fastapi.responses.StreamingResponse:
|
|
101
|
-
executor.
|
|
102
|
+
executor.check_request_thread_executor_available()
|
|
103
|
+
request_task = await executor.prepare_request_async(
|
|
102
104
|
request_id=request.state.request_id,
|
|
103
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SERVE_LOGS,
|
|
104
106
|
request_body=log_body,
|
|
105
107
|
func=core.tail_logs,
|
|
106
108
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
107
109
|
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
|
108
110
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
return stream_utils.
|
|
111
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
112
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
113
|
+
background_tasks.add_task(task.cancel)
|
|
114
|
+
return stream_utils.stream_response_for_long_request(
|
|
113
115
|
request_id=request_task.request_id,
|
|
114
116
|
logs_path=request_task.log_path,
|
|
115
117
|
background_tasks=background_tasks,
|
|
118
|
+
kill_request_on_disconnect=False,
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
|
|
@@ -130,9 +133,9 @@ async def download_logs(
|
|
|
130
133
|
# We should reuse the original request body, so that the env vars, such as
|
|
131
134
|
# user hash, are kept the same.
|
|
132
135
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
133
|
-
executor.
|
|
136
|
+
await executor.schedule_request_async(
|
|
134
137
|
request_id=request.state.request_id,
|
|
135
|
-
request_name=
|
|
138
|
+
request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
|
|
136
139
|
request_body=download_logs_body,
|
|
137
140
|
func=core.sync_down_logs,
|
|
138
141
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/serve/service.py
CHANGED
|
@@ -13,12 +13,13 @@ from typing import Dict
|
|
|
13
13
|
|
|
14
14
|
import filelock
|
|
15
15
|
|
|
16
|
-
from sky import authentication
|
|
17
16
|
from sky import exceptions
|
|
17
|
+
from sky import global_user_state
|
|
18
18
|
from sky import sky_logging
|
|
19
19
|
from sky import task as task_lib
|
|
20
20
|
from sky.backends import backend_utils
|
|
21
21
|
from sky.backends import cloud_vm_ray_backend
|
|
22
|
+
from sky.data import data_utils
|
|
22
23
|
from sky.serve import constants
|
|
23
24
|
from sky.serve import controller
|
|
24
25
|
from sky.serve import load_balancer
|
|
@@ -26,7 +27,9 @@ from sky.serve import replica_managers
|
|
|
26
27
|
from sky.serve import serve_state
|
|
27
28
|
from sky.serve import serve_utils
|
|
28
29
|
from sky.skylet import constants as skylet_constants
|
|
30
|
+
from sky.utils import auth_utils
|
|
29
31
|
from sky.utils import common_utils
|
|
32
|
+
from sky.utils import controller_utils
|
|
30
33
|
from sky.utils import subprocess_utils
|
|
31
34
|
from sky.utils import ux_utils
|
|
32
35
|
|
|
@@ -71,6 +74,8 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
71
74
|
Returns:
|
|
72
75
|
True if the storage is cleaned up successfully, False otherwise.
|
|
73
76
|
"""
|
|
77
|
+
failed = False
|
|
78
|
+
|
|
74
79
|
try:
|
|
75
80
|
task = task_lib.Task.from_yaml(task_yaml)
|
|
76
81
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
@@ -86,41 +91,105 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
86
91
|
f'{common_utils.format_exception(e)}')
|
|
87
92
|
with ux_utils.enable_traceback():
|
|
88
93
|
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
89
|
-
|
|
90
|
-
|
|
94
|
+
failed = True
|
|
95
|
+
|
|
96
|
+
# Clean up any files mounted from the local disk, such as two-hop file
|
|
97
|
+
# mounts.
|
|
98
|
+
for file_mount in (task.file_mounts or {}).values():
|
|
99
|
+
try:
|
|
100
|
+
if not data_utils.is_cloud_store_url(file_mount):
|
|
101
|
+
path = os.path.expanduser(file_mount)
|
|
102
|
+
if os.path.isdir(path):
|
|
103
|
+
shutil.rmtree(path)
|
|
104
|
+
else:
|
|
105
|
+
os.remove(path)
|
|
106
|
+
except Exception as e: # pylint: disable=broad-except
|
|
107
|
+
logger.error(f'Failed to clean up file mount {file_mount}: {e}')
|
|
108
|
+
with ux_utils.enable_traceback():
|
|
109
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
110
|
+
failed = True
|
|
91
111
|
|
|
112
|
+
return not failed
|
|
92
113
|
|
|
114
|
+
|
|
115
|
+
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
116
|
+
# because we killed all the processes (controller & replica manager) before
|
|
117
|
+
# calling this function.
|
|
93
118
|
def _cleanup(service_name: str) -> bool:
|
|
94
119
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
120
|
+
# Cleanup the HA recovery script first as it is possible that some error
|
|
121
|
+
# was raised when we construct the task object (e.g.,
|
|
122
|
+
# sky.exceptions.ResourcesUnavailableError).
|
|
123
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
95
124
|
failed = False
|
|
96
125
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
97
126
|
info2proc: Dict[replica_managers.ReplicaInfo,
|
|
98
127
|
multiprocessing.Process] = dict()
|
|
128
|
+
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
129
|
+
# generate_replica_cluster_name`. Change it if you change the function.
|
|
130
|
+
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
131
|
+
service_name)
|
|
99
132
|
for info in replica_infos:
|
|
133
|
+
if info.cluster_name not in existing_cluster_names:
|
|
134
|
+
logger.info(f'Cluster {info.cluster_name} for replica '
|
|
135
|
+
f'{info.replica_id} not found. Might be a failed '
|
|
136
|
+
'cluster. Skipping.')
|
|
137
|
+
continue
|
|
100
138
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
101
139
|
args=(info.cluster_name,))
|
|
102
|
-
p.start()
|
|
103
140
|
info2proc[info] = p
|
|
104
141
|
# Set replica status to `SHUTTING_DOWN`
|
|
105
142
|
info.status_property.sky_launch_status = (
|
|
106
|
-
replica_managers.ProcessStatus.SUCCEEDED)
|
|
143
|
+
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
107
144
|
info.status_property.sky_down_status = (
|
|
108
|
-
replica_managers.ProcessStatus.
|
|
145
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED)
|
|
109
146
|
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
110
|
-
logger.info(f'
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
147
|
+
logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
|
|
148
|
+
|
|
149
|
+
def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
|
|
150
|
+
nonlocal failed
|
|
151
|
+
# Set replica status to `FAILED_CLEANUP`
|
|
152
|
+
info.status_property.sky_down_status = (
|
|
153
|
+
replica_managers.common_utils.ProcessStatus.FAILED)
|
|
154
|
+
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
155
|
+
failed = True
|
|
156
|
+
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
|
157
|
+
|
|
158
|
+
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
159
|
+
# TODO(tian): Refactor to use the same logic and code.
|
|
160
|
+
while info2proc:
|
|
161
|
+
snapshot = list(info2proc.items())
|
|
162
|
+
for info, p in snapshot:
|
|
163
|
+
if p.is_alive():
|
|
164
|
+
continue
|
|
165
|
+
if (info.status_property.sky_down_status ==
|
|
166
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
167
|
+
if controller_utils.can_terminate():
|
|
168
|
+
try:
|
|
169
|
+
p.start()
|
|
170
|
+
except Exception as e: # pylint: disable=broad-except
|
|
171
|
+
_set_to_failed_cleanup(info)
|
|
172
|
+
logger.error(f'Failed to start process for replica '
|
|
173
|
+
f'{info.replica_id}: {e}')
|
|
174
|
+
del info2proc[info]
|
|
175
|
+
else:
|
|
176
|
+
info.status_property.sky_down_status = (
|
|
177
|
+
common_utils.ProcessStatus.RUNNING)
|
|
178
|
+
serve_state.add_or_update_replica(
|
|
179
|
+
service_name, info.replica_id, info)
|
|
180
|
+
else:
|
|
181
|
+
logger.info('Terminate process for replica '
|
|
182
|
+
f'{info.replica_id} finished.')
|
|
183
|
+
p.join()
|
|
184
|
+
del info2proc[info]
|
|
185
|
+
if p.exitcode == 0:
|
|
186
|
+
serve_state.remove_replica(service_name, info.replica_id)
|
|
187
|
+
logger.info(
|
|
188
|
+
f'Replica {info.replica_id} terminated successfully.')
|
|
189
|
+
else:
|
|
190
|
+
_set_to_failed_cleanup(info)
|
|
191
|
+
time.sleep(3)
|
|
192
|
+
|
|
124
193
|
versions = serve_state.get_service_versions(service_name)
|
|
125
194
|
serve_state.remove_service_versions(service_name)
|
|
126
195
|
|
|
@@ -152,13 +221,13 @@ def _cleanup_task_run_script(job_id: int) -> None:
|
|
|
152
221
|
logger.warning(f'Task run script {this_task_run_script} not found')
|
|
153
222
|
|
|
154
223
|
|
|
155
|
-
def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
224
|
+
def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
156
225
|
"""Starts the service.
|
|
157
226
|
This including the controller and load balancer.
|
|
158
227
|
"""
|
|
159
228
|
# Generate ssh key pair to avoid race condition when multiple sky.launch
|
|
160
229
|
# are executed at the same time.
|
|
161
|
-
|
|
230
|
+
auth_utils.get_or_generate_keys()
|
|
162
231
|
|
|
163
232
|
# Initialize database record for the service.
|
|
164
233
|
task = task_lib.Task.from_yaml(tmp_task_yaml)
|
|
@@ -186,22 +255,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
186
255
|
|
|
187
256
|
service_dir = os.path.expanduser(
|
|
188
257
|
serve_utils.generate_remote_service_dir_name(service_name))
|
|
189
|
-
|
|
258
|
+
service_task_yaml = serve_utils.generate_task_yaml_file_name(
|
|
259
|
+
service_name, version)
|
|
190
260
|
|
|
191
261
|
if not is_recovery:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
262
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
263
|
+
if not controller_utils.can_start_new_process():
|
|
264
|
+
cleanup_storage(tmp_task_yaml)
|
|
265
|
+
with ux_utils.print_exception_no_traceback():
|
|
266
|
+
raise RuntimeError(
|
|
267
|
+
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
268
|
+
success = serve_state.add_service(
|
|
269
|
+
service_name,
|
|
270
|
+
controller_job_id=job_id,
|
|
271
|
+
policy=service_spec.autoscaling_policy_str(),
|
|
272
|
+
requested_resources_str=backend_utils.get_task_resources_str(
|
|
273
|
+
task),
|
|
274
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
|
275
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
276
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
277
|
+
pool=service_spec.pool,
|
|
278
|
+
controller_pid=os.getpid(),
|
|
279
|
+
entrypoint=entrypoint)
|
|
205
280
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
206
281
|
# for more details.
|
|
207
282
|
if not success:
|
|
@@ -218,7 +293,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
218
293
|
# don't want the new file mounts to overwrite the old one, so we
|
|
219
294
|
# sync to a tmp file first and then copy it to the final name
|
|
220
295
|
# if there is no name conflict.
|
|
221
|
-
shutil.copy(tmp_task_yaml,
|
|
296
|
+
shutil.copy(tmp_task_yaml, service_task_yaml)
|
|
297
|
+
else:
|
|
298
|
+
serve_state.update_service_controller_pid(service_name, os.getpid())
|
|
222
299
|
|
|
223
300
|
controller_process = None
|
|
224
301
|
load_balancer_process = None
|
|
@@ -249,8 +326,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
249
326
|
controller_host = _get_controller_host()
|
|
250
327
|
controller_process = multiprocessing.Process(
|
|
251
328
|
target=controller.run_controller,
|
|
252
|
-
args=(service_name, service_spec,
|
|
253
|
-
controller_port))
|
|
329
|
+
args=(service_name, service_spec, service_task_yaml,
|
|
330
|
+
controller_host, controller_port))
|
|
254
331
|
controller_process.start()
|
|
255
332
|
|
|
256
333
|
if not is_recovery:
|
|
@@ -271,14 +348,18 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
271
348
|
# TODO(tian): Probably we could enable multiple ports specified in
|
|
272
349
|
# service spec and we could start multiple load balancers.
|
|
273
350
|
# After that, we will have a mapping from replica port to endpoint.
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
351
|
+
# NOTE(tian): We don't need the load balancer for cluster pool.
|
|
352
|
+
# Skip the load balancer process for cluster pool.
|
|
353
|
+
if not service_spec.pool:
|
|
354
|
+
load_balancer_process = multiprocessing.Process(
|
|
355
|
+
target=ux_utils.RedirectOutputForProcess(
|
|
356
|
+
load_balancer.run_load_balancer,
|
|
357
|
+
load_balancer_log_file).run,
|
|
358
|
+
args=(controller_addr, load_balancer_port,
|
|
359
|
+
service_spec.load_balancing_policy,
|
|
360
|
+
service_spec.tls_credential,
|
|
361
|
+
service_spec.target_qps_per_replica))
|
|
362
|
+
load_balancer_process.start()
|
|
282
363
|
|
|
283
364
|
if not is_recovery:
|
|
284
365
|
serve_state.set_service_load_balancer_port(
|
|
@@ -333,8 +414,12 @@ if __name__ == '__main__':
|
|
|
333
414
|
required=True,
|
|
334
415
|
type=int,
|
|
335
416
|
help='Job id for the service job.')
|
|
417
|
+
parser.add_argument('--entrypoint',
|
|
418
|
+
type=str,
|
|
419
|
+
help='Entrypoint to launch the service',
|
|
420
|
+
required=True)
|
|
336
421
|
args = parser.parse_args()
|
|
337
422
|
# We start process with 'spawn', because 'fork' could result in weird
|
|
338
423
|
# behaviors; 'spawn' is also cross-platform.
|
|
339
424
|
multiprocessing.set_start_method('spawn', force=True)
|
|
340
|
-
_start(args.service_name, args.task_yaml, args.job_id)
|
|
425
|
+
_start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)
|
sky/serve/service_spec.py
CHANGED
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import textwrap
|
|
5
|
-
import
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
6
|
|
|
8
7
|
from sky import serve
|
|
9
|
-
from sky.adaptors import common as adaptors_common
|
|
10
8
|
from sky.serve import constants
|
|
11
9
|
from sky.serve import load_balancing_policies as lb_policies
|
|
12
10
|
from sky.serve import serve_utils
|
|
@@ -14,11 +12,7 @@ from sky.serve import spot_placer as spot_placer_lib
|
|
|
14
12
|
from sky.utils import common_utils
|
|
15
13
|
from sky.utils import schemas
|
|
16
14
|
from sky.utils import ux_utils
|
|
17
|
-
|
|
18
|
-
if typing.TYPE_CHECKING:
|
|
19
|
-
import yaml
|
|
20
|
-
else:
|
|
21
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
15
|
+
from sky.utils import yaml_utils
|
|
22
16
|
|
|
23
17
|
|
|
24
18
|
class SkyServiceSpec:
|
|
@@ -33,7 +27,7 @@ class SkyServiceSpec:
|
|
|
33
27
|
max_replicas: Optional[int] = None,
|
|
34
28
|
num_overprovision: Optional[int] = None,
|
|
35
29
|
ports: Optional[str] = None,
|
|
36
|
-
target_qps_per_replica: Optional[float] = None,
|
|
30
|
+
target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None,
|
|
37
31
|
post_data: Optional[Dict[str, Any]] = None,
|
|
38
32
|
tls_credential: Optional[serve_utils.TLSCredential] = None,
|
|
39
33
|
readiness_headers: Optional[Dict[str, str]] = None,
|
|
@@ -43,7 +37,33 @@ class SkyServiceSpec:
|
|
|
43
37
|
upscale_delay_seconds: Optional[int] = None,
|
|
44
38
|
downscale_delay_seconds: Optional[int] = None,
|
|
45
39
|
load_balancing_policy: Optional[str] = None,
|
|
40
|
+
pool: Optional[bool] = None,
|
|
46
41
|
) -> None:
|
|
42
|
+
if pool:
|
|
43
|
+
for unsupported_field in [
|
|
44
|
+
'max_replicas',
|
|
45
|
+
'num_overprovision',
|
|
46
|
+
'target_qps_per_replica',
|
|
47
|
+
'upscale_delay_seconds',
|
|
48
|
+
'downscale_delay_seconds',
|
|
49
|
+
'base_ondemand_fallback_replicas',
|
|
50
|
+
'dynamic_ondemand_fallback',
|
|
51
|
+
'spot_placer',
|
|
52
|
+
'load_balancing_policy',
|
|
53
|
+
'ports',
|
|
54
|
+
'post_data',
|
|
55
|
+
'tls_credential',
|
|
56
|
+
'readiness_headers',
|
|
57
|
+
]:
|
|
58
|
+
if locals()[unsupported_field] is not None:
|
|
59
|
+
with ux_utils.print_exception_no_traceback():
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f'{unsupported_field} is not supported for pool.')
|
|
62
|
+
if max_replicas is not None and max_replicas != min_replicas:
|
|
63
|
+
with ux_utils.print_exception_no_traceback():
|
|
64
|
+
raise ValueError('Autoscaling is not supported for pool '
|
|
65
|
+
'for now.')
|
|
66
|
+
|
|
47
67
|
if max_replicas is not None and max_replicas < min_replicas:
|
|
48
68
|
with ux_utils.print_exception_no_traceback():
|
|
49
69
|
raise ValueError('max_replicas must be greater than or '
|
|
@@ -83,7 +103,8 @@ class SkyServiceSpec:
|
|
|
83
103
|
self._max_replicas: Optional[int] = max_replicas
|
|
84
104
|
self._num_overprovision: Optional[int] = num_overprovision
|
|
85
105
|
self._ports: Optional[str] = ports
|
|
86
|
-
self._target_qps_per_replica: Optional[float
|
|
106
|
+
self._target_qps_per_replica: Optional[Union[float, Dict[
|
|
107
|
+
str, float]]] = target_qps_per_replica
|
|
87
108
|
self._post_data: Optional[Dict[str, Any]] = post_data
|
|
88
109
|
self._tls_credential: Optional[serve_utils.TLSCredential] = (
|
|
89
110
|
tls_credential)
|
|
@@ -96,6 +117,7 @@ class SkyServiceSpec:
|
|
|
96
117
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
|
97
118
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
|
98
119
|
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
|
120
|
+
self._pool: Optional[bool] = pool
|
|
99
121
|
|
|
100
122
|
self._use_ondemand_fallback: bool = (
|
|
101
123
|
self.dynamic_ondemand_fallback is not None and
|
|
@@ -115,7 +137,7 @@ class SkyServiceSpec:
|
|
|
115
137
|
|
|
116
138
|
service_config: Dict[str, Any] = {}
|
|
117
139
|
|
|
118
|
-
readiness_section = config
|
|
140
|
+
readiness_section = config.get('readiness_probe', '/')
|
|
119
141
|
if isinstance(readiness_section, str):
|
|
120
142
|
service_config['readiness_path'] = readiness_section
|
|
121
143
|
initial_delay_seconds = None
|
|
@@ -157,8 +179,29 @@ class SkyServiceSpec:
|
|
|
157
179
|
raise ValueError('Port must be between 1 and 65535.')
|
|
158
180
|
service_config['ports'] = str(ports) if ports is not None else None
|
|
159
181
|
|
|
182
|
+
pool_config = config.get('pool', None)
|
|
183
|
+
if pool_config is not None:
|
|
184
|
+
service_config['pool'] = pool_config
|
|
185
|
+
|
|
160
186
|
policy_section = config.get('replica_policy', None)
|
|
187
|
+
if policy_section is not None and pool_config:
|
|
188
|
+
with ux_utils.print_exception_no_traceback():
|
|
189
|
+
raise ValueError('Cannot specify `replica_policy` for cluster '
|
|
190
|
+
'pool. Only `workers: <num>` is supported '
|
|
191
|
+
'for cluster pool now.')
|
|
192
|
+
|
|
161
193
|
simplified_policy_section = config.get('replicas', None)
|
|
194
|
+
workers_config = config.get('workers', None)
|
|
195
|
+
if simplified_policy_section is not None and workers_config is not None:
|
|
196
|
+
with ux_utils.print_exception_no_traceback():
|
|
197
|
+
raise ValueError('Cannot specify both `replicas` and `workers`.'
|
|
198
|
+
' Please use one of them.')
|
|
199
|
+
if simplified_policy_section is not None and pool_config:
|
|
200
|
+
with ux_utils.print_exception_no_traceback():
|
|
201
|
+
raise ValueError('Cannot specify `replicas` for cluster pool. '
|
|
202
|
+
'Please use `workers` instead.')
|
|
203
|
+
if simplified_policy_section is None:
|
|
204
|
+
simplified_policy_section = workers_config
|
|
162
205
|
if policy_section is None or simplified_policy_section is not None:
|
|
163
206
|
if simplified_policy_section is not None:
|
|
164
207
|
min_replicas = simplified_policy_section
|
|
@@ -193,6 +236,26 @@ class SkyServiceSpec:
|
|
|
193
236
|
service_config['load_balancing_policy'] = config.get(
|
|
194
237
|
'load_balancing_policy', None)
|
|
195
238
|
|
|
239
|
+
# Validate instance-aware settings
|
|
240
|
+
target_qps_per_replica = service_config['target_qps_per_replica']
|
|
241
|
+
load_balancing_policy = service_config['load_balancing_policy']
|
|
242
|
+
|
|
243
|
+
if isinstance(target_qps_per_replica, dict):
|
|
244
|
+
if load_balancing_policy != 'instance_aware_least_load':
|
|
245
|
+
with ux_utils.print_exception_no_traceback():
|
|
246
|
+
raise ValueError(
|
|
247
|
+
'When using dict type target_qps_per_replica, '
|
|
248
|
+
'load_balancing_policy must be '
|
|
249
|
+
'"instance_aware_least_load".')
|
|
250
|
+
|
|
251
|
+
if load_balancing_policy == 'instance_aware_least_load':
|
|
252
|
+
if not isinstance(target_qps_per_replica, dict):
|
|
253
|
+
with ux_utils.print_exception_no_traceback():
|
|
254
|
+
raise ValueError(
|
|
255
|
+
'When using "instance_aware_least_load" policy, '
|
|
256
|
+
'target_qps_per_replica must be a '
|
|
257
|
+
'dict mapping GPU types to QPS values.')
|
|
258
|
+
|
|
196
259
|
tls_section = config.get('tls', None)
|
|
197
260
|
if tls_section is not None:
|
|
198
261
|
service_config['tls_credential'] = serve_utils.TLSCredential(
|
|
@@ -205,7 +268,7 @@ class SkyServiceSpec:
|
|
|
205
268
|
@staticmethod
|
|
206
269
|
def from_yaml(yaml_path: str) -> 'SkyServiceSpec':
|
|
207
270
|
with open(os.path.expanduser(yaml_path), 'r', encoding='utf-8') as f:
|
|
208
|
-
config =
|
|
271
|
+
config = yaml_utils.safe_load(f)
|
|
209
272
|
|
|
210
273
|
if isinstance(config, str):
|
|
211
274
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -239,6 +302,13 @@ class SkyServiceSpec:
|
|
|
239
302
|
config[section] = dict()
|
|
240
303
|
config[section][key] = value
|
|
241
304
|
|
|
305
|
+
add_if_not_none('pool', None, self._pool)
|
|
306
|
+
|
|
307
|
+
if self.pool:
|
|
308
|
+
# For pool, currently only `workers: <num>` is supported.
|
|
309
|
+
add_if_not_none('workers', None, self.min_replicas)
|
|
310
|
+
return config
|
|
311
|
+
|
|
242
312
|
add_if_not_none('readiness_probe', 'path', self.readiness_path)
|
|
243
313
|
add_if_not_none('readiness_probe', 'initial_delay_seconds',
|
|
244
314
|
self.initial_delay_seconds)
|
|
@@ -306,10 +376,14 @@ class SkyServiceSpec:
|
|
|
306
376
|
return ' '.join(policy_strs)
|
|
307
377
|
|
|
308
378
|
def autoscaling_policy_str(self):
|
|
379
|
+
if self.pool:
|
|
380
|
+
# We only support fixed-size pool for now.
|
|
381
|
+
return f'Fixed-size ({self.min_replicas} workers)'
|
|
309
382
|
# TODO(MaoZiming): Update policy_str
|
|
383
|
+
noun = 'worker' if self.pool else 'replica'
|
|
310
384
|
min_plural = '' if self.min_replicas == 1 else 's'
|
|
311
385
|
if self.max_replicas == self.min_replicas or self.max_replicas is None:
|
|
312
|
-
return f'Fixed {self.min_replicas}
|
|
386
|
+
return f'Fixed {self.min_replicas} {noun}{min_plural}'
|
|
313
387
|
# Already checked in __init__.
|
|
314
388
|
assert self.target_qps_per_replica is not None
|
|
315
389
|
# TODO(tian): Refactor to contain more information
|
|
@@ -319,8 +393,8 @@ class SkyServiceSpec:
|
|
|
319
393
|
overprovision_str = (
|
|
320
394
|
f' with {self.num_overprovision} overprovisioned replicas')
|
|
321
395
|
return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
|
|
322
|
-
f'
|
|
323
|
-
f'
|
|
396
|
+
f'{noun}{max_plural}{overprovision_str} (target QPS per '
|
|
397
|
+
f'{noun}: {self.target_qps_per_replica})')
|
|
324
398
|
|
|
325
399
|
def set_ports(self, ports: str) -> None:
|
|
326
400
|
self._ports = ports
|
|
@@ -332,6 +406,10 @@ class SkyServiceSpec:
|
|
|
332
406
|
f'Certfile: {self.tls_credential.certfile}')
|
|
333
407
|
|
|
334
408
|
def __repr__(self) -> str:
|
|
409
|
+
if self.pool:
|
|
410
|
+
return textwrap.dedent(f"""\
|
|
411
|
+
Worker policy: {self.autoscaling_policy_str()}
|
|
412
|
+
""")
|
|
335
413
|
return textwrap.dedent(f"""\
|
|
336
414
|
Readiness probe method: {self.probe_str()}
|
|
337
415
|
Readiness initial delay seconds: {self.initial_delay_seconds}
|
|
@@ -372,7 +450,8 @@ class SkyServiceSpec:
|
|
|
372
450
|
return self._ports
|
|
373
451
|
|
|
374
452
|
@property
|
|
375
|
-
def target_qps_per_replica(
|
|
453
|
+
def target_qps_per_replica(
|
|
454
|
+
self) -> Optional[Union[float, Dict[str, float]]]:
|
|
376
455
|
return self._target_qps_per_replica
|
|
377
456
|
|
|
378
457
|
@property
|
|
@@ -420,3 +499,43 @@ class SkyServiceSpec:
|
|
|
420
499
|
def load_balancing_policy(self) -> str:
|
|
421
500
|
return lb_policies.LoadBalancingPolicy.make_policy_name(
|
|
422
501
|
self._load_balancing_policy)
|
|
502
|
+
|
|
503
|
+
@property
|
|
504
|
+
def pool(self) -> bool:
|
|
505
|
+
# This can happen for backward compatibility.
|
|
506
|
+
if not hasattr(self, '_pool'):
|
|
507
|
+
return False
|
|
508
|
+
return bool(self._pool)
|
|
509
|
+
|
|
510
|
+
def copy(self, **override) -> 'SkyServiceSpec':
|
|
511
|
+
return SkyServiceSpec(
|
|
512
|
+
readiness_path=override.pop('readiness_path', self._readiness_path),
|
|
513
|
+
initial_delay_seconds=override.pop('initial_delay_seconds',
|
|
514
|
+
self._initial_delay_seconds),
|
|
515
|
+
readiness_timeout_seconds=override.pop(
|
|
516
|
+
'readiness_timeout_seconds', self._readiness_timeout_seconds),
|
|
517
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
518
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
519
|
+
num_overprovision=override.pop('num_overprovision',
|
|
520
|
+
self._num_overprovision),
|
|
521
|
+
ports=override.pop('ports', self._ports),
|
|
522
|
+
target_qps_per_replica=override.pop('target_qps_per_replica',
|
|
523
|
+
self._target_qps_per_replica),
|
|
524
|
+
post_data=override.pop('post_data', self._post_data),
|
|
525
|
+
tls_credential=override.pop('tls_credential', self._tls_credential),
|
|
526
|
+
readiness_headers=override.pop('readiness_headers',
|
|
527
|
+
self._readiness_headers),
|
|
528
|
+
dynamic_ondemand_fallback=override.pop(
|
|
529
|
+
'dynamic_ondemand_fallback', self._dynamic_ondemand_fallback),
|
|
530
|
+
base_ondemand_fallback_replicas=override.pop(
|
|
531
|
+
'base_ondemand_fallback_replicas',
|
|
532
|
+
self._base_ondemand_fallback_replicas),
|
|
533
|
+
spot_placer=override.pop('spot_placer', self._spot_placer),
|
|
534
|
+
upscale_delay_seconds=override.pop('upscale_delay_seconds',
|
|
535
|
+
self._upscale_delay_seconds),
|
|
536
|
+
downscale_delay_seconds=override.pop('downscale_delay_seconds',
|
|
537
|
+
self._downscale_delay_seconds),
|
|
538
|
+
load_balancing_policy=override.pop('load_balancing_policy',
|
|
539
|
+
self._load_balancing_policy),
|
|
540
|
+
pool=override.pop('pool', self._pool),
|
|
541
|
+
)
|
sky/serve/spot_placer.py
CHANGED
|
@@ -46,6 +46,8 @@ class Location:
|
|
|
46
46
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
|
49
|
+
assert resources.cloud is not None, 'Cloud must be specified'
|
|
50
|
+
assert resources.region is not None, 'Region must be specified'
|
|
49
51
|
return cls(resources.cloud, resources.region, resources.zone)
|
|
50
52
|
|
|
51
53
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
|
|
147
149
|
cloud_str = str(launchable.cloud)
|
|
148
150
|
region = launchable.region
|
|
149
151
|
zone = launchable.zone
|
|
152
|
+
assert region is not None, 'Region must be specified'
|
|
150
153
|
if (cloud_str not in location_requirements and
|
|
151
154
|
location_requirements):
|
|
152
155
|
continue
|
|
File without changes
|