skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
2
|
import dataclasses
|
|
3
|
-
import enum
|
|
4
3
|
import functools
|
|
5
4
|
import multiprocessing
|
|
6
5
|
from multiprocessing import pool as mp_pool
|
|
@@ -12,53 +11,56 @@ import typing
|
|
|
12
11
|
from typing import Any, Dict, List, Optional, Tuple
|
|
13
12
|
|
|
14
13
|
import colorama
|
|
15
|
-
import
|
|
14
|
+
import filelock
|
|
16
15
|
import requests
|
|
17
16
|
|
|
18
|
-
import sky
|
|
19
17
|
from sky import backends
|
|
20
18
|
from sky import core
|
|
21
19
|
from sky import exceptions
|
|
22
20
|
from sky import execution
|
|
23
21
|
from sky import global_user_state
|
|
24
22
|
from sky import sky_logging
|
|
23
|
+
from sky import task as task_lib
|
|
25
24
|
from sky.backends import backend_utils
|
|
26
25
|
from sky.serve import constants as serve_constants
|
|
27
26
|
from sky.serve import serve_state
|
|
28
27
|
from sky.serve import serve_utils
|
|
29
28
|
from sky.serve import service
|
|
30
29
|
from sky.serve import spot_placer
|
|
30
|
+
from sky.server.requests import request_names
|
|
31
31
|
from sky.skylet import constants
|
|
32
32
|
from sky.skylet import job_lib
|
|
33
33
|
from sky.usage import usage_lib
|
|
34
34
|
from sky.utils import common_utils
|
|
35
35
|
from sky.utils import controller_utils
|
|
36
36
|
from sky.utils import env_options
|
|
37
|
+
from sky.utils import resources_utils
|
|
37
38
|
from sky.utils import status_lib
|
|
38
39
|
from sky.utils import ux_utils
|
|
40
|
+
from sky.utils import yaml_utils
|
|
39
41
|
|
|
40
42
|
if typing.TYPE_CHECKING:
|
|
41
|
-
from sky import resources
|
|
42
43
|
from sky.serve import service_spec
|
|
43
44
|
|
|
44
45
|
logger = sky_logging.init_logger(__name__)
|
|
45
46
|
|
|
46
47
|
_JOB_STATUS_FETCH_INTERVAL = 30
|
|
47
48
|
_PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
48
|
-
# TODO(tian): Maybe let user determine this threshold
|
|
49
|
-
_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
|
|
50
49
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
51
50
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
52
51
|
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
|
|
52
|
+
# TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
|
|
53
|
+
# 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
|
|
54
|
+
# old ReplicaInfo in database will still tries to unpickle using ProcessStatus
|
|
55
|
+
# in replica_managers. We set this alias to avoid breaking changes. See #6729
|
|
56
|
+
# for more details.
|
|
57
|
+
ProcessStatus = common_utils.ProcessStatus
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
# TODO(tian): Combine this with
|
|
59
61
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
60
62
|
def launch_cluster(replica_id: int,
|
|
61
|
-
|
|
63
|
+
service_task_yaml_path: str,
|
|
62
64
|
cluster_name: str,
|
|
63
65
|
resources_override: Optional[Dict[str, Any]] = None,
|
|
64
66
|
retry_until_up: bool = True,
|
|
@@ -78,8 +80,9 @@ def launch_cluster(replica_id: int,
|
|
|
78
80
|
f'{cluster_name} with resources override: '
|
|
79
81
|
f'{resources_override}')
|
|
80
82
|
try:
|
|
81
|
-
config =
|
|
82
|
-
|
|
83
|
+
config = yaml_utils.read_yaml(
|
|
84
|
+
os.path.expanduser(service_task_yaml_path))
|
|
85
|
+
task = task_lib.Task.from_yaml_config(config)
|
|
83
86
|
if resources_override is not None:
|
|
84
87
|
resources = task.resources
|
|
85
88
|
overrided_resources = [
|
|
@@ -105,6 +108,8 @@ def launch_cluster(replica_id: int,
|
|
|
105
108
|
execution.launch(task,
|
|
106
109
|
cluster_name,
|
|
107
110
|
retry_until_up=retry_until_up,
|
|
111
|
+
_request_name=request_names.AdminPolicyRequestName.
|
|
112
|
+
SERVE_LAUNCH_REPLICA,
|
|
108
113
|
_is_launched_by_sky_serve_controller=True)
|
|
109
114
|
logger.info(f'Replica cluster {cluster_name} launched.')
|
|
110
115
|
except (exceptions.InvalidClusterNameError,
|
|
@@ -173,17 +178,19 @@ def terminate_cluster(cluster_name: str,
|
|
|
173
178
|
time.sleep(gap_seconds)
|
|
174
179
|
|
|
175
180
|
|
|
176
|
-
def _get_resources_ports(
|
|
181
|
+
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
177
182
|
"""Get the resources ports used by the task."""
|
|
178
|
-
task =
|
|
183
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
179
184
|
# Already checked all ports are valid in sky.serve.core.up
|
|
180
185
|
assert task.resources, task
|
|
181
186
|
assert task.service is not None, task
|
|
187
|
+
if task.service.pool:
|
|
188
|
+
return '-'
|
|
182
189
|
assert task.service.ports is not None, task
|
|
183
190
|
return task.service.ports
|
|
184
191
|
|
|
185
192
|
|
|
186
|
-
def _should_use_spot(
|
|
193
|
+
def _should_use_spot(service_task_yaml_path: str,
|
|
187
194
|
resource_override: Optional[Dict[str, Any]]) -> bool:
|
|
188
195
|
"""Get whether the task should use spot."""
|
|
189
196
|
if resource_override is not None:
|
|
@@ -191,7 +198,7 @@ def _should_use_spot(task_yaml: str,
|
|
|
191
198
|
if use_spot_override is not None:
|
|
192
199
|
assert isinstance(use_spot_override, bool)
|
|
193
200
|
return use_spot_override
|
|
194
|
-
task =
|
|
201
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
195
202
|
spot_use_resources = [
|
|
196
203
|
resources for resources in task.resources if resources.use_spot
|
|
197
204
|
]
|
|
@@ -200,6 +207,12 @@ def _should_use_spot(task_yaml: str,
|
|
|
200
207
|
return len(spot_use_resources) == len(task.resources)
|
|
201
208
|
|
|
202
209
|
|
|
210
|
+
# Every function that calls serve_state.add_or_update_replica should acquire
|
|
211
|
+
# this lock. It is to prevent race condition when the replica status is updated
|
|
212
|
+
# by multiple threads at the same time. The modification of replica info is
|
|
213
|
+
# 2 database calls: read the whole replica info object, unpickle it, and modify
|
|
214
|
+
# corresponding fields. Then it is write back to the database. We need to ensure
|
|
215
|
+
# the read-modify-write operation is atomic.
|
|
203
216
|
def with_lock(func):
|
|
204
217
|
|
|
205
218
|
@functools.wraps(func)
|
|
@@ -210,22 +223,6 @@ def with_lock(func):
|
|
|
210
223
|
return wrapper
|
|
211
224
|
|
|
212
225
|
|
|
213
|
-
class ProcessStatus(enum.Enum):
|
|
214
|
-
"""Process status."""
|
|
215
|
-
|
|
216
|
-
# The process is running
|
|
217
|
-
RUNNING = 'RUNNING'
|
|
218
|
-
|
|
219
|
-
# The process is finished and succeeded
|
|
220
|
-
SUCCEEDED = 'SUCCEEDED'
|
|
221
|
-
|
|
222
|
-
# The process is interrupted
|
|
223
|
-
INTERRUPTED = 'INTERRUPTED'
|
|
224
|
-
|
|
225
|
-
# The process failed
|
|
226
|
-
FAILED = 'FAILED'
|
|
227
|
-
|
|
228
|
-
|
|
229
226
|
@dataclasses.dataclass
|
|
230
227
|
class ReplicaStatusProperty:
|
|
231
228
|
"""Some properties that determine replica status.
|
|
@@ -237,15 +234,16 @@ class ReplicaStatusProperty:
|
|
|
237
234
|
first_ready_time: The first time the service is ready.
|
|
238
235
|
sky_down_status: Process status of sky.down.
|
|
239
236
|
"""
|
|
240
|
-
#
|
|
241
|
-
sky_launch_status:
|
|
237
|
+
# sky.launch will always be scheduled on creation of ReplicaStatusProperty.
|
|
238
|
+
sky_launch_status: common_utils.ProcessStatus = (
|
|
239
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
242
240
|
user_app_failed: bool = False
|
|
243
241
|
service_ready_now: bool = False
|
|
244
242
|
# None means readiness probe is not succeeded yet;
|
|
245
243
|
# -1 means the initial delay seconds is exceeded.
|
|
246
244
|
first_ready_time: Optional[float] = None
|
|
247
245
|
# None means sky.down is not called yet.
|
|
248
|
-
sky_down_status: Optional[ProcessStatus] = None
|
|
246
|
+
sky_down_status: Optional[common_utils.ProcessStatus] = None
|
|
249
247
|
# Whether the termination is caused by autoscaler's decision
|
|
250
248
|
is_scale_down: bool = False
|
|
251
249
|
# The replica's spot instance was preempted.
|
|
@@ -300,7 +298,7 @@ class ReplicaStatusProperty:
|
|
|
300
298
|
(1) Job status;
|
|
301
299
|
(2) Readiness probe.
|
|
302
300
|
"""
|
|
303
|
-
if self.sky_launch_status != ProcessStatus.SUCCEEDED:
|
|
301
|
+
if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
|
|
304
302
|
return False
|
|
305
303
|
if self.sky_down_status is not None:
|
|
306
304
|
return False
|
|
@@ -314,37 +312,43 @@ class ReplicaStatusProperty:
|
|
|
314
312
|
|
|
315
313
|
def to_replica_status(self) -> serve_state.ReplicaStatus:
|
|
316
314
|
"""Convert status property to human-readable replica status."""
|
|
317
|
-
|
|
315
|
+
# Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
|
|
316
|
+
# we use None to represent sky.launch is not called yet.
|
|
317
|
+
if (self.sky_launch_status is None or
|
|
318
|
+
self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
|
|
318
319
|
# Pending to launch
|
|
319
320
|
return serve_state.ReplicaStatus.PENDING
|
|
320
|
-
if self.sky_launch_status == ProcessStatus.RUNNING:
|
|
321
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
321
|
+
if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
|
|
322
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
322
323
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
323
|
-
if self.sky_down_status == ProcessStatus.SUCCEEDED:
|
|
324
|
+
if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
|
|
324
325
|
# This indicate it is a scale_down with correct teardown.
|
|
325
326
|
# Should have been cleaned from the replica table.
|
|
326
327
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
327
328
|
# Still launching
|
|
328
329
|
return serve_state.ReplicaStatus.PROVISIONING
|
|
329
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
330
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
330
331
|
# sky.down is running and a scale down interrupted sky.launch
|
|
331
332
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
332
333
|
if self.sky_down_status is not None:
|
|
333
334
|
if self.preempted:
|
|
334
335
|
# Replica (spot) is preempted
|
|
335
336
|
return serve_state.ReplicaStatus.PREEMPTED
|
|
336
|
-
if self.sky_down_status == ProcessStatus.
|
|
337
|
+
if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
|
|
338
|
+
# sky.down is scheduled to run, but not started yet.
|
|
339
|
+
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
340
|
+
if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
|
|
337
341
|
# sky.down is running
|
|
338
342
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
339
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
343
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
340
344
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
341
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
345
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
342
346
|
# sky.down failed
|
|
343
347
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
344
348
|
if self.user_app_failed:
|
|
345
349
|
# Failed on user setup/run
|
|
346
350
|
return serve_state.ReplicaStatus.FAILED
|
|
347
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
351
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
348
352
|
# sky.launch failed
|
|
349
353
|
return serve_state.ReplicaStatus.FAILED_PROVISION
|
|
350
354
|
if self.first_ready_time is None:
|
|
@@ -360,7 +364,7 @@ class ReplicaStatusProperty:
|
|
|
360
364
|
# This indicate it is a scale_down with correct teardown.
|
|
361
365
|
# Should have been cleaned from the replica table.
|
|
362
366
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
363
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
367
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
364
368
|
# sky.launch failed
|
|
365
369
|
# The down process has not been started if it reaches here,
|
|
366
370
|
# due to the `if self.sky_down_status is not None`` check above.
|
|
@@ -421,11 +425,12 @@ class ReplicaInfo:
|
|
|
421
425
|
based on the cluster name.
|
|
422
426
|
"""
|
|
423
427
|
if cluster_record is None:
|
|
424
|
-
|
|
428
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
425
429
|
self.cluster_name)
|
|
426
|
-
|
|
430
|
+
else:
|
|
431
|
+
handle = cluster_record['handle']
|
|
432
|
+
if handle is None:
|
|
427
433
|
return None
|
|
428
|
-
handle = cluster_record['handle']
|
|
429
434
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
430
435
|
return handle
|
|
431
436
|
|
|
@@ -442,10 +447,16 @@ class ReplicaInfo:
|
|
|
442
447
|
handle = self.handle()
|
|
443
448
|
if handle is None:
|
|
444
449
|
return None
|
|
450
|
+
if self.replica_port == '-':
|
|
451
|
+
# This is a pool replica so there is no endpoint and it's filled
|
|
452
|
+
# with this dummy value. We return None here so that we can
|
|
453
|
+
# get the active ready replicas and perform autoscaling. Otherwise,
|
|
454
|
+
# would error out when trying to get the endpoint.
|
|
455
|
+
return None
|
|
445
456
|
replica_port_int = int(self.replica_port)
|
|
446
457
|
try:
|
|
447
|
-
endpoint_dict =
|
|
448
|
-
|
|
458
|
+
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
459
|
+
replica_port_int)
|
|
449
460
|
except exceptions.ClusterNotUpError:
|
|
450
461
|
return None
|
|
451
462
|
endpoint = endpoint_dict.get(replica_port_int, None)
|
|
@@ -465,26 +476,36 @@ class ReplicaInfo:
|
|
|
465
476
|
f'replica {self.replica_id}.')
|
|
466
477
|
return replica_status
|
|
467
478
|
|
|
468
|
-
def to_info_dict(self,
|
|
479
|
+
def to_info_dict(self,
|
|
480
|
+
with_handle: bool,
|
|
481
|
+
with_url: bool = True) -> Dict[str, Any]:
|
|
469
482
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
470
|
-
self.cluster_name)
|
|
483
|
+
self.cluster_name, include_user_info=False, summary_response=True)
|
|
471
484
|
info_dict = {
|
|
472
485
|
'replica_id': self.replica_id,
|
|
473
486
|
'name': self.cluster_name,
|
|
474
487
|
'status': self.status,
|
|
475
488
|
'version': self.version,
|
|
476
|
-
'endpoint': self.url,
|
|
489
|
+
'endpoint': self.url if with_url else None,
|
|
477
490
|
'is_spot': self.is_spot,
|
|
478
491
|
'launched_at': (cluster_record['launched_at']
|
|
479
492
|
if cluster_record is not None else None),
|
|
480
493
|
}
|
|
481
494
|
if with_handle:
|
|
482
|
-
|
|
495
|
+
handle = self.handle(cluster_record)
|
|
496
|
+
info_dict['handle'] = handle
|
|
497
|
+
if handle is not None:
|
|
498
|
+
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
499
|
+
info_dict['region'] = handle.launched_resources.region
|
|
500
|
+
info_dict['resources_str'] = (
|
|
501
|
+
resources_utils.get_readable_resources_repr(
|
|
502
|
+
handle, simplified_only=True)[0])
|
|
483
503
|
return info_dict
|
|
484
504
|
|
|
485
505
|
def __repr__(self) -> str:
|
|
486
|
-
|
|
487
|
-
|
|
506
|
+
show_details = env_options.Options.SHOW_DEBUG_INFO.get()
|
|
507
|
+
info_dict = self.to_info_dict(with_handle=show_details,
|
|
508
|
+
with_url=show_details)
|
|
488
509
|
handle_str = ''
|
|
489
510
|
if 'handle' in info_dict:
|
|
490
511
|
handle_str = f', handle={info_dict["handle"]}'
|
|
@@ -498,6 +519,33 @@ class ReplicaInfo:
|
|
|
498
519
|
f'launched_at={info_dict["launched_at"]}{handle_str})')
|
|
499
520
|
return info
|
|
500
521
|
|
|
522
|
+
def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
|
|
523
|
+
"""Probe the replica for pool management.
|
|
524
|
+
|
|
525
|
+
This function will check the first job status of the cluster, which is a
|
|
526
|
+
dummy job that only echoes "setup done". The success of this job means
|
|
527
|
+
the setup command is done and the replica is ready to be used. Check
|
|
528
|
+
sky/serve/server/core.py::up for more details.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Tuple of (self, is_ready, probe_time).
|
|
532
|
+
"""
|
|
533
|
+
probe_time = time.time()
|
|
534
|
+
try:
|
|
535
|
+
handle = backend_utils.check_cluster_available(
|
|
536
|
+
self.cluster_name, operation='probing pool')
|
|
537
|
+
if handle is None:
|
|
538
|
+
return self, False, probe_time
|
|
539
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
540
|
+
statuses = backend.get_job_status(handle, [1], stream_logs=False)
|
|
541
|
+
if statuses[1] == job_lib.JobStatus.SUCCEEDED:
|
|
542
|
+
return self, True, probe_time
|
|
543
|
+
return self, False, probe_time
|
|
544
|
+
except Exception as e: # pylint: disable=broad-except
|
|
545
|
+
logger.error(f'Error when probing pool of {self.cluster_name}: '
|
|
546
|
+
f'{common_utils.format_exception(e)}.')
|
|
547
|
+
return self, False, probe_time
|
|
548
|
+
|
|
501
549
|
def probe(
|
|
502
550
|
self,
|
|
503
551
|
readiness_path: str,
|
|
@@ -587,6 +635,7 @@ class ReplicaManager:
|
|
|
587
635
|
self._service_name: str = service_name
|
|
588
636
|
self._uptime: Optional[float] = None
|
|
589
637
|
self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
|
|
638
|
+
self._is_pool: bool = spec.pool
|
|
590
639
|
header_keys = None
|
|
591
640
|
if spec.readiness_headers is not None:
|
|
592
641
|
header_keys = list(spec.readiness_headers.keys())
|
|
@@ -600,6 +649,15 @@ class ReplicaManager:
|
|
|
600
649
|
# Oldest version among the currently provisioned and launched replicas
|
|
601
650
|
self.least_recent_version: int = serve_constants.INITIAL_VERSION
|
|
602
651
|
|
|
652
|
+
def _consecutive_failure_threshold_timeout(self) -> int:
|
|
653
|
+
"""The timeout for the consecutive failure threshold in seconds.
|
|
654
|
+
|
|
655
|
+
We reduce the timeout for pool to 10 seconds to make the pool more
|
|
656
|
+
responsive to the failure.
|
|
657
|
+
"""
|
|
658
|
+
# TODO(tian): Maybe let user determine this threshold
|
|
659
|
+
return 10 if self._is_pool else 180
|
|
660
|
+
|
|
603
661
|
def scale_up(self,
|
|
604
662
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
605
663
|
"""Scale up the service by 1 replica with resources_override.
|
|
@@ -634,10 +692,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
634
692
|
"""
|
|
635
693
|
|
|
636
694
|
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
|
637
|
-
|
|
695
|
+
service_task_yaml_path: str) -> None:
|
|
638
696
|
super().__init__(service_name, spec)
|
|
639
|
-
self.
|
|
640
|
-
task =
|
|
697
|
+
self.service_task_yaml_path = service_task_yaml_path
|
|
698
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
641
699
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
642
700
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
643
701
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
|
@@ -657,6 +715,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
657
715
|
|
|
658
716
|
self._recover_replica_operations()
|
|
659
717
|
|
|
718
|
+
@with_lock
|
|
660
719
|
def _recover_replica_operations(self):
|
|
661
720
|
"""Let's see are there something to do for ReplicaManager in a
|
|
662
721
|
recovery run"""
|
|
@@ -697,9 +756,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
697
756
|
# Replica management functions #
|
|
698
757
|
################################
|
|
699
758
|
|
|
700
|
-
#
|
|
701
|
-
#
|
|
702
|
-
@with_lock
|
|
759
|
+
# We don't need to add lock here since every caller of this function
|
|
760
|
+
# will acquire the lock.
|
|
703
761
|
def _launch_replica(
|
|
704
762
|
self,
|
|
705
763
|
replica_id: int,
|
|
@@ -714,7 +772,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
714
772
|
self._service_name, replica_id)
|
|
715
773
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
|
716
774
|
self._service_name, replica_id)
|
|
717
|
-
use_spot = _should_use_spot(self.
|
|
775
|
+
use_spot = _should_use_spot(self.service_task_yaml_path,
|
|
776
|
+
resources_override)
|
|
718
777
|
retry_until_up = True
|
|
719
778
|
location = None
|
|
720
779
|
if use_spot and self._spot_placer is not None:
|
|
@@ -742,10 +801,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
742
801
|
launch_cluster,
|
|
743
802
|
log_file_name,
|
|
744
803
|
).run,
|
|
745
|
-
args=(replica_id, self.
|
|
804
|
+
args=(replica_id, self.service_task_yaml_path, cluster_name,
|
|
746
805
|
resources_override, retry_until_up),
|
|
747
806
|
)
|
|
748
|
-
replica_port = _get_resources_ports(self.
|
|
807
|
+
replica_port = _get_resources_ports(self.service_task_yaml_path)
|
|
749
808
|
|
|
750
809
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
|
751
810
|
location, self.latest_version, resources_override)
|
|
@@ -754,11 +813,61 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
754
813
|
# to avoid too many sky.launch running at the same time.
|
|
755
814
|
self._launch_process_pool[replica_id] = p
|
|
756
815
|
|
|
816
|
+
@with_lock
|
|
757
817
|
def scale_up(self,
|
|
758
818
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
759
819
|
self._launch_replica(self._next_replica_id, resources_override)
|
|
760
820
|
self._next_replica_id += 1
|
|
761
821
|
|
|
822
|
+
def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
|
|
823
|
+
if exitcode != 0:
|
|
824
|
+
logger.error(f'Down process for replica {info.replica_id} '
|
|
825
|
+
f'exited abnormally with code {exitcode}.')
|
|
826
|
+
info.status_property.sky_down_status = (
|
|
827
|
+
common_utils.ProcessStatus.FAILED)
|
|
828
|
+
else:
|
|
829
|
+
info.status_property.sky_down_status = (
|
|
830
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
831
|
+
# Failed replica still count as a replica. In our current design, we
|
|
832
|
+
# want to fail early if user code have any error. This will prevent
|
|
833
|
+
# infinite loop of teardown and re-provision. However, there is a
|
|
834
|
+
# special case that if the replica is UP for longer than
|
|
835
|
+
# initial_delay_seconds, we assume it is just some random failure and
|
|
836
|
+
# we should restart the replica. Please refer to the implementation of
|
|
837
|
+
# `is_scale_down_succeeded` for more details.
|
|
838
|
+
# TODO(tian): Currently, restart replicas that failed within
|
|
839
|
+
# initial_delay_seconds is not supported. We should add it
|
|
840
|
+
# later when we support `sky serve update`.
|
|
841
|
+
removal_reason = None
|
|
842
|
+
if info.status_property.is_scale_down:
|
|
843
|
+
# This means the cluster is deleted due to an autoscaler
|
|
844
|
+
# decision or the cluster is recovering from preemption.
|
|
845
|
+
# Delete the replica info so it won't count as a replica.
|
|
846
|
+
if info.status_property.preempted:
|
|
847
|
+
removal_reason = 'for preemption recovery'
|
|
848
|
+
else:
|
|
849
|
+
removal_reason = 'normally'
|
|
850
|
+
# Don't keep failed record for version mismatch replicas,
|
|
851
|
+
# since user should fixed the error before update.
|
|
852
|
+
elif info.version != self.latest_version:
|
|
853
|
+
removal_reason = 'for version outdated'
|
|
854
|
+
elif info.status_property.purged:
|
|
855
|
+
removal_reason = 'for purge'
|
|
856
|
+
elif info.status_property.failed_spot_availability:
|
|
857
|
+
removal_reason = 'for spot availability failure'
|
|
858
|
+
else:
|
|
859
|
+
logger.info(f'Termination of replica {info.replica_id} '
|
|
860
|
+
'finished. Replica info is kept since some '
|
|
861
|
+
'failure detected.')
|
|
862
|
+
serve_state.add_or_update_replica(self._service_name,
|
|
863
|
+
info.replica_id, info)
|
|
864
|
+
if removal_reason is not None:
|
|
865
|
+
serve_state.remove_replica(self._service_name, info.replica_id)
|
|
866
|
+
logger.info(f'Replica {info.replica_id} removed from the '
|
|
867
|
+
f'replica table {removal_reason}.')
|
|
868
|
+
|
|
869
|
+
# We don't need to add lock here since every caller of this function
|
|
870
|
+
# will acquire the lock.
|
|
762
871
|
def _terminate_replica(self,
|
|
763
872
|
replica_id: int,
|
|
764
873
|
sync_down_logs: bool,
|
|
@@ -776,7 +885,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
776
885
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
777
886
|
replica_id)
|
|
778
887
|
assert info is not None
|
|
779
|
-
info.status_property.sky_launch_status =
|
|
888
|
+
info.status_property.sky_launch_status = (
|
|
889
|
+
common_utils.ProcessStatus.INTERRUPTED)
|
|
780
890
|
serve_state.add_or_update_replica(self._service_name, replica_id,
|
|
781
891
|
info)
|
|
782
892
|
launch_process = self._launch_process_pool[replica_id]
|
|
@@ -820,9 +930,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
820
930
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
821
931
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
822
932
|
'replica_jobs')
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
933
|
+
job_ids = ['1'] if self._is_pool else None
|
|
934
|
+
job_log_file_name = controller_utils.download_and_stream_job_log(
|
|
935
|
+
backend, handle, replica_job_logs_dir, job_ids)
|
|
826
936
|
if job_log_file_name is not None:
|
|
827
937
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
828
938
|
with open(log_file_name, 'a',
|
|
@@ -848,18 +958,30 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
848
958
|
|
|
849
959
|
logger.info(f'preempted: {info.status_property.preempted}, '
|
|
850
960
|
f'replica_id: {replica_id}')
|
|
961
|
+
info.status_property.is_scale_down = is_scale_down
|
|
962
|
+
info.status_property.purged = purge
|
|
963
|
+
|
|
964
|
+
# If the cluster does not exist, it means either the cluster never
|
|
965
|
+
# exists (e.g., the cluster is scaled down before it gets a chance to
|
|
966
|
+
# provision) or the cluster is preempted and cleaned up by the status
|
|
967
|
+
# refresh. In this case, we skip spawning a new down process to save
|
|
968
|
+
# controller resources.
|
|
969
|
+
if not global_user_state.cluster_with_name_exists(info.cluster_name):
|
|
970
|
+
self._handle_sky_down_finish(info, exitcode=0)
|
|
971
|
+
return
|
|
972
|
+
|
|
973
|
+
# Otherwise, start the process to terminate the cluster.
|
|
851
974
|
p = multiprocessing.Process(
|
|
852
975
|
target=ux_utils.RedirectOutputForProcess(terminate_cluster,
|
|
853
976
|
log_file_name, 'a').run,
|
|
854
977
|
args=(info.cluster_name, replica_drain_delay_seconds),
|
|
855
978
|
)
|
|
856
|
-
info.status_property.sky_down_status =
|
|
857
|
-
|
|
858
|
-
info.status_property.purged = purge
|
|
979
|
+
info.status_property.sky_down_status = (
|
|
980
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
859
981
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
860
|
-
p.start()
|
|
861
982
|
self._down_process_pool[replica_id] = p
|
|
862
983
|
|
|
984
|
+
@with_lock
|
|
863
985
|
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
|
864
986
|
self._terminate_replica(
|
|
865
987
|
replica_id,
|
|
@@ -868,6 +990,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
868
990
|
is_scale_down=True,
|
|
869
991
|
purge=purge)
|
|
870
992
|
|
|
993
|
+
# We don't need to add lock here since every caller of this function
|
|
994
|
+
# will acquire the lock.
|
|
871
995
|
def _handle_preemption(self, info: ReplicaInfo) -> bool:
|
|
872
996
|
"""Handle preemption of the replica if any error happened.
|
|
873
997
|
|
|
@@ -930,18 +1054,19 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
930
1054
|
# To avoid `dictionary changed size during iteration` error.
|
|
931
1055
|
launch_process_pool_snapshot = list(self._launch_process_pool.items())
|
|
932
1056
|
for replica_id, p in launch_process_pool_snapshot:
|
|
933
|
-
if
|
|
1057
|
+
if p.is_alive():
|
|
1058
|
+
continue
|
|
1059
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
934
1060
|
info = serve_state.get_replica_info_from_id(
|
|
935
1061
|
self._service_name, replica_id)
|
|
936
1062
|
assert info is not None, replica_id
|
|
937
1063
|
error_in_sky_launch = False
|
|
938
1064
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
939
1065
|
# sky.launch not started yet
|
|
940
|
-
if
|
|
941
|
-
_MAX_NUM_LAUNCH):
|
|
1066
|
+
if controller_utils.can_provision():
|
|
942
1067
|
p.start()
|
|
943
1068
|
info.status_property.sky_launch_status = (
|
|
944
|
-
ProcessStatus.RUNNING)
|
|
1069
|
+
common_utils.ProcessStatus.RUNNING)
|
|
945
1070
|
else:
|
|
946
1071
|
# sky.launch finished
|
|
947
1072
|
# TODO(tian): Try-catch in process, and have an enum return
|
|
@@ -958,11 +1083,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
958
1083
|
f'exited abnormally with code {p.exitcode}.'
|
|
959
1084
|
' Terminating...')
|
|
960
1085
|
info.status_property.sky_launch_status = (
|
|
961
|
-
ProcessStatus.FAILED)
|
|
1086
|
+
common_utils.ProcessStatus.FAILED)
|
|
962
1087
|
error_in_sky_launch = True
|
|
963
1088
|
else:
|
|
964
1089
|
info.status_property.sky_launch_status = (
|
|
965
|
-
ProcessStatus.SUCCEEDED)
|
|
1090
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
966
1091
|
if self._spot_placer is not None and info.is_spot:
|
|
967
1092
|
# TODO(tian): Currently, we set the location to
|
|
968
1093
|
# preemptive if the launch process failed. This is
|
|
@@ -990,59 +1115,25 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
990
1115
|
replica_drain_delay_seconds=0)
|
|
991
1116
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
992
1117
|
for replica_id, p in down_process_pool_snapshot:
|
|
993
|
-
if
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
info.status_property.sky_down_status = (
|
|
1004
|
-
ProcessStatus.FAILED)
|
|
1005
|
-
else:
|
|
1118
|
+
if p.is_alive():
|
|
1119
|
+
continue
|
|
1120
|
+
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
1121
|
+
replica_id)
|
|
1122
|
+
assert info is not None, replica_id
|
|
1123
|
+
if (info.status_property.sky_down_status ==
|
|
1124
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
1125
|
+
# sky.down not started yet
|
|
1126
|
+
if controller_utils.can_terminate():
|
|
1127
|
+
p.start()
|
|
1006
1128
|
info.status_property.sky_down_status = (
|
|
1007
|
-
ProcessStatus.
|
|
1008
|
-
# Failed replica still count as a replica. In our current
|
|
1009
|
-
# design, we want to fail early if user code have any error.
|
|
1010
|
-
# This will prevent infinite loop of teardown and
|
|
1011
|
-
# re-provision. However, there is a special case that if the
|
|
1012
|
-
# replica is UP for longer than initial_delay_seconds, we
|
|
1013
|
-
# assume it is just some random failure and we should restart
|
|
1014
|
-
# the replica. Please refer to the implementation of
|
|
1015
|
-
# `is_scale_down_succeeded` for more details.
|
|
1016
|
-
# TODO(tian): Currently, restart replicas that failed within
|
|
1017
|
-
# initial_delay_seconds is not supported. We should add it
|
|
1018
|
-
# later when we support `sky serve update`.
|
|
1019
|
-
removal_reason = None
|
|
1020
|
-
if info.status_property.is_scale_down:
|
|
1021
|
-
# This means the cluster is deleted due to an autoscaler
|
|
1022
|
-
# decision or the cluster is recovering from preemption.
|
|
1023
|
-
# Delete the replica info so it won't count as a replica.
|
|
1024
|
-
if info.status_property.preempted:
|
|
1025
|
-
removal_reason = 'for preemption recovery'
|
|
1026
|
-
else:
|
|
1027
|
-
removal_reason = 'normally'
|
|
1028
|
-
# Don't keep failed record for version mismatch replicas,
|
|
1029
|
-
# since user should fixed the error before update.
|
|
1030
|
-
elif info.version != self.latest_version:
|
|
1031
|
-
removal_reason = 'for version outdated'
|
|
1032
|
-
elif info.status_property.purged:
|
|
1033
|
-
removal_reason = 'for purge'
|
|
1034
|
-
elif info.status_property.failed_spot_availability:
|
|
1035
|
-
removal_reason = 'for spot availability failure'
|
|
1036
|
-
else:
|
|
1037
|
-
logger.info(f'Termination of replica {replica_id} '
|
|
1038
|
-
'finished. Replica info is kept since some '
|
|
1039
|
-
'failure detected.')
|
|
1129
|
+
common_utils.ProcessStatus.RUNNING)
|
|
1040
1130
|
serve_state.add_or_update_replica(self._service_name,
|
|
1041
1131
|
replica_id, info)
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1132
|
+
else:
|
|
1133
|
+
logger.info(
|
|
1134
|
+
f'Terminate process for replica {replica_id} finished.')
|
|
1135
|
+
del self._down_process_pool[replica_id]
|
|
1136
|
+
self._handle_sky_down_finish(info, exitcode=p.exitcode)
|
|
1046
1137
|
|
|
1047
1138
|
# Clean old version
|
|
1048
1139
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
@@ -1098,9 +1189,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1098
1189
|
handle = info.handle()
|
|
1099
1190
|
assert handle is not None, info
|
|
1100
1191
|
# Use None to fetch latest job, which stands for user task job
|
|
1192
|
+
job_ids = [1] if self._is_pool else None
|
|
1101
1193
|
try:
|
|
1102
1194
|
job_statuses = backend.get_job_status(handle,
|
|
1103
|
-
|
|
1195
|
+
job_ids,
|
|
1104
1196
|
stream_logs=False)
|
|
1105
1197
|
except exceptions.CommandError:
|
|
1106
1198
|
# If the job status fetch failed, it is likely that the
|
|
@@ -1110,7 +1202,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1110
1202
|
continue
|
|
1111
1203
|
# Re-raise the exception if it is not preempted.
|
|
1112
1204
|
raise
|
|
1113
|
-
job_status =
|
|
1205
|
+
job_status = job_statuses[1] if self._is_pool else list(
|
|
1206
|
+
job_statuses.values())[0]
|
|
1114
1207
|
if job_status in job_lib.JobStatus.user_code_failure_states():
|
|
1115
1208
|
info.status_property.user_app_failed = True
|
|
1116
1209
|
serve_state.add_or_update_replica(self._service_name,
|
|
@@ -1154,18 +1247,24 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1154
1247
|
for info in infos:
|
|
1155
1248
|
if not info.status_property.should_track_service_status():
|
|
1156
1249
|
continue
|
|
1157
|
-
|
|
1158
|
-
f'replica_{info.replica_id}(
|
|
1159
|
-
|
|
1160
|
-
pool.apply_async(
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1250
|
+
if self._is_pool:
|
|
1251
|
+
replica_to_probe.append(f'replica_{info.replica_id}(cluster'
|
|
1252
|
+
f'_name={info.cluster_name})')
|
|
1253
|
+
probe_futures.append(pool.apply_async(info.probe_pool))
|
|
1254
|
+
else:
|
|
1255
|
+
replica_to_probe.append(
|
|
1256
|
+
f'replica_{info.replica_id}(url={info.url})')
|
|
1257
|
+
probe_futures.append(
|
|
1258
|
+
pool.apply_async(
|
|
1259
|
+
info.probe,
|
|
1260
|
+
(
|
|
1261
|
+
self._get_readiness_path(info.version),
|
|
1262
|
+
self._get_post_data(info.version),
|
|
1263
|
+
self._get_readiness_timeout_seconds(
|
|
1264
|
+
info.version),
|
|
1265
|
+
self._get_readiness_headers(info.version),
|
|
1266
|
+
),
|
|
1267
|
+
),)
|
|
1169
1268
|
logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
|
|
1170
1269
|
|
|
1171
1270
|
# Since futures.as_completed will return futures in the order of
|
|
@@ -1202,8 +1301,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1202
1301
|
consecutive_failure_time = (
|
|
1203
1302
|
info.consecutive_failure_times[-1] -
|
|
1204
1303
|
info.consecutive_failure_times[0])
|
|
1205
|
-
|
|
1206
|
-
|
|
1304
|
+
failure_threshold = (
|
|
1305
|
+
self._consecutive_failure_threshold_timeout())
|
|
1306
|
+
if consecutive_failure_time >= failure_threshold:
|
|
1207
1307
|
logger.info(
|
|
1208
1308
|
f'Replica {info.replica_id} is not ready for '
|
|
1209
1309
|
'too long and exceeding consecutive failure '
|
|
@@ -1214,8 +1314,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1214
1314
|
f'Replica {info.replica_id} is not ready '
|
|
1215
1315
|
'but within consecutive failure threshold '
|
|
1216
1316
|
f'({consecutive_failure_time}s / '
|
|
1217
|
-
f'{
|
|
1218
|
-
'Skipping.')
|
|
1317
|
+
f'{failure_threshold}s). Skipping.')
|
|
1219
1318
|
else:
|
|
1220
1319
|
initial_delay_seconds = self._get_initial_delay_seconds(
|
|
1221
1320
|
info.version)
|
|
@@ -1290,11 +1389,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1290
1389
|
logger.error(f'Invalid version: {version}, '
|
|
1291
1390
|
f'latest version: {self.latest_version}')
|
|
1292
1391
|
return
|
|
1293
|
-
|
|
1392
|
+
service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
|
1294
1393
|
self._service_name, version)
|
|
1295
1394
|
serve_state.add_or_update_version(self._service_name, version, spec)
|
|
1296
1395
|
self.latest_version = version
|
|
1297
|
-
self.
|
|
1396
|
+
self.service_task_yaml_path = service_task_yaml_path
|
|
1298
1397
|
self._update_mode = update_mode
|
|
1299
1398
|
|
|
1300
1399
|
# Reuse all replicas that have the same config as the new version
|
|
@@ -1302,32 +1401,40 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1302
1401
|
# the latest version. This can significantly improve the speed
|
|
1303
1402
|
# for updating an existing service with only config changes to the
|
|
1304
1403
|
# service specs, e.g. scale down the service.
|
|
1305
|
-
new_config =
|
|
1404
|
+
new_config = yaml_utils.read_yaml(
|
|
1405
|
+
os.path.expanduser(service_task_yaml_path))
|
|
1306
1406
|
# Always create new replicas and scale down old ones when file_mounts
|
|
1307
1407
|
# are not empty.
|
|
1308
1408
|
if new_config.get('file_mounts', None) != {}:
|
|
1309
1409
|
return
|
|
1310
|
-
for key in ['service']:
|
|
1311
|
-
new_config.pop(key)
|
|
1410
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1411
|
+
new_config.pop(key, None)
|
|
1412
|
+
new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
|
|
1413
|
+
|
|
1312
1414
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
1313
1415
|
for info in replica_infos:
|
|
1314
1416
|
if info.version < version and not info.is_terminal:
|
|
1315
1417
|
# Assume user does not change the yaml file on the controller.
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1418
|
+
old_service_task_yaml_path = (
|
|
1419
|
+
serve_utils.generate_task_yaml_file_name(
|
|
1420
|
+
self._service_name, info.version))
|
|
1421
|
+
old_config = yaml_utils.read_yaml(
|
|
1422
|
+
os.path.expanduser(old_service_task_yaml_path))
|
|
1423
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1424
|
+
old_config.pop(key, None)
|
|
1322
1425
|
# Bump replica version if all fields except for service are
|
|
1323
1426
|
# the same.
|
|
1324
1427
|
# Here, we manually convert the any_of field to a set to avoid
|
|
1325
1428
|
# only the difference in the random order of the any_of fields.
|
|
1326
1429
|
old_config_any_of = old_config.get('resources',
|
|
1327
1430
|
{}).pop('any_of', [])
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1431
|
+
|
|
1432
|
+
if (resources_utils.normalize_any_of_resources_config(
|
|
1433
|
+
old_config_any_of) != resources_utils.
|
|
1434
|
+
normalize_any_of_resources_config(new_config_any_of)):
|
|
1435
|
+
logger.info('Replica config changed (any_of), skipping. '
|
|
1436
|
+
f'old: {old_config_any_of}, '
|
|
1437
|
+
f'new: {new_config_any_of}')
|
|
1331
1438
|
continue
|
|
1332
1439
|
# File mounts should both be empty, as update always
|
|
1333
1440
|
# create new buckets if they are not empty.
|
|
@@ -1341,6 +1448,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1341
1448
|
info.version = version
|
|
1342
1449
|
serve_state.add_or_update_replica(self._service_name,
|
|
1343
1450
|
info.replica_id, info)
|
|
1451
|
+
else:
|
|
1452
|
+
logger.info('Replica config changed (rest), skipping. '
|
|
1453
|
+
f'old: {old_config}, '
|
|
1454
|
+
f'new: {new_config}')
|
|
1344
1455
|
|
|
1345
1456
|
def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
|
|
1346
1457
|
spec = serve_state.get_spec(self._service_name, version)
|