skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
2
|
import dataclasses
|
|
3
|
-
import enum
|
|
4
3
|
import functools
|
|
5
|
-
import multiprocessing
|
|
6
4
|
from multiprocessing import pool as mp_pool
|
|
7
5
|
import os
|
|
6
|
+
import pathlib
|
|
8
7
|
import threading
|
|
9
8
|
import time
|
|
10
9
|
import traceback
|
|
@@ -12,17 +11,16 @@ import typing
|
|
|
12
11
|
from typing import Any, Dict, List, Optional, Tuple
|
|
13
12
|
|
|
14
13
|
import colorama
|
|
15
|
-
import
|
|
14
|
+
import filelock
|
|
16
15
|
import requests
|
|
17
16
|
|
|
18
|
-
import sky
|
|
19
17
|
from sky import backends
|
|
20
|
-
from sky import core
|
|
21
18
|
from sky import exceptions
|
|
22
|
-
from sky import execution
|
|
23
19
|
from sky import global_user_state
|
|
24
20
|
from sky import sky_logging
|
|
21
|
+
from sky import task as task_lib
|
|
25
22
|
from sky.backends import backend_utils
|
|
23
|
+
from sky.client import sdk
|
|
26
24
|
from sky.serve import constants as serve_constants
|
|
27
25
|
from sky.serve import serve_state
|
|
28
26
|
from sky.serve import serve_utils
|
|
@@ -32,34 +30,47 @@ from sky.skylet import constants
|
|
|
32
30
|
from sky.skylet import job_lib
|
|
33
31
|
from sky.usage import usage_lib
|
|
34
32
|
from sky.utils import common_utils
|
|
33
|
+
from sky.utils import context
|
|
35
34
|
from sky.utils import controller_utils
|
|
36
35
|
from sky.utils import env_options
|
|
36
|
+
from sky.utils import resources_utils
|
|
37
37
|
from sky.utils import status_lib
|
|
38
|
+
from sky.utils import thread_utils
|
|
38
39
|
from sky.utils import ux_utils
|
|
40
|
+
from sky.utils import yaml_utils
|
|
39
41
|
|
|
40
42
|
if typing.TYPE_CHECKING:
|
|
41
|
-
|
|
43
|
+
import logging
|
|
44
|
+
|
|
42
45
|
from sky.serve import service_spec
|
|
43
46
|
|
|
44
47
|
logger = sky_logging.init_logger(__name__)
|
|
45
48
|
|
|
46
49
|
_JOB_STATUS_FETCH_INTERVAL = 30
|
|
47
50
|
_PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
48
|
-
# TODO(tian): Maybe let user determine this threshold
|
|
49
|
-
_CONSECUTIVE_FAILURE_THRESHOLD_TIMEOUT = 180
|
|
50
51
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
51
52
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
53
|
+
_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS = 15
|
|
52
54
|
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
|
|
55
|
+
# TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
|
|
56
|
+
# 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
|
|
57
|
+
# old ReplicaInfo in database will still tries to unpickle using ProcessStatus
|
|
58
|
+
# in replica_managers. We set this alias to avoid breaking changes. See #6729
|
|
59
|
+
# for more details.
|
|
60
|
+
ProcessStatus = common_utils.ProcessStatus
|
|
56
61
|
|
|
57
62
|
|
|
58
63
|
# TODO(tian): Combine this with
|
|
59
64
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
65
|
+
# Use context.contextual to enable per-launch output redirection.
|
|
66
|
+
@context.contextual
|
|
60
67
|
def launch_cluster(replica_id: int,
|
|
61
|
-
|
|
68
|
+
yaml_content: str,
|
|
62
69
|
cluster_name: str,
|
|
70
|
+
log_file: str,
|
|
71
|
+
replica_to_request_id: thread_utils.ThreadSafeDict[int, str],
|
|
72
|
+
replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
|
|
73
|
+
int, bool],
|
|
63
74
|
resources_override: Optional[Dict[str, Any]] = None,
|
|
64
75
|
retry_until_up: bool = True,
|
|
65
76
|
max_retry: int = 3) -> None:
|
|
@@ -73,13 +84,16 @@ def launch_cluster(replica_id: int,
|
|
|
73
84
|
or some error happened before provisioning and will happen again
|
|
74
85
|
if retry.
|
|
75
86
|
"""
|
|
87
|
+
ctx = context.get()
|
|
88
|
+
assert ctx is not None, 'Context is not initialized'
|
|
89
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
90
|
+
|
|
76
91
|
if resources_override is not None:
|
|
77
92
|
logger.info(f'Scaling up replica (id: {replica_id}) cluster '
|
|
78
93
|
f'{cluster_name} with resources override: '
|
|
79
94
|
f'{resources_override}')
|
|
80
95
|
try:
|
|
81
|
-
|
|
82
|
-
task = sky.Task.from_yaml_config(config)
|
|
96
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
83
97
|
if resources_override is not None:
|
|
84
98
|
resources = task.resources
|
|
85
99
|
overrided_resources = [
|
|
@@ -96,16 +110,31 @@ def launch_cluster(replica_id: int,
|
|
|
96
110
|
raise RuntimeError(
|
|
97
111
|
f'Failed to launch the sky serve replica cluster {cluster_name} '
|
|
98
112
|
'due to failing to initialize sky.Task from yaml file.') from e
|
|
113
|
+
|
|
114
|
+
def _check_is_cancelled() -> bool:
|
|
115
|
+
is_cancelled = replica_to_launch_cancelled.get(replica_id, False)
|
|
116
|
+
if is_cancelled:
|
|
117
|
+
logger.info(f'Replica {replica_id} launch cancelled.')
|
|
118
|
+
# Pop the value to indicate that the signal was received.
|
|
119
|
+
replica_to_launch_cancelled.pop(replica_id)
|
|
120
|
+
return is_cancelled
|
|
121
|
+
|
|
99
122
|
retry_cnt = 0
|
|
100
123
|
backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS)
|
|
101
124
|
while True:
|
|
102
125
|
retry_cnt += 1
|
|
103
126
|
try:
|
|
127
|
+
if _check_is_cancelled():
|
|
128
|
+
return
|
|
104
129
|
usage_lib.messages.usage.set_internal()
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
130
|
+
request_id = sdk.launch(task,
|
|
131
|
+
cluster_name,
|
|
132
|
+
retry_until_up=retry_until_up,
|
|
133
|
+
_is_launched_by_sky_serve_controller=True)
|
|
134
|
+
logger.info(f'Replica cluster {cluster_name} launch requested '
|
|
135
|
+
f'with request_id: {request_id}.')
|
|
136
|
+
replica_to_request_id[replica_id] = request_id
|
|
137
|
+
sdk.stream_and_get(request_id)
|
|
109
138
|
logger.info(f'Replica cluster {cluster_name} launched.')
|
|
110
139
|
except (exceptions.InvalidClusterNameError,
|
|
111
140
|
exceptions.NoCloudAccessError,
|
|
@@ -130,22 +159,44 @@ def launch_cluster(replica_id: int,
|
|
|
130
159
|
else: # No exception, the launch succeeds.
|
|
131
160
|
return
|
|
132
161
|
|
|
133
|
-
|
|
162
|
+
# Cleanup the request id and the failed cluster.
|
|
163
|
+
replica_to_request_id.pop(replica_id)
|
|
164
|
+
# If it is cancelled, no need to terminate the cluster. It will be
|
|
165
|
+
# handled by the termination thread.
|
|
166
|
+
if _check_is_cancelled():
|
|
167
|
+
return
|
|
168
|
+
terminate_cluster(cluster_name, log_file=log_file)
|
|
169
|
+
|
|
134
170
|
if retry_cnt >= max_retry:
|
|
135
171
|
raise RuntimeError('Failed to launch the sky serve replica cluster '
|
|
136
172
|
f'{cluster_name} after {max_retry} retries.')
|
|
173
|
+
|
|
137
174
|
gap_seconds = backoff.current_backoff()
|
|
138
175
|
logger.info('Retrying to launch the sky serve replica cluster '
|
|
139
176
|
f'in {gap_seconds:.1f} seconds.')
|
|
140
|
-
time.
|
|
177
|
+
start_backoff = time.time()
|
|
178
|
+
# Check if it is cancelled every 0.1 seconds.
|
|
179
|
+
while time.time() - start_backoff < gap_seconds:
|
|
180
|
+
if _check_is_cancelled():
|
|
181
|
+
return
|
|
182
|
+
time.sleep(0.1)
|
|
141
183
|
|
|
142
184
|
|
|
143
185
|
# TODO(tian): Combine this with
|
|
144
186
|
# sky/spot/recovery_strategy.py::terminate_cluster
|
|
187
|
+
@context.contextual
|
|
145
188
|
def terminate_cluster(cluster_name: str,
|
|
189
|
+
log_file: str,
|
|
146
190
|
replica_drain_delay_seconds: int = 0,
|
|
147
191
|
max_retry: int = 3) -> None:
|
|
148
192
|
"""Terminate the sky serve replica cluster."""
|
|
193
|
+
# Setup logging redirection.
|
|
194
|
+
ctx = context.get()
|
|
195
|
+
assert ctx is not None, 'Context is not initialized'
|
|
196
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
197
|
+
|
|
198
|
+
logger.info(f'Terminating replica cluster {cluster_name} with '
|
|
199
|
+
f'replica_drain_delay_seconds: {replica_drain_delay_seconds}')
|
|
149
200
|
time.sleep(replica_drain_delay_seconds)
|
|
150
201
|
retry_cnt = 0
|
|
151
202
|
backoff = common_utils.Backoff()
|
|
@@ -153,7 +204,10 @@ def terminate_cluster(cluster_name: str,
|
|
|
153
204
|
retry_cnt += 1
|
|
154
205
|
try:
|
|
155
206
|
usage_lib.messages.usage.set_internal()
|
|
156
|
-
|
|
207
|
+
logger.info(f'Sending down request to cluster {cluster_name}')
|
|
208
|
+
request_id = sdk.down(cluster_name)
|
|
209
|
+
sdk.stream_and_get(request_id)
|
|
210
|
+
logger.info(f'Replica cluster {cluster_name} terminated.')
|
|
157
211
|
return
|
|
158
212
|
except ValueError:
|
|
159
213
|
# The cluster is already terminated.
|
|
@@ -173,17 +227,19 @@ def terminate_cluster(cluster_name: str,
|
|
|
173
227
|
time.sleep(gap_seconds)
|
|
174
228
|
|
|
175
229
|
|
|
176
|
-
def _get_resources_ports(
|
|
230
|
+
def _get_resources_ports(yaml_content: str) -> str:
|
|
177
231
|
"""Get the resources ports used by the task."""
|
|
178
|
-
task =
|
|
232
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
179
233
|
# Already checked all ports are valid in sky.serve.core.up
|
|
180
234
|
assert task.resources, task
|
|
181
235
|
assert task.service is not None, task
|
|
236
|
+
if task.service.pool:
|
|
237
|
+
return '-'
|
|
182
238
|
assert task.service.ports is not None, task
|
|
183
239
|
return task.service.ports
|
|
184
240
|
|
|
185
241
|
|
|
186
|
-
def _should_use_spot(
|
|
242
|
+
def _should_use_spot(yaml_content: str,
|
|
187
243
|
resource_override: Optional[Dict[str, Any]]) -> bool:
|
|
188
244
|
"""Get whether the task should use spot."""
|
|
189
245
|
if resource_override is not None:
|
|
@@ -191,7 +247,7 @@ def _should_use_spot(task_yaml: str,
|
|
|
191
247
|
if use_spot_override is not None:
|
|
192
248
|
assert isinstance(use_spot_override, bool)
|
|
193
249
|
return use_spot_override
|
|
194
|
-
task =
|
|
250
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
195
251
|
spot_use_resources = [
|
|
196
252
|
resources for resources in task.resources if resources.use_spot
|
|
197
253
|
]
|
|
@@ -200,6 +256,12 @@ def _should_use_spot(task_yaml: str,
|
|
|
200
256
|
return len(spot_use_resources) == len(task.resources)
|
|
201
257
|
|
|
202
258
|
|
|
259
|
+
# Every function that calls serve_state.add_or_update_replica should acquire
|
|
260
|
+
# this lock. It is to prevent race condition when the replica status is updated
|
|
261
|
+
# by multiple threads at the same time. The modification of replica info is
|
|
262
|
+
# 2 database calls: read the whole replica info object, unpickle it, and modify
|
|
263
|
+
# corresponding fields. Then it is write back to the database. We need to ensure
|
|
264
|
+
# the read-modify-write operation is atomic.
|
|
203
265
|
def with_lock(func):
|
|
204
266
|
|
|
205
267
|
@functools.wraps(func)
|
|
@@ -210,22 +272,6 @@ def with_lock(func):
|
|
|
210
272
|
return wrapper
|
|
211
273
|
|
|
212
274
|
|
|
213
|
-
class ProcessStatus(enum.Enum):
|
|
214
|
-
"""Process status."""
|
|
215
|
-
|
|
216
|
-
# The process is running
|
|
217
|
-
RUNNING = 'RUNNING'
|
|
218
|
-
|
|
219
|
-
# The process is finished and succeeded
|
|
220
|
-
SUCCEEDED = 'SUCCEEDED'
|
|
221
|
-
|
|
222
|
-
# The process is interrupted
|
|
223
|
-
INTERRUPTED = 'INTERRUPTED'
|
|
224
|
-
|
|
225
|
-
# The process failed
|
|
226
|
-
FAILED = 'FAILED'
|
|
227
|
-
|
|
228
|
-
|
|
229
275
|
@dataclasses.dataclass
|
|
230
276
|
class ReplicaStatusProperty:
|
|
231
277
|
"""Some properties that determine replica status.
|
|
@@ -237,15 +283,16 @@ class ReplicaStatusProperty:
|
|
|
237
283
|
first_ready_time: The first time the service is ready.
|
|
238
284
|
sky_down_status: Process status of sky.down.
|
|
239
285
|
"""
|
|
240
|
-
#
|
|
241
|
-
sky_launch_status:
|
|
286
|
+
# sky.launch will always be scheduled on creation of ReplicaStatusProperty.
|
|
287
|
+
sky_launch_status: common_utils.ProcessStatus = (
|
|
288
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
242
289
|
user_app_failed: bool = False
|
|
243
290
|
service_ready_now: bool = False
|
|
244
291
|
# None means readiness probe is not succeeded yet;
|
|
245
292
|
# -1 means the initial delay seconds is exceeded.
|
|
246
293
|
first_ready_time: Optional[float] = None
|
|
247
294
|
# None means sky.down is not called yet.
|
|
248
|
-
sky_down_status: Optional[ProcessStatus] = None
|
|
295
|
+
sky_down_status: Optional[common_utils.ProcessStatus] = None
|
|
249
296
|
# Whether the termination is caused by autoscaler's decision
|
|
250
297
|
is_scale_down: bool = False
|
|
251
298
|
# The replica's spot instance was preempted.
|
|
@@ -300,7 +347,7 @@ class ReplicaStatusProperty:
|
|
|
300
347
|
(1) Job status;
|
|
301
348
|
(2) Readiness probe.
|
|
302
349
|
"""
|
|
303
|
-
if self.sky_launch_status != ProcessStatus.SUCCEEDED:
|
|
350
|
+
if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
|
|
304
351
|
return False
|
|
305
352
|
if self.sky_down_status is not None:
|
|
306
353
|
return False
|
|
@@ -314,37 +361,43 @@ class ReplicaStatusProperty:
|
|
|
314
361
|
|
|
315
362
|
def to_replica_status(self) -> serve_state.ReplicaStatus:
|
|
316
363
|
"""Convert status property to human-readable replica status."""
|
|
317
|
-
|
|
364
|
+
# Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
|
|
365
|
+
# we use None to represent sky.launch is not called yet.
|
|
366
|
+
if (self.sky_launch_status is None or
|
|
367
|
+
self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
|
|
318
368
|
# Pending to launch
|
|
319
369
|
return serve_state.ReplicaStatus.PENDING
|
|
320
|
-
if self.sky_launch_status == ProcessStatus.RUNNING:
|
|
321
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
370
|
+
if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
|
|
371
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
322
372
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
323
|
-
if self.sky_down_status == ProcessStatus.SUCCEEDED:
|
|
373
|
+
if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
|
|
324
374
|
# This indicate it is a scale_down with correct teardown.
|
|
325
375
|
# Should have been cleaned from the replica table.
|
|
326
376
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
327
377
|
# Still launching
|
|
328
378
|
return serve_state.ReplicaStatus.PROVISIONING
|
|
329
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
379
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
330
380
|
# sky.down is running and a scale down interrupted sky.launch
|
|
331
381
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
332
382
|
if self.sky_down_status is not None:
|
|
333
383
|
if self.preempted:
|
|
334
384
|
# Replica (spot) is preempted
|
|
335
385
|
return serve_state.ReplicaStatus.PREEMPTED
|
|
336
|
-
if self.sky_down_status == ProcessStatus.
|
|
386
|
+
if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
|
|
387
|
+
# sky.down is scheduled to run, but not started yet.
|
|
388
|
+
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
389
|
+
if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
|
|
337
390
|
# sky.down is running
|
|
338
391
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
339
|
-
if self.sky_launch_status == ProcessStatus.INTERRUPTED:
|
|
392
|
+
if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
|
|
340
393
|
return serve_state.ReplicaStatus.SHUTTING_DOWN
|
|
341
|
-
if self.sky_down_status == ProcessStatus.FAILED:
|
|
394
|
+
if self.sky_down_status == common_utils.ProcessStatus.FAILED:
|
|
342
395
|
# sky.down failed
|
|
343
396
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
344
397
|
if self.user_app_failed:
|
|
345
398
|
# Failed on user setup/run
|
|
346
399
|
return serve_state.ReplicaStatus.FAILED
|
|
347
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
400
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
348
401
|
# sky.launch failed
|
|
349
402
|
return serve_state.ReplicaStatus.FAILED_PROVISION
|
|
350
403
|
if self.first_ready_time is None:
|
|
@@ -360,18 +413,18 @@ class ReplicaStatusProperty:
|
|
|
360
413
|
# This indicate it is a scale_down with correct teardown.
|
|
361
414
|
# Should have been cleaned from the replica table.
|
|
362
415
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
363
|
-
if self.sky_launch_status == ProcessStatus.FAILED:
|
|
416
|
+
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
364
417
|
# sky.launch failed
|
|
365
|
-
# The down
|
|
418
|
+
# The down thread has not been started if it reaches here,
|
|
366
419
|
# due to the `if self.sky_down_status is not None`` check above.
|
|
367
|
-
# However, it should have been started by
|
|
420
|
+
# However, it should have been started by _refresh_thread_pool.
|
|
368
421
|
# If not started, this means some bug prevent sky.down from
|
|
369
422
|
# executing. It is also a potential resource leak, so we mark
|
|
370
423
|
# it as FAILED_CLEANUP.
|
|
371
424
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
372
425
|
if self.user_app_failed:
|
|
373
426
|
# Failed on user setup/run
|
|
374
|
-
# Same as above, the down
|
|
427
|
+
# Same as above, the down thread should have been started.
|
|
375
428
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
376
429
|
if self.service_ready_now:
|
|
377
430
|
# Service is ready
|
|
@@ -421,11 +474,12 @@ class ReplicaInfo:
|
|
|
421
474
|
based on the cluster name.
|
|
422
475
|
"""
|
|
423
476
|
if cluster_record is None:
|
|
424
|
-
|
|
477
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
425
478
|
self.cluster_name)
|
|
426
|
-
|
|
479
|
+
else:
|
|
480
|
+
handle = cluster_record['handle']
|
|
481
|
+
if handle is None:
|
|
427
482
|
return None
|
|
428
|
-
handle = cluster_record['handle']
|
|
429
483
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
430
484
|
return handle
|
|
431
485
|
|
|
@@ -442,10 +496,16 @@ class ReplicaInfo:
|
|
|
442
496
|
handle = self.handle()
|
|
443
497
|
if handle is None:
|
|
444
498
|
return None
|
|
499
|
+
if self.replica_port == '-':
|
|
500
|
+
# This is a pool replica so there is no endpoint and it's filled
|
|
501
|
+
# with this dummy value. We return None here so that we can
|
|
502
|
+
# get the active ready replicas and perform autoscaling. Otherwise,
|
|
503
|
+
# would error out when trying to get the endpoint.
|
|
504
|
+
return None
|
|
445
505
|
replica_port_int = int(self.replica_port)
|
|
446
506
|
try:
|
|
447
|
-
endpoint_dict =
|
|
448
|
-
|
|
507
|
+
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
508
|
+
replica_port_int)
|
|
449
509
|
except exceptions.ClusterNotUpError:
|
|
450
510
|
return None
|
|
451
511
|
endpoint = endpoint_dict.get(replica_port_int, None)
|
|
@@ -465,26 +525,36 @@ class ReplicaInfo:
|
|
|
465
525
|
f'replica {self.replica_id}.')
|
|
466
526
|
return replica_status
|
|
467
527
|
|
|
468
|
-
def to_info_dict(self,
|
|
528
|
+
def to_info_dict(self,
|
|
529
|
+
with_handle: bool,
|
|
530
|
+
with_url: bool = True) -> Dict[str, Any]:
|
|
469
531
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
470
|
-
self.cluster_name)
|
|
532
|
+
self.cluster_name, include_user_info=False, summary_response=True)
|
|
471
533
|
info_dict = {
|
|
472
534
|
'replica_id': self.replica_id,
|
|
473
535
|
'name': self.cluster_name,
|
|
474
536
|
'status': self.status,
|
|
475
537
|
'version': self.version,
|
|
476
|
-
'endpoint': self.url,
|
|
538
|
+
'endpoint': self.url if with_url else None,
|
|
477
539
|
'is_spot': self.is_spot,
|
|
478
540
|
'launched_at': (cluster_record['launched_at']
|
|
479
541
|
if cluster_record is not None else None),
|
|
480
542
|
}
|
|
481
543
|
if with_handle:
|
|
482
|
-
|
|
544
|
+
handle = self.handle(cluster_record)
|
|
545
|
+
info_dict['handle'] = handle
|
|
546
|
+
if handle is not None:
|
|
547
|
+
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
548
|
+
info_dict['region'] = handle.launched_resources.region
|
|
549
|
+
info_dict['resources_str'] = (
|
|
550
|
+
resources_utils.get_readable_resources_repr(
|
|
551
|
+
handle, simplified_only=True)[0])
|
|
483
552
|
return info_dict
|
|
484
553
|
|
|
485
554
|
def __repr__(self) -> str:
|
|
486
|
-
|
|
487
|
-
|
|
555
|
+
show_details = env_options.Options.SHOW_DEBUG_INFO.get()
|
|
556
|
+
info_dict = self.to_info_dict(with_handle=show_details,
|
|
557
|
+
with_url=show_details)
|
|
488
558
|
handle_str = ''
|
|
489
559
|
if 'handle' in info_dict:
|
|
490
560
|
handle_str = f', handle={info_dict["handle"]}'
|
|
@@ -498,6 +568,33 @@ class ReplicaInfo:
|
|
|
498
568
|
f'launched_at={info_dict["launched_at"]}{handle_str})')
|
|
499
569
|
return info
|
|
500
570
|
|
|
571
|
+
def probe_pool(self) -> Tuple['ReplicaInfo', bool, float]:
|
|
572
|
+
"""Probe the replica for pool management.
|
|
573
|
+
|
|
574
|
+
This function will check the first job status of the cluster, which is a
|
|
575
|
+
dummy job that only echoes "setup done". The success of this job means
|
|
576
|
+
the setup command is done and the replica is ready to be used. Check
|
|
577
|
+
sky/serve/server/core.py::up for more details.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Tuple of (self, is_ready, probe_time).
|
|
581
|
+
"""
|
|
582
|
+
probe_time = time.time()
|
|
583
|
+
try:
|
|
584
|
+
handle = backend_utils.check_cluster_available(
|
|
585
|
+
self.cluster_name, operation='probing pool')
|
|
586
|
+
if handle is None:
|
|
587
|
+
return self, False, probe_time
|
|
588
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
589
|
+
statuses = backend.get_job_status(handle, [1], stream_logs=False)
|
|
590
|
+
if statuses[1] == job_lib.JobStatus.SUCCEEDED:
|
|
591
|
+
return self, True, probe_time
|
|
592
|
+
return self, False, probe_time
|
|
593
|
+
except Exception as e: # pylint: disable=broad-except
|
|
594
|
+
logger.error(f'Error when probing pool of {self.cluster_name}: '
|
|
595
|
+
f'{common_utils.format_exception(e)}.')
|
|
596
|
+
return self, False, probe_time
|
|
597
|
+
|
|
501
598
|
def probe(
|
|
502
599
|
self,
|
|
503
600
|
readiness_path: str,
|
|
@@ -580,13 +677,14 @@ class ReplicaInfo:
|
|
|
580
677
|
class ReplicaManager:
|
|
581
678
|
"""Each replica manager monitors one service."""
|
|
582
679
|
|
|
583
|
-
def __init__(self, service_name: str,
|
|
584
|
-
|
|
680
|
+
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
|
681
|
+
version: int) -> None:
|
|
585
682
|
self.lock = threading.Lock()
|
|
586
683
|
self._next_replica_id: int = 1
|
|
587
684
|
self._service_name: str = service_name
|
|
588
685
|
self._uptime: Optional[float] = None
|
|
589
686
|
self._update_mode = serve_utils.DEFAULT_UPDATE_MODE
|
|
687
|
+
self._is_pool: bool = spec.pool
|
|
590
688
|
header_keys = None
|
|
591
689
|
if spec.readiness_headers is not None:
|
|
592
690
|
header_keys = list(spec.readiness_headers.keys())
|
|
@@ -596,9 +694,18 @@ class ReplicaManager:
|
|
|
596
694
|
f'Readiness header keys: {header_keys}')
|
|
597
695
|
|
|
598
696
|
# Newest version among the currently provisioned and launched replicas
|
|
599
|
-
self.latest_version: int =
|
|
697
|
+
self.latest_version: int = version
|
|
600
698
|
# Oldest version among the currently provisioned and launched replicas
|
|
601
|
-
self.least_recent_version: int =
|
|
699
|
+
self.least_recent_version: int = version
|
|
700
|
+
|
|
701
|
+
def _consecutive_failure_threshold_timeout(self) -> int:
|
|
702
|
+
"""The timeout for the consecutive failure threshold in seconds.
|
|
703
|
+
|
|
704
|
+
We reduce the timeout for pool to 10 seconds to make the pool more
|
|
705
|
+
responsive to the failure.
|
|
706
|
+
"""
|
|
707
|
+
# TODO(tian): Maybe let user determine this threshold
|
|
708
|
+
return 10 if self._is_pool else 180
|
|
602
709
|
|
|
603
710
|
def scale_up(self,
|
|
604
711
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
@@ -625,8 +732,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
625
732
|
"""Replica Manager for SkyPilot clusters.
|
|
626
733
|
|
|
627
734
|
It will run three daemon to monitor the status of the replicas:
|
|
628
|
-
(1)
|
|
629
|
-
to monitor the progress of the launch/down
|
|
735
|
+
(1) _thread_pool_refresher: Refresh the launch/down thread pool
|
|
736
|
+
to monitor the progress of the launch/down thread.
|
|
630
737
|
(2) _job_status_fetcher: Fetch the job status of the service to
|
|
631
738
|
monitor the status of the service jobs.
|
|
632
739
|
(3) _replica_prober: Do readiness probe to the replicas to monitor
|
|
@@ -634,40 +741,41 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
634
741
|
"""
|
|
635
742
|
|
|
636
743
|
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
|
637
|
-
|
|
638
|
-
super().__init__(service_name, spec)
|
|
639
|
-
self.
|
|
640
|
-
task =
|
|
744
|
+
version: int) -> None:
|
|
745
|
+
super().__init__(service_name, spec, version)
|
|
746
|
+
self.yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
747
|
+
task = task_lib.Task.from_yaml_str(self.yaml_content)
|
|
641
748
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
642
749
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
643
|
-
# TODO(tian): Store launch/down
|
|
644
|
-
# manager more persistent.
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
self.
|
|
650
|
-
int,
|
|
651
|
-
self.
|
|
652
|
-
int,
|
|
653
|
-
|
|
654
|
-
threading.Thread(target=self.
|
|
750
|
+
# TODO(tian): Store launch/down request id in the replica table, to make
|
|
751
|
+
# the manager more persistent.
|
|
752
|
+
self._launch_thread_pool: thread_utils.ThreadSafeDict[
|
|
753
|
+
int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
|
|
754
|
+
self._replica_to_request_id: thread_utils.ThreadSafeDict[
|
|
755
|
+
int, str] = thread_utils.ThreadSafeDict()
|
|
756
|
+
self._replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
|
|
757
|
+
int, bool] = thread_utils.ThreadSafeDict()
|
|
758
|
+
self._down_thread_pool: thread_utils.ThreadSafeDict[
|
|
759
|
+
int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
|
|
760
|
+
|
|
761
|
+
threading.Thread(target=self._thread_pool_refresher).start()
|
|
655
762
|
threading.Thread(target=self._job_status_fetcher).start()
|
|
656
763
|
threading.Thread(target=self._replica_prober).start()
|
|
657
764
|
|
|
658
765
|
self._recover_replica_operations()
|
|
659
766
|
|
|
767
|
+
@with_lock
|
|
660
768
|
def _recover_replica_operations(self):
|
|
661
769
|
"""Let's see are there something to do for ReplicaManager in a
|
|
662
770
|
recovery run"""
|
|
663
|
-
assert (not self.
|
|
664
|
-
), 'We should not have any running
|
|
771
|
+
assert (not self._launch_thread_pool and not self._down_thread_pool
|
|
772
|
+
), 'We should not have any running threads in a recovery run'
|
|
665
773
|
|
|
666
774
|
# There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
|
|
667
775
|
# _launch_replica.
|
|
668
776
|
# We prioritize PROVISIONING replicas since they were previously
|
|
669
777
|
# launched but may have been interrupted and need to be restarted.
|
|
670
|
-
# This is why we
|
|
778
|
+
# This is why we handle PENDING replicas only after PROVISIONING
|
|
671
779
|
# replicas.
|
|
672
780
|
to_up_replicas = serve_state.get_replicas_at_status(
|
|
673
781
|
self._service_name, serve_state.ReplicaStatus.PROVISIONING)
|
|
@@ -697,16 +805,15 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
697
805
|
# Replica management functions #
|
|
698
806
|
################################
|
|
699
807
|
|
|
700
|
-
#
|
|
701
|
-
#
|
|
702
|
-
@with_lock
|
|
808
|
+
# We don't need to add lock here since every caller of this function
|
|
809
|
+
# will acquire the lock.
|
|
703
810
|
def _launch_replica(
|
|
704
811
|
self,
|
|
705
812
|
replica_id: int,
|
|
706
813
|
resources_override: Optional[Dict[str, Any]] = None,
|
|
707
814
|
) -> None:
|
|
708
|
-
if replica_id in self.
|
|
709
|
-
logger.warning(f'Launch
|
|
815
|
+
if replica_id in self._launch_thread_pool:
|
|
816
|
+
logger.warning(f'Launch thread for replica {replica_id} '
|
|
710
817
|
'already exists. Skipping.')
|
|
711
818
|
return
|
|
712
819
|
logger.info(f'Launching replica {replica_id}...')
|
|
@@ -714,7 +821,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
714
821
|
self._service_name, replica_id)
|
|
715
822
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
|
716
823
|
self._service_name, replica_id)
|
|
717
|
-
use_spot = _should_use_spot(self.
|
|
824
|
+
use_spot = _should_use_spot(self.yaml_content, resources_override)
|
|
718
825
|
retry_until_up = True
|
|
719
826
|
location = None
|
|
720
827
|
if use_spot and self._spot_placer is not None:
|
|
@@ -737,28 +844,78 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
737
844
|
location = self._spot_placer.select_next_location(
|
|
738
845
|
current_spot_locations)
|
|
739
846
|
resources_override.update(location.to_dict())
|
|
740
|
-
|
|
741
|
-
target=
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
resources_override, retry_until_up),
|
|
847
|
+
t = thread_utils.SafeThread(
|
|
848
|
+
target=launch_cluster,
|
|
849
|
+
args=(replica_id, self.yaml_content, cluster_name, log_file_name,
|
|
850
|
+
self._replica_to_request_id,
|
|
851
|
+
self._replica_to_launch_cancelled, resources_override,
|
|
852
|
+
retry_until_up),
|
|
747
853
|
)
|
|
748
|
-
replica_port = _get_resources_ports(self.
|
|
854
|
+
replica_port = _get_resources_ports(self.yaml_content)
|
|
749
855
|
|
|
750
856
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
|
751
857
|
location, self.latest_version, resources_override)
|
|
752
858
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
753
|
-
# Don't start right now; we will start it later in
|
|
859
|
+
# Don't start right now; we will start it later in _refresh_thread_pool
|
|
754
860
|
# to avoid too many sky.launch running at the same time.
|
|
755
|
-
self.
|
|
861
|
+
self._launch_thread_pool[replica_id] = t
|
|
756
862
|
|
|
863
|
+
@with_lock
|
|
757
864
|
def scale_up(self,
|
|
758
865
|
resources_override: Optional[Dict[str, Any]] = None) -> None:
|
|
759
866
|
self._launch_replica(self._next_replica_id, resources_override)
|
|
760
867
|
self._next_replica_id += 1
|
|
761
868
|
|
|
869
|
+
def _handle_sky_down_finish(self, info: ReplicaInfo,
|
|
870
|
+
format_exc: Optional[str]) -> None:
|
|
871
|
+
if format_exc is not None:
|
|
872
|
+
logger.error(f'Down thread for replica {info.replica_id} '
|
|
873
|
+
f'exited abnormally with exception {format_exc}.')
|
|
874
|
+
info.status_property.sky_down_status = (
|
|
875
|
+
common_utils.ProcessStatus.FAILED)
|
|
876
|
+
else:
|
|
877
|
+
info.status_property.sky_down_status = (
|
|
878
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
879
|
+
# Failed replica still count as a replica. In our current design, we
|
|
880
|
+
# want to fail early if user code have any error. This will prevent
|
|
881
|
+
# infinite loop of teardown and re-provision. However, there is a
|
|
882
|
+
# special case that if the replica is UP for longer than
|
|
883
|
+
# initial_delay_seconds, we assume it is just some random failure and
|
|
884
|
+
# we should restart the replica. Please refer to the implementation of
|
|
885
|
+
# `is_scale_down_succeeded` for more details.
|
|
886
|
+
# TODO(tian): Currently, restart replicas that failed within
|
|
887
|
+
# initial_delay_seconds is not supported. We should add it
|
|
888
|
+
# later when we support `sky serve update`.
|
|
889
|
+
removal_reason = None
|
|
890
|
+
if info.status_property.is_scale_down:
|
|
891
|
+
# This means the cluster is deleted due to an autoscaler
|
|
892
|
+
# decision or the cluster is recovering from preemption.
|
|
893
|
+
# Delete the replica info so it won't count as a replica.
|
|
894
|
+
if info.status_property.preempted:
|
|
895
|
+
removal_reason = 'for preemption recovery'
|
|
896
|
+
else:
|
|
897
|
+
removal_reason = 'normally'
|
|
898
|
+
# Don't keep failed record for version mismatch replicas,
|
|
899
|
+
# since user should fixed the error before update.
|
|
900
|
+
elif info.version != self.latest_version:
|
|
901
|
+
removal_reason = 'for version outdated'
|
|
902
|
+
elif info.status_property.purged:
|
|
903
|
+
removal_reason = 'for purge'
|
|
904
|
+
elif info.status_property.failed_spot_availability:
|
|
905
|
+
removal_reason = 'for spot availability failure'
|
|
906
|
+
else:
|
|
907
|
+
logger.info(f'Termination of replica {info.replica_id} '
|
|
908
|
+
'finished. Replica info is kept since some '
|
|
909
|
+
'failure detected.')
|
|
910
|
+
serve_state.add_or_update_replica(self._service_name,
|
|
911
|
+
info.replica_id, info)
|
|
912
|
+
if removal_reason is not None:
|
|
913
|
+
serve_state.remove_replica(self._service_name, info.replica_id)
|
|
914
|
+
logger.info(f'Replica {info.replica_id} removed from the '
|
|
915
|
+
f'replica table {removal_reason}.')
|
|
916
|
+
|
|
917
|
+
# We don't need to add lock here since every caller of this function
|
|
918
|
+
# will acquire the lock.
|
|
762
919
|
def _terminate_replica(self,
|
|
763
920
|
replica_id: int,
|
|
764
921
|
sync_down_logs: bool,
|
|
@@ -772,24 +929,55 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
772
929
|
'the logs should always be synced down. '
|
|
773
930
|
'So that the user can see the logs to debug.')
|
|
774
931
|
|
|
775
|
-
if replica_id in self.
|
|
932
|
+
if replica_id in self._launch_thread_pool:
|
|
776
933
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
777
934
|
replica_id)
|
|
778
935
|
assert info is not None
|
|
779
|
-
info.status_property.sky_launch_status =
|
|
936
|
+
info.status_property.sky_launch_status = (
|
|
937
|
+
common_utils.ProcessStatus.INTERRUPTED)
|
|
780
938
|
serve_state.add_or_update_replica(self._service_name, replica_id,
|
|
781
939
|
info)
|
|
782
|
-
|
|
783
|
-
if
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
940
|
+
launch_thread = self._launch_thread_pool[replica_id]
|
|
941
|
+
if launch_thread.is_alive():
|
|
942
|
+
self._replica_to_launch_cancelled[replica_id] = True
|
|
943
|
+
start_wait_time = time.time()
|
|
944
|
+
timeout_reached = False
|
|
945
|
+
while True:
|
|
946
|
+
# Launch request id found. cancel it.
|
|
947
|
+
if replica_id in self._replica_to_request_id:
|
|
948
|
+
request_id = self._replica_to_request_id[replica_id]
|
|
949
|
+
sdk.api_cancel(request_id)
|
|
950
|
+
break
|
|
951
|
+
if replica_id not in self._replica_to_launch_cancelled:
|
|
952
|
+
# Indicates that the cancellation was received.
|
|
953
|
+
break
|
|
954
|
+
if not launch_thread.is_alive():
|
|
955
|
+
# It's possible that the launch thread immediately
|
|
956
|
+
# finished after we check. Exit the loop now.
|
|
957
|
+
break
|
|
958
|
+
if (time.time() - start_wait_time >
|
|
959
|
+
_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS):
|
|
960
|
+
timeout_reached = True
|
|
961
|
+
break
|
|
962
|
+
time.sleep(0.1)
|
|
963
|
+
if timeout_reached:
|
|
964
|
+
logger.warning(
|
|
965
|
+
'Failed to cancel launch request for replica '
|
|
966
|
+
f'{replica_id} after '
|
|
967
|
+
f'{_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS} seconds. '
|
|
968
|
+
'Force waiting the launch thread to finish.')
|
|
969
|
+
else:
|
|
970
|
+
logger.info('Interrupted launch thread for replica '
|
|
971
|
+
f'{replica_id} and deleted the cluster.')
|
|
972
|
+
launch_thread.join()
|
|
973
|
+
else:
|
|
974
|
+
logger.info(f'Launch thread for replica {replica_id} '
|
|
975
|
+
'already finished. Delete the cluster now.')
|
|
976
|
+
self._launch_thread_pool.pop(replica_id)
|
|
977
|
+
self._replica_to_request_id.pop(replica_id)
|
|
978
|
+
|
|
979
|
+
if replica_id in self._down_thread_pool:
|
|
980
|
+
logger.warning(f'Terminate thread for replica {replica_id} '
|
|
793
981
|
'already exists. Skipping.')
|
|
794
982
|
return
|
|
795
983
|
|
|
@@ -820,9 +1008,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
820
1008
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
821
1009
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
822
1010
|
'replica_jobs')
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
1011
|
+
job_ids = ['1'] if self._is_pool else None
|
|
1012
|
+
job_log_file_name = controller_utils.download_and_stream_job_log(
|
|
1013
|
+
backend, handle, replica_job_logs_dir, job_ids)
|
|
826
1014
|
if job_log_file_name is not None:
|
|
827
1015
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
828
1016
|
with open(log_file_name, 'a',
|
|
@@ -848,18 +1036,30 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
848
1036
|
|
|
849
1037
|
logger.info(f'preempted: {info.status_property.preempted}, '
|
|
850
1038
|
f'replica_id: {replica_id}')
|
|
851
|
-
p = multiprocessing.Process(
|
|
852
|
-
target=ux_utils.RedirectOutputForProcess(terminate_cluster,
|
|
853
|
-
log_file_name, 'a').run,
|
|
854
|
-
args=(info.cluster_name, replica_drain_delay_seconds),
|
|
855
|
-
)
|
|
856
|
-
info.status_property.sky_down_status = ProcessStatus.RUNNING
|
|
857
1039
|
info.status_property.is_scale_down = is_scale_down
|
|
858
1040
|
info.status_property.purged = purge
|
|
1041
|
+
|
|
1042
|
+
# If the cluster does not exist, it means either the cluster never
|
|
1043
|
+
# exists (e.g., the cluster is scaled down before it gets a chance to
|
|
1044
|
+
# provision) or the cluster is preempted and cleaned up by the status
|
|
1045
|
+
# refresh. In this case, we skip spawning a new down thread to save
|
|
1046
|
+
# controller resources.
|
|
1047
|
+
if not global_user_state.cluster_with_name_exists(info.cluster_name):
|
|
1048
|
+
self._handle_sky_down_finish(info, format_exc=None)
|
|
1049
|
+
return
|
|
1050
|
+
|
|
1051
|
+
# Otherwise, start the thread to terminate the cluster.
|
|
1052
|
+
t = thread_utils.SafeThread(
|
|
1053
|
+
target=terminate_cluster,
|
|
1054
|
+
args=(info.cluster_name, log_file_name,
|
|
1055
|
+
replica_drain_delay_seconds),
|
|
1056
|
+
)
|
|
1057
|
+
info.status_property.sky_down_status = (
|
|
1058
|
+
common_utils.ProcessStatus.SCHEDULED)
|
|
859
1059
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
860
|
-
|
|
861
|
-
self._down_process_pool[replica_id] = p
|
|
1060
|
+
self._down_thread_pool[replica_id] = t
|
|
862
1061
|
|
|
1062
|
+
@with_lock
|
|
863
1063
|
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
|
864
1064
|
self._terminate_replica(
|
|
865
1065
|
replica_id,
|
|
@@ -868,6 +1068,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
868
1068
|
is_scale_down=True,
|
|
869
1069
|
purge=purge)
|
|
870
1070
|
|
|
1071
|
+
# We don't need to add lock here since every caller of this function
|
|
1072
|
+
# will acquire the lock.
|
|
871
1073
|
def _handle_preemption(self, info: ReplicaInfo) -> bool:
|
|
872
1074
|
"""Handle preemption of the replica if any error happened.
|
|
873
1075
|
|
|
@@ -920,52 +1122,54 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
920
1122
|
#################################
|
|
921
1123
|
|
|
922
1124
|
@with_lock
|
|
923
|
-
def
|
|
924
|
-
"""Refresh the launch/down
|
|
1125
|
+
def _refresh_thread_pool(self) -> None:
|
|
1126
|
+
"""Refresh the launch/down thread pool.
|
|
925
1127
|
|
|
926
|
-
This function will checks all sky.launch and sky.down
|
|
1128
|
+
This function will checks all sky.launch and sky.down thread on
|
|
927
1129
|
the fly. If any of them finished, it will update the status of the
|
|
928
1130
|
corresponding replica.
|
|
929
1131
|
"""
|
|
930
1132
|
# To avoid `dictionary changed size during iteration` error.
|
|
931
|
-
|
|
932
|
-
for replica_id,
|
|
933
|
-
if
|
|
1133
|
+
launch_thread_pool_snapshot = list(self._launch_thread_pool.items())
|
|
1134
|
+
for replica_id, t in launch_thread_pool_snapshot:
|
|
1135
|
+
if t.is_alive():
|
|
1136
|
+
continue
|
|
1137
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
934
1138
|
info = serve_state.get_replica_info_from_id(
|
|
935
1139
|
self._service_name, replica_id)
|
|
936
1140
|
assert info is not None, replica_id
|
|
937
1141
|
error_in_sky_launch = False
|
|
938
1142
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
939
1143
|
# sky.launch not started yet
|
|
940
|
-
if (
|
|
941
|
-
|
|
942
|
-
p.start()
|
|
1144
|
+
if controller_utils.can_provision(self._is_pool):
|
|
1145
|
+
t.start()
|
|
943
1146
|
info.status_property.sky_launch_status = (
|
|
944
|
-
ProcessStatus.RUNNING)
|
|
1147
|
+
common_utils.ProcessStatus.RUNNING)
|
|
945
1148
|
else:
|
|
946
1149
|
# sky.launch finished
|
|
947
|
-
# TODO(tian): Try-catch in
|
|
1150
|
+
# TODO(tian): Try-catch in thread, and have an enum return
|
|
948
1151
|
# value to indicate which type of failure happened.
|
|
949
1152
|
# Currently we only have user code failure since the
|
|
950
1153
|
# retry_until_up flag is set to True, but it will be helpful
|
|
951
1154
|
# when we enable user choose whether to retry or not.
|
|
952
1155
|
logger.info(
|
|
953
|
-
f'Launch
|
|
954
|
-
|
|
955
|
-
|
|
1156
|
+
f'Launch thread for replica {replica_id} finished.')
|
|
1157
|
+
self._launch_thread_pool.pop(replica_id)
|
|
1158
|
+
self._replica_to_request_id.pop(replica_id)
|
|
1159
|
+
if t.format_exc is not None:
|
|
956
1160
|
logger.warning(
|
|
957
|
-
f'Launch
|
|
958
|
-
f'exited abnormally with
|
|
959
|
-
' Terminating...')
|
|
1161
|
+
f'Launch thread for replica {replica_id} '
|
|
1162
|
+
f'exited abnormally with exception '
|
|
1163
|
+
f'{t.format_exc}. Terminating...')
|
|
960
1164
|
info.status_property.sky_launch_status = (
|
|
961
|
-
ProcessStatus.FAILED)
|
|
1165
|
+
common_utils.ProcessStatus.FAILED)
|
|
962
1166
|
error_in_sky_launch = True
|
|
963
1167
|
else:
|
|
964
1168
|
info.status_property.sky_launch_status = (
|
|
965
|
-
ProcessStatus.SUCCEEDED)
|
|
1169
|
+
common_utils.ProcessStatus.SUCCEEDED)
|
|
966
1170
|
if self._spot_placer is not None and info.is_spot:
|
|
967
1171
|
# TODO(tian): Currently, we set the location to
|
|
968
|
-
# preemptive if the launch
|
|
1172
|
+
# preemptive if the launch thread failed. This is
|
|
969
1173
|
# because if the error is not related to the
|
|
970
1174
|
# availability of the location, then all locations
|
|
971
1175
|
# should failed for same reason. So it does not matter
|
|
@@ -975,7 +1179,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
975
1179
|
# availability of the location later.
|
|
976
1180
|
location = info.get_spot_location()
|
|
977
1181
|
assert location is not None
|
|
978
|
-
if
|
|
1182
|
+
if t.format_exc is not None:
|
|
979
1183
|
self._spot_placer.set_preemptive(location)
|
|
980
1184
|
info.status_property.failed_spot_availability = True
|
|
981
1185
|
else:
|
|
@@ -988,61 +1192,27 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
988
1192
|
self._terminate_replica(replica_id,
|
|
989
1193
|
sync_down_logs=True,
|
|
990
1194
|
replica_drain_delay_seconds=0)
|
|
991
|
-
|
|
992
|
-
for replica_id,
|
|
993
|
-
if
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
info.status_property.sky_down_status = (
|
|
1004
|
-
ProcessStatus.FAILED)
|
|
1005
|
-
else:
|
|
1195
|
+
down_thread_pool_snapshot = list(self._down_thread_pool.items())
|
|
1196
|
+
for replica_id, t in down_thread_pool_snapshot:
|
|
1197
|
+
if t.is_alive():
|
|
1198
|
+
continue
|
|
1199
|
+
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
1200
|
+
replica_id)
|
|
1201
|
+
assert info is not None, replica_id
|
|
1202
|
+
if (info.status_property.sky_down_status ==
|
|
1203
|
+
common_utils.ProcessStatus.SCHEDULED):
|
|
1204
|
+
# sky.down not started yet
|
|
1205
|
+
if controller_utils.can_terminate(self._is_pool):
|
|
1206
|
+
t.start()
|
|
1006
1207
|
info.status_property.sky_down_status = (
|
|
1007
|
-
ProcessStatus.
|
|
1008
|
-
# Failed replica still count as a replica. In our current
|
|
1009
|
-
# design, we want to fail early if user code have any error.
|
|
1010
|
-
# This will prevent infinite loop of teardown and
|
|
1011
|
-
# re-provision. However, there is a special case that if the
|
|
1012
|
-
# replica is UP for longer than initial_delay_seconds, we
|
|
1013
|
-
# assume it is just some random failure and we should restart
|
|
1014
|
-
# the replica. Please refer to the implementation of
|
|
1015
|
-
# `is_scale_down_succeeded` for more details.
|
|
1016
|
-
# TODO(tian): Currently, restart replicas that failed within
|
|
1017
|
-
# initial_delay_seconds is not supported. We should add it
|
|
1018
|
-
# later when we support `sky serve update`.
|
|
1019
|
-
removal_reason = None
|
|
1020
|
-
if info.status_property.is_scale_down:
|
|
1021
|
-
# This means the cluster is deleted due to an autoscaler
|
|
1022
|
-
# decision or the cluster is recovering from preemption.
|
|
1023
|
-
# Delete the replica info so it won't count as a replica.
|
|
1024
|
-
if info.status_property.preempted:
|
|
1025
|
-
removal_reason = 'for preemption recovery'
|
|
1026
|
-
else:
|
|
1027
|
-
removal_reason = 'normally'
|
|
1028
|
-
# Don't keep failed record for version mismatch replicas,
|
|
1029
|
-
# since user should fixed the error before update.
|
|
1030
|
-
elif info.version != self.latest_version:
|
|
1031
|
-
removal_reason = 'for version outdated'
|
|
1032
|
-
elif info.status_property.purged:
|
|
1033
|
-
removal_reason = 'for purge'
|
|
1034
|
-
elif info.status_property.failed_spot_availability:
|
|
1035
|
-
removal_reason = 'for spot availability failure'
|
|
1036
|
-
else:
|
|
1037
|
-
logger.info(f'Termination of replica {replica_id} '
|
|
1038
|
-
'finished. Replica info is kept since some '
|
|
1039
|
-
'failure detected.')
|
|
1208
|
+
common_utils.ProcessStatus.RUNNING)
|
|
1040
1209
|
serve_state.add_or_update_replica(self._service_name,
|
|
1041
1210
|
replica_id, info)
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1211
|
+
else:
|
|
1212
|
+
logger.info(
|
|
1213
|
+
f'Terminate thread for replica {replica_id} finished.')
|
|
1214
|
+
self._down_thread_pool.pop(replica_id)
|
|
1215
|
+
self._handle_sky_down_finish(info, format_exc=t.format_exc)
|
|
1046
1216
|
|
|
1047
1217
|
# Clean old version
|
|
1048
1218
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
@@ -1052,25 +1222,25 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1052
1222
|
if self.least_recent_version < current_least_recent_version:
|
|
1053
1223
|
for version in range(self.least_recent_version,
|
|
1054
1224
|
current_least_recent_version):
|
|
1055
|
-
|
|
1225
|
+
yaml_content = serve_utils.get_yaml_content(
|
|
1056
1226
|
self._service_name, version)
|
|
1057
1227
|
# Delete old version metadata.
|
|
1058
1228
|
serve_state.delete_version(self._service_name, version)
|
|
1059
1229
|
# Delete storage buckets of older versions.
|
|
1060
|
-
service.cleanup_storage(
|
|
1230
|
+
service.cleanup_storage(yaml_content)
|
|
1061
1231
|
# newest version will be cleaned in serve down
|
|
1062
1232
|
self.least_recent_version = current_least_recent_version
|
|
1063
1233
|
|
|
1064
|
-
def
|
|
1065
|
-
"""Periodically refresh the launch/down
|
|
1234
|
+
def _thread_pool_refresher(self) -> None:
|
|
1235
|
+
"""Periodically refresh the launch/down thread pool."""
|
|
1066
1236
|
while True:
|
|
1067
|
-
logger.debug('Refreshing
|
|
1237
|
+
logger.debug('Refreshing thread pool.')
|
|
1068
1238
|
try:
|
|
1069
|
-
self.
|
|
1239
|
+
self._refresh_thread_pool()
|
|
1070
1240
|
except Exception as e: # pylint: disable=broad-except
|
|
1071
1241
|
# No matter what error happens, we should keep the
|
|
1072
|
-
#
|
|
1073
|
-
logger.error('Error in
|
|
1242
|
+
# thread pool refresher running.
|
|
1243
|
+
logger.error('Error in thread pool refresher: '
|
|
1074
1244
|
f'{common_utils.format_exception(e)}')
|
|
1075
1245
|
with ux_utils.enable_traceback():
|
|
1076
1246
|
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
@@ -1098,9 +1268,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1098
1268
|
handle = info.handle()
|
|
1099
1269
|
assert handle is not None, info
|
|
1100
1270
|
# Use None to fetch latest job, which stands for user task job
|
|
1271
|
+
job_ids = [1] if self._is_pool else None
|
|
1101
1272
|
try:
|
|
1102
1273
|
job_statuses = backend.get_job_status(handle,
|
|
1103
|
-
|
|
1274
|
+
job_ids,
|
|
1104
1275
|
stream_logs=False)
|
|
1105
1276
|
except exceptions.CommandError:
|
|
1106
1277
|
# If the job status fetch failed, it is likely that the
|
|
@@ -1110,7 +1281,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1110
1281
|
continue
|
|
1111
1282
|
# Re-raise the exception if it is not preempted.
|
|
1112
1283
|
raise
|
|
1113
|
-
job_status =
|
|
1284
|
+
job_status = job_statuses[1] if self._is_pool else list(
|
|
1285
|
+
job_statuses.values())[0]
|
|
1114
1286
|
if job_status in job_lib.JobStatus.user_code_failure_states():
|
|
1115
1287
|
info.status_property.user_app_failed = True
|
|
1116
1288
|
serve_state.add_or_update_replica(self._service_name,
|
|
@@ -1154,18 +1326,24 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1154
1326
|
for info in infos:
|
|
1155
1327
|
if not info.status_property.should_track_service_status():
|
|
1156
1328
|
continue
|
|
1157
|
-
|
|
1158
|
-
f'replica_{info.replica_id}(
|
|
1159
|
-
|
|
1160
|
-
pool.apply_async(
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1329
|
+
if self._is_pool:
|
|
1330
|
+
replica_to_probe.append(f'replica_{info.replica_id}(cluster'
|
|
1331
|
+
f'_name={info.cluster_name})')
|
|
1332
|
+
probe_futures.append(pool.apply_async(info.probe_pool))
|
|
1333
|
+
else:
|
|
1334
|
+
replica_to_probe.append(
|
|
1335
|
+
f'replica_{info.replica_id}(url={info.url})')
|
|
1336
|
+
probe_futures.append(
|
|
1337
|
+
pool.apply_async(
|
|
1338
|
+
info.probe,
|
|
1339
|
+
(
|
|
1340
|
+
self._get_readiness_path(info.version),
|
|
1341
|
+
self._get_post_data(info.version),
|
|
1342
|
+
self._get_readiness_timeout_seconds(
|
|
1343
|
+
info.version),
|
|
1344
|
+
self._get_readiness_headers(info.version),
|
|
1345
|
+
),
|
|
1346
|
+
),)
|
|
1169
1347
|
logger.info(f'Replicas to probe: {", ".join(replica_to_probe)}')
|
|
1170
1348
|
|
|
1171
1349
|
# Since futures.as_completed will return futures in the order of
|
|
@@ -1202,8 +1380,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1202
1380
|
consecutive_failure_time = (
|
|
1203
1381
|
info.consecutive_failure_times[-1] -
|
|
1204
1382
|
info.consecutive_failure_times[0])
|
|
1205
|
-
|
|
1206
|
-
|
|
1383
|
+
failure_threshold = (
|
|
1384
|
+
self._consecutive_failure_threshold_timeout())
|
|
1385
|
+
if consecutive_failure_time >= failure_threshold:
|
|
1207
1386
|
logger.info(
|
|
1208
1387
|
f'Replica {info.replica_id} is not ready for '
|
|
1209
1388
|
'too long and exceeding consecutive failure '
|
|
@@ -1214,8 +1393,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1214
1393
|
f'Replica {info.replica_id} is not ready '
|
|
1215
1394
|
'but within consecutive failure threshold '
|
|
1216
1395
|
f'({consecutive_failure_time}s / '
|
|
1217
|
-
f'{
|
|
1218
|
-
'Skipping.')
|
|
1396
|
+
f'{failure_threshold}s). Skipping.')
|
|
1219
1397
|
else:
|
|
1220
1398
|
initial_delay_seconds = self._get_initial_delay_seconds(
|
|
1221
1399
|
info.version)
|
|
@@ -1290,11 +1468,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1290
1468
|
logger.error(f'Invalid version: {version}, '
|
|
1291
1469
|
f'latest version: {self.latest_version}')
|
|
1292
1470
|
return
|
|
1293
|
-
|
|
1294
|
-
self._service_name, version)
|
|
1295
|
-
serve_state.add_or_update_version(self._service_name, version, spec)
|
|
1471
|
+
yaml_content = serve_state.get_yaml_content(self._service_name, version)
|
|
1296
1472
|
self.latest_version = version
|
|
1297
|
-
self.
|
|
1473
|
+
self.yaml_content = yaml_content
|
|
1298
1474
|
self._update_mode = update_mode
|
|
1299
1475
|
|
|
1300
1476
|
# Reuse all replicas that have the same config as the new version
|
|
@@ -1302,32 +1478,37 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1302
1478
|
# the latest version. This can significantly improve the speed
|
|
1303
1479
|
# for updating an existing service with only config changes to the
|
|
1304
1480
|
# service specs, e.g. scale down the service.
|
|
1305
|
-
new_config =
|
|
1481
|
+
new_config = yaml_utils.safe_load(yaml_content)
|
|
1306
1482
|
# Always create new replicas and scale down old ones when file_mounts
|
|
1307
1483
|
# are not empty.
|
|
1308
1484
|
if new_config.get('file_mounts', None) != {}:
|
|
1309
1485
|
return
|
|
1310
|
-
for key in ['service']:
|
|
1311
|
-
new_config.pop(key)
|
|
1486
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1487
|
+
new_config.pop(key, None)
|
|
1488
|
+
new_config_any_of = new_config.get('resources', {}).pop('any_of', [])
|
|
1489
|
+
|
|
1312
1490
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
1313
1491
|
for info in replica_infos:
|
|
1314
1492
|
if info.version < version and not info.is_terminal:
|
|
1315
1493
|
# Assume user does not change the yaml file on the controller.
|
|
1316
|
-
|
|
1494
|
+
old_yaml_content = serve_state.get_yaml_content(
|
|
1317
1495
|
self._service_name, info.version)
|
|
1318
|
-
old_config =
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
old_config.pop(key)
|
|
1496
|
+
old_config = yaml_utils.safe_load(old_yaml_content)
|
|
1497
|
+
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1498
|
+
old_config.pop(key, None)
|
|
1322
1499
|
# Bump replica version if all fields except for service are
|
|
1323
1500
|
# the same.
|
|
1324
1501
|
# Here, we manually convert the any_of field to a set to avoid
|
|
1325
1502
|
# only the difference in the random order of the any_of fields.
|
|
1326
1503
|
old_config_any_of = old_config.get('resources',
|
|
1327
1504
|
{}).pop('any_of', [])
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1505
|
+
|
|
1506
|
+
if (resources_utils.normalize_any_of_resources_config(
|
|
1507
|
+
old_config_any_of) != resources_utils.
|
|
1508
|
+
normalize_any_of_resources_config(new_config_any_of)):
|
|
1509
|
+
logger.info('Replica config changed (any_of), skipping. '
|
|
1510
|
+
f'old: {old_config_any_of}, '
|
|
1511
|
+
f'new: {new_config_any_of}')
|
|
1331
1512
|
continue
|
|
1332
1513
|
# File mounts should both be empty, as update always
|
|
1333
1514
|
# create new buckets if they are not empty.
|
|
@@ -1341,6 +1522,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1341
1522
|
info.version = version
|
|
1342
1523
|
serve_state.add_or_update_replica(self._service_name,
|
|
1343
1524
|
info.replica_id, info)
|
|
1525
|
+
else:
|
|
1526
|
+
logger.info('Replica config changed (rest), skipping. '
|
|
1527
|
+
f'old: {old_config}, '
|
|
1528
|
+
f'new: {new_config}')
|
|
1344
1529
|
|
|
1345
1530
|
def _get_version_spec(self, version: int) -> 'service_spec.SkyServiceSpec':
|
|
1346
1531
|
spec = serve_state.get_spec(self._service_name, version)
|