skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/replica_managers.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""ReplicaManager: handles the creation and deletion of endpoint replicas."""
|
|
2
2
|
import dataclasses
|
|
3
3
|
import functools
|
|
4
|
-
import multiprocessing
|
|
5
4
|
from multiprocessing import pool as mp_pool
|
|
6
5
|
import os
|
|
6
|
+
import pathlib
|
|
7
7
|
import threading
|
|
8
8
|
import time
|
|
9
9
|
import traceback
|
|
@@ -15,14 +15,12 @@ import filelock
|
|
|
15
15
|
import requests
|
|
16
16
|
|
|
17
17
|
from sky import backends
|
|
18
|
-
from sky import core
|
|
19
18
|
from sky import exceptions
|
|
20
|
-
from sky import execution
|
|
21
19
|
from sky import global_user_state
|
|
22
20
|
from sky import sky_logging
|
|
23
21
|
from sky import task as task_lib
|
|
24
22
|
from sky.backends import backend_utils
|
|
25
|
-
from sky.
|
|
23
|
+
from sky.client import sdk
|
|
26
24
|
from sky.serve import constants as serve_constants
|
|
27
25
|
from sky.serve import serve_state
|
|
28
26
|
from sky.serve import serve_utils
|
|
@@ -32,14 +30,18 @@ from sky.skylet import constants
|
|
|
32
30
|
from sky.skylet import job_lib
|
|
33
31
|
from sky.usage import usage_lib
|
|
34
32
|
from sky.utils import common_utils
|
|
33
|
+
from sky.utils import context
|
|
35
34
|
from sky.utils import controller_utils
|
|
36
35
|
from sky.utils import env_options
|
|
37
36
|
from sky.utils import resources_utils
|
|
38
37
|
from sky.utils import status_lib
|
|
38
|
+
from sky.utils import thread_utils
|
|
39
39
|
from sky.utils import ux_utils
|
|
40
40
|
from sky.utils import yaml_utils
|
|
41
41
|
|
|
42
42
|
if typing.TYPE_CHECKING:
|
|
43
|
+
import logging
|
|
44
|
+
|
|
43
45
|
from sky.serve import service_spec
|
|
44
46
|
|
|
45
47
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -48,6 +50,7 @@ _JOB_STATUS_FETCH_INTERVAL = 30
|
|
|
48
50
|
_PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
49
51
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
50
52
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
53
|
+
_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS = 15
|
|
51
54
|
|
|
52
55
|
# TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
|
|
53
56
|
# 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
|
|
@@ -59,9 +62,15 @@ ProcessStatus = common_utils.ProcessStatus
|
|
|
59
62
|
|
|
60
63
|
# TODO(tian): Combine this with
|
|
61
64
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
65
|
+
# Use context.contextual to enable per-launch output redirection.
|
|
66
|
+
@context.contextual
|
|
62
67
|
def launch_cluster(replica_id: int,
|
|
63
|
-
|
|
68
|
+
yaml_content: str,
|
|
64
69
|
cluster_name: str,
|
|
70
|
+
log_file: str,
|
|
71
|
+
replica_to_request_id: thread_utils.ThreadSafeDict[int, str],
|
|
72
|
+
replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
|
|
73
|
+
int, bool],
|
|
65
74
|
resources_override: Optional[Dict[str, Any]] = None,
|
|
66
75
|
retry_until_up: bool = True,
|
|
67
76
|
max_retry: int = 3) -> None:
|
|
@@ -75,14 +84,16 @@ def launch_cluster(replica_id: int,
|
|
|
75
84
|
or some error happened before provisioning and will happen again
|
|
76
85
|
if retry.
|
|
77
86
|
"""
|
|
87
|
+
ctx = context.get()
|
|
88
|
+
assert ctx is not None, 'Context is not initialized'
|
|
89
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
90
|
+
|
|
78
91
|
if resources_override is not None:
|
|
79
92
|
logger.info(f'Scaling up replica (id: {replica_id}) cluster '
|
|
80
93
|
f'{cluster_name} with resources override: '
|
|
81
94
|
f'{resources_override}')
|
|
82
95
|
try:
|
|
83
|
-
|
|
84
|
-
os.path.expanduser(service_task_yaml_path))
|
|
85
|
-
task = task_lib.Task.from_yaml_config(config)
|
|
96
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
86
97
|
if resources_override is not None:
|
|
87
98
|
resources = task.resources
|
|
88
99
|
overrided_resources = [
|
|
@@ -99,16 +110,31 @@ def launch_cluster(replica_id: int,
|
|
|
99
110
|
raise RuntimeError(
|
|
100
111
|
f'Failed to launch the sky serve replica cluster {cluster_name} '
|
|
101
112
|
'due to failing to initialize sky.Task from yaml file.') from e
|
|
113
|
+
|
|
114
|
+
def _check_is_cancelled() -> bool:
|
|
115
|
+
is_cancelled = replica_to_launch_cancelled.get(replica_id, False)
|
|
116
|
+
if is_cancelled:
|
|
117
|
+
logger.info(f'Replica {replica_id} launch cancelled.')
|
|
118
|
+
# Pop the value to indicate that the signal was received.
|
|
119
|
+
replica_to_launch_cancelled.pop(replica_id)
|
|
120
|
+
return is_cancelled
|
|
121
|
+
|
|
102
122
|
retry_cnt = 0
|
|
103
123
|
backoff = common_utils.Backoff(_RETRY_INIT_GAP_SECONDS)
|
|
104
124
|
while True:
|
|
105
125
|
retry_cnt += 1
|
|
106
126
|
try:
|
|
127
|
+
if _check_is_cancelled():
|
|
128
|
+
return
|
|
107
129
|
usage_lib.messages.usage.set_internal()
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
130
|
+
request_id = sdk.launch(task,
|
|
131
|
+
cluster_name,
|
|
132
|
+
retry_until_up=retry_until_up,
|
|
133
|
+
_is_launched_by_sky_serve_controller=True)
|
|
134
|
+
logger.info(f'Replica cluster {cluster_name} launch requested '
|
|
135
|
+
f'with request_id: {request_id}.')
|
|
136
|
+
replica_to_request_id[replica_id] = request_id
|
|
137
|
+
sdk.stream_and_get(request_id)
|
|
112
138
|
logger.info(f'Replica cluster {cluster_name} launched.')
|
|
113
139
|
except (exceptions.InvalidClusterNameError,
|
|
114
140
|
exceptions.NoCloudAccessError,
|
|
@@ -133,22 +159,44 @@ def launch_cluster(replica_id: int,
|
|
|
133
159
|
else: # No exception, the launch succeeds.
|
|
134
160
|
return
|
|
135
161
|
|
|
136
|
-
|
|
162
|
+
# Cleanup the request id and the failed cluster.
|
|
163
|
+
replica_to_request_id.pop(replica_id)
|
|
164
|
+
# If it is cancelled, no need to terminate the cluster. It will be
|
|
165
|
+
# handled by the termination thread.
|
|
166
|
+
if _check_is_cancelled():
|
|
167
|
+
return
|
|
168
|
+
terminate_cluster(cluster_name, log_file=log_file)
|
|
169
|
+
|
|
137
170
|
if retry_cnt >= max_retry:
|
|
138
171
|
raise RuntimeError('Failed to launch the sky serve replica cluster '
|
|
139
172
|
f'{cluster_name} after {max_retry} retries.')
|
|
173
|
+
|
|
140
174
|
gap_seconds = backoff.current_backoff()
|
|
141
175
|
logger.info('Retrying to launch the sky serve replica cluster '
|
|
142
176
|
f'in {gap_seconds:.1f} seconds.')
|
|
143
|
-
time.
|
|
177
|
+
start_backoff = time.time()
|
|
178
|
+
# Check if it is cancelled every 0.1 seconds.
|
|
179
|
+
while time.time() - start_backoff < gap_seconds:
|
|
180
|
+
if _check_is_cancelled():
|
|
181
|
+
return
|
|
182
|
+
time.sleep(0.1)
|
|
144
183
|
|
|
145
184
|
|
|
146
185
|
# TODO(tian): Combine this with
|
|
147
186
|
# sky/spot/recovery_strategy.py::terminate_cluster
|
|
187
|
+
@context.contextual
|
|
148
188
|
def terminate_cluster(cluster_name: str,
|
|
189
|
+
log_file: str,
|
|
149
190
|
replica_drain_delay_seconds: int = 0,
|
|
150
191
|
max_retry: int = 3) -> None:
|
|
151
192
|
"""Terminate the sky serve replica cluster."""
|
|
193
|
+
# Setup logging redirection.
|
|
194
|
+
ctx = context.get()
|
|
195
|
+
assert ctx is not None, 'Context is not initialized'
|
|
196
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
197
|
+
|
|
198
|
+
logger.info(f'Terminating replica cluster {cluster_name} with '
|
|
199
|
+
f'replica_drain_delay_seconds: {replica_drain_delay_seconds}')
|
|
152
200
|
time.sleep(replica_drain_delay_seconds)
|
|
153
201
|
retry_cnt = 0
|
|
154
202
|
backoff = common_utils.Backoff()
|
|
@@ -156,7 +204,10 @@ def terminate_cluster(cluster_name: str,
|
|
|
156
204
|
retry_cnt += 1
|
|
157
205
|
try:
|
|
158
206
|
usage_lib.messages.usage.set_internal()
|
|
159
|
-
|
|
207
|
+
logger.info(f'Sending down request to cluster {cluster_name}')
|
|
208
|
+
request_id = sdk.down(cluster_name)
|
|
209
|
+
sdk.stream_and_get(request_id)
|
|
210
|
+
logger.info(f'Replica cluster {cluster_name} terminated.')
|
|
160
211
|
return
|
|
161
212
|
except ValueError:
|
|
162
213
|
# The cluster is already terminated.
|
|
@@ -176,9 +227,9 @@ def terminate_cluster(cluster_name: str,
|
|
|
176
227
|
time.sleep(gap_seconds)
|
|
177
228
|
|
|
178
229
|
|
|
179
|
-
def _get_resources_ports(
|
|
230
|
+
def _get_resources_ports(yaml_content: str) -> str:
|
|
180
231
|
"""Get the resources ports used by the task."""
|
|
181
|
-
task = task_lib.Task.
|
|
232
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
182
233
|
# Already checked all ports are valid in sky.serve.core.up
|
|
183
234
|
assert task.resources, task
|
|
184
235
|
assert task.service is not None, task
|
|
@@ -188,7 +239,7 @@ def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
|
188
239
|
return task.service.ports
|
|
189
240
|
|
|
190
241
|
|
|
191
|
-
def _should_use_spot(
|
|
242
|
+
def _should_use_spot(yaml_content: str,
|
|
192
243
|
resource_override: Optional[Dict[str, Any]]) -> bool:
|
|
193
244
|
"""Get whether the task should use spot."""
|
|
194
245
|
if resource_override is not None:
|
|
@@ -196,7 +247,7 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
196
247
|
if use_spot_override is not None:
|
|
197
248
|
assert isinstance(use_spot_override, bool)
|
|
198
249
|
return use_spot_override
|
|
199
|
-
task = task_lib.Task.
|
|
250
|
+
task = task_lib.Task.from_yaml_str(yaml_content)
|
|
200
251
|
spot_use_resources = [
|
|
201
252
|
resources for resources in task.resources if resources.use_spot
|
|
202
253
|
]
|
|
@@ -364,16 +415,16 @@ class ReplicaStatusProperty:
|
|
|
364
415
|
return serve_state.ReplicaStatus.UNKNOWN
|
|
365
416
|
if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
|
|
366
417
|
# sky.launch failed
|
|
367
|
-
# The down
|
|
418
|
+
# The down thread has not been started if it reaches here,
|
|
368
419
|
# due to the `if self.sky_down_status is not None`` check above.
|
|
369
|
-
# However, it should have been started by
|
|
420
|
+
# However, it should have been started by _refresh_thread_pool.
|
|
370
421
|
# If not started, this means some bug prevent sky.down from
|
|
371
422
|
# executing. It is also a potential resource leak, so we mark
|
|
372
423
|
# it as FAILED_CLEANUP.
|
|
373
424
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
374
425
|
if self.user_app_failed:
|
|
375
426
|
# Failed on user setup/run
|
|
376
|
-
# Same as above, the down
|
|
427
|
+
# Same as above, the down thread should have been started.
|
|
377
428
|
return serve_state.ReplicaStatus.FAILED_CLEANUP
|
|
378
429
|
if self.service_ready_now:
|
|
379
430
|
# Service is ready
|
|
@@ -423,11 +474,12 @@ class ReplicaInfo:
|
|
|
423
474
|
based on the cluster name.
|
|
424
475
|
"""
|
|
425
476
|
if cluster_record is None:
|
|
426
|
-
|
|
477
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
427
478
|
self.cluster_name)
|
|
428
|
-
|
|
479
|
+
else:
|
|
480
|
+
handle = cluster_record['handle']
|
|
481
|
+
if handle is None:
|
|
429
482
|
return None
|
|
430
|
-
handle = cluster_record['handle']
|
|
431
483
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
432
484
|
return handle
|
|
433
485
|
|
|
@@ -444,6 +496,12 @@ class ReplicaInfo:
|
|
|
444
496
|
handle = self.handle()
|
|
445
497
|
if handle is None:
|
|
446
498
|
return None
|
|
499
|
+
if self.replica_port == '-':
|
|
500
|
+
# This is a pool replica so there is no endpoint and it's filled
|
|
501
|
+
# with this dummy value. We return None here so that we can
|
|
502
|
+
# get the active ready replicas and perform autoscaling. Otherwise,
|
|
503
|
+
# would error out when trying to get the endpoint.
|
|
504
|
+
return None
|
|
447
505
|
replica_port_int = int(self.replica_port)
|
|
448
506
|
try:
|
|
449
507
|
endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
|
|
@@ -471,7 +529,7 @@ class ReplicaInfo:
|
|
|
471
529
|
with_handle: bool,
|
|
472
530
|
with_url: bool = True) -> Dict[str, Any]:
|
|
473
531
|
cluster_record = global_user_state.get_cluster_from_name(
|
|
474
|
-
self.cluster_name)
|
|
532
|
+
self.cluster_name, include_user_info=False, summary_response=True)
|
|
475
533
|
info_dict = {
|
|
476
534
|
'replica_id': self.replica_id,
|
|
477
535
|
'name': self.cluster_name,
|
|
@@ -489,8 +547,8 @@ class ReplicaInfo:
|
|
|
489
547
|
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
490
548
|
info_dict['region'] = handle.launched_resources.region
|
|
491
549
|
info_dict['resources_str'] = (
|
|
492
|
-
resources_utils.get_readable_resources_repr(
|
|
493
|
-
|
|
550
|
+
resources_utils.get_readable_resources_repr(
|
|
551
|
+
handle, simplified_only=True)[0])
|
|
494
552
|
return info_dict
|
|
495
553
|
|
|
496
554
|
def __repr__(self) -> str:
|
|
@@ -619,8 +677,8 @@ class ReplicaInfo:
|
|
|
619
677
|
class ReplicaManager:
|
|
620
678
|
"""Each replica manager monitors one service."""
|
|
621
679
|
|
|
622
|
-
def __init__(self, service_name: str,
|
|
623
|
-
|
|
680
|
+
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
|
681
|
+
version: int) -> None:
|
|
624
682
|
self.lock = threading.Lock()
|
|
625
683
|
self._next_replica_id: int = 1
|
|
626
684
|
self._service_name: str = service_name
|
|
@@ -636,9 +694,9 @@ class ReplicaManager:
|
|
|
636
694
|
f'Readiness header keys: {header_keys}')
|
|
637
695
|
|
|
638
696
|
# Newest version among the currently provisioned and launched replicas
|
|
639
|
-
self.latest_version: int =
|
|
697
|
+
self.latest_version: int = version
|
|
640
698
|
# Oldest version among the currently provisioned and launched replicas
|
|
641
|
-
self.least_recent_version: int =
|
|
699
|
+
self.least_recent_version: int = version
|
|
642
700
|
|
|
643
701
|
def _consecutive_failure_threshold_timeout(self) -> int:
|
|
644
702
|
"""The timeout for the consecutive failure threshold in seconds.
|
|
@@ -674,8 +732,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
674
732
|
"""Replica Manager for SkyPilot clusters.
|
|
675
733
|
|
|
676
734
|
It will run three daemon to monitor the status of the replicas:
|
|
677
|
-
(1)
|
|
678
|
-
to monitor the progress of the launch/down
|
|
735
|
+
(1) _thread_pool_refresher: Refresh the launch/down thread pool
|
|
736
|
+
to monitor the progress of the launch/down thread.
|
|
679
737
|
(2) _job_status_fetcher: Fetch the job status of the service to
|
|
680
738
|
monitor the status of the service jobs.
|
|
681
739
|
(3) _replica_prober: Do readiness probe to the replicas to monitor
|
|
@@ -683,24 +741,24 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
683
741
|
"""
|
|
684
742
|
|
|
685
743
|
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
|
686
|
-
|
|
687
|
-
super().__init__(service_name, spec)
|
|
688
|
-
self.
|
|
689
|
-
task = task_lib.Task.
|
|
744
|
+
version: int) -> None:
|
|
745
|
+
super().__init__(service_name, spec, version)
|
|
746
|
+
self.yaml_content = serve_state.get_yaml_content(service_name, version)
|
|
747
|
+
task = task_lib.Task.from_yaml_str(self.yaml_content)
|
|
690
748
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
691
749
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
692
|
-
# TODO(tian): Store launch/down
|
|
693
|
-
# manager more persistent.
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
self.
|
|
699
|
-
int,
|
|
700
|
-
self.
|
|
701
|
-
int,
|
|
702
|
-
|
|
703
|
-
threading.Thread(target=self.
|
|
750
|
+
# TODO(tian): Store launch/down request id in the replica table, to make
|
|
751
|
+
# the manager more persistent.
|
|
752
|
+
self._launch_thread_pool: thread_utils.ThreadSafeDict[
|
|
753
|
+
int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
|
|
754
|
+
self._replica_to_request_id: thread_utils.ThreadSafeDict[
|
|
755
|
+
int, str] = thread_utils.ThreadSafeDict()
|
|
756
|
+
self._replica_to_launch_cancelled: thread_utils.ThreadSafeDict[
|
|
757
|
+
int, bool] = thread_utils.ThreadSafeDict()
|
|
758
|
+
self._down_thread_pool: thread_utils.ThreadSafeDict[
|
|
759
|
+
int, thread_utils.SafeThread] = thread_utils.ThreadSafeDict()
|
|
760
|
+
|
|
761
|
+
threading.Thread(target=self._thread_pool_refresher).start()
|
|
704
762
|
threading.Thread(target=self._job_status_fetcher).start()
|
|
705
763
|
threading.Thread(target=self._replica_prober).start()
|
|
706
764
|
|
|
@@ -710,14 +768,14 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
710
768
|
def _recover_replica_operations(self):
|
|
711
769
|
"""Let's see are there something to do for ReplicaManager in a
|
|
712
770
|
recovery run"""
|
|
713
|
-
assert (not self.
|
|
714
|
-
), 'We should not have any running
|
|
771
|
+
assert (not self._launch_thread_pool and not self._down_thread_pool
|
|
772
|
+
), 'We should not have any running threads in a recovery run'
|
|
715
773
|
|
|
716
774
|
# There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
|
|
717
775
|
# _launch_replica.
|
|
718
776
|
# We prioritize PROVISIONING replicas since they were previously
|
|
719
777
|
# launched but may have been interrupted and need to be restarted.
|
|
720
|
-
# This is why we
|
|
778
|
+
# This is why we handle PENDING replicas only after PROVISIONING
|
|
721
779
|
# replicas.
|
|
722
780
|
to_up_replicas = serve_state.get_replicas_at_status(
|
|
723
781
|
self._service_name, serve_state.ReplicaStatus.PROVISIONING)
|
|
@@ -754,8 +812,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
754
812
|
replica_id: int,
|
|
755
813
|
resources_override: Optional[Dict[str, Any]] = None,
|
|
756
814
|
) -> None:
|
|
757
|
-
if replica_id in self.
|
|
758
|
-
logger.warning(f'Launch
|
|
815
|
+
if replica_id in self._launch_thread_pool:
|
|
816
|
+
logger.warning(f'Launch thread for replica {replica_id} '
|
|
759
817
|
'already exists. Skipping.')
|
|
760
818
|
return
|
|
761
819
|
logger.info(f'Launching replica {replica_id}...')
|
|
@@ -763,8 +821,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
763
821
|
self._service_name, replica_id)
|
|
764
822
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
|
765
823
|
self._service_name, replica_id)
|
|
766
|
-
use_spot = _should_use_spot(self.
|
|
767
|
-
resources_override)
|
|
824
|
+
use_spot = _should_use_spot(self.yaml_content, resources_override)
|
|
768
825
|
retry_until_up = True
|
|
769
826
|
location = None
|
|
770
827
|
if use_spot and self._spot_placer is not None:
|
|
@@ -787,22 +844,21 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
787
844
|
location = self._spot_placer.select_next_location(
|
|
788
845
|
current_spot_locations)
|
|
789
846
|
resources_override.update(location.to_dict())
|
|
790
|
-
|
|
791
|
-
target=
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
resources_override, retry_until_up),
|
|
847
|
+
t = thread_utils.SafeThread(
|
|
848
|
+
target=launch_cluster,
|
|
849
|
+
args=(replica_id, self.yaml_content, cluster_name, log_file_name,
|
|
850
|
+
self._replica_to_request_id,
|
|
851
|
+
self._replica_to_launch_cancelled, resources_override,
|
|
852
|
+
retry_until_up),
|
|
797
853
|
)
|
|
798
|
-
replica_port = _get_resources_ports(self.
|
|
854
|
+
replica_port = _get_resources_ports(self.yaml_content)
|
|
799
855
|
|
|
800
856
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
|
801
857
|
location, self.latest_version, resources_override)
|
|
802
858
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
803
|
-
# Don't start right now; we will start it later in
|
|
859
|
+
# Don't start right now; we will start it later in _refresh_thread_pool
|
|
804
860
|
# to avoid too many sky.launch running at the same time.
|
|
805
|
-
self.
|
|
861
|
+
self._launch_thread_pool[replica_id] = t
|
|
806
862
|
|
|
807
863
|
@with_lock
|
|
808
864
|
def scale_up(self,
|
|
@@ -810,10 +866,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
810
866
|
self._launch_replica(self._next_replica_id, resources_override)
|
|
811
867
|
self._next_replica_id += 1
|
|
812
868
|
|
|
813
|
-
def _handle_sky_down_finish(self, info: ReplicaInfo,
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
869
|
+
def _handle_sky_down_finish(self, info: ReplicaInfo,
|
|
870
|
+
format_exc: Optional[str]) -> None:
|
|
871
|
+
if format_exc is not None:
|
|
872
|
+
logger.error(f'Down thread for replica {info.replica_id} '
|
|
873
|
+
f'exited abnormally with exception {format_exc}.')
|
|
817
874
|
info.status_property.sky_down_status = (
|
|
818
875
|
common_utils.ProcessStatus.FAILED)
|
|
819
876
|
else:
|
|
@@ -872,7 +929,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
872
929
|
'the logs should always be synced down. '
|
|
873
930
|
'So that the user can see the logs to debug.')
|
|
874
931
|
|
|
875
|
-
if replica_id in self.
|
|
932
|
+
if replica_id in self._launch_thread_pool:
|
|
876
933
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
877
934
|
replica_id)
|
|
878
935
|
assert info is not None
|
|
@@ -880,17 +937,47 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
880
937
|
common_utils.ProcessStatus.INTERRUPTED)
|
|
881
938
|
serve_state.add_or_update_replica(self._service_name, replica_id,
|
|
882
939
|
info)
|
|
883
|
-
|
|
884
|
-
if
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
940
|
+
launch_thread = self._launch_thread_pool[replica_id]
|
|
941
|
+
if launch_thread.is_alive():
|
|
942
|
+
self._replica_to_launch_cancelled[replica_id] = True
|
|
943
|
+
start_wait_time = time.time()
|
|
944
|
+
timeout_reached = False
|
|
945
|
+
while True:
|
|
946
|
+
# Launch request id found. cancel it.
|
|
947
|
+
if replica_id in self._replica_to_request_id:
|
|
948
|
+
request_id = self._replica_to_request_id[replica_id]
|
|
949
|
+
sdk.api_cancel(request_id)
|
|
950
|
+
break
|
|
951
|
+
if replica_id not in self._replica_to_launch_cancelled:
|
|
952
|
+
# Indicates that the cancellation was received.
|
|
953
|
+
break
|
|
954
|
+
if not launch_thread.is_alive():
|
|
955
|
+
# It's possible that the launch thread immediately
|
|
956
|
+
# finished after we check. Exit the loop now.
|
|
957
|
+
break
|
|
958
|
+
if (time.time() - start_wait_time >
|
|
959
|
+
_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS):
|
|
960
|
+
timeout_reached = True
|
|
961
|
+
break
|
|
962
|
+
time.sleep(0.1)
|
|
963
|
+
if timeout_reached:
|
|
964
|
+
logger.warning(
|
|
965
|
+
'Failed to cancel launch request for replica '
|
|
966
|
+
f'{replica_id} after '
|
|
967
|
+
f'{_WAIT_LAUNCH_THREAD_TIMEOUT_SECONDS} seconds. '
|
|
968
|
+
'Force waiting the launch thread to finish.')
|
|
969
|
+
else:
|
|
970
|
+
logger.info('Interrupted launch thread for replica '
|
|
971
|
+
f'{replica_id} and deleted the cluster.')
|
|
972
|
+
launch_thread.join()
|
|
973
|
+
else:
|
|
974
|
+
logger.info(f'Launch thread for replica {replica_id} '
|
|
975
|
+
'already finished. Delete the cluster now.')
|
|
976
|
+
self._launch_thread_pool.pop(replica_id)
|
|
977
|
+
self._replica_to_request_id.pop(replica_id)
|
|
978
|
+
|
|
979
|
+
if replica_id in self._down_thread_pool:
|
|
980
|
+
logger.warning(f'Terminate thread for replica {replica_id} '
|
|
894
981
|
'already exists. Skipping.')
|
|
895
982
|
return
|
|
896
983
|
|
|
@@ -955,22 +1042,22 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
955
1042
|
# If the cluster does not exist, it means either the cluster never
|
|
956
1043
|
# exists (e.g., the cluster is scaled down before it gets a chance to
|
|
957
1044
|
# provision) or the cluster is preempted and cleaned up by the status
|
|
958
|
-
# refresh. In this case, we skip spawning a new down
|
|
1045
|
+
# refresh. In this case, we skip spawning a new down thread to save
|
|
959
1046
|
# controller resources.
|
|
960
|
-
if global_user_state.
|
|
961
|
-
self._handle_sky_down_finish(info,
|
|
1047
|
+
if not global_user_state.cluster_with_name_exists(info.cluster_name):
|
|
1048
|
+
self._handle_sky_down_finish(info, format_exc=None)
|
|
962
1049
|
return
|
|
963
1050
|
|
|
964
|
-
# Otherwise, start the
|
|
965
|
-
|
|
966
|
-
target=
|
|
967
|
-
|
|
968
|
-
|
|
1051
|
+
# Otherwise, start the thread to terminate the cluster.
|
|
1052
|
+
t = thread_utils.SafeThread(
|
|
1053
|
+
target=terminate_cluster,
|
|
1054
|
+
args=(info.cluster_name, log_file_name,
|
|
1055
|
+
replica_drain_delay_seconds),
|
|
969
1056
|
)
|
|
970
1057
|
info.status_property.sky_down_status = (
|
|
971
1058
|
common_utils.ProcessStatus.SCHEDULED)
|
|
972
1059
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
|
973
|
-
self.
|
|
1060
|
+
self._down_thread_pool[replica_id] = t
|
|
974
1061
|
|
|
975
1062
|
@with_lock
|
|
976
1063
|
def scale_down(self, replica_id: int, purge: bool = False) -> None:
|
|
@@ -1035,55 +1122,54 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1035
1122
|
#################################
|
|
1036
1123
|
|
|
1037
1124
|
@with_lock
|
|
1038
|
-
def
|
|
1039
|
-
"""Refresh the launch/down
|
|
1125
|
+
def _refresh_thread_pool(self) -> None:
|
|
1126
|
+
"""Refresh the launch/down thread pool.
|
|
1040
1127
|
|
|
1041
|
-
This function will checks all sky.launch and sky.down
|
|
1128
|
+
This function will checks all sky.launch and sky.down thread on
|
|
1042
1129
|
the fly. If any of them finished, it will update the status of the
|
|
1043
1130
|
corresponding replica.
|
|
1044
1131
|
"""
|
|
1045
1132
|
# To avoid `dictionary changed size during iteration` error.
|
|
1046
|
-
|
|
1047
|
-
for replica_id,
|
|
1048
|
-
if
|
|
1133
|
+
launch_thread_pool_snapshot = list(self._launch_thread_pool.items())
|
|
1134
|
+
for replica_id, t in launch_thread_pool_snapshot:
|
|
1135
|
+
if t.is_alive():
|
|
1049
1136
|
continue
|
|
1050
1137
|
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
1051
1138
|
info = serve_state.get_replica_info_from_id(
|
|
1052
1139
|
self._service_name, replica_id)
|
|
1053
1140
|
assert info is not None, replica_id
|
|
1054
1141
|
error_in_sky_launch = False
|
|
1055
|
-
schedule_next_jobs = False
|
|
1056
1142
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
1057
1143
|
# sky.launch not started yet
|
|
1058
|
-
if controller_utils.can_provision():
|
|
1059
|
-
|
|
1144
|
+
if controller_utils.can_provision(self._is_pool):
|
|
1145
|
+
t.start()
|
|
1060
1146
|
info.status_property.sky_launch_status = (
|
|
1061
1147
|
common_utils.ProcessStatus.RUNNING)
|
|
1062
1148
|
else:
|
|
1063
1149
|
# sky.launch finished
|
|
1064
|
-
# TODO(tian): Try-catch in
|
|
1150
|
+
# TODO(tian): Try-catch in thread, and have an enum return
|
|
1065
1151
|
# value to indicate which type of failure happened.
|
|
1066
1152
|
# Currently we only have user code failure since the
|
|
1067
1153
|
# retry_until_up flag is set to True, but it will be helpful
|
|
1068
1154
|
# when we enable user choose whether to retry or not.
|
|
1069
1155
|
logger.info(
|
|
1070
|
-
f'Launch
|
|
1071
|
-
|
|
1072
|
-
|
|
1156
|
+
f'Launch thread for replica {replica_id} finished.')
|
|
1157
|
+
self._launch_thread_pool.pop(replica_id)
|
|
1158
|
+
self._replica_to_request_id.pop(replica_id)
|
|
1159
|
+
if t.format_exc is not None:
|
|
1073
1160
|
logger.warning(
|
|
1074
|
-
f'Launch
|
|
1075
|
-
f'exited abnormally with
|
|
1076
|
-
' Terminating...')
|
|
1161
|
+
f'Launch thread for replica {replica_id} '
|
|
1162
|
+
f'exited abnormally with exception '
|
|
1163
|
+
f'{t.format_exc}. Terminating...')
|
|
1077
1164
|
info.status_property.sky_launch_status = (
|
|
1078
1165
|
common_utils.ProcessStatus.FAILED)
|
|
1079
1166
|
error_in_sky_launch = True
|
|
1080
1167
|
else:
|
|
1081
1168
|
info.status_property.sky_launch_status = (
|
|
1082
1169
|
common_utils.ProcessStatus.SUCCEEDED)
|
|
1083
|
-
schedule_next_jobs = True
|
|
1084
1170
|
if self._spot_placer is not None and info.is_spot:
|
|
1085
1171
|
# TODO(tian): Currently, we set the location to
|
|
1086
|
-
# preemptive if the launch
|
|
1172
|
+
# preemptive if the launch thread failed. This is
|
|
1087
1173
|
# because if the error is not related to the
|
|
1088
1174
|
# availability of the location, then all locations
|
|
1089
1175
|
# should failed for same reason. So it does not matter
|
|
@@ -1093,26 +1179,22 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1093
1179
|
# availability of the location later.
|
|
1094
1180
|
location = info.get_spot_location()
|
|
1095
1181
|
assert location is not None
|
|
1096
|
-
if
|
|
1182
|
+
if t.format_exc is not None:
|
|
1097
1183
|
self._spot_placer.set_preemptive(location)
|
|
1098
1184
|
info.status_property.failed_spot_availability = True
|
|
1099
1185
|
else:
|
|
1100
1186
|
self._spot_placer.set_active(location)
|
|
1101
1187
|
serve_state.add_or_update_replica(self._service_name,
|
|
1102
1188
|
replica_id, info)
|
|
1103
|
-
if schedule_next_jobs and self._is_pool:
|
|
1104
|
-
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1105
1189
|
if error_in_sky_launch:
|
|
1106
1190
|
# Teardown after update replica info since
|
|
1107
1191
|
# _terminate_replica will update the replica info too.
|
|
1108
1192
|
self._terminate_replica(replica_id,
|
|
1109
1193
|
sync_down_logs=True,
|
|
1110
1194
|
replica_drain_delay_seconds=0)
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
for replica_id, p in down_process_pool_snapshot:
|
|
1115
|
-
if p.is_alive():
|
|
1195
|
+
down_thread_pool_snapshot = list(self._down_thread_pool.items())
|
|
1196
|
+
for replica_id, t in down_thread_pool_snapshot:
|
|
1197
|
+
if t.is_alive():
|
|
1116
1198
|
continue
|
|
1117
1199
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
|
1118
1200
|
replica_id)
|
|
@@ -1120,17 +1202,17 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1120
1202
|
if (info.status_property.sky_down_status ==
|
|
1121
1203
|
common_utils.ProcessStatus.SCHEDULED):
|
|
1122
1204
|
# sky.down not started yet
|
|
1123
|
-
if controller_utils.can_terminate():
|
|
1124
|
-
|
|
1205
|
+
if controller_utils.can_terminate(self._is_pool):
|
|
1206
|
+
t.start()
|
|
1125
1207
|
info.status_property.sky_down_status = (
|
|
1126
1208
|
common_utils.ProcessStatus.RUNNING)
|
|
1127
1209
|
serve_state.add_or_update_replica(self._service_name,
|
|
1128
1210
|
replica_id, info)
|
|
1129
1211
|
else:
|
|
1130
1212
|
logger.info(
|
|
1131
|
-
f'Terminate
|
|
1132
|
-
|
|
1133
|
-
self._handle_sky_down_finish(info,
|
|
1213
|
+
f'Terminate thread for replica {replica_id} finished.')
|
|
1214
|
+
self._down_thread_pool.pop(replica_id)
|
|
1215
|
+
self._handle_sky_down_finish(info, format_exc=t.format_exc)
|
|
1134
1216
|
|
|
1135
1217
|
# Clean old version
|
|
1136
1218
|
replica_infos = serve_state.get_replica_infos(self._service_name)
|
|
@@ -1140,25 +1222,25 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1140
1222
|
if self.least_recent_version < current_least_recent_version:
|
|
1141
1223
|
for version in range(self.least_recent_version,
|
|
1142
1224
|
current_least_recent_version):
|
|
1143
|
-
|
|
1225
|
+
yaml_content = serve_utils.get_yaml_content(
|
|
1144
1226
|
self._service_name, version)
|
|
1145
1227
|
# Delete old version metadata.
|
|
1146
1228
|
serve_state.delete_version(self._service_name, version)
|
|
1147
1229
|
# Delete storage buckets of older versions.
|
|
1148
|
-
service.cleanup_storage(
|
|
1230
|
+
service.cleanup_storage(yaml_content)
|
|
1149
1231
|
# newest version will be cleaned in serve down
|
|
1150
1232
|
self.least_recent_version = current_least_recent_version
|
|
1151
1233
|
|
|
1152
|
-
def
|
|
1153
|
-
"""Periodically refresh the launch/down
|
|
1234
|
+
def _thread_pool_refresher(self) -> None:
|
|
1235
|
+
"""Periodically refresh the launch/down thread pool."""
|
|
1154
1236
|
while True:
|
|
1155
|
-
logger.debug('Refreshing
|
|
1237
|
+
logger.debug('Refreshing thread pool.')
|
|
1156
1238
|
try:
|
|
1157
|
-
self.
|
|
1239
|
+
self._refresh_thread_pool()
|
|
1158
1240
|
except Exception as e: # pylint: disable=broad-except
|
|
1159
1241
|
# No matter what error happens, we should keep the
|
|
1160
|
-
#
|
|
1161
|
-
logger.error('Error in
|
|
1242
|
+
# thread pool refresher running.
|
|
1243
|
+
logger.error('Error in thread pool refresher: '
|
|
1162
1244
|
f'{common_utils.format_exception(e)}')
|
|
1163
1245
|
with ux_utils.enable_traceback():
|
|
1164
1246
|
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
@@ -1386,11 +1468,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1386
1468
|
logger.error(f'Invalid version: {version}, '
|
|
1387
1469
|
f'latest version: {self.latest_version}')
|
|
1388
1470
|
return
|
|
1389
|
-
|
|
1390
|
-
self._service_name, version)
|
|
1391
|
-
serve_state.add_or_update_version(self._service_name, version, spec)
|
|
1471
|
+
yaml_content = serve_state.get_yaml_content(self._service_name, version)
|
|
1392
1472
|
self.latest_version = version
|
|
1393
|
-
self.
|
|
1473
|
+
self.yaml_content = yaml_content
|
|
1394
1474
|
self._update_mode = update_mode
|
|
1395
1475
|
|
|
1396
1476
|
# Reuse all replicas that have the same config as the new version
|
|
@@ -1398,8 +1478,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1398
1478
|
# the latest version. This can significantly improve the speed
|
|
1399
1479
|
# for updating an existing service with only config changes to the
|
|
1400
1480
|
# service specs, e.g. scale down the service.
|
|
1401
|
-
new_config = yaml_utils.
|
|
1402
|
-
os.path.expanduser(service_task_yaml_path))
|
|
1481
|
+
new_config = yaml_utils.safe_load(yaml_content)
|
|
1403
1482
|
# Always create new replicas and scale down old ones when file_mounts
|
|
1404
1483
|
# are not empty.
|
|
1405
1484
|
if new_config.get('file_mounts', None) != {}:
|
|
@@ -1412,11 +1491,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1412
1491
|
for info in replica_infos:
|
|
1413
1492
|
if info.version < version and not info.is_terminal:
|
|
1414
1493
|
# Assume user does not change the yaml file on the controller.
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
old_config = yaml_utils.read_yaml(
|
|
1419
|
-
os.path.expanduser(old_service_task_yaml_path))
|
|
1494
|
+
old_yaml_content = serve_state.get_yaml_content(
|
|
1495
|
+
self._service_name, info.version)
|
|
1496
|
+
old_config = yaml_utils.safe_load(old_yaml_content)
|
|
1420
1497
|
for key in ['service', 'pool', '_user_specified_yaml']:
|
|
1421
1498
|
old_config.pop(key, None)
|
|
1422
1499
|
# Bump replica version if all fields except for service are
|