skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -5,25 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
|
|
|
5
5
|
resources:
|
|
6
6
|
job_recovery: EAGER_NEXT_REGION
|
|
7
7
|
"""
|
|
8
|
-
import
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
9
11
|
import traceback
|
|
10
12
|
import typing
|
|
11
|
-
from typing import Optional
|
|
13
|
+
from typing import Optional, Set
|
|
12
14
|
|
|
13
15
|
from sky import backends
|
|
14
16
|
from sky import dag as dag_lib
|
|
15
17
|
from sky import exceptions
|
|
16
|
-
from sky import execution
|
|
17
18
|
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
20
|
+
from sky import skypilot_config
|
|
19
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.client import sdk
|
|
20
23
|
from sky.jobs import scheduler
|
|
21
24
|
from sky.jobs import state
|
|
22
25
|
from sky.jobs import utils as managed_job_utils
|
|
23
26
|
from sky.serve import serve_utils
|
|
27
|
+
from sky.skylet import constants
|
|
24
28
|
from sky.skylet import job_lib
|
|
25
29
|
from sky.usage import usage_lib
|
|
26
30
|
from sky.utils import common_utils
|
|
31
|
+
from sky.utils import context_utils
|
|
32
|
+
from sky.utils import env_options
|
|
27
33
|
from sky.utils import registry
|
|
28
34
|
from sky.utils import status_lib
|
|
29
35
|
from sky.utils import ux_utils
|
|
@@ -41,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
|
41
47
|
# Minutes to job cluster autodown. This should be significantly larger than
|
|
42
48
|
# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
|
|
43
49
|
# cluster before its status can be updated by the job controller.
|
|
44
|
-
_AUTODOWN_MINUTES =
|
|
50
|
+
_AUTODOWN_MINUTES = 10
|
|
51
|
+
|
|
52
|
+
ENV_VARS_TO_CLEAR = [
|
|
53
|
+
skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
|
|
54
|
+
constants.USER_ID_ENV_VAR,
|
|
55
|
+
constants.USER_ENV_VAR,
|
|
56
|
+
env_options.Options.SHOW_DEBUG_INFO.env_key,
|
|
57
|
+
]
|
|
45
58
|
|
|
46
59
|
|
|
47
60
|
class StrategyExecutor:
|
|
@@ -49,15 +62,31 @@ class StrategyExecutor:
|
|
|
49
62
|
|
|
50
63
|
RETRY_INIT_GAP_SECONDS = 60
|
|
51
64
|
|
|
52
|
-
def __init__(
|
|
53
|
-
|
|
54
|
-
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
cluster_name: Optional[str],
|
|
68
|
+
backend: 'backends.Backend',
|
|
69
|
+
task: 'task_lib.Task',
|
|
70
|
+
max_restarts_on_errors: int,
|
|
71
|
+
job_id: int,
|
|
72
|
+
task_id: int,
|
|
73
|
+
pool: Optional[str],
|
|
74
|
+
starting: Set[int],
|
|
75
|
+
starting_lock: asyncio.Lock,
|
|
76
|
+
starting_signal: asyncio.Condition,
|
|
77
|
+
) -> None:
|
|
55
78
|
"""Initialize the strategy executor.
|
|
56
79
|
|
|
57
80
|
Args:
|
|
58
81
|
cluster_name: The name of the cluster.
|
|
59
82
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
|
60
83
|
task: The task to execute.
|
|
84
|
+
max_restarts_on_errors: Maximum number of restarts on errors.
|
|
85
|
+
job_id: The ID of the job.
|
|
86
|
+
task_id: The ID of the task.
|
|
87
|
+
starting: Set of job IDs that are currently starting.
|
|
88
|
+
starting_lock: Lock to synchronize starting jobs.
|
|
89
|
+
starting_signal: Condition to signal when a job can start.
|
|
61
90
|
"""
|
|
62
91
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
63
92
|
'Only CloudVMRayBackend is supported.')
|
|
@@ -75,11 +104,23 @@ class StrategyExecutor:
|
|
|
75
104
|
self.pool = pool
|
|
76
105
|
self.restart_cnt_on_failure = 0
|
|
77
106
|
self.job_id_on_pool_cluster: Optional[int] = None
|
|
107
|
+
self.starting = starting
|
|
108
|
+
self.starting_lock = starting_lock
|
|
109
|
+
self.starting_signal = starting_signal
|
|
78
110
|
|
|
79
111
|
@classmethod
|
|
80
|
-
def make(
|
|
81
|
-
|
|
82
|
-
|
|
112
|
+
def make(
|
|
113
|
+
cls,
|
|
114
|
+
cluster_name: Optional[str],
|
|
115
|
+
backend: 'backends.Backend',
|
|
116
|
+
task: 'task_lib.Task',
|
|
117
|
+
job_id: int,
|
|
118
|
+
task_id: int,
|
|
119
|
+
pool: Optional[str],
|
|
120
|
+
starting: Set[int],
|
|
121
|
+
starting_lock: asyncio.Lock,
|
|
122
|
+
starting_signal: asyncio.Condition,
|
|
123
|
+
) -> 'StrategyExecutor':
|
|
83
124
|
"""Create a strategy from a task."""
|
|
84
125
|
|
|
85
126
|
resource_list = list(task.resources)
|
|
@@ -111,9 +152,10 @@ class StrategyExecutor:
|
|
|
111
152
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
112
153
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
113
154
|
max_restarts_on_errors, job_id, task_id,
|
|
114
|
-
pool
|
|
155
|
+
pool, starting, starting_lock,
|
|
156
|
+
starting_signal)
|
|
115
157
|
|
|
116
|
-
def launch(self) -> float:
|
|
158
|
+
async def launch(self) -> float:
|
|
117
159
|
"""Launch the cluster for the first time.
|
|
118
160
|
|
|
119
161
|
It can fail if resource is not available. Need to check the cluster
|
|
@@ -125,11 +167,11 @@ class StrategyExecutor:
|
|
|
125
167
|
Raises: Please refer to the docstring of self._launch().
|
|
126
168
|
"""
|
|
127
169
|
|
|
128
|
-
job_submit_at = self._launch(max_retry=None)
|
|
170
|
+
job_submit_at = await self._launch(max_retry=None)
|
|
129
171
|
assert job_submit_at is not None
|
|
130
172
|
return job_submit_at
|
|
131
173
|
|
|
132
|
-
def recover(self) -> float:
|
|
174
|
+
async def recover(self) -> float:
|
|
133
175
|
"""Relaunch the cluster after failure and wait until job starts.
|
|
134
176
|
|
|
135
177
|
When recover() is called the cluster should be in STOPPED status (i.e.
|
|
@@ -139,13 +181,11 @@ class StrategyExecutor:
|
|
|
139
181
|
"""
|
|
140
182
|
raise NotImplementedError
|
|
141
183
|
|
|
142
|
-
def _try_cancel_jobs(self):
|
|
143
|
-
from sky import core # pylint: disable=import-outside-toplevel
|
|
144
|
-
|
|
184
|
+
async def _try_cancel_jobs(self):
|
|
145
185
|
if self.cluster_name is None:
|
|
146
186
|
return
|
|
147
|
-
handle =
|
|
148
|
-
self.cluster_name)
|
|
187
|
+
handle = await context_utils.to_thread(
|
|
188
|
+
global_user_state.get_handle_from_cluster_name, self.cluster_name)
|
|
149
189
|
if handle is None or self.pool is not None:
|
|
150
190
|
return
|
|
151
191
|
try:
|
|
@@ -169,14 +209,26 @@ class StrategyExecutor:
|
|
|
169
209
|
# should be functional with the `_try_cancel_if_cluster_is_init`
|
|
170
210
|
# flag, i.e. it sends the cancel signal to the head node, which will
|
|
171
211
|
# then kill the user process on remaining worker nodes.
|
|
172
|
-
# Only cancel the corresponding job for
|
|
212
|
+
# Only cancel the corresponding job for pool.
|
|
173
213
|
if self.pool is None:
|
|
174
|
-
|
|
214
|
+
request_id = await context_utils.to_thread(
|
|
215
|
+
sdk.cancel,
|
|
216
|
+
cluster_name=self.cluster_name,
|
|
217
|
+
all=True,
|
|
218
|
+
_try_cancel_if_cluster_is_init=True,
|
|
219
|
+
)
|
|
175
220
|
else:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
221
|
+
request_id = await context_utils.to_thread(
|
|
222
|
+
sdk.cancel,
|
|
223
|
+
cluster_name=self.cluster_name,
|
|
224
|
+
job_ids=[self.job_id_on_pool_cluster],
|
|
225
|
+
_try_cancel_if_cluster_is_init=True,
|
|
226
|
+
)
|
|
227
|
+
logger.debug(f'sdk.cancel request ID: {request_id}')
|
|
228
|
+
await context_utils.to_thread(
|
|
229
|
+
sdk.get,
|
|
230
|
+
request_id,
|
|
231
|
+
)
|
|
180
232
|
except Exception as e: # pylint: disable=broad-except
|
|
181
233
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
|
182
234
|
'might be already down or the head node is preempted.'
|
|
@@ -184,9 +236,9 @@ class StrategyExecutor:
|
|
|
184
236
|
f'{common_utils.format_exception(e)}\n'
|
|
185
237
|
'Terminating the cluster explicitly to ensure no '
|
|
186
238
|
'remaining job process interferes with recovery.')
|
|
187
|
-
self._cleanup_cluster
|
|
239
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
188
240
|
|
|
189
|
-
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
241
|
+
async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
190
242
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
|
191
243
|
|
|
192
244
|
Returns:
|
|
@@ -200,10 +252,10 @@ class StrategyExecutor:
|
|
|
200
252
|
# Avoid the infinite loop, if any bug happens.
|
|
201
253
|
job_checking_retry_cnt += 1
|
|
202
254
|
try:
|
|
203
|
-
cluster_status, _ = (
|
|
204
|
-
backend_utils.refresh_cluster_status_handle
|
|
205
|
-
|
|
206
|
-
|
|
255
|
+
cluster_status, _ = (await context_utils.to_thread(
|
|
256
|
+
backend_utils.refresh_cluster_status_handle,
|
|
257
|
+
self.cluster_name,
|
|
258
|
+
force_refresh_statuses=set(status_lib.ClusterStatus)))
|
|
207
259
|
except Exception as e: # pylint: disable=broad-except
|
|
208
260
|
# If any unexpected error happens, retry the job checking
|
|
209
261
|
# loop.
|
|
@@ -223,7 +275,7 @@ class StrategyExecutor:
|
|
|
223
275
|
break
|
|
224
276
|
|
|
225
277
|
try:
|
|
226
|
-
status = managed_job_utils.get_job_status(
|
|
278
|
+
status = await managed_job_utils.get_job_status(
|
|
227
279
|
self.backend,
|
|
228
280
|
self.cluster_name,
|
|
229
281
|
job_id=self.job_id_on_pool_cluster)
|
|
@@ -241,7 +293,8 @@ class StrategyExecutor:
|
|
|
241
293
|
# Check the job status until it is not in initialized status
|
|
242
294
|
if status is not None and status > job_lib.JobStatus.INIT:
|
|
243
295
|
try:
|
|
244
|
-
job_submitted_at =
|
|
296
|
+
job_submitted_at = await context_utils.to_thread(
|
|
297
|
+
managed_job_utils.get_job_timestamp,
|
|
245
298
|
self.backend,
|
|
246
299
|
self.cluster_name,
|
|
247
300
|
self.job_id_on_pool_cluster,
|
|
@@ -254,7 +307,8 @@ class StrategyExecutor:
|
|
|
254
307
|
'the job start timestamp. Retrying.')
|
|
255
308
|
continue
|
|
256
309
|
# Wait for the job to be started
|
|
257
|
-
|
|
310
|
+
await asyncio.sleep(
|
|
311
|
+
managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
|
|
258
312
|
return None
|
|
259
313
|
|
|
260
314
|
def _cleanup_cluster(self) -> None:
|
|
@@ -263,10 +317,10 @@ class StrategyExecutor:
|
|
|
263
317
|
if self.pool is None:
|
|
264
318
|
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
265
319
|
|
|
266
|
-
def _launch(self,
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
320
|
+
async def _launch(self,
|
|
321
|
+
max_retry: Optional[int] = 3,
|
|
322
|
+
raise_on_failure: bool = True,
|
|
323
|
+
recovery: bool = False) -> Optional[float]:
|
|
270
324
|
"""Implementation of launch().
|
|
271
325
|
|
|
272
326
|
The function will wait until the job starts running, but will leave the
|
|
@@ -307,54 +361,132 @@ class StrategyExecutor:
|
|
|
307
361
|
while True:
|
|
308
362
|
retry_cnt += 1
|
|
309
363
|
try:
|
|
310
|
-
with scheduler.scheduled_launch(
|
|
364
|
+
async with scheduler.scheduled_launch(
|
|
365
|
+
self.job_id,
|
|
366
|
+
self.starting,
|
|
367
|
+
self.starting_lock,
|
|
368
|
+
self.starting_signal,
|
|
369
|
+
):
|
|
311
370
|
# The job state may have been PENDING during backoff -
|
|
312
371
|
# update to STARTING or RECOVERING.
|
|
313
372
|
# On the first attempt (when retry_cnt is 1), we should
|
|
314
373
|
# already be in STARTING or RECOVERING.
|
|
315
374
|
if retry_cnt > 1:
|
|
316
|
-
state.
|
|
317
|
-
|
|
375
|
+
await state.set_restarting_async(
|
|
376
|
+
self.job_id, self.task_id, recovery)
|
|
318
377
|
try:
|
|
319
378
|
usage_lib.messages.usage.set_internal()
|
|
320
379
|
if self.pool is None:
|
|
321
380
|
assert self.cluster_name is not None
|
|
322
|
-
|
|
323
|
-
#
|
|
324
|
-
#
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
381
|
+
|
|
382
|
+
# sdk.launch will implicitly start the API server,
|
|
383
|
+
# but then the API server will inherit the current
|
|
384
|
+
# env vars/user, which we may not want.
|
|
385
|
+
# Instead, clear env vars here and call api_start
|
|
386
|
+
# explicitly.
|
|
387
|
+
vars_to_restore = {}
|
|
388
|
+
try:
|
|
389
|
+
for env_var in ENV_VARS_TO_CLEAR:
|
|
390
|
+
vars_to_restore[env_var] = os.environ.pop(
|
|
391
|
+
env_var, None)
|
|
392
|
+
logger.debug('Cleared env var: '
|
|
393
|
+
f'{env_var}')
|
|
394
|
+
logger.debug('Env vars for api_start: '
|
|
395
|
+
f'{os.environ}')
|
|
396
|
+
await context_utils.to_thread(sdk.api_start)
|
|
397
|
+
logger.info('API server started.')
|
|
398
|
+
finally:
|
|
399
|
+
for env_var, value in vars_to_restore.items():
|
|
400
|
+
if value is not None:
|
|
401
|
+
logger.debug('Restored env var: '
|
|
402
|
+
f'{env_var}: {value}')
|
|
403
|
+
os.environ[env_var] = value
|
|
404
|
+
|
|
405
|
+
request_id = None
|
|
406
|
+
try:
|
|
407
|
+
request_id = await context_utils.to_thread(
|
|
408
|
+
sdk.launch,
|
|
409
|
+
self.dag,
|
|
410
|
+
cluster_name=self.cluster_name,
|
|
411
|
+
# We expect to tear down the cluster as soon
|
|
412
|
+
# as the job is finished. However, in case
|
|
413
|
+
# the controller dies, we may end up with a
|
|
414
|
+
# resource leak.
|
|
415
|
+
# Ideally, we should autodown to be safe,
|
|
416
|
+
# but it's fine to disable it for now, as
|
|
417
|
+
# Nebius doesn't support autodown yet.
|
|
418
|
+
# TODO(kevin): set down=True once Nebius
|
|
419
|
+
# supports autodown.
|
|
420
|
+
# idle_minutes_to_autostop=(
|
|
421
|
+
# _AUTODOWN_MINUTES),
|
|
422
|
+
# down=True,
|
|
423
|
+
_is_launched_by_jobs_controller=True,
|
|
424
|
+
)
|
|
425
|
+
logger.debug('sdk.launch request ID: '
|
|
426
|
+
f'{request_id}')
|
|
427
|
+
await context_utils.to_thread(
|
|
428
|
+
sdk.stream_and_get,
|
|
429
|
+
request_id,
|
|
430
|
+
)
|
|
431
|
+
except asyncio.CancelledError:
|
|
432
|
+
if request_id:
|
|
433
|
+
req = await context_utils.to_thread(
|
|
434
|
+
sdk.api_cancel, request_id)
|
|
435
|
+
logger.debug('sdk.api_cancel request '
|
|
436
|
+
f'ID: {req}')
|
|
437
|
+
try:
|
|
438
|
+
await context_utils.to_thread(
|
|
439
|
+
sdk.get, req)
|
|
440
|
+
except Exception as e: # pylint: disable=broad-except
|
|
441
|
+
# we must still return a CancelledError
|
|
442
|
+
logger.error(
|
|
443
|
+
f'Failed to cancel the job: {e}')
|
|
444
|
+
raise
|
|
445
|
+
logger.info('Managed job cluster launched.')
|
|
340
446
|
else:
|
|
341
|
-
self.cluster_name = (
|
|
342
|
-
serve_utils.get_next_cluster_name
|
|
343
|
-
|
|
447
|
+
self.cluster_name = await (context_utils.to_thread(
|
|
448
|
+
serve_utils.get_next_cluster_name, self.pool,
|
|
449
|
+
self.job_id))
|
|
344
450
|
if self.cluster_name is None:
|
|
345
451
|
raise exceptions.NoClusterLaunchedError(
|
|
346
452
|
'No cluster name found in the pool.')
|
|
347
|
-
|
|
348
|
-
|
|
453
|
+
request_id = None
|
|
454
|
+
try:
|
|
455
|
+
request_id = await context_utils.to_thread(
|
|
456
|
+
sdk.exec,
|
|
457
|
+
self.dag,
|
|
458
|
+
cluster_name=self.cluster_name,
|
|
459
|
+
)
|
|
460
|
+
logger.debug('sdk.exec request ID: '
|
|
461
|
+
f'{request_id}')
|
|
462
|
+
job_id_on_pool_cluster, _ = (
|
|
463
|
+
await context_utils.to_thread(
|
|
464
|
+
sdk.get, request_id))
|
|
465
|
+
except asyncio.CancelledError:
|
|
466
|
+
if request_id:
|
|
467
|
+
req = await context_utils.to_thread(
|
|
468
|
+
sdk.api_cancel, request_id)
|
|
469
|
+
logger.debug('sdk.api_cancel request '
|
|
470
|
+
f'ID: {req}')
|
|
471
|
+
try:
|
|
472
|
+
await context_utils.to_thread(
|
|
473
|
+
sdk.get, req)
|
|
474
|
+
except Exception as e: # pylint: disable=broad-except
|
|
475
|
+
# we must still return a CancelledError
|
|
476
|
+
logger.error(
|
|
477
|
+
f'Failed to cancel the job: {e}')
|
|
478
|
+
raise
|
|
349
479
|
assert job_id_on_pool_cluster is not None, (
|
|
350
480
|
self.cluster_name, self.job_id)
|
|
351
481
|
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
352
|
-
state.
|
|
482
|
+
await state.set_job_id_on_pool_cluster_async(
|
|
353
483
|
self.job_id, job_id_on_pool_cluster)
|
|
354
484
|
logger.info('Managed job cluster launched.')
|
|
355
485
|
except (exceptions.InvalidClusterNameError,
|
|
356
486
|
exceptions.NoCloudAccessError,
|
|
357
|
-
exceptions.ResourcesMismatchError
|
|
487
|
+
exceptions.ResourcesMismatchError,
|
|
488
|
+
exceptions.StorageSpecError,
|
|
489
|
+
exceptions.StorageError) as e:
|
|
358
490
|
logger.error('Failure happened before provisioning. '
|
|
359
491
|
f'{common_utils.format_exception(e)}')
|
|
360
492
|
if raise_on_failure:
|
|
@@ -405,7 +537,7 @@ class StrategyExecutor:
|
|
|
405
537
|
# At this point, a sky.launch() has succeeded. Cluster
|
|
406
538
|
# may be UP (no preemption since) or DOWN (newly
|
|
407
539
|
# preempted).
|
|
408
|
-
job_submitted_at = (
|
|
540
|
+
job_submitted_at = await (
|
|
409
541
|
self._wait_until_job_starts_on_cluster())
|
|
410
542
|
if job_submitted_at is not None:
|
|
411
543
|
return job_submitted_at
|
|
@@ -421,7 +553,7 @@ class StrategyExecutor:
|
|
|
421
553
|
|
|
422
554
|
# If we get here, the launch did not succeed. Tear down the
|
|
423
555
|
# cluster and retry.
|
|
424
|
-
self._cleanup_cluster
|
|
556
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
425
557
|
if max_retry is not None and retry_cnt >= max_retry:
|
|
426
558
|
# Retry forever if max_retry is None.
|
|
427
559
|
if raise_on_failure:
|
|
@@ -444,15 +576,13 @@ class StrategyExecutor:
|
|
|
444
576
|
|
|
445
577
|
except exceptions.NoClusterLaunchedError:
|
|
446
578
|
# Update the status to PENDING during backoff.
|
|
447
|
-
state.
|
|
579
|
+
await state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
448
580
|
# Calculate the backoff time and sleep.
|
|
449
|
-
# We retry immediately for worker pool, since no sky.launch()
|
|
450
|
-
# is called and the overhead is minimal.
|
|
451
581
|
gap_seconds = (backoff.current_backoff()
|
|
452
582
|
if self.pool is None else 1)
|
|
453
583
|
logger.info('Retrying to launch the cluster in '
|
|
454
584
|
f'{gap_seconds:.1f} seconds.')
|
|
455
|
-
|
|
585
|
+
await asyncio.sleep(gap_seconds)
|
|
456
586
|
continue
|
|
457
587
|
else:
|
|
458
588
|
# The inner loop should either return or throw
|
|
@@ -478,26 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
478
608
|
|
|
479
609
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
|
480
610
|
|
|
481
|
-
def __init__(
|
|
482
|
-
|
|
483
|
-
|
|
611
|
+
def __init__(
|
|
612
|
+
self,
|
|
613
|
+
cluster_name: Optional[str],
|
|
614
|
+
backend: 'backends.Backend',
|
|
615
|
+
task: 'task_lib.Task',
|
|
616
|
+
max_restarts_on_errors: int,
|
|
617
|
+
job_id: int,
|
|
618
|
+
task_id: int,
|
|
619
|
+
pool: Optional[str],
|
|
620
|
+
starting: Set[int],
|
|
621
|
+
starting_lock: asyncio.Lock,
|
|
622
|
+
starting_signal: asyncio.Condition,
|
|
623
|
+
) -> None:
|
|
484
624
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
485
|
-
job_id, task_id, pool
|
|
625
|
+
job_id, task_id, pool, starting, starting_lock,
|
|
626
|
+
starting_signal)
|
|
486
627
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
487
628
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
488
629
|
# rely on cluster handle, as it can be None if the cluster is
|
|
489
630
|
# preempted.)
|
|
490
631
|
self._launched_resources: Optional['resources.Resources'] = None
|
|
491
632
|
|
|
492
|
-
def _launch(self,
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
|
497
|
-
|
|
633
|
+
async def _launch(self,
|
|
634
|
+
max_retry: Optional[int] = 3,
|
|
635
|
+
raise_on_failure: bool = True,
|
|
636
|
+
recovery: bool = False) -> Optional[float]:
|
|
637
|
+
job_submitted_at = await super()._launch(max_retry, raise_on_failure,
|
|
638
|
+
recovery)
|
|
498
639
|
if job_submitted_at is not None and self.cluster_name is not None:
|
|
499
640
|
# Only record the cloud/region if the launch is successful.
|
|
500
|
-
handle =
|
|
641
|
+
handle = await context_utils.to_thread(
|
|
642
|
+
global_user_state.get_handle_from_cluster_name,
|
|
501
643
|
self.cluster_name)
|
|
502
644
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), (
|
|
503
645
|
'Cluster should be launched.', handle)
|
|
@@ -507,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
507
649
|
self._launched_resources = None
|
|
508
650
|
return job_submitted_at
|
|
509
651
|
|
|
510
|
-
def recover(self) -> float:
|
|
652
|
+
async def recover(self) -> float:
|
|
511
653
|
# 1. Cancel the jobs and launch the cluster with the STOPPED status,
|
|
512
654
|
# so that it will try on the current region first until timeout.
|
|
513
655
|
# 2. Tear down the cluster, if the step 1 failed to launch the cluster.
|
|
@@ -515,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
515
657
|
# original user specification.
|
|
516
658
|
|
|
517
659
|
# Step 1
|
|
518
|
-
self._try_cancel_jobs()
|
|
660
|
+
await self._try_cancel_jobs()
|
|
519
661
|
|
|
520
662
|
while True:
|
|
521
663
|
# Add region constraint to the task, to retry on the same region
|
|
@@ -529,8 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
529
671
|
cloud=launched_cloud, region=launched_region, zone=None)
|
|
530
672
|
task.set_resources({new_resources})
|
|
531
673
|
# Not using self.launch to avoid the retry until up logic.
|
|
532
|
-
job_submitted_at = self._launch(raise_on_failure=False,
|
|
533
|
-
|
|
674
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
675
|
+
recovery=True)
|
|
534
676
|
# Restore the original dag, i.e. reset the region constraint.
|
|
535
677
|
task.set_resources(original_resources)
|
|
536
678
|
if job_submitted_at is not None:
|
|
@@ -539,21 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
539
681
|
# Step 2
|
|
540
682
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
541
683
|
'region.')
|
|
542
|
-
self._cleanup_cluster
|
|
684
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
543
685
|
|
|
544
686
|
# Step 3
|
|
545
687
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
546
688
|
'cloud/region.')
|
|
547
689
|
# Not using self.launch to avoid the retry until up logic.
|
|
548
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
549
|
-
|
|
550
|
-
|
|
690
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
691
|
+
raise_on_failure=False,
|
|
692
|
+
recovery=True)
|
|
551
693
|
if job_submitted_at is None:
|
|
552
694
|
# Failed to launch the cluster.
|
|
553
695
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
554
696
|
logger.info('Retrying to recover the cluster in '
|
|
555
697
|
f'{gap_seconds:.1f} seconds.')
|
|
556
|
-
|
|
698
|
+
await asyncio.sleep(gap_seconds)
|
|
557
699
|
continue
|
|
558
700
|
|
|
559
701
|
return job_submitted_at
|
|
@@ -585,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
585
727
|
-> R1Z1 (success)
|
|
586
728
|
"""
|
|
587
729
|
|
|
588
|
-
def recover(self) -> float:
|
|
730
|
+
async def recover(self) -> float:
|
|
589
731
|
# 1. Terminate the current cluster
|
|
590
732
|
# 2. Launch again by explicitly blocking the previously launched region
|
|
591
733
|
# (this will failover through the entire search space except the
|
|
@@ -598,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
598
740
|
|
|
599
741
|
# Step 1
|
|
600
742
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
|
601
|
-
self._cleanup_cluster
|
|
743
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
602
744
|
|
|
603
745
|
# Step 2
|
|
604
746
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
|
@@ -619,8 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
619
761
|
region=launched_region)
|
|
620
762
|
}
|
|
621
763
|
# Not using self.launch to avoid the retry until up logic.
|
|
622
|
-
job_submitted_at = self._launch(raise_on_failure=False,
|
|
623
|
-
|
|
764
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
765
|
+
recovery=True)
|
|
624
766
|
task.blocked_resources = None
|
|
625
767
|
if job_submitted_at is not None:
|
|
626
768
|
return job_submitted_at
|
|
@@ -630,15 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
630
772
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
631
773
|
'cloud/region.')
|
|
632
774
|
# Not using self.launch to avoid the retry until up logic.
|
|
633
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
634
|
-
|
|
635
|
-
|
|
775
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
776
|
+
raise_on_failure=False,
|
|
777
|
+
recovery=True)
|
|
636
778
|
if job_submitted_at is None:
|
|
637
779
|
# Failed to launch the cluster.
|
|
638
780
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
639
781
|
logger.info('Retrying to recover the cluster in '
|
|
640
782
|
f'{gap_seconds:.1f} seconds.')
|
|
641
|
-
|
|
783
|
+
await asyncio.sleep(gap_seconds)
|
|
642
784
|
continue
|
|
643
785
|
|
|
644
786
|
return job_submitted_at
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
|
|
790
|
+
"""Gets the file path that the logger writes to."""
|
|
791
|
+
for handler in file_logger.handlers:
|
|
792
|
+
if isinstance(handler, logging.FileHandler):
|
|
793
|
+
return handler.baseFilename
|
|
794
|
+
return None
|