skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -5,23 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
|
|
|
5
5
|
resources:
|
|
6
6
|
job_recovery: EAGER_NEXT_REGION
|
|
7
7
|
"""
|
|
8
|
-
import
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
9
11
|
import traceback
|
|
10
12
|
import typing
|
|
11
|
-
from typing import Optional
|
|
13
|
+
from typing import Optional, Set
|
|
12
14
|
|
|
13
|
-
import sky
|
|
14
15
|
from sky import backends
|
|
16
|
+
from sky import dag as dag_lib
|
|
15
17
|
from sky import exceptions
|
|
16
|
-
from sky import execution
|
|
17
18
|
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
20
|
+
from sky import skypilot_config
|
|
19
21
|
from sky.backends import backend_utils
|
|
22
|
+
from sky.client import sdk
|
|
20
23
|
from sky.jobs import scheduler
|
|
24
|
+
from sky.jobs import state
|
|
21
25
|
from sky.jobs import utils as managed_job_utils
|
|
26
|
+
from sky.serve import serve_utils
|
|
27
|
+
from sky.skylet import constants
|
|
22
28
|
from sky.skylet import job_lib
|
|
23
29
|
from sky.usage import usage_lib
|
|
24
30
|
from sky.utils import common_utils
|
|
31
|
+
from sky.utils import context_utils
|
|
32
|
+
from sky.utils import env_options
|
|
25
33
|
from sky.utils import registry
|
|
26
34
|
from sky.utils import status_lib
|
|
27
35
|
from sky.utils import ux_utils
|
|
@@ -39,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
|
39
47
|
# Minutes to job cluster autodown. This should be significantly larger than
|
|
40
48
|
# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
|
|
41
49
|
# cluster before its status can be updated by the job controller.
|
|
42
|
-
_AUTODOWN_MINUTES =
|
|
50
|
+
_AUTODOWN_MINUTES = 10
|
|
51
|
+
|
|
52
|
+
ENV_VARS_TO_CLEAR = [
|
|
53
|
+
skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
|
|
54
|
+
constants.USER_ID_ENV_VAR,
|
|
55
|
+
constants.USER_ENV_VAR,
|
|
56
|
+
env_options.Options.SHOW_DEBUG_INFO.env_key,
|
|
57
|
+
]
|
|
43
58
|
|
|
44
59
|
|
|
45
60
|
class StrategyExecutor:
|
|
@@ -47,29 +62,65 @@ class StrategyExecutor:
|
|
|
47
62
|
|
|
48
63
|
RETRY_INIT_GAP_SECONDS = 60
|
|
49
64
|
|
|
50
|
-
def __init__(
|
|
51
|
-
|
|
52
|
-
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
cluster_name: Optional[str],
|
|
68
|
+
backend: 'backends.Backend',
|
|
69
|
+
task: 'task_lib.Task',
|
|
70
|
+
max_restarts_on_errors: int,
|
|
71
|
+
job_id: int,
|
|
72
|
+
task_id: int,
|
|
73
|
+
pool: Optional[str],
|
|
74
|
+
starting: Set[int],
|
|
75
|
+
starting_lock: asyncio.Lock,
|
|
76
|
+
starting_signal: asyncio.Condition,
|
|
77
|
+
) -> None:
|
|
53
78
|
"""Initialize the strategy executor.
|
|
54
79
|
|
|
55
80
|
Args:
|
|
56
81
|
cluster_name: The name of the cluster.
|
|
57
82
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
|
58
83
|
task: The task to execute.
|
|
84
|
+
max_restarts_on_errors: Maximum number of restarts on errors.
|
|
85
|
+
job_id: The ID of the job.
|
|
86
|
+
task_id: The ID of the task.
|
|
87
|
+
starting: Set of job IDs that are currently starting.
|
|
88
|
+
starting_lock: Lock to synchronize starting jobs.
|
|
89
|
+
starting_signal: Condition to signal when a job can start.
|
|
59
90
|
"""
|
|
60
91
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
61
92
|
'Only CloudVMRayBackend is supported.')
|
|
62
|
-
self.dag =
|
|
93
|
+
self.dag = dag_lib.Dag()
|
|
63
94
|
self.dag.add(task)
|
|
95
|
+
# For jobs submitted to a pool, the cluster name might change after each
|
|
96
|
+
# recovery. Initially this is set to an empty string to indicate that no
|
|
97
|
+
# cluster is assigned yet, and in `_launch`, it will be set to one of
|
|
98
|
+
# the cluster names in the pool.
|
|
64
99
|
self.cluster_name = cluster_name
|
|
65
100
|
self.backend = backend
|
|
66
101
|
self.max_restarts_on_errors = max_restarts_on_errors
|
|
67
102
|
self.job_id = job_id
|
|
103
|
+
self.task_id = task_id
|
|
104
|
+
self.pool = pool
|
|
68
105
|
self.restart_cnt_on_failure = 0
|
|
106
|
+
self.job_id_on_pool_cluster: Optional[int] = None
|
|
107
|
+
self.starting = starting
|
|
108
|
+
self.starting_lock = starting_lock
|
|
109
|
+
self.starting_signal = starting_signal
|
|
69
110
|
|
|
70
111
|
@classmethod
|
|
71
|
-
def make(
|
|
72
|
-
|
|
112
|
+
def make(
|
|
113
|
+
cls,
|
|
114
|
+
cluster_name: Optional[str],
|
|
115
|
+
backend: 'backends.Backend',
|
|
116
|
+
task: 'task_lib.Task',
|
|
117
|
+
job_id: int,
|
|
118
|
+
task_id: int,
|
|
119
|
+
pool: Optional[str],
|
|
120
|
+
starting: Set[int],
|
|
121
|
+
starting_lock: asyncio.Lock,
|
|
122
|
+
starting_signal: asyncio.Condition,
|
|
123
|
+
) -> 'StrategyExecutor':
|
|
73
124
|
"""Create a strategy from a task."""
|
|
74
125
|
|
|
75
126
|
resource_list = list(task.resources)
|
|
@@ -86,8 +137,11 @@ class StrategyExecutor:
|
|
|
86
137
|
# original task.resources
|
|
87
138
|
task.set_resources(type(task.resources)(new_resources_list))
|
|
88
139
|
if isinstance(job_recovery, dict):
|
|
89
|
-
|
|
140
|
+
name = job_recovery.pop(
|
|
90
141
|
'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
|
|
142
|
+
assert name is None or isinstance(name, str), (
|
|
143
|
+
name, 'The job recovery strategy name must be a string or None')
|
|
144
|
+
job_recovery_name: Optional[str] = name
|
|
91
145
|
max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
|
|
92
146
|
0)
|
|
93
147
|
else:
|
|
@@ -97,9 +151,11 @@ class StrategyExecutor:
|
|
|
97
151
|
from_str(job_recovery_name))
|
|
98
152
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
99
153
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
100
|
-
max_restarts_on_errors, job_id
|
|
154
|
+
max_restarts_on_errors, job_id, task_id,
|
|
155
|
+
pool, starting, starting_lock,
|
|
156
|
+
starting_signal)
|
|
101
157
|
|
|
102
|
-
def launch(self) -> float:
|
|
158
|
+
async def launch(self) -> float:
|
|
103
159
|
"""Launch the cluster for the first time.
|
|
104
160
|
|
|
105
161
|
It can fail if resource is not available. Need to check the cluster
|
|
@@ -111,11 +167,11 @@ class StrategyExecutor:
|
|
|
111
167
|
Raises: Please refer to the docstring of self._launch().
|
|
112
168
|
"""
|
|
113
169
|
|
|
114
|
-
job_submit_at = self._launch(max_retry=None)
|
|
170
|
+
job_submit_at = await self._launch(max_retry=None)
|
|
115
171
|
assert job_submit_at is not None
|
|
116
172
|
return job_submit_at
|
|
117
173
|
|
|
118
|
-
def recover(self) -> float:
|
|
174
|
+
async def recover(self) -> float:
|
|
119
175
|
"""Relaunch the cluster after failure and wait until job starts.
|
|
120
176
|
|
|
121
177
|
When recover() is called the cluster should be in STOPPED status (i.e.
|
|
@@ -125,12 +181,12 @@ class StrategyExecutor:
|
|
|
125
181
|
"""
|
|
126
182
|
raise NotImplementedError
|
|
127
183
|
|
|
128
|
-
def
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
handle =
|
|
132
|
-
self.cluster_name)
|
|
133
|
-
if handle is None:
|
|
184
|
+
async def _try_cancel_jobs(self):
|
|
185
|
+
if self.cluster_name is None:
|
|
186
|
+
return
|
|
187
|
+
handle = await context_utils.to_thread(
|
|
188
|
+
global_user_state.get_handle_from_cluster_name, self.cluster_name)
|
|
189
|
+
if handle is None or self.pool is not None:
|
|
134
190
|
return
|
|
135
191
|
try:
|
|
136
192
|
usage_lib.messages.usage.set_internal()
|
|
@@ -153,9 +209,26 @@ class StrategyExecutor:
|
|
|
153
209
|
# should be functional with the `_try_cancel_if_cluster_is_init`
|
|
154
210
|
# flag, i.e. it sends the cancel signal to the head node, which will
|
|
155
211
|
# then kill the user process on remaining worker nodes.
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
212
|
+
# Only cancel the corresponding job for pool.
|
|
213
|
+
if self.pool is None:
|
|
214
|
+
request_id = await context_utils.to_thread(
|
|
215
|
+
sdk.cancel,
|
|
216
|
+
cluster_name=self.cluster_name,
|
|
217
|
+
all=True,
|
|
218
|
+
_try_cancel_if_cluster_is_init=True,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
request_id = await context_utils.to_thread(
|
|
222
|
+
sdk.cancel,
|
|
223
|
+
cluster_name=self.cluster_name,
|
|
224
|
+
job_ids=[self.job_id_on_pool_cluster],
|
|
225
|
+
_try_cancel_if_cluster_is_init=True,
|
|
226
|
+
)
|
|
227
|
+
logger.debug(f'sdk.cancel request ID: {request_id}')
|
|
228
|
+
await context_utils.to_thread(
|
|
229
|
+
sdk.get,
|
|
230
|
+
request_id,
|
|
231
|
+
)
|
|
159
232
|
except Exception as e: # pylint: disable=broad-except
|
|
160
233
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
|
161
234
|
'might be already down or the head node is preempted.'
|
|
@@ -163,25 +236,26 @@ class StrategyExecutor:
|
|
|
163
236
|
f'{common_utils.format_exception(e)}\n'
|
|
164
237
|
'Terminating the cluster explicitly to ensure no '
|
|
165
238
|
'remaining job process interferes with recovery.')
|
|
166
|
-
|
|
239
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
167
240
|
|
|
168
|
-
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
241
|
+
async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
169
242
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
|
170
243
|
|
|
171
244
|
Returns:
|
|
172
245
|
The timestamp of when the job is submitted, or None if failed to
|
|
173
246
|
submit.
|
|
174
247
|
"""
|
|
248
|
+
assert self.cluster_name is not None
|
|
175
249
|
status = None
|
|
176
250
|
job_checking_retry_cnt = 0
|
|
177
251
|
while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
|
|
178
252
|
# Avoid the infinite loop, if any bug happens.
|
|
179
253
|
job_checking_retry_cnt += 1
|
|
180
254
|
try:
|
|
181
|
-
cluster_status, _ = (
|
|
182
|
-
backend_utils.refresh_cluster_status_handle
|
|
183
|
-
|
|
184
|
-
|
|
255
|
+
cluster_status, _ = (await context_utils.to_thread(
|
|
256
|
+
backend_utils.refresh_cluster_status_handle,
|
|
257
|
+
self.cluster_name,
|
|
258
|
+
force_refresh_statuses=set(status_lib.ClusterStatus)))
|
|
185
259
|
except Exception as e: # pylint: disable=broad-except
|
|
186
260
|
# If any unexpected error happens, retry the job checking
|
|
187
261
|
# loop.
|
|
@@ -201,8 +275,10 @@ class StrategyExecutor:
|
|
|
201
275
|
break
|
|
202
276
|
|
|
203
277
|
try:
|
|
204
|
-
status = managed_job_utils.get_job_status(
|
|
205
|
-
self.backend,
|
|
278
|
+
status = await managed_job_utils.get_job_status(
|
|
279
|
+
self.backend,
|
|
280
|
+
self.cluster_name,
|
|
281
|
+
job_id=self.job_id_on_pool_cluster)
|
|
206
282
|
except Exception as e: # pylint: disable=broad-except
|
|
207
283
|
# If any unexpected error happens, retry the job checking
|
|
208
284
|
# loop.
|
|
@@ -217,8 +293,12 @@ class StrategyExecutor:
|
|
|
217
293
|
# Check the job status until it is not in initialized status
|
|
218
294
|
if status is not None and status > job_lib.JobStatus.INIT:
|
|
219
295
|
try:
|
|
220
|
-
job_submitted_at =
|
|
221
|
-
|
|
296
|
+
job_submitted_at = await context_utils.to_thread(
|
|
297
|
+
managed_job_utils.get_job_timestamp,
|
|
298
|
+
self.backend,
|
|
299
|
+
self.cluster_name,
|
|
300
|
+
self.job_id_on_pool_cluster,
|
|
301
|
+
get_end_time=False)
|
|
222
302
|
return job_submitted_at
|
|
223
303
|
except Exception as e: # pylint: disable=broad-except
|
|
224
304
|
# If we failed to get the job timestamp, we will retry
|
|
@@ -227,12 +307,20 @@ class StrategyExecutor:
|
|
|
227
307
|
'the job start timestamp. Retrying.')
|
|
228
308
|
continue
|
|
229
309
|
# Wait for the job to be started
|
|
230
|
-
|
|
310
|
+
await asyncio.sleep(
|
|
311
|
+
managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
|
|
231
312
|
return None
|
|
232
313
|
|
|
233
|
-
def
|
|
234
|
-
|
|
235
|
-
|
|
314
|
+
def _cleanup_cluster(self) -> None:
|
|
315
|
+
if self.cluster_name is None:
|
|
316
|
+
return
|
|
317
|
+
if self.pool is None:
|
|
318
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
319
|
+
|
|
320
|
+
async def _launch(self,
|
|
321
|
+
max_retry: Optional[int] = 3,
|
|
322
|
+
raise_on_failure: bool = True,
|
|
323
|
+
recovery: bool = False) -> Optional[float]:
|
|
236
324
|
"""Implementation of launch().
|
|
237
325
|
|
|
238
326
|
The function will wait until the job starts running, but will leave the
|
|
@@ -272,98 +360,234 @@ class StrategyExecutor:
|
|
|
272
360
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
|
273
361
|
while True:
|
|
274
362
|
retry_cnt += 1
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
363
|
+
try:
|
|
364
|
+
async with scheduler.scheduled_launch(
|
|
365
|
+
self.job_id,
|
|
366
|
+
self.starting,
|
|
367
|
+
self.starting_lock,
|
|
368
|
+
self.starting_signal,
|
|
369
|
+
):
|
|
370
|
+
# The job state may have been PENDING during backoff -
|
|
371
|
+
# update to STARTING or RECOVERING.
|
|
372
|
+
# On the first attempt (when retry_cnt is 1), we should
|
|
373
|
+
# already be in STARTING or RECOVERING.
|
|
374
|
+
if retry_cnt > 1:
|
|
375
|
+
await state.set_restarting_async(
|
|
376
|
+
self.job_id, self.task_id, recovery)
|
|
377
|
+
try:
|
|
378
|
+
usage_lib.messages.usage.set_internal()
|
|
379
|
+
if self.pool is None:
|
|
380
|
+
assert self.cluster_name is not None
|
|
381
|
+
|
|
382
|
+
# sdk.launch will implicitly start the API server,
|
|
383
|
+
# but then the API server will inherit the current
|
|
384
|
+
# env vars/user, which we may not want.
|
|
385
|
+
# Instead, clear env vars here and call api_start
|
|
386
|
+
# explicitly.
|
|
387
|
+
vars_to_restore = {}
|
|
388
|
+
try:
|
|
389
|
+
for env_var in ENV_VARS_TO_CLEAR:
|
|
390
|
+
vars_to_restore[env_var] = os.environ.pop(
|
|
391
|
+
env_var, None)
|
|
392
|
+
logger.debug('Cleared env var: '
|
|
393
|
+
f'{env_var}')
|
|
394
|
+
logger.debug('Env vars for api_start: '
|
|
395
|
+
f'{os.environ}')
|
|
396
|
+
await context_utils.to_thread(sdk.api_start)
|
|
397
|
+
logger.info('API server started.')
|
|
398
|
+
finally:
|
|
399
|
+
for env_var, value in vars_to_restore.items():
|
|
400
|
+
if value is not None:
|
|
401
|
+
logger.debug('Restored env var: '
|
|
402
|
+
f'{env_var}: {value}')
|
|
403
|
+
os.environ[env_var] = value
|
|
404
|
+
|
|
405
|
+
request_id = None
|
|
406
|
+
try:
|
|
407
|
+
request_id = await context_utils.to_thread(
|
|
408
|
+
sdk.launch,
|
|
409
|
+
self.dag,
|
|
410
|
+
cluster_name=self.cluster_name,
|
|
411
|
+
# We expect to tear down the cluster as soon
|
|
412
|
+
# as the job is finished. However, in case
|
|
413
|
+
# the controller dies, we may end up with a
|
|
414
|
+
# resource leak.
|
|
415
|
+
# Ideally, we should autodown to be safe,
|
|
416
|
+
# but it's fine to disable it for now, as
|
|
417
|
+
# Nebius doesn't support autodown yet.
|
|
418
|
+
# TODO(kevin): set down=True once Nebius
|
|
419
|
+
# supports autodown.
|
|
420
|
+
# idle_minutes_to_autostop=(
|
|
421
|
+
# _AUTODOWN_MINUTES),
|
|
422
|
+
# down=True,
|
|
423
|
+
_is_launched_by_jobs_controller=True,
|
|
424
|
+
)
|
|
425
|
+
logger.debug('sdk.launch request ID: '
|
|
426
|
+
f'{request_id}')
|
|
427
|
+
await context_utils.to_thread(
|
|
428
|
+
sdk.stream_and_get,
|
|
429
|
+
request_id,
|
|
430
|
+
)
|
|
431
|
+
except asyncio.CancelledError:
|
|
432
|
+
if request_id:
|
|
433
|
+
req = await context_utils.to_thread(
|
|
434
|
+
sdk.api_cancel, request_id)
|
|
435
|
+
logger.debug('sdk.api_cancel request '
|
|
436
|
+
f'ID: {req}')
|
|
437
|
+
try:
|
|
438
|
+
await context_utils.to_thread(
|
|
439
|
+
sdk.get, req)
|
|
440
|
+
except Exception as e: # pylint: disable=broad-except
|
|
441
|
+
# we must still return a CancelledError
|
|
442
|
+
logger.error(
|
|
443
|
+
f'Failed to cancel the job: {e}')
|
|
444
|
+
raise
|
|
445
|
+
logger.info('Managed job cluster launched.')
|
|
446
|
+
else:
|
|
447
|
+
self.cluster_name = await (context_utils.to_thread(
|
|
448
|
+
serve_utils.get_next_cluster_name, self.pool,
|
|
449
|
+
self.job_id))
|
|
450
|
+
if self.cluster_name is None:
|
|
451
|
+
raise exceptions.NoClusterLaunchedError(
|
|
452
|
+
'No cluster name found in the pool.')
|
|
453
|
+
request_id = None
|
|
454
|
+
try:
|
|
455
|
+
request_id = await context_utils.to_thread(
|
|
456
|
+
sdk.exec,
|
|
457
|
+
self.dag,
|
|
458
|
+
cluster_name=self.cluster_name,
|
|
459
|
+
)
|
|
460
|
+
logger.debug('sdk.exec request ID: '
|
|
461
|
+
f'{request_id}')
|
|
462
|
+
job_id_on_pool_cluster, _ = (
|
|
463
|
+
await context_utils.to_thread(
|
|
464
|
+
sdk.get, request_id))
|
|
465
|
+
except asyncio.CancelledError:
|
|
466
|
+
if request_id:
|
|
467
|
+
req = await context_utils.to_thread(
|
|
468
|
+
sdk.api_cancel, request_id)
|
|
469
|
+
logger.debug('sdk.api_cancel request '
|
|
470
|
+
f'ID: {req}')
|
|
471
|
+
try:
|
|
472
|
+
await context_utils.to_thread(
|
|
473
|
+
sdk.get, req)
|
|
474
|
+
except Exception as e: # pylint: disable=broad-except
|
|
475
|
+
# we must still return a CancelledError
|
|
476
|
+
logger.error(
|
|
477
|
+
f'Failed to cancel the job: {e}')
|
|
478
|
+
raise
|
|
479
|
+
assert job_id_on_pool_cluster is not None, (
|
|
480
|
+
self.cluster_name, self.job_id)
|
|
481
|
+
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
482
|
+
await state.set_job_id_on_pool_cluster_async(
|
|
483
|
+
self.job_id, job_id_on_pool_cluster)
|
|
484
|
+
logger.info('Managed job cluster launched.')
|
|
485
|
+
except (exceptions.InvalidClusterNameError,
|
|
486
|
+
exceptions.NoCloudAccessError,
|
|
487
|
+
exceptions.ResourcesMismatchError,
|
|
488
|
+
exceptions.StorageSpecError,
|
|
489
|
+
exceptions.StorageError) as e:
|
|
490
|
+
logger.error('Failure happened before provisioning. '
|
|
491
|
+
f'{common_utils.format_exception(e)}')
|
|
323
492
|
if raise_on_failure:
|
|
324
|
-
raise exceptions.ProvisionPrechecksError(
|
|
325
|
-
|
|
326
|
-
logger.info('Failed to launch a cluster with error: '
|
|
327
|
-
f'{common_utils.format_exception(e)})')
|
|
328
|
-
except Exception as e: # pylint: disable=broad-except
|
|
329
|
-
# If the launch fails, it will be recovered by the following
|
|
330
|
-
# code.
|
|
331
|
-
logger.info('Failed to launch a cluster with error: '
|
|
332
|
-
f'{common_utils.format_exception(e)})')
|
|
333
|
-
with ux_utils.enable_traceback():
|
|
334
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
|
335
|
-
else: # No exception, the launch succeeds.
|
|
336
|
-
# At this point, a sky.launch() has succeeded. Cluster may
|
|
337
|
-
# be UP (no preemption since) or DOWN (newly preempted).
|
|
338
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
|
339
|
-
if job_submitted_at is not None:
|
|
340
|
-
return job_submitted_at
|
|
341
|
-
# The job fails to start on the cluster, retry the launch.
|
|
342
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
|
343
|
-
# for future debugging.
|
|
344
|
-
logger.info(
|
|
345
|
-
'Failed to successfully submit the job to the '
|
|
346
|
-
'launched cluster, due to unexpected submission errors '
|
|
347
|
-
'or the cluster being preempted during job submission.')
|
|
348
|
-
|
|
349
|
-
# If we get here, the launch did not succeed. Tear down the
|
|
350
|
-
# cluster and retry.
|
|
351
|
-
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
352
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
|
353
|
-
# Retry forever if max_retry is None.
|
|
354
|
-
if raise_on_failure:
|
|
355
|
-
with ux_utils.print_exception_no_traceback():
|
|
356
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
|
357
|
-
'Resources unavailable: failed to launch '
|
|
358
|
-
f'clusters after {max_retry} retries.')
|
|
359
|
-
else:
|
|
493
|
+
raise exceptions.ProvisionPrechecksError(
|
|
494
|
+
reasons=[e])
|
|
360
495
|
return None
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
496
|
+
except exceptions.ResourcesUnavailableError as e:
|
|
497
|
+
# This is raised when the launch fails due to prechecks
|
|
498
|
+
# or after failing over through all the candidates.
|
|
499
|
+
# Please refer to the docstring of `sky.launch` for more
|
|
500
|
+
# details of how the exception will be structured.
|
|
501
|
+
if not any(
|
|
502
|
+
isinstance(err,
|
|
503
|
+
exceptions.ResourcesUnavailableError)
|
|
504
|
+
for err in e.failover_history):
|
|
505
|
+
# _launch() (this function) should fail/exit
|
|
506
|
+
# directly, if none of the failover reasons were
|
|
507
|
+
# because of resource unavailability or no failover
|
|
508
|
+
# was attempted (the optimizer cannot find feasible
|
|
509
|
+
# resources for requested resources), i.e.,
|
|
510
|
+
# e.failover_history is empty. Failing directly
|
|
511
|
+
# avoids the infinite loop of retrying the launch
|
|
512
|
+
# when, e.g., an invalid cluster name is used and
|
|
513
|
+
# --retry-until-up is specified.
|
|
514
|
+
reasons = (e.failover_history
|
|
515
|
+
if e.failover_history else [e])
|
|
516
|
+
reasons_str = '; '.join(
|
|
517
|
+
common_utils.format_exception(err)
|
|
518
|
+
for err in reasons)
|
|
519
|
+
logger.error(
|
|
520
|
+
'Failure happened before provisioning. '
|
|
521
|
+
f'Failover reasons: {reasons_str}')
|
|
522
|
+
if raise_on_failure:
|
|
523
|
+
raise exceptions.ProvisionPrechecksError(
|
|
524
|
+
reasons)
|
|
525
|
+
return None
|
|
526
|
+
logger.info('Failed to launch a cluster with error: '
|
|
527
|
+
f'{common_utils.format_exception(e)})')
|
|
528
|
+
except Exception as e: # pylint: disable=broad-except
|
|
529
|
+
# If the launch fails, it will be recovered by the
|
|
530
|
+
# following code.
|
|
531
|
+
logger.info('Failed to launch a cluster with error: '
|
|
532
|
+
f'{common_utils.format_exception(e)})')
|
|
533
|
+
with ux_utils.enable_traceback():
|
|
534
|
+
logger.info(
|
|
535
|
+
f' Traceback: {traceback.format_exc()}')
|
|
536
|
+
else: # No exception, the launch succeeds.
|
|
537
|
+
# At this point, a sky.launch() has succeeded. Cluster
|
|
538
|
+
# may be UP (no preemption since) or DOWN (newly
|
|
539
|
+
# preempted).
|
|
540
|
+
job_submitted_at = await (
|
|
541
|
+
self._wait_until_job_starts_on_cluster())
|
|
542
|
+
if job_submitted_at is not None:
|
|
543
|
+
return job_submitted_at
|
|
544
|
+
# The job fails to start on the cluster, retry the
|
|
545
|
+
# launch.
|
|
546
|
+
# TODO(zhwu): log the unexpected error to usage
|
|
547
|
+
# collection for future debugging.
|
|
548
|
+
logger.info(
|
|
549
|
+
'Failed to successfully submit the job to the '
|
|
550
|
+
'launched cluster, due to unexpected submission '
|
|
551
|
+
'errors or the cluster being preempted during '
|
|
552
|
+
'job submission.')
|
|
553
|
+
|
|
554
|
+
# If we get here, the launch did not succeed. Tear down the
|
|
555
|
+
# cluster and retry.
|
|
556
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
557
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
|
558
|
+
# Retry forever if max_retry is None.
|
|
559
|
+
if raise_on_failure:
|
|
560
|
+
with ux_utils.print_exception_no_traceback():
|
|
561
|
+
raise (
|
|
562
|
+
exceptions.ManagedJobReachedMaxRetriesError(
|
|
563
|
+
'Resources unavailable: failed to '
|
|
564
|
+
f'launch clusters after {max_retry} '
|
|
565
|
+
'retries.'))
|
|
566
|
+
else:
|
|
567
|
+
return None
|
|
568
|
+
|
|
569
|
+
# Raise NoClusterLaunchedError to indicate that the job is
|
|
570
|
+
# in retry backoff. This will trigger special handling in
|
|
571
|
+
# scheduler.schedule_launched().
|
|
572
|
+
# We will exit the scheduled_launch context so that the
|
|
573
|
+
# schedule state is ALIVE_BACKOFF during the backoff. This
|
|
574
|
+
# allows other jobs to launch.
|
|
575
|
+
raise exceptions.NoClusterLaunchedError()
|
|
576
|
+
|
|
577
|
+
except exceptions.NoClusterLaunchedError:
|
|
578
|
+
# Update the status to PENDING during backoff.
|
|
579
|
+
await state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
580
|
+
# Calculate the backoff time and sleep.
|
|
581
|
+
gap_seconds = (backoff.current_backoff()
|
|
582
|
+
if self.pool is None else 1)
|
|
583
|
+
logger.info('Retrying to launch the cluster in '
|
|
584
|
+
f'{gap_seconds:.1f} seconds.')
|
|
585
|
+
await asyncio.sleep(gap_seconds)
|
|
586
|
+
continue
|
|
587
|
+
else:
|
|
588
|
+
# The inner loop should either return or throw
|
|
589
|
+
# NoClusterLaunchedError.
|
|
590
|
+
assert False, 'Unreachable'
|
|
367
591
|
|
|
368
592
|
def should_restart_on_failure(self) -> bool:
|
|
369
593
|
"""Increments counter & checks if job should be restarted on a failure.
|
|
@@ -384,24 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
384
608
|
|
|
385
609
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
|
386
610
|
|
|
387
|
-
def __init__(
|
|
388
|
-
|
|
389
|
-
|
|
611
|
+
def __init__(
|
|
612
|
+
self,
|
|
613
|
+
cluster_name: Optional[str],
|
|
614
|
+
backend: 'backends.Backend',
|
|
615
|
+
task: 'task_lib.Task',
|
|
616
|
+
max_restarts_on_errors: int,
|
|
617
|
+
job_id: int,
|
|
618
|
+
task_id: int,
|
|
619
|
+
pool: Optional[str],
|
|
620
|
+
starting: Set[int],
|
|
621
|
+
starting_lock: asyncio.Lock,
|
|
622
|
+
starting_signal: asyncio.Condition,
|
|
623
|
+
) -> None:
|
|
390
624
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
391
|
-
job_id
|
|
625
|
+
job_id, task_id, pool, starting, starting_lock,
|
|
626
|
+
starting_signal)
|
|
392
627
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
393
628
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
394
629
|
# rely on cluster handle, as it can be None if the cluster is
|
|
395
630
|
# preempted.)
|
|
396
631
|
self._launched_resources: Optional['resources.Resources'] = None
|
|
397
632
|
|
|
398
|
-
def _launch(self,
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
633
|
+
async def _launch(self,
|
|
634
|
+
max_retry: Optional[int] = 3,
|
|
635
|
+
raise_on_failure: bool = True,
|
|
636
|
+
recovery: bool = False) -> Optional[float]:
|
|
637
|
+
job_submitted_at = await super()._launch(max_retry, raise_on_failure,
|
|
638
|
+
recovery)
|
|
639
|
+
if job_submitted_at is not None and self.cluster_name is not None:
|
|
403
640
|
# Only record the cloud/region if the launch is successful.
|
|
404
|
-
handle =
|
|
641
|
+
handle = await context_utils.to_thread(
|
|
642
|
+
global_user_state.get_handle_from_cluster_name,
|
|
405
643
|
self.cluster_name)
|
|
406
644
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), (
|
|
407
645
|
'Cluster should be launched.', handle)
|
|
@@ -411,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
411
649
|
self._launched_resources = None
|
|
412
650
|
return job_submitted_at
|
|
413
651
|
|
|
414
|
-
def recover(self) -> float:
|
|
652
|
+
async def recover(self) -> float:
|
|
415
653
|
# 1. Cancel the jobs and launch the cluster with the STOPPED status,
|
|
416
654
|
# so that it will try on the current region first until timeout.
|
|
417
655
|
# 2. Tear down the cluster, if the step 1 failed to launch the cluster.
|
|
@@ -419,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
419
657
|
# original user specification.
|
|
420
658
|
|
|
421
659
|
# Step 1
|
|
422
|
-
self.
|
|
660
|
+
await self._try_cancel_jobs()
|
|
423
661
|
|
|
424
662
|
while True:
|
|
425
663
|
# Add region constraint to the task, to retry on the same region
|
|
@@ -433,7 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
433
671
|
cloud=launched_cloud, region=launched_region, zone=None)
|
|
434
672
|
task.set_resources({new_resources})
|
|
435
673
|
# Not using self.launch to avoid the retry until up logic.
|
|
436
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
|
674
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
675
|
+
recovery=True)
|
|
437
676
|
# Restore the original dag, i.e. reset the region constraint.
|
|
438
677
|
task.set_resources(original_resources)
|
|
439
678
|
if job_submitted_at is not None:
|
|
@@ -442,20 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
442
681
|
# Step 2
|
|
443
682
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
444
683
|
'region.')
|
|
445
|
-
|
|
684
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
446
685
|
|
|
447
686
|
# Step 3
|
|
448
687
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
449
688
|
'cloud/region.')
|
|
450
689
|
# Not using self.launch to avoid the retry until up logic.
|
|
451
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
452
|
-
|
|
690
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
691
|
+
raise_on_failure=False,
|
|
692
|
+
recovery=True)
|
|
453
693
|
if job_submitted_at is None:
|
|
454
694
|
# Failed to launch the cluster.
|
|
455
695
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
456
696
|
logger.info('Retrying to recover the cluster in '
|
|
457
697
|
f'{gap_seconds:.1f} seconds.')
|
|
458
|
-
|
|
698
|
+
await asyncio.sleep(gap_seconds)
|
|
459
699
|
continue
|
|
460
700
|
|
|
461
701
|
return job_submitted_at
|
|
@@ -487,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
487
727
|
-> R1Z1 (success)
|
|
488
728
|
"""
|
|
489
729
|
|
|
490
|
-
def recover(self) -> float:
|
|
730
|
+
async def recover(self) -> float:
|
|
491
731
|
# 1. Terminate the current cluster
|
|
492
732
|
# 2. Launch again by explicitly blocking the previously launched region
|
|
493
733
|
# (this will failover through the entire search space except the
|
|
@@ -500,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
500
740
|
|
|
501
741
|
# Step 1
|
|
502
742
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
|
503
|
-
|
|
743
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
504
744
|
|
|
505
745
|
# Step 2
|
|
506
746
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
|
@@ -521,7 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
521
761
|
region=launched_region)
|
|
522
762
|
}
|
|
523
763
|
# Not using self.launch to avoid the retry until up logic.
|
|
524
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
|
764
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
765
|
+
recovery=True)
|
|
525
766
|
task.blocked_resources = None
|
|
526
767
|
if job_submitted_at is not None:
|
|
527
768
|
return job_submitted_at
|
|
@@ -531,14 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
531
772
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
532
773
|
'cloud/region.')
|
|
533
774
|
# Not using self.launch to avoid the retry until up logic.
|
|
534
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
535
|
-
|
|
775
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
776
|
+
raise_on_failure=False,
|
|
777
|
+
recovery=True)
|
|
536
778
|
if job_submitted_at is None:
|
|
537
779
|
# Failed to launch the cluster.
|
|
538
780
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
539
781
|
logger.info('Retrying to recover the cluster in '
|
|
540
782
|
f'{gap_seconds:.1f} seconds.')
|
|
541
|
-
|
|
783
|
+
await asyncio.sleep(gap_seconds)
|
|
542
784
|
continue
|
|
543
785
|
|
|
544
786
|
return job_submitted_at
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
|
|
790
|
+
"""Gets the file path that the logger writes to."""
|
|
791
|
+
for handler in file_logger.handlers:
|
|
792
|
+
if isinstance(handler, logging.FileHandler):
|
|
793
|
+
return handler.baseFilename
|
|
794
|
+
return None
|