skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
|
2
2
|
import copy
|
|
3
|
+
import dataclasses
|
|
3
4
|
import enum
|
|
4
|
-
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import math
|
|
7
7
|
import os
|
|
8
8
|
import pathlib
|
|
9
|
+
import random
|
|
9
10
|
import re
|
|
10
11
|
import shlex
|
|
11
|
-
import shutil
|
|
12
12
|
import signal
|
|
13
|
+
import socket
|
|
13
14
|
import subprocess
|
|
14
15
|
import sys
|
|
15
16
|
import tempfile
|
|
@@ -17,14 +18,14 @@ import textwrap
|
|
|
17
18
|
import threading
|
|
18
19
|
import time
|
|
19
20
|
import typing
|
|
20
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
21
|
-
Union)
|
|
21
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
22
|
+
Set, Tuple, Union)
|
|
22
23
|
|
|
23
24
|
import colorama
|
|
24
|
-
import
|
|
25
|
+
import psutil
|
|
25
26
|
|
|
26
|
-
import sky
|
|
27
27
|
from sky import backends
|
|
28
|
+
from sky import catalog
|
|
28
29
|
from sky import check as sky_check
|
|
29
30
|
from sky import cloud_stores
|
|
30
31
|
from sky import clouds
|
|
@@ -37,10 +38,11 @@ from sky import resources as resources_lib
|
|
|
37
38
|
from sky import sky_logging
|
|
38
39
|
from sky import skypilot_config
|
|
39
40
|
from sky import task as task_lib
|
|
41
|
+
from sky.adaptors import common as adaptors_common
|
|
40
42
|
from sky.backends import backend_utils
|
|
43
|
+
from sky.backends import task_codegen
|
|
41
44
|
from sky.backends import wheel_utils
|
|
42
45
|
from sky.clouds import cloud as sky_cloud
|
|
43
|
-
from sky.clouds import service_catalog
|
|
44
46
|
from sky.clouds.utils import gcp_utils
|
|
45
47
|
from sky.data import data_utils
|
|
46
48
|
from sky.data import storage as storage_lib
|
|
@@ -48,21 +50,26 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
55
|
+
from sky.serve import constants as serve_constants
|
|
52
56
|
from sky.server.requests import requests as requests_lib
|
|
53
57
|
from sky.skylet import autostop_lib
|
|
54
58
|
from sky.skylet import constants
|
|
55
59
|
from sky.skylet import job_lib
|
|
56
60
|
from sky.skylet import log_lib
|
|
57
61
|
from sky.usage import usage_lib
|
|
58
|
-
from sky.utils import accelerator_registry
|
|
59
62
|
from sky.utils import annotations
|
|
60
63
|
from sky.utils import cluster_utils
|
|
61
64
|
from sky.utils import command_runner
|
|
62
65
|
from sky.utils import common
|
|
63
66
|
from sky.utils import common_utils
|
|
67
|
+
from sky.utils import context_utils
|
|
64
68
|
from sky.utils import controller_utils
|
|
69
|
+
from sky.utils import directory_utils
|
|
65
70
|
from sky.utils import env_options
|
|
71
|
+
from sky.utils import lock_events
|
|
72
|
+
from sky.utils import locks
|
|
66
73
|
from sky.utils import log_utils
|
|
67
74
|
from sky.utils import message_utils
|
|
68
75
|
from sky.utils import registry
|
|
@@ -72,9 +79,43 @@ from sky.utils import status_lib
|
|
|
72
79
|
from sky.utils import subprocess_utils
|
|
73
80
|
from sky.utils import timeline
|
|
74
81
|
from sky.utils import ux_utils
|
|
82
|
+
from sky.utils import volume as volume_lib
|
|
83
|
+
from sky.utils import yaml_utils
|
|
75
84
|
|
|
76
85
|
if typing.TYPE_CHECKING:
|
|
86
|
+
import grpc
|
|
87
|
+
|
|
77
88
|
from sky import dag
|
|
89
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
90
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
93
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
95
|
+
from sky.schemas.generated import servev1_pb2
|
|
96
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
97
|
+
else:
|
|
98
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
99
|
+
grpc = adaptors_common.LazyImport(
|
|
100
|
+
'grpc',
|
|
101
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
102
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
103
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
104
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
105
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
106
|
+
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
107
|
+
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
108
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
109
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
110
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
111
|
+
servev1_pb2 = adaptors_common.LazyImport(
|
|
112
|
+
'sky.schemas.generated.servev1_pb2')
|
|
113
|
+
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
114
|
+
'sky.schemas.generated.servev1_pb2_grpc')
|
|
115
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
116
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
117
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
118
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
78
119
|
|
|
79
120
|
Path = str
|
|
80
121
|
|
|
@@ -96,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
|
96
137
|
clouds.OCI: 300,
|
|
97
138
|
clouds.Paperspace: 600,
|
|
98
139
|
clouds.Kubernetes: 300,
|
|
140
|
+
clouds.Shadeform: 300,
|
|
99
141
|
clouds.Vsphere: 240,
|
|
100
142
|
}
|
|
101
143
|
|
|
@@ -141,12 +183,13 @@ _MAX_RAY_UP_RETRY = 5
|
|
|
141
183
|
_MAX_GET_ZONE_RETRY = 3
|
|
142
184
|
|
|
143
185
|
_JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
|
|
186
|
+
_LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
|
|
144
187
|
|
|
145
188
|
# Path to the monkey-patched ray up script.
|
|
146
189
|
# We don't do import then __file__ because that script needs to be filled in
|
|
147
190
|
# (so import would fail).
|
|
148
191
|
_RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
149
|
-
pathlib.Path(
|
|
192
|
+
pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
|
|
150
193
|
'monkey_patches' / 'monkey_patch_ray_up.py')
|
|
151
194
|
|
|
152
195
|
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
@@ -161,10 +204,19 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
161
204
|
# We use 100KB as a threshold to be safe for other arguments that
|
|
162
205
|
# might be added during ssh.
|
|
163
206
|
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
207
|
+
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
208
|
+
('too long', 255),
|
|
209
|
+
('request-uri too large', 1),
|
|
210
|
+
('request header fields too large', 1),
|
|
211
|
+
('400 bad request', 1), # CloudFlare 400 error
|
|
212
|
+
]
|
|
164
213
|
|
|
165
214
|
_RESOURCES_UNAVAILABLE_LOG = (
|
|
166
215
|
'Reasons for provision failures (for details, please check the log above):')
|
|
167
216
|
|
|
217
|
+
# Number of seconds to wait locking the cluster before communicating with user.
|
|
218
|
+
_CLUSTER_LOCK_TIMEOUT = 5.0
|
|
219
|
+
|
|
168
220
|
|
|
169
221
|
def _is_command_length_over_limit(command: str) -> bool:
|
|
170
222
|
"""Check if the length of the command exceeds the limit.
|
|
@@ -178,6 +230,61 @@ def _is_command_length_over_limit(command: str) -> bool:
|
|
|
178
230
|
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
179
231
|
|
|
180
232
|
|
|
233
|
+
def _is_message_too_long(returncode: int,
|
|
234
|
+
output: Optional[str] = None,
|
|
235
|
+
file_path: Optional[str] = None) -> bool:
|
|
236
|
+
"""Check if the message sent to the remote is too long.
|
|
237
|
+
|
|
238
|
+
We use inline script to run the setup or run command, i.e. the script will
|
|
239
|
+
be part of the message sent to the remote cluster. There is a chance that
|
|
240
|
+
the command is too long, when people has very long run or setup commands, or
|
|
241
|
+
there is a cloudflare proxy in front of the remote blocking the long
|
|
242
|
+
message. Several common causes are:
|
|
243
|
+
- SSH returning: `too long` in the error message.
|
|
244
|
+
- Cloudflare proxy returning: `414 Request-URI Too Large` or
|
|
245
|
+
`431 Request Header Fields Too Large` error.
|
|
246
|
+
|
|
247
|
+
We use a general length limit check before but it could be inaccurate on
|
|
248
|
+
some systems, e.g. cloudflare proxy, so this is necessary.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
returncode: The return code of the setup command.
|
|
252
|
+
output: The output of the setup command.
|
|
253
|
+
file_path: The path to the setup log file.
|
|
254
|
+
"""
|
|
255
|
+
assert (output is None) != (file_path is None), (
|
|
256
|
+
'Either output or file_path must be provided.', output, file_path)
|
|
257
|
+
to_check = []
|
|
258
|
+
for (match_str,
|
|
259
|
+
desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
|
|
260
|
+
if desired_rc == returncode:
|
|
261
|
+
to_check.append(match_str)
|
|
262
|
+
if not to_check:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
def _check_output_for_match_str(output: str) -> bool:
|
|
266
|
+
for match_str in to_check:
|
|
267
|
+
if match_str.lower() in output.lower():
|
|
268
|
+
return True
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
if file_path is not None:
|
|
272
|
+
try:
|
|
273
|
+
with open(os.path.expanduser(file_path), 'r',
|
|
274
|
+
encoding='utf-8') as f:
|
|
275
|
+
content = f.read()
|
|
276
|
+
return _check_output_for_match_str(content)
|
|
277
|
+
except Exception as e: # pylint: disable=broad-except
|
|
278
|
+
# We don't crash the setup if we cannot read the log file.
|
|
279
|
+
# Instead, we should retry the setup with dumping the script
|
|
280
|
+
# to a file to be safe.
|
|
281
|
+
logger.debug(f'Failed to read setup log file {file_path}: {e}')
|
|
282
|
+
return True
|
|
283
|
+
else:
|
|
284
|
+
assert output is not None, (output, file_path)
|
|
285
|
+
return _check_output_for_match_str(output)
|
|
286
|
+
|
|
287
|
+
|
|
181
288
|
def _get_cluster_config_template(cloud):
|
|
182
289
|
cloud_to_template = {
|
|
183
290
|
clouds.AWS: 'aws-ray.yml.j2',
|
|
@@ -189,13 +296,18 @@ def _get_cluster_config_template(cloud):
|
|
|
189
296
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
190
297
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
191
298
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
299
|
+
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
192
300
|
clouds.DO: 'do-ray.yml.j2',
|
|
193
301
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
194
302
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
303
|
+
clouds.SSH: 'kubernetes-ray.yml.j2',
|
|
304
|
+
clouds.Shadeform: 'shadeform-ray.yml.j2',
|
|
195
305
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
|
196
306
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
197
307
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
198
|
-
clouds.Nebius: 'nebius-ray.yml.j2'
|
|
308
|
+
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
309
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
310
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
199
311
|
}
|
|
200
312
|
return cloud_to_template[type(cloud)]
|
|
201
313
|
|
|
@@ -225,500 +337,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
|
|
|
225
337
|
return f.name
|
|
226
338
|
|
|
227
339
|
|
|
228
|
-
class RayCodeGen:
|
|
229
|
-
"""Code generator of a Ray program that executes a sky.Task.
|
|
230
|
-
|
|
231
|
-
Usage:
|
|
232
|
-
|
|
233
|
-
>> codegen = RayCodegen()
|
|
234
|
-
>> codegen.add_prologue()
|
|
235
|
-
|
|
236
|
-
>> codegen.add_ray_task(...)
|
|
237
|
-
>> codegen.add_ray_task(...)
|
|
238
|
-
|
|
239
|
-
>> codegen.add_epilogue()
|
|
240
|
-
>> code = codegen.build()
|
|
241
|
-
"""
|
|
242
|
-
|
|
243
|
-
def __init__(self):
|
|
244
|
-
# Code generated so far, to be joined via '\n'.
|
|
245
|
-
self._code = []
|
|
246
|
-
# Guard method calling order.
|
|
247
|
-
self._has_prologue = False
|
|
248
|
-
self._has_epilogue = False
|
|
249
|
-
|
|
250
|
-
# For n nodes gang scheduling.
|
|
251
|
-
self._has_gang_scheduling = False
|
|
252
|
-
self._num_nodes = 0
|
|
253
|
-
|
|
254
|
-
self._has_register_run_fn = False
|
|
255
|
-
|
|
256
|
-
# job_id
|
|
257
|
-
# Job ID is used to identify the job (also this generated code).
|
|
258
|
-
# It is a int automatically generated by the DB on the cluster
|
|
259
|
-
# and monotonically increasing starting from 1.
|
|
260
|
-
# To generate the job ID, we use the following logic:
|
|
261
|
-
# code = job_lib.JobLibCodeGen.add_job(username,
|
|
262
|
-
# run_timestamp)
|
|
263
|
-
# job_id = get_output(run_on_cluster(code))
|
|
264
|
-
self.job_id = None
|
|
265
|
-
|
|
266
|
-
def add_prologue(self, job_id: int) -> None:
|
|
267
|
-
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
268
|
-
self._has_prologue = True
|
|
269
|
-
self.job_id = job_id
|
|
270
|
-
# Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
|
|
271
|
-
# 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
|
|
272
|
-
# Otherwise, ray will fail to get the placement group because of a bug
|
|
273
|
-
# in ray job.
|
|
274
|
-
ray_address = 'auto'
|
|
275
|
-
self._code = [
|
|
276
|
-
textwrap.dedent(f"""\
|
|
277
|
-
import getpass
|
|
278
|
-
import hashlib
|
|
279
|
-
import io
|
|
280
|
-
import os
|
|
281
|
-
import pathlib
|
|
282
|
-
import selectors
|
|
283
|
-
import shlex
|
|
284
|
-
import subprocess
|
|
285
|
-
import sys
|
|
286
|
-
import tempfile
|
|
287
|
-
import textwrap
|
|
288
|
-
import time
|
|
289
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
290
|
-
|
|
291
|
-
# Set the environment variables to avoid deduplicating logs and
|
|
292
|
-
# scheduler events. This should be set in driver code, since we are
|
|
293
|
-
# not using `ray job submit` anymore, and the environment variables
|
|
294
|
-
# from the ray cluster is not inherited.
|
|
295
|
-
os.environ['RAY_DEDUP_LOGS'] = '0'
|
|
296
|
-
os.environ['RAY_SCHEDULER_EVENTS'] = '0'
|
|
297
|
-
|
|
298
|
-
import ray
|
|
299
|
-
import ray.util as ray_util
|
|
300
|
-
|
|
301
|
-
from sky.skylet import autostop_lib
|
|
302
|
-
from sky.skylet import constants
|
|
303
|
-
from sky.skylet import job_lib
|
|
304
|
-
from sky.utils import log_utils
|
|
305
|
-
from sky.utils import subprocess_utils
|
|
306
|
-
|
|
307
|
-
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
308
|
-
|
|
309
|
-
kwargs = dict()
|
|
310
|
-
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
311
|
-
# the directory exists for backward compatibility for the VM
|
|
312
|
-
# launched before #1790.
|
|
313
|
-
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
|
|
314
|
-
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
|
|
315
|
-
ray.init(
|
|
316
|
-
address={ray_address!r},
|
|
317
|
-
namespace='__sky__{job_id}__',
|
|
318
|
-
log_to_driver=True,
|
|
319
|
-
**kwargs
|
|
320
|
-
)
|
|
321
|
-
def get_or_fail(futures, pg) -> List[int]:
|
|
322
|
-
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
323
|
-
if not futures:
|
|
324
|
-
return []
|
|
325
|
-
returncodes = [1] * len(futures)
|
|
326
|
-
# Wait for 1 task to be ready.
|
|
327
|
-
ready = []
|
|
328
|
-
# Keep invoking ray.wait if ready is empty. This is because
|
|
329
|
-
# ray.wait with timeout=None will only wait for 10**6 seconds,
|
|
330
|
-
# which will cause tasks running for more than 12 days to return
|
|
331
|
-
# before becoming ready.
|
|
332
|
-
# (Such tasks are common in serving jobs.)
|
|
333
|
-
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
334
|
-
while not ready:
|
|
335
|
-
ready, unready = ray.wait(futures)
|
|
336
|
-
idx = futures.index(ready[0])
|
|
337
|
-
returncodes[idx] = ray.get(ready[0])
|
|
338
|
-
while unready:
|
|
339
|
-
if returncodes[idx] != 0:
|
|
340
|
-
for task in unready:
|
|
341
|
-
# ray.cancel without force fails to kill tasks.
|
|
342
|
-
# We use force=True to kill unready tasks.
|
|
343
|
-
ray.cancel(task, force=True)
|
|
344
|
-
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
345
|
-
# killed.
|
|
346
|
-
idx = futures.index(task)
|
|
347
|
-
returncodes[idx] = 137
|
|
348
|
-
break
|
|
349
|
-
ready, unready = ray.wait(unready)
|
|
350
|
-
idx = futures.index(ready[0])
|
|
351
|
-
returncodes[idx] = ray.get(ready[0])
|
|
352
|
-
# Remove the placement group after all tasks are done, so that
|
|
353
|
-
# the next job can be scheduled on the released resources
|
|
354
|
-
# immediately.
|
|
355
|
-
ray_util.remove_placement_group(pg)
|
|
356
|
-
sys.stdout.flush()
|
|
357
|
-
return returncodes
|
|
358
|
-
|
|
359
|
-
run_fn = None
|
|
360
|
-
futures = []
|
|
361
|
-
"""),
|
|
362
|
-
# FIXME: This is a hack to make sure that the functions can be found
|
|
363
|
-
# by ray.remote. This should be removed once we have a better way to
|
|
364
|
-
# specify dependencies for ray.
|
|
365
|
-
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
|
366
|
-
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
|
367
|
-
inspect.getsource(log_lib.process_subprocess_stream),
|
|
368
|
-
inspect.getsource(log_lib.run_with_log),
|
|
369
|
-
inspect.getsource(log_lib.make_task_bash_script),
|
|
370
|
-
inspect.getsource(log_lib.add_ray_env_vars),
|
|
371
|
-
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
372
|
-
'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
|
|
373
|
-
]
|
|
374
|
-
# Currently, the codegen program is/can only be submitted to the head
|
|
375
|
-
# node, due to using job_lib for updating job statuses, and using
|
|
376
|
-
# autostop_lib here.
|
|
377
|
-
self._code.append(
|
|
378
|
-
# Use hasattr to handle backward compatibility.
|
|
379
|
-
# TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
|
|
380
|
-
textwrap.dedent("""\
|
|
381
|
-
if hasattr(autostop_lib, 'set_last_active_time_to_now'):
|
|
382
|
-
autostop_lib.set_last_active_time_to_now()
|
|
383
|
-
"""))
|
|
384
|
-
self._code += [
|
|
385
|
-
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
386
|
-
]
|
|
387
|
-
|
|
388
|
-
def add_gang_scheduling_placement_group_and_setup(
|
|
389
|
-
self,
|
|
390
|
-
num_nodes: int,
|
|
391
|
-
resources_dict: Dict[str, float],
|
|
392
|
-
stable_cluster_internal_ips: List[str],
|
|
393
|
-
env_vars: Dict[str, str],
|
|
394
|
-
setup_cmd: Optional[str] = None,
|
|
395
|
-
setup_log_path: Optional[str] = None,
|
|
396
|
-
) -> None:
|
|
397
|
-
"""Create the gang scheduling placement group for a Task.
|
|
398
|
-
|
|
399
|
-
cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
|
|
400
|
-
variable is assigned in a deterministic order whenever a new task is
|
|
401
|
-
added.
|
|
402
|
-
"""
|
|
403
|
-
assert self._has_prologue, (
|
|
404
|
-
'Call add_prologue() before '
|
|
405
|
-
'add_gang_scheduling_placement_group_and_setup().')
|
|
406
|
-
self._has_gang_scheduling = True
|
|
407
|
-
self._num_nodes = num_nodes
|
|
408
|
-
|
|
409
|
-
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
410
|
-
# Set CPU to avoid ray hanging the resources allocation
|
|
411
|
-
# for remote functions, since the task will request 1 CPU
|
|
412
|
-
# by default.
|
|
413
|
-
task_cpu_demand = resources_dict.pop('CPU')
|
|
414
|
-
|
|
415
|
-
if resources_dict:
|
|
416
|
-
assert len(resources_dict) == 1, (
|
|
417
|
-
'There can only be one type of accelerator per instance. '
|
|
418
|
-
f'Found: {resources_dict}.')
|
|
419
|
-
acc_name, acc_count = list(resources_dict.items())[0]
|
|
420
|
-
gpu_dict = {'GPU': acc_count}
|
|
421
|
-
# gpu_dict should be empty when the accelerator is not GPU.
|
|
422
|
-
# TODO(zongheng,zhanghao): an alternative is to start the remote
|
|
423
|
-
# cluster with custom resource 'GPU': <n> even if the accelerator(s)
|
|
424
|
-
# are not GPU. We opt for the current solution for now.
|
|
425
|
-
if accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
426
|
-
acc_name):
|
|
427
|
-
gpu_dict = {}
|
|
428
|
-
for bundle in bundles:
|
|
429
|
-
bundle.update({
|
|
430
|
-
# Set the GPU to avoid ray hanging the resources allocation
|
|
431
|
-
**gpu_dict,
|
|
432
|
-
})
|
|
433
|
-
|
|
434
|
-
streaming_message = (
|
|
435
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
|
436
|
-
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
|
437
|
-
f'be killed){colorama.Style.RESET_ALL}')
|
|
438
|
-
self._code += [
|
|
439
|
-
textwrap.dedent(f"""\
|
|
440
|
-
pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
|
|
441
|
-
plural = 's' if {num_nodes} > 1 else ''
|
|
442
|
-
node_str = f'{num_nodes} node{{plural}}'
|
|
443
|
-
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
444
|
-
'Waiting for task resources on '
|
|
445
|
-
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
|
446
|
-
print(message, flush=True)
|
|
447
|
-
# FIXME: This will print the error message from autoscaler if
|
|
448
|
-
# it is waiting for other task to finish. We should hide the
|
|
449
|
-
# error message.
|
|
450
|
-
ray.get(pg.ready())
|
|
451
|
-
print({streaming_message!r}, flush=True)
|
|
452
|
-
""")
|
|
453
|
-
]
|
|
454
|
-
|
|
455
|
-
job_id = self.job_id
|
|
456
|
-
if setup_cmd is not None:
|
|
457
|
-
setup_envs = env_vars.copy()
|
|
458
|
-
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
459
|
-
self._code += [
|
|
460
|
-
textwrap.dedent(f"""\
|
|
461
|
-
setup_cmd = {setup_cmd!r}
|
|
462
|
-
_SETUP_CPUS = 0.0001
|
|
463
|
-
# The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
|
|
464
|
-
# requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
|
|
465
|
-
# We unset it so that user setup command may properly use this env var.
|
|
466
|
-
setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
|
|
467
|
-
job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
468
|
-
|
|
469
|
-
# The schedule_step should be called after the job status is set to non-PENDING,
|
|
470
|
-
# otherwise, the scheduler will think the current job is not submitted yet, and
|
|
471
|
-
# skip the scheduling step.
|
|
472
|
-
job_lib.scheduler.schedule_step()
|
|
473
|
-
|
|
474
|
-
total_num_nodes = len(ray.nodes())
|
|
475
|
-
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
476
|
-
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
477
|
-
setup_workers = [run_bash_command_with_log \\
|
|
478
|
-
.options(
|
|
479
|
-
name='setup',
|
|
480
|
-
num_cpus=_SETUP_CPUS,
|
|
481
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
482
|
-
placement_group=setup_pg,
|
|
483
|
-
placement_group_bundle_index=i)
|
|
484
|
-
) \\
|
|
485
|
-
.remote(
|
|
486
|
-
setup_cmd,
|
|
487
|
-
os.path.expanduser({setup_log_path!r}),
|
|
488
|
-
env_vars={setup_envs!r},
|
|
489
|
-
stream_logs=True,
|
|
490
|
-
with_ray=True,
|
|
491
|
-
) for i in range(total_num_nodes)]
|
|
492
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
493
|
-
if sum(setup_returncodes) != 0:
|
|
494
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
495
|
-
# This waits for all streaming logs to finish.
|
|
496
|
-
time.sleep(1)
|
|
497
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
498
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
499
|
-
setup_returncodes,
|
|
500
|
-
flush=True)
|
|
501
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
502
|
-
sys.exit(1)
|
|
503
|
-
""")
|
|
504
|
-
]
|
|
505
|
-
|
|
506
|
-
self._code.append(f'job_lib.set_job_started({self.job_id!r})')
|
|
507
|
-
if setup_cmd is None:
|
|
508
|
-
# Need to call schedule_step() to make sure the scheduler
|
|
509
|
-
# schedule the next pending job.
|
|
510
|
-
self._code.append('job_lib.scheduler.schedule_step()')
|
|
511
|
-
|
|
512
|
-
# Export IP and node rank to the environment variables.
|
|
513
|
-
self._code += [
|
|
514
|
-
textwrap.dedent(f"""\
|
|
515
|
-
@ray.remote
|
|
516
|
-
def check_ip():
|
|
517
|
-
return ray.util.get_node_ip_address()
|
|
518
|
-
gang_scheduling_id_to_ip = ray.get([
|
|
519
|
-
check_ip.options(
|
|
520
|
-
num_cpus={task_cpu_demand},
|
|
521
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
522
|
-
placement_group=pg,
|
|
523
|
-
placement_group_bundle_index=i
|
|
524
|
-
)).remote()
|
|
525
|
-
for i in range(pg.bundle_count)
|
|
526
|
-
])
|
|
527
|
-
|
|
528
|
-
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
|
529
|
-
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
|
530
|
-
job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
|
|
531
|
-
job_ip_list_str = '\\n'.join(job_ip_rank_list)
|
|
532
|
-
"""),
|
|
533
|
-
]
|
|
534
|
-
|
|
535
|
-
def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
|
|
536
|
-
"""Register the run function to be run on the remote cluster.
|
|
537
|
-
|
|
538
|
-
Args:
|
|
539
|
-
run_fn: The run function to be run on the remote cluster.
|
|
540
|
-
"""
|
|
541
|
-
assert self._has_gang_scheduling, (
|
|
542
|
-
'Call add_gang_scheduling_placement_group_and_setup() '
|
|
543
|
-
'before register_run_fn().')
|
|
544
|
-
assert not self._has_register_run_fn, (
|
|
545
|
-
'register_run_fn() called twice?')
|
|
546
|
-
self._has_register_run_fn = True
|
|
547
|
-
|
|
548
|
-
self._code += [
|
|
549
|
-
run_fn,
|
|
550
|
-
f'run_fn = {run_fn_name}',
|
|
551
|
-
]
|
|
552
|
-
|
|
553
|
-
def add_ray_task(self,
|
|
554
|
-
bash_script: Optional[str],
|
|
555
|
-
task_name: Optional[str],
|
|
556
|
-
ray_resources_dict: Dict[str, float],
|
|
557
|
-
log_dir: str,
|
|
558
|
-
env_vars: Optional[Dict[str, str]] = None,
|
|
559
|
-
gang_scheduling_id: int = 0) -> None:
|
|
560
|
-
"""Generates code for a ray remote task that runs a bash command."""
|
|
561
|
-
assert self._has_gang_scheduling, (
|
|
562
|
-
'Call add_gang_scheduling_placement_group_and_setup() before '
|
|
563
|
-
'add_ray_task().')
|
|
564
|
-
assert (not self._has_register_run_fn or
|
|
565
|
-
bash_script is None), ('bash_script should '
|
|
566
|
-
'be None when run_fn is registered.')
|
|
567
|
-
task_cpu_demand = ray_resources_dict.pop('CPU')
|
|
568
|
-
# Build remote_task.options(...)
|
|
569
|
-
# resources=...
|
|
570
|
-
# num_gpus=...
|
|
571
|
-
options = []
|
|
572
|
-
options.append(f'num_cpus={task_cpu_demand}')
|
|
573
|
-
|
|
574
|
-
num_gpus = 0.0
|
|
575
|
-
if ray_resources_dict:
|
|
576
|
-
assert len(ray_resources_dict) == 1, (
|
|
577
|
-
'There can only be one type of accelerator per instance. '
|
|
578
|
-
f'Found: {ray_resources_dict}.')
|
|
579
|
-
num_gpus = list(ray_resources_dict.values())[0]
|
|
580
|
-
options.append(f'resources={json.dumps(ray_resources_dict)}')
|
|
581
|
-
|
|
582
|
-
resources_key = list(ray_resources_dict.keys())[0]
|
|
583
|
-
if not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
584
|
-
resources_key):
|
|
585
|
-
# `num_gpus` should be empty when the accelerator is not GPU.
|
|
586
|
-
# FIXME: use a set of GPU types, instead of 'tpu' in the key.
|
|
587
|
-
|
|
588
|
-
# Passing this ensures that the Ray remote task gets
|
|
589
|
-
# CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
|
|
590
|
-
# would be force-set to empty by Ray.
|
|
591
|
-
options.append(f'num_gpus={num_gpus}')
|
|
592
|
-
options.append(
|
|
593
|
-
'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
|
|
594
|
-
'placement_group=pg, '
|
|
595
|
-
f'placement_group_bundle_index={gang_scheduling_id})')
|
|
596
|
-
|
|
597
|
-
sky_env_vars_dict_str = [
|
|
598
|
-
textwrap.dedent(f"""\
|
|
599
|
-
sky_env_vars_dict = {{}}
|
|
600
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
|
|
601
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
|
|
602
|
-
""")
|
|
603
|
-
]
|
|
604
|
-
|
|
605
|
-
if env_vars is not None:
|
|
606
|
-
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
607
|
-
for k, v in env_vars.items())
|
|
608
|
-
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
609
|
-
|
|
610
|
-
options_str = ', '.join(options)
|
|
611
|
-
logger.debug('Added Task with options: '
|
|
612
|
-
f'{options_str}')
|
|
613
|
-
# Script to block completion of a job until all storage mounted with
|
|
614
|
-
# CACHED_MOUNT mode is uploaded to remote.
|
|
615
|
-
rclone_flush_script = textwrap.dedent(f"""\
|
|
616
|
-
|
|
617
|
-
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ]; then
|
|
618
|
-
flushed=0
|
|
619
|
-
# extra second on top of --vfs-cache-poll-interval to
|
|
620
|
-
# avoid race condition between rclone log line creation and this check.
|
|
621
|
-
sleep 1
|
|
622
|
-
while [ $flushed -eq 0 ]; do
|
|
623
|
-
# sleep for the same interval as --vfs-cache-poll-interval
|
|
624
|
-
sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
|
|
625
|
-
flushed=1
|
|
626
|
-
for file in {constants.RCLONE_LOG_DIR}/*; do
|
|
627
|
-
exitcode=0
|
|
628
|
-
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
629
|
-
if [ $exitcode -ne 0 ]; then
|
|
630
|
-
echo "skypilot: cached mount is still uploading to remote"
|
|
631
|
-
flushed=0
|
|
632
|
-
break
|
|
633
|
-
fi
|
|
634
|
-
done
|
|
635
|
-
done
|
|
636
|
-
echo "skypilot: cached mount uploaded complete"
|
|
637
|
-
fi""")
|
|
638
|
-
self._code += [
|
|
639
|
-
sky_env_vars_dict_str,
|
|
640
|
-
textwrap.dedent(f"""\
|
|
641
|
-
script = {bash_script!r}
|
|
642
|
-
rclone_flush_script = {rclone_flush_script!r}
|
|
643
|
-
if run_fn is not None:
|
|
644
|
-
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
|
645
|
-
|
|
646
|
-
if script is not None:
|
|
647
|
-
script += rclone_flush_script
|
|
648
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
649
|
-
|
|
650
|
-
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
|
|
651
|
-
rank = job_ip_rank_map[ip]
|
|
652
|
-
|
|
653
|
-
if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
|
|
654
|
-
name_str = '{task_name},' if {task_name!r} != None else 'task,'
|
|
655
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
|
|
656
|
-
else: # Single-node or multi-node task on multi-node cluster
|
|
657
|
-
idx_in_cluster = cluster_ips_to_node_id[ip]
|
|
658
|
-
if cluster_ips_to_node_id[ip] == 0:
|
|
659
|
-
node_name = 'head'
|
|
660
|
-
else:
|
|
661
|
-
node_name = f'worker{{idx_in_cluster}}'
|
|
662
|
-
name_str = f'{{node_name}}, rank={{rank}},'
|
|
663
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
|
|
664
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
|
|
665
|
-
|
|
666
|
-
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
667
|
-
|
|
668
|
-
futures.append(run_bash_command_with_log \\
|
|
669
|
-
.options(name=name_str, {options_str}) \\
|
|
670
|
-
.remote(
|
|
671
|
-
script,
|
|
672
|
-
log_path,
|
|
673
|
-
env_vars=sky_env_vars_dict,
|
|
674
|
-
stream_logs=True,
|
|
675
|
-
with_ray=True,
|
|
676
|
-
))""")
|
|
677
|
-
]
|
|
678
|
-
|
|
679
|
-
def add_epilogue(self) -> None:
|
|
680
|
-
"""Generates code that waits for all tasks, then exits."""
|
|
681
|
-
assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
|
|
682
|
-
assert not self._has_epilogue, 'add_epilogue() called twice?'
|
|
683
|
-
self._has_epilogue = True
|
|
684
|
-
|
|
685
|
-
self._code += [
|
|
686
|
-
textwrap.dedent(f"""\
|
|
687
|
-
returncodes = get_or_fail(futures, pg)
|
|
688
|
-
if sum(returncodes) != 0:
|
|
689
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
690
|
-
# Schedule the next pending job immediately to make the job
|
|
691
|
-
# scheduling more efficient.
|
|
692
|
-
job_lib.scheduler.schedule_step()
|
|
693
|
-
# This waits for all streaming logs to finish.
|
|
694
|
-
time.sleep(0.5)
|
|
695
|
-
reason = ''
|
|
696
|
-
# 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
|
|
697
|
-
if any(r == 139 for r in returncodes):
|
|
698
|
-
reason = '(likely due to Segmentation Fault)'
|
|
699
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
|
|
700
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
701
|
-
returncodes,
|
|
702
|
-
reason,
|
|
703
|
-
flush=True)
|
|
704
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
705
|
-
sys.exit(1)
|
|
706
|
-
else:
|
|
707
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
|
|
708
|
-
# Schedule the next pending job immediately to make the job
|
|
709
|
-
# scheduling more efficient.
|
|
710
|
-
job_lib.scheduler.schedule_step()
|
|
711
|
-
# This waits for all streaming logs to finish.
|
|
712
|
-
time.sleep(0.5)
|
|
713
|
-
""")
|
|
714
|
-
]
|
|
715
|
-
|
|
716
|
-
def build(self) -> str:
|
|
717
|
-
"""Returns the entire generated program."""
|
|
718
|
-
assert self._has_epilogue, 'Call add_epilogue() before build().'
|
|
719
|
-
return '\n'.join(self._code)
|
|
720
|
-
|
|
721
|
-
|
|
722
340
|
class GangSchedulingStatus(enum.Enum):
|
|
723
341
|
"""Enum for gang scheduling status."""
|
|
724
342
|
CLUSTER_READY = 0
|
|
@@ -778,34 +396,6 @@ class FailoverCloudErrorHandlerV1:
|
|
|
778
396
|
setattr(e, 'detailed_reason', detailed_reason)
|
|
779
397
|
raise e
|
|
780
398
|
|
|
781
|
-
@staticmethod
|
|
782
|
-
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
783
|
-
launchable_resources: 'resources_lib.Resources',
|
|
784
|
-
region: 'clouds.Region',
|
|
785
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
|
786
|
-
stderr: str):
|
|
787
|
-
del zones # Unused.
|
|
788
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
|
789
|
-
stdout,
|
|
790
|
-
stderr,
|
|
791
|
-
is_error_str_known=lambda x: 'SCPError:' in x.strip())
|
|
792
|
-
|
|
793
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
|
794
|
-
messages = '\n\t'.join(errors)
|
|
795
|
-
style = colorama.Style
|
|
796
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
|
797
|
-
_add_to_blocked_resources(blocked_resources,
|
|
798
|
-
launchable_resources.copy(zone=None))
|
|
799
|
-
|
|
800
|
-
# Sometimes, SCPError will list available regions.
|
|
801
|
-
for e in errors:
|
|
802
|
-
if e.find('Regions with capacity available:') != -1:
|
|
803
|
-
for r in service_catalog.regions('scp'):
|
|
804
|
-
if e.find(r.name) == -1:
|
|
805
|
-
_add_to_blocked_resources(
|
|
806
|
-
blocked_resources,
|
|
807
|
-
launchable_resources.copy(region=r.name, zone=None))
|
|
808
|
-
|
|
809
399
|
@staticmethod
|
|
810
400
|
def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
811
401
|
launchable_resources: 'resources_lib.Resources',
|
|
@@ -1085,7 +675,7 @@ class FailoverCloudErrorHandlerV2:
|
|
|
1085
675
|
output = str(error)
|
|
1086
676
|
# Sometimes, lambda cloud error will list available regions.
|
|
1087
677
|
if output.find('Regions with capacity available:') != -1:
|
|
1088
|
-
for r in
|
|
678
|
+
for r in catalog.regions('lambda'):
|
|
1089
679
|
if output.find(r.name) == -1:
|
|
1090
680
|
_add_to_blocked_resources(
|
|
1091
681
|
blocked_resources,
|
|
@@ -1109,6 +699,21 @@ class FailoverCloudErrorHandlerV2:
|
|
|
1109
699
|
FailoverCloudErrorHandlerV2._default_handler(
|
|
1110
700
|
blocked_resources, launchable_resources, region, zones, error)
|
|
1111
701
|
|
|
702
|
+
@staticmethod
|
|
703
|
+
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
704
|
+
launchable_resources: 'resources_lib.Resources',
|
|
705
|
+
region: 'clouds.Region',
|
|
706
|
+
zones: Optional[List['clouds.Zone']],
|
|
707
|
+
error: Exception) -> None:
|
|
708
|
+
logger.info(f'SCP handler error: {error}')
|
|
709
|
+
# Block SCP if the credential has expired.
|
|
710
|
+
if isinstance(error, exceptions.InvalidCloudCredentials):
|
|
711
|
+
_add_to_blocked_resources(
|
|
712
|
+
blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
|
|
713
|
+
else:
|
|
714
|
+
FailoverCloudErrorHandlerV2._default_handler(
|
|
715
|
+
blocked_resources, launchable_resources, region, zones, error)
|
|
716
|
+
|
|
1112
717
|
@staticmethod
|
|
1113
718
|
def _default_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
1114
719
|
launchable_resources: 'resources_lib.Resources',
|
|
@@ -1176,7 +781,8 @@ class RetryingVmProvisioner(object):
|
|
|
1176
781
|
local_wheel_path: pathlib.Path,
|
|
1177
782
|
wheel_hash: str,
|
|
1178
783
|
blocked_resources: Optional[Iterable[
|
|
1179
|
-
resources_lib.Resources]] = None
|
|
784
|
+
resources_lib.Resources]] = None,
|
|
785
|
+
is_managed: Optional[bool] = None):
|
|
1180
786
|
self._blocked_resources: Set[resources_lib.Resources] = set()
|
|
1181
787
|
if blocked_resources:
|
|
1182
788
|
# blocked_resources is not None and not empty.
|
|
@@ -1188,6 +794,7 @@ class RetryingVmProvisioner(object):
|
|
|
1188
794
|
self._requested_features = requested_features
|
|
1189
795
|
self._local_wheel_path = local_wheel_path
|
|
1190
796
|
self._wheel_hash = wheel_hash
|
|
797
|
+
self._is_managed = is_managed
|
|
1191
798
|
|
|
1192
799
|
def _yield_zones(
|
|
1193
800
|
self, to_provision: resources_lib.Resources, num_nodes: int,
|
|
@@ -1232,7 +839,8 @@ class RetryingVmProvisioner(object):
|
|
|
1232
839
|
assert isinstance(handle, CloudVmRayResourceHandle), (
|
|
1233
840
|
'handle should be CloudVmRayResourceHandle (found: '
|
|
1234
841
|
f'{type(handle)}) {cluster_name!r}')
|
|
1235
|
-
config =
|
|
842
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
843
|
+
handle.cluster_yaml)
|
|
1236
844
|
# This is for the case when the zone field is not set in the
|
|
1237
845
|
# launched resources in a previous launch (e.g., ctrl-c during
|
|
1238
846
|
# launch and multi-node cluster before PR #1700).
|
|
@@ -1316,6 +924,34 @@ class RetryingVmProvisioner(object):
|
|
|
1316
924
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1317
925
|
yield zones
|
|
1318
926
|
|
|
927
|
+
def _insufficient_resources_msg(
|
|
928
|
+
self,
|
|
929
|
+
to_provision: resources_lib.Resources,
|
|
930
|
+
requested_resources: Set[resources_lib.Resources],
|
|
931
|
+
insufficient_resources: Optional[List[str]],
|
|
932
|
+
) -> str:
|
|
933
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
934
|
+
f' ({", ".join(insufficient_resources)})')
|
|
935
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
936
|
+
if to_provision.zone is not None:
|
|
937
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
938
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
939
|
+
# For public clouds, provision.region is always set.
|
|
940
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
941
|
+
message += (
|
|
942
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
943
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
944
|
+
'have enough resources.')
|
|
945
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
946
|
+
message += (f'in context {to_provision.region} for '
|
|
947
|
+
f'{requested_resources}. ')
|
|
948
|
+
else:
|
|
949
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
950
|
+
f'{requested_resources}. ')
|
|
951
|
+
else:
|
|
952
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
953
|
+
return message
|
|
954
|
+
|
|
1319
955
|
def _retry_zones(
|
|
1320
956
|
self,
|
|
1321
957
|
to_provision: resources_lib.Resources,
|
|
@@ -1329,6 +965,7 @@ class RetryingVmProvisioner(object):
|
|
|
1329
965
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
|
1330
966
|
prev_cluster_ever_up: bool,
|
|
1331
967
|
skip_if_config_hash_matches: Optional[str],
|
|
968
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]],
|
|
1332
969
|
) -> Dict[str, Any]:
|
|
1333
970
|
"""The provision retry loop.
|
|
1334
971
|
|
|
@@ -1349,12 +986,17 @@ class RetryingVmProvisioner(object):
|
|
|
1349
986
|
if not dryrun:
|
|
1350
987
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
1351
988
|
os.system(f'touch {log_path}')
|
|
989
|
+
|
|
1352
990
|
rich_utils.force_update_status(
|
|
1353
|
-
ux_utils.spinner_message('Launching',
|
|
991
|
+
ux_utils.spinner_message('Launching',
|
|
992
|
+
log_path,
|
|
993
|
+
cluster_name=cluster_name))
|
|
1354
994
|
|
|
1355
995
|
# Get previous cluster status
|
|
1356
996
|
cluster_exists = prev_cluster_status is not None
|
|
1357
997
|
|
|
998
|
+
to_provision = to_provision.assert_launchable()
|
|
999
|
+
|
|
1358
1000
|
assert to_provision.region is not None, (
|
|
1359
1001
|
to_provision, 'region should have been set by the optimizer.')
|
|
1360
1002
|
region = clouds.Region(to_provision.region)
|
|
@@ -1388,6 +1030,7 @@ class RetryingVmProvisioner(object):
|
|
|
1388
1030
|
f'To request quotas, check the instruction: '
|
|
1389
1031
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1390
1032
|
|
|
1033
|
+
insufficient_resources = None
|
|
1391
1034
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1392
1035
|
prev_cluster_status,
|
|
1393
1036
|
prev_cluster_ever_up):
|
|
@@ -1432,7 +1075,9 @@ class RetryingVmProvisioner(object):
|
|
|
1432
1075
|
region=region,
|
|
1433
1076
|
zones=zones,
|
|
1434
1077
|
dryrun=dryrun,
|
|
1435
|
-
keep_launch_fields_in_existing_config=cluster_exists
|
|
1078
|
+
keep_launch_fields_in_existing_config=cluster_exists,
|
|
1079
|
+
volume_mounts=volume_mounts,
|
|
1080
|
+
)
|
|
1436
1081
|
except exceptions.ResourcesUnavailableError as e:
|
|
1437
1082
|
# Failed due to catalog issue, e.g. image not found, or
|
|
1438
1083
|
# GPUs are requested in a Kubernetes cluster but the cluster
|
|
@@ -1515,8 +1160,17 @@ class RetryingVmProvisioner(object):
|
|
|
1515
1160
|
cluster_handle=handle,
|
|
1516
1161
|
requested_resources=requested_resources,
|
|
1517
1162
|
ready=False,
|
|
1163
|
+
is_managed=self._is_managed,
|
|
1164
|
+
provision_log_path=log_abs_path,
|
|
1518
1165
|
)
|
|
1519
1166
|
|
|
1167
|
+
# Add cluster event for actual provisioning start.
|
|
1168
|
+
global_user_state.add_cluster_event(
|
|
1169
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
1170
|
+
f'Provisioning on {to_provision.cloud.display_name()} ' +
|
|
1171
|
+
f'in {to_provision.region}',
|
|
1172
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
1173
|
+
|
|
1520
1174
|
global_user_state.set_owner_identity_for_cluster(
|
|
1521
1175
|
cluster_name, cloud_user_identity)
|
|
1522
1176
|
|
|
@@ -1543,11 +1197,13 @@ class RetryingVmProvisioner(object):
|
|
|
1543
1197
|
controller_str = ('' if controller is None else
|
|
1544
1198
|
f' {controller.value.name}')
|
|
1545
1199
|
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
1546
|
-
|
|
1200
|
+
suffix = '.'
|
|
1201
|
+
if region.name.startswith('ssh-'):
|
|
1202
|
+
suffix = f' ({region.name.lstrip("ssh-")})'
|
|
1547
1203
|
logger.info(
|
|
1548
1204
|
ux_utils.starting_message(
|
|
1549
1205
|
f'Launching{controller_str} on '
|
|
1550
|
-
f'{to_provision.cloud}
|
|
1206
|
+
f'{to_provision.cloud}{suffix}'))
|
|
1551
1207
|
else:
|
|
1552
1208
|
logger.info(
|
|
1553
1209
|
ux_utils.starting_message(
|
|
@@ -1587,6 +1243,24 @@ class RetryingVmProvisioner(object):
|
|
|
1587
1243
|
# No teardown happens for this error.
|
|
1588
1244
|
with ux_utils.print_exception_no_traceback():
|
|
1589
1245
|
raise
|
|
1246
|
+
except config_lib.KubernetesError as e:
|
|
1247
|
+
if e.insufficent_resources:
|
|
1248
|
+
insufficient_resources = e.insufficent_resources
|
|
1249
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1250
|
+
# cluster does not exist. Also we are fast at
|
|
1251
|
+
# cleaning up clusters now if there is no existing node.
|
|
1252
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1253
|
+
handle,
|
|
1254
|
+
terminate=not prev_cluster_ever_up,
|
|
1255
|
+
remove_from_db=False,
|
|
1256
|
+
failover=True,
|
|
1257
|
+
)
|
|
1258
|
+
# TODO(suquark): other clouds may have different zone
|
|
1259
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1260
|
+
# for details.
|
|
1261
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1262
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1263
|
+
continue
|
|
1590
1264
|
except Exception as e: # pylint: disable=broad-except
|
|
1591
1265
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1592
1266
|
# cluster does not exist. Also we are fast at
|
|
@@ -1594,7 +1268,8 @@ class RetryingVmProvisioner(object):
|
|
|
1594
1268
|
CloudVmRayBackend().post_teardown_cleanup(
|
|
1595
1269
|
handle,
|
|
1596
1270
|
terminate=not prev_cluster_ever_up,
|
|
1597
|
-
remove_from_db=False
|
|
1271
|
+
remove_from_db=False,
|
|
1272
|
+
failover=True)
|
|
1598
1273
|
# TODO(suquark): other clouds may have different zone
|
|
1599
1274
|
# blocking strategy. See '_update_blocklist_on_error'
|
|
1600
1275
|
# for details.
|
|
@@ -1650,7 +1325,9 @@ class RetryingVmProvisioner(object):
|
|
|
1650
1325
|
config_dict['handle'] = handle
|
|
1651
1326
|
logger.info(
|
|
1652
1327
|
ux_utils.finishing_message(
|
|
1653
|
-
f'Cluster launched: {cluster_name!r}.',
|
|
1328
|
+
f'Cluster launched: {cluster_name!r}.',
|
|
1329
|
+
log_path,
|
|
1330
|
+
cluster_name=cluster_name))
|
|
1654
1331
|
return config_dict
|
|
1655
1332
|
|
|
1656
1333
|
# The cluster is not ready. We must perform error recording and/or
|
|
@@ -1714,17 +1391,9 @@ class RetryingVmProvisioner(object):
|
|
|
1714
1391
|
terminate=terminate_or_stop,
|
|
1715
1392
|
remove_from_db=False)
|
|
1716
1393
|
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
f'{requested_resources}. ')
|
|
1721
|
-
elif to_provision.region is not None:
|
|
1722
|
-
# For public clouds, provision.region is always set.
|
|
1723
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1724
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1725
|
-
else:
|
|
1726
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1727
|
-
f'for {requested_resources}. ')
|
|
1394
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1395
|
+
requested_resources,
|
|
1396
|
+
insufficient_resources)
|
|
1728
1397
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1729
1398
|
# the user can have some data on the cluster.
|
|
1730
1399
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -1775,7 +1444,8 @@ class RetryingVmProvisioner(object):
|
|
|
1775
1444
|
log_abs_path,
|
|
1776
1445
|
stream_logs=False,
|
|
1777
1446
|
start_streaming_at='Shared connection to',
|
|
1778
|
-
line_processor=log_utils.RayUpLineProcessor(
|
|
1447
|
+
line_processor=log_utils.RayUpLineProcessor(
|
|
1448
|
+
log_abs_path, cluster_name=cluster_handle.cluster_name),
|
|
1779
1449
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
|
1780
1450
|
# time during 'ray up' if insufficient capacity occurs.
|
|
1781
1451
|
env=dict(
|
|
@@ -1919,9 +1589,10 @@ class RetryingVmProvisioner(object):
|
|
|
1919
1589
|
# ready to ensure cluster will not scale up after preemption (spot).
|
|
1920
1590
|
# Skip for non-spot as this takes extra time to provision (~1min).
|
|
1921
1591
|
if use_spot:
|
|
1922
|
-
ray_config =
|
|
1592
|
+
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
1593
|
+
cluster_config_file)
|
|
1923
1594
|
ray_config['upscaling_speed'] = 0
|
|
1924
|
-
|
|
1595
|
+
yaml_utils.dump_yaml(cluster_config_file, ray_config)
|
|
1925
1596
|
start = time.time()
|
|
1926
1597
|
returncode, stdout, stderr = ray_up()
|
|
1927
1598
|
logger.debug(
|
|
@@ -2030,6 +1701,7 @@ class RetryingVmProvisioner(object):
|
|
|
2030
1701
|
f' that never expire or a service account.\033[0m')
|
|
2031
1702
|
logger.warning(warnings)
|
|
2032
1703
|
|
|
1704
|
+
to_provision = to_provision.assert_launchable()
|
|
2033
1705
|
# Retrying launchable resources.
|
|
2034
1706
|
while True:
|
|
2035
1707
|
try:
|
|
@@ -2068,7 +1740,9 @@ class RetryingVmProvisioner(object):
|
|
|
2068
1740
|
prev_cluster_status=prev_cluster_status,
|
|
2069
1741
|
prev_handle=prev_handle,
|
|
2070
1742
|
prev_cluster_ever_up=prev_cluster_ever_up,
|
|
2071
|
-
skip_if_config_hash_matches=skip_if_config_hash_matches
|
|
1743
|
+
skip_if_config_hash_matches=skip_if_config_hash_matches,
|
|
1744
|
+
volume_mounts=task.volume_mounts,
|
|
1745
|
+
)
|
|
2072
1746
|
if dryrun:
|
|
2073
1747
|
return config_dict
|
|
2074
1748
|
except (exceptions.InvalidClusterNameError,
|
|
@@ -2115,8 +1789,6 @@ class RetryingVmProvisioner(object):
|
|
|
2115
1789
|
# terminated by _retry_zones().
|
|
2116
1790
|
assert (prev_cluster_status == status_lib.ClusterStatus.INIT
|
|
2117
1791
|
), prev_cluster_status
|
|
2118
|
-
assert global_user_state.get_handle_from_cluster_name(
|
|
2119
|
-
cluster_name) is None, cluster_name
|
|
2120
1792
|
logger.info(
|
|
2121
1793
|
ux_utils.retry_message(
|
|
2122
1794
|
f'Retrying provisioning with requested resources: '
|
|
@@ -2151,20 +1823,45 @@ class RetryingVmProvisioner(object):
|
|
|
2151
1823
|
# possible resources or the requested resources is too
|
|
2152
1824
|
# restrictive. If we reach here, our failover logic finally
|
|
2153
1825
|
# ends here.
|
|
2154
|
-
table = log_utils.create_table(['
|
|
1826
|
+
table = log_utils.create_table(['INFRA', 'RESOURCES', 'REASON'])
|
|
2155
1827
|
for (resource, exception) in resource_exceptions.items():
|
|
2156
|
-
table.add_row(
|
|
2157
|
-
|
|
2158
|
-
|
|
1828
|
+
table.add_row([
|
|
1829
|
+
resource.infra.formatted_str(),
|
|
1830
|
+
resources_utils.format_resource(
|
|
1831
|
+
resource, simplified_only=True)[0], exception
|
|
1832
|
+
])
|
|
1833
|
+
# Set the max width of REASON column to 80 to avoid the table
|
|
1834
|
+
# being wrapped in a unreadable way.
|
|
1835
|
+
# pylint: disable=protected-access
|
|
1836
|
+
table._max_width = {'REASON': 80}
|
|
2159
1837
|
raise exceptions.ResourcesUnavailableError(
|
|
2160
1838
|
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
|
2161
1839
|
failover_history=failover_history)
|
|
2162
|
-
|
|
1840
|
+
best_resources = task.best_resources
|
|
2163
1841
|
assert task in self._dag.tasks, 'Internal logic error.'
|
|
2164
|
-
assert
|
|
1842
|
+
assert best_resources is not None, task
|
|
1843
|
+
to_provision = best_resources
|
|
2165
1844
|
return config_dict
|
|
2166
1845
|
|
|
2167
1846
|
|
|
1847
|
+
@dataclasses.dataclass
|
|
1848
|
+
class SSHTunnelInfo:
|
|
1849
|
+
port: int
|
|
1850
|
+
pid: int
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
|
|
1854
|
+
try:
|
|
1855
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1856
|
+
s.settimeout(0.5)
|
|
1857
|
+
s.connect(('localhost', tunnel.port))
|
|
1858
|
+
return True
|
|
1859
|
+
except socket.error as e:
|
|
1860
|
+
logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
|
|
1861
|
+
f'{common_utils.format_exception(e)}')
|
|
1862
|
+
return False
|
|
1863
|
+
|
|
1864
|
+
|
|
2168
1865
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2169
1866
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2170
1867
|
|
|
@@ -2184,10 +1881,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2184
1881
|
- (optional) Launched resources
|
|
2185
1882
|
- (optional) Docker user name
|
|
2186
1883
|
- (optional) If TPU(s) are managed, a path to a deletion script.
|
|
1884
|
+
- (optional) Skylet SSH tunnel info.
|
|
2187
1885
|
"""
|
|
2188
1886
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2189
|
-
#
|
|
2190
|
-
_VERSION =
|
|
1887
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
1888
|
+
_VERSION = 12
|
|
2191
1889
|
|
|
2192
1890
|
def __init__(
|
|
2193
1891
|
self,
|
|
@@ -2220,6 +1918,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2220
1918
|
self.launched_nodes = launched_nodes
|
|
2221
1919
|
self.launched_resources = launched_resources
|
|
2222
1920
|
self.docker_user: Optional[str] = None
|
|
1921
|
+
self.is_grpc_enabled = True
|
|
2223
1922
|
|
|
2224
1923
|
def __repr__(self):
|
|
2225
1924
|
return (f'ResourceHandle('
|
|
@@ -2235,17 +1934,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2235
1934
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
|
2236
1935
|
f'{self.launched_resources}, '
|
|
2237
1936
|
f'\n\tdocker_user={self.docker_user},'
|
|
2238
|
-
f'\n\tssh_user={self.ssh_user}'
|
|
1937
|
+
f'\n\tssh_user={self.ssh_user},'
|
|
1938
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2239
1939
|
|
|
2240
1940
|
def get_cluster_name(self):
|
|
2241
1941
|
return self.cluster_name
|
|
2242
1942
|
|
|
1943
|
+
def get_cluster_name_on_cloud(self):
|
|
1944
|
+
return self.cluster_name_on_cloud
|
|
1945
|
+
|
|
2243
1946
|
def _use_internal_ips(self):
|
|
2244
1947
|
"""Returns whether to use internal IPs for SSH connections."""
|
|
2245
1948
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
|
2246
1949
|
# instead of `skypilot_config` as the latter can be changed after the
|
|
2247
1950
|
# cluster is UP.
|
|
2248
|
-
return
|
|
1951
|
+
return global_user_state.get_cluster_yaml_dict(self.cluster_yaml).get(
|
|
2249
1952
|
'provider', {}).get('use_internal_ips', False)
|
|
2250
1953
|
|
|
2251
1954
|
def update_ssh_ports(self, max_attempts: int = 1) -> None:
|
|
@@ -2266,15 +1969,20 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2266
1969
|
def _update_cluster_info(self):
|
|
2267
1970
|
# When a cluster is on a cloud that does not support the new
|
|
2268
1971
|
# provisioner, we should skip updating cluster_info.
|
|
2269
|
-
if (self.launched_resources.cloud
|
|
1972
|
+
if (self.launched_resources.cloud is not None and
|
|
1973
|
+
self.launched_resources.cloud.PROVISIONER_VERSION >=
|
|
2270
1974
|
clouds.ProvisionerVersion.SKYPILOT):
|
|
2271
1975
|
provider_name = str(self.launched_resources.cloud).lower()
|
|
2272
1976
|
config = {}
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
1977
|
+
# It is possible that the cluster yaml is not available when
|
|
1978
|
+
# the handle is unpickled for service replicas from the
|
|
1979
|
+
# controller with older version.
|
|
1980
|
+
yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
|
|
1981
|
+
if yaml_str is None:
|
|
1982
|
+
# If the cluster yaml is not available,
|
|
1983
|
+
# we skip updating the cluster info.
|
|
1984
|
+
return
|
|
1985
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
2278
1986
|
try:
|
|
2279
1987
|
cluster_info = provision_lib.get_cluster_info(
|
|
2280
1988
|
provider_name,
|
|
@@ -2410,12 +2118,23 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2410
2118
|
zip(cluster_internal_ips, cluster_feasible_ips))
|
|
2411
2119
|
|
|
2412
2120
|
# Ensure head node is the first element, then sort based on the
|
|
2413
|
-
# external IPs for stableness
|
|
2414
|
-
|
|
2415
|
-
|
|
2121
|
+
# external IPs for stableness. Skip for k8s nodes since pods
|
|
2122
|
+
# worker ids are already mapped.
|
|
2123
|
+
if (cluster_info is not None and
|
|
2124
|
+
cluster_info.provider_name == 'kubernetes'):
|
|
2125
|
+
stable_internal_external_ips = internal_external_ips
|
|
2126
|
+
else:
|
|
2127
|
+
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
|
|
2128
|
+
internal_external_ips[1:], key=lambda x: x[1])
|
|
2416
2129
|
self.stable_internal_external_ips = stable_internal_external_ips
|
|
2417
2130
|
|
|
2418
|
-
@
|
|
2131
|
+
@context_utils.cancellation_guard
|
|
2132
|
+
# we expect different request to be acting on different clusters
|
|
2133
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2134
|
+
# across requests.
|
|
2135
|
+
# Do not change this cache to global scope
|
|
2136
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2137
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2419
2138
|
@timeline.event
|
|
2420
2139
|
def get_command_runners(self,
|
|
2421
2140
|
force_cached: bool = False,
|
|
@@ -2426,19 +2145,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2426
2145
|
self.cluster_yaml, self.docker_user, self.ssh_user)
|
|
2427
2146
|
if avoid_ssh_control:
|
|
2428
2147
|
ssh_credentials.pop('ssh_control_name', None)
|
|
2148
|
+
|
|
2149
|
+
launched_resources = self.launched_resources.assert_launchable()
|
|
2429
2150
|
updated_to_skypilot_provisioner_after_provisioned = (
|
|
2430
|
-
|
|
2151
|
+
launched_resources.cloud.PROVISIONER_VERSION >=
|
|
2431
2152
|
clouds.ProvisionerVersion.SKYPILOT and
|
|
2432
2153
|
self.cached_external_ips is not None and
|
|
2433
2154
|
self.cached_cluster_info is None)
|
|
2434
2155
|
if updated_to_skypilot_provisioner_after_provisioned:
|
|
2435
2156
|
logger.debug(
|
|
2436
|
-
f'{
|
|
2157
|
+
f'{launched_resources.cloud} has been updated to the new '
|
|
2437
2158
|
f'provisioner after cluster {self.cluster_name} was '
|
|
2438
2159
|
f'provisioned. Cached IPs are used for connecting to the '
|
|
2439
2160
|
'cluster.')
|
|
2440
2161
|
if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
|
|
2441
|
-
|
|
2162
|
+
launched_resources.cloud.PROVISIONER_VERSION or
|
|
2442
2163
|
updated_to_skypilot_provisioner_after_provisioned):
|
|
2443
2164
|
ip_list = (self.cached_external_ips
|
|
2444
2165
|
if force_cached else self.external_ips())
|
|
@@ -2464,6 +2185,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2464
2185
|
'Tried to use cached cluster info, but it\'s missing for '
|
|
2465
2186
|
f'cluster "{self.cluster_name}"')
|
|
2466
2187
|
self._update_cluster_info()
|
|
2188
|
+
# For Kubernetes, `KubernetesCommandRunner` want to get the pod names
|
|
2189
|
+
# to run the command. But for high availability serve controller,
|
|
2190
|
+
# the controller pod is part of a deployment, and once the pod is
|
|
2191
|
+
# killed and a new one is created, the pod name changes, so we need
|
|
2192
|
+
# to manually update the cluster info here.
|
|
2193
|
+
# TODO(andyl): See if we can prevent this refresh. Like pass in
|
|
2194
|
+
# deployment name as identifier for KubernetesCommandRunner. Now this
|
|
2195
|
+
# is required for rsync as using deployment in rsync seems to cause
|
|
2196
|
+
# some unknown issues.
|
|
2197
|
+
# TODO(andyl): Should check through the real cluster info. Same as
|
|
2198
|
+
# the TODO in kubernetes/instance.py:terminate_instances
|
|
2199
|
+
if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
|
|
2200
|
+
controller_utils.high_availability_specified(
|
|
2201
|
+
self.cluster_name)):
|
|
2202
|
+
self._update_cluster_info()
|
|
2467
2203
|
|
|
2468
2204
|
assert self.cached_cluster_info is not None, self
|
|
2469
2205
|
runners = provision_lib.get_command_runners(
|
|
@@ -2532,6 +2268,201 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2532
2268
|
cluster_config_file)
|
|
2533
2269
|
self.docker_user = docker_user
|
|
2534
2270
|
|
|
2271
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2272
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2273
|
+
self.cluster_name)
|
|
2274
|
+
if metadata is None:
|
|
2275
|
+
return None
|
|
2276
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2277
|
+
|
|
2278
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2279
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2280
|
+
self.cluster_name,
|
|
2281
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2282
|
+
|
|
2283
|
+
def close_skylet_ssh_tunnel(self) -> None:
|
|
2284
|
+
"""Terminate the SSH tunnel process and clear its metadata."""
|
|
2285
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2286
|
+
if tunnel is None:
|
|
2287
|
+
return
|
|
2288
|
+
logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
|
|
2289
|
+
self.cluster_name, tunnel.port)
|
|
2290
|
+
try:
|
|
2291
|
+
self._terminate_ssh_tunnel_process(tunnel)
|
|
2292
|
+
finally:
|
|
2293
|
+
self._set_skylet_ssh_tunnel(None)
|
|
2294
|
+
|
|
2295
|
+
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2296
|
+
grpc_options = [
|
|
2297
|
+
# The task YAMLs can be large, so the default
|
|
2298
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2299
|
+
('grpc.max_receive_message_length', -1),
|
|
2300
|
+
]
|
|
2301
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2302
|
+
# and writes are very rare.
|
|
2303
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2304
|
+
# because it will only happen on:
|
|
2305
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2306
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2307
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2308
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2309
|
+
# and get the new tunnel.
|
|
2310
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2311
|
+
if tunnel is not None:
|
|
2312
|
+
if _is_tunnel_healthy(tunnel):
|
|
2313
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2314
|
+
options=grpc_options)
|
|
2315
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2316
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2317
|
+
|
|
2318
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2319
|
+
remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2320
|
+
start_time = time.perf_counter()
|
|
2321
|
+
attempt = 1
|
|
2322
|
+
|
|
2323
|
+
def _get_remaining_timeout() -> float:
|
|
2324
|
+
return max(0.0,
|
|
2325
|
+
remaining_timeout - (time.perf_counter() - start_time))
|
|
2326
|
+
|
|
2327
|
+
while remaining_timeout > 0:
|
|
2328
|
+
logger.debug(
|
|
2329
|
+
'Attempting to acquire exclusive lock for %s (attempt %d)',
|
|
2330
|
+
lock_id, attempt)
|
|
2331
|
+
exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
|
|
2332
|
+
try:
|
|
2333
|
+
with exclusive_lock.acquire(blocking=False):
|
|
2334
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2335
|
+
logger.debug(f'Acquired exclusive lock for {lock_id} after '
|
|
2336
|
+
f'{wait_elapsed:.2f}s')
|
|
2337
|
+
try:
|
|
2338
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2339
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2340
|
+
options=grpc_options)
|
|
2341
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2342
|
+
# Failed to open tunnel, release the lock and retry.
|
|
2343
|
+
logger.warning(f'Failed to open tunnel for cluster '
|
|
2344
|
+
f'{self.cluster_name!r}: '
|
|
2345
|
+
f'{common_utils.format_exception(e)}')
|
|
2346
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2347
|
+
attempt += 1
|
|
2348
|
+
continue
|
|
2349
|
+
except locks.LockTimeout:
|
|
2350
|
+
pass
|
|
2351
|
+
|
|
2352
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2353
|
+
logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
|
|
2354
|
+
f'waiting on shared lock (attempt {attempt})')
|
|
2355
|
+
try:
|
|
2356
|
+
# Use shared lock so that concurrent readers can
|
|
2357
|
+
# proceed in parallel.
|
|
2358
|
+
shared_lock = locks.get_lock(lock_id,
|
|
2359
|
+
remaining_timeout,
|
|
2360
|
+
shared_lock=True)
|
|
2361
|
+
# Wait for the exclusive lock to be released.
|
|
2362
|
+
shared_lock.acquire(blocking=True)
|
|
2363
|
+
# We only need the lock for signalling that the new tunnel has
|
|
2364
|
+
# been opened, not for checking the tunnel health.
|
|
2365
|
+
# Same reasoning as why we don't need to grab the lock in
|
|
2366
|
+
# the fast path at the start of this function.
|
|
2367
|
+
shared_lock.release()
|
|
2368
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2369
|
+
logger.debug(f'Acquired shared lock for {lock_id} after '
|
|
2370
|
+
f'{wait_elapsed:.2f}s')
|
|
2371
|
+
except locks.LockTimeout as e:
|
|
2372
|
+
raise RuntimeError(
|
|
2373
|
+
f'Failed to get gRPC channel for cluster '
|
|
2374
|
+
f'{self.cluster_name!r} due to a timeout when waiting '
|
|
2375
|
+
'for the SSH tunnel to be opened. Please try again or '
|
|
2376
|
+
f'manually remove the lock at {lock_id}. '
|
|
2377
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2378
|
+
|
|
2379
|
+
# Add small jitter before probing to smoothen the effects
|
|
2380
|
+
# of many readers waking up simultaneously.
|
|
2381
|
+
jitter = random.uniform(0.01, 0.05)
|
|
2382
|
+
time.sleep(jitter)
|
|
2383
|
+
|
|
2384
|
+
# Re-read the tunnel metadata and verify it's healthy.
|
|
2385
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2386
|
+
if tunnel is not None:
|
|
2387
|
+
if _is_tunnel_healthy(tunnel):
|
|
2388
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2389
|
+
options=grpc_options)
|
|
2390
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2391
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2392
|
+
# Tunnel is still unhealthy or missing, try again with updated
|
|
2393
|
+
# timeout. This could happen in the case where the thread who
|
|
2394
|
+
# held the exclusive lock to open the tunnel crashed.
|
|
2395
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2396
|
+
attempt += 1
|
|
2397
|
+
raise RuntimeError('Timeout waiting for gRPC channel for cluster '
|
|
2398
|
+
f'{self.cluster_name!r} to be ready.')
|
|
2399
|
+
|
|
2400
|
+
def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2401
|
+
"""Terminate the SSH tunnel process."""
|
|
2402
|
+
try:
|
|
2403
|
+
proc = psutil.Process(tunnel_info.pid)
|
|
2404
|
+
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2405
|
+
logger.debug(
|
|
2406
|
+
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2407
|
+
subprocess_utils.kill_children_processes(proc.pid)
|
|
2408
|
+
except psutil.NoSuchProcess:
|
|
2409
|
+
pass
|
|
2410
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2411
|
+
logger.warning(
|
|
2412
|
+
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2413
|
+
|
|
2414
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2415
|
+
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2416
|
+
updates the cluster handle, and persists it to the database."""
|
|
2417
|
+
max_attempts = 3
|
|
2418
|
+
# There could be a race condition here, as multiple processes may
|
|
2419
|
+
# attempt to open the same port at the same time.
|
|
2420
|
+
for attempt in range(max_attempts):
|
|
2421
|
+
runners = self.get_command_runners()
|
|
2422
|
+
head_runner = runners[0]
|
|
2423
|
+
local_port = random.randint(10000, 65535)
|
|
2424
|
+
try:
|
|
2425
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2426
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2427
|
+
except exceptions.CommandError as e:
|
|
2428
|
+
# Don't retry if the error is due to timeout,
|
|
2429
|
+
# connection refused, Kubernetes pods not found,
|
|
2430
|
+
# or an in-progress termination.
|
|
2431
|
+
if (e.detailed_reason is not None and
|
|
2432
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2433
|
+
e.detailed_reason) or
|
|
2434
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2435
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2436
|
+
raise e
|
|
2437
|
+
logger.warning(
|
|
2438
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2439
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2440
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2441
|
+
continue
|
|
2442
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2443
|
+
pid=ssh_tunnel_proc.pid)
|
|
2444
|
+
break
|
|
2445
|
+
|
|
2446
|
+
try:
|
|
2447
|
+
grpc.channel_ready_future(
|
|
2448
|
+
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2449
|
+
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2450
|
+
# Clean up existing tunnel before setting up the new one.
|
|
2451
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2452
|
+
if old_tunnel is not None:
|
|
2453
|
+
self._terminate_ssh_tunnel_process(old_tunnel)
|
|
2454
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2455
|
+
return tunnel_info
|
|
2456
|
+
except grpc.FutureTimeoutError as e:
|
|
2457
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2458
|
+
logger.warning(
|
|
2459
|
+
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2460
|
+
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2461
|
+
raise e
|
|
2462
|
+
except Exception as e:
|
|
2463
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2464
|
+
raise e
|
|
2465
|
+
|
|
2535
2466
|
@property
|
|
2536
2467
|
def cluster_yaml(self) -> Optional[str]:
|
|
2537
2468
|
if self._cluster_yaml is None:
|
|
@@ -2542,6 +2473,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2542
2473
|
def cluster_yaml(self, value: Optional[str]):
|
|
2543
2474
|
self._cluster_yaml = value
|
|
2544
2475
|
|
|
2476
|
+
@property
|
|
2477
|
+
def instance_ids(self):
|
|
2478
|
+
if self.cached_cluster_info is not None:
|
|
2479
|
+
return self.cached_cluster_info.instance_ids()
|
|
2480
|
+
return None
|
|
2481
|
+
|
|
2545
2482
|
@property
|
|
2546
2483
|
def ssh_user(self):
|
|
2547
2484
|
if self.cached_cluster_info is not None:
|
|
@@ -2576,6 +2513,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2576
2513
|
num_ips = 1
|
|
2577
2514
|
return num_ips
|
|
2578
2515
|
|
|
2516
|
+
@property
|
|
2517
|
+
def is_grpc_enabled_with_flag(self) -> bool:
|
|
2518
|
+
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2519
|
+
return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
|
|
2520
|
+
|
|
2521
|
+
def __getstate__(self):
|
|
2522
|
+
state = self.__dict__.copy()
|
|
2523
|
+
# For backwards compatibility. Refer to
|
|
2524
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
2525
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
2526
|
+
return state
|
|
2527
|
+
|
|
2579
2528
|
def __setstate__(self, state):
|
|
2580
2529
|
self._version = self._VERSION
|
|
2581
2530
|
|
|
@@ -2606,7 +2555,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2606
2555
|
# pylint: disable=import-outside-toplevel
|
|
2607
2556
|
launched_resources = state['launched_resources']
|
|
2608
2557
|
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
|
2609
|
-
yaml_config =
|
|
2558
|
+
yaml_config = global_user_state.get_cluster_yaml_dict(
|
|
2610
2559
|
os.path.expanduser(state['_cluster_yaml']))
|
|
2611
2560
|
context = kubernetes_utils.get_context_from_config(
|
|
2612
2561
|
yaml_config['provider'])
|
|
@@ -2629,6 +2578,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2629
2578
|
os.path.expanduser(state['_cluster_yaml'])):
|
|
2630
2579
|
state['_cluster_yaml'] = None
|
|
2631
2580
|
|
|
2581
|
+
if version < 11:
|
|
2582
|
+
state['is_grpc_enabled'] = False
|
|
2583
|
+
state['skylet_ssh_tunnel'] = None
|
|
2584
|
+
|
|
2585
|
+
if version >= 12:
|
|
2586
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
2587
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
2588
|
+
|
|
2632
2589
|
self.__dict__.update(state)
|
|
2633
2590
|
|
|
2634
2591
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2653,6 +2610,234 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2653
2610
|
pass
|
|
2654
2611
|
|
|
2655
2612
|
|
|
2613
|
+
class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
2614
|
+
"""A handle for local resources."""
|
|
2615
|
+
|
|
2616
|
+
def __init__(
|
|
2617
|
+
self,
|
|
2618
|
+
*,
|
|
2619
|
+
cluster_name: str,
|
|
2620
|
+
cluster_name_on_cloud: str,
|
|
2621
|
+
cluster_yaml: Optional[str],
|
|
2622
|
+
launched_nodes: int,
|
|
2623
|
+
launched_resources: resources_lib.Resources,
|
|
2624
|
+
stable_internal_external_ips: Optional[List[Tuple[str,
|
|
2625
|
+
str]]] = None,
|
|
2626
|
+
stable_ssh_ports: Optional[List[int]] = None,
|
|
2627
|
+
cluster_info: Optional[provision_common.ClusterInfo] = None
|
|
2628
|
+
) -> None:
|
|
2629
|
+
super().__init__(
|
|
2630
|
+
cluster_name=cluster_name,
|
|
2631
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
|
2632
|
+
cluster_yaml=cluster_yaml,
|
|
2633
|
+
launched_nodes=launched_nodes,
|
|
2634
|
+
launched_resources=launched_resources,
|
|
2635
|
+
stable_internal_external_ips=stable_internal_external_ips,
|
|
2636
|
+
stable_ssh_ports=stable_ssh_ports,
|
|
2637
|
+
cluster_info=cluster_info)
|
|
2638
|
+
# TODO (kyuds): handle jobs consolidation mode. Currently,
|
|
2639
|
+
# jobs consolidation mode will not run a skylet, hence
|
|
2640
|
+
# grpc server will not run. In the future, we should
|
|
2641
|
+
# figure out a way to start grpc in consolidation mode.
|
|
2642
|
+
self.is_grpc_enabled = False
|
|
2643
|
+
|
|
2644
|
+
@context_utils.cancellation_guard
|
|
2645
|
+
# we expect different request to be acting on different clusters
|
|
2646
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2647
|
+
# across requests.
|
|
2648
|
+
# Do not change this cache to global scope
|
|
2649
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2650
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2651
|
+
@timeline.event
|
|
2652
|
+
def get_command_runners(self,
|
|
2653
|
+
force_cached: bool = False,
|
|
2654
|
+
avoid_ssh_control: bool = False
|
|
2655
|
+
) -> List[command_runner.CommandRunner]:
|
|
2656
|
+
"""Returns a list of local command runners."""
|
|
2657
|
+
del force_cached, avoid_ssh_control # Unused.
|
|
2658
|
+
return [command_runner.LocalProcessCommandRunner()]
|
|
2659
|
+
|
|
2660
|
+
|
|
2661
|
+
class SkyletClient:
|
|
2662
|
+
"""The client to interact with a remote cluster through Skylet."""
|
|
2663
|
+
|
|
2664
|
+
def __init__(self, channel: 'grpc.Channel'):
|
|
2665
|
+
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
2666
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
2667
|
+
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
2668
|
+
self._managed_jobs_stub = (
|
|
2669
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
2670
|
+
|
|
2671
|
+
def set_autostop(
|
|
2672
|
+
self,
|
|
2673
|
+
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2674
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2675
|
+
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2676
|
+
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2677
|
+
|
|
2678
|
+
def is_autostopping(
|
|
2679
|
+
self,
|
|
2680
|
+
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2681
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2682
|
+
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2683
|
+
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2684
|
+
|
|
2685
|
+
def add_job(
|
|
2686
|
+
self,
|
|
2687
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
2688
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2689
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
2690
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
2691
|
+
|
|
2692
|
+
def queue_job(
|
|
2693
|
+
self,
|
|
2694
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
2695
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2696
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
2697
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
2698
|
+
|
|
2699
|
+
def update_status(
|
|
2700
|
+
self,
|
|
2701
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
2702
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2703
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
2704
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
2705
|
+
|
|
2706
|
+
def get_job_queue(
|
|
2707
|
+
self,
|
|
2708
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
2709
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2710
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
2711
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
2712
|
+
|
|
2713
|
+
def cancel_jobs(
|
|
2714
|
+
self,
|
|
2715
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
2716
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2717
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
2718
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2719
|
+
|
|
2720
|
+
def fail_all_in_progress_jobs(
|
|
2721
|
+
self,
|
|
2722
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
2723
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2724
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
2725
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
2726
|
+
|
|
2727
|
+
def get_job_status(
|
|
2728
|
+
self,
|
|
2729
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
2730
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2731
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
2732
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
2733
|
+
|
|
2734
|
+
def get_job_submitted_timestamp(
|
|
2735
|
+
self,
|
|
2736
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
2737
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2738
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
2739
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
2740
|
+
timeout=timeout)
|
|
2741
|
+
|
|
2742
|
+
def get_job_ended_timestamp(
|
|
2743
|
+
self,
|
|
2744
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
2745
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2746
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
2747
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
2748
|
+
|
|
2749
|
+
def get_log_dirs_for_jobs(
|
|
2750
|
+
self,
|
|
2751
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
2752
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2753
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
2754
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
2755
|
+
|
|
2756
|
+
def tail_logs(
|
|
2757
|
+
self,
|
|
2758
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
2759
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2760
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
2761
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
2762
|
+
|
|
2763
|
+
def get_service_status(
|
|
2764
|
+
self,
|
|
2765
|
+
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
2766
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2767
|
+
) -> 'servev1_pb2.GetServiceStatusResponse':
|
|
2768
|
+
return self._serve_stub.GetServiceStatus(request, timeout=timeout)
|
|
2769
|
+
|
|
2770
|
+
def add_serve_version(
|
|
2771
|
+
self,
|
|
2772
|
+
request: 'servev1_pb2.AddVersionRequest',
|
|
2773
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2774
|
+
) -> 'servev1_pb2.AddVersionResponse':
|
|
2775
|
+
return self._serve_stub.AddVersion(request, timeout=timeout)
|
|
2776
|
+
|
|
2777
|
+
def terminate_services(
|
|
2778
|
+
self,
|
|
2779
|
+
request: 'servev1_pb2.TerminateServicesRequest',
|
|
2780
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2781
|
+
) -> 'servev1_pb2.TerminateServicesResponse':
|
|
2782
|
+
return self._serve_stub.TerminateServices(request, timeout=timeout)
|
|
2783
|
+
|
|
2784
|
+
def terminate_replica(
|
|
2785
|
+
self,
|
|
2786
|
+
request: 'servev1_pb2.TerminateReplicaRequest',
|
|
2787
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2788
|
+
) -> 'servev1_pb2.TerminateReplicaResponse':
|
|
2789
|
+
return self._serve_stub.TerminateReplica(request, timeout=timeout)
|
|
2790
|
+
|
|
2791
|
+
def wait_service_registration(
|
|
2792
|
+
self,
|
|
2793
|
+
request: 'servev1_pb2.WaitServiceRegistrationRequest',
|
|
2794
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2795
|
+
) -> 'servev1_pb2.WaitServiceRegistrationResponse':
|
|
2796
|
+
# set timeout to at least 10 seconds more than service register
|
|
2797
|
+
# constant to make sure that timeouts will not occur.
|
|
2798
|
+
if timeout is not None:
|
|
2799
|
+
timeout = max(timeout,
|
|
2800
|
+
serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
|
|
2801
|
+
return self._serve_stub.WaitServiceRegistration(request,
|
|
2802
|
+
timeout=timeout)
|
|
2803
|
+
|
|
2804
|
+
def update_service(
|
|
2805
|
+
self,
|
|
2806
|
+
request: 'servev1_pb2.UpdateServiceRequest',
|
|
2807
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2808
|
+
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
2809
|
+
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
2810
|
+
|
|
2811
|
+
def get_managed_job_controller_version(
|
|
2812
|
+
self,
|
|
2813
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
2814
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2815
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
2816
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
2817
|
+
|
|
2818
|
+
def get_managed_job_table(
|
|
2819
|
+
self,
|
|
2820
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
2821
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2822
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
2823
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
2824
|
+
|
|
2825
|
+
def get_all_managed_job_ids_by_name(
|
|
2826
|
+
self,
|
|
2827
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
2828
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2829
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
2830
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
2831
|
+
timeout=timeout)
|
|
2832
|
+
|
|
2833
|
+
def cancel_managed_jobs(
|
|
2834
|
+
self,
|
|
2835
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
2836
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2837
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
2838
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2839
|
+
|
|
2840
|
+
|
|
2656
2841
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2657
2842
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2658
2843
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
|
@@ -2665,7 +2850,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2665
2850
|
NAME = 'cloudvmray'
|
|
2666
2851
|
|
|
2667
2852
|
# Backward compatibility, with the old name of the handle.
|
|
2668
|
-
ResourceHandle = CloudVmRayResourceHandle #
|
|
2853
|
+
ResourceHandle = CloudVmRayResourceHandle # type: ignore
|
|
2669
2854
|
|
|
2670
2855
|
def __init__(self):
|
|
2671
2856
|
self.run_timestamp = sky_logging.get_run_timestamp()
|
|
@@ -2680,6 +2865,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2680
2865
|
self._dag = None
|
|
2681
2866
|
self._optimize_target = None
|
|
2682
2867
|
self._requested_features = set()
|
|
2868
|
+
self._dump_final_script = False
|
|
2869
|
+
self._is_managed = False
|
|
2870
|
+
# Optional planner (via register_info): used under the per-cluster lock
|
|
2871
|
+
# to produce a fresh concrete plan when neither a reusable snapshot nor
|
|
2872
|
+
# a caller plan is available.
|
|
2873
|
+
self._planner = None
|
|
2683
2874
|
|
|
2684
2875
|
# Command for running the setup script. It is only set when the
|
|
2685
2876
|
# setup needs to be run outside the self._setup() and as part of
|
|
@@ -2696,6 +2887,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2696
2887
|
self._requested_features = kwargs.pop('requested_features',
|
|
2697
2888
|
self._requested_features)
|
|
2698
2889
|
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
|
2890
|
+
self._is_managed = kwargs.pop('is_managed', False)
|
|
2891
|
+
# Optional planner callback for a fresh plan under lock when no
|
|
2892
|
+
# reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
|
|
2893
|
+
self._planner = kwargs.pop('planner', self._planner)
|
|
2699
2894
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
|
2700
2895
|
|
|
2701
2896
|
def check_resources_fit_cluster(
|
|
@@ -2722,9 +2917,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2722
2917
|
# Usage Collection:
|
|
2723
2918
|
usage_lib.messages.usage.update_cluster_resources(
|
|
2724
2919
|
handle.launched_nodes, launched_resources)
|
|
2725
|
-
|
|
2726
|
-
if
|
|
2727
|
-
usage_lib.messages.usage.update_cluster_status(
|
|
2920
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
2921
|
+
if status is not None:
|
|
2922
|
+
usage_lib.messages.usage.update_cluster_status(status)
|
|
2728
2923
|
|
|
2729
2924
|
assert launched_resources.region is not None, handle
|
|
2730
2925
|
|
|
@@ -2846,12 +3041,46 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2846
3041
|
# Check if the cluster is owned by the current user. Raise
|
|
2847
3042
|
# exceptions.ClusterOwnerIdentityMismatchError
|
|
2848
3043
|
backend_utils.check_owner_identity(cluster_name)
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
3044
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
3045
|
+
communicated_with_user = False
|
|
3046
|
+
|
|
3047
|
+
while True:
|
|
3048
|
+
try:
|
|
3049
|
+
return self._locked_provision(lock_id, task, to_provision,
|
|
3050
|
+
dryrun, stream_logs, cluster_name,
|
|
3051
|
+
retry_until_up,
|
|
3052
|
+
skip_unnecessary_provisioning)
|
|
3053
|
+
except locks.LockTimeout:
|
|
3054
|
+
if not communicated_with_user:
|
|
3055
|
+
rich_utils.force_update_status(
|
|
3056
|
+
ux_utils.spinner_message('Launching - blocked by ' +
|
|
3057
|
+
'other requests ' +
|
|
3058
|
+
colorama.Style.RESET_ALL +
|
|
3059
|
+
colorama.Style.DIM +
|
|
3060
|
+
'Check concurrent requests: ' +
|
|
3061
|
+
'sky api status -v | grep '
|
|
3062
|
+
f'{cluster_name}'))
|
|
3063
|
+
|
|
3064
|
+
def _locked_provision(
|
|
3065
|
+
self,
|
|
3066
|
+
lock_id: str,
|
|
3067
|
+
task: task_lib.Task,
|
|
3068
|
+
to_provision: Optional[resources_lib.Resources],
|
|
3069
|
+
dryrun: bool,
|
|
3070
|
+
stream_logs: bool,
|
|
3071
|
+
cluster_name: str,
|
|
3072
|
+
retry_until_up: bool = False,
|
|
3073
|
+
skip_unnecessary_provisioning: bool = False,
|
|
3074
|
+
) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
|
|
3075
|
+
with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
|
|
3076
|
+
# Reset spinner message to remove any mention of being blocked
|
|
3077
|
+
# by other requests.
|
|
3078
|
+
rich_utils.force_update_status(
|
|
3079
|
+
ux_utils.spinner_message('Launching'))
|
|
3080
|
+
|
|
3081
|
+
# Try to launch the exiting cluster first. If no existing
|
|
3082
|
+
# cluster, this function will create a to_provision_config
|
|
3083
|
+
# with required resources.
|
|
2855
3084
|
to_provision_config = self._check_existing_cluster(
|
|
2856
3085
|
task, to_provision, cluster_name, dryrun)
|
|
2857
3086
|
assert to_provision_config.resources is not None, (
|
|
@@ -2869,14 +3098,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2869
3098
|
# TODO(suquark): once we have sky on PyPI, we should directly
|
|
2870
3099
|
# install sky from PyPI.
|
|
2871
3100
|
local_wheel_path, wheel_hash = wheel_utils.build_sky_wheel()
|
|
2872
|
-
# The most frequent reason for the failure of a provision
|
|
2873
|
-
# request is resource unavailability instead of rate
|
|
2874
|
-
# limiting; to make users wait shorter, we do not make
|
|
2875
|
-
# backoffs exponential.
|
|
2876
|
-
backoff = common_utils.Backoff(
|
|
2877
|
-
initial_backoff=_RETRY_UNTIL_UP_INIT_GAP_SECONDS,
|
|
2878
|
-
max_backoff_factor=1)
|
|
2879
|
-
attempt_cnt = 1
|
|
2880
3101
|
while True:
|
|
2881
3102
|
# For on-demand instances, RetryingVmProvisioner will retry
|
|
2882
3103
|
# within the given region first, then optionally retry on all
|
|
@@ -2895,21 +3116,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2895
3116
|
try:
|
|
2896
3117
|
retry_provisioner = RetryingVmProvisioner(
|
|
2897
3118
|
self.log_dir,
|
|
2898
|
-
self._dag,
|
|
2899
|
-
self._optimize_target,
|
|
3119
|
+
self._dag, # type: ignore[arg-type]
|
|
3120
|
+
self._optimize_target, # type: ignore[arg-type]
|
|
2900
3121
|
self._requested_features,
|
|
2901
3122
|
local_wheel_path,
|
|
2902
3123
|
wheel_hash,
|
|
2903
|
-
blocked_resources=task.blocked_resources
|
|
3124
|
+
blocked_resources=task.blocked_resources,
|
|
3125
|
+
is_managed=self._is_managed)
|
|
2904
3126
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
|
2905
3127
|
rich_utils.force_update_status(
|
|
2906
|
-
ux_utils.spinner_message('Launching',
|
|
3128
|
+
ux_utils.spinner_message('Launching',
|
|
3129
|
+
log_path,
|
|
3130
|
+
cluster_name=cluster_name))
|
|
2907
3131
|
config_dict = retry_provisioner.provision_with_retries(
|
|
2908
3132
|
task, to_provision_config, dryrun, stream_logs,
|
|
2909
3133
|
skip_unnecessary_provisioning)
|
|
2910
3134
|
break
|
|
2911
3135
|
except exceptions.ResourcesUnavailableError as e:
|
|
2912
3136
|
log_path = retry_provisioner.log_dir + '/provision.log'
|
|
3137
|
+
|
|
2913
3138
|
error_message = (
|
|
2914
3139
|
f'{colorama.Fore.RED}Failed to provision all '
|
|
2915
3140
|
f'possible launchable resources.'
|
|
@@ -2920,23 +3145,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2920
3145
|
error_message = str(e)
|
|
2921
3146
|
|
|
2922
3147
|
if retry_until_up:
|
|
2923
|
-
|
|
2924
|
-
# Sleep and retry.
|
|
2925
|
-
gap_seconds = backoff.current_backoff()
|
|
2926
|
-
plural = 's' if attempt_cnt > 1 else ''
|
|
3148
|
+
gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
|
|
2927
3149
|
retry_message = ux_utils.retry_message(
|
|
2928
|
-
f'Retry after {gap_seconds:.0f}s '
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
3150
|
+
f'Retry after {gap_seconds:.0f}s ')
|
|
3151
|
+
hint_message = (
|
|
3152
|
+
f'\n{retry_message} '
|
|
3153
|
+
f'{ux_utils.provision_hint(cluster_name)}'
|
|
3154
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3155
|
+
|
|
3156
|
+
# Add cluster event for retry.
|
|
3157
|
+
global_user_state.add_cluster_event(
|
|
3158
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
3159
|
+
f'Retrying provisioning after {gap_seconds:.0f}s',
|
|
3160
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3161
|
+
|
|
3162
|
+
raise exceptions.ExecutionRetryableError(
|
|
3163
|
+
error_message,
|
|
3164
|
+
hint=hint_message,
|
|
3165
|
+
retry_wait_seconds=gap_seconds)
|
|
2936
3166
|
# Clean up the cluster's entry in `sky status`.
|
|
2937
3167
|
# Do not remove the stopped cluster from the global state
|
|
2938
3168
|
# if failed to start.
|
|
2939
3169
|
if not e.no_failover:
|
|
3170
|
+
global_user_state.add_cluster_event(
|
|
3171
|
+
cluster_name,
|
|
3172
|
+
None,
|
|
3173
|
+
'Provision failed: ' + str(e),
|
|
3174
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
3175
|
+
nop_if_duplicate=True)
|
|
2940
3176
|
global_user_state.remove_cluster(cluster_name,
|
|
2941
3177
|
terminate=True)
|
|
2942
3178
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -2944,7 +3180,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2944
3180
|
logger.error(
|
|
2945
3181
|
ux_utils.error_message(
|
|
2946
3182
|
'Failed to provision resources. '
|
|
2947
|
-
f'{ux_utils.
|
|
3183
|
+
f'{ux_utils.provision_hint(cluster_name)}'))
|
|
2948
3184
|
error_message += (
|
|
2949
3185
|
'\nTo keep retrying until the cluster is up, use '
|
|
2950
3186
|
'the `--retry-until-up` flag.')
|
|
@@ -2953,8 +3189,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2953
3189
|
error_message + '\n' + str(e),
|
|
2954
3190
|
failover_history=e.failover_history) from None
|
|
2955
3191
|
if dryrun:
|
|
2956
|
-
|
|
2957
|
-
|
|
3192
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3193
|
+
cluster_name)
|
|
3194
|
+
return handle if handle is not None else None, False
|
|
2958
3195
|
|
|
2959
3196
|
if config_dict['provisioning_skipped']:
|
|
2960
3197
|
# Skip further provisioning.
|
|
@@ -2962,10 +3199,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2962
3199
|
# ('handle', 'provision_record', 'resources_vars')
|
|
2963
3200
|
# We need to return the handle - but it should be the existing
|
|
2964
3201
|
# handle for the cluster.
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
return
|
|
3202
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3203
|
+
cluster_name)
|
|
3204
|
+
assert handle is not None, (cluster_name, handle)
|
|
3205
|
+
return handle, True
|
|
2969
3206
|
|
|
2970
3207
|
if 'provision_record' in config_dict:
|
|
2971
3208
|
# New provisioner is used here.
|
|
@@ -2980,8 +3217,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2980
3217
|
# and other necessary files to the VM.
|
|
2981
3218
|
# 3. Run setup commands to install dependencies.
|
|
2982
3219
|
# 4. Starting ray cluster and skylet.
|
|
3220
|
+
|
|
3221
|
+
# Add cluster event for runtime setup start
|
|
3222
|
+
global_user_state.add_cluster_event(
|
|
3223
|
+
handle.cluster_name, status_lib.ClusterStatus.INIT,
|
|
3224
|
+
'Setting up SkyPilot runtime on cluster',
|
|
3225
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3226
|
+
|
|
2983
3227
|
cluster_info = provisioner.post_provision_runtime_setup(
|
|
2984
|
-
|
|
3228
|
+
handle.launched_resources,
|
|
2985
3229
|
resources_utils.ClusterName(handle.cluster_name,
|
|
2986
3230
|
handle.cluster_name_on_cloud),
|
|
2987
3231
|
handle.cluster_yaml,
|
|
@@ -2995,6 +3239,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2995
3239
|
# manually or by the cloud provider.
|
|
2996
3240
|
# Optimize the case where the cluster's IPs can be retrieved
|
|
2997
3241
|
# from cluster_info.
|
|
3242
|
+
handle.cached_cluster_info = cluster_info
|
|
2998
3243
|
handle.docker_user = cluster_info.docker_user
|
|
2999
3244
|
handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
|
|
3000
3245
|
cluster_info=cluster_info)
|
|
@@ -3006,7 +3251,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3006
3251
|
|
|
3007
3252
|
self._update_after_cluster_provisioned(
|
|
3008
3253
|
handle, to_provision_config.prev_handle, task,
|
|
3009
|
-
prev_cluster_status,
|
|
3254
|
+
prev_cluster_status, config_hash)
|
|
3010
3255
|
return handle, False
|
|
3011
3256
|
|
|
3012
3257
|
cluster_config_file = config_dict['ray']
|
|
@@ -3016,8 +3261,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3016
3261
|
ssh_port_list = handle.external_ssh_ports()
|
|
3017
3262
|
assert ip_list is not None, handle
|
|
3018
3263
|
assert ssh_port_list is not None, handle
|
|
3019
|
-
|
|
3020
|
-
|
|
3264
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
3265
|
+
cluster_config_file)
|
|
3021
3266
|
if 'docker' in config:
|
|
3022
3267
|
handle.setup_docker_user(cluster_config_file)
|
|
3023
3268
|
|
|
@@ -3078,14 +3323,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3078
3323
|
|
|
3079
3324
|
self._update_after_cluster_provisioned(
|
|
3080
3325
|
handle, to_provision_config.prev_handle, task,
|
|
3081
|
-
prev_cluster_status,
|
|
3326
|
+
prev_cluster_status, config_hash)
|
|
3082
3327
|
return handle, False
|
|
3083
3328
|
|
|
3084
3329
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
3085
3330
|
cloud = handle.launched_resources.cloud
|
|
3086
3331
|
logger.debug(
|
|
3087
3332
|
f'Opening ports {handle.launched_resources.ports} for {cloud}')
|
|
3088
|
-
config =
|
|
3333
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
3089
3334
|
provider_config = config['provider']
|
|
3090
3335
|
provision_lib.open_ports(repr(cloud), handle.cluster_name_on_cloud,
|
|
3091
3336
|
handle.launched_resources.ports,
|
|
@@ -3096,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3096
3341
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3097
3342
|
task: task_lib.Task,
|
|
3098
3343
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3099
|
-
|
|
3344
|
+
config_hash: str) -> None:
|
|
3100
3345
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3101
3346
|
handle.launched_nodes, handle.launched_resources)
|
|
3102
3347
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3108,16 +3353,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3108
3353
|
# update_status will query the ray job status for all INIT /
|
|
3109
3354
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3110
3355
|
# know the actual previous status of the cluster.
|
|
3111
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3112
3356
|
logger.debug('Update job queue on remote cluster.')
|
|
3113
3357
|
with rich_utils.safe_status(
|
|
3114
3358
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3359
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3360
|
+
|
|
3361
|
+
if not use_legacy:
|
|
3362
|
+
try:
|
|
3363
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3364
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3365
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3366
|
+
).update_status(request))
|
|
3367
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3368
|
+
use_legacy = True
|
|
3369
|
+
|
|
3370
|
+
if use_legacy:
|
|
3371
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3372
|
+
returncode, _, stderr = self.run_on_head(
|
|
3373
|
+
handle, cmd, require_outputs=True)
|
|
3374
|
+
subprocess_utils.handle_returncode(
|
|
3375
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3121
3376
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3122
3377
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3123
3378
|
# is restarted
|
|
@@ -3125,14 +3380,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3125
3380
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3126
3381
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3127
3382
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3383
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3384
|
+
|
|
3385
|
+
if not use_legacy:
|
|
3386
|
+
try:
|
|
3387
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3388
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3389
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3390
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3391
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3392
|
+
use_legacy = True
|
|
3393
|
+
|
|
3394
|
+
if use_legacy:
|
|
3395
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3396
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3397
|
+
handle, cmd, require_outputs=True)
|
|
3398
|
+
subprocess_utils.handle_returncode(
|
|
3399
|
+
returncode, cmd,
|
|
3400
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3401
|
+
stdout + stderr)
|
|
3136
3402
|
|
|
3137
3403
|
prev_ports = None
|
|
3138
3404
|
if prev_handle is not None:
|
|
@@ -3142,14 +3408,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3142
3408
|
resources_utils.port_ranges_to_set(current_ports) -
|
|
3143
3409
|
resources_utils.port_ranges_to_set(prev_ports))
|
|
3144
3410
|
if open_new_ports:
|
|
3145
|
-
|
|
3146
|
-
if not (cloud.OPEN_PORTS_VERSION <=
|
|
3411
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
3412
|
+
if not (launched_resources.cloud.OPEN_PORTS_VERSION <=
|
|
3147
3413
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
|
3148
3414
|
with rich_utils.safe_status(
|
|
3149
3415
|
ux_utils.spinner_message(
|
|
3150
3416
|
'Launching - Opening new ports')):
|
|
3151
3417
|
self._open_ports(handle)
|
|
3152
3418
|
|
|
3419
|
+
# Capture task YAML and command
|
|
3420
|
+
user_specified_task_config = None
|
|
3421
|
+
if task is not None:
|
|
3422
|
+
user_specified_task_config = task.to_yaml_config(
|
|
3423
|
+
use_user_specified_yaml=True)
|
|
3424
|
+
|
|
3153
3425
|
with timeline.Event('backend.provision.post_process'):
|
|
3154
3426
|
global_user_state.add_or_update_cluster(
|
|
3155
3427
|
handle.cluster_name,
|
|
@@ -3157,7 +3429,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3157
3429
|
set(task.resources),
|
|
3158
3430
|
ready=True,
|
|
3159
3431
|
config_hash=config_hash,
|
|
3432
|
+
task_config=user_specified_task_config,
|
|
3160
3433
|
)
|
|
3434
|
+
|
|
3435
|
+
# Add cluster event for successful provisioning.
|
|
3436
|
+
global_user_state.add_cluster_event(
|
|
3437
|
+
handle.cluster_name, status_lib.ClusterStatus.UP,
|
|
3438
|
+
'Cluster successfully provisioned with ' +
|
|
3439
|
+
f'{handle.launched_nodes} nodes',
|
|
3440
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3441
|
+
|
|
3161
3442
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3162
3443
|
status_lib.ClusterStatus.UP)
|
|
3163
3444
|
# We still add the cluster to ssh config file on API server, this
|
|
@@ -3172,13 +3453,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3172
3453
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3173
3454
|
handle.ssh_user)
|
|
3174
3455
|
|
|
3175
|
-
common_utils.remove_file_if_exists(lock_path)
|
|
3176
|
-
|
|
3177
3456
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3178
|
-
workdir: Path
|
|
3457
|
+
workdir: Union[Path, Dict[str, Any]],
|
|
3458
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
|
3179
3459
|
# Even though provision() takes care of it, there may be cases where
|
|
3180
3460
|
# this function is called in isolation, without calling provision(),
|
|
3181
3461
|
# e.g., in CLI. So we should rerun rsync_up.
|
|
3462
|
+
if isinstance(workdir, dict):
|
|
3463
|
+
self._sync_git_workdir(handle, envs_and_secrets)
|
|
3464
|
+
else:
|
|
3465
|
+
self._sync_path_workdir(handle, workdir)
|
|
3466
|
+
|
|
3467
|
+
def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3468
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
|
3469
|
+
style = colorama.Style
|
|
3470
|
+
ip_list = handle.external_ips()
|
|
3471
|
+
assert ip_list is not None, 'external_ips is not cached in handle'
|
|
3472
|
+
|
|
3473
|
+
log_path = os.path.join(self.log_dir, 'workdir_sync.log')
|
|
3474
|
+
|
|
3475
|
+
# TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
|
|
3476
|
+
runners = handle.get_command_runners()
|
|
3477
|
+
|
|
3478
|
+
def _sync_git_workdir_node(
|
|
3479
|
+
runner: command_runner.CommandRunner) -> None:
|
|
3480
|
+
# Type assertion to help mypy understand the type
|
|
3481
|
+
assert hasattr(
|
|
3482
|
+
runner, 'git_clone'
|
|
3483
|
+
), f'CommandRunner should have git_clone method, ' \
|
|
3484
|
+
f'got {type(runner)}'
|
|
3485
|
+
runner.git_clone(
|
|
3486
|
+
target_dir=SKY_REMOTE_WORKDIR,
|
|
3487
|
+
log_path=log_path,
|
|
3488
|
+
stream_logs=False,
|
|
3489
|
+
max_retry=3,
|
|
3490
|
+
envs_and_secrets=envs_and_secrets,
|
|
3491
|
+
)
|
|
3492
|
+
|
|
3493
|
+
num_nodes = handle.launched_nodes
|
|
3494
|
+
plural = 's' if num_nodes > 1 else ''
|
|
3495
|
+
logger.info(
|
|
3496
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
|
3497
|
+
f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
|
3498
|
+
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
3499
|
+
os.system(f'touch {log_path}')
|
|
3500
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
|
3501
|
+
str(handle.launched_resources.cloud))
|
|
3502
|
+
with rich_utils.safe_status(
|
|
3503
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
|
3504
|
+
subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
|
|
3505
|
+
num_threads)
|
|
3506
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
|
3507
|
+
|
|
3508
|
+
def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3509
|
+
workdir: Path) -> None:
|
|
3182
3510
|
fore = colorama.Fore
|
|
3183
3511
|
style = colorama.Style
|
|
3184
3512
|
ip_list = handle.external_ips()
|
|
@@ -3247,14 +3575,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3247
3575
|
TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
|
|
3248
3576
|
assert here that all storage_mounts are MOUNT mode.
|
|
3249
3577
|
"""
|
|
3578
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
3250
3579
|
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
|
3251
3580
|
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
|
3252
|
-
|
|
3581
|
+
launched_resources.cloud, all_file_mounts)
|
|
3253
3582
|
self._execute_file_mounts(handle, all_file_mounts)
|
|
3254
3583
|
self._execute_storage_mounts(handle, storage_mounts)
|
|
3255
3584
|
self._set_storage_mounts_metadata(handle.cluster_name,
|
|
3256
3585
|
storage_mounts)
|
|
3257
3586
|
|
|
3587
|
+
def _get_num_gpus(self, task: task_lib.Task) -> int:
|
|
3588
|
+
if task.resources is not None:
|
|
3589
|
+
for resource in task.resources:
|
|
3590
|
+
if (resource.accelerators is not None and
|
|
3591
|
+
isinstance(resource.accelerators, dict)):
|
|
3592
|
+
if len(resource.accelerators) > 0:
|
|
3593
|
+
return math.ceil(
|
|
3594
|
+
list(resource.accelerators.values())[0])
|
|
3595
|
+
return 0
|
|
3596
|
+
|
|
3258
3597
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
|
3259
3598
|
detach_setup: bool) -> None:
|
|
3260
3599
|
start = time.time()
|
|
@@ -3267,13 +3606,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3267
3606
|
remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
|
|
3268
3607
|
# Need this `-i` option to make sure `source ~/.bashrc` work
|
|
3269
3608
|
setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
|
|
3609
|
+
unset_ray_env_vars = ' && '.join(
|
|
3610
|
+
[f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
|
|
3611
|
+
setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
|
|
3270
3612
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
|
3271
3613
|
|
|
3272
3614
|
def _setup_node(node_id: int) -> None:
|
|
3273
|
-
setup_envs =
|
|
3615
|
+
setup_envs = task_lib.get_plaintext_envs_and_secrets(
|
|
3616
|
+
task.envs_and_secrets)
|
|
3274
3617
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
|
3275
3618
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
|
3276
3619
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
|
3620
|
+
setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
|
|
3621
|
+
self._get_num_gpus(task)))
|
|
3622
|
+
|
|
3277
3623
|
runner = runners[node_id]
|
|
3278
3624
|
setup_script = log_lib.make_task_bash_script(setup,
|
|
3279
3625
|
env_vars=setup_envs)
|
|
@@ -3329,33 +3675,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3329
3675
|
return returncode
|
|
3330
3676
|
|
|
3331
3677
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
# Instead, we should retry the setup with dumping the script
|
|
3343
|
-
# to a file to be safe.
|
|
3344
|
-
logger.debug('Failed to read setup log file '
|
|
3345
|
-
f'{setup_log_path}: {e}')
|
|
3346
|
-
is_message_too_long = True
|
|
3347
|
-
|
|
3348
|
-
if is_message_too_long:
|
|
3349
|
-
# If the setup script is too long, we retry it with dumping
|
|
3350
|
-
# the script to a file and running it with SSH. We use a
|
|
3351
|
-
# general length limit check before but it could be
|
|
3352
|
-
# inaccurate on some systems.
|
|
3353
|
-
logger.debug(
|
|
3354
|
-
'Failed to run setup command inline due to '
|
|
3355
|
-
'command length limit. Dumping setup script to '
|
|
3356
|
-
'file and running it with SSH.')
|
|
3357
|
-
_dump_final_script(setup_script)
|
|
3358
|
-
returncode = _run_setup(setup_cmd)
|
|
3678
|
+
|
|
3679
|
+
if _is_message_too_long(returncode, file_path=setup_log_path):
|
|
3680
|
+
# If the setup script is too long, we need to retry it
|
|
3681
|
+
# with dumping the script to a file and running it the script
|
|
3682
|
+
# on remote cluster instead.
|
|
3683
|
+
logger.debug('Failed to run setup command inline due to '
|
|
3684
|
+
'command length limit. Dumping setup script to '
|
|
3685
|
+
'file and running it with SSH.')
|
|
3686
|
+
_dump_final_script(setup_script)
|
|
3687
|
+
returncode = _run_setup(setup_cmd)
|
|
3359
3688
|
|
|
3360
3689
|
def error_message() -> str:
|
|
3361
3690
|
# Use the function to avoid tailing the file in success case
|
|
@@ -3414,102 +3743,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3414
3743
|
logger.info(
|
|
3415
3744
|
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
|
3416
3745
|
|
|
3746
|
+
def _download_file(self, handle: CloudVmRayResourceHandle,
|
|
3747
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
3748
|
+
"""Syncs file from remote to local."""
|
|
3749
|
+
runners = handle.get_command_runners()
|
|
3750
|
+
head_runner = runners[0]
|
|
3751
|
+
head_runner.rsync(
|
|
3752
|
+
source=local_file_path,
|
|
3753
|
+
target=remote_file_path,
|
|
3754
|
+
up=False,
|
|
3755
|
+
stream_logs=False,
|
|
3756
|
+
)
|
|
3757
|
+
|
|
3417
3758
|
def _exec_code_on_head(
|
|
3418
3759
|
self,
|
|
3419
3760
|
handle: CloudVmRayResourceHandle,
|
|
3420
3761
|
codegen: str,
|
|
3421
3762
|
job_id: int,
|
|
3422
|
-
detach_run: bool = False,
|
|
3423
3763
|
managed_job_dag: Optional['dag.Dag'] = None,
|
|
3764
|
+
managed_job_user_id: Optional[str] = None,
|
|
3765
|
+
remote_log_dir: Optional[str] = None,
|
|
3424
3766
|
) -> None:
|
|
3425
3767
|
"""Executes generated code on the head node."""
|
|
3426
|
-
|
|
3427
|
-
|
|
3768
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3769
|
+
file_name = f'sky_job_{job_id}'
|
|
3770
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
3771
|
+
if remote_log_dir is None:
|
|
3772
|
+
remote_log_dir = self.log_dir
|
|
3428
3773
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3429
3774
|
|
|
3430
|
-
|
|
3775
|
+
def _dump_code_to_file(codegen: str,
|
|
3776
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3777
|
+
runners = handle.get_command_runners()
|
|
3778
|
+
head_runner = runners[0]
|
|
3779
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3780
|
+
fp.write(codegen)
|
|
3781
|
+
fp.flush()
|
|
3782
|
+
script_path = os.path.join(target_dir, file_name)
|
|
3783
|
+
# We choose to sync code + exec, because the alternative of
|
|
3784
|
+
# 'ray submit' may not work as it may use system python
|
|
3785
|
+
# (python2) to execute the script. Happens for AWS.
|
|
3786
|
+
head_runner.rsync(source=fp.name,
|
|
3787
|
+
target=script_path,
|
|
3788
|
+
up=True,
|
|
3789
|
+
stream_logs=False)
|
|
3431
3790
|
|
|
3791
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3432
3792
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3433
3793
|
f'touch {remote_log_path}')
|
|
3434
3794
|
encoded_script = shlex.quote(codegen)
|
|
3435
3795
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3436
3796
|
job_submit_cmd = (
|
|
3437
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3438
|
-
# with pid is the same driver process.
|
|
3797
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3798
|
+
# retrieved with pid is the same driver process.
|
|
3439
3799
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3440
3800
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3441
3801
|
# Do not use &>, which is not POSIX and may not work.
|
|
3442
3802
|
# Note that the order of ">filename 2>&1" matters.
|
|
3443
3803
|
f'> {remote_log_path} 2>&1')
|
|
3444
|
-
|
|
3445
3804
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3446
3805
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3447
3806
|
|
|
3448
|
-
def _dump_code_to_file(codegen: str,
|
|
3449
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3450
|
-
runners = handle.get_command_runners()
|
|
3451
|
-
head_runner = runners[0]
|
|
3452
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3453
|
-
fp.write(codegen)
|
|
3454
|
-
fp.flush()
|
|
3455
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3456
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3457
|
-
# submit' may not work as it may use system python (python2) to
|
|
3458
|
-
# execute the script. Happens for AWS.
|
|
3459
|
-
head_runner.rsync(source=fp.name,
|
|
3460
|
-
target=script_path,
|
|
3461
|
-
up=True,
|
|
3462
|
-
stream_logs=False)
|
|
3463
|
-
|
|
3464
3807
|
# Should also be ealier than _is_command_length_over_limit
|
|
3465
3808
|
# Same reason as in _setup
|
|
3466
3809
|
if self._dump_final_script:
|
|
3467
3810
|
_dump_code_to_file(job_submit_cmd,
|
|
3468
3811
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3469
3812
|
|
|
3470
|
-
if
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3480
|
-
|
|
3481
|
-
|
|
3482
|
-
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
|
|
3813
|
+
if not use_legacy:
|
|
3814
|
+
try:
|
|
3815
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
3816
|
+
if managed_job_dag is not None:
|
|
3817
|
+
workspace = skypilot_config.get_active_workspace(
|
|
3818
|
+
force_user_workspace=True)
|
|
3819
|
+
entrypoint = common_utils.get_current_command()
|
|
3820
|
+
|
|
3821
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
3822
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
3823
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
3824
|
+
task, is_managed_job=True)
|
|
3825
|
+
managed_job_tasks.append(
|
|
3826
|
+
jobsv1_pb2.ManagedJobTask(
|
|
3827
|
+
task_id=task_id,
|
|
3828
|
+
name=task.name,
|
|
3829
|
+
resources_str=resources_str,
|
|
3830
|
+
metadata_json=task.metadata_json))
|
|
3831
|
+
|
|
3832
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
3833
|
+
name=managed_job_dag.name,
|
|
3834
|
+
pool=managed_job_dag.pool,
|
|
3835
|
+
workspace=workspace,
|
|
3836
|
+
entrypoint=entrypoint,
|
|
3837
|
+
tasks=managed_job_tasks,
|
|
3838
|
+
user_id=managed_job_user_id)
|
|
3839
|
+
|
|
3840
|
+
if _is_command_length_over_limit(codegen):
|
|
3841
|
+
_dump_code_to_file(codegen)
|
|
3842
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3843
|
+
job_id=job_id,
|
|
3844
|
+
# codegen not set - server assumes script uploaded
|
|
3845
|
+
remote_log_dir=remote_log_dir,
|
|
3846
|
+
managed_job=managed_job_info,
|
|
3847
|
+
script_path=script_path)
|
|
3848
|
+
else:
|
|
3849
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3850
|
+
job_id=job_id,
|
|
3851
|
+
codegen=codegen,
|
|
3852
|
+
remote_log_dir=remote_log_dir,
|
|
3853
|
+
managed_job=managed_job_info,
|
|
3854
|
+
script_path=script_path)
|
|
3855
|
+
|
|
3856
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
3857
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
3858
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3859
|
+
use_legacy = True
|
|
3860
|
+
|
|
3861
|
+
if use_legacy:
|
|
3862
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
|
3863
|
+
_dump_code_to_file(codegen)
|
|
3864
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3865
|
+
|
|
3866
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
3867
|
+
if managed_job_dag is not None:
|
|
3868
|
+
# Add the managed job to job queue database.
|
|
3869
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
3870
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
3871
|
+
job_id,
|
|
3872
|
+
managed_job_dag,
|
|
3873
|
+
skypilot_config.get_active_workspace(
|
|
3874
|
+
force_user_workspace=True),
|
|
3875
|
+
entrypoint=common_utils.get_current_command(),
|
|
3876
|
+
user_hash=managed_job_user_id)
|
|
3877
|
+
# Set the managed job to PENDING state to make sure that
|
|
3878
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
3879
|
+
# if it needs to wait to be submitted.
|
|
3880
|
+
# We cannot set the managed job to PENDING state in the
|
|
3881
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
3882
|
+
# to wait for the run commands to be scheduled on the job
|
|
3883
|
+
# controller in high-load cases.
|
|
3884
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
3885
|
+
return job_submit_cmd
|
|
3886
|
+
|
|
3887
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3486
3888
|
|
|
3487
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3488
|
-
job_submit_cmd,
|
|
3489
|
-
stream_logs=False,
|
|
3490
|
-
require_outputs=True)
|
|
3491
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3492
|
-
# running a job. Necessitating calling `sky launch`.
|
|
3493
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3494
|
-
handle.cluster_name)
|
|
3495
|
-
if returncode == 255 and 'too long' in stdout + stderr:
|
|
3496
|
-
# If the generated script is too long, we retry it with dumping
|
|
3497
|
-
# the script to a file and running it with SSH. We use a general
|
|
3498
|
-
# length limit check before but it could be inaccurate on some
|
|
3499
|
-
# systems.
|
|
3500
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
3501
|
-
'Dumping job to file and running it with SSH.')
|
|
3502
|
-
_dump_code_to_file(codegen)
|
|
3503
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3504
3889
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3505
3890
|
job_submit_cmd,
|
|
3506
3891
|
stream_logs=False,
|
|
3507
3892
|
require_outputs=True)
|
|
3893
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3894
|
+
# running a job. Necessitating calling `sky launch`.
|
|
3895
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3896
|
+
handle.cluster_name)
|
|
3897
|
+
output = stdout + stderr
|
|
3898
|
+
if _is_message_too_long(returncode, output=output):
|
|
3899
|
+
# If the job submit script is too long, we need to retry it
|
|
3900
|
+
# with dumping the script to a file and running it the script
|
|
3901
|
+
# on remote cluster instead.
|
|
3902
|
+
logger.debug(
|
|
3903
|
+
'Failed to submit job due to command length limit. '
|
|
3904
|
+
'Dumping job to file and running it with SSH. '
|
|
3905
|
+
f'Output: {output}')
|
|
3906
|
+
_dump_code_to_file(codegen)
|
|
3907
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3908
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3909
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3910
|
+
handle,
|
|
3911
|
+
job_submit_cmd,
|
|
3912
|
+
stream_logs=False,
|
|
3913
|
+
require_outputs=True)
|
|
3508
3914
|
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3915
|
+
subprocess_utils.handle_returncode(
|
|
3916
|
+
returncode,
|
|
3917
|
+
job_submit_cmd,
|
|
3918
|
+
f'Failed to submit job {job_id}.',
|
|
3919
|
+
stderr=stdout + stderr)
|
|
3513
3920
|
|
|
3514
3921
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
3515
3922
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -3518,53 +3925,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3518
3925
|
logger.info(
|
|
3519
3926
|
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
|
3520
3927
|
rich_utils.stop_safe_status()
|
|
3521
|
-
if not detach_run:
|
|
3522
|
-
if (handle.cluster_name == controller_utils.Controllers.
|
|
3523
|
-
JOBS_CONTROLLER.value.cluster_name):
|
|
3524
|
-
self.tail_managed_job_logs(handle, job_id)
|
|
3525
|
-
else:
|
|
3526
|
-
# Sky logs. Not using subprocess.run since it will make the
|
|
3527
|
-
# ssh keep connected after ctrl-c.
|
|
3528
|
-
self.tail_logs(handle, job_id)
|
|
3529
3928
|
|
|
3530
3929
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
3531
|
-
job_name: Optional[str], resources_str: str
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
|
|
3556
|
-
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
|
|
3930
|
+
job_name: Optional[str], resources_str: str,
|
|
3931
|
+
metadata: str) -> Tuple[int, str]:
|
|
3932
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3933
|
+
|
|
3934
|
+
if not use_legacy:
|
|
3935
|
+
try:
|
|
3936
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
3937
|
+
job_name=job_name,
|
|
3938
|
+
username=common_utils.get_user_hash(),
|
|
3939
|
+
run_timestamp=self.run_timestamp,
|
|
3940
|
+
resources_str=resources_str,
|
|
3941
|
+
metadata=metadata)
|
|
3942
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
3943
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
3944
|
+
request))
|
|
3945
|
+
job_id = response.job_id
|
|
3946
|
+
log_dir = response.log_dir
|
|
3947
|
+
return job_id, log_dir
|
|
3948
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3949
|
+
use_legacy = True
|
|
3950
|
+
|
|
3951
|
+
if use_legacy:
|
|
3952
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
3953
|
+
job_name=job_name,
|
|
3954
|
+
username=common_utils.get_user_hash(),
|
|
3955
|
+
run_timestamp=self.run_timestamp,
|
|
3956
|
+
resources_str=resources_str,
|
|
3957
|
+
metadata=metadata)
|
|
3958
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
3959
|
+
handle,
|
|
3960
|
+
code,
|
|
3961
|
+
stream_logs=False,
|
|
3962
|
+
require_outputs=True,
|
|
3963
|
+
separate_stderr=True)
|
|
3964
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3965
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
3966
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3967
|
+
handle.cluster_name)
|
|
3968
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
3969
|
+
# retry for this, after we figure out the reason.
|
|
3970
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
3971
|
+
'Failed to fetch job id.',
|
|
3972
|
+
stderr)
|
|
3973
|
+
try:
|
|
3974
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
3975
|
+
if job_id_match is not None:
|
|
3976
|
+
job_id = int(job_id_match.group(1))
|
|
3977
|
+
else:
|
|
3978
|
+
# For backward compatibility.
|
|
3979
|
+
job_id = int(result_str)
|
|
3980
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
3981
|
+
if log_dir_match is not None:
|
|
3982
|
+
log_dir = log_dir_match.group(1).strip()
|
|
3983
|
+
else:
|
|
3984
|
+
# For backward compatibility, use the same log dir as local.
|
|
3985
|
+
log_dir = self.log_dir
|
|
3986
|
+
except ValueError as e:
|
|
3987
|
+
logger.error(stderr)
|
|
3988
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
3989
|
+
f'Returncode: {returncode}') from e
|
|
3990
|
+
return job_id, log_dir
|
|
3562
3991
|
|
|
3563
3992
|
def _execute(
|
|
3564
3993
|
self,
|
|
3565
3994
|
handle: CloudVmRayResourceHandle,
|
|
3566
3995
|
task: task_lib.Task,
|
|
3567
|
-
detach_run: bool,
|
|
3568
3996
|
dryrun: bool = False,
|
|
3569
3997
|
) -> Optional[int]:
|
|
3570
3998
|
"""Executes the task on the cluster.
|
|
@@ -3588,7 +4016,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3588
4016
|
# In this case, we reset the resources for the task, so that the
|
|
3589
4017
|
# detached setup does not need to wait for the task resources to be
|
|
3590
4018
|
# ready (which is not used for setup anyway).
|
|
3591
|
-
valid_resource =
|
|
4019
|
+
valid_resource = resources_lib.Resources()
|
|
3592
4020
|
else:
|
|
3593
4021
|
# Check the task resources vs the cluster resources. Since
|
|
3594
4022
|
# `sky exec` will not run the provision and _check_existing_cluster
|
|
@@ -3610,15 +4038,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3610
4038
|
logger.info(f'Dryrun complete. Would have run:\n{task}')
|
|
3611
4039
|
return None
|
|
3612
4040
|
|
|
3613
|
-
job_id = self._add_job(handle, task_copy.name, resources_str
|
|
4041
|
+
job_id, log_dir = self._add_job(handle, task_copy.name, resources_str,
|
|
4042
|
+
task.metadata_json)
|
|
3614
4043
|
|
|
3615
4044
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
3616
4045
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
|
3617
4046
|
if num_actual_nodes > 1:
|
|
3618
|
-
self._execute_task_n_nodes(handle, task_copy, job_id,
|
|
4047
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
|
|
3619
4048
|
else:
|
|
3620
4049
|
# Case: task_lib.Task(run, num_nodes=1)
|
|
3621
|
-
self._execute_task_one_node(handle, task_copy, job_id,
|
|
4050
|
+
self._execute_task_one_node(handle, task_copy, job_id, log_dir)
|
|
3622
4051
|
|
|
3623
4052
|
return job_id
|
|
3624
4053
|
|
|
@@ -3661,7 +4090,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3661
4090
|
is_identity_mismatch_and_purge = False
|
|
3662
4091
|
try:
|
|
3663
4092
|
backend_utils.check_owner_identity(cluster_name)
|
|
3664
|
-
except exceptions.ClusterOwnerIdentityMismatchError
|
|
4093
|
+
except (exceptions.ClusterOwnerIdentityMismatchError,
|
|
4094
|
+
exceptions.CloudUserIdentityError) as e:
|
|
3665
4095
|
if purge:
|
|
3666
4096
|
logger.error(e)
|
|
3667
4097
|
verbed = 'terminated' if terminate else 'stopped'
|
|
@@ -3674,16 +4104,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3674
4104
|
is_identity_mismatch_and_purge = True
|
|
3675
4105
|
else:
|
|
3676
4106
|
raise
|
|
3677
|
-
|
|
3678
|
-
|
|
4107
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
4108
|
+
lock = locks.get_lock(lock_id, timeout=1)
|
|
3679
4109
|
# Retry in case new cluster operation comes in and holds the lock
|
|
3680
4110
|
# right after the lock is removed.
|
|
3681
4111
|
n_attempts = 2
|
|
3682
4112
|
while True:
|
|
3683
4113
|
n_attempts -= 1
|
|
3684
|
-
# In case other running cluster operations are still holding the
|
|
3685
|
-
# lock.
|
|
3686
|
-
common_utils.remove_file_if_exists(lock_path)
|
|
3687
4114
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
3688
4115
|
# should be higher priority than the cluster requests, and we should
|
|
3689
4116
|
# release the lock from other requests.
|
|
@@ -3701,10 +4128,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3701
4128
|
'Failed to kill other launch requests for the '
|
|
3702
4129
|
f'cluster {handle.cluster_name}: '
|
|
3703
4130
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4131
|
+
# In case other running cluster operations are still holding the
|
|
4132
|
+
# lock.
|
|
4133
|
+
lock.force_unlock()
|
|
3704
4134
|
try:
|
|
3705
|
-
with
|
|
3706
|
-
lock_path,
|
|
3707
|
-
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
|
4135
|
+
with lock:
|
|
3708
4136
|
self.teardown_no_lock(
|
|
3709
4137
|
handle,
|
|
3710
4138
|
terminate,
|
|
@@ -3717,14 +4145,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3717
4145
|
refresh_cluster_status=(
|
|
3718
4146
|
not is_identity_mismatch_and_purge))
|
|
3719
4147
|
if terminate:
|
|
3720
|
-
|
|
4148
|
+
lock.force_unlock()
|
|
3721
4149
|
break
|
|
3722
|
-
except
|
|
4150
|
+
except locks.LockTimeout as e:
|
|
3723
4151
|
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
|
3724
4152
|
f'retrying...')
|
|
3725
4153
|
if n_attempts <= 0:
|
|
3726
4154
|
raise RuntimeError(
|
|
3727
|
-
f'Cluster {cluster_name!r} is locked by {
|
|
4155
|
+
f'Cluster {cluster_name!r} is locked by {lock_id}. '
|
|
3728
4156
|
'Check to see if it is still being launched') from e
|
|
3729
4157
|
|
|
3730
4158
|
# --- CloudVMRayBackend Specific APIs ---
|
|
@@ -3735,6 +4163,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3735
4163
|
job_ids: Optional[List[int]] = None,
|
|
3736
4164
|
stream_logs: bool = True
|
|
3737
4165
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4166
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4167
|
+
try:
|
|
4168
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4169
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4170
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4171
|
+
).get_job_status(request))
|
|
4172
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4173
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4174
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4175
|
+
}
|
|
4176
|
+
return statuses
|
|
4177
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4178
|
+
pass
|
|
4179
|
+
|
|
3738
4180
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
3739
4181
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3740
4182
|
code,
|
|
@@ -3755,16 +4197,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3755
4197
|
|
|
3756
4198
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
3757
4199
|
"""
|
|
3758
|
-
|
|
3759
|
-
|
|
3760
|
-
|
|
3761
|
-
|
|
3762
|
-
|
|
3763
|
-
|
|
3764
|
-
|
|
3765
|
-
|
|
3766
|
-
|
|
3767
|
-
|
|
4200
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4201
|
+
|
|
4202
|
+
if not use_legacy:
|
|
4203
|
+
try:
|
|
4204
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4205
|
+
cancel_all=cancel_all,
|
|
4206
|
+
user_hash=user_hash)
|
|
4207
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4208
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4209
|
+
request))
|
|
4210
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4211
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4212
|
+
use_legacy = True
|
|
4213
|
+
|
|
4214
|
+
if use_legacy:
|
|
4215
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4216
|
+
user_hash)
|
|
4217
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4218
|
+
code,
|
|
4219
|
+
stream_logs=False,
|
|
4220
|
+
require_outputs=True)
|
|
4221
|
+
subprocess_utils.handle_returncode(
|
|
4222
|
+
returncode, code,
|
|
4223
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4224
|
+
stdout)
|
|
4225
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
3768
4226
|
if cancelled_ids:
|
|
3769
4227
|
logger.info(
|
|
3770
4228
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -3781,32 +4239,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3781
4239
|
Returns:
|
|
3782
4240
|
A dictionary mapping job_id to log path.
|
|
3783
4241
|
"""
|
|
3784
|
-
|
|
3785
|
-
|
|
3786
|
-
handle,
|
|
3787
|
-
code,
|
|
3788
|
-
stream_logs=False,
|
|
3789
|
-
require_outputs=True,
|
|
3790
|
-
separate_stderr=True)
|
|
3791
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
3792
|
-
'Failed to sync logs.', stderr)
|
|
3793
|
-
run_timestamps = message_utils.decode_payload(run_timestamps)
|
|
3794
|
-
if not run_timestamps:
|
|
3795
|
-
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3796
|
-
'No matching log directories found'
|
|
3797
|
-
f'{colorama.Style.RESET_ALL}')
|
|
3798
|
-
return {}
|
|
4242
|
+
job_to_dir: Dict[str, str] = {}
|
|
4243
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3799
4244
|
|
|
3800
|
-
|
|
3801
|
-
|
|
4245
|
+
if not use_legacy:
|
|
4246
|
+
try:
|
|
4247
|
+
int_job_ids = []
|
|
4248
|
+
if job_ids:
|
|
4249
|
+
for str_job_id in job_ids:
|
|
4250
|
+
if str_job_id.isdigit():
|
|
4251
|
+
int_job_ids.append(int(str_job_id))
|
|
4252
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4253
|
+
job_ids=int_job_ids)
|
|
4254
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4255
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4256
|
+
).get_log_dirs_for_jobs(request))
|
|
4257
|
+
job_log_dirs = response.job_log_dirs
|
|
4258
|
+
if not job_log_dirs:
|
|
4259
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4260
|
+
'No matching log directories found'
|
|
4261
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4262
|
+
return {}
|
|
4263
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4264
|
+
# Convert to string for backwards compatibility
|
|
4265
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4266
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4267
|
+
use_legacy = True
|
|
4268
|
+
|
|
4269
|
+
if use_legacy:
|
|
4270
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4271
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4272
|
+
code,
|
|
4273
|
+
stream_logs=False,
|
|
4274
|
+
require_outputs=True,
|
|
4275
|
+
separate_stderr=True)
|
|
4276
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4277
|
+
'Failed to sync logs.', stderr)
|
|
4278
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4279
|
+
if not job_to_dir:
|
|
4280
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4281
|
+
'No matching log directories found'
|
|
4282
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4283
|
+
return {}
|
|
4284
|
+
|
|
4285
|
+
job_ids = list(job_to_dir.keys())
|
|
4286
|
+
dirs = list(job_to_dir.values())
|
|
3802
4287
|
remote_log_dirs = [
|
|
3803
|
-
|
|
3804
|
-
|
|
3805
|
-
|
|
3806
|
-
|
|
3807
|
-
os.path.join(local_dir, run_timestamp)
|
|
3808
|
-
for run_timestamp in run_timestamps
|
|
4288
|
+
# TODO(aylei): backward compatibility for legacy runtime that
|
|
4289
|
+
# returns run_timestamp only, remove after 0.12.0
|
|
4290
|
+
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
|
4291
|
+
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
|
3809
4292
|
]
|
|
4293
|
+
# Include cluster name in local log directory path to avoid conflicts
|
|
4294
|
+
# when the same job_id exists on different clusters
|
|
4295
|
+
cluster_name = handle.cluster_name
|
|
4296
|
+
local_log_dirs = []
|
|
4297
|
+
for remote_log_dir in dirs:
|
|
4298
|
+
if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
|
|
4299
|
+
# Extract the job-specific directory name from the full path
|
|
4300
|
+
# e.g., ~/sky_logs/1-job_name -> 1-job_name
|
|
4301
|
+
job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
|
|
4302
|
+
'').lstrip('/')
|
|
4303
|
+
local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
|
|
4304
|
+
else:
|
|
4305
|
+
# remote_log_dir is already just the job directory name (e.g.,
|
|
4306
|
+
# "1-job_name")
|
|
4307
|
+
local_log_dir = os.path.join(local_dir, cluster_name,
|
|
4308
|
+
remote_log_dir)
|
|
4309
|
+
local_log_dirs.append(local_log_dir)
|
|
3810
4310
|
|
|
3811
4311
|
runners = handle.get_command_runners()
|
|
3812
4312
|
|
|
@@ -3842,12 +4342,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3842
4342
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
|
3843
4343
|
return dict(zip(job_ids, local_log_dirs))
|
|
3844
4344
|
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
4345
|
+
@context_utils.cancellation_guard
|
|
4346
|
+
def tail_logs(
|
|
4347
|
+
self,
|
|
4348
|
+
handle: CloudVmRayResourceHandle,
|
|
4349
|
+
job_id: Optional[int],
|
|
4350
|
+
managed_job_id: Optional[int] = None,
|
|
4351
|
+
follow: bool = True,
|
|
4352
|
+
tail: int = 0,
|
|
4353
|
+
require_outputs: bool = False,
|
|
4354
|
+
stream_logs: bool = True,
|
|
4355
|
+
process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
|
|
3851
4356
|
"""Tail the logs of a job.
|
|
3852
4357
|
|
|
3853
4358
|
Args:
|
|
@@ -3857,11 +4362,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3857
4362
|
follow: Whether to follow the logs.
|
|
3858
4363
|
tail: The number of lines to display from the end of the
|
|
3859
4364
|
log file. If 0, print all lines.
|
|
4365
|
+
require_outputs: Whether to return the stdout/stderr of the command.
|
|
4366
|
+
stream_logs: Whether to stream the logs to stdout/stderr.
|
|
4367
|
+
process_stream: Whether to process the stream.
|
|
3860
4368
|
|
|
3861
4369
|
Returns:
|
|
3862
4370
|
The exit code of the tail command. Returns code 100 if the job has
|
|
3863
4371
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
3864
4372
|
"""
|
|
4373
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4374
|
+
last_exit_code = 0
|
|
4375
|
+
try:
|
|
4376
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4377
|
+
job_id=job_id,
|
|
4378
|
+
managed_job_id=managed_job_id,
|
|
4379
|
+
follow=follow,
|
|
4380
|
+
tail=tail)
|
|
4381
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4382
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4383
|
+
).tail_logs(request, timeout=None)):
|
|
4384
|
+
if resp.log_line:
|
|
4385
|
+
print(resp.log_line, end='', flush=True)
|
|
4386
|
+
last_exit_code = resp.exit_code
|
|
4387
|
+
return last_exit_code
|
|
4388
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4389
|
+
pass
|
|
4390
|
+
except grpc.RpcError as e:
|
|
4391
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4392
|
+
return last_exit_code
|
|
4393
|
+
raise e
|
|
4394
|
+
|
|
3865
4395
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
3866
4396
|
managed_job_id=managed_job_id,
|
|
3867
4397
|
follow=follow,
|
|
@@ -3876,29 +4406,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3876
4406
|
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
|
3877
4407
|
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
|
3878
4408
|
try:
|
|
3879
|
-
|
|
4409
|
+
final = self.run_on_head(
|
|
3880
4410
|
handle,
|
|
3881
4411
|
code,
|
|
3882
|
-
stream_logs=
|
|
3883
|
-
process_stream=
|
|
4412
|
+
stream_logs=stream_logs,
|
|
4413
|
+
process_stream=process_stream,
|
|
4414
|
+
require_outputs=require_outputs,
|
|
3884
4415
|
# Allocate a pseudo-terminal to disable output buffering.
|
|
3885
4416
|
# Otherwise, there may be 5 minutes delay in logging.
|
|
3886
4417
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
|
3887
4418
|
)
|
|
3888
4419
|
except SystemExit as e:
|
|
3889
|
-
|
|
3890
|
-
return
|
|
4420
|
+
final = e.code
|
|
4421
|
+
return final
|
|
3891
4422
|
|
|
3892
4423
|
def tail_managed_job_logs(self,
|
|
3893
4424
|
handle: CloudVmRayResourceHandle,
|
|
3894
4425
|
job_id: Optional[int] = None,
|
|
3895
4426
|
job_name: Optional[str] = None,
|
|
3896
4427
|
controller: bool = False,
|
|
3897
|
-
follow: bool = True
|
|
4428
|
+
follow: bool = True,
|
|
4429
|
+
tail: Optional[int] = None) -> int:
|
|
3898
4430
|
# if job_name is not None, job_id should be None
|
|
3899
4431
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4432
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
3900
4433
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
3901
|
-
job_name, job_id, follow, controller)
|
|
4434
|
+
job_name, job_id, follow, controller, tail)
|
|
3902
4435
|
|
|
3903
4436
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
|
3904
4437
|
# kill the process, so we need to handle it manually here.
|
|
@@ -3942,20 +4475,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3942
4475
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
3943
4476
|
|
|
3944
4477
|
if job_id is None:
|
|
3945
|
-
#
|
|
4478
|
+
# get the job_id
|
|
3946
4479
|
# if job_name is None, get all job_ids
|
|
3947
4480
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
3951
|
-
|
|
3952
|
-
|
|
3953
|
-
|
|
3954
|
-
|
|
3955
|
-
|
|
3956
|
-
|
|
3957
|
-
|
|
3958
|
-
|
|
4481
|
+
|
|
4482
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4483
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4484
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4485
|
+
if not use_legacy:
|
|
4486
|
+
try:
|
|
4487
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4488
|
+
job_name=job_name)
|
|
4489
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4490
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4491
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4492
|
+
job_ids = list(response.job_ids)
|
|
4493
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4494
|
+
use_legacy = True
|
|
4495
|
+
|
|
4496
|
+
if use_legacy:
|
|
4497
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4498
|
+
job_name=job_name)
|
|
4499
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4500
|
+
handle,
|
|
4501
|
+
code,
|
|
4502
|
+
stream_logs=False,
|
|
4503
|
+
require_outputs=True,
|
|
4504
|
+
separate_stderr=True)
|
|
4505
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4506
|
+
'Failed to sync down logs.',
|
|
4507
|
+
stderr)
|
|
4508
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
3959
4509
|
if not job_ids:
|
|
3960
4510
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3961
4511
|
'No matching job found'
|
|
@@ -3974,20 +4524,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3974
4524
|
# list should aready be in descending order
|
|
3975
4525
|
job_id = job_ids[0]
|
|
3976
4526
|
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
|
|
3984
|
-
|
|
3985
|
-
|
|
3986
|
-
|
|
3987
|
-
|
|
3988
|
-
|
|
3989
|
-
|
|
3990
|
-
|
|
4527
|
+
if isinstance(handle, LocalResourcesHandle):
|
|
4528
|
+
# In consolidation mode, we don't submit a ray job, therefore no
|
|
4529
|
+
# run_timestamp is available. We use a dummy run_timestamp here.
|
|
4530
|
+
run_timestamps = {
|
|
4531
|
+
job_id: f'managed-jobs-consolidation-mode-{job_id}'
|
|
4532
|
+
}
|
|
4533
|
+
else:
|
|
4534
|
+
# get the run_timestamp
|
|
4535
|
+
# the function takes in [job_id]
|
|
4536
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4537
|
+
if not use_legacy:
|
|
4538
|
+
try:
|
|
4539
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4540
|
+
job_ids=[job_id])
|
|
4541
|
+
log_dirs_response = (
|
|
4542
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4543
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4544
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4545
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4546
|
+
# Convert back to the expected format
|
|
4547
|
+
# {job_id: run_timestamp}
|
|
4548
|
+
run_timestamps = {}
|
|
4549
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4550
|
+
run_timestamps[int(jid)] = log_dir
|
|
4551
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4552
|
+
use_legacy = True
|
|
4553
|
+
|
|
4554
|
+
if use_legacy:
|
|
4555
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4556
|
+
[str(job_id)])
|
|
4557
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4558
|
+
handle,
|
|
4559
|
+
code,
|
|
4560
|
+
stream_logs=False,
|
|
4561
|
+
require_outputs=True,
|
|
4562
|
+
separate_stderr=True)
|
|
4563
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4564
|
+
'Failed to sync logs.',
|
|
4565
|
+
stderr)
|
|
4566
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4567
|
+
run_timestamps = message_utils.decode_payload(
|
|
4568
|
+
run_timestamps_payload)
|
|
3991
4569
|
if not run_timestamps:
|
|
3992
4570
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3993
4571
|
'No matching log directories found'
|
|
@@ -3996,11 +4574,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3996
4574
|
|
|
3997
4575
|
run_timestamp = list(run_timestamps.values())[0]
|
|
3998
4576
|
job_id = list(run_timestamps.keys())[0]
|
|
4577
|
+
|
|
4578
|
+
# If run_timestamp contains the full path with SKY_LOGS_DIRECTORY,
|
|
4579
|
+
# strip the prefix to get just the relative part to avoid duplication
|
|
4580
|
+
# when constructing local paths.
|
|
4581
|
+
if run_timestamp.startswith(constants.SKY_LOGS_DIRECTORY):
|
|
4582
|
+
run_timestamp = run_timestamp[len(constants.SKY_LOGS_DIRECTORY
|
|
4583
|
+
):].lstrip('/')
|
|
3999
4584
|
local_log_dir = ''
|
|
4000
4585
|
if controller: # download controller logs
|
|
4001
4586
|
remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
|
|
4002
4587
|
f'{job_id}.log')
|
|
4003
|
-
local_log_dir = os.path.join(local_dir,
|
|
4588
|
+
local_log_dir = os.path.join(local_dir, 'managed_jobs',
|
|
4589
|
+
run_timestamp)
|
|
4004
4590
|
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
|
4005
4591
|
exist_ok=True)
|
|
4006
4592
|
|
|
@@ -4046,11 +4632,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4046
4632
|
exist_ok=True)
|
|
4047
4633
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4048
4634
|
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
|
|
4053
|
-
|
|
4635
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4636
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4637
|
+
job_name=None,
|
|
4638
|
+
job_id=int(job_id),
|
|
4639
|
+
follow=False,
|
|
4640
|
+
controller=False)
|
|
4054
4641
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4055
4642
|
# kill the process, so we need to handle it manually here.
|
|
4056
4643
|
if threading.current_thread() is threading.main_thread():
|
|
@@ -4091,6 +4678,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4091
4678
|
Raises:
|
|
4092
4679
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
|
4093
4680
|
"""
|
|
4681
|
+
try:
|
|
4682
|
+
handle.close_skylet_ssh_tunnel()
|
|
4683
|
+
except Exception as e: # pylint: disable=broad-except
|
|
4684
|
+
# Not critical to the cluster teardown, just log a warning.
|
|
4685
|
+
logger.warning(
|
|
4686
|
+
'Failed to close Skylet SSH tunnel for cluster '
|
|
4687
|
+
f'{handle.cluster_name}: '
|
|
4688
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4689
|
+
|
|
4094
4690
|
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
|
4095
4691
|
# We have to kill the cluster requests again within the lock, because
|
|
4096
4692
|
# any pending requests on the same cluster should be cancelled after
|
|
@@ -4116,7 +4712,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4116
4712
|
prev_cluster_status, _ = (
|
|
4117
4713
|
backend_utils.refresh_cluster_status_handle(
|
|
4118
4714
|
handle.cluster_name,
|
|
4119
|
-
|
|
4715
|
+
# There is a case where
|
|
4716
|
+
# 1. The cluster was interrupted during provisioning.
|
|
4717
|
+
# 2. The API request to create the cluster instances was
|
|
4718
|
+
# sent to the cloud, but hasn't been processed yet.
|
|
4719
|
+
# In this case, the cluster will be INIT. We should do a
|
|
4720
|
+
# hard status refresh to see if the instances are
|
|
4721
|
+
# actually there or not. Otherwise, teardown may not
|
|
4722
|
+
# find the instances, leading to a leak. This was
|
|
4723
|
+
# observed in AWS. See also
|
|
4724
|
+
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
4725
|
+
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
4726
|
+
cluster_lock_already_held=True,
|
|
4727
|
+
retry_if_missing=False))
|
|
4120
4728
|
cluster_status_fetched = True
|
|
4121
4729
|
except exceptions.ClusterStatusFetchingError:
|
|
4122
4730
|
logger.warning(
|
|
@@ -4124,10 +4732,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4124
4732
|
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
|
4125
4733
|
'up.')
|
|
4126
4734
|
if not cluster_status_fetched:
|
|
4127
|
-
|
|
4735
|
+
status = global_user_state.get_status_from_cluster_name(
|
|
4128
4736
|
handle.cluster_name)
|
|
4129
|
-
prev_cluster_status =
|
|
4130
|
-
'status'] if record is not None else None
|
|
4737
|
+
prev_cluster_status = status if status is not None else None
|
|
4131
4738
|
if prev_cluster_status is None:
|
|
4132
4739
|
# When the cluster is not in the cluster table, we guarantee that
|
|
4133
4740
|
# all related resources / cache / config are cleaned up, i.e. it
|
|
@@ -4148,8 +4755,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4148
4755
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
|
4149
4756
|
'teardown.log')
|
|
4150
4757
|
log_abs_path = os.path.abspath(log_path)
|
|
4151
|
-
|
|
4152
|
-
|
|
4758
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
4759
|
+
cloud = launched_resources.cloud
|
|
4760
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
4153
4761
|
cluster_name = handle.cluster_name
|
|
4154
4762
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
4155
4763
|
|
|
@@ -4209,7 +4817,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4209
4817
|
from sky.adaptors import ibm
|
|
4210
4818
|
from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
|
|
4211
4819
|
|
|
4212
|
-
config_provider =
|
|
4820
|
+
config_provider = global_user_state.get_cluster_yaml_dict(
|
|
4213
4821
|
handle.cluster_yaml)['provider']
|
|
4214
4822
|
region = config_provider['region']
|
|
4215
4823
|
search_client = ibm.search_client()
|
|
@@ -4238,36 +4846,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4238
4846
|
# successfully removed cluster as no exception was raised
|
|
4239
4847
|
returncode = 0
|
|
4240
4848
|
|
|
4241
|
-
elif terminate and isinstance(cloud, clouds.SCP):
|
|
4242
|
-
# pylint: disable=import-outside-toplevel
|
|
4243
|
-
from sky.skylet.providers.scp import node_provider
|
|
4244
|
-
config['provider']['cache_stopped_nodes'] = not terminate
|
|
4245
|
-
provider = node_provider.SCPNodeProvider(config['provider'],
|
|
4246
|
-
cluster_name_on_cloud)
|
|
4247
|
-
try:
|
|
4248
|
-
if not os.path.exists(provider.metadata.path):
|
|
4249
|
-
raise node_provider.SCPError(
|
|
4250
|
-
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
|
|
4251
|
-
'Metadata file does not exist.')
|
|
4252
|
-
|
|
4253
|
-
with open(provider.metadata.path, 'r', encoding='utf-8') as f:
|
|
4254
|
-
metadata = json.load(f)
|
|
4255
|
-
node_id = next(iter(metadata.values())).get(
|
|
4256
|
-
'creation', {}).get('virtualServerId', None)
|
|
4257
|
-
provider.terminate_node(node_id)
|
|
4258
|
-
returncode = 0
|
|
4259
|
-
except node_provider.SCPError as e:
|
|
4260
|
-
returncode = 1
|
|
4261
|
-
stdout = ''
|
|
4262
|
-
stderr = str(e)
|
|
4263
|
-
|
|
4264
4849
|
else:
|
|
4265
4850
|
config['provider']['cache_stopped_nodes'] = not terminate
|
|
4266
4851
|
with tempfile.NamedTemporaryFile('w',
|
|
4267
4852
|
prefix='sky_',
|
|
4268
4853
|
delete=False,
|
|
4269
4854
|
suffix='.yml') as f:
|
|
4270
|
-
|
|
4855
|
+
yaml_utils.dump_yaml(f.name, config)
|
|
4271
4856
|
f.flush()
|
|
4272
4857
|
|
|
4273
4858
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
|
@@ -4322,12 +4907,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4322
4907
|
handle: CloudVmRayResourceHandle,
|
|
4323
4908
|
terminate: bool,
|
|
4324
4909
|
purge: bool = False,
|
|
4325
|
-
remove_from_db: bool = True
|
|
4910
|
+
remove_from_db: bool = True,
|
|
4911
|
+
failover: bool = False) -> None:
|
|
4326
4912
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
|
4327
4913
|
|
|
4328
4914
|
This method will handle the following cleanup steps:
|
|
4329
4915
|
* Deleting the TPUs;
|
|
4330
4916
|
* Removing ssh configs for the cluster;
|
|
4917
|
+
* Deleting the open ports;
|
|
4918
|
+
* Deleting the custom multi network infrastructure based on the
|
|
4919
|
+
failover flag (e.g. delete firewalls, subnets, and VPCs for GPU
|
|
4920
|
+
Direct if failover is False, otherwise, only delete the subnets);
|
|
4331
4921
|
* Updating the local state of the cluster;
|
|
4332
4922
|
* Removing the terminated cluster's scripts and ray yaml files.
|
|
4333
4923
|
"""
|
|
@@ -4359,19 +4949,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4359
4949
|
# The cluster yaml does not exist when skypilot has not found
|
|
4360
4950
|
# the right resource to provision the cluster.
|
|
4361
4951
|
if handle.cluster_yaml is not None:
|
|
4952
|
+
launched_resources = (
|
|
4953
|
+
handle.launched_resources.assert_launchable())
|
|
4954
|
+
cloud = launched_resources.cloud
|
|
4955
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
4956
|
+
handle.cluster_yaml)
|
|
4957
|
+
ports_cleaned_up = False
|
|
4958
|
+
custom_multi_network_cleaned_up = False
|
|
4362
4959
|
try:
|
|
4363
|
-
cloud = handle.launched_resources.cloud
|
|
4364
|
-
config = common_utils.read_yaml(handle.cluster_yaml)
|
|
4365
4960
|
cloud.check_features_are_supported(
|
|
4366
|
-
|
|
4961
|
+
launched_resources,
|
|
4367
4962
|
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
|
4368
4963
|
provision_lib.cleanup_ports(repr(cloud),
|
|
4369
4964
|
cluster_name_on_cloud,
|
|
4370
4965
|
handle.launched_resources.ports,
|
|
4371
4966
|
config['provider'])
|
|
4372
|
-
|
|
4967
|
+
ports_cleaned_up = True
|
|
4373
4968
|
except exceptions.NotSupportedError:
|
|
4374
|
-
|
|
4969
|
+
ports_cleaned_up = True
|
|
4375
4970
|
except exceptions.PortDoesNotExistError:
|
|
4376
4971
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
4377
4972
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -4383,8 +4978,43 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4383
4978
|
else:
|
|
4384
4979
|
raise
|
|
4385
4980
|
|
|
4386
|
-
|
|
4387
|
-
|
|
4981
|
+
# Clean up custom multi networks, e.g. the subnets, firewalls,
|
|
4982
|
+
# and VPCs created for GCP GPUDirect TCPX
|
|
4983
|
+
try:
|
|
4984
|
+
cloud.check_features_are_supported(
|
|
4985
|
+
handle.launched_resources, {
|
|
4986
|
+
clouds.CloudImplementationFeatures.
|
|
4987
|
+
CUSTOM_MULTI_NETWORK
|
|
4988
|
+
})
|
|
4989
|
+
provision_lib.cleanup_custom_multi_network(
|
|
4990
|
+
repr(cloud), cluster_name_on_cloud, config['provider'],
|
|
4991
|
+
failover)
|
|
4992
|
+
custom_multi_network_cleaned_up = True
|
|
4993
|
+
except exceptions.NotSupportedError:
|
|
4994
|
+
custom_multi_network_cleaned_up = True
|
|
4995
|
+
except Exception as e: # pylint: disable=broad-except
|
|
4996
|
+
if purge:
|
|
4997
|
+
msg = common_utils.format_exception(e, use_bracket=True)
|
|
4998
|
+
logger.warning(
|
|
4999
|
+
f'Failed to cleanup custom multi network. Skipping '
|
|
5000
|
+
f'since purge is set. Details: {msg}')
|
|
5001
|
+
else:
|
|
5002
|
+
raise
|
|
5003
|
+
|
|
5004
|
+
if ports_cleaned_up and custom_multi_network_cleaned_up:
|
|
5005
|
+
try:
|
|
5006
|
+
self.remove_cluster_config(handle)
|
|
5007
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5008
|
+
if purge:
|
|
5009
|
+
msg = common_utils.format_exception(
|
|
5010
|
+
e, use_bracket=True)
|
|
5011
|
+
logger.warning(
|
|
5012
|
+
f'Failed to remove cluster config. Skipping '
|
|
5013
|
+
f'since purge is set. Details: {msg}')
|
|
5014
|
+
else:
|
|
5015
|
+
raise
|
|
5016
|
+
|
|
5017
|
+
cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
|
|
4388
5018
|
|
|
4389
5019
|
def _detect_abnormal_non_terminated_nodes(
|
|
4390
5020
|
handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -4400,18 +5030,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4400
5030
|
# https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
|
|
4401
5031
|
attempts = 0
|
|
4402
5032
|
while True:
|
|
4403
|
-
config =
|
|
5033
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
5034
|
+
handle.cluster_yaml)
|
|
4404
5035
|
|
|
4405
5036
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
|
4406
5037
|
node_status_dict = provision_lib.query_instances(
|
|
4407
5038
|
repr(cloud),
|
|
5039
|
+
handle.cluster_name,
|
|
4408
5040
|
cluster_name_on_cloud,
|
|
4409
5041
|
config['provider'],
|
|
4410
5042
|
non_terminated_only=False)
|
|
4411
5043
|
|
|
4412
5044
|
unexpected_node_state: Optional[Tuple[str, str]] = None
|
|
4413
|
-
for node_id,
|
|
4414
|
-
|
|
5045
|
+
for node_id, node_status_tuple in node_status_dict.items():
|
|
5046
|
+
node_status, reason = node_status_tuple
|
|
5047
|
+
reason = '' if reason is None else f' ({reason})'
|
|
5048
|
+
logger.debug(f'{node_id} status: {node_status}{reason}')
|
|
4415
5049
|
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
|
4416
5050
|
# between "stopping/stopped" and "terminating/terminated",
|
|
4417
5051
|
# so we allow for either status instead of casing on
|
|
@@ -4456,13 +5090,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4456
5090
|
|
|
4457
5091
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
|
4458
5092
|
"""Remove the YAML config of a cluster."""
|
|
5093
|
+
cluster_yaml_path = handle.cluster_yaml
|
|
4459
5094
|
handle.cluster_yaml = None
|
|
4460
5095
|
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
|
4461
|
-
|
|
5096
|
+
# Removing the cluster YAML can cause some unexpected stability issues.
|
|
5097
|
+
# See #5011.
|
|
5098
|
+
# global_user_state.remove_cluster_yaml(handle.cluster_name)
|
|
5099
|
+
common_utils.remove_file_if_exists(cluster_yaml_path)
|
|
4462
5100
|
|
|
4463
5101
|
def set_autostop(self,
|
|
4464
5102
|
handle: CloudVmRayResourceHandle,
|
|
4465
5103
|
idle_minutes_to_autostop: Optional[int],
|
|
5104
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor],
|
|
4466
5105
|
down: bool = False,
|
|
4467
5106
|
stream_logs: bool = True) -> None:
|
|
4468
5107
|
# The core.autostop() function should have already checked that the
|
|
@@ -4489,6 +5128,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4489
5128
|
|
|
4490
5129
|
# down = False is the default, but warn the user in case
|
|
4491
5130
|
# they have explicitly specified it.
|
|
5131
|
+
# TODO(cooperc): Fix for new autostop stuff.
|
|
4492
5132
|
config_override_down = skypilot_config.get_nested(
|
|
4493
5133
|
(controller.value.controller_type, 'controller',
|
|
4494
5134
|
'autostop', 'down'), None)
|
|
@@ -4508,17 +5148,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4508
5148
|
# Check if we're stopping spot
|
|
4509
5149
|
assert (handle.launched_resources is not None and
|
|
4510
5150
|
handle.launched_resources.cloud is not None), handle
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4520
|
-
|
|
4521
|
-
|
|
5151
|
+
if handle.is_grpc_enabled_with_flag:
|
|
5152
|
+
request = autostopv1_pb2.SetAutostopRequest(
|
|
5153
|
+
idle_minutes=idle_minutes_to_autostop,
|
|
5154
|
+
backend=self.NAME,
|
|
5155
|
+
wait_for=wait_for.to_protobuf() if wait_for is not None else
|
|
5156
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
5157
|
+
down=down,
|
|
5158
|
+
)
|
|
5159
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5160
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
5161
|
+
else:
|
|
5162
|
+
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
5163
|
+
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
5164
|
+
returncode, _, stderr = self.run_on_head(
|
|
5165
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
5166
|
+
subprocess_utils.handle_returncode(returncode,
|
|
5167
|
+
code,
|
|
5168
|
+
'Failed to set autostop',
|
|
5169
|
+
stderr=stderr,
|
|
5170
|
+
stream_logs=stream_logs)
|
|
4522
5171
|
global_user_state.set_cluster_autostop_value(
|
|
4523
5172
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
|
4524
5173
|
|
|
@@ -4543,22 +5192,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4543
5192
|
# The head node of the cluster is not UP or in an abnormal state.
|
|
4544
5193
|
# We cannot check if the cluster is autostopping.
|
|
4545
5194
|
return False
|
|
4546
|
-
|
|
4547
|
-
|
|
4548
|
-
|
|
4549
|
-
|
|
4550
|
-
|
|
4551
|
-
|
|
4552
|
-
|
|
4553
|
-
|
|
4554
|
-
|
|
4555
|
-
|
|
4556
|
-
|
|
4557
|
-
|
|
5195
|
+
if handle.is_grpc_enabled_with_flag:
|
|
5196
|
+
try:
|
|
5197
|
+
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5198
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
5199
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5200
|
+
).is_autostopping(request))
|
|
5201
|
+
return response.is_autostopping
|
|
5202
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5203
|
+
# The cluster may have been terminated, causing the gRPC call
|
|
5204
|
+
# to timeout and fail.
|
|
5205
|
+
logger.debug(f'Failed to check if cluster is autostopping: {e}')
|
|
5206
|
+
return False
|
|
5207
|
+
else:
|
|
5208
|
+
code = autostop_lib.AutostopCodeGen.is_autostopping()
|
|
5209
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
5210
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
5211
|
+
if returncode == 0:
|
|
5212
|
+
return message_utils.decode_payload(stdout)
|
|
5213
|
+
logger.debug('Failed to check if cluster is autostopping with '
|
|
5214
|
+
f'{returncode}: {stdout+stderr}\n'
|
|
5215
|
+
f'Command: {code}')
|
|
5216
|
+
return False
|
|
4558
5217
|
|
|
4559
5218
|
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
|
|
4560
5219
|
# can support its own command runner.
|
|
4561
5220
|
@timeline.event
|
|
5221
|
+
@context_utils.cancellation_guard
|
|
4562
5222
|
def run_on_head(
|
|
4563
5223
|
self,
|
|
4564
5224
|
handle: CloudVmRayResourceHandle,
|
|
@@ -4649,7 +5309,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4649
5309
|
exceptions.InvalidClusterNameError: If the cluster name is invalid.
|
|
4650
5310
|
# TODO(zhwu): complete the list of exceptions.
|
|
4651
5311
|
"""
|
|
4652
|
-
record = global_user_state.get_cluster_from_name(
|
|
5312
|
+
record = global_user_state.get_cluster_from_name(
|
|
5313
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
4653
5314
|
if record is None:
|
|
4654
5315
|
handle_before_refresh = None
|
|
4655
5316
|
status_before_refresh = None
|
|
@@ -4657,6 +5318,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4657
5318
|
handle_before_refresh = record['handle']
|
|
4658
5319
|
status_before_refresh = record['status']
|
|
4659
5320
|
|
|
5321
|
+
handle: Optional[CloudVmRayResourceHandle]
|
|
4660
5322
|
prev_cluster_status, handle = (status_before_refresh,
|
|
4661
5323
|
handle_before_refresh)
|
|
4662
5324
|
|
|
@@ -4668,7 +5330,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4668
5330
|
record = backend_utils.refresh_cluster_record(
|
|
4669
5331
|
cluster_name,
|
|
4670
5332
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
4671
|
-
|
|
5333
|
+
cluster_lock_already_held=True,
|
|
5334
|
+
include_user_info=False,
|
|
5335
|
+
summary_response=True,
|
|
4672
5336
|
)
|
|
4673
5337
|
if record is not None:
|
|
4674
5338
|
prev_cluster_status = record['status']
|
|
@@ -4677,7 +5341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4677
5341
|
prev_cluster_status = None
|
|
4678
5342
|
handle = None
|
|
4679
5343
|
# We should check the cluster_ever_up after refresh, because if the
|
|
4680
|
-
# cluster is terminated (through console or auto-
|
|
5344
|
+
# cluster is terminated (through console or auto-down), the record will
|
|
4681
5345
|
# become None and the cluster_ever_up should be considered as False.
|
|
4682
5346
|
cluster_ever_up = record is not None and record['cluster_ever_up']
|
|
4683
5347
|
prev_config_hash = record['config_hash'] if record is not None else None
|
|
@@ -4690,16 +5354,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4690
5354
|
self.check_resources_fit_cluster(handle, task)
|
|
4691
5355
|
# Use the existing cluster.
|
|
4692
5356
|
assert handle.launched_resources is not None, (cluster_name, handle)
|
|
5357
|
+
# Take a random resource in order to get resource info that applies
|
|
5358
|
+
# to all resources.
|
|
5359
|
+
one_task_resource = list(task.resources)[0]
|
|
5360
|
+
|
|
4693
5361
|
# Assume resources share the same ports.
|
|
4694
5362
|
for resource in task.resources:
|
|
4695
|
-
assert resource.ports ==
|
|
5363
|
+
assert resource.ports == one_task_resource.ports
|
|
4696
5364
|
requested_ports_set = resources_utils.port_ranges_to_set(
|
|
4697
|
-
|
|
5365
|
+
one_task_resource.ports)
|
|
4698
5366
|
current_ports_set = resources_utils.port_ranges_to_set(
|
|
4699
5367
|
handle.launched_resources.ports)
|
|
4700
5368
|
all_ports = resources_utils.port_set_to_ranges(current_ports_set |
|
|
4701
5369
|
requested_ports_set)
|
|
4702
5370
|
to_provision = handle.launched_resources
|
|
5371
|
+
assert to_provision is not None
|
|
5372
|
+
to_provision = to_provision.assert_launchable()
|
|
4703
5373
|
if (to_provision.cloud.OPEN_PORTS_VERSION <=
|
|
4704
5374
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
|
4705
5375
|
if not requested_ports_set <= current_ports_set:
|
|
@@ -4713,6 +5383,57 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4713
5383
|
'a new cluster with the desired ports open.')
|
|
4714
5384
|
if all_ports:
|
|
4715
5385
|
to_provision = to_provision.copy(ports=all_ports)
|
|
5386
|
+
# Docker login should always be the same for all resources, since
|
|
5387
|
+
# it's set from envs.
|
|
5388
|
+
for resource in task.resources:
|
|
5389
|
+
assert (resource.docker_login_config ==
|
|
5390
|
+
one_task_resource.docker_login_config), (
|
|
5391
|
+
resource.docker_login_config,
|
|
5392
|
+
one_task_resource.docker_login_config)
|
|
5393
|
+
# If we have docker login config in the new task, override the
|
|
5394
|
+
# existing resources to pick up new credentials. This allows the
|
|
5395
|
+
# user to specify new or fixed credentials if the existing
|
|
5396
|
+
# credentials are not working. If we don't do this, the credentials
|
|
5397
|
+
# from the existing resources will always be reused.
|
|
5398
|
+
if one_task_resource.docker_login_config is not None:
|
|
5399
|
+
to_provision = to_provision.copy(
|
|
5400
|
+
_docker_login_config=one_task_resource.docker_login_config)
|
|
5401
|
+
|
|
5402
|
+
# cluster_config_overrides should be the same for all resources.
|
|
5403
|
+
for resource in task.resources:
|
|
5404
|
+
assert (resource.cluster_config_overrides ==
|
|
5405
|
+
one_task_resource.cluster_config_overrides)
|
|
5406
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
5407
|
+
# Warn users if the Kubernetes pod config is different
|
|
5408
|
+
# from the existing cluster.
|
|
5409
|
+
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5410
|
+
cluster_name)
|
|
5411
|
+
actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
|
|
5412
|
+
desired_cluster_yaml_obj = (
|
|
5413
|
+
kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
5414
|
+
actual_cluster_yaml_obj,
|
|
5415
|
+
cluster_config_overrides=one_task_resource.
|
|
5416
|
+
cluster_config_overrides,
|
|
5417
|
+
cloud=to_provision.cloud,
|
|
5418
|
+
context=to_provision.region))
|
|
5419
|
+
|
|
5420
|
+
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5421
|
+
return (yaml_obj.get('available_node_types',
|
|
5422
|
+
{}).get('ray_head_default',
|
|
5423
|
+
{}).get('node_config', {}))
|
|
5424
|
+
|
|
5425
|
+
if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
|
|
5426
|
+
actual_cluster_yaml_obj):
|
|
5427
|
+
# pylint: disable=line-too-long
|
|
5428
|
+
logger.warning(
|
|
5429
|
+
f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
|
|
5430
|
+
f'pod config than the existing cluster. The existing '
|
|
5431
|
+
f'cluster will be used with its current pod config.'
|
|
5432
|
+
f'To apply use your task\'s new pod config:\n'
|
|
5433
|
+
f' • Use a new cluster'
|
|
5434
|
+
f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
|
|
5435
|
+
f'{colorama.Style.RESET_ALL}')
|
|
5436
|
+
|
|
4716
5437
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
4717
5438
|
cluster_name,
|
|
4718
5439
|
to_provision,
|
|
@@ -4727,33 +5448,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4727
5448
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
|
4728
5449
|
|
|
4729
5450
|
if to_provision is None:
|
|
4730
|
-
#
|
|
4731
|
-
#
|
|
4732
|
-
#
|
|
4733
|
-
#
|
|
4734
|
-
#
|
|
4735
|
-
#
|
|
4736
|
-
#
|
|
4737
|
-
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
handle_before_refresh,
|
|
4742
|
-
|
|
4743
|
-
|
|
4744
|
-
|
|
4745
|
-
|
|
4746
|
-
|
|
4747
|
-
|
|
4748
|
-
|
|
4749
|
-
|
|
4750
|
-
|
|
4751
|
-
|
|
4752
|
-
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
|
|
4756
|
-
|
|
5451
|
+
# Recently terminated after refresh. OPTIMIZE usually ran outside
|
|
5452
|
+
# the lock, so that decision may be stale by now. Under the lock,
|
|
5453
|
+
# ensure we always have a concrete plan via the following order:
|
|
5454
|
+
# 1) Reuse last placement snapshot (if available);
|
|
5455
|
+
# 2) Else, call injected planner for a fresh plan.
|
|
5456
|
+
# If we still have a pre-refresh handle snapshot with a concrete
|
|
5457
|
+
# placement, prefer reusing it.
|
|
5458
|
+
if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
|
|
5459
|
+
handle_before_refresh.launched_resources is not None):
|
|
5460
|
+
to_provision = handle_before_refresh.launched_resources
|
|
5461
|
+
# Ensure the requested task fits the previous placement.
|
|
5462
|
+
self.check_resources_fit_cluster(handle_before_refresh, task)
|
|
5463
|
+
# Mirror the original message for reuse path.
|
|
5464
|
+
status_before_refresh_str = None
|
|
5465
|
+
if status_before_refresh is not None:
|
|
5466
|
+
status_before_refresh_str = status_before_refresh.value
|
|
5467
|
+
logger.info(
|
|
5468
|
+
f'The cluster {cluster_name!r} (status: '
|
|
5469
|
+
f'{status_before_refresh_str}) was not found on the cloud: '
|
|
5470
|
+
'it may be autodowned, manually terminated, or its launch '
|
|
5471
|
+
'never succeeded. Provisioning a new cluster by using the '
|
|
5472
|
+
'same resources as its original launch.')
|
|
5473
|
+
elif self._planner is not None:
|
|
5474
|
+
to_provision = self._planner(task)
|
|
5475
|
+
logger.info(
|
|
5476
|
+
'Previous placement snapshot missing; computing a fresh '
|
|
5477
|
+
'plan for provisioning.')
|
|
5478
|
+
else:
|
|
5479
|
+
# Without a snapshot or planner, we cannot proceed safely.
|
|
5480
|
+
# Surface a user-friendly error without a long traceback.
|
|
5481
|
+
with ux_utils.print_exception_no_traceback():
|
|
5482
|
+
raise RuntimeError(
|
|
5483
|
+
'No concrete launch plan available after recent cloud '
|
|
5484
|
+
f'termination of cluster {cluster_name!r}. Ensure the '
|
|
5485
|
+
'OPTIMIZE stage runs or provide concrete resources.')
|
|
4757
5486
|
|
|
4758
5487
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
4759
5488
|
cluster_name,
|
|
@@ -5033,18 +5762,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5033
5762
|
# reconstruct them during cluster restart.
|
|
5034
5763
|
continue
|
|
5035
5764
|
storage_mounts_metadata[dst] = storage_obj.handle
|
|
5036
|
-
|
|
5037
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5765
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5038
5766
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5039
5767
|
try:
|
|
5040
|
-
with
|
|
5768
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5041
5769
|
global_user_state.set_cluster_storage_mounts_metadata(
|
|
5042
5770
|
cluster_name, storage_mounts_metadata)
|
|
5043
|
-
except
|
|
5771
|
+
except locks.LockTimeout as e:
|
|
5044
5772
|
raise RuntimeError(
|
|
5045
5773
|
f'Failed to store metadata for cluster {cluster_name!r} due to '
|
|
5046
5774
|
'a timeout when trying to access local database. Please '
|
|
5047
|
-
f'try again or manually remove the lock at {
|
|
5775
|
+
f'try again or manually remove the lock at {lock_id}. '
|
|
5048
5776
|
f'{common_utils.format_exception(e)}') from None
|
|
5049
5777
|
|
|
5050
5778
|
def get_storage_mounts_metadata(
|
|
@@ -5055,19 +5783,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5055
5783
|
After retrieving storage_mounts_metadata, it converts back the
|
|
5056
5784
|
StorageMetadata to Storage object and restores 'storage_mounts.'
|
|
5057
5785
|
"""
|
|
5058
|
-
|
|
5059
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5786
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5060
5787
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5061
5788
|
try:
|
|
5062
|
-
with
|
|
5789
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5063
5790
|
storage_mounts_metadata = (
|
|
5064
5791
|
global_user_state.get_cluster_storage_mounts_metadata(
|
|
5065
5792
|
cluster_name))
|
|
5066
|
-
except
|
|
5793
|
+
except locks.LockTimeout as e:
|
|
5067
5794
|
raise RuntimeError(
|
|
5068
5795
|
f'Failed to retrieve metadata for cluster {cluster_name!r} '
|
|
5069
5796
|
'due to a timeout when trying to access local database. '
|
|
5070
|
-
f'Please try again or manually remove the lock at {
|
|
5797
|
+
f'Please try again or manually remove the lock at {lock_id}.'
|
|
5071
5798
|
f' {common_utils.format_exception(e)}') from None
|
|
5072
5799
|
|
|
5073
5800
|
if storage_mounts_metadata is None:
|
|
@@ -5104,7 +5831,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5104
5831
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
|
5105
5832
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
|
5106
5833
|
"""Returns the environment variables for the task."""
|
|
5107
|
-
env_vars =
|
|
5834
|
+
env_vars = task_lib.get_plaintext_envs_and_secrets(
|
|
5835
|
+
task.envs_and_secrets)
|
|
5108
5836
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
|
5109
5837
|
# by the controller.
|
|
5110
5838
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|
|
@@ -5116,11 +5844,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5116
5844
|
env_vars.update(self._skypilot_predefined_env_vars(handle))
|
|
5117
5845
|
return env_vars
|
|
5118
5846
|
|
|
5847
|
+
def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
|
|
5848
|
+
"""Returns the user id for the managed job."""
|
|
5849
|
+
if task.managed_job_dag is not None:
|
|
5850
|
+
return task.envs[constants.USER_ID_ENV_VAR]
|
|
5851
|
+
return None
|
|
5852
|
+
|
|
5119
5853
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5120
5854
|
task: task_lib.Task, job_id: int,
|
|
5121
|
-
|
|
5855
|
+
remote_log_dir: str) -> None:
|
|
5122
5856
|
# Launch the command as a Ray task.
|
|
5123
|
-
log_dir = os.path.join(
|
|
5857
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5124
5858
|
|
|
5125
5859
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
|
5126
5860
|
internal_ips = handle.internal_ips()
|
|
@@ -5128,9 +5862,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5128
5862
|
|
|
5129
5863
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5130
5864
|
|
|
5131
|
-
codegen = RayCodeGen()
|
|
5865
|
+
codegen = task_codegen.RayCodeGen()
|
|
5132
5866
|
codegen.add_prologue(job_id)
|
|
5133
|
-
codegen.
|
|
5867
|
+
codegen.add_setup(
|
|
5134
5868
|
1,
|
|
5135
5869
|
resources_dict,
|
|
5136
5870
|
stable_cluster_internal_ips=internal_ips,
|
|
@@ -5139,36 +5873,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5139
5873
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5140
5874
|
)
|
|
5141
5875
|
|
|
5142
|
-
|
|
5143
|
-
|
|
5144
|
-
|
|
5145
|
-
codegen.register_run_fn(run_fn_code, run_fn_name)
|
|
5146
|
-
|
|
5147
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5148
|
-
codegen.add_ray_task(
|
|
5149
|
-
bash_script=command_for_node,
|
|
5876
|
+
codegen.add_task(
|
|
5877
|
+
1,
|
|
5878
|
+
bash_script=task.run,
|
|
5150
5879
|
env_vars=task_env_vars,
|
|
5151
5880
|
task_name=task.name,
|
|
5152
|
-
|
|
5881
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5153
5882
|
log_dir=log_dir)
|
|
5154
5883
|
|
|
5155
5884
|
codegen.add_epilogue()
|
|
5156
5885
|
|
|
5157
|
-
self._exec_code_on_head(
|
|
5158
|
-
|
|
5159
|
-
|
|
5160
|
-
|
|
5161
|
-
|
|
5886
|
+
self._exec_code_on_head(
|
|
5887
|
+
handle,
|
|
5888
|
+
codegen.build(),
|
|
5889
|
+
job_id,
|
|
5890
|
+
managed_job_dag=task.managed_job_dag,
|
|
5891
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5892
|
+
remote_log_dir=remote_log_dir)
|
|
5162
5893
|
|
|
5163
5894
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
|
5164
5895
|
task: task_lib.Task, job_id: int,
|
|
5165
|
-
|
|
5896
|
+
remote_log_dir: str) -> None:
|
|
5166
5897
|
# Strategy:
|
|
5167
5898
|
# ray.init(...)
|
|
5168
5899
|
# for node:
|
|
5169
5900
|
# submit _run_cmd(cmd) with resource {node_i: 1}
|
|
5170
|
-
|
|
5171
|
-
log_dir = os.path.join(log_dir_base, 'tasks')
|
|
5901
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5172
5902
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
|
5173
5903
|
internal_ips = handle.internal_ips()
|
|
5174
5904
|
assert internal_ips is not None, 'internal_ips is not cached in handle'
|
|
@@ -5177,9 +5907,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5177
5907
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
5178
5908
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5179
5909
|
|
|
5180
|
-
codegen = RayCodeGen()
|
|
5910
|
+
codegen = task_codegen.RayCodeGen()
|
|
5181
5911
|
codegen.add_prologue(job_id)
|
|
5182
|
-
codegen.
|
|
5912
|
+
codegen.add_setup(
|
|
5183
5913
|
num_actual_nodes,
|
|
5184
5914
|
resources_dict,
|
|
5185
5915
|
stable_cluster_internal_ips=internal_ips,
|
|
@@ -5188,30 +5918,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5188
5918
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5189
5919
|
)
|
|
5190
5920
|
|
|
5191
|
-
|
|
5192
|
-
|
|
5193
|
-
|
|
5194
|
-
|
|
5195
|
-
|
|
5196
|
-
|
|
5197
|
-
|
|
5198
|
-
for i in range(num_actual_nodes):
|
|
5199
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5200
|
-
|
|
5201
|
-
# Ray's per-node resources, to constrain scheduling each command to
|
|
5202
|
-
# the corresponding node, represented by private IPs.
|
|
5203
|
-
codegen.add_ray_task(
|
|
5204
|
-
bash_script=command_for_node,
|
|
5205
|
-
env_vars=task_env_vars,
|
|
5206
|
-
task_name=task.name,
|
|
5207
|
-
ray_resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5208
|
-
log_dir=log_dir,
|
|
5209
|
-
gang_scheduling_id=i)
|
|
5921
|
+
codegen.add_task(
|
|
5922
|
+
num_actual_nodes,
|
|
5923
|
+
bash_script=task.run,
|
|
5924
|
+
env_vars=task_env_vars,
|
|
5925
|
+
task_name=task.name,
|
|
5926
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5927
|
+
log_dir=log_dir)
|
|
5210
5928
|
|
|
5211
5929
|
codegen.add_epilogue()
|
|
5212
5930
|
# TODO(zhanghao): Add help info for downloading logs.
|
|
5213
|
-
self._exec_code_on_head(
|
|
5214
|
-
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
5931
|
+
self._exec_code_on_head(
|
|
5932
|
+
handle,
|
|
5933
|
+
codegen.build(),
|
|
5934
|
+
job_id,
|
|
5935
|
+
managed_job_dag=task.managed_job_dag,
|
|
5936
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5937
|
+
remote_log_dir=remote_log_dir)
|