skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
|
2
2
|
import copy
|
|
3
|
+
import dataclasses
|
|
3
4
|
import enum
|
|
4
5
|
import inspect
|
|
5
6
|
import json
|
|
6
7
|
import math
|
|
7
8
|
import os
|
|
8
9
|
import pathlib
|
|
10
|
+
import random
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
|
-
import shutil
|
|
12
13
|
import signal
|
|
14
|
+
import socket
|
|
13
15
|
import subprocess
|
|
14
16
|
import sys
|
|
15
17
|
import tempfile
|
|
@@ -17,14 +19,14 @@ import textwrap
|
|
|
17
19
|
import threading
|
|
18
20
|
import time
|
|
19
21
|
import typing
|
|
20
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
21
|
-
Union)
|
|
22
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
23
|
+
Set, Tuple, Union)
|
|
22
24
|
|
|
23
25
|
import colorama
|
|
24
|
-
import
|
|
26
|
+
import psutil
|
|
25
27
|
|
|
26
|
-
import sky
|
|
27
28
|
from sky import backends
|
|
29
|
+
from sky import catalog
|
|
28
30
|
from sky import check as sky_check
|
|
29
31
|
from sky import cloud_stores
|
|
30
32
|
from sky import clouds
|
|
@@ -37,10 +39,10 @@ from sky import resources as resources_lib
|
|
|
37
39
|
from sky import sky_logging
|
|
38
40
|
from sky import skypilot_config
|
|
39
41
|
from sky import task as task_lib
|
|
42
|
+
from sky.adaptors import common as adaptors_common
|
|
40
43
|
from sky.backends import backend_utils
|
|
41
44
|
from sky.backends import wheel_utils
|
|
42
45
|
from sky.clouds import cloud as sky_cloud
|
|
43
|
-
from sky.clouds import service_catalog
|
|
44
46
|
from sky.clouds.utils import gcp_utils
|
|
45
47
|
from sky.data import data_utils
|
|
46
48
|
from sky.data import storage as storage_lib
|
|
@@ -48,7 +50,9 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
55
|
+
from sky.serve import constants as serve_constants
|
|
52
56
|
from sky.server.requests import requests as requests_lib
|
|
53
57
|
from sky.skylet import autostop_lib
|
|
54
58
|
from sky.skylet import constants
|
|
@@ -61,8 +65,12 @@ from sky.utils import cluster_utils
|
|
|
61
65
|
from sky.utils import command_runner
|
|
62
66
|
from sky.utils import common
|
|
63
67
|
from sky.utils import common_utils
|
|
68
|
+
from sky.utils import context_utils
|
|
64
69
|
from sky.utils import controller_utils
|
|
70
|
+
from sky.utils import directory_utils
|
|
65
71
|
from sky.utils import env_options
|
|
72
|
+
from sky.utils import lock_events
|
|
73
|
+
from sky.utils import locks
|
|
66
74
|
from sky.utils import log_utils
|
|
67
75
|
from sky.utils import message_utils
|
|
68
76
|
from sky.utils import registry
|
|
@@ -72,14 +80,51 @@ from sky.utils import status_lib
|
|
|
72
80
|
from sky.utils import subprocess_utils
|
|
73
81
|
from sky.utils import timeline
|
|
74
82
|
from sky.utils import ux_utils
|
|
83
|
+
from sky.utils import volume as volume_lib
|
|
84
|
+
from sky.utils import yaml_utils
|
|
75
85
|
|
|
76
86
|
if typing.TYPE_CHECKING:
|
|
87
|
+
import grpc
|
|
88
|
+
|
|
77
89
|
from sky import dag
|
|
90
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
91
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
93
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
95
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
96
|
+
from sky.schemas.generated import servev1_pb2
|
|
97
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
98
|
+
else:
|
|
99
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
100
|
+
grpc = adaptors_common.LazyImport(
|
|
101
|
+
'grpc',
|
|
102
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
103
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
104
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
105
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
106
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
107
|
+
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
108
|
+
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
109
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
110
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
111
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
112
|
+
servev1_pb2 = adaptors_common.LazyImport(
|
|
113
|
+
'sky.schemas.generated.servev1_pb2')
|
|
114
|
+
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
115
|
+
'sky.schemas.generated.servev1_pb2_grpc')
|
|
116
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
117
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
118
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
119
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
78
120
|
|
|
79
121
|
Path = str
|
|
80
122
|
|
|
81
123
|
SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
|
|
82
124
|
SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
|
|
125
|
+
# Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
|
|
126
|
+
# from interfering with the Ray cluster in the user's task (if any).
|
|
127
|
+
UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
|
|
83
128
|
|
|
84
129
|
logger = sky_logging.init_logger(__name__)
|
|
85
130
|
|
|
@@ -96,6 +141,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
|
96
141
|
clouds.OCI: 300,
|
|
97
142
|
clouds.Paperspace: 600,
|
|
98
143
|
clouds.Kubernetes: 300,
|
|
144
|
+
clouds.Shadeform: 300,
|
|
99
145
|
clouds.Vsphere: 240,
|
|
100
146
|
}
|
|
101
147
|
|
|
@@ -141,12 +187,13 @@ _MAX_RAY_UP_RETRY = 5
|
|
|
141
187
|
_MAX_GET_ZONE_RETRY = 3
|
|
142
188
|
|
|
143
189
|
_JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
|
|
190
|
+
_LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
|
|
144
191
|
|
|
145
192
|
# Path to the monkey-patched ray up script.
|
|
146
193
|
# We don't do import then __file__ because that script needs to be filled in
|
|
147
194
|
# (so import would fail).
|
|
148
195
|
_RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
149
|
-
pathlib.Path(
|
|
196
|
+
pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
|
|
150
197
|
'monkey_patches' / 'monkey_patch_ray_up.py')
|
|
151
198
|
|
|
152
199
|
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
@@ -161,10 +208,19 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
161
208
|
# We use 100KB as a threshold to be safe for other arguments that
|
|
162
209
|
# might be added during ssh.
|
|
163
210
|
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
211
|
+
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
212
|
+
('too long', 255),
|
|
213
|
+
('request-uri too large', 1),
|
|
214
|
+
('request header fields too large', 1),
|
|
215
|
+
('400 bad request', 1), # CloudFlare 400 error
|
|
216
|
+
]
|
|
164
217
|
|
|
165
218
|
_RESOURCES_UNAVAILABLE_LOG = (
|
|
166
219
|
'Reasons for provision failures (for details, please check the log above):')
|
|
167
220
|
|
|
221
|
+
# Number of seconds to wait locking the cluster before communicating with user.
|
|
222
|
+
_CLUSTER_LOCK_TIMEOUT = 5.0
|
|
223
|
+
|
|
168
224
|
|
|
169
225
|
def _is_command_length_over_limit(command: str) -> bool:
|
|
170
226
|
"""Check if the length of the command exceeds the limit.
|
|
@@ -178,6 +234,61 @@ def _is_command_length_over_limit(command: str) -> bool:
|
|
|
178
234
|
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
179
235
|
|
|
180
236
|
|
|
237
|
+
def _is_message_too_long(returncode: int,
|
|
238
|
+
output: Optional[str] = None,
|
|
239
|
+
file_path: Optional[str] = None) -> bool:
|
|
240
|
+
"""Check if the message sent to the remote is too long.
|
|
241
|
+
|
|
242
|
+
We use inline script to run the setup or run command, i.e. the script will
|
|
243
|
+
be part of the message sent to the remote cluster. There is a chance that
|
|
244
|
+
the command is too long, when people has very long run or setup commands, or
|
|
245
|
+
there is a cloudflare proxy in front of the remote blocking the long
|
|
246
|
+
message. Several common causes are:
|
|
247
|
+
- SSH returning: `too long` in the error message.
|
|
248
|
+
- Cloudflare proxy returning: `414 Request-URI Too Large` or
|
|
249
|
+
`431 Request Header Fields Too Large` error.
|
|
250
|
+
|
|
251
|
+
We use a general length limit check before but it could be inaccurate on
|
|
252
|
+
some systems, e.g. cloudflare proxy, so this is necessary.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
returncode: The return code of the setup command.
|
|
256
|
+
output: The output of the setup command.
|
|
257
|
+
file_path: The path to the setup log file.
|
|
258
|
+
"""
|
|
259
|
+
assert (output is None) != (file_path is None), (
|
|
260
|
+
'Either output or file_path must be provided.', output, file_path)
|
|
261
|
+
to_check = []
|
|
262
|
+
for (match_str,
|
|
263
|
+
desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
|
|
264
|
+
if desired_rc == returncode:
|
|
265
|
+
to_check.append(match_str)
|
|
266
|
+
if not to_check:
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
def _check_output_for_match_str(output: str) -> bool:
|
|
270
|
+
for match_str in to_check:
|
|
271
|
+
if match_str.lower() in output.lower():
|
|
272
|
+
return True
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
if file_path is not None:
|
|
276
|
+
try:
|
|
277
|
+
with open(os.path.expanduser(file_path), 'r',
|
|
278
|
+
encoding='utf-8') as f:
|
|
279
|
+
content = f.read()
|
|
280
|
+
return _check_output_for_match_str(content)
|
|
281
|
+
except Exception as e: # pylint: disable=broad-except
|
|
282
|
+
# We don't crash the setup if we cannot read the log file.
|
|
283
|
+
# Instead, we should retry the setup with dumping the script
|
|
284
|
+
# to a file to be safe.
|
|
285
|
+
logger.debug(f'Failed to read setup log file {file_path}: {e}')
|
|
286
|
+
return True
|
|
287
|
+
else:
|
|
288
|
+
assert output is not None, (output, file_path)
|
|
289
|
+
return _check_output_for_match_str(output)
|
|
290
|
+
|
|
291
|
+
|
|
181
292
|
def _get_cluster_config_template(cloud):
|
|
182
293
|
cloud_to_template = {
|
|
183
294
|
clouds.AWS: 'aws-ray.yml.j2',
|
|
@@ -189,13 +300,18 @@ def _get_cluster_config_template(cloud):
|
|
|
189
300
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
190
301
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
191
302
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
303
|
+
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
192
304
|
clouds.DO: 'do-ray.yml.j2',
|
|
193
305
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
194
306
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
307
|
+
clouds.SSH: 'kubernetes-ray.yml.j2',
|
|
308
|
+
clouds.Shadeform: 'shadeform-ray.yml.j2',
|
|
195
309
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
|
196
310
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
197
311
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
198
|
-
clouds.Nebius: 'nebius-ray.yml.j2'
|
|
312
|
+
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
313
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
314
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
199
315
|
}
|
|
200
316
|
return cloud_to_template[type(cloud)]
|
|
201
317
|
|
|
@@ -274,6 +390,7 @@ class RayCodeGen:
|
|
|
274
390
|
ray_address = 'auto'
|
|
275
391
|
self._code = [
|
|
276
392
|
textwrap.dedent(f"""\
|
|
393
|
+
import functools
|
|
277
394
|
import getpass
|
|
278
395
|
import hashlib
|
|
279
396
|
import io
|
|
@@ -306,6 +423,8 @@ class RayCodeGen:
|
|
|
306
423
|
|
|
307
424
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
308
425
|
|
|
426
|
+
CANCELLED_RETURN_CODE = 137
|
|
427
|
+
|
|
309
428
|
kwargs = dict()
|
|
310
429
|
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
311
430
|
# the directory exists for backward compatibility for the VM
|
|
@@ -321,8 +440,10 @@ class RayCodeGen:
|
|
|
321
440
|
def get_or_fail(futures, pg) -> List[int]:
|
|
322
441
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
323
442
|
if not futures:
|
|
324
|
-
return []
|
|
443
|
+
return [], []
|
|
325
444
|
returncodes = [1] * len(futures)
|
|
445
|
+
pids = [None] * len(futures)
|
|
446
|
+
failed = False
|
|
326
447
|
# Wait for 1 task to be ready.
|
|
327
448
|
ready = []
|
|
328
449
|
# Keep invoking ray.wait if ready is empty. This is because
|
|
@@ -331,12 +452,22 @@ class RayCodeGen:
|
|
|
331
452
|
# before becoming ready.
|
|
332
453
|
# (Such tasks are common in serving jobs.)
|
|
333
454
|
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
455
|
+
|
|
456
|
+
def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
|
|
457
|
+
nonlocal returncodes, pids, failed
|
|
458
|
+
for task in tasks:
|
|
459
|
+
idx = futures.index(task)
|
|
460
|
+
res = ray.get(task)
|
|
461
|
+
returncodes[idx] = res['return_code']
|
|
462
|
+
pids[idx] = res['pid']
|
|
463
|
+
if res['return_code'] != 0:
|
|
464
|
+
failed = True
|
|
465
|
+
|
|
334
466
|
while not ready:
|
|
335
467
|
ready, unready = ray.wait(futures)
|
|
336
|
-
|
|
337
|
-
returncodes[idx] = ray.get(ready[0])
|
|
468
|
+
handle_ready_tasks(ready)
|
|
338
469
|
while unready:
|
|
339
|
-
if
|
|
470
|
+
if failed:
|
|
340
471
|
for task in unready:
|
|
341
472
|
# ray.cancel without force fails to kill tasks.
|
|
342
473
|
# We use force=True to kill unready tasks.
|
|
@@ -344,17 +475,16 @@ class RayCodeGen:
|
|
|
344
475
|
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
345
476
|
# killed.
|
|
346
477
|
idx = futures.index(task)
|
|
347
|
-
returncodes[idx] =
|
|
478
|
+
returncodes[idx] = CANCELLED_RETURN_CODE
|
|
348
479
|
break
|
|
349
480
|
ready, unready = ray.wait(unready)
|
|
350
|
-
|
|
351
|
-
returncodes[idx] = ray.get(ready[0])
|
|
481
|
+
handle_ready_tasks(ready)
|
|
352
482
|
# Remove the placement group after all tasks are done, so that
|
|
353
483
|
# the next job can be scheduled on the released resources
|
|
354
484
|
# immediately.
|
|
355
485
|
ray_util.remove_placement_group(pg)
|
|
356
486
|
sys.stdout.flush()
|
|
357
|
-
return returncodes
|
|
487
|
+
return returncodes, pids
|
|
358
488
|
|
|
359
489
|
run_fn = None
|
|
360
490
|
futures = []
|
|
@@ -363,13 +493,17 @@ class RayCodeGen:
|
|
|
363
493
|
# by ray.remote. This should be removed once we have a better way to
|
|
364
494
|
# specify dependencies for ray.
|
|
365
495
|
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
|
496
|
+
inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
|
|
366
497
|
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
|
367
498
|
inspect.getsource(log_lib.process_subprocess_stream),
|
|
368
499
|
inspect.getsource(log_lib.run_with_log),
|
|
369
500
|
inspect.getsource(log_lib.make_task_bash_script),
|
|
370
501
|
inspect.getsource(log_lib.add_ray_env_vars),
|
|
371
502
|
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
372
|
-
|
|
503
|
+
inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
|
|
504
|
+
'run_bash_command_with_log = run_bash_command_with_log',
|
|
505
|
+
'run_bash_command_with_log_and_return_pid = \
|
|
506
|
+
ray.remote(run_bash_command_with_log_and_return_pid)',
|
|
373
507
|
]
|
|
374
508
|
# Currently, the codegen program is/can only be submitted to the head
|
|
375
509
|
# node, due to using job_lib for updating job statuses, and using
|
|
@@ -471,10 +605,14 @@ class RayCodeGen:
|
|
|
471
605
|
# skip the scheduling step.
|
|
472
606
|
job_lib.scheduler.schedule_step()
|
|
473
607
|
|
|
474
|
-
|
|
608
|
+
# If some nodes are down and then new nodes are added after launching again,
|
|
609
|
+
# the result of `ray.nodes()` will include all the nodes, so we need to get
|
|
610
|
+
# the alive nodes.
|
|
611
|
+
alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
|
|
612
|
+
total_num_nodes = len(alive_nodes)
|
|
475
613
|
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
476
614
|
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
477
|
-
setup_workers = [
|
|
615
|
+
setup_workers = [run_bash_command_with_log_and_return_pid \\
|
|
478
616
|
.options(
|
|
479
617
|
name='setup',
|
|
480
618
|
num_cpus=_SETUP_CPUS,
|
|
@@ -489,15 +627,25 @@ class RayCodeGen:
|
|
|
489
627
|
stream_logs=True,
|
|
490
628
|
with_ray=True,
|
|
491
629
|
) for i in range(total_num_nodes)]
|
|
492
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
493
|
-
|
|
630
|
+
setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
|
|
631
|
+
success = True
|
|
632
|
+
failed_workers_and_returncodes = []
|
|
633
|
+
for i in range(len(setup_returncodes)):
|
|
634
|
+
returncode = setup_returncodes[i]
|
|
635
|
+
pid = setup_pids[i]
|
|
636
|
+
if pid == None:
|
|
637
|
+
pid = os.getpid()
|
|
638
|
+
if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
|
|
639
|
+
success = False
|
|
640
|
+
failed_workers_and_returncodes.append((pid, returncode))
|
|
641
|
+
if not success:
|
|
642
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
|
|
643
|
+
msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
|
|
644
|
+
msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
645
|
+
print(msg, flush=True)
|
|
494
646
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
495
647
|
# This waits for all streaming logs to finish.
|
|
496
648
|
time.sleep(1)
|
|
497
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
498
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
499
|
-
setup_returncodes,
|
|
500
|
-
flush=True)
|
|
501
649
|
# Need this to set the job status in ray job to be FAILED.
|
|
502
650
|
sys.exit(1)
|
|
503
651
|
""")
|
|
@@ -614,7 +762,12 @@ class RayCodeGen:
|
|
|
614
762
|
# CACHED_MOUNT mode is uploaded to remote.
|
|
615
763
|
rclone_flush_script = textwrap.dedent(f"""\
|
|
616
764
|
|
|
617
|
-
|
|
765
|
+
# Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
|
|
766
|
+
# findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
|
|
767
|
+
# rclone for normal mounts as well.
|
|
768
|
+
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
769
|
+
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
770
|
+
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
618
771
|
flushed=0
|
|
619
772
|
# extra second on top of --vfs-cache-poll-interval to
|
|
620
773
|
# avoid race condition between rclone log line creation and this check.
|
|
@@ -623,7 +776,7 @@ class RayCodeGen:
|
|
|
623
776
|
# sleep for the same interval as --vfs-cache-poll-interval
|
|
624
777
|
sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
|
|
625
778
|
flushed=1
|
|
626
|
-
for file in {constants.
|
|
779
|
+
for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
|
|
627
780
|
exitcode=0
|
|
628
781
|
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
629
782
|
if [ $exitcode -ne 0 ]; then
|
|
@@ -635,6 +788,8 @@ class RayCodeGen:
|
|
|
635
788
|
done
|
|
636
789
|
echo "skypilot: cached mount uploaded complete"
|
|
637
790
|
fi""")
|
|
791
|
+
unset_ray_env_vars = ' && '.join(
|
|
792
|
+
[f'unset {var}' for var in UNSET_RAY_ENV_VARS])
|
|
638
793
|
self._code += [
|
|
639
794
|
sky_env_vars_dict_str,
|
|
640
795
|
textwrap.dedent(f"""\
|
|
@@ -644,6 +799,7 @@ class RayCodeGen:
|
|
|
644
799
|
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
|
645
800
|
|
|
646
801
|
if script is not None:
|
|
802
|
+
script=f'{unset_ray_env_vars}; {{script}}'
|
|
647
803
|
script += rclone_flush_script
|
|
648
804
|
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
649
805
|
|
|
@@ -665,7 +821,7 @@ class RayCodeGen:
|
|
|
665
821
|
|
|
666
822
|
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
667
823
|
|
|
668
|
-
futures.append(
|
|
824
|
+
futures.append(run_bash_command_with_log_and_return_pid \\
|
|
669
825
|
.options(name=name_str, {options_str}) \\
|
|
670
826
|
.remote(
|
|
671
827
|
script,
|
|
@@ -684,7 +840,7 @@ class RayCodeGen:
|
|
|
684
840
|
|
|
685
841
|
self._code += [
|
|
686
842
|
textwrap.dedent(f"""\
|
|
687
|
-
returncodes = get_or_fail(futures, pg)
|
|
843
|
+
returncodes, _ = get_or_fail(futures, pg)
|
|
688
844
|
if sum(returncodes) != 0:
|
|
689
845
|
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
690
846
|
# Schedule the next pending job immediately to make the job
|
|
@@ -696,6 +852,10 @@ class RayCodeGen:
|
|
|
696
852
|
# 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
|
|
697
853
|
if any(r == 139 for r in returncodes):
|
|
698
854
|
reason = '(likely due to Segmentation Fault)'
|
|
855
|
+
if any(r == 137 for r in returncodes):
|
|
856
|
+
# Find the first non-137 return code
|
|
857
|
+
non_137 = next(r for r in returncodes if r != 137)
|
|
858
|
+
reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
|
|
699
859
|
print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
|
|
700
860
|
'return code list:{colorama.Style.RESET_ALL}',
|
|
701
861
|
returncodes,
|
|
@@ -778,34 +938,6 @@ class FailoverCloudErrorHandlerV1:
|
|
|
778
938
|
setattr(e, 'detailed_reason', detailed_reason)
|
|
779
939
|
raise e
|
|
780
940
|
|
|
781
|
-
@staticmethod
|
|
782
|
-
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
783
|
-
launchable_resources: 'resources_lib.Resources',
|
|
784
|
-
region: 'clouds.Region',
|
|
785
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
|
786
|
-
stderr: str):
|
|
787
|
-
del zones # Unused.
|
|
788
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
|
789
|
-
stdout,
|
|
790
|
-
stderr,
|
|
791
|
-
is_error_str_known=lambda x: 'SCPError:' in x.strip())
|
|
792
|
-
|
|
793
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
|
794
|
-
messages = '\n\t'.join(errors)
|
|
795
|
-
style = colorama.Style
|
|
796
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
|
797
|
-
_add_to_blocked_resources(blocked_resources,
|
|
798
|
-
launchable_resources.copy(zone=None))
|
|
799
|
-
|
|
800
|
-
# Sometimes, SCPError will list available regions.
|
|
801
|
-
for e in errors:
|
|
802
|
-
if e.find('Regions with capacity available:') != -1:
|
|
803
|
-
for r in service_catalog.regions('scp'):
|
|
804
|
-
if e.find(r.name) == -1:
|
|
805
|
-
_add_to_blocked_resources(
|
|
806
|
-
blocked_resources,
|
|
807
|
-
launchable_resources.copy(region=r.name, zone=None))
|
|
808
|
-
|
|
809
941
|
@staticmethod
|
|
810
942
|
def _ibm_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
811
943
|
launchable_resources: 'resources_lib.Resources',
|
|
@@ -1085,7 +1217,7 @@ class FailoverCloudErrorHandlerV2:
|
|
|
1085
1217
|
output = str(error)
|
|
1086
1218
|
# Sometimes, lambda cloud error will list available regions.
|
|
1087
1219
|
if output.find('Regions with capacity available:') != -1:
|
|
1088
|
-
for r in
|
|
1220
|
+
for r in catalog.regions('lambda'):
|
|
1089
1221
|
if output.find(r.name) == -1:
|
|
1090
1222
|
_add_to_blocked_resources(
|
|
1091
1223
|
blocked_resources,
|
|
@@ -1109,6 +1241,21 @@ class FailoverCloudErrorHandlerV2:
|
|
|
1109
1241
|
FailoverCloudErrorHandlerV2._default_handler(
|
|
1110
1242
|
blocked_resources, launchable_resources, region, zones, error)
|
|
1111
1243
|
|
|
1244
|
+
@staticmethod
|
|
1245
|
+
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
1246
|
+
launchable_resources: 'resources_lib.Resources',
|
|
1247
|
+
region: 'clouds.Region',
|
|
1248
|
+
zones: Optional[List['clouds.Zone']],
|
|
1249
|
+
error: Exception) -> None:
|
|
1250
|
+
logger.info(f'SCP handler error: {error}')
|
|
1251
|
+
# Block SCP if the credential has expired.
|
|
1252
|
+
if isinstance(error, exceptions.InvalidCloudCredentials):
|
|
1253
|
+
_add_to_blocked_resources(
|
|
1254
|
+
blocked_resources, resources_lib.Resources(cloud=clouds.SCP()))
|
|
1255
|
+
else:
|
|
1256
|
+
FailoverCloudErrorHandlerV2._default_handler(
|
|
1257
|
+
blocked_resources, launchable_resources, region, zones, error)
|
|
1258
|
+
|
|
1112
1259
|
@staticmethod
|
|
1113
1260
|
def _default_handler(blocked_resources: Set['resources_lib.Resources'],
|
|
1114
1261
|
launchable_resources: 'resources_lib.Resources',
|
|
@@ -1176,7 +1323,8 @@ class RetryingVmProvisioner(object):
|
|
|
1176
1323
|
local_wheel_path: pathlib.Path,
|
|
1177
1324
|
wheel_hash: str,
|
|
1178
1325
|
blocked_resources: Optional[Iterable[
|
|
1179
|
-
resources_lib.Resources]] = None
|
|
1326
|
+
resources_lib.Resources]] = None,
|
|
1327
|
+
is_managed: Optional[bool] = None):
|
|
1180
1328
|
self._blocked_resources: Set[resources_lib.Resources] = set()
|
|
1181
1329
|
if blocked_resources:
|
|
1182
1330
|
# blocked_resources is not None and not empty.
|
|
@@ -1188,6 +1336,7 @@ class RetryingVmProvisioner(object):
|
|
|
1188
1336
|
self._requested_features = requested_features
|
|
1189
1337
|
self._local_wheel_path = local_wheel_path
|
|
1190
1338
|
self._wheel_hash = wheel_hash
|
|
1339
|
+
self._is_managed = is_managed
|
|
1191
1340
|
|
|
1192
1341
|
def _yield_zones(
|
|
1193
1342
|
self, to_provision: resources_lib.Resources, num_nodes: int,
|
|
@@ -1232,7 +1381,8 @@ class RetryingVmProvisioner(object):
|
|
|
1232
1381
|
assert isinstance(handle, CloudVmRayResourceHandle), (
|
|
1233
1382
|
'handle should be CloudVmRayResourceHandle (found: '
|
|
1234
1383
|
f'{type(handle)}) {cluster_name!r}')
|
|
1235
|
-
config =
|
|
1384
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
1385
|
+
handle.cluster_yaml)
|
|
1236
1386
|
# This is for the case when the zone field is not set in the
|
|
1237
1387
|
# launched resources in a previous launch (e.g., ctrl-c during
|
|
1238
1388
|
# launch and multi-node cluster before PR #1700).
|
|
@@ -1316,6 +1466,34 @@ class RetryingVmProvisioner(object):
|
|
|
1316
1466
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1317
1467
|
yield zones
|
|
1318
1468
|
|
|
1469
|
+
def _insufficient_resources_msg(
|
|
1470
|
+
self,
|
|
1471
|
+
to_provision: resources_lib.Resources,
|
|
1472
|
+
requested_resources: Set[resources_lib.Resources],
|
|
1473
|
+
insufficient_resources: Optional[List[str]],
|
|
1474
|
+
) -> str:
|
|
1475
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
1476
|
+
f' ({", ".join(insufficient_resources)})')
|
|
1477
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
1478
|
+
if to_provision.zone is not None:
|
|
1479
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
1480
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
1481
|
+
# For public clouds, provision.region is always set.
|
|
1482
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1483
|
+
message += (
|
|
1484
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
1485
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
1486
|
+
'have enough resources.')
|
|
1487
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1488
|
+
message += (f'in context {to_provision.region} for '
|
|
1489
|
+
f'{requested_resources}. ')
|
|
1490
|
+
else:
|
|
1491
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
1492
|
+
f'{requested_resources}. ')
|
|
1493
|
+
else:
|
|
1494
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
1495
|
+
return message
|
|
1496
|
+
|
|
1319
1497
|
def _retry_zones(
|
|
1320
1498
|
self,
|
|
1321
1499
|
to_provision: resources_lib.Resources,
|
|
@@ -1329,6 +1507,7 @@ class RetryingVmProvisioner(object):
|
|
|
1329
1507
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
|
1330
1508
|
prev_cluster_ever_up: bool,
|
|
1331
1509
|
skip_if_config_hash_matches: Optional[str],
|
|
1510
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]],
|
|
1332
1511
|
) -> Dict[str, Any]:
|
|
1333
1512
|
"""The provision retry loop.
|
|
1334
1513
|
|
|
@@ -1349,12 +1528,17 @@ class RetryingVmProvisioner(object):
|
|
|
1349
1528
|
if not dryrun:
|
|
1350
1529
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
1351
1530
|
os.system(f'touch {log_path}')
|
|
1531
|
+
|
|
1352
1532
|
rich_utils.force_update_status(
|
|
1353
|
-
ux_utils.spinner_message('Launching',
|
|
1533
|
+
ux_utils.spinner_message('Launching',
|
|
1534
|
+
log_path,
|
|
1535
|
+
cluster_name=cluster_name))
|
|
1354
1536
|
|
|
1355
1537
|
# Get previous cluster status
|
|
1356
1538
|
cluster_exists = prev_cluster_status is not None
|
|
1357
1539
|
|
|
1540
|
+
to_provision = to_provision.assert_launchable()
|
|
1541
|
+
|
|
1358
1542
|
assert to_provision.region is not None, (
|
|
1359
1543
|
to_provision, 'region should have been set by the optimizer.')
|
|
1360
1544
|
region = clouds.Region(to_provision.region)
|
|
@@ -1388,6 +1572,7 @@ class RetryingVmProvisioner(object):
|
|
|
1388
1572
|
f'To request quotas, check the instruction: '
|
|
1389
1573
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1390
1574
|
|
|
1575
|
+
insufficient_resources = None
|
|
1391
1576
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1392
1577
|
prev_cluster_status,
|
|
1393
1578
|
prev_cluster_ever_up):
|
|
@@ -1432,7 +1617,9 @@ class RetryingVmProvisioner(object):
|
|
|
1432
1617
|
region=region,
|
|
1433
1618
|
zones=zones,
|
|
1434
1619
|
dryrun=dryrun,
|
|
1435
|
-
keep_launch_fields_in_existing_config=cluster_exists
|
|
1620
|
+
keep_launch_fields_in_existing_config=cluster_exists,
|
|
1621
|
+
volume_mounts=volume_mounts,
|
|
1622
|
+
)
|
|
1436
1623
|
except exceptions.ResourcesUnavailableError as e:
|
|
1437
1624
|
# Failed due to catalog issue, e.g. image not found, or
|
|
1438
1625
|
# GPUs are requested in a Kubernetes cluster but the cluster
|
|
@@ -1515,8 +1702,17 @@ class RetryingVmProvisioner(object):
|
|
|
1515
1702
|
cluster_handle=handle,
|
|
1516
1703
|
requested_resources=requested_resources,
|
|
1517
1704
|
ready=False,
|
|
1705
|
+
is_managed=self._is_managed,
|
|
1706
|
+
provision_log_path=log_abs_path,
|
|
1518
1707
|
)
|
|
1519
1708
|
|
|
1709
|
+
# Add cluster event for actual provisioning start.
|
|
1710
|
+
global_user_state.add_cluster_event(
|
|
1711
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
1712
|
+
f'Provisioning on {to_provision.cloud.display_name()} ' +
|
|
1713
|
+
f'in {to_provision.region}',
|
|
1714
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
1715
|
+
|
|
1520
1716
|
global_user_state.set_owner_identity_for_cluster(
|
|
1521
1717
|
cluster_name, cloud_user_identity)
|
|
1522
1718
|
|
|
@@ -1543,11 +1739,13 @@ class RetryingVmProvisioner(object):
|
|
|
1543
1739
|
controller_str = ('' if controller is None else
|
|
1544
1740
|
f' {controller.value.name}')
|
|
1545
1741
|
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
1546
|
-
|
|
1742
|
+
suffix = '.'
|
|
1743
|
+
if region.name.startswith('ssh-'):
|
|
1744
|
+
suffix = f' ({region.name.lstrip("ssh-")})'
|
|
1547
1745
|
logger.info(
|
|
1548
1746
|
ux_utils.starting_message(
|
|
1549
1747
|
f'Launching{controller_str} on '
|
|
1550
|
-
f'{to_provision.cloud}
|
|
1748
|
+
f'{to_provision.cloud}{suffix}'))
|
|
1551
1749
|
else:
|
|
1552
1750
|
logger.info(
|
|
1553
1751
|
ux_utils.starting_message(
|
|
@@ -1587,6 +1785,24 @@ class RetryingVmProvisioner(object):
|
|
|
1587
1785
|
# No teardown happens for this error.
|
|
1588
1786
|
with ux_utils.print_exception_no_traceback():
|
|
1589
1787
|
raise
|
|
1788
|
+
except config_lib.KubernetesError as e:
|
|
1789
|
+
if e.insufficent_resources:
|
|
1790
|
+
insufficient_resources = e.insufficent_resources
|
|
1791
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1792
|
+
# cluster does not exist. Also we are fast at
|
|
1793
|
+
# cleaning up clusters now if there is no existing node.
|
|
1794
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1795
|
+
handle,
|
|
1796
|
+
terminate=not prev_cluster_ever_up,
|
|
1797
|
+
remove_from_db=False,
|
|
1798
|
+
failover=True,
|
|
1799
|
+
)
|
|
1800
|
+
# TODO(suquark): other clouds may have different zone
|
|
1801
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1802
|
+
# for details.
|
|
1803
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1804
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1805
|
+
continue
|
|
1590
1806
|
except Exception as e: # pylint: disable=broad-except
|
|
1591
1807
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1592
1808
|
# cluster does not exist. Also we are fast at
|
|
@@ -1594,7 +1810,8 @@ class RetryingVmProvisioner(object):
|
|
|
1594
1810
|
CloudVmRayBackend().post_teardown_cleanup(
|
|
1595
1811
|
handle,
|
|
1596
1812
|
terminate=not prev_cluster_ever_up,
|
|
1597
|
-
remove_from_db=False
|
|
1813
|
+
remove_from_db=False,
|
|
1814
|
+
failover=True)
|
|
1598
1815
|
# TODO(suquark): other clouds may have different zone
|
|
1599
1816
|
# blocking strategy. See '_update_blocklist_on_error'
|
|
1600
1817
|
# for details.
|
|
@@ -1650,7 +1867,9 @@ class RetryingVmProvisioner(object):
|
|
|
1650
1867
|
config_dict['handle'] = handle
|
|
1651
1868
|
logger.info(
|
|
1652
1869
|
ux_utils.finishing_message(
|
|
1653
|
-
f'Cluster launched: {cluster_name!r}.',
|
|
1870
|
+
f'Cluster launched: {cluster_name!r}.',
|
|
1871
|
+
log_path,
|
|
1872
|
+
cluster_name=cluster_name))
|
|
1654
1873
|
return config_dict
|
|
1655
1874
|
|
|
1656
1875
|
# The cluster is not ready. We must perform error recording and/or
|
|
@@ -1714,17 +1933,9 @@ class RetryingVmProvisioner(object):
|
|
|
1714
1933
|
terminate=terminate_or_stop,
|
|
1715
1934
|
remove_from_db=False)
|
|
1716
1935
|
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
f'{requested_resources}. ')
|
|
1721
|
-
elif to_provision.region is not None:
|
|
1722
|
-
# For public clouds, provision.region is always set.
|
|
1723
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1724
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1725
|
-
else:
|
|
1726
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1727
|
-
f'for {requested_resources}. ')
|
|
1936
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1937
|
+
requested_resources,
|
|
1938
|
+
insufficient_resources)
|
|
1728
1939
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1729
1940
|
# the user can have some data on the cluster.
|
|
1730
1941
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -1775,7 +1986,8 @@ class RetryingVmProvisioner(object):
|
|
|
1775
1986
|
log_abs_path,
|
|
1776
1987
|
stream_logs=False,
|
|
1777
1988
|
start_streaming_at='Shared connection to',
|
|
1778
|
-
line_processor=log_utils.RayUpLineProcessor(
|
|
1989
|
+
line_processor=log_utils.RayUpLineProcessor(
|
|
1990
|
+
log_abs_path, cluster_name=cluster_handle.cluster_name),
|
|
1779
1991
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
|
1780
1992
|
# time during 'ray up' if insufficient capacity occurs.
|
|
1781
1993
|
env=dict(
|
|
@@ -1919,9 +2131,10 @@ class RetryingVmProvisioner(object):
|
|
|
1919
2131
|
# ready to ensure cluster will not scale up after preemption (spot).
|
|
1920
2132
|
# Skip for non-spot as this takes extra time to provision (~1min).
|
|
1921
2133
|
if use_spot:
|
|
1922
|
-
ray_config =
|
|
2134
|
+
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
2135
|
+
cluster_config_file)
|
|
1923
2136
|
ray_config['upscaling_speed'] = 0
|
|
1924
|
-
|
|
2137
|
+
yaml_utils.dump_yaml(cluster_config_file, ray_config)
|
|
1925
2138
|
start = time.time()
|
|
1926
2139
|
returncode, stdout, stderr = ray_up()
|
|
1927
2140
|
logger.debug(
|
|
@@ -2030,6 +2243,7 @@ class RetryingVmProvisioner(object):
|
|
|
2030
2243
|
f' that never expire or a service account.\033[0m')
|
|
2031
2244
|
logger.warning(warnings)
|
|
2032
2245
|
|
|
2246
|
+
to_provision = to_provision.assert_launchable()
|
|
2033
2247
|
# Retrying launchable resources.
|
|
2034
2248
|
while True:
|
|
2035
2249
|
try:
|
|
@@ -2068,7 +2282,9 @@ class RetryingVmProvisioner(object):
|
|
|
2068
2282
|
prev_cluster_status=prev_cluster_status,
|
|
2069
2283
|
prev_handle=prev_handle,
|
|
2070
2284
|
prev_cluster_ever_up=prev_cluster_ever_up,
|
|
2071
|
-
skip_if_config_hash_matches=skip_if_config_hash_matches
|
|
2285
|
+
skip_if_config_hash_matches=skip_if_config_hash_matches,
|
|
2286
|
+
volume_mounts=task.volume_mounts,
|
|
2287
|
+
)
|
|
2072
2288
|
if dryrun:
|
|
2073
2289
|
return config_dict
|
|
2074
2290
|
except (exceptions.InvalidClusterNameError,
|
|
@@ -2115,8 +2331,6 @@ class RetryingVmProvisioner(object):
|
|
|
2115
2331
|
# terminated by _retry_zones().
|
|
2116
2332
|
assert (prev_cluster_status == status_lib.ClusterStatus.INIT
|
|
2117
2333
|
), prev_cluster_status
|
|
2118
|
-
assert global_user_state.get_handle_from_cluster_name(
|
|
2119
|
-
cluster_name) is None, cluster_name
|
|
2120
2334
|
logger.info(
|
|
2121
2335
|
ux_utils.retry_message(
|
|
2122
2336
|
f'Retrying provisioning with requested resources: '
|
|
@@ -2151,20 +2365,33 @@ class RetryingVmProvisioner(object):
|
|
|
2151
2365
|
# possible resources or the requested resources is too
|
|
2152
2366
|
# restrictive. If we reach here, our failover logic finally
|
|
2153
2367
|
# ends here.
|
|
2154
|
-
table = log_utils.create_table(['
|
|
2368
|
+
table = log_utils.create_table(['INFRA', 'RESOURCES', 'REASON'])
|
|
2155
2369
|
for (resource, exception) in resource_exceptions.items():
|
|
2156
|
-
table.add_row(
|
|
2157
|
-
|
|
2158
|
-
|
|
2370
|
+
table.add_row([
|
|
2371
|
+
resource.infra.formatted_str(),
|
|
2372
|
+
resources_utils.format_resource(
|
|
2373
|
+
resource, simplified_only=True)[0], exception
|
|
2374
|
+
])
|
|
2375
|
+
# Set the max width of REASON column to 80 to avoid the table
|
|
2376
|
+
# being wrapped in a unreadable way.
|
|
2377
|
+
# pylint: disable=protected-access
|
|
2378
|
+
table._max_width = {'REASON': 80}
|
|
2159
2379
|
raise exceptions.ResourcesUnavailableError(
|
|
2160
2380
|
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
|
2161
2381
|
failover_history=failover_history)
|
|
2162
|
-
|
|
2382
|
+
best_resources = task.best_resources
|
|
2163
2383
|
assert task in self._dag.tasks, 'Internal logic error.'
|
|
2164
|
-
assert
|
|
2384
|
+
assert best_resources is not None, task
|
|
2385
|
+
to_provision = best_resources
|
|
2165
2386
|
return config_dict
|
|
2166
2387
|
|
|
2167
2388
|
|
|
2389
|
+
@dataclasses.dataclass
|
|
2390
|
+
class SSHTunnelInfo:
|
|
2391
|
+
port: int
|
|
2392
|
+
pid: int
|
|
2393
|
+
|
|
2394
|
+
|
|
2168
2395
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2169
2396
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2170
2397
|
|
|
@@ -2184,10 +2411,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2184
2411
|
- (optional) Launched resources
|
|
2185
2412
|
- (optional) Docker user name
|
|
2186
2413
|
- (optional) If TPU(s) are managed, a path to a deletion script.
|
|
2414
|
+
- (optional) Skylet SSH tunnel info.
|
|
2187
2415
|
"""
|
|
2188
2416
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2189
|
-
#
|
|
2190
|
-
_VERSION =
|
|
2417
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
2418
|
+
_VERSION = 12
|
|
2191
2419
|
|
|
2192
2420
|
def __init__(
|
|
2193
2421
|
self,
|
|
@@ -2220,6 +2448,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2220
2448
|
self.launched_nodes = launched_nodes
|
|
2221
2449
|
self.launched_resources = launched_resources
|
|
2222
2450
|
self.docker_user: Optional[str] = None
|
|
2451
|
+
self.is_grpc_enabled = True
|
|
2223
2452
|
|
|
2224
2453
|
def __repr__(self):
|
|
2225
2454
|
return (f'ResourceHandle('
|
|
@@ -2235,17 +2464,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2235
2464
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
|
2236
2465
|
f'{self.launched_resources}, '
|
|
2237
2466
|
f'\n\tdocker_user={self.docker_user},'
|
|
2238
|
-
f'\n\tssh_user={self.ssh_user}'
|
|
2467
|
+
f'\n\tssh_user={self.ssh_user},'
|
|
2468
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2239
2469
|
|
|
2240
2470
|
def get_cluster_name(self):
|
|
2241
2471
|
return self.cluster_name
|
|
2242
2472
|
|
|
2473
|
+
def get_cluster_name_on_cloud(self):
|
|
2474
|
+
return self.cluster_name_on_cloud
|
|
2475
|
+
|
|
2243
2476
|
def _use_internal_ips(self):
|
|
2244
2477
|
"""Returns whether to use internal IPs for SSH connections."""
|
|
2245
2478
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
|
2246
2479
|
# instead of `skypilot_config` as the latter can be changed after the
|
|
2247
2480
|
# cluster is UP.
|
|
2248
|
-
return
|
|
2481
|
+
return global_user_state.get_cluster_yaml_dict(self.cluster_yaml).get(
|
|
2249
2482
|
'provider', {}).get('use_internal_ips', False)
|
|
2250
2483
|
|
|
2251
2484
|
def update_ssh_ports(self, max_attempts: int = 1) -> None:
|
|
@@ -2270,11 +2503,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2270
2503
|
clouds.ProvisionerVersion.SKYPILOT):
|
|
2271
2504
|
provider_name = str(self.launched_resources.cloud).lower()
|
|
2272
2505
|
config = {}
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2506
|
+
# It is possible that the cluster yaml is not available when
|
|
2507
|
+
# the handle is unpickled for service replicas from the
|
|
2508
|
+
# controller with older version.
|
|
2509
|
+
yaml_str = global_user_state.get_cluster_yaml_str(self.cluster_yaml)
|
|
2510
|
+
if yaml_str is None:
|
|
2511
|
+
# If the cluster yaml is not available,
|
|
2512
|
+
# we skip updating the cluster info.
|
|
2513
|
+
return
|
|
2514
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
2278
2515
|
try:
|
|
2279
2516
|
cluster_info = provision_lib.get_cluster_info(
|
|
2280
2517
|
provider_name,
|
|
@@ -2410,12 +2647,23 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2410
2647
|
zip(cluster_internal_ips, cluster_feasible_ips))
|
|
2411
2648
|
|
|
2412
2649
|
# Ensure head node is the first element, then sort based on the
|
|
2413
|
-
# external IPs for stableness
|
|
2414
|
-
|
|
2415
|
-
|
|
2650
|
+
# external IPs for stableness. Skip for k8s nodes since pods
|
|
2651
|
+
# worker ids are already mapped.
|
|
2652
|
+
if (cluster_info is not None and
|
|
2653
|
+
cluster_info.provider_name == 'kubernetes'):
|
|
2654
|
+
stable_internal_external_ips = internal_external_ips
|
|
2655
|
+
else:
|
|
2656
|
+
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
|
|
2657
|
+
internal_external_ips[1:], key=lambda x: x[1])
|
|
2416
2658
|
self.stable_internal_external_ips = stable_internal_external_ips
|
|
2417
2659
|
|
|
2418
|
-
@
|
|
2660
|
+
@context_utils.cancellation_guard
|
|
2661
|
+
# we expect different request to be acting on different clusters
|
|
2662
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2663
|
+
# across requests.
|
|
2664
|
+
# Do not change this cache to global scope
|
|
2665
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2666
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2419
2667
|
@timeline.event
|
|
2420
2668
|
def get_command_runners(self,
|
|
2421
2669
|
force_cached: bool = False,
|
|
@@ -2426,19 +2674,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2426
2674
|
self.cluster_yaml, self.docker_user, self.ssh_user)
|
|
2427
2675
|
if avoid_ssh_control:
|
|
2428
2676
|
ssh_credentials.pop('ssh_control_name', None)
|
|
2677
|
+
|
|
2678
|
+
launched_resources = self.launched_resources.assert_launchable()
|
|
2429
2679
|
updated_to_skypilot_provisioner_after_provisioned = (
|
|
2430
|
-
|
|
2680
|
+
launched_resources.cloud.PROVISIONER_VERSION >=
|
|
2431
2681
|
clouds.ProvisionerVersion.SKYPILOT and
|
|
2432
2682
|
self.cached_external_ips is not None and
|
|
2433
2683
|
self.cached_cluster_info is None)
|
|
2434
2684
|
if updated_to_skypilot_provisioner_after_provisioned:
|
|
2435
2685
|
logger.debug(
|
|
2436
|
-
f'{
|
|
2686
|
+
f'{launched_resources.cloud} has been updated to the new '
|
|
2437
2687
|
f'provisioner after cluster {self.cluster_name} was '
|
|
2438
2688
|
f'provisioned. Cached IPs are used for connecting to the '
|
|
2439
2689
|
'cluster.')
|
|
2440
2690
|
if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
|
|
2441
|
-
|
|
2691
|
+
launched_resources.cloud.PROVISIONER_VERSION or
|
|
2442
2692
|
updated_to_skypilot_provisioner_after_provisioned):
|
|
2443
2693
|
ip_list = (self.cached_external_ips
|
|
2444
2694
|
if force_cached else self.external_ips())
|
|
@@ -2464,6 +2714,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2464
2714
|
'Tried to use cached cluster info, but it\'s missing for '
|
|
2465
2715
|
f'cluster "{self.cluster_name}"')
|
|
2466
2716
|
self._update_cluster_info()
|
|
2717
|
+
# For Kubernetes, `KubernetesCommandRunner` want to get the pod names
|
|
2718
|
+
# to run the command. But for high availability serve controller,
|
|
2719
|
+
# the controller pod is part of a deployment, and once the pod is
|
|
2720
|
+
# killed and a new one is created, the pod name changes, so we need
|
|
2721
|
+
# to manually update the cluster info here.
|
|
2722
|
+
# TODO(andyl): See if we can prevent this refresh. Like pass in
|
|
2723
|
+
# deployment name as identifier for KubernetesCommandRunner. Now this
|
|
2724
|
+
# is required for rsync as using deployment in rsync seems to cause
|
|
2725
|
+
# some unknown issues.
|
|
2726
|
+
# TODO(andyl): Should check through the real cluster info. Same as
|
|
2727
|
+
# the TODO in kubernetes/instance.py:terminate_instances
|
|
2728
|
+
if (isinstance(self.launched_resources.cloud, clouds.Kubernetes) and
|
|
2729
|
+
controller_utils.high_availability_specified(
|
|
2730
|
+
self.cluster_name)):
|
|
2731
|
+
self._update_cluster_info()
|
|
2467
2732
|
|
|
2468
2733
|
assert self.cached_cluster_info is not None, self
|
|
2469
2734
|
runners = provision_lib.get_command_runners(
|
|
@@ -2532,6 +2797,162 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2532
2797
|
cluster_config_file)
|
|
2533
2798
|
self.docker_user = docker_user
|
|
2534
2799
|
|
|
2800
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2801
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2802
|
+
self.cluster_name)
|
|
2803
|
+
if metadata is None:
|
|
2804
|
+
return None
|
|
2805
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2806
|
+
|
|
2807
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2808
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2809
|
+
self.cluster_name,
|
|
2810
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2811
|
+
|
|
2812
|
+
def close_skylet_ssh_tunnel(self) -> None:
|
|
2813
|
+
"""Terminate the SSH tunnel process and clear its metadata."""
|
|
2814
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2815
|
+
if tunnel is None:
|
|
2816
|
+
return
|
|
2817
|
+
logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
|
|
2818
|
+
self.cluster_name, tunnel.port)
|
|
2819
|
+
try:
|
|
2820
|
+
self._terminate_ssh_tunnel_process(tunnel)
|
|
2821
|
+
finally:
|
|
2822
|
+
self._set_skylet_ssh_tunnel(None)
|
|
2823
|
+
|
|
2824
|
+
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2825
|
+
grpc_options = [
|
|
2826
|
+
# The task YAMLs can be large, so the default
|
|
2827
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2828
|
+
('grpc.max_receive_message_length', -1),
|
|
2829
|
+
]
|
|
2830
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2831
|
+
# and writes are very rare.
|
|
2832
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2833
|
+
# because it will only happen on:
|
|
2834
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2835
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2836
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2837
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2838
|
+
# and get the new tunnel.
|
|
2839
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2840
|
+
if tunnel is not None:
|
|
2841
|
+
try:
|
|
2842
|
+
# Check if the tunnel is open.
|
|
2843
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2844
|
+
s.settimeout(0.5)
|
|
2845
|
+
s.connect(('localhost', tunnel.port))
|
|
2846
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2847
|
+
options=grpc_options)
|
|
2848
|
+
except socket.error as e:
|
|
2849
|
+
logger.debug(
|
|
2850
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2851
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2852
|
+
'acquiring lock')
|
|
2853
|
+
pass
|
|
2854
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2855
|
+
lock_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2856
|
+
lock = locks.get_lock(lock_id, lock_timeout)
|
|
2857
|
+
try:
|
|
2858
|
+
with lock.acquire(blocking=True):
|
|
2859
|
+
# Re-read the tunnel from the DB.
|
|
2860
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2861
|
+
if tunnel is None:
|
|
2862
|
+
logger.debug('No SSH tunnel found for cluster '
|
|
2863
|
+
f'{self.cluster_name!r}, '
|
|
2864
|
+
'opening the tunnel')
|
|
2865
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2866
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2867
|
+
options=grpc_options)
|
|
2868
|
+
try:
|
|
2869
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2870
|
+
s.settimeout(0.5)
|
|
2871
|
+
s.connect(('localhost', tunnel.port))
|
|
2872
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2873
|
+
options=grpc_options)
|
|
2874
|
+
except socket.error as e:
|
|
2875
|
+
logger.debug(
|
|
2876
|
+
'Failed to connect to SSH tunnel for cluster '
|
|
2877
|
+
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2878
|
+
'opening new tunnel')
|
|
2879
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2880
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2881
|
+
options=grpc_options)
|
|
2882
|
+
except locks.LockTimeout as e:
|
|
2883
|
+
raise RuntimeError(
|
|
2884
|
+
'Failed to get gRPC channel for cluster '
|
|
2885
|
+
f'{self.cluster_name!r} due to a timeout when waiting for the '
|
|
2886
|
+
'SSH tunnel to be opened. Please try again or manually remove '
|
|
2887
|
+
f'the lock at {lock_id}. '
|
|
2888
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2889
|
+
|
|
2890
|
+
def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2891
|
+
"""Terminate the SSH tunnel process."""
|
|
2892
|
+
try:
|
|
2893
|
+
proc = psutil.Process(tunnel_info.pid)
|
|
2894
|
+
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2895
|
+
logger.debug(
|
|
2896
|
+
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2897
|
+
subprocess_utils.kill_children_processes(proc.pid)
|
|
2898
|
+
except psutil.NoSuchProcess:
|
|
2899
|
+
pass
|
|
2900
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2901
|
+
logger.warning(
|
|
2902
|
+
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2903
|
+
|
|
2904
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2905
|
+
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2906
|
+
updates the cluster handle, and persists it to the database."""
|
|
2907
|
+
max_attempts = 3
|
|
2908
|
+
# There could be a race condition here, as multiple processes may
|
|
2909
|
+
# attempt to open the same port at the same time.
|
|
2910
|
+
for attempt in range(max_attempts):
|
|
2911
|
+
runners = self.get_command_runners()
|
|
2912
|
+
head_runner = runners[0]
|
|
2913
|
+
local_port = random.randint(10000, 65535)
|
|
2914
|
+
try:
|
|
2915
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2916
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2917
|
+
except exceptions.CommandError as e:
|
|
2918
|
+
# Don't retry if the error is due to timeout,
|
|
2919
|
+
# connection refused, Kubernetes pods not found,
|
|
2920
|
+
# or an in-progress termination.
|
|
2921
|
+
if (e.detailed_reason is not None and
|
|
2922
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2923
|
+
e.detailed_reason) or
|
|
2924
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2925
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2926
|
+
raise e
|
|
2927
|
+
logger.warning(
|
|
2928
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2929
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2930
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2931
|
+
continue
|
|
2932
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2933
|
+
pid=ssh_tunnel_proc.pid)
|
|
2934
|
+
break
|
|
2935
|
+
|
|
2936
|
+
try:
|
|
2937
|
+
grpc.channel_ready_future(
|
|
2938
|
+
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2939
|
+
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2940
|
+
# Clean up existing tunnel before setting up the new one.
|
|
2941
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2942
|
+
if old_tunnel is not None:
|
|
2943
|
+
self._terminate_ssh_tunnel_process(old_tunnel)
|
|
2944
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2945
|
+
return tunnel_info
|
|
2946
|
+
except grpc.FutureTimeoutError as e:
|
|
2947
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2948
|
+
logger.warning(
|
|
2949
|
+
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2950
|
+
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2951
|
+
raise e
|
|
2952
|
+
except Exception as e:
|
|
2953
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2954
|
+
raise e
|
|
2955
|
+
|
|
2535
2956
|
@property
|
|
2536
2957
|
def cluster_yaml(self) -> Optional[str]:
|
|
2537
2958
|
if self._cluster_yaml is None:
|
|
@@ -2542,6 +2963,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2542
2963
|
def cluster_yaml(self, value: Optional[str]):
|
|
2543
2964
|
self._cluster_yaml = value
|
|
2544
2965
|
|
|
2966
|
+
@property
|
|
2967
|
+
def instance_ids(self):
|
|
2968
|
+
if self.cached_cluster_info is not None:
|
|
2969
|
+
return self.cached_cluster_info.instance_ids()
|
|
2970
|
+
return None
|
|
2971
|
+
|
|
2545
2972
|
@property
|
|
2546
2973
|
def ssh_user(self):
|
|
2547
2974
|
if self.cached_cluster_info is not None:
|
|
@@ -2576,6 +3003,18 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2576
3003
|
num_ips = 1
|
|
2577
3004
|
return num_ips
|
|
2578
3005
|
|
|
3006
|
+
@property
|
|
3007
|
+
def is_grpc_enabled_with_flag(self) -> bool:
|
|
3008
|
+
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
3009
|
+
return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
|
|
3010
|
+
|
|
3011
|
+
def __getstate__(self):
|
|
3012
|
+
state = self.__dict__.copy()
|
|
3013
|
+
# For backwards compatibility. Refer to
|
|
3014
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
3015
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
3016
|
+
return state
|
|
3017
|
+
|
|
2579
3018
|
def __setstate__(self, state):
|
|
2580
3019
|
self._version = self._VERSION
|
|
2581
3020
|
|
|
@@ -2606,7 +3045,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2606
3045
|
# pylint: disable=import-outside-toplevel
|
|
2607
3046
|
launched_resources = state['launched_resources']
|
|
2608
3047
|
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
|
2609
|
-
yaml_config =
|
|
3048
|
+
yaml_config = global_user_state.get_cluster_yaml_dict(
|
|
2610
3049
|
os.path.expanduser(state['_cluster_yaml']))
|
|
2611
3050
|
context = kubernetes_utils.get_context_from_config(
|
|
2612
3051
|
yaml_config['provider'])
|
|
@@ -2629,6 +3068,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2629
3068
|
os.path.expanduser(state['_cluster_yaml'])):
|
|
2630
3069
|
state['_cluster_yaml'] = None
|
|
2631
3070
|
|
|
3071
|
+
if version < 11:
|
|
3072
|
+
state['is_grpc_enabled'] = False
|
|
3073
|
+
state['skylet_ssh_tunnel'] = None
|
|
3074
|
+
|
|
3075
|
+
if version >= 12:
|
|
3076
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
3077
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
3078
|
+
|
|
2632
3079
|
self.__dict__.update(state)
|
|
2633
3080
|
|
|
2634
3081
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2653,6 +3100,234 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2653
3100
|
pass
|
|
2654
3101
|
|
|
2655
3102
|
|
|
3103
|
+
class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
3104
|
+
"""A handle for local resources."""
|
|
3105
|
+
|
|
3106
|
+
def __init__(
|
|
3107
|
+
self,
|
|
3108
|
+
*,
|
|
3109
|
+
cluster_name: str,
|
|
3110
|
+
cluster_name_on_cloud: str,
|
|
3111
|
+
cluster_yaml: Optional[str],
|
|
3112
|
+
launched_nodes: int,
|
|
3113
|
+
launched_resources: resources_lib.Resources,
|
|
3114
|
+
stable_internal_external_ips: Optional[List[Tuple[str,
|
|
3115
|
+
str]]] = None,
|
|
3116
|
+
stable_ssh_ports: Optional[List[int]] = None,
|
|
3117
|
+
cluster_info: Optional[provision_common.ClusterInfo] = None
|
|
3118
|
+
) -> None:
|
|
3119
|
+
super().__init__(
|
|
3120
|
+
cluster_name=cluster_name,
|
|
3121
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
|
3122
|
+
cluster_yaml=cluster_yaml,
|
|
3123
|
+
launched_nodes=launched_nodes,
|
|
3124
|
+
launched_resources=launched_resources,
|
|
3125
|
+
stable_internal_external_ips=stable_internal_external_ips,
|
|
3126
|
+
stable_ssh_ports=stable_ssh_ports,
|
|
3127
|
+
cluster_info=cluster_info)
|
|
3128
|
+
# TODO (kyuds): handle jobs consolidation mode. Currently,
|
|
3129
|
+
# jobs consolidation mode will not run a skylet, hence
|
|
3130
|
+
# grpc server will not run. In the future, we should
|
|
3131
|
+
# figure out a way to start grpc in consolidation mode.
|
|
3132
|
+
self.is_grpc_enabled = False
|
|
3133
|
+
|
|
3134
|
+
@context_utils.cancellation_guard
|
|
3135
|
+
# we expect different request to be acting on different clusters
|
|
3136
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
3137
|
+
# across requests.
|
|
3138
|
+
# Do not change this cache to global scope
|
|
3139
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
3140
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
3141
|
+
@timeline.event
|
|
3142
|
+
def get_command_runners(self,
|
|
3143
|
+
force_cached: bool = False,
|
|
3144
|
+
avoid_ssh_control: bool = False
|
|
3145
|
+
) -> List[command_runner.CommandRunner]:
|
|
3146
|
+
"""Returns a list of local command runners."""
|
|
3147
|
+
del force_cached, avoid_ssh_control # Unused.
|
|
3148
|
+
return [command_runner.LocalProcessCommandRunner()]
|
|
3149
|
+
|
|
3150
|
+
|
|
3151
|
+
class SkyletClient:
|
|
3152
|
+
"""The client to interact with a remote cluster through Skylet."""
|
|
3153
|
+
|
|
3154
|
+
def __init__(self, channel: 'grpc.Channel'):
|
|
3155
|
+
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3156
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3157
|
+
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
3158
|
+
self._managed_jobs_stub = (
|
|
3159
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
3160
|
+
|
|
3161
|
+
def set_autostop(
|
|
3162
|
+
self,
|
|
3163
|
+
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
3164
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3165
|
+
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
3166
|
+
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
3167
|
+
|
|
3168
|
+
def is_autostopping(
|
|
3169
|
+
self,
|
|
3170
|
+
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
3171
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3172
|
+
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
3173
|
+
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
3174
|
+
|
|
3175
|
+
def add_job(
|
|
3176
|
+
self,
|
|
3177
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
3178
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3179
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
3180
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
3181
|
+
|
|
3182
|
+
def queue_job(
|
|
3183
|
+
self,
|
|
3184
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
3185
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3186
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
3187
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
3188
|
+
|
|
3189
|
+
def update_status(
|
|
3190
|
+
self,
|
|
3191
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
3192
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3193
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
3194
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
3195
|
+
|
|
3196
|
+
def get_job_queue(
|
|
3197
|
+
self,
|
|
3198
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
3199
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3200
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
3201
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
3202
|
+
|
|
3203
|
+
def cancel_jobs(
|
|
3204
|
+
self,
|
|
3205
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
3206
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3207
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
3208
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3209
|
+
|
|
3210
|
+
def fail_all_in_progress_jobs(
|
|
3211
|
+
self,
|
|
3212
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
3213
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3214
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
3215
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
3216
|
+
|
|
3217
|
+
def get_job_status(
|
|
3218
|
+
self,
|
|
3219
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
3220
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3221
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
3222
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
3223
|
+
|
|
3224
|
+
def get_job_submitted_timestamp(
|
|
3225
|
+
self,
|
|
3226
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
3227
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3228
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
3229
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
3230
|
+
timeout=timeout)
|
|
3231
|
+
|
|
3232
|
+
def get_job_ended_timestamp(
|
|
3233
|
+
self,
|
|
3234
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
3235
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3236
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
3237
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
3238
|
+
|
|
3239
|
+
def get_log_dirs_for_jobs(
|
|
3240
|
+
self,
|
|
3241
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
3242
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3243
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3244
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3245
|
+
|
|
3246
|
+
def tail_logs(
|
|
3247
|
+
self,
|
|
3248
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
3249
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3250
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
3251
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
3252
|
+
|
|
3253
|
+
def get_service_status(
|
|
3254
|
+
self,
|
|
3255
|
+
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
3256
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3257
|
+
) -> 'servev1_pb2.GetServiceStatusResponse':
|
|
3258
|
+
return self._serve_stub.GetServiceStatus(request, timeout=timeout)
|
|
3259
|
+
|
|
3260
|
+
def add_serve_version(
|
|
3261
|
+
self,
|
|
3262
|
+
request: 'servev1_pb2.AddVersionRequest',
|
|
3263
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3264
|
+
) -> 'servev1_pb2.AddVersionResponse':
|
|
3265
|
+
return self._serve_stub.AddVersion(request, timeout=timeout)
|
|
3266
|
+
|
|
3267
|
+
def terminate_services(
|
|
3268
|
+
self,
|
|
3269
|
+
request: 'servev1_pb2.TerminateServicesRequest',
|
|
3270
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3271
|
+
) -> 'servev1_pb2.TerminateServicesResponse':
|
|
3272
|
+
return self._serve_stub.TerminateServices(request, timeout=timeout)
|
|
3273
|
+
|
|
3274
|
+
def terminate_replica(
|
|
3275
|
+
self,
|
|
3276
|
+
request: 'servev1_pb2.TerminateReplicaRequest',
|
|
3277
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3278
|
+
) -> 'servev1_pb2.TerminateReplicaResponse':
|
|
3279
|
+
return self._serve_stub.TerminateReplica(request, timeout=timeout)
|
|
3280
|
+
|
|
3281
|
+
def wait_service_registration(
|
|
3282
|
+
self,
|
|
3283
|
+
request: 'servev1_pb2.WaitServiceRegistrationRequest',
|
|
3284
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3285
|
+
) -> 'servev1_pb2.WaitServiceRegistrationResponse':
|
|
3286
|
+
# set timeout to at least 10 seconds more than service register
|
|
3287
|
+
# constant to make sure that timeouts will not occur.
|
|
3288
|
+
if timeout is not None:
|
|
3289
|
+
timeout = max(timeout,
|
|
3290
|
+
serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
|
|
3291
|
+
return self._serve_stub.WaitServiceRegistration(request,
|
|
3292
|
+
timeout=timeout)
|
|
3293
|
+
|
|
3294
|
+
def update_service(
|
|
3295
|
+
self,
|
|
3296
|
+
request: 'servev1_pb2.UpdateServiceRequest',
|
|
3297
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3298
|
+
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
3299
|
+
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
3300
|
+
|
|
3301
|
+
def get_managed_job_controller_version(
|
|
3302
|
+
self,
|
|
3303
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
3304
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3305
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
3306
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
3307
|
+
|
|
3308
|
+
def get_managed_job_table(
|
|
3309
|
+
self,
|
|
3310
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
3311
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3312
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
3313
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
3314
|
+
|
|
3315
|
+
def get_all_managed_job_ids_by_name(
|
|
3316
|
+
self,
|
|
3317
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
3318
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3319
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
3320
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
3321
|
+
timeout=timeout)
|
|
3322
|
+
|
|
3323
|
+
def cancel_managed_jobs(
|
|
3324
|
+
self,
|
|
3325
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
3326
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3327
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
3328
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3329
|
+
|
|
3330
|
+
|
|
2656
3331
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2657
3332
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2658
3333
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
|
@@ -2665,7 +3340,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2665
3340
|
NAME = 'cloudvmray'
|
|
2666
3341
|
|
|
2667
3342
|
# Backward compatibility, with the old name of the handle.
|
|
2668
|
-
ResourceHandle = CloudVmRayResourceHandle #
|
|
3343
|
+
ResourceHandle = CloudVmRayResourceHandle # type: ignore
|
|
2669
3344
|
|
|
2670
3345
|
def __init__(self):
|
|
2671
3346
|
self.run_timestamp = sky_logging.get_run_timestamp()
|
|
@@ -2680,6 +3355,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2680
3355
|
self._dag = None
|
|
2681
3356
|
self._optimize_target = None
|
|
2682
3357
|
self._requested_features = set()
|
|
3358
|
+
self._dump_final_script = False
|
|
3359
|
+
self._is_managed = False
|
|
3360
|
+
# Optional planner (via register_info): used under the per-cluster lock
|
|
3361
|
+
# to produce a fresh concrete plan when neither a reusable snapshot nor
|
|
3362
|
+
# a caller plan is available.
|
|
3363
|
+
self._planner = None
|
|
2683
3364
|
|
|
2684
3365
|
# Command for running the setup script. It is only set when the
|
|
2685
3366
|
# setup needs to be run outside the self._setup() and as part of
|
|
@@ -2696,6 +3377,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2696
3377
|
self._requested_features = kwargs.pop('requested_features',
|
|
2697
3378
|
self._requested_features)
|
|
2698
3379
|
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
|
3380
|
+
self._is_managed = kwargs.pop('is_managed', False)
|
|
3381
|
+
# Optional planner callback for a fresh plan under lock when no
|
|
3382
|
+
# reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
|
|
3383
|
+
self._planner = kwargs.pop('planner', self._planner)
|
|
2699
3384
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
|
2700
3385
|
|
|
2701
3386
|
def check_resources_fit_cluster(
|
|
@@ -2722,9 +3407,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2722
3407
|
# Usage Collection:
|
|
2723
3408
|
usage_lib.messages.usage.update_cluster_resources(
|
|
2724
3409
|
handle.launched_nodes, launched_resources)
|
|
2725
|
-
|
|
2726
|
-
if
|
|
2727
|
-
usage_lib.messages.usage.update_cluster_status(
|
|
3410
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
3411
|
+
if status is not None:
|
|
3412
|
+
usage_lib.messages.usage.update_cluster_status(status)
|
|
2728
3413
|
|
|
2729
3414
|
assert launched_resources.region is not None, handle
|
|
2730
3415
|
|
|
@@ -2846,12 +3531,46 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2846
3531
|
# Check if the cluster is owned by the current user. Raise
|
|
2847
3532
|
# exceptions.ClusterOwnerIdentityMismatchError
|
|
2848
3533
|
backend_utils.check_owner_identity(cluster_name)
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
3534
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
3535
|
+
communicated_with_user = False
|
|
3536
|
+
|
|
3537
|
+
while True:
|
|
3538
|
+
try:
|
|
3539
|
+
return self._locked_provision(lock_id, task, to_provision,
|
|
3540
|
+
dryrun, stream_logs, cluster_name,
|
|
3541
|
+
retry_until_up,
|
|
3542
|
+
skip_unnecessary_provisioning)
|
|
3543
|
+
except locks.LockTimeout:
|
|
3544
|
+
if not communicated_with_user:
|
|
3545
|
+
rich_utils.force_update_status(
|
|
3546
|
+
ux_utils.spinner_message('Launching - blocked by ' +
|
|
3547
|
+
'other requests ' +
|
|
3548
|
+
colorama.Style.RESET_ALL +
|
|
3549
|
+
colorama.Style.DIM +
|
|
3550
|
+
'Check concurrent requests: ' +
|
|
3551
|
+
'sky api status -v | grep '
|
|
3552
|
+
f'{cluster_name}'))
|
|
3553
|
+
|
|
3554
|
+
def _locked_provision(
|
|
3555
|
+
self,
|
|
3556
|
+
lock_id: str,
|
|
3557
|
+
task: task_lib.Task,
|
|
3558
|
+
to_provision: Optional[resources_lib.Resources],
|
|
3559
|
+
dryrun: bool,
|
|
3560
|
+
stream_logs: bool,
|
|
3561
|
+
cluster_name: str,
|
|
3562
|
+
retry_until_up: bool = False,
|
|
3563
|
+
skip_unnecessary_provisioning: bool = False,
|
|
3564
|
+
) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
|
|
3565
|
+
with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
|
|
3566
|
+
# Reset spinner message to remove any mention of being blocked
|
|
3567
|
+
# by other requests.
|
|
3568
|
+
rich_utils.force_update_status(
|
|
3569
|
+
ux_utils.spinner_message('Launching'))
|
|
3570
|
+
|
|
3571
|
+
# Try to launch the exiting cluster first. If no existing
|
|
3572
|
+
# cluster, this function will create a to_provision_config
|
|
3573
|
+
# with required resources.
|
|
2855
3574
|
to_provision_config = self._check_existing_cluster(
|
|
2856
3575
|
task, to_provision, cluster_name, dryrun)
|
|
2857
3576
|
assert to_provision_config.resources is not None, (
|
|
@@ -2869,14 +3588,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2869
3588
|
# TODO(suquark): once we have sky on PyPI, we should directly
|
|
2870
3589
|
# install sky from PyPI.
|
|
2871
3590
|
local_wheel_path, wheel_hash = wheel_utils.build_sky_wheel()
|
|
2872
|
-
# The most frequent reason for the failure of a provision
|
|
2873
|
-
# request is resource unavailability instead of rate
|
|
2874
|
-
# limiting; to make users wait shorter, we do not make
|
|
2875
|
-
# backoffs exponential.
|
|
2876
|
-
backoff = common_utils.Backoff(
|
|
2877
|
-
initial_backoff=_RETRY_UNTIL_UP_INIT_GAP_SECONDS,
|
|
2878
|
-
max_backoff_factor=1)
|
|
2879
|
-
attempt_cnt = 1
|
|
2880
3591
|
while True:
|
|
2881
3592
|
# For on-demand instances, RetryingVmProvisioner will retry
|
|
2882
3593
|
# within the given region first, then optionally retry on all
|
|
@@ -2900,16 +3611,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2900
3611
|
self._requested_features,
|
|
2901
3612
|
local_wheel_path,
|
|
2902
3613
|
wheel_hash,
|
|
2903
|
-
blocked_resources=task.blocked_resources
|
|
3614
|
+
blocked_resources=task.blocked_resources,
|
|
3615
|
+
is_managed=self._is_managed)
|
|
2904
3616
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
|
2905
3617
|
rich_utils.force_update_status(
|
|
2906
|
-
ux_utils.spinner_message('Launching',
|
|
3618
|
+
ux_utils.spinner_message('Launching',
|
|
3619
|
+
log_path,
|
|
3620
|
+
cluster_name=cluster_name))
|
|
2907
3621
|
config_dict = retry_provisioner.provision_with_retries(
|
|
2908
3622
|
task, to_provision_config, dryrun, stream_logs,
|
|
2909
3623
|
skip_unnecessary_provisioning)
|
|
2910
3624
|
break
|
|
2911
3625
|
except exceptions.ResourcesUnavailableError as e:
|
|
2912
3626
|
log_path = retry_provisioner.log_dir + '/provision.log'
|
|
3627
|
+
|
|
2913
3628
|
error_message = (
|
|
2914
3629
|
f'{colorama.Fore.RED}Failed to provision all '
|
|
2915
3630
|
f'possible launchable resources.'
|
|
@@ -2920,23 +3635,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2920
3635
|
error_message = str(e)
|
|
2921
3636
|
|
|
2922
3637
|
if retry_until_up:
|
|
2923
|
-
|
|
2924
|
-
# Sleep and retry.
|
|
2925
|
-
gap_seconds = backoff.current_backoff()
|
|
2926
|
-
plural = 's' if attempt_cnt > 1 else ''
|
|
3638
|
+
gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
|
|
2927
3639
|
retry_message = ux_utils.retry_message(
|
|
2928
|
-
f'Retry after {gap_seconds:.0f}s '
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
3640
|
+
f'Retry after {gap_seconds:.0f}s ')
|
|
3641
|
+
hint_message = (
|
|
3642
|
+
f'\n{retry_message} '
|
|
3643
|
+
f'{ux_utils.provision_hint(cluster_name)}'
|
|
3644
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3645
|
+
|
|
3646
|
+
# Add cluster event for retry.
|
|
3647
|
+
global_user_state.add_cluster_event(
|
|
3648
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
3649
|
+
f'Retrying provisioning after {gap_seconds:.0f}s',
|
|
3650
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3651
|
+
|
|
3652
|
+
raise exceptions.ExecutionRetryableError(
|
|
3653
|
+
error_message,
|
|
3654
|
+
hint=hint_message,
|
|
3655
|
+
retry_wait_seconds=gap_seconds)
|
|
2936
3656
|
# Clean up the cluster's entry in `sky status`.
|
|
2937
3657
|
# Do not remove the stopped cluster from the global state
|
|
2938
3658
|
# if failed to start.
|
|
2939
3659
|
if not e.no_failover:
|
|
3660
|
+
global_user_state.add_cluster_event(
|
|
3661
|
+
cluster_name,
|
|
3662
|
+
None,
|
|
3663
|
+
'Provision failed: ' + str(e),
|
|
3664
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
3665
|
+
nop_if_duplicate=True)
|
|
2940
3666
|
global_user_state.remove_cluster(cluster_name,
|
|
2941
3667
|
terminate=True)
|
|
2942
3668
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -2944,7 +3670,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2944
3670
|
logger.error(
|
|
2945
3671
|
ux_utils.error_message(
|
|
2946
3672
|
'Failed to provision resources. '
|
|
2947
|
-
f'{ux_utils.
|
|
3673
|
+
f'{ux_utils.provision_hint(cluster_name)}'))
|
|
2948
3674
|
error_message += (
|
|
2949
3675
|
'\nTo keep retrying until the cluster is up, use '
|
|
2950
3676
|
'the `--retry-until-up` flag.')
|
|
@@ -2953,8 +3679,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2953
3679
|
error_message + '\n' + str(e),
|
|
2954
3680
|
failover_history=e.failover_history) from None
|
|
2955
3681
|
if dryrun:
|
|
2956
|
-
|
|
2957
|
-
|
|
3682
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3683
|
+
cluster_name)
|
|
3684
|
+
return handle if handle is not None else None, False
|
|
2958
3685
|
|
|
2959
3686
|
if config_dict['provisioning_skipped']:
|
|
2960
3687
|
# Skip further provisioning.
|
|
@@ -2962,10 +3689,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2962
3689
|
# ('handle', 'provision_record', 'resources_vars')
|
|
2963
3690
|
# We need to return the handle - but it should be the existing
|
|
2964
3691
|
# handle for the cluster.
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
return
|
|
3692
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3693
|
+
cluster_name)
|
|
3694
|
+
assert handle is not None, (cluster_name, handle)
|
|
3695
|
+
return handle, True
|
|
2969
3696
|
|
|
2970
3697
|
if 'provision_record' in config_dict:
|
|
2971
3698
|
# New provisioner is used here.
|
|
@@ -2980,8 +3707,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2980
3707
|
# and other necessary files to the VM.
|
|
2981
3708
|
# 3. Run setup commands to install dependencies.
|
|
2982
3709
|
# 4. Starting ray cluster and skylet.
|
|
3710
|
+
|
|
3711
|
+
# Add cluster event for runtime setup start
|
|
3712
|
+
global_user_state.add_cluster_event(
|
|
3713
|
+
handle.cluster_name, status_lib.ClusterStatus.INIT,
|
|
3714
|
+
'Setting up SkyPilot runtime on cluster',
|
|
3715
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3716
|
+
|
|
2983
3717
|
cluster_info = provisioner.post_provision_runtime_setup(
|
|
2984
|
-
|
|
3718
|
+
handle.launched_resources,
|
|
2985
3719
|
resources_utils.ClusterName(handle.cluster_name,
|
|
2986
3720
|
handle.cluster_name_on_cloud),
|
|
2987
3721
|
handle.cluster_yaml,
|
|
@@ -2995,6 +3729,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2995
3729
|
# manually or by the cloud provider.
|
|
2996
3730
|
# Optimize the case where the cluster's IPs can be retrieved
|
|
2997
3731
|
# from cluster_info.
|
|
3732
|
+
handle.cached_cluster_info = cluster_info
|
|
2998
3733
|
handle.docker_user = cluster_info.docker_user
|
|
2999
3734
|
handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
|
|
3000
3735
|
cluster_info=cluster_info)
|
|
@@ -3006,7 +3741,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3006
3741
|
|
|
3007
3742
|
self._update_after_cluster_provisioned(
|
|
3008
3743
|
handle, to_provision_config.prev_handle, task,
|
|
3009
|
-
prev_cluster_status,
|
|
3744
|
+
prev_cluster_status, config_hash)
|
|
3010
3745
|
return handle, False
|
|
3011
3746
|
|
|
3012
3747
|
cluster_config_file = config_dict['ray']
|
|
@@ -3016,8 +3751,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3016
3751
|
ssh_port_list = handle.external_ssh_ports()
|
|
3017
3752
|
assert ip_list is not None, handle
|
|
3018
3753
|
assert ssh_port_list is not None, handle
|
|
3019
|
-
|
|
3020
|
-
|
|
3754
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
3755
|
+
cluster_config_file)
|
|
3021
3756
|
if 'docker' in config:
|
|
3022
3757
|
handle.setup_docker_user(cluster_config_file)
|
|
3023
3758
|
|
|
@@ -3078,14 +3813,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3078
3813
|
|
|
3079
3814
|
self._update_after_cluster_provisioned(
|
|
3080
3815
|
handle, to_provision_config.prev_handle, task,
|
|
3081
|
-
prev_cluster_status,
|
|
3816
|
+
prev_cluster_status, config_hash)
|
|
3082
3817
|
return handle, False
|
|
3083
3818
|
|
|
3084
3819
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
3085
3820
|
cloud = handle.launched_resources.cloud
|
|
3086
3821
|
logger.debug(
|
|
3087
3822
|
f'Opening ports {handle.launched_resources.ports} for {cloud}')
|
|
3088
|
-
config =
|
|
3823
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
3089
3824
|
provider_config = config['provider']
|
|
3090
3825
|
provision_lib.open_ports(repr(cloud), handle.cluster_name_on_cloud,
|
|
3091
3826
|
handle.launched_resources.ports,
|
|
@@ -3096,7 +3831,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3096
3831
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3097
3832
|
task: task_lib.Task,
|
|
3098
3833
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3099
|
-
|
|
3834
|
+
config_hash: str) -> None:
|
|
3100
3835
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3101
3836
|
handle.launched_nodes, handle.launched_resources)
|
|
3102
3837
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3108,16 +3843,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3108
3843
|
# update_status will query the ray job status for all INIT /
|
|
3109
3844
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3110
3845
|
# know the actual previous status of the cluster.
|
|
3111
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3112
3846
|
logger.debug('Update job queue on remote cluster.')
|
|
3113
3847
|
with rich_utils.safe_status(
|
|
3114
3848
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3849
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3850
|
+
|
|
3851
|
+
if not use_legacy:
|
|
3852
|
+
try:
|
|
3853
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3854
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3855
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3856
|
+
).update_status(request))
|
|
3857
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3858
|
+
use_legacy = True
|
|
3859
|
+
|
|
3860
|
+
if use_legacy:
|
|
3861
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3862
|
+
returncode, _, stderr = self.run_on_head(
|
|
3863
|
+
handle, cmd, require_outputs=True)
|
|
3864
|
+
subprocess_utils.handle_returncode(
|
|
3865
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3121
3866
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3122
3867
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3123
3868
|
# is restarted
|
|
@@ -3125,14 +3870,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3125
3870
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3126
3871
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3127
3872
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3873
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3874
|
+
|
|
3875
|
+
if not use_legacy:
|
|
3876
|
+
try:
|
|
3877
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3878
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3879
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3880
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3881
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3882
|
+
use_legacy = True
|
|
3883
|
+
|
|
3884
|
+
if use_legacy:
|
|
3885
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3886
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3887
|
+
handle, cmd, require_outputs=True)
|
|
3888
|
+
subprocess_utils.handle_returncode(
|
|
3889
|
+
returncode, cmd,
|
|
3890
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3891
|
+
stdout + stderr)
|
|
3136
3892
|
|
|
3137
3893
|
prev_ports = None
|
|
3138
3894
|
if prev_handle is not None:
|
|
@@ -3142,14 +3898,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3142
3898
|
resources_utils.port_ranges_to_set(current_ports) -
|
|
3143
3899
|
resources_utils.port_ranges_to_set(prev_ports))
|
|
3144
3900
|
if open_new_ports:
|
|
3145
|
-
|
|
3146
|
-
if not (cloud.OPEN_PORTS_VERSION <=
|
|
3901
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
3902
|
+
if not (launched_resources.cloud.OPEN_PORTS_VERSION <=
|
|
3147
3903
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
|
3148
3904
|
with rich_utils.safe_status(
|
|
3149
3905
|
ux_utils.spinner_message(
|
|
3150
3906
|
'Launching - Opening new ports')):
|
|
3151
3907
|
self._open_ports(handle)
|
|
3152
3908
|
|
|
3909
|
+
# Capture task YAML and command
|
|
3910
|
+
user_specified_task_config = None
|
|
3911
|
+
if task is not None:
|
|
3912
|
+
user_specified_task_config = task.to_yaml_config(
|
|
3913
|
+
use_user_specified_yaml=True)
|
|
3914
|
+
|
|
3153
3915
|
with timeline.Event('backend.provision.post_process'):
|
|
3154
3916
|
global_user_state.add_or_update_cluster(
|
|
3155
3917
|
handle.cluster_name,
|
|
@@ -3157,7 +3919,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3157
3919
|
set(task.resources),
|
|
3158
3920
|
ready=True,
|
|
3159
3921
|
config_hash=config_hash,
|
|
3922
|
+
task_config=user_specified_task_config,
|
|
3160
3923
|
)
|
|
3924
|
+
|
|
3925
|
+
# Add cluster event for successful provisioning.
|
|
3926
|
+
global_user_state.add_cluster_event(
|
|
3927
|
+
handle.cluster_name, status_lib.ClusterStatus.UP,
|
|
3928
|
+
'Cluster successfully provisioned with ' +
|
|
3929
|
+
f'{handle.launched_nodes} nodes',
|
|
3930
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3931
|
+
|
|
3161
3932
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3162
3933
|
status_lib.ClusterStatus.UP)
|
|
3163
3934
|
# We still add the cluster to ssh config file on API server, this
|
|
@@ -3172,13 +3943,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3172
3943
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3173
3944
|
handle.ssh_user)
|
|
3174
3945
|
|
|
3175
|
-
common_utils.remove_file_if_exists(lock_path)
|
|
3176
|
-
|
|
3177
3946
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3178
|
-
workdir: Path
|
|
3947
|
+
workdir: Union[Path, Dict[str, Any]],
|
|
3948
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
|
3179
3949
|
# Even though provision() takes care of it, there may be cases where
|
|
3180
3950
|
# this function is called in isolation, without calling provision(),
|
|
3181
3951
|
# e.g., in CLI. So we should rerun rsync_up.
|
|
3952
|
+
if isinstance(workdir, dict):
|
|
3953
|
+
self._sync_git_workdir(handle, envs_and_secrets)
|
|
3954
|
+
else:
|
|
3955
|
+
self._sync_path_workdir(handle, workdir)
|
|
3956
|
+
|
|
3957
|
+
def _sync_git_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3958
|
+
envs_and_secrets: Dict[str, str]) -> None:
|
|
3959
|
+
style = colorama.Style
|
|
3960
|
+
ip_list = handle.external_ips()
|
|
3961
|
+
assert ip_list is not None, 'external_ips is not cached in handle'
|
|
3962
|
+
|
|
3963
|
+
log_path = os.path.join(self.log_dir, 'workdir_sync.log')
|
|
3964
|
+
|
|
3965
|
+
# TODO(zhwu): refactor this with backend_utils.parallel_cmd_with_rsync
|
|
3966
|
+
runners = handle.get_command_runners()
|
|
3967
|
+
|
|
3968
|
+
def _sync_git_workdir_node(
|
|
3969
|
+
runner: command_runner.CommandRunner) -> None:
|
|
3970
|
+
# Type assertion to help mypy understand the type
|
|
3971
|
+
assert hasattr(
|
|
3972
|
+
runner, 'git_clone'
|
|
3973
|
+
), f'CommandRunner should have git_clone method, ' \
|
|
3974
|
+
f'got {type(runner)}'
|
|
3975
|
+
runner.git_clone(
|
|
3976
|
+
target_dir=SKY_REMOTE_WORKDIR,
|
|
3977
|
+
log_path=log_path,
|
|
3978
|
+
stream_logs=False,
|
|
3979
|
+
max_retry=3,
|
|
3980
|
+
envs_and_secrets=envs_and_secrets,
|
|
3981
|
+
)
|
|
3982
|
+
|
|
3983
|
+
num_nodes = handle.launched_nodes
|
|
3984
|
+
plural = 's' if num_nodes > 1 else ''
|
|
3985
|
+
logger.info(
|
|
3986
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
|
3987
|
+
f'{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
|
3988
|
+
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
3989
|
+
os.system(f'touch {log_path}')
|
|
3990
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
|
3991
|
+
str(handle.launched_resources.cloud))
|
|
3992
|
+
with rich_utils.safe_status(
|
|
3993
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
|
3994
|
+
subprocess_utils.run_in_parallel(_sync_git_workdir_node, runners,
|
|
3995
|
+
num_threads)
|
|
3996
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
|
3997
|
+
|
|
3998
|
+
def _sync_path_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3999
|
+
workdir: Path) -> None:
|
|
3182
4000
|
fore = colorama.Fore
|
|
3183
4001
|
style = colorama.Style
|
|
3184
4002
|
ip_list = handle.external_ips()
|
|
@@ -3247,9 +4065,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3247
4065
|
TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
|
|
3248
4066
|
assert here that all storage_mounts are MOUNT mode.
|
|
3249
4067
|
"""
|
|
4068
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
3250
4069
|
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
|
3251
4070
|
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
|
3252
|
-
|
|
4071
|
+
launched_resources.cloud, all_file_mounts)
|
|
3253
4072
|
self._execute_file_mounts(handle, all_file_mounts)
|
|
3254
4073
|
self._execute_storage_mounts(handle, storage_mounts)
|
|
3255
4074
|
self._set_storage_mounts_metadata(handle.cluster_name,
|
|
@@ -3267,10 +4086,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3267
4086
|
remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
|
|
3268
4087
|
# Need this `-i` option to make sure `source ~/.bashrc` work
|
|
3269
4088
|
setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
|
|
4089
|
+
unset_ray_env_vars = ' && '.join(
|
|
4090
|
+
[f'unset {var}' for var in UNSET_RAY_ENV_VARS])
|
|
4091
|
+
setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
|
|
3270
4092
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
|
3271
4093
|
|
|
3272
4094
|
def _setup_node(node_id: int) -> None:
|
|
3273
|
-
setup_envs = task.
|
|
4095
|
+
setup_envs = task.envs_and_secrets
|
|
3274
4096
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
|
3275
4097
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
|
3276
4098
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
|
@@ -3329,33 +4151,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3329
4151
|
return returncode
|
|
3330
4152
|
|
|
3331
4153
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
# Instead, we should retry the setup with dumping the script
|
|
3343
|
-
# to a file to be safe.
|
|
3344
|
-
logger.debug('Failed to read setup log file '
|
|
3345
|
-
f'{setup_log_path}: {e}')
|
|
3346
|
-
is_message_too_long = True
|
|
3347
|
-
|
|
3348
|
-
if is_message_too_long:
|
|
3349
|
-
# If the setup script is too long, we retry it with dumping
|
|
3350
|
-
# the script to a file and running it with SSH. We use a
|
|
3351
|
-
# general length limit check before but it could be
|
|
3352
|
-
# inaccurate on some systems.
|
|
3353
|
-
logger.debug(
|
|
3354
|
-
'Failed to run setup command inline due to '
|
|
3355
|
-
'command length limit. Dumping setup script to '
|
|
3356
|
-
'file and running it with SSH.')
|
|
3357
|
-
_dump_final_script(setup_script)
|
|
3358
|
-
returncode = _run_setup(setup_cmd)
|
|
4154
|
+
|
|
4155
|
+
if _is_message_too_long(returncode, file_path=setup_log_path):
|
|
4156
|
+
# If the setup script is too long, we need to retry it
|
|
4157
|
+
# with dumping the script to a file and running it the script
|
|
4158
|
+
# on remote cluster instead.
|
|
4159
|
+
logger.debug('Failed to run setup command inline due to '
|
|
4160
|
+
'command length limit. Dumping setup script to '
|
|
4161
|
+
'file and running it with SSH.')
|
|
4162
|
+
_dump_final_script(setup_script)
|
|
4163
|
+
returncode = _run_setup(setup_cmd)
|
|
3359
4164
|
|
|
3360
4165
|
def error_message() -> str:
|
|
3361
4166
|
# Use the function to avoid tailing the file in success case
|
|
@@ -3414,102 +4219,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3414
4219
|
logger.info(
|
|
3415
4220
|
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
|
3416
4221
|
|
|
4222
|
+
def _download_file(self, handle: CloudVmRayResourceHandle,
|
|
4223
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
4224
|
+
"""Syncs file from remote to local."""
|
|
4225
|
+
runners = handle.get_command_runners()
|
|
4226
|
+
head_runner = runners[0]
|
|
4227
|
+
head_runner.rsync(
|
|
4228
|
+
source=local_file_path,
|
|
4229
|
+
target=remote_file_path,
|
|
4230
|
+
up=False,
|
|
4231
|
+
stream_logs=False,
|
|
4232
|
+
)
|
|
4233
|
+
|
|
3417
4234
|
def _exec_code_on_head(
|
|
3418
4235
|
self,
|
|
3419
4236
|
handle: CloudVmRayResourceHandle,
|
|
3420
4237
|
codegen: str,
|
|
3421
4238
|
job_id: int,
|
|
3422
|
-
detach_run: bool = False,
|
|
3423
4239
|
managed_job_dag: Optional['dag.Dag'] = None,
|
|
4240
|
+
managed_job_user_id: Optional[str] = None,
|
|
4241
|
+
remote_log_dir: Optional[str] = None,
|
|
3424
4242
|
) -> None:
|
|
3425
4243
|
"""Executes generated code on the head node."""
|
|
3426
|
-
|
|
3427
|
-
|
|
4244
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4245
|
+
file_name = f'sky_job_{job_id}'
|
|
4246
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
4247
|
+
if remote_log_dir is None:
|
|
4248
|
+
remote_log_dir = self.log_dir
|
|
3428
4249
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3429
4250
|
|
|
3430
|
-
|
|
4251
|
+
def _dump_code_to_file(codegen: str,
|
|
4252
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
4253
|
+
runners = handle.get_command_runners()
|
|
4254
|
+
head_runner = runners[0]
|
|
4255
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
4256
|
+
fp.write(codegen)
|
|
4257
|
+
fp.flush()
|
|
4258
|
+
script_path = os.path.join(target_dir, file_name)
|
|
4259
|
+
# We choose to sync code + exec, because the alternative of
|
|
4260
|
+
# 'ray submit' may not work as it may use system python
|
|
4261
|
+
# (python2) to execute the script. Happens for AWS.
|
|
4262
|
+
head_runner.rsync(source=fp.name,
|
|
4263
|
+
target=script_path,
|
|
4264
|
+
up=True,
|
|
4265
|
+
stream_logs=False)
|
|
3431
4266
|
|
|
4267
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3432
4268
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3433
4269
|
f'touch {remote_log_path}')
|
|
3434
4270
|
encoded_script = shlex.quote(codegen)
|
|
3435
4271
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3436
4272
|
job_submit_cmd = (
|
|
3437
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3438
|
-
# with pid is the same driver process.
|
|
4273
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
4274
|
+
# retrieved with pid is the same driver process.
|
|
3439
4275
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3440
4276
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3441
4277
|
# Do not use &>, which is not POSIX and may not work.
|
|
3442
4278
|
# Note that the order of ">filename 2>&1" matters.
|
|
3443
4279
|
f'> {remote_log_path} 2>&1')
|
|
3444
|
-
|
|
3445
4280
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3446
4281
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3447
4282
|
|
|
3448
|
-
def _dump_code_to_file(codegen: str,
|
|
3449
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3450
|
-
runners = handle.get_command_runners()
|
|
3451
|
-
head_runner = runners[0]
|
|
3452
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3453
|
-
fp.write(codegen)
|
|
3454
|
-
fp.flush()
|
|
3455
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3456
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3457
|
-
# submit' may not work as it may use system python (python2) to
|
|
3458
|
-
# execute the script. Happens for AWS.
|
|
3459
|
-
head_runner.rsync(source=fp.name,
|
|
3460
|
-
target=script_path,
|
|
3461
|
-
up=True,
|
|
3462
|
-
stream_logs=False)
|
|
3463
|
-
|
|
3464
4283
|
# Should also be ealier than _is_command_length_over_limit
|
|
3465
4284
|
# Same reason as in _setup
|
|
3466
4285
|
if self._dump_final_script:
|
|
3467
4286
|
_dump_code_to_file(job_submit_cmd,
|
|
3468
4287
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3469
4288
|
|
|
3470
|
-
if
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3480
|
-
|
|
3481
|
-
|
|
3482
|
-
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
|
|
4289
|
+
if not use_legacy:
|
|
4290
|
+
try:
|
|
4291
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4292
|
+
if managed_job_dag is not None:
|
|
4293
|
+
workspace = skypilot_config.get_active_workspace(
|
|
4294
|
+
force_user_workspace=True)
|
|
4295
|
+
entrypoint = common_utils.get_current_command()
|
|
4296
|
+
|
|
4297
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
4298
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
4299
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
4300
|
+
task, is_managed_job=True)
|
|
4301
|
+
managed_job_tasks.append(
|
|
4302
|
+
jobsv1_pb2.ManagedJobTask(
|
|
4303
|
+
task_id=task_id,
|
|
4304
|
+
name=task.name,
|
|
4305
|
+
resources_str=resources_str,
|
|
4306
|
+
metadata_json=task.metadata_json))
|
|
4307
|
+
|
|
4308
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
4309
|
+
name=managed_job_dag.name,
|
|
4310
|
+
pool=managed_job_dag.pool,
|
|
4311
|
+
workspace=workspace,
|
|
4312
|
+
entrypoint=entrypoint,
|
|
4313
|
+
tasks=managed_job_tasks,
|
|
4314
|
+
user_id=managed_job_user_id)
|
|
4315
|
+
|
|
4316
|
+
if _is_command_length_over_limit(codegen):
|
|
4317
|
+
_dump_code_to_file(codegen)
|
|
4318
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4319
|
+
job_id=job_id,
|
|
4320
|
+
# codegen not set - server assumes script uploaded
|
|
4321
|
+
remote_log_dir=remote_log_dir,
|
|
4322
|
+
managed_job=managed_job_info,
|
|
4323
|
+
script_path=script_path)
|
|
4324
|
+
else:
|
|
4325
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4326
|
+
job_id=job_id,
|
|
4327
|
+
codegen=codegen,
|
|
4328
|
+
remote_log_dir=remote_log_dir,
|
|
4329
|
+
managed_job=managed_job_info,
|
|
4330
|
+
script_path=script_path)
|
|
4331
|
+
|
|
4332
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
4333
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
4334
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4335
|
+
use_legacy = True
|
|
4336
|
+
|
|
4337
|
+
if use_legacy:
|
|
4338
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
|
4339
|
+
_dump_code_to_file(codegen)
|
|
4340
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4341
|
+
|
|
4342
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
4343
|
+
if managed_job_dag is not None:
|
|
4344
|
+
# Add the managed job to job queue database.
|
|
4345
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
4346
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
4347
|
+
job_id,
|
|
4348
|
+
managed_job_dag,
|
|
4349
|
+
skypilot_config.get_active_workspace(
|
|
4350
|
+
force_user_workspace=True),
|
|
4351
|
+
entrypoint=common_utils.get_current_command(),
|
|
4352
|
+
user_hash=managed_job_user_id)
|
|
4353
|
+
# Set the managed job to PENDING state to make sure that
|
|
4354
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
4355
|
+
# if it needs to wait to be submitted.
|
|
4356
|
+
# We cannot set the managed job to PENDING state in the
|
|
4357
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
4358
|
+
# to wait for the run commands to be scheduled on the job
|
|
4359
|
+
# controller in high-load cases.
|
|
4360
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
4361
|
+
return job_submit_cmd
|
|
4362
|
+
|
|
4363
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3486
4364
|
|
|
3487
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3488
|
-
job_submit_cmd,
|
|
3489
|
-
stream_logs=False,
|
|
3490
|
-
require_outputs=True)
|
|
3491
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3492
|
-
# running a job. Necessitating calling `sky launch`.
|
|
3493
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3494
|
-
handle.cluster_name)
|
|
3495
|
-
if returncode == 255 and 'too long' in stdout + stderr:
|
|
3496
|
-
# If the generated script is too long, we retry it with dumping
|
|
3497
|
-
# the script to a file and running it with SSH. We use a general
|
|
3498
|
-
# length limit check before but it could be inaccurate on some
|
|
3499
|
-
# systems.
|
|
3500
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
3501
|
-
'Dumping job to file and running it with SSH.')
|
|
3502
|
-
_dump_code_to_file(codegen)
|
|
3503
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3504
4365
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3505
4366
|
job_submit_cmd,
|
|
3506
4367
|
stream_logs=False,
|
|
3507
4368
|
require_outputs=True)
|
|
4369
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4370
|
+
# running a job. Necessitating calling `sky launch`.
|
|
4371
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4372
|
+
handle.cluster_name)
|
|
4373
|
+
output = stdout + stderr
|
|
4374
|
+
if _is_message_too_long(returncode, output=output):
|
|
4375
|
+
# If the job submit script is too long, we need to retry it
|
|
4376
|
+
# with dumping the script to a file and running it the script
|
|
4377
|
+
# on remote cluster instead.
|
|
4378
|
+
logger.debug(
|
|
4379
|
+
'Failed to submit job due to command length limit. '
|
|
4380
|
+
'Dumping job to file and running it with SSH. '
|
|
4381
|
+
f'Output: {output}')
|
|
4382
|
+
_dump_code_to_file(codegen)
|
|
4383
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4384
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
4385
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
4386
|
+
handle,
|
|
4387
|
+
job_submit_cmd,
|
|
4388
|
+
stream_logs=False,
|
|
4389
|
+
require_outputs=True)
|
|
3508
4390
|
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
4391
|
+
subprocess_utils.handle_returncode(
|
|
4392
|
+
returncode,
|
|
4393
|
+
job_submit_cmd,
|
|
4394
|
+
f'Failed to submit job {job_id}.',
|
|
4395
|
+
stderr=stdout + stderr)
|
|
3513
4396
|
|
|
3514
4397
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
3515
4398
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -3518,53 +4401,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3518
4401
|
logger.info(
|
|
3519
4402
|
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
|
3520
4403
|
rich_utils.stop_safe_status()
|
|
3521
|
-
if not detach_run:
|
|
3522
|
-
if (handle.cluster_name == controller_utils.Controllers.
|
|
3523
|
-
JOBS_CONTROLLER.value.cluster_name):
|
|
3524
|
-
self.tail_managed_job_logs(handle, job_id)
|
|
3525
|
-
else:
|
|
3526
|
-
# Sky logs. Not using subprocess.run since it will make the
|
|
3527
|
-
# ssh keep connected after ctrl-c.
|
|
3528
|
-
self.tail_logs(handle, job_id)
|
|
3529
4404
|
|
|
3530
4405
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
3531
|
-
job_name: Optional[str], resources_str: str
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
|
|
3556
|
-
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
|
|
4406
|
+
job_name: Optional[str], resources_str: str,
|
|
4407
|
+
metadata: str) -> Tuple[int, str]:
|
|
4408
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4409
|
+
|
|
4410
|
+
if not use_legacy:
|
|
4411
|
+
try:
|
|
4412
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
4413
|
+
job_name=job_name,
|
|
4414
|
+
username=common_utils.get_user_hash(),
|
|
4415
|
+
run_timestamp=self.run_timestamp,
|
|
4416
|
+
resources_str=resources_str,
|
|
4417
|
+
metadata=metadata)
|
|
4418
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4419
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
4420
|
+
request))
|
|
4421
|
+
job_id = response.job_id
|
|
4422
|
+
log_dir = response.log_dir
|
|
4423
|
+
return job_id, log_dir
|
|
4424
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4425
|
+
use_legacy = True
|
|
4426
|
+
|
|
4427
|
+
if use_legacy:
|
|
4428
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
4429
|
+
job_name=job_name,
|
|
4430
|
+
username=common_utils.get_user_hash(),
|
|
4431
|
+
run_timestamp=self.run_timestamp,
|
|
4432
|
+
resources_str=resources_str,
|
|
4433
|
+
metadata=metadata)
|
|
4434
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
4435
|
+
handle,
|
|
4436
|
+
code,
|
|
4437
|
+
stream_logs=False,
|
|
4438
|
+
require_outputs=True,
|
|
4439
|
+
separate_stderr=True)
|
|
4440
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4441
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
4442
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4443
|
+
handle.cluster_name)
|
|
4444
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
4445
|
+
# retry for this, after we figure out the reason.
|
|
4446
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4447
|
+
'Failed to fetch job id.',
|
|
4448
|
+
stderr)
|
|
4449
|
+
try:
|
|
4450
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
4451
|
+
if job_id_match is not None:
|
|
4452
|
+
job_id = int(job_id_match.group(1))
|
|
4453
|
+
else:
|
|
4454
|
+
# For backward compatibility.
|
|
4455
|
+
job_id = int(result_str)
|
|
4456
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
4457
|
+
if log_dir_match is not None:
|
|
4458
|
+
log_dir = log_dir_match.group(1).strip()
|
|
4459
|
+
else:
|
|
4460
|
+
# For backward compatibility, use the same log dir as local.
|
|
4461
|
+
log_dir = self.log_dir
|
|
4462
|
+
except ValueError as e:
|
|
4463
|
+
logger.error(stderr)
|
|
4464
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
4465
|
+
f'Returncode: {returncode}') from e
|
|
4466
|
+
return job_id, log_dir
|
|
3562
4467
|
|
|
3563
4468
|
def _execute(
|
|
3564
4469
|
self,
|
|
3565
4470
|
handle: CloudVmRayResourceHandle,
|
|
3566
4471
|
task: task_lib.Task,
|
|
3567
|
-
detach_run: bool,
|
|
3568
4472
|
dryrun: bool = False,
|
|
3569
4473
|
) -> Optional[int]:
|
|
3570
4474
|
"""Executes the task on the cluster.
|
|
@@ -3588,7 +4492,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3588
4492
|
# In this case, we reset the resources for the task, so that the
|
|
3589
4493
|
# detached setup does not need to wait for the task resources to be
|
|
3590
4494
|
# ready (which is not used for setup anyway).
|
|
3591
|
-
valid_resource =
|
|
4495
|
+
valid_resource = resources_lib.Resources()
|
|
3592
4496
|
else:
|
|
3593
4497
|
# Check the task resources vs the cluster resources. Since
|
|
3594
4498
|
# `sky exec` will not run the provision and _check_existing_cluster
|
|
@@ -3610,15 +4514,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3610
4514
|
logger.info(f'Dryrun complete. Would have run:\n{task}')
|
|
3611
4515
|
return None
|
|
3612
4516
|
|
|
3613
|
-
job_id = self._add_job(handle, task_copy.name, resources_str
|
|
4517
|
+
job_id, log_dir = self._add_job(handle, task_copy.name, resources_str,
|
|
4518
|
+
task.metadata_json)
|
|
3614
4519
|
|
|
3615
4520
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
3616
4521
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
|
3617
4522
|
if num_actual_nodes > 1:
|
|
3618
|
-
self._execute_task_n_nodes(handle, task_copy, job_id,
|
|
4523
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
|
|
3619
4524
|
else:
|
|
3620
4525
|
# Case: task_lib.Task(run, num_nodes=1)
|
|
3621
|
-
self._execute_task_one_node(handle, task_copy, job_id,
|
|
4526
|
+
self._execute_task_one_node(handle, task_copy, job_id, log_dir)
|
|
3622
4527
|
|
|
3623
4528
|
return job_id
|
|
3624
4529
|
|
|
@@ -3674,16 +4579,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3674
4579
|
is_identity_mismatch_and_purge = True
|
|
3675
4580
|
else:
|
|
3676
4581
|
raise
|
|
3677
|
-
|
|
3678
|
-
|
|
4582
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
4583
|
+
lock = locks.get_lock(lock_id, timeout=1)
|
|
3679
4584
|
# Retry in case new cluster operation comes in and holds the lock
|
|
3680
4585
|
# right after the lock is removed.
|
|
3681
4586
|
n_attempts = 2
|
|
3682
4587
|
while True:
|
|
3683
4588
|
n_attempts -= 1
|
|
3684
|
-
# In case other running cluster operations are still holding the
|
|
3685
|
-
# lock.
|
|
3686
|
-
common_utils.remove_file_if_exists(lock_path)
|
|
3687
4589
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
3688
4590
|
# should be higher priority than the cluster requests, and we should
|
|
3689
4591
|
# release the lock from other requests.
|
|
@@ -3701,10 +4603,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3701
4603
|
'Failed to kill other launch requests for the '
|
|
3702
4604
|
f'cluster {handle.cluster_name}: '
|
|
3703
4605
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4606
|
+
# In case other running cluster operations are still holding the
|
|
4607
|
+
# lock.
|
|
4608
|
+
lock.force_unlock()
|
|
3704
4609
|
try:
|
|
3705
|
-
with
|
|
3706
|
-
lock_path,
|
|
3707
|
-
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
|
4610
|
+
with lock:
|
|
3708
4611
|
self.teardown_no_lock(
|
|
3709
4612
|
handle,
|
|
3710
4613
|
terminate,
|
|
@@ -3717,14 +4620,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3717
4620
|
refresh_cluster_status=(
|
|
3718
4621
|
not is_identity_mismatch_and_purge))
|
|
3719
4622
|
if terminate:
|
|
3720
|
-
|
|
4623
|
+
lock.force_unlock()
|
|
3721
4624
|
break
|
|
3722
|
-
except
|
|
4625
|
+
except locks.LockTimeout as e:
|
|
3723
4626
|
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
|
3724
4627
|
f'retrying...')
|
|
3725
4628
|
if n_attempts <= 0:
|
|
3726
4629
|
raise RuntimeError(
|
|
3727
|
-
f'Cluster {cluster_name!r} is locked by {
|
|
4630
|
+
f'Cluster {cluster_name!r} is locked by {lock_id}. '
|
|
3728
4631
|
'Check to see if it is still being launched') from e
|
|
3729
4632
|
|
|
3730
4633
|
# --- CloudVMRayBackend Specific APIs ---
|
|
@@ -3735,6 +4638,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3735
4638
|
job_ids: Optional[List[int]] = None,
|
|
3736
4639
|
stream_logs: bool = True
|
|
3737
4640
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4641
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4642
|
+
try:
|
|
4643
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4644
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4645
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4646
|
+
).get_job_status(request))
|
|
4647
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4648
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4649
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4650
|
+
}
|
|
4651
|
+
return statuses
|
|
4652
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4653
|
+
pass
|
|
4654
|
+
|
|
3738
4655
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
3739
4656
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3740
4657
|
code,
|
|
@@ -3755,16 +4672,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3755
4672
|
|
|
3756
4673
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
3757
4674
|
"""
|
|
3758
|
-
|
|
3759
|
-
|
|
3760
|
-
|
|
3761
|
-
|
|
3762
|
-
|
|
3763
|
-
|
|
3764
|
-
|
|
3765
|
-
|
|
3766
|
-
|
|
3767
|
-
|
|
4675
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4676
|
+
|
|
4677
|
+
if not use_legacy:
|
|
4678
|
+
try:
|
|
4679
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4680
|
+
cancel_all=cancel_all,
|
|
4681
|
+
user_hash=user_hash)
|
|
4682
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4683
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4684
|
+
request))
|
|
4685
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4686
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4687
|
+
use_legacy = True
|
|
4688
|
+
|
|
4689
|
+
if use_legacy:
|
|
4690
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4691
|
+
user_hash)
|
|
4692
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4693
|
+
code,
|
|
4694
|
+
stream_logs=False,
|
|
4695
|
+
require_outputs=True)
|
|
4696
|
+
subprocess_utils.handle_returncode(
|
|
4697
|
+
returncode, code,
|
|
4698
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4699
|
+
stdout)
|
|
4700
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
3768
4701
|
if cancelled_ids:
|
|
3769
4702
|
logger.info(
|
|
3770
4703
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -3781,32 +4714,60 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3781
4714
|
Returns:
|
|
3782
4715
|
A dictionary mapping job_id to log path.
|
|
3783
4716
|
"""
|
|
3784
|
-
|
|
3785
|
-
|
|
3786
|
-
handle,
|
|
3787
|
-
code,
|
|
3788
|
-
stream_logs=False,
|
|
3789
|
-
require_outputs=True,
|
|
3790
|
-
separate_stderr=True)
|
|
3791
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
3792
|
-
'Failed to sync logs.', stderr)
|
|
3793
|
-
run_timestamps = message_utils.decode_payload(run_timestamps)
|
|
3794
|
-
if not run_timestamps:
|
|
3795
|
-
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3796
|
-
'No matching log directories found'
|
|
3797
|
-
f'{colorama.Style.RESET_ALL}')
|
|
3798
|
-
return {}
|
|
4717
|
+
job_to_dir: Dict[str, str] = {}
|
|
4718
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3799
4719
|
|
|
3800
|
-
|
|
3801
|
-
|
|
4720
|
+
if not use_legacy:
|
|
4721
|
+
try:
|
|
4722
|
+
int_job_ids = []
|
|
4723
|
+
if job_ids:
|
|
4724
|
+
for str_job_id in job_ids:
|
|
4725
|
+
if str_job_id.isdigit():
|
|
4726
|
+
int_job_ids.append(int(str_job_id))
|
|
4727
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4728
|
+
job_ids=int_job_ids)
|
|
4729
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4730
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4731
|
+
).get_log_dirs_for_jobs(request))
|
|
4732
|
+
job_log_dirs = response.job_log_dirs
|
|
4733
|
+
if not job_log_dirs:
|
|
4734
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4735
|
+
'No matching log directories found'
|
|
4736
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4737
|
+
return {}
|
|
4738
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4739
|
+
# Convert to string for backwards compatibility
|
|
4740
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4741
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4742
|
+
use_legacy = True
|
|
4743
|
+
|
|
4744
|
+
if use_legacy:
|
|
4745
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4746
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4747
|
+
code,
|
|
4748
|
+
stream_logs=False,
|
|
4749
|
+
require_outputs=True,
|
|
4750
|
+
separate_stderr=True)
|
|
4751
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4752
|
+
'Failed to sync logs.', stderr)
|
|
4753
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4754
|
+
if not job_to_dir:
|
|
4755
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4756
|
+
'No matching log directories found'
|
|
4757
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4758
|
+
return {}
|
|
4759
|
+
|
|
4760
|
+
job_ids = list(job_to_dir.keys())
|
|
4761
|
+
dirs = list(job_to_dir.values())
|
|
3802
4762
|
remote_log_dirs = [
|
|
3803
|
-
|
|
3804
|
-
|
|
3805
|
-
|
|
3806
|
-
|
|
3807
|
-
os.path.join(local_dir, run_timestamp)
|
|
3808
|
-
for run_timestamp in run_timestamps
|
|
4763
|
+
# TODO(aylei): backward compatibility for legacy runtime that
|
|
4764
|
+
# returns run_timestamp only, remove after 0.12.0
|
|
4765
|
+
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
|
4766
|
+
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
|
3809
4767
|
]
|
|
4768
|
+
local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
|
|
4769
|
+
if constants.SKY_LOGS_DIRECTORY in dir else
|
|
4770
|
+
os.path.join(local_dir, dir)) for dir in dirs]
|
|
3810
4771
|
|
|
3811
4772
|
runners = handle.get_command_runners()
|
|
3812
4773
|
|
|
@@ -3842,12 +4803,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3842
4803
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
|
3843
4804
|
return dict(zip(job_ids, local_log_dirs))
|
|
3844
4805
|
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
4806
|
+
@context_utils.cancellation_guard
|
|
4807
|
+
def tail_logs(
|
|
4808
|
+
self,
|
|
4809
|
+
handle: CloudVmRayResourceHandle,
|
|
4810
|
+
job_id: Optional[int],
|
|
4811
|
+
managed_job_id: Optional[int] = None,
|
|
4812
|
+
follow: bool = True,
|
|
4813
|
+
tail: int = 0,
|
|
4814
|
+
require_outputs: bool = False,
|
|
4815
|
+
stream_logs: bool = True,
|
|
4816
|
+
process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
|
|
3851
4817
|
"""Tail the logs of a job.
|
|
3852
4818
|
|
|
3853
4819
|
Args:
|
|
@@ -3857,11 +4823,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3857
4823
|
follow: Whether to follow the logs.
|
|
3858
4824
|
tail: The number of lines to display from the end of the
|
|
3859
4825
|
log file. If 0, print all lines.
|
|
4826
|
+
require_outputs: Whether to return the stdout/stderr of the command.
|
|
4827
|
+
stream_logs: Whether to stream the logs to stdout/stderr.
|
|
4828
|
+
process_stream: Whether to process the stream.
|
|
3860
4829
|
|
|
3861
4830
|
Returns:
|
|
3862
4831
|
The exit code of the tail command. Returns code 100 if the job has
|
|
3863
4832
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
3864
4833
|
"""
|
|
4834
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4835
|
+
last_exit_code = 0
|
|
4836
|
+
try:
|
|
4837
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4838
|
+
job_id=job_id,
|
|
4839
|
+
managed_job_id=managed_job_id,
|
|
4840
|
+
follow=follow,
|
|
4841
|
+
tail=tail)
|
|
4842
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4843
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4844
|
+
).tail_logs(request, timeout=None)):
|
|
4845
|
+
if resp.log_line:
|
|
4846
|
+
print(resp.log_line, end='', flush=True)
|
|
4847
|
+
last_exit_code = resp.exit_code
|
|
4848
|
+
return last_exit_code
|
|
4849
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4850
|
+
pass
|
|
4851
|
+
except grpc.RpcError as e:
|
|
4852
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4853
|
+
return last_exit_code
|
|
4854
|
+
raise e
|
|
4855
|
+
|
|
3865
4856
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
3866
4857
|
managed_job_id=managed_job_id,
|
|
3867
4858
|
follow=follow,
|
|
@@ -3876,29 +4867,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3876
4867
|
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
|
3877
4868
|
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
|
3878
4869
|
try:
|
|
3879
|
-
|
|
4870
|
+
final = self.run_on_head(
|
|
3880
4871
|
handle,
|
|
3881
4872
|
code,
|
|
3882
|
-
stream_logs=
|
|
3883
|
-
process_stream=
|
|
4873
|
+
stream_logs=stream_logs,
|
|
4874
|
+
process_stream=process_stream,
|
|
4875
|
+
require_outputs=require_outputs,
|
|
3884
4876
|
# Allocate a pseudo-terminal to disable output buffering.
|
|
3885
4877
|
# Otherwise, there may be 5 minutes delay in logging.
|
|
3886
4878
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
|
3887
4879
|
)
|
|
3888
4880
|
except SystemExit as e:
|
|
3889
|
-
|
|
3890
|
-
return
|
|
4881
|
+
final = e.code
|
|
4882
|
+
return final
|
|
3891
4883
|
|
|
3892
4884
|
def tail_managed_job_logs(self,
|
|
3893
4885
|
handle: CloudVmRayResourceHandle,
|
|
3894
4886
|
job_id: Optional[int] = None,
|
|
3895
4887
|
job_name: Optional[str] = None,
|
|
3896
4888
|
controller: bool = False,
|
|
3897
|
-
follow: bool = True
|
|
4889
|
+
follow: bool = True,
|
|
4890
|
+
tail: Optional[int] = None) -> int:
|
|
3898
4891
|
# if job_name is not None, job_id should be None
|
|
3899
4892
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4893
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
3900
4894
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
3901
|
-
job_name, job_id, follow, controller)
|
|
4895
|
+
job_name, job_id, follow, controller, tail)
|
|
3902
4896
|
|
|
3903
4897
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
|
3904
4898
|
# kill the process, so we need to handle it manually here.
|
|
@@ -3942,20 +4936,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3942
4936
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
3943
4937
|
|
|
3944
4938
|
if job_id is None:
|
|
3945
|
-
#
|
|
4939
|
+
# get the job_id
|
|
3946
4940
|
# if job_name is None, get all job_ids
|
|
3947
4941
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
3951
|
-
|
|
3952
|
-
|
|
3953
|
-
|
|
3954
|
-
|
|
3955
|
-
|
|
3956
|
-
|
|
3957
|
-
|
|
3958
|
-
|
|
4942
|
+
|
|
4943
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4944
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4945
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4946
|
+
if not use_legacy:
|
|
4947
|
+
try:
|
|
4948
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4949
|
+
job_name=job_name)
|
|
4950
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4951
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4952
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4953
|
+
job_ids = list(response.job_ids)
|
|
4954
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4955
|
+
use_legacy = True
|
|
4956
|
+
|
|
4957
|
+
if use_legacy:
|
|
4958
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4959
|
+
job_name=job_name)
|
|
4960
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4961
|
+
handle,
|
|
4962
|
+
code,
|
|
4963
|
+
stream_logs=False,
|
|
4964
|
+
require_outputs=True,
|
|
4965
|
+
separate_stderr=True)
|
|
4966
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4967
|
+
'Failed to sync down logs.',
|
|
4968
|
+
stderr)
|
|
4969
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
3959
4970
|
if not job_ids:
|
|
3960
4971
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3961
4972
|
'No matching job found'
|
|
@@ -3974,20 +4985,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3974
4985
|
# list should aready be in descending order
|
|
3975
4986
|
job_id = job_ids[0]
|
|
3976
4987
|
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
|
|
3984
|
-
|
|
3985
|
-
|
|
3986
|
-
|
|
3987
|
-
|
|
3988
|
-
|
|
3989
|
-
|
|
3990
|
-
|
|
4988
|
+
if isinstance(handle, LocalResourcesHandle):
|
|
4989
|
+
# In consolidation mode, we don't submit a ray job, therefore no
|
|
4990
|
+
# run_timestamp is available. We use a dummy run_timestamp here.
|
|
4991
|
+
run_timestamps = {
|
|
4992
|
+
job_id: f'managed-jobs-consolidation-mode-{job_id}'
|
|
4993
|
+
}
|
|
4994
|
+
else:
|
|
4995
|
+
# get the run_timestamp
|
|
4996
|
+
# the function takes in [job_id]
|
|
4997
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4998
|
+
if not use_legacy:
|
|
4999
|
+
try:
|
|
5000
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
5001
|
+
job_ids=[job_id])
|
|
5002
|
+
log_dirs_response = (
|
|
5003
|
+
backend_utils.invoke_skylet_with_retries(
|
|
5004
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
5005
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
5006
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
5007
|
+
# Convert back to the expected format
|
|
5008
|
+
# {job_id: run_timestamp}
|
|
5009
|
+
run_timestamps = {}
|
|
5010
|
+
for jid, log_dir in job_log_dirs.items():
|
|
5011
|
+
run_timestamps[int(jid)] = log_dir
|
|
5012
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
5013
|
+
use_legacy = True
|
|
5014
|
+
|
|
5015
|
+
if use_legacy:
|
|
5016
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
5017
|
+
[str(job_id)])
|
|
5018
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
5019
|
+
handle,
|
|
5020
|
+
code,
|
|
5021
|
+
stream_logs=False,
|
|
5022
|
+
require_outputs=True,
|
|
5023
|
+
separate_stderr=True)
|
|
5024
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
5025
|
+
'Failed to sync logs.',
|
|
5026
|
+
stderr)
|
|
5027
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
5028
|
+
run_timestamps = message_utils.decode_payload(
|
|
5029
|
+
run_timestamps_payload)
|
|
3991
5030
|
if not run_timestamps:
|
|
3992
5031
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
3993
5032
|
'No matching log directories found'
|
|
@@ -3996,11 +5035,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3996
5035
|
|
|
3997
5036
|
run_timestamp = list(run_timestamps.values())[0]
|
|
3998
5037
|
job_id = list(run_timestamps.keys())[0]
|
|
5038
|
+
|
|
5039
|
+
# If run_timestamp contains the full path with SKY_LOGS_DIRECTORY,
|
|
5040
|
+
# strip the prefix to get just the relative part to avoid duplication
|
|
5041
|
+
# when constructing local paths.
|
|
5042
|
+
if run_timestamp.startswith(constants.SKY_LOGS_DIRECTORY):
|
|
5043
|
+
run_timestamp = run_timestamp[len(constants.SKY_LOGS_DIRECTORY
|
|
5044
|
+
):].lstrip('/')
|
|
3999
5045
|
local_log_dir = ''
|
|
4000
5046
|
if controller: # download controller logs
|
|
4001
5047
|
remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
|
|
4002
5048
|
f'{job_id}.log')
|
|
4003
|
-
local_log_dir = os.path.join(local_dir,
|
|
5049
|
+
local_log_dir = os.path.join(local_dir, 'managed_jobs',
|
|
5050
|
+
run_timestamp)
|
|
4004
5051
|
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
|
4005
5052
|
exist_ok=True)
|
|
4006
5053
|
|
|
@@ -4046,11 +5093,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4046
5093
|
exist_ok=True)
|
|
4047
5094
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4048
5095
|
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
|
|
4053
|
-
|
|
5096
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
5097
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
5098
|
+
job_name=None,
|
|
5099
|
+
job_id=int(job_id),
|
|
5100
|
+
follow=False,
|
|
5101
|
+
controller=False)
|
|
4054
5102
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4055
5103
|
# kill the process, so we need to handle it manually here.
|
|
4056
5104
|
if threading.current_thread() is threading.main_thread():
|
|
@@ -4091,6 +5139,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4091
5139
|
Raises:
|
|
4092
5140
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
|
4093
5141
|
"""
|
|
5142
|
+
try:
|
|
5143
|
+
handle.close_skylet_ssh_tunnel()
|
|
5144
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5145
|
+
# Not critical to the cluster teardown, just log a warning.
|
|
5146
|
+
logger.warning(
|
|
5147
|
+
'Failed to close Skylet SSH tunnel for cluster '
|
|
5148
|
+
f'{handle.cluster_name}: '
|
|
5149
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
5150
|
+
|
|
4094
5151
|
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
|
4095
5152
|
# We have to kill the cluster requests again within the lock, because
|
|
4096
5153
|
# any pending requests on the same cluster should be cancelled after
|
|
@@ -4116,7 +5173,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4116
5173
|
prev_cluster_status, _ = (
|
|
4117
5174
|
backend_utils.refresh_cluster_status_handle(
|
|
4118
5175
|
handle.cluster_name,
|
|
4119
|
-
|
|
5176
|
+
# There is a case where
|
|
5177
|
+
# 1. The cluster was interrupted during provisioning.
|
|
5178
|
+
# 2. The API request to create the cluster instances was
|
|
5179
|
+
# sent to the cloud, but hasn't been processed yet.
|
|
5180
|
+
# In this case, the cluster will be INIT. We should do a
|
|
5181
|
+
# hard status refresh to see if the instances are
|
|
5182
|
+
# actually there or not. Otherwise, teardown may not
|
|
5183
|
+
# find the instances, leading to a leak. This was
|
|
5184
|
+
# observed in AWS. See also
|
|
5185
|
+
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
5186
|
+
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5187
|
+
cluster_lock_already_held=True,
|
|
5188
|
+
retry_if_missing=False))
|
|
4120
5189
|
cluster_status_fetched = True
|
|
4121
5190
|
except exceptions.ClusterStatusFetchingError:
|
|
4122
5191
|
logger.warning(
|
|
@@ -4124,10 +5193,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4124
5193
|
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
|
4125
5194
|
'up.')
|
|
4126
5195
|
if not cluster_status_fetched:
|
|
4127
|
-
|
|
5196
|
+
status = global_user_state.get_status_from_cluster_name(
|
|
4128
5197
|
handle.cluster_name)
|
|
4129
|
-
prev_cluster_status =
|
|
4130
|
-
'status'] if record is not None else None
|
|
5198
|
+
prev_cluster_status = status if status is not None else None
|
|
4131
5199
|
if prev_cluster_status is None:
|
|
4132
5200
|
# When the cluster is not in the cluster table, we guarantee that
|
|
4133
5201
|
# all related resources / cache / config are cleaned up, i.e. it
|
|
@@ -4148,8 +5216,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4148
5216
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
|
4149
5217
|
'teardown.log')
|
|
4150
5218
|
log_abs_path = os.path.abspath(log_path)
|
|
4151
|
-
|
|
4152
|
-
|
|
5219
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
5220
|
+
cloud = launched_resources.cloud
|
|
5221
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
4153
5222
|
cluster_name = handle.cluster_name
|
|
4154
5223
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
4155
5224
|
|
|
@@ -4209,7 +5278,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4209
5278
|
from sky.adaptors import ibm
|
|
4210
5279
|
from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
|
|
4211
5280
|
|
|
4212
|
-
config_provider =
|
|
5281
|
+
config_provider = global_user_state.get_cluster_yaml_dict(
|
|
4213
5282
|
handle.cluster_yaml)['provider']
|
|
4214
5283
|
region = config_provider['region']
|
|
4215
5284
|
search_client = ibm.search_client()
|
|
@@ -4238,36 +5307,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4238
5307
|
# successfully removed cluster as no exception was raised
|
|
4239
5308
|
returncode = 0
|
|
4240
5309
|
|
|
4241
|
-
elif terminate and isinstance(cloud, clouds.SCP):
|
|
4242
|
-
# pylint: disable=import-outside-toplevel
|
|
4243
|
-
from sky.skylet.providers.scp import node_provider
|
|
4244
|
-
config['provider']['cache_stopped_nodes'] = not terminate
|
|
4245
|
-
provider = node_provider.SCPNodeProvider(config['provider'],
|
|
4246
|
-
cluster_name_on_cloud)
|
|
4247
|
-
try:
|
|
4248
|
-
if not os.path.exists(provider.metadata.path):
|
|
4249
|
-
raise node_provider.SCPError(
|
|
4250
|
-
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
|
|
4251
|
-
'Metadata file does not exist.')
|
|
4252
|
-
|
|
4253
|
-
with open(provider.metadata.path, 'r', encoding='utf-8') as f:
|
|
4254
|
-
metadata = json.load(f)
|
|
4255
|
-
node_id = next(iter(metadata.values())).get(
|
|
4256
|
-
'creation', {}).get('virtualServerId', None)
|
|
4257
|
-
provider.terminate_node(node_id)
|
|
4258
|
-
returncode = 0
|
|
4259
|
-
except node_provider.SCPError as e:
|
|
4260
|
-
returncode = 1
|
|
4261
|
-
stdout = ''
|
|
4262
|
-
stderr = str(e)
|
|
4263
|
-
|
|
4264
5310
|
else:
|
|
4265
5311
|
config['provider']['cache_stopped_nodes'] = not terminate
|
|
4266
5312
|
with tempfile.NamedTemporaryFile('w',
|
|
4267
5313
|
prefix='sky_',
|
|
4268
5314
|
delete=False,
|
|
4269
5315
|
suffix='.yml') as f:
|
|
4270
|
-
|
|
5316
|
+
yaml_utils.dump_yaml(f.name, config)
|
|
4271
5317
|
f.flush()
|
|
4272
5318
|
|
|
4273
5319
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
|
@@ -4322,12 +5368,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4322
5368
|
handle: CloudVmRayResourceHandle,
|
|
4323
5369
|
terminate: bool,
|
|
4324
5370
|
purge: bool = False,
|
|
4325
|
-
remove_from_db: bool = True
|
|
5371
|
+
remove_from_db: bool = True,
|
|
5372
|
+
failover: bool = False) -> None:
|
|
4326
5373
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
|
4327
5374
|
|
|
4328
5375
|
This method will handle the following cleanup steps:
|
|
4329
5376
|
* Deleting the TPUs;
|
|
4330
5377
|
* Removing ssh configs for the cluster;
|
|
5378
|
+
* Deleting the open ports;
|
|
5379
|
+
* Deleting the custom multi network infrastructure based on the
|
|
5380
|
+
failover flag (e.g. delete firewalls, subnets, and VPCs for GPU
|
|
5381
|
+
Direct if failover is False, otherwise, only delete the subnets);
|
|
4331
5382
|
* Updating the local state of the cluster;
|
|
4332
5383
|
* Removing the terminated cluster's scripts and ray yaml files.
|
|
4333
5384
|
"""
|
|
@@ -4359,19 +5410,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4359
5410
|
# The cluster yaml does not exist when skypilot has not found
|
|
4360
5411
|
# the right resource to provision the cluster.
|
|
4361
5412
|
if handle.cluster_yaml is not None:
|
|
5413
|
+
launched_resources = (
|
|
5414
|
+
handle.launched_resources.assert_launchable())
|
|
5415
|
+
cloud = launched_resources.cloud
|
|
5416
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
5417
|
+
handle.cluster_yaml)
|
|
5418
|
+
ports_cleaned_up = False
|
|
5419
|
+
custom_multi_network_cleaned_up = False
|
|
4362
5420
|
try:
|
|
4363
|
-
cloud = handle.launched_resources.cloud
|
|
4364
|
-
config = common_utils.read_yaml(handle.cluster_yaml)
|
|
4365
5421
|
cloud.check_features_are_supported(
|
|
4366
|
-
|
|
5422
|
+
launched_resources,
|
|
4367
5423
|
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
|
4368
5424
|
provision_lib.cleanup_ports(repr(cloud),
|
|
4369
5425
|
cluster_name_on_cloud,
|
|
4370
5426
|
handle.launched_resources.ports,
|
|
4371
5427
|
config['provider'])
|
|
4372
|
-
|
|
5428
|
+
ports_cleaned_up = True
|
|
4373
5429
|
except exceptions.NotSupportedError:
|
|
4374
|
-
|
|
5430
|
+
ports_cleaned_up = True
|
|
4375
5431
|
except exceptions.PortDoesNotExistError:
|
|
4376
5432
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
4377
5433
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -4383,8 +5439,43 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4383
5439
|
else:
|
|
4384
5440
|
raise
|
|
4385
5441
|
|
|
4386
|
-
|
|
4387
|
-
|
|
5442
|
+
# Clean up custom multi networks, e.g. the subnets, firewalls,
|
|
5443
|
+
# and VPCs created for GCP GPUDirect TCPX
|
|
5444
|
+
try:
|
|
5445
|
+
cloud.check_features_are_supported(
|
|
5446
|
+
handle.launched_resources, {
|
|
5447
|
+
clouds.CloudImplementationFeatures.
|
|
5448
|
+
CUSTOM_MULTI_NETWORK
|
|
5449
|
+
})
|
|
5450
|
+
provision_lib.cleanup_custom_multi_network(
|
|
5451
|
+
repr(cloud), cluster_name_on_cloud, config['provider'],
|
|
5452
|
+
failover)
|
|
5453
|
+
custom_multi_network_cleaned_up = True
|
|
5454
|
+
except exceptions.NotSupportedError:
|
|
5455
|
+
custom_multi_network_cleaned_up = True
|
|
5456
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5457
|
+
if purge:
|
|
5458
|
+
msg = common_utils.format_exception(e, use_bracket=True)
|
|
5459
|
+
logger.warning(
|
|
5460
|
+
f'Failed to cleanup custom multi network. Skipping '
|
|
5461
|
+
f'since purge is set. Details: {msg}')
|
|
5462
|
+
else:
|
|
5463
|
+
raise
|
|
5464
|
+
|
|
5465
|
+
if ports_cleaned_up and custom_multi_network_cleaned_up:
|
|
5466
|
+
try:
|
|
5467
|
+
self.remove_cluster_config(handle)
|
|
5468
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5469
|
+
if purge:
|
|
5470
|
+
msg = common_utils.format_exception(
|
|
5471
|
+
e, use_bracket=True)
|
|
5472
|
+
logger.warning(
|
|
5473
|
+
f'Failed to remove cluster config. Skipping '
|
|
5474
|
+
f'since purge is set. Details: {msg}')
|
|
5475
|
+
else:
|
|
5476
|
+
raise
|
|
5477
|
+
|
|
5478
|
+
cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
|
|
4388
5479
|
|
|
4389
5480
|
def _detect_abnormal_non_terminated_nodes(
|
|
4390
5481
|
handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -4400,18 +5491,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4400
5491
|
# https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
|
|
4401
5492
|
attempts = 0
|
|
4402
5493
|
while True:
|
|
4403
|
-
config =
|
|
5494
|
+
config = global_user_state.get_cluster_yaml_dict(
|
|
5495
|
+
handle.cluster_yaml)
|
|
4404
5496
|
|
|
4405
5497
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
|
4406
5498
|
node_status_dict = provision_lib.query_instances(
|
|
4407
5499
|
repr(cloud),
|
|
5500
|
+
handle.cluster_name,
|
|
4408
5501
|
cluster_name_on_cloud,
|
|
4409
5502
|
config['provider'],
|
|
4410
5503
|
non_terminated_only=False)
|
|
4411
5504
|
|
|
4412
5505
|
unexpected_node_state: Optional[Tuple[str, str]] = None
|
|
4413
|
-
for node_id,
|
|
4414
|
-
|
|
5506
|
+
for node_id, node_status_tuple in node_status_dict.items():
|
|
5507
|
+
node_status, reason = node_status_tuple
|
|
5508
|
+
reason = '' if reason is None else f' ({reason})'
|
|
5509
|
+
logger.debug(f'{node_id} status: {node_status}{reason}')
|
|
4415
5510
|
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
|
4416
5511
|
# between "stopping/stopped" and "terminating/terminated",
|
|
4417
5512
|
# so we allow for either status instead of casing on
|
|
@@ -4456,13 +5551,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4456
5551
|
|
|
4457
5552
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
|
4458
5553
|
"""Remove the YAML config of a cluster."""
|
|
5554
|
+
cluster_yaml_path = handle.cluster_yaml
|
|
4459
5555
|
handle.cluster_yaml = None
|
|
4460
5556
|
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
|
4461
|
-
|
|
5557
|
+
# Removing the cluster YAML can cause some unexpected stability issues.
|
|
5558
|
+
# See #5011.
|
|
5559
|
+
# global_user_state.remove_cluster_yaml(handle.cluster_name)
|
|
5560
|
+
common_utils.remove_file_if_exists(cluster_yaml_path)
|
|
4462
5561
|
|
|
4463
5562
|
def set_autostop(self,
|
|
4464
5563
|
handle: CloudVmRayResourceHandle,
|
|
4465
5564
|
idle_minutes_to_autostop: Optional[int],
|
|
5565
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor],
|
|
4466
5566
|
down: bool = False,
|
|
4467
5567
|
stream_logs: bool = True) -> None:
|
|
4468
5568
|
# The core.autostop() function should have already checked that the
|
|
@@ -4489,6 +5589,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4489
5589
|
|
|
4490
5590
|
# down = False is the default, but warn the user in case
|
|
4491
5591
|
# they have explicitly specified it.
|
|
5592
|
+
# TODO(cooperc): Fix for new autostop stuff.
|
|
4492
5593
|
config_override_down = skypilot_config.get_nested(
|
|
4493
5594
|
(controller.value.controller_type, 'controller',
|
|
4494
5595
|
'autostop', 'down'), None)
|
|
@@ -4508,17 +5609,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4508
5609
|
# Check if we're stopping spot
|
|
4509
5610
|
assert (handle.launched_resources is not None and
|
|
4510
5611
|
handle.launched_resources.cloud is not None), handle
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4520
|
-
|
|
4521
|
-
|
|
5612
|
+
if handle.is_grpc_enabled_with_flag:
|
|
5613
|
+
request = autostopv1_pb2.SetAutostopRequest(
|
|
5614
|
+
idle_minutes=idle_minutes_to_autostop,
|
|
5615
|
+
backend=self.NAME,
|
|
5616
|
+
wait_for=wait_for.to_protobuf() if wait_for is not None else
|
|
5617
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
5618
|
+
down=down,
|
|
5619
|
+
)
|
|
5620
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5621
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
5622
|
+
else:
|
|
5623
|
+
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
5624
|
+
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
5625
|
+
returncode, _, stderr = self.run_on_head(
|
|
5626
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
5627
|
+
subprocess_utils.handle_returncode(returncode,
|
|
5628
|
+
code,
|
|
5629
|
+
'Failed to set autostop',
|
|
5630
|
+
stderr=stderr,
|
|
5631
|
+
stream_logs=stream_logs)
|
|
4522
5632
|
global_user_state.set_cluster_autostop_value(
|
|
4523
5633
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
|
4524
5634
|
|
|
@@ -4543,22 +5653,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4543
5653
|
# The head node of the cluster is not UP or in an abnormal state.
|
|
4544
5654
|
# We cannot check if the cluster is autostopping.
|
|
4545
5655
|
return False
|
|
4546
|
-
|
|
4547
|
-
|
|
4548
|
-
|
|
4549
|
-
|
|
4550
|
-
|
|
4551
|
-
|
|
4552
|
-
|
|
4553
|
-
|
|
4554
|
-
|
|
4555
|
-
|
|
4556
|
-
|
|
4557
|
-
|
|
5656
|
+
if handle.is_grpc_enabled_with_flag:
|
|
5657
|
+
try:
|
|
5658
|
+
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5659
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
5660
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5661
|
+
).is_autostopping(request))
|
|
5662
|
+
return response.is_autostopping
|
|
5663
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5664
|
+
# The cluster may have been terminated, causing the gRPC call
|
|
5665
|
+
# to timeout and fail.
|
|
5666
|
+
logger.debug(f'Failed to check if cluster is autostopping: {e}')
|
|
5667
|
+
return False
|
|
5668
|
+
else:
|
|
5669
|
+
code = autostop_lib.AutostopCodeGen.is_autostopping()
|
|
5670
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
5671
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
5672
|
+
if returncode == 0:
|
|
5673
|
+
return message_utils.decode_payload(stdout)
|
|
5674
|
+
logger.debug('Failed to check if cluster is autostopping with '
|
|
5675
|
+
f'{returncode}: {stdout+stderr}\n'
|
|
5676
|
+
f'Command: {code}')
|
|
5677
|
+
return False
|
|
4558
5678
|
|
|
4559
5679
|
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
|
|
4560
5680
|
# can support its own command runner.
|
|
4561
5681
|
@timeline.event
|
|
5682
|
+
@context_utils.cancellation_guard
|
|
4562
5683
|
def run_on_head(
|
|
4563
5684
|
self,
|
|
4564
5685
|
handle: CloudVmRayResourceHandle,
|
|
@@ -4649,7 +5770,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4649
5770
|
exceptions.InvalidClusterNameError: If the cluster name is invalid.
|
|
4650
5771
|
# TODO(zhwu): complete the list of exceptions.
|
|
4651
5772
|
"""
|
|
4652
|
-
record = global_user_state.get_cluster_from_name(
|
|
5773
|
+
record = global_user_state.get_cluster_from_name(
|
|
5774
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
4653
5775
|
if record is None:
|
|
4654
5776
|
handle_before_refresh = None
|
|
4655
5777
|
status_before_refresh = None
|
|
@@ -4657,6 +5779,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4657
5779
|
handle_before_refresh = record['handle']
|
|
4658
5780
|
status_before_refresh = record['status']
|
|
4659
5781
|
|
|
5782
|
+
handle: Optional[CloudVmRayResourceHandle]
|
|
4660
5783
|
prev_cluster_status, handle = (status_before_refresh,
|
|
4661
5784
|
handle_before_refresh)
|
|
4662
5785
|
|
|
@@ -4668,7 +5791,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4668
5791
|
record = backend_utils.refresh_cluster_record(
|
|
4669
5792
|
cluster_name,
|
|
4670
5793
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
4671
|
-
|
|
5794
|
+
cluster_lock_already_held=True,
|
|
5795
|
+
include_user_info=False,
|
|
5796
|
+
summary_response=True,
|
|
4672
5797
|
)
|
|
4673
5798
|
if record is not None:
|
|
4674
5799
|
prev_cluster_status = record['status']
|
|
@@ -4677,7 +5802,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4677
5802
|
prev_cluster_status = None
|
|
4678
5803
|
handle = None
|
|
4679
5804
|
# We should check the cluster_ever_up after refresh, because if the
|
|
4680
|
-
# cluster is terminated (through console or auto-
|
|
5805
|
+
# cluster is terminated (through console or auto-down), the record will
|
|
4681
5806
|
# become None and the cluster_ever_up should be considered as False.
|
|
4682
5807
|
cluster_ever_up = record is not None and record['cluster_ever_up']
|
|
4683
5808
|
prev_config_hash = record['config_hash'] if record is not None else None
|
|
@@ -4690,16 +5815,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4690
5815
|
self.check_resources_fit_cluster(handle, task)
|
|
4691
5816
|
# Use the existing cluster.
|
|
4692
5817
|
assert handle.launched_resources is not None, (cluster_name, handle)
|
|
5818
|
+
# Take a random resource in order to get resource info that applies
|
|
5819
|
+
# to all resources.
|
|
5820
|
+
one_task_resource = list(task.resources)[0]
|
|
5821
|
+
|
|
4693
5822
|
# Assume resources share the same ports.
|
|
4694
5823
|
for resource in task.resources:
|
|
4695
|
-
assert resource.ports ==
|
|
5824
|
+
assert resource.ports == one_task_resource.ports
|
|
4696
5825
|
requested_ports_set = resources_utils.port_ranges_to_set(
|
|
4697
|
-
|
|
5826
|
+
one_task_resource.ports)
|
|
4698
5827
|
current_ports_set = resources_utils.port_ranges_to_set(
|
|
4699
5828
|
handle.launched_resources.ports)
|
|
4700
5829
|
all_ports = resources_utils.port_set_to_ranges(current_ports_set |
|
|
4701
5830
|
requested_ports_set)
|
|
4702
5831
|
to_provision = handle.launched_resources
|
|
5832
|
+
assert to_provision is not None
|
|
5833
|
+
to_provision = to_provision.assert_launchable()
|
|
4703
5834
|
if (to_provision.cloud.OPEN_PORTS_VERSION <=
|
|
4704
5835
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
|
4705
5836
|
if not requested_ports_set <= current_ports_set:
|
|
@@ -4713,6 +5844,57 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4713
5844
|
'a new cluster with the desired ports open.')
|
|
4714
5845
|
if all_ports:
|
|
4715
5846
|
to_provision = to_provision.copy(ports=all_ports)
|
|
5847
|
+
# Docker login should always be the same for all resources, since
|
|
5848
|
+
# it's set from envs.
|
|
5849
|
+
for resource in task.resources:
|
|
5850
|
+
assert (resource.docker_login_config ==
|
|
5851
|
+
one_task_resource.docker_login_config), (
|
|
5852
|
+
resource.docker_login_config,
|
|
5853
|
+
one_task_resource.docker_login_config)
|
|
5854
|
+
# If we have docker login config in the new task, override the
|
|
5855
|
+
# existing resources to pick up new credentials. This allows the
|
|
5856
|
+
# user to specify new or fixed credentials if the existing
|
|
5857
|
+
# credentials are not working. If we don't do this, the credentials
|
|
5858
|
+
# from the existing resources will always be reused.
|
|
5859
|
+
if one_task_resource.docker_login_config is not None:
|
|
5860
|
+
to_provision = to_provision.copy(
|
|
5861
|
+
_docker_login_config=one_task_resource.docker_login_config)
|
|
5862
|
+
|
|
5863
|
+
# cluster_config_overrides should be the same for all resources.
|
|
5864
|
+
for resource in task.resources:
|
|
5865
|
+
assert (resource.cluster_config_overrides ==
|
|
5866
|
+
one_task_resource.cluster_config_overrides)
|
|
5867
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
5868
|
+
# Warn users if the Kubernetes pod config is different
|
|
5869
|
+
# from the existing cluster.
|
|
5870
|
+
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5871
|
+
cluster_name)
|
|
5872
|
+
actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
|
|
5873
|
+
desired_cluster_yaml_obj = (
|
|
5874
|
+
kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
5875
|
+
actual_cluster_yaml_obj,
|
|
5876
|
+
cluster_config_overrides=one_task_resource.
|
|
5877
|
+
cluster_config_overrides,
|
|
5878
|
+
cloud=to_provision.cloud,
|
|
5879
|
+
context=to_provision.region))
|
|
5880
|
+
|
|
5881
|
+
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5882
|
+
return (yaml_obj.get('available_node_types',
|
|
5883
|
+
{}).get('ray_head_default',
|
|
5884
|
+
{}).get('node_config', {}))
|
|
5885
|
+
|
|
5886
|
+
if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
|
|
5887
|
+
actual_cluster_yaml_obj):
|
|
5888
|
+
# pylint: disable=line-too-long
|
|
5889
|
+
logger.warning(
|
|
5890
|
+
f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
|
|
5891
|
+
f'pod config than the existing cluster. The existing '
|
|
5892
|
+
f'cluster will be used with its current pod config.'
|
|
5893
|
+
f'To apply use your task\'s new pod config:\n'
|
|
5894
|
+
f' • Use a new cluster'
|
|
5895
|
+
f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
|
|
5896
|
+
f'{colorama.Style.RESET_ALL}')
|
|
5897
|
+
|
|
4716
5898
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
4717
5899
|
cluster_name,
|
|
4718
5900
|
to_provision,
|
|
@@ -4727,33 +5909,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4727
5909
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
|
4728
5910
|
|
|
4729
5911
|
if to_provision is None:
|
|
4730
|
-
#
|
|
4731
|
-
#
|
|
4732
|
-
#
|
|
4733
|
-
#
|
|
4734
|
-
#
|
|
4735
|
-
#
|
|
4736
|
-
#
|
|
4737
|
-
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
handle_before_refresh,
|
|
4742
|
-
|
|
4743
|
-
|
|
4744
|
-
|
|
4745
|
-
|
|
4746
|
-
|
|
4747
|
-
|
|
4748
|
-
|
|
4749
|
-
|
|
4750
|
-
|
|
4751
|
-
|
|
4752
|
-
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
|
|
4756
|
-
|
|
5912
|
+
# Recently terminated after refresh. OPTIMIZE usually ran outside
|
|
5913
|
+
# the lock, so that decision may be stale by now. Under the lock,
|
|
5914
|
+
# ensure we always have a concrete plan via the following order:
|
|
5915
|
+
# 1) Reuse last placement snapshot (if available);
|
|
5916
|
+
# 2) Else, call injected planner for a fresh plan.
|
|
5917
|
+
# If we still have a pre-refresh handle snapshot with a concrete
|
|
5918
|
+
# placement, prefer reusing it.
|
|
5919
|
+
if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
|
|
5920
|
+
handle_before_refresh.launched_resources is not None):
|
|
5921
|
+
to_provision = handle_before_refresh.launched_resources
|
|
5922
|
+
# Ensure the requested task fits the previous placement.
|
|
5923
|
+
self.check_resources_fit_cluster(handle_before_refresh, task)
|
|
5924
|
+
# Mirror the original message for reuse path.
|
|
5925
|
+
status_before_refresh_str = None
|
|
5926
|
+
if status_before_refresh is not None:
|
|
5927
|
+
status_before_refresh_str = status_before_refresh.value
|
|
5928
|
+
logger.info(
|
|
5929
|
+
f'The cluster {cluster_name!r} (status: '
|
|
5930
|
+
f'{status_before_refresh_str}) was not found on the cloud: '
|
|
5931
|
+
'it may be autodowned, manually terminated, or its launch '
|
|
5932
|
+
'never succeeded. Provisioning a new cluster by using the '
|
|
5933
|
+
'same resources as its original launch.')
|
|
5934
|
+
elif self._planner is not None:
|
|
5935
|
+
to_provision = self._planner(task)
|
|
5936
|
+
logger.info(
|
|
5937
|
+
'Previous placement snapshot missing; computing a fresh '
|
|
5938
|
+
'plan for provisioning.')
|
|
5939
|
+
else:
|
|
5940
|
+
# Without a snapshot or planner, we cannot proceed safely.
|
|
5941
|
+
# Surface a user-friendly error without a long traceback.
|
|
5942
|
+
with ux_utils.print_exception_no_traceback():
|
|
5943
|
+
raise RuntimeError(
|
|
5944
|
+
'No concrete launch plan available after recent cloud '
|
|
5945
|
+
f'termination of cluster {cluster_name!r}. Ensure the '
|
|
5946
|
+
'OPTIMIZE stage runs or provide concrete resources.')
|
|
4757
5947
|
|
|
4758
5948
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
4759
5949
|
cluster_name,
|
|
@@ -5033,18 +6223,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5033
6223
|
# reconstruct them during cluster restart.
|
|
5034
6224
|
continue
|
|
5035
6225
|
storage_mounts_metadata[dst] = storage_obj.handle
|
|
5036
|
-
|
|
5037
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
6226
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5038
6227
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5039
6228
|
try:
|
|
5040
|
-
with
|
|
6229
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5041
6230
|
global_user_state.set_cluster_storage_mounts_metadata(
|
|
5042
6231
|
cluster_name, storage_mounts_metadata)
|
|
5043
|
-
except
|
|
6232
|
+
except locks.LockTimeout as e:
|
|
5044
6233
|
raise RuntimeError(
|
|
5045
6234
|
f'Failed to store metadata for cluster {cluster_name!r} due to '
|
|
5046
6235
|
'a timeout when trying to access local database. Please '
|
|
5047
|
-
f'try again or manually remove the lock at {
|
|
6236
|
+
f'try again or manually remove the lock at {lock_id}. '
|
|
5048
6237
|
f'{common_utils.format_exception(e)}') from None
|
|
5049
6238
|
|
|
5050
6239
|
def get_storage_mounts_metadata(
|
|
@@ -5055,19 +6244,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5055
6244
|
After retrieving storage_mounts_metadata, it converts back the
|
|
5056
6245
|
StorageMetadata to Storage object and restores 'storage_mounts.'
|
|
5057
6246
|
"""
|
|
5058
|
-
|
|
5059
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
6247
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5060
6248
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5061
6249
|
try:
|
|
5062
|
-
with
|
|
6250
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5063
6251
|
storage_mounts_metadata = (
|
|
5064
6252
|
global_user_state.get_cluster_storage_mounts_metadata(
|
|
5065
6253
|
cluster_name))
|
|
5066
|
-
except
|
|
6254
|
+
except locks.LockTimeout as e:
|
|
5067
6255
|
raise RuntimeError(
|
|
5068
6256
|
f'Failed to retrieve metadata for cluster {cluster_name!r} '
|
|
5069
6257
|
'due to a timeout when trying to access local database. '
|
|
5070
|
-
f'Please try again or manually remove the lock at {
|
|
6258
|
+
f'Please try again or manually remove the lock at {lock_id}.'
|
|
5071
6259
|
f' {common_utils.format_exception(e)}') from None
|
|
5072
6260
|
|
|
5073
6261
|
if storage_mounts_metadata is None:
|
|
@@ -5104,7 +6292,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5104
6292
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
|
5105
6293
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
|
5106
6294
|
"""Returns the environment variables for the task."""
|
|
5107
|
-
env_vars = task.
|
|
6295
|
+
env_vars = task.envs_and_secrets
|
|
5108
6296
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
|
5109
6297
|
# by the controller.
|
|
5110
6298
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|
|
@@ -5116,11 +6304,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5116
6304
|
env_vars.update(self._skypilot_predefined_env_vars(handle))
|
|
5117
6305
|
return env_vars
|
|
5118
6306
|
|
|
6307
|
+
def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
|
|
6308
|
+
"""Returns the user id for the managed job."""
|
|
6309
|
+
if task.managed_job_dag is not None:
|
|
6310
|
+
return task.envs[constants.USER_ID_ENV_VAR]
|
|
6311
|
+
return None
|
|
6312
|
+
|
|
5119
6313
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5120
6314
|
task: task_lib.Task, job_id: int,
|
|
5121
|
-
|
|
6315
|
+
remote_log_dir: str) -> None:
|
|
5122
6316
|
# Launch the command as a Ray task.
|
|
5123
|
-
log_dir = os.path.join(
|
|
6317
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5124
6318
|
|
|
5125
6319
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
|
5126
6320
|
internal_ips = handle.internal_ips()
|
|
@@ -5154,21 +6348,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5154
6348
|
|
|
5155
6349
|
codegen.add_epilogue()
|
|
5156
6350
|
|
|
5157
|
-
self._exec_code_on_head(
|
|
5158
|
-
|
|
5159
|
-
|
|
5160
|
-
|
|
5161
|
-
|
|
6351
|
+
self._exec_code_on_head(
|
|
6352
|
+
handle,
|
|
6353
|
+
codegen.build(),
|
|
6354
|
+
job_id,
|
|
6355
|
+
managed_job_dag=task.managed_job_dag,
|
|
6356
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
6357
|
+
remote_log_dir=remote_log_dir)
|
|
5162
6358
|
|
|
5163
6359
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
|
5164
6360
|
task: task_lib.Task, job_id: int,
|
|
5165
|
-
|
|
6361
|
+
remote_log_dir: str) -> None:
|
|
5166
6362
|
# Strategy:
|
|
5167
6363
|
# ray.init(...)
|
|
5168
6364
|
# for node:
|
|
5169
6365
|
# submit _run_cmd(cmd) with resource {node_i: 1}
|
|
5170
|
-
|
|
5171
|
-
log_dir = os.path.join(log_dir_base, 'tasks')
|
|
6366
|
+
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5172
6367
|
resources_dict = backend_utils.get_task_demands_dict(task)
|
|
5173
6368
|
internal_ips = handle.internal_ips()
|
|
5174
6369
|
assert internal_ips is not None, 'internal_ips is not cached in handle'
|
|
@@ -5210,8 +6405,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5210
6405
|
|
|
5211
6406
|
codegen.add_epilogue()
|
|
5212
6407
|
# TODO(zhanghao): Add help info for downloading logs.
|
|
5213
|
-
self._exec_code_on_head(
|
|
5214
|
-
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
6408
|
+
self._exec_code_on_head(
|
|
6409
|
+
handle,
|
|
6410
|
+
codegen.build(),
|
|
6411
|
+
job_id,
|
|
6412
|
+
managed_job_dag=task.managed_job_dag,
|
|
6413
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
6414
|
+
remote_log_dir=remote_log_dir)
|