skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,6 @@
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
2
2
|
import copy
|
3
3
|
import enum
|
4
|
-
import functools
|
5
|
-
import getpass
|
6
4
|
import inspect
|
7
5
|
import json
|
8
6
|
import math
|
@@ -10,6 +8,7 @@ import os
|
|
10
8
|
import pathlib
|
11
9
|
import re
|
12
10
|
import shlex
|
11
|
+
import shutil
|
13
12
|
import signal
|
14
13
|
import subprocess
|
15
14
|
import sys
|
@@ -18,13 +17,15 @@ import textwrap
|
|
18
17
|
import threading
|
19
18
|
import time
|
20
19
|
import typing
|
21
|
-
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple,
|
20
|
+
from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
21
|
+
Union)
|
22
22
|
|
23
23
|
import colorama
|
24
24
|
import filelock
|
25
25
|
|
26
26
|
import sky
|
27
27
|
from sky import backends
|
28
|
+
from sky import check as sky_check
|
28
29
|
from sky import cloud_stores
|
29
30
|
from sky import clouds
|
30
31
|
from sky import exceptions
|
@@ -33,9 +34,7 @@ from sky import jobs as managed_jobs
|
|
33
34
|
from sky import optimizer
|
34
35
|
from sky import provision as provision_lib
|
35
36
|
from sky import resources as resources_lib
|
36
|
-
from sky import serve as serve_lib
|
37
37
|
from sky import sky_logging
|
38
|
-
from sky import status_lib
|
39
38
|
from sky import task as task_lib
|
40
39
|
from sky.backends import backend_utils
|
41
40
|
from sky.backends import wheel_utils
|
@@ -47,18 +46,26 @@ from sky.provision import common as provision_common
|
|
47
46
|
from sky.provision import instance_setup
|
48
47
|
from sky.provision import metadata_utils
|
49
48
|
from sky.provision import provisioner
|
49
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
50
|
+
from sky.server.requests import requests as requests_lib
|
50
51
|
from sky.skylet import autostop_lib
|
51
52
|
from sky.skylet import constants
|
52
53
|
from sky.skylet import job_lib
|
53
54
|
from sky.skylet import log_lib
|
54
55
|
from sky.usage import usage_lib
|
55
56
|
from sky.utils import accelerator_registry
|
57
|
+
from sky.utils import annotations
|
58
|
+
from sky.utils import cluster_utils
|
56
59
|
from sky.utils import command_runner
|
60
|
+
from sky.utils import common
|
57
61
|
from sky.utils import common_utils
|
58
62
|
from sky.utils import controller_utils
|
59
63
|
from sky.utils import log_utils
|
64
|
+
from sky.utils import message_utils
|
65
|
+
from sky.utils import registry
|
60
66
|
from sky.utils import resources_utils
|
61
67
|
from sky.utils import rich_utils
|
68
|
+
from sky.utils import status_lib
|
62
69
|
from sky.utils import subprocess_utils
|
63
70
|
from sky.utils import timeline
|
64
71
|
from sky.utils import ux_utils
|
@@ -81,9 +88,10 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
81
88
|
clouds.AWS: 90,
|
82
89
|
clouds.Azure: 90,
|
83
90
|
clouds.GCP: 240,
|
84
|
-
clouds.Lambda:
|
91
|
+
clouds.Lambda: 300,
|
85
92
|
clouds.IBM: 160,
|
86
93
|
clouds.OCI: 300,
|
94
|
+
clouds.Paperspace: 600,
|
87
95
|
clouds.Kubernetes: 300,
|
88
96
|
clouds.Vsphere: 240,
|
89
97
|
}
|
@@ -95,6 +103,11 @@ _RETRY_UNTIL_UP_INIT_GAP_SECONDS = 30
|
|
95
103
|
# The maximum retry count for fetching IP address.
|
96
104
|
_FETCH_IP_MAX_ATTEMPTS = 3
|
97
105
|
|
106
|
+
# How many times to query the cloud provider to make sure instances are
|
107
|
+
# stopping/terminating, and how long to wait between each query.
|
108
|
+
_TEARDOWN_WAIT_MAX_ATTEMPTS = 10
|
109
|
+
_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS = 1
|
110
|
+
|
98
111
|
_TEARDOWN_FAILURE_MESSAGE = (
|
99
112
|
f'\n{colorama.Fore.RED}Failed to terminate '
|
100
113
|
'{cluster_name}. {extra_reason}'
|
@@ -119,9 +132,6 @@ _RSYNC_NOT_FOUND_MESSAGE = (
|
|
119
132
|
|
120
133
|
_TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
|
121
134
|
|
122
|
-
_CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming '
|
123
|
-
'(task will not be killed).')
|
124
|
-
|
125
135
|
_MAX_RAY_UP_RETRY = 5
|
126
136
|
|
127
137
|
# Number of retries for getting zones.
|
@@ -145,9 +155,24 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
145
155
|
# If the command is too long, we instead write it to a file, rsync and execute
|
146
156
|
# it.
|
147
157
|
#
|
148
|
-
# We use
|
158
|
+
# We use 100KB as a threshold to be safe for other arguments that
|
149
159
|
# might be added during ssh.
|
150
|
-
_MAX_INLINE_SCRIPT_LENGTH =
|
160
|
+
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
161
|
+
|
162
|
+
_RESOURCES_UNAVAILABLE_LOG = (
|
163
|
+
'Reasons for provision failures (for details, please check the log above):')
|
164
|
+
|
165
|
+
|
166
|
+
def _is_command_length_over_limit(command: str) -> bool:
|
167
|
+
"""Check if the length of the command exceeds the limit.
|
168
|
+
|
169
|
+
We calculate the length of the command after quoting the command twice as
|
170
|
+
when it is executed by the CommandRunner, the command will be quoted twice
|
171
|
+
to ensure the correctness, which will add significant length to the command.
|
172
|
+
"""
|
173
|
+
|
174
|
+
quoted_length = len(shlex.quote(shlex.quote(command)))
|
175
|
+
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
151
176
|
|
152
177
|
|
153
178
|
def _get_cluster_config_template(cloud):
|
@@ -161,16 +186,19 @@ def _get_cluster_config_template(cloud):
|
|
161
186
|
clouds.SCP: 'scp-ray.yml.j2',
|
162
187
|
clouds.OCI: 'oci-ray.yml.j2',
|
163
188
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
189
|
+
clouds.DO: 'do-ray.yml.j2',
|
164
190
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
165
191
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
166
192
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
167
|
-
clouds.
|
193
|
+
clouds.Vast: 'vast-ray.yml.j2',
|
194
|
+
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
195
|
+
clouds.Nebius: 'nebius-ray.yml.j2'
|
168
196
|
}
|
169
197
|
return cloud_to_template[type(cloud)]
|
170
198
|
|
171
199
|
|
172
200
|
def write_ray_up_script_with_patched_launch_hash_fn(
|
173
|
-
cluster_config_path: str,
|
201
|
+
cluster_config_path: Optional[str],
|
174
202
|
ray_up_kwargs: Dict[str, bool],
|
175
203
|
) -> str:
|
176
204
|
"""Writes a Python script that runs `ray up` with our launch hash func.
|
@@ -257,6 +285,13 @@ class RayCodeGen:
|
|
257
285
|
import time
|
258
286
|
from typing import Dict, List, Optional, Tuple, Union
|
259
287
|
|
288
|
+
# Set the environment variables to avoid deduplicating logs and
|
289
|
+
# scheduler events. This should be set in driver code, since we are
|
290
|
+
# not using `ray job submit` anymore, and the environment variables
|
291
|
+
# from the ray cluster is not inherited.
|
292
|
+
os.environ['RAY_DEDUP_LOGS'] = '0'
|
293
|
+
os.environ['RAY_SCHEDULER_EVENTS'] = '0'
|
294
|
+
|
260
295
|
import ray
|
261
296
|
import ray.util as ray_util
|
262
297
|
|
@@ -264,12 +299,14 @@ class RayCodeGen:
|
|
264
299
|
from sky.skylet import constants
|
265
300
|
from sky.skylet import job_lib
|
266
301
|
from sky.utils import log_utils
|
302
|
+
from sky.utils import subprocess_utils
|
267
303
|
|
268
304
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
269
305
|
|
270
306
|
kwargs = dict()
|
271
|
-
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
272
|
-
# exists for backward compatibility for the VM
|
307
|
+
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
308
|
+
# the directory exists for backward compatibility for the VM
|
309
|
+
# launched before #1790.
|
273
310
|
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
|
274
311
|
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
|
275
312
|
ray.init(
|
@@ -280,6 +317,8 @@ class RayCodeGen:
|
|
280
317
|
)
|
281
318
|
def get_or_fail(futures, pg) -> List[int]:
|
282
319
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
320
|
+
if not futures:
|
321
|
+
return []
|
283
322
|
returncodes = [1] * len(futures)
|
284
323
|
# Wait for 1 task to be ready.
|
285
324
|
ready = []
|
@@ -307,8 +346,9 @@ class RayCodeGen:
|
|
307
346
|
ready, unready = ray.wait(unready)
|
308
347
|
idx = futures.index(ready[0])
|
309
348
|
returncodes[idx] = ray.get(ready[0])
|
310
|
-
# Remove the placement group after all tasks are done, so that
|
311
|
-
# next job can be scheduled on the released resources
|
349
|
+
# Remove the placement group after all tasks are done, so that
|
350
|
+
# the next job can be scheduled on the released resources
|
351
|
+
# immediately.
|
312
352
|
ray_util.remove_placement_group(pg)
|
313
353
|
sys.stdout.flush()
|
314
354
|
return returncodes
|
@@ -347,9 +387,9 @@ class RayCodeGen:
|
|
347
387
|
num_nodes: int,
|
348
388
|
resources_dict: Dict[str, float],
|
349
389
|
stable_cluster_internal_ips: List[str],
|
390
|
+
env_vars: Dict[str, str],
|
350
391
|
setup_cmd: Optional[str] = None,
|
351
392
|
setup_log_path: Optional[str] = None,
|
352
|
-
env_vars: Optional[Dict[str, str]] = None,
|
353
393
|
) -> None:
|
354
394
|
"""Create the gang scheduling placement group for a Task.
|
355
395
|
|
@@ -388,27 +428,42 @@ class RayCodeGen:
|
|
388
428
|
**gpu_dict,
|
389
429
|
})
|
390
430
|
|
431
|
+
streaming_message = (
|
432
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
433
|
+
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
434
|
+
f'be killed){colorama.Style.RESET_ALL}')
|
391
435
|
self._code += [
|
392
436
|
textwrap.dedent(f"""\
|
393
437
|
pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
|
394
438
|
plural = 's' if {num_nodes} > 1 else ''
|
395
439
|
node_str = f'{num_nodes} node{{plural}}'
|
396
440
|
|
397
|
-
message
|
398
|
-
|
399
|
-
|
400
|
-
|
441
|
+
# We have this `INFO: Tip:` message only for backward
|
442
|
+
# compatibility, because if a cluster has the old SkyPilot version,
|
443
|
+
# it relies on this message to start log streaming.
|
444
|
+
# This message will be skipped for new clusters, because we use
|
445
|
+
# start_streaming_at for the `Waiting for task resources on`
|
446
|
+
# message.
|
447
|
+
# TODO: Remove this message in v0.9.0.
|
448
|
+
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}INFO: '
|
449
|
+
'Tip: use Ctrl-C to exit log streaming, not kill '
|
450
|
+
'the job.{colorama.Style.RESET_ALL}\\n')
|
451
|
+
message += ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
452
|
+
'Waiting for task resources on '
|
453
|
+
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
454
|
+
print(message, flush=True)
|
401
455
|
# FIXME: This will print the error message from autoscaler if
|
402
456
|
# it is waiting for other task to finish. We should hide the
|
403
457
|
# error message.
|
404
458
|
ray.get(pg.ready())
|
405
|
-
print(
|
406
|
-
flush=True)
|
459
|
+
print({streaming_message!r}, flush=True)
|
407
460
|
""")
|
408
461
|
]
|
409
462
|
|
410
463
|
job_id = self.job_id
|
411
464
|
if setup_cmd is not None:
|
465
|
+
setup_envs = env_vars.copy()
|
466
|
+
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
412
467
|
self._code += [
|
413
468
|
textwrap.dedent(f"""\
|
414
469
|
setup_cmd = {setup_cmd!r}
|
@@ -438,7 +493,7 @@ class RayCodeGen:
|
|
438
493
|
.remote(
|
439
494
|
setup_cmd,
|
440
495
|
os.path.expanduser({setup_log_path!r}),
|
441
|
-
env_vars={
|
496
|
+
env_vars={setup_envs!r},
|
442
497
|
stream_logs=True,
|
443
498
|
with_ray=True,
|
444
499
|
) for i in range(total_num_nodes)]
|
@@ -477,7 +532,6 @@ class RayCodeGen:
|
|
477
532
|
)).remote()
|
478
533
|
for i in range(pg.bundle_count)
|
479
534
|
])
|
480
|
-
print('INFO: Reserved IPs:', gang_scheduling_id_to_ip)
|
481
535
|
|
482
536
|
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
483
537
|
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
@@ -549,11 +603,13 @@ class RayCodeGen:
|
|
549
603
|
f'placement_group_bundle_index={gang_scheduling_id})')
|
550
604
|
|
551
605
|
sky_env_vars_dict_str = [
|
552
|
-
textwrap.dedent("""\
|
553
|
-
sky_env_vars_dict = {}
|
554
|
-
sky_env_vars_dict['SKYPILOT_NODE_IPS'] = job_ip_list_str
|
555
|
-
# Environment starting with `SKY_` is
|
606
|
+
textwrap.dedent(f"""\
|
607
|
+
sky_env_vars_dict = {{}}
|
608
|
+
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
|
609
|
+
# Backward compatibility: Environment starting with `SKY_` is
|
610
|
+
# deprecated. Remove it in v0.9.0.
|
556
611
|
sky_env_vars_dict['SKY_NODE_IPS'] = job_ip_list_str
|
612
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
|
557
613
|
""")
|
558
614
|
]
|
559
615
|
|
@@ -574,8 +630,9 @@ class RayCodeGen:
|
|
574
630
|
|
575
631
|
|
576
632
|
if script is not None:
|
577
|
-
sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
|
578
|
-
# Environment starting with `SKY_` is
|
633
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
634
|
+
# Backward compatibility: Environment starting with `SKY_` is
|
635
|
+
# deprecated. Remove it in v0.9.0.
|
579
636
|
sky_env_vars_dict['SKY_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
|
580
637
|
|
581
638
|
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
|
@@ -592,12 +649,14 @@ class RayCodeGen:
|
|
592
649
|
node_name = f'worker{{idx_in_cluster}}'
|
593
650
|
name_str = f'{{node_name}}, rank={{rank}},'
|
594
651
|
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
|
595
|
-
sky_env_vars_dict['SKYPILOT_NODE_RANK'] = rank
|
596
|
-
# Environment starting with `SKY_` is
|
652
|
+
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
|
653
|
+
# Backward compatibility: Environment starting with `SKY_` is
|
654
|
+
# deprecated. Remove it in v0.9.0.
|
597
655
|
sky_env_vars_dict['SKY_NODE_RANK'] = rank
|
598
656
|
|
599
657
|
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
600
|
-
# Environment starting with `SKY_` is
|
658
|
+
# Backward compatibility: Environment starting with `SKY_` is
|
659
|
+
# deprecated. Remove it in v0.9.0.
|
601
660
|
sky_env_vars_dict['SKY_INTERNAL_JOB_ID'] = {self.job_id}
|
602
661
|
|
603
662
|
futures.append(run_bash_command_with_log \\
|
@@ -680,56 +739,38 @@ class FailoverCloudErrorHandlerV1:
|
|
680
739
|
"""
|
681
740
|
|
682
741
|
@staticmethod
|
683
|
-
def
|
684
|
-
|
685
|
-
region: 'clouds.Region',
|
686
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
687
|
-
stderr: str):
|
688
|
-
del zones # Unused.
|
689
|
-
# The underlying ray autoscaler will try all zones of a region at once.
|
690
|
-
style = colorama.Style
|
742
|
+
def _handle_errors(stdout: str, stderr: str,
|
743
|
+
is_error_str_known: Callable[[str], bool]) -> List[str]:
|
691
744
|
stdout_splits = stdout.split('\n')
|
692
745
|
stderr_splits = stderr.split('\n')
|
693
746
|
errors = [
|
694
747
|
s.strip()
|
695
748
|
for s in stdout_splits + stderr_splits
|
696
|
-
if (
|
697
|
-
in s.strip() or '(ReadOnlyDisabledSubscription)' in s.strip())
|
749
|
+
if is_error_str_known(s.strip())
|
698
750
|
]
|
699
|
-
if
|
700
|
-
|
701
|
-
|
702
|
-
# timed out. Failed to create head node.
|
703
|
-
# This is a transient error, but we have retried in need_ray_up
|
704
|
-
# and failed. So we skip this region.
|
705
|
-
logger.info('Got \'Head node fetch timed out\' in '
|
706
|
-
f'{region.name}.')
|
707
|
-
_add_to_blocked_resources(
|
708
|
-
blocked_resources,
|
709
|
-
launchable_resources.copy(region=region.name))
|
710
|
-
elif 'rsync: command not found' in stderr:
|
711
|
-
with ux_utils.print_exception_no_traceback():
|
712
|
-
raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
|
713
|
-
logger.info('====== stdout ======')
|
714
|
-
for s in stdout_splits:
|
715
|
-
print(s)
|
716
|
-
logger.info('====== stderr ======')
|
717
|
-
for s in stderr_splits:
|
718
|
-
print(s)
|
751
|
+
if errors:
|
752
|
+
return errors
|
753
|
+
if 'rsync: command not found' in stderr:
|
719
754
|
with ux_utils.print_exception_no_traceback():
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
755
|
+
e = RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
|
756
|
+
setattr(e, 'detailed_reason',
|
757
|
+
f'stdout: {stdout}\nstderr: {stderr}')
|
758
|
+
raise e
|
759
|
+
detailed_reason = textwrap.dedent(f"""\
|
760
|
+
====== stdout ======
|
761
|
+
{stdout}
|
762
|
+
====== stderr ======
|
763
|
+
{stderr}
|
764
|
+
""")
|
765
|
+
logger.info('====== stdout ======')
|
766
|
+
print(stdout)
|
767
|
+
logger.info('====== stderr ======')
|
768
|
+
print(stderr)
|
769
|
+
with ux_utils.print_exception_no_traceback():
|
770
|
+
e = RuntimeError('Errors occurred during provision; '
|
771
|
+
'check logs above.')
|
772
|
+
setattr(e, 'detailed_reason', detailed_reason)
|
773
|
+
raise e
|
733
774
|
|
734
775
|
@staticmethod
|
735
776
|
def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
|
@@ -737,32 +778,14 @@ class FailoverCloudErrorHandlerV1:
|
|
737
778
|
region: 'clouds.Region',
|
738
779
|
zones: Optional[List['clouds.Zone']], stdout: str,
|
739
780
|
stderr: str):
|
740
|
-
del zones # Unused.
|
781
|
+
del region, zones # Unused.
|
782
|
+
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
783
|
+
stdout,
|
784
|
+
stderr,
|
785
|
+
is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
|
786
|
+
messages = '\n '.join(errors)
|
741
787
|
style = colorama.Style
|
742
|
-
|
743
|
-
stderr_splits = stderr.split('\n')
|
744
|
-
errors = [
|
745
|
-
s.strip()
|
746
|
-
for s in stdout_splits + stderr_splits
|
747
|
-
if 'LambdaCloudError:' in s.strip()
|
748
|
-
]
|
749
|
-
if not errors:
|
750
|
-
if 'rsync: command not found' in stderr:
|
751
|
-
with ux_utils.print_exception_no_traceback():
|
752
|
-
raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
|
753
|
-
logger.info('====== stdout ======')
|
754
|
-
for s in stdout_splits:
|
755
|
-
print(s)
|
756
|
-
logger.info('====== stderr ======')
|
757
|
-
for s in stderr_splits:
|
758
|
-
print(s)
|
759
|
-
with ux_utils.print_exception_no_traceback():
|
760
|
-
raise RuntimeError('Errors occurred during provision; '
|
761
|
-
'check logs above.')
|
762
|
-
|
763
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
764
|
-
messages = '\n\t'.join(errors)
|
765
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
788
|
+
logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
|
766
789
|
_add_to_blocked_resources(blocked_resources,
|
767
790
|
launchable_resources.copy(zone=None))
|
768
791
|
|
@@ -775,65 +798,21 @@ class FailoverCloudErrorHandlerV1:
|
|
775
798
|
blocked_resources,
|
776
799
|
launchable_resources.copy(region=r.name, zone=None))
|
777
800
|
|
778
|
-
@staticmethod
|
779
|
-
def _kubernetes_handler(blocked_resources: Set['resources_lib.Resources'],
|
780
|
-
launchable_resources: 'resources_lib.Resources',
|
781
|
-
region, zones, stdout, stderr):
|
782
|
-
del zones # Unused.
|
783
|
-
style = colorama.Style
|
784
|
-
stdout_splits = stdout.split('\n')
|
785
|
-
stderr_splits = stderr.split('\n')
|
786
|
-
errors = [
|
787
|
-
s.strip()
|
788
|
-
for s in stdout_splits + stderr_splits
|
789
|
-
if 'KubernetesError:' in s.strip()
|
790
|
-
]
|
791
|
-
if not errors:
|
792
|
-
logger.info('====== stdout ======')
|
793
|
-
for s in stdout_splits:
|
794
|
-
print(s)
|
795
|
-
logger.info('====== stderr ======')
|
796
|
-
for s in stderr_splits:
|
797
|
-
print(s)
|
798
|
-
with ux_utils.print_exception_no_traceback():
|
799
|
-
raise RuntimeError('Errors occurred during provisioning; '
|
800
|
-
'check logs above.')
|
801
|
-
|
802
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
803
|
-
messages = '\n\t'.join(errors)
|
804
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
805
|
-
_add_to_blocked_resources(blocked_resources,
|
806
|
-
launchable_resources.copy(zone=None))
|
807
|
-
|
808
801
|
@staticmethod
|
809
802
|
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
810
|
-
launchable_resources: 'resources_lib.Resources',
|
811
|
-
|
803
|
+
launchable_resources: 'resources_lib.Resources',
|
804
|
+
region: 'clouds.Region',
|
805
|
+
zones: Optional[List['clouds.Zone']], stdout: str,
|
806
|
+
stderr: str):
|
812
807
|
del zones # Unused.
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
s.strip()
|
818
|
-
for s in stdout_splits + stderr_splits
|
819
|
-
if 'SCPError:' in s.strip()
|
820
|
-
]
|
821
|
-
if not errors:
|
822
|
-
if 'rsync: command not found' in stderr:
|
823
|
-
with ux_utils.print_exception_no_traceback():
|
824
|
-
raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
|
825
|
-
logger.info('====== stdout ======')
|
826
|
-
for s in stdout_splits:
|
827
|
-
print(s)
|
828
|
-
logger.info('====== stderr ======')
|
829
|
-
for s in stderr_splits:
|
830
|
-
print(s)
|
831
|
-
with ux_utils.print_exception_no_traceback():
|
832
|
-
raise RuntimeError('Errors occurred during provision; '
|
833
|
-
'check logs above.')
|
808
|
+
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
809
|
+
stdout,
|
810
|
+
stderr,
|
811
|
+
is_error_str_known=lambda x: 'SCPError:' in x.strip())
|
834
812
|
|
835
813
|
logger.warning(f'Got error(s) in {region.name}:')
|
836
814
|
messages = '\n\t'.join(errors)
|
815
|
+
style = colorama.Style
|
837
816
|
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
838
817
|
_add_to_blocked_resources(blocked_resources,
|
839
818
|
launchable_resources.copy(zone=None))
|
@@ -854,29 +833,13 @@ class FailoverCloudErrorHandlerV1:
|
|
854
833
|
zones: Optional[List['clouds.Zone']], stdout: str,
|
855
834
|
stderr: str):
|
856
835
|
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
s.strip()
|
862
|
-
for s in stdout_splits + stderr_splits
|
863
|
-
if 'ERR' in s.strip() or 'PANIC' in s.strip()
|
864
|
-
]
|
865
|
-
if not errors:
|
866
|
-
if 'rsync: command not found' in stderr:
|
867
|
-
with ux_utils.print_exception_no_traceback():
|
868
|
-
raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
|
869
|
-
logger.info('====== stdout ======')
|
870
|
-
for s in stdout_splits:
|
871
|
-
print(s)
|
872
|
-
logger.info('====== stderr ======')
|
873
|
-
for s in stderr_splits:
|
874
|
-
print(s)
|
875
|
-
with ux_utils.print_exception_no_traceback():
|
876
|
-
raise RuntimeError('Errors occurred during provision; '
|
877
|
-
'check logs above.')
|
836
|
+
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
837
|
+
stdout, stderr,
|
838
|
+
lambda x: 'ERR' in x.strip() or 'PANIC' in x.strip())
|
839
|
+
|
878
840
|
logger.warning(f'Got error(s) on IBM cluster, in {region.name}:')
|
879
841
|
messages = '\n\t'.join(errors)
|
842
|
+
style = colorama.Style
|
880
843
|
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
881
844
|
|
882
845
|
for zone in zones: # type: ignore[union-attr]
|
@@ -890,35 +853,17 @@ class FailoverCloudErrorHandlerV1:
|
|
890
853
|
region: 'clouds.Region',
|
891
854
|
zones: Optional[List['clouds.Zone']], stdout: str,
|
892
855
|
stderr: str):
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
stderr_splits = stderr.split('\n')
|
897
|
-
errors = [
|
898
|
-
s.strip()
|
899
|
-
for s in stdout_splits + stderr_splits
|
900
|
-
if ('VcnSubnetNotFound' in s.strip()) or
|
901
|
-
('oci.exceptions.ServiceError' in s.strip() and
|
902
|
-
('NotAuthorizedOrNotFound' in s.strip() or 'CannotParseRequest' in
|
903
|
-
s.strip() or 'InternalError' in s.strip() or
|
904
|
-
'LimitExceeded' in s.strip() or 'NotAuthenticated' in s.strip()))
|
856
|
+
known_service_errors = [
|
857
|
+
'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
|
858
|
+
'LimitExceeded', 'NotAuthenticated'
|
905
859
|
]
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
logger.info('====== stdout ======')
|
911
|
-
for s in stdout_splits:
|
912
|
-
print(s)
|
913
|
-
logger.info('====== stderr ======')
|
914
|
-
for s in stderr_splits:
|
915
|
-
print(s)
|
916
|
-
with ux_utils.print_exception_no_traceback():
|
917
|
-
raise RuntimeError('Errors occurred during provision; '
|
918
|
-
'check logs above.')
|
919
|
-
|
860
|
+
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
861
|
+
stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
|
862
|
+
('oci.exceptions.ServiceError' in x.strip() and any(
|
863
|
+
known_err in x.strip() for known_err in known_service_errors)))
|
920
864
|
logger.warning(f'Got error(s) in {region.name}:')
|
921
865
|
messages = '\n\t'.join(errors)
|
866
|
+
style = colorama.Style
|
922
867
|
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
923
868
|
|
924
869
|
if zones is not None:
|
@@ -1000,6 +945,29 @@ class FailoverCloudErrorHandlerV2:
|
|
1000
945
|
stdout and stderr.
|
1001
946
|
"""
|
1002
947
|
|
948
|
+
@staticmethod
|
949
|
+
def _azure_handler(blocked_resources: Set['resources_lib.Resources'],
|
950
|
+
launchable_resources: 'resources_lib.Resources',
|
951
|
+
region: 'clouds.Region', zones: List['clouds.Zone'],
|
952
|
+
err: Exception):
|
953
|
+
del region, zones # Unused.
|
954
|
+
if '(ReadOnlyDisabledSubscription)' in str(err):
|
955
|
+
logger.info(
|
956
|
+
f'{colorama.Style.DIM}Azure subscription is read-only. '
|
957
|
+
'Skip provisioning on Azure. Please check the subscription set '
|
958
|
+
'with az account set -s <subscription_id>.'
|
959
|
+
f'{colorama.Style.RESET_ALL}')
|
960
|
+
_add_to_blocked_resources(
|
961
|
+
blocked_resources,
|
962
|
+
resources_lib.Resources(cloud=clouds.Azure()))
|
963
|
+
elif 'ClientAuthenticationError' in str(err):
|
964
|
+
_add_to_blocked_resources(
|
965
|
+
blocked_resources,
|
966
|
+
resources_lib.Resources(cloud=clouds.Azure()))
|
967
|
+
else:
|
968
|
+
_add_to_blocked_resources(blocked_resources,
|
969
|
+
launchable_resources.copy(zone=None))
|
970
|
+
|
1003
971
|
@staticmethod
|
1004
972
|
def _gcp_handler(blocked_resources: Set['resources_lib.Resources'],
|
1005
973
|
launchable_resources: 'resources_lib.Resources',
|
@@ -1135,7 +1103,7 @@ class FailoverCloudErrorHandlerV2:
|
|
1135
1103
|
'having the required permissions and the user '
|
1136
1104
|
'account does not have enough permission to '
|
1137
1105
|
'update it. Please contact your administrator and '
|
1138
|
-
'check out: https://skypilot.
|
1106
|
+
'check out: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long
|
1139
1107
|
f'Details: {message}')
|
1140
1108
|
_add_to_blocked_resources(
|
1141
1109
|
blocked_resources,
|
@@ -1203,6 +1171,7 @@ class RetryingVmProvisioner(object):
|
|
1203
1171
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
1204
1172
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
1205
1173
|
prev_cluster_ever_up: bool,
|
1174
|
+
prev_config_hash: Optional[str],
|
1206
1175
|
) -> None:
|
1207
1176
|
assert cluster_name is not None, 'cluster_name must be specified.'
|
1208
1177
|
self.cluster_name = cluster_name
|
@@ -1211,11 +1180,12 @@ class RetryingVmProvisioner(object):
|
|
1211
1180
|
self.prev_cluster_status = prev_cluster_status
|
1212
1181
|
self.prev_handle = prev_handle
|
1213
1182
|
self.prev_cluster_ever_up = prev_cluster_ever_up
|
1183
|
+
self.prev_config_hash = prev_config_hash
|
1214
1184
|
|
1215
1185
|
def __init__(self,
|
1216
1186
|
log_dir: str,
|
1217
1187
|
dag: 'dag.Dag',
|
1218
|
-
optimize_target: '
|
1188
|
+
optimize_target: 'common.OptimizeTarget',
|
1219
1189
|
requested_features: Set[clouds.CloudImplementationFeatures],
|
1220
1190
|
local_wheel_path: pathlib.Path,
|
1221
1191
|
wheel_hash: str,
|
@@ -1294,9 +1264,10 @@ class RetryingVmProvisioner(object):
|
|
1294
1264
|
|
1295
1265
|
if prev_cluster_status != status_lib.ClusterStatus.UP:
|
1296
1266
|
logger.info(
|
1297
|
-
f'Cluster {cluster_name!r} (status: '
|
1298
|
-
f'{prev_cluster_status.value}) was previously
|
1299
|
-
f'
|
1267
|
+
f'{colorama.Style.DIM}Cluster {cluster_name!r} (status: '
|
1268
|
+
f'{prev_cluster_status.value}) was previously in '
|
1269
|
+
f'{cloud} ({region.name}). Restarting.'
|
1270
|
+
f'{colorama.Style.RESET_ALL}')
|
1300
1271
|
yield zones
|
1301
1272
|
|
1302
1273
|
# If it reaches here: the cluster status in the database gets
|
@@ -1371,19 +1342,29 @@ class RetryingVmProvisioner(object):
|
|
1371
1342
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
1372
1343
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
1373
1344
|
prev_cluster_ever_up: bool,
|
1345
|
+
skip_if_config_hash_matches: Optional[str],
|
1374
1346
|
) -> Dict[str, Any]:
|
1375
|
-
"""The provision retry loop.
|
1376
|
-
|
1377
|
-
|
1347
|
+
"""The provision retry loop.
|
1348
|
+
|
1349
|
+
Returns a config_dict with the following fields:
|
1350
|
+
All fields from backend_utils.write_cluster_config(). See its
|
1351
|
+
docstring.
|
1352
|
+
- 'provisioning_skipped': True if provisioning was short-circuited
|
1353
|
+
by skip_if_config_hash_matches, False otherwise.
|
1354
|
+
- 'handle': The provisioned cluster handle.
|
1355
|
+
- 'provision_record': (Only if using the new skypilot provisioner) The
|
1356
|
+
record returned by provisioner.bulk_provision().
|
1357
|
+
- 'resources_vars': (Only if using the new skypilot provisioner) The
|
1358
|
+
resources variables given by make_deploy_resources_variables().
|
1359
|
+
"""
|
1378
1360
|
# Get log_path name
|
1379
1361
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
1380
1362
|
log_abs_path = os.path.abspath(log_path)
|
1381
1363
|
if not dryrun:
|
1382
1364
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
1383
1365
|
os.system(f'touch {log_path}')
|
1384
|
-
|
1385
|
-
|
1386
|
-
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
1366
|
+
rich_utils.force_update_status(
|
1367
|
+
ux_utils.spinner_message('Launching', log_path))
|
1387
1368
|
|
1388
1369
|
# Get previous cluster status
|
1389
1370
|
cluster_exists = prev_cluster_status is not None
|
@@ -1419,8 +1400,7 @@ class RetryingVmProvisioner(object):
|
|
1419
1400
|
f'in {to_provision.cloud}. '
|
1420
1401
|
f'{colorama.Style.RESET_ALL}'
|
1421
1402
|
f'To request quotas, check the instruction: '
|
1422
|
-
f'https://skypilot.
|
1423
|
-
)
|
1403
|
+
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
1424
1404
|
|
1425
1405
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
1426
1406
|
prev_cluster_status,
|
@@ -1484,8 +1464,18 @@ class RetryingVmProvisioner(object):
|
|
1484
1464
|
raise exceptions.ResourcesUnavailableError(
|
1485
1465
|
f'Failed to provision on cloud {to_provision.cloud} due to '
|
1486
1466
|
f'invalid cloud config: {common_utils.format_exception(e)}')
|
1467
|
+
|
1468
|
+
if ('config_hash' in config_dict and
|
1469
|
+
skip_if_config_hash_matches == config_dict['config_hash']):
|
1470
|
+
logger.debug('Skipping provisioning of cluster with matching '
|
1471
|
+
'config hash.')
|
1472
|
+
config_dict['provisioning_skipped'] = True
|
1473
|
+
return config_dict
|
1474
|
+
config_dict['provisioning_skipped'] = False
|
1475
|
+
|
1487
1476
|
if dryrun:
|
1488
1477
|
return config_dict
|
1478
|
+
|
1489
1479
|
cluster_config_file = config_dict['ray']
|
1490
1480
|
|
1491
1481
|
launched_resources = to_provision.copy(region=region.name)
|
@@ -1540,24 +1530,55 @@ class RetryingVmProvisioner(object):
|
|
1540
1530
|
assert to_provision.region == region.name, (to_provision,
|
1541
1531
|
region)
|
1542
1532
|
num_nodes = handle.launched_nodes
|
1533
|
+
# Some clouds, like RunPod, only support exposing ports during
|
1534
|
+
# launch. For those clouds, we pass the ports to open in the
|
1535
|
+
# `bulk_provision` to expose the ports during provisioning.
|
1536
|
+
# If the `bulk_provision` is to apply on an existing cluster,
|
1537
|
+
# it should be ignored by the underlying provisioner impl
|
1538
|
+
# as it will only apply to newly-created instances.
|
1539
|
+
ports_to_open_on_launch = (
|
1540
|
+
list(resources_utils.port_ranges_to_set(to_provision.ports))
|
1541
|
+
if to_provision.cloud.OPEN_PORTS_VERSION <=
|
1542
|
+
clouds.OpenPortsVersion.LAUNCH_ONLY else None)
|
1543
1543
|
try:
|
1544
|
+
controller = controller_utils.Controllers.from_name(
|
1545
|
+
cluster_name)
|
1546
|
+
controller_str = ('' if controller is None else
|
1547
|
+
f' {controller.value.name}')
|
1548
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
1549
|
+
# Omit the region name for Kubernetes.
|
1550
|
+
logger.info(
|
1551
|
+
ux_utils.starting_message(
|
1552
|
+
f'Launching{controller_str} on '
|
1553
|
+
f'{to_provision.cloud}.'))
|
1554
|
+
else:
|
1555
|
+
logger.info(
|
1556
|
+
ux_utils.starting_message(
|
1557
|
+
f'Launching{controller_str} on '
|
1558
|
+
f'{to_provision.cloud} '
|
1559
|
+
f'{region.name}{colorama.Style.RESET_ALL}'
|
1560
|
+
f'{zone_str}.'))
|
1561
|
+
assert handle.cluster_yaml is not None
|
1544
1562
|
provision_record = provisioner.bulk_provision(
|
1545
1563
|
to_provision.cloud,
|
1546
1564
|
region,
|
1547
1565
|
zones,
|
1548
|
-
|
1549
|
-
|
1566
|
+
resources_utils.ClusterName(
|
1567
|
+
cluster_name, handle.cluster_name_on_cloud),
|
1550
1568
|
num_nodes=num_nodes,
|
1551
1569
|
cluster_yaml=handle.cluster_yaml,
|
1552
1570
|
prev_cluster_ever_up=prev_cluster_ever_up,
|
1553
|
-
log_dir=self.log_dir
|
1571
|
+
log_dir=self.log_dir,
|
1572
|
+
ports_to_open_on_launch=ports_to_open_on_launch)
|
1554
1573
|
# NOTE: We will handle the logic of '_ensure_cluster_ray_started' #pylint: disable=line-too-long
|
1555
1574
|
# in 'provision_utils.post_provision_runtime_setup()' in the
|
1556
1575
|
# caller.
|
1557
1576
|
resources_vars = (
|
1558
1577
|
to_provision.cloud.make_deploy_resources_variables(
|
1559
|
-
to_provision,
|
1560
|
-
|
1578
|
+
to_provision,
|
1579
|
+
resources_utils.ClusterName(
|
1580
|
+
cluster_name, handle.cluster_name_on_cloud),
|
1581
|
+
region, zones, num_nodes))
|
1561
1582
|
config_dict['provision_record'] = provision_record
|
1562
1583
|
config_dict['resources_vars'] = resources_vars
|
1563
1584
|
config_dict['handle'] = handle
|
@@ -1570,7 +1591,9 @@ class RetryingVmProvisioner(object):
|
|
1570
1591
|
# cluster does not exist. Also we are fast at
|
1571
1592
|
# cleaning up clusters now if there is no existing node..
|
1572
1593
|
CloudVmRayBackend().post_teardown_cleanup(
|
1573
|
-
handle,
|
1594
|
+
handle,
|
1595
|
+
terminate=not prev_cluster_ever_up,
|
1596
|
+
remove_from_db=False)
|
1574
1597
|
# TODO(suquark): other clouds may have different zone
|
1575
1598
|
# blocking strategy. See '_update_blocklist_on_error'
|
1576
1599
|
# for details.
|
@@ -1585,6 +1608,7 @@ class RetryingVmProvisioner(object):
|
|
1585
1608
|
'region_name': region.name,
|
1586
1609
|
'zone_str': zone_str,
|
1587
1610
|
}
|
1611
|
+
|
1588
1612
|
status, stdout, stderr, head_internal_ip, head_external_ip = (
|
1589
1613
|
self._gang_schedule_ray_up(to_provision.cloud,
|
1590
1614
|
cluster_config_file, handle,
|
@@ -1623,9 +1647,9 @@ class RetryingVmProvisioner(object):
|
|
1623
1647
|
self._ensure_cluster_ray_started(handle, log_abs_path)
|
1624
1648
|
|
1625
1649
|
config_dict['handle'] = handle
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1650
|
+
logger.info(
|
1651
|
+
ux_utils.finishing_message(
|
1652
|
+
f'Cluster launched: {cluster_name!r}.', log_path))
|
1629
1653
|
return config_dict
|
1630
1654
|
|
1631
1655
|
# The cluster is not ready. We must perform error recording and/or
|
@@ -1686,21 +1710,20 @@ class RetryingVmProvisioner(object):
|
|
1686
1710
|
# autoscaler proceeds to setup commands, which may fail:
|
1687
1711
|
# ERR updater.py:138 -- New status: update-failed
|
1688
1712
|
CloudVmRayBackend().teardown_no_lock(handle,
|
1689
|
-
terminate=terminate_or_stop
|
1713
|
+
terminate=terminate_or_stop,
|
1714
|
+
remove_from_db=False)
|
1690
1715
|
|
1691
1716
|
if to_provision.zone is not None:
|
1692
1717
|
message = (
|
1693
|
-
f'Failed to acquire resources in {to_provision.zone}
|
1694
|
-
'
|
1718
|
+
f'Failed to acquire resources in {to_provision.zone} for '
|
1719
|
+
f'{requested_resources}. ')
|
1695
1720
|
elif to_provision.region is not None:
|
1696
1721
|
# For public clouds, provision.region is always set.
|
1697
1722
|
message = ('Failed to acquire resources in all zones in '
|
1698
|
-
f'{to_provision.region}
|
1699
|
-
'requirements or use another region.')
|
1723
|
+
f'{to_provision.region} for {requested_resources}. ')
|
1700
1724
|
else:
|
1701
|
-
message = (f'Failed to acquire resources in {to_provision.cloud}
|
1702
|
-
'
|
1703
|
-
'cloud provider.')
|
1725
|
+
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
1726
|
+
f'for {requested_resources}. ')
|
1704
1727
|
# Do not failover to other locations if the cluster was ever up, since
|
1705
1728
|
# the user can have some data on the cluster.
|
1706
1729
|
raise exceptions.ResourcesUnavailableError(
|
@@ -1751,7 +1774,7 @@ class RetryingVmProvisioner(object):
|
|
1751
1774
|
log_abs_path,
|
1752
1775
|
stream_logs=False,
|
1753
1776
|
start_streaming_at='Shared connection to',
|
1754
|
-
line_processor=log_utils.RayUpLineProcessor(),
|
1777
|
+
line_processor=log_utils.RayUpLineProcessor(log_abs_path),
|
1755
1778
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
1756
1779
|
# time during 'ray up' if insufficient capacity occurs.
|
1757
1780
|
env=dict(
|
@@ -1771,13 +1794,14 @@ class RetryingVmProvisioner(object):
|
|
1771
1794
|
|
1772
1795
|
region_name = logging_info['region_name']
|
1773
1796
|
zone_str = logging_info['zone_str']
|
1774
|
-
style = colorama.Style
|
1775
1797
|
if isinstance(to_provision_cloud, clouds.Kubernetes):
|
1776
|
-
logger.info(
|
1777
|
-
|
1798
|
+
logger.info(
|
1799
|
+
ux_utils.starting_message(
|
1800
|
+
f'Launching on {to_provision_cloud}.'))
|
1778
1801
|
else:
|
1779
|
-
logger.info(
|
1780
|
-
|
1802
|
+
logger.info(
|
1803
|
+
ux_utils.starting_message(f'Launching on {to_provision_cloud} '
|
1804
|
+
f'{region_name}{zone_str}.'))
|
1781
1805
|
start = time.time()
|
1782
1806
|
|
1783
1807
|
# Edge case: /tmp/ray does not exist, so autoscaler can't create/store
|
@@ -1802,19 +1826,6 @@ class RetryingVmProvisioner(object):
|
|
1802
1826
|
if returncode == 0:
|
1803
1827
|
return False
|
1804
1828
|
|
1805
|
-
if isinstance(to_provision_cloud, clouds.Azure):
|
1806
|
-
if 'Failed to invoke the Azure CLI' in stderr:
|
1807
|
-
logger.info(
|
1808
|
-
'Retrying head node provisioning due to Azure CLI '
|
1809
|
-
'issues.')
|
1810
|
-
return True
|
1811
|
-
if ('Head node fetch timed out. Failed to create head node.'
|
1812
|
-
in stderr):
|
1813
|
-
logger.info(
|
1814
|
-
'Retrying head node provisioning due to head fetching '
|
1815
|
-
'timeout.')
|
1816
|
-
return True
|
1817
|
-
|
1818
1829
|
if isinstance(to_provision_cloud, clouds.Lambda):
|
1819
1830
|
if 'Your API requests are being rate limited.' in stderr:
|
1820
1831
|
logger.info(
|
@@ -1892,11 +1903,6 @@ class RetryingVmProvisioner(object):
|
|
1892
1903
|
head_internal_ip, head_external_ip)
|
1893
1904
|
|
1894
1905
|
# All code below is handling num_nodes > 1.
|
1895
|
-
provision_str = ('Successfully provisioned or found existing head '
|
1896
|
-
'instance.')
|
1897
|
-
logger.info(f'{style.BRIGHT}{provision_str} '
|
1898
|
-
f'Waiting for workers.{style.RESET_ALL}')
|
1899
|
-
|
1900
1906
|
# FIXME(zongheng): the below requires ray processes are up on head. To
|
1901
1907
|
# repro it failing: launch a 2-node cluster, log into head and ray
|
1902
1908
|
# stop, then launch again.
|
@@ -1985,8 +1991,13 @@ class RetryingVmProvisioner(object):
|
|
1985
1991
|
to_provision_config: ToProvisionConfig,
|
1986
1992
|
dryrun: bool,
|
1987
1993
|
stream_logs: bool,
|
1994
|
+
skip_unnecessary_provisioning: bool,
|
1988
1995
|
) -> Dict[str, Any]:
|
1989
|
-
"""Provision with retries for all launchable resources.
|
1996
|
+
"""Provision with retries for all launchable resources.
|
1997
|
+
|
1998
|
+
Returns the config_dict from _retry_zones() - see its docstring for
|
1999
|
+
details.
|
2000
|
+
"""
|
1990
2001
|
cluster_name = to_provision_config.cluster_name
|
1991
2002
|
to_provision = to_provision_config.resources
|
1992
2003
|
num_nodes = to_provision_config.num_nodes
|
@@ -1995,10 +2006,28 @@ class RetryingVmProvisioner(object):
|
|
1995
2006
|
prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
|
1996
2007
|
launchable_retries_disabled = (self._dag is None or
|
1997
2008
|
self._optimize_target is None)
|
2009
|
+
skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
|
2010
|
+
skip_unnecessary_provisioning else None)
|
1998
2011
|
|
1999
2012
|
failover_history: List[Exception] = list()
|
2013
|
+
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
|
2014
|
+
# If the user is using local credentials which may expire, the
|
2015
|
+
# controller may leak resources if the credentials expire while a job
|
2016
|
+
# is running. Here we check the enabled clouds and expiring credentials
|
2017
|
+
# and raise a warning to the user.
|
2018
|
+
if task.is_controller_task():
|
2019
|
+
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
|
2020
|
+
expirable_clouds = backend_utils.get_expirable_clouds(
|
2021
|
+
enabled_clouds)
|
2022
|
+
|
2023
|
+
if len(expirable_clouds) > 0:
|
2024
|
+
warnings = (f'\033[93mWarning: Credentials used for '
|
2025
|
+
f'{expirable_clouds} may expire. Clusters may be '
|
2026
|
+
f'leaked if the credentials expire while jobs '
|
2027
|
+
f'are running. It is recommended to use credentials'
|
2028
|
+
f' that never expire or a service account.\033[0m')
|
2029
|
+
logger.warning(warnings)
|
2000
2030
|
|
2001
|
-
style = colorama.Style
|
2002
2031
|
# Retrying launchable resources.
|
2003
2032
|
while True:
|
2004
2033
|
try:
|
@@ -2008,11 +2037,12 @@ class RetryingVmProvisioner(object):
|
|
2008
2037
|
if dryrun:
|
2009
2038
|
cloud_user = None
|
2010
2039
|
else:
|
2011
|
-
cloud_user = to_provision.cloud.
|
2040
|
+
cloud_user = to_provision.cloud.get_active_user_identity()
|
2012
2041
|
|
2013
2042
|
requested_features = self._requested_features.copy()
|
2014
|
-
# Skip stop feature for Kubernetes controllers.
|
2015
|
-
if (isinstance(to_provision.cloud,
|
2043
|
+
# Skip stop feature for Kubernetes and RunPod controllers.
|
2044
|
+
if (isinstance(to_provision.cloud,
|
2045
|
+
(clouds.Kubernetes, clouds.RunPod)) and
|
2016
2046
|
controller_utils.Controllers.from_name(cluster_name)
|
2017
2047
|
is not None):
|
2018
2048
|
assert (clouds.CloudImplementationFeatures.STOP
|
@@ -2034,7 +2064,8 @@ class RetryingVmProvisioner(object):
|
|
2034
2064
|
cloud_user_identity=cloud_user,
|
2035
2065
|
prev_cluster_status=prev_cluster_status,
|
2036
2066
|
prev_handle=prev_handle,
|
2037
|
-
prev_cluster_ever_up=prev_cluster_ever_up
|
2067
|
+
prev_cluster_ever_up=prev_cluster_ever_up,
|
2068
|
+
skip_if_config_hash_matches=skip_if_config_hash_matches)
|
2038
2069
|
if dryrun:
|
2039
2070
|
return config_dict
|
2040
2071
|
except (exceptions.InvalidClusterNameError,
|
@@ -2067,17 +2098,12 @@ class RetryingVmProvisioner(object):
|
|
2067
2098
|
# Provisioning succeeded.
|
2068
2099
|
break
|
2069
2100
|
|
2070
|
-
if to_provision.zone is None:
|
2071
|
-
region_or_zone_str = str(to_provision.region)
|
2072
|
-
else:
|
2073
|
-
region_or_zone_str = str(to_provision.zone)
|
2074
|
-
logger.warning(f'\n{style.BRIGHT}Provision failed for {num_nodes}x '
|
2075
|
-
f'{to_provision} in {region_or_zone_str}. '
|
2076
|
-
f'Trying other locations (if any).{style.RESET_ALL}')
|
2077
2101
|
if prev_cluster_status is None:
|
2078
2102
|
# Add failed resources to the blocklist, only when it
|
2079
2103
|
# is in fallback mode.
|
2080
2104
|
_add_to_blocked_resources(self._blocked_resources, to_provision)
|
2105
|
+
assert len(failover_history) > 0
|
2106
|
+
resource_exceptions[to_provision] = failover_history[-1]
|
2081
2107
|
else:
|
2082
2108
|
# If we reach here, it means that the existing cluster must have
|
2083
2109
|
# a previous status of INIT, because other statuses (UP,
|
@@ -2088,8 +2114,10 @@ class RetryingVmProvisioner(object):
|
|
2088
2114
|
), prev_cluster_status
|
2089
2115
|
assert global_user_state.get_handle_from_cluster_name(
|
2090
2116
|
cluster_name) is None, cluster_name
|
2091
|
-
logger.info(
|
2092
|
-
|
2117
|
+
logger.info(
|
2118
|
+
ux_utils.retry_message(
|
2119
|
+
f'Retrying provisioning with requested resources: '
|
2120
|
+
f'{task.num_nodes}x {task.resources}'))
|
2093
2121
|
# Retry with the current, potentially "smaller" resources:
|
2094
2122
|
# to_provision == the current new resources (e.g., V100:1),
|
2095
2123
|
# which may be "smaller" than the original (V100:8).
|
@@ -2099,12 +2127,18 @@ class RetryingVmProvisioner(object):
|
|
2099
2127
|
prev_cluster_status = None
|
2100
2128
|
prev_handle = None
|
2101
2129
|
|
2130
|
+
retry_message = ux_utils.retry_message(
|
2131
|
+
'Trying other potential resources.')
|
2132
|
+
logger.warning(f'\n{retry_message}')
|
2133
|
+
log_path = os.path.join(self.log_dir, 'provision.log')
|
2134
|
+
rich_utils.force_update_status(
|
2135
|
+
ux_utils.spinner_message('Looking for resources', log_path))
|
2102
2136
|
# Set to None so that sky.optimize() will assign a new one
|
2103
2137
|
# (otherwise will skip re-optimizing this task).
|
2104
2138
|
# TODO: set all remaining tasks' best_resources to None.
|
2105
2139
|
task.best_resources = None
|
2106
2140
|
try:
|
2107
|
-
self._dag =
|
2141
|
+
self._dag = optimizer.Optimizer.optimize(
|
2108
2142
|
self._dag,
|
2109
2143
|
minimize=self._optimize_target,
|
2110
2144
|
blocked_resources=self._blocked_resources)
|
@@ -2114,7 +2148,14 @@ class RetryingVmProvisioner(object):
|
|
2114
2148
|
# possible resources or the requested resources is too
|
2115
2149
|
# restrictive. If we reach here, our failover logic finally
|
2116
2150
|
# ends here.
|
2117
|
-
|
2151
|
+
table = log_utils.create_table(['Resource', 'Reason'])
|
2152
|
+
for (resource, exception) in resource_exceptions.items():
|
2153
|
+
table.add_row(
|
2154
|
+
[resources_utils.format_resource(resource), exception])
|
2155
|
+
table.max_table_width = shutil.get_terminal_size().columns
|
2156
|
+
raise exceptions.ResourcesUnavailableError(
|
2157
|
+
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2158
|
+
failover_history=failover_history)
|
2118
2159
|
to_provision = task.best_resources
|
2119
2160
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2120
2161
|
assert to_provision is not None, task
|
@@ -2143,31 +2184,30 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2143
2184
|
"""
|
2144
2185
|
# Bump if any fields get added/removed/changed, and add backward
|
2145
2186
|
# compaitibility logic in __setstate__.
|
2146
|
-
_VERSION =
|
2187
|
+
_VERSION = 10
|
2147
2188
|
|
2148
2189
|
def __init__(
|
2149
2190
|
self,
|
2150
2191
|
*,
|
2151
2192
|
cluster_name: str,
|
2152
2193
|
cluster_name_on_cloud: str,
|
2153
|
-
cluster_yaml: str,
|
2194
|
+
cluster_yaml: Optional[str],
|
2154
2195
|
launched_nodes: int,
|
2155
2196
|
launched_resources: resources_lib.Resources,
|
2156
2197
|
stable_internal_external_ips: Optional[List[Tuple[str,
|
2157
2198
|
str]]] = None,
|
2158
2199
|
stable_ssh_ports: Optional[List[int]] = None,
|
2159
|
-
cluster_info: Optional[provision_common.ClusterInfo] = None
|
2160
|
-
|
2161
|
-
# API handles the TPU node creation/deletion.
|
2162
|
-
# Backward compatibility for TPU nodes created before #2943.
|
2163
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2164
|
-
tpu_create_script: Optional[str] = None,
|
2165
|
-
tpu_delete_script: Optional[str] = None) -> None:
|
2200
|
+
cluster_info: Optional[provision_common.ClusterInfo] = None
|
2201
|
+
) -> None:
|
2166
2202
|
self._version = self._VERSION
|
2167
2203
|
self.cluster_name = cluster_name
|
2168
2204
|
self.cluster_name_on_cloud = cluster_name_on_cloud
|
2169
|
-
|
2170
|
-
|
2205
|
+
# Replace the home directory with ~ for better robustness across systems
|
2206
|
+
# with different home directories.
|
2207
|
+
if cluster_yaml is not None and cluster_yaml.startswith(
|
2208
|
+
os.path.expanduser('~')):
|
2209
|
+
cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
|
2210
|
+
self._cluster_yaml = cluster_yaml
|
2171
2211
|
# List of (internal_ip, feasible_ip) tuples for all the nodes in the
|
2172
2212
|
# cluster, sorted by the feasible ips. The feasible ips can be either
|
2173
2213
|
# internal or external ips, depending on the use_internal_ips flag.
|
@@ -2177,12 +2217,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2177
2217
|
self.launched_nodes = launched_nodes
|
2178
2218
|
self.launched_resources = launched_resources
|
2179
2219
|
self.docker_user: Optional[str] = None
|
2180
|
-
# Deprecated. SkyPilot new provisioner API handles the TPU node
|
2181
|
-
# creation/deletion.
|
2182
|
-
# Backward compatibility for TPU nodes created before #2943.
|
2183
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2184
|
-
self.tpu_create_script = tpu_create_script
|
2185
|
-
self.tpu_delete_script = tpu_delete_script
|
2186
2220
|
|
2187
2221
|
def __repr__(self):
|
2188
2222
|
return (f'ResourceHandle('
|
@@ -2198,10 +2232,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2198
2232
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
2199
2233
|
f'{self.launched_resources}, '
|
2200
2234
|
f'\n\tdocker_user={self.docker_user},'
|
2201
|
-
f'\n\tssh_user={self.ssh_user}
|
2202
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2203
|
-
f'\n\ttpu_create_script={self.tpu_create_script}, '
|
2204
|
-
f'\n\ttpu_delete_script={self.tpu_delete_script})')
|
2235
|
+
f'\n\tssh_user={self.ssh_user}')
|
2205
2236
|
|
2206
2237
|
def get_cluster_name(self):
|
2207
2238
|
return self.cluster_name
|
@@ -2214,26 +2245,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2214
2245
|
return common_utils.read_yaml(self.cluster_yaml).get(
|
2215
2246
|
'provider', {}).get('use_internal_ips', False)
|
2216
2247
|
|
2217
|
-
def _update_cluster_region(self):
|
2218
|
-
"""Update the region in handle.launched_resources.
|
2219
|
-
|
2220
|
-
This is for backward compatibility to handle the clusters launched
|
2221
|
-
long before. We should remove this after 0.6.0.
|
2222
|
-
"""
|
2223
|
-
if self.launched_resources.region is not None:
|
2224
|
-
return
|
2225
|
-
|
2226
|
-
config = common_utils.read_yaml(self.cluster_yaml)
|
2227
|
-
provider = config['provider']
|
2228
|
-
cloud = self.launched_resources.cloud
|
2229
|
-
if cloud.is_same_cloud(clouds.Azure()):
|
2230
|
-
region = provider['location']
|
2231
|
-
elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
|
2232
|
-
clouds.AWS()):
|
2233
|
-
region = provider['region']
|
2234
|
-
|
2235
|
-
self.launched_resources = self.launched_resources.copy(region=region)
|
2236
|
-
|
2237
2248
|
def update_ssh_ports(self, max_attempts: int = 1) -> None:
|
2238
2249
|
"""Fetches and sets the SSH ports for the cluster nodes.
|
2239
2250
|
|
@@ -2322,9 +2333,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2322
2333
|
"""
|
2323
2334
|
if cluster_info is not None:
|
2324
2335
|
self.cached_cluster_info = cluster_info
|
2325
|
-
|
2326
|
-
cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips(
|
2327
|
-
use_internal_ips)
|
2336
|
+
cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips()
|
2328
2337
|
cluster_internal_ips = self.cached_cluster_info.get_feasible_ips(
|
2329
2338
|
force_internal_ips=True)
|
2330
2339
|
else:
|
@@ -2403,7 +2412,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2403
2412
|
internal_external_ips[1:], key=lambda x: x[1])
|
2404
2413
|
self.stable_internal_external_ips = stable_internal_external_ips
|
2405
2414
|
|
2406
|
-
@
|
2415
|
+
@annotations.lru_cache(scope='global')
|
2407
2416
|
@timeline.event
|
2408
2417
|
def get_command_runners(self,
|
2409
2418
|
force_cached: bool = False,
|
@@ -2414,8 +2423,20 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2414
2423
|
self.cluster_yaml, self.docker_user, self.ssh_user)
|
2415
2424
|
if avoid_ssh_control:
|
2416
2425
|
ssh_credentials.pop('ssh_control_name', None)
|
2426
|
+
updated_to_skypilot_provisioner_after_provisioned = (
|
2427
|
+
self.launched_resources.cloud.PROVISIONER_VERSION >=
|
2428
|
+
clouds.ProvisionerVersion.SKYPILOT and
|
2429
|
+
self.cached_external_ips is not None and
|
2430
|
+
self.cached_cluster_info is None)
|
2431
|
+
if updated_to_skypilot_provisioner_after_provisioned:
|
2432
|
+
logger.debug(
|
2433
|
+
f'{self.launched_resources.cloud} has been updated to the new '
|
2434
|
+
f'provisioner after cluster {self.cluster_name} was '
|
2435
|
+
f'provisioned. Cached IPs are used for connecting to the '
|
2436
|
+
'cluster.')
|
2417
2437
|
if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
|
2418
|
-
self.launched_resources.cloud.PROVISIONER_VERSION
|
2438
|
+
self.launched_resources.cloud.PROVISIONER_VERSION or
|
2439
|
+
updated_to_skypilot_provisioner_after_provisioned):
|
2419
2440
|
ip_list = (self.cached_external_ips
|
2420
2441
|
if force_cached else self.external_ips())
|
2421
2442
|
if ip_list is None:
|
@@ -2428,7 +2449,17 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2428
2449
|
zip(ip_list, port_list), **ssh_credentials)
|
2429
2450
|
return runners
|
2430
2451
|
if self.cached_cluster_info is None:
|
2431
|
-
|
2452
|
+
# We have `and self.cached_external_ips is None` here, because
|
2453
|
+
# when a cluster's cloud is just upgraded to the new provsioner,
|
2454
|
+
# although it has the cached_external_ips, the cached_cluster_info
|
2455
|
+
# can be None. We need to update it here, even when force_cached is
|
2456
|
+
# set to True.
|
2457
|
+
# TODO: We can remove `self.cached_external_ips is None` after
|
2458
|
+
# all clouds moved to new provisioner.
|
2459
|
+
if force_cached and self.cached_external_ips is None:
|
2460
|
+
raise RuntimeError(
|
2461
|
+
'Tried to use cached cluster info, but it\'s missing for '
|
2462
|
+
f'cluster "{self.cluster_name}"')
|
2432
2463
|
self._update_cluster_info()
|
2433
2464
|
assert self.cached_cluster_info is not None, self
|
2434
2465
|
runners = provision_lib.get_command_runners(
|
@@ -2498,9 +2529,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2498
2529
|
self.docker_user = docker_user
|
2499
2530
|
|
2500
2531
|
@property
|
2501
|
-
def cluster_yaml(self):
|
2532
|
+
def cluster_yaml(self) -> Optional[str]:
|
2533
|
+
if self._cluster_yaml is None:
|
2534
|
+
return None
|
2502
2535
|
return os.path.expanduser(self._cluster_yaml)
|
2503
2536
|
|
2537
|
+
@cluster_yaml.setter
|
2538
|
+
def cluster_yaml(self, value: Optional[str]):
|
2539
|
+
self._cluster_yaml = value
|
2540
|
+
|
2504
2541
|
@property
|
2505
2542
|
def ssh_user(self):
|
2506
2543
|
if self.cached_cluster_info is not None:
|
@@ -2530,7 +2567,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2530
2567
|
"""Returns number of IPs per node in the cluster, handling TPU Pod."""
|
2531
2568
|
is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
|
2532
2569
|
if is_tpu_vm_pod:
|
2533
|
-
num_ips =
|
2570
|
+
num_ips = len(self.internal_ips())
|
2534
2571
|
else:
|
2535
2572
|
num_ips = 1
|
2536
2573
|
return num_ips
|
@@ -2559,6 +2596,35 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2559
2596
|
if version < 8:
|
2560
2597
|
self.cached_cluster_info = None
|
2561
2598
|
|
2599
|
+
if version < 9:
|
2600
|
+
# For backward compatibility, we should update the region of a
|
2601
|
+
# SkyPilot cluster on Kubernetes to the actual context it is using.
|
2602
|
+
# pylint: disable=import-outside-toplevel
|
2603
|
+
launched_resources = state['launched_resources']
|
2604
|
+
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
2605
|
+
yaml_config = common_utils.read_yaml(
|
2606
|
+
os.path.expanduser(state['_cluster_yaml']))
|
2607
|
+
context = kubernetes_utils.get_context_from_config(
|
2608
|
+
yaml_config['provider'])
|
2609
|
+
state['launched_resources'] = launched_resources.copy(
|
2610
|
+
region=context)
|
2611
|
+
|
2612
|
+
if version < 10:
|
2613
|
+
# In #4660, we keep the cluster entry in the database even when it
|
2614
|
+
# is in the transition from one region to another during the
|
2615
|
+
# failover. We allow `handle.cluster_yaml` to be None to indicate
|
2616
|
+
# that the cluster yaml is intentionally removed. Before that PR,
|
2617
|
+
# the `handle.cluster_yaml` is always not None, even if it is
|
2618
|
+
# intentionally removed.
|
2619
|
+
#
|
2620
|
+
# For backward compatibility, we set the `_cluster_yaml` to None
|
2621
|
+
# if the file does not exist, assuming all the removal of the
|
2622
|
+
# _cluster_yaml for existing clusters are intentional by SkyPilot.
|
2623
|
+
# are intentional by SkyPilot.
|
2624
|
+
if state['_cluster_yaml'] is not None and not os.path.exists(
|
2625
|
+
os.path.expanduser(state['_cluster_yaml'])):
|
2626
|
+
state['_cluster_yaml'] = None
|
2627
|
+
|
2562
2628
|
self.__dict__.update(state)
|
2563
2629
|
|
2564
2630
|
# Because the update_cluster_ips and update_ssh_ports
|
@@ -2574,8 +2640,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2574
2640
|
if version < 4:
|
2575
2641
|
self.update_ssh_ports()
|
2576
2642
|
|
2577
|
-
self._update_cluster_region()
|
2578
|
-
|
2579
2643
|
if version < 8:
|
2580
2644
|
try:
|
2581
2645
|
self._update_cluster_info()
|
@@ -2585,6 +2649,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2585
2649
|
pass
|
2586
2650
|
|
2587
2651
|
|
2652
|
+
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
2588
2653
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
2589
2654
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
2590
2655
|
|
@@ -2599,7 +2664,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2599
2664
|
ResourceHandle = CloudVmRayResourceHandle # pylint: disable=invalid-name
|
2600
2665
|
|
2601
2666
|
def __init__(self):
|
2602
|
-
self.run_timestamp =
|
2667
|
+
self.run_timestamp = sky_logging.get_run_timestamp()
|
2603
2668
|
# NOTE: do not expanduser() here, as this '~/...' path is used for
|
2604
2669
|
# remote as well to be expanded on the remote side.
|
2605
2670
|
self.log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
@@ -2614,7 +2679,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2614
2679
|
|
2615
2680
|
# Command for running the setup script. It is only set when the
|
2616
2681
|
# setup needs to be run outside the self._setup() and as part of
|
2617
|
-
# a job (
|
2682
|
+
# a job (detach_setup, default).
|
2618
2683
|
self._setup_cmd = None
|
2619
2684
|
|
2620
2685
|
# --- Implementation of Backend APIs ---
|
@@ -2623,10 +2688,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2623
2688
|
self._dag = kwargs.pop('dag', self._dag)
|
2624
2689
|
self._optimize_target = kwargs.pop(
|
2625
2690
|
'optimize_target',
|
2626
|
-
self._optimize_target) or
|
2691
|
+
self._optimize_target) or common.OptimizeTarget.COST
|
2627
2692
|
self._requested_features = kwargs.pop('requested_features',
|
2628
2693
|
self._requested_features)
|
2629
|
-
assert
|
2694
|
+
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
2630
2695
|
|
2631
2696
|
def check_resources_fit_cluster(
|
2632
2697
|
self,
|
@@ -2656,8 +2721,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2656
2721
|
if record is not None:
|
2657
2722
|
usage_lib.messages.usage.update_cluster_status(record['status'])
|
2658
2723
|
|
2659
|
-
# Backward compatibility: the old launched_resources without region info
|
2660
|
-
# was handled by ResourceHandle._update_cluster_region.
|
2661
2724
|
assert launched_resources.region is not None, handle
|
2662
2725
|
|
2663
2726
|
mismatch_str = (f'To fix: specify a new cluster name, or down the '
|
@@ -2720,17 +2783,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2720
2783
|
f' Existing:\t{handle.launched_nodes}x '
|
2721
2784
|
f'{handle.launched_resources}\n'
|
2722
2785
|
f'{mismatch_str}')
|
2786
|
+
else:
|
2787
|
+
# For fractional acc count clusters, we round up the number of accs
|
2788
|
+
# to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
|
2789
|
+
# Here we scale the required acc count to (required / launched) * 1
|
2790
|
+
# so the total number of accs is the same as the requested number.
|
2791
|
+
launched_accs = launched_resources.accelerators
|
2792
|
+
if (launched_accs is not None and
|
2793
|
+
valid_resource.accelerators is not None):
|
2794
|
+
for _, count in launched_accs.items():
|
2795
|
+
if isinstance(count, float) and not count.is_integer():
|
2796
|
+
valid_resource = valid_resource.copy(
|
2797
|
+
accelerators={
|
2798
|
+
k: v / count
|
2799
|
+
for k, v in valid_resource.accelerators.items()
|
2800
|
+
})
|
2723
2801
|
return valid_resource
|
2724
2802
|
|
2725
2803
|
def _provision(
|
2726
|
-
|
2727
|
-
|
2728
|
-
|
2729
|
-
|
2730
|
-
|
2731
|
-
|
2732
|
-
|
2733
|
-
|
2804
|
+
self,
|
2805
|
+
task: task_lib.Task,
|
2806
|
+
to_provision: Optional[resources_lib.Resources],
|
2807
|
+
dryrun: bool,
|
2808
|
+
stream_logs: bool,
|
2809
|
+
cluster_name: str,
|
2810
|
+
retry_until_up: bool = False,
|
2811
|
+
skip_unnecessary_provisioning: bool = False,
|
2812
|
+
) -> Optional[CloudVmRayResourceHandle]:
|
2813
|
+
"""Provisions the cluster, or re-provisions an existing cluster.
|
2814
|
+
|
2815
|
+
Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
|
2816
|
+
use 'ray up'.
|
2817
|
+
|
2818
|
+
See also docstring for Backend.provision().
|
2734
2819
|
|
2735
2820
|
Raises:
|
2736
2821
|
exceptions.ClusterOwnerIdentityMismatchError: if the cluster
|
@@ -2744,7 +2829,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2744
2829
|
(e.g., cluster name invalid) or a region/zone throwing
|
2745
2830
|
resource unavailability.
|
2746
2831
|
exceptions.CommandError: any ssh command error.
|
2747
|
-
|
2832
|
+
RuntimeError: raised when 'rsync' is not installed.
|
2748
2833
|
# TODO(zhwu): complete the list of exceptions.
|
2749
2834
|
"""
|
2750
2835
|
# FIXME: ray up for Azure with different cluster_names will overwrite
|
@@ -2811,55 +2896,78 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2811
2896
|
local_wheel_path,
|
2812
2897
|
wheel_hash,
|
2813
2898
|
blocked_resources=task.blocked_resources)
|
2899
|
+
log_path = os.path.join(self.log_dir, 'provision.log')
|
2900
|
+
rich_utils.force_update_status(
|
2901
|
+
ux_utils.spinner_message('Launching', log_path))
|
2814
2902
|
config_dict = retry_provisioner.provision_with_retries(
|
2815
|
-
task, to_provision_config, dryrun, stream_logs
|
2903
|
+
task, to_provision_config, dryrun, stream_logs,
|
2904
|
+
skip_unnecessary_provisioning)
|
2816
2905
|
break
|
2817
2906
|
except exceptions.ResourcesUnavailableError as e:
|
2818
|
-
|
2819
|
-
|
2907
|
+
log_path = retry_provisioner.log_dir + '/provision.log'
|
2908
|
+
error_message = (
|
2909
|
+
f'{colorama.Fore.RED}Failed to provision all '
|
2910
|
+
f'possible launchable resources.'
|
2911
|
+
f'{colorama.Style.RESET_ALL}'
|
2912
|
+
' Relax the task\'s resource requirements: '
|
2913
|
+
f'{task.num_nodes}x {list(task.resources)[0]}')
|
2820
2914
|
if e.no_failover:
|
2821
2915
|
error_message = str(e)
|
2822
|
-
|
2823
|
-
# Clean up the cluster's entry in `sky status`.
|
2824
|
-
global_user_state.remove_cluster(cluster_name,
|
2825
|
-
terminate=True)
|
2826
|
-
usage_lib.messages.usage.update_final_cluster_status(
|
2827
|
-
None)
|
2828
|
-
error_message = (
|
2829
|
-
'Failed to provision all possible launchable '
|
2830
|
-
'resources.'
|
2831
|
-
f' Relax the task\'s resource requirements: '
|
2832
|
-
f'{task.num_nodes}x {list(task.resources)[0]}')
|
2916
|
+
|
2833
2917
|
if retry_until_up:
|
2834
2918
|
logger.error(error_message)
|
2835
2919
|
# Sleep and retry.
|
2836
2920
|
gap_seconds = backoff.current_backoff()
|
2837
2921
|
plural = 's' if attempt_cnt > 1 else ''
|
2838
|
-
|
2839
|
-
f'
|
2840
|
-
f'{
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2922
|
+
retry_message = ux_utils.retry_message(
|
2923
|
+
f'Retry after {gap_seconds:.0f}s '
|
2924
|
+
f'({attempt_cnt} attempt{plural}). ')
|
2925
|
+
logger.info(f'\n{retry_message} '
|
2926
|
+
f'{ux_utils.log_path_hint(log_path)}'
|
2927
|
+
f'{colorama.Style.RESET_ALL}')
|
2844
2928
|
attempt_cnt += 1
|
2845
2929
|
time.sleep(gap_seconds)
|
2846
2930
|
continue
|
2931
|
+
# Clean up the cluster's entry in `sky status`.
|
2932
|
+
# Do not remove the stopped cluster from the global state
|
2933
|
+
# if failed to start.
|
2934
|
+
if not e.no_failover:
|
2935
|
+
global_user_state.remove_cluster(cluster_name,
|
2936
|
+
terminate=True)
|
2937
|
+
usage_lib.messages.usage.update_final_cluster_status(
|
2938
|
+
None)
|
2939
|
+
logger.error(
|
2940
|
+
ux_utils.error_message(
|
2941
|
+
'Failed to provision resources. '
|
2942
|
+
f'{ux_utils.log_path_hint(log_path)}'))
|
2847
2943
|
error_message += (
|
2848
|
-
'\nTo keep retrying until the cluster is up, use
|
2849
|
-
'`--retry-until-up` flag.')
|
2944
|
+
'\nTo keep retrying until the cluster is up, use '
|
2945
|
+
'the `--retry-until-up` flag.')
|
2850
2946
|
with ux_utils.print_exception_no_traceback():
|
2851
2947
|
raise exceptions.ResourcesUnavailableError(
|
2852
|
-
error_message,
|
2948
|
+
error_message + '\n' + str(e),
|
2853
2949
|
failover_history=e.failover_history) from None
|
2854
2950
|
if dryrun:
|
2855
2951
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
2856
2952
|
return record['handle'] if record is not None else None
|
2857
2953
|
|
2954
|
+
if config_dict['provisioning_skipped']:
|
2955
|
+
# Skip further provisioning.
|
2956
|
+
# In this case, we won't have certain fields in the config_dict
|
2957
|
+
# ('handle', 'provision_record', 'resources_vars')
|
2958
|
+
# We need to return the handle - but it should be the existing
|
2959
|
+
# handle for the cluster.
|
2960
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2961
|
+
assert record is not None and record['handle'] is not None, (
|
2962
|
+
cluster_name, record)
|
2963
|
+
return record['handle']
|
2964
|
+
|
2858
2965
|
if 'provision_record' in config_dict:
|
2859
2966
|
# New provisioner is used here.
|
2860
2967
|
handle = config_dict['handle']
|
2861
2968
|
provision_record = config_dict['provision_record']
|
2862
2969
|
resources_vars = config_dict['resources_vars']
|
2970
|
+
config_hash = config_dict.get('config_hash', None)
|
2863
2971
|
|
2864
2972
|
# Setup SkyPilot runtime after the cluster is provisioned
|
2865
2973
|
# 1. Wait for SSH to be ready.
|
@@ -2869,8 +2977,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2869
2977
|
# 4. Starting ray cluster and skylet.
|
2870
2978
|
cluster_info = provisioner.post_provision_runtime_setup(
|
2871
2979
|
repr(handle.launched_resources.cloud),
|
2872
|
-
|
2873
|
-
|
2980
|
+
resources_utils.ClusterName(handle.cluster_name,
|
2981
|
+
handle.cluster_name_on_cloud),
|
2874
2982
|
handle.cluster_yaml,
|
2875
2983
|
provision_record=provision_record,
|
2876
2984
|
custom_resource=resources_vars.get('custom_resources'),
|
@@ -2893,8 +3001,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2893
3001
|
|
2894
3002
|
self._update_after_cluster_provisioned(
|
2895
3003
|
handle, to_provision_config.prev_handle, task,
|
2896
|
-
prev_cluster_status,
|
2897
|
-
handle.external_ssh_ports(), lock_path)
|
3004
|
+
prev_cluster_status, lock_path, config_hash)
|
2898
3005
|
return handle
|
2899
3006
|
|
2900
3007
|
cluster_config_file = config_dict['ray']
|
@@ -2957,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2957
3064
|
# and restarted if necessary.
|
2958
3065
|
logger.debug('Checking if skylet is running on the head node.')
|
2959
3066
|
with rich_utils.safe_status(
|
2960
|
-
'
|
3067
|
+
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
2961
3068
|
# We need to source bashrc for skylet to make sure the autostop
|
2962
3069
|
# event can access the path to the cloud CLIs.
|
2963
3070
|
self.run_on_head(handle,
|
@@ -2966,7 +3073,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2966
3073
|
|
2967
3074
|
self._update_after_cluster_provisioned(
|
2968
3075
|
handle, to_provision_config.prev_handle, task,
|
2969
|
-
prev_cluster_status,
|
3076
|
+
prev_cluster_status, lock_path, config_hash)
|
2970
3077
|
return handle
|
2971
3078
|
|
2972
3079
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
@@ -2984,8 +3091,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2984
3091
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
2985
3092
|
task: task_lib.Task,
|
2986
3093
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
2987
|
-
|
2988
|
-
lock_path: str) -> None:
|
3094
|
+
lock_path: str, config_hash: str) -> None:
|
2989
3095
|
usage_lib.messages.usage.update_cluster_resources(
|
2990
3096
|
handle.launched_nodes, handle.launched_resources)
|
2991
3097
|
usage_lib.messages.usage.update_final_cluster_status(
|
@@ -3000,7 +3106,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3000
3106
|
cmd = job_lib.JobLibCodeGen.update_status()
|
3001
3107
|
logger.debug('Update job queue on remote cluster.')
|
3002
3108
|
with rich_utils.safe_status(
|
3003
|
-
'
|
3109
|
+
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
3004
3110
|
returncode, _, stderr = self.run_on_head(handle,
|
3005
3111
|
cmd,
|
3006
3112
|
require_outputs=True)
|
@@ -3031,9 +3137,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3031
3137
|
resources_utils.port_ranges_to_set(current_ports) -
|
3032
3138
|
resources_utils.port_ranges_to_set(prev_ports))
|
3033
3139
|
if open_new_ports:
|
3034
|
-
|
3035
|
-
|
3036
|
-
|
3140
|
+
cloud = handle.launched_resources.cloud
|
3141
|
+
if not (cloud.OPEN_PORTS_VERSION <=
|
3142
|
+
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
3143
|
+
with rich_utils.safe_status(
|
3144
|
+
ux_utils.spinner_message(
|
3145
|
+
'Launching - Opening new ports')):
|
3146
|
+
self._open_ports(handle)
|
3037
3147
|
|
3038
3148
|
with timeline.Event('backend.provision.post_process'):
|
3039
3149
|
global_user_state.add_or_update_cluster(
|
@@ -3041,15 +3151,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3041
3151
|
handle,
|
3042
3152
|
set(task.resources),
|
3043
3153
|
ready=True,
|
3154
|
+
config_hash=config_hash,
|
3044
3155
|
)
|
3045
3156
|
usage_lib.messages.usage.update_final_cluster_status(
|
3046
3157
|
status_lib.ClusterStatus.UP)
|
3047
|
-
|
3048
|
-
|
3049
|
-
|
3050
|
-
|
3051
|
-
|
3052
|
-
|
3158
|
+
# We still add the cluster to ssh config file on API server, this
|
3159
|
+
# is helpful for people trying to use `sky launch`'ed cluster for
|
3160
|
+
# ssh proxy jump.
|
3161
|
+
auth_config = backend_utils.ssh_credential_from_yaml(
|
3162
|
+
handle.cluster_yaml,
|
3163
|
+
ssh_user=handle.ssh_user,
|
3164
|
+
docker_user=handle.docker_user)
|
3165
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
3166
|
+
handle.cluster_name, handle.cached_external_ips, auth_config,
|
3167
|
+
handle.cached_external_ssh_ports, handle.docker_user,
|
3168
|
+
handle.ssh_user)
|
3053
3169
|
|
3054
3170
|
common_utils.remove_file_if_exists(lock_path)
|
3055
3171
|
|
@@ -3078,9 +3194,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3078
3194
|
dir_size = backend_utils.path_size_megabytes(full_workdir)
|
3079
3195
|
if dir_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
|
3080
3196
|
logger.warning(
|
3081
|
-
f'{fore.YELLOW}The size of workdir {workdir!r} '
|
3197
|
+
f' {fore.YELLOW}The size of workdir {workdir!r} '
|
3082
3198
|
f'is {dir_size} MB. Try to keep workdir small or use '
|
3083
|
-
'.
|
3199
|
+
'.skyignore to exclude large files, as large sizes will slow '
|
3084
3200
|
f'down rsync.{style.RESET_ALL}')
|
3085
3201
|
|
3086
3202
|
log_path = os.path.join(self.log_dir, 'workdir_sync.log')
|
@@ -3100,17 +3216,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3100
3216
|
num_nodes = handle.launched_nodes
|
3101
3217
|
plural = 's' if num_nodes > 1 else ''
|
3102
3218
|
logger.info(
|
3103
|
-
f'{
|
3104
|
-
f'{
|
3105
|
-
f' -> '
|
3106
|
-
f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3219
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
3220
|
+
f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3107
3221
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
3108
3222
|
os.system(f'touch {log_path}')
|
3109
|
-
|
3110
|
-
|
3111
|
-
|
3112
|
-
|
3113
|
-
subprocess_utils.run_in_parallel(_sync_workdir_node, runners
|
3223
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
3224
|
+
str(handle.launched_resources.cloud))
|
3225
|
+
with rich_utils.safe_status(
|
3226
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
3227
|
+
subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
|
3228
|
+
num_threads)
|
3229
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
3114
3230
|
|
3115
3231
|
def _sync_file_mounts(
|
3116
3232
|
self,
|
@@ -3118,18 +3234,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3118
3234
|
all_file_mounts: Optional[Dict[Path, Path]],
|
3119
3235
|
storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
|
3120
3236
|
) -> None:
|
3121
|
-
"""Mounts all user files to the remote nodes.
|
3122
|
-
|
3123
|
-
|
3124
|
-
|
3125
|
-
|
3126
|
-
|
3237
|
+
"""Mounts all user files to the remote nodes.
|
3238
|
+
|
3239
|
+
Note: This does not handle COPY storage_mounts. These should have
|
3240
|
+
already been translated into file_mounts by task.sync_storage_mounts().
|
3241
|
+
|
3242
|
+
TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
|
3243
|
+
assert here that all storage_mounts are MOUNT mode.
|
3244
|
+
"""
|
3245
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
3246
|
+
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
3247
|
+
handle.launched_resources.cloud, all_file_mounts)
|
3248
|
+
self._execute_file_mounts(handle, all_file_mounts)
|
3249
|
+
self._execute_storage_mounts(handle, storage_mounts)
|
3250
|
+
self._set_storage_mounts_metadata(handle.cluster_name,
|
3251
|
+
storage_mounts)
|
3127
3252
|
|
3128
3253
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
3129
3254
|
detach_setup: bool) -> None:
|
3130
3255
|
start = time.time()
|
3131
|
-
style = colorama.Style
|
3132
|
-
fore = colorama.Fore
|
3133
3256
|
|
3134
3257
|
if task.setup is None:
|
3135
3258
|
return
|
@@ -3150,8 +3273,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3150
3273
|
setup_script = log_lib.make_task_bash_script(setup,
|
3151
3274
|
env_vars=setup_envs)
|
3152
3275
|
encoded_script = shlex.quote(setup_script)
|
3153
|
-
|
3154
|
-
|
3276
|
+
|
3277
|
+
def _dump_setup_script(setup_script: str) -> None:
|
3155
3278
|
with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
|
3156
3279
|
f.write(setup_script)
|
3157
3280
|
f.flush()
|
@@ -3160,6 +3283,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3160
3283
|
target=remote_setup_file_name,
|
3161
3284
|
up=True,
|
3162
3285
|
stream_logs=False)
|
3286
|
+
|
3287
|
+
if detach_setup or _is_command_length_over_limit(encoded_script):
|
3288
|
+
_dump_setup_script(setup_script)
|
3163
3289
|
create_script_code = 'true'
|
3164
3290
|
else:
|
3165
3291
|
create_script_code = (f'{{ echo {encoded_script} > '
|
@@ -3167,20 +3293,52 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3167
3293
|
|
3168
3294
|
if detach_setup:
|
3169
3295
|
return
|
3296
|
+
|
3170
3297
|
setup_log_path = os.path.join(self.log_dir,
|
3171
3298
|
f'setup-{runner.node_id}.log')
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3299
|
+
|
3300
|
+
def _run_setup(setup_cmd: str) -> int:
|
3301
|
+
returncode = runner.run(
|
3302
|
+
setup_cmd,
|
3303
|
+
log_path=setup_log_path,
|
3304
|
+
process_stream=False,
|
3305
|
+
# We do not source bashrc for setup, since bashrc is sourced
|
3306
|
+
# in the script already.
|
3307
|
+
# Skip an empty line and two lines due to the /bin/bash -i
|
3308
|
+
# and source ~/.bashrc in the setup_cmd.
|
3309
|
+
# bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
|
3310
|
+
# bash: no job control in this shell
|
3311
|
+
skip_num_lines=3)
|
3312
|
+
return returncode
|
3313
|
+
|
3314
|
+
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
3315
|
+
if returncode == 255:
|
3316
|
+
is_message_too_long = False
|
3317
|
+
try:
|
3318
|
+
with open(os.path.expanduser(setup_log_path),
|
3319
|
+
'r',
|
3320
|
+
encoding='utf-8') as f:
|
3321
|
+
if 'too long' in f.read():
|
3322
|
+
is_message_too_long = True
|
3323
|
+
except Exception as e: # pylint: disable=broad-except
|
3324
|
+
# We don't crash the setup if we cannot read the log file.
|
3325
|
+
# Instead, we should retry the setup with dumping the script
|
3326
|
+
# to a file to be safe.
|
3327
|
+
logger.debug('Failed to read setup log file '
|
3328
|
+
f'{setup_log_path}: {e}')
|
3329
|
+
is_message_too_long = True
|
3330
|
+
|
3331
|
+
if is_message_too_long:
|
3332
|
+
# If the setup script is too long, we retry it with dumping
|
3333
|
+
# the script to a file and running it with SSH. We use a
|
3334
|
+
# general length limit check before but it could be
|
3335
|
+
# inaccurate on some systems.
|
3336
|
+
logger.debug(
|
3337
|
+
'Failed to run setup command inline due to '
|
3338
|
+
'command length limit. Dumping setup script to '
|
3339
|
+
'file and running it with SSH.')
|
3340
|
+
_dump_setup_script(setup_script)
|
3341
|
+
returncode = _run_setup(setup_cmd)
|
3184
3342
|
|
3185
3343
|
def error_message() -> str:
|
3186
3344
|
# Use the function to avoid tailing the file in success case
|
@@ -3211,23 +3369,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3211
3369
|
|
3212
3370
|
num_nodes = len(runners)
|
3213
3371
|
plural = 's' if num_nodes > 1 else ''
|
3372
|
+
node_str = f'{num_nodes} VM{plural}'
|
3373
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
3374
|
+
node_str = f'{num_nodes} pod{plural}'
|
3375
|
+
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
3376
|
+
if controller is not None:
|
3377
|
+
node_str = controller.value.name
|
3214
3378
|
if not detach_setup:
|
3215
|
-
logger.info(
|
3216
|
-
|
3379
|
+
logger.info(
|
3380
|
+
ux_utils.starting_message(f'Running setup on {node_str}.'))
|
3217
3381
|
# TODO(zhwu): run_in_parallel uses multi-thread to run the commands,
|
3218
3382
|
# which can cause the program waiting for all the threads to finish,
|
3219
3383
|
# even if some of them raise exceptions. We should replace it with
|
3220
3384
|
# multi-process.
|
3221
|
-
|
3385
|
+
rich_utils.stop_safe_status()
|
3386
|
+
subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
|
3222
3387
|
|
3223
3388
|
if detach_setup:
|
3224
3389
|
# Only set this when setup needs to be run outside the self._setup()
|
3225
|
-
# as part of a job (
|
3390
|
+
# as part of a job (detach_setup, default).
|
3226
3391
|
self._setup_cmd = setup_cmd
|
3392
|
+
logger.info(ux_utils.finishing_message('Setup detached.'))
|
3227
3393
|
return
|
3228
|
-
logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
|
3229
3394
|
end = time.time()
|
3230
3395
|
logger.debug(f'Setup took {end - start} seconds.')
|
3396
|
+
setup_log_path = os.path.join(self.log_dir, 'setup-*.log')
|
3397
|
+
logger.info(
|
3398
|
+
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
3231
3399
|
|
3232
3400
|
def _exec_code_on_head(
|
3233
3401
|
self,
|
@@ -3238,9 +3406,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3238
3406
|
managed_job_dag: Optional['dag.Dag'] = None,
|
3239
3407
|
) -> None:
|
3240
3408
|
"""Executes generated code on the head node."""
|
3241
|
-
style = colorama.Style
|
3242
|
-
fore = colorama.Fore
|
3243
|
-
|
3244
3409
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3245
3410
|
remote_log_dir = self.log_dir
|
3246
3411
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
@@ -3252,17 +3417,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3252
3417
|
encoded_script = shlex.quote(codegen)
|
3253
3418
|
create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
|
3254
3419
|
job_submit_cmd = (
|
3255
|
-
|
3256
|
-
|
3257
|
-
'
|
3258
|
-
f'
|
3259
|
-
#
|
3260
|
-
|
3261
|
-
|
3420
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
3421
|
+
# with pid is the same driver process.
|
3422
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
3423
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
3424
|
+
# Do not use &>, which is not POSIX and may not work.
|
3425
|
+
# Note that the order of ">filename 2>&1" matters.
|
3426
|
+
f'> {remote_log_path} 2>&1')
|
3262
3427
|
|
3263
3428
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3264
3429
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
3265
|
-
|
3430
|
+
|
3431
|
+
def _dump_code_to_file(codegen: str) -> None:
|
3266
3432
|
runners = handle.get_command_runners()
|
3267
3433
|
head_runner = runners[0]
|
3268
3434
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
@@ -3277,6 +3443,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3277
3443
|
target=script_path,
|
3278
3444
|
up=True,
|
3279
3445
|
stream_logs=False)
|
3446
|
+
|
3447
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
3448
|
+
_dump_code_to_file(codegen)
|
3280
3449
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
3281
3450
|
|
3282
3451
|
if managed_job_dag is not None:
|
@@ -3285,90 +3454,72 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3285
3454
|
managed_job_code = managed_job_codegen.set_pending(
|
3286
3455
|
job_id, managed_job_dag)
|
3287
3456
|
# Set the managed job to PENDING state to make sure that this
|
3288
|
-
# managed job appears in the `sky jobs queue`,
|
3289
|
-
#
|
3290
|
-
#
|
3291
|
-
#
|
3292
|
-
#
|
3293
|
-
# controller process job has been queued, as our skylet on spot
|
3294
|
-
# controller will set the managed job in FAILED state if the
|
3295
|
-
# controller process job does not exist.
|
3296
|
-
# We cannot set the managed job to PENDING state in the codegen for
|
3297
|
-
# the controller process job, as it will stay in the job pending
|
3298
|
-
# table and not be executed until there is an empty slot.
|
3457
|
+
# managed job appears in the `sky jobs queue`, even if it needs to
|
3458
|
+
# wait to be submitted.
|
3459
|
+
# We cannot set the managed job to PENDING state in the job template
|
3460
|
+
# (jobs-controller.yaml.j2), as it may need to wait for the run
|
3461
|
+
# commands to be scheduled on the job controller in high-load cases.
|
3299
3462
|
job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
|
3300
3463
|
|
3301
3464
|
returncode, stdout, stderr = self.run_on_head(handle,
|
3302
3465
|
job_submit_cmd,
|
3303
3466
|
stream_logs=False,
|
3304
3467
|
require_outputs=True)
|
3305
|
-
|
3306
|
-
#
|
3307
|
-
|
3308
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stdout,
|
3468
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3469
|
+
# running a job. Necessitating calling `sky launch`.
|
3470
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3309
3471
|
handle.cluster_name)
|
3472
|
+
if returncode == 255 and 'too long' in stdout + stderr:
|
3473
|
+
# If the generated script is too long, we retry it with dumping
|
3474
|
+
# the script to a file and running it with SSH. We use a general
|
3475
|
+
# length limit check before but it could be inaccurate on some
|
3476
|
+
# systems.
|
3477
|
+
logger.debug('Failed to submit job due to command length limit. '
|
3478
|
+
'Dumping job to file and running it with SSH.')
|
3479
|
+
_dump_code_to_file(codegen)
|
3480
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
3481
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
3482
|
+
job_submit_cmd,
|
3483
|
+
stream_logs=False,
|
3484
|
+
require_outputs=True)
|
3485
|
+
|
3310
3486
|
subprocess_utils.handle_returncode(returncode,
|
3311
3487
|
job_submit_cmd,
|
3312
3488
|
f'Failed to submit job {job_id}.',
|
3313
3489
|
stderr=stdout + stderr)
|
3314
3490
|
|
3315
|
-
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
|
3321
|
-
|
3322
|
-
|
3323
|
-
|
3324
|
-
|
3325
|
-
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
3331
|
-
logger.info(
|
3332
|
-
f'{fore.CYAN}Managed Job ID: '
|
3333
|
-
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3334
|
-
'\nTo cancel the job:\t\t'
|
3335
|
-
f'{backend_utils.BOLD}sky jobs cancel {job_id}'
|
3336
|
-
f'{backend_utils.RESET_BOLD}'
|
3337
|
-
'\nTo stream job logs:\t\t'
|
3338
|
-
f'{backend_utils.BOLD}sky jobs logs {job_id}'
|
3339
|
-
f'{backend_utils.RESET_BOLD}'
|
3340
|
-
f'\nTo stream controller logs:\t'
|
3341
|
-
f'{backend_utils.BOLD}sky jobs logs --controller {job_id}'
|
3342
|
-
f'{backend_utils.RESET_BOLD}'
|
3343
|
-
'\nTo view all managed jobs:\t'
|
3344
|
-
f'{backend_utils.BOLD}sky jobs queue'
|
3345
|
-
f'{backend_utils.RESET_BOLD}'
|
3346
|
-
'\nTo view managed job dashboard:\t'
|
3347
|
-
f'{backend_utils.BOLD}sky jobs dashboard'
|
3348
|
-
f'{backend_utils.RESET_BOLD}')
|
3349
|
-
elif controller is None:
|
3350
|
-
logger.info(f'{fore.CYAN}Job ID: '
|
3351
|
-
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3352
|
-
'\nTo cancel the job:\t'
|
3353
|
-
f'{backend_utils.BOLD}sky cancel {name} {job_id}'
|
3354
|
-
f'{backend_utils.RESET_BOLD}'
|
3355
|
-
'\nTo stream job logs:\t'
|
3356
|
-
f'{backend_utils.BOLD}sky logs {name} {job_id}'
|
3357
|
-
f'{backend_utils.RESET_BOLD}'
|
3358
|
-
'\nTo view the job queue:\t'
|
3359
|
-
f'{backend_utils.BOLD}sky queue {name}'
|
3360
|
-
f'{backend_utils.RESET_BOLD}')
|
3491
|
+
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
3492
|
+
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
3493
|
+
logger.info(ux_utils.starting_message('Service registered.'))
|
3494
|
+
else:
|
3495
|
+
logger.info(
|
3496
|
+
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
3497
|
+
rich_utils.stop_safe_status()
|
3498
|
+
if not detach_run:
|
3499
|
+
if (handle.cluster_name == controller_utils.Controllers.
|
3500
|
+
JOBS_CONTROLLER.value.cluster_name):
|
3501
|
+
self.tail_managed_job_logs(handle, job_id)
|
3502
|
+
else:
|
3503
|
+
# Sky logs. Not using subprocess.run since it will make the
|
3504
|
+
# ssh keep connected after ctrl-c.
|
3505
|
+
self.tail_logs(handle, job_id)
|
3361
3506
|
|
3362
3507
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
3363
3508
|
job_name: Optional[str], resources_str: str) -> int:
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3509
|
+
code = job_lib.JobLibCodeGen.add_job(
|
3510
|
+
job_name=job_name,
|
3511
|
+
username=common_utils.get_user_hash(),
|
3512
|
+
run_timestamp=self.run_timestamp,
|
3513
|
+
resources_str=resources_str)
|
3367
3514
|
returncode, job_id_str, stderr = self.run_on_head(handle,
|
3368
3515
|
code,
|
3369
3516
|
stream_logs=False,
|
3370
3517
|
require_outputs=True,
|
3371
3518
|
separate_stderr=True)
|
3519
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3520
|
+
# adding a job. Necessitating calling `sky launch`.
|
3521
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3522
|
+
handle.cluster_name)
|
3372
3523
|
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
3373
3524
|
# retry for this, after we figure out the reason.
|
3374
3525
|
subprocess_utils.handle_returncode(returncode, code,
|
@@ -3398,15 +3549,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3398
3549
|
Returns:
|
3399
3550
|
Job id if the task is submitted to the cluster, None otherwise.
|
3400
3551
|
"""
|
3401
|
-
if task.run is None:
|
3552
|
+
if task.run is None and self._setup_cmd is None:
|
3553
|
+
# This message is fine without mentioning setup, as there are two
|
3554
|
+
# cases when run section is empty:
|
3555
|
+
# 1. setup specified: setup is executed in detached mode and this
|
3556
|
+
# message will not be shown.
|
3557
|
+
# 2. no setup specified: this message is fine as a user is likely
|
3558
|
+
# creating a cluster only, and ok with the empty run command.
|
3402
3559
|
logger.info('Run commands not specified or empty.')
|
3403
3560
|
return None
|
3404
|
-
|
3405
|
-
|
3406
|
-
|
3407
|
-
|
3408
|
-
|
3409
|
-
|
3561
|
+
if task.run is None:
|
3562
|
+
# If the task has no run command, we still need to execute the
|
3563
|
+
# generated ray driver program to run the setup command in detached
|
3564
|
+
# mode.
|
3565
|
+
# In this case, we reset the resources for the task, so that the
|
3566
|
+
# detached setup does not need to wait for the task resources to be
|
3567
|
+
# ready (which is not used for setup anyway).
|
3568
|
+
valid_resource = sky.Resources()
|
3569
|
+
else:
|
3570
|
+
# Check the task resources vs the cluster resources. Since
|
3571
|
+
# `sky exec` will not run the provision and _check_existing_cluster
|
3572
|
+
# We need to check ports here since sky.exec shouldn't change
|
3573
|
+
# resources.
|
3574
|
+
valid_resource = self.check_resources_fit_cluster(handle,
|
3575
|
+
task,
|
3576
|
+
check_ports=True)
|
3410
3577
|
task_copy = copy.copy(task)
|
3411
3578
|
# Handle multiple resources exec case.
|
3412
3579
|
task_copy.set_resources(valid_resource)
|
@@ -3434,30 +3601,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3434
3601
|
|
3435
3602
|
def _post_execute(self, handle: CloudVmRayResourceHandle,
|
3436
3603
|
down: bool) -> None:
|
3437
|
-
|
3438
|
-
|
3439
|
-
|
3440
|
-
controller = controller_utils.Controllers.from_name(name)
|
3441
|
-
if controller is not None or down:
|
3442
|
-
return
|
3443
|
-
stop_str = ('\nTo stop the cluster:'
|
3444
|
-
f'\t{backend_utils.BOLD}sky stop {name}'
|
3445
|
-
f'{backend_utils.RESET_BOLD}')
|
3446
|
-
logger.info(f'\n{fore.CYAN}Cluster name: '
|
3447
|
-
f'{style.BRIGHT}{name}{style.RESET_ALL}'
|
3448
|
-
'\nTo log into the head VM:\t'
|
3449
|
-
f'{backend_utils.BOLD}ssh {name}'
|
3450
|
-
f'{backend_utils.RESET_BOLD}'
|
3451
|
-
'\nTo submit a job:'
|
3452
|
-
f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
|
3453
|
-
f'{backend_utils.RESET_BOLD}'
|
3454
|
-
f'{stop_str}'
|
3455
|
-
'\nTo teardown the cluster:'
|
3456
|
-
f'\t{backend_utils.BOLD}sky down {name}'
|
3457
|
-
f'{backend_utils.RESET_BOLD}')
|
3458
|
-
if (gcp_utils.is_tpu(handle.launched_resources) and
|
3459
|
-
not gcp_utils.is_tpu_vm(handle.launched_resources)):
|
3460
|
-
logger.info('Tip: `sky down` will delete launched TPU(s) too.')
|
3604
|
+
"""Post-execute cleanup."""
|
3605
|
+
del handle, down # Unused.
|
3606
|
+
# All logic is handled in previous stages, no-op.
|
3461
3607
|
|
3462
3608
|
def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
|
3463
3609
|
storage_mounts = task.storage_mounts
|
@@ -3505,33 +3651,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3505
3651
|
is_identity_mismatch_and_purge = True
|
3506
3652
|
else:
|
3507
3653
|
raise
|
3508
|
-
|
3509
3654
|
lock_path = os.path.expanduser(
|
3510
3655
|
backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
3511
|
-
|
3512
|
-
|
3513
|
-
|
3514
|
-
|
3515
|
-
|
3516
|
-
|
3517
|
-
|
3518
|
-
|
3519
|
-
|
3520
|
-
|
3521
|
-
|
3522
|
-
|
3523
|
-
|
3524
|
-
|
3525
|
-
|
3526
|
-
|
3527
|
-
|
3528
|
-
|
3529
|
-
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3656
|
+
# Retry in case new cluster operation comes in and holds the lock
|
3657
|
+
# right after the lock is removed.
|
3658
|
+
n_attempts = 2
|
3659
|
+
while True:
|
3660
|
+
n_attempts -= 1
|
3661
|
+
# In case other running cluster operations are still holding the
|
3662
|
+
# lock.
|
3663
|
+
common_utils.remove_file_if_exists(lock_path)
|
3664
|
+
# We have to kill the cluster requests, because `down` and `stop`
|
3665
|
+
# should be higher priority than the cluster requests, and we should
|
3666
|
+
# release the lock from other requests.
|
3667
|
+
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
3668
|
+
requests_lib.kill_cluster_requests(handle.cluster_name,
|
3669
|
+
exclude_request_to_kill)
|
3670
|
+
try:
|
3671
|
+
with filelock.FileLock(
|
3672
|
+
lock_path,
|
3673
|
+
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
3674
|
+
self.teardown_no_lock(
|
3675
|
+
handle,
|
3676
|
+
terminate,
|
3677
|
+
purge,
|
3678
|
+
# When --purge is set and we already see an ID mismatch
|
3679
|
+
# error, we skip the refresh codepath. This is because
|
3680
|
+
# refresh checks current user identity can throw
|
3681
|
+
# ClusterOwnerIdentityMismatchError. The argument/flag
|
3682
|
+
# `purge` should bypass such ID mismatch errors.
|
3683
|
+
refresh_cluster_status=(
|
3684
|
+
not is_identity_mismatch_and_purge))
|
3685
|
+
if terminate:
|
3686
|
+
common_utils.remove_file_if_exists(lock_path)
|
3687
|
+
break
|
3688
|
+
except filelock.Timeout as e:
|
3689
|
+
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
3690
|
+
f'retrying...')
|
3691
|
+
if n_attempts <= 0:
|
3692
|
+
raise RuntimeError(
|
3693
|
+
f'Cluster {cluster_name!r} is locked by {lock_path}. '
|
3694
|
+
'Check to see if it is still being launched') from e
|
3535
3695
|
|
3536
3696
|
# --- CloudVMRayBackend Specific APIs ---
|
3537
3697
|
|
@@ -3555,24 +3715,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3555
3715
|
def cancel_jobs(self,
|
3556
3716
|
handle: CloudVmRayResourceHandle,
|
3557
3717
|
jobs: Optional[List[int]],
|
3558
|
-
cancel_all: bool = False
|
3718
|
+
cancel_all: bool = False,
|
3719
|
+
user_hash: Optional[str] = None) -> None:
|
3559
3720
|
"""Cancels jobs.
|
3560
3721
|
|
3561
|
-
|
3562
|
-
|
3563
|
-
Args:
|
3564
|
-
handle: The cluster handle.
|
3565
|
-
jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
|
3566
|
-
cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
|
3567
|
-
set to None. If False and `jobs` is None, cancel the latest
|
3568
|
-
running job.
|
3722
|
+
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
3569
3723
|
"""
|
3570
|
-
|
3571
|
-
assert jobs is None, (
|
3572
|
-
'If cancel_all=True, usage is to set jobs=None')
|
3573
|
-
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
|
3574
|
-
|
3575
|
-
# All error messages should have been redirected to stdout.
|
3724
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
|
3576
3725
|
returncode, stdout, _ = self.run_on_head(handle,
|
3577
3726
|
code,
|
3578
3727
|
stream_logs=False,
|
@@ -3581,13 +3730,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3581
3730
|
returncode, code,
|
3582
3731
|
f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
|
3583
3732
|
|
3584
|
-
cancelled_ids =
|
3733
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
3585
3734
|
if cancelled_ids:
|
3586
3735
|
logger.info(
|
3587
3736
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
3588
3737
|
else:
|
3589
|
-
logger.info(
|
3590
|
-
'No jobs cancelled. They may already be in terminal states.')
|
3738
|
+
logger.info('No jobs cancelled. They may be in terminal states.')
|
3591
3739
|
|
3592
3740
|
def sync_down_logs(
|
3593
3741
|
self,
|
@@ -3608,7 +3756,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3608
3756
|
separate_stderr=True)
|
3609
3757
|
subprocess_utils.handle_returncode(returncode, code,
|
3610
3758
|
'Failed to sync logs.', stderr)
|
3611
|
-
run_timestamps =
|
3759
|
+
run_timestamps = message_utils.decode_payload(run_timestamps)
|
3612
3760
|
if not run_timestamps:
|
3613
3761
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3614
3762
|
'No matching log directories found'
|
@@ -3622,16 +3770,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3622
3770
|
for run_timestamp in run_timestamps
|
3623
3771
|
]
|
3624
3772
|
local_log_dirs = [
|
3625
|
-
os.path.
|
3773
|
+
os.path.join(local_dir, run_timestamp)
|
3626
3774
|
for run_timestamp in run_timestamps
|
3627
3775
|
]
|
3628
3776
|
|
3629
|
-
style = colorama.Style
|
3630
|
-
fore = colorama.Fore
|
3631
|
-
for job_id, log_dir in zip(job_ids, local_log_dirs):
|
3632
|
-
logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
|
3633
|
-
f'{style.RESET_ALL}')
|
3634
|
-
|
3635
3777
|
runners = handle.get_command_runners()
|
3636
3778
|
|
3637
3779
|
def _rsync_down(args) -> None:
|
@@ -3642,10 +3784,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3642
3784
|
"""
|
3643
3785
|
(runner, local_log_dir, remote_log_dir) = args
|
3644
3786
|
try:
|
3645
|
-
os.makedirs(local_log_dir, exist_ok=True)
|
3787
|
+
os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
|
3646
3788
|
runner.rsync(
|
3647
|
-
|
3648
|
-
|
3789
|
+
# Require a `/` at the end to make sure the parent dir
|
3790
|
+
# are not created locally. We do not add additional '*' as
|
3791
|
+
# kubernetes's rsync does not work with an ending '*'.
|
3792
|
+
source=f'{remote_log_dir}/',
|
3793
|
+
target=os.path.expanduser(local_log_dir),
|
3649
3794
|
up=False,
|
3650
3795
|
stream_logs=False,
|
3651
3796
|
)
|
@@ -3653,7 +3798,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3653
3798
|
if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
|
3654
3799
|
# Raised by rsync_down. Remote log dir may not exist, since
|
3655
3800
|
# the job can be run on some part of the nodes.
|
3656
|
-
logger.debug(f'{runner.
|
3801
|
+
logger.debug(f'{runner.node_id} does not have the tasks/*.')
|
3657
3802
|
else:
|
3658
3803
|
raise
|
3659
3804
|
|
@@ -3667,7 +3812,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3667
3812
|
handle: CloudVmRayResourceHandle,
|
3668
3813
|
job_id: Optional[int],
|
3669
3814
|
managed_job_id: Optional[int] = None,
|
3670
|
-
follow: bool = True
|
3815
|
+
follow: bool = True,
|
3816
|
+
tail: int = 0) -> int:
|
3671
3817
|
"""Tail the logs of a job.
|
3672
3818
|
|
3673
3819
|
Args:
|
@@ -3675,10 +3821,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3675
3821
|
job_id: The job ID to tail the logs of.
|
3676
3822
|
managed_job_id: The managed job ID for display purpose only.
|
3677
3823
|
follow: Whether to follow the logs.
|
3824
|
+
tail: The number of lines to display from the end of the
|
3825
|
+
log file. If 0, print all lines.
|
3678
3826
|
"""
|
3679
3827
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
3680
3828
|
managed_job_id=managed_job_id,
|
3681
|
-
follow=follow
|
3829
|
+
follow=follow,
|
3830
|
+
tail=tail)
|
3682
3831
|
if job_id is None and managed_job_id is None:
|
3683
3832
|
logger.info(
|
3684
3833
|
'Job ID not provided. Streaming the logs of the latest job.')
|
@@ -3697,10 +3846,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3697
3846
|
# Allocate a pseudo-terminal to disable output buffering.
|
3698
3847
|
# Otherwise, there may be 5 minutes delay in logging.
|
3699
3848
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
3700
|
-
# Disable stdin to avoid ray outputs mess up the terminal with
|
3701
|
-
# misaligned output in multithreading/multiprocessing.
|
3702
|
-
# Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
|
3703
|
-
stdin=subprocess.DEVNULL,
|
3704
3849
|
)
|
3705
3850
|
except SystemExit as e:
|
3706
3851
|
returncode = e.code
|
@@ -3730,52 +3875,169 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3730
3875
|
stream_logs=True,
|
3731
3876
|
process_stream=False,
|
3732
3877
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
3733
|
-
stdin=subprocess.DEVNULL,
|
3734
3878
|
)
|
3735
3879
|
|
3736
|
-
def
|
3737
|
-
|
3738
|
-
|
3739
|
-
|
3880
|
+
def sync_down_managed_job_logs(
|
3881
|
+
self,
|
3882
|
+
handle: CloudVmRayResourceHandle,
|
3883
|
+
job_id: Optional[int] = None,
|
3884
|
+
job_name: Optional[str] = None,
|
3885
|
+
controller: bool = False,
|
3886
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
|
3887
|
+
"""Sync down logs for a managed job.
|
3740
3888
|
|
3741
3889
|
Args:
|
3742
|
-
handle: The handle to the
|
3743
|
-
|
3744
|
-
|
3745
|
-
|
3746
|
-
|
3747
|
-
target is replica.
|
3748
|
-
follow: Whether to follow the logs.
|
3749
|
-
"""
|
3750
|
-
if target != serve_lib.ServiceComponent.REPLICA:
|
3751
|
-
code = serve_lib.ServeCodeGen.stream_serve_process_logs(
|
3752
|
-
service_name,
|
3753
|
-
stream_controller=(
|
3754
|
-
target == serve_lib.ServiceComponent.CONTROLLER),
|
3755
|
-
follow=follow)
|
3756
|
-
else:
|
3757
|
-
assert replica_id is not None, service_name
|
3758
|
-
code = serve_lib.ServeCodeGen.stream_replica_logs(
|
3759
|
-
service_name, replica_id, follow)
|
3890
|
+
handle: The handle to the cluster.
|
3891
|
+
job_id: The job ID to sync down logs for.
|
3892
|
+
job_name: The job name to sync down logs for.
|
3893
|
+
controller: Whether to sync down logs for the controller.
|
3894
|
+
local_dir: The local directory to sync down logs to.
|
3760
3895
|
|
3761
|
-
|
3762
|
-
|
3896
|
+
Returns:
|
3897
|
+
A dictionary mapping job_id to log path.
|
3898
|
+
"""
|
3899
|
+
# if job_name and job_id should not both be specified
|
3900
|
+
assert job_name is None or job_id is None, (job_name, job_id)
|
3763
3901
|
|
3764
|
-
|
3902
|
+
if job_id is None:
|
3903
|
+
# generate code to get the job_id
|
3904
|
+
# if job_name is None, get all job_ids
|
3905
|
+
# TODO: Only get the latest job_id, since that's the only one we use
|
3906
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3907
|
+
job_name=job_name)
|
3908
|
+
returncode, job_ids, stderr = self.run_on_head(handle,
|
3909
|
+
code,
|
3910
|
+
stream_logs=False,
|
3911
|
+
require_outputs=True,
|
3912
|
+
separate_stderr=True)
|
3913
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3914
|
+
'Failed to sync down logs.',
|
3915
|
+
stderr)
|
3916
|
+
job_ids = message_utils.decode_payload(job_ids)
|
3917
|
+
if not job_ids:
|
3918
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3919
|
+
'No matching job found'
|
3920
|
+
f'{colorama.Style.RESET_ALL}')
|
3921
|
+
return {}
|
3922
|
+
elif len(job_ids) > 1:
|
3923
|
+
name_str = ''
|
3924
|
+
if job_name is not None:
|
3925
|
+
name_str = ('Multiple jobs IDs found under the name '
|
3926
|
+
f'{job_name}. ')
|
3927
|
+
controller_str = ' (controller)' if controller else ''
|
3928
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3929
|
+
f'{name_str}'
|
3930
|
+
f'Downloading the latest job logs{controller_str}.'
|
3931
|
+
f'{colorama.Style.RESET_ALL}')
|
3932
|
+
# list should aready be in descending order
|
3933
|
+
job_id = job_ids[0]
|
3934
|
+
|
3935
|
+
# get the run_timestamp
|
3936
|
+
# the function takes in [job_id]
|
3937
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3938
|
+
[str(job_id)])
|
3939
|
+
returncode, run_timestamps, stderr = self.run_on_head(
|
3765
3940
|
handle,
|
3766
3941
|
code,
|
3767
|
-
stream_logs=
|
3768
|
-
|
3769
|
-
|
3770
|
-
|
3771
|
-
|
3942
|
+
stream_logs=False,
|
3943
|
+
require_outputs=True,
|
3944
|
+
separate_stderr=True)
|
3945
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3946
|
+
'Failed to sync logs.', stderr)
|
3947
|
+
# returns with a dict of {job_id: run_timestamp}
|
3948
|
+
run_timestamps = message_utils.decode_payload(run_timestamps)
|
3949
|
+
if not run_timestamps:
|
3950
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3951
|
+
'No matching log directories found'
|
3952
|
+
f'{colorama.Style.RESET_ALL}')
|
3953
|
+
return {}
|
3954
|
+
|
3955
|
+
run_timestamp = list(run_timestamps.values())[0]
|
3956
|
+
job_id = list(run_timestamps.keys())[0]
|
3957
|
+
local_log_dir = ''
|
3958
|
+
if controller: # download controller logs
|
3959
|
+
remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
|
3960
|
+
f'{job_id}.log')
|
3961
|
+
local_log_dir = os.path.join(local_dir, run_timestamp)
|
3962
|
+
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
3963
|
+
exist_ok=True)
|
3964
|
+
|
3965
|
+
logger.debug(f'{colorama.Fore.CYAN}'
|
3966
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3967
|
+
f'{colorama.Style.RESET_ALL}')
|
3968
|
+
|
3969
|
+
runners = handle.get_command_runners()
|
3970
|
+
|
3971
|
+
def _rsync_down(args) -> None:
|
3972
|
+
"""Rsync down logs from remote nodes.
|
3973
|
+
|
3974
|
+
Args:
|
3975
|
+
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
3976
|
+
"""
|
3977
|
+
(runner, local_log_dir, remote_log) = args
|
3978
|
+
try:
|
3979
|
+
os.makedirs(os.path.expanduser(local_log_dir),
|
3980
|
+
exist_ok=True)
|
3981
|
+
runner.rsync(
|
3982
|
+
source=remote_log,
|
3983
|
+
target=f'{local_log_dir}/controller.log',
|
3984
|
+
up=False,
|
3985
|
+
stream_logs=False,
|
3986
|
+
)
|
3987
|
+
except exceptions.CommandError as e:
|
3988
|
+
if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
|
3989
|
+
# Raised by rsync_down. Remote log dir may not exist
|
3990
|
+
# since the job can be run on some part of the nodes.
|
3991
|
+
logger.debug(
|
3992
|
+
f'{runner.node_id} does not have the tasks/*.')
|
3993
|
+
else:
|
3994
|
+
raise
|
3995
|
+
|
3996
|
+
parallel_args = [
|
3997
|
+
(runner, local_log_dir, remote_log) for runner in runners
|
3998
|
+
]
|
3999
|
+
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4000
|
+
else: # download job logs
|
4001
|
+
local_log_dir = os.path.join(local_dir, 'managed_jobs',
|
4002
|
+
run_timestamp)
|
4003
|
+
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
4004
|
+
exist_ok=True)
|
4005
|
+
log_file = os.path.join(local_log_dir, 'run.log')
|
4006
|
+
|
4007
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
|
4008
|
+
job_id=job_id,
|
4009
|
+
follow=False,
|
4010
|
+
controller=False)
|
4011
|
+
|
4012
|
+
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
4013
|
+
# kill the process, so we need to handle it manually here.
|
4014
|
+
if threading.current_thread() is threading.main_thread():
|
4015
|
+
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4016
|
+
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4017
|
+
|
4018
|
+
# We redirect the output to the log file
|
4019
|
+
# and disable the STDOUT and STDERR
|
4020
|
+
self.run_on_head(
|
4021
|
+
handle,
|
4022
|
+
code,
|
4023
|
+
log_path=os.path.expanduser(log_file),
|
4024
|
+
stream_logs=False,
|
4025
|
+
process_stream=False,
|
4026
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4027
|
+
)
|
4028
|
+
|
4029
|
+
logger.debug(f'{colorama.Fore.CYAN}'
|
4030
|
+
f'Job {job_id} logs: {local_log_dir}'
|
4031
|
+
f'{colorama.Style.RESET_ALL}')
|
4032
|
+
return {str(job_id): local_log_dir}
|
3772
4033
|
|
3773
4034
|
def teardown_no_lock(self,
|
3774
4035
|
handle: CloudVmRayResourceHandle,
|
3775
4036
|
terminate: bool,
|
3776
4037
|
purge: bool = False,
|
3777
4038
|
post_teardown_cleanup: bool = True,
|
3778
|
-
refresh_cluster_status: bool = True
|
4039
|
+
refresh_cluster_status: bool = True,
|
4040
|
+
remove_from_db: bool = True) -> None:
|
3779
4041
|
"""Teardown the cluster without acquiring the cluster status lock.
|
3780
4042
|
|
3781
4043
|
NOTE: This method should not be called without holding the cluster
|
@@ -3787,11 +4049,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3787
4049
|
Raises:
|
3788
4050
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
3789
4051
|
"""
|
4052
|
+
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
4053
|
+
# We have to kill the cluster requests again within the lock, because
|
4054
|
+
# any pending requests on the same cluster should be cancelled after
|
4055
|
+
# the cluster is terminated/stopped. Otherwise, it will be quite
|
4056
|
+
# confusing to see the cluster restarted immediately after it is
|
4057
|
+
# terminated/stopped, when there is a pending launch request.
|
4058
|
+
requests_lib.kill_cluster_requests(handle.cluster_name,
|
4059
|
+
exclude_request_to_kill)
|
4060
|
+
cluster_status_fetched = False
|
3790
4061
|
if refresh_cluster_status:
|
3791
|
-
|
3792
|
-
|
3793
|
-
|
3794
|
-
|
4062
|
+
try:
|
4063
|
+
prev_cluster_status, _ = (
|
4064
|
+
backend_utils.refresh_cluster_status_handle(
|
4065
|
+
handle.cluster_name,
|
4066
|
+
acquire_per_cluster_status_lock=False))
|
4067
|
+
cluster_status_fetched = True
|
4068
|
+
except exceptions.ClusterStatusFetchingError:
|
4069
|
+
logger.warning(
|
4070
|
+
'Failed to fetch cluster status for '
|
4071
|
+
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
4072
|
+
'up.')
|
4073
|
+
if not cluster_status_fetched:
|
3795
4074
|
record = global_user_state.get_cluster_from_name(
|
3796
4075
|
handle.cluster_name)
|
3797
4076
|
prev_cluster_status = record[
|
@@ -3805,6 +4084,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3805
4084
|
f'Cluster {handle.cluster_name!r} is already terminated. '
|
3806
4085
|
'Skipped.')
|
3807
4086
|
return
|
4087
|
+
|
4088
|
+
if handle.cluster_yaml is None:
|
4089
|
+
logger.warning(f'Cluster {handle.cluster_name!r} has no '
|
4090
|
+
f'provision yaml so it '
|
4091
|
+
'has not been provisioned. Skipped.')
|
4092
|
+
global_user_state.remove_cluster(handle.cluster_name,
|
4093
|
+
terminate=terminate)
|
4094
|
+
return
|
3808
4095
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
3809
4096
|
'teardown.log')
|
3810
4097
|
log_abs_path = os.path.abspath(log_path)
|
@@ -3843,7 +4130,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3843
4130
|
|
3844
4131
|
try:
|
3845
4132
|
provisioner.teardown_cluster(repr(cloud),
|
3846
|
-
|
4133
|
+
resources_utils.ClusterName(
|
3847
4134
|
cluster_name,
|
3848
4135
|
cluster_name_on_cloud),
|
3849
4136
|
terminate=terminate,
|
@@ -3859,25 +4146,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3859
4146
|
raise
|
3860
4147
|
|
3861
4148
|
if post_teardown_cleanup:
|
3862
|
-
self.post_teardown_cleanup(handle, terminate, purge
|
4149
|
+
self.post_teardown_cleanup(handle, terminate, purge,
|
4150
|
+
remove_from_db)
|
3863
4151
|
return
|
3864
4152
|
|
3865
|
-
if
|
3866
|
-
|
3867
|
-
# autoscaler.
|
3868
|
-
resource_group = config['provider']['resource_group']
|
3869
|
-
terminate_cmd = f'az group delete -y --name {resource_group}'
|
3870
|
-
with rich_utils.safe_status(f'[bold cyan]Terminating '
|
3871
|
-
f'[green]{cluster_name}'):
|
3872
|
-
returncode, stdout, stderr = log_lib.run_with_log(
|
3873
|
-
terminate_cmd,
|
3874
|
-
log_abs_path,
|
3875
|
-
shell=True,
|
3876
|
-
stream_logs=False,
|
3877
|
-
require_outputs=True)
|
3878
|
-
|
3879
|
-
elif (isinstance(cloud, clouds.IBM) and terminate and
|
3880
|
-
prev_cluster_status == status_lib.ClusterStatus.STOPPED):
|
4153
|
+
if (isinstance(cloud, clouds.IBM) and terminate and
|
4154
|
+
prev_cluster_status == status_lib.ClusterStatus.STOPPED):
|
3881
4155
|
# pylint: disable= W0622 W0703 C0415
|
3882
4156
|
from sky.adaptors import ibm
|
3883
4157
|
from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
|
@@ -3895,7 +4169,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3895
4169
|
limit=1000).get_result()['items']
|
3896
4170
|
vpc_id = None
|
3897
4171
|
try:
|
3898
|
-
# pylint: disable=line-too-long
|
3899
4172
|
vpc_id = vpcs_filtered_by_tags_and_region[0]['crn'].rsplit(
|
3900
4173
|
':', 1)[-1]
|
3901
4174
|
vpc_found = True
|
@@ -3904,7 +4177,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3904
4177
|
returncode = -1
|
3905
4178
|
|
3906
4179
|
if vpc_found:
|
3907
|
-
# pylint: disable=line-too-long E1136
|
3908
4180
|
# Delete VPC and it's associated resources
|
3909
4181
|
vpc_provider = IBMVPCProvider(
|
3910
4182
|
config_provider['resource_group_id'], region,
|
@@ -3936,25 +4208,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3936
4208
|
stdout = ''
|
3937
4209
|
stderr = str(e)
|
3938
4210
|
|
3939
|
-
# Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
|
3940
|
-
# May, 2023 by Hysun: Allow terminate INIT cluster which may have
|
3941
|
-
# some instances provisioning in background but not completed.
|
3942
|
-
elif (isinstance(cloud, clouds.OCI) and terminate and
|
3943
|
-
prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
|
3944
|
-
status_lib.ClusterStatus.INIT)):
|
3945
|
-
region = config['provider']['region']
|
3946
|
-
|
3947
|
-
# pylint: disable=import-outside-toplevel
|
3948
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
3949
|
-
|
3950
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
3951
|
-
|
3952
|
-
# 0: All terminated successfully, failed count otherwise
|
3953
|
-
returncode = oci_query_helper.terminate_instances_by_tags(
|
3954
|
-
{TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
|
3955
|
-
|
3956
|
-
# To avoid undefined local variables error.
|
3957
|
-
stdout = stderr = ''
|
3958
4211
|
else:
|
3959
4212
|
config['provider']['cache_stopped_nodes'] = not terminate
|
3960
4213
|
with tempfile.NamedTemporaryFile('w',
|
@@ -3965,8 +4218,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3965
4218
|
f.flush()
|
3966
4219
|
|
3967
4220
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
3968
|
-
with rich_utils.safe_status(
|
3969
|
-
|
4221
|
+
with rich_utils.safe_status(
|
4222
|
+
ux_utils.spinner_message(
|
4223
|
+
f'{teardown_verb}: {cluster_name}', log_path)):
|
3970
4224
|
# FIXME(zongheng): support retries. This call can fail for
|
3971
4225
|
# example due to GCP returning list requests per limit
|
3972
4226
|
# exceeded.
|
@@ -3995,14 +4249,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3995
4249
|
# never launched and the errors are related to pre-launch
|
3996
4250
|
# configurations (such as VPC not found). So it's safe & good UX
|
3997
4251
|
# to not print a failure message.
|
3998
|
-
#
|
3999
|
-
# '(ResourceGroupNotFound)': this indicates the resource group on
|
4000
|
-
# Azure is not found. That means the cluster is already deleted
|
4001
|
-
# on the cloud. So it's safe & good UX to not print a failure
|
4002
|
-
# message.
|
4003
4252
|
elif ('TPU must be specified.' not in stderr and
|
4004
|
-
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr
|
4005
|
-
'(ResourceGroupNotFound)' not in stderr):
|
4253
|
+
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
|
4006
4254
|
raise RuntimeError(
|
4007
4255
|
_TEARDOWN_FAILURE_MESSAGE.format(
|
4008
4256
|
extra_reason='',
|
@@ -4020,7 +4268,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4020
4268
|
def post_teardown_cleanup(self,
|
4021
4269
|
handle: CloudVmRayResourceHandle,
|
4022
4270
|
terminate: bool,
|
4023
|
-
purge: bool = False
|
4271
|
+
purge: bool = False,
|
4272
|
+
remove_from_db: bool = True) -> None:
|
4024
4273
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
4025
4274
|
|
4026
4275
|
This method will handle the following cleanup steps:
|
@@ -4028,53 +4277,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4028
4277
|
* Removing ssh configs for the cluster;
|
4029
4278
|
* Updating the local state of the cluster;
|
4030
4279
|
* Removing the terminated cluster's scripts and ray yaml files.
|
4031
|
-
|
4032
|
-
Raises:
|
4033
|
-
RuntimeError: If it fails to delete the TPU.
|
4034
4280
|
"""
|
4035
|
-
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
4036
|
-
'teardown.log')
|
4037
|
-
log_abs_path = os.path.abspath(log_path)
|
4038
4281
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
4039
|
-
|
4040
|
-
# Backward compatibility for TPU nodes created before #2943. Any TPU
|
4041
|
-
# node launched before that PR have the delete script generated (and do
|
4042
|
-
# not have the tpu_node config set in its cluster yaml), so we have to
|
4043
|
-
# call the deletion script to clean up the TPU node.
|
4044
|
-
# For TPU nodes launched after the PR, deletion is done in SkyPilot's
|
4045
|
-
# new GCP provisioner API.
|
4046
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
4047
|
-
if (handle.tpu_delete_script is not None and
|
4048
|
-
os.path.exists(handle.tpu_delete_script)):
|
4049
|
-
# Only call the deletion script if the cluster config does not
|
4050
|
-
# contain TPU node config. Otherwise, the deletion should
|
4051
|
-
# already be handled by the new provisioner.
|
4052
|
-
config = common_utils.read_yaml(handle.cluster_yaml)
|
4053
|
-
tpu_node_config = config['provider'].get('tpu_node')
|
4054
|
-
if tpu_node_config is None:
|
4055
|
-
with rich_utils.safe_status('[bold cyan]Terminating TPU...'):
|
4056
|
-
tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
|
4057
|
-
['bash', handle.tpu_delete_script],
|
4058
|
-
log_abs_path,
|
4059
|
-
stream_logs=False,
|
4060
|
-
require_outputs=True)
|
4061
|
-
if tpu_rc != 0:
|
4062
|
-
if _TPU_NOT_FOUND_ERROR in tpu_stderr:
|
4063
|
-
logger.info('TPU not found. '
|
4064
|
-
'It should have been deleted already.')
|
4065
|
-
elif purge:
|
4066
|
-
logger.warning(
|
4067
|
-
_TEARDOWN_PURGE_WARNING.format(
|
4068
|
-
reason='stopping/terminating TPU',
|
4069
|
-
details=tpu_stderr))
|
4070
|
-
else:
|
4071
|
-
raise RuntimeError(
|
4072
|
-
_TEARDOWN_FAILURE_MESSAGE.format(
|
4073
|
-
extra_reason='It is caused by TPU failure.',
|
4074
|
-
cluster_name=common_utils.cluster_name_in_hint(
|
4075
|
-
handle.cluster_name, cluster_name_on_cloud),
|
4076
|
-
stdout=tpu_stdout,
|
4077
|
-
stderr=tpu_stderr))
|
4282
|
+
cloud = handle.launched_resources.cloud
|
4078
4283
|
|
4079
4284
|
if (terminate and handle.launched_resources.is_image_managed is True):
|
4080
4285
|
# Delete the image when terminating a "cloned" cluster, i.e.,
|
@@ -4095,56 +4300,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4095
4300
|
'remove it manually to avoid image leakage. Details: '
|
4096
4301
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
4097
4302
|
if terminate:
|
4098
|
-
|
4099
|
-
|
4100
|
-
|
4101
|
-
|
4102
|
-
|
4103
|
-
|
4104
|
-
|
4105
|
-
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
|
4114
|
-
|
4115
|
-
|
4116
|
-
|
4303
|
+
# This function could be directly called from status refresh,
|
4304
|
+
# where we need to cleanup the cluster profile.
|
4305
|
+
metadata_utils.remove_cluster_metadata(handle.cluster_name)
|
4306
|
+
# The cluster yaml does not exist when skypilot has not found
|
4307
|
+
# the right resource to provision the cluster.
|
4308
|
+
if handle.cluster_yaml is not None:
|
4309
|
+
try:
|
4310
|
+
cloud = handle.launched_resources.cloud
|
4311
|
+
config = common_utils.read_yaml(handle.cluster_yaml)
|
4312
|
+
cloud.check_features_are_supported(
|
4313
|
+
handle.launched_resources,
|
4314
|
+
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
4315
|
+
provision_lib.cleanup_ports(repr(cloud),
|
4316
|
+
cluster_name_on_cloud,
|
4317
|
+
handle.launched_resources.ports,
|
4318
|
+
config['provider'])
|
4319
|
+
self.remove_cluster_config(handle)
|
4320
|
+
except exceptions.NotSupportedError:
|
4321
|
+
pass
|
4322
|
+
except exceptions.PortDoesNotExistError:
|
4323
|
+
logger.debug('Ports do not exist. Skipping cleanup.')
|
4324
|
+
except Exception as e: # pylint: disable=broad-except
|
4325
|
+
if purge:
|
4326
|
+
msg = common_utils.format_exception(e, use_bracket=True)
|
4327
|
+
logger.warning(
|
4328
|
+
f'Failed to cleanup ports. Skipping since purge is '
|
4329
|
+
f'set. Details: {msg}')
|
4330
|
+
else:
|
4331
|
+
raise
|
4332
|
+
|
4333
|
+
sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
|
4334
|
+
handle.cluster_name)
|
4335
|
+
|
4336
|
+
def _detect_abnormal_non_terminated_nodes(
|
4337
|
+
handle: CloudVmRayResourceHandle) -> None:
|
4338
|
+
# Confirm that instances have actually transitioned state before
|
4339
|
+
# updating the state database. We do this immediately before
|
4340
|
+
# removing the state from the database, so that we can guarantee
|
4341
|
+
# that this is always called before the state is removed. We
|
4342
|
+
# considered running this check as part of
|
4343
|
+
# provisioner.teardown_cluster or provision.terminate_instances, but
|
4344
|
+
# it would open the door to code paths that successfully call this
|
4345
|
+
# function but do not first call teardown_cluster or
|
4346
|
+
# terminate_instances. See
|
4347
|
+
# https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
|
4348
|
+
attempts = 0
|
4349
|
+
while True:
|
4350
|
+
config = common_utils.read_yaml(handle.cluster_yaml)
|
4351
|
+
|
4352
|
+
logger.debug(f'instance statuses attempt {attempts + 1}')
|
4353
|
+
node_status_dict = provision_lib.query_instances(
|
4354
|
+
repr(cloud),
|
4355
|
+
cluster_name_on_cloud,
|
4356
|
+
config['provider'],
|
4357
|
+
non_terminated_only=False)
|
4358
|
+
|
4359
|
+
unexpected_node_state: Optional[Tuple[str, str]] = None
|
4360
|
+
for node_id, node_status in node_status_dict.items():
|
4361
|
+
logger.debug(f'{node_id} status: {node_status}')
|
4362
|
+
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
4363
|
+
# between "stopping/stopped" and "terminating/terminated",
|
4364
|
+
# so we allow for either status instead of casing on
|
4365
|
+
# `terminate`.
|
4366
|
+
if node_status not in [
|
4367
|
+
None, status_lib.ClusterStatus.STOPPED
|
4368
|
+
]:
|
4369
|
+
unexpected_node_state = (node_id, node_status)
|
4370
|
+
break
|
4371
|
+
|
4372
|
+
if unexpected_node_state is None:
|
4373
|
+
break
|
4374
|
+
|
4375
|
+
attempts += 1
|
4376
|
+
if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
|
4377
|
+
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
4117
4378
|
else:
|
4118
|
-
|
4379
|
+
(node_id, node_status) = unexpected_node_state
|
4380
|
+
raise RuntimeError(f'Instance {node_id} in unexpected '
|
4381
|
+
f'state {node_status}.')
|
4119
4382
|
|
4120
|
-
#
|
4121
|
-
#
|
4122
|
-
|
4123
|
-
|
4124
|
-
backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
|
4125
|
-
handle.head_ip,
|
4126
|
-
auth_config,
|
4127
|
-
handle.docker_user)
|
4383
|
+
# If cluster_yaml is None, the cluster should ensured to be terminated,
|
4384
|
+
# so we don't need to do the double check.
|
4385
|
+
if handle.cluster_yaml is not None:
|
4386
|
+
_detect_abnormal_non_terminated_nodes(handle)
|
4128
4387
|
|
4129
|
-
|
4130
|
-
|
4388
|
+
if not terminate or remove_from_db:
|
4389
|
+
global_user_state.remove_cluster(handle.cluster_name,
|
4390
|
+
terminate=terminate)
|
4131
4391
|
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4135
|
-
|
4136
|
-
|
4137
|
-
# Backward compatibility for TPU nodes created before #2943.
|
4138
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
4139
|
-
if handle.tpu_delete_script is not None:
|
4140
|
-
assert handle.tpu_create_script is not None
|
4141
|
-
common_utils.remove_file_if_exists(handle.tpu_create_script)
|
4142
|
-
common_utils.remove_file_if_exists(handle.tpu_delete_script)
|
4143
|
-
|
4144
|
-
# Clean up generated config
|
4145
|
-
# No try-except is needed since Ray will fail to teardown the
|
4146
|
-
# cluster if the cluster_yaml is missing.
|
4147
|
-
common_utils.remove_file_if_exists(handle.cluster_yaml)
|
4392
|
+
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
4393
|
+
"""Remove the YAML config of a cluster."""
|
4394
|
+
handle.cluster_yaml = None
|
4395
|
+
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
4396
|
+
common_utils.remove_file_if_exists(handle.cluster_yaml)
|
4148
4397
|
|
4149
4398
|
def set_autostop(self,
|
4150
4399
|
handle: CloudVmRayResourceHandle,
|
@@ -4154,16 +4403,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4154
4403
|
# The core.autostop() function should have already checked that the
|
4155
4404
|
# cloud and resources support requested autostop.
|
4156
4405
|
if idle_minutes_to_autostop is not None:
|
4157
|
-
# Skip auto-stop for Kubernetes clusters.
|
4158
|
-
if (isinstance(handle.launched_resources.cloud,
|
4159
|
-
|
4406
|
+
# Skip auto-stop for Kubernetes and RunPod clusters.
|
4407
|
+
if (isinstance(handle.launched_resources.cloud,
|
4408
|
+
(clouds.Kubernetes, clouds.RunPod)) and not down and
|
4409
|
+
idle_minutes_to_autostop >= 0):
|
4160
4410
|
# We should hit this code path only for the controllers on
|
4161
|
-
# Kubernetes clusters.
|
4162
|
-
|
4163
|
-
handle.cluster_name)
|
4164
|
-
|
4165
|
-
|
4166
|
-
|
4411
|
+
# Kubernetes and RunPod clusters.
|
4412
|
+
controller = controller_utils.Controllers.from_name(
|
4413
|
+
handle.cluster_name)
|
4414
|
+
assert (controller is not None), handle.cluster_name
|
4415
|
+
if (controller
|
4416
|
+
== controller_utils.Controllers.SKY_SERVE_CONTROLLER and
|
4417
|
+
isinstance(handle.launched_resources.cloud,
|
4418
|
+
clouds.Kubernetes)):
|
4419
|
+
# For SkyServe controllers on Kubernetes: override autostop
|
4420
|
+
# behavior to force autodown (instead of no-op)
|
4421
|
+
# to avoid dangling controllers.
|
4422
|
+
down = True
|
4423
|
+
else:
|
4424
|
+
logger.info('Auto-stop is not supported for Kubernetes '
|
4425
|
+
'and RunPod clusters. Skipping.')
|
4426
|
+
return
|
4167
4427
|
|
4168
4428
|
# Check if we're stopping spot
|
4169
4429
|
assert (handle.launched_resources is not None and
|
@@ -4182,6 +4442,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4182
4442
|
global_user_state.set_cluster_autostop_value(
|
4183
4443
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
4184
4444
|
|
4445
|
+
# Add/Remove autodown annotations to/from Kubernetes pods.
|
4446
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
4447
|
+
kubernetes_utils.set_autodown_annotations(
|
4448
|
+
handle=handle,
|
4449
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
4450
|
+
down=down)
|
4451
|
+
|
4185
4452
|
def is_definitely_autostopping(self,
|
4186
4453
|
handle: CloudVmRayResourceHandle,
|
4187
4454
|
stream_logs: bool = True) -> bool:
|
@@ -4203,7 +4470,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4203
4470
|
stream_logs=stream_logs)
|
4204
4471
|
|
4205
4472
|
if returncode == 0:
|
4206
|
-
return
|
4473
|
+
return message_utils.decode_payload(stdout)
|
4207
4474
|
logger.debug('Failed to check if cluster is autostopping with '
|
4208
4475
|
f'{returncode}: {stdout+stderr}\n'
|
4209
4476
|
f'Command: {code}')
|
@@ -4333,6 +4600,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4333
4600
|
# cluster is terminated (through console or auto-dwon), the record will
|
4334
4601
|
# become None and the cluster_ever_up should be considered as False.
|
4335
4602
|
cluster_ever_up = record is not None and record['cluster_ever_up']
|
4603
|
+
prev_config_hash = record['config_hash'] if record is not None else None
|
4336
4604
|
logger.debug(f'cluster_ever_up: {cluster_ever_up}')
|
4337
4605
|
logger.debug(f'record: {record}')
|
4338
4606
|
|
@@ -4345,12 +4613,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4345
4613
|
# Assume resources share the same ports.
|
4346
4614
|
for resource in task.resources:
|
4347
4615
|
assert resource.ports == list(task.resources)[0].ports
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4616
|
+
requested_ports_set = resources_utils.port_ranges_to_set(
|
4617
|
+
list(task.resources)[0].ports)
|
4618
|
+
current_ports_set = resources_utils.port_ranges_to_set(
|
4619
|
+
handle.launched_resources.ports)
|
4620
|
+
all_ports = resources_utils.port_set_to_ranges(current_ports_set |
|
4621
|
+
requested_ports_set)
|
4353
4622
|
to_provision = handle.launched_resources
|
4623
|
+
if (to_provision.cloud.OPEN_PORTS_VERSION <=
|
4624
|
+
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
4625
|
+
if not requested_ports_set <= current_ports_set:
|
4626
|
+
current_cloud = to_provision.cloud
|
4627
|
+
with ux_utils.print_exception_no_traceback():
|
4628
|
+
raise exceptions.NotSupportedError(
|
4629
|
+
'Failed to open new ports on an existing cluster '
|
4630
|
+
f'with the current cloud {current_cloud} as it only'
|
4631
|
+
' supports opening ports on launch of the cluster. '
|
4632
|
+
'Please terminate the existing cluster and launch '
|
4633
|
+
'a new cluster with the desired ports open.')
|
4354
4634
|
if all_ports:
|
4355
4635
|
to_provision = to_provision.copy(ports=all_ports)
|
4356
4636
|
return RetryingVmProvisioner.ToProvisionConfig(
|
@@ -4359,7 +4639,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4359
4639
|
handle.launched_nodes,
|
4360
4640
|
prev_cluster_status=prev_cluster_status,
|
4361
4641
|
prev_handle=handle,
|
4362
|
-
prev_cluster_ever_up=cluster_ever_up
|
4642
|
+
prev_cluster_ever_up=cluster_ever_up,
|
4643
|
+
prev_config_hash=prev_config_hash)
|
4363
4644
|
usage_lib.messages.usage.set_new_cluster()
|
4364
4645
|
# Use the task_cloud, because the cloud in `to_provision` can be changed
|
4365
4646
|
# later during the retry.
|
@@ -4394,20 +4675,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4394
4675
|
to_provision = handle_before_refresh.launched_resources
|
4395
4676
|
self.check_resources_fit_cluster(handle_before_refresh, task)
|
4396
4677
|
|
4397
|
-
logger.info(
|
4398
|
-
f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} '
|
4399
|
-
f'[{task.num_nodes}x {to_provision}].'
|
4400
|
-
f'{colorama.Style.RESET_ALL}\n'
|
4401
|
-
'Tip: to reuse an existing cluster, '
|
4402
|
-
'specify --cluster (-c). '
|
4403
|
-
'Run `sky status` to see existing clusters.')
|
4404
4678
|
return RetryingVmProvisioner.ToProvisionConfig(
|
4405
4679
|
cluster_name,
|
4406
4680
|
to_provision,
|
4407
4681
|
task.num_nodes,
|
4408
4682
|
prev_cluster_status=None,
|
4409
4683
|
prev_handle=None,
|
4410
|
-
prev_cluster_ever_up=False
|
4684
|
+
prev_cluster_ever_up=False,
|
4685
|
+
prev_config_hash=prev_config_hash)
|
4411
4686
|
|
4412
4687
|
def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
|
4413
4688
|
file_mounts: Optional[Dict[Path, Path]]):
|
@@ -4423,34 +4698,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4423
4698
|
symlink_commands = []
|
4424
4699
|
fore = colorama.Fore
|
4425
4700
|
style = colorama.Style
|
4426
|
-
logger.info(f'{fore.CYAN}Processing file mounts.{style.RESET_ALL}')
|
4427
4701
|
start = time.time()
|
4428
4702
|
runners = handle.get_command_runners()
|
4429
4703
|
log_path = os.path.join(self.log_dir, 'file_mounts.log')
|
4704
|
+
num_threads = subprocess_utils.get_max_workers_for_file_mounts(
|
4705
|
+
file_mounts, str(handle.launched_resources.cloud))
|
4430
4706
|
|
4431
4707
|
# Check the files and warn
|
4432
4708
|
for dst, src in file_mounts.items():
|
4433
4709
|
if not data_utils.is_cloud_store_url(src):
|
4434
4710
|
full_src = os.path.abspath(os.path.expanduser(src))
|
4435
4711
|
# Checked during Task.set_file_mounts().
|
4436
|
-
assert os.path.exists(
|
4712
|
+
assert os.path.exists(
|
4713
|
+
full_src), f'{full_src} does not exist. {file_mounts}'
|
4437
4714
|
src_size = backend_utils.path_size_megabytes(full_src)
|
4438
4715
|
if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
|
4439
4716
|
logger.warning(
|
4440
|
-
f'{fore.YELLOW}The size of file mount src {src!r} '
|
4717
|
+
f' {fore.YELLOW}The size of file mount src {src!r} '
|
4441
4718
|
f'is {src_size} MB. Try to keep src small or use '
|
4442
|
-
'.
|
4719
|
+
'.skyignore to exclude large files, as large sizes '
|
4443
4720
|
f'will slow down rsync. {style.RESET_ALL}')
|
4444
4721
|
if os.path.islink(full_src):
|
4445
4722
|
logger.warning(
|
4446
|
-
f'{fore.YELLOW}Source path {src!r} is a symlink. '
|
4723
|
+
f' {fore.YELLOW}Source path {src!r} is a symlink. '
|
4447
4724
|
f'Symlink contents are not uploaded.{style.RESET_ALL}')
|
4448
4725
|
|
4449
4726
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
4450
4727
|
os.system(f'touch {log_path}')
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4728
|
+
|
4729
|
+
rich_utils.force_update_status(
|
4730
|
+
ux_utils.spinner_message('Syncing file mounts', log_path))
|
4454
4731
|
|
4455
4732
|
for dst, src in file_mounts.items():
|
4456
4733
|
# TODO: room for improvement. Here there are many moving parts
|
@@ -4488,18 +4765,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4488
4765
|
action_message='Syncing',
|
4489
4766
|
log_path=log_path,
|
4490
4767
|
stream_logs=False,
|
4768
|
+
num_threads=num_threads,
|
4491
4769
|
)
|
4492
4770
|
continue
|
4493
4771
|
|
4494
4772
|
storage = cloud_stores.get_storage_from_path(src)
|
4495
4773
|
if storage.is_directory(src):
|
4496
|
-
|
4497
|
-
|
4774
|
+
sync_cmd = (storage.make_sync_dir_command(
|
4775
|
+
source=src, destination=wrapped_dst))
|
4498
4776
|
# It is a directory so make sure it exists.
|
4499
4777
|
mkdir_for_wrapped_dst = f'mkdir -p {wrapped_dst}'
|
4500
4778
|
else:
|
4501
|
-
|
4502
|
-
|
4779
|
+
sync_cmd = (storage.make_sync_file_command(
|
4780
|
+
source=src, destination=wrapped_dst))
|
4503
4781
|
# It is a file so make sure *its parent dir* exists.
|
4504
4782
|
mkdir_for_wrapped_dst = (
|
4505
4783
|
f'mkdir -p {os.path.dirname(wrapped_dst)}')
|
@@ -4508,7 +4786,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4508
4786
|
# Ensure sync can write to wrapped_dst (e.g., '/data/').
|
4509
4787
|
mkdir_for_wrapped_dst,
|
4510
4788
|
# Both the wrapped and the symlink dir exist; sync.
|
4511
|
-
|
4789
|
+
sync_cmd,
|
4512
4790
|
]
|
4513
4791
|
command = ' && '.join(download_target_commands)
|
4514
4792
|
# dst is only used for message printing.
|
@@ -4524,6 +4802,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4524
4802
|
# Need to source bashrc, as the cloud specific CLI or SDK may
|
4525
4803
|
# require PATH in bashrc.
|
4526
4804
|
source_bashrc=True,
|
4805
|
+
num_threads=num_threads,
|
4527
4806
|
)
|
4528
4807
|
# (2) Run the commands to create symlinks on all the nodes.
|
4529
4808
|
symlink_command = ' && '.join(symlink_commands)
|
@@ -4542,9 +4821,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4542
4821
|
'Failed to create symlinks. The target destination '
|
4543
4822
|
f'may already exist. Log: {log_path}')
|
4544
4823
|
|
4545
|
-
subprocess_utils.run_in_parallel(_symlink_node, runners
|
4824
|
+
subprocess_utils.run_in_parallel(_symlink_node, runners,
|
4825
|
+
num_threads)
|
4546
4826
|
end = time.time()
|
4547
4827
|
logger.debug(f'File mount sync took {end - start} seconds.')
|
4828
|
+
logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
|
4548
4829
|
|
4549
4830
|
def _execute_storage_mounts(
|
4550
4831
|
self, handle: CloudVmRayResourceHandle,
|
@@ -4568,17 +4849,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4568
4849
|
# Handle cases when there aren't any Storages with MOUNT mode.
|
4569
4850
|
if not storage_mounts:
|
4570
4851
|
return
|
4571
|
-
|
4572
|
-
fore = colorama.Fore
|
4573
|
-
style = colorama.Style
|
4574
|
-
plural = 's' if len(storage_mounts) > 1 else ''
|
4575
|
-
logger.info(f'{fore.CYAN}Processing {len(storage_mounts)} '
|
4576
|
-
f'storage mount{plural}.{style.RESET_ALL}')
|
4577
4852
|
start = time.time()
|
4578
4853
|
runners = handle.get_command_runners()
|
4854
|
+
num_threads = subprocess_utils.get_parallel_threads(
|
4855
|
+
str(handle.launched_resources.cloud))
|
4579
4856
|
log_path = os.path.join(self.log_dir, 'storage_mounts.log')
|
4580
4857
|
|
4858
|
+
plural = 's' if len(storage_mounts) > 1 else ''
|
4859
|
+
rich_utils.force_update_status(
|
4860
|
+
ux_utils.spinner_message(
|
4861
|
+
f'Mounting {len(storage_mounts)} storage{plural}', log_path))
|
4862
|
+
|
4581
4863
|
for dst, storage_obj in storage_mounts.items():
|
4864
|
+
storage_obj.construct()
|
4582
4865
|
if not os.path.isabs(dst) and not dst.startswith('~/'):
|
4583
4866
|
dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
|
4584
4867
|
# Raised when the bucket is externall removed before re-mounting
|
@@ -4592,6 +4875,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4592
4875
|
'successfully without mounting the bucket.')
|
4593
4876
|
# Get the first store and use it to mount
|
4594
4877
|
store = list(storage_obj.stores.values())[0]
|
4878
|
+
assert store is not None, storage_obj
|
4595
4879
|
mount_cmd = store.mount_command(dst)
|
4596
4880
|
src_print = (storage_obj.source
|
4597
4881
|
if storage_obj.source else storage_obj.name)
|
@@ -4609,6 +4893,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4609
4893
|
# Need to source bashrc, as the cloud specific CLI or SDK
|
4610
4894
|
# may require PATH in bashrc.
|
4611
4895
|
source_bashrc=True,
|
4896
|
+
num_threads=num_threads,
|
4612
4897
|
)
|
4613
4898
|
except exceptions.CommandError as e:
|
4614
4899
|
if e.returncode == exceptions.MOUNT_PATH_NON_EMPTY_CODE:
|
@@ -4631,6 +4916,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4631
4916
|
|
4632
4917
|
end = time.time()
|
4633
4918
|
logger.debug(f'Storage mount sync took {end - start} seconds.')
|
4919
|
+
logger.info(ux_utils.finishing_message('Storage mounted.', log_path))
|
4634
4920
|
|
4635
4921
|
def _set_storage_mounts_metadata(
|
4636
4922
|
self, cluster_name: str,
|
@@ -4644,6 +4930,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4644
4930
|
return
|
4645
4931
|
storage_mounts_metadata = {}
|
4646
4932
|
for dst, storage_obj in storage_mounts.items():
|
4933
|
+
if storage_obj.mode != storage_lib.StorageMode.MOUNT:
|
4934
|
+
# Skip non-mount storage objects, as there is no need to
|
4935
|
+
# reconstruct them during cluster restart.
|
4936
|
+
continue
|
4647
4937
|
storage_mounts_metadata[dst] = storage_obj.handle
|
4648
4938
|
lock_path = (
|
4649
4939
|
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
@@ -4746,9 +5036,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4746
5036
|
1,
|
4747
5037
|
resources_dict,
|
4748
5038
|
stable_cluster_internal_ips=internal_ips,
|
5039
|
+
env_vars=task_env_vars,
|
4749
5040
|
setup_cmd=self._setup_cmd,
|
4750
5041
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
4751
|
-
env_vars=task_env_vars,
|
4752
5042
|
)
|
4753
5043
|
|
4754
5044
|
if callable(task.run):
|
@@ -4795,9 +5085,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4795
5085
|
num_actual_nodes,
|
4796
5086
|
resources_dict,
|
4797
5087
|
stable_cluster_internal_ips=internal_ips,
|
5088
|
+
env_vars=task_env_vars,
|
4798
5089
|
setup_cmd=self._setup_cmd,
|
4799
5090
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
4800
|
-
|
5091
|
+
)
|
4801
5092
|
|
4802
5093
|
if callable(task.run):
|
4803
5094
|
run_fn_code = textwrap.dedent(inspect.getsource(task.run))
|