skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/backends/backend_utils.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
from datetime import datetime
|
3
3
|
import enum
|
4
4
|
import fnmatch
|
5
|
-
import
|
5
|
+
import hashlib
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
8
|
import pprint
|
@@ -11,7 +11,6 @@ import shlex
|
|
11
11
|
import subprocess
|
12
12
|
import sys
|
13
13
|
import tempfile
|
14
|
-
import textwrap
|
15
14
|
import time
|
16
15
|
import typing
|
17
16
|
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
@@ -37,26 +36,27 @@ from sky import global_user_state
|
|
37
36
|
from sky import provision as provision_lib
|
38
37
|
from sky import sky_logging
|
39
38
|
from sky import skypilot_config
|
40
|
-
from sky import status_lib
|
41
|
-
from sky.clouds import cloud_registry
|
42
39
|
from sky.provision import instance_setup
|
43
40
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
44
41
|
from sky.skylet import constants
|
45
42
|
from sky.usage import usage_lib
|
46
|
-
from sky.utils import
|
43
|
+
from sky.utils import cluster_utils
|
47
44
|
from sky.utils import command_runner
|
45
|
+
from sky.utils import common
|
48
46
|
from sky.utils import common_utils
|
49
47
|
from sky.utils import controller_utils
|
50
48
|
from sky.utils import env_options
|
49
|
+
from sky.utils import registry
|
51
50
|
from sky.utils import resources_utils
|
52
51
|
from sky.utils import rich_utils
|
53
52
|
from sky.utils import schemas
|
53
|
+
from sky.utils import status_lib
|
54
54
|
from sky.utils import subprocess_utils
|
55
55
|
from sky.utils import timeline
|
56
56
|
from sky.utils import ux_utils
|
57
57
|
|
58
58
|
if typing.TYPE_CHECKING:
|
59
|
-
from sky import resources
|
59
|
+
from sky import resources as resources_lib
|
60
60
|
from sky import task as task_lib
|
61
61
|
from sky.backends import cloud_vm_ray_backend
|
62
62
|
from sky.backends import local_docker_backend
|
@@ -68,10 +68,6 @@ SKY_REMOTE_APP_DIR = '~/.sky/sky_app'
|
|
68
68
|
# Exclude subnet mask from IP address regex.
|
69
69
|
IP_ADDR_REGEX = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!/\d{1,2})\b'
|
70
70
|
SKY_REMOTE_PATH = '~/.sky/wheels'
|
71
|
-
SKY_USER_FILE_PATH = '~/.sky/generated'
|
72
|
-
|
73
|
-
BOLD = '\033[1m'
|
74
|
-
RESET_BOLD = '\033[0m'
|
75
71
|
|
76
72
|
# Do not use /tmp because it gets cleared on VM restart.
|
77
73
|
_SKY_REMOTE_FILE_MOUNTS_DIR = '~/.sky/file_mounts/'
|
@@ -103,6 +99,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
103
99
|
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
104
100
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
105
101
|
|
102
|
+
# Time that must elapse since the last status check before we should re-check if
|
103
|
+
# the cluster has been terminated or autostopped.
|
104
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
105
|
+
|
106
106
|
# Filelocks for updating cluster's file_mounts.
|
107
107
|
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
108
108
|
'~/.sky/.{}_file_mounts.lock')
|
@@ -114,6 +114,16 @@ _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
114
114
|
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
115
115
|
'please retry after a while.')
|
116
116
|
|
117
|
+
# If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
|
118
|
+
# see any instances in the cloud, the instances might be in the proccess of
|
119
|
+
# being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
|
120
|
+
# check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
|
121
|
+
# should be set longer than the delay between (sending the create instance
|
122
|
+
# request) and (the instances appearing on the cloud).
|
123
|
+
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
124
|
+
_LAUNCH_DOUBLE_CHECK_WINDOW = 60
|
125
|
+
_LAUNCH_DOUBLE_CHECK_DELAY = 1
|
126
|
+
|
117
127
|
# Include the fields that will be used for generating tags that distinguishes
|
118
128
|
# the cluster in ray, to avoid the stopped cluster being discarded due to
|
119
129
|
# updates in the yaml template.
|
@@ -146,6 +156,7 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
|
|
146
156
|
# Clouds with new provisioner has docker_login_config in the
|
147
157
|
# docker field, instead of the provider field.
|
148
158
|
('docker', 'docker_login_config'),
|
159
|
+
('docker', 'run_options'),
|
149
160
|
# Other clouds
|
150
161
|
('provider', 'docker_login_config'),
|
151
162
|
('provider', 'firewall_rule'),
|
@@ -154,8 +165,21 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
|
|
154
165
|
# we need to take this field from the new yaml.
|
155
166
|
('provider', 'tpu_node'),
|
156
167
|
('provider', 'security_group', 'GroupName'),
|
168
|
+
('available_node_types', 'ray.head.default', 'node_config',
|
169
|
+
'IamInstanceProfile'),
|
157
170
|
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
|
158
|
-
('available_node_types', 'ray.
|
171
|
+
('available_node_types', 'ray.head.default', 'node_config',
|
172
|
+
'azure_arm_parameters', 'cloudInitSetupCommands'),
|
173
|
+
]
|
174
|
+
# These keys are expected to change when provisioning on an existing cluster,
|
175
|
+
# but they don't actually represent a change that requires re-provisioning the
|
176
|
+
# cluster. If the cluster yaml is the same except for these keys, we can safely
|
177
|
+
# skip reprovisioning. See _deterministic_cluster_yaml_hash.
|
178
|
+
_RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
179
|
+
# On first launch, availability_zones will include all possible zones. Once
|
180
|
+
# the cluster exists, it will only include the zone that the cluster is
|
181
|
+
# actually in.
|
182
|
+
('provider', 'availability_zone'),
|
159
183
|
]
|
160
184
|
|
161
185
|
|
@@ -165,13 +189,17 @@ def is_ip(s: str) -> bool:
|
|
165
189
|
|
166
190
|
|
167
191
|
def _get_yaml_path_from_cluster_name(cluster_name: str,
|
168
|
-
prefix: str = SKY_USER_FILE_PATH
|
192
|
+
prefix: str = constants.SKY_USER_FILE_PATH
|
193
|
+
) -> str:
|
169
194
|
output_path = pathlib.Path(
|
170
195
|
prefix).expanduser().resolve() / f'{cluster_name}.yml'
|
171
196
|
os.makedirs(output_path.parents[0], exist_ok=True)
|
172
197
|
return str(output_path)
|
173
198
|
|
174
199
|
|
200
|
+
# Add retry for the file mounts optimization, as the underlying cp command may
|
201
|
+
# experience transient errors, #4758.
|
202
|
+
@common_utils.retry
|
175
203
|
def _optimize_file_mounts(yaml_path: str) -> None:
|
176
204
|
"""Optimize file mounts in the given ray yaml file.
|
177
205
|
|
@@ -181,6 +209,10 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
181
209
|
- wheel
|
182
210
|
- credentials
|
183
211
|
Format is {dst: src}.
|
212
|
+
|
213
|
+
Raises:
|
214
|
+
subprocess.CalledProcessError: If the file mounts are failed to be
|
215
|
+
copied.
|
184
216
|
"""
|
185
217
|
yaml_config = common_utils.read_yaml(yaml_path)
|
186
218
|
|
@@ -276,18 +308,22 @@ def path_size_megabytes(path: str) -> int:
|
|
276
308
|
If successful: the size of 'path' in megabytes, rounded down. Otherwise,
|
277
309
|
-1.
|
278
310
|
"""
|
279
|
-
resolved_path = pathlib.Path(path).expanduser().resolve()
|
280
311
|
git_exclude_filter = ''
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
312
|
+
resolved_path = pathlib.Path(path).expanduser().resolve()
|
313
|
+
if (resolved_path / constants.SKY_IGNORE_FILE).exists():
|
314
|
+
rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE
|
315
|
+
else:
|
316
|
+
rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE
|
317
|
+
if (resolved_path / command_runner.GIT_EXCLUDE).exists():
|
318
|
+
# Ensure file exists; otherwise, rsync will error out.
|
319
|
+
#
|
320
|
+
# We shlex.quote() because the path may contain spaces:
|
321
|
+
# 'my dir/.git/info/exclude'
|
322
|
+
# Without quoting rsync fails.
|
323
|
+
git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
|
324
|
+
shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
|
289
325
|
rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} '
|
290
|
-
f'{
|
326
|
+
f'{rsync_filter} '
|
291
327
|
f'{git_exclude_filter} --dry-run {path!r}')
|
292
328
|
rsync_output = ''
|
293
329
|
try:
|
@@ -391,304 +427,6 @@ class FileMountHelper(object):
|
|
391
427
|
return ' && '.join(commands)
|
392
428
|
|
393
429
|
|
394
|
-
class SSHConfigHelper(object):
|
395
|
-
"""Helper for handling local SSH configuration."""
|
396
|
-
|
397
|
-
ssh_conf_path = '~/.ssh/config'
|
398
|
-
ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
|
399
|
-
ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
|
400
|
-
|
401
|
-
@classmethod
|
402
|
-
def _get_generated_config(cls, autogen_comment: str, host_name: str,
|
403
|
-
ip: str, username: str, ssh_key_path: str,
|
404
|
-
proxy_command: Optional[str], port: int,
|
405
|
-
docker_proxy_command: Optional[str]):
|
406
|
-
if proxy_command is not None:
|
407
|
-
# Already checked in resources
|
408
|
-
assert docker_proxy_command is None, (
|
409
|
-
'Cannot specify both proxy_command and docker_proxy_command.')
|
410
|
-
proxy = f'ProxyCommand {proxy_command}'
|
411
|
-
elif docker_proxy_command is not None:
|
412
|
-
proxy = f'ProxyCommand {docker_proxy_command}'
|
413
|
-
else:
|
414
|
-
proxy = ''
|
415
|
-
# StrictHostKeyChecking=no skips the host key check for the first
|
416
|
-
# time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
|
417
|
-
# prevent the host key from being added to the known_hosts file and
|
418
|
-
# always return an empty file for known hosts, making the ssh think
|
419
|
-
# this is a first-time connection, and thus skipping the host key
|
420
|
-
# check.
|
421
|
-
codegen = textwrap.dedent(f"""\
|
422
|
-
{autogen_comment}
|
423
|
-
Host {host_name}
|
424
|
-
HostName {ip}
|
425
|
-
User {username}
|
426
|
-
IdentityFile {ssh_key_path}
|
427
|
-
IdentitiesOnly yes
|
428
|
-
ForwardAgent yes
|
429
|
-
StrictHostKeyChecking no
|
430
|
-
UserKnownHostsFile=/dev/null
|
431
|
-
GlobalKnownHostsFile=/dev/null
|
432
|
-
Port {port}
|
433
|
-
{proxy}
|
434
|
-
""".rstrip())
|
435
|
-
codegen = codegen + '\n'
|
436
|
-
return codegen
|
437
|
-
|
438
|
-
@classmethod
|
439
|
-
@timeline.FileLockEvent(ssh_conf_lock_path)
|
440
|
-
def add_cluster(
|
441
|
-
cls,
|
442
|
-
cluster_name: str,
|
443
|
-
ips: List[str],
|
444
|
-
auth_config: Dict[str, str],
|
445
|
-
ports: List[int],
|
446
|
-
docker_user: Optional[str] = None,
|
447
|
-
ssh_user: Optional[str] = None,
|
448
|
-
):
|
449
|
-
"""Add authentication information for cluster to local SSH config file.
|
450
|
-
|
451
|
-
If a host with `cluster_name` already exists and the configuration was
|
452
|
-
not added by sky, then `ip` is used to identify the host instead in the
|
453
|
-
file.
|
454
|
-
|
455
|
-
If a host with `cluster_name` already exists and the configuration was
|
456
|
-
added by sky (e.g. a spot instance), then the configuration is
|
457
|
-
overwritten.
|
458
|
-
|
459
|
-
Args:
|
460
|
-
cluster_name: Cluster name (see `sky status`)
|
461
|
-
ips: List of public IP addresses in the cluster. First IP is head
|
462
|
-
node.
|
463
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
464
|
-
ports: List of port numbers for SSH corresponding to ips
|
465
|
-
docker_user: If not None, use this user to ssh into the docker
|
466
|
-
ssh_user: Override the ssh_user in auth_config
|
467
|
-
"""
|
468
|
-
if ssh_user is None:
|
469
|
-
username = auth_config['ssh_user']
|
470
|
-
else:
|
471
|
-
username = ssh_user
|
472
|
-
if docker_user is not None:
|
473
|
-
username = docker_user
|
474
|
-
key_path = os.path.expanduser(auth_config['ssh_private_key'])
|
475
|
-
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
476
|
-
f'{cluster_name}` to remove)')
|
477
|
-
ip = ips[0]
|
478
|
-
if docker_user is not None:
|
479
|
-
ip = 'localhost'
|
480
|
-
|
481
|
-
config_path = os.path.expanduser(cls.ssh_conf_path)
|
482
|
-
|
483
|
-
# For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
|
484
|
-
# directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
|
485
|
-
# and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
|
486
|
-
cls._remove_stale_cluster_config_for_backward_compatibility(
|
487
|
-
cluster_name, ip, auth_config, docker_user)
|
488
|
-
|
489
|
-
if not os.path.exists(config_path):
|
490
|
-
config = ['\n']
|
491
|
-
with open(config_path,
|
492
|
-
'w',
|
493
|
-
encoding='utf-8',
|
494
|
-
opener=functools.partial(os.open, mode=0o644)) as f:
|
495
|
-
f.writelines(config)
|
496
|
-
|
497
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
498
|
-
config = f.readlines()
|
499
|
-
|
500
|
-
ssh_dir = cls.ssh_cluster_path.format('')
|
501
|
-
os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
|
502
|
-
|
503
|
-
# Handle Include on top of Config file
|
504
|
-
include_str = f'Include {cls.ssh_cluster_path.format("*")}'
|
505
|
-
found = False
|
506
|
-
for i, line in enumerate(config):
|
507
|
-
config_str = line.strip()
|
508
|
-
if config_str == include_str:
|
509
|
-
found = True
|
510
|
-
break
|
511
|
-
if 'Host' in config_str:
|
512
|
-
break
|
513
|
-
if not found:
|
514
|
-
# Did not find Include string. Insert `Include` lines.
|
515
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
516
|
-
config.insert(
|
517
|
-
0,
|
518
|
-
f'# Added by SkyPilot for ssh config of all clusters\n{include_str}\n'
|
519
|
-
)
|
520
|
-
f.write(''.join(config).strip())
|
521
|
-
f.write('\n' * 2)
|
522
|
-
|
523
|
-
proxy_command = auth_config.get('ssh_proxy_command', None)
|
524
|
-
|
525
|
-
docker_proxy_command_generator = None
|
526
|
-
if docker_user is not None:
|
527
|
-
docker_proxy_command_generator = lambda ip, port: ' '.join(
|
528
|
-
['ssh'] + command_runner.ssh_options_list(
|
529
|
-
key_path, ssh_control_name=None, port=port) +
|
530
|
-
['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
|
531
|
-
|
532
|
-
codegen = ''
|
533
|
-
# Add the nodes to the codegen
|
534
|
-
for i, ip in enumerate(ips):
|
535
|
-
docker_proxy_command = None
|
536
|
-
port = ports[i]
|
537
|
-
if docker_proxy_command_generator is not None:
|
538
|
-
docker_proxy_command = docker_proxy_command_generator(ip, port)
|
539
|
-
ip = 'localhost'
|
540
|
-
port = constants.DEFAULT_DOCKER_PORT
|
541
|
-
node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
|
542
|
-
# TODO(romilb): Update port number when k8s supports multinode
|
543
|
-
codegen += cls._get_generated_config(
|
544
|
-
sky_autogen_comment, node_name, ip, username, key_path,
|
545
|
-
proxy_command, port, docker_proxy_command) + '\n'
|
546
|
-
|
547
|
-
cluster_config_path = os.path.expanduser(
|
548
|
-
cls.ssh_cluster_path.format(cluster_name))
|
549
|
-
|
550
|
-
with open(cluster_config_path,
|
551
|
-
'w',
|
552
|
-
encoding='utf-8',
|
553
|
-
opener=functools.partial(os.open, mode=0o644)) as f:
|
554
|
-
f.write(codegen)
|
555
|
-
|
556
|
-
@classmethod
|
557
|
-
def _remove_stale_cluster_config_for_backward_compatibility(
|
558
|
-
cls,
|
559
|
-
cluster_name: str,
|
560
|
-
ip: str,
|
561
|
-
auth_config: Dict[str, str],
|
562
|
-
docker_user: Optional[str] = None,
|
563
|
-
):
|
564
|
-
"""Remove authentication information for cluster from local SSH config.
|
565
|
-
|
566
|
-
If no existing host matching the provided specification is found, then
|
567
|
-
nothing is removed.
|
568
|
-
|
569
|
-
Args:
|
570
|
-
ip: Head node's IP address.
|
571
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
572
|
-
docker_user: If not None, use this user to ssh into the docker
|
573
|
-
"""
|
574
|
-
username = auth_config['ssh_user']
|
575
|
-
config_path = os.path.expanduser(cls.ssh_conf_path)
|
576
|
-
cluster_config_path = os.path.expanduser(
|
577
|
-
cls.ssh_cluster_path.format(cluster_name))
|
578
|
-
if not os.path.exists(config_path):
|
579
|
-
return
|
580
|
-
|
581
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
582
|
-
config = f.readlines()
|
583
|
-
|
584
|
-
start_line_idx = None
|
585
|
-
|
586
|
-
# Scan the config for the cluster name.
|
587
|
-
for i, line in enumerate(config):
|
588
|
-
next_line = config[i + 1] if i + 1 < len(config) else ''
|
589
|
-
if docker_user is None:
|
590
|
-
found = (line.strip() == f'HostName {ip}' and
|
591
|
-
next_line.strip() == f'User {username}')
|
592
|
-
else:
|
593
|
-
found = (line.strip() == 'HostName localhost' and
|
594
|
-
next_line.strip() == f'User {docker_user}')
|
595
|
-
if found:
|
596
|
-
# Find the line starting with ProxyCommand and contains the ip
|
597
|
-
found = False
|
598
|
-
for idx in range(i, len(config)):
|
599
|
-
# Stop if we reach an empty line, which means a new host
|
600
|
-
if not config[idx].strip():
|
601
|
-
break
|
602
|
-
if config[idx].strip().startswith('ProxyCommand'):
|
603
|
-
proxy_command_line = config[idx].strip()
|
604
|
-
if proxy_command_line.endswith(f'@{ip}'):
|
605
|
-
found = True
|
606
|
-
break
|
607
|
-
if found:
|
608
|
-
start_line_idx = i - 1
|
609
|
-
break
|
610
|
-
|
611
|
-
if start_line_idx is not None:
|
612
|
-
# Scan for end of previous config.
|
613
|
-
cursor = start_line_idx
|
614
|
-
while cursor > 0 and len(config[cursor].strip()) > 0:
|
615
|
-
cursor -= 1
|
616
|
-
prev_end_line_idx = cursor
|
617
|
-
|
618
|
-
# Scan for end of the cluster config.
|
619
|
-
end_line_idx = None
|
620
|
-
cursor = start_line_idx + 1
|
621
|
-
start_line_idx -= 1 # remove auto-generated comment
|
622
|
-
while cursor < len(config):
|
623
|
-
if config[cursor].strip().startswith(
|
624
|
-
'# ') or config[cursor].strip().startswith('Host '):
|
625
|
-
end_line_idx = cursor
|
626
|
-
break
|
627
|
-
cursor += 1
|
628
|
-
|
629
|
-
# Remove sky-generated config and update the file.
|
630
|
-
config[prev_end_line_idx:end_line_idx] = [
|
631
|
-
'\n'
|
632
|
-
] if end_line_idx is not None else []
|
633
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
634
|
-
f.write(''.join(config).strip())
|
635
|
-
f.write('\n' * 2)
|
636
|
-
|
637
|
-
# Delete include statement if it exists in the config.
|
638
|
-
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
639
|
-
f'{cluster_name}` to remove)')
|
640
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
641
|
-
config = f.readlines()
|
642
|
-
|
643
|
-
for i, line in enumerate(config):
|
644
|
-
config_str = line.strip()
|
645
|
-
if f'Include {cluster_config_path}' in config_str:
|
646
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
647
|
-
if i < len(config) - 1 and config[i + 1] == '\n':
|
648
|
-
del config[i + 1]
|
649
|
-
# Delete Include string
|
650
|
-
del config[i]
|
651
|
-
# Delete Sky Autogen Comment
|
652
|
-
if i > 0 and sky_autogen_comment in config[i - 1].strip():
|
653
|
-
del config[i - 1]
|
654
|
-
f.write(''.join(config))
|
655
|
-
break
|
656
|
-
if 'Host' in config_str:
|
657
|
-
break
|
658
|
-
|
659
|
-
@classmethod
|
660
|
-
# TODO: We can remove this after 0.6.0 and have a lock only per cluster.
|
661
|
-
@timeline.FileLockEvent(ssh_conf_lock_path)
|
662
|
-
def remove_cluster(
|
663
|
-
cls,
|
664
|
-
cluster_name: str,
|
665
|
-
ip: str,
|
666
|
-
auth_config: Dict[str, str],
|
667
|
-
docker_user: Optional[str] = None,
|
668
|
-
):
|
669
|
-
"""Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
|
670
|
-
|
671
|
-
For backward compatibility also remove the config from ~/.ssh/config if it exists.
|
672
|
-
|
673
|
-
If no existing host matching the provided specification is found, then
|
674
|
-
nothing is removed.
|
675
|
-
|
676
|
-
Args:
|
677
|
-
ip: Head node's IP address.
|
678
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
679
|
-
docker_user: If not None, use this user to ssh into the docker
|
680
|
-
"""
|
681
|
-
cluster_config_path = os.path.expanduser(
|
682
|
-
cls.ssh_cluster_path.format(cluster_name))
|
683
|
-
common_utils.remove_file_if_exists(cluster_config_path)
|
684
|
-
|
685
|
-
# Ensures backward compatibility: before #2706, we wrote the config of SkyPilot clusters
|
686
|
-
# directly in ~/.ssh/config. For these clusters, we should clean up the config.
|
687
|
-
# TODO: Remove this after 0.6.0
|
688
|
-
cls._remove_stale_cluster_config_for_backward_compatibility(
|
689
|
-
cluster_name, ip, auth_config, docker_user)
|
690
|
-
|
691
|
-
|
692
430
|
def _replace_yaml_dicts(
|
693
431
|
new_yaml: str, old_yaml: str, restore_key_names: Set[str],
|
694
432
|
restore_key_names_exceptions: Sequence[Tuple[str, ...]]) -> str:
|
@@ -742,10 +480,46 @@ def _replace_yaml_dicts(
|
|
742
480
|
return common_utils.dump_yaml_str(new_config)
|
743
481
|
|
744
482
|
|
483
|
+
def get_expirable_clouds(
|
484
|
+
enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]:
|
485
|
+
"""Returns a list of clouds that use local credentials and whose credentials can expire.
|
486
|
+
|
487
|
+
This function checks each cloud in the provided sequence to determine if it uses local credentials
|
488
|
+
and if its credentials can expire. If both conditions are met, the cloud is added to the list of
|
489
|
+
expirable clouds.
|
490
|
+
|
491
|
+
Args:
|
492
|
+
enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check.
|
493
|
+
|
494
|
+
Returns:
|
495
|
+
list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire.
|
496
|
+
"""
|
497
|
+
expirable_clouds = []
|
498
|
+
local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
|
499
|
+
for cloud in enabled_clouds:
|
500
|
+
remote_identities = skypilot_config.get_nested(
|
501
|
+
(str(cloud).lower(), 'remote_identity'), None)
|
502
|
+
if remote_identities is None:
|
503
|
+
remote_identities = schemas.get_default_remote_identity(
|
504
|
+
str(cloud).lower())
|
505
|
+
|
506
|
+
local_credential_expiring = cloud.can_credential_expire()
|
507
|
+
if isinstance(remote_identities, str):
|
508
|
+
if remote_identities == local_credentials_value and local_credential_expiring:
|
509
|
+
expirable_clouds.append(cloud)
|
510
|
+
elif isinstance(remote_identities, list):
|
511
|
+
for profile in remote_identities:
|
512
|
+
if list(profile.values(
|
513
|
+
))[0] == local_credentials_value and local_credential_expiring:
|
514
|
+
expirable_clouds.append(cloud)
|
515
|
+
break
|
516
|
+
return expirable_clouds
|
517
|
+
|
518
|
+
|
745
519
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
746
520
|
@timeline.event
|
747
521
|
def write_cluster_config(
|
748
|
-
to_provision: '
|
522
|
+
to_provision: 'resources_lib.Resources',
|
749
523
|
num_nodes: int,
|
750
524
|
cluster_config_template: str,
|
751
525
|
cluster_name: str,
|
@@ -757,11 +531,17 @@ def write_cluster_config(
|
|
757
531
|
keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
|
758
532
|
"""Fills in cluster configuration templates and writes them out.
|
759
533
|
|
760
|
-
Returns:
|
761
|
-
|
762
|
-
- 'ray'
|
763
|
-
- '
|
764
|
-
- '
|
534
|
+
Returns:
|
535
|
+
Dict with the following keys:
|
536
|
+
- 'ray': Path to the generated Ray yaml config file
|
537
|
+
- 'cluster_name': Name of the cluster
|
538
|
+
- 'cluster_name_on_cloud': Name of the cluster as it appears in the
|
539
|
+
cloud provider
|
540
|
+
- 'config_hash': Hash of the cluster config and file mounts contents.
|
541
|
+
Can be missing if we unexpectedly failed to calculate the hash for
|
542
|
+
some reason. In that case we will continue without the optimization to
|
543
|
+
skip provisioning.
|
544
|
+
|
765
545
|
Raises:
|
766
546
|
exceptions.ResourcesUnavailableError: if the region/zones requested does
|
767
547
|
not appear in the catalog, or an ssh_proxy_command is specified but
|
@@ -792,35 +572,76 @@ def write_cluster_config(
|
|
792
572
|
# move the check out of this function, i.e. the caller should be responsible
|
793
573
|
# for the validation.
|
794
574
|
# TODO(tian): Move more cloud agnostic vars to resources.py.
|
795
|
-
resources_vars = to_provision.make_deploy_variables(
|
796
|
-
|
575
|
+
resources_vars = to_provision.make_deploy_variables(
|
576
|
+
resources_utils.ClusterName(
|
577
|
+
cluster_name,
|
578
|
+
cluster_name_on_cloud,
|
579
|
+
), region, zones, num_nodes, dryrun)
|
797
580
|
config_dict = {}
|
798
581
|
|
799
582
|
specific_reservations = set(
|
800
583
|
skypilot_config.get_nested(
|
801
584
|
(str(to_provision.cloud).lower(), 'specific_reservations'), set()))
|
802
585
|
|
586
|
+
# Remote identity handling can have 4 cases:
|
587
|
+
# 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
|
588
|
+
# 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
|
589
|
+
# 3. Custom service account: Use specified service account
|
590
|
+
# 4. NO_UPLOAD: Do not upload any credentials
|
591
|
+
#
|
592
|
+
# We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
|
593
|
+
# other cases, we exclude the cloud from credential file uploads after
|
594
|
+
# running required checks.
|
803
595
|
assert cluster_name is not None
|
804
|
-
excluded_clouds =
|
805
|
-
|
806
|
-
(str(cloud).lower(), 'remote_identity'),
|
807
|
-
|
808
|
-
if
|
809
|
-
|
596
|
+
excluded_clouds = set()
|
597
|
+
remote_identity_config = skypilot_config.get_nested(
|
598
|
+
(str(cloud).lower(), 'remote_identity'), None)
|
599
|
+
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
600
|
+
if isinstance(remote_identity_config, str):
|
601
|
+
remote_identity = remote_identity_config
|
602
|
+
if isinstance(remote_identity_config, list):
|
603
|
+
# Some clouds (e.g., AWS) support specifying multiple service accounts
|
604
|
+
# chosen based on the cluster name. Do the matching here to pick the
|
605
|
+
# correct one.
|
606
|
+
for profile in remote_identity_config:
|
810
607
|
if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
|
811
608
|
remote_identity = list(profile.values())[0]
|
812
609
|
break
|
813
610
|
if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
|
814
|
-
|
611
|
+
# If LOCAL_CREDENTIALS is not specified, we add the cloud to the
|
612
|
+
# excluded_clouds set, but we must also check if the cloud supports
|
613
|
+
# service accounts.
|
614
|
+
if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
|
615
|
+
# If NO_UPLOAD is specified, fall back to default remote identity
|
616
|
+
# for downstream logic but add it to excluded_clouds to skip
|
617
|
+
# credential file uploads.
|
618
|
+
remote_identity = schemas.get_default_remote_identity(
|
619
|
+
str(cloud).lower())
|
620
|
+
elif not cloud.supports_service_account_on_remote():
|
815
621
|
raise exceptions.InvalidCloudConfigs(
|
816
622
|
'remote_identity: SERVICE_ACCOUNT is specified in '
|
817
623
|
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
|
818
624
|
'is not supported by this cloud. Remove the config or set: '
|
819
625
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
820
|
-
|
626
|
+
if isinstance(cloud, clouds.Kubernetes):
|
627
|
+
if skypilot_config.get_nested(
|
628
|
+
('kubernetes', 'allowed_contexts'), None) is None:
|
629
|
+
excluded_clouds.add(cloud)
|
630
|
+
else:
|
631
|
+
excluded_clouds.add(cloud)
|
632
|
+
|
633
|
+
for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
|
634
|
+
remote_identity_config = skypilot_config.get_nested(
|
635
|
+
(cloud_str.lower(), 'remote_identity'), None)
|
636
|
+
if remote_identity_config:
|
637
|
+
if (remote_identity_config ==
|
638
|
+
schemas.RemoteIdentityOptions.NO_UPLOAD.value):
|
639
|
+
excluded_clouds.add(cloud_obj)
|
640
|
+
|
821
641
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
822
642
|
|
823
|
-
|
643
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
644
|
+
auth_config = {'ssh_private_key': private_key_path}
|
824
645
|
region_name = resources_vars.get('region')
|
825
646
|
|
826
647
|
yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
|
@@ -854,11 +675,6 @@ def write_cluster_config(
|
|
854
675
|
|
855
676
|
# User-supplied global instance tags from ~/.sky/config.yaml.
|
856
677
|
labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
|
857
|
-
# Deprecated: instance_tags have been replaced by labels. For backward
|
858
|
-
# compatibility, we support them and the schema allows them only if
|
859
|
-
# `labels` are not specified. This should be removed after 0.7.0.
|
860
|
-
labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
|
861
|
-
labels)
|
862
678
|
# labels is a dict, which is guaranteed by the type check in
|
863
679
|
# schemas.py
|
864
680
|
assert isinstance(labels, dict), labels
|
@@ -873,6 +689,11 @@ def write_cluster_config(
|
|
873
689
|
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
|
874
690
|
)
|
875
691
|
|
692
|
+
# We disable conda auto-activation if the user has specified a docker image
|
693
|
+
# to use, which is likely to already have a conda environment activated.
|
694
|
+
conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
|
695
|
+
else 'false')
|
696
|
+
|
876
697
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
877
698
|
# future.
|
878
699
|
tmp_yaml_path = yaml_path + '.tmp'
|
@@ -907,16 +728,21 @@ def write_cluster_config(
|
|
907
728
|
'specific_reservations': specific_reservations,
|
908
729
|
|
909
730
|
# Conda setup
|
910
|
-
'conda_installation_commands':
|
911
|
-
constants.CONDA_INSTALLATION_COMMANDS,
|
912
731
|
# We should not use `.format`, as it contains '{}' as the bash
|
913
732
|
# syntax.
|
733
|
+
'conda_installation_commands':
|
734
|
+
constants.CONDA_INSTALLATION_COMMANDS.replace(
|
735
|
+
'{conda_auto_activate}', conda_auto_activate),
|
914
736
|
'ray_skypilot_installation_commands':
|
915
737
|
(constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
|
916
738
|
'{sky_wheel_hash}',
|
917
739
|
wheel_hash).replace('{cloud}',
|
918
740
|
str(cloud).lower())),
|
919
|
-
|
741
|
+
'skypilot_wheel_installation_commands':
|
742
|
+
constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
|
743
|
+
'{sky_wheel_hash}',
|
744
|
+
wheel_hash).replace('{cloud}',
|
745
|
+
str(cloud).lower()),
|
920
746
|
# Port of Ray (GCS server).
|
921
747
|
# Ray's default port 6379 is conflicted with Redis.
|
922
748
|
'ray_port': constants.SKY_REMOTE_RAY_PORT,
|
@@ -945,7 +771,7 @@ def write_cluster_config(
|
|
945
771
|
'sky_local_path': str(local_wheel_path),
|
946
772
|
# Add yaml file path to the template variables.
|
947
773
|
'sky_ray_yaml_remote_path':
|
948
|
-
|
774
|
+
cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
|
949
775
|
'sky_ray_yaml_local_path': tmp_yaml_path,
|
950
776
|
'sky_version': str(version.parse(sky.__version__)),
|
951
777
|
'sky_wheel_hash': wheel_hash,
|
@@ -955,17 +781,33 @@ def write_cluster_config(
|
|
955
781
|
output_path=tmp_yaml_path)
|
956
782
|
config_dict['cluster_name'] = cluster_name
|
957
783
|
config_dict['ray'] = yaml_path
|
784
|
+
|
785
|
+
# Add kubernetes config fields from ~/.sky/config
|
786
|
+
if isinstance(cloud, clouds.Kubernetes):
|
787
|
+
kubernetes_utils.combine_pod_config_fields(
|
788
|
+
tmp_yaml_path,
|
789
|
+
cluster_config_overrides=to_provision.cluster_config_overrides)
|
790
|
+
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
791
|
+
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
|
792
|
+
pod_config = yaml_obj['available_node_types']['ray_head_default'][
|
793
|
+
'node_config']
|
794
|
+
valid, message = kubernetes_utils.check_pod_config(pod_config)
|
795
|
+
if not valid:
|
796
|
+
raise exceptions.InvalidCloudConfigs(
|
797
|
+
f'Invalid pod_config. Details: {message}')
|
798
|
+
|
958
799
|
if dryrun:
|
959
800
|
# If dryrun, return the unfinished tmp yaml path.
|
960
801
|
config_dict['ray'] = tmp_yaml_path
|
802
|
+
try:
|
803
|
+
config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
|
804
|
+
tmp_yaml_path)
|
805
|
+
except Exception as e: # pylint: disable=broad-except
|
806
|
+
logger.warning(f'Failed to calculate config_hash: {e}')
|
807
|
+
logger.debug('Full exception:', exc_info=e)
|
961
808
|
return config_dict
|
962
809
|
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
|
963
810
|
|
964
|
-
# Add kubernetes config fields from ~/.sky/config
|
965
|
-
if isinstance(cloud, clouds.Kubernetes):
|
966
|
-
kubernetes_utils.combine_pod_config_fields(tmp_yaml_path)
|
967
|
-
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
968
|
-
|
969
811
|
# Restore the old yaml content for backward compatibility.
|
970
812
|
if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config:
|
971
813
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
@@ -979,7 +821,22 @@ def write_cluster_config(
|
|
979
821
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
980
822
|
f.write(restored_yaml_content)
|
981
823
|
|
982
|
-
|
824
|
+
# Read the cluster name from the tmp yaml file, to take the backward
|
825
|
+
# compatbility restortion above into account.
|
826
|
+
# TODO: remove this after 2 minor releases, 0.10.0.
|
827
|
+
yaml_config = common_utils.read_yaml(tmp_yaml_path)
|
828
|
+
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
829
|
+
|
830
|
+
# Make sure to do this before we optimize file mounts. Optimization is
|
831
|
+
# non-deterministic, but everything else before this point should be
|
832
|
+
# deterministic.
|
833
|
+
try:
|
834
|
+
config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
|
835
|
+
tmp_yaml_path)
|
836
|
+
except Exception as e: # pylint: disable=broad-except
|
837
|
+
logger.warning('Failed to calculate config_hash: '
|
838
|
+
f'{common_utils.format_exception(e)}')
|
839
|
+
logger.debug('Full exception:', exc_info=e)
|
983
840
|
|
984
841
|
# Optimization: copy the contents of source files in file_mounts to a
|
985
842
|
# special dir, and upload that as the only file_mount instead. Delay
|
@@ -1004,13 +861,20 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
1004
861
|
"""
|
1005
862
|
config = common_utils.read_yaml(cluster_config_file)
|
1006
863
|
# Check the availability of the cloud type.
|
1007
|
-
if isinstance(cloud, (
|
1008
|
-
|
864
|
+
if isinstance(cloud, (
|
865
|
+
clouds.AWS,
|
866
|
+
clouds.OCI,
|
867
|
+
clouds.SCP,
|
868
|
+
clouds.Vsphere,
|
869
|
+
clouds.Cudo,
|
870
|
+
clouds.Paperspace,
|
871
|
+
clouds.Azure,
|
872
|
+
clouds.DO,
|
873
|
+
clouds.Nebius,
|
874
|
+
)):
|
1009
875
|
config = auth.configure_ssh_info(config)
|
1010
876
|
elif isinstance(cloud, clouds.GCP):
|
1011
877
|
config = auth.setup_gcp_authentication(config)
|
1012
|
-
elif isinstance(cloud, clouds.Azure):
|
1013
|
-
config = auth.setup_azure_authentication(config)
|
1014
878
|
elif isinstance(cloud, clouds.Lambda):
|
1015
879
|
config = auth.setup_lambda_authentication(config)
|
1016
880
|
elif isinstance(cloud, clouds.Kubernetes):
|
@@ -1019,6 +883,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
1019
883
|
config = auth.setup_ibm_authentication(config)
|
1020
884
|
elif isinstance(cloud, clouds.RunPod):
|
1021
885
|
config = auth.setup_runpod_authentication(config)
|
886
|
+
elif isinstance(cloud, clouds.Vast):
|
887
|
+
config = auth.setup_vast_authentication(config)
|
1022
888
|
elif isinstance(cloud, clouds.Fluidstack):
|
1023
889
|
config = auth.setup_fluidstack_authentication(config)
|
1024
890
|
else:
|
@@ -1026,10 +892,6 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
1026
892
|
common_utils.dump_yaml(cluster_config_file, config)
|
1027
893
|
|
1028
894
|
|
1029
|
-
def get_run_timestamp() -> str:
|
1030
|
-
return 'sky-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
|
1031
|
-
|
1032
|
-
|
1033
895
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
1034
896
|
return datetime.strptime(
|
1035
897
|
run_timestamp.partition('-')[2], '%Y-%m-%d-%H-%M-%S-%f').timestamp()
|
@@ -1084,6 +946,135 @@ def _count_healthy_nodes_from_ray(output: str,
|
|
1084
946
|
return ready_head, ready_workers
|
1085
947
|
|
1086
948
|
|
949
|
+
@timeline.event
|
950
|
+
def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
951
|
+
"""Hash the cluster yaml and contents of file mounts to a unique string.
|
952
|
+
|
953
|
+
Two invocations of this function should return the same string if and only
|
954
|
+
if the contents of the yaml are the same and the file contents of all the
|
955
|
+
file_mounts specified in the yaml are the same.
|
956
|
+
|
957
|
+
Limitations:
|
958
|
+
- This function can be expensive if the file mounts are large. (E.g. a few
|
959
|
+
seconds for ~1GB.) This should be okay since we expect that the
|
960
|
+
file_mounts in the cluster yaml (the wheel and cloud credentials) will be
|
961
|
+
small.
|
962
|
+
- Symbolic links are not explicitly handled. Some symbolic link changes may
|
963
|
+
not be detected.
|
964
|
+
|
965
|
+
Implementation: We create a byte sequence that captures the state of the
|
966
|
+
yaml file and all the files in the file mounts, then hash the byte sequence.
|
967
|
+
|
968
|
+
The format of the byte sequence is:
|
969
|
+
32 bytes - sha256 hash of the yaml
|
970
|
+
for each file mount:
|
971
|
+
file mount remote destination (UTF-8), \0
|
972
|
+
if the file mount source is a file:
|
973
|
+
'file' encoded to UTF-8
|
974
|
+
32 byte sha256 hash of the file contents
|
975
|
+
if the file mount source is a directory:
|
976
|
+
'dir' encoded to UTF-8
|
977
|
+
for each directory and subdirectory withinin the file mount (starting from
|
978
|
+
the root and descending recursively):
|
979
|
+
name of the directory (UTF-8), \0
|
980
|
+
name of each subdirectory within the directory (UTF-8) terminated by \0
|
981
|
+
\0
|
982
|
+
for each file in the directory:
|
983
|
+
name of the file (UTF-8), \0
|
984
|
+
32 bytes - sha256 hash of the file contents
|
985
|
+
\0
|
986
|
+
if the file mount source is something else or does not exist, nothing
|
987
|
+
\0\0
|
988
|
+
|
989
|
+
Rather than constructing the whole byte sequence, which may be quite large,
|
990
|
+
we construct it incrementally by using hash.update() to add new bytes.
|
991
|
+
"""
|
992
|
+
|
993
|
+
# Load the yaml contents so that we can directly remove keys.
|
994
|
+
yaml_config = common_utils.read_yaml(yaml_path)
|
995
|
+
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
996
|
+
dict_to_remove_from = yaml_config
|
997
|
+
found_key = True
|
998
|
+
for key in key_list[:-1]:
|
999
|
+
if (not isinstance(dict_to_remove_from, dict) or
|
1000
|
+
key not in dict_to_remove_from):
|
1001
|
+
found_key = False
|
1002
|
+
break
|
1003
|
+
dict_to_remove_from = dict_to_remove_from[key]
|
1004
|
+
if found_key and key_list[-1] in dict_to_remove_from:
|
1005
|
+
dict_to_remove_from.pop(key_list[-1])
|
1006
|
+
|
1007
|
+
def _hash_file(path: str) -> bytes:
|
1008
|
+
return common_utils.hash_file(path, 'sha256').digest()
|
1009
|
+
|
1010
|
+
config_hash = hashlib.sha256()
|
1011
|
+
|
1012
|
+
yaml_hash = hashlib.sha256(
|
1013
|
+
common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
|
1014
|
+
config_hash.update(yaml_hash.digest())
|
1015
|
+
|
1016
|
+
file_mounts = yaml_config.get('file_mounts', {})
|
1017
|
+
# Remove the file mounts added by the newline.
|
1018
|
+
if '' in file_mounts:
|
1019
|
+
assert file_mounts[''] == '', file_mounts['']
|
1020
|
+
file_mounts.pop('')
|
1021
|
+
|
1022
|
+
for dst, src in sorted(file_mounts.items()):
|
1023
|
+
if src == yaml_path:
|
1024
|
+
# Skip the yaml file itself. We have already hashed a modified
|
1025
|
+
# version of it. The file may include fields we don't want to hash.
|
1026
|
+
continue
|
1027
|
+
|
1028
|
+
expanded_src = os.path.expanduser(src)
|
1029
|
+
config_hash.update(dst.encode('utf-8') + b'\0')
|
1030
|
+
|
1031
|
+
# If the file mount source is a symlink, this should be true. In that
|
1032
|
+
# case we hash the contents of the symlink destination.
|
1033
|
+
if os.path.isfile(expanded_src):
|
1034
|
+
config_hash.update('file'.encode('utf-8'))
|
1035
|
+
config_hash.update(_hash_file(expanded_src))
|
1036
|
+
|
1037
|
+
# This can also be a symlink to a directory. os.walk will treat it as a
|
1038
|
+
# normal directory and list the contents of the symlink destination.
|
1039
|
+
elif os.path.isdir(expanded_src):
|
1040
|
+
config_hash.update('dir'.encode('utf-8'))
|
1041
|
+
|
1042
|
+
# Aside from expanded_src, os.walk will list symlinks to directories
|
1043
|
+
# but will not recurse into them.
|
1044
|
+
for (dirpath, dirnames, filenames) in os.walk(expanded_src):
|
1045
|
+
config_hash.update(dirpath.encode('utf-8') + b'\0')
|
1046
|
+
|
1047
|
+
# Note: inplace sort will also affect the traversal order of
|
1048
|
+
# os.walk. We need it so that the os.walk order is
|
1049
|
+
# deterministic.
|
1050
|
+
dirnames.sort()
|
1051
|
+
# This includes symlinks to directories. os.walk will recurse
|
1052
|
+
# into all the directories but not the symlinks. We don't hash
|
1053
|
+
# the link destination, so if a symlink to a directory changes,
|
1054
|
+
# we won't notice.
|
1055
|
+
for dirname in dirnames:
|
1056
|
+
config_hash.update(dirname.encode('utf-8') + b'\0')
|
1057
|
+
config_hash.update(b'\0')
|
1058
|
+
|
1059
|
+
filenames.sort()
|
1060
|
+
# This includes symlinks to files. We could hash the symlink
|
1061
|
+
# destination itself but instead just hash the destination
|
1062
|
+
# contents.
|
1063
|
+
for filename in filenames:
|
1064
|
+
config_hash.update(filename.encode('utf-8') + b'\0')
|
1065
|
+
config_hash.update(
|
1066
|
+
_hash_file(os.path.join(dirpath, filename)))
|
1067
|
+
config_hash.update(b'\0')
|
1068
|
+
|
1069
|
+
else:
|
1070
|
+
logger.debug(
|
1071
|
+
f'Unexpected file_mount that is not a file or dir: {src}')
|
1072
|
+
|
1073
|
+
config_hash.update(b'\0\0')
|
1074
|
+
|
1075
|
+
return config_hash.hexdigest()
|
1076
|
+
|
1077
|
+
|
1087
1078
|
def get_docker_user(ip: str, cluster_config_file: str) -> str:
|
1088
1079
|
"""Find docker container username."""
|
1089
1080
|
ssh_credentials = ssh_credential_from_yaml(cluster_config_file)
|
@@ -1139,7 +1130,8 @@ def wait_until_ray_cluster_ready(
|
|
1139
1130
|
runner = command_runner.SSHCommandRunner(node=(head_ip, 22),
|
1140
1131
|
**ssh_credentials)
|
1141
1132
|
with rich_utils.safe_status(
|
1142
|
-
'
|
1133
|
+
ux_utils.spinner_message('Waiting for workers',
|
1134
|
+
log_path=log_path)) as worker_status:
|
1143
1135
|
while True:
|
1144
1136
|
rc, output, stderr = runner.run(
|
1145
1137
|
instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
@@ -1155,9 +1147,11 @@ def wait_until_ray_cluster_ready(
|
|
1155
1147
|
ready_head, ready_workers = _count_healthy_nodes_from_ray(
|
1156
1148
|
output, is_local_cloud=is_local_cloud)
|
1157
1149
|
|
1158
|
-
worker_status.update(
|
1159
|
-
|
1160
|
-
|
1150
|
+
worker_status.update(
|
1151
|
+
ux_utils.spinner_message(
|
1152
|
+
f'{ready_workers} out of {num_nodes - 1} '
|
1153
|
+
'workers ready',
|
1154
|
+
log_path=log_path))
|
1161
1155
|
|
1162
1156
|
# In the local case, ready_head=0 and ready_workers=num_nodes. This
|
1163
1157
|
# is because there is no matching regex for _LAUNCHED_HEAD_PATTERN.
|
@@ -1207,7 +1201,7 @@ def wait_until_ray_cluster_ready(
|
|
1207
1201
|
|
1208
1202
|
|
1209
1203
|
def ssh_credential_from_yaml(
|
1210
|
-
cluster_yaml: str,
|
1204
|
+
cluster_yaml: Optional[str],
|
1211
1205
|
docker_user: Optional[str] = None,
|
1212
1206
|
ssh_user: Optional[str] = None,
|
1213
1207
|
) -> Dict[str, Any]:
|
@@ -1219,6 +1213,8 @@ def ssh_credential_from_yaml(
|
|
1219
1213
|
the docker container.
|
1220
1214
|
ssh_user: override the ssh_user in the cluster yaml.
|
1221
1215
|
"""
|
1216
|
+
if cluster_yaml is None:
|
1217
|
+
return dict()
|
1222
1218
|
config = common_utils.read_yaml(cluster_yaml)
|
1223
1219
|
auth_section = config['auth']
|
1224
1220
|
if ssh_user is None:
|
@@ -1226,6 +1222,12 @@ def ssh_credential_from_yaml(
|
|
1226
1222
|
ssh_private_key = auth_section.get('ssh_private_key')
|
1227
1223
|
ssh_control_name = config.get('cluster_name', '__default__')
|
1228
1224
|
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
1225
|
+
|
1226
|
+
# Update the ssh_user placeholder in proxy command, if required
|
1227
|
+
if (ssh_proxy_command is not None and
|
1228
|
+
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
1229
|
+
ssh_proxy_command = ssh_proxy_command.replace(
|
1230
|
+
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
1229
1231
|
credentials = {
|
1230
1232
|
'ssh_user': ssh_user,
|
1231
1233
|
'ssh_private_key': ssh_private_key,
|
@@ -1242,18 +1244,18 @@ def ssh_credential_from_yaml(
|
|
1242
1244
|
|
1243
1245
|
|
1244
1246
|
def parallel_data_transfer_to_nodes(
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
):
|
1247
|
+
runners: List[command_runner.CommandRunner],
|
1248
|
+
source: Optional[str],
|
1249
|
+
target: str,
|
1250
|
+
cmd: Optional[str],
|
1251
|
+
run_rsync: bool,
|
1252
|
+
*,
|
1253
|
+
action_message: str,
|
1254
|
+
# Advanced options.
|
1255
|
+
log_path: str = os.devnull,
|
1256
|
+
stream_logs: bool = False,
|
1257
|
+
source_bashrc: bool = False,
|
1258
|
+
num_threads: Optional[int] = None):
|
1257
1259
|
"""Runs a command on all nodes and optionally runs rsync from src->dst.
|
1258
1260
|
|
1259
1261
|
Args:
|
@@ -1265,8 +1267,8 @@ def parallel_data_transfer_to_nodes(
|
|
1265
1267
|
log_path: str; Path to the log file
|
1266
1268
|
stream_logs: bool; Whether to stream logs to stdout
|
1267
1269
|
source_bashrc: bool; Source bashrc before running the command.
|
1270
|
+
num_threads: Optional[int]; Number of threads to use.
|
1268
1271
|
"""
|
1269
|
-
fore = colorama.Fore
|
1270
1272
|
style = colorama.Style
|
1271
1273
|
|
1272
1274
|
origin_source = source
|
@@ -1303,12 +1305,10 @@ def parallel_data_transfer_to_nodes(
|
|
1303
1305
|
|
1304
1306
|
num_nodes = len(runners)
|
1305
1307
|
plural = 's' if num_nodes > 1 else ''
|
1306
|
-
message = (f'{
|
1307
|
-
f': {
|
1308
|
-
f'{style.BRIGHT}{target}{style.RESET_ALL}')
|
1308
|
+
message = (f' {style.DIM}{action_message} (to {num_nodes} node{plural})'
|
1309
|
+
f': {origin_source} -> {target}{style.RESET_ALL}')
|
1309
1310
|
logger.info(message)
|
1310
|
-
|
1311
|
-
subprocess_utils.run_in_parallel(_sync_node, runners)
|
1311
|
+
subprocess_utils.run_in_parallel(_sync_node, runners, num_threads)
|
1312
1312
|
|
1313
1313
|
|
1314
1314
|
def check_local_gpus() -> bool:
|
@@ -1335,12 +1335,6 @@ def check_local_gpus() -> bool:
|
|
1335
1335
|
return is_functional
|
1336
1336
|
|
1337
1337
|
|
1338
|
-
def generate_cluster_name():
|
1339
|
-
# TODO: change this ID formatting to something more pleasant.
|
1340
|
-
# User name is helpful in non-isolated accounts, e.g., GCP, Azure.
|
1341
|
-
return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'
|
1342
|
-
|
1343
|
-
|
1344
1338
|
def _query_head_ip_with_retries(cluster_yaml: str,
|
1345
1339
|
max_attempts: int = 1) -> str:
|
1346
1340
|
"""Returns the IP of the head node by querying the cloud.
|
@@ -1406,8 +1400,8 @@ def get_node_ips(cluster_yaml: str,
|
|
1406
1400
|
"""
|
1407
1401
|
ray_config = common_utils.read_yaml(cluster_yaml)
|
1408
1402
|
# Use the new provisioner for AWS.
|
1409
|
-
provider_name =
|
1410
|
-
cloud =
|
1403
|
+
provider_name = cluster_utils.get_provider_name(ray_config)
|
1404
|
+
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
1411
1405
|
assert cloud is not None, provider_name
|
1412
1406
|
|
1413
1407
|
if cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
|
@@ -1506,6 +1500,7 @@ def check_network_connection():
|
|
1506
1500
|
'Network seems down.') from e
|
1507
1501
|
|
1508
1502
|
|
1503
|
+
@timeline.event
|
1509
1504
|
def check_owner_identity(cluster_name: str) -> None:
|
1510
1505
|
"""Check if current user is the same as the user who created the cluster.
|
1511
1506
|
|
@@ -1525,58 +1520,65 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
1525
1520
|
return
|
1526
1521
|
|
1527
1522
|
cloud = handle.launched_resources.cloud
|
1528
|
-
|
1523
|
+
user_identities = cloud.get_user_identities()
|
1529
1524
|
owner_identity = record['owner']
|
1530
|
-
if
|
1525
|
+
if user_identities is None:
|
1531
1526
|
# Skip the check if the cloud does not support user identity.
|
1532
1527
|
return
|
1533
1528
|
# The user identity can be None, if the cluster is created by an older
|
1534
1529
|
# version of SkyPilot. In that case, we set the user identity to the
|
1535
|
-
# current one.
|
1530
|
+
# current active one.
|
1536
1531
|
# NOTE: a user who upgrades SkyPilot and switches to a new cloud identity
|
1537
1532
|
# immediately without `sky status --refresh` first, will cause a leakage
|
1538
1533
|
# of the existing cluster. We deem this an acceptable tradeoff mainly
|
1539
1534
|
# because multi-identity is not common (at least at the moment).
|
1540
1535
|
if owner_identity is None:
|
1541
1536
|
global_user_state.set_owner_identity_for_cluster(
|
1542
|
-
cluster_name,
|
1537
|
+
cluster_name, user_identities[0])
|
1543
1538
|
else:
|
1544
1539
|
assert isinstance(owner_identity, list)
|
1545
1540
|
# It is OK if the owner identity is shorter, which will happen when
|
1546
1541
|
# the cluster is launched before #1808. In that case, we only check
|
1547
1542
|
# the same length (zip will stop at the shorter one).
|
1548
|
-
for
|
1549
|
-
|
1550
|
-
|
1551
|
-
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1543
|
+
for identity in user_identities:
|
1544
|
+
for i, (owner, current) in enumerate(zip(owner_identity, identity)):
|
1545
|
+
# Clean up the owner identity for the backslash and newlines, caused
|
1546
|
+
# by the cloud CLI output, e.g. gcloud.
|
1547
|
+
owner = owner.replace('\n', '').replace('\\', '')
|
1548
|
+
if owner == current:
|
1549
|
+
if i != 0:
|
1550
|
+
logger.warning(
|
1551
|
+
f'The cluster was owned by {owner_identity}, but '
|
1552
|
+
f'a new identity {identity} is activated. We still '
|
1553
|
+
'allow the operation as the two identities are '
|
1554
|
+
'likely to have the same access to the cluster. '
|
1555
|
+
'Please be aware that this can cause unexpected '
|
1556
|
+
'cluster leakage if the two identities are not '
|
1557
|
+
'actually equivalent (e.g., belong to the same '
|
1558
|
+
'person).')
|
1559
|
+
if i != 0 or len(owner_identity) != len(identity):
|
1560
|
+
# We update the owner of a cluster, when:
|
1561
|
+
# 1. The strictest identty (i.e. the first one) does not
|
1562
|
+
# match, but the latter ones match.
|
1563
|
+
# 2. The length of the two identities are different,
|
1564
|
+
# which will only happen when the cluster is launched
|
1565
|
+
# before #1808. Update the user identity to avoid
|
1566
|
+
# showing the warning above again.
|
1567
|
+
global_user_state.set_owner_identity_for_cluster(
|
1568
|
+
cluster_name, identity)
|
1569
|
+
return # The user identity matches.
|
1570
|
+
# Generate error message if no match found
|
1571
|
+
if len(user_identities) == 1:
|
1572
|
+
err_msg = f'the activated identity is {user_identities[0]!r}.'
|
1573
|
+
else:
|
1574
|
+
err_msg = (f'available identities are {user_identities!r}.')
|
1575
|
+
if cloud.is_same_cloud(clouds.Kubernetes()):
|
1576
|
+
err_msg += (' Check your kubeconfig file and make sure the '
|
1577
|
+
'correct context is available.')
|
1575
1578
|
with ux_utils.print_exception_no_traceback():
|
1576
1579
|
raise exceptions.ClusterOwnerIdentityMismatchError(
|
1577
1580
|
f'{cluster_name!r} ({cloud}) is owned by account '
|
1578
|
-
f'{owner_identity!r}, but
|
1579
|
-
f'is {current_user_identity!r}.')
|
1581
|
+
f'{owner_identity!r}, but ' + err_msg)
|
1580
1582
|
|
1581
1583
|
|
1582
1584
|
def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
@@ -1648,14 +1650,14 @@ def check_can_clone_disk_and_override_task(
|
|
1648
1650
|
The task to use and the resource handle of the source cluster.
|
1649
1651
|
|
1650
1652
|
Raises:
|
1651
|
-
|
1653
|
+
exceptions.ClusterDoesNotExist: If the source cluster does not exist.
|
1652
1654
|
exceptions.NotSupportedError: If the source cluster is not valid or the
|
1653
1655
|
task is not compatible to clone disk from the source cluster.
|
1654
1656
|
"""
|
1655
1657
|
source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
|
1656
1658
|
if source_cluster_status is None:
|
1657
1659
|
with ux_utils.print_exception_no_traceback():
|
1658
|
-
raise
|
1660
|
+
raise exceptions.ClusterDoesNotExist(
|
1659
1661
|
f'Cannot find cluster {cluster_name!r} to clone disk from.')
|
1660
1662
|
|
1661
1663
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
@@ -1667,7 +1669,7 @@ def check_can_clone_disk_and_override_task(
|
|
1667
1669
|
with ux_utils.print_exception_no_traceback():
|
1668
1670
|
raise exceptions.NotSupportedError(
|
1669
1671
|
f'Cannot clone disk from cluster {cluster_name!r} '
|
1670
|
-
f'({source_cluster_status!r}). Please stop the '
|
1672
|
+
f'({source_cluster_status.value!r}). Please stop the '
|
1671
1673
|
f'cluster first: sky stop {cluster_name}')
|
1672
1674
|
|
1673
1675
|
if target_cluster_name is not None:
|
@@ -1747,18 +1749,44 @@ def check_can_clone_disk_and_override_task(
|
|
1747
1749
|
return task, handle
|
1748
1750
|
|
1749
1751
|
|
1750
|
-
def
|
1751
|
-
|
1752
|
-
|
1752
|
+
def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
1753
|
+
"""Update the cluster status.
|
1754
|
+
|
1755
|
+
The cluster status is updated by checking ray cluster and real status from
|
1756
|
+
cloud.
|
1757
|
+
|
1758
|
+
The function will update the cached cluster status in the global state. For
|
1759
|
+
the design of the cluster status and transition, please refer to the
|
1760
|
+
sky/design_docs/cluster_status.md
|
1761
|
+
|
1762
|
+
Note: this function is only safe to be called when the caller process is
|
1763
|
+
holding the cluster lock, which means no other processes are modifying the
|
1764
|
+
cluster.
|
1765
|
+
|
1766
|
+
Returns:
|
1767
|
+
If the cluster is terminated or does not exist, return None. Otherwise
|
1768
|
+
returns the input record with status and handle potentially updated.
|
1753
1769
|
|
1754
1770
|
Raises:
|
1771
|
+
exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
1772
|
+
not the same as the user who created the cluster.
|
1773
|
+
exceptions.CloudUserIdentityError: if we fail to get the current user
|
1774
|
+
identity.
|
1755
1775
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
1756
|
-
fetched from the cloud provider
|
1776
|
+
fetched from the cloud provider or there are leaked nodes causing
|
1777
|
+
the node number larger than expected.
|
1757
1778
|
"""
|
1758
1779
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
1759
1780
|
if record is None:
|
1760
1781
|
return None
|
1761
1782
|
handle = record['handle']
|
1783
|
+
if handle.cluster_yaml is None:
|
1784
|
+
# Remove cluster from db since this cluster does not have a config file
|
1785
|
+
# or any other ongoing requests
|
1786
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
1787
|
+
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
1788
|
+
'Removing the cluster from cache.')
|
1789
|
+
return None
|
1762
1790
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1763
1791
|
return record
|
1764
1792
|
cluster_name = handle.cluster_name
|
@@ -1813,13 +1841,12 @@ def _update_cluster_status_no_lock(
|
|
1813
1841
|
logger.debug(
|
1814
1842
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
1815
1843
|
except RuntimeError as e:
|
1816
|
-
logger.debug(
|
1844
|
+
logger.debug(common_utils.format_exception(e))
|
1817
1845
|
except Exception as e: # pylint: disable=broad-except
|
1818
1846
|
# This can be raised by `external_ssh_ports()`, due to the
|
1819
1847
|
# underlying call to kubernetes API.
|
1820
|
-
logger.debug(
|
1821
|
-
|
1822
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1848
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
|
1849
|
+
exc_info=e)
|
1823
1850
|
return False
|
1824
1851
|
|
1825
1852
|
# Determining if the cluster is healthy (UP):
|
@@ -1843,9 +1870,27 @@ def _update_cluster_status_no_lock(
|
|
1843
1870
|
requested_resources=None,
|
1844
1871
|
ready=True,
|
1845
1872
|
is_launch=False)
|
1846
|
-
return
|
1873
|
+
return global_user_state.get_cluster_from_name(cluster_name)
|
1847
1874
|
|
1848
1875
|
# All cases below are transitioning the cluster to non-UP states.
|
1876
|
+
|
1877
|
+
if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
|
1878
|
+
clouds.StatusVersion.SKYPILOT):
|
1879
|
+
# Note: launched_at is set during sky launch, even on an existing
|
1880
|
+
# cluster. This will catch the case where the cluster was terminated on
|
1881
|
+
# the cloud and restarted by sky launch.
|
1882
|
+
time_since_launch = time.time() - record['launched_at']
|
1883
|
+
if (record['status'] == status_lib.ClusterStatus.INIT and
|
1884
|
+
time_since_launch < _LAUNCH_DOUBLE_CHECK_WINDOW):
|
1885
|
+
# It's possible the instances for this cluster were just created,
|
1886
|
+
# and haven't appeared yet in the cloud API/console. Wait for a bit
|
1887
|
+
# and check again. This is a best-effort leak prevention check.
|
1888
|
+
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
1889
|
+
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
1890
|
+
node_statuses = _query_cluster_status_via_cloud_api(handle)
|
1891
|
+
# Note: even if all the node_statuses are UP now, we will still
|
1892
|
+
# consider this cluster abnormal, and its status will be INIT.
|
1893
|
+
|
1849
1894
|
if len(node_statuses) > handle.launched_nodes:
|
1850
1895
|
# Unexpected: in the queried region more than 1 cluster with the same
|
1851
1896
|
# constructed name tag returned. This will typically not happen unless
|
@@ -1874,13 +1919,15 @@ def _update_cluster_status_no_lock(
|
|
1874
1919
|
f'{colorama.Style.RESET_ALL}')
|
1875
1920
|
assert len(node_statuses) <= handle.launched_nodes
|
1876
1921
|
|
1877
|
-
# If the node_statuses is empty, all the nodes are
|
1878
|
-
#
|
1879
|
-
# where the cluster is terminated by the user manually through
|
1922
|
+
# If the node_statuses is empty, it should mean that all the nodes are
|
1923
|
+
# terminated and we can set the cluster status to TERMINATED. This handles
|
1924
|
+
# the edge case where the cluster is terminated by the user manually through
|
1925
|
+
# the UI.
|
1880
1926
|
to_terminate = not node_statuses
|
1881
1927
|
|
1882
|
-
# A cluster is considered "abnormal", if not all nodes are
|
1883
|
-
# not all nodes are STOPPED. We check that with the following
|
1928
|
+
# A cluster is considered "abnormal", if some (but not all) nodes are
|
1929
|
+
# TERMINATED, or not all nodes are STOPPED. We check that with the following
|
1930
|
+
# logic:
|
1884
1931
|
# * Not all nodes are terminated and there's at least one node
|
1885
1932
|
# terminated; or
|
1886
1933
|
# * Any of the non-TERMINATED nodes is in a non-STOPPED status.
|
@@ -1892,6 +1939,8 @@ def _update_cluster_status_no_lock(
|
|
1892
1939
|
# cluster is probably down.
|
1893
1940
|
# * The cluster is partially terminated or stopped should be considered
|
1894
1941
|
# abnormal.
|
1942
|
+
# * The cluster is partially or completely in the INIT state, which means
|
1943
|
+
# that provisioning was interrupted. This is considered abnormal.
|
1895
1944
|
#
|
1896
1945
|
# An abnormal cluster will transition to INIT and have any autostop setting
|
1897
1946
|
# reset (unless it's autostopping/autodowning).
|
@@ -1921,7 +1970,8 @@ def _update_cluster_status_no_lock(
|
|
1921
1970
|
except exceptions.CommandError as e:
|
1922
1971
|
success = False
|
1923
1972
|
if e.returncode == 255:
|
1924
|
-
|
1973
|
+
word = 'autostopped' if noun == 'autostop' else 'autodowned'
|
1974
|
+
logger.debug(f'The cluster is likely {word}.')
|
1925
1975
|
reset_local_autostop = False
|
1926
1976
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
1927
1977
|
success = False
|
@@ -1973,52 +2023,22 @@ def _update_cluster_status_no_lock(
|
|
1973
2023
|
return global_user_state.get_cluster_from_name(cluster_name)
|
1974
2024
|
|
1975
2025
|
|
1976
|
-
def
|
1977
|
-
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
The cluster status is updated by checking ray cluster and real status from
|
1984
|
-
cloud.
|
1985
|
-
|
1986
|
-
The function will update the cached cluster status in the global state. For
|
1987
|
-
the design of the cluster status and transition, please refer to the
|
1988
|
-
sky/design_docs/cluster_status.md
|
1989
|
-
|
1990
|
-
Args:
|
1991
|
-
cluster_name: The name of the cluster.
|
1992
|
-
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
1993
|
-
before updating the status.
|
1994
|
-
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
1995
|
-
lock.
|
1996
|
-
|
1997
|
-
Returns:
|
1998
|
-
If the cluster is terminated or does not exist, return None. Otherwise
|
1999
|
-
returns the input record with status and handle potentially updated.
|
2026
|
+
def _must_refresh_cluster_status(
|
2027
|
+
record: Dict[str, Any],
|
2028
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
|
2029
|
+
) -> bool:
|
2030
|
+
force_refresh_for_cluster = (force_refresh_statuses is not None and
|
2031
|
+
record['status'] in force_refresh_statuses)
|
2000
2032
|
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2004
|
-
|
2005
|
-
|
2006
|
-
|
2007
|
-
|
2008
|
-
the node number larger than expected.
|
2009
|
-
"""
|
2010
|
-
if not acquire_per_cluster_status_lock:
|
2011
|
-
return _update_cluster_status_no_lock(cluster_name)
|
2033
|
+
use_spot = record['handle'].launched_resources.use_spot
|
2034
|
+
has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
|
2035
|
+
record['autostop'] >= 0)
|
2036
|
+
recently_refreshed = (record['status_updated_at'] is not None and
|
2037
|
+
time.time() - record['status_updated_at'] <
|
2038
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS)
|
2039
|
+
is_stale = (use_spot or has_autostop) and not recently_refreshed
|
2012
2040
|
|
2013
|
-
|
2014
|
-
with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
|
2015
|
-
timeout=cluster_status_lock_timeout):
|
2016
|
-
return _update_cluster_status_no_lock(cluster_name)
|
2017
|
-
except filelock.Timeout:
|
2018
|
-
logger.debug('Refreshing status: Failed get the lock for cluster '
|
2019
|
-
f'{cluster_name!r}. Using the cached status.')
|
2020
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
2021
|
-
return record
|
2041
|
+
return force_refresh_for_cluster or is_stale
|
2022
2042
|
|
2023
2043
|
|
2024
2044
|
def refresh_cluster_record(
|
@@ -2030,22 +2050,28 @@ def refresh_cluster_record(
|
|
2030
2050
|
) -> Optional[Dict[str, Any]]:
|
2031
2051
|
"""Refresh the cluster, and return the possibly updated record.
|
2032
2052
|
|
2033
|
-
|
2034
|
-
|
2035
|
-
|
2053
|
+
The function will update the cached cluster status in the global state. For
|
2054
|
+
the design of the cluster status and transition, please refer to the
|
2055
|
+
sky/design_docs/cluster_status.md
|
2036
2056
|
|
2037
2057
|
Args:
|
2038
2058
|
cluster_name: The name of the cluster.
|
2039
|
-
force_refresh_statuses: if specified, refresh the cluster if it has one
|
2040
|
-
the specified statuses. Additionally, clusters satisfying the
|
2041
|
-
following conditions will
|
2042
|
-
|
2043
|
-
|
2044
|
-
|
2059
|
+
force_refresh_statuses: if specified, refresh the cluster if it has one
|
2060
|
+
of the specified statuses. Additionally, clusters satisfying the
|
2061
|
+
following conditions will be refreshed no matter the argument is
|
2062
|
+
specified or not:
|
2063
|
+
- the most latest available status update is more than
|
2064
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
2065
|
+
1. the cluster is a spot cluster, or
|
2066
|
+
2. cluster autostop is set and the cluster is not STOPPED.
|
2045
2067
|
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
2046
|
-
before updating the status.
|
2068
|
+
before updating the status. Even if this is True, the lock may not be
|
2069
|
+
acquired if the status does not need to be refreshed.
|
2047
2070
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
2048
|
-
lock. If timeout, the function will use the cached status.
|
2071
|
+
lock. If timeout, the function will use the cached status. If the
|
2072
|
+
value is <0, do not timeout (wait for the lock indefinitely). By
|
2073
|
+
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
2074
|
+
if correctness is required, you must set this to -1.
|
2049
2075
|
|
2050
2076
|
Returns:
|
2051
2077
|
If the cluster is terminated or does not exist, return None.
|
@@ -2066,19 +2092,55 @@ def refresh_cluster_record(
|
|
2066
2092
|
return None
|
2067
2093
|
check_owner_identity(cluster_name)
|
2068
2094
|
|
2069
|
-
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2095
|
+
if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
|
2096
|
+
return record
|
2097
|
+
|
2098
|
+
# The loop logic allows us to notice if the status was updated in the
|
2099
|
+
# global_user_state by another process and stop trying to get the lock.
|
2100
|
+
# The core loop logic is adapted from FileLock's implementation.
|
2101
|
+
lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
2102
|
+
start_time = time.perf_counter()
|
2103
|
+
|
2104
|
+
# Loop until we have an up-to-date status or until we acquire the lock.
|
2105
|
+
while True:
|
2106
|
+
# Check to see if we can return the cached status.
|
2107
|
+
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
2108
|
+
return record
|
2109
|
+
|
2110
|
+
if not acquire_per_cluster_status_lock:
|
2111
|
+
return _update_cluster_status(cluster_name)
|
2112
|
+
|
2113
|
+
# Try to acquire the lock so we can fetch the status.
|
2114
|
+
try:
|
2115
|
+
with lock.acquire(blocking=False):
|
2116
|
+
# Check the cluster status again, since it could have been
|
2117
|
+
# updated between our last check and acquiring the lock.
|
2118
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2119
|
+
if record is None or not _must_refresh_cluster_status(
|
2120
|
+
record, force_refresh_statuses):
|
2121
|
+
return record
|
2122
|
+
# Update and return the cluster status.
|
2123
|
+
return _update_cluster_status(cluster_name)
|
2124
|
+
except filelock.Timeout:
|
2125
|
+
# lock.acquire() will throw a Timeout exception if the lock is not
|
2126
|
+
# available and we have blocking=False.
|
2127
|
+
pass
|
2128
|
+
|
2129
|
+
# Logic adapted from FileLock.acquire().
|
2130
|
+
# If cluster_status_lock_time is <0, we will never hit this. No timeout.
|
2131
|
+
# Otherwise, if we have timed out, return the cached status. This has
|
2132
|
+
# the potential to cause correctness issues, but if so it is the
|
2133
|
+
# caller's responsibility to set the timeout to -1.
|
2134
|
+
if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
|
2135
|
+
logger.debug('Refreshing status: Failed get the lock for cluster '
|
2136
|
+
f'{cluster_name!r}. Using the cached status.')
|
2137
|
+
return record
|
2138
|
+
time.sleep(0.05)
|
2139
|
+
|
2140
|
+
# Refresh for next loop iteration.
|
2141
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2142
|
+
if record is None:
|
2143
|
+
return None
|
2082
2144
|
|
2083
2145
|
|
2084
2146
|
@timeline.event
|
@@ -2141,7 +2203,7 @@ def check_cluster_available(
|
|
2141
2203
|
"""Check if the cluster is available.
|
2142
2204
|
|
2143
2205
|
Raises:
|
2144
|
-
|
2206
|
+
exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
2145
2207
|
exceptions.ClusterNotUpError: if the cluster is not UP.
|
2146
2208
|
exceptions.NotSupportedError: if the cluster is not based on
|
2147
2209
|
CloudVmRayBackend.
|
@@ -2206,7 +2268,8 @@ def check_cluster_available(
|
|
2206
2268
|
error_msg += message
|
2207
2269
|
|
2208
2270
|
with ux_utils.print_exception_no_traceback():
|
2209
|
-
raise
|
2271
|
+
raise exceptions.ClusterDoesNotExist(
|
2272
|
+
f'{colorama.Fore.YELLOW}{error_msg}{reset}')
|
2210
2273
|
assert cluster_status is not None, 'handle is not None but status is None'
|
2211
2274
|
backend = get_backend_from_handle(handle)
|
2212
2275
|
if check_cloud_vm_ray_backend and not isinstance(
|
@@ -2380,10 +2443,21 @@ class CloudFilter(enum.Enum):
|
|
2380
2443
|
LOCAL = 'local'
|
2381
2444
|
|
2382
2445
|
|
2446
|
+
def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
|
2447
|
+
"""Returns a list of clusters that match the glob pattern."""
|
2448
|
+
glob_clusters = []
|
2449
|
+
for cluster in clusters:
|
2450
|
+
glob_cluster = global_user_state.get_glob_cluster_names(cluster)
|
2451
|
+
if len(glob_cluster) == 0 and not silent:
|
2452
|
+
logger.info(f'Cluster {cluster} not found.')
|
2453
|
+
glob_clusters.extend(glob_cluster)
|
2454
|
+
return list(set(glob_clusters))
|
2455
|
+
|
2456
|
+
|
2383
2457
|
def get_clusters(
|
2384
|
-
|
2385
|
-
refresh: bool,
|
2458
|
+
refresh: common.StatusRefreshMode,
|
2386
2459
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
2460
|
+
all_users: bool = True,
|
2387
2461
|
) -> List[Dict[str, Any]]:
|
2388
2462
|
"""Returns a list of cached or optionally refreshed cluster records.
|
2389
2463
|
|
@@ -2408,20 +2482,55 @@ def get_clusters(
|
|
2408
2482
|
terminated, the record will be omitted from the returned list.
|
2409
2483
|
"""
|
2410
2484
|
records = global_user_state.get_clusters()
|
2411
|
-
|
2412
|
-
|
2485
|
+
if not all_users:
|
2486
|
+
current_user_hash = common_utils.get_user_hash()
|
2413
2487
|
records = [
|
2414
2488
|
record for record in records
|
2415
|
-
if
|
2489
|
+
if record['user_hash'] == current_user_hash
|
2416
2490
|
]
|
2417
2491
|
|
2418
2492
|
yellow = colorama.Fore.YELLOW
|
2419
2493
|
bright = colorama.Style.BRIGHT
|
2420
2494
|
reset = colorama.Style.RESET_ALL
|
2421
2495
|
|
2496
|
+
def _update_record_with_credentials_and_resources_str(
|
2497
|
+
record: Optional[Dict[str, Any]]) -> None:
|
2498
|
+
"""Add the credentials to the record.
|
2499
|
+
|
2500
|
+
This is useful for the client side to setup the ssh config of the
|
2501
|
+
cluster.
|
2502
|
+
"""
|
2503
|
+
if record is None:
|
2504
|
+
return
|
2505
|
+
handle = record['handle']
|
2506
|
+
if handle is None:
|
2507
|
+
return
|
2508
|
+
record['resources_str'] = resources_utils.get_readable_resources_repr(
|
2509
|
+
handle)
|
2510
|
+
credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
2511
|
+
handle.docker_user,
|
2512
|
+
handle.ssh_user)
|
2513
|
+
|
2514
|
+
if not credentials:
|
2515
|
+
return
|
2516
|
+
ssh_private_key_path = credentials.get('ssh_private_key', None)
|
2517
|
+
if ssh_private_key_path is not None:
|
2518
|
+
with open(os.path.expanduser(ssh_private_key_path),
|
2519
|
+
'r',
|
2520
|
+
encoding='utf-8') as f:
|
2521
|
+
credentials['ssh_private_key_content'] = f.read()
|
2522
|
+
else:
|
2523
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
2524
|
+
with open(os.path.expanduser(private_key_path),
|
2525
|
+
'r',
|
2526
|
+
encoding='utf-8') as f:
|
2527
|
+
credentials['ssh_private_key_content'] = f.read()
|
2528
|
+
record['credentials'] = credentials
|
2529
|
+
|
2422
2530
|
if cluster_names is not None:
|
2423
2531
|
if isinstance(cluster_names, str):
|
2424
2532
|
cluster_names = [cluster_names]
|
2533
|
+
cluster_names = _get_glob_clusters(cluster_names, silent=True)
|
2425
2534
|
new_records = []
|
2426
2535
|
not_exist_cluster_names = []
|
2427
2536
|
for cluster_name in cluster_names:
|
@@ -2436,23 +2545,33 @@ def get_clusters(
|
|
2436
2545
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
2437
2546
|
records = new_records
|
2438
2547
|
|
2439
|
-
|
2548
|
+
# Add auth_config to the records
|
2549
|
+
for record in records:
|
2550
|
+
_update_record_with_credentials_and_resources_str(record)
|
2551
|
+
|
2552
|
+
if refresh == common.StatusRefreshMode.NONE:
|
2440
2553
|
return records
|
2441
2554
|
|
2442
2555
|
plural = 's' if len(records) > 1 else ''
|
2443
2556
|
progress = rich_progress.Progress(transient=True,
|
2444
2557
|
redirect_stdout=False,
|
2445
2558
|
redirect_stderr=False)
|
2446
|
-
task = progress.add_task(
|
2447
|
-
f'
|
2448
|
-
|
2559
|
+
task = progress.add_task(ux_utils.spinner_message(
|
2560
|
+
f'Refreshing status for {len(records)} cluster{plural}'),
|
2561
|
+
total=len(records))
|
2562
|
+
|
2563
|
+
if refresh == common.StatusRefreshMode.FORCE:
|
2564
|
+
force_refresh_statuses = set(status_lib.ClusterStatus)
|
2565
|
+
else:
|
2566
|
+
force_refresh_statuses = None
|
2449
2567
|
|
2450
2568
|
def _refresh_cluster(cluster_name):
|
2451
2569
|
try:
|
2452
2570
|
record = refresh_cluster_record(
|
2453
2571
|
cluster_name,
|
2454
|
-
force_refresh_statuses=
|
2572
|
+
force_refresh_statuses=force_refresh_statuses,
|
2455
2573
|
acquire_per_cluster_status_lock=True)
|
2574
|
+
_update_record_with_credentials_and_resources_str(record)
|
2456
2575
|
except (exceptions.ClusterStatusFetchingError,
|
2457
2576
|
exceptions.CloudUserIdentityError,
|
2458
2577
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
@@ -2464,9 +2583,11 @@ def get_clusters(
|
|
2464
2583
|
return record
|
2465
2584
|
|
2466
2585
|
cluster_names = [record['name'] for record in records]
|
2467
|
-
|
2468
|
-
|
2469
|
-
|
2586
|
+
updated_records = []
|
2587
|
+
if len(cluster_names) > 0:
|
2588
|
+
with progress:
|
2589
|
+
updated_records = subprocess_utils.run_in_parallel(
|
2590
|
+
_refresh_cluster, cluster_names)
|
2470
2591
|
|
2471
2592
|
# Show information for removed clusters.
|
2472
2593
|
kept_records = []
|
@@ -2503,6 +2624,7 @@ def get_clusters(
|
|
2503
2624
|
f'{len(failed_clusters)} cluster{plural}:{reset}')
|
2504
2625
|
for cluster_name, e in failed_clusters:
|
2505
2626
|
logger.warning(f' {bright}{cluster_name}{reset}: {e}')
|
2627
|
+
|
2506
2628
|
return kept_records
|
2507
2629
|
|
2508
2630
|
|
@@ -2579,10 +2701,12 @@ def get_task_resources_str(task: 'task_lib.Task',
|
|
2579
2701
|
the accelerator demands (if any). Otherwise, the CPU demand is shown.
|
2580
2702
|
"""
|
2581
2703
|
spot_str = ''
|
2704
|
+
is_controller_task = task.is_controller_task()
|
2582
2705
|
task_cpu_demand = (str(constants.CONTROLLER_PROCESS_CPU_DEMAND)
|
2583
|
-
if
|
2584
|
-
|
2585
|
-
|
2706
|
+
if is_controller_task else str(DEFAULT_TASK_CPU_DEMAND))
|
2707
|
+
if is_controller_task:
|
2708
|
+
resources_str = f'CPU:{task_cpu_demand}'
|
2709
|
+
elif task.best_resources is not None:
|
2586
2710
|
accelerator_dict = task.best_resources.accelerators
|
2587
2711
|
if is_managed_job:
|
2588
2712
|
if task.best_resources.use_spot:
|
@@ -2650,27 +2774,6 @@ def stop_handler(signum, frame):
|
|
2650
2774
|
raise KeyboardInterrupt(exceptions.SIGTSTP_CODE)
|
2651
2775
|
|
2652
2776
|
|
2653
|
-
def run_command_and_handle_ssh_failure(runner: command_runner.SSHCommandRunner,
|
2654
|
-
command: str,
|
2655
|
-
failure_message: str) -> str:
|
2656
|
-
"""Runs command remotely and returns output with proper error handling."""
|
2657
|
-
rc, stdout, stderr = runner.run(command,
|
2658
|
-
require_outputs=True,
|
2659
|
-
stream_logs=False)
|
2660
|
-
if rc == 255:
|
2661
|
-
# SSH failed
|
2662
|
-
raise RuntimeError(
|
2663
|
-
f'SSH with user {runner.ssh_user} and key {runner.ssh_private_key} '
|
2664
|
-
f'to {runner.ip} failed. This is most likely due to incorrect '
|
2665
|
-
'credentials or incorrect permissions for the key file. Check '
|
2666
|
-
'your credentials and try again.')
|
2667
|
-
subprocess_utils.handle_returncode(rc,
|
2668
|
-
command,
|
2669
|
-
failure_message,
|
2670
|
-
stderr=stderr)
|
2671
|
-
return stdout
|
2672
|
-
|
2673
|
-
|
2674
2777
|
def check_rsync_installed() -> None:
|
2675
2778
|
"""Checks if rsync is installed.
|
2676
2779
|
|
@@ -2703,15 +2806,18 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
2703
2806
|
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
2704
2807
|
r'attribute \'(.*)\'')
|
2705
2808
|
if returncode != 0:
|
2809
|
+
# TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
|
2810
|
+
# the remote cluster. Remove this after 0.10.0 is released.
|
2706
2811
|
attribute_error = re.findall(pattern, stderr)
|
2707
|
-
if attribute_error:
|
2812
|
+
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
2708
2813
|
with ux_utils.print_exception_no_traceback():
|
2709
2814
|
raise RuntimeError(
|
2710
2815
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
2711
|
-
'on the remote cluster. To update, run
|
2712
|
-
|
2816
|
+
f'on the remote cluster: {cluster_name}. To update, run '
|
2817
|
+
'(existing jobs will not be interrupted): '
|
2818
|
+
f'{colorama.Style.BRIGHT}sky start -f -y '
|
2713
2819
|
f'{cluster_name}{colorama.Style.RESET_ALL}'
|
2714
|
-
f'\n--- Details ---\n{stderr.strip()}\n')
|
2820
|
+
f'\n--- Details ---\n{stderr.strip()}\n') from None
|
2715
2821
|
|
2716
2822
|
|
2717
2823
|
def get_endpoints(cluster: str,
|
@@ -2748,16 +2854,22 @@ def get_endpoints(cluster: str,
|
|
2748
2854
|
except ValueError:
|
2749
2855
|
with ux_utils.print_exception_no_traceback():
|
2750
2856
|
raise ValueError(f'Invalid endpoint {port!r}.') from None
|
2751
|
-
cluster_records = get_clusters(
|
2752
|
-
refresh=False,
|
2857
|
+
cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
|
2753
2858
|
cluster_names=[cluster])
|
2859
|
+
if not cluster_records:
|
2860
|
+
with ux_utils.print_exception_no_traceback():
|
2861
|
+
raise exceptions.ClusterNotUpError(
|
2862
|
+
f'Cluster {cluster!r} not found.', cluster_status=None)
|
2863
|
+
assert len(cluster_records) == 1, cluster_records
|
2754
2864
|
cluster_record = cluster_records[0]
|
2755
2865
|
if (not skip_status_check and
|
2756
2866
|
cluster_record['status'] != status_lib.ClusterStatus.UP):
|
2757
2867
|
with ux_utils.print_exception_no_traceback():
|
2758
2868
|
raise exceptions.ClusterNotUpError(
|
2759
2869
|
f'Cluster {cluster_record["name"]!r} '
|
2760
|
-
'is not in UP status.',
|
2870
|
+
'is not in UP status.',
|
2871
|
+
cluster_status=cluster_record['status'],
|
2872
|
+
handle=cluster_record['handle'])
|
2761
2873
|
handle = cluster_record['handle']
|
2762
2874
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
2763
2875
|
with ux_utils.print_exception_no_traceback():
|
@@ -2773,7 +2885,7 @@ def get_endpoints(cluster: str,
|
|
2773
2885
|
except exceptions.NotSupportedError:
|
2774
2886
|
with ux_utils.print_exception_no_traceback():
|
2775
2887
|
raise ValueError('Querying endpoints is not supported '
|
2776
|
-
f'for
|
2888
|
+
f'for {cluster!r} on {cloud}.') from None
|
2777
2889
|
|
2778
2890
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
2779
2891
|
port_details = provision_lib.query_ports(repr(cloud),
|