skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/command_runner.py
CHANGED
@@ -5,20 +5,23 @@ import os
|
|
5
5
|
import pathlib
|
6
6
|
import shlex
|
7
7
|
import time
|
8
|
-
from typing import Any, Iterable, List, Optional, Tuple, Type, Union
|
8
|
+
from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
|
9
9
|
|
10
10
|
from sky import sky_logging
|
11
11
|
from sky.skylet import constants
|
12
12
|
from sky.skylet import log_lib
|
13
13
|
from sky.utils import common_utils
|
14
|
+
from sky.utils import control_master_utils
|
14
15
|
from sky.utils import subprocess_utils
|
15
16
|
from sky.utils import timeline
|
16
17
|
|
17
18
|
logger = sky_logging.init_logger(__name__)
|
18
19
|
|
19
|
-
# The git exclude file to support.
|
20
|
-
GIT_EXCLUDE = '.git/info/exclude'
|
21
20
|
# Rsync options
|
21
|
+
# TODO(zhwu): This will print a per-file progress bar (with -P),
|
22
|
+
# shooting a lot of messages to the output. --info=progress2 is used
|
23
|
+
# to get a total progress bar, but it requires rsync>=3.1.0 and Mac
|
24
|
+
# OS has a default rsync==2.6.9 (16 years old).
|
22
25
|
RSYNC_DISPLAY_OPTION = '-Pavz'
|
23
26
|
# Legend
|
24
27
|
# dir-merge: ignore file can appear in any subdir, applies to that
|
@@ -26,10 +29,14 @@ RSYNC_DISPLAY_OPTION = '-Pavz'
|
|
26
29
|
# Note that "-" is mandatory for rsync and means all patterns in the ignore
|
27
30
|
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
|
28
31
|
# do_not_exclude" doesn't work, even though git allows it.
|
29
|
-
|
32
|
+
RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
|
33
|
+
RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
|
34
|
+
# The git exclude file to support.
|
35
|
+
GIT_EXCLUDE = '.git/info/exclude'
|
30
36
|
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
|
31
37
|
|
32
38
|
_HASH_MAX_LENGTH = 10
|
39
|
+
_DEFAULT_CONNECT_TIMEOUT = 30
|
33
40
|
|
34
41
|
|
35
42
|
def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
|
@@ -60,9 +67,12 @@ def ssh_options_list(
|
|
60
67
|
) -> List[str]:
|
61
68
|
"""Returns a list of sane options for 'ssh'."""
|
62
69
|
if connect_timeout is None:
|
63
|
-
connect_timeout =
|
70
|
+
connect_timeout = _DEFAULT_CONNECT_TIMEOUT
|
64
71
|
# Forked from Ray SSHOptions:
|
65
72
|
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/command_runner.py
|
73
|
+
# Do not allow agent forwarding because SkyPilot API server has access to
|
74
|
+
# all user cluster private keys, which should not be all forwarded to
|
75
|
+
# individual user clusters.
|
66
76
|
arg_dict = {
|
67
77
|
# SSH port
|
68
78
|
'Port': port,
|
@@ -75,7 +85,7 @@ def ssh_options_list(
|
|
75
85
|
# that case.
|
76
86
|
'UserKnownHostsFile': os.devnull,
|
77
87
|
# Suppresses the warning messages, such as:
|
78
|
-
# Warning: Permanently added '
|
88
|
+
# Warning: Permanently added 'xx.xx.xx.xx' (EDxxx) to the list of
|
79
89
|
# known hosts.
|
80
90
|
'LogLevel': 'ERROR',
|
81
91
|
# Try fewer extraneous key pairs.
|
@@ -89,18 +99,25 @@ def ssh_options_list(
|
|
89
99
|
'ServerAliveCountMax': 3,
|
90
100
|
# ConnectTimeout.
|
91
101
|
'ConnectTimeout': f'{connect_timeout}s',
|
92
|
-
# Agent forwarding for git.
|
93
|
-
'ForwardAgent': 'yes',
|
94
102
|
}
|
95
103
|
# SSH Control will have a severe delay when using docker_ssh_proxy_command.
|
96
104
|
# TODO(tian): Investigate why.
|
105
|
+
#
|
106
|
+
# We disable ControlMaster when ssh_proxy_command is used, because the
|
107
|
+
# master connection will be idle although the connection might be shared
|
108
|
+
# by other ssh commands that is not idle. In that case, user's custom proxy
|
109
|
+
# command may drop the connection due to idle timeout, since it will only
|
110
|
+
# see the idle master connection. It is an issue even with the
|
111
|
+
# ServerAliveInterval set, since the keepalive message may not be recognized
|
112
|
+
# by the custom proxy command, such as AWS SSM Session Manager.
|
113
|
+
#
|
97
114
|
# We also do not use ControlMaster when we use `kubectl port-forward`
|
98
115
|
# to access Kubernetes pods over SSH+Proxycommand. This is because the
|
99
116
|
# process running ProxyCommand is kept running as long as the ssh session
|
100
117
|
# is running and the ControlMaster keeps the session, which results in
|
101
118
|
# 'ControlPersist' number of seconds delay per ssh commands ran.
|
102
119
|
if (ssh_control_name is not None and docker_ssh_proxy_command is None and
|
103
|
-
not disable_control_master):
|
120
|
+
ssh_proxy_command is None and not disable_control_master):
|
104
121
|
arg_dict.update({
|
105
122
|
# Control path: important optimization as we do multiple ssh in one
|
106
123
|
# sky.launch().
|
@@ -161,7 +178,7 @@ class CommandRunner:
|
|
161
178
|
cmd: Union[str, List[str]],
|
162
179
|
process_stream: bool,
|
163
180
|
separate_stderr: bool,
|
164
|
-
|
181
|
+
skip_num_lines: int,
|
165
182
|
source_bashrc: bool = False,
|
166
183
|
) -> str:
|
167
184
|
"""Returns the command to run."""
|
@@ -170,7 +187,7 @@ class CommandRunner:
|
|
170
187
|
|
171
188
|
# We need this to correctly run the cmd, and get the output.
|
172
189
|
command = [
|
173
|
-
'bash',
|
190
|
+
'/bin/bash',
|
174
191
|
'--login',
|
175
192
|
'-c',
|
176
193
|
]
|
@@ -193,12 +210,12 @@ class CommandRunner:
|
|
193
210
|
]
|
194
211
|
if not separate_stderr:
|
195
212
|
command.append('2>&1')
|
196
|
-
if not process_stream and
|
213
|
+
if not process_stream and skip_num_lines:
|
197
214
|
command += [
|
198
215
|
# A hack to remove the following bash warnings (twice):
|
199
216
|
# bash: cannot set terminal process group
|
200
217
|
# bash: no job control in this shell
|
201
|
-
f'| stdbuf -o0 tail -n +{
|
218
|
+
f'| stdbuf -o0 tail -n +{skip_num_lines}',
|
202
219
|
# This is required to make sure the executor of command can get
|
203
220
|
# correct returncode, since linux pipe is used.
|
204
221
|
'; exit ${PIPESTATUS[0]}'
|
@@ -207,6 +224,111 @@ class CommandRunner:
|
|
207
224
|
command_str = ' '.join(command)
|
208
225
|
return command_str
|
209
226
|
|
227
|
+
def _rsync(
|
228
|
+
self,
|
229
|
+
source: str,
|
230
|
+
target: str,
|
231
|
+
node_destination: str,
|
232
|
+
up: bool,
|
233
|
+
rsh_option: str,
|
234
|
+
# Advanced options.
|
235
|
+
log_path: str = os.devnull,
|
236
|
+
stream_logs: bool = True,
|
237
|
+
max_retry: int = 1,
|
238
|
+
prefix_command: Optional[str] = None,
|
239
|
+
get_remote_home_dir: Callable[[], str] = lambda: '~') -> None:
|
240
|
+
"""Builds the rsync command."""
|
241
|
+
# Build command.
|
242
|
+
rsync_command = []
|
243
|
+
if prefix_command is not None:
|
244
|
+
rsync_command.append(prefix_command)
|
245
|
+
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
|
246
|
+
|
247
|
+
def _get_remote_home_dir_with_retry():
|
248
|
+
backoff = common_utils.Backoff(initial_backoff=1,
|
249
|
+
max_backoff_factor=5)
|
250
|
+
retries_left = max_retry
|
251
|
+
assert retries_left > 0, f'max_retry {max_retry} must be positive.'
|
252
|
+
while retries_left >= 0:
|
253
|
+
try:
|
254
|
+
return get_remote_home_dir()
|
255
|
+
except Exception: # pylint: disable=broad-except
|
256
|
+
if retries_left == 0:
|
257
|
+
raise
|
258
|
+
sleep_time = backoff.current_backoff()
|
259
|
+
logger.warning(f'Failed to get remote home dir '
|
260
|
+
f'- retrying in {sleep_time} seconds.')
|
261
|
+
retries_left -= 1
|
262
|
+
time.sleep(sleep_time)
|
263
|
+
|
264
|
+
# --filter
|
265
|
+
# The source is a local path, so we need to resolve it.
|
266
|
+
resolved_source = pathlib.Path(source).expanduser().resolve()
|
267
|
+
if (resolved_source / constants.SKY_IGNORE_FILE).exists():
|
268
|
+
rsync_command.append(RSYNC_FILTER_SKYIGNORE)
|
269
|
+
else:
|
270
|
+
rsync_command.append(RSYNC_FILTER_GITIGNORE)
|
271
|
+
if up:
|
272
|
+
# Build --exclude-from argument.
|
273
|
+
if (resolved_source / GIT_EXCLUDE).exists():
|
274
|
+
# Ensure file exists; otherwise, rsync will error out.
|
275
|
+
#
|
276
|
+
# We shlex.quote() because the path may contain spaces:
|
277
|
+
# 'my dir/.git/info/exclude'
|
278
|
+
# Without quoting rsync fails.
|
279
|
+
rsync_command.append(
|
280
|
+
RSYNC_EXCLUDE_OPTION.format(
|
281
|
+
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
|
282
|
+
|
283
|
+
rsync_command.append(f'-e {shlex.quote(rsh_option)}')
|
284
|
+
|
285
|
+
if up:
|
286
|
+
resolved_target = target
|
287
|
+
if target.startswith('~'):
|
288
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
289
|
+
resolved_target = target.replace('~', remote_home_dir)
|
290
|
+
full_source_str = str(resolved_source)
|
291
|
+
if resolved_source.is_dir():
|
292
|
+
full_source_str = os.path.join(full_source_str, '')
|
293
|
+
rsync_command.extend([
|
294
|
+
f'{full_source_str!r}',
|
295
|
+
f'{node_destination}:{resolved_target!r}',
|
296
|
+
])
|
297
|
+
else:
|
298
|
+
resolved_source = source
|
299
|
+
if source.startswith('~'):
|
300
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
301
|
+
resolved_source = source.replace('~', remote_home_dir)
|
302
|
+
rsync_command.extend([
|
303
|
+
f'{node_destination}:{resolved_source!r}',
|
304
|
+
f'{os.path.expanduser(target)!r}',
|
305
|
+
])
|
306
|
+
command = ' '.join(rsync_command)
|
307
|
+
logger.debug(f'Running rsync command: {command}')
|
308
|
+
|
309
|
+
backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
|
310
|
+
assert max_retry > 0, f'max_retry {max_retry} must be positive.'
|
311
|
+
while max_retry >= 0:
|
312
|
+
returncode, stdout, stderr = log_lib.run_with_log(
|
313
|
+
command,
|
314
|
+
log_path=log_path,
|
315
|
+
stream_logs=stream_logs,
|
316
|
+
shell=True,
|
317
|
+
require_outputs=True)
|
318
|
+
if returncode == 0:
|
319
|
+
break
|
320
|
+
max_retry -= 1
|
321
|
+
time.sleep(backoff.current_backoff())
|
322
|
+
|
323
|
+
direction = 'up' if up else 'down'
|
324
|
+
error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
|
325
|
+
'Ensure that the network is stable, then retry.')
|
326
|
+
subprocess_utils.handle_returncode(returncode,
|
327
|
+
command,
|
328
|
+
error_msg,
|
329
|
+
stderr=stdout + stderr,
|
330
|
+
stream_logs=stream_logs)
|
331
|
+
|
210
332
|
@timeline.event
|
211
333
|
def run(
|
212
334
|
self,
|
@@ -222,7 +344,7 @@ class CommandRunner:
|
|
222
344
|
separate_stderr: bool = False,
|
223
345
|
connect_timeout: Optional[int] = None,
|
224
346
|
source_bashrc: bool = False,
|
225
|
-
|
347
|
+
skip_num_lines: int = 0,
|
226
348
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
227
349
|
"""Runs the command on the cluster.
|
228
350
|
|
@@ -237,7 +359,7 @@ class CommandRunner:
|
|
237
359
|
connect_timeout: timeout in seconds for the ssh connection.
|
238
360
|
source_bashrc: Whether to source the ~/.bashrc before running the
|
239
361
|
command.
|
240
|
-
|
362
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
241
363
|
output. This is used when the output is not processed by
|
242
364
|
SkyPilot but we still want to get rid of some warning messages,
|
243
365
|
such as SSH warnings.
|
@@ -293,6 +415,22 @@ class CommandRunner:
|
|
293
415
|
returncode = self.run('true', connect_timeout=5, stream_logs=False)
|
294
416
|
return returncode == 0
|
295
417
|
|
418
|
+
def close_cached_connection(self) -> None:
|
419
|
+
"""Close the cached connection to the remote machine."""
|
420
|
+
pass
|
421
|
+
|
422
|
+
def port_forward_command(self,
|
423
|
+
port_forward: List[Tuple[int, int]],
|
424
|
+
connect_timeout: int = 1) -> List[str]:
|
425
|
+
"""Command for forwarding ports from localhost to the remote machine.
|
426
|
+
|
427
|
+
Args:
|
428
|
+
port_forward: A list of ports to forward from the localhost to the
|
429
|
+
remote host.
|
430
|
+
connect_timeout: The timeout for the connection.
|
431
|
+
"""
|
432
|
+
raise NotImplementedError
|
433
|
+
|
296
434
|
|
297
435
|
class SSHCommandRunner(CommandRunner):
|
298
436
|
"""Runner for SSH commands."""
|
@@ -340,7 +478,9 @@ class SSHCommandRunner(CommandRunner):
|
|
340
478
|
None if ssh_control_name is None else hashlib.md5(
|
341
479
|
ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
|
342
480
|
self._ssh_proxy_command = ssh_proxy_command
|
343
|
-
self.disable_control_master =
|
481
|
+
self.disable_control_master = (
|
482
|
+
disable_control_master or
|
483
|
+
control_master_utils.should_disable_control_master())
|
344
484
|
if docker_user is not None:
|
345
485
|
assert port is None or port == 22, (
|
346
486
|
f'port must be None or 22 for docker_user, got {port}.')
|
@@ -359,9 +499,27 @@ class SSHCommandRunner(CommandRunner):
|
|
359
499
|
self.port = port
|
360
500
|
self._docker_ssh_proxy_command = None
|
361
501
|
|
362
|
-
def
|
363
|
-
|
364
|
-
|
502
|
+
def port_forward_command(self,
|
503
|
+
port_forward: List[Tuple[int, int]],
|
504
|
+
connect_timeout: int = 1) -> List[str]:
|
505
|
+
"""Command for forwarding ports from localhost to the remote machine.
|
506
|
+
|
507
|
+
Args:
|
508
|
+
port_forward: A list of ports to forward from the local port to the
|
509
|
+
remote port.
|
510
|
+
connect_timeout: The timeout for the ssh connection.
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
The command for forwarding ports from localhost to the remote
|
514
|
+
machine.
|
515
|
+
"""
|
516
|
+
return self.ssh_base_command(ssh_mode=SshMode.INTERACTIVE,
|
517
|
+
port_forward=port_forward,
|
518
|
+
connect_timeout=connect_timeout)
|
519
|
+
|
520
|
+
def ssh_base_command(self, *, ssh_mode: SshMode,
|
521
|
+
port_forward: Optional[List[Tuple[int, int]]],
|
522
|
+
connect_timeout: Optional[int]) -> List[str]:
|
365
523
|
ssh = ['ssh']
|
366
524
|
if ssh_mode == SshMode.NON_INTERACTIVE:
|
367
525
|
# Disable pseudo-terminal allocation. Otherwise, the output of
|
@@ -371,11 +529,10 @@ class SSHCommandRunner(CommandRunner):
|
|
371
529
|
# Force pseudo-terminal allocation for interactive/login mode.
|
372
530
|
ssh += ['-tt']
|
373
531
|
if port_forward is not None:
|
374
|
-
for
|
375
|
-
local = remote = port
|
532
|
+
for local, remote in port_forward:
|
376
533
|
logger.info(
|
377
534
|
f'Forwarding port {local} to port {remote} on localhost.')
|
378
|
-
ssh += ['-
|
535
|
+
ssh += ['-NL', f'{remote}:localhost:{local}']
|
379
536
|
if self._docker_ssh_proxy_command is not None:
|
380
537
|
docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
|
381
538
|
else:
|
@@ -391,13 +548,35 @@ class SSHCommandRunner(CommandRunner):
|
|
391
548
|
f'{self.ssh_user}@{self.ip}'
|
392
549
|
]
|
393
550
|
|
551
|
+
def close_cached_connection(self) -> None:
|
552
|
+
"""Close the cached connection to the remote machine.
|
553
|
+
|
554
|
+
This is useful when we need to make the permission update effective of a
|
555
|
+
ssh user, e.g. usermod -aG docker $USER.
|
556
|
+
"""
|
557
|
+
if self.ssh_control_name is not None:
|
558
|
+
control_path = _ssh_control_path(self.ssh_control_name)
|
559
|
+
if control_path is not None:
|
560
|
+
# Suppress the `Exit request sent.` output for this comamnd
|
561
|
+
# which would interrupt the CLI spinner.
|
562
|
+
cmd = (f'ssh -O exit -S {control_path}/%C '
|
563
|
+
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
|
564
|
+
logger.debug(f'Closing cached connection {control_path!r} with '
|
565
|
+
f'cmd: {cmd}')
|
566
|
+
log_lib.run_with_log(cmd,
|
567
|
+
log_path=os.devnull,
|
568
|
+
require_outputs=False,
|
569
|
+
stream_logs=False,
|
570
|
+
process_stream=False,
|
571
|
+
shell=True)
|
572
|
+
|
394
573
|
@timeline.event
|
395
574
|
def run(
|
396
575
|
self,
|
397
576
|
cmd: Union[str, List[str]],
|
398
577
|
*,
|
399
578
|
require_outputs: bool = False,
|
400
|
-
port_forward: Optional[List[int]] = None,
|
579
|
+
port_forward: Optional[List[Tuple[int, int]]] = None,
|
401
580
|
# Advanced options.
|
402
581
|
log_path: str = os.devnull,
|
403
582
|
# If False, do not redirect stdout/stderr to optimize performance.
|
@@ -407,7 +586,7 @@ class SSHCommandRunner(CommandRunner):
|
|
407
586
|
separate_stderr: bool = False,
|
408
587
|
connect_timeout: Optional[int] = None,
|
409
588
|
source_bashrc: bool = False,
|
410
|
-
|
589
|
+
skip_num_lines: int = 0,
|
411
590
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
412
591
|
"""Uses 'ssh' to run 'cmd' on a node with ip.
|
413
592
|
|
@@ -428,7 +607,7 @@ class SSHCommandRunner(CommandRunner):
|
|
428
607
|
connect_timeout: timeout in seconds for the ssh connection.
|
429
608
|
source_bashrc: Whether to source the bashrc before running the
|
430
609
|
command.
|
431
|
-
|
610
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
432
611
|
output. This is used when the output is not processed by
|
433
612
|
SkyPilot but we still want to get rid of some warning messages,
|
434
613
|
such as SSH warnings.
|
@@ -438,7 +617,7 @@ class SSHCommandRunner(CommandRunner):
|
|
438
617
|
or
|
439
618
|
A tuple of (returncode, stdout, stderr).
|
440
619
|
"""
|
441
|
-
base_ssh_command = self.
|
620
|
+
base_ssh_command = self.ssh_base_command(
|
442
621
|
ssh_mode=ssh_mode,
|
443
622
|
port_forward=port_forward,
|
444
623
|
connect_timeout=connect_timeout)
|
@@ -451,7 +630,7 @@ class SSHCommandRunner(CommandRunner):
|
|
451
630
|
command_str = self._get_command_to_run(cmd,
|
452
631
|
process_stream,
|
453
632
|
separate_stderr,
|
454
|
-
|
633
|
+
skip_num_lines=skip_num_lines,
|
455
634
|
source_bashrc=source_bashrc)
|
456
635
|
command = base_ssh_command + [shlex.quote(command_str)]
|
457
636
|
|
@@ -506,30 +685,6 @@ class SSHCommandRunner(CommandRunner):
|
|
506
685
|
Raises:
|
507
686
|
exceptions.CommandError: rsync command failed.
|
508
687
|
"""
|
509
|
-
# Build command.
|
510
|
-
# TODO(zhwu): This will print a per-file progress bar (with -P),
|
511
|
-
# shooting a lot of messages to the output. --info=progress2 is used
|
512
|
-
# to get a total progress bar, but it requires rsync>=3.1.0 and Mac
|
513
|
-
# OS has a default rsync==2.6.9 (16 years old).
|
514
|
-
rsync_command = ['rsync', RSYNC_DISPLAY_OPTION]
|
515
|
-
|
516
|
-
# --filter
|
517
|
-
rsync_command.append(RSYNC_FILTER_OPTION)
|
518
|
-
|
519
|
-
if up:
|
520
|
-
# The source is a local path, so we need to resolve it.
|
521
|
-
# --exclude-from
|
522
|
-
resolved_source = pathlib.Path(source).expanduser().resolve()
|
523
|
-
if (resolved_source / GIT_EXCLUDE).exists():
|
524
|
-
# Ensure file exists; otherwise, rsync will error out.
|
525
|
-
#
|
526
|
-
# We shlex.quote() because the path may contain spaces:
|
527
|
-
# 'my dir/.git/info/exclude'
|
528
|
-
# Without quoting rsync fails.
|
529
|
-
rsync_command.append(
|
530
|
-
RSYNC_EXCLUDE_OPTION.format(
|
531
|
-
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
|
532
|
-
|
533
688
|
if self._docker_ssh_proxy_command is not None:
|
534
689
|
docker_ssh_proxy_command = self._docker_ssh_proxy_command(['ssh'])
|
535
690
|
else:
|
@@ -542,43 +697,251 @@ class SSHCommandRunner(CommandRunner):
|
|
542
697
|
docker_ssh_proxy_command=docker_ssh_proxy_command,
|
543
698
|
port=self.port,
|
544
699
|
disable_control_master=self.disable_control_master))
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
f'{full_source_str!r}',
|
555
|
-
f'{self.ssh_user}@{self.ip}:{target!r}',
|
556
|
-
])
|
557
|
-
else:
|
558
|
-
rsync_command.extend([
|
559
|
-
f'{self.ssh_user}@{self.ip}:{source!r}',
|
560
|
-
f'{os.path.expanduser(target)!r}',
|
561
|
-
])
|
562
|
-
command = ' '.join(rsync_command)
|
700
|
+
rsh_option = f'ssh {ssh_options}'
|
701
|
+
self._rsync(source,
|
702
|
+
target,
|
703
|
+
node_destination=f'{self.ssh_user}@{self.ip}',
|
704
|
+
up=up,
|
705
|
+
rsh_option=rsh_option,
|
706
|
+
log_path=log_path,
|
707
|
+
stream_logs=stream_logs,
|
708
|
+
max_retry=max_retry)
|
563
709
|
|
564
|
-
backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
|
565
|
-
while max_retry >= 0:
|
566
|
-
returncode, stdout, stderr = log_lib.run_with_log(
|
567
|
-
command,
|
568
|
-
log_path=log_path,
|
569
|
-
stream_logs=stream_logs,
|
570
|
-
shell=True,
|
571
|
-
require_outputs=True)
|
572
|
-
if returncode == 0:
|
573
|
-
break
|
574
|
-
max_retry -= 1
|
575
|
-
time.sleep(backoff.current_backoff())
|
576
710
|
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
711
|
+
class KubernetesCommandRunner(CommandRunner):
|
712
|
+
"""Runner for Kubernetes commands."""
|
713
|
+
|
714
|
+
_MAX_RETRIES_FOR_RSYNC = 3
|
715
|
+
|
716
|
+
def __init__(
|
717
|
+
self,
|
718
|
+
node: Tuple[Tuple[str, Optional[str]], str],
|
719
|
+
**kwargs,
|
720
|
+
):
|
721
|
+
"""Initialize KubernetesCommandRunner.
|
722
|
+
|
723
|
+
Example Usage:
|
724
|
+
runner = KubernetesCommandRunner((namespace, context), pod_name))
|
725
|
+
runner.run('ls -l')
|
726
|
+
runner.rsync(source, target, up=True)
|
727
|
+
|
728
|
+
Args:
|
729
|
+
node: The namespace and pod_name of the remote machine.
|
730
|
+
"""
|
731
|
+
del kwargs
|
732
|
+
super().__init__(node)
|
733
|
+
(self.namespace, self.context), self.pod_name = node
|
734
|
+
|
735
|
+
@property
|
736
|
+
def node_id(self) -> str:
|
737
|
+
return f'{self.context}-{self.namespace}-{self.pod_name}'
|
738
|
+
|
739
|
+
def port_forward_command(self,
|
740
|
+
port_forward: List[Tuple[int, int]],
|
741
|
+
connect_timeout: int = 1) -> List[str]:
|
742
|
+
"""Command for forwarding ports from localhost to the remote machine.
|
743
|
+
|
744
|
+
Args:
|
745
|
+
port_forward: A list of ports to forward from the local port to the
|
746
|
+
remote port. Currently, only one port is supported, i.e. the
|
747
|
+
list should have only one element.
|
748
|
+
connect_timeout: The timeout for the ssh connection.
|
749
|
+
"""
|
750
|
+
assert port_forward and len(port_forward) == 1, (
|
751
|
+
'Only one port is supported for Kubernetes port-forward.')
|
752
|
+
kubectl_args = [
|
753
|
+
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
754
|
+
]
|
755
|
+
if self.context:
|
756
|
+
kubectl_args += ['--context', self.context]
|
757
|
+
local_port, remote_port = port_forward[0]
|
758
|
+
local_port_str = f'{local_port}' if local_port is not None else ''
|
759
|
+
kubectl_cmd = [
|
760
|
+
'kubectl',
|
761
|
+
*kubectl_args,
|
762
|
+
'port-forward',
|
763
|
+
f'pod/{self.pod_name}',
|
764
|
+
f'{local_port_str}:{remote_port}',
|
765
|
+
]
|
766
|
+
return kubectl_cmd
|
767
|
+
|
768
|
+
@timeline.event
|
769
|
+
def run(
|
770
|
+
self,
|
771
|
+
cmd: Union[str, List[str]],
|
772
|
+
*,
|
773
|
+
port_forward: Optional[List[int]] = None,
|
774
|
+
require_outputs: bool = False,
|
775
|
+
# Advanced options.
|
776
|
+
log_path: str = os.devnull,
|
777
|
+
# If False, do not redirect stdout/stderr to optimize performance.
|
778
|
+
process_stream: bool = True,
|
779
|
+
stream_logs: bool = True,
|
780
|
+
ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
|
781
|
+
separate_stderr: bool = False,
|
782
|
+
connect_timeout: Optional[int] = None,
|
783
|
+
source_bashrc: bool = False,
|
784
|
+
skip_num_lines: int = 0,
|
785
|
+
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
786
|
+
"""Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
|
787
|
+
|
788
|
+
Args:
|
789
|
+
cmd: The command to run.
|
790
|
+
port_forward: This should be None for k8s.
|
791
|
+
|
792
|
+
Advanced options:
|
793
|
+
|
794
|
+
require_outputs: Whether to return the stdout/stderr of the command.
|
795
|
+
log_path: Redirect stdout/stderr to the log_path.
|
796
|
+
stream_logs: Stream logs to the stdout/stderr.
|
797
|
+
check: Check the success of the command.
|
798
|
+
ssh_mode: The mode to use for ssh.
|
799
|
+
See SSHMode for more details.
|
800
|
+
separate_stderr: Whether to separate stderr from stdout.
|
801
|
+
connect_timeout: timeout in seconds for the pod connection.
|
802
|
+
source_bashrc: Whether to source the bashrc before running the
|
803
|
+
command.
|
804
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
805
|
+
output. This is used when the output is not processed by
|
806
|
+
SkyPilot but we still want to get rid of some warning messages,
|
807
|
+
such as SSH warnings.
|
808
|
+
|
809
|
+
|
810
|
+
Returns:
|
811
|
+
returncode
|
812
|
+
or
|
813
|
+
A tuple of (returncode, stdout, stderr).
|
814
|
+
"""
|
815
|
+
# TODO(zhwu): implement port_forward for k8s.
|
816
|
+
assert port_forward is None, ('port_forward is not supported for k8s '
|
817
|
+
f'for now, but got: {port_forward}')
|
818
|
+
if connect_timeout is None:
|
819
|
+
connect_timeout = _DEFAULT_CONNECT_TIMEOUT
|
820
|
+
kubectl_args = [
|
821
|
+
'--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
|
822
|
+
]
|
823
|
+
if self.context:
|
824
|
+
kubectl_args += ['--context', self.context]
|
825
|
+
# If context is none, it means we are using incluster auth. In this
|
826
|
+
# case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
|
827
|
+
if self.context is None:
|
828
|
+
kubectl_args += ['--kubeconfig', '/dev/null']
|
829
|
+
kubectl_args += [self.pod_name]
|
830
|
+
if ssh_mode == SshMode.LOGIN:
|
831
|
+
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
832
|
+
base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
|
833
|
+
command = base_cmd + cmd
|
834
|
+
proc = subprocess_utils.run(command, shell=False, check=False)
|
835
|
+
return proc.returncode, '', ''
|
836
|
+
|
837
|
+
kubectl_base_command = ['kubectl', 'exec']
|
838
|
+
|
839
|
+
if ssh_mode == SshMode.INTERACTIVE:
|
840
|
+
kubectl_base_command.append('-i')
|
841
|
+
kubectl_base_command += [*kubectl_args, '--']
|
842
|
+
|
843
|
+
command_str = self._get_command_to_run(cmd,
|
844
|
+
process_stream,
|
845
|
+
separate_stderr,
|
846
|
+
skip_num_lines=skip_num_lines,
|
847
|
+
source_bashrc=source_bashrc)
|
848
|
+
command = kubectl_base_command + [
|
849
|
+
# It is important to use /bin/bash -c here to make sure we quote the
|
850
|
+
# command to be run properly. Otherwise, directly appending commands
|
851
|
+
# after '--' will not work for some commands, such as '&&', '>' etc.
|
852
|
+
'/bin/bash',
|
853
|
+
'-c',
|
854
|
+
shlex.quote(command_str)
|
855
|
+
]
|
856
|
+
|
857
|
+
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
858
|
+
os.makedirs(log_dir, exist_ok=True)
|
859
|
+
|
860
|
+
executable = None
|
861
|
+
if not process_stream:
|
862
|
+
if stream_logs:
|
863
|
+
command += [
|
864
|
+
f'| tee {log_path}',
|
865
|
+
# This also requires the executor to be '/bin/bash' instead
|
866
|
+
# of the default '/bin/sh'.
|
867
|
+
'; exit ${PIPESTATUS[0]}'
|
868
|
+
]
|
869
|
+
else:
|
870
|
+
command += [f'> {log_path}']
|
871
|
+
executable = '/bin/bash'
|
872
|
+
return log_lib.run_with_log(' '.join(command),
|
873
|
+
log_path,
|
874
|
+
require_outputs=require_outputs,
|
875
|
+
stream_logs=stream_logs,
|
876
|
+
process_stream=process_stream,
|
877
|
+
shell=True,
|
878
|
+
executable=executable,
|
879
|
+
**kwargs)
|
880
|
+
|
881
|
+
@timeline.event
|
882
|
+
def rsync(
|
883
|
+
self,
|
884
|
+
source: str,
|
885
|
+
target: str,
|
886
|
+
*,
|
887
|
+
up: bool,
|
888
|
+
# Advanced options.
|
889
|
+
log_path: str = os.devnull,
|
890
|
+
stream_logs: bool = True,
|
891
|
+
max_retry: int = _MAX_RETRIES_FOR_RSYNC,
|
892
|
+
) -> None:
|
893
|
+
"""Uses 'rsync' to sync 'source' to 'target'.
|
894
|
+
|
895
|
+
Args:
|
896
|
+
source: The source path.
|
897
|
+
target: The target path.
|
898
|
+
up: The direction of the sync, True for local to cluster, False
|
899
|
+
for cluster to local.
|
900
|
+
log_path: Redirect stdout/stderr to the log_path.
|
901
|
+
stream_logs: Stream logs to the stdout/stderr.
|
902
|
+
max_retry: The maximum number of retries for the rsync command.
|
903
|
+
This value should be non-negative.
|
904
|
+
|
905
|
+
Raises:
|
906
|
+
exceptions.CommandError: rsync command failed.
|
907
|
+
"""
|
908
|
+
|
909
|
+
def get_remote_home_dir() -> str:
|
910
|
+
# Use `echo ~` to get the remote home directory, instead of pwd or
|
911
|
+
# echo $HOME, because pwd can be `/` when the remote user is root
|
912
|
+
# and $HOME is not always set.
|
913
|
+
rc, remote_home_dir, stderr = self.run('echo ~',
|
914
|
+
require_outputs=True,
|
915
|
+
separate_stderr=True,
|
916
|
+
stream_logs=False)
|
917
|
+
if rc != 0:
|
918
|
+
raise ValueError('Failed to get remote home directory: '
|
919
|
+
f'{remote_home_dir + stderr}')
|
920
|
+
remote_home_dir = remote_home_dir.strip()
|
921
|
+
return remote_home_dir
|
922
|
+
|
923
|
+
# Build command.
|
924
|
+
helper_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
|
925
|
+
'kubernetes', 'rsync_helper.sh')
|
926
|
+
namespace_context = f'{self.namespace}+{self.context}'
|
927
|
+
# Avoid rsync interpreting :, /, and + in namespace_context as the
|
928
|
+
# default delimiter for options and arguments.
|
929
|
+
# rsync_helper.sh will parse the namespace_context by reverting the
|
930
|
+
# encoding and pass it to kubectl exec.
|
931
|
+
encoded_namespace_context = (namespace_context.replace(
|
932
|
+
'@', '%40').replace(':', '%3A').replace('/',
|
933
|
+
'%2F').replace('+', '%2B'))
|
934
|
+
self._rsync(
|
935
|
+
source,
|
936
|
+
target,
|
937
|
+
node_destination=f'{self.pod_name}@{encoded_namespace_context}',
|
938
|
+
up=up,
|
939
|
+
rsh_option=helper_path,
|
940
|
+
log_path=log_path,
|
941
|
+
stream_logs=stream_logs,
|
942
|
+
max_retry=max_retry,
|
943
|
+
prefix_command=f'chmod +x {helper_path} && ',
|
944
|
+
# rsync with `kubectl` as the rsh command will cause ~/xx parsed as
|
945
|
+
# /~/xx, so we need to replace ~ with the remote home directory. We
|
946
|
+
# only need to do this when ~ is at the beginning of the path.
|
947
|
+
get_remote_home_dir=get_remote_home_dir)
|