skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -5,20 +5,23 @@ import os
5
5
  import pathlib
6
6
  import shlex
7
7
  import time
8
- from typing import Any, Iterable, List, Optional, Tuple, Type, Union
8
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
9
9
 
10
10
  from sky import sky_logging
11
11
  from sky.skylet import constants
12
12
  from sky.skylet import log_lib
13
13
  from sky.utils import common_utils
14
+ from sky.utils import control_master_utils
14
15
  from sky.utils import subprocess_utils
15
16
  from sky.utils import timeline
16
17
 
17
18
  logger = sky_logging.init_logger(__name__)
18
19
 
19
- # The git exclude file to support.
20
- GIT_EXCLUDE = '.git/info/exclude'
21
20
  # Rsync options
21
+ # TODO(zhwu): This will print a per-file progress bar (with -P),
22
+ # shooting a lot of messages to the output. --info=progress2 is used
23
+ # to get a total progress bar, but it requires rsync>=3.1.0 and Mac
24
+ # OS has a default rsync==2.6.9 (16 years old).
22
25
  RSYNC_DISPLAY_OPTION = '-Pavz'
23
26
  # Legend
24
27
  # dir-merge: ignore file can appear in any subdir, applies to that
@@ -26,10 +29,14 @@ RSYNC_DISPLAY_OPTION = '-Pavz'
26
29
  # Note that "-" is mandatory for rsync and means all patterns in the ignore
27
30
  # files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
28
31
  # do_not_exclude" doesn't work, even though git allows it.
29
- RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\''
32
+ RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
33
+ RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
34
+ # The git exclude file to support.
35
+ GIT_EXCLUDE = '.git/info/exclude'
30
36
  RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
31
37
 
32
38
  _HASH_MAX_LENGTH = 10
39
+ _DEFAULT_CONNECT_TIMEOUT = 30
33
40
 
34
41
 
35
42
  def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
@@ -60,9 +67,12 @@ def ssh_options_list(
60
67
  ) -> List[str]:
61
68
  """Returns a list of sane options for 'ssh'."""
62
69
  if connect_timeout is None:
63
- connect_timeout = 30
70
+ connect_timeout = _DEFAULT_CONNECT_TIMEOUT
64
71
  # Forked from Ray SSHOptions:
65
72
  # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/command_runner.py
73
+ # Do not allow agent forwarding because SkyPilot API server has access to
74
+ # all user cluster private keys, which should not be all forwarded to
75
+ # individual user clusters.
66
76
  arg_dict = {
67
77
  # SSH port
68
78
  'Port': port,
@@ -75,7 +85,7 @@ def ssh_options_list(
75
85
  # that case.
76
86
  'UserKnownHostsFile': os.devnull,
77
87
  # Suppresses the warning messages, such as:
78
- # Warning: Permanently added '34.69.216.203' (ED25519) to the list of
88
+ # Warning: Permanently added 'xx.xx.xx.xx' (EDxxx) to the list of
79
89
  # known hosts.
80
90
  'LogLevel': 'ERROR',
81
91
  # Try fewer extraneous key pairs.
@@ -89,18 +99,25 @@ def ssh_options_list(
89
99
  'ServerAliveCountMax': 3,
90
100
  # ConnectTimeout.
91
101
  'ConnectTimeout': f'{connect_timeout}s',
92
- # Agent forwarding for git.
93
- 'ForwardAgent': 'yes',
94
102
  }
95
103
  # SSH Control will have a severe delay when using docker_ssh_proxy_command.
96
104
  # TODO(tian): Investigate why.
105
+ #
106
+ # We disable ControlMaster when ssh_proxy_command is used, because the
107
+ # master connection will be idle although the connection might be shared
108
+ # by other ssh commands that is not idle. In that case, user's custom proxy
109
+ # command may drop the connection due to idle timeout, since it will only
110
+ # see the idle master connection. It is an issue even with the
111
+ # ServerAliveInterval set, since the keepalive message may not be recognized
112
+ # by the custom proxy command, such as AWS SSM Session Manager.
113
+ #
97
114
  # We also do not use ControlMaster when we use `kubectl port-forward`
98
115
  # to access Kubernetes pods over SSH+Proxycommand. This is because the
99
116
  # process running ProxyCommand is kept running as long as the ssh session
100
117
  # is running and the ControlMaster keeps the session, which results in
101
118
  # 'ControlPersist' number of seconds delay per ssh commands ran.
102
119
  if (ssh_control_name is not None and docker_ssh_proxy_command is None and
103
- not disable_control_master):
120
+ ssh_proxy_command is None and not disable_control_master):
104
121
  arg_dict.update({
105
122
  # Control path: important optimization as we do multiple ssh in one
106
123
  # sky.launch().
@@ -161,7 +178,7 @@ class CommandRunner:
161
178
  cmd: Union[str, List[str]],
162
179
  process_stream: bool,
163
180
  separate_stderr: bool,
164
- skip_lines: int,
181
+ skip_num_lines: int,
165
182
  source_bashrc: bool = False,
166
183
  ) -> str:
167
184
  """Returns the command to run."""
@@ -170,7 +187,7 @@ class CommandRunner:
170
187
 
171
188
  # We need this to correctly run the cmd, and get the output.
172
189
  command = [
173
- 'bash',
190
+ '/bin/bash',
174
191
  '--login',
175
192
  '-c',
176
193
  ]
@@ -193,12 +210,12 @@ class CommandRunner:
193
210
  ]
194
211
  if not separate_stderr:
195
212
  command.append('2>&1')
196
- if not process_stream and skip_lines:
213
+ if not process_stream and skip_num_lines:
197
214
  command += [
198
215
  # A hack to remove the following bash warnings (twice):
199
216
  # bash: cannot set terminal process group
200
217
  # bash: no job control in this shell
201
- f'| stdbuf -o0 tail -n +{skip_lines}',
218
+ f'| stdbuf -o0 tail -n +{skip_num_lines}',
202
219
  # This is required to make sure the executor of command can get
203
220
  # correct returncode, since linux pipe is used.
204
221
  '; exit ${PIPESTATUS[0]}'
@@ -207,6 +224,111 @@ class CommandRunner:
207
224
  command_str = ' '.join(command)
208
225
  return command_str
209
226
 
227
+ def _rsync(
228
+ self,
229
+ source: str,
230
+ target: str,
231
+ node_destination: str,
232
+ up: bool,
233
+ rsh_option: str,
234
+ # Advanced options.
235
+ log_path: str = os.devnull,
236
+ stream_logs: bool = True,
237
+ max_retry: int = 1,
238
+ prefix_command: Optional[str] = None,
239
+ get_remote_home_dir: Callable[[], str] = lambda: '~') -> None:
240
+ """Builds the rsync command."""
241
+ # Build command.
242
+ rsync_command = []
243
+ if prefix_command is not None:
244
+ rsync_command.append(prefix_command)
245
+ rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
246
+
247
+ def _get_remote_home_dir_with_retry():
248
+ backoff = common_utils.Backoff(initial_backoff=1,
249
+ max_backoff_factor=5)
250
+ retries_left = max_retry
251
+ assert retries_left > 0, f'max_retry {max_retry} must be positive.'
252
+ while retries_left >= 0:
253
+ try:
254
+ return get_remote_home_dir()
255
+ except Exception: # pylint: disable=broad-except
256
+ if retries_left == 0:
257
+ raise
258
+ sleep_time = backoff.current_backoff()
259
+ logger.warning(f'Failed to get remote home dir '
260
+ f'- retrying in {sleep_time} seconds.')
261
+ retries_left -= 1
262
+ time.sleep(sleep_time)
263
+
264
+ # --filter
265
+ # The source is a local path, so we need to resolve it.
266
+ resolved_source = pathlib.Path(source).expanduser().resolve()
267
+ if (resolved_source / constants.SKY_IGNORE_FILE).exists():
268
+ rsync_command.append(RSYNC_FILTER_SKYIGNORE)
269
+ else:
270
+ rsync_command.append(RSYNC_FILTER_GITIGNORE)
271
+ if up:
272
+ # Build --exclude-from argument.
273
+ if (resolved_source / GIT_EXCLUDE).exists():
274
+ # Ensure file exists; otherwise, rsync will error out.
275
+ #
276
+ # We shlex.quote() because the path may contain spaces:
277
+ # 'my dir/.git/info/exclude'
278
+ # Without quoting rsync fails.
279
+ rsync_command.append(
280
+ RSYNC_EXCLUDE_OPTION.format(
281
+ shlex.quote(str(resolved_source / GIT_EXCLUDE))))
282
+
283
+ rsync_command.append(f'-e {shlex.quote(rsh_option)}')
284
+
285
+ if up:
286
+ resolved_target = target
287
+ if target.startswith('~'):
288
+ remote_home_dir = _get_remote_home_dir_with_retry()
289
+ resolved_target = target.replace('~', remote_home_dir)
290
+ full_source_str = str(resolved_source)
291
+ if resolved_source.is_dir():
292
+ full_source_str = os.path.join(full_source_str, '')
293
+ rsync_command.extend([
294
+ f'{full_source_str!r}',
295
+ f'{node_destination}:{resolved_target!r}',
296
+ ])
297
+ else:
298
+ resolved_source = source
299
+ if source.startswith('~'):
300
+ remote_home_dir = _get_remote_home_dir_with_retry()
301
+ resolved_source = source.replace('~', remote_home_dir)
302
+ rsync_command.extend([
303
+ f'{node_destination}:{resolved_source!r}',
304
+ f'{os.path.expanduser(target)!r}',
305
+ ])
306
+ command = ' '.join(rsync_command)
307
+ logger.debug(f'Running rsync command: {command}')
308
+
309
+ backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
310
+ assert max_retry > 0, f'max_retry {max_retry} must be positive.'
311
+ while max_retry >= 0:
312
+ returncode, stdout, stderr = log_lib.run_with_log(
313
+ command,
314
+ log_path=log_path,
315
+ stream_logs=stream_logs,
316
+ shell=True,
317
+ require_outputs=True)
318
+ if returncode == 0:
319
+ break
320
+ max_retry -= 1
321
+ time.sleep(backoff.current_backoff())
322
+
323
+ direction = 'up' if up else 'down'
324
+ error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
325
+ 'Ensure that the network is stable, then retry.')
326
+ subprocess_utils.handle_returncode(returncode,
327
+ command,
328
+ error_msg,
329
+ stderr=stdout + stderr,
330
+ stream_logs=stream_logs)
331
+
210
332
  @timeline.event
211
333
  def run(
212
334
  self,
@@ -222,7 +344,7 @@ class CommandRunner:
222
344
  separate_stderr: bool = False,
223
345
  connect_timeout: Optional[int] = None,
224
346
  source_bashrc: bool = False,
225
- skip_lines: int = 0,
347
+ skip_num_lines: int = 0,
226
348
  **kwargs) -> Union[int, Tuple[int, str, str]]:
227
349
  """Runs the command on the cluster.
228
350
 
@@ -237,7 +359,7 @@ class CommandRunner:
237
359
  connect_timeout: timeout in seconds for the ssh connection.
238
360
  source_bashrc: Whether to source the ~/.bashrc before running the
239
361
  command.
240
- skip_lines: The number of lines to skip at the beginning of the
362
+ skip_num_lines: The number of lines to skip at the beginning of the
241
363
  output. This is used when the output is not processed by
242
364
  SkyPilot but we still want to get rid of some warning messages,
243
365
  such as SSH warnings.
@@ -293,6 +415,22 @@ class CommandRunner:
293
415
  returncode = self.run('true', connect_timeout=5, stream_logs=False)
294
416
  return returncode == 0
295
417
 
418
+ def close_cached_connection(self) -> None:
419
+ """Close the cached connection to the remote machine."""
420
+ pass
421
+
422
+ def port_forward_command(self,
423
+ port_forward: List[Tuple[int, int]],
424
+ connect_timeout: int = 1) -> List[str]:
425
+ """Command for forwarding ports from localhost to the remote machine.
426
+
427
+ Args:
428
+ port_forward: A list of ports to forward from the localhost to the
429
+ remote host.
430
+ connect_timeout: The timeout for the connection.
431
+ """
432
+ raise NotImplementedError
433
+
296
434
 
297
435
  class SSHCommandRunner(CommandRunner):
298
436
  """Runner for SSH commands."""
@@ -340,7 +478,9 @@ class SSHCommandRunner(CommandRunner):
340
478
  None if ssh_control_name is None else hashlib.md5(
341
479
  ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
342
480
  self._ssh_proxy_command = ssh_proxy_command
343
- self.disable_control_master = disable_control_master
481
+ self.disable_control_master = (
482
+ disable_control_master or
483
+ control_master_utils.should_disable_control_master())
344
484
  if docker_user is not None:
345
485
  assert port is None or port == 22, (
346
486
  f'port must be None or 22 for docker_user, got {port}.')
@@ -359,9 +499,27 @@ class SSHCommandRunner(CommandRunner):
359
499
  self.port = port
360
500
  self._docker_ssh_proxy_command = None
361
501
 
362
- def _ssh_base_command(self, *, ssh_mode: SshMode,
363
- port_forward: Optional[List[int]],
364
- connect_timeout: Optional[int]) -> List[str]:
502
+ def port_forward_command(self,
503
+ port_forward: List[Tuple[int, int]],
504
+ connect_timeout: int = 1) -> List[str]:
505
+ """Command for forwarding ports from localhost to the remote machine.
506
+
507
+ Args:
508
+ port_forward: A list of ports to forward from the local port to the
509
+ remote port.
510
+ connect_timeout: The timeout for the ssh connection.
511
+
512
+ Returns:
513
+ The command for forwarding ports from localhost to the remote
514
+ machine.
515
+ """
516
+ return self.ssh_base_command(ssh_mode=SshMode.INTERACTIVE,
517
+ port_forward=port_forward,
518
+ connect_timeout=connect_timeout)
519
+
520
+ def ssh_base_command(self, *, ssh_mode: SshMode,
521
+ port_forward: Optional[List[Tuple[int, int]]],
522
+ connect_timeout: Optional[int]) -> List[str]:
365
523
  ssh = ['ssh']
366
524
  if ssh_mode == SshMode.NON_INTERACTIVE:
367
525
  # Disable pseudo-terminal allocation. Otherwise, the output of
@@ -371,11 +529,10 @@ class SSHCommandRunner(CommandRunner):
371
529
  # Force pseudo-terminal allocation for interactive/login mode.
372
530
  ssh += ['-tt']
373
531
  if port_forward is not None:
374
- for port in port_forward:
375
- local = remote = port
532
+ for local, remote in port_forward:
376
533
  logger.info(
377
534
  f'Forwarding port {local} to port {remote} on localhost.')
378
- ssh += ['-L', f'{remote}:localhost:{local}']
535
+ ssh += ['-NL', f'{remote}:localhost:{local}']
379
536
  if self._docker_ssh_proxy_command is not None:
380
537
  docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
381
538
  else:
@@ -391,13 +548,35 @@ class SSHCommandRunner(CommandRunner):
391
548
  f'{self.ssh_user}@{self.ip}'
392
549
  ]
393
550
 
551
+ def close_cached_connection(self) -> None:
552
+ """Close the cached connection to the remote machine.
553
+
554
+ This is useful when we need to make the permission update effective of a
555
+ ssh user, e.g. usermod -aG docker $USER.
556
+ """
557
+ if self.ssh_control_name is not None:
558
+ control_path = _ssh_control_path(self.ssh_control_name)
559
+ if control_path is not None:
560
+ # Suppress the `Exit request sent.` output for this comamnd
561
+ # which would interrupt the CLI spinner.
562
+ cmd = (f'ssh -O exit -S {control_path}/%C '
563
+ f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
564
+ logger.debug(f'Closing cached connection {control_path!r} with '
565
+ f'cmd: {cmd}')
566
+ log_lib.run_with_log(cmd,
567
+ log_path=os.devnull,
568
+ require_outputs=False,
569
+ stream_logs=False,
570
+ process_stream=False,
571
+ shell=True)
572
+
394
573
  @timeline.event
395
574
  def run(
396
575
  self,
397
576
  cmd: Union[str, List[str]],
398
577
  *,
399
578
  require_outputs: bool = False,
400
- port_forward: Optional[List[int]] = None,
579
+ port_forward: Optional[List[Tuple[int, int]]] = None,
401
580
  # Advanced options.
402
581
  log_path: str = os.devnull,
403
582
  # If False, do not redirect stdout/stderr to optimize performance.
@@ -407,7 +586,7 @@ class SSHCommandRunner(CommandRunner):
407
586
  separate_stderr: bool = False,
408
587
  connect_timeout: Optional[int] = None,
409
588
  source_bashrc: bool = False,
410
- skip_lines: int = 0,
589
+ skip_num_lines: int = 0,
411
590
  **kwargs) -> Union[int, Tuple[int, str, str]]:
412
591
  """Uses 'ssh' to run 'cmd' on a node with ip.
413
592
 
@@ -428,7 +607,7 @@ class SSHCommandRunner(CommandRunner):
428
607
  connect_timeout: timeout in seconds for the ssh connection.
429
608
  source_bashrc: Whether to source the bashrc before running the
430
609
  command.
431
- skip_lines: The number of lines to skip at the beginning of the
610
+ skip_num_lines: The number of lines to skip at the beginning of the
432
611
  output. This is used when the output is not processed by
433
612
  SkyPilot but we still want to get rid of some warning messages,
434
613
  such as SSH warnings.
@@ -438,7 +617,7 @@ class SSHCommandRunner(CommandRunner):
438
617
  or
439
618
  A tuple of (returncode, stdout, stderr).
440
619
  """
441
- base_ssh_command = self._ssh_base_command(
620
+ base_ssh_command = self.ssh_base_command(
442
621
  ssh_mode=ssh_mode,
443
622
  port_forward=port_forward,
444
623
  connect_timeout=connect_timeout)
@@ -451,7 +630,7 @@ class SSHCommandRunner(CommandRunner):
451
630
  command_str = self._get_command_to_run(cmd,
452
631
  process_stream,
453
632
  separate_stderr,
454
- skip_lines=skip_lines,
633
+ skip_num_lines=skip_num_lines,
455
634
  source_bashrc=source_bashrc)
456
635
  command = base_ssh_command + [shlex.quote(command_str)]
457
636
 
@@ -506,30 +685,6 @@ class SSHCommandRunner(CommandRunner):
506
685
  Raises:
507
686
  exceptions.CommandError: rsync command failed.
508
687
  """
509
- # Build command.
510
- # TODO(zhwu): This will print a per-file progress bar (with -P),
511
- # shooting a lot of messages to the output. --info=progress2 is used
512
- # to get a total progress bar, but it requires rsync>=3.1.0 and Mac
513
- # OS has a default rsync==2.6.9 (16 years old).
514
- rsync_command = ['rsync', RSYNC_DISPLAY_OPTION]
515
-
516
- # --filter
517
- rsync_command.append(RSYNC_FILTER_OPTION)
518
-
519
- if up:
520
- # The source is a local path, so we need to resolve it.
521
- # --exclude-from
522
- resolved_source = pathlib.Path(source).expanduser().resolve()
523
- if (resolved_source / GIT_EXCLUDE).exists():
524
- # Ensure file exists; otherwise, rsync will error out.
525
- #
526
- # We shlex.quote() because the path may contain spaces:
527
- # 'my dir/.git/info/exclude'
528
- # Without quoting rsync fails.
529
- rsync_command.append(
530
- RSYNC_EXCLUDE_OPTION.format(
531
- shlex.quote(str(resolved_source / GIT_EXCLUDE))))
532
-
533
688
  if self._docker_ssh_proxy_command is not None:
534
689
  docker_ssh_proxy_command = self._docker_ssh_proxy_command(['ssh'])
535
690
  else:
@@ -542,43 +697,251 @@ class SSHCommandRunner(CommandRunner):
542
697
  docker_ssh_proxy_command=docker_ssh_proxy_command,
543
698
  port=self.port,
544
699
  disable_control_master=self.disable_control_master))
545
- rsync_command.append(f'-e "ssh {ssh_options}"')
546
- # To support spaces in the path, we need to quote source and target.
547
- # rsync doesn't support '~' in a quoted local path, but it is ok to
548
- # have '~' in a quoted remote path.
549
- if up:
550
- full_source_str = str(resolved_source)
551
- if resolved_source.is_dir():
552
- full_source_str = os.path.join(full_source_str, '')
553
- rsync_command.extend([
554
- f'{full_source_str!r}',
555
- f'{self.ssh_user}@{self.ip}:{target!r}',
556
- ])
557
- else:
558
- rsync_command.extend([
559
- f'{self.ssh_user}@{self.ip}:{source!r}',
560
- f'{os.path.expanduser(target)!r}',
561
- ])
562
- command = ' '.join(rsync_command)
700
+ rsh_option = f'ssh {ssh_options}'
701
+ self._rsync(source,
702
+ target,
703
+ node_destination=f'{self.ssh_user}@{self.ip}',
704
+ up=up,
705
+ rsh_option=rsh_option,
706
+ log_path=log_path,
707
+ stream_logs=stream_logs,
708
+ max_retry=max_retry)
563
709
 
564
- backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
565
- while max_retry >= 0:
566
- returncode, stdout, stderr = log_lib.run_with_log(
567
- command,
568
- log_path=log_path,
569
- stream_logs=stream_logs,
570
- shell=True,
571
- require_outputs=True)
572
- if returncode == 0:
573
- break
574
- max_retry -= 1
575
- time.sleep(backoff.current_backoff())
576
710
 
577
- direction = 'up' if up else 'down'
578
- error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
579
- 'Ensure that the network is stable, then retry.')
580
- subprocess_utils.handle_returncode(returncode,
581
- command,
582
- error_msg,
583
- stderr=stdout + stderr,
584
- stream_logs=stream_logs)
711
+ class KubernetesCommandRunner(CommandRunner):
712
+ """Runner for Kubernetes commands."""
713
+
714
+ _MAX_RETRIES_FOR_RSYNC = 3
715
+
716
+ def __init__(
717
+ self,
718
+ node: Tuple[Tuple[str, Optional[str]], str],
719
+ **kwargs,
720
+ ):
721
+ """Initialize KubernetesCommandRunner.
722
+
723
+ Example Usage:
724
+ runner = KubernetesCommandRunner((namespace, context), pod_name))
725
+ runner.run('ls -l')
726
+ runner.rsync(source, target, up=True)
727
+
728
+ Args:
729
+ node: The namespace and pod_name of the remote machine.
730
+ """
731
+ del kwargs
732
+ super().__init__(node)
733
+ (self.namespace, self.context), self.pod_name = node
734
+
735
+ @property
736
+ def node_id(self) -> str:
737
+ return f'{self.context}-{self.namespace}-{self.pod_name}'
738
+
739
+ def port_forward_command(self,
740
+ port_forward: List[Tuple[int, int]],
741
+ connect_timeout: int = 1) -> List[str]:
742
+ """Command for forwarding ports from localhost to the remote machine.
743
+
744
+ Args:
745
+ port_forward: A list of ports to forward from the local port to the
746
+ remote port. Currently, only one port is supported, i.e. the
747
+ list should have only one element.
748
+ connect_timeout: The timeout for the ssh connection.
749
+ """
750
+ assert port_forward and len(port_forward) == 1, (
751
+ 'Only one port is supported for Kubernetes port-forward.')
752
+ kubectl_args = [
753
+ '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
754
+ ]
755
+ if self.context:
756
+ kubectl_args += ['--context', self.context]
757
+ local_port, remote_port = port_forward[0]
758
+ local_port_str = f'{local_port}' if local_port is not None else ''
759
+ kubectl_cmd = [
760
+ 'kubectl',
761
+ *kubectl_args,
762
+ 'port-forward',
763
+ f'pod/{self.pod_name}',
764
+ f'{local_port_str}:{remote_port}',
765
+ ]
766
+ return kubectl_cmd
767
+
768
+ @timeline.event
769
+ def run(
770
+ self,
771
+ cmd: Union[str, List[str]],
772
+ *,
773
+ port_forward: Optional[List[int]] = None,
774
+ require_outputs: bool = False,
775
+ # Advanced options.
776
+ log_path: str = os.devnull,
777
+ # If False, do not redirect stdout/stderr to optimize performance.
778
+ process_stream: bool = True,
779
+ stream_logs: bool = True,
780
+ ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
781
+ separate_stderr: bool = False,
782
+ connect_timeout: Optional[int] = None,
783
+ source_bashrc: bool = False,
784
+ skip_num_lines: int = 0,
785
+ **kwargs) -> Union[int, Tuple[int, str, str]]:
786
+ """Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
787
+
788
+ Args:
789
+ cmd: The command to run.
790
+ port_forward: This should be None for k8s.
791
+
792
+ Advanced options:
793
+
794
+ require_outputs: Whether to return the stdout/stderr of the command.
795
+ log_path: Redirect stdout/stderr to the log_path.
796
+ stream_logs: Stream logs to the stdout/stderr.
797
+ check: Check the success of the command.
798
+ ssh_mode: The mode to use for ssh.
799
+ See SSHMode for more details.
800
+ separate_stderr: Whether to separate stderr from stdout.
801
+ connect_timeout: timeout in seconds for the pod connection.
802
+ source_bashrc: Whether to source the bashrc before running the
803
+ command.
804
+ skip_num_lines: The number of lines to skip at the beginning of the
805
+ output. This is used when the output is not processed by
806
+ SkyPilot but we still want to get rid of some warning messages,
807
+ such as SSH warnings.
808
+
809
+
810
+ Returns:
811
+ returncode
812
+ or
813
+ A tuple of (returncode, stdout, stderr).
814
+ """
815
+ # TODO(zhwu): implement port_forward for k8s.
816
+ assert port_forward is None, ('port_forward is not supported for k8s '
817
+ f'for now, but got: {port_forward}')
818
+ if connect_timeout is None:
819
+ connect_timeout = _DEFAULT_CONNECT_TIMEOUT
820
+ kubectl_args = [
821
+ '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
822
+ ]
823
+ if self.context:
824
+ kubectl_args += ['--context', self.context]
825
+ # If context is none, it means we are using incluster auth. In this
826
+ # case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
827
+ if self.context is None:
828
+ kubectl_args += ['--kubeconfig', '/dev/null']
829
+ kubectl_args += [self.pod_name]
830
+ if ssh_mode == SshMode.LOGIN:
831
+ assert isinstance(cmd, list), 'cmd must be a list for login mode.'
832
+ base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
833
+ command = base_cmd + cmd
834
+ proc = subprocess_utils.run(command, shell=False, check=False)
835
+ return proc.returncode, '', ''
836
+
837
+ kubectl_base_command = ['kubectl', 'exec']
838
+
839
+ if ssh_mode == SshMode.INTERACTIVE:
840
+ kubectl_base_command.append('-i')
841
+ kubectl_base_command += [*kubectl_args, '--']
842
+
843
+ command_str = self._get_command_to_run(cmd,
844
+ process_stream,
845
+ separate_stderr,
846
+ skip_num_lines=skip_num_lines,
847
+ source_bashrc=source_bashrc)
848
+ command = kubectl_base_command + [
849
+ # It is important to use /bin/bash -c here to make sure we quote the
850
+ # command to be run properly. Otherwise, directly appending commands
851
+ # after '--' will not work for some commands, such as '&&', '>' etc.
852
+ '/bin/bash',
853
+ '-c',
854
+ shlex.quote(command_str)
855
+ ]
856
+
857
+ log_dir = os.path.expanduser(os.path.dirname(log_path))
858
+ os.makedirs(log_dir, exist_ok=True)
859
+
860
+ executable = None
861
+ if not process_stream:
862
+ if stream_logs:
863
+ command += [
864
+ f'| tee {log_path}',
865
+ # This also requires the executor to be '/bin/bash' instead
866
+ # of the default '/bin/sh'.
867
+ '; exit ${PIPESTATUS[0]}'
868
+ ]
869
+ else:
870
+ command += [f'> {log_path}']
871
+ executable = '/bin/bash'
872
+ return log_lib.run_with_log(' '.join(command),
873
+ log_path,
874
+ require_outputs=require_outputs,
875
+ stream_logs=stream_logs,
876
+ process_stream=process_stream,
877
+ shell=True,
878
+ executable=executable,
879
+ **kwargs)
880
+
881
+ @timeline.event
882
+ def rsync(
883
+ self,
884
+ source: str,
885
+ target: str,
886
+ *,
887
+ up: bool,
888
+ # Advanced options.
889
+ log_path: str = os.devnull,
890
+ stream_logs: bool = True,
891
+ max_retry: int = _MAX_RETRIES_FOR_RSYNC,
892
+ ) -> None:
893
+ """Uses 'rsync' to sync 'source' to 'target'.
894
+
895
+ Args:
896
+ source: The source path.
897
+ target: The target path.
898
+ up: The direction of the sync, True for local to cluster, False
899
+ for cluster to local.
900
+ log_path: Redirect stdout/stderr to the log_path.
901
+ stream_logs: Stream logs to the stdout/stderr.
902
+ max_retry: The maximum number of retries for the rsync command.
903
+ This value should be non-negative.
904
+
905
+ Raises:
906
+ exceptions.CommandError: rsync command failed.
907
+ """
908
+
909
+ def get_remote_home_dir() -> str:
910
+ # Use `echo ~` to get the remote home directory, instead of pwd or
911
+ # echo $HOME, because pwd can be `/` when the remote user is root
912
+ # and $HOME is not always set.
913
+ rc, remote_home_dir, stderr = self.run('echo ~',
914
+ require_outputs=True,
915
+ separate_stderr=True,
916
+ stream_logs=False)
917
+ if rc != 0:
918
+ raise ValueError('Failed to get remote home directory: '
919
+ f'{remote_home_dir + stderr}')
920
+ remote_home_dir = remote_home_dir.strip()
921
+ return remote_home_dir
922
+
923
+ # Build command.
924
+ helper_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
925
+ 'kubernetes', 'rsync_helper.sh')
926
+ namespace_context = f'{self.namespace}+{self.context}'
927
+ # Avoid rsync interpreting :, /, and + in namespace_context as the
928
+ # default delimiter for options and arguments.
929
+ # rsync_helper.sh will parse the namespace_context by reverting the
930
+ # encoding and pass it to kubectl exec.
931
+ encoded_namespace_context = (namespace_context.replace(
932
+ '@', '%40').replace(':', '%3A').replace('/',
933
+ '%2F').replace('+', '%2B'))
934
+ self._rsync(
935
+ source,
936
+ target,
937
+ node_destination=f'{self.pod_name}@{encoded_namespace_context}',
938
+ up=up,
939
+ rsh_option=helper_path,
940
+ log_path=log_path,
941
+ stream_logs=stream_logs,
942
+ max_retry=max_retry,
943
+ prefix_command=f'chmod +x {helper_path} && ',
944
+ # rsync with `kubectl` as the rsh command will cause ~/xx parsed as
945
+ # /~/xx, so we need to replace ~ with the remote home directory. We
946
+ # only need to do this when ~ is at the beginning of the path.
947
+ get_remote_home_dir=get_remote_home_dir)