skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/utils/command_runner.py
CHANGED
|
@@ -1,14 +1,23 @@
|
|
|
1
1
|
"""Runner for commands to be executed on the cluster."""
|
|
2
2
|
import enum
|
|
3
|
+
import fcntl
|
|
3
4
|
import hashlib
|
|
4
5
|
import os
|
|
5
6
|
import pathlib
|
|
7
|
+
import pty
|
|
6
8
|
import re
|
|
7
9
|
import shlex
|
|
10
|
+
import signal
|
|
11
|
+
import socket
|
|
8
12
|
import sys
|
|
13
|
+
import termios
|
|
14
|
+
import threading
|
|
9
15
|
import time
|
|
10
16
|
from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
|
|
11
17
|
Union)
|
|
18
|
+
import uuid
|
|
19
|
+
|
|
20
|
+
import colorama
|
|
12
21
|
|
|
13
22
|
from sky import exceptions
|
|
14
23
|
from sky import sky_logging
|
|
@@ -19,6 +28,7 @@ from sky.utils import common_utils
|
|
|
19
28
|
from sky.utils import context_utils
|
|
20
29
|
from sky.utils import control_master_utils
|
|
21
30
|
from sky.utils import git as git_utils
|
|
31
|
+
from sky.utils import interactive_utils
|
|
22
32
|
from sky.utils import subprocess_utils
|
|
23
33
|
from sky.utils import timeline
|
|
24
34
|
|
|
@@ -90,10 +100,12 @@ def ssh_options_list(
|
|
|
90
100
|
ssh_control_name: Optional[str],
|
|
91
101
|
*,
|
|
92
102
|
ssh_proxy_command: Optional[str] = None,
|
|
103
|
+
ssh_proxy_jump: Optional[str] = None,
|
|
93
104
|
docker_ssh_proxy_command: Optional[str] = None,
|
|
94
105
|
connect_timeout: Optional[int] = None,
|
|
95
106
|
port: int = 22,
|
|
96
107
|
disable_control_master: Optional[bool] = False,
|
|
108
|
+
escape_percent_expand: bool = False,
|
|
97
109
|
) -> List[str]:
|
|
98
110
|
"""Returns a list of sane options for 'ssh'."""
|
|
99
111
|
if connect_timeout is None:
|
|
@@ -133,11 +145,11 @@ def ssh_options_list(
|
|
|
133
145
|
# SSH Control will have a severe delay when using docker_ssh_proxy_command.
|
|
134
146
|
# TODO(tian): Investigate why.
|
|
135
147
|
#
|
|
136
|
-
# We disable ControlMaster when ssh_proxy_command is used,
|
|
137
|
-
# master connection will be idle although the connection might
|
|
138
|
-
# by other ssh commands that is not idle. In that case, user's
|
|
139
|
-
# command may drop the connection due to idle timeout, since it
|
|
140
|
-
# see the idle master connection. It is an issue even with the
|
|
148
|
+
# We disable ControlMaster when ssh_proxy_command is used,
|
|
149
|
+
# because the master connection will be idle although the connection might
|
|
150
|
+
# be shared by other ssh commands that is not idle. In that case, user's
|
|
151
|
+
# custom proxy command may drop the connection due to idle timeout, since it
|
|
152
|
+
# will only see the idle master connection. It is an issue even with the
|
|
141
153
|
# ServerAliveInterval set, since the keepalive message may not be recognized
|
|
142
154
|
# by the custom proxy command, such as AWS SSM Session Manager.
|
|
143
155
|
#
|
|
@@ -148,11 +160,14 @@ def ssh_options_list(
|
|
|
148
160
|
# 'ControlPersist' number of seconds delay per ssh commands ran.
|
|
149
161
|
if (ssh_control_name is not None and docker_ssh_proxy_command is None and
|
|
150
162
|
ssh_proxy_command is None and not disable_control_master):
|
|
163
|
+
control_path = f'{_ssh_control_path(ssh_control_name)}/%C'
|
|
164
|
+
if escape_percent_expand:
|
|
165
|
+
control_path = control_path.replace('%', '%%')
|
|
151
166
|
arg_dict.update({
|
|
152
167
|
# Control path: important optimization as we do multiple ssh in one
|
|
153
168
|
# sky.launch().
|
|
154
169
|
'ControlMaster': 'auto',
|
|
155
|
-
'ControlPath':
|
|
170
|
+
'ControlPath': control_path,
|
|
156
171
|
'ControlPersist': '300s',
|
|
157
172
|
})
|
|
158
173
|
ssh_key_option = [
|
|
@@ -174,6 +189,15 @@ def ssh_options_list(
|
|
|
174
189
|
'ProxyCommand': shlex.quote(ssh_proxy_command),
|
|
175
190
|
})
|
|
176
191
|
|
|
192
|
+
if ssh_proxy_jump is not None:
|
|
193
|
+
logger.debug(f'--- ProxyJump: {ssh_proxy_jump} ---')
|
|
194
|
+
if ssh_proxy_command is not None:
|
|
195
|
+
logger.warning('Both ProxyCommand and ProxyJump are specified. '
|
|
196
|
+
'ProxyCommand will take precedence.')
|
|
197
|
+
arg_dict.update({
|
|
198
|
+
'ProxyJump': shlex.quote(ssh_proxy_jump),
|
|
199
|
+
})
|
|
200
|
+
|
|
177
201
|
return ssh_key_option + [
|
|
178
202
|
x for y in (['-o', f'{k}={v}']
|
|
179
203
|
for k, v in arg_dict.items()
|
|
@@ -233,6 +257,7 @@ class CommandRunner:
|
|
|
233
257
|
skip_num_lines: int,
|
|
234
258
|
source_bashrc: bool = False,
|
|
235
259
|
use_login: bool = True,
|
|
260
|
+
run_in_background: bool = False,
|
|
236
261
|
) -> str:
|
|
237
262
|
"""Returns the command to run."""
|
|
238
263
|
if isinstance(cmd, list):
|
|
@@ -263,7 +288,11 @@ class CommandRunner:
|
|
|
263
288
|
]
|
|
264
289
|
if not separate_stderr:
|
|
265
290
|
command.append('2>&1')
|
|
291
|
+
if run_in_background:
|
|
292
|
+
command = ['nohup'] + command + ['&']
|
|
266
293
|
if not process_stream and skip_num_lines:
|
|
294
|
+
assert not run_in_background, (
|
|
295
|
+
'run_in_background and skip_num_lines cannot be used together')
|
|
267
296
|
command += [
|
|
268
297
|
# A hack to remove the following bash warnings (twice):
|
|
269
298
|
# bash: cannot set terminal process group
|
|
@@ -424,6 +453,7 @@ class CommandRunner:
|
|
|
424
453
|
connect_timeout: Optional[int] = None,
|
|
425
454
|
source_bashrc: bool = False,
|
|
426
455
|
skip_num_lines: int = 0,
|
|
456
|
+
run_in_background: bool = False,
|
|
427
457
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
428
458
|
"""Runs the command on the cluster.
|
|
429
459
|
|
|
@@ -442,6 +472,7 @@ class CommandRunner:
|
|
|
442
472
|
output. This is used when the output is not processed by
|
|
443
473
|
SkyPilot but we still want to get rid of some warning messages,
|
|
444
474
|
such as SSH warnings.
|
|
475
|
+
run_in_background: Whether to run the command in the background.
|
|
445
476
|
|
|
446
477
|
Returns:
|
|
447
478
|
returncode
|
|
@@ -622,9 +653,11 @@ class SSHCommandRunner(CommandRunner):
|
|
|
622
653
|
ssh_private_key: Optional[str],
|
|
623
654
|
ssh_control_name: Optional[str] = '__default__',
|
|
624
655
|
ssh_proxy_command: Optional[str] = None,
|
|
656
|
+
ssh_proxy_jump: Optional[str] = None,
|
|
625
657
|
docker_user: Optional[str] = None,
|
|
626
658
|
disable_control_master: Optional[bool] = False,
|
|
627
659
|
port_forward_execute_remote_command: Optional[bool] = False,
|
|
660
|
+
enable_interactive_auth: bool = False,
|
|
628
661
|
):
|
|
629
662
|
"""Initialize SSHCommandRunner.
|
|
630
663
|
|
|
@@ -644,6 +677,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
644
677
|
ssh_proxy_command: Optional, the value to pass to '-o
|
|
645
678
|
ProxyCommand'. Useful for communicating with clusters without
|
|
646
679
|
public IPs using a "jump server".
|
|
680
|
+
ssh_proxy_jump: Optional, the value to pass to '-o ProxyJump' flag.
|
|
681
|
+
Similar to ssh_proxy_command, but more modern.
|
|
647
682
|
port: The port to use for ssh.
|
|
648
683
|
docker_user: The docker user to use for ssh. If specified, the
|
|
649
684
|
command will be run inside a docker container which have a ssh
|
|
@@ -663,6 +698,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
663
698
|
None if ssh_control_name is None else hashlib.md5(
|
|
664
699
|
ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
|
|
665
700
|
self._ssh_proxy_command = ssh_proxy_command
|
|
701
|
+
self._ssh_proxy_jump = ssh_proxy_jump
|
|
666
702
|
self.disable_control_master = (
|
|
667
703
|
disable_control_master or
|
|
668
704
|
control_master_utils.should_disable_control_master())
|
|
@@ -712,6 +748,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
712
748
|
self._docker_ssh_proxy_command = None
|
|
713
749
|
self.port_forward_execute_remote_command = (
|
|
714
750
|
port_forward_execute_remote_command)
|
|
751
|
+
self.enable_interactive_auth = enable_interactive_auth
|
|
715
752
|
|
|
716
753
|
def port_forward_command(
|
|
717
754
|
self,
|
|
@@ -763,6 +800,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
763
800
|
self.ssh_private_key,
|
|
764
801
|
self.ssh_control_name,
|
|
765
802
|
ssh_proxy_command=self._ssh_proxy_command,
|
|
803
|
+
ssh_proxy_jump=self._ssh_proxy_jump,
|
|
766
804
|
docker_ssh_proxy_command=docker_ssh_proxy_command,
|
|
767
805
|
port=self.port,
|
|
768
806
|
connect_timeout=connect_timeout,
|
|
@@ -770,6 +808,127 @@ class SSHCommandRunner(CommandRunner):
|
|
|
770
808
|
f'{self.ssh_user}@{self.ip}'
|
|
771
809
|
]
|
|
772
810
|
|
|
811
|
+
def _retry_with_interactive_auth(
|
|
812
|
+
self, session_id: str, command: List[str], log_path: str,
|
|
813
|
+
require_outputs: bool, process_stream: bool, stream_logs: bool,
|
|
814
|
+
executable: str,
|
|
815
|
+
**kwargs) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
|
|
816
|
+
"""Retries command with interactive auth.
|
|
817
|
+
|
|
818
|
+
This handles SSH connections requiring keyboard-interactive
|
|
819
|
+
authentication (e.g., 2FA) by using a PTY for auth prompts and
|
|
820
|
+
establishing a persistent ControlMaster socket (if enabled) that
|
|
821
|
+
other SSH sessions can reuse without re-authenticating.
|
|
822
|
+
|
|
823
|
+
The PTY is bridged to a websocket connection that allows the client
|
|
824
|
+
to handle interactive authentication. Command output flows through
|
|
825
|
+
normal stdout/stderr pipes, which gets printed to log_path.
|
|
826
|
+
|
|
827
|
+
See ssh_options_list for when ControlMaster is not enabled.
|
|
828
|
+
"""
|
|
829
|
+
extra_options = [
|
|
830
|
+
# Override ControlPersist to reduce frequency of manual user
|
|
831
|
+
# intervention. The default from ssh_options_list is only 5m.
|
|
832
|
+
#
|
|
833
|
+
# NOTE: When used with ProxyJump, the connection can die
|
|
834
|
+
# earlier than expected, so it is recommended to also enable
|
|
835
|
+
# ControlMaster on the jump host's SSH config. It is hard to
|
|
836
|
+
# tell why exactly, because enabling -v makes this problem
|
|
837
|
+
# disappear for some reasons.
|
|
838
|
+
'-o',
|
|
839
|
+
'ControlPersist=1d',
|
|
840
|
+
]
|
|
841
|
+
if self._ssh_proxy_jump is not None:
|
|
842
|
+
logger.warning(f'{colorama.Fore.YELLOW}When using ProxyJump, it is '
|
|
843
|
+
'recommended to also enable ControlMaster on the '
|
|
844
|
+
'jump host\'s SSH config to keep the authenticated '
|
|
845
|
+
f'connection alive for longer.{colorama.Fore.RESET}')
|
|
846
|
+
command = command[:1] + extra_options + command[1:]
|
|
847
|
+
|
|
848
|
+
# Create PTY for SSH. PTY slave for stdin from user, PTY master
|
|
849
|
+
# for password/auth prompts from SSH.
|
|
850
|
+
pty_m_fd, pty_s_fd = pty.openpty()
|
|
851
|
+
|
|
852
|
+
# Create Unix socket to pass PTY master fd to websocket handler
|
|
853
|
+
fd_socket_path = interactive_utils.get_pty_socket_path(session_id)
|
|
854
|
+
if os.path.exists(fd_socket_path):
|
|
855
|
+
os.unlink(fd_socket_path)
|
|
856
|
+
fd_server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
857
|
+
fd_server.bind(fd_socket_path)
|
|
858
|
+
fd_server.listen(1)
|
|
859
|
+
fd_server.settimeout(60)
|
|
860
|
+
|
|
861
|
+
# Signal client to initiate websocket for interactive auth
|
|
862
|
+
interactive_signal = f'<sky-interactive session="{session_id}"/>'
|
|
863
|
+
print(interactive_signal, flush=True)
|
|
864
|
+
|
|
865
|
+
def handle_unix_socket_connection():
|
|
866
|
+
"""Background thread to handle Unix socket connection."""
|
|
867
|
+
conn = None
|
|
868
|
+
try:
|
|
869
|
+
# Wait for websocket handler to connect.
|
|
870
|
+
conn, _ = fd_server.accept()
|
|
871
|
+
# Send PTY master fd through Unix socket.
|
|
872
|
+
interactive_utils.send_fd(conn, pty_m_fd)
|
|
873
|
+
# We don't need to block here to wait for the websocket
|
|
874
|
+
# handler, as SSH will continue by itself once auth
|
|
875
|
+
# is complete.
|
|
876
|
+
except socket.timeout:
|
|
877
|
+
logger.debug('Timeout waiting for interactive auth connection')
|
|
878
|
+
except Exception as e: # pylint: disable=broad-except
|
|
879
|
+
logger.error(f'Error in Unix socket connection: '
|
|
880
|
+
f'{common_utils.format_exception(e)}')
|
|
881
|
+
finally:
|
|
882
|
+
if conn is not None:
|
|
883
|
+
try:
|
|
884
|
+
conn.close()
|
|
885
|
+
except Exception: # pylint: disable=broad-except
|
|
886
|
+
pass
|
|
887
|
+
try:
|
|
888
|
+
os.close(pty_m_fd)
|
|
889
|
+
except Exception: # pylint: disable=broad-except
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
unix_sock_thread = threading.Thread(
|
|
893
|
+
target=handle_unix_socket_connection, daemon=True)
|
|
894
|
+
unix_sock_thread.start()
|
|
895
|
+
|
|
896
|
+
try:
|
|
897
|
+
|
|
898
|
+
def setup_pty_session():
|
|
899
|
+
# Set PTY as controlling terminal so SSH can access /dev/tty
|
|
900
|
+
# for keyboard-interactive auth. Without this:
|
|
901
|
+
# "can't open /dev/tty: Device not configured"
|
|
902
|
+
fcntl.ioctl(pty_s_fd, termios.TIOCSCTTY, 0)
|
|
903
|
+
# Ignore SIGHUP so ControlMaster survives when PTY closes.
|
|
904
|
+
signal.signal(signal.SIGHUP, signal.SIG_IGN)
|
|
905
|
+
# Ignore SIGTERM so ControlMaster survives subprocess_daemon
|
|
906
|
+
# killing the process group.
|
|
907
|
+
if self._ssh_proxy_jump is not None:
|
|
908
|
+
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
|
909
|
+
|
|
910
|
+
return log_lib.run_with_log(' '.join(command),
|
|
911
|
+
log_path,
|
|
912
|
+
require_outputs=require_outputs,
|
|
913
|
+
stream_logs=stream_logs,
|
|
914
|
+
process_stream=process_stream,
|
|
915
|
+
shell=True,
|
|
916
|
+
executable=executable,
|
|
917
|
+
preexec_fn=setup_pty_session,
|
|
918
|
+
**kwargs)
|
|
919
|
+
except Exception as e:
|
|
920
|
+
raise RuntimeError(f'Exception in setup: {e}') from e
|
|
921
|
+
finally:
|
|
922
|
+
# Clean up PTY fds and sockets.
|
|
923
|
+
fd_server.close()
|
|
924
|
+
if os.path.exists(fd_socket_path):
|
|
925
|
+
os.unlink(fd_socket_path)
|
|
926
|
+
try:
|
|
927
|
+
os.close(pty_m_fd)
|
|
928
|
+
except OSError:
|
|
929
|
+
pass # Already closed by background thread
|
|
930
|
+
os.close(pty_s_fd)
|
|
931
|
+
|
|
773
932
|
def close_cached_connection(self) -> None:
|
|
774
933
|
"""Close the cached connection to the remote machine.
|
|
775
934
|
|
|
@@ -810,6 +969,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
810
969
|
connect_timeout: Optional[int] = None,
|
|
811
970
|
source_bashrc: bool = False,
|
|
812
971
|
skip_num_lines: int = 0,
|
|
972
|
+
run_in_background: bool = False,
|
|
813
973
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
814
974
|
"""Uses 'ssh' to run 'cmd' on a node with ip.
|
|
815
975
|
|
|
@@ -834,27 +994,32 @@ class SSHCommandRunner(CommandRunner):
|
|
|
834
994
|
output. This is used when the output is not processed by
|
|
835
995
|
SkyPilot but we still want to get rid of some warning messages,
|
|
836
996
|
such as SSH warnings.
|
|
997
|
+
run_in_background: Whether to run the command in the background.
|
|
837
998
|
|
|
838
999
|
Returns:
|
|
839
1000
|
returncode
|
|
840
1001
|
or
|
|
841
1002
|
A tuple of (returncode, stdout, stderr).
|
|
842
1003
|
"""
|
|
1004
|
+
|
|
843
1005
|
base_ssh_command = self.ssh_base_command(
|
|
844
1006
|
ssh_mode=ssh_mode,
|
|
845
1007
|
port_forward=port_forward,
|
|
846
1008
|
connect_timeout=connect_timeout)
|
|
1009
|
+
|
|
847
1010
|
if ssh_mode == SshMode.LOGIN:
|
|
848
1011
|
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
|
849
1012
|
command = base_ssh_command + cmd
|
|
850
1013
|
proc = subprocess_utils.run(command, shell=False, check=False)
|
|
851
1014
|
return proc.returncode, '', ''
|
|
852
1015
|
|
|
853
|
-
command_str = self._get_command_to_run(
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1016
|
+
command_str = self._get_command_to_run(
|
|
1017
|
+
cmd,
|
|
1018
|
+
process_stream,
|
|
1019
|
+
separate_stderr,
|
|
1020
|
+
skip_num_lines=skip_num_lines,
|
|
1021
|
+
source_bashrc=source_bashrc,
|
|
1022
|
+
run_in_background=run_in_background)
|
|
858
1023
|
command = base_ssh_command + [shlex.quote(command_str)]
|
|
859
1024
|
|
|
860
1025
|
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
@@ -872,14 +1037,35 @@ class SSHCommandRunner(CommandRunner):
|
|
|
872
1037
|
else:
|
|
873
1038
|
command += [f'> {log_path}']
|
|
874
1039
|
executable = '/bin/bash'
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
1040
|
+
|
|
1041
|
+
result = log_lib.run_with_log(' '.join(command),
|
|
1042
|
+
log_path,
|
|
1043
|
+
require_outputs=require_outputs,
|
|
1044
|
+
stream_logs=stream_logs,
|
|
1045
|
+
process_stream=process_stream,
|
|
1046
|
+
shell=True,
|
|
1047
|
+
executable=executable,
|
|
1048
|
+
**kwargs)
|
|
1049
|
+
if not self.enable_interactive_auth:
|
|
1050
|
+
return result
|
|
1051
|
+
|
|
1052
|
+
if require_outputs:
|
|
1053
|
+
returncode, _, _ = result
|
|
1054
|
+
else:
|
|
1055
|
+
returncode = result
|
|
1056
|
+
|
|
1057
|
+
if returncode != 255:
|
|
1058
|
+
return result
|
|
1059
|
+
# Exit code 255 indicates an SSH connection error. It does not
|
|
1060
|
+
# necessarily mean an auth failure, but when ControlMaster is used,
|
|
1061
|
+
# the stdout/stderr does not contain the auth failure message,
|
|
1062
|
+
# which is why we don't check the output here, and just attempt
|
|
1063
|
+
# the interactive auth flow.
|
|
1064
|
+
session_id = str(uuid.uuid4())
|
|
1065
|
+
return self._retry_with_interactive_auth(session_id, command, log_path,
|
|
1066
|
+
require_outputs,
|
|
1067
|
+
process_stream, stream_logs,
|
|
1068
|
+
executable, **kwargs)
|
|
883
1069
|
|
|
884
1070
|
@timeline.event
|
|
885
1071
|
def rsync(
|
|
@@ -920,6 +1106,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
920
1106
|
self.ssh_private_key,
|
|
921
1107
|
self.ssh_control_name,
|
|
922
1108
|
ssh_proxy_command=self._ssh_proxy_command,
|
|
1109
|
+
ssh_proxy_jump=self._ssh_proxy_jump,
|
|
923
1110
|
docker_ssh_proxy_command=docker_ssh_proxy_command,
|
|
924
1111
|
port=self.port,
|
|
925
1112
|
disable_control_master=self.disable_control_master))
|
|
@@ -1033,6 +1220,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1033
1220
|
connect_timeout: Optional[int] = None,
|
|
1034
1221
|
source_bashrc: bool = False,
|
|
1035
1222
|
skip_num_lines: int = 0,
|
|
1223
|
+
run_in_background: bool = False,
|
|
1036
1224
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1037
1225
|
"""Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
|
|
1038
1226
|
name and namespace.
|
|
@@ -1057,6 +1245,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1057
1245
|
output. This is used when the output is not processed by
|
|
1058
1246
|
SkyPilot but we still want to get rid of some warning messages,
|
|
1059
1247
|
such as SSH warnings.
|
|
1248
|
+
run_in_background: Whether to run the command in the background.
|
|
1060
1249
|
|
|
1061
1250
|
Returns:
|
|
1062
1251
|
returncode
|
|
@@ -1093,11 +1282,13 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1093
1282
|
kubectl_base_command.append('-i')
|
|
1094
1283
|
kubectl_base_command += [*kubectl_args, '--']
|
|
1095
1284
|
|
|
1096
|
-
command_str = self._get_command_to_run(
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1285
|
+
command_str = self._get_command_to_run(
|
|
1286
|
+
cmd,
|
|
1287
|
+
process_stream,
|
|
1288
|
+
separate_stderr,
|
|
1289
|
+
skip_num_lines=skip_num_lines,
|
|
1290
|
+
source_bashrc=source_bashrc,
|
|
1291
|
+
run_in_background=run_in_background)
|
|
1101
1292
|
command = kubectl_base_command + [
|
|
1102
1293
|
# It is important to use /bin/bash -c here to make sure we quote the
|
|
1103
1294
|
# command to be run properly. Otherwise, directly appending commands
|
|
@@ -1211,16 +1402,19 @@ class LocalProcessCommandRunner(CommandRunner):
|
|
|
1211
1402
|
connect_timeout: Optional[int] = None,
|
|
1212
1403
|
source_bashrc: bool = False,
|
|
1213
1404
|
skip_num_lines: int = 0,
|
|
1405
|
+
run_in_background: bool = False,
|
|
1214
1406
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1215
1407
|
"""Use subprocess to run the command."""
|
|
1216
1408
|
del port_forward, ssh_mode, connect_timeout # Unused.
|
|
1217
1409
|
|
|
1218
|
-
command_str = self._get_command_to_run(
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1410
|
+
command_str = self._get_command_to_run(
|
|
1411
|
+
cmd,
|
|
1412
|
+
process_stream,
|
|
1413
|
+
separate_stderr,
|
|
1414
|
+
skip_num_lines=skip_num_lines,
|
|
1415
|
+
source_bashrc=source_bashrc,
|
|
1416
|
+
use_login=False,
|
|
1417
|
+
run_in_background=run_in_background)
|
|
1224
1418
|
|
|
1225
1419
|
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
1226
1420
|
os.makedirs(log_dir, exist_ok=True)
|
|
@@ -1332,29 +1526,6 @@ class SlurmCommandRunner(SSHCommandRunner):
|
|
|
1332
1526
|
self.job_id = job_id
|
|
1333
1527
|
self.slurm_node = slurm_node
|
|
1334
1528
|
|
|
1335
|
-
# Build a chained ProxyCommand that goes through the login node to reach
|
|
1336
|
-
# the compute node where the job is running.
|
|
1337
|
-
|
|
1338
|
-
# First, build SSH options to reach the login node, using the user's
|
|
1339
|
-
# existing proxy command if provided.
|
|
1340
|
-
proxy_ssh_options = ' '.join(
|
|
1341
|
-
ssh_options_list(self.ssh_private_key,
|
|
1342
|
-
None,
|
|
1343
|
-
ssh_proxy_command=self._ssh_proxy_command,
|
|
1344
|
-
port=self.port,
|
|
1345
|
-
disable_control_master=True))
|
|
1346
|
-
login_node_proxy_command = (f'ssh {proxy_ssh_options} '
|
|
1347
|
-
f'-W %h:%p {self.ssh_user}@{self.ip}')
|
|
1348
|
-
|
|
1349
|
-
# Update the proxy command to be the login node proxy, which will
|
|
1350
|
-
# be used by super().run() to reach the compute node.
|
|
1351
|
-
self._ssh_proxy_command = login_node_proxy_command
|
|
1352
|
-
# Update self.ip to target the compute node.
|
|
1353
|
-
self.ip = slurm_node
|
|
1354
|
-
# Assume the compute node's SSH port is 22.
|
|
1355
|
-
# TODO(kevin): Make this configurable if needed.
|
|
1356
|
-
self.port = 22
|
|
1357
|
-
|
|
1358
1529
|
def rsync(
|
|
1359
1530
|
self,
|
|
1360
1531
|
source: str,
|
|
@@ -1365,40 +1536,35 @@ class SlurmCommandRunner(SSHCommandRunner):
|
|
|
1365
1536
|
stream_logs: bool = True,
|
|
1366
1537
|
max_retry: int = 1,
|
|
1367
1538
|
) -> None:
|
|
1368
|
-
"""Rsyncs files
|
|
1369
|
-
by proxying through the Slurm login node.
|
|
1370
|
-
|
|
1371
|
-
For Slurm, files need to be accessible by compute nodes where jobs
|
|
1372
|
-
execute via srun. This means either it has to be on the compute node's
|
|
1373
|
-
local filesystem, or on a shared filesystem.
|
|
1539
|
+
"""Rsyncs files to/from the Slurm compute node using srun as transport.
|
|
1374
1540
|
"""
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
#
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1541
|
+
ssh_command = ' '.join(
|
|
1542
|
+
self.ssh_base_command(ssh_mode=SshMode.NON_INTERACTIVE,
|
|
1543
|
+
port_forward=None,
|
|
1544
|
+
connect_timeout=None))
|
|
1545
|
+
|
|
1546
|
+
# rsh command: parse job_id+node_list from $1, ssh to login node,
|
|
1547
|
+
# run srun with rsync command.
|
|
1548
|
+
rsh_option = (
|
|
1549
|
+
f'bash --norc --noprofile -c \''
|
|
1550
|
+
f'job_id=$(echo "$1" | cut -d+ -f1); '
|
|
1551
|
+
f'node_list=$(echo "$1" | cut -d+ -f2); '
|
|
1552
|
+
f'shift; ' # Shift past the encoded job_id+node_list
|
|
1553
|
+
f'exec {ssh_command} ' # SSH to login node to run srun
|
|
1554
|
+
f'srun --unbuffered --quiet --overlap '
|
|
1555
|
+
f'--jobid="$job_id" --nodelist="$node_list" --nodes=1 --ntasks=1 '
|
|
1556
|
+
f'"$@"'
|
|
1557
|
+
f'\' --')
|
|
1558
|
+
encoded_info = f'{self.job_id}+{self.slurm_node}'
|
|
1559
|
+
self._rsync(source,
|
|
1560
|
+
target,
|
|
1561
|
+
node_destination=encoded_info,
|
|
1562
|
+
up=up,
|
|
1563
|
+
rsh_option=rsh_option,
|
|
1564
|
+
log_path=log_path,
|
|
1565
|
+
stream_logs=stream_logs,
|
|
1566
|
+
max_retry=max_retry,
|
|
1567
|
+
get_remote_home_dir=lambda: self.sky_dir)
|
|
1402
1568
|
|
|
1403
1569
|
@timeline.event
|
|
1404
1570
|
@context_utils.cancellation_guard
|
|
@@ -1420,14 +1586,6 @@ class SlurmCommandRunner(SSHCommandRunner):
|
|
|
1420
1586
|
# could be part of a shared filesystem.
|
|
1421
1587
|
# And similarly for SKY_RUNTIME_DIR. See constants.\
|
|
1422
1588
|
# SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
|
|
1423
|
-
#
|
|
1424
|
-
# SSH directly to the compute node instead of using srun.
|
|
1425
|
-
# This avoids Slurm's proctrack/cgroup which kills all processes
|
|
1426
|
-
# when the job step ends (including child processes launched as
|
|
1427
|
-
# a separate process group), breaking background process spawning
|
|
1428
|
-
# (e.g., JobScheduler._run_job which uses launch_new_process_tree).
|
|
1429
|
-
# Note: proctrack/cgroup is enabled by default on Nebius'
|
|
1430
|
-
# Managed Soperator.
|
|
1431
1589
|
cmd = (
|
|
1432
1590
|
f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
|
|
1433
1591
|
f'"{self.skypilot_runtime_dir}" && '
|
|
@@ -1438,4 +1596,8 @@ class SlurmCommandRunner(SSHCommandRunner):
|
|
|
1438
1596
|
f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
|
|
1439
1597
|
f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
|
|
1440
1598
|
|
|
1599
|
+
cmd = (f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
|
|
1600
|
+
f'--nodelist={self.slurm_node} '
|
|
1601
|
+
f'--nodes=1 --ntasks=1 bash -c {shlex.quote(cmd)}')
|
|
1602
|
+
|
|
1441
1603
|
return super().run(cmd, **kwargs)
|