skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/utils/command_runner.py
CHANGED
|
@@ -1,14 +1,23 @@
|
|
|
1
1
|
"""Runner for commands to be executed on the cluster."""
|
|
2
2
|
import enum
|
|
3
|
+
import fcntl
|
|
3
4
|
import hashlib
|
|
4
5
|
import os
|
|
5
6
|
import pathlib
|
|
7
|
+
import pty
|
|
6
8
|
import re
|
|
7
9
|
import shlex
|
|
10
|
+
import signal
|
|
11
|
+
import socket
|
|
8
12
|
import sys
|
|
13
|
+
import termios
|
|
14
|
+
import threading
|
|
9
15
|
import time
|
|
10
16
|
from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
|
|
11
17
|
Union)
|
|
18
|
+
import uuid
|
|
19
|
+
|
|
20
|
+
import colorama
|
|
12
21
|
|
|
13
22
|
from sky import exceptions
|
|
14
23
|
from sky import sky_logging
|
|
@@ -19,6 +28,7 @@ from sky.utils import common_utils
|
|
|
19
28
|
from sky.utils import context_utils
|
|
20
29
|
from sky.utils import control_master_utils
|
|
21
30
|
from sky.utils import git as git_utils
|
|
31
|
+
from sky.utils import interactive_utils
|
|
22
32
|
from sky.utils import subprocess_utils
|
|
23
33
|
from sky.utils import timeline
|
|
24
34
|
|
|
@@ -63,6 +73,22 @@ def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
|
|
|
63
73
|
return path
|
|
64
74
|
|
|
65
75
|
|
|
76
|
+
def _is_skypilot_managed_key(key_path: str) -> bool:
|
|
77
|
+
"""Check if SSH key follows SkyPilot's managed key format.
|
|
78
|
+
|
|
79
|
+
SkyPilot-managed keys follow the pattern: ~/.sky/clients/<hash>/ssh/sky-key
|
|
80
|
+
External keys (like ~/.ssh/id_rsa) do not follow this pattern.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
key_path: Path to the SSH private key.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
True if the key follows SkyPilot's managed format, False otherwise.
|
|
87
|
+
"""
|
|
88
|
+
parts = os.path.normpath(key_path).split(os.path.sep)
|
|
89
|
+
return len(parts) >= 2 and parts[-1] == 'sky-key' and parts[-2] == 'ssh'
|
|
90
|
+
|
|
91
|
+
|
|
66
92
|
# Disable sudo for root user. This is useful when the command is running in a
|
|
67
93
|
# docker container, i.e. image_id is a docker image.
|
|
68
94
|
ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD = (
|
|
@@ -74,10 +100,12 @@ def ssh_options_list(
|
|
|
74
100
|
ssh_control_name: Optional[str],
|
|
75
101
|
*,
|
|
76
102
|
ssh_proxy_command: Optional[str] = None,
|
|
103
|
+
ssh_proxy_jump: Optional[str] = None,
|
|
77
104
|
docker_ssh_proxy_command: Optional[str] = None,
|
|
78
105
|
connect_timeout: Optional[int] = None,
|
|
79
106
|
port: int = 22,
|
|
80
107
|
disable_control_master: Optional[bool] = False,
|
|
108
|
+
escape_percent_expand: bool = False,
|
|
81
109
|
) -> List[str]:
|
|
82
110
|
"""Returns a list of sane options for 'ssh'."""
|
|
83
111
|
if connect_timeout is None:
|
|
@@ -117,11 +145,11 @@ def ssh_options_list(
|
|
|
117
145
|
# SSH Control will have a severe delay when using docker_ssh_proxy_command.
|
|
118
146
|
# TODO(tian): Investigate why.
|
|
119
147
|
#
|
|
120
|
-
# We disable ControlMaster when ssh_proxy_command is used,
|
|
121
|
-
# master connection will be idle although the connection might
|
|
122
|
-
# by other ssh commands that is not idle. In that case, user's
|
|
123
|
-
# command may drop the connection due to idle timeout, since it
|
|
124
|
-
# see the idle master connection. It is an issue even with the
|
|
148
|
+
# We disable ControlMaster when ssh_proxy_command is used,
|
|
149
|
+
# because the master connection will be idle although the connection might
|
|
150
|
+
# be shared by other ssh commands that is not idle. In that case, user's
|
|
151
|
+
# custom proxy command may drop the connection due to idle timeout, since it
|
|
152
|
+
# will only see the idle master connection. It is an issue even with the
|
|
125
153
|
# ServerAliveInterval set, since the keepalive message may not be recognized
|
|
126
154
|
# by the custom proxy command, such as AWS SSM Session Manager.
|
|
127
155
|
#
|
|
@@ -132,11 +160,14 @@ def ssh_options_list(
|
|
|
132
160
|
# 'ControlPersist' number of seconds delay per ssh commands ran.
|
|
133
161
|
if (ssh_control_name is not None and docker_ssh_proxy_command is None and
|
|
134
162
|
ssh_proxy_command is None and not disable_control_master):
|
|
163
|
+
control_path = f'{_ssh_control_path(ssh_control_name)}/%C'
|
|
164
|
+
if escape_percent_expand:
|
|
165
|
+
control_path = control_path.replace('%', '%%')
|
|
135
166
|
arg_dict.update({
|
|
136
167
|
# Control path: important optimization as we do multiple ssh in one
|
|
137
168
|
# sky.launch().
|
|
138
169
|
'ControlMaster': 'auto',
|
|
139
|
-
'ControlPath':
|
|
170
|
+
'ControlPath': control_path,
|
|
140
171
|
'ControlPersist': '300s',
|
|
141
172
|
})
|
|
142
173
|
ssh_key_option = [
|
|
@@ -158,6 +189,15 @@ def ssh_options_list(
|
|
|
158
189
|
'ProxyCommand': shlex.quote(ssh_proxy_command),
|
|
159
190
|
})
|
|
160
191
|
|
|
192
|
+
if ssh_proxy_jump is not None:
|
|
193
|
+
logger.debug(f'--- ProxyJump: {ssh_proxy_jump} ---')
|
|
194
|
+
if ssh_proxy_command is not None:
|
|
195
|
+
logger.warning('Both ProxyCommand and ProxyJump are specified. '
|
|
196
|
+
'ProxyCommand will take precedence.')
|
|
197
|
+
arg_dict.update({
|
|
198
|
+
'ProxyJump': shlex.quote(ssh_proxy_jump),
|
|
199
|
+
})
|
|
200
|
+
|
|
161
201
|
return ssh_key_option + [
|
|
162
202
|
x for y in (['-o', f'{k}={v}']
|
|
163
203
|
for k, v in arg_dict.items()
|
|
@@ -217,6 +257,7 @@ class CommandRunner:
|
|
|
217
257
|
skip_num_lines: int,
|
|
218
258
|
source_bashrc: bool = False,
|
|
219
259
|
use_login: bool = True,
|
|
260
|
+
run_in_background: bool = False,
|
|
220
261
|
) -> str:
|
|
221
262
|
"""Returns the command to run."""
|
|
222
263
|
if isinstance(cmd, list):
|
|
@@ -247,7 +288,11 @@ class CommandRunner:
|
|
|
247
288
|
]
|
|
248
289
|
if not separate_stderr:
|
|
249
290
|
command.append('2>&1')
|
|
291
|
+
if run_in_background:
|
|
292
|
+
command = ['nohup'] + command + ['&']
|
|
250
293
|
if not process_stream and skip_num_lines:
|
|
294
|
+
assert not run_in_background, (
|
|
295
|
+
'run_in_background and skip_num_lines cannot be used together')
|
|
251
296
|
command += [
|
|
252
297
|
# A hack to remove the following bash warnings (twice):
|
|
253
298
|
# bash: cannot set terminal process group
|
|
@@ -408,6 +453,7 @@ class CommandRunner:
|
|
|
408
453
|
connect_timeout: Optional[int] = None,
|
|
409
454
|
source_bashrc: bool = False,
|
|
410
455
|
skip_num_lines: int = 0,
|
|
456
|
+
run_in_background: bool = False,
|
|
411
457
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
412
458
|
"""Runs the command on the cluster.
|
|
413
459
|
|
|
@@ -426,6 +472,7 @@ class CommandRunner:
|
|
|
426
472
|
output. This is used when the output is not processed by
|
|
427
473
|
SkyPilot but we still want to get rid of some warning messages,
|
|
428
474
|
such as SSH warnings.
|
|
475
|
+
run_in_background: Whether to run the command in the background.
|
|
429
476
|
|
|
430
477
|
Returns:
|
|
431
478
|
returncode
|
|
@@ -603,17 +650,19 @@ class SSHCommandRunner(CommandRunner):
|
|
|
603
650
|
self,
|
|
604
651
|
node: Tuple[str, int],
|
|
605
652
|
ssh_user: str,
|
|
606
|
-
ssh_private_key: str,
|
|
653
|
+
ssh_private_key: Optional[str],
|
|
607
654
|
ssh_control_name: Optional[str] = '__default__',
|
|
608
655
|
ssh_proxy_command: Optional[str] = None,
|
|
656
|
+
ssh_proxy_jump: Optional[str] = None,
|
|
609
657
|
docker_user: Optional[str] = None,
|
|
610
658
|
disable_control_master: Optional[bool] = False,
|
|
611
659
|
port_forward_execute_remote_command: Optional[bool] = False,
|
|
660
|
+
enable_interactive_auth: bool = False,
|
|
612
661
|
):
|
|
613
662
|
"""Initialize SSHCommandRunner.
|
|
614
663
|
|
|
615
664
|
Example Usage:
|
|
616
|
-
runner = SSHCommandRunner(ip, ssh_user, ssh_private_key)
|
|
665
|
+
runner = SSHCommandRunner((ip, port), ssh_user, ssh_private_key)
|
|
617
666
|
runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
|
|
618
667
|
runner.rsync(source, target, up=True)
|
|
619
668
|
|
|
@@ -628,6 +677,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
628
677
|
ssh_proxy_command: Optional, the value to pass to '-o
|
|
629
678
|
ProxyCommand'. Useful for communicating with clusters without
|
|
630
679
|
public IPs using a "jump server".
|
|
680
|
+
ssh_proxy_jump: Optional, the value to pass to '-o ProxyJump' flag.
|
|
681
|
+
Similar to ssh_proxy_command, but more modern.
|
|
631
682
|
port: The port to use for ssh.
|
|
632
683
|
docker_user: The docker user to use for ssh. If specified, the
|
|
633
684
|
command will be run inside a docker container which have a ssh
|
|
@@ -647,11 +698,21 @@ class SSHCommandRunner(CommandRunner):
|
|
|
647
698
|
None if ssh_control_name is None else hashlib.md5(
|
|
648
699
|
ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
|
|
649
700
|
self._ssh_proxy_command = ssh_proxy_command
|
|
701
|
+
self._ssh_proxy_jump = ssh_proxy_jump
|
|
650
702
|
self.disable_control_master = (
|
|
651
703
|
disable_control_master or
|
|
652
704
|
control_master_utils.should_disable_control_master())
|
|
653
|
-
#
|
|
654
|
-
|
|
705
|
+
# Ensure SSH key is available. For SkyPilot-managed keys, create from
|
|
706
|
+
# database. For external keys (e.g., Slurm clusters), verify existence.
|
|
707
|
+
if ssh_private_key is not None and _is_skypilot_managed_key(
|
|
708
|
+
ssh_private_key):
|
|
709
|
+
auth_utils.create_ssh_key_files_from_db(ssh_private_key)
|
|
710
|
+
elif ssh_private_key is not None:
|
|
711
|
+
# Externally managed key - just verify it exists
|
|
712
|
+
expanded_key_path = os.path.expanduser(ssh_private_key)
|
|
713
|
+
if not os.path.exists(expanded_key_path):
|
|
714
|
+
raise FileNotFoundError(
|
|
715
|
+
f'SSH private key not found: {expanded_key_path}')
|
|
655
716
|
if docker_user is not None:
|
|
656
717
|
assert port is None or port == 22, (
|
|
657
718
|
f'port must be None or 22 for docker_user, got {port}.')
|
|
@@ -687,6 +748,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
687
748
|
self._docker_ssh_proxy_command = None
|
|
688
749
|
self.port_forward_execute_remote_command = (
|
|
689
750
|
port_forward_execute_remote_command)
|
|
751
|
+
self.enable_interactive_auth = enable_interactive_auth
|
|
690
752
|
|
|
691
753
|
def port_forward_command(
|
|
692
754
|
self,
|
|
@@ -738,6 +800,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
738
800
|
self.ssh_private_key,
|
|
739
801
|
self.ssh_control_name,
|
|
740
802
|
ssh_proxy_command=self._ssh_proxy_command,
|
|
803
|
+
ssh_proxy_jump=self._ssh_proxy_jump,
|
|
741
804
|
docker_ssh_proxy_command=docker_ssh_proxy_command,
|
|
742
805
|
port=self.port,
|
|
743
806
|
connect_timeout=connect_timeout,
|
|
@@ -745,6 +808,127 @@ class SSHCommandRunner(CommandRunner):
|
|
|
745
808
|
f'{self.ssh_user}@{self.ip}'
|
|
746
809
|
]
|
|
747
810
|
|
|
811
|
+
def _retry_with_interactive_auth(
|
|
812
|
+
self, session_id: str, command: List[str], log_path: str,
|
|
813
|
+
require_outputs: bool, process_stream: bool, stream_logs: bool,
|
|
814
|
+
executable: str,
|
|
815
|
+
**kwargs) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
|
|
816
|
+
"""Retries command with interactive auth.
|
|
817
|
+
|
|
818
|
+
This handles SSH connections requiring keyboard-interactive
|
|
819
|
+
authentication (e.g., 2FA) by using a PTY for auth prompts and
|
|
820
|
+
establishing a persistent ControlMaster socket (if enabled) that
|
|
821
|
+
other SSH sessions can reuse without re-authenticating.
|
|
822
|
+
|
|
823
|
+
The PTY is bridged to a websocket connection that allows the client
|
|
824
|
+
to handle interactive authentication. Command output flows through
|
|
825
|
+
normal stdout/stderr pipes, which gets printed to log_path.
|
|
826
|
+
|
|
827
|
+
See ssh_options_list for when ControlMaster is not enabled.
|
|
828
|
+
"""
|
|
829
|
+
extra_options = [
|
|
830
|
+
# Override ControlPersist to reduce frequency of manual user
|
|
831
|
+
# intervention. The default from ssh_options_list is only 5m.
|
|
832
|
+
#
|
|
833
|
+
# NOTE: When used with ProxyJump, the connection can die
|
|
834
|
+
# earlier than expected, so it is recommended to also enable
|
|
835
|
+
# ControlMaster on the jump host's SSH config. It is hard to
|
|
836
|
+
# tell why exactly, because enabling -v makes this problem
|
|
837
|
+
# disappear for some reasons.
|
|
838
|
+
'-o',
|
|
839
|
+
'ControlPersist=1d',
|
|
840
|
+
]
|
|
841
|
+
if self._ssh_proxy_jump is not None:
|
|
842
|
+
logger.warning(f'{colorama.Fore.YELLOW}When using ProxyJump, it is '
|
|
843
|
+
'recommended to also enable ControlMaster on the '
|
|
844
|
+
'jump host\'s SSH config to keep the authenticated '
|
|
845
|
+
f'connection alive for longer.{colorama.Fore.RESET}')
|
|
846
|
+
command = command[:1] + extra_options + command[1:]
|
|
847
|
+
|
|
848
|
+
# Create PTY for SSH. PTY slave for stdin from user, PTY master
|
|
849
|
+
# for password/auth prompts from SSH.
|
|
850
|
+
pty_m_fd, pty_s_fd = pty.openpty()
|
|
851
|
+
|
|
852
|
+
# Create Unix socket to pass PTY master fd to websocket handler
|
|
853
|
+
fd_socket_path = interactive_utils.get_pty_socket_path(session_id)
|
|
854
|
+
if os.path.exists(fd_socket_path):
|
|
855
|
+
os.unlink(fd_socket_path)
|
|
856
|
+
fd_server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
857
|
+
fd_server.bind(fd_socket_path)
|
|
858
|
+
fd_server.listen(1)
|
|
859
|
+
fd_server.settimeout(60)
|
|
860
|
+
|
|
861
|
+
# Signal client to initiate websocket for interactive auth
|
|
862
|
+
interactive_signal = f'<sky-interactive session="{session_id}"/>'
|
|
863
|
+
print(interactive_signal, flush=True)
|
|
864
|
+
|
|
865
|
+
def handle_unix_socket_connection():
|
|
866
|
+
"""Background thread to handle Unix socket connection."""
|
|
867
|
+
conn = None
|
|
868
|
+
try:
|
|
869
|
+
# Wait for websocket handler to connect.
|
|
870
|
+
conn, _ = fd_server.accept()
|
|
871
|
+
# Send PTY master fd through Unix socket.
|
|
872
|
+
interactive_utils.send_fd(conn, pty_m_fd)
|
|
873
|
+
# We don't need to block here to wait for the websocket
|
|
874
|
+
# handler, as SSH will continue by itself once auth
|
|
875
|
+
# is complete.
|
|
876
|
+
except socket.timeout:
|
|
877
|
+
logger.debug('Timeout waiting for interactive auth connection')
|
|
878
|
+
except Exception as e: # pylint: disable=broad-except
|
|
879
|
+
logger.error(f'Error in Unix socket connection: '
|
|
880
|
+
f'{common_utils.format_exception(e)}')
|
|
881
|
+
finally:
|
|
882
|
+
if conn is not None:
|
|
883
|
+
try:
|
|
884
|
+
conn.close()
|
|
885
|
+
except Exception: # pylint: disable=broad-except
|
|
886
|
+
pass
|
|
887
|
+
try:
|
|
888
|
+
os.close(pty_m_fd)
|
|
889
|
+
except Exception: # pylint: disable=broad-except
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
unix_sock_thread = threading.Thread(
|
|
893
|
+
target=handle_unix_socket_connection, daemon=True)
|
|
894
|
+
unix_sock_thread.start()
|
|
895
|
+
|
|
896
|
+
try:
|
|
897
|
+
|
|
898
|
+
def setup_pty_session():
|
|
899
|
+
# Set PTY as controlling terminal so SSH can access /dev/tty
|
|
900
|
+
# for keyboard-interactive auth. Without this:
|
|
901
|
+
# "can't open /dev/tty: Device not configured"
|
|
902
|
+
fcntl.ioctl(pty_s_fd, termios.TIOCSCTTY, 0)
|
|
903
|
+
# Ignore SIGHUP so ControlMaster survives when PTY closes.
|
|
904
|
+
signal.signal(signal.SIGHUP, signal.SIG_IGN)
|
|
905
|
+
# Ignore SIGTERM so ControlMaster survives subprocess_daemon
|
|
906
|
+
# killing the process group.
|
|
907
|
+
if self._ssh_proxy_jump is not None:
|
|
908
|
+
signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
|
909
|
+
|
|
910
|
+
return log_lib.run_with_log(' '.join(command),
|
|
911
|
+
log_path,
|
|
912
|
+
require_outputs=require_outputs,
|
|
913
|
+
stream_logs=stream_logs,
|
|
914
|
+
process_stream=process_stream,
|
|
915
|
+
shell=True,
|
|
916
|
+
executable=executable,
|
|
917
|
+
preexec_fn=setup_pty_session,
|
|
918
|
+
**kwargs)
|
|
919
|
+
except Exception as e:
|
|
920
|
+
raise RuntimeError(f'Exception in setup: {e}') from e
|
|
921
|
+
finally:
|
|
922
|
+
# Clean up PTY fds and sockets.
|
|
923
|
+
fd_server.close()
|
|
924
|
+
if os.path.exists(fd_socket_path):
|
|
925
|
+
os.unlink(fd_socket_path)
|
|
926
|
+
try:
|
|
927
|
+
os.close(pty_m_fd)
|
|
928
|
+
except OSError:
|
|
929
|
+
pass # Already closed by background thread
|
|
930
|
+
os.close(pty_s_fd)
|
|
931
|
+
|
|
748
932
|
def close_cached_connection(self) -> None:
|
|
749
933
|
"""Close the cached connection to the remote machine.
|
|
750
934
|
|
|
@@ -785,6 +969,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
785
969
|
connect_timeout: Optional[int] = None,
|
|
786
970
|
source_bashrc: bool = False,
|
|
787
971
|
skip_num_lines: int = 0,
|
|
972
|
+
run_in_background: bool = False,
|
|
788
973
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
789
974
|
"""Uses 'ssh' to run 'cmd' on a node with ip.
|
|
790
975
|
|
|
@@ -809,27 +994,32 @@ class SSHCommandRunner(CommandRunner):
|
|
|
809
994
|
output. This is used when the output is not processed by
|
|
810
995
|
SkyPilot but we still want to get rid of some warning messages,
|
|
811
996
|
such as SSH warnings.
|
|
997
|
+
run_in_background: Whether to run the command in the background.
|
|
812
998
|
|
|
813
999
|
Returns:
|
|
814
1000
|
returncode
|
|
815
1001
|
or
|
|
816
1002
|
A tuple of (returncode, stdout, stderr).
|
|
817
1003
|
"""
|
|
1004
|
+
|
|
818
1005
|
base_ssh_command = self.ssh_base_command(
|
|
819
1006
|
ssh_mode=ssh_mode,
|
|
820
1007
|
port_forward=port_forward,
|
|
821
1008
|
connect_timeout=connect_timeout)
|
|
1009
|
+
|
|
822
1010
|
if ssh_mode == SshMode.LOGIN:
|
|
823
1011
|
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
|
824
1012
|
command = base_ssh_command + cmd
|
|
825
1013
|
proc = subprocess_utils.run(command, shell=False, check=False)
|
|
826
1014
|
return proc.returncode, '', ''
|
|
827
1015
|
|
|
828
|
-
command_str = self._get_command_to_run(
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
1016
|
+
command_str = self._get_command_to_run(
|
|
1017
|
+
cmd,
|
|
1018
|
+
process_stream,
|
|
1019
|
+
separate_stderr,
|
|
1020
|
+
skip_num_lines=skip_num_lines,
|
|
1021
|
+
source_bashrc=source_bashrc,
|
|
1022
|
+
run_in_background=run_in_background)
|
|
833
1023
|
command = base_ssh_command + [shlex.quote(command_str)]
|
|
834
1024
|
|
|
835
1025
|
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
@@ -847,14 +1037,35 @@ class SSHCommandRunner(CommandRunner):
|
|
|
847
1037
|
else:
|
|
848
1038
|
command += [f'> {log_path}']
|
|
849
1039
|
executable = '/bin/bash'
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1040
|
+
|
|
1041
|
+
result = log_lib.run_with_log(' '.join(command),
|
|
1042
|
+
log_path,
|
|
1043
|
+
require_outputs=require_outputs,
|
|
1044
|
+
stream_logs=stream_logs,
|
|
1045
|
+
process_stream=process_stream,
|
|
1046
|
+
shell=True,
|
|
1047
|
+
executable=executable,
|
|
1048
|
+
**kwargs)
|
|
1049
|
+
if not self.enable_interactive_auth:
|
|
1050
|
+
return result
|
|
1051
|
+
|
|
1052
|
+
if require_outputs:
|
|
1053
|
+
returncode, _, _ = result
|
|
1054
|
+
else:
|
|
1055
|
+
returncode = result
|
|
1056
|
+
|
|
1057
|
+
if returncode != 255:
|
|
1058
|
+
return result
|
|
1059
|
+
# Exit code 255 indicates an SSH connection error. It does not
|
|
1060
|
+
# necessarily mean an auth failure, but when ControlMaster is used,
|
|
1061
|
+
# the stdout/stderr does not contain the auth failure message,
|
|
1062
|
+
# which is why we don't check the output here, and just attempt
|
|
1063
|
+
# the interactive auth flow.
|
|
1064
|
+
session_id = str(uuid.uuid4())
|
|
1065
|
+
return self._retry_with_interactive_auth(session_id, command, log_path,
|
|
1066
|
+
require_outputs,
|
|
1067
|
+
process_stream, stream_logs,
|
|
1068
|
+
executable, **kwargs)
|
|
858
1069
|
|
|
859
1070
|
@timeline.event
|
|
860
1071
|
def rsync(
|
|
@@ -867,6 +1078,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
867
1078
|
log_path: str = os.devnull,
|
|
868
1079
|
stream_logs: bool = True,
|
|
869
1080
|
max_retry: int = 1,
|
|
1081
|
+
get_remote_home_dir: Callable[[], str] = lambda: '~',
|
|
870
1082
|
) -> None:
|
|
871
1083
|
"""Uses 'rsync' to sync 'source' to 'target'.
|
|
872
1084
|
|
|
@@ -879,6 +1091,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
879
1091
|
stream_logs: Stream logs to the stdout/stderr.
|
|
880
1092
|
max_retry: The maximum number of retries for the rsync command.
|
|
881
1093
|
This value should be non-negative.
|
|
1094
|
+
get_remote_home_dir: A callable that returns the remote home
|
|
1095
|
+
directory. Defaults to '~'.
|
|
882
1096
|
|
|
883
1097
|
Raises:
|
|
884
1098
|
exceptions.CommandError: rsync command failed.
|
|
@@ -892,6 +1106,7 @@ class SSHCommandRunner(CommandRunner):
|
|
|
892
1106
|
self.ssh_private_key,
|
|
893
1107
|
self.ssh_control_name,
|
|
894
1108
|
ssh_proxy_command=self._ssh_proxy_command,
|
|
1109
|
+
ssh_proxy_jump=self._ssh_proxy_jump,
|
|
895
1110
|
docker_ssh_proxy_command=docker_ssh_proxy_command,
|
|
896
1111
|
port=self.port,
|
|
897
1112
|
disable_control_master=self.disable_control_master))
|
|
@@ -903,7 +1118,8 @@ class SSHCommandRunner(CommandRunner):
|
|
|
903
1118
|
rsh_option=rsh_option,
|
|
904
1119
|
log_path=log_path,
|
|
905
1120
|
stream_logs=stream_logs,
|
|
906
|
-
max_retry=max_retry
|
|
1121
|
+
max_retry=max_retry,
|
|
1122
|
+
get_remote_home_dir=get_remote_home_dir)
|
|
907
1123
|
|
|
908
1124
|
|
|
909
1125
|
class KubernetesCommandRunner(CommandRunner):
|
|
@@ -1004,6 +1220,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1004
1220
|
connect_timeout: Optional[int] = None,
|
|
1005
1221
|
source_bashrc: bool = False,
|
|
1006
1222
|
skip_num_lines: int = 0,
|
|
1223
|
+
run_in_background: bool = False,
|
|
1007
1224
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1008
1225
|
"""Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
|
|
1009
1226
|
name and namespace.
|
|
@@ -1028,6 +1245,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1028
1245
|
output. This is used when the output is not processed by
|
|
1029
1246
|
SkyPilot but we still want to get rid of some warning messages,
|
|
1030
1247
|
such as SSH warnings.
|
|
1248
|
+
run_in_background: Whether to run the command in the background.
|
|
1031
1249
|
|
|
1032
1250
|
Returns:
|
|
1033
1251
|
returncode
|
|
@@ -1064,11 +1282,13 @@ class KubernetesCommandRunner(CommandRunner):
|
|
|
1064
1282
|
kubectl_base_command.append('-i')
|
|
1065
1283
|
kubectl_base_command += [*kubectl_args, '--']
|
|
1066
1284
|
|
|
1067
|
-
command_str = self._get_command_to_run(
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1285
|
+
command_str = self._get_command_to_run(
|
|
1286
|
+
cmd,
|
|
1287
|
+
process_stream,
|
|
1288
|
+
separate_stderr,
|
|
1289
|
+
skip_num_lines=skip_num_lines,
|
|
1290
|
+
source_bashrc=source_bashrc,
|
|
1291
|
+
run_in_background=run_in_background)
|
|
1072
1292
|
command = kubectl_base_command + [
|
|
1073
1293
|
# It is important to use /bin/bash -c here to make sure we quote the
|
|
1074
1294
|
# command to be run properly. Otherwise, directly appending commands
|
|
@@ -1182,16 +1402,19 @@ class LocalProcessCommandRunner(CommandRunner):
|
|
|
1182
1402
|
connect_timeout: Optional[int] = None,
|
|
1183
1403
|
source_bashrc: bool = False,
|
|
1184
1404
|
skip_num_lines: int = 0,
|
|
1405
|
+
run_in_background: bool = False,
|
|
1185
1406
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1186
1407
|
"""Use subprocess to run the command."""
|
|
1187
1408
|
del port_forward, ssh_mode, connect_timeout # Unused.
|
|
1188
1409
|
|
|
1189
|
-
command_str = self._get_command_to_run(
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1410
|
+
command_str = self._get_command_to_run(
|
|
1411
|
+
cmd,
|
|
1412
|
+
process_stream,
|
|
1413
|
+
separate_stderr,
|
|
1414
|
+
skip_num_lines=skip_num_lines,
|
|
1415
|
+
source_bashrc=source_bashrc,
|
|
1416
|
+
use_login=False,
|
|
1417
|
+
run_in_background=run_in_background)
|
|
1195
1418
|
|
|
1196
1419
|
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
|
1197
1420
|
os.makedirs(log_dir, exist_ok=True)
|
|
@@ -1247,3 +1470,134 @@ class LocalProcessCommandRunner(CommandRunner):
|
|
|
1247
1470
|
log_path=log_path,
|
|
1248
1471
|
stream_logs=stream_logs,
|
|
1249
1472
|
max_retry=max_retry)
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
class SlurmCommandRunner(SSHCommandRunner):
|
|
1476
|
+
"""Runner for Slurm commands.
|
|
1477
|
+
|
|
1478
|
+
SlurmCommandRunner sends commands over an SSH connection through the Slurm
|
|
1479
|
+
controller, to the virtual instances.
|
|
1480
|
+
"""
|
|
1481
|
+
|
|
1482
|
+
def __init__(
|
|
1483
|
+
self,
|
|
1484
|
+
node: Tuple[str, int],
|
|
1485
|
+
ssh_user: str,
|
|
1486
|
+
ssh_private_key: Optional[str],
|
|
1487
|
+
*,
|
|
1488
|
+
sky_dir: str,
|
|
1489
|
+
skypilot_runtime_dir: str,
|
|
1490
|
+
job_id: str,
|
|
1491
|
+
slurm_node: str,
|
|
1492
|
+
**kwargs,
|
|
1493
|
+
):
|
|
1494
|
+
"""Initialize SlurmCommandRunner.
|
|
1495
|
+
|
|
1496
|
+
Example Usage:
|
|
1497
|
+
runner = SlurmCommandRunner(
|
|
1498
|
+
(ip, port),
|
|
1499
|
+
ssh_user,
|
|
1500
|
+
ssh_private_key,
|
|
1501
|
+
sky_dir=sky_dir,
|
|
1502
|
+
skypilot_runtime_dir=skypilot_runtime_dir,
|
|
1503
|
+
job_id=job_id,
|
|
1504
|
+
slurm_node=slurm_node)
|
|
1505
|
+
runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
|
|
1506
|
+
runner.rsync(source, target, up=True)
|
|
1507
|
+
|
|
1508
|
+
Args:
|
|
1509
|
+
node: (ip, port) The IP address and port of the remote machine
|
|
1510
|
+
(login node).
|
|
1511
|
+
ssh_user: SSH username.
|
|
1512
|
+
ssh_private_key: Path to SSH private key.
|
|
1513
|
+
sky_dir: The private directory for the SkyPilot cluster on the
|
|
1514
|
+
Slurm cluster.
|
|
1515
|
+
skypilot_runtime_dir: The directory for the SkyPilot runtime
|
|
1516
|
+
on the Slurm cluster.
|
|
1517
|
+
job_id: The Slurm job ID for this instance.
|
|
1518
|
+
slurm_node: The Slurm node hostname for this instance
|
|
1519
|
+
(compute node).
|
|
1520
|
+
**kwargs: Additional arguments forwarded to SSHCommandRunner
|
|
1521
|
+
(e.g., ssh_proxy_command).
|
|
1522
|
+
"""
|
|
1523
|
+
super().__init__(node, ssh_user, ssh_private_key, **kwargs)
|
|
1524
|
+
self.sky_dir = sky_dir
|
|
1525
|
+
self.skypilot_runtime_dir = skypilot_runtime_dir
|
|
1526
|
+
self.job_id = job_id
|
|
1527
|
+
self.slurm_node = slurm_node
|
|
1528
|
+
|
|
1529
|
+
def rsync(
|
|
1530
|
+
self,
|
|
1531
|
+
source: str,
|
|
1532
|
+
target: str,
|
|
1533
|
+
*,
|
|
1534
|
+
up: bool,
|
|
1535
|
+
log_path: str = os.devnull,
|
|
1536
|
+
stream_logs: bool = True,
|
|
1537
|
+
max_retry: int = 1,
|
|
1538
|
+
) -> None:
|
|
1539
|
+
"""Rsyncs files to/from the Slurm compute node using srun as transport.
|
|
1540
|
+
"""
|
|
1541
|
+
ssh_command = ' '.join(
|
|
1542
|
+
self.ssh_base_command(ssh_mode=SshMode.NON_INTERACTIVE,
|
|
1543
|
+
port_forward=None,
|
|
1544
|
+
connect_timeout=None))
|
|
1545
|
+
|
|
1546
|
+
# rsh command: parse job_id+node_list from $1, ssh to login node,
|
|
1547
|
+
# run srun with rsync command.
|
|
1548
|
+
rsh_option = (
|
|
1549
|
+
f'bash --norc --noprofile -c \''
|
|
1550
|
+
f'job_id=$(echo "$1" | cut -d+ -f1); '
|
|
1551
|
+
f'node_list=$(echo "$1" | cut -d+ -f2); '
|
|
1552
|
+
f'shift; ' # Shift past the encoded job_id+node_list
|
|
1553
|
+
f'exec {ssh_command} ' # SSH to login node to run srun
|
|
1554
|
+
f'srun --unbuffered --quiet --overlap '
|
|
1555
|
+
f'--jobid="$job_id" --nodelist="$node_list" --nodes=1 --ntasks=1 '
|
|
1556
|
+
f'"$@"'
|
|
1557
|
+
f'\' --')
|
|
1558
|
+
encoded_info = f'{self.job_id}+{self.slurm_node}'
|
|
1559
|
+
self._rsync(source,
|
|
1560
|
+
target,
|
|
1561
|
+
node_destination=encoded_info,
|
|
1562
|
+
up=up,
|
|
1563
|
+
rsh_option=rsh_option,
|
|
1564
|
+
log_path=log_path,
|
|
1565
|
+
stream_logs=stream_logs,
|
|
1566
|
+
max_retry=max_retry,
|
|
1567
|
+
get_remote_home_dir=lambda: self.sky_dir)
|
|
1568
|
+
|
|
1569
|
+
@timeline.event
|
|
1570
|
+
@context_utils.cancellation_guard
|
|
1571
|
+
def run(self, cmd: Union[str, List[str]],
|
|
1572
|
+
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
|
1573
|
+
"""Run Slurm-supported user commands over an SSH connection.
|
|
1574
|
+
|
|
1575
|
+
Args:
|
|
1576
|
+
cmd: The Slurm-supported user command to run.
|
|
1577
|
+
|
|
1578
|
+
Returns:
|
|
1579
|
+
returncode
|
|
1580
|
+
or
|
|
1581
|
+
A tuple of (returncode, stdout, stderr).
|
|
1582
|
+
"""
|
|
1583
|
+
# Override $HOME so that each SkyPilot cluster's state is isolated
|
|
1584
|
+
# from one another. We rely on the assumption that ~ is exclusively
|
|
1585
|
+
# used by a cluster, and in Slurm that is not the case, as $HOME
|
|
1586
|
+
# could be part of a shared filesystem.
|
|
1587
|
+
# And similarly for SKY_RUNTIME_DIR. See constants.\
|
|
1588
|
+
# SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
|
|
1589
|
+
cmd = (
|
|
1590
|
+
f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
|
|
1591
|
+
f'"{self.skypilot_runtime_dir}" && '
|
|
1592
|
+
# Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
|
|
1593
|
+
# package installation while avoiding permission conflicts when
|
|
1594
|
+
# multiple users share the same host. Otherwise it defaults to
|
|
1595
|
+
# ~/.cache/uv.
|
|
1596
|
+
f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
|
|
1597
|
+
f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
|
|
1598
|
+
|
|
1599
|
+
cmd = (f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
|
|
1600
|
+
f'--nodelist={self.slurm_node} '
|
|
1601
|
+
f'--nodes=1 --ntasks=1 bash -c {shlex.quote(cmd)}')
|
|
1602
|
+
|
|
1603
|
+
return super().run(cmd, **kwargs)
|