skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,23 @@
1
1
  """Runner for commands to be executed on the cluster."""
2
2
  import enum
3
+ import fcntl
3
4
  import hashlib
4
5
  import os
5
6
  import pathlib
7
+ import pty
6
8
  import re
7
9
  import shlex
10
+ import signal
11
+ import socket
8
12
  import sys
13
+ import termios
14
+ import threading
9
15
  import time
10
16
  from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
11
17
  Union)
18
+ import uuid
19
+
20
+ import colorama
12
21
 
13
22
  from sky import exceptions
14
23
  from sky import sky_logging
@@ -19,6 +28,7 @@ from sky.utils import common_utils
19
28
  from sky.utils import context_utils
20
29
  from sky.utils import control_master_utils
21
30
  from sky.utils import git as git_utils
31
+ from sky.utils import interactive_utils
22
32
  from sky.utils import subprocess_utils
23
33
  from sky.utils import timeline
24
34
 
@@ -63,6 +73,22 @@ def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
63
73
  return path
64
74
 
65
75
 
76
+ def _is_skypilot_managed_key(key_path: str) -> bool:
77
+ """Check if SSH key follows SkyPilot's managed key format.
78
+
79
+ SkyPilot-managed keys follow the pattern: ~/.sky/clients/<hash>/ssh/sky-key
80
+ External keys (like ~/.ssh/id_rsa) do not follow this pattern.
81
+
82
+ Args:
83
+ key_path: Path to the SSH private key.
84
+
85
+ Returns:
86
+ True if the key follows SkyPilot's managed format, False otherwise.
87
+ """
88
+ parts = os.path.normpath(key_path).split(os.path.sep)
89
+ return len(parts) >= 2 and parts[-1] == 'sky-key' and parts[-2] == 'ssh'
90
+
91
+
66
92
  # Disable sudo for root user. This is useful when the command is running in a
67
93
  # docker container, i.e. image_id is a docker image.
68
94
  ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD = (
@@ -74,10 +100,12 @@ def ssh_options_list(
74
100
  ssh_control_name: Optional[str],
75
101
  *,
76
102
  ssh_proxy_command: Optional[str] = None,
103
+ ssh_proxy_jump: Optional[str] = None,
77
104
  docker_ssh_proxy_command: Optional[str] = None,
78
105
  connect_timeout: Optional[int] = None,
79
106
  port: int = 22,
80
107
  disable_control_master: Optional[bool] = False,
108
+ escape_percent_expand: bool = False,
81
109
  ) -> List[str]:
82
110
  """Returns a list of sane options for 'ssh'."""
83
111
  if connect_timeout is None:
@@ -117,11 +145,11 @@ def ssh_options_list(
117
145
  # SSH Control will have a severe delay when using docker_ssh_proxy_command.
118
146
  # TODO(tian): Investigate why.
119
147
  #
120
- # We disable ControlMaster when ssh_proxy_command is used, because the
121
- # master connection will be idle although the connection might be shared
122
- # by other ssh commands that is not idle. In that case, user's custom proxy
123
- # command may drop the connection due to idle timeout, since it will only
124
- # see the idle master connection. It is an issue even with the
148
+ # We disable ControlMaster when ssh_proxy_command is used,
149
+ # because the master connection will be idle although the connection might
150
+ # be shared by other ssh commands that is not idle. In that case, user's
151
+ # custom proxy command may drop the connection due to idle timeout, since it
152
+ # will only see the idle master connection. It is an issue even with the
125
153
  # ServerAliveInterval set, since the keepalive message may not be recognized
126
154
  # by the custom proxy command, such as AWS SSM Session Manager.
127
155
  #
@@ -132,11 +160,14 @@ def ssh_options_list(
132
160
  # 'ControlPersist' number of seconds delay per ssh commands ran.
133
161
  if (ssh_control_name is not None and docker_ssh_proxy_command is None and
134
162
  ssh_proxy_command is None and not disable_control_master):
163
+ control_path = f'{_ssh_control_path(ssh_control_name)}/%C'
164
+ if escape_percent_expand:
165
+ control_path = control_path.replace('%', '%%')
135
166
  arg_dict.update({
136
167
  # Control path: important optimization as we do multiple ssh in one
137
168
  # sky.launch().
138
169
  'ControlMaster': 'auto',
139
- 'ControlPath': f'{_ssh_control_path(ssh_control_name)}/%C',
170
+ 'ControlPath': control_path,
140
171
  'ControlPersist': '300s',
141
172
  })
142
173
  ssh_key_option = [
@@ -158,6 +189,15 @@ def ssh_options_list(
158
189
  'ProxyCommand': shlex.quote(ssh_proxy_command),
159
190
  })
160
191
 
192
+ if ssh_proxy_jump is not None:
193
+ logger.debug(f'--- ProxyJump: {ssh_proxy_jump} ---')
194
+ if ssh_proxy_command is not None:
195
+ logger.warning('Both ProxyCommand and ProxyJump are specified. '
196
+ 'ProxyCommand will take precedence.')
197
+ arg_dict.update({
198
+ 'ProxyJump': shlex.quote(ssh_proxy_jump),
199
+ })
200
+
161
201
  return ssh_key_option + [
162
202
  x for y in (['-o', f'{k}={v}']
163
203
  for k, v in arg_dict.items()
@@ -217,6 +257,7 @@ class CommandRunner:
217
257
  skip_num_lines: int,
218
258
  source_bashrc: bool = False,
219
259
  use_login: bool = True,
260
+ run_in_background: bool = False,
220
261
  ) -> str:
221
262
  """Returns the command to run."""
222
263
  if isinstance(cmd, list):
@@ -247,7 +288,11 @@ class CommandRunner:
247
288
  ]
248
289
  if not separate_stderr:
249
290
  command.append('2>&1')
291
+ if run_in_background:
292
+ command = ['nohup'] + command + ['&']
250
293
  if not process_stream and skip_num_lines:
294
+ assert not run_in_background, (
295
+ 'run_in_background and skip_num_lines cannot be used together')
251
296
  command += [
252
297
  # A hack to remove the following bash warnings (twice):
253
298
  # bash: cannot set terminal process group
@@ -408,6 +453,7 @@ class CommandRunner:
408
453
  connect_timeout: Optional[int] = None,
409
454
  source_bashrc: bool = False,
410
455
  skip_num_lines: int = 0,
456
+ run_in_background: bool = False,
411
457
  **kwargs) -> Union[int, Tuple[int, str, str]]:
412
458
  """Runs the command on the cluster.
413
459
 
@@ -426,6 +472,7 @@ class CommandRunner:
426
472
  output. This is used when the output is not processed by
427
473
  SkyPilot but we still want to get rid of some warning messages,
428
474
  such as SSH warnings.
475
+ run_in_background: Whether to run the command in the background.
429
476
 
430
477
  Returns:
431
478
  returncode
@@ -603,17 +650,19 @@ class SSHCommandRunner(CommandRunner):
603
650
  self,
604
651
  node: Tuple[str, int],
605
652
  ssh_user: str,
606
- ssh_private_key: str,
653
+ ssh_private_key: Optional[str],
607
654
  ssh_control_name: Optional[str] = '__default__',
608
655
  ssh_proxy_command: Optional[str] = None,
656
+ ssh_proxy_jump: Optional[str] = None,
609
657
  docker_user: Optional[str] = None,
610
658
  disable_control_master: Optional[bool] = False,
611
659
  port_forward_execute_remote_command: Optional[bool] = False,
660
+ enable_interactive_auth: bool = False,
612
661
  ):
613
662
  """Initialize SSHCommandRunner.
614
663
 
615
664
  Example Usage:
616
- runner = SSHCommandRunner(ip, ssh_user, ssh_private_key)
665
+ runner = SSHCommandRunner((ip, port), ssh_user, ssh_private_key)
617
666
  runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
618
667
  runner.rsync(source, target, up=True)
619
668
 
@@ -628,6 +677,8 @@ class SSHCommandRunner(CommandRunner):
628
677
  ssh_proxy_command: Optional, the value to pass to '-o
629
678
  ProxyCommand'. Useful for communicating with clusters without
630
679
  public IPs using a "jump server".
680
+ ssh_proxy_jump: Optional, the value to pass to '-o ProxyJump' flag.
681
+ Similar to ssh_proxy_command, but more modern.
631
682
  port: The port to use for ssh.
632
683
  docker_user: The docker user to use for ssh. If specified, the
633
684
  command will be run inside a docker container which have a ssh
@@ -647,11 +698,21 @@ class SSHCommandRunner(CommandRunner):
647
698
  None if ssh_control_name is None else hashlib.md5(
648
699
  ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
649
700
  self._ssh_proxy_command = ssh_proxy_command
701
+ self._ssh_proxy_jump = ssh_proxy_jump
650
702
  self.disable_control_master = (
651
703
  disable_control_master or
652
704
  control_master_utils.should_disable_control_master())
653
- # ensure the ssh key files are created from the database
654
- auth_utils.create_ssh_key_files_from_db(ssh_private_key)
705
+ # Ensure SSH key is available. For SkyPilot-managed keys, create from
706
+ # database. For external keys (e.g., Slurm clusters), verify existence.
707
+ if ssh_private_key is not None and _is_skypilot_managed_key(
708
+ ssh_private_key):
709
+ auth_utils.create_ssh_key_files_from_db(ssh_private_key)
710
+ elif ssh_private_key is not None:
711
+ # Externally managed key - just verify it exists
712
+ expanded_key_path = os.path.expanduser(ssh_private_key)
713
+ if not os.path.exists(expanded_key_path):
714
+ raise FileNotFoundError(
715
+ f'SSH private key not found: {expanded_key_path}')
655
716
  if docker_user is not None:
656
717
  assert port is None or port == 22, (
657
718
  f'port must be None or 22 for docker_user, got {port}.')
@@ -687,6 +748,7 @@ class SSHCommandRunner(CommandRunner):
687
748
  self._docker_ssh_proxy_command = None
688
749
  self.port_forward_execute_remote_command = (
689
750
  port_forward_execute_remote_command)
751
+ self.enable_interactive_auth = enable_interactive_auth
690
752
 
691
753
  def port_forward_command(
692
754
  self,
@@ -738,6 +800,7 @@ class SSHCommandRunner(CommandRunner):
738
800
  self.ssh_private_key,
739
801
  self.ssh_control_name,
740
802
  ssh_proxy_command=self._ssh_proxy_command,
803
+ ssh_proxy_jump=self._ssh_proxy_jump,
741
804
  docker_ssh_proxy_command=docker_ssh_proxy_command,
742
805
  port=self.port,
743
806
  connect_timeout=connect_timeout,
@@ -745,6 +808,127 @@ class SSHCommandRunner(CommandRunner):
745
808
  f'{self.ssh_user}@{self.ip}'
746
809
  ]
747
810
 
811
+ def _retry_with_interactive_auth(
812
+ self, session_id: str, command: List[str], log_path: str,
813
+ require_outputs: bool, process_stream: bool, stream_logs: bool,
814
+ executable: str,
815
+ **kwargs) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
816
+ """Retries command with interactive auth.
817
+
818
+ This handles SSH connections requiring keyboard-interactive
819
+ authentication (e.g., 2FA) by using a PTY for auth prompts and
820
+ establishing a persistent ControlMaster socket (if enabled) that
821
+ other SSH sessions can reuse without re-authenticating.
822
+
823
+ The PTY is bridged to a websocket connection that allows the client
824
+ to handle interactive authentication. Command output flows through
825
+ normal stdout/stderr pipes, which gets printed to log_path.
826
+
827
+ See ssh_options_list for when ControlMaster is not enabled.
828
+ """
829
+ extra_options = [
830
+ # Override ControlPersist to reduce frequency of manual user
831
+ # intervention. The default from ssh_options_list is only 5m.
832
+ #
833
+ # NOTE: When used with ProxyJump, the connection can die
834
+ # earlier than expected, so it is recommended to also enable
835
+ # ControlMaster on the jump host's SSH config. It is hard to
836
+ # tell why exactly, because enabling -v makes this problem
837
+ # disappear for some reasons.
838
+ '-o',
839
+ 'ControlPersist=1d',
840
+ ]
841
+ if self._ssh_proxy_jump is not None:
842
+ logger.warning(f'{colorama.Fore.YELLOW}When using ProxyJump, it is '
843
+ 'recommended to also enable ControlMaster on the '
844
+ 'jump host\'s SSH config to keep the authenticated '
845
+ f'connection alive for longer.{colorama.Fore.RESET}')
846
+ command = command[:1] + extra_options + command[1:]
847
+
848
+ # Create PTY for SSH. PTY slave for stdin from user, PTY master
849
+ # for password/auth prompts from SSH.
850
+ pty_m_fd, pty_s_fd = pty.openpty()
851
+
852
+ # Create Unix socket to pass PTY master fd to websocket handler
853
+ fd_socket_path = interactive_utils.get_pty_socket_path(session_id)
854
+ if os.path.exists(fd_socket_path):
855
+ os.unlink(fd_socket_path)
856
+ fd_server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
857
+ fd_server.bind(fd_socket_path)
858
+ fd_server.listen(1)
859
+ fd_server.settimeout(60)
860
+
861
+ # Signal client to initiate websocket for interactive auth
862
+ interactive_signal = f'<sky-interactive session="{session_id}"/>'
863
+ print(interactive_signal, flush=True)
864
+
865
+ def handle_unix_socket_connection():
866
+ """Background thread to handle Unix socket connection."""
867
+ conn = None
868
+ try:
869
+ # Wait for websocket handler to connect.
870
+ conn, _ = fd_server.accept()
871
+ # Send PTY master fd through Unix socket.
872
+ interactive_utils.send_fd(conn, pty_m_fd)
873
+ # We don't need to block here to wait for the websocket
874
+ # handler, as SSH will continue by itself once auth
875
+ # is complete.
876
+ except socket.timeout:
877
+ logger.debug('Timeout waiting for interactive auth connection')
878
+ except Exception as e: # pylint: disable=broad-except
879
+ logger.error(f'Error in Unix socket connection: '
880
+ f'{common_utils.format_exception(e)}')
881
+ finally:
882
+ if conn is not None:
883
+ try:
884
+ conn.close()
885
+ except Exception: # pylint: disable=broad-except
886
+ pass
887
+ try:
888
+ os.close(pty_m_fd)
889
+ except Exception: # pylint: disable=broad-except
890
+ pass
891
+
892
+ unix_sock_thread = threading.Thread(
893
+ target=handle_unix_socket_connection, daemon=True)
894
+ unix_sock_thread.start()
895
+
896
+ try:
897
+
898
+ def setup_pty_session():
899
+ # Set PTY as controlling terminal so SSH can access /dev/tty
900
+ # for keyboard-interactive auth. Without this:
901
+ # "can't open /dev/tty: Device not configured"
902
+ fcntl.ioctl(pty_s_fd, termios.TIOCSCTTY, 0)
903
+ # Ignore SIGHUP so ControlMaster survives when PTY closes.
904
+ signal.signal(signal.SIGHUP, signal.SIG_IGN)
905
+ # Ignore SIGTERM so ControlMaster survives subprocess_daemon
906
+ # killing the process group.
907
+ if self._ssh_proxy_jump is not None:
908
+ signal.signal(signal.SIGTERM, signal.SIG_IGN)
909
+
910
+ return log_lib.run_with_log(' '.join(command),
911
+ log_path,
912
+ require_outputs=require_outputs,
913
+ stream_logs=stream_logs,
914
+ process_stream=process_stream,
915
+ shell=True,
916
+ executable=executable,
917
+ preexec_fn=setup_pty_session,
918
+ **kwargs)
919
+ except Exception as e:
920
+ raise RuntimeError(f'Exception in setup: {e}') from e
921
+ finally:
922
+ # Clean up PTY fds and sockets.
923
+ fd_server.close()
924
+ if os.path.exists(fd_socket_path):
925
+ os.unlink(fd_socket_path)
926
+ try:
927
+ os.close(pty_m_fd)
928
+ except OSError:
929
+ pass # Already closed by background thread
930
+ os.close(pty_s_fd)
931
+
748
932
  def close_cached_connection(self) -> None:
749
933
  """Close the cached connection to the remote machine.
750
934
 
@@ -785,6 +969,7 @@ class SSHCommandRunner(CommandRunner):
785
969
  connect_timeout: Optional[int] = None,
786
970
  source_bashrc: bool = False,
787
971
  skip_num_lines: int = 0,
972
+ run_in_background: bool = False,
788
973
  **kwargs) -> Union[int, Tuple[int, str, str]]:
789
974
  """Uses 'ssh' to run 'cmd' on a node with ip.
790
975
 
@@ -809,27 +994,32 @@ class SSHCommandRunner(CommandRunner):
809
994
  output. This is used when the output is not processed by
810
995
  SkyPilot but we still want to get rid of some warning messages,
811
996
  such as SSH warnings.
997
+ run_in_background: Whether to run the command in the background.
812
998
 
813
999
  Returns:
814
1000
  returncode
815
1001
  or
816
1002
  A tuple of (returncode, stdout, stderr).
817
1003
  """
1004
+
818
1005
  base_ssh_command = self.ssh_base_command(
819
1006
  ssh_mode=ssh_mode,
820
1007
  port_forward=port_forward,
821
1008
  connect_timeout=connect_timeout)
1009
+
822
1010
  if ssh_mode == SshMode.LOGIN:
823
1011
  assert isinstance(cmd, list), 'cmd must be a list for login mode.'
824
1012
  command = base_ssh_command + cmd
825
1013
  proc = subprocess_utils.run(command, shell=False, check=False)
826
1014
  return proc.returncode, '', ''
827
1015
 
828
- command_str = self._get_command_to_run(cmd,
829
- process_stream,
830
- separate_stderr,
831
- skip_num_lines=skip_num_lines,
832
- source_bashrc=source_bashrc)
1016
+ command_str = self._get_command_to_run(
1017
+ cmd,
1018
+ process_stream,
1019
+ separate_stderr,
1020
+ skip_num_lines=skip_num_lines,
1021
+ source_bashrc=source_bashrc,
1022
+ run_in_background=run_in_background)
833
1023
  command = base_ssh_command + [shlex.quote(command_str)]
834
1024
 
835
1025
  log_dir = os.path.expanduser(os.path.dirname(log_path))
@@ -847,14 +1037,35 @@ class SSHCommandRunner(CommandRunner):
847
1037
  else:
848
1038
  command += [f'> {log_path}']
849
1039
  executable = '/bin/bash'
850
- return log_lib.run_with_log(' '.join(command),
851
- log_path,
852
- require_outputs=require_outputs,
853
- stream_logs=stream_logs,
854
- process_stream=process_stream,
855
- shell=True,
856
- executable=executable,
857
- **kwargs)
1040
+
1041
+ result = log_lib.run_with_log(' '.join(command),
1042
+ log_path,
1043
+ require_outputs=require_outputs,
1044
+ stream_logs=stream_logs,
1045
+ process_stream=process_stream,
1046
+ shell=True,
1047
+ executable=executable,
1048
+ **kwargs)
1049
+ if not self.enable_interactive_auth:
1050
+ return result
1051
+
1052
+ if require_outputs:
1053
+ returncode, _, _ = result
1054
+ else:
1055
+ returncode = result
1056
+
1057
+ if returncode != 255:
1058
+ return result
1059
+ # Exit code 255 indicates an SSH connection error. It does not
1060
+ # necessarily mean an auth failure, but when ControlMaster is used,
1061
+ # the stdout/stderr does not contain the auth failure message,
1062
+ # which is why we don't check the output here, and just attempt
1063
+ # the interactive auth flow.
1064
+ session_id = str(uuid.uuid4())
1065
+ return self._retry_with_interactive_auth(session_id, command, log_path,
1066
+ require_outputs,
1067
+ process_stream, stream_logs,
1068
+ executable, **kwargs)
858
1069
 
859
1070
  @timeline.event
860
1071
  def rsync(
@@ -867,6 +1078,7 @@ class SSHCommandRunner(CommandRunner):
867
1078
  log_path: str = os.devnull,
868
1079
  stream_logs: bool = True,
869
1080
  max_retry: int = 1,
1081
+ get_remote_home_dir: Callable[[], str] = lambda: '~',
870
1082
  ) -> None:
871
1083
  """Uses 'rsync' to sync 'source' to 'target'.
872
1084
 
@@ -879,6 +1091,8 @@ class SSHCommandRunner(CommandRunner):
879
1091
  stream_logs: Stream logs to the stdout/stderr.
880
1092
  max_retry: The maximum number of retries for the rsync command.
881
1093
  This value should be non-negative.
1094
+ get_remote_home_dir: A callable that returns the remote home
1095
+ directory. Defaults to '~'.
882
1096
 
883
1097
  Raises:
884
1098
  exceptions.CommandError: rsync command failed.
@@ -892,6 +1106,7 @@ class SSHCommandRunner(CommandRunner):
892
1106
  self.ssh_private_key,
893
1107
  self.ssh_control_name,
894
1108
  ssh_proxy_command=self._ssh_proxy_command,
1109
+ ssh_proxy_jump=self._ssh_proxy_jump,
895
1110
  docker_ssh_proxy_command=docker_ssh_proxy_command,
896
1111
  port=self.port,
897
1112
  disable_control_master=self.disable_control_master))
@@ -903,7 +1118,8 @@ class SSHCommandRunner(CommandRunner):
903
1118
  rsh_option=rsh_option,
904
1119
  log_path=log_path,
905
1120
  stream_logs=stream_logs,
906
- max_retry=max_retry)
1121
+ max_retry=max_retry,
1122
+ get_remote_home_dir=get_remote_home_dir)
907
1123
 
908
1124
 
909
1125
  class KubernetesCommandRunner(CommandRunner):
@@ -1004,6 +1220,7 @@ class KubernetesCommandRunner(CommandRunner):
1004
1220
  connect_timeout: Optional[int] = None,
1005
1221
  source_bashrc: bool = False,
1006
1222
  skip_num_lines: int = 0,
1223
+ run_in_background: bool = False,
1007
1224
  **kwargs) -> Union[int, Tuple[int, str, str]]:
1008
1225
  """Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
1009
1226
  name and namespace.
@@ -1028,6 +1245,7 @@ class KubernetesCommandRunner(CommandRunner):
1028
1245
  output. This is used when the output is not processed by
1029
1246
  SkyPilot but we still want to get rid of some warning messages,
1030
1247
  such as SSH warnings.
1248
+ run_in_background: Whether to run the command in the background.
1031
1249
 
1032
1250
  Returns:
1033
1251
  returncode
@@ -1064,11 +1282,13 @@ class KubernetesCommandRunner(CommandRunner):
1064
1282
  kubectl_base_command.append('-i')
1065
1283
  kubectl_base_command += [*kubectl_args, '--']
1066
1284
 
1067
- command_str = self._get_command_to_run(cmd,
1068
- process_stream,
1069
- separate_stderr,
1070
- skip_num_lines=skip_num_lines,
1071
- source_bashrc=source_bashrc)
1285
+ command_str = self._get_command_to_run(
1286
+ cmd,
1287
+ process_stream,
1288
+ separate_stderr,
1289
+ skip_num_lines=skip_num_lines,
1290
+ source_bashrc=source_bashrc,
1291
+ run_in_background=run_in_background)
1072
1292
  command = kubectl_base_command + [
1073
1293
  # It is important to use /bin/bash -c here to make sure we quote the
1074
1294
  # command to be run properly. Otherwise, directly appending commands
@@ -1182,16 +1402,19 @@ class LocalProcessCommandRunner(CommandRunner):
1182
1402
  connect_timeout: Optional[int] = None,
1183
1403
  source_bashrc: bool = False,
1184
1404
  skip_num_lines: int = 0,
1405
+ run_in_background: bool = False,
1185
1406
  **kwargs) -> Union[int, Tuple[int, str, str]]:
1186
1407
  """Use subprocess to run the command."""
1187
1408
  del port_forward, ssh_mode, connect_timeout # Unused.
1188
1409
 
1189
- command_str = self._get_command_to_run(cmd,
1190
- process_stream,
1191
- separate_stderr,
1192
- skip_num_lines=skip_num_lines,
1193
- source_bashrc=source_bashrc,
1194
- use_login=False)
1410
+ command_str = self._get_command_to_run(
1411
+ cmd,
1412
+ process_stream,
1413
+ separate_stderr,
1414
+ skip_num_lines=skip_num_lines,
1415
+ source_bashrc=source_bashrc,
1416
+ use_login=False,
1417
+ run_in_background=run_in_background)
1195
1418
 
1196
1419
  log_dir = os.path.expanduser(os.path.dirname(log_path))
1197
1420
  os.makedirs(log_dir, exist_ok=True)
@@ -1247,3 +1470,134 @@ class LocalProcessCommandRunner(CommandRunner):
1247
1470
  log_path=log_path,
1248
1471
  stream_logs=stream_logs,
1249
1472
  max_retry=max_retry)
1473
+
1474
+
1475
+ class SlurmCommandRunner(SSHCommandRunner):
1476
+ """Runner for Slurm commands.
1477
+
1478
+ SlurmCommandRunner sends commands over an SSH connection through the Slurm
1479
+ controller, to the virtual instances.
1480
+ """
1481
+
1482
+ def __init__(
1483
+ self,
1484
+ node: Tuple[str, int],
1485
+ ssh_user: str,
1486
+ ssh_private_key: Optional[str],
1487
+ *,
1488
+ sky_dir: str,
1489
+ skypilot_runtime_dir: str,
1490
+ job_id: str,
1491
+ slurm_node: str,
1492
+ **kwargs,
1493
+ ):
1494
+ """Initialize SlurmCommandRunner.
1495
+
1496
+ Example Usage:
1497
+ runner = SlurmCommandRunner(
1498
+ (ip, port),
1499
+ ssh_user,
1500
+ ssh_private_key,
1501
+ sky_dir=sky_dir,
1502
+ skypilot_runtime_dir=skypilot_runtime_dir,
1503
+ job_id=job_id,
1504
+ slurm_node=slurm_node)
1505
+ runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
1506
+ runner.rsync(source, target, up=True)
1507
+
1508
+ Args:
1509
+ node: (ip, port) The IP address and port of the remote machine
1510
+ (login node).
1511
+ ssh_user: SSH username.
1512
+ ssh_private_key: Path to SSH private key.
1513
+ sky_dir: The private directory for the SkyPilot cluster on the
1514
+ Slurm cluster.
1515
+ skypilot_runtime_dir: The directory for the SkyPilot runtime
1516
+ on the Slurm cluster.
1517
+ job_id: The Slurm job ID for this instance.
1518
+ slurm_node: The Slurm node hostname for this instance
1519
+ (compute node).
1520
+ **kwargs: Additional arguments forwarded to SSHCommandRunner
1521
+ (e.g., ssh_proxy_command).
1522
+ """
1523
+ super().__init__(node, ssh_user, ssh_private_key, **kwargs)
1524
+ self.sky_dir = sky_dir
1525
+ self.skypilot_runtime_dir = skypilot_runtime_dir
1526
+ self.job_id = job_id
1527
+ self.slurm_node = slurm_node
1528
+
1529
+ def rsync(
1530
+ self,
1531
+ source: str,
1532
+ target: str,
1533
+ *,
1534
+ up: bool,
1535
+ log_path: str = os.devnull,
1536
+ stream_logs: bool = True,
1537
+ max_retry: int = 1,
1538
+ ) -> None:
1539
+ """Rsyncs files to/from the Slurm compute node using srun as transport.
1540
+ """
1541
+ ssh_command = ' '.join(
1542
+ self.ssh_base_command(ssh_mode=SshMode.NON_INTERACTIVE,
1543
+ port_forward=None,
1544
+ connect_timeout=None))
1545
+
1546
+ # rsh command: parse job_id+node_list from $1, ssh to login node,
1547
+ # run srun with rsync command.
1548
+ rsh_option = (
1549
+ f'bash --norc --noprofile -c \''
1550
+ f'job_id=$(echo "$1" | cut -d+ -f1); '
1551
+ f'node_list=$(echo "$1" | cut -d+ -f2); '
1552
+ f'shift; ' # Shift past the encoded job_id+node_list
1553
+ f'exec {ssh_command} ' # SSH to login node to run srun
1554
+ f'srun --unbuffered --quiet --overlap '
1555
+ f'--jobid="$job_id" --nodelist="$node_list" --nodes=1 --ntasks=1 '
1556
+ f'"$@"'
1557
+ f'\' --')
1558
+ encoded_info = f'{self.job_id}+{self.slurm_node}'
1559
+ self._rsync(source,
1560
+ target,
1561
+ node_destination=encoded_info,
1562
+ up=up,
1563
+ rsh_option=rsh_option,
1564
+ log_path=log_path,
1565
+ stream_logs=stream_logs,
1566
+ max_retry=max_retry,
1567
+ get_remote_home_dir=lambda: self.sky_dir)
1568
+
1569
+ @timeline.event
1570
+ @context_utils.cancellation_guard
1571
+ def run(self, cmd: Union[str, List[str]],
1572
+ **kwargs) -> Union[int, Tuple[int, str, str]]:
1573
+ """Run Slurm-supported user commands over an SSH connection.
1574
+
1575
+ Args:
1576
+ cmd: The Slurm-supported user command to run.
1577
+
1578
+ Returns:
1579
+ returncode
1580
+ or
1581
+ A tuple of (returncode, stdout, stderr).
1582
+ """
1583
+ # Override $HOME so that each SkyPilot cluster's state is isolated
1584
+ # from one another. We rely on the assumption that ~ is exclusively
1585
+ # used by a cluster, and in Slurm that is not the case, as $HOME
1586
+ # could be part of a shared filesystem.
1587
+ # And similarly for SKY_RUNTIME_DIR. See constants.\
1588
+ # SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
1589
+ cmd = (
1590
+ f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
1591
+ f'"{self.skypilot_runtime_dir}" && '
1592
+ # Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
1593
+ # package installation while avoiding permission conflicts when
1594
+ # multiple users share the same host. Otherwise it defaults to
1595
+ # ~/.cache/uv.
1596
+ f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
1597
+ f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
1598
+
1599
+ cmd = (f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
1600
+ f'--nodelist={self.slurm_node} '
1601
+ f'--nodes=1 --ntasks=1 bash -c {shlex.quote(cmd)}')
1602
+
1603
+ return super().run(cmd, **kwargs)