skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,23 @@
1
1
  """Runner for commands to be executed on the cluster."""
2
2
  import enum
3
+ import fcntl
3
4
  import hashlib
4
5
  import os
5
6
  import pathlib
7
+ import pty
6
8
  import re
7
9
  import shlex
10
+ import signal
11
+ import socket
8
12
  import sys
13
+ import termios
14
+ import threading
9
15
  import time
10
16
  from typing import (Any, Callable, Dict, Iterable, List, Optional, Tuple, Type,
11
17
  Union)
18
+ import uuid
19
+
20
+ import colorama
12
21
 
13
22
  from sky import exceptions
14
23
  from sky import sky_logging
@@ -19,6 +28,7 @@ from sky.utils import common_utils
19
28
  from sky.utils import context_utils
20
29
  from sky.utils import control_master_utils
21
30
  from sky.utils import git as git_utils
31
+ from sky.utils import interactive_utils
22
32
  from sky.utils import subprocess_utils
23
33
  from sky.utils import timeline
24
34
 
@@ -90,10 +100,12 @@ def ssh_options_list(
90
100
  ssh_control_name: Optional[str],
91
101
  *,
92
102
  ssh_proxy_command: Optional[str] = None,
103
+ ssh_proxy_jump: Optional[str] = None,
93
104
  docker_ssh_proxy_command: Optional[str] = None,
94
105
  connect_timeout: Optional[int] = None,
95
106
  port: int = 22,
96
107
  disable_control_master: Optional[bool] = False,
108
+ escape_percent_expand: bool = False,
97
109
  ) -> List[str]:
98
110
  """Returns a list of sane options for 'ssh'."""
99
111
  if connect_timeout is None:
@@ -133,11 +145,11 @@ def ssh_options_list(
133
145
  # SSH Control will have a severe delay when using docker_ssh_proxy_command.
134
146
  # TODO(tian): Investigate why.
135
147
  #
136
- # We disable ControlMaster when ssh_proxy_command is used, because the
137
- # master connection will be idle although the connection might be shared
138
- # by other ssh commands that is not idle. In that case, user's custom proxy
139
- # command may drop the connection due to idle timeout, since it will only
140
- # see the idle master connection. It is an issue even with the
148
+ # We disable ControlMaster when ssh_proxy_command is used,
149
+ # because the master connection will be idle although the connection might
150
+ # be shared by other ssh commands that is not idle. In that case, user's
151
+ # custom proxy command may drop the connection due to idle timeout, since it
152
+ # will only see the idle master connection. It is an issue even with the
141
153
  # ServerAliveInterval set, since the keepalive message may not be recognized
142
154
  # by the custom proxy command, such as AWS SSM Session Manager.
143
155
  #
@@ -148,11 +160,14 @@ def ssh_options_list(
148
160
  # 'ControlPersist' number of seconds delay per ssh commands ran.
149
161
  if (ssh_control_name is not None and docker_ssh_proxy_command is None and
150
162
  ssh_proxy_command is None and not disable_control_master):
163
+ control_path = f'{_ssh_control_path(ssh_control_name)}/%C'
164
+ if escape_percent_expand:
165
+ control_path = control_path.replace('%', '%%')
151
166
  arg_dict.update({
152
167
  # Control path: important optimization as we do multiple ssh in one
153
168
  # sky.launch().
154
169
  'ControlMaster': 'auto',
155
- 'ControlPath': f'{_ssh_control_path(ssh_control_name)}/%C',
170
+ 'ControlPath': control_path,
156
171
  'ControlPersist': '300s',
157
172
  })
158
173
  ssh_key_option = [
@@ -174,6 +189,15 @@ def ssh_options_list(
174
189
  'ProxyCommand': shlex.quote(ssh_proxy_command),
175
190
  })
176
191
 
192
+ if ssh_proxy_jump is not None:
193
+ logger.debug(f'--- ProxyJump: {ssh_proxy_jump} ---')
194
+ if ssh_proxy_command is not None:
195
+ logger.warning('Both ProxyCommand and ProxyJump are specified. '
196
+ 'ProxyCommand will take precedence.')
197
+ arg_dict.update({
198
+ 'ProxyJump': shlex.quote(ssh_proxy_jump),
199
+ })
200
+
177
201
  return ssh_key_option + [
178
202
  x for y in (['-o', f'{k}={v}']
179
203
  for k, v in arg_dict.items()
@@ -233,6 +257,7 @@ class CommandRunner:
233
257
  skip_num_lines: int,
234
258
  source_bashrc: bool = False,
235
259
  use_login: bool = True,
260
+ run_in_background: bool = False,
236
261
  ) -> str:
237
262
  """Returns the command to run."""
238
263
  if isinstance(cmd, list):
@@ -263,7 +288,11 @@ class CommandRunner:
263
288
  ]
264
289
  if not separate_stderr:
265
290
  command.append('2>&1')
291
+ if run_in_background:
292
+ command = ['nohup'] + command + ['&']
266
293
  if not process_stream and skip_num_lines:
294
+ assert not run_in_background, (
295
+ 'run_in_background and skip_num_lines cannot be used together')
267
296
  command += [
268
297
  # A hack to remove the following bash warnings (twice):
269
298
  # bash: cannot set terminal process group
@@ -424,6 +453,7 @@ class CommandRunner:
424
453
  connect_timeout: Optional[int] = None,
425
454
  source_bashrc: bool = False,
426
455
  skip_num_lines: int = 0,
456
+ run_in_background: bool = False,
427
457
  **kwargs) -> Union[int, Tuple[int, str, str]]:
428
458
  """Runs the command on the cluster.
429
459
 
@@ -442,6 +472,7 @@ class CommandRunner:
442
472
  output. This is used when the output is not processed by
443
473
  SkyPilot but we still want to get rid of some warning messages,
444
474
  such as SSH warnings.
475
+ run_in_background: Whether to run the command in the background.
445
476
 
446
477
  Returns:
447
478
  returncode
@@ -622,9 +653,11 @@ class SSHCommandRunner(CommandRunner):
622
653
  ssh_private_key: Optional[str],
623
654
  ssh_control_name: Optional[str] = '__default__',
624
655
  ssh_proxy_command: Optional[str] = None,
656
+ ssh_proxy_jump: Optional[str] = None,
625
657
  docker_user: Optional[str] = None,
626
658
  disable_control_master: Optional[bool] = False,
627
659
  port_forward_execute_remote_command: Optional[bool] = False,
660
+ enable_interactive_auth: bool = False,
628
661
  ):
629
662
  """Initialize SSHCommandRunner.
630
663
 
@@ -644,6 +677,8 @@ class SSHCommandRunner(CommandRunner):
644
677
  ssh_proxy_command: Optional, the value to pass to '-o
645
678
  ProxyCommand'. Useful for communicating with clusters without
646
679
  public IPs using a "jump server".
680
+ ssh_proxy_jump: Optional, the value to pass to '-o ProxyJump' flag.
681
+ Similar to ssh_proxy_command, but more modern.
647
682
  port: The port to use for ssh.
648
683
  docker_user: The docker user to use for ssh. If specified, the
649
684
  command will be run inside a docker container which have a ssh
@@ -663,6 +698,7 @@ class SSHCommandRunner(CommandRunner):
663
698
  None if ssh_control_name is None else hashlib.md5(
664
699
  ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
665
700
  self._ssh_proxy_command = ssh_proxy_command
701
+ self._ssh_proxy_jump = ssh_proxy_jump
666
702
  self.disable_control_master = (
667
703
  disable_control_master or
668
704
  control_master_utils.should_disable_control_master())
@@ -712,6 +748,7 @@ class SSHCommandRunner(CommandRunner):
712
748
  self._docker_ssh_proxy_command = None
713
749
  self.port_forward_execute_remote_command = (
714
750
  port_forward_execute_remote_command)
751
+ self.enable_interactive_auth = enable_interactive_auth
715
752
 
716
753
  def port_forward_command(
717
754
  self,
@@ -763,6 +800,7 @@ class SSHCommandRunner(CommandRunner):
763
800
  self.ssh_private_key,
764
801
  self.ssh_control_name,
765
802
  ssh_proxy_command=self._ssh_proxy_command,
803
+ ssh_proxy_jump=self._ssh_proxy_jump,
766
804
  docker_ssh_proxy_command=docker_ssh_proxy_command,
767
805
  port=self.port,
768
806
  connect_timeout=connect_timeout,
@@ -770,6 +808,127 @@ class SSHCommandRunner(CommandRunner):
770
808
  f'{self.ssh_user}@{self.ip}'
771
809
  ]
772
810
 
811
+ def _retry_with_interactive_auth(
812
+ self, session_id: str, command: List[str], log_path: str,
813
+ require_outputs: bool, process_stream: bool, stream_logs: bool,
814
+ executable: str,
815
+ **kwargs) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
816
+ """Retries command with interactive auth.
817
+
818
+ This handles SSH connections requiring keyboard-interactive
819
+ authentication (e.g., 2FA) by using a PTY for auth prompts and
820
+ establishing a persistent ControlMaster socket (if enabled) that
821
+ other SSH sessions can reuse without re-authenticating.
822
+
823
+ The PTY is bridged to a websocket connection that allows the client
824
+ to handle interactive authentication. Command output flows through
825
+ normal stdout/stderr pipes, which gets printed to log_path.
826
+
827
+ See ssh_options_list for when ControlMaster is not enabled.
828
+ """
829
+ extra_options = [
830
+ # Override ControlPersist to reduce frequency of manual user
831
+ # intervention. The default from ssh_options_list is only 5m.
832
+ #
833
+ # NOTE: When used with ProxyJump, the connection can die
834
+ # earlier than expected, so it is recommended to also enable
835
+ # ControlMaster on the jump host's SSH config. It is hard to
836
+ # tell why exactly, because enabling -v makes this problem
837
+ # disappear for some reasons.
838
+ '-o',
839
+ 'ControlPersist=1d',
840
+ ]
841
+ if self._ssh_proxy_jump is not None:
842
+ logger.warning(f'{colorama.Fore.YELLOW}When using ProxyJump, it is '
843
+ 'recommended to also enable ControlMaster on the '
844
+ 'jump host\'s SSH config to keep the authenticated '
845
+ f'connection alive for longer.{colorama.Fore.RESET}')
846
+ command = command[:1] + extra_options + command[1:]
847
+
848
+ # Create PTY for SSH. PTY slave for stdin from user, PTY master
849
+ # for password/auth prompts from SSH.
850
+ pty_m_fd, pty_s_fd = pty.openpty()
851
+
852
+ # Create Unix socket to pass PTY master fd to websocket handler
853
+ fd_socket_path = interactive_utils.get_pty_socket_path(session_id)
854
+ if os.path.exists(fd_socket_path):
855
+ os.unlink(fd_socket_path)
856
+ fd_server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
857
+ fd_server.bind(fd_socket_path)
858
+ fd_server.listen(1)
859
+ fd_server.settimeout(60)
860
+
861
+ # Signal client to initiate websocket for interactive auth
862
+ interactive_signal = f'<sky-interactive session="{session_id}"/>'
863
+ print(interactive_signal, flush=True)
864
+
865
+ def handle_unix_socket_connection():
866
+ """Background thread to handle Unix socket connection."""
867
+ conn = None
868
+ try:
869
+ # Wait for websocket handler to connect.
870
+ conn, _ = fd_server.accept()
871
+ # Send PTY master fd through Unix socket.
872
+ interactive_utils.send_fd(conn, pty_m_fd)
873
+ # We don't need to block here to wait for the websocket
874
+ # handler, as SSH will continue by itself once auth
875
+ # is complete.
876
+ except socket.timeout:
877
+ logger.debug('Timeout waiting for interactive auth connection')
878
+ except Exception as e: # pylint: disable=broad-except
879
+ logger.error(f'Error in Unix socket connection: '
880
+ f'{common_utils.format_exception(e)}')
881
+ finally:
882
+ if conn is not None:
883
+ try:
884
+ conn.close()
885
+ except Exception: # pylint: disable=broad-except
886
+ pass
887
+ try:
888
+ os.close(pty_m_fd)
889
+ except Exception: # pylint: disable=broad-except
890
+ pass
891
+
892
+ unix_sock_thread = threading.Thread(
893
+ target=handle_unix_socket_connection, daemon=True)
894
+ unix_sock_thread.start()
895
+
896
+ try:
897
+
898
+ def setup_pty_session():
899
+ # Set PTY as controlling terminal so SSH can access /dev/tty
900
+ # for keyboard-interactive auth. Without this:
901
+ # "can't open /dev/tty: Device not configured"
902
+ fcntl.ioctl(pty_s_fd, termios.TIOCSCTTY, 0)
903
+ # Ignore SIGHUP so ControlMaster survives when PTY closes.
904
+ signal.signal(signal.SIGHUP, signal.SIG_IGN)
905
+ # Ignore SIGTERM so ControlMaster survives subprocess_daemon
906
+ # killing the process group.
907
+ if self._ssh_proxy_jump is not None:
908
+ signal.signal(signal.SIGTERM, signal.SIG_IGN)
909
+
910
+ return log_lib.run_with_log(' '.join(command),
911
+ log_path,
912
+ require_outputs=require_outputs,
913
+ stream_logs=stream_logs,
914
+ process_stream=process_stream,
915
+ shell=True,
916
+ executable=executable,
917
+ preexec_fn=setup_pty_session,
918
+ **kwargs)
919
+ except Exception as e:
920
+ raise RuntimeError(f'Exception in setup: {e}') from e
921
+ finally:
922
+ # Clean up PTY fds and sockets.
923
+ fd_server.close()
924
+ if os.path.exists(fd_socket_path):
925
+ os.unlink(fd_socket_path)
926
+ try:
927
+ os.close(pty_m_fd)
928
+ except OSError:
929
+ pass # Already closed by background thread
930
+ os.close(pty_s_fd)
931
+
773
932
  def close_cached_connection(self) -> None:
774
933
  """Close the cached connection to the remote machine.
775
934
 
@@ -810,6 +969,7 @@ class SSHCommandRunner(CommandRunner):
810
969
  connect_timeout: Optional[int] = None,
811
970
  source_bashrc: bool = False,
812
971
  skip_num_lines: int = 0,
972
+ run_in_background: bool = False,
813
973
  **kwargs) -> Union[int, Tuple[int, str, str]]:
814
974
  """Uses 'ssh' to run 'cmd' on a node with ip.
815
975
 
@@ -834,27 +994,32 @@ class SSHCommandRunner(CommandRunner):
834
994
  output. This is used when the output is not processed by
835
995
  SkyPilot but we still want to get rid of some warning messages,
836
996
  such as SSH warnings.
997
+ run_in_background: Whether to run the command in the background.
837
998
 
838
999
  Returns:
839
1000
  returncode
840
1001
  or
841
1002
  A tuple of (returncode, stdout, stderr).
842
1003
  """
1004
+
843
1005
  base_ssh_command = self.ssh_base_command(
844
1006
  ssh_mode=ssh_mode,
845
1007
  port_forward=port_forward,
846
1008
  connect_timeout=connect_timeout)
1009
+
847
1010
  if ssh_mode == SshMode.LOGIN:
848
1011
  assert isinstance(cmd, list), 'cmd must be a list for login mode.'
849
1012
  command = base_ssh_command + cmd
850
1013
  proc = subprocess_utils.run(command, shell=False, check=False)
851
1014
  return proc.returncode, '', ''
852
1015
 
853
- command_str = self._get_command_to_run(cmd,
854
- process_stream,
855
- separate_stderr,
856
- skip_num_lines=skip_num_lines,
857
- source_bashrc=source_bashrc)
1016
+ command_str = self._get_command_to_run(
1017
+ cmd,
1018
+ process_stream,
1019
+ separate_stderr,
1020
+ skip_num_lines=skip_num_lines,
1021
+ source_bashrc=source_bashrc,
1022
+ run_in_background=run_in_background)
858
1023
  command = base_ssh_command + [shlex.quote(command_str)]
859
1024
 
860
1025
  log_dir = os.path.expanduser(os.path.dirname(log_path))
@@ -872,14 +1037,35 @@ class SSHCommandRunner(CommandRunner):
872
1037
  else:
873
1038
  command += [f'> {log_path}']
874
1039
  executable = '/bin/bash'
875
- return log_lib.run_with_log(' '.join(command),
876
- log_path,
877
- require_outputs=require_outputs,
878
- stream_logs=stream_logs,
879
- process_stream=process_stream,
880
- shell=True,
881
- executable=executable,
882
- **kwargs)
1040
+
1041
+ result = log_lib.run_with_log(' '.join(command),
1042
+ log_path,
1043
+ require_outputs=require_outputs,
1044
+ stream_logs=stream_logs,
1045
+ process_stream=process_stream,
1046
+ shell=True,
1047
+ executable=executable,
1048
+ **kwargs)
1049
+ if not self.enable_interactive_auth:
1050
+ return result
1051
+
1052
+ if require_outputs:
1053
+ returncode, _, _ = result
1054
+ else:
1055
+ returncode = result
1056
+
1057
+ if returncode != 255:
1058
+ return result
1059
+ # Exit code 255 indicates an SSH connection error. It does not
1060
+ # necessarily mean an auth failure, but when ControlMaster is used,
1061
+ # the stdout/stderr does not contain the auth failure message,
1062
+ # which is why we don't check the output here, and just attempt
1063
+ # the interactive auth flow.
1064
+ session_id = str(uuid.uuid4())
1065
+ return self._retry_with_interactive_auth(session_id, command, log_path,
1066
+ require_outputs,
1067
+ process_stream, stream_logs,
1068
+ executable, **kwargs)
883
1069
 
884
1070
  @timeline.event
885
1071
  def rsync(
@@ -920,6 +1106,7 @@ class SSHCommandRunner(CommandRunner):
920
1106
  self.ssh_private_key,
921
1107
  self.ssh_control_name,
922
1108
  ssh_proxy_command=self._ssh_proxy_command,
1109
+ ssh_proxy_jump=self._ssh_proxy_jump,
923
1110
  docker_ssh_proxy_command=docker_ssh_proxy_command,
924
1111
  port=self.port,
925
1112
  disable_control_master=self.disable_control_master))
@@ -1033,6 +1220,7 @@ class KubernetesCommandRunner(CommandRunner):
1033
1220
  connect_timeout: Optional[int] = None,
1034
1221
  source_bashrc: bool = False,
1035
1222
  skip_num_lines: int = 0,
1223
+ run_in_background: bool = False,
1036
1224
  **kwargs) -> Union[int, Tuple[int, str, str]]:
1037
1225
  """Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
1038
1226
  name and namespace.
@@ -1057,6 +1245,7 @@ class KubernetesCommandRunner(CommandRunner):
1057
1245
  output. This is used when the output is not processed by
1058
1246
  SkyPilot but we still want to get rid of some warning messages,
1059
1247
  such as SSH warnings.
1248
+ run_in_background: Whether to run the command in the background.
1060
1249
 
1061
1250
  Returns:
1062
1251
  returncode
@@ -1093,11 +1282,13 @@ class KubernetesCommandRunner(CommandRunner):
1093
1282
  kubectl_base_command.append('-i')
1094
1283
  kubectl_base_command += [*kubectl_args, '--']
1095
1284
 
1096
- command_str = self._get_command_to_run(cmd,
1097
- process_stream,
1098
- separate_stderr,
1099
- skip_num_lines=skip_num_lines,
1100
- source_bashrc=source_bashrc)
1285
+ command_str = self._get_command_to_run(
1286
+ cmd,
1287
+ process_stream,
1288
+ separate_stderr,
1289
+ skip_num_lines=skip_num_lines,
1290
+ source_bashrc=source_bashrc,
1291
+ run_in_background=run_in_background)
1101
1292
  command = kubectl_base_command + [
1102
1293
  # It is important to use /bin/bash -c here to make sure we quote the
1103
1294
  # command to be run properly. Otherwise, directly appending commands
@@ -1211,16 +1402,19 @@ class LocalProcessCommandRunner(CommandRunner):
1211
1402
  connect_timeout: Optional[int] = None,
1212
1403
  source_bashrc: bool = False,
1213
1404
  skip_num_lines: int = 0,
1405
+ run_in_background: bool = False,
1214
1406
  **kwargs) -> Union[int, Tuple[int, str, str]]:
1215
1407
  """Use subprocess to run the command."""
1216
1408
  del port_forward, ssh_mode, connect_timeout # Unused.
1217
1409
 
1218
- command_str = self._get_command_to_run(cmd,
1219
- process_stream,
1220
- separate_stderr,
1221
- skip_num_lines=skip_num_lines,
1222
- source_bashrc=source_bashrc,
1223
- use_login=False)
1410
+ command_str = self._get_command_to_run(
1411
+ cmd,
1412
+ process_stream,
1413
+ separate_stderr,
1414
+ skip_num_lines=skip_num_lines,
1415
+ source_bashrc=source_bashrc,
1416
+ use_login=False,
1417
+ run_in_background=run_in_background)
1224
1418
 
1225
1419
  log_dir = os.path.expanduser(os.path.dirname(log_path))
1226
1420
  os.makedirs(log_dir, exist_ok=True)
@@ -1332,29 +1526,6 @@ class SlurmCommandRunner(SSHCommandRunner):
1332
1526
  self.job_id = job_id
1333
1527
  self.slurm_node = slurm_node
1334
1528
 
1335
- # Build a chained ProxyCommand that goes through the login node to reach
1336
- # the compute node where the job is running.
1337
-
1338
- # First, build SSH options to reach the login node, using the user's
1339
- # existing proxy command if provided.
1340
- proxy_ssh_options = ' '.join(
1341
- ssh_options_list(self.ssh_private_key,
1342
- None,
1343
- ssh_proxy_command=self._ssh_proxy_command,
1344
- port=self.port,
1345
- disable_control_master=True))
1346
- login_node_proxy_command = (f'ssh {proxy_ssh_options} '
1347
- f'-W %h:%p {self.ssh_user}@{self.ip}')
1348
-
1349
- # Update the proxy command to be the login node proxy, which will
1350
- # be used by super().run() to reach the compute node.
1351
- self._ssh_proxy_command = login_node_proxy_command
1352
- # Update self.ip to target the compute node.
1353
- self.ip = slurm_node
1354
- # Assume the compute node's SSH port is 22.
1355
- # TODO(kevin): Make this configurable if needed.
1356
- self.port = 22
1357
-
1358
1529
  def rsync(
1359
1530
  self,
1360
1531
  source: str,
@@ -1365,40 +1536,35 @@ class SlurmCommandRunner(SSHCommandRunner):
1365
1536
  stream_logs: bool = True,
1366
1537
  max_retry: int = 1,
1367
1538
  ) -> None:
1368
- """Rsyncs files directly to the Slurm compute node,
1369
- by proxying through the Slurm login node.
1370
-
1371
- For Slurm, files need to be accessible by compute nodes where jobs
1372
- execute via srun. This means either it has to be on the compute node's
1373
- local filesystem, or on a shared filesystem.
1539
+ """Rsyncs files to/from the Slurm compute node using srun as transport.
1374
1540
  """
1375
- # TODO(kevin): We can probably optimize this to skip the proxying
1376
- # if the target dir is in a shared filesystem, since it will
1377
- # be accessible by the compute node.
1378
-
1379
- # Build SSH options for rsync using the ProxyCommand set up in __init__
1380
- # to reach the compute node through the login node.
1381
- ssh_options = ' '.join(
1382
- ssh_options_list(
1383
- # Assume nothing and rely on default SSH behavior when -i is
1384
- # not specified.
1385
- None,
1386
- None,
1387
- ssh_proxy_command=self._ssh_proxy_command,
1388
- disable_control_master=True))
1389
- rsh_option = f'ssh {ssh_options}'
1390
-
1391
- self._rsync(
1392
- source,
1393
- target,
1394
- # Compute node
1395
- node_destination=f'{self.ssh_user}@{self.slurm_node}',
1396
- up=up,
1397
- rsh_option=rsh_option,
1398
- log_path=log_path,
1399
- stream_logs=stream_logs,
1400
- max_retry=max_retry,
1401
- get_remote_home_dir=lambda: self.sky_dir)
1541
+ ssh_command = ' '.join(
1542
+ self.ssh_base_command(ssh_mode=SshMode.NON_INTERACTIVE,
1543
+ port_forward=None,
1544
+ connect_timeout=None))
1545
+
1546
+ # rsh command: parse job_id+node_list from $1, ssh to login node,
1547
+ # run srun with rsync command.
1548
+ rsh_option = (
1549
+ f'bash --norc --noprofile -c \''
1550
+ f'job_id=$(echo "$1" | cut -d+ -f1); '
1551
+ f'node_list=$(echo "$1" | cut -d+ -f2); '
1552
+ f'shift; ' # Shift past the encoded job_id+node_list
1553
+ f'exec {ssh_command} ' # SSH to login node to run srun
1554
+ f'srun --unbuffered --quiet --overlap '
1555
+ f'--jobid="$job_id" --nodelist="$node_list" --nodes=1 --ntasks=1 '
1556
+ f'"$@"'
1557
+ f'\' --')
1558
+ encoded_info = f'{self.job_id}+{self.slurm_node}'
1559
+ self._rsync(source,
1560
+ target,
1561
+ node_destination=encoded_info,
1562
+ up=up,
1563
+ rsh_option=rsh_option,
1564
+ log_path=log_path,
1565
+ stream_logs=stream_logs,
1566
+ max_retry=max_retry,
1567
+ get_remote_home_dir=lambda: self.sky_dir)
1402
1568
 
1403
1569
  @timeline.event
1404
1570
  @context_utils.cancellation_guard
@@ -1420,14 +1586,6 @@ class SlurmCommandRunner(SSHCommandRunner):
1420
1586
  # could be part of a shared filesystem.
1421
1587
  # And similarly for SKY_RUNTIME_DIR. See constants.\
1422
1588
  # SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
1423
- #
1424
- # SSH directly to the compute node instead of using srun.
1425
- # This avoids Slurm's proctrack/cgroup which kills all processes
1426
- # when the job step ends (including child processes launched as
1427
- # a separate process group), breaking background process spawning
1428
- # (e.g., JobScheduler._run_job which uses launch_new_process_tree).
1429
- # Note: proctrack/cgroup is enabled by default on Nebius'
1430
- # Managed Soperator.
1431
1589
  cmd = (
1432
1590
  f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
1433
1591
  f'"{self.skypilot_runtime_dir}" && '
@@ -1438,4 +1596,8 @@ class SlurmCommandRunner(SSHCommandRunner):
1438
1596
  f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
1439
1597
  f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
1440
1598
 
1599
+ cmd = (f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
1600
+ f'--nodelist={self.slurm_node} '
1601
+ f'--nodes=1 --ntasks=1 bash -c {shlex.quote(cmd)}')
1602
+
1441
1603
  return super().run(cmd, **kwargs)