skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ """Utilities for handling interactive SSH authentication."""
2
+ import asyncio
3
+ import fcntl
4
+ import os
5
+ import re
6
+ import sys
7
+ import termios
8
+ import tty
9
+ import typing
10
+
11
+ from sky import sky_logging
12
+ from sky.adaptors import common as adaptors_common
13
+ from sky.client import service_account_auth
14
+ from sky.server import common as server_common
15
+ from sky.utils import rich_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ import websockets
19
+ else:
20
+ websockets = adaptors_common.LazyImport('websockets')
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ SKY_INTERACTIVE_PATTERN = re.compile(r'<sky-interactive session="([^"]+)"/>')
25
+
26
+
27
+ # TODO(kevin): Refactor to share code with websocket_proxy.py.
28
+ async def _handle_interactive_auth_websocket(session_id: str) -> None:
29
+ """Handle interactive SSH authentication via websocket.
30
+
31
+ This establishes a websocket connection to the API server and bridges
32
+ the user's terminal I/O bidirectionally with the PTY on the server,
33
+ allowing interactive authentication (e.g., 2FA).
34
+
35
+ Args:
36
+ session_id: The session identifier from the <sky-interactive> signal.
37
+ """
38
+ # Get HTTP server URL and convert to websocket URL
39
+ server_url = server_common.get_server_url()
40
+ server_proto, server_fqdn = server_url.split('://')
41
+ websocket_proto = 'wss' if server_proto == 'https' else 'ws'
42
+ ws_url = (f'{websocket_proto}://{server_fqdn}'
43
+ f'/ssh-interactive-auth?session_id={session_id}')
44
+
45
+ logger.info('Starting interactive SSH authentication...')
46
+
47
+ headers = {}
48
+ # Add service account auth if available
49
+ headers.update(service_account_auth.get_service_account_headers())
50
+ # Add cookie auth with URL-aware filtering
51
+ headers.update(server_common.get_cookie_header_for_url(ws_url))
52
+
53
+ # Set terminal to raw mode if stdin is a tty
54
+ old_settings = None
55
+ if os.isatty(sys.stdin.fileno()):
56
+ old_settings = termios.tcgetattr(sys.stdin.fileno())
57
+ tty.setraw(sys.stdin.fileno())
58
+
59
+ stdin_dup_fd = None
60
+ stdout_dup_fd = None
61
+ try:
62
+ # Duplicate stdin/stdout fds before passing to asyncio.
63
+ # When asyncio's loop.connect_read/write_pipe() is called,
64
+ # it creates a transport that takes ownership of the file passed to it.
65
+ # By duplicating the fds, we give asyncio independent copies that it can
66
+ # safely close, while preserving the original sys.stdin/stdout.
67
+ stdin_dup_fd = os.dup(sys.stdin.fileno())
68
+ stdout_dup_fd = os.dup(sys.stdout.fileno())
69
+
70
+ async with websockets.connect(ws_url,
71
+ additional_headers=headers,
72
+ ping_interval=None) as ws:
73
+ loop = asyncio.get_running_loop()
74
+
75
+ stdin_reader = asyncio.StreamReader()
76
+ stdin_protocol = asyncio.StreamReaderProtocol(stdin_reader)
77
+ stdin_dup_file = os.fdopen(stdin_dup_fd, 'rb', buffering=0)
78
+ stdin_dup_fd = None # File object now owns the FD
79
+ await loop.connect_read_pipe(lambda: stdin_protocol, stdin_dup_file)
80
+
81
+ stdout_dup_file = os.fdopen(stdout_dup_fd, 'wb', buffering=0)
82
+ stdout_dup_fd = None # File object now owns the FD
83
+ stdout_transport, stdout_protocol = await loop.connect_write_pipe(
84
+ asyncio.streams.FlowControlMixin,
85
+ stdout_dup_file) # type: ignore
86
+ stdout_writer = asyncio.StreamWriter(stdout_transport,
87
+ stdout_protocol, None, loop)
88
+
89
+ async def stdin_to_websocket():
90
+ """Forward stdin to websocket."""
91
+ try:
92
+ while True:
93
+ data = await stdin_reader.read(4096)
94
+ if not data:
95
+ break
96
+ await ws.send(data)
97
+ except asyncio.CancelledError:
98
+ # Task was cancelled - auth complete
99
+ pass
100
+ except Exception as e: # pylint: disable=broad-except
101
+ logger.debug(f'Error in stdin_to_websocket: {e}')
102
+
103
+ async def websocket_to_stdout():
104
+ """Forward websocket to stdout."""
105
+ try:
106
+ async for message in ws:
107
+ stdout_writer.write(message)
108
+ await stdout_writer.drain()
109
+ except Exception as e: # pylint: disable=broad-except
110
+ logger.debug(f'Error in websocket_to_stdout: {e}')
111
+
112
+ # Run both directions concurrently
113
+ # Use tasks so we can cancel stdin reader when websocket closes
114
+ stdin_task = asyncio.create_task(stdin_to_websocket())
115
+ stdout_task = asyncio.create_task(websocket_to_stdout())
116
+
117
+ # Wait for websocket to close (auth complete)
118
+ await stdout_task
119
+ # Cancel stdin reader so it doesn't consume the next keystroke
120
+ stdin_task.cancel()
121
+ try:
122
+ await stdin_task
123
+ except asyncio.CancelledError:
124
+ pass
125
+ except Exception as e: # pylint: disable=broad-except
126
+ logger.error(f'Failed to handle interactive authentication: {e}')
127
+ raise
128
+ finally:
129
+ # Restore terminal settings if they were changed
130
+ if old_settings:
131
+ termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
132
+ old_settings)
133
+ # Flush any buffered input from stdin
134
+ termios.tcflush(sys.stdin.fileno(), termios.TCIFLUSH)
135
+ # Ensure stdout is in blocking mode (can be non-blocking after
136
+ # asyncio transport operations)
137
+ flags = fcntl.fcntl(sys.stdout.fileno(), fcntl.F_GETFL)
138
+ fcntl.fcntl(sys.stdout.fileno(), fcntl.F_SETFL,
139
+ flags & ~os.O_NONBLOCK)
140
+
141
+ for fd in [stdin_dup_fd, stdout_dup_fd]:
142
+ if fd is not None:
143
+ try:
144
+ os.close(fd)
145
+ except OSError:
146
+ # Already closed by asyncio or never opened
147
+ pass
148
+
149
+
150
+ def handle_interactive_auth(line: str) -> typing.Optional[str]:
151
+ """Handle interactive SSH authentication signals (sync version).
152
+
153
+ Args:
154
+ line: The log line to check for interactive auth markers.
155
+
156
+ Returns:
157
+ The line with the marker removed, or None if this was an interactive
158
+ auth signal (meaning the line was consumed).
159
+ """
160
+ match = SKY_INTERACTIVE_PATTERN.search(line)
161
+ if not match:
162
+ return line
163
+
164
+ session_id = match.group(1)
165
+ # Temporarily stop any spinners to allow terminal I/O
166
+ with rich_utils.safe_logger():
167
+ asyncio.run(_handle_interactive_auth_websocket(session_id))
168
+
169
+ return None
170
+
171
+
172
+ async def handle_interactive_auth_async(line: str) -> typing.Optional[str]:
173
+ """Handle interactive SSH authentication signals (async version).
174
+
175
+ Args:
176
+ line: The log line to check for interactive auth markers.
177
+
178
+ Returns:
179
+ The line with the marker removed, or None if this was an interactive
180
+ auth signal (meaning the line was consumed).
181
+ """
182
+ match = SKY_INTERACTIVE_PATTERN.search(line)
183
+ if not match:
184
+ return line
185
+
186
+ session_id = match.group(1)
187
+ with rich_utils.safe_logger():
188
+ await _handle_interactive_auth_websocket(session_id)
189
+
190
+ return None
sky/client/sdk.py CHANGED
@@ -30,6 +30,7 @@ from sky import sky_logging
30
30
  from sky import skypilot_config
31
31
  from sky.adaptors import common as adaptors_common
32
32
  from sky.client import common as client_common
33
+ from sky.client import interactive_utils
33
34
  from sky.client import oauth as oauth_lib
34
35
  from sky.jobs import scheduler
35
36
  from sky.jobs import utils as managed_job_utils
@@ -42,6 +43,7 @@ from sky.server.requests import request_names
42
43
  from sky.server.requests import requests as requests_lib
43
44
  from sky.skylet import autostop_lib
44
45
  from sky.skylet import constants
46
+ from sky.ssh_node_pools import utils as ssh_utils
45
47
  from sky.usage import usage_lib
46
48
  from sky.utils import admin_policy_utils
47
49
  from sky.utils import annotations
@@ -57,7 +59,6 @@ from sky.utils import status_lib
57
59
  from sky.utils import subprocess_utils
58
60
  from sky.utils import ux_utils
59
61
  from sky.utils import yaml_utils
60
- from sky.utils.kubernetes import ssh_utils
61
62
 
62
63
  if typing.TYPE_CHECKING:
63
64
  import base64
@@ -157,9 +158,16 @@ def stream_response(request_id: Optional[server_common.RequestId[T]],
157
158
  retry_context = rest.get_retry_context()
158
159
  try:
159
160
  line_count = 0
161
+
160
162
  for line in rich_utils.decode_rich_status(response):
161
163
  if line is not None:
162
164
  line_count += 1
165
+
166
+ line = interactive_utils.handle_interactive_auth(line)
167
+ if line is None:
168
+ # Line was consumed by interactive auth handler
169
+ continue
170
+
163
171
  if retry_context is None:
164
172
  print(line, flush=True, end='', file=output_stream)
165
173
  elif line_count > retry_context.line_processed:
@@ -675,7 +683,7 @@ def _launch(
675
683
  clusters = get(status_request_id)
676
684
  cluster_user_hash = common_utils.get_user_hash()
677
685
  cluster_user_hash_str = ''
678
- current_user = common_utils.get_current_user_name()
686
+ current_user = common_utils.get_local_user_name()
679
687
  cluster_user_name = current_user
680
688
  if not clusters:
681
689
  # Show the optimize log before the prompt if the cluster does not
@@ -2744,3 +2752,57 @@ def api_logout() -> None:
2744
2752
  _clear_api_server_config()
2745
2753
  logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
2746
2754
  f'{colorama.Style.RESET_ALL}')
2755
+
2756
+
2757
+ @usage_lib.entrypoint
2758
+ @server_common.check_server_healthy_or_start
2759
+ @versions.minimal_api_version(24)
2760
+ @annotations.client_api
2761
+ def realtime_slurm_gpu_availability(
2762
+ name_filter: Optional[str] = None,
2763
+ quantity_filter: Optional[int] = None) -> server_common.RequestId:
2764
+ """Gets the real-time Slurm GPU availability.
2765
+
2766
+ Args:
2767
+ name_filter: Optional name filter for GPUs.
2768
+ quantity_filter: Optional quantity filter for GPUs.
2769
+
2770
+ Returns:
2771
+ The request ID of the Slurm GPU availability request.
2772
+ """
2773
+ body = payloads.SlurmGpuAvailabilityRequestBody(
2774
+ name_filter=name_filter,
2775
+ quantity_filter=quantity_filter,
2776
+ )
2777
+ response = server_common.make_authenticated_request(
2778
+ 'POST',
2779
+ '/slurm_gpu_availability',
2780
+ json=json.loads(body.model_dump_json()),
2781
+ )
2782
+ return server_common.get_request_id(response)
2783
+
2784
+
2785
+ @usage_lib.entrypoint
2786
+ @server_common.check_server_healthy_or_start
2787
+ @versions.minimal_api_version(24)
2788
+ @annotations.client_api
2789
+ def slurm_node_info(
2790
+ slurm_cluster_name: Optional[str] = None) -> server_common.RequestId:
2791
+ """Gets the resource information for all nodes in the Slurm cluster.
2792
+
2793
+ Returns:
2794
+ The request ID of the Slurm node info request.
2795
+
2796
+ Request Returns:
2797
+ List[Dict[str, Any]]: A list of dictionaries, each containing info
2798
+ for a single Slurm node (node_name, partition, node_state,
2799
+ gpu_type, total_gpus, free_gpus, vcpu_count, memory_gb).
2800
+ """
2801
+ body = payloads.SlurmNodeInfoRequestBody(
2802
+ slurm_cluster_name=slurm_cluster_name)
2803
+ response = server_common.make_authenticated_request(
2804
+ 'GET',
2805
+ '/slurm_node_info',
2806
+ json=json.loads(body.model_dump_json()),
2807
+ )
2808
+ return server_common.get_request_id(response)
sky/client/sdk_async.py CHANGED
@@ -23,6 +23,7 @@ from sky import catalog
23
23
  from sky import exceptions
24
24
  from sky import sky_logging
25
25
  from sky.client import common as client_common
26
+ from sky.client import interactive_utils
26
27
  from sky.client import sdk
27
28
  from sky.schemas.api import responses
28
29
  from sky.server import common as server_common
@@ -167,9 +168,17 @@ async def stream_response_async(request_id: Optional[str],
167
168
  retry_context = rest.get_retry_context()
168
169
  try:
169
170
  line_count = 0
171
+
170
172
  async for line in rich_utils.decode_rich_status_async(response):
171
173
  if line is not None:
172
174
  line_count += 1
175
+
176
+ line = await interactive_utils.handle_interactive_auth_async(
177
+ line)
178
+ if line is None:
179
+ # Line was consumed by interactive auth handler
180
+ continue
181
+
173
182
  if retry_context is None:
174
183
  print(line, flush=True, end='', file=output_stream)
175
184
  elif line_count > retry_context.line_processed:
sky/clouds/__init__.py CHANGED
@@ -31,6 +31,7 @@ from sky.clouds.runpod import RunPod
31
31
  from sky.clouds.scp import SCP
32
32
  from sky.clouds.seeweb import Seeweb
33
33
  from sky.clouds.shadeform import Shadeform
34
+ from sky.clouds.slurm import Slurm
34
35
  from sky.clouds.ssh import SSH
35
36
  from sky.clouds.vast import Vast
36
37
  from sky.clouds.vsphere import Vsphere
@@ -48,6 +49,7 @@ __all__ = [
48
49
  'Paperspace',
49
50
  'PrimeIntellect',
50
51
  'SCP',
52
+ 'Slurm',
51
53
  'RunPod',
52
54
  'Shadeform',
53
55
  'Vast',
sky/clouds/aws.py CHANGED
@@ -12,6 +12,7 @@ import typing
12
12
  from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
13
13
  Tuple, TypeVar, Union)
14
14
 
15
+ import colorama
15
16
  from typing_extensions import ParamSpec
16
17
 
17
18
  from sky import catalog
@@ -758,6 +759,36 @@ class AWS(clouds.Cloud):
758
759
  max_efa_interfaces = 0
759
760
  enable_efa = False
760
761
 
762
+ use_internal_ips = skypilot_config.get_effective_region_config(
763
+ cloud='aws',
764
+ region=region_name,
765
+ keys=('use_internal_ips',),
766
+ default_value=False)
767
+ if max_efa_interfaces > 1 and not use_internal_ips:
768
+ logger.warning(
769
+ f'{colorama.Fore.YELLOW}'
770
+ f'Instance type {resources.instance_type} supports up to '
771
+ f'{max_efa_interfaces} EFA interfaces, but '
772
+ '`use_internal_ips` is not enabled.\nLaunching with the '
773
+ 'current configuration will use only 1 EFA interface.\n'
774
+ f'To use all {max_efa_interfaces} EFA interfaces, enable '
775
+ 'internal IPs by adding one of the following '
776
+ 'configurations to SkyPilot config:\n'
777
+ 'Option 1 (with SSM):\n'
778
+ ' aws:\n'
779
+ ' use_internal_ips: true\n'
780
+ ' use_ssm: true\n'
781
+ 'Option 2 (with SSH proxy):\n'
782
+ ' aws:\n'
783
+ ' use_internal_ips: true\n'
784
+ ' ssh_proxy_command: ssh -W %h:%p -i <ssh key path> '
785
+ '-o StrictHostKeyChecking=no <user>@<jump server public'
786
+ ' ip>\n'
787
+ 'Refer to '
788
+ 'https://docs.skypilot.co/en/latest/reference/config.html'
789
+ '#aws-use-internal-ips for more details.'
790
+ f'{colorama.Style.RESET_ALL}')
791
+
761
792
  docker_run_options = []
762
793
  if resources.extract_docker_image() is not None:
763
794
  image_id_to_use = None
@@ -1005,8 +1036,10 @@ class AWS(clouds.Cloud):
1005
1036
  hints = 'AWS SSO is set.'
1006
1037
  if static_credential_exists:
1007
1038
  hints += (
1008
- ' To ensure multiple clouds work correctly, please use SkyPilot '
1009
- 'with static credentials (e.g., ~/.aws/credentials) by unsetting '
1039
+ ' To ensure S3 mounting and other features work correctly '
1040
+ 'on Kubernetes and other clouds, '
1041
+ 'please use SkyPilot with static AWS credentials '
1042
+ '(e.g., ~/.aws/credentials) by unsetting '
1010
1043
  'the AWS_PROFILE environment variable.')
1011
1044
  else:
1012
1045
  hints += single_cloud_hint
@@ -1081,6 +1114,31 @@ class AWS(clouds.Cloud):
1081
1114
  return identity_type
1082
1115
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
1083
1116
 
1117
+ @classmethod
1118
+ def should_use_env_auth_for_s3(cls) -> bool:
1119
+ """Returns True if S3 should use environment-based auth.
1120
+
1121
+ When using non-static AWS credentials (SSO, IAM role, container role),
1122
+ we should not embed credentials into rclone config. Instead, we should
1123
+ use env_auth=true so that rclone uses the AWS SDK credential chain,
1124
+ which properly handles temporary credentials and IAM roles.
1125
+
1126
+ Returns:
1127
+ True if environment-based auth should be used, False for static
1128
+ credentials that can be embedded.
1129
+ """
1130
+ identity_type = cls._current_identity_type()
1131
+ if identity_type is None:
1132
+ return False
1133
+ # These credential types use temporary credentials that should not be
1134
+ # embedded in config files. They rely on the AWS SDK credential chain.
1135
+ non_static_types = {
1136
+ AWSIdentityType.SSO,
1137
+ AWSIdentityType.IAM_ROLE,
1138
+ AWSIdentityType.CONTAINER_ROLE,
1139
+ }
1140
+ return identity_type in non_static_types
1141
+
1084
1142
  @classmethod
1085
1143
  @aws_profile_aware_lru_cache(scope='request',
1086
1144
  maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
sky/clouds/azure.py CHANGED
@@ -97,6 +97,8 @@ class Azure(clouds.Cloud):
97
97
  clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
98
98
  f'High availability controllers are not supported on {cls._REPR}.'
99
99
  ),
100
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
101
+ (f'Custom network tier is not supported on {cls._REPR}.'),
100
102
  clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK: (
101
103
  f'Customized multiple network interfaces are not supported on {cls._REPR}.'
102
104
  ),
sky/clouds/cloud.py CHANGED
@@ -182,6 +182,13 @@ class Cloud:
182
182
  """
183
183
  return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE
184
184
 
185
+ @classmethod
186
+ def uses_ray(cls) -> bool:
187
+ """Returns whether this cloud uses Ray as the distributed
188
+ execution framework.
189
+ """
190
+ return True
191
+
185
192
  #### Regions/Zones ####
186
193
 
187
194
  @classmethod
sky/clouds/kubernetes.py CHANGED
@@ -766,6 +766,8 @@ class Kubernetes(clouds.Cloud):
766
766
  'ha_recovery_log_path':
767
767
  constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(''),
768
768
  'sky_python_cmd': constants.SKY_PYTHON_CMD,
769
+ 'sky_unset_pythonpath_and_set_cwd':
770
+ constants.SKY_UNSET_PYTHONPATH_AND_SET_CWD,
769
771
  'k8s_high_availability_storage_class_name':
770
772
  (k8s_ha_storage_class_name),
771
773
  'avoid_label_keys': avoid_label_keys,
sky/clouds/runpod.py CHANGED
@@ -7,6 +7,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
7
7
 
8
8
  from sky import catalog
9
9
  from sky import clouds
10
+ from sky.utils import common_utils
10
11
  from sky.utils import registry
11
12
  from sky.utils import resources_utils
12
13
 
@@ -312,18 +313,48 @@ class RunPod(clouds.Cloud):
312
313
  # If that happens to be set to None, then ValueError is raised.
313
314
  return False, dependency_error_msg
314
315
 
316
+ hint_msg = (
317
+ 'Credentials can be set up by running: \n'
318
+ ' $ pip install runpod \n'
319
+ ' $ runpod config\n'
320
+ ' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long
321
+ )
322
+
315
323
  valid, error = cls._check_runpod_credentials()
316
324
  if not valid:
317
- return False, (
318
- f'{error} \n' # First line is indented by 4 spaces
319
- ' Credentials can be set up by running: \n'
320
- f' $ pip install runpod \n'
321
- f' $ runpod config\n'
322
- ' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long
323
- )
325
+ return False, (f'{error} \n {hint_msg}')
326
+
327
+ # Validate credentials by making an actual API call
328
+ valid, error = cls._validate_api_key()
329
+ if not valid:
330
+ return False, (f'{error} \n {hint_msg}')
324
331
 
325
332
  return True, None
326
333
 
334
+ @classmethod
335
+ def _validate_api_key(cls) -> Tuple[bool, Optional[str]]:
336
+ """Validate RunPod API key by making an actual API call."""
337
+ # Import here to avoid circular imports and ensure runpod is configured
338
+ # pylint: disable=import-outside-toplevel
339
+ from sky.provision.runpod import utils as runpod_utils
340
+ try:
341
+ # Try to list instances to validate the API key works
342
+ runpod_utils.list_instances()
343
+ return True, None
344
+ except Exception as e: # pylint: disable=broad-except
345
+ from sky.adaptors import runpod
346
+ error_msg = common_utils.format_exception(e, use_bracket=True)
347
+ if isinstance(e, runpod.runpod.error.QueryError):
348
+ error_msg_lower = str(e).lower()
349
+ auth_keywords = ['unauthorized', 'forbidden', '401', '403']
350
+ if any(keyword in error_msg_lower for keyword in auth_keywords):
351
+ return False, (
352
+ 'RunPod API key is invalid or lacks required '
353
+ f'permissions. {error_msg}')
354
+ return False, (f'Failed to verify RunPod API key. {error_msg}')
355
+ return False, ('An unexpected error occurred during RunPod API '
356
+ f'key validation. {error_msg}')
357
+
327
358
  @classmethod
328
359
  def _check_runpod_credentials(cls, profile: str = 'default'):
329
360
  """Checks if the credentials file exists and is valid."""