skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'c28d94abd967c1a7494e3c343f92eb6d02d29541'
10
+ _SKYPILOT_COMMIT_SHA = '5f4cd3b33375c055093474b95f219d26018b7343'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20251210'
40
+ __version__ = '1.0.0.dev20260112'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
@@ -143,6 +143,7 @@ SCP = clouds.SCP
143
143
  Slurm = clouds.Slurm
144
144
  Kubernetes = clouds.Kubernetes
145
145
  K8s = Kubernetes
146
+ SSH = clouds.SSH
146
147
  OCI = clouds.OCI
147
148
  Paperspace = clouds.Paperspace
148
149
  PrimeIntellect = clouds.PrimeIntellect
@@ -164,6 +165,7 @@ __all__ = [
164
165
  'IBM',
165
166
  'Kubernetes',
166
167
  'K8s',
168
+ 'SSH',
167
169
  'Lambda',
168
170
  'OCI',
169
171
  'Paperspace',
sky/adaptors/slurm.py CHANGED
@@ -1,11 +1,15 @@
1
1
  """Slurm adaptor for SkyPilot."""
2
2
 
3
+ import ipaddress
3
4
  import logging
4
5
  import re
6
+ import socket
5
7
  import time
6
8
  from typing import Dict, List, NamedTuple, Optional, Tuple
7
9
 
10
+ from sky.adaptors import common
8
11
  from sky.utils import command_runner
12
+ from sky.utils import common_utils
9
13
  from sky.utils import subprocess_utils
10
14
  from sky.utils import timeline
11
15
 
@@ -22,6 +26,11 @@ _PARTITION_NAME_REGEX = re.compile(r'PartitionName=(.+?)(?:\s+\w+=|$)')
22
26
  # Default timeout for waiting for job nodes to be allocated, in seconds.
23
27
  _SLURM_DEFAULT_PROVISION_TIMEOUT = 10
24
28
 
29
+ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Slurm. '
30
+ 'Try running: pip install "skypilot[slurm]"')
31
+ hostlist = common.LazyImport('hostlist',
32
+ import_error_message=_IMPORT_ERROR_MESSAGE)
33
+
25
34
 
26
35
  class SlurmPartition(NamedTuple):
27
36
  """Information about the Slurm partitions."""
@@ -47,11 +56,13 @@ class SlurmClient:
47
56
 
48
57
  def __init__(
49
58
  self,
50
- ssh_host: str,
51
- ssh_port: int,
52
- ssh_user: str,
53
- ssh_key: Optional[str],
59
+ ssh_host: Optional[str] = None,
60
+ ssh_port: Optional[int] = None,
61
+ ssh_user: Optional[str] = None,
62
+ ssh_key: Optional[str] = None,
54
63
  ssh_proxy_command: Optional[str] = None,
64
+ ssh_proxy_jump: Optional[str] = None,
65
+ is_inside_slurm_cluster: bool = False,
55
66
  ):
56
67
  """Initialize SlurmClient.
57
68
 
@@ -61,21 +72,42 @@ class SlurmClient:
61
72
  ssh_user: SSH username.
62
73
  ssh_key: Path to SSH private key, or None for keyless SSH.
63
74
  ssh_proxy_command: Optional SSH proxy command.
75
+ ssh_proxy_jump: Optional SSH proxy jump destination.
76
+ is_inside_slurm_cluster: If True, uses local execution mode (for
77
+ when running on the Slurm cluster itself). Defaults to False.
64
78
  """
65
79
  self.ssh_host = ssh_host
66
80
  self.ssh_port = ssh_port
67
81
  self.ssh_user = ssh_user
68
82
  self.ssh_key = ssh_key
69
83
  self.ssh_proxy_command = ssh_proxy_command
70
-
71
- # Internal runner for executing Slurm CLI commands
72
- # on the controller node.
73
- self._runner = command_runner.SSHCommandRunner(
74
- (ssh_host, ssh_port),
75
- ssh_user,
76
- ssh_key,
77
- ssh_proxy_command=ssh_proxy_command,
78
- )
84
+ self.ssh_proxy_jump = ssh_proxy_jump
85
+
86
+ self._runner: command_runner.CommandRunner
87
+
88
+ if is_inside_slurm_cluster:
89
+ # Local execution mode - for running on the Slurm cluster itself
90
+ # (e.g., autodown from skylet).
91
+ self._runner = command_runner.LocalProcessCommandRunner()
92
+ else:
93
+ # Remote execution via SSH
94
+ assert ssh_host is not None
95
+ assert ssh_port is not None
96
+ assert ssh_user is not None
97
+ self._runner = command_runner.SSHCommandRunner(
98
+ (ssh_host, ssh_port),
99
+ ssh_user,
100
+ ssh_key,
101
+ ssh_proxy_command=ssh_proxy_command,
102
+ ssh_proxy_jump=ssh_proxy_jump,
103
+ enable_interactive_auth=True,
104
+ )
105
+
106
+ def _run_slurm_cmd(self, cmd: str) -> Tuple[int, str, str]:
107
+ return self._runner.run(cmd,
108
+ require_outputs=True,
109
+ separate_stderr=True,
110
+ stream_logs=False)
79
111
 
80
112
  def query_jobs(
81
113
  self,
@@ -99,13 +131,11 @@ class SlurmClient:
99
131
  if job_name is not None:
100
132
  cmd += f' --name {job_name}'
101
133
 
102
- rc, stdout, stderr = self._runner.run(cmd,
103
- require_outputs=True,
104
- stream_logs=False)
134
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
105
135
  subprocess_utils.handle_returncode(rc,
106
136
  cmd,
107
137
  'Failed to query Slurm jobs.',
108
- stderr=stderr)
138
+ stderr=f'{stdout}\n{stderr}')
109
139
 
110
140
  job_ids = stdout.strip().splitlines()
111
141
  return job_ids
@@ -128,13 +158,11 @@ class SlurmClient:
128
158
  cmd += f' --signal {signal}'
129
159
  if full:
130
160
  cmd += ' --full'
131
- rc, stdout, stderr = self._runner.run(cmd,
132
- require_outputs=True,
133
- stream_logs=False)
161
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
134
162
  subprocess_utils.handle_returncode(rc,
135
163
  cmd,
136
164
  f'Failed to cancel job {job_name}.',
137
- stderr=stderr)
165
+ stderr=f'{stdout}\n{stderr}')
138
166
  logger.debug(f'Successfully cancelled job {job_name}: {stdout}')
139
167
 
140
168
  def info(self) -> str:
@@ -147,11 +175,12 @@ class SlurmClient:
147
175
  The stdout output from sinfo.
148
176
  """
149
177
  cmd = 'sinfo'
150
- rc, stdout, stderr = self._runner.run(cmd,
151
- require_outputs=True,
152
- stream_logs=False)
178
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
153
179
  subprocess_utils.handle_returncode(
154
- rc, cmd, 'Failed to get Slurm cluster information.', stderr=stderr)
180
+ rc,
181
+ cmd,
182
+ 'Failed to get Slurm cluster information.',
183
+ stderr=f'{stdout}\n{stderr}')
155
184
  return stdout
156
185
 
157
186
  def info_nodes(self) -> List[NodeInfo]:
@@ -162,11 +191,12 @@ class SlurmClient:
162
191
  """
163
192
  cmd = (f'sinfo -h --Node -o '
164
193
  f'"%N{SEP}%t{SEP}%G{SEP}%c{SEP}%m{SEP}%P"')
165
- rc, stdout, stderr = self._runner.run(cmd,
166
- require_outputs=True,
167
- stream_logs=False)
194
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
168
195
  subprocess_utils.handle_returncode(
169
- rc, cmd, 'Failed to get Slurm node information.', stderr=stderr)
196
+ rc,
197
+ cmd,
198
+ 'Failed to get Slurm node information.',
199
+ stderr=f'{stdout}\n{stderr}')
170
200
 
171
201
  nodes = []
172
202
  for line in stdout.splitlines():
@@ -211,31 +241,63 @@ class SlurmClient:
211
241
  return node_info
212
242
 
213
243
  cmd = f'scontrol show node {node_name}'
214
- rc, node_details, _ = self._runner.run(cmd,
215
- require_outputs=True,
216
- stream_logs=False)
244
+ rc, node_details, stderr = self._run_slurm_cmd(cmd)
217
245
  subprocess_utils.handle_returncode(
218
246
  rc,
219
247
  cmd,
220
248
  f'Failed to get detailed node information for {node_name}.',
221
- stderr=node_details)
249
+ stderr=f'{node_details}\n{stderr}')
222
250
  node_info = _parse_scontrol_node_output(node_details)
223
251
  return node_info
224
252
 
225
- def get_node_jobs(self, node_name: str) -> List[str]:
226
- """Get the list of jobs for a given node name.
253
+ def get_jobs_gres(self, node_name: str) -> List[str]:
254
+ """Get the list of jobs GRES for a given node name.
227
255
 
228
256
  Returns:
229
- A list of job names for the current user on the node.
257
+ A list of GRES specs (e.g., 'gres/gpu:h100:4')
258
+ for jobs on the node.
230
259
  """
231
- cmd = f'squeue --me -h --nodelist {node_name} -o "%b"'
232
- rc, stdout, stderr = self._runner.run(cmd,
233
- require_outputs=True,
234
- stream_logs=False)
260
+ cmd = f'squeue -h --nodelist {node_name} -o "%b"'
261
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
235
262
  subprocess_utils.handle_returncode(
236
- rc, cmd, f'Failed to get jobs for node {node_name}.', stderr=stderr)
263
+ rc,
264
+ cmd,
265
+ f'Failed to get jobs for node {node_name}.',
266
+ stderr=f'{stdout}\n{stderr}')
237
267
  return stdout.splitlines()
238
268
 
269
+ def get_all_jobs_gres(self) -> Dict[str, List[str]]:
270
+ """Get GRES allocation for all running jobs, grouped by node.
271
+
272
+ Returns:
273
+ Dict mapping node_name -> list of GRES strings for jobs on that
274
+ node.
275
+ """
276
+ cmd = f'squeue -h --states=running,completing -o "%N{SEP}%b"'
277
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
278
+ subprocess_utils.handle_returncode(rc,
279
+ cmd,
280
+ 'Failed to get all jobs GRES.',
281
+ stderr=f'{stdout}\n{stderr}')
282
+
283
+ nodes_to_gres: Dict[str, List[str]] = {}
284
+ for line in stdout.splitlines():
285
+ line = line.strip()
286
+ if not line:
287
+ continue
288
+ parts = line.split(SEP)
289
+ if len(parts) != 2:
290
+ # We should never reach here, but just in case.
291
+ continue
292
+ nodelist_str, gres_str = parts
293
+ if not gres_str or gres_str == 'N/A':
294
+ continue
295
+
296
+ for node in hostlist.expand_hostlist(nodelist_str):
297
+ nodes_to_gres.setdefault(node, []).append(gres_str)
298
+
299
+ return nodes_to_gres
300
+
239
301
  def get_job_state(self, job_id: str) -> Optional[str]:
240
302
  """Get the state of a Slurm job.
241
303
 
@@ -249,17 +311,30 @@ class SlurmClient:
249
311
  # Use --only-job-state since we only need the job state.
250
312
  # This reduces the work required by slurmctld.
251
313
  cmd = f'squeue -h --only-job-state --jobs {job_id} -o "%T"'
252
- rc, stdout, stderr = self._runner.run(cmd,
253
- require_outputs=True,
254
- stream_logs=False)
255
- if rc != 0:
256
- # Job may not exist
257
- logger.debug(f'Failed to get job state for job {job_id}: {stderr}')
258
- return None
314
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
315
+ subprocess_utils.handle_returncode(
316
+ rc,
317
+ cmd,
318
+ f'Failed to get job state for job {job_id}.',
319
+ stderr=f'{stdout}\n{stderr}')
259
320
 
260
321
  state = stdout.strip()
261
322
  return state if state else None
262
323
 
324
+ def get_jobs_state_by_name(self, job_name: str) -> List[str]:
325
+ """Get the states of all Slurm jobs by name.
326
+ """
327
+ cmd = f'squeue -h --name {job_name} -o "%T"'
328
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
329
+ subprocess_utils.handle_returncode(
330
+ rc,
331
+ cmd,
332
+ f'Failed to get job state for job {job_name}.',
333
+ stderr=f'{stdout}\n{stderr}')
334
+
335
+ states = stdout.splitlines()
336
+ return states
337
+
263
338
  @timeline.event
264
339
  def get_job_reason(self, job_id: str) -> Optional[str]:
265
340
  """Get the reason a job is in its current state
@@ -269,12 +344,12 @@ class SlurmClient:
269
344
  """
270
345
  # Without --states all, squeue omits terminated jobs.
271
346
  cmd = f'squeue -h --jobs {job_id} --states all -o "%r"'
272
- rc, stdout, stderr = self._runner.run(cmd,
273
- require_outputs=True,
274
- stream_logs=False)
275
- if rc != 0:
276
- logger.debug(f'Failed to get job info for job {job_id}: {stderr}')
277
- return None
347
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
348
+ subprocess_utils.handle_returncode(
349
+ rc,
350
+ cmd,
351
+ f'Failed to get job reason for job {job_id}.',
352
+ stderr=f'{stdout}\n{stderr}')
278
353
 
279
354
  output = stdout.strip()
280
355
  if not output:
@@ -312,9 +387,7 @@ class SlurmClient:
312
387
 
313
388
  # Check if nodes are allocated by trying to get node list
314
389
  cmd = f'squeue -h --jobs {job_id} -o "%N"'
315
- rc, stdout, stderr = self._runner.run(cmd,
316
- require_outputs=True,
317
- stream_logs=False)
390
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
318
391
 
319
392
  if rc == 0 and stdout.strip():
320
393
  # Nodes are allocated
@@ -322,7 +395,8 @@ class SlurmClient:
322
395
  f'Job {job_id} has nodes allocated: {stdout.strip()}')
323
396
  return
324
397
  elif rc != 0:
325
- logger.debug(f'Failed to get nodes for job {job_id}: {stderr}')
398
+ logger.debug(f'Failed to get nodes for job {job_id}: '
399
+ f'{stdout}\n{stderr}')
326
400
 
327
401
  # Wait before checking again
328
402
  time.sleep(2)
@@ -359,15 +433,16 @@ class SlurmClient:
359
433
  f'squeue -h --jobs {job_id} -o "%N" | tr \',\' \'\\n\' | '
360
434
  f'while read node; do '
361
435
  # TODO(kevin): Use json output for more robust parsing.
362
- f'ip=$(scontrol show node=$node | grep NodeAddr= | '
436
+ f'node_addr=$(scontrol show node=$node | grep NodeAddr= | '
363
437
  f'awk -F= \'{{print $2}}\' | awk \'{{print $1}}\'); '
364
- f'echo "$node $ip"; '
438
+ f'echo "$node $node_addr"; '
365
439
  f'done')
366
- rc, stdout, stderr = self._runner.run(cmd,
367
- require_outputs=True,
368
- stream_logs=False)
440
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
369
441
  subprocess_utils.handle_returncode(
370
- rc, cmd, f'Failed to get nodes for job {job_id}.', stderr=stderr)
442
+ rc,
443
+ cmd,
444
+ f'Failed to get nodes for job {job_id}.',
445
+ stderr=f'{stdout}\n{stderr}')
371
446
  logger.debug(f'Successfully got nodes for job {job_id}: {stdout}')
372
447
 
373
448
  node_info = {}
@@ -377,7 +452,23 @@ class SlurmClient:
377
452
  parts = line.split()
378
453
  if len(parts) >= 2:
379
454
  node_name = parts[0]
380
- node_ip = parts[1]
455
+ node_addr = parts[1]
456
+ # Resolve hostname to IP if node_addr is not already
457
+ # an IP address.
458
+ try:
459
+ ipaddress.ip_address(node_addr)
460
+ # Already an IP address
461
+ node_ip = node_addr
462
+ except ValueError:
463
+ # It's a hostname, resolve it to an IP
464
+ try:
465
+ node_ip = socket.gethostbyname(node_addr)
466
+ except socket.gaierror as e:
467
+ raise RuntimeError(
468
+ f'Failed to resolve hostname {node_addr} to IP '
469
+ f'for node {node_name}: '
470
+ f'{common_utils.format_exception(e)}') from e
471
+
381
472
  node_info[node_name] = node_ip
382
473
 
383
474
  nodes = list(node_info.keys())
@@ -408,9 +499,7 @@ class SlurmClient:
408
499
  The job ID of the submitted job.
409
500
  """
410
501
  cmd = f'sbatch --partition={partition} {script_path}'
411
- rc, stdout, stderr = self._runner.run(cmd,
412
- require_outputs=True,
413
- stream_logs=False)
502
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
414
503
  subprocess_utils.handle_returncode(rc,
415
504
  cmd,
416
505
  'Failed to submit Slurm job.',
@@ -435,13 +524,11 @@ class SlurmClient:
435
524
  List of SlurmPartition objects.
436
525
  """
437
526
  cmd = 'scontrol show partitions -o'
438
- rc, stdout, stderr = self._runner.run(cmd,
439
- require_outputs=True,
440
- stream_logs=False)
527
+ rc, stdout, stderr = self._run_slurm_cmd(cmd)
441
528
  subprocess_utils.handle_returncode(rc,
442
529
  cmd,
443
530
  'Failed to get Slurm partitions.',
444
- stderr=stderr)
531
+ stderr=f'{stdout}\n{stderr}')
445
532
 
446
533
  partitions = []
447
534
  for line in stdout.strip().splitlines():
@@ -69,6 +69,7 @@ from sky.utils import timeline
69
69
  from sky.utils import ux_utils
70
70
  from sky.utils import volume as volume_utils
71
71
  from sky.utils import yaml_utils
72
+ from sky.utils.plugin_extensions import ExternalFailureSource
72
73
  from sky.workspaces import core as workspaces_core
73
74
 
74
75
  if typing.TYPE_CHECKING:
@@ -763,7 +764,20 @@ def write_cluster_config(
763
764
  keys=('allowed_contexts',),
764
765
  default_value=None)
765
766
  if allowed_contexts is None:
766
- excluded_clouds.add(cloud)
767
+ # Exclude both Kubernetes and SSH explicitly since:
768
+ # 1. isinstance(cloud, clouds.Kubernetes) matches both (SSH
769
+ # inherits from Kubernetes)
770
+ # 2. Both share the same get_credential_file_mounts() which
771
+ # returns the kubeconfig. So if we don't exclude both, the
772
+ # unexcluded one will upload the kubeconfig.
773
+ # TODO(romilb): This is a workaround. The right long-term fix
774
+ # is to have SSH Node Pools use its own kubeconfig instead of
775
+ # sharing the global kubeconfig at ~/.kube/config. In the
776
+ # interim, SSH Node Pools' get_credential_file_mounts can filter
777
+ # contexts starting with ssh- and create a temp kubeconfig
778
+ # to upload.
779
+ excluded_clouds.add(clouds.Kubernetes())
780
+ excluded_clouds.add(clouds.SSH())
767
781
  else:
768
782
  excluded_clouds.add(cloud)
769
783
 
@@ -2262,6 +2276,12 @@ def _update_cluster_status(
2262
2276
  for status in node_statuses) and
2263
2277
  len(node_statuses) == handle.launched_nodes)
2264
2278
 
2279
+ external_cluster_failures = ExternalFailureSource.get(
2280
+ cluster_hash=record['cluster_hash'])
2281
+ logger.debug(f'Cluster {cluster_name} with cluster_hash '
2282
+ f'{record["cluster_hash"]} has external cluster failures: '
2283
+ f'{external_cluster_failures}')
2284
+
2265
2285
  def get_node_counts_from_ray_status(
2266
2286
  runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
2267
2287
  rc, output, stderr = runner.run(
@@ -2401,8 +2421,9 @@ def _update_cluster_status(
2401
2421
 
2402
2422
  # For Slurm, skip Ray health check since it doesn't use Ray.
2403
2423
  should_check_ray = cloud is not None and cloud.uses_ray()
2404
- if all_nodes_up and (not should_check_ray or
2405
- run_ray_status_to_check_ray_cluster_healthy()):
2424
+ if (all_nodes_up and (not should_check_ray or
2425
+ run_ray_status_to_check_ray_cluster_healthy()) and
2426
+ not external_cluster_failures):
2406
2427
  # NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
2407
2428
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
2408
2429
  # head-ip/worker-ips`.
@@ -2505,15 +2526,15 @@ def _update_cluster_status(
2505
2526
  # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2506
2527
  # autostopping/autodowning.
2507
2528
  some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2508
- # If all nodes are up and ray cluster is health, we would have returned
2509
- # earlier. So if all_nodes_up is True and we are here, it means the ray
2510
- # cluster must have been unhealthy.
2511
- ray_cluster_unhealthy = all_nodes_up
2512
2529
  some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2513
2530
  for status in node_statuses)
2514
2531
  is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2515
2532
 
2516
- if is_abnormal:
2533
+ if is_abnormal and not external_cluster_failures:
2534
+ # If all nodes are up and ray cluster is healthy, we would have returned
2535
+ # earlier. So if all_nodes_up is True and we are here, it means the ray
2536
+ # cluster must have been unhealthy.
2537
+ ray_cluster_unhealthy = all_nodes_up
2517
2538
  status_reason = ', '.join(
2518
2539
  [status[1] for status in node_statuses if status[1] is not None])
2519
2540
 
@@ -2641,8 +2662,25 @@ def _update_cluster_status(
2641
2662
  cluster_name,
2642
2663
  include_user_info=include_user_info,
2643
2664
  summary_response=summary_response)
2644
- # Now is_abnormal is False: either node_statuses is empty or all nodes are
2645
- # STOPPED.
2665
+ # Now either:
2666
+ # (1) is_abnormal is False: either node_statuses is empty or all nodes are
2667
+ # STOPPED
2668
+ # or
2669
+ # (2) there are external cluster failures reported by a plugin.
2670
+
2671
+ # If there are external cluster failures and the cluster has not been
2672
+ # terminated on cloud (to_terminate), we can return the cluster record as is.
2673
+ # This is because when an external failure is detected, the cluster will be
2674
+ # marked as INIT with a reason indicating the details of the failure. So, we
2675
+ # do not want to modify the cluster status in this function except for in the
2676
+ # case where the cluster has been terminated on cloud, in which case we should
2677
+ # clean up the cluster from SkyPilot's global state.
2678
+ if external_cluster_failures and not to_terminate:
2679
+ return global_user_state.get_cluster_from_name(
2680
+ cluster_name,
2681
+ include_user_info=include_user_info,
2682
+ summary_response=summary_response)
2683
+
2646
2684
  verb = 'terminated' if to_terminate else 'stopped'
2647
2685
  backend = backends.CloudVmRayBackend()
2648
2686
  global_user_state.add_cluster_event(
@@ -3368,6 +3406,8 @@ def get_clusters(
3368
3406
  handle = record['handle']
3369
3407
  record['nodes'] = handle.launched_nodes
3370
3408
  if handle.launched_resources is None:
3409
+ # Set default values when launched_resources is None
3410
+ record['labels'] = {}
3371
3411
  continue
3372
3412
  record['cloud'] = (f'{handle.launched_resources.cloud}'
3373
3413
  if handle.launched_resources.cloud else None)
@@ -3380,6 +3420,8 @@ def get_clusters(
3380
3420
  record['accelerators'] = (
3381
3421
  f'{handle.launched_resources.accelerators}'
3382
3422
  if handle.launched_resources.accelerators else None)
3423
+ record['labels'] = (handle.launched_resources.labels
3424
+ if handle.launched_resources.labels else {})
3383
3425
  if not include_handle:
3384
3426
  record.pop('handle', None)
3385
3427