skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
+ import traceback
3
4
  from typing import Any, Dict, List, Optional, Tuple
4
5
 
5
6
  from sky import sky_logging
@@ -116,7 +117,8 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
116
117
  volume_mount_path=volume_mount_path,
117
118
  )
118
119
  except Exception as e: # pylint: disable=broad-except
119
- logger.warning(f'run_instances error: {e}')
120
+ logger.warning(f'run_instances error: {e}\n'
121
+ f'Full traceback:\n{traceback.format_exc()}')
120
122
  raise
121
123
  logger.info(f'Launched instance {instance_id}.')
122
124
  created_instance_ids.append(instance_id)
@@ -80,7 +80,11 @@ def _construct_docker_login_template_name(cluster_name: str) -> str:
80
80
 
81
81
 
82
82
  def retry(func):
83
- """Decorator to retry a function."""
83
+ """Decorator to retry a function.
84
+
85
+ Only retries on transient errors. Does not retry on authorization errors
86
+ (Unauthorized, Forbidden) as these are not recoverable.
87
+ """
84
88
 
85
89
  def wrapper(*args, **kwargs):
86
90
  """Wrapper for retrying a function."""
@@ -89,6 +93,14 @@ def retry(func):
89
93
  try:
90
94
  return func(*args, **kwargs)
91
95
  except runpod.runpod.error.QueryError as e:
96
+ error_msg = str(e).lower()
97
+ # Don't retry on authorization errors - these won't recover
98
+ auth_keywords = ['unauthorized', 'forbidden', '401', '403']
99
+ if any(keyword in error_msg for keyword in auth_keywords):
100
+ logger.error(f'RunPod authorization error (not retrying): '
101
+ f'{common_utils.format_exception(e)}')
102
+ raise
103
+ cnt += 1
92
104
  if cnt >= 3:
93
105
  raise
94
106
  logger.warning('Retrying for exception: '
@@ -1,5 +1,5 @@
1
1
  """RunPod network volume provisioning."""
2
- from typing import Any, Dict, List, Optional, Tuple
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
3
 
4
4
  from sky import global_user_state
5
5
  from sky import models
@@ -194,15 +194,31 @@ def get_volume_usedby(
194
194
 
195
195
  def get_all_volumes_usedby(
196
196
  configs: List[models.VolumeConfig],
197
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
198
- """Gets the usedby resources of all volumes."""
199
- used_by_results = [get_volume_usedby(config) for config in configs]
197
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
198
+ """Gets the usedby resources of all volumes.
199
+
200
+ Args:
201
+ configs: List of VolumeConfig objects.
202
+
203
+ Returns:
204
+ usedby_pods: Dictionary of volume name to pods using the volume.
205
+ usedby_clusters: Dictionary of volume name to clusters using the volume.
206
+ failed_volume_names: Set of volume names whose usedby info failed to
207
+ fetch.
208
+ """
200
209
  used_by_pods, used_by_clusters = {}, {}
201
- for i in range(len(configs)):
202
- config = configs[i]
203
- used_by_pods[config.name_on_cloud] = used_by_results[i][0]
204
- used_by_clusters[config.name_on_cloud] = used_by_results[i][1]
205
- return used_by_pods, used_by_clusters
210
+ failed_volume_names = set()
211
+ for config in configs:
212
+ try:
213
+ usedby_pods, usedby_clusters = get_volume_usedby(config)
214
+ used_by_pods[config.name_on_cloud] = usedby_pods
215
+ used_by_clusters[config.name_on_cloud] = usedby_clusters
216
+ except Exception as e: # pylint: disable=broad-except
217
+ logger.debug(f'Failed to get usedby info for RunPod volume '
218
+ f'{config.name}: {e}')
219
+ failed_volume_names.add(config.name)
220
+ continue
221
+ return used_by_pods, used_by_clusters, failed_volume_names
206
222
 
207
223
 
208
224
  def map_all_volumes_usedby(
@@ -72,6 +72,7 @@ def _create_virtual_instance(
72
72
  ssh_user = ssh_config_dict['user']
73
73
  ssh_key = ssh_config_dict['private_key']
74
74
  ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
75
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
75
76
  partition = slurm_utils.get_partition_from_config(provider_config)
76
77
 
77
78
  client = slurm.SlurmClient(
@@ -80,6 +81,7 @@ def _create_virtual_instance(
80
81
  ssh_user,
81
82
  ssh_key,
82
83
  ssh_proxy_command=ssh_proxy_command,
84
+ ssh_proxy_jump=ssh_proxy_jump,
83
85
  )
84
86
 
85
87
  # COMPLETING state occurs when a job is being terminated - during this
@@ -168,12 +170,13 @@ def _create_virtual_instance(
168
170
  skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
169
171
  sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
170
172
  ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
173
+ slurm_marker_file = f'{sky_home_dir}/{slurm_utils.SLURM_MARKER_FILE}'
171
174
 
172
175
  # Build the sbatch script
173
176
  gpu_directive = ''
174
177
  if (accelerator_type is not None and accelerator_type.upper() != 'NONE' and
175
178
  accelerator_count > 0):
176
- gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type.lower()}:'
179
+ gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type}:'
177
180
  f'{accelerator_count}')
178
181
 
179
182
  # By default stdout and stderr will be written to $HOME/slurm-%j.out
@@ -215,6 +218,8 @@ def _create_virtual_instance(
215
218
  mkdir -p {sky_home_dir}
216
219
  # Create sky runtime directory on each node.
217
220
  srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
221
+ # Marker file to indicate we're in a Slurm cluster.
222
+ touch {slurm_marker_file}
218
223
  # Suppress login messages.
219
224
  touch {sky_home_dir}/.hushlogin
220
225
  # Signal that the sbatch script has completed setup.
@@ -229,6 +234,7 @@ def _create_virtual_instance(
229
234
  ssh_user,
230
235
  ssh_key,
231
236
  ssh_proxy_command=ssh_proxy_command,
237
+ ssh_proxy_jump=ssh_proxy_jump,
232
238
  )
233
239
 
234
240
  cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
@@ -305,6 +311,7 @@ def query_instances(
305
311
  ssh_user = ssh_config_dict['user']
306
312
  ssh_key = ssh_config_dict['private_key']
307
313
  ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
314
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
308
315
 
309
316
  client = slurm.SlurmClient(
310
317
  ssh_host,
@@ -312,6 +319,7 @@ def query_instances(
312
319
  ssh_user,
313
320
  ssh_key,
314
321
  ssh_proxy_command=ssh_proxy_command,
322
+ ssh_proxy_jump=ssh_proxy_jump,
315
323
  )
316
324
 
317
325
  # Map Slurm job states to SkyPilot ClusterStatus
@@ -401,6 +409,7 @@ def get_cluster_info(
401
409
  ssh_user = ssh_config_dict['user']
402
410
  ssh_key = ssh_config_dict['private_key']
403
411
  ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
412
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
404
413
 
405
414
  client = slurm.SlurmClient(
406
415
  ssh_host,
@@ -408,6 +417,7 @@ def get_cluster_info(
408
417
  ssh_user,
409
418
  ssh_key,
410
419
  ssh_proxy_command=ssh_proxy_command,
420
+ ssh_proxy_jump=ssh_proxy_jump,
411
421
  )
412
422
 
413
423
  # Find running job for this cluster
@@ -480,36 +490,66 @@ def terminate_instances(
480
490
  'worker_only=True is not supported for Slurm, this is a no-op.')
481
491
  return
482
492
 
483
- ssh_config_dict = provider_config['ssh']
484
- ssh_host = ssh_config_dict['hostname']
485
- ssh_port = int(ssh_config_dict['port'])
486
- ssh_user = ssh_config_dict['user']
487
- ssh_private_key = ssh_config_dict['private_key']
488
- # Check if we are running inside a Slurm job (Only happens with autodown,
489
- # where the Skylet will invoke terminate_instances on the remote cluster),
490
- # where we assume SSH between nodes have been set up on each node's
491
- # ssh config.
492
- # TODO(kevin): Validate this assumption. Another way would be to
493
- # mount the private key to the remote cluster, like we do with
494
- # other clouds' API keys.
495
- if slurm_utils.is_inside_slurm_job():
496
- logger.debug('Running inside a Slurm job, using machine\'s ssh config')
497
- ssh_private_key = None
498
- ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
499
-
500
- client = slurm.SlurmClient(
501
- ssh_host,
502
- ssh_port,
503
- ssh_user,
504
- ssh_private_key,
505
- ssh_proxy_command=ssh_proxy_command,
506
- )
507
- client.cancel_jobs_by_name(
508
- cluster_name_on_cloud,
509
- signal='TERM',
510
- full=True,
493
+ # Check if we are running inside a Slurm cluster (only happens with
494
+ # autodown, where the Skylet invokes terminate_instances on the remote
495
+ # cluster). In this case, use local execution instead of SSH.
496
+ # This assumes that the compute node is able to run scancel.
497
+ # TODO(kevin): Validate this assumption.
498
+ if slurm_utils.is_inside_slurm_cluster():
499
+ logger.debug('Running inside a Slurm cluster, using local execution')
500
+ client = slurm.SlurmClient(is_inside_slurm_cluster=True)
501
+ else:
502
+ ssh_config_dict = provider_config['ssh']
503
+ ssh_host = ssh_config_dict['hostname']
504
+ ssh_port = int(ssh_config_dict['port'])
505
+ ssh_user = ssh_config_dict['user']
506
+ ssh_private_key = ssh_config_dict['private_key']
507
+ ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
508
+ ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
509
+
510
+ client = slurm.SlurmClient(
511
+ ssh_host,
512
+ ssh_port,
513
+ ssh_user,
514
+ ssh_private_key,
515
+ ssh_proxy_command=ssh_proxy_command,
516
+ ssh_proxy_jump=ssh_proxy_jump,
517
+ )
518
+ jobs_state = client.get_jobs_state_by_name(cluster_name_on_cloud)
519
+ if not jobs_state:
520
+ logger.debug(f'Job for cluster {cluster_name_on_cloud} not found, '
521
+ 'it may have been terminated.')
522
+ return
523
+ assert len(jobs_state) == 1, (
524
+ f'Multiple jobs found for cluster {cluster_name_on_cloud}: {jobs_state}'
511
525
  )
512
526
 
527
+ job_state = jobs_state[0].strip()
528
+ # Terminal states where scancel is not needed or will fail.
529
+ terminal_states = {
530
+ 'COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT', 'NODE_FAIL', 'PREEMPTED',
531
+ 'SPECIAL_EXIT'
532
+ }
533
+ if job_state in terminal_states:
534
+ logger.debug(
535
+ f'Job for cluster {cluster_name_on_cloud} is already in a terminal '
536
+ f'state {job_state}. No action needed.')
537
+ return
538
+
539
+ if job_state in ('PENDING', 'CONFIGURING'):
540
+ # For pending/configuring jobs, cancel without signal to avoid hangs.
541
+ client.cancel_jobs_by_name(cluster_name_on_cloud, signal=None)
542
+ elif job_state == 'COMPLETING':
543
+ # Job is already being terminated. No action needed.
544
+ logger.debug(
545
+ f'Job for cluster {cluster_name_on_cloud} is already completing. '
546
+ 'No action needed.')
547
+ else:
548
+ # For other states (e.g., RUNNING, SUSPENDED), send a TERM signal.
549
+ client.cancel_jobs_by_name(cluster_name_on_cloud,
550
+ signal='TERM',
551
+ full=True)
552
+
513
553
 
514
554
  def open_ports(
515
555
  cluster_name_on_cloud: str,
@@ -557,6 +597,10 @@ def get_command_runners(
557
597
  # it is the login node's. The internal IP is the private IP of the node.
558
598
  ssh_user = cast(str, credentials.pop('ssh_user'))
559
599
  ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
600
+ # ssh_proxy_jump is Slurm-specific, it does not exist in the auth section
601
+ # of the cluster yaml.
602
+ ssh_proxy_jump = cluster_info.provider_config.get('ssh', {}).get(
603
+ 'proxyjump', None)
560
604
  runners = [
561
605
  command_runner.SlurmCommandRunner(
562
606
  (instance_info.external_ip or '', instance_info.ssh_port),
@@ -566,6 +610,8 @@ def get_command_runners(
566
610
  skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
567
611
  job_id=instance_info.tags['job_id'],
568
612
  slurm_node=instance_info.tags['node'],
613
+ ssh_proxy_jump=ssh_proxy_jump,
614
+ enable_interactive_auth=True,
569
615
  **credentials) for instance_info in instances
570
616
  ]
571
617