skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
+
import traceback
|
|
3
4
|
from typing import Any, Dict, List, Optional, Tuple
|
|
4
5
|
|
|
5
6
|
from sky import sky_logging
|
|
@@ -116,7 +117,8 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
116
117
|
volume_mount_path=volume_mount_path,
|
|
117
118
|
)
|
|
118
119
|
except Exception as e: # pylint: disable=broad-except
|
|
119
|
-
logger.warning(f'run_instances error: {e}'
|
|
120
|
+
logger.warning(f'run_instances error: {e}\n'
|
|
121
|
+
f'Full traceback:\n{traceback.format_exc()}')
|
|
120
122
|
raise
|
|
121
123
|
logger.info(f'Launched instance {instance_id}.')
|
|
122
124
|
created_instance_ids.append(instance_id)
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -80,7 +80,11 @@ def _construct_docker_login_template_name(cluster_name: str) -> str:
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
def retry(func):
|
|
83
|
-
"""Decorator to retry a function.
|
|
83
|
+
"""Decorator to retry a function.
|
|
84
|
+
|
|
85
|
+
Only retries on transient errors. Does not retry on authorization errors
|
|
86
|
+
(Unauthorized, Forbidden) as these are not recoverable.
|
|
87
|
+
"""
|
|
84
88
|
|
|
85
89
|
def wrapper(*args, **kwargs):
|
|
86
90
|
"""Wrapper for retrying a function."""
|
|
@@ -89,6 +93,14 @@ def retry(func):
|
|
|
89
93
|
try:
|
|
90
94
|
return func(*args, **kwargs)
|
|
91
95
|
except runpod.runpod.error.QueryError as e:
|
|
96
|
+
error_msg = str(e).lower()
|
|
97
|
+
# Don't retry on authorization errors - these won't recover
|
|
98
|
+
auth_keywords = ['unauthorized', 'forbidden', '401', '403']
|
|
99
|
+
if any(keyword in error_msg for keyword in auth_keywords):
|
|
100
|
+
logger.error(f'RunPod authorization error (not retrying): '
|
|
101
|
+
f'{common_utils.format_exception(e)}')
|
|
102
|
+
raise
|
|
103
|
+
cnt += 1
|
|
92
104
|
if cnt >= 3:
|
|
93
105
|
raise
|
|
94
106
|
logger.warning('Retrying for exception: '
|
sky/provision/runpod/volume.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""RunPod network volume provisioning."""
|
|
2
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
3
3
|
|
|
4
4
|
from sky import global_user_state
|
|
5
5
|
from sky import models
|
|
@@ -194,15 +194,31 @@ def get_volume_usedby(
|
|
|
194
194
|
|
|
195
195
|
def get_all_volumes_usedby(
|
|
196
196
|
configs: List[models.VolumeConfig],
|
|
197
|
-
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
198
|
-
"""Gets the usedby resources of all volumes.
|
|
199
|
-
|
|
197
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
|
198
|
+
"""Gets the usedby resources of all volumes.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
configs: List of VolumeConfig objects.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
usedby_pods: Dictionary of volume name to pods using the volume.
|
|
205
|
+
usedby_clusters: Dictionary of volume name to clusters using the volume.
|
|
206
|
+
failed_volume_names: Set of volume names whose usedby info failed to
|
|
207
|
+
fetch.
|
|
208
|
+
"""
|
|
200
209
|
used_by_pods, used_by_clusters = {}, {}
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
210
|
+
failed_volume_names = set()
|
|
211
|
+
for config in configs:
|
|
212
|
+
try:
|
|
213
|
+
usedby_pods, usedby_clusters = get_volume_usedby(config)
|
|
214
|
+
used_by_pods[config.name_on_cloud] = usedby_pods
|
|
215
|
+
used_by_clusters[config.name_on_cloud] = usedby_clusters
|
|
216
|
+
except Exception as e: # pylint: disable=broad-except
|
|
217
|
+
logger.debug(f'Failed to get usedby info for RunPod volume '
|
|
218
|
+
f'{config.name}: {e}')
|
|
219
|
+
failed_volume_names.add(config.name)
|
|
220
|
+
continue
|
|
221
|
+
return used_by_pods, used_by_clusters, failed_volume_names
|
|
206
222
|
|
|
207
223
|
|
|
208
224
|
def map_all_volumes_usedby(
|
sky/provision/slurm/instance.py
CHANGED
|
@@ -72,6 +72,7 @@ def _create_virtual_instance(
|
|
|
72
72
|
ssh_user = ssh_config_dict['user']
|
|
73
73
|
ssh_key = ssh_config_dict['private_key']
|
|
74
74
|
ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
|
|
75
|
+
ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
|
|
75
76
|
partition = slurm_utils.get_partition_from_config(provider_config)
|
|
76
77
|
|
|
77
78
|
client = slurm.SlurmClient(
|
|
@@ -80,6 +81,7 @@ def _create_virtual_instance(
|
|
|
80
81
|
ssh_user,
|
|
81
82
|
ssh_key,
|
|
82
83
|
ssh_proxy_command=ssh_proxy_command,
|
|
84
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
83
85
|
)
|
|
84
86
|
|
|
85
87
|
# COMPLETING state occurs when a job is being terminated - during this
|
|
@@ -168,12 +170,13 @@ def _create_virtual_instance(
|
|
|
168
170
|
skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
|
|
169
171
|
sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
|
|
170
172
|
ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
|
|
173
|
+
slurm_marker_file = f'{sky_home_dir}/{slurm_utils.SLURM_MARKER_FILE}'
|
|
171
174
|
|
|
172
175
|
# Build the sbatch script
|
|
173
176
|
gpu_directive = ''
|
|
174
177
|
if (accelerator_type is not None and accelerator_type.upper() != 'NONE' and
|
|
175
178
|
accelerator_count > 0):
|
|
176
|
-
gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type
|
|
179
|
+
gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type}:'
|
|
177
180
|
f'{accelerator_count}')
|
|
178
181
|
|
|
179
182
|
# By default stdout and stderr will be written to $HOME/slurm-%j.out
|
|
@@ -215,6 +218,8 @@ def _create_virtual_instance(
|
|
|
215
218
|
mkdir -p {sky_home_dir}
|
|
216
219
|
# Create sky runtime directory on each node.
|
|
217
220
|
srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
|
|
221
|
+
# Marker file to indicate we're in a Slurm cluster.
|
|
222
|
+
touch {slurm_marker_file}
|
|
218
223
|
# Suppress login messages.
|
|
219
224
|
touch {sky_home_dir}/.hushlogin
|
|
220
225
|
# Signal that the sbatch script has completed setup.
|
|
@@ -229,6 +234,7 @@ def _create_virtual_instance(
|
|
|
229
234
|
ssh_user,
|
|
230
235
|
ssh_key,
|
|
231
236
|
ssh_proxy_command=ssh_proxy_command,
|
|
237
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
232
238
|
)
|
|
233
239
|
|
|
234
240
|
cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
|
|
@@ -305,6 +311,7 @@ def query_instances(
|
|
|
305
311
|
ssh_user = ssh_config_dict['user']
|
|
306
312
|
ssh_key = ssh_config_dict['private_key']
|
|
307
313
|
ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
|
|
314
|
+
ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
|
|
308
315
|
|
|
309
316
|
client = slurm.SlurmClient(
|
|
310
317
|
ssh_host,
|
|
@@ -312,6 +319,7 @@ def query_instances(
|
|
|
312
319
|
ssh_user,
|
|
313
320
|
ssh_key,
|
|
314
321
|
ssh_proxy_command=ssh_proxy_command,
|
|
322
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
315
323
|
)
|
|
316
324
|
|
|
317
325
|
# Map Slurm job states to SkyPilot ClusterStatus
|
|
@@ -401,6 +409,7 @@ def get_cluster_info(
|
|
|
401
409
|
ssh_user = ssh_config_dict['user']
|
|
402
410
|
ssh_key = ssh_config_dict['private_key']
|
|
403
411
|
ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
|
|
412
|
+
ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
|
|
404
413
|
|
|
405
414
|
client = slurm.SlurmClient(
|
|
406
415
|
ssh_host,
|
|
@@ -408,6 +417,7 @@ def get_cluster_info(
|
|
|
408
417
|
ssh_user,
|
|
409
418
|
ssh_key,
|
|
410
419
|
ssh_proxy_command=ssh_proxy_command,
|
|
420
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
411
421
|
)
|
|
412
422
|
|
|
413
423
|
# Find running job for this cluster
|
|
@@ -480,36 +490,66 @@ def terminate_instances(
|
|
|
480
490
|
'worker_only=True is not supported for Slurm, this is a no-op.')
|
|
481
491
|
return
|
|
482
492
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
493
|
+
# Check if we are running inside a Slurm cluster (only happens with
|
|
494
|
+
# autodown, where the Skylet invokes terminate_instances on the remote
|
|
495
|
+
# cluster). In this case, use local execution instead of SSH.
|
|
496
|
+
# This assumes that the compute node is able to run scancel.
|
|
497
|
+
# TODO(kevin): Validate this assumption.
|
|
498
|
+
if slurm_utils.is_inside_slurm_cluster():
|
|
499
|
+
logger.debug('Running inside a Slurm cluster, using local execution')
|
|
500
|
+
client = slurm.SlurmClient(is_inside_slurm_cluster=True)
|
|
501
|
+
else:
|
|
502
|
+
ssh_config_dict = provider_config['ssh']
|
|
503
|
+
ssh_host = ssh_config_dict['hostname']
|
|
504
|
+
ssh_port = int(ssh_config_dict['port'])
|
|
505
|
+
ssh_user = ssh_config_dict['user']
|
|
506
|
+
ssh_private_key = ssh_config_dict['private_key']
|
|
507
|
+
ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
|
|
508
|
+
ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
|
|
509
|
+
|
|
510
|
+
client = slurm.SlurmClient(
|
|
511
|
+
ssh_host,
|
|
512
|
+
ssh_port,
|
|
513
|
+
ssh_user,
|
|
514
|
+
ssh_private_key,
|
|
515
|
+
ssh_proxy_command=ssh_proxy_command,
|
|
516
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
517
|
+
)
|
|
518
|
+
jobs_state = client.get_jobs_state_by_name(cluster_name_on_cloud)
|
|
519
|
+
if not jobs_state:
|
|
520
|
+
logger.debug(f'Job for cluster {cluster_name_on_cloud} not found, '
|
|
521
|
+
'it may have been terminated.')
|
|
522
|
+
return
|
|
523
|
+
assert len(jobs_state) == 1, (
|
|
524
|
+
f'Multiple jobs found for cluster {cluster_name_on_cloud}: {jobs_state}'
|
|
511
525
|
)
|
|
512
526
|
|
|
527
|
+
job_state = jobs_state[0].strip()
|
|
528
|
+
# Terminal states where scancel is not needed or will fail.
|
|
529
|
+
terminal_states = {
|
|
530
|
+
'COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT', 'NODE_FAIL', 'PREEMPTED',
|
|
531
|
+
'SPECIAL_EXIT'
|
|
532
|
+
}
|
|
533
|
+
if job_state in terminal_states:
|
|
534
|
+
logger.debug(
|
|
535
|
+
f'Job for cluster {cluster_name_on_cloud} is already in a terminal '
|
|
536
|
+
f'state {job_state}. No action needed.')
|
|
537
|
+
return
|
|
538
|
+
|
|
539
|
+
if job_state in ('PENDING', 'CONFIGURING'):
|
|
540
|
+
# For pending/configuring jobs, cancel without signal to avoid hangs.
|
|
541
|
+
client.cancel_jobs_by_name(cluster_name_on_cloud, signal=None)
|
|
542
|
+
elif job_state == 'COMPLETING':
|
|
543
|
+
# Job is already being terminated. No action needed.
|
|
544
|
+
logger.debug(
|
|
545
|
+
f'Job for cluster {cluster_name_on_cloud} is already completing. '
|
|
546
|
+
'No action needed.')
|
|
547
|
+
else:
|
|
548
|
+
# For other states (e.g., RUNNING, SUSPENDED), send a TERM signal.
|
|
549
|
+
client.cancel_jobs_by_name(cluster_name_on_cloud,
|
|
550
|
+
signal='TERM',
|
|
551
|
+
full=True)
|
|
552
|
+
|
|
513
553
|
|
|
514
554
|
def open_ports(
|
|
515
555
|
cluster_name_on_cloud: str,
|
|
@@ -557,6 +597,10 @@ def get_command_runners(
|
|
|
557
597
|
# it is the login node's. The internal IP is the private IP of the node.
|
|
558
598
|
ssh_user = cast(str, credentials.pop('ssh_user'))
|
|
559
599
|
ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
|
|
600
|
+
# ssh_proxy_jump is Slurm-specific, it does not exist in the auth section
|
|
601
|
+
# of the cluster yaml.
|
|
602
|
+
ssh_proxy_jump = cluster_info.provider_config.get('ssh', {}).get(
|
|
603
|
+
'proxyjump', None)
|
|
560
604
|
runners = [
|
|
561
605
|
command_runner.SlurmCommandRunner(
|
|
562
606
|
(instance_info.external_ip or '', instance_info.ssh_port),
|
|
@@ -566,6 +610,8 @@ def get_command_runners(
|
|
|
566
610
|
skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
|
|
567
611
|
job_id=instance_info.tags['job_id'],
|
|
568
612
|
slurm_node=instance_info.tags['node'],
|
|
613
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
614
|
+
enable_interactive_auth=True,
|
|
569
615
|
**credentials) for instance_info in instances
|
|
570
616
|
]
|
|
571
617
|
|