skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '5f4cd3b33375c055093474b95f219d26018b7343'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20260112'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
|
@@ -143,6 +143,7 @@ SCP = clouds.SCP
|
|
|
143
143
|
Slurm = clouds.Slurm
|
|
144
144
|
Kubernetes = clouds.Kubernetes
|
|
145
145
|
K8s = Kubernetes
|
|
146
|
+
SSH = clouds.SSH
|
|
146
147
|
OCI = clouds.OCI
|
|
147
148
|
Paperspace = clouds.Paperspace
|
|
148
149
|
PrimeIntellect = clouds.PrimeIntellect
|
|
@@ -164,6 +165,7 @@ __all__ = [
|
|
|
164
165
|
'IBM',
|
|
165
166
|
'Kubernetes',
|
|
166
167
|
'K8s',
|
|
168
|
+
'SSH',
|
|
167
169
|
'Lambda',
|
|
168
170
|
'OCI',
|
|
169
171
|
'Paperspace',
|
sky/adaptors/slurm.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
"""Slurm adaptor for SkyPilot."""
|
|
2
2
|
|
|
3
|
+
import ipaddress
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
6
|
+
import socket
|
|
5
7
|
import time
|
|
6
8
|
from typing import Dict, List, NamedTuple, Optional, Tuple
|
|
7
9
|
|
|
10
|
+
from sky.adaptors import common
|
|
8
11
|
from sky.utils import command_runner
|
|
12
|
+
from sky.utils import common_utils
|
|
9
13
|
from sky.utils import subprocess_utils
|
|
10
14
|
from sky.utils import timeline
|
|
11
15
|
|
|
@@ -22,6 +26,11 @@ _PARTITION_NAME_REGEX = re.compile(r'PartitionName=(.+?)(?:\s+\w+=|$)')
|
|
|
22
26
|
# Default timeout for waiting for job nodes to be allocated, in seconds.
|
|
23
27
|
_SLURM_DEFAULT_PROVISION_TIMEOUT = 10
|
|
24
28
|
|
|
29
|
+
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Slurm. '
|
|
30
|
+
'Try running: pip install "skypilot[slurm]"')
|
|
31
|
+
hostlist = common.LazyImport('hostlist',
|
|
32
|
+
import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
33
|
+
|
|
25
34
|
|
|
26
35
|
class SlurmPartition(NamedTuple):
|
|
27
36
|
"""Information about the Slurm partitions."""
|
|
@@ -47,11 +56,13 @@ class SlurmClient:
|
|
|
47
56
|
|
|
48
57
|
def __init__(
|
|
49
58
|
self,
|
|
50
|
-
ssh_host: str,
|
|
51
|
-
ssh_port: int,
|
|
52
|
-
ssh_user: str,
|
|
53
|
-
ssh_key: Optional[str],
|
|
59
|
+
ssh_host: Optional[str] = None,
|
|
60
|
+
ssh_port: Optional[int] = None,
|
|
61
|
+
ssh_user: Optional[str] = None,
|
|
62
|
+
ssh_key: Optional[str] = None,
|
|
54
63
|
ssh_proxy_command: Optional[str] = None,
|
|
64
|
+
ssh_proxy_jump: Optional[str] = None,
|
|
65
|
+
is_inside_slurm_cluster: bool = False,
|
|
55
66
|
):
|
|
56
67
|
"""Initialize SlurmClient.
|
|
57
68
|
|
|
@@ -61,21 +72,42 @@ class SlurmClient:
|
|
|
61
72
|
ssh_user: SSH username.
|
|
62
73
|
ssh_key: Path to SSH private key, or None for keyless SSH.
|
|
63
74
|
ssh_proxy_command: Optional SSH proxy command.
|
|
75
|
+
ssh_proxy_jump: Optional SSH proxy jump destination.
|
|
76
|
+
is_inside_slurm_cluster: If True, uses local execution mode (for
|
|
77
|
+
when running on the Slurm cluster itself). Defaults to False.
|
|
64
78
|
"""
|
|
65
79
|
self.ssh_host = ssh_host
|
|
66
80
|
self.ssh_port = ssh_port
|
|
67
81
|
self.ssh_user = ssh_user
|
|
68
82
|
self.ssh_key = ssh_key
|
|
69
83
|
self.ssh_proxy_command = ssh_proxy_command
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
84
|
+
self.ssh_proxy_jump = ssh_proxy_jump
|
|
85
|
+
|
|
86
|
+
self._runner: command_runner.CommandRunner
|
|
87
|
+
|
|
88
|
+
if is_inside_slurm_cluster:
|
|
89
|
+
# Local execution mode - for running on the Slurm cluster itself
|
|
90
|
+
# (e.g., autodown from skylet).
|
|
91
|
+
self._runner = command_runner.LocalProcessCommandRunner()
|
|
92
|
+
else:
|
|
93
|
+
# Remote execution via SSH
|
|
94
|
+
assert ssh_host is not None
|
|
95
|
+
assert ssh_port is not None
|
|
96
|
+
assert ssh_user is not None
|
|
97
|
+
self._runner = command_runner.SSHCommandRunner(
|
|
98
|
+
(ssh_host, ssh_port),
|
|
99
|
+
ssh_user,
|
|
100
|
+
ssh_key,
|
|
101
|
+
ssh_proxy_command=ssh_proxy_command,
|
|
102
|
+
ssh_proxy_jump=ssh_proxy_jump,
|
|
103
|
+
enable_interactive_auth=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _run_slurm_cmd(self, cmd: str) -> Tuple[int, str, str]:
|
|
107
|
+
return self._runner.run(cmd,
|
|
108
|
+
require_outputs=True,
|
|
109
|
+
separate_stderr=True,
|
|
110
|
+
stream_logs=False)
|
|
79
111
|
|
|
80
112
|
def query_jobs(
|
|
81
113
|
self,
|
|
@@ -99,13 +131,11 @@ class SlurmClient:
|
|
|
99
131
|
if job_name is not None:
|
|
100
132
|
cmd += f' --name {job_name}'
|
|
101
133
|
|
|
102
|
-
rc, stdout, stderr = self.
|
|
103
|
-
require_outputs=True,
|
|
104
|
-
stream_logs=False)
|
|
134
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
105
135
|
subprocess_utils.handle_returncode(rc,
|
|
106
136
|
cmd,
|
|
107
137
|
'Failed to query Slurm jobs.',
|
|
108
|
-
stderr=stderr)
|
|
138
|
+
stderr=f'{stdout}\n{stderr}')
|
|
109
139
|
|
|
110
140
|
job_ids = stdout.strip().splitlines()
|
|
111
141
|
return job_ids
|
|
@@ -128,13 +158,11 @@ class SlurmClient:
|
|
|
128
158
|
cmd += f' --signal {signal}'
|
|
129
159
|
if full:
|
|
130
160
|
cmd += ' --full'
|
|
131
|
-
rc, stdout, stderr = self.
|
|
132
|
-
require_outputs=True,
|
|
133
|
-
stream_logs=False)
|
|
161
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
134
162
|
subprocess_utils.handle_returncode(rc,
|
|
135
163
|
cmd,
|
|
136
164
|
f'Failed to cancel job {job_name}.',
|
|
137
|
-
stderr=stderr)
|
|
165
|
+
stderr=f'{stdout}\n{stderr}')
|
|
138
166
|
logger.debug(f'Successfully cancelled job {job_name}: {stdout}')
|
|
139
167
|
|
|
140
168
|
def info(self) -> str:
|
|
@@ -147,11 +175,12 @@ class SlurmClient:
|
|
|
147
175
|
The stdout output from sinfo.
|
|
148
176
|
"""
|
|
149
177
|
cmd = 'sinfo'
|
|
150
|
-
rc, stdout, stderr = self.
|
|
151
|
-
require_outputs=True,
|
|
152
|
-
stream_logs=False)
|
|
178
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
153
179
|
subprocess_utils.handle_returncode(
|
|
154
|
-
rc,
|
|
180
|
+
rc,
|
|
181
|
+
cmd,
|
|
182
|
+
'Failed to get Slurm cluster information.',
|
|
183
|
+
stderr=f'{stdout}\n{stderr}')
|
|
155
184
|
return stdout
|
|
156
185
|
|
|
157
186
|
def info_nodes(self) -> List[NodeInfo]:
|
|
@@ -162,11 +191,12 @@ class SlurmClient:
|
|
|
162
191
|
"""
|
|
163
192
|
cmd = (f'sinfo -h --Node -o '
|
|
164
193
|
f'"%N{SEP}%t{SEP}%G{SEP}%c{SEP}%m{SEP}%P"')
|
|
165
|
-
rc, stdout, stderr = self.
|
|
166
|
-
require_outputs=True,
|
|
167
|
-
stream_logs=False)
|
|
194
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
168
195
|
subprocess_utils.handle_returncode(
|
|
169
|
-
rc,
|
|
196
|
+
rc,
|
|
197
|
+
cmd,
|
|
198
|
+
'Failed to get Slurm node information.',
|
|
199
|
+
stderr=f'{stdout}\n{stderr}')
|
|
170
200
|
|
|
171
201
|
nodes = []
|
|
172
202
|
for line in stdout.splitlines():
|
|
@@ -211,31 +241,63 @@ class SlurmClient:
|
|
|
211
241
|
return node_info
|
|
212
242
|
|
|
213
243
|
cmd = f'scontrol show node {node_name}'
|
|
214
|
-
rc, node_details,
|
|
215
|
-
require_outputs=True,
|
|
216
|
-
stream_logs=False)
|
|
244
|
+
rc, node_details, stderr = self._run_slurm_cmd(cmd)
|
|
217
245
|
subprocess_utils.handle_returncode(
|
|
218
246
|
rc,
|
|
219
247
|
cmd,
|
|
220
248
|
f'Failed to get detailed node information for {node_name}.',
|
|
221
|
-
stderr=node_details)
|
|
249
|
+
stderr=f'{node_details}\n{stderr}')
|
|
222
250
|
node_info = _parse_scontrol_node_output(node_details)
|
|
223
251
|
return node_info
|
|
224
252
|
|
|
225
|
-
def
|
|
226
|
-
"""Get the list of jobs for a given node name.
|
|
253
|
+
def get_jobs_gres(self, node_name: str) -> List[str]:
|
|
254
|
+
"""Get the list of jobs GRES for a given node name.
|
|
227
255
|
|
|
228
256
|
Returns:
|
|
229
|
-
A list of
|
|
257
|
+
A list of GRES specs (e.g., 'gres/gpu:h100:4')
|
|
258
|
+
for jobs on the node.
|
|
230
259
|
"""
|
|
231
|
-
cmd = f'squeue
|
|
232
|
-
rc, stdout, stderr = self.
|
|
233
|
-
require_outputs=True,
|
|
234
|
-
stream_logs=False)
|
|
260
|
+
cmd = f'squeue -h --nodelist {node_name} -o "%b"'
|
|
261
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
235
262
|
subprocess_utils.handle_returncode(
|
|
236
|
-
rc,
|
|
263
|
+
rc,
|
|
264
|
+
cmd,
|
|
265
|
+
f'Failed to get jobs for node {node_name}.',
|
|
266
|
+
stderr=f'{stdout}\n{stderr}')
|
|
237
267
|
return stdout.splitlines()
|
|
238
268
|
|
|
269
|
+
def get_all_jobs_gres(self) -> Dict[str, List[str]]:
|
|
270
|
+
"""Get GRES allocation for all running jobs, grouped by node.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Dict mapping node_name -> list of GRES strings for jobs on that
|
|
274
|
+
node.
|
|
275
|
+
"""
|
|
276
|
+
cmd = f'squeue -h --states=running,completing -o "%N{SEP}%b"'
|
|
277
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
278
|
+
subprocess_utils.handle_returncode(rc,
|
|
279
|
+
cmd,
|
|
280
|
+
'Failed to get all jobs GRES.',
|
|
281
|
+
stderr=f'{stdout}\n{stderr}')
|
|
282
|
+
|
|
283
|
+
nodes_to_gres: Dict[str, List[str]] = {}
|
|
284
|
+
for line in stdout.splitlines():
|
|
285
|
+
line = line.strip()
|
|
286
|
+
if not line:
|
|
287
|
+
continue
|
|
288
|
+
parts = line.split(SEP)
|
|
289
|
+
if len(parts) != 2:
|
|
290
|
+
# We should never reach here, but just in case.
|
|
291
|
+
continue
|
|
292
|
+
nodelist_str, gres_str = parts
|
|
293
|
+
if not gres_str or gres_str == 'N/A':
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
for node in hostlist.expand_hostlist(nodelist_str):
|
|
297
|
+
nodes_to_gres.setdefault(node, []).append(gres_str)
|
|
298
|
+
|
|
299
|
+
return nodes_to_gres
|
|
300
|
+
|
|
239
301
|
def get_job_state(self, job_id: str) -> Optional[str]:
|
|
240
302
|
"""Get the state of a Slurm job.
|
|
241
303
|
|
|
@@ -249,17 +311,30 @@ class SlurmClient:
|
|
|
249
311
|
# Use --only-job-state since we only need the job state.
|
|
250
312
|
# This reduces the work required by slurmctld.
|
|
251
313
|
cmd = f'squeue -h --only-job-state --jobs {job_id} -o "%T"'
|
|
252
|
-
rc, stdout, stderr = self.
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
return None
|
|
314
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
315
|
+
subprocess_utils.handle_returncode(
|
|
316
|
+
rc,
|
|
317
|
+
cmd,
|
|
318
|
+
f'Failed to get job state for job {job_id}.',
|
|
319
|
+
stderr=f'{stdout}\n{stderr}')
|
|
259
320
|
|
|
260
321
|
state = stdout.strip()
|
|
261
322
|
return state if state else None
|
|
262
323
|
|
|
324
|
+
def get_jobs_state_by_name(self, job_name: str) -> List[str]:
|
|
325
|
+
"""Get the states of all Slurm jobs by name.
|
|
326
|
+
"""
|
|
327
|
+
cmd = f'squeue -h --name {job_name} -o "%T"'
|
|
328
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
329
|
+
subprocess_utils.handle_returncode(
|
|
330
|
+
rc,
|
|
331
|
+
cmd,
|
|
332
|
+
f'Failed to get job state for job {job_name}.',
|
|
333
|
+
stderr=f'{stdout}\n{stderr}')
|
|
334
|
+
|
|
335
|
+
states = stdout.splitlines()
|
|
336
|
+
return states
|
|
337
|
+
|
|
263
338
|
@timeline.event
|
|
264
339
|
def get_job_reason(self, job_id: str) -> Optional[str]:
|
|
265
340
|
"""Get the reason a job is in its current state
|
|
@@ -269,12 +344,12 @@ class SlurmClient:
|
|
|
269
344
|
"""
|
|
270
345
|
# Without --states all, squeue omits terminated jobs.
|
|
271
346
|
cmd = f'squeue -h --jobs {job_id} --states all -o "%r"'
|
|
272
|
-
rc, stdout, stderr = self.
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
347
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
348
|
+
subprocess_utils.handle_returncode(
|
|
349
|
+
rc,
|
|
350
|
+
cmd,
|
|
351
|
+
f'Failed to get job reason for job {job_id}.',
|
|
352
|
+
stderr=f'{stdout}\n{stderr}')
|
|
278
353
|
|
|
279
354
|
output = stdout.strip()
|
|
280
355
|
if not output:
|
|
@@ -312,9 +387,7 @@ class SlurmClient:
|
|
|
312
387
|
|
|
313
388
|
# Check if nodes are allocated by trying to get node list
|
|
314
389
|
cmd = f'squeue -h --jobs {job_id} -o "%N"'
|
|
315
|
-
rc, stdout, stderr = self.
|
|
316
|
-
require_outputs=True,
|
|
317
|
-
stream_logs=False)
|
|
390
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
318
391
|
|
|
319
392
|
if rc == 0 and stdout.strip():
|
|
320
393
|
# Nodes are allocated
|
|
@@ -322,7 +395,8 @@ class SlurmClient:
|
|
|
322
395
|
f'Job {job_id} has nodes allocated: {stdout.strip()}')
|
|
323
396
|
return
|
|
324
397
|
elif rc != 0:
|
|
325
|
-
logger.debug(f'Failed to get nodes for job {job_id}:
|
|
398
|
+
logger.debug(f'Failed to get nodes for job {job_id}: '
|
|
399
|
+
f'{stdout}\n{stderr}')
|
|
326
400
|
|
|
327
401
|
# Wait before checking again
|
|
328
402
|
time.sleep(2)
|
|
@@ -359,15 +433,16 @@ class SlurmClient:
|
|
|
359
433
|
f'squeue -h --jobs {job_id} -o "%N" | tr \',\' \'\\n\' | '
|
|
360
434
|
f'while read node; do '
|
|
361
435
|
# TODO(kevin): Use json output for more robust parsing.
|
|
362
|
-
f'
|
|
436
|
+
f'node_addr=$(scontrol show node=$node | grep NodeAddr= | '
|
|
363
437
|
f'awk -F= \'{{print $2}}\' | awk \'{{print $1}}\'); '
|
|
364
|
-
f'echo "$node $
|
|
438
|
+
f'echo "$node $node_addr"; '
|
|
365
439
|
f'done')
|
|
366
|
-
rc, stdout, stderr = self.
|
|
367
|
-
require_outputs=True,
|
|
368
|
-
stream_logs=False)
|
|
440
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
369
441
|
subprocess_utils.handle_returncode(
|
|
370
|
-
rc,
|
|
442
|
+
rc,
|
|
443
|
+
cmd,
|
|
444
|
+
f'Failed to get nodes for job {job_id}.',
|
|
445
|
+
stderr=f'{stdout}\n{stderr}')
|
|
371
446
|
logger.debug(f'Successfully got nodes for job {job_id}: {stdout}')
|
|
372
447
|
|
|
373
448
|
node_info = {}
|
|
@@ -377,7 +452,23 @@ class SlurmClient:
|
|
|
377
452
|
parts = line.split()
|
|
378
453
|
if len(parts) >= 2:
|
|
379
454
|
node_name = parts[0]
|
|
380
|
-
|
|
455
|
+
node_addr = parts[1]
|
|
456
|
+
# Resolve hostname to IP if node_addr is not already
|
|
457
|
+
# an IP address.
|
|
458
|
+
try:
|
|
459
|
+
ipaddress.ip_address(node_addr)
|
|
460
|
+
# Already an IP address
|
|
461
|
+
node_ip = node_addr
|
|
462
|
+
except ValueError:
|
|
463
|
+
# It's a hostname, resolve it to an IP
|
|
464
|
+
try:
|
|
465
|
+
node_ip = socket.gethostbyname(node_addr)
|
|
466
|
+
except socket.gaierror as e:
|
|
467
|
+
raise RuntimeError(
|
|
468
|
+
f'Failed to resolve hostname {node_addr} to IP '
|
|
469
|
+
f'for node {node_name}: '
|
|
470
|
+
f'{common_utils.format_exception(e)}') from e
|
|
471
|
+
|
|
381
472
|
node_info[node_name] = node_ip
|
|
382
473
|
|
|
383
474
|
nodes = list(node_info.keys())
|
|
@@ -408,9 +499,7 @@ class SlurmClient:
|
|
|
408
499
|
The job ID of the submitted job.
|
|
409
500
|
"""
|
|
410
501
|
cmd = f'sbatch --partition={partition} {script_path}'
|
|
411
|
-
rc, stdout, stderr = self.
|
|
412
|
-
require_outputs=True,
|
|
413
|
-
stream_logs=False)
|
|
502
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
414
503
|
subprocess_utils.handle_returncode(rc,
|
|
415
504
|
cmd,
|
|
416
505
|
'Failed to submit Slurm job.',
|
|
@@ -435,13 +524,11 @@ class SlurmClient:
|
|
|
435
524
|
List of SlurmPartition objects.
|
|
436
525
|
"""
|
|
437
526
|
cmd = 'scontrol show partitions -o'
|
|
438
|
-
rc, stdout, stderr = self.
|
|
439
|
-
require_outputs=True,
|
|
440
|
-
stream_logs=False)
|
|
527
|
+
rc, stdout, stderr = self._run_slurm_cmd(cmd)
|
|
441
528
|
subprocess_utils.handle_returncode(rc,
|
|
442
529
|
cmd,
|
|
443
530
|
'Failed to get Slurm partitions.',
|
|
444
|
-
stderr=stderr)
|
|
531
|
+
stderr=f'{stdout}\n{stderr}')
|
|
445
532
|
|
|
446
533
|
partitions = []
|
|
447
534
|
for line in stdout.strip().splitlines():
|
sky/backends/backend_utils.py
CHANGED
|
@@ -69,6 +69,7 @@ from sky.utils import timeline
|
|
|
69
69
|
from sky.utils import ux_utils
|
|
70
70
|
from sky.utils import volume as volume_utils
|
|
71
71
|
from sky.utils import yaml_utils
|
|
72
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
72
73
|
from sky.workspaces import core as workspaces_core
|
|
73
74
|
|
|
74
75
|
if typing.TYPE_CHECKING:
|
|
@@ -763,7 +764,20 @@ def write_cluster_config(
|
|
|
763
764
|
keys=('allowed_contexts',),
|
|
764
765
|
default_value=None)
|
|
765
766
|
if allowed_contexts is None:
|
|
766
|
-
|
|
767
|
+
# Exclude both Kubernetes and SSH explicitly since:
|
|
768
|
+
# 1. isinstance(cloud, clouds.Kubernetes) matches both (SSH
|
|
769
|
+
# inherits from Kubernetes)
|
|
770
|
+
# 2. Both share the same get_credential_file_mounts() which
|
|
771
|
+
# returns the kubeconfig. So if we don't exclude both, the
|
|
772
|
+
# unexcluded one will upload the kubeconfig.
|
|
773
|
+
# TODO(romilb): This is a workaround. The right long-term fix
|
|
774
|
+
# is to have SSH Node Pools use its own kubeconfig instead of
|
|
775
|
+
# sharing the global kubeconfig at ~/.kube/config. In the
|
|
776
|
+
# interim, SSH Node Pools' get_credential_file_mounts can filter
|
|
777
|
+
# contexts starting with ssh- and create a temp kubeconfig
|
|
778
|
+
# to upload.
|
|
779
|
+
excluded_clouds.add(clouds.Kubernetes())
|
|
780
|
+
excluded_clouds.add(clouds.SSH())
|
|
767
781
|
else:
|
|
768
782
|
excluded_clouds.add(cloud)
|
|
769
783
|
|
|
@@ -2262,6 +2276,12 @@ def _update_cluster_status(
|
|
|
2262
2276
|
for status in node_statuses) and
|
|
2263
2277
|
len(node_statuses) == handle.launched_nodes)
|
|
2264
2278
|
|
|
2279
|
+
external_cluster_failures = ExternalFailureSource.get(
|
|
2280
|
+
cluster_hash=record['cluster_hash'])
|
|
2281
|
+
logger.debug(f'Cluster {cluster_name} with cluster_hash '
|
|
2282
|
+
f'{record["cluster_hash"]} has external cluster failures: '
|
|
2283
|
+
f'{external_cluster_failures}')
|
|
2284
|
+
|
|
2265
2285
|
def get_node_counts_from_ray_status(
|
|
2266
2286
|
runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
|
|
2267
2287
|
rc, output, stderr = runner.run(
|
|
@@ -2401,8 +2421,9 @@ def _update_cluster_status(
|
|
|
2401
2421
|
|
|
2402
2422
|
# For Slurm, skip Ray health check since it doesn't use Ray.
|
|
2403
2423
|
should_check_ray = cloud is not None and cloud.uses_ray()
|
|
2404
|
-
if all_nodes_up and (not should_check_ray or
|
|
2405
|
-
|
|
2424
|
+
if (all_nodes_up and (not should_check_ray or
|
|
2425
|
+
run_ray_status_to_check_ray_cluster_healthy()) and
|
|
2426
|
+
not external_cluster_failures):
|
|
2406
2427
|
# NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
|
|
2407
2428
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
2408
2429
|
# head-ip/worker-ips`.
|
|
@@ -2505,15 +2526,15 @@ def _update_cluster_status(
|
|
|
2505
2526
|
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2506
2527
|
# autostopping/autodowning.
|
|
2507
2528
|
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2508
|
-
# If all nodes are up and ray cluster is health, we would have returned
|
|
2509
|
-
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2510
|
-
# cluster must have been unhealthy.
|
|
2511
|
-
ray_cluster_unhealthy = all_nodes_up
|
|
2512
2529
|
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2513
2530
|
for status in node_statuses)
|
|
2514
2531
|
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
2515
2532
|
|
|
2516
|
-
if is_abnormal:
|
|
2533
|
+
if is_abnormal and not external_cluster_failures:
|
|
2534
|
+
# If all nodes are up and ray cluster is healthy, we would have returned
|
|
2535
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2536
|
+
# cluster must have been unhealthy.
|
|
2537
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2517
2538
|
status_reason = ', '.join(
|
|
2518
2539
|
[status[1] for status in node_statuses if status[1] is not None])
|
|
2519
2540
|
|
|
@@ -2641,8 +2662,25 @@ def _update_cluster_status(
|
|
|
2641
2662
|
cluster_name,
|
|
2642
2663
|
include_user_info=include_user_info,
|
|
2643
2664
|
summary_response=summary_response)
|
|
2644
|
-
# Now
|
|
2645
|
-
#
|
|
2665
|
+
# Now either:
|
|
2666
|
+
# (1) is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2667
|
+
# STOPPED
|
|
2668
|
+
# or
|
|
2669
|
+
# (2) there are external cluster failures reported by a plugin.
|
|
2670
|
+
|
|
2671
|
+
# If there are external cluster failures and the cluster has not been
|
|
2672
|
+
# terminated on cloud (to_terminate), we can return the cluster record as is.
|
|
2673
|
+
# This is because when an external failure is detected, the cluster will be
|
|
2674
|
+
# marked as INIT with a reason indicating the details of the failure. So, we
|
|
2675
|
+
# do not want to modify the cluster status in this function except for in the
|
|
2676
|
+
# case where the cluster has been terminated on cloud, in which case we should
|
|
2677
|
+
# clean up the cluster from SkyPilot's global state.
|
|
2678
|
+
if external_cluster_failures and not to_terminate:
|
|
2679
|
+
return global_user_state.get_cluster_from_name(
|
|
2680
|
+
cluster_name,
|
|
2681
|
+
include_user_info=include_user_info,
|
|
2682
|
+
summary_response=summary_response)
|
|
2683
|
+
|
|
2646
2684
|
verb = 'terminated' if to_terminate else 'stopped'
|
|
2647
2685
|
backend = backends.CloudVmRayBackend()
|
|
2648
2686
|
global_user_state.add_cluster_event(
|
|
@@ -3368,6 +3406,8 @@ def get_clusters(
|
|
|
3368
3406
|
handle = record['handle']
|
|
3369
3407
|
record['nodes'] = handle.launched_nodes
|
|
3370
3408
|
if handle.launched_resources is None:
|
|
3409
|
+
# Set default values when launched_resources is None
|
|
3410
|
+
record['labels'] = {}
|
|
3371
3411
|
continue
|
|
3372
3412
|
record['cloud'] = (f'{handle.launched_resources.cloud}'
|
|
3373
3413
|
if handle.launched_resources.cloud else None)
|
|
@@ -3380,6 +3420,8 @@ def get_clusters(
|
|
|
3380
3420
|
record['accelerators'] = (
|
|
3381
3421
|
f'{handle.launched_resources.accelerators}'
|
|
3382
3422
|
if handle.launched_resources.accelerators else None)
|
|
3423
|
+
record['labels'] = (handle.launched_resources.labels
|
|
3424
|
+
if handle.launched_resources.labels else {})
|
|
3383
3425
|
if not include_handle:
|
|
3384
3426
|
record.pop('handle', None)
|
|
3385
3427
|
|