skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'c28d94abd967c1a7494e3c343f92eb6d02d29541'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20251210'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
|
@@ -140,6 +140,7 @@ Cudo = clouds.Cudo
|
|
|
140
140
|
GCP = clouds.GCP
|
|
141
141
|
Lambda = clouds.Lambda
|
|
142
142
|
SCP = clouds.SCP
|
|
143
|
+
Slurm = clouds.Slurm
|
|
143
144
|
Kubernetes = clouds.Kubernetes
|
|
144
145
|
K8s = Kubernetes
|
|
145
146
|
OCI = clouds.OCI
|
|
@@ -170,6 +171,7 @@ __all__ = [
|
|
|
170
171
|
'RunPod',
|
|
171
172
|
'Vast',
|
|
172
173
|
'SCP',
|
|
174
|
+
'Slurm',
|
|
173
175
|
'Vsphere',
|
|
174
176
|
'Fluidstack',
|
|
175
177
|
'Nebius',
|
sky/adaptors/aws.py
CHANGED
|
@@ -28,7 +28,6 @@ This is informed by the following boto3 docs:
|
|
|
28
28
|
|
|
29
29
|
# pylint: disable=import-outside-toplevel
|
|
30
30
|
|
|
31
|
-
import functools
|
|
32
31
|
import logging
|
|
33
32
|
import threading
|
|
34
33
|
import time
|
|
@@ -69,65 +68,6 @@ version = 1
|
|
|
69
68
|
_MAX_ATTEMPT_FOR_CREATION = 5
|
|
70
69
|
|
|
71
70
|
|
|
72
|
-
class _ThreadLocalTTLCache(threading.local):
|
|
73
|
-
"""Thread-local storage for _thread_local_lru_cache decorator."""
|
|
74
|
-
|
|
75
|
-
def __init__(self, func, maxsize: int, ttl: int):
|
|
76
|
-
super().__init__()
|
|
77
|
-
self.func = func
|
|
78
|
-
self.maxsize = maxsize
|
|
79
|
-
self.ttl = ttl
|
|
80
|
-
|
|
81
|
-
def get_cache(self):
|
|
82
|
-
if not hasattr(self, 'cache'):
|
|
83
|
-
self.cache = annotations.ttl_cache(scope='request',
|
|
84
|
-
maxsize=self.maxsize,
|
|
85
|
-
ttl=self.ttl,
|
|
86
|
-
timer=time.time)(self.func)
|
|
87
|
-
return self.cache
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def _thread_local_ttl_cache(maxsize=32, ttl=60 * 55):
|
|
91
|
-
"""Thread-local TTL cache decorator.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
maxsize: Maximum size of the cache.
|
|
95
|
-
ttl: Time to live for the cache in seconds.
|
|
96
|
-
Default is 55 minutes, a bit less than 1 hour
|
|
97
|
-
default lifetime of an STS token.
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
def decorator(func):
|
|
101
|
-
# Create thread-local storage for the LRU cache
|
|
102
|
-
local_cache = _ThreadLocalTTLCache(func, maxsize, ttl)
|
|
103
|
-
|
|
104
|
-
# We can't apply the lru_cache here, because this runs at import time
|
|
105
|
-
# so we will always have the main thread's cache.
|
|
106
|
-
|
|
107
|
-
@functools.wraps(func)
|
|
108
|
-
def wrapper(*args, **kwargs):
|
|
109
|
-
# We are within the actual function call, which may be on a thread,
|
|
110
|
-
# so local_cache.cache will return the correct thread-local cache,
|
|
111
|
-
# which we can now apply and immediately call.
|
|
112
|
-
return local_cache.get_cache()(*args, **kwargs)
|
|
113
|
-
|
|
114
|
-
def cache_info():
|
|
115
|
-
# Note that this will only give the cache info for the current
|
|
116
|
-
# thread's cache.
|
|
117
|
-
return local_cache.get_cache().cache_info()
|
|
118
|
-
|
|
119
|
-
def cache_clear():
|
|
120
|
-
# Note that this will only clear the cache for the current thread.
|
|
121
|
-
local_cache.get_cache().cache_clear()
|
|
122
|
-
|
|
123
|
-
wrapper.cache_info = cache_info # type: ignore[attr-defined]
|
|
124
|
-
wrapper.cache_clear = cache_clear # type: ignore[attr-defined]
|
|
125
|
-
|
|
126
|
-
return wrapper
|
|
127
|
-
|
|
128
|
-
return decorator
|
|
129
|
-
|
|
130
|
-
|
|
131
71
|
def _assert_kwargs_builtin_type(kwargs):
|
|
132
72
|
assert all(isinstance(v, (int, float, str)) for v in kwargs.values()), (
|
|
133
73
|
f'kwargs should not contain none built-in types: {kwargs}')
|
|
@@ -174,7 +114,7 @@ def get_workspace_profile() -> Optional[str]:
|
|
|
174
114
|
|
|
175
115
|
# The TTL cache needs to be thread-local to avoid multiple threads sharing the
|
|
176
116
|
# same session object, which is not guaranteed to be thread-safe.
|
|
177
|
-
@
|
|
117
|
+
@annotations.thread_local_ttl_cache()
|
|
178
118
|
def session(check_credentials: bool = True, profile: Optional[str] = None):
|
|
179
119
|
"""Create an AWS session.
|
|
180
120
|
|
sky/adaptors/slurm.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""Slurm adaptor for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from typing import Dict, List, NamedTuple, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
from sky.utils import command_runner
|
|
9
|
+
from sky.utils import subprocess_utils
|
|
10
|
+
from sky.utils import timeline
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# ASCII Unit Separator (\x1f) to handle values with spaces
|
|
15
|
+
# and other special characters.
|
|
16
|
+
SEP = r'\x1f'
|
|
17
|
+
|
|
18
|
+
# Regex pattern to extract partition names from scontrol output
|
|
19
|
+
# Matches PartitionName=<name> and captures until the next field
|
|
20
|
+
_PARTITION_NAME_REGEX = re.compile(r'PartitionName=(.+?)(?:\s+\w+=|$)')
|
|
21
|
+
|
|
22
|
+
# Default timeout for waiting for job nodes to be allocated, in seconds.
|
|
23
|
+
_SLURM_DEFAULT_PROVISION_TIMEOUT = 10
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SlurmPartition(NamedTuple):
|
|
27
|
+
"""Information about the Slurm partitions."""
|
|
28
|
+
name: str
|
|
29
|
+
is_default: bool
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# TODO(kevin): Add more API types for other client functions.
|
|
33
|
+
class NodeInfo(NamedTuple):
|
|
34
|
+
"""Information about a Slurm node from sinfo."""
|
|
35
|
+
node: str
|
|
36
|
+
state: str
|
|
37
|
+
gres: str
|
|
38
|
+
cpus: int
|
|
39
|
+
memory_gb: float
|
|
40
|
+
# The default partition contains a '*' at the end of the name.
|
|
41
|
+
# It is the caller's responsibility to strip the '*' if needed.
|
|
42
|
+
partition: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SlurmClient:
|
|
46
|
+
"""Client for Slurm control plane operations."""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
ssh_host: str,
|
|
51
|
+
ssh_port: int,
|
|
52
|
+
ssh_user: str,
|
|
53
|
+
ssh_key: Optional[str],
|
|
54
|
+
ssh_proxy_command: Optional[str] = None,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize SlurmClient.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
ssh_host: Hostname of the Slurm controller.
|
|
60
|
+
ssh_port: SSH port on the controller.
|
|
61
|
+
ssh_user: SSH username.
|
|
62
|
+
ssh_key: Path to SSH private key, or None for keyless SSH.
|
|
63
|
+
ssh_proxy_command: Optional SSH proxy command.
|
|
64
|
+
"""
|
|
65
|
+
self.ssh_host = ssh_host
|
|
66
|
+
self.ssh_port = ssh_port
|
|
67
|
+
self.ssh_user = ssh_user
|
|
68
|
+
self.ssh_key = ssh_key
|
|
69
|
+
self.ssh_proxy_command = ssh_proxy_command
|
|
70
|
+
|
|
71
|
+
# Internal runner for executing Slurm CLI commands
|
|
72
|
+
# on the controller node.
|
|
73
|
+
self._runner = command_runner.SSHCommandRunner(
|
|
74
|
+
(ssh_host, ssh_port),
|
|
75
|
+
ssh_user,
|
|
76
|
+
ssh_key,
|
|
77
|
+
ssh_proxy_command=ssh_proxy_command,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def query_jobs(
|
|
81
|
+
self,
|
|
82
|
+
job_name: Optional[str] = None,
|
|
83
|
+
state_filters: Optional[List[str]] = None,
|
|
84
|
+
) -> List[str]:
|
|
85
|
+
"""Query Slurm jobs by state and optional name.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
job_name: Optional job name to filter by.
|
|
89
|
+
state_filters: List of job states to filter by
|
|
90
|
+
(e.g., ['running', 'pending']). If None, returns all jobs.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of job IDs matching the filters.
|
|
94
|
+
"""
|
|
95
|
+
cmd = 'squeue --me -h -o "%i"'
|
|
96
|
+
if state_filters is not None:
|
|
97
|
+
state_filters_str = ','.join(state_filters)
|
|
98
|
+
cmd += f' --states {state_filters_str}'
|
|
99
|
+
if job_name is not None:
|
|
100
|
+
cmd += f' --name {job_name}'
|
|
101
|
+
|
|
102
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
103
|
+
require_outputs=True,
|
|
104
|
+
stream_logs=False)
|
|
105
|
+
subprocess_utils.handle_returncode(rc,
|
|
106
|
+
cmd,
|
|
107
|
+
'Failed to query Slurm jobs.',
|
|
108
|
+
stderr=stderr)
|
|
109
|
+
|
|
110
|
+
job_ids = stdout.strip().splitlines()
|
|
111
|
+
return job_ids
|
|
112
|
+
|
|
113
|
+
def cancel_jobs_by_name(self,
|
|
114
|
+
job_name: str,
|
|
115
|
+
signal: Optional[str] = None,
|
|
116
|
+
full: bool = False) -> None:
|
|
117
|
+
"""Cancel Slurm job(s) by name.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
job_name: Name of the job(s) to cancel.
|
|
121
|
+
signal: Optional signal to send to the job(s).
|
|
122
|
+
full: If True, signals the batch script and its children processes.
|
|
123
|
+
By default, signals other than SIGKILL are not sent to the
|
|
124
|
+
batch step (the shell script).
|
|
125
|
+
"""
|
|
126
|
+
cmd = f'scancel --name {job_name}'
|
|
127
|
+
if signal is not None:
|
|
128
|
+
cmd += f' --signal {signal}'
|
|
129
|
+
if full:
|
|
130
|
+
cmd += ' --full'
|
|
131
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
132
|
+
require_outputs=True,
|
|
133
|
+
stream_logs=False)
|
|
134
|
+
subprocess_utils.handle_returncode(rc,
|
|
135
|
+
cmd,
|
|
136
|
+
f'Failed to cancel job {job_name}.',
|
|
137
|
+
stderr=stderr)
|
|
138
|
+
logger.debug(f'Successfully cancelled job {job_name}: {stdout}')
|
|
139
|
+
|
|
140
|
+
def info(self) -> str:
|
|
141
|
+
"""Get Slurm cluster information.
|
|
142
|
+
|
|
143
|
+
This is useful for checking if the cluster is accessible and
|
|
144
|
+
retrieving node information.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
The stdout output from sinfo.
|
|
148
|
+
"""
|
|
149
|
+
cmd = 'sinfo'
|
|
150
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
151
|
+
require_outputs=True,
|
|
152
|
+
stream_logs=False)
|
|
153
|
+
subprocess_utils.handle_returncode(
|
|
154
|
+
rc, cmd, 'Failed to get Slurm cluster information.', stderr=stderr)
|
|
155
|
+
return stdout
|
|
156
|
+
|
|
157
|
+
def info_nodes(self) -> List[NodeInfo]:
|
|
158
|
+
"""Get Slurm node information.
|
|
159
|
+
|
|
160
|
+
Returns node names, states, GRES (generic resources like GPUs),
|
|
161
|
+
CPUs, memory (MB), and partitions.
|
|
162
|
+
"""
|
|
163
|
+
cmd = (f'sinfo -h --Node -o '
|
|
164
|
+
f'"%N{SEP}%t{SEP}%G{SEP}%c{SEP}%m{SEP}%P"')
|
|
165
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
166
|
+
require_outputs=True,
|
|
167
|
+
stream_logs=False)
|
|
168
|
+
subprocess_utils.handle_returncode(
|
|
169
|
+
rc, cmd, 'Failed to get Slurm node information.', stderr=stderr)
|
|
170
|
+
|
|
171
|
+
nodes = []
|
|
172
|
+
for line in stdout.splitlines():
|
|
173
|
+
parts = line.split(SEP)
|
|
174
|
+
if len(parts) != 6:
|
|
175
|
+
raise RuntimeError(
|
|
176
|
+
f'Unexpected output format from sinfo: {line!r}')
|
|
177
|
+
try:
|
|
178
|
+
node_info = NodeInfo(node=parts[0],
|
|
179
|
+
state=parts[1],
|
|
180
|
+
gres=parts[2],
|
|
181
|
+
cpus=int(parts[3]),
|
|
182
|
+
memory_gb=int(parts[4]) / 1024.0,
|
|
183
|
+
partition=parts[5])
|
|
184
|
+
nodes.append(node_info)
|
|
185
|
+
except ValueError as e:
|
|
186
|
+
raise RuntimeError(
|
|
187
|
+
f'Failed to parse node info from line: {line!r}. '
|
|
188
|
+
f'Error: {e}') from e
|
|
189
|
+
|
|
190
|
+
return nodes
|
|
191
|
+
|
|
192
|
+
def node_details(self, node_name: str) -> Dict[str, str]:
|
|
193
|
+
"""Get detailed Slurm node information.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
A dictionary of node attributes.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def _parse_scontrol_node_output(output: str) -> Dict[str, str]:
|
|
200
|
+
"""Parses the key=value output of 'scontrol show node'."""
|
|
201
|
+
node_info = {}
|
|
202
|
+
# Split by space, handling values that might have spaces
|
|
203
|
+
# if quoted. This is simplified; scontrol can be complex.
|
|
204
|
+
parts = output.split()
|
|
205
|
+
for part in parts:
|
|
206
|
+
if '=' in part:
|
|
207
|
+
key, value = part.split('=', 1)
|
|
208
|
+
# Simple quote removal, might need refinement
|
|
209
|
+
value = value.strip('\'"')
|
|
210
|
+
node_info[key] = value
|
|
211
|
+
return node_info
|
|
212
|
+
|
|
213
|
+
cmd = f'scontrol show node {node_name}'
|
|
214
|
+
rc, node_details, _ = self._runner.run(cmd,
|
|
215
|
+
require_outputs=True,
|
|
216
|
+
stream_logs=False)
|
|
217
|
+
subprocess_utils.handle_returncode(
|
|
218
|
+
rc,
|
|
219
|
+
cmd,
|
|
220
|
+
f'Failed to get detailed node information for {node_name}.',
|
|
221
|
+
stderr=node_details)
|
|
222
|
+
node_info = _parse_scontrol_node_output(node_details)
|
|
223
|
+
return node_info
|
|
224
|
+
|
|
225
|
+
def get_node_jobs(self, node_name: str) -> List[str]:
|
|
226
|
+
"""Get the list of jobs for a given node name.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
A list of job names for the current user on the node.
|
|
230
|
+
"""
|
|
231
|
+
cmd = f'squeue --me -h --nodelist {node_name} -o "%b"'
|
|
232
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
233
|
+
require_outputs=True,
|
|
234
|
+
stream_logs=False)
|
|
235
|
+
subprocess_utils.handle_returncode(
|
|
236
|
+
rc, cmd, f'Failed to get jobs for node {node_name}.', stderr=stderr)
|
|
237
|
+
return stdout.splitlines()
|
|
238
|
+
|
|
239
|
+
def get_job_state(self, job_id: str) -> Optional[str]:
|
|
240
|
+
"""Get the state of a Slurm job.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
job_id: The Slurm job ID.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
The job state (e.g., 'PENDING', 'RUNNING', 'COMPLETED', etc.),
|
|
247
|
+
or None if the job is not found.
|
|
248
|
+
"""
|
|
249
|
+
# Use --only-job-state since we only need the job state.
|
|
250
|
+
# This reduces the work required by slurmctld.
|
|
251
|
+
cmd = f'squeue -h --only-job-state --jobs {job_id} -o "%T"'
|
|
252
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
253
|
+
require_outputs=True,
|
|
254
|
+
stream_logs=False)
|
|
255
|
+
if rc != 0:
|
|
256
|
+
# Job may not exist
|
|
257
|
+
logger.debug(f'Failed to get job state for job {job_id}: {stderr}')
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
state = stdout.strip()
|
|
261
|
+
return state if state else None
|
|
262
|
+
|
|
263
|
+
@timeline.event
|
|
264
|
+
def get_job_reason(self, job_id: str) -> Optional[str]:
|
|
265
|
+
"""Get the reason a job is in its current state
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
job_id: The Slurm job ID.
|
|
269
|
+
"""
|
|
270
|
+
# Without --states all, squeue omits terminated jobs.
|
|
271
|
+
cmd = f'squeue -h --jobs {job_id} --states all -o "%r"'
|
|
272
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
273
|
+
require_outputs=True,
|
|
274
|
+
stream_logs=False)
|
|
275
|
+
if rc != 0:
|
|
276
|
+
logger.debug(f'Failed to get job info for job {job_id}: {stderr}')
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
output = stdout.strip()
|
|
280
|
+
if not output:
|
|
281
|
+
return None
|
|
282
|
+
|
|
283
|
+
return output if output != 'None' else None
|
|
284
|
+
|
|
285
|
+
@timeline.event
|
|
286
|
+
def wait_for_job_nodes(self, job_id: str, timeout: int) -> None:
|
|
287
|
+
"""Wait for a Slurm job to have nodes allocated.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
job_id: The Slurm job ID.
|
|
291
|
+
timeout: Maximum time to wait in seconds.
|
|
292
|
+
"""
|
|
293
|
+
start_time = time.time()
|
|
294
|
+
last_state = None
|
|
295
|
+
|
|
296
|
+
while time.time() - start_time < timeout:
|
|
297
|
+
state = self.get_job_state(job_id)
|
|
298
|
+
|
|
299
|
+
if state != last_state:
|
|
300
|
+
logger.debug(f'Job {job_id} state: {state}')
|
|
301
|
+
last_state = state
|
|
302
|
+
|
|
303
|
+
if state is None:
|
|
304
|
+
raise RuntimeError(f'Job {job_id} not found. It may have been '
|
|
305
|
+
'cancelled or failed.')
|
|
306
|
+
|
|
307
|
+
if state in ('COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT'):
|
|
308
|
+
raise RuntimeError(
|
|
309
|
+
f'Job {job_id} terminated with state {state} '
|
|
310
|
+
'before nodes were allocated.')
|
|
311
|
+
# TODO(kevin): Log reason for pending.
|
|
312
|
+
|
|
313
|
+
# Check if nodes are allocated by trying to get node list
|
|
314
|
+
cmd = f'squeue -h --jobs {job_id} -o "%N"'
|
|
315
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
316
|
+
require_outputs=True,
|
|
317
|
+
stream_logs=False)
|
|
318
|
+
|
|
319
|
+
if rc == 0 and stdout.strip():
|
|
320
|
+
# Nodes are allocated
|
|
321
|
+
logger.debug(
|
|
322
|
+
f'Job {job_id} has nodes allocated: {stdout.strip()}')
|
|
323
|
+
return
|
|
324
|
+
elif rc != 0:
|
|
325
|
+
logger.debug(f'Failed to get nodes for job {job_id}: {stderr}')
|
|
326
|
+
|
|
327
|
+
# Wait before checking again
|
|
328
|
+
time.sleep(2)
|
|
329
|
+
|
|
330
|
+
raise TimeoutError(f'Job {job_id} did not get nodes allocated within '
|
|
331
|
+
f'{timeout} seconds. Last state: {last_state}')
|
|
332
|
+
|
|
333
|
+
@timeline.event
|
|
334
|
+
def get_job_nodes(
|
|
335
|
+
self,
|
|
336
|
+
job_id: str,
|
|
337
|
+
wait: bool = True,
|
|
338
|
+
timeout: Optional[int] = None) -> Tuple[List[str], List[str]]:
|
|
339
|
+
"""Get the list of nodes and their IPs for a given job ID.
|
|
340
|
+
|
|
341
|
+
The ordering is guaranteed to be stable for the lifetime of the job.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
job_id: The Slurm job ID.
|
|
345
|
+
wait: If True, wait for nodes to be allocated before returning.
|
|
346
|
+
timeout: Maximum time to wait in seconds. Only used when wait=True.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
A tuple of (nodes, node_ips) where nodes is a list of node names
|
|
350
|
+
and node_ips is a list of corresponding IP addresses.
|
|
351
|
+
"""
|
|
352
|
+
# Wait for nodes to be allocated if requested
|
|
353
|
+
if wait:
|
|
354
|
+
if timeout is None:
|
|
355
|
+
timeout = _SLURM_DEFAULT_PROVISION_TIMEOUT
|
|
356
|
+
self.wait_for_job_nodes(job_id, timeout=timeout)
|
|
357
|
+
|
|
358
|
+
cmd = (
|
|
359
|
+
f'squeue -h --jobs {job_id} -o "%N" | tr \',\' \'\\n\' | '
|
|
360
|
+
f'while read node; do '
|
|
361
|
+
# TODO(kevin): Use json output for more robust parsing.
|
|
362
|
+
f'ip=$(scontrol show node=$node | grep NodeAddr= | '
|
|
363
|
+
f'awk -F= \'{{print $2}}\' | awk \'{{print $1}}\'); '
|
|
364
|
+
f'echo "$node $ip"; '
|
|
365
|
+
f'done')
|
|
366
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
367
|
+
require_outputs=True,
|
|
368
|
+
stream_logs=False)
|
|
369
|
+
subprocess_utils.handle_returncode(
|
|
370
|
+
rc, cmd, f'Failed to get nodes for job {job_id}.', stderr=stderr)
|
|
371
|
+
logger.debug(f'Successfully got nodes for job {job_id}: {stdout}')
|
|
372
|
+
|
|
373
|
+
node_info = {}
|
|
374
|
+
for line in stdout.strip().splitlines():
|
|
375
|
+
line = line.strip()
|
|
376
|
+
if line:
|
|
377
|
+
parts = line.split()
|
|
378
|
+
if len(parts) >= 2:
|
|
379
|
+
node_name = parts[0]
|
|
380
|
+
node_ip = parts[1]
|
|
381
|
+
node_info[node_name] = node_ip
|
|
382
|
+
|
|
383
|
+
nodes = list(node_info.keys())
|
|
384
|
+
node_ips = [node_info[node] for node in nodes]
|
|
385
|
+
if not nodes:
|
|
386
|
+
raise RuntimeError(
|
|
387
|
+
f'No nodes found for job {job_id}. '
|
|
388
|
+
f'The job may have terminated or the output was empty.')
|
|
389
|
+
assert (len(nodes) == len(node_ips)
|
|
390
|
+
), f'Number of nodes and IPs do not match: {nodes} != {node_ips}'
|
|
391
|
+
|
|
392
|
+
return nodes, node_ips
|
|
393
|
+
|
|
394
|
+
def submit_job(
|
|
395
|
+
self,
|
|
396
|
+
partition: str,
|
|
397
|
+
job_name: str,
|
|
398
|
+
script_path: str,
|
|
399
|
+
) -> str:
|
|
400
|
+
"""Submit a Slurm job script.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
partition: Slurm partition to submit to.
|
|
404
|
+
job_name: Name to give the job.
|
|
405
|
+
script_path: Remote path where the script will be stored.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
The job ID of the submitted job.
|
|
409
|
+
"""
|
|
410
|
+
cmd = f'sbatch --partition={partition} {script_path}'
|
|
411
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
412
|
+
require_outputs=True,
|
|
413
|
+
stream_logs=False)
|
|
414
|
+
subprocess_utils.handle_returncode(rc,
|
|
415
|
+
cmd,
|
|
416
|
+
'Failed to submit Slurm job.',
|
|
417
|
+
stderr=f'{stdout}\n{stderr}')
|
|
418
|
+
|
|
419
|
+
# Parse job ID from sbatch output (format: "Submitted batch job 12345")
|
|
420
|
+
job_id_match = re.search(r'Submitted batch job (\d+)', stdout)
|
|
421
|
+
if not job_id_match:
|
|
422
|
+
raise RuntimeError(
|
|
423
|
+
f'Failed to parse job ID from sbatch output: {stdout}')
|
|
424
|
+
|
|
425
|
+
job_id = job_id_match.group(1).strip()
|
|
426
|
+
logger.debug(f'Successfully submitted Slurm job {job_id} with name '
|
|
427
|
+
f'{job_name}: {stdout}')
|
|
428
|
+
|
|
429
|
+
return job_id
|
|
430
|
+
|
|
431
|
+
def get_partitions_info(self) -> List[SlurmPartition]:
|
|
432
|
+
"""Get the partitions information for the Slurm cluster.
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
List of SlurmPartition objects.
|
|
436
|
+
"""
|
|
437
|
+
cmd = 'scontrol show partitions -o'
|
|
438
|
+
rc, stdout, stderr = self._runner.run(cmd,
|
|
439
|
+
require_outputs=True,
|
|
440
|
+
stream_logs=False)
|
|
441
|
+
subprocess_utils.handle_returncode(rc,
|
|
442
|
+
cmd,
|
|
443
|
+
'Failed to get Slurm partitions.',
|
|
444
|
+
stderr=stderr)
|
|
445
|
+
|
|
446
|
+
partitions = []
|
|
447
|
+
for line in stdout.strip().splitlines():
|
|
448
|
+
is_default = False
|
|
449
|
+
match = _PARTITION_NAME_REGEX.search(line)
|
|
450
|
+
if 'Default=YES' in line:
|
|
451
|
+
is_default = True
|
|
452
|
+
if match:
|
|
453
|
+
partition = match.group(1).strip()
|
|
454
|
+
if partition:
|
|
455
|
+
partitions.append(
|
|
456
|
+
SlurmPartition(name=partition, is_default=is_default))
|
|
457
|
+
return partitions
|
|
458
|
+
|
|
459
|
+
def get_default_partition(self) -> Optional[str]:
|
|
460
|
+
"""Get the default partition name for the Slurm cluster.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
The default partition name, or None if it cannot be determined.
|
|
464
|
+
"""
|
|
465
|
+
partitions = self.get_partitions_info()
|
|
466
|
+
for partition in partitions:
|
|
467
|
+
if partition.is_default:
|
|
468
|
+
return partition.name
|
|
469
|
+
return None
|
|
470
|
+
|
|
471
|
+
def get_partitions(self) -> List[str]:
|
|
472
|
+
"""Get unique partition names in the Slurm cluster.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
List of partition names. The default partition will not have a '*'
|
|
476
|
+
at the end of the name.
|
|
477
|
+
"""
|
|
478
|
+
return [partition.name for partition in self.get_partitions_info()]
|
sky/backends/backend_utils.py
CHANGED
|
@@ -147,6 +147,19 @@ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
|
|
|
147
147
|
# Remote dir that holds our runtime files.
|
|
148
148
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
149
149
|
|
|
150
|
+
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
151
|
+
# executed with /bin/sh should be less than 128KB.
|
|
152
|
+
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
|
|
153
|
+
#
|
|
154
|
+
# If a user have very long run or setup commands, the generated command may
|
|
155
|
+
# exceed the limit, as we directly include scripts in job submission commands.
|
|
156
|
+
# If the command is too long, we instead write it to a file, rsync and execute
|
|
157
|
+
# it.
|
|
158
|
+
#
|
|
159
|
+
# We use 100KB as a threshold to be safe for other arguments that
|
|
160
|
+
# might be added during ssh.
|
|
161
|
+
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
162
|
+
|
|
150
163
|
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
|
151
164
|
'please retry after a while.')
|
|
152
165
|
|
|
@@ -225,6 +238,18 @@ _ACK_MESSAGE = 'ack'
|
|
|
225
238
|
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
226
239
|
|
|
227
240
|
|
|
241
|
+
def is_command_length_over_limit(command: str) -> bool:
|
|
242
|
+
"""Check if the length of the command exceeds the limit.
|
|
243
|
+
|
|
244
|
+
We calculate the length of the command after quoting the command twice as
|
|
245
|
+
when it is executed by the CommandRunner, the command will be quoted twice
|
|
246
|
+
to ensure the correctness, which will add significant length to the command.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
quoted_length = len(shlex.quote(shlex.quote(command)))
|
|
250
|
+
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
251
|
+
|
|
252
|
+
|
|
228
253
|
def is_ip(s: str) -> bool:
|
|
229
254
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
230
255
|
return len(re.findall(IP_ADDR_REGEX, s)) == 1
|
|
@@ -946,6 +971,9 @@ def write_cluster_config(
|
|
|
946
971
|
'{conda_auto_activate}',
|
|
947
972
|
conda_auto_activate).replace('{is_custom_docker}',
|
|
948
973
|
is_custom_docker),
|
|
974
|
+
# Currently only used by Slurm. For other clouds, it is
|
|
975
|
+
# already part of ray_skypilot_installation_commands
|
|
976
|
+
'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
|
|
949
977
|
'ray_skypilot_installation_commands':
|
|
950
978
|
(constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
|
|
951
979
|
'{sky_wheel_hash}',
|
|
@@ -1058,7 +1086,11 @@ def write_cluster_config(
|
|
|
1058
1086
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
|
1059
1087
|
f.write(restored_yaml_content)
|
|
1060
1088
|
|
|
1061
|
-
|
|
1089
|
+
# Read the cluster_name_on_cloud from the restored yaml. This is a hack to
|
|
1090
|
+
# make sure that launching on the same cluster across multiple users works
|
|
1091
|
+
# correctly. See #8232.
|
|
1092
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1093
|
+
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
1062
1094
|
|
|
1063
1095
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
1064
1096
|
# non-deterministic, but everything else before this point should be
|
|
@@ -1105,17 +1137,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1105
1137
|
"""
|
|
1106
1138
|
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1107
1139
|
# Check the availability of the cloud type.
|
|
1108
|
-
if isinstance(
|
|
1140
|
+
if isinstance(
|
|
1141
|
+
cloud,
|
|
1142
|
+
(
|
|
1109
1143
|
clouds.AWS,
|
|
1110
1144
|
clouds.OCI,
|
|
1111
1145
|
clouds.SCP,
|
|
1146
|
+
# TODO(jwj): Handle Slurm-specific auth logic
|
|
1147
|
+
clouds.Slurm,
|
|
1112
1148
|
clouds.Vsphere,
|
|
1113
1149
|
clouds.Cudo,
|
|
1114
1150
|
clouds.Paperspace,
|
|
1115
1151
|
clouds.Azure,
|
|
1116
1152
|
clouds.DO,
|
|
1117
1153
|
clouds.Nebius,
|
|
1118
|
-
|
|
1154
|
+
)):
|
|
1119
1155
|
config = auth.configure_ssh_info(config)
|
|
1120
1156
|
elif isinstance(cloud, clouds.GCP):
|
|
1121
1157
|
config = auth.setup_gcp_authentication(config)
|
|
@@ -2361,7 +2397,12 @@ def _update_cluster_status(
|
|
|
2361
2397
|
# remain healthy for a while before the cloud completely preempts the VMs.
|
|
2362
2398
|
# We have mitigated this by again first querying the VM state from the cloud
|
|
2363
2399
|
# provider.
|
|
2364
|
-
|
|
2400
|
+
cloud = handle.launched_resources.cloud
|
|
2401
|
+
|
|
2402
|
+
# For Slurm, skip Ray health check since it doesn't use Ray.
|
|
2403
|
+
should_check_ray = cloud is not None and cloud.uses_ray()
|
|
2404
|
+
if all_nodes_up and (not should_check_ray or
|
|
2405
|
+
run_ray_status_to_check_ray_cluster_healthy()):
|
|
2365
2406
|
# NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
|
|
2366
2407
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
2367
2408
|
# head-ip/worker-ips`.
|