xmanager-slurm 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xm_slurm/__init__.py +47 -0
- xm_slurm/api/__init__.py +33 -0
- xm_slurm/api/abc.py +65 -0
- xm_slurm/api/models.py +70 -0
- xm_slurm/api/sqlite/client.py +358 -0
- xm_slurm/api/web/client.py +173 -0
- xm_slurm/batching.py +139 -0
- xm_slurm/config.py +189 -0
- xm_slurm/console.py +3 -0
- xm_slurm/constants.py +19 -0
- xm_slurm/contrib/__init__.py +0 -0
- xm_slurm/contrib/clusters/__init__.py +67 -0
- xm_slurm/contrib/clusters/drac.py +242 -0
- xm_slurm/dependencies.py +171 -0
- xm_slurm/executables.py +215 -0
- xm_slurm/execution.py +995 -0
- xm_slurm/executors.py +210 -0
- xm_slurm/experiment.py +1016 -0
- xm_slurm/experimental/parameter_controller.py +206 -0
- xm_slurm/filesystems.py +129 -0
- xm_slurm/job_blocks.py +21 -0
- xm_slurm/metadata_context.py +253 -0
- xm_slurm/packageables.py +309 -0
- xm_slurm/packaging/__init__.py +8 -0
- xm_slurm/packaging/docker.py +348 -0
- xm_slurm/packaging/registry.py +45 -0
- xm_slurm/packaging/router.py +56 -0
- xm_slurm/packaging/utils.py +22 -0
- xm_slurm/resources.py +350 -0
- xm_slurm/scripts/_cloudpickle.py +28 -0
- xm_slurm/scripts/cli.py +90 -0
- xm_slurm/status.py +197 -0
- xm_slurm/templates/docker/docker-bake.hcl.j2 +54 -0
- xm_slurm/templates/docker/mamba.Dockerfile +29 -0
- xm_slurm/templates/docker/python.Dockerfile +32 -0
- xm_slurm/templates/docker/uv.Dockerfile +38 -0
- xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +78 -0
- xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-group.bash.j2 +47 -0
- xm_slurm/templates/slurm/job.bash.j2 +90 -0
- xm_slurm/templates/slurm/library/retry.bash +62 -0
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +73 -0
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +43 -0
- xm_slurm/types.py +23 -0
- xm_slurm/utils.py +196 -0
- xmanager_slurm-0.4.19.dist-info/METADATA +28 -0
- xmanager_slurm-0.4.19.dist-info/RECORD +52 -0
- xmanager_slurm-0.4.19.dist-info/WHEEL +4 -0
- xmanager_slurm-0.4.19.dist-info/entry_points.txt +2 -0
- xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md +227 -0
xm_slurm/executors.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import collections.abc
|
|
2
|
+
import dataclasses
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import signal
|
|
5
|
+
import typing as tp
|
|
6
|
+
|
|
7
|
+
from xmanager import xm
|
|
8
|
+
|
|
9
|
+
from xm_slurm import resources, utils
|
|
10
|
+
|
|
11
|
+
ResourceBindType = tp.Literal[
|
|
12
|
+
resources.ResourceType.GPU,
|
|
13
|
+
resources.ResourceType.MEMORY,
|
|
14
|
+
resources.ResourceType.RAM,
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
19
|
+
class SlurmSpec(xm.ExecutorSpec):
|
|
20
|
+
"""Slurm executor specification that describes the location of the container runtime.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
tag: The Image URI to push and pull the container image from.
|
|
24
|
+
For example, using the GitHub Container Registry: `ghcr.io/my-project/my-image:latest`.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
tag: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
31
|
+
class Slurm(xm.Executor):
|
|
32
|
+
"""Slurm Executor describing the runtime environment.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
requirements: The requirements for the job.
|
|
36
|
+
time: The maximum time to run the job.
|
|
37
|
+
switches: Maximum count of leaf switches desired for the job allocation.
|
|
38
|
+
switches_grace_period: Maximum time to wait for that number of switches.
|
|
39
|
+
bind: How to bind tasks to resource (memory, GPU, or generic resource).
|
|
40
|
+
bind_flag: Generic resource task binding options.
|
|
41
|
+
account: The account to charge the job to.
|
|
42
|
+
partition: The partition to run the job in.
|
|
43
|
+
qos: The quality of service to run the job with.
|
|
44
|
+
priority: The priority of the job.
|
|
45
|
+
reservation: Allocate resources for the job from the named reservation.
|
|
46
|
+
exclusive: Allow sharing nodes with other running jobs.
|
|
47
|
+
oversubscribe: Allow over-subscribing resources with other running jobs.
|
|
48
|
+
overcommit: Allow sharing of allocated resources as if only one task per was requested.
|
|
49
|
+
nice: Run the job with an adjusted scheduling priority.
|
|
50
|
+
kill_on_invalid_dependencies: Whether to kill the job if it has invalid dependencies.
|
|
51
|
+
timeout_signal: The signal to send to the job when it runs out of time.
|
|
52
|
+
timeout_signal_grace_period: The time to wait before sending `timeout_signal`.
|
|
53
|
+
requeue: Whether or not the job is eligible for requeueing.
|
|
54
|
+
requeue_on_exit_code: The exit code that triggers requeueing.
|
|
55
|
+
requeue_max_attempts: The maximum number of times to attempt requeueing.
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# Job requirements
|
|
60
|
+
requirements: resources.JobRequirements
|
|
61
|
+
time: dt.timedelta
|
|
62
|
+
bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
|
|
63
|
+
bind_flag: str | None = None
|
|
64
|
+
|
|
65
|
+
# Placement
|
|
66
|
+
account: str | None = None
|
|
67
|
+
partition: str | None = None
|
|
68
|
+
qos: str | None = None
|
|
69
|
+
priority: int | None = None
|
|
70
|
+
reservation: str | tp.Iterable[str] | None = None
|
|
71
|
+
exclusive: bool = False
|
|
72
|
+
oversubscribe: bool = False
|
|
73
|
+
overcommit: bool = False
|
|
74
|
+
nice: int | None = None
|
|
75
|
+
|
|
76
|
+
# Job dependency handling
|
|
77
|
+
kill_on_invalid_dependencies: bool = True
|
|
78
|
+
|
|
79
|
+
# Job rescheduling
|
|
80
|
+
timeout_signal: signal.Signals = signal.SIGUSR2
|
|
81
|
+
timeout_signal_grace_period: dt.timedelta = dt.timedelta(seconds=90)
|
|
82
|
+
|
|
83
|
+
requeue: bool = True # Is this job ellible for requeueing?
|
|
84
|
+
requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
|
|
85
|
+
requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
|
|
86
|
+
requeue_max_attempts: int = 5 # How many times to attempt requeueing
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def requeue_timeout(self) -> dt.timedelta:
|
|
90
|
+
return self.time - self.timeout_signal_grace_period
|
|
91
|
+
|
|
92
|
+
def __post_init__(self) -> None:
|
|
93
|
+
if not isinstance(self.requirements, resources.JobRequirements):
|
|
94
|
+
raise TypeError(
|
|
95
|
+
f"requirements must be a `xm_slurm.JobRequirements`, got {type(self.requirements)}. "
|
|
96
|
+
"If you're still using `xm.JobRequirements`, please update to `xm_slurm.JobRequirements`."
|
|
97
|
+
)
|
|
98
|
+
if not isinstance(self.time, dt.timedelta):
|
|
99
|
+
raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
|
|
100
|
+
if self.bind is not None:
|
|
101
|
+
if not isinstance(self.bind, collections.abc.Mapping):
|
|
102
|
+
raise TypeError(f"bind must be a mapping, got {type(self.bind)}")
|
|
103
|
+
for resource, value in self.bind.items():
|
|
104
|
+
if resource not in (
|
|
105
|
+
resources.ResourceType.GPU,
|
|
106
|
+
resources.ResourceType.MEMORY,
|
|
107
|
+
resources.ResourceType.RAM,
|
|
108
|
+
) and not isinstance(resource, str):
|
|
109
|
+
raise TypeError(
|
|
110
|
+
f"bind resource must be a {resources.ResourceType.GPU.name}, {resources.ResourceType.MEMORY.name}, or {resources.ResourceType.RAM.name}, got {type(resource)}"
|
|
111
|
+
)
|
|
112
|
+
if value is not None and not isinstance(value, str):
|
|
113
|
+
raise TypeError(f"bind value must be None or a string, got {type(value)}")
|
|
114
|
+
if self.bind_flag is not None and not isinstance(self.bind_flag, str):
|
|
115
|
+
raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
|
|
116
|
+
|
|
117
|
+
if not isinstance(self.timeout_signal, signal.Signals):
|
|
118
|
+
raise TypeError(
|
|
119
|
+
f"termination_signal must be a `signal.Signals`, got {type(self.timeout_signal)}"
|
|
120
|
+
)
|
|
121
|
+
if not isinstance(self.timeout_signal_grace_period, dt.timedelta):
|
|
122
|
+
raise TypeError(
|
|
123
|
+
f"termination_signal_delay_time must be a `datetime.timedelta`, got {type(self.timeout_signal_grace_period)}"
|
|
124
|
+
)
|
|
125
|
+
if self.requeue_max_attempts < 0:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"requeue_max_attempts must be greater than or equal to 0, got {self.requeue_max_attempts}"
|
|
128
|
+
)
|
|
129
|
+
if self.requeue_on_exit_code == 0:
|
|
130
|
+
raise ValueError("requeue_on_exit_code should not be 0 to avoid unexpected behavior.")
|
|
131
|
+
if self.exclusive and self.oversubscribe:
|
|
132
|
+
raise ValueError("exclusive and oversubscribe are mutually exclusive.")
|
|
133
|
+
if self.nice is not None and not (-2147483645 <= self.nice <= 2147483645):
|
|
134
|
+
raise ValueError(f"nice must be between -2147483645 and 2147483645, got {self.nice}")
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def Spec(cls, tag: str | None = None) -> SlurmSpec:
|
|
138
|
+
return SlurmSpec(tag=tag)
|
|
139
|
+
|
|
140
|
+
def batch_directives(self) -> list[str]:
|
|
141
|
+
# Job requirements
|
|
142
|
+
directives = self.requirements.batch_directives()
|
|
143
|
+
|
|
144
|
+
# Time
|
|
145
|
+
directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
|
|
146
|
+
|
|
147
|
+
# Job dependency handling
|
|
148
|
+
directives.append(
|
|
149
|
+
f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Placement
|
|
153
|
+
if self.account is not None:
|
|
154
|
+
directives.append(f"--account={self.account}")
|
|
155
|
+
if self.partition is not None:
|
|
156
|
+
directives.append(f"--partition={self.partition}")
|
|
157
|
+
if self.qos is not None:
|
|
158
|
+
directives.append(f"--qos={self.qos}")
|
|
159
|
+
if self.priority is not None:
|
|
160
|
+
directives.append(f"--priority={self.priority}")
|
|
161
|
+
if self.reservation is not None:
|
|
162
|
+
match self.reservation:
|
|
163
|
+
case str():
|
|
164
|
+
directives.append(f"--reservation={self.reservation}")
|
|
165
|
+
case collections.abc.Iterable():
|
|
166
|
+
directives.append(f"--reservation={','.join(self.reservation)}")
|
|
167
|
+
case _:
|
|
168
|
+
raise ValueError(f"Invalid reservation type: {type(self.reservation)}")
|
|
169
|
+
if self.exclusive:
|
|
170
|
+
directives.append("--exclusive")
|
|
171
|
+
if self.oversubscribe:
|
|
172
|
+
directives.append("--oversubscribe")
|
|
173
|
+
if self.overcommit:
|
|
174
|
+
directives.append("--overcommit")
|
|
175
|
+
if self.nice is not None:
|
|
176
|
+
directives.append(f"--nice={self.nice}")
|
|
177
|
+
|
|
178
|
+
# Job rescheduling
|
|
179
|
+
directives.append(
|
|
180
|
+
f"--signal={self.timeout_signal.name.removeprefix('SIG')}@{self.timeout_signal_grace_period.seconds}"
|
|
181
|
+
)
|
|
182
|
+
if self.requeue is not None and self.requeue_max_attempts > 0:
|
|
183
|
+
directives.append("--requeue")
|
|
184
|
+
else:
|
|
185
|
+
directives.append("--no-requeue")
|
|
186
|
+
|
|
187
|
+
return directives
|
|
188
|
+
|
|
189
|
+
def step_directives(self) -> list[str]:
|
|
190
|
+
directives = self.requirements.step_directives()
|
|
191
|
+
|
|
192
|
+
# Resource binding
|
|
193
|
+
if self.bind is not None:
|
|
194
|
+
for resource, value in self.bind.items():
|
|
195
|
+
if value is None:
|
|
196
|
+
value = "none"
|
|
197
|
+
match resource:
|
|
198
|
+
case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
|
|
199
|
+
directives.append(f"--mem-bind={value}")
|
|
200
|
+
case resources.ResourceType.GPU:
|
|
201
|
+
directives.append(f"--gpu-bind={value}")
|
|
202
|
+
case str():
|
|
203
|
+
directives.append(f"--tres-bind=gres/{resource}:{value}")
|
|
204
|
+
case _:
|
|
205
|
+
raise ValueError(f"Unsupported resource type {resource!r} for binding.")
|
|
206
|
+
|
|
207
|
+
if self.bind_flag is not None:
|
|
208
|
+
directives.append(f"--gres-flags={self.bind_flag}")
|
|
209
|
+
|
|
210
|
+
return directives
|