xmanager-slurm 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/contrib/clusters/__init__.py +1 -0
- xm_slurm/contrib/clusters/drac.py +1 -0
- xm_slurm/executors.py +29 -17
- xm_slurm/resources.py +4 -1
- xm_slurm/templates/slurm/job-array.bash.j2 +4 -1
- xm_slurm/templates/slurm/job-group.bash.j2 +7 -2
- xm_slurm/templates/slurm/job.bash.j2 +5 -2
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +12 -8
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +1 -0
- {xmanager_slurm-0.4.15.dist-info → xmanager_slurm-0.4.17.dist-info}/METADATA +1 -1
- {xmanager_slurm-0.4.15.dist-info → xmanager_slurm-0.4.17.dist-info}/RECORD +14 -14
- {xmanager_slurm-0.4.15.dist-info → xmanager_slurm-0.4.17.dist-info}/WHEEL +0 -0
- {xmanager_slurm-0.4.15.dist-info → xmanager_slurm-0.4.17.dist-info}/entry_points.txt +0 -0
- {xmanager_slurm-0.4.15.dist-info → xmanager_slurm-0.4.17.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -26,6 +26,7 @@ def mila(
|
|
|
26
26
|
"/home/mila/${USER:0:1}/$USER/.local/state/xm-slurm": "/xm-slurm-state",
|
|
27
27
|
"/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
|
|
28
28
|
}
|
|
29
|
+
mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
|
|
29
30
|
|
|
30
31
|
return config.SlurmClusterConfig(
|
|
31
32
|
name="mila",
|
xm_slurm/executors.py
CHANGED
|
@@ -37,6 +37,7 @@ class Slurm(xm.Executor):
|
|
|
37
37
|
switches: Maximum count of leaf switches desired for the job allocation.
|
|
38
38
|
switches_grace_period: Maximum time to wait for that number of switches.
|
|
39
39
|
bind: How to bind tasks to resource (memory, GPU, or generic resource).
|
|
40
|
+
bind_flag: Generic resource task binding options.
|
|
40
41
|
account: The account to charge the job to.
|
|
41
42
|
partition: The partition to run the job in.
|
|
42
43
|
qos: The quality of service to run the job with.
|
|
@@ -59,6 +60,7 @@ class Slurm(xm.Executor):
|
|
|
59
60
|
requirements: resources.JobRequirements
|
|
60
61
|
time: dt.timedelta
|
|
61
62
|
bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
|
|
63
|
+
bind_flag: str | None = None
|
|
62
64
|
|
|
63
65
|
# Placement
|
|
64
66
|
account: str | None = None
|
|
@@ -109,6 +111,8 @@ class Slurm(xm.Executor):
|
|
|
109
111
|
)
|
|
110
112
|
if value is not None and not isinstance(value, str):
|
|
111
113
|
raise TypeError(f"bind value must be None or a string, got {type(value)}")
|
|
114
|
+
if self.bind_flag is not None and not isinstance(self.bind_flag, str):
|
|
115
|
+
raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
|
|
112
116
|
|
|
113
117
|
if not isinstance(self.timeout_signal, signal.Signals):
|
|
114
118
|
raise TypeError(
|
|
@@ -133,28 +137,13 @@ class Slurm(xm.Executor):
|
|
|
133
137
|
def Spec(cls, tag: str | None = None) -> SlurmSpec:
|
|
134
138
|
return SlurmSpec(tag=tag)
|
|
135
139
|
|
|
136
|
-
def
|
|
140
|
+
def batch_directives(self) -> list[str]:
|
|
137
141
|
# Job requirements
|
|
138
|
-
directives = self.requirements.
|
|
142
|
+
directives = self.requirements.batch_directives()
|
|
139
143
|
|
|
140
144
|
# Time
|
|
141
145
|
directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
|
|
142
146
|
|
|
143
|
-
# Resource binding
|
|
144
|
-
if self.bind is not None:
|
|
145
|
-
for resource, value in self.bind.items():
|
|
146
|
-
if value is None:
|
|
147
|
-
value = "none"
|
|
148
|
-
match resource:
|
|
149
|
-
case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
|
|
150
|
-
directives.append(f"--mem-bind={value}")
|
|
151
|
-
case resources.ResourceType.GPU:
|
|
152
|
-
directives.append(f"--gpu-bind={value}")
|
|
153
|
-
case str():
|
|
154
|
-
directives.append(f"--tres-bind=gres/{resource}:{value}")
|
|
155
|
-
case _:
|
|
156
|
-
raise ValueError(f"Unsupported resource type {resource!r} for binding.")
|
|
157
|
-
|
|
158
147
|
# Job dependency handling
|
|
159
148
|
directives.append(
|
|
160
149
|
f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
|
|
@@ -196,3 +185,26 @@ class Slurm(xm.Executor):
|
|
|
196
185
|
directives.append("--no-requeue")
|
|
197
186
|
|
|
198
187
|
return directives
|
|
188
|
+
|
|
189
|
+
def step_directives(self) -> list[str]:
|
|
190
|
+
directives = self.requirements.step_directives()
|
|
191
|
+
|
|
192
|
+
# Resource binding
|
|
193
|
+
if self.bind is not None:
|
|
194
|
+
for resource, value in self.bind.items():
|
|
195
|
+
if value is None:
|
|
196
|
+
value = "none"
|
|
197
|
+
match resource:
|
|
198
|
+
case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
|
|
199
|
+
directives.append(f"--mem-bind={value}")
|
|
200
|
+
case resources.ResourceType.GPU:
|
|
201
|
+
directives.append(f"--gpu-bind={value}")
|
|
202
|
+
case str():
|
|
203
|
+
directives.append(f"--tres-bind=gres/{resource}:{value}")
|
|
204
|
+
case _:
|
|
205
|
+
raise ValueError(f"Unsupported resource type {resource!r} for binding.")
|
|
206
|
+
|
|
207
|
+
if self.bind_flag is not None:
|
|
208
|
+
directives.append(f"--gres-flags={self.bind_flag}")
|
|
209
|
+
|
|
210
|
+
return directives
|
xm_slurm/resources.py
CHANGED
|
@@ -232,7 +232,7 @@ class JobRequirements:
|
|
|
232
232
|
raise ValueError(f"Replicas must be a positive integer, got {replicas!r}")
|
|
233
233
|
self.replicas = replicas or 1
|
|
234
234
|
|
|
235
|
-
def
|
|
235
|
+
def batch_directives(self) -> list[str]:
|
|
236
236
|
directives = []
|
|
237
237
|
|
|
238
238
|
for resource, value in self.task_requirements.items():
|
|
@@ -302,6 +302,9 @@ class JobRequirements:
|
|
|
302
302
|
|
|
303
303
|
return directives
|
|
304
304
|
|
|
305
|
+
def step_directives(self) -> list[str]:
|
|
306
|
+
return []
|
|
307
|
+
|
|
305
308
|
def replace(
|
|
306
309
|
self,
|
|
307
310
|
replicas: int | None = None,
|
|
@@ -9,8 +9,11 @@
|
|
|
9
9
|
srun \
|
|
10
10
|
--label \
|
|
11
11
|
--unbuffered \
|
|
12
|
-
--kill-on-bad-exit=
|
|
12
|
+
--kill-on-bad-exit=1 \
|
|
13
13
|
--export="ALL" \
|
|
14
|
+
{% for directive in job.executor.step_directives() %}
|
|
15
|
+
{{ directive }} \
|
|
16
|
+
{% endfor %}
|
|
14
17
|
bash <<'SRUN_EOF' &
|
|
15
18
|
set -Eeuxo pipefail
|
|
16
19
|
|
|
@@ -21,7 +21,9 @@
|
|
|
21
21
|
{% else %}
|
|
22
22
|
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
|
|
23
23
|
{% endif %}
|
|
24
|
-
{
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
|
+
#SBATCH {{ directive }}
|
|
26
|
+
{% endfor %}
|
|
25
27
|
{{ "\n#SBATCH hetjob\n" if not loop.last }}
|
|
26
28
|
{% endfor %}
|
|
27
29
|
{% endblock directives %}
|
|
@@ -31,8 +33,11 @@
|
|
|
31
33
|
srun \
|
|
32
34
|
--label \
|
|
33
35
|
--unbuffered \
|
|
34
|
-
--kill-on-bad-exit=
|
|
36
|
+
--kill-on-bad-exit=1 \
|
|
35
37
|
--export="ALL" \
|
|
38
|
+
{% for directive in job.executor.step_directives() %}
|
|
39
|
+
{{ directive }} \
|
|
40
|
+
{% endfor %}
|
|
36
41
|
--het-group={{ loop.index0 }} \
|
|
37
42
|
bash <<'SRUN_EOF' &
|
|
38
43
|
set -Eeuxo pipefail
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
{% endif %}
|
|
22
22
|
#SBATCH --job-name=xm[{{ experiment_id }}]
|
|
23
23
|
{% endif %}
|
|
24
|
-
{% for directive in job.executor.
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
25
|
#SBATCH {{ directive }}
|
|
26
26
|
{% endfor %}
|
|
27
27
|
{% endblock directives %}
|
|
@@ -61,8 +61,11 @@ export {{ key }}="{{ value }}"
|
|
|
61
61
|
srun \
|
|
62
62
|
--label \
|
|
63
63
|
--unbuffered \
|
|
64
|
-
--kill-on-bad-exit=
|
|
64
|
+
--kill-on-bad-exit=1 \
|
|
65
65
|
--export="ALL" \
|
|
66
|
+
{% for directive in job.executor.step_directives() %}
|
|
67
|
+
{{ directive }} \
|
|
68
|
+
{% endfor %}
|
|
66
69
|
bash <<'SRUN_EOF' &
|
|
67
70
|
set -Eeuxo pipefail
|
|
68
71
|
{{ run(cluster, job) }}
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
# Bundle will be where our built sandbox image is stored
|
|
5
5
|
# container-workdir will be our container's scratch directory
|
|
6
|
-
|
|
6
|
+
# TODO(jfarebro): We can make this more efficient by doing an srun per node and downloading the container once per node.
|
|
7
|
+
# but this requires apptainer support to have an overlay per procid
|
|
8
|
+
mkdir -p "$SLURM_TMPDIR"/{container-"$SLURM_PROCID",container-workdir-"$SLURM_PROCID",container-overlay-"$SLURM_PROCID"}
|
|
7
9
|
|
|
8
10
|
retry -c 255 -n 10 -d 1 -b 2 -- \
|
|
9
11
|
{% if job.executable.credentials %}
|
|
@@ -14,19 +16,21 @@ retry -c 255 -n 10 -d 1 -b 2 -- \
|
|
|
14
16
|
--force \
|
|
15
17
|
--sandbox \
|
|
16
18
|
--fix-perms \
|
|
17
|
-
"$SLURM_TMPDIR"/container \
|
|
19
|
+
"$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
|
|
18
20
|
docker://{{ job.executable.image }}
|
|
19
21
|
|
|
20
22
|
{% if runtime == "singularity" and cluster.mounts %}
|
|
21
23
|
{% for source, dest in cluster.mounts.items() %}
|
|
22
|
-
mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
|
|
24
|
+
mkdir -p "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/{{ dest | trim('/') }}
|
|
23
25
|
{% endfor %}
|
|
24
26
|
{% endif %}
|
|
25
27
|
|
|
26
|
-
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
|
|
28
|
+
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
|
|
27
29
|
{{ entrypoint(cluster, job) }}
|
|
28
30
|
ENTRYPOINT_EOF
|
|
29
|
-
chmod +x "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
|
|
31
|
+
chmod +x "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
|
|
32
|
+
|
|
33
|
+
for var in "${!SLURM_@}"; do export "{{ runtime | upper }}ENV_${var}=${!var}"; done
|
|
30
34
|
|
|
31
35
|
exec {{ runtime }} exec \
|
|
32
36
|
{% if job.executor.requirements.accelerator %}
|
|
@@ -45,16 +49,16 @@ exec {{ runtime }} exec \
|
|
|
45
49
|
--bind {{ source }}:{{ dest }} \
|
|
46
50
|
{% endfor %}
|
|
47
51
|
{% endif %}
|
|
48
|
-
--workdir "$SLURM_TMPDIR"/container-workdir \
|
|
52
|
+
--workdir "$SLURM_TMPDIR"/container-workdir-"$SLURM_PROCID" \
|
|
49
53
|
{% if (cluster.runtime | string) == "apptainer" %}
|
|
50
|
-
--overlay "$SLURM_TMPDIR"/container-overlay \
|
|
54
|
+
--overlay "$SLURM_TMPDIR"/container-overlay-"$SLURM_PROCID" \
|
|
51
55
|
{% else %}
|
|
52
56
|
--writable \
|
|
53
57
|
{% endif %}
|
|
54
58
|
{% if job.executable.workdir %}
|
|
55
59
|
--pwd {{ job.executable.workdir }} \
|
|
56
60
|
{% endif %}
|
|
57
|
-
"$SLURM_TMPDIR"/container \
|
|
61
|
+
"$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
|
|
58
62
|
/xm-slurm-entrypoint.sh \
|
|
59
63
|
{% for arg in job.executable.args.to_list() %}
|
|
60
64
|
{{ arg }} \
|
|
@@ -6,13 +6,13 @@ xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
|
|
|
6
6
|
xm_slurm/dependencies.py,sha256=G-8vfmvSptZH6c_Ow51SwT84Dr6LI1clRj8F8wOUkiw,6421
|
|
7
7
|
xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
|
|
8
8
|
xm_slurm/execution.py,sha256=mTy5u2oP2StIbGzjaSiGCUAwXuBFOiaJ5ephWoc25hI,31799
|
|
9
|
-
xm_slurm/executors.py,sha256=
|
|
9
|
+
xm_slurm/executors.py,sha256=27oiMwF84axeTcrcwL0f5seeLL_1j79OjiM_JZjioFs,9112
|
|
10
10
|
xm_slurm/experiment.py,sha256=94r0mhtUPUzw4eaUEz0kpsufC25wEGqlDhV4Fcr1ukY,39883
|
|
11
11
|
xm_slurm/filesystem.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
|
|
12
12
|
xm_slurm/job_blocks.py,sha256=BFOOYgeodoGIQsB5PdC7SsOUou5aZx-1qbQ7lcqqylI,604
|
|
13
13
|
xm_slurm/metadata_context.py,sha256=mksVRbVUuistL1uE7TC-fkW-Y69On52jN_svP1e1kiQ,7841
|
|
14
14
|
xm_slurm/packageables.py,sha256=aEZUQpddfq4FK6h4f6kgGEI4XcOufhm68MjoDFOYR4U,12261
|
|
15
|
-
xm_slurm/resources.py,sha256=
|
|
15
|
+
xm_slurm/resources.py,sha256=sTfwPc0QHRgfckOFq300FZ4fvtPfE4hq8B27DIvf6m4,12388
|
|
16
16
|
xm_slurm/status.py,sha256=JIBCJPOYsmeJOQbzdACXA2vTWK7g8YWWhzpGP79e7JE,6911
|
|
17
17
|
xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
|
|
18
18
|
xm_slurm/utils.py,sha256=9w98HlXF0U9cKKtoB8QtGm0CnB0MnnzBARKlbbVNNpU,6211
|
|
@@ -22,8 +22,8 @@ xm_slurm/api/models.py,sha256=_INVh0j-4-rRs0WASyg4fNB6NF1L1nUeGgQ6-XnbwsM,1610
|
|
|
22
22
|
xm_slurm/api/sqlite/client.py,sha256=jAesCKDuYwnNcAxwJk_1b1TB8cT_QGbSjo1UE3mZjEQ,14037
|
|
23
23
|
xm_slurm/api/web/client.py,sha256=uO67Y7fnQ-w__Vm_A5BEuy7Qi8wQcWk3vIsBGEBkyfk,6261
|
|
24
24
|
xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
xm_slurm/contrib/clusters/__init__.py,sha256=
|
|
26
|
-
xm_slurm/contrib/clusters/drac.py,sha256
|
|
25
|
+
xm_slurm/contrib/clusters/__init__.py,sha256=oYkuo4jAA6_VLIJs4k1Rjcp6l3qN0qmJExfkPWfmho4,2281
|
|
26
|
+
xm_slurm/contrib/clusters/drac.py,sha256=-Nawzyx3p471f8Fti_tbweHn8h4ItRN4ec-ktHnUE4Q,7087
|
|
27
27
|
xm_slurm/experimental/parameter_controller.py,sha256=IrFzq104LkZrhzuirit5GUZDXDvv2bBSYNMh3orsiPY,8518
|
|
28
28
|
xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
|
|
29
29
|
xm_slurm/packaging/docker.py,sha256=-DWcB9qqbeHmIEqyfF0-v6xOT25ae90u2x-QZ7kluOw,13579
|
|
@@ -37,16 +37,16 @@ xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgC
|
|
|
37
37
|
xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
|
|
38
38
|
xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5PpMhJYsKkbEg,1040
|
|
39
39
|
xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
|
|
40
|
-
xm_slurm/templates/slurm/job-array.bash.j2,sha256=
|
|
41
|
-
xm_slurm/templates/slurm/job-group.bash.j2,sha256=
|
|
42
|
-
xm_slurm/templates/slurm/job.bash.j2,sha256=
|
|
40
|
+
xm_slurm/templates/slurm/job-array.bash.j2,sha256=7cc0nZvEcHhZoo7jXI3fJWgMcc6z5H5FmopPRaklylI,637
|
|
41
|
+
xm_slurm/templates/slurm/job-group.bash.j2,sha256=9H3zfJy8RZGFf00ZQJGmMEPyWQ9YMZfvGoD4Q8hMx9Y,1244
|
|
42
|
+
xm_slurm/templates/slurm/job.bash.j2,sha256=GBKY3DPCODPTtEBfuvfaZAua_ZEd5cqPrShtPGE_IpY,2174
|
|
43
43
|
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=ri5FgoKs6_bQVf5DO8SL4rJf4UsLxV34aOV-OD8VWDU,2526
|
|
44
44
|
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
45
45
|
xm_slurm/templates/slurm/library/retry.bash,sha256=bLe59qvfWEk17rE1wZ4EHiHba3RvR2WWZPq-kSe8RAA,2164
|
|
46
|
-
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=
|
|
47
|
-
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=
|
|
48
|
-
xmanager_slurm-0.4.
|
|
49
|
-
xmanager_slurm-0.4.
|
|
50
|
-
xmanager_slurm-0.4.
|
|
51
|
-
xmanager_slurm-0.4.
|
|
52
|
-
xmanager_slurm-0.4.
|
|
46
|
+
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=XxAQWLxZogL7zjn7tuzKn-DkYUJMx_HjaRzpVkz97lM,2414
|
|
47
|
+
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=8N1ZtwHyXxP-Cjo4HBPsJiZXcTvf7q2GzvW9ao8_aok,1208
|
|
48
|
+
xmanager_slurm-0.4.17.dist-info/METADATA,sha256=YUBZ6woSk-9-0GzFuFlTPy6hhaxFI-WPGVt7APaviT0,1007
|
|
49
|
+
xmanager_slurm-0.4.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
50
|
+
xmanager_slurm-0.4.17.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
|
|
51
|
+
xmanager_slurm-0.4.17.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
|
|
52
|
+
xmanager_slurm-0.4.17.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|