xmanager-slurm 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ from typing import Literal
4
4
  from xm_slurm import config
5
5
  from xm_slurm.resources import FeatureType, ResourceType
6
6
 
7
- __all__ = ["narval", "beluga", "cedar", "graham"]
7
+ __all__ = ["beluga", "cedar", "fir", "graham", "narval"]
8
8
 
9
9
 
10
10
  def _drac_cluster(
@@ -121,6 +121,36 @@ def beluga(
121
121
  )
122
122
 
123
123
 
124
+ def rorqual(
125
+ *,
126
+ user: str | None = None,
127
+ account: str | None = None,
128
+ proxy: Literal["submission-host"] | str | None = None,
129
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
130
+ ) -> config.SlurmClusterConfig:
131
+ """DRAC Beluga Cluster (https://docs.alliancecan.ca/wiki/Rorqual/en)."""
132
+ modules = []
133
+ if proxy != "submission-host":
134
+ modules.append("httpproxy")
135
+
136
+ return _drac_cluster(
137
+ name="rorqual",
138
+ host="robot.rorqual.alliancecan.ca",
139
+ host_public_key=config.PublicKey(
140
+ "ssh-ed25519", "AAAAC3NzaC1lZDI1NTE5AAAAINME5e9bifKZbuKKOQSpe3xrvC4g1b0QLMYj+AXBQGJe"
141
+ ),
142
+ user=user,
143
+ account=account,
144
+ mounts=mounts,
145
+ proxy=proxy,
146
+ modules=modules,
147
+ resources={ResourceType.H100: "h100"},
148
+ features={
149
+ FeatureType.NVIDIA_NVLINK: "nvlink",
150
+ },
151
+ )
152
+
153
+
124
154
  def cedar(
125
155
  *,
126
156
  user: str | None = None,
@@ -146,6 +176,27 @@ def cedar(
146
176
  )
147
177
 
148
178
 
179
+ def fir(
180
+ *,
181
+ user: str | None = None,
182
+ account: str | None = None,
183
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
184
+ ) -> config.SlurmClusterConfig:
185
+ """DRAC Fir Cluster (https://docs.alliancecan.ca/wiki/Fir/en)."""
186
+ return _drac_cluster(
187
+ name="fir",
188
+ host="robot.fir.alliancecan.ca",
189
+ host_public_key=config.PublicKey(
190
+ "ssh-ed25519",
191
+ "AAAAC3NzaC1lZDI1NTE5AAAAIJtenyJz+inwobvlJntWYFNu+ANcVWNcOHRKcEN6zmDo",
192
+ ),
193
+ user=user,
194
+ account=account,
195
+ mounts=mounts,
196
+ resources={ResourceType.H100: "h100"},
197
+ )
198
+
199
+
149
200
  def graham(
150
201
  *,
151
202
  user: str | None = None,
xm_slurm/execution.py CHANGED
@@ -3,6 +3,8 @@ import collections.abc
3
3
  import dataclasses
4
4
  import functools
5
5
  import hashlib
6
+ import importlib
7
+ import importlib.resources
6
8
  import logging
7
9
  import operator
8
10
  import os
@@ -311,6 +313,14 @@ def get_template_env(runtime: ContainerRuntime) -> j2.Environment:
311
313
  template_env.globals["raise"] = _raise_template_exception
312
314
  template_env.globals["operator"] = operator
313
315
 
316
+ # Iterate over stdlib files and insert them into the template environment
317
+ stdlib = []
318
+ for file in importlib.resources.files("xm_slurm.templates.slurm.library").iterdir():
319
+ if not file.is_file() or not file.name.endswith(".bash"):
320
+ continue
321
+ stdlib.append(file.read_text())
322
+ template_env.globals["stdlib"] = stdlib
323
+
314
324
  entrypoint_template = template_env.get_template("entrypoint.bash.j2")
315
325
  template_env.globals.update(entrypoint_template.module.__dict__)
316
326
 
@@ -25,6 +25,7 @@ __xm_slurm_wait_for_children() {
25
25
  while [ ${#children[@]} -gt 0 ]; do
26
26
  {% endraw %}
27
27
  echo "INFO: Waiting for child processes to finish..."
28
+ set +e
28
29
  {% if requeue_on_timeout %}
29
30
  # Wait on either one of the child processes or the timeout process.
30
31
  wait -n -p child_pid "${children[@]}" "${timeout_pid}"
@@ -32,6 +33,7 @@ __xm_slurm_wait_for_children() {
32
33
  wait -n -p child_pid "${children[@]}"
33
34
  {% endif %}
34
35
  local child_exit_code=$?
36
+ set -e
35
37
 
36
38
  {% if requeue_on_timeout %}
37
39
  # If the finished process is the watchdog, trigger the timeout handling.
@@ -27,6 +27,13 @@
27
27
  {% endblock directives %}
28
28
  set -Eeuxo pipefail
29
29
 
30
+ {% if stdlib %}
31
+ # --- Helper functions ---
32
+ {% for fn in stdlib %}
33
+ {{ fn }}
34
+ {% endfor %}
35
+ {% endif %}
36
+
30
37
  {% block prolog %}
31
38
  {% if cluster.prolog %}
32
39
  {{- cluster.prolog -}}
@@ -0,0 +1,62 @@
1
+ # retry: rerun a command if it exits with certain codes
2
+ # Options:
3
+ # -c CODE Retry on this exit code (repeatable).
4
+ # -n N Max attempts (incl. first). Default: unlimited
5
+ # -d SECS Initial delay before first retry. Default: 1
6
+ # -b FACTOR Integer backoff multiplier per retry. Default: 1 (no backoff)
7
+ # -q Quiet (no logs)
8
+ # Usage:
9
+ # retry [-c CODE ...] [-n N] [-d SECS] [-b FACTOR] [-q] -- cmd arg1 arg2 ...
10
+ retry() {
11
+ local -a codes=()
12
+ local -i max=-1 delay=1 backoff=1 quiet=0 status
13
+ local opt OPTIND=1
14
+
15
+ while getopts ":c:n:d:b:q" opt; do
16
+ case "$opt" in
17
+ c) codes+=("$OPTARG") ;;
18
+ n) max=$OPTARG ;;
19
+ d) delay=$OPTARG ;;
20
+ b) backoff=$OPTARG ;;
21
+ q) quiet=1 ;;
22
+ :) printf 'retry: option -%s requires an argument\n' "$OPTARG" >&2; return 2 ;;
23
+ \?) printf 'retry: invalid option -- %s\n' "$OPTARG" >&2; return 2 ;;
24
+ esac
25
+ done
26
+ shift $((OPTIND-1))
27
+ (( $# )) || { printf 'retry: missing command\n' >&2; return 2; }
28
+
29
+ ((${#codes[@]})) || { printf 'retry: no return codes specified\n' >&2; return 2; }
30
+
31
+ for ((attempt=1; ; attempt++)); do
32
+ if "$@"; then # safe with set -e (exception context)
33
+ return 0
34
+ else
35
+ status=$? # capture failing status immediately
36
+ fi
37
+
38
+ # retryable?
39
+ local retryable=0 c
40
+ for c in "${codes[@]}"; do
41
+ (( status == c )) && { retryable=1; break; }
42
+ done
43
+
44
+ # stop if not retryable OR we've just hit the max attempt
45
+ if (( !retryable )) || (( max >= 0 && attempt >= max )); then
46
+ (( quiet )) || {
47
+ if (( attempt > 1 )); then
48
+ printf 'retry: giving up after %d attempts; last exit=%d\n' "$attempt" "$status" >&2
49
+ else
50
+ printf 'retry: command failed; exit=%d\n' "$status" >&2
51
+ fi
52
+ }
53
+ return "$status" # propagate exact code; errexit will catch
54
+ fi
55
+
56
+ (( quiet )) || printf 'retry: attempt %d failed with %d; retrying in %ds...\n' \
57
+ "$attempt" "$status" "$delay" >&2
58
+ sleep "$delay" || : # never trip set -e if sleep errors
59
+ (( delay *= backoff ))
60
+ done
61
+ }
62
+ export -f retry
@@ -5,16 +5,17 @@
5
5
  # container-workdir will be our container's scratch directory
6
6
  mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
7
7
 
8
+ retry -c 255 -n 10 -d 1 -b 2 -- \
8
9
  {% if job.executable.credentials %}
9
- env {{ runtime | upper }}_DOCKER_USERNAME="{{ job.executable.credentials.username }}" {{ runtime | upper }}_DOCKER_PASSWORD="{{ job.executable.credentials.password }}" time {{ runtime }} build \
10
+ env {{ runtime | upper }}_DOCKER_USERNAME="{{ job.executable.credentials.username }}" {{ runtime | upper }}_DOCKER_PASSWORD="{{ job.executable.credentials.password }}" time {{ runtime }} build \
10
11
  {% else %}
11
- time {{ runtime }} build \
12
+ time {{ runtime }} build \
12
13
  {% endif %}
13
- --force \
14
- --sandbox \
15
- --fix-perms \
16
- "$SLURM_TMPDIR"/container \
17
- docker://{{ job.executable.image }}
14
+ --force \
15
+ --sandbox \
16
+ --fix-perms \
17
+ "$SLURM_TMPDIR"/container \
18
+ docker://{{ job.executable.image }}
18
19
 
19
20
  {% if runtime == "singularity" and cluster.mounts %}
20
21
  {% for source, dest in cluster.mounts.items() %}
@@ -1,9 +1,10 @@
1
1
  {% macro run(cluster, job) -%}
2
- time podman pull \
2
+ retry -c 255 -n 10 -d 1 -b 2 -- \
3
+ time podman pull \
3
4
  {% if job.executable.credentials %}
4
- --creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
5
+ --creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
5
6
  {% endif %}
6
- {{ job.executable.image }}
7
+ {{ job.executable.image }}
7
8
 
8
9
  cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/xm-slurm-entrypoint.sh
9
10
  {{ entrypoint(cluster, job) }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.12
3
+ Version: 0.4.14
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -5,7 +5,7 @@ xm_slurm/console.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
5
5
  xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
6
6
  xm_slurm/dependencies.py,sha256=-5gN_tpfs3dOA7H5_MIHO2ratb7F5Pm_yjkR5rZcgI8,6421
7
7
  xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
8
- xm_slurm/execution.py,sha256=c0aV1h2tKQFyAGM6JLd16MWFgpRLKAbcutZz17xPUSw,31400
8
+ xm_slurm/execution.py,sha256=mTy5u2oP2StIbGzjaSiGCUAwXuBFOiaJ5ephWoc25hI,31799
9
9
  xm_slurm/executors.py,sha256=bUgKcgtvf-nPGjcuHRzUAqD1r3_vwea_h-Y9MAB-Kqo,4887
10
10
  xm_slurm/experiment.py,sha256=94r0mhtUPUzw4eaUEz0kpsufC25wEGqlDhV4Fcr1ukY,39883
11
11
  xm_slurm/filesystem.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
@@ -23,7 +23,7 @@ xm_slurm/api/sqlite/client.py,sha256=jAesCKDuYwnNcAxwJk_1b1TB8cT_QGbSjo1UE3mZjEQ
23
23
  xm_slurm/api/web/client.py,sha256=uO67Y7fnQ-w__Vm_A5BEuy7Qi8wQcWk3vIsBGEBkyfk,6261
24
24
  xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  xm_slurm/contrib/clusters/__init__.py,sha256=XFCVnkThiU3_8uA_tUgDByOBanXNHrxDvfmuptmQ2KE,2214
26
- xm_slurm/contrib/clusters/drac.py,sha256=ViLYerYBMSuZXnWVbz9RDIPPV7JA8BgBpgTfj1wPP28,5881
26
+ xm_slurm/contrib/clusters/drac.py,sha256=vY3dxrNUk12H9Gq-tuCcqo2YcdTGq-4LJnQF6DzD4_k,7431
27
27
  xm_slurm/experimental/parameter_controller.py,sha256=b5LfglHV307F6QcPrHeZX5GJBtyOK9aQydke_SZ3Wto,8457
28
28
  xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
29
29
  xm_slurm/packaging/docker.py,sha256=-DWcB9qqbeHmIEqyfF0-v6xOT25ae90u2x-QZ7kluOw,13579
@@ -39,13 +39,14 @@ xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5Pp
39
39
  xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
40
40
  xm_slurm/templates/slurm/job-array.bash.j2,sha256=smxmSSzBEUHm6MJF-nYPVVjK6CLKrb1fRxF_tfrzAX8,552
41
41
  xm_slurm/templates/slurm/job-group.bash.j2,sha256=Cp8YhNOxYqaOkl4MFjQlcaLMGZwdDh97m8OGT5RWbAo,1101
42
- xm_slurm/templates/slurm/job.bash.j2,sha256=pNKir1tkmRTGDiGxlQ3DkUaW9Zos_gdkkXJC_xX5Cxo,1985
43
- xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=BJ1brSjhESOe9VX_OYaPyy9-qE3uiFlzxp8ZkFcTw8Y,2504
42
+ xm_slurm/templates/slurm/job.bash.j2,sha256=DrDipliaEfiHbq9vDfOdfD8zBVFLy1jjlvCV-9-6k9s,2086
43
+ xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=ri5FgoKs6_bQVf5DO8SL4rJf4UsLxV34aOV-OD8VWDU,2526
44
44
  xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
45
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=lE2EWVCK2O-n08RL4_MJYIikVTvODjcYKuv7Eh73Q2w,1932
46
- xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=3j7K5eyXt_WhXK0EoMlxnhlmFVJ2JyxRKbsMRaDqzSs,1148
47
- xmanager_slurm-0.4.12.dist-info/METADATA,sha256=ttPSMz6bQs8BEQVCurQ9lFWV7MuzLOHWU7lISOOoufA,1007
48
- xmanager_slurm-0.4.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
49
- xmanager_slurm-0.4.12.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
50
- xmanager_slurm-0.4.12.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
51
- xmanager_slurm-0.4.12.dist-info/RECORD,,
45
+ xm_slurm/templates/slurm/library/retry.bash,sha256=bLe59qvfWEk17rE1wZ4EHiHba3RvR2WWZPq-kSe8RAA,2164
46
+ xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=v0LwHM-kBW8sJqVcVA2jYr1n44imDSZrJqmqlr5uTGc,1980
47
+ xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=zWLsFEuVzOMSETOmv4A5ZCV4oQHwCipiR6wi79XVzNI,1188
48
+ xmanager_slurm-0.4.14.dist-info/METADATA,sha256=T7xNy0jmrKhQemaDhCg9E-J64gWkabjJGpxDYgdsBx8,1007
49
+ xmanager_slurm-0.4.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
50
+ xmanager_slurm-0.4.14.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
51
+ xmanager_slurm-0.4.14.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
52
+ xmanager_slurm-0.4.14.dist-info/RECORD,,