xmanager-slurm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (38) hide show
  1. xm_slurm/__init__.py +44 -0
  2. xm_slurm/api.py +261 -0
  3. xm_slurm/batching.py +139 -0
  4. xm_slurm/config.py +162 -0
  5. xm_slurm/console.py +3 -0
  6. xm_slurm/contrib/clusters/__init__.py +52 -0
  7. xm_slurm/contrib/clusters/drac.py +169 -0
  8. xm_slurm/executables.py +201 -0
  9. xm_slurm/execution.py +491 -0
  10. xm_slurm/executors.py +127 -0
  11. xm_slurm/experiment.py +737 -0
  12. xm_slurm/job_blocks.py +14 -0
  13. xm_slurm/packageables.py +292 -0
  14. xm_slurm/packaging/__init__.py +8 -0
  15. xm_slurm/packaging/docker/__init__.py +75 -0
  16. xm_slurm/packaging/docker/abc.py +112 -0
  17. xm_slurm/packaging/docker/cloud.py +503 -0
  18. xm_slurm/packaging/docker/local.py +206 -0
  19. xm_slurm/packaging/registry.py +45 -0
  20. xm_slurm/packaging/router.py +52 -0
  21. xm_slurm/packaging/utils.py +202 -0
  22. xm_slurm/resources.py +150 -0
  23. xm_slurm/status.py +188 -0
  24. xm_slurm/templates/docker/docker-bake.hcl.j2 +47 -0
  25. xm_slurm/templates/docker/mamba.Dockerfile +27 -0
  26. xm_slurm/templates/docker/pdm.Dockerfile +31 -0
  27. xm_slurm/templates/docker/python.Dockerfile +24 -0
  28. xm_slurm/templates/slurm/fragments/monitor.bash.j2 +32 -0
  29. xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
  30. xm_slurm/templates/slurm/job-array.bash.j2 +29 -0
  31. xm_slurm/templates/slurm/job-group.bash.j2 +41 -0
  32. xm_slurm/templates/slurm/job.bash.j2 +78 -0
  33. xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +103 -0
  34. xm_slurm/templates/slurm/runtimes/podman.bash.j2 +56 -0
  35. xm_slurm/utils.py +69 -0
  36. xmanager_slurm-0.3.0.dist-info/METADATA +25 -0
  37. xmanager_slurm-0.3.0.dist-info/RECORD +38 -0
  38. xmanager_slurm-0.3.0.dist-info/WHEEL +4 -0
xm_slurm/status.py ADDED
@@ -0,0 +1,188 @@
1
+ """Implementation of Slurm work unit statuses."""
2
+
3
+ import enum
4
+ import re
5
+ from typing import Sequence
6
+
7
+ from xmanager import xm
8
+
9
+
10
+ class SlurmJobState(enum.Enum):
11
+ BOOT_FAIL = enum.auto()
12
+ CANCELLED = enum.auto()
13
+ COMPLETED = enum.auto()
14
+ DEADLINE = enum.auto()
15
+ FAILED = enum.auto()
16
+ NODE_FAIL = enum.auto()
17
+ OUT_OF_MEMORY = enum.auto()
18
+ PENDING = enum.auto()
19
+ PREEMPTED = enum.auto()
20
+ RUNNING = enum.auto()
21
+ REQUEUED = enum.auto()
22
+ RESIZING = enum.auto()
23
+ REVOKED = enum.auto()
24
+ SUSPENDED = enum.auto()
25
+ TIMEOUT = enum.auto()
26
+
27
+ @property
28
+ def message(self) -> str:
29
+ match self:
30
+ case SlurmJobState.BOOT_FAIL:
31
+ return (
32
+ "Job terminated due to launch failure, "
33
+ "typically due to a hardware failure (e.g. unable to boot "
34
+ "the node or block and the job can not be requeued)."
35
+ )
36
+ case SlurmJobState.CANCELLED:
37
+ return (
38
+ "Job was explicitly cancelled by the user or "
39
+ "system administrator. The job may or may not have been "
40
+ "initiated."
41
+ )
42
+ case SlurmJobState.COMPLETED:
43
+ return "Job has terminated all processes on all " "nodes with an exit code of zero."
44
+ case SlurmJobState.DEADLINE:
45
+ return "Job terminated on deadline."
46
+ case SlurmJobState.FAILED:
47
+ return "Job terminated with non-zero exit code or " "other failure condition."
48
+ case SlurmJobState.NODE_FAIL:
49
+ return "Job terminated due to failure of one or " "more allocated nodes."
50
+ case SlurmJobState.OUT_OF_MEMORY:
51
+ return "Job experienced out of memory error."
52
+ case SlurmJobState.PENDING:
53
+ return "Job is awaiting resource allocation."
54
+ case SlurmJobState.PREEMPTED:
55
+ return "Job terminated due to preemption."
56
+ case SlurmJobState.RUNNING:
57
+ return "Job currently has an allocation."
58
+ case SlurmJobState.REQUEUED:
59
+ return "Job was requeued."
60
+ case SlurmJobState.RESIZING:
61
+ return "Job is about to change size."
62
+ case SlurmJobState.REVOKED:
63
+ return "Sibling was removed from cluster due to " "other cluster starting the job."
64
+ case SlurmJobState.SUSPENDED:
65
+ return "Job has an allocation, but execution has been suspended."
66
+ case SlurmJobState.TIMEOUT:
67
+ return "Job terminated upon reaching its time limit."
68
+ case _:
69
+ raise ValueError(f"Invalid Slurm job state: {self}")
70
+
71
+ def __str__(self) -> str:
72
+ return f"{self.name}: {self.message}"
73
+
74
+ @classmethod
75
+ def from_str(cls, state: str) -> "SlurmJobState":
76
+ return cls[state]
77
+
78
+ @classmethod
79
+ def from_slurm_str(cls, state: str) -> "SlurmJobState":
80
+ _SLURM_JOB_STATE_REGEX = re.compile(f"({'|'.join(entry.name for entry in cls)})\\s?.*")
81
+ match = _SLURM_JOB_STATE_REGEX.match(state)
82
+ assert match and len(match.groups()) == 1, f"Failed to parse job state, {state!r}"
83
+ return cls.from_str(match.group(1))
84
+
85
+
86
+ SlurmPendingJobStates = set([
87
+ SlurmJobState.PENDING,
88
+ SlurmJobState.REQUEUED,
89
+ SlurmJobState.RESIZING,
90
+ ])
91
+ SlurmRunningJobStates = set([
92
+ SlurmJobState.RUNNING,
93
+ SlurmJobState.SUSPENDED,
94
+ ])
95
+ SlurmActiveJobStates = SlurmPendingJobStates | SlurmRunningJobStates
96
+ SlurmCompletedJobStates = set([SlurmJobState.COMPLETED])
97
+ SlurmFailedJobStates = set([
98
+ SlurmJobState.BOOT_FAIL,
99
+ SlurmJobState.DEADLINE,
100
+ SlurmJobState.FAILED,
101
+ SlurmJobState.NODE_FAIL,
102
+ SlurmJobState.OUT_OF_MEMORY,
103
+ SlurmJobState.PREEMPTED,
104
+ SlurmJobState.REVOKED,
105
+ SlurmJobState.TIMEOUT,
106
+ ])
107
+ SlurmCancelledJobStates = set([SlurmJobState.CANCELLED])
108
+
109
+
110
+ class SlurmWorkUnitStatusEnum(enum.IntEnum):
111
+ """Status of a local experiment job."""
112
+
113
+ # Work unit was created, but has not started yet.
114
+ PENDING = 0
115
+ # Work unit was created, but has not terminated yet.
116
+ RUNNING = 1
117
+ # Work unit terminated and was successful.
118
+ COMPLETED = 2
119
+ # Work unit terminated and was not succesful.
120
+ FAILED = 3
121
+ # Work unit terminated because it was cancelled by the user.
122
+ CANCELLED = 4
123
+
124
+ @classmethod
125
+ def from_job_state(cls, state: SlurmJobState) -> "SlurmWorkUnitStatusEnum":
126
+ """Convert a Slurm job state to a SlurmWorkUnitStatusEnum."""
127
+ if state in SlurmPendingJobStates:
128
+ return cls.PENDING
129
+ elif state in SlurmRunningJobStates:
130
+ return cls.RUNNING
131
+ elif state in SlurmCompletedJobStates:
132
+ return cls.COMPLETED
133
+ elif state in SlurmFailedJobStates:
134
+ return cls.FAILED
135
+ elif state in SlurmCancelledJobStates:
136
+ return cls.CANCELLED
137
+ else:
138
+ raise ValueError(f"Invalid Slurm job state: {state}")
139
+
140
+
141
+ class SlurmWorkUnitStatus(xm.ExperimentUnitStatus):
142
+ """Status of a Slurm experiment job."""
143
+
144
+ @classmethod
145
+ def aggregate(cls, states: Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
146
+ """Aggregate a sequence of statuses into a single status."""
147
+ assert len(states) > 0, "Cannot aggregate empty sequence of statuses."
148
+ max_error_state: SlurmJobState | None = None
149
+ for state in states:
150
+ if not max_error_state:
151
+ max_error_state = state
152
+ elif SlurmWorkUnitStatusEnum.from_job_state(
153
+ state
154
+ ) > SlurmWorkUnitStatusEnum.from_job_state(max_error_state):
155
+ max_error_state = state
156
+ assert max_error_state is not None
157
+ return cls(max_error_state)
158
+
159
+ def __init__(self, state: SlurmJobState) -> None:
160
+ super().__init__()
161
+ self._state = state
162
+ self._status = SlurmWorkUnitStatusEnum.from_job_state(state)
163
+
164
+ @property
165
+ def is_active(self) -> bool:
166
+ return (
167
+ self._status == SlurmWorkUnitStatusEnum.RUNNING
168
+ or self._status == SlurmWorkUnitStatusEnum.PENDING
169
+ )
170
+
171
+ @property
172
+ def is_completed(self) -> bool:
173
+ return self._status == SlurmWorkUnitStatusEnum.COMPLETED
174
+
175
+ @property
176
+ def is_failed(self) -> bool:
177
+ return self._status == SlurmWorkUnitStatusEnum.FAILED
178
+
179
+ @property
180
+ def status(self) -> SlurmWorkUnitStatusEnum:
181
+ return self._status
182
+
183
+ @property
184
+ def message(self) -> str:
185
+ return str(self._state)
186
+
187
+ def __repr__(self) -> str:
188
+ return f"<SlurmWorkUnitStatus {self._state!r}>"
@@ -0,0 +1,47 @@
1
+ {% for executable, executors in executables.items() %}
2
+ target "{{ hash(executable) }}" {
3
+ dockerfile-inline = <<EOF
4
+ {{ executable.dockerfile.read_text() }}
5
+ EOF
6
+ context = "{{ executable.context }}"
7
+ {% if executable.target %}
8
+ target = "{{ executable.target }}"
9
+ {% endif %}
10
+ pull = true
11
+ tags = [
12
+ {% for executor in executors %}
13
+ "{{ executor.tag }}"{% if not loop.last %},{% endif %}
14
+ {% endfor %}
15
+ ]
16
+ output = [
17
+ "type=registry"
18
+ ]
19
+ {% if executable.cache_from %}
20
+ cache-from = [
21
+ {% for cache_from in executable.cache_from %}
22
+ "{{ cache_from }}"{% if not loop.last %},{% endif %}
23
+ {% endfor %}
24
+ ]
25
+ {% endif %}
26
+ cache-to = [
27
+ "type=inline"
28
+ ]
29
+ platforms = [
30
+ {% for platform in executable.platforms %}
31
+ "{{ platform }}"{% if not loop.last %},{% endif %}
32
+ {% endfor %}
33
+ ]
34
+ labels = {
35
+ {% for key, value in executable.labels.items() %}
36
+ "{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
37
+ {% endfor %}
38
+ }
39
+ {% if executable.build_args %}
40
+ args = {
41
+ {% for key, value in executable.build_args.items() %}
42
+ "{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
43
+ {% endfor %}
44
+ }
45
+ {% endif %}
46
+ }
47
+ {% endfor %}
@@ -0,0 +1,27 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE=gcr.io/distroless/base-debian10
3
+
4
+ FROM docker.io/mambaorg/micromamba:jammy as mamba
5
+ ARG CONDA_ENVIRONMENT=environment.yml
6
+
7
+ USER root
8
+
9
+ COPY $CONDA_ENVIRONMENT /tmp/
10
+
11
+ # Setup mamba environment
12
+ RUN --mount=type=cache,target=/opt/conda/pkgs --mount=type=cache,target=/root/.cache/pip \
13
+ micromamba create --yes --always-copy --no-pyc --prefix /opt/env --file /tmp/environment.yml
14
+
15
+ RUN find /opt/env/ -follow -type f -name '*.a' -delete && \
16
+ find /opt/env/ -follow -type f -name '*.js.map' -delete
17
+
18
+ FROM $BASE_IMAGE
19
+
20
+ COPY --link --from=mamba /opt/env /opt/env
21
+
22
+ ENV PATH=$PATH:/opt/env/bin
23
+
24
+ WORKDIR /workspace
25
+ COPY --link . /workspace
26
+
27
+ ENTRYPOINT ["/opt/env/bin/python"]
@@ -0,0 +1,31 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE
3
+
4
+ FROM $BASE_IMAGE AS builder
5
+
6
+ RUN apt-get update \
7
+ && apt-get install -y --no-install-recommends \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ RUN pip install -U pip setuptools wheel pysocks \
12
+ && pip install pdm
13
+
14
+ COPY --link pyproject.toml pdm.lock /workspace/
15
+ WORKDIR /workspace
16
+
17
+ RUN --mount=type=cache,target=/root/.cache/pdm mkdir __pypackages__ \
18
+ && PDM_CACHE_DIR=/root/.cache/pdm pdm sync --prod --no-editable
19
+
20
+ FROM $BASE_IMAGE
21
+
22
+ ARG PYTHON_MAJOR
23
+ ARG PYTHON_MINOR
24
+
25
+ ENV PYTHONPATH=/workspace/pkgs:$PYTHONPATH
26
+ COPY --link --from=builder /workspace/__pypackages__/$PYTHON_MAJOR.$PYTHON_MINOR/lib /workspace/pkgs
27
+
28
+ WORKDIR /workspace/src
29
+ COPY --link . /workspace/src
30
+
31
+ ENTRYPOINT ["python"]
@@ -0,0 +1,24 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE=docker.io/python:3.10-slim
3
+ FROM $BASE_IMAGE as builder
4
+
5
+ RUN apt-get update \
6
+ && apt-get install -y --no-install-recommends \
7
+ git \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Install and update necesarry global Python packages
11
+ RUN pip install -U pip setuptools wheel pysocks
12
+
13
+ ARG PIP_REQUIREMENTS=requirements.txt
14
+
15
+ RUN python -m venv --copies --upgrade --upgrade-deps --system-site-packages /venv
16
+ COPY $PIP_REQUIREMENTS /tmp/requirements.txt
17
+ RUN --mount=type=cache,target=/root/.cache/pip \
18
+ PIP_CACHE_DIR=/root/.cache/pip /venv/bin/pip install -r /tmp/requirements.txt \
19
+ && rm -rf /tmp/requirements.txt
20
+
21
+ COPY --link . /workspace
22
+ WORKDIR /workspace
23
+
24
+ ENTRYPOINT [ "/venv/bin/python" ]
@@ -0,0 +1,32 @@
1
+ {% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
2
+ __xm_slurm_wait_for_children() {
3
+ if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
4
+ local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
5
+ else
6
+ local -r JOB_ID="${SLURM_JOB_ID}"
7
+ fi
8
+
9
+ # If there are no child jobs we should error out
10
+ if [ -z "$(jobs -p)" ]; then
11
+ echo "ERROR: no child jobs exist..." >&2
12
+ exit -1
13
+ fi
14
+
15
+ # Loop through all job IDs in the background job list and wait for them to finish
16
+ for job in "$(jobs -p)"; do
17
+ echo "INFO: Waiting for job ${job} to finish..."
18
+ set +e
19
+ wait "${job}"
20
+ local -r JOB_EXIT_CODE="${?}"
21
+ set -e
22
+
23
+ if [ "${JOB_EXIT_CODE}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT-0}" -le "{{ requeue_max_attempts }}" ]; then
24
+ echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
25
+ scontrol requeue "${JOB_ID}"
26
+ exit {{ requeue_exit_code }}
27
+ fi
28
+ done
29
+ }
30
+
31
+ __xm_slurm_wait_for_children
32
+ {%- endmacro %}
@@ -0,0 +1,31 @@
1
+ {% macro proxy(host) -%}
2
+ __xm_slurm_proxy() {
3
+ local -r GATEWAY="$1"
4
+ local PORT
5
+
6
+ # Find an open port
7
+ while
8
+ PORT="$(shuf -n 1 -i 1024-65535)"
9
+ netstat -atun | grep -q "$PORT"
10
+ do
11
+ sleep 0.25
12
+ continue
13
+ done
14
+
15
+ # Reverse proxy through the gateway
16
+ ssh -D "$PORT" "$GATEWAY" -N -f
17
+
18
+ # Export all env vars for applications to pick up on proxy
19
+ export ALL_PROXY="socks5://127.0.0.1:$PORT"
20
+ export all_proxy="socks5://127.0.0.1:$PORT"
21
+
22
+ export HTTP_PROXY="socks5://127.0.0.1:$PORT"
23
+ export http_proxy="socks5://127.0.0.1:$PORT"
24
+
25
+ export HTTPS_PROXY="socks5://127.0.0.1:$PORT"
26
+ export https_proxy="socks5://127.0.0.1:$PORT"
27
+
28
+ export JAVA_OPTS="-DsocksProxyHost=127.0.0.1 -DsocksProxyPort=$PORT"
29
+ }
30
+ __xm_slurm_proxy "{{ host }}"
31
+ {%- endmacro %}
@@ -0,0 +1,29 @@
1
+ {% extends "job.bash.j2" %}
2
+ {% block directives %}
3
+ {{ super() -}}
4
+ #SBATCH --array=0-{{ args | length - 1 }}
5
+ #SBATCH --output=xm-%j-%a.stdout
6
+ #SBATCH --error=xm-%j-%a.stderr
7
+ {% endblock directives %}
8
+
9
+ {% block bootstrap %}
10
+ srun \
11
+ --unbuffered \
12
+ --kill-on-bad-exit=0 \
13
+ --overlap \
14
+ --export={{ export(job, "ALL") }} \
15
+ bash <<'SRUN_EOF' &
16
+ set -Eeuxo pipefail
17
+
18
+ readonly __XM_SLURM_TRIALS=(
19
+ {% for trial in args %}
20
+ "{{ trial.to_list() | join(" ") }}"
21
+ {% endfor %}
22
+ )
23
+
24
+ {% call run(job, cluster) %}
25
+ ${__XM_SLURM_TRIALS[$SLURM_ARRAY_TASK_ID]} \
26
+ {% endcall %}
27
+
28
+ SRUN_EOF
29
+ {%- endblock bootstrap %}
@@ -0,0 +1,41 @@
1
+ {% extends "job.bash.j2" %}
2
+ {% block directives %}
3
+ #SBATCH --export=NONE
4
+ #SBATCH --comment="{'xid': {{ experiment_id }}}"
5
+ {% if cluster.account %}
6
+ #SBATCH --account={{ cluster.account }}
7
+ {% endif %}
8
+ {% if cluster.partition %}
9
+ #SBATCH --partition={{ cluster.partition }}
10
+ {% endif %}
11
+ {% if cluster.qos %}
12
+ #SBATCH --qos={{ cluster.qos }}
13
+ {% endif %}
14
+
15
+ {% for job_name, job in job_group.jobs.items() %}
16
+ #SBATCH --output=xm-%j+{{ job_name }}.stdout
17
+ #SBATCH --error=xm-%j+{{ job_name }}.stderr
18
+ {% if identity %}
19
+ #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}.{{ identity }}]
20
+ #SBATCH --dependency=singleton
21
+ {% else %}
22
+ #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
23
+ {% endif %}
24
+ {{ job.executor.to_directives() | join("\n") }}
25
+ {{ "\n#SBATCH hetjob\n" if not loop.last }}
26
+ {% endfor %}
27
+ {% endblock directives %}
28
+
29
+ {% block bootstrap %}
30
+ {% for job in job_group.jobs.values() +%}
31
+ srun \
32
+ --unbuffered \
33
+ --kill-on-bad-exit=0 \
34
+ --export={{ export(job, "ALL") }} \
35
+ --het-group={{ loop.index0 }} \
36
+ bash <<'SRUN_EOF' &
37
+ set -Eeuxo pipefail
38
+ {{ run(job, cluster) }}
39
+ SRUN_EOF
40
+ {% endfor +%}
41
+ {% endblock bootstrap %}
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env bash
2
+ {% block directives %}
3
+ #SBATCH --open-mode=append
4
+ #SBATCH --export=NONE
5
+ #SBATCH --output=xm-%j.stdout
6
+ #SBATCH --error=xm-%j.stderr
7
+ #SBATCH --comment="{'xid': {{ experiment_id }}}"
8
+ {% if cluster.account and not job.executor.account %}
9
+ #SBATCH --account={{ cluster.account }}
10
+ {% endif %}
11
+ {% if cluster.partition and not job.executor.partition %}
12
+ #SBATCH --partition={{ cluster.partition }}
13
+ {% endif %}
14
+ {% if cluster.qos and not job.executor.qos %}
15
+ #SBATCH --qos={{ cluster.qos }}
16
+ {% endif %}
17
+ {% if identity %}
18
+ #SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
19
+ #SBATCH --dependency=singleton
20
+ {% else %}
21
+ #SBATCH --job-name=xm[{{ experiment_id }}]
22
+ {% endif %}
23
+ {% for directive in job.executor.to_directives() %}
24
+ #SBATCH {{ directive }}
25
+ {% endfor %}
26
+ {% endblock directives %}
27
+ set -Eeuxo pipefail
28
+
29
+ {% block prolog %}
30
+ {% if cluster.prolog %}
31
+ {{- cluster.prolog -}}
32
+ {% endif %}
33
+ {%- endblock prolog %}
34
+
35
+
36
+ {% block environment -%}
37
+ {{ env(cluster.environment) }}
38
+ {%- endblock environment %}
39
+
40
+ {% block proxy -%}
41
+ {%- if cluster.proxy %}
42
+ {% from 'fragments/proxy.bash.j2' import proxy %}
43
+ {% if cluster.proxy == "submission-host" %}
44
+ {{ proxy("$SLURM_SUBMIT_HOST") }}
45
+ {% else %}
46
+ {{ proxy(cluster.proxy) }}
47
+ {% endif %}
48
+ {% endif %}
49
+ {%- endblock proxy %}
50
+
51
+ {% block bootstrap %}
52
+ srun \
53
+ --unbuffered \
54
+ --kill-on-bad-exit=0 \
55
+ --overlap \
56
+ --export={{ export(job, "ALL") }} \
57
+ bash <<'SRUN_EOF' &
58
+ set -Eeuxo pipefail
59
+ {{ run(job, cluster) }}
60
+ SRUN_EOF
61
+ {%- endblock bootstrap %}
62
+
63
+ echo "[INFO] Start timestamp: $(date)"
64
+
65
+ {%- block epilog -%}
66
+ {% if cluster.epilog %}
67
+ {{ cluster.epilog }}
68
+ {% endif %}
69
+ {%- endblock epilog %}
70
+
71
+
72
+ {% block monitor -%}
73
+ {% from 'fragments/monitor.bash.j2' import monitor %}
74
+ {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
75
+ {%- endblock monitor %}
76
+
77
+
78
+ echo "[INFO] End timestamp: $(date)"
@@ -0,0 +1,103 @@
1
+ {% macro env(env_vars) -%}
2
+ {% for key, value in env_vars.items() %}
3
+ {% if key.startswith("SINGULARITY") or key.startswith("APPTAINER") or key.startswith("_") %}
4
+ {% set key = key.lstrip('_') %}
5
+ export {{ key }}="{{ value }}"
6
+ {% else %}
7
+ export APPTAINERENV_{{ key }}="{{ value }}"
8
+ export SINGULARITYENV_{{ key }}="{{ value }}"
9
+ {% endif %}
10
+ {% endfor %}
11
+ {%- endmacro %}
12
+
13
+ {% macro export(job, mode=None) -%}
14
+ {%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
15
+ {%- if job.executable.credentials -%}
16
+ {%- set combined_envs = operator.or_(combined_envs, {
17
+ "APPTAINER_DOCKER_USERNAME": job.executable.credentials.username,
18
+ "APPTAINER_DOCKER_PASSWORD": job.executable.credentials.password,
19
+ "SINGULARITY_DOCKER_USERNAME": job.executable.credentials.username,
20
+ "SINGULARITY_DOCKER_PASSWORD": job.executable.credentials.password,
21
+ })
22
+ -%}
23
+ {%- endif %}
24
+
25
+ {%- set env_strings = [] -%}
26
+ {%- for key, value in combined_envs.items() -%}
27
+ {%- if key.startswith("SINGULARITY") or key.startswith("APPTAINER") -%}
28
+ {%- set _ = env_strings.append('{0}="{1}"'.format(key, value)) -%}
29
+ {%- else -%}
30
+ {%- set _ = env_strings.append('APPTAINERENV_{0}="{1}",SINGULARITYENV_{0}="{1}"'.format(key, value)) -%}
31
+ {%- endif -%}
32
+ {%- endfor -%}
33
+
34
+ {%- if mode is not none -%}
35
+ {{- mode -}}{{- "," if combined_envs -}}
36
+ {%- endif -%}
37
+
38
+ {{- env_strings | join(",") -}}
39
+ {% endmacro %}
40
+
41
+ {% macro run(job, cluster) -%}
42
+ # Determine which binary to use or if an error should be raised
43
+ if [[ $(command -v apptainer) ]]; then
44
+ readonly CONTAINER_RUNTIME="apptainer"
45
+ elif [[ $(command -v singularity) ]]; then
46
+ readonly CONTAINER_RUNTIME="singularity"
47
+ else
48
+ echo "Error: Neither singularity nor apptainer binaries found" >&2
49
+ exit 1
50
+ fi
51
+
52
+ # Bundle will be where our built sandbox image is stored
53
+ # container-workdir will be our container's scratch directory
54
+ mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
55
+
56
+ time ${CONTAINER_RUNTIME} build \
57
+ --force \
58
+ --sandbox \
59
+ --fix-perms \
60
+ "$SLURM_TMPDIR"/container \
61
+ docker://{{ job.executable.image }}
62
+
63
+ {% if (cluster.runtime | string) == "singularity" and cluster.mounts %}
64
+ {% for source, dest in cluster.mounts.items() %}
65
+ mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
66
+ {% endfor %}
67
+ {% endif %}
68
+
69
+ exec ${CONTAINER_RUNTIME} run \
70
+ {% if job.executor.requirements.accelerator %}
71
+ --nv \
72
+ {% endif %}
73
+ --no-init \
74
+ --no-umask \
75
+ --no-home \
76
+ --cleanenv \
77
+ --containall \
78
+ {% if cluster.mounts %}
79
+ {% for source, dest in cluster.mounts.items() %}
80
+ --mount type=bind,src={{ source }},dst={{ dest }} \
81
+ {% endfor %}
82
+ {% endif %}
83
+ --workdir "$SLURM_TMPDIR"/container-workdir \
84
+ {% if (cluster.runtime | string) == "apptainer" %}
85
+ --overlay "$SLURM_TMPDIR"/container-overlay \
86
+ {% else %}
87
+ --writable \
88
+ {% endif %}
89
+ {% if job.executable.workdir %}
90
+ --pwd {{ job.executable.workdir }} \
91
+ {% endif %}
92
+ "$SLURM_TMPDIR"/container \
93
+ {% for arg in job.executable.args.to_list() %}
94
+ {{ arg }} \
95
+ {% endfor %}
96
+ {% for arg in job.args.to_list() %}
97
+ {{ arg }} \
98
+ {% endfor %}
99
+ {% if caller %}
100
+ {{- caller() -}}
101
+ {% endif %}
102
+ "$@"
103
+ {%- endmacro %}
@@ -0,0 +1,56 @@
1
+ {% macro env(env_vars) -%}
2
+ {% for key, value in env_vars.items() %}
3
+ export PODMANENV_{{ key }}="{{ value }}"
4
+ {% endfor %}
5
+ {%- endmacro %}
6
+
7
+ {% macro export(job, mode=None) -%}
8
+ {%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
9
+
10
+ {%- set env_strings = [] -%}
11
+ {%- for key, value in combined_envs.items() -%}
12
+ {%- set _ = env_strings.append('PODMANENV_{0}="{1}"'.format(key, value)) -%}
13
+ {%- endfor -%}
14
+
15
+ {%- if mode is not none -%}
16
+ {{- mode -}}{{- "," if combined_envs -}}
17
+ {%- endif -%}
18
+
19
+ {{- env_strings | join(",") -}}
20
+ {% endmacro %}
21
+
22
+ {% macro run(job, cluster) -%}
23
+ podman pull \
24
+ {% if job.executable.credentials %}
25
+ --creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
26
+ {% endif %}
27
+ {{ job.executable.image }}
28
+
29
+ exec podman run \
30
+ --env PODMANENV* \
31
+ --pull never \
32
+ --restart no \
33
+ --rm \
34
+ {% if job.executor.requirements.accelerator %}
35
+ --device nvidia.com/gpu=all \
36
+ {% endif %}
37
+ {% if cluster.mounts %}
38
+ {% for source, dest in cluster.mounts.items() %}
39
+ --mount type=bind,src={{ source }},dst={{ dest }} \
40
+ {% endfor %}
41
+ {% endif %}
42
+ {% if job.executable.workdir %}
43
+ --workdir {{ job.executable.workdir }} \
44
+ {% endif %}
45
+ {{ job.executable.image }} \
46
+ {% for arg in job.executable.args.to_list() %}
47
+ {{ arg }} \
48
+ {% endfor %}
49
+ {% for arg in job.args.to_list() %}
50
+ {{ arg }} \
51
+ {% endfor %}
52
+ {% if caller %}
53
+ {{- caller() -}}
54
+ {% endif %}
55
+ "$@"
56
+ {% endmacro %}