xmanager-slurm 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. xm_slurm/__init__.py +47 -0
  2. xm_slurm/api/__init__.py +33 -0
  3. xm_slurm/api/abc.py +65 -0
  4. xm_slurm/api/models.py +70 -0
  5. xm_slurm/api/sqlite/client.py +358 -0
  6. xm_slurm/api/web/client.py +173 -0
  7. xm_slurm/batching.py +139 -0
  8. xm_slurm/config.py +189 -0
  9. xm_slurm/console.py +3 -0
  10. xm_slurm/constants.py +19 -0
  11. xm_slurm/contrib/__init__.py +0 -0
  12. xm_slurm/contrib/clusters/__init__.py +67 -0
  13. xm_slurm/contrib/clusters/drac.py +242 -0
  14. xm_slurm/dependencies.py +171 -0
  15. xm_slurm/executables.py +215 -0
  16. xm_slurm/execution.py +995 -0
  17. xm_slurm/executors.py +210 -0
  18. xm_slurm/experiment.py +1016 -0
  19. xm_slurm/experimental/parameter_controller.py +206 -0
  20. xm_slurm/filesystems.py +129 -0
  21. xm_slurm/job_blocks.py +21 -0
  22. xm_slurm/metadata_context.py +253 -0
  23. xm_slurm/packageables.py +309 -0
  24. xm_slurm/packaging/__init__.py +8 -0
  25. xm_slurm/packaging/docker.py +348 -0
  26. xm_slurm/packaging/registry.py +45 -0
  27. xm_slurm/packaging/router.py +56 -0
  28. xm_slurm/packaging/utils.py +22 -0
  29. xm_slurm/resources.py +350 -0
  30. xm_slurm/scripts/_cloudpickle.py +28 -0
  31. xm_slurm/scripts/cli.py +90 -0
  32. xm_slurm/status.py +197 -0
  33. xm_slurm/templates/docker/docker-bake.hcl.j2 +54 -0
  34. xm_slurm/templates/docker/mamba.Dockerfile +29 -0
  35. xm_slurm/templates/docker/python.Dockerfile +32 -0
  36. xm_slurm/templates/docker/uv.Dockerfile +38 -0
  37. xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
  38. xm_slurm/templates/slurm/fragments/monitor.bash.j2 +78 -0
  39. xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
  40. xm_slurm/templates/slurm/job-array.bash.j2 +31 -0
  41. xm_slurm/templates/slurm/job-group.bash.j2 +47 -0
  42. xm_slurm/templates/slurm/job.bash.j2 +90 -0
  43. xm_slurm/templates/slurm/library/retry.bash +62 -0
  44. xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +73 -0
  45. xm_slurm/templates/slurm/runtimes/podman.bash.j2 +43 -0
  46. xm_slurm/types.py +23 -0
  47. xm_slurm/utils.py +196 -0
  48. xmanager_slurm-0.4.19.dist-info/METADATA +28 -0
  49. xmanager_slurm-0.4.19.dist-info/RECORD +52 -0
  50. xmanager_slurm-0.4.19.dist-info/WHEEL +4 -0
  51. xmanager_slurm-0.4.19.dist-info/entry_points.txt +2 -0
  52. xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md +227 -0
xm_slurm/status.py ADDED
@@ -0,0 +1,197 @@
1
+ """Implementation of Slurm work unit statuses."""
2
+
3
+ import enum
4
+ import re
5
+ import typing as tp
6
+
7
+ from xmanager import xm
8
+
9
+
10
+ class SlurmJobState(enum.Enum):
11
+ BOOT_FAIL = enum.auto()
12
+ CANCELLED = enum.auto()
13
+ COMPLETED = enum.auto()
14
+ DEADLINE = enum.auto()
15
+ FAILED = enum.auto()
16
+ NODE_FAIL = enum.auto()
17
+ OUT_OF_MEMORY = enum.auto()
18
+ PENDING = enum.auto()
19
+ PREEMPTED = enum.auto()
20
+ RUNNING = enum.auto()
21
+ REQUEUED = enum.auto()
22
+ RESIZING = enum.auto()
23
+ REVOKED = enum.auto()
24
+ SUSPENDED = enum.auto()
25
+ TIMEOUT = enum.auto()
26
+
27
+ @property
28
+ def message(self) -> str:
29
+ match self:
30
+ case SlurmJobState.BOOT_FAIL:
31
+ return (
32
+ "Job terminated due to launch failure, "
33
+ "typically due to a hardware failure (e.g. unable to boot "
34
+ "the node or block and the job can not be requeued)."
35
+ )
36
+ case SlurmJobState.CANCELLED:
37
+ return (
38
+ "Job was explicitly cancelled by the user or "
39
+ "system administrator. The job may or may not have been "
40
+ "initiated."
41
+ )
42
+ case SlurmJobState.COMPLETED:
43
+ return "Job has terminated all processes on all " "nodes with an exit code of zero."
44
+ case SlurmJobState.DEADLINE:
45
+ return "Job terminated on deadline."
46
+ case SlurmJobState.FAILED:
47
+ return "Job terminated with non-zero exit code or " "other failure condition."
48
+ case SlurmJobState.NODE_FAIL:
49
+ return "Job terminated due to failure of one or " "more allocated nodes."
50
+ case SlurmJobState.OUT_OF_MEMORY:
51
+ return "Job experienced out of memory error."
52
+ case SlurmJobState.PENDING:
53
+ return "Job is awaiting resource allocation."
54
+ case SlurmJobState.PREEMPTED:
55
+ return "Job terminated due to preemption."
56
+ case SlurmJobState.RUNNING:
57
+ return "Job currently has an allocation."
58
+ case SlurmJobState.REQUEUED:
59
+ return "Job was requeued."
60
+ case SlurmJobState.RESIZING:
61
+ return "Job is about to change size."
62
+ case SlurmJobState.REVOKED:
63
+ return "Sibling was removed from cluster due to " "other cluster starting the job."
64
+ case SlurmJobState.SUSPENDED:
65
+ return "Job has an allocation, but execution has been suspended."
66
+ case SlurmJobState.TIMEOUT:
67
+ return "Job terminated upon reaching its time limit."
68
+ case _:
69
+ raise ValueError(f"Invalid Slurm job state: {self}")
70
+
71
+ def __str__(self) -> str:
72
+ return f"{self.name}: {self.message}"
73
+
74
+ @classmethod
75
+ def from_str(cls, state: str) -> "SlurmJobState":
76
+ return cls[state]
77
+
78
+ @classmethod
79
+ def from_slurm_str(cls, state: str) -> "SlurmJobState":
80
+ _SLURM_JOB_STATE_REGEX = re.compile(f"({'|'.join(entry.name for entry in cls)})\\s?.*")
81
+ match = _SLURM_JOB_STATE_REGEX.match(state)
82
+ assert match and len(match.groups()) == 1, f"Failed to parse job state, {state!r}"
83
+ return cls.from_str(match.group(1))
84
+
85
+
86
+ SlurmPendingJobStates = set([
87
+ SlurmJobState.PENDING,
88
+ SlurmJobState.REQUEUED,
89
+ SlurmJobState.RESIZING,
90
+ ])
91
+ SlurmRunningJobStates = set([
92
+ SlurmJobState.RUNNING,
93
+ SlurmJobState.SUSPENDED,
94
+ ])
95
+ SlurmActiveJobStates = SlurmPendingJobStates | SlurmRunningJobStates
96
+ SlurmCompletedJobStates = set([SlurmJobState.COMPLETED])
97
+ SlurmFailedJobStates = set([
98
+ SlurmJobState.BOOT_FAIL,
99
+ SlurmJobState.DEADLINE,
100
+ SlurmJobState.FAILED,
101
+ SlurmJobState.NODE_FAIL,
102
+ SlurmJobState.OUT_OF_MEMORY,
103
+ SlurmJobState.PREEMPTED,
104
+ SlurmJobState.REVOKED,
105
+ SlurmJobState.TIMEOUT,
106
+ ])
107
+ SlurmCancelledJobStates = set([SlurmJobState.CANCELLED])
108
+
109
+ assert (
110
+ SlurmPendingJobStates
111
+ | SlurmRunningJobStates
112
+ | SlurmActiveJobStates
113
+ | SlurmCompletedJobStates
114
+ | SlurmFailedJobStates
115
+ | SlurmCancelledJobStates
116
+ ) == set(SlurmJobState.__members__.values()), "Slurm job states are not exhaustive."
117
+
118
+
119
+ class SlurmWorkUnitStatusEnum(enum.IntEnum):
120
+ """Status of a local experiment job."""
121
+
122
+ # Work unit was created, but has not started yet.
123
+ PENDING = 0
124
+ # Work unit was created, but has not terminated yet.
125
+ RUNNING = 1
126
+ # Work unit terminated and was successful.
127
+ COMPLETED = 2
128
+ # Work unit terminated and was not succesful.
129
+ FAILED = 3
130
+ # Work unit terminated because it was cancelled by the user.
131
+ CANCELLED = 4
132
+
133
+ @classmethod
134
+ def from_job_state(cls, state: SlurmJobState) -> "SlurmWorkUnitStatusEnum":
135
+ """Convert a Slurm job state to a SlurmWorkUnitStatusEnum."""
136
+ if state in SlurmPendingJobStates:
137
+ return cls.PENDING
138
+ elif state in SlurmRunningJobStates:
139
+ return cls.RUNNING
140
+ elif state in SlurmCompletedJobStates:
141
+ return cls.COMPLETED
142
+ elif state in SlurmFailedJobStates:
143
+ return cls.FAILED
144
+ elif state in SlurmCancelledJobStates:
145
+ return cls.CANCELLED
146
+ else:
147
+ raise ValueError(f"Invalid Slurm job state: {state}")
148
+
149
+
150
+ class SlurmWorkUnitStatus(xm.ExperimentUnitStatus):
151
+ """Status of a Slurm experiment job."""
152
+
153
+ @classmethod
154
+ def aggregate(cls, states: tp.Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
155
+ """Aggregate a sequence of statuses into a single status."""
156
+ assert len(states) > 0, "Cannot aggregate empty sequence of statuses."
157
+ max_error_state: SlurmJobState | None = None
158
+ for state in states:
159
+ if not max_error_state:
160
+ max_error_state = state
161
+ elif SlurmWorkUnitStatusEnum.from_job_state(
162
+ state
163
+ ) > SlurmWorkUnitStatusEnum.from_job_state(max_error_state):
164
+ max_error_state = state
165
+ assert max_error_state is not None
166
+ return cls(max_error_state)
167
+
168
+ def __init__(self, state: SlurmJobState) -> None:
169
+ super().__init__()
170
+ self._state = state
171
+ self._status = SlurmWorkUnitStatusEnum.from_job_state(state)
172
+
173
+ @property
174
+ def is_active(self) -> bool:
175
+ return (
176
+ self._status == SlurmWorkUnitStatusEnum.RUNNING
177
+ or self._status == SlurmWorkUnitStatusEnum.PENDING
178
+ )
179
+
180
+ @property
181
+ def is_completed(self) -> bool:
182
+ return self._status == SlurmWorkUnitStatusEnum.COMPLETED
183
+
184
+ @property
185
+ def is_failed(self) -> bool:
186
+ return self._status == SlurmWorkUnitStatusEnum.FAILED
187
+
188
+ @property
189
+ def status(self) -> SlurmWorkUnitStatusEnum:
190
+ return self._status
191
+
192
+ @property
193
+ def message(self) -> str:
194
+ return str(self._state)
195
+
196
+ def __repr__(self) -> str:
197
+ return f"<SlurmWorkUnitStatus {self._state!r}>"
@@ -0,0 +1,54 @@
1
+ {% for executable, executors in executables.items() %}
2
+ target "{{ hash(executable) }}" {
3
+ dockerfile-inline = <<EOF
4
+ {{ executable.dockerfile.read_text() }}
5
+ EOF
6
+ context = "{{ executable.context }}"
7
+ {% if executable.ssh %}
8
+ ssh = [
9
+ {% for ssh_val in executable.ssh %}
10
+ "{{ ssh_val }}"{% if not loop.last %},{% endif %}
11
+ {% endfor %}
12
+ ]
13
+ {% endif %}
14
+ {% if executable.target %}
15
+ target = "{{ executable.target }}"
16
+ {% endif %}
17
+ pull = false
18
+ tags = [
19
+ {% for executor in executors %}
20
+ "{{ executor.tag }}"{% if not loop.last %},{% endif %}
21
+ {% endfor %}
22
+ ]
23
+ output = [
24
+ "type=registry"
25
+ ]
26
+ {% if executable.cache_from %}
27
+ cache-from = [
28
+ {% for cache_from in executable.cache_from %}
29
+ "{{ cache_from }}"{% if not loop.last %},{% endif %}
30
+ {% endfor %}
31
+ ]
32
+ {% endif %}
33
+ cache-to = [
34
+ "type=inline"
35
+ ]
36
+ platforms = [
37
+ {% for platform in executable.platforms %}
38
+ "{{ platform }}"{% if not loop.last %},{% endif %}
39
+ {% endfor %}
40
+ ]
41
+ labels = {
42
+ {% for key, value in executable.labels.items() %}
43
+ "{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
44
+ {% endfor %}
45
+ }
46
+ {% if executable.build_args %}
47
+ args = {
48
+ {% for key, value in executable.build_args.items() %}
49
+ "{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
50
+ {% endfor %}
51
+ }
52
+ {% endif %}
53
+ }
54
+ {% endfor %}
@@ -0,0 +1,29 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE=gcr.io/distroless/base-debian10
3
+
4
+ FROM docker.io/mambaorg/micromamba:bookworm-slim as mamba
5
+ ARG CONDA_ENVIRONMENT=environment.yml
6
+
7
+ USER root
8
+
9
+ COPY $CONDA_ENVIRONMENT /tmp/
10
+
11
+ # Setup mamba environment
12
+ RUN --mount=type=cache,target=/opt/conda/pkgs \
13
+ --mount=type=cache,target=/root/.cache/pip \
14
+ --mount=type=ssh \
15
+ micromamba create --yes --always-copy --no-pyc --prefix /opt/env --file /tmp/environment.yml
16
+
17
+ RUN find /opt/env/ -follow -type f -name '*.a' -delete && \
18
+ find /opt/env/ -follow -type f -name '*.js.map' -delete
19
+
20
+ FROM $BASE_IMAGE
21
+
22
+ COPY --link --from=mamba /opt/env /opt/env
23
+
24
+ ENV PATH=$PATH:/opt/env/bin
25
+
26
+ WORKDIR /workspace
27
+ COPY --link . /workspace
28
+
29
+ ENTRYPOINT ["/opt/env/bin/python"]
@@ -0,0 +1,32 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE=docker.io/python:3.10-slim-bookworm
3
+ FROM $BASE_IMAGE as builder
4
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
5
+
6
+ ARG EXTRA_SYSTEM_PACKAGES=""
7
+ ARG EXTRA_PYTHON_PACKAGES=""
8
+
9
+ ENV UV_PYTHON_DOWNLOADS=0
10
+ ENV UV_COMPILE_BYTECODE=1
11
+ ENV UV_LINK_MODE=copy
12
+
13
+ WORKDIR /workspace
14
+
15
+ RUN apt-get update \
16
+ && apt-get install -y --no-install-recommends \
17
+ git $EXTRA_SYSTEM_PACKAGES \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # Install and update necesarry global Python packages
21
+ RUN uv pip install --system pysocks $EXTRA_PYTHON_PACKAGES
22
+
23
+ ARG PIP_REQUIREMENTS=requirements.txt
24
+
25
+ RUN --mount=type=cache,target=/root/.cache/uv \
26
+ --mount=type=bind,source=$PIP_REQUIREMENTS,target=requirements.txt \
27
+ --mount=type=ssh \
28
+ uv pip install --system --requirement requirements.txt
29
+
30
+ COPY --link . /workspace
31
+
32
+ ENTRYPOINT [ "python" ]
@@ -0,0 +1,38 @@
1
+ # syntax=docker/dockerfile:1.4
2
+ ARG BASE_IMAGE=docker.io/python:3.10-slim-bookworm
3
+ FROM $BASE_IMAGE
4
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
5
+
6
+ ARG EXTRA_SYSTEM_PACKAGES=""
7
+ ARG EXTRA_PYTHON_PACKAGES=""
8
+
9
+ WORKDIR /workspace
10
+
11
+ ENV UV_PYTHON_DOWNLOADS=0
12
+ ENV UV_COMPILE_BYTECODE=1
13
+ ENV UV_LINK_MODE=copy
14
+
15
+ RUN apt-get update \
16
+ && apt-get install -y --no-install-recommends \
17
+ git $EXTRA_SYSTEM_PACKAGES \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ RUN --mount=type=cache,target=/root/.cache/uv \
21
+ uv pip install --system pysocks $EXTRA_PYTHON_PACKAGES
22
+
23
+ RUN uv venv --system-site-packages
24
+
25
+ ENV PATH="/workspace/.venv/bin:$PATH"
26
+
27
+ RUN --mount=type=cache,target=/root/.cache/uv \
28
+ --mount=type=bind,source=uv.lock,target=uv.lock \
29
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
30
+ --mount=type=ssh \
31
+ uv sync --frozen --no-install-project --no-dev --no-editable
32
+
33
+ COPY --link . /workspace
34
+ RUN --mount=type=cache,target=/root/.cache/uv \
35
+ --mount=type=ssh \
36
+ uv sync --frozen --no-dev
37
+
38
+ ENTRYPOINT [ "python" ]
@@ -0,0 +1,27 @@
1
+ {%- macro entrypoint(cluster, job) -%}
2
+ #!/bin/sh
3
+ set -eux
4
+
5
+ {% if cluster.container_environment %}
6
+ # Cluster environment variables
7
+ {% for key, value in cluster.container_environment.items() %}
8
+ export {{ key }}="{{ value }}"
9
+ {% endfor %}
10
+ {%- endif %}
11
+
12
+ {% if job.executable.env_vars %}
13
+ # Executable environment variables
14
+ {% for key, value in job.executable.env_vars.items() %}
15
+ export {{ key }}="{{ value }}"
16
+ {% endfor %}
17
+ {%- endif %}
18
+
19
+ {% if job.env_vars %}
20
+ # Job environment variables
21
+ {% for key, value in job.env_vars.items() %}
22
+ export {{ key }}="{{ value }}"
23
+ {% endfor %}
24
+ {%- endif %}
25
+
26
+ exec {{ job.executable.entrypoint.to_list() | join(' ') }} "$@"
27
+ {%- endmacro -%}
@@ -0,0 +1,78 @@
1
+ {% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
2
+ __xm_slurm_wait_for_children() {
3
+ if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
4
+ local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
5
+ else
6
+ local -r JOB_ID="${SLURM_JOB_ID}"
7
+ fi
8
+
9
+ # If there are no child jobs we should error out
10
+ children=( $(jobs -p) )
11
+ {% raw %}
12
+ if [ ${#children[@]} -eq 0 ]; then
13
+ {% endraw %}
14
+ echo "ERROR: no child jobs exist..." >&2
15
+ exit 1
16
+ fi
17
+
18
+ {% if requeue_on_timeout %}
19
+ # Start a watchdog process to signal timeout.
20
+ sleep {{ requeue_timeout }} &
21
+ timeout_pid=$!
22
+ {% endif %}
23
+
24
+ {% raw %}
25
+ while [ ${#children[@]} -gt 0 ]; do
26
+ {% endraw %}
27
+ echo "INFO: Waiting for child processes to finish..."
28
+ set +e
29
+ {% if requeue_on_timeout %}
30
+ # Wait on either one of the child processes or the timeout process.
31
+ wait -n -p child_pid "${children[@]}" "${timeout_pid}"
32
+ {% else %}
33
+ wait -n -p child_pid "${children[@]}"
34
+ {% endif %}
35
+ local child_exit_code=$?
36
+ set -e
37
+
38
+ {% if requeue_on_timeout %}
39
+ # If the finished process is the watchdog, trigger the timeout handling.
40
+ if [ "${child_pid}" = "${timeout_pid}" ]; then
41
+ echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
42
+ kill "${children[@]}" 2>/dev/null || true
43
+ scontrol requeue "${JOB_ID}"
44
+ exit {{ requeue_exit_code }}
45
+ fi
46
+ {% endif %}
47
+
48
+ echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
49
+
50
+ # Handle the exit code of the finished process.
51
+ if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
52
+ echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
53
+ scontrol requeue "${JOB_ID}"
54
+ exit {{ requeue_exit_code }}
55
+ elif [ "${child_exit_code}" -ne 0 ]; then
56
+ echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
57
+ exit "${child_exit_code}"
58
+ fi
59
+
60
+ # Remove the finished PID from the array in a concise way.
61
+ for i in "${!children[@]}"; do
62
+ if [ "${children[i]}" = "$child_pid" ]; then
63
+ unset 'children[i]'
64
+ break
65
+ fi
66
+ done
67
+
68
+ # Reindex the array.
69
+ children=( "${children[@]}" )
70
+ done
71
+
72
+ {% if requeue_on_timeout %}
73
+ kill "$timeout_pid" 2>/dev/null || true
74
+ {% endif %}
75
+ }
76
+
77
+ __xm_slurm_wait_for_children
78
+ {%- endmacro %}
@@ -0,0 +1,31 @@
1
+ {% macro proxy(host) -%}
2
+ __xm_slurm_proxy() {
3
+ local -r GATEWAY="$1"
4
+ local PORT
5
+
6
+ # Find an open port
7
+ while
8
+ PORT="$(shuf -n 1 -i 1024-65535)"
9
+ netstat -atun | grep -q "$PORT"
10
+ do
11
+ sleep 0.25
12
+ continue
13
+ done
14
+
15
+ # Reverse proxy through the gateway
16
+ ssh -D "$PORT" "$GATEWAY" -N -f
17
+
18
+ # Export all env vars for applications to pick up on proxy
19
+ export ALL_PROXY="socks5://127.0.0.1:$PORT"
20
+ export all_proxy="socks5://127.0.0.1:$PORT"
21
+
22
+ export HTTP_PROXY="socks5://127.0.0.1:$PORT"
23
+ export http_proxy="socks5://127.0.0.1:$PORT"
24
+
25
+ export HTTPS_PROXY="socks5://127.0.0.1:$PORT"
26
+ export https_proxy="socks5://127.0.0.1:$PORT"
27
+
28
+ export JAVA_OPTS="-DsocksProxyHost=127.0.0.1 -DsocksProxyPort=$PORT"
29
+ }
30
+ __xm_slurm_proxy "{{ host }}"
31
+ {%- endmacro %}
@@ -0,0 +1,31 @@
1
+ {% extends "job.bash.j2" %}
2
+ {% block directives %}
3
+ {{ super() -}}
4
+ #SBATCH --array=0-{{ args | length - 1 }}
5
+ #SBATCH --output=slurm-%A_%a.out
6
+ {% endblock directives %}
7
+
8
+ {% block bootstrap %}
9
+ srun \
10
+ --label \
11
+ --unbuffered \
12
+ --kill-on-bad-exit=1 \
13
+ --export="ALL" \
14
+ {% for directive in job.executor.step_directives() %}
15
+ {{ directive }} \
16
+ {% endfor %}
17
+ bash <<'SRUN_EOF' &
18
+ set -Eeuxo pipefail
19
+
20
+ readonly XM_SLURM_TRIAL_ARGS=(
21
+ {% for trial in args %}
22
+ "{{ trial.to_list() | join(" ") }}"
23
+ {% endfor %}
24
+ )
25
+
26
+ {% call run(cluster, job) %}
27
+ ${XM_SLURM_TRIAL_ARGS[$SLURM_ARRAY_TASK_ID]} \
28
+ {% endcall %}
29
+
30
+ SRUN_EOF
31
+ {%- endblock bootstrap %}
@@ -0,0 +1,47 @@
1
+ {% extends "job.bash.j2" %}
2
+ {% block directives %}
3
+ #SBATCH --export=NONE
4
+ #SBATCH --comment="{'xid': {{ experiment_id }}}"
5
+ {% if cluster.account %}
6
+ #SBATCH --account={{ cluster.account }}
7
+ {% endif %}
8
+ {% if cluster.partition %}
9
+ #SBATCH --partition={{ cluster.partition }}
10
+ {% endif %}
11
+ {% if cluster.qos %}
12
+ #SBATCH --qos={{ cluster.qos }}
13
+ {% endif %}
14
+
15
+ {% for job_name, job in job_group.jobs.items() %}
16
+ #SBATCH --output=xm-%j+{{ job_name }}.stdout
17
+ #SBATCH --error=xm-%j+{{ job_name }}.stderr
18
+ {% if identity %}
19
+ #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}.{{ identity }}]
20
+ #SBATCH --dependency=singleton
21
+ {% else %}
22
+ #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
23
+ {% endif %}
24
+ {% for directive in job.executor.batch_directives() %}
25
+ #SBATCH {{ directive }}
26
+ {% endfor %}
27
+ {{ "\n#SBATCH hetjob\n" if not loop.last }}
28
+ {% endfor %}
29
+ {% endblock directives %}
30
+
31
+ {% block bootstrap %}
32
+ {% for job in job_group.jobs.values() +%}
33
+ srun \
34
+ --label \
35
+ --unbuffered \
36
+ --kill-on-bad-exit=1 \
37
+ --export="ALL" \
38
+ {% for directive in job.executor.step_directives() %}
39
+ {{ directive }} \
40
+ {% endfor %}
41
+ --het-group={{ loop.index0 }} \
42
+ bash <<'SRUN_EOF' &
43
+ set -Eeuxo pipefail
44
+ {{ run(cluster, job) }}
45
+ SRUN_EOF
46
+ {% endfor +%}
47
+ {% endblock bootstrap %}
@@ -0,0 +1,90 @@
1
+ #!/usr/bin/env bash
2
+ {% block directives %}
3
+ #SBATCH --open-mode=append
4
+ #SBATCH --export=NONE
5
+ #SBATCH --output=slurm-%j.out
6
+ #SBATCH --comment="{'xid': {{ experiment_id }}}"
7
+ {% if cluster.account and not job.executor.account %}
8
+ #SBATCH --account={{ cluster.account }}
9
+ {% endif %}
10
+ {% if cluster.partition and not job.executor.partition %}
11
+ #SBATCH --partition={{ cluster.partition }}
12
+ {% endif %}
13
+ {% if cluster.qos and not job.executor.qos %}
14
+ #SBATCH --qos={{ cluster.qos }}
15
+ {% endif %}
16
+ {% if identity %}
17
+ #SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
18
+ {% else %}
19
+ {% if dependency %}
20
+ #SBATCH {{ dependency.to_directive() }}
21
+ {% endif %}
22
+ #SBATCH --job-name=xm[{{ experiment_id }}]
23
+ {% endif %}
24
+ {% for directive in job.executor.batch_directives() %}
25
+ #SBATCH {{ directive }}
26
+ {% endfor %}
27
+ {% endblock directives %}
28
+ set -Eeuxo pipefail
29
+
30
+ {% if stdlib %}
31
+ # --- Helper functions ---
32
+ {% for fn in stdlib %}
33
+ {{ fn }}
34
+ {% endfor %}
35
+ {% endif %}
36
+
37
+ {% block prolog %}
38
+ {% if cluster.prolog %}
39
+ {{- cluster.prolog -}}
40
+ {% endif %}
41
+ {%- endblock prolog %}
42
+
43
+ {% block environment -%}
44
+ {% for key, value in cluster.host_environment.items() %}
45
+ export {{ key }}="{{ value }}"
46
+ {% endfor %}
47
+ {%- endblock environment %}
48
+
49
+ {% block proxy -%}
50
+ {%- if cluster.proxy %}
51
+ {% from 'fragments/proxy.bash.j2' import proxy %}
52
+ {% if cluster.proxy == "submission-host" %}
53
+ {{ proxy("$SLURM_SUBMIT_HOST") }}
54
+ {% else %}
55
+ {{ proxy(cluster.proxy) }}
56
+ {% endif %}
57
+ {% endif %}
58
+ {%- endblock proxy %}
59
+
60
+ {% block bootstrap %}
61
+ srun \
62
+ --label \
63
+ --unbuffered \
64
+ --kill-on-bad-exit=1 \
65
+ --export="ALL" \
66
+ {% for directive in job.executor.step_directives() %}
67
+ {{ directive }} \
68
+ {% endfor %}
69
+ bash <<'SRUN_EOF' &
70
+ set -Eeuxo pipefail
71
+ {{ run(cluster, job) }}
72
+ SRUN_EOF
73
+ {%- endblock bootstrap %}
74
+
75
+ echo "[INFO] Start timestamp: $(date)"
76
+
77
+ {%- block epilog -%}
78
+ {% if cluster.epilog %}
79
+ {{ cluster.epilog }}
80
+ {% endif %}
81
+ {%- endblock epilog %}
82
+
83
+
84
+ {% block monitor -%}
85
+ {% from 'fragments/monitor.bash.j2' import monitor %}
86
+ {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
87
+ {%- endblock monitor %}
88
+
89
+
90
+ echo "[INFO] End timestamp: $(date)"