xmanager-slurm 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +44 -0
- xm_slurm/api.py +261 -0
- xm_slurm/batching.py +139 -0
- xm_slurm/config.py +162 -0
- xm_slurm/console.py +3 -0
- xm_slurm/contrib/clusters/__init__.py +52 -0
- xm_slurm/contrib/clusters/drac.py +169 -0
- xm_slurm/executables.py +201 -0
- xm_slurm/execution.py +491 -0
- xm_slurm/executors.py +127 -0
- xm_slurm/experiment.py +737 -0
- xm_slurm/job_blocks.py +14 -0
- xm_slurm/packageables.py +292 -0
- xm_slurm/packaging/__init__.py +8 -0
- xm_slurm/packaging/docker/__init__.py +75 -0
- xm_slurm/packaging/docker/abc.py +112 -0
- xm_slurm/packaging/docker/cloud.py +503 -0
- xm_slurm/packaging/docker/local.py +206 -0
- xm_slurm/packaging/registry.py +45 -0
- xm_slurm/packaging/router.py +52 -0
- xm_slurm/packaging/utils.py +202 -0
- xm_slurm/resources.py +150 -0
- xm_slurm/status.py +188 -0
- xm_slurm/templates/docker/docker-bake.hcl.j2 +47 -0
- xm_slurm/templates/docker/mamba.Dockerfile +27 -0
- xm_slurm/templates/docker/pdm.Dockerfile +31 -0
- xm_slurm/templates/docker/python.Dockerfile +24 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +32 -0
- xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +29 -0
- xm_slurm/templates/slurm/job-group.bash.j2 +41 -0
- xm_slurm/templates/slurm/job.bash.j2 +78 -0
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +103 -0
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +56 -0
- xm_slurm/utils.py +69 -0
- xmanager_slurm-0.3.0.dist-info/METADATA +25 -0
- xmanager_slurm-0.3.0.dist-info/RECORD +38 -0
- xmanager_slurm-0.3.0.dist-info/WHEEL +4 -0
xm_slurm/status.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Implementation of Slurm work unit statuses."""
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import re
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
from xmanager import xm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SlurmJobState(enum.Enum):
|
|
11
|
+
BOOT_FAIL = enum.auto()
|
|
12
|
+
CANCELLED = enum.auto()
|
|
13
|
+
COMPLETED = enum.auto()
|
|
14
|
+
DEADLINE = enum.auto()
|
|
15
|
+
FAILED = enum.auto()
|
|
16
|
+
NODE_FAIL = enum.auto()
|
|
17
|
+
OUT_OF_MEMORY = enum.auto()
|
|
18
|
+
PENDING = enum.auto()
|
|
19
|
+
PREEMPTED = enum.auto()
|
|
20
|
+
RUNNING = enum.auto()
|
|
21
|
+
REQUEUED = enum.auto()
|
|
22
|
+
RESIZING = enum.auto()
|
|
23
|
+
REVOKED = enum.auto()
|
|
24
|
+
SUSPENDED = enum.auto()
|
|
25
|
+
TIMEOUT = enum.auto()
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def message(self) -> str:
|
|
29
|
+
match self:
|
|
30
|
+
case SlurmJobState.BOOT_FAIL:
|
|
31
|
+
return (
|
|
32
|
+
"Job terminated due to launch failure, "
|
|
33
|
+
"typically due to a hardware failure (e.g. unable to boot "
|
|
34
|
+
"the node or block and the job can not be requeued)."
|
|
35
|
+
)
|
|
36
|
+
case SlurmJobState.CANCELLED:
|
|
37
|
+
return (
|
|
38
|
+
"Job was explicitly cancelled by the user or "
|
|
39
|
+
"system administrator. The job may or may not have been "
|
|
40
|
+
"initiated."
|
|
41
|
+
)
|
|
42
|
+
case SlurmJobState.COMPLETED:
|
|
43
|
+
return "Job has terminated all processes on all " "nodes with an exit code of zero."
|
|
44
|
+
case SlurmJobState.DEADLINE:
|
|
45
|
+
return "Job terminated on deadline."
|
|
46
|
+
case SlurmJobState.FAILED:
|
|
47
|
+
return "Job terminated with non-zero exit code or " "other failure condition."
|
|
48
|
+
case SlurmJobState.NODE_FAIL:
|
|
49
|
+
return "Job terminated due to failure of one or " "more allocated nodes."
|
|
50
|
+
case SlurmJobState.OUT_OF_MEMORY:
|
|
51
|
+
return "Job experienced out of memory error."
|
|
52
|
+
case SlurmJobState.PENDING:
|
|
53
|
+
return "Job is awaiting resource allocation."
|
|
54
|
+
case SlurmJobState.PREEMPTED:
|
|
55
|
+
return "Job terminated due to preemption."
|
|
56
|
+
case SlurmJobState.RUNNING:
|
|
57
|
+
return "Job currently has an allocation."
|
|
58
|
+
case SlurmJobState.REQUEUED:
|
|
59
|
+
return "Job was requeued."
|
|
60
|
+
case SlurmJobState.RESIZING:
|
|
61
|
+
return "Job is about to change size."
|
|
62
|
+
case SlurmJobState.REVOKED:
|
|
63
|
+
return "Sibling was removed from cluster due to " "other cluster starting the job."
|
|
64
|
+
case SlurmJobState.SUSPENDED:
|
|
65
|
+
return "Job has an allocation, but execution has been suspended."
|
|
66
|
+
case SlurmJobState.TIMEOUT:
|
|
67
|
+
return "Job terminated upon reaching its time limit."
|
|
68
|
+
case _:
|
|
69
|
+
raise ValueError(f"Invalid Slurm job state: {self}")
|
|
70
|
+
|
|
71
|
+
def __str__(self) -> str:
|
|
72
|
+
return f"{self.name}: {self.message}"
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_str(cls, state: str) -> "SlurmJobState":
|
|
76
|
+
return cls[state]
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_slurm_str(cls, state: str) -> "SlurmJobState":
|
|
80
|
+
_SLURM_JOB_STATE_REGEX = re.compile(f"({'|'.join(entry.name for entry in cls)})\\s?.*")
|
|
81
|
+
match = _SLURM_JOB_STATE_REGEX.match(state)
|
|
82
|
+
assert match and len(match.groups()) == 1, f"Failed to parse job state, {state!r}"
|
|
83
|
+
return cls.from_str(match.group(1))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
SlurmPendingJobStates = set([
|
|
87
|
+
SlurmJobState.PENDING,
|
|
88
|
+
SlurmJobState.REQUEUED,
|
|
89
|
+
SlurmJobState.RESIZING,
|
|
90
|
+
])
|
|
91
|
+
SlurmRunningJobStates = set([
|
|
92
|
+
SlurmJobState.RUNNING,
|
|
93
|
+
SlurmJobState.SUSPENDED,
|
|
94
|
+
])
|
|
95
|
+
SlurmActiveJobStates = SlurmPendingJobStates | SlurmRunningJobStates
|
|
96
|
+
SlurmCompletedJobStates = set([SlurmJobState.COMPLETED])
|
|
97
|
+
SlurmFailedJobStates = set([
|
|
98
|
+
SlurmJobState.BOOT_FAIL,
|
|
99
|
+
SlurmJobState.DEADLINE,
|
|
100
|
+
SlurmJobState.FAILED,
|
|
101
|
+
SlurmJobState.NODE_FAIL,
|
|
102
|
+
SlurmJobState.OUT_OF_MEMORY,
|
|
103
|
+
SlurmJobState.PREEMPTED,
|
|
104
|
+
SlurmJobState.REVOKED,
|
|
105
|
+
SlurmJobState.TIMEOUT,
|
|
106
|
+
])
|
|
107
|
+
SlurmCancelledJobStates = set([SlurmJobState.CANCELLED])
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SlurmWorkUnitStatusEnum(enum.IntEnum):
|
|
111
|
+
"""Status of a local experiment job."""
|
|
112
|
+
|
|
113
|
+
# Work unit was created, but has not started yet.
|
|
114
|
+
PENDING = 0
|
|
115
|
+
# Work unit was created, but has not terminated yet.
|
|
116
|
+
RUNNING = 1
|
|
117
|
+
# Work unit terminated and was successful.
|
|
118
|
+
COMPLETED = 2
|
|
119
|
+
# Work unit terminated and was not succesful.
|
|
120
|
+
FAILED = 3
|
|
121
|
+
# Work unit terminated because it was cancelled by the user.
|
|
122
|
+
CANCELLED = 4
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def from_job_state(cls, state: SlurmJobState) -> "SlurmWorkUnitStatusEnum":
|
|
126
|
+
"""Convert a Slurm job state to a SlurmWorkUnitStatusEnum."""
|
|
127
|
+
if state in SlurmPendingJobStates:
|
|
128
|
+
return cls.PENDING
|
|
129
|
+
elif state in SlurmRunningJobStates:
|
|
130
|
+
return cls.RUNNING
|
|
131
|
+
elif state in SlurmCompletedJobStates:
|
|
132
|
+
return cls.COMPLETED
|
|
133
|
+
elif state in SlurmFailedJobStates:
|
|
134
|
+
return cls.FAILED
|
|
135
|
+
elif state in SlurmCancelledJobStates:
|
|
136
|
+
return cls.CANCELLED
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Invalid Slurm job state: {state}")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class SlurmWorkUnitStatus(xm.ExperimentUnitStatus):
|
|
142
|
+
"""Status of a Slurm experiment job."""
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def aggregate(cls, states: Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
|
|
146
|
+
"""Aggregate a sequence of statuses into a single status."""
|
|
147
|
+
assert len(states) > 0, "Cannot aggregate empty sequence of statuses."
|
|
148
|
+
max_error_state: SlurmJobState | None = None
|
|
149
|
+
for state in states:
|
|
150
|
+
if not max_error_state:
|
|
151
|
+
max_error_state = state
|
|
152
|
+
elif SlurmWorkUnitStatusEnum.from_job_state(
|
|
153
|
+
state
|
|
154
|
+
) > SlurmWorkUnitStatusEnum.from_job_state(max_error_state):
|
|
155
|
+
max_error_state = state
|
|
156
|
+
assert max_error_state is not None
|
|
157
|
+
return cls(max_error_state)
|
|
158
|
+
|
|
159
|
+
def __init__(self, state: SlurmJobState) -> None:
|
|
160
|
+
super().__init__()
|
|
161
|
+
self._state = state
|
|
162
|
+
self._status = SlurmWorkUnitStatusEnum.from_job_state(state)
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def is_active(self) -> bool:
|
|
166
|
+
return (
|
|
167
|
+
self._status == SlurmWorkUnitStatusEnum.RUNNING
|
|
168
|
+
or self._status == SlurmWorkUnitStatusEnum.PENDING
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def is_completed(self) -> bool:
|
|
173
|
+
return self._status == SlurmWorkUnitStatusEnum.COMPLETED
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def is_failed(self) -> bool:
|
|
177
|
+
return self._status == SlurmWorkUnitStatusEnum.FAILED
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def status(self) -> SlurmWorkUnitStatusEnum:
|
|
181
|
+
return self._status
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def message(self) -> str:
|
|
185
|
+
return str(self._state)
|
|
186
|
+
|
|
187
|
+
def __repr__(self) -> str:
|
|
188
|
+
return f"<SlurmWorkUnitStatus {self._state!r}>"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{% for executable, executors in executables.items() %}
|
|
2
|
+
target "{{ hash(executable) }}" {
|
|
3
|
+
dockerfile-inline = <<EOF
|
|
4
|
+
{{ executable.dockerfile.read_text() }}
|
|
5
|
+
EOF
|
|
6
|
+
context = "{{ executable.context }}"
|
|
7
|
+
{% if executable.target %}
|
|
8
|
+
target = "{{ executable.target }}"
|
|
9
|
+
{% endif %}
|
|
10
|
+
pull = true
|
|
11
|
+
tags = [
|
|
12
|
+
{% for executor in executors %}
|
|
13
|
+
"{{ executor.tag }}"{% if not loop.last %},{% endif %}
|
|
14
|
+
{% endfor %}
|
|
15
|
+
]
|
|
16
|
+
output = [
|
|
17
|
+
"type=registry"
|
|
18
|
+
]
|
|
19
|
+
{% if executable.cache_from %}
|
|
20
|
+
cache-from = [
|
|
21
|
+
{% for cache_from in executable.cache_from %}
|
|
22
|
+
"{{ cache_from }}"{% if not loop.last %},{% endif %}
|
|
23
|
+
{% endfor %}
|
|
24
|
+
]
|
|
25
|
+
{% endif %}
|
|
26
|
+
cache-to = [
|
|
27
|
+
"type=inline"
|
|
28
|
+
]
|
|
29
|
+
platforms = [
|
|
30
|
+
{% for platform in executable.platforms %}
|
|
31
|
+
"{{ platform }}"{% if not loop.last %},{% endif %}
|
|
32
|
+
{% endfor %}
|
|
33
|
+
]
|
|
34
|
+
labels = {
|
|
35
|
+
{% for key, value in executable.labels.items() %}
|
|
36
|
+
"{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
|
|
37
|
+
{% endfor %}
|
|
38
|
+
}
|
|
39
|
+
{% if executable.build_args %}
|
|
40
|
+
args = {
|
|
41
|
+
{% for key, value in executable.build_args.items() %}
|
|
42
|
+
"{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
|
|
43
|
+
{% endfor %}
|
|
44
|
+
}
|
|
45
|
+
{% endif %}
|
|
46
|
+
}
|
|
47
|
+
{% endfor %}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE=gcr.io/distroless/base-debian10
|
|
3
|
+
|
|
4
|
+
FROM docker.io/mambaorg/micromamba:jammy as mamba
|
|
5
|
+
ARG CONDA_ENVIRONMENT=environment.yml
|
|
6
|
+
|
|
7
|
+
USER root
|
|
8
|
+
|
|
9
|
+
COPY $CONDA_ENVIRONMENT /tmp/
|
|
10
|
+
|
|
11
|
+
# Setup mamba environment
|
|
12
|
+
RUN --mount=type=cache,target=/opt/conda/pkgs --mount=type=cache,target=/root/.cache/pip \
|
|
13
|
+
micromamba create --yes --always-copy --no-pyc --prefix /opt/env --file /tmp/environment.yml
|
|
14
|
+
|
|
15
|
+
RUN find /opt/env/ -follow -type f -name '*.a' -delete && \
|
|
16
|
+
find /opt/env/ -follow -type f -name '*.js.map' -delete
|
|
17
|
+
|
|
18
|
+
FROM $BASE_IMAGE
|
|
19
|
+
|
|
20
|
+
COPY --link --from=mamba /opt/env /opt/env
|
|
21
|
+
|
|
22
|
+
ENV PATH=$PATH:/opt/env/bin
|
|
23
|
+
|
|
24
|
+
WORKDIR /workspace
|
|
25
|
+
COPY --link . /workspace
|
|
26
|
+
|
|
27
|
+
ENTRYPOINT ["/opt/env/bin/python"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE
|
|
3
|
+
|
|
4
|
+
FROM $BASE_IMAGE AS builder
|
|
5
|
+
|
|
6
|
+
RUN apt-get update \
|
|
7
|
+
&& apt-get install -y --no-install-recommends \
|
|
8
|
+
git \
|
|
9
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
+
|
|
11
|
+
RUN pip install -U pip setuptools wheel pysocks \
|
|
12
|
+
&& pip install pdm
|
|
13
|
+
|
|
14
|
+
COPY --link pyproject.toml pdm.lock /workspace/
|
|
15
|
+
WORKDIR /workspace
|
|
16
|
+
|
|
17
|
+
RUN --mount=type=cache,target=/root/.cache/pdm mkdir __pypackages__ \
|
|
18
|
+
&& PDM_CACHE_DIR=/root/.cache/pdm pdm sync --prod --no-editable
|
|
19
|
+
|
|
20
|
+
FROM $BASE_IMAGE
|
|
21
|
+
|
|
22
|
+
ARG PYTHON_MAJOR
|
|
23
|
+
ARG PYTHON_MINOR
|
|
24
|
+
|
|
25
|
+
ENV PYTHONPATH=/workspace/pkgs:$PYTHONPATH
|
|
26
|
+
COPY --link --from=builder /workspace/__pypackages__/$PYTHON_MAJOR.$PYTHON_MINOR/lib /workspace/pkgs
|
|
27
|
+
|
|
28
|
+
WORKDIR /workspace/src
|
|
29
|
+
COPY --link . /workspace/src
|
|
30
|
+
|
|
31
|
+
ENTRYPOINT ["python"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE=docker.io/python:3.10-slim
|
|
3
|
+
FROM $BASE_IMAGE as builder
|
|
4
|
+
|
|
5
|
+
RUN apt-get update \
|
|
6
|
+
&& apt-get install -y --no-install-recommends \
|
|
7
|
+
git \
|
|
8
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
9
|
+
|
|
10
|
+
# Install and update necesarry global Python packages
|
|
11
|
+
RUN pip install -U pip setuptools wheel pysocks
|
|
12
|
+
|
|
13
|
+
ARG PIP_REQUIREMENTS=requirements.txt
|
|
14
|
+
|
|
15
|
+
RUN python -m venv --copies --upgrade --upgrade-deps --system-site-packages /venv
|
|
16
|
+
COPY $PIP_REQUIREMENTS /tmp/requirements.txt
|
|
17
|
+
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
18
|
+
PIP_CACHE_DIR=/root/.cache/pip /venv/bin/pip install -r /tmp/requirements.txt \
|
|
19
|
+
&& rm -rf /tmp/requirements.txt
|
|
20
|
+
|
|
21
|
+
COPY --link . /workspace
|
|
22
|
+
WORKDIR /workspace
|
|
23
|
+
|
|
24
|
+
ENTRYPOINT [ "/venv/bin/python" ]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
|
|
2
|
+
__xm_slurm_wait_for_children() {
|
|
3
|
+
if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
|
|
4
|
+
local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
|
|
5
|
+
else
|
|
6
|
+
local -r JOB_ID="${SLURM_JOB_ID}"
|
|
7
|
+
fi
|
|
8
|
+
|
|
9
|
+
# If there are no child jobs we should error out
|
|
10
|
+
if [ -z "$(jobs -p)" ]; then
|
|
11
|
+
echo "ERROR: no child jobs exist..." >&2
|
|
12
|
+
exit -1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
# Loop through all job IDs in the background job list and wait for them to finish
|
|
16
|
+
for job in "$(jobs -p)"; do
|
|
17
|
+
echo "INFO: Waiting for job ${job} to finish..."
|
|
18
|
+
set +e
|
|
19
|
+
wait "${job}"
|
|
20
|
+
local -r JOB_EXIT_CODE="${?}"
|
|
21
|
+
set -e
|
|
22
|
+
|
|
23
|
+
if [ "${JOB_EXIT_CODE}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT-0}" -le "{{ requeue_max_attempts }}" ]; then
|
|
24
|
+
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
25
|
+
scontrol requeue "${JOB_ID}"
|
|
26
|
+
exit {{ requeue_exit_code }}
|
|
27
|
+
fi
|
|
28
|
+
done
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
__xm_slurm_wait_for_children
|
|
32
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{% macro proxy(host) -%}
|
|
2
|
+
__xm_slurm_proxy() {
|
|
3
|
+
local -r GATEWAY="$1"
|
|
4
|
+
local PORT
|
|
5
|
+
|
|
6
|
+
# Find an open port
|
|
7
|
+
while
|
|
8
|
+
PORT="$(shuf -n 1 -i 1024-65535)"
|
|
9
|
+
netstat -atun | grep -q "$PORT"
|
|
10
|
+
do
|
|
11
|
+
sleep 0.25
|
|
12
|
+
continue
|
|
13
|
+
done
|
|
14
|
+
|
|
15
|
+
# Reverse proxy through the gateway
|
|
16
|
+
ssh -D "$PORT" "$GATEWAY" -N -f
|
|
17
|
+
|
|
18
|
+
# Export all env vars for applications to pick up on proxy
|
|
19
|
+
export ALL_PROXY="socks5://127.0.0.1:$PORT"
|
|
20
|
+
export all_proxy="socks5://127.0.0.1:$PORT"
|
|
21
|
+
|
|
22
|
+
export HTTP_PROXY="socks5://127.0.0.1:$PORT"
|
|
23
|
+
export http_proxy="socks5://127.0.0.1:$PORT"
|
|
24
|
+
|
|
25
|
+
export HTTPS_PROXY="socks5://127.0.0.1:$PORT"
|
|
26
|
+
export https_proxy="socks5://127.0.0.1:$PORT"
|
|
27
|
+
|
|
28
|
+
export JAVA_OPTS="-DsocksProxyHost=127.0.0.1 -DsocksProxyPort=$PORT"
|
|
29
|
+
}
|
|
30
|
+
__xm_slurm_proxy "{{ host }}"
|
|
31
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{% extends "job.bash.j2" %}
|
|
2
|
+
{% block directives %}
|
|
3
|
+
{{ super() -}}
|
|
4
|
+
#SBATCH --array=0-{{ args | length - 1 }}
|
|
5
|
+
#SBATCH --output=xm-%j-%a.stdout
|
|
6
|
+
#SBATCH --error=xm-%j-%a.stderr
|
|
7
|
+
{% endblock directives %}
|
|
8
|
+
|
|
9
|
+
{% block bootstrap %}
|
|
10
|
+
srun \
|
|
11
|
+
--unbuffered \
|
|
12
|
+
--kill-on-bad-exit=0 \
|
|
13
|
+
--overlap \
|
|
14
|
+
--export={{ export(job, "ALL") }} \
|
|
15
|
+
bash <<'SRUN_EOF' &
|
|
16
|
+
set -Eeuxo pipefail
|
|
17
|
+
|
|
18
|
+
readonly __XM_SLURM_TRIALS=(
|
|
19
|
+
{% for trial in args %}
|
|
20
|
+
"{{ trial.to_list() | join(" ") }}"
|
|
21
|
+
{% endfor %}
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
{% call run(job, cluster) %}
|
|
25
|
+
${__XM_SLURM_TRIALS[$SLURM_ARRAY_TASK_ID]} \
|
|
26
|
+
{% endcall %}
|
|
27
|
+
|
|
28
|
+
SRUN_EOF
|
|
29
|
+
{%- endblock bootstrap %}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{% extends "job.bash.j2" %}
|
|
2
|
+
{% block directives %}
|
|
3
|
+
#SBATCH --export=NONE
|
|
4
|
+
#SBATCH --comment="{'xid': {{ experiment_id }}}"
|
|
5
|
+
{% if cluster.account %}
|
|
6
|
+
#SBATCH --account={{ cluster.account }}
|
|
7
|
+
{% endif %}
|
|
8
|
+
{% if cluster.partition %}
|
|
9
|
+
#SBATCH --partition={{ cluster.partition }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if cluster.qos %}
|
|
12
|
+
#SBATCH --qos={{ cluster.qos }}
|
|
13
|
+
{% endif %}
|
|
14
|
+
|
|
15
|
+
{% for job_name, job in job_group.jobs.items() %}
|
|
16
|
+
#SBATCH --output=xm-%j+{{ job_name }}.stdout
|
|
17
|
+
#SBATCH --error=xm-%j+{{ job_name }}.stderr
|
|
18
|
+
{% if identity %}
|
|
19
|
+
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}.{{ identity }}]
|
|
20
|
+
#SBATCH --dependency=singleton
|
|
21
|
+
{% else %}
|
|
22
|
+
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
|
|
23
|
+
{% endif %}
|
|
24
|
+
{{ job.executor.to_directives() | join("\n") }}
|
|
25
|
+
{{ "\n#SBATCH hetjob\n" if not loop.last }}
|
|
26
|
+
{% endfor %}
|
|
27
|
+
{% endblock directives %}
|
|
28
|
+
|
|
29
|
+
{% block bootstrap %}
|
|
30
|
+
{% for job in job_group.jobs.values() +%}
|
|
31
|
+
srun \
|
|
32
|
+
--unbuffered \
|
|
33
|
+
--kill-on-bad-exit=0 \
|
|
34
|
+
--export={{ export(job, "ALL") }} \
|
|
35
|
+
--het-group={{ loop.index0 }} \
|
|
36
|
+
bash <<'SRUN_EOF' &
|
|
37
|
+
set -Eeuxo pipefail
|
|
38
|
+
{{ run(job, cluster) }}
|
|
39
|
+
SRUN_EOF
|
|
40
|
+
{% endfor +%}
|
|
41
|
+
{% endblock bootstrap %}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
{% block directives %}
|
|
3
|
+
#SBATCH --open-mode=append
|
|
4
|
+
#SBATCH --export=NONE
|
|
5
|
+
#SBATCH --output=xm-%j.stdout
|
|
6
|
+
#SBATCH --error=xm-%j.stderr
|
|
7
|
+
#SBATCH --comment="{'xid': {{ experiment_id }}}"
|
|
8
|
+
{% if cluster.account and not job.executor.account %}
|
|
9
|
+
#SBATCH --account={{ cluster.account }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if cluster.partition and not job.executor.partition %}
|
|
12
|
+
#SBATCH --partition={{ cluster.partition }}
|
|
13
|
+
{% endif %}
|
|
14
|
+
{% if cluster.qos and not job.executor.qos %}
|
|
15
|
+
#SBATCH --qos={{ cluster.qos }}
|
|
16
|
+
{% endif %}
|
|
17
|
+
{% if identity %}
|
|
18
|
+
#SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
|
|
19
|
+
#SBATCH --dependency=singleton
|
|
20
|
+
{% else %}
|
|
21
|
+
#SBATCH --job-name=xm[{{ experiment_id }}]
|
|
22
|
+
{% endif %}
|
|
23
|
+
{% for directive in job.executor.to_directives() %}
|
|
24
|
+
#SBATCH {{ directive }}
|
|
25
|
+
{% endfor %}
|
|
26
|
+
{% endblock directives %}
|
|
27
|
+
set -Eeuxo pipefail
|
|
28
|
+
|
|
29
|
+
{% block prolog %}
|
|
30
|
+
{% if cluster.prolog %}
|
|
31
|
+
{{- cluster.prolog -}}
|
|
32
|
+
{% endif %}
|
|
33
|
+
{%- endblock prolog %}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
{% block environment -%}
|
|
37
|
+
{{ env(cluster.environment) }}
|
|
38
|
+
{%- endblock environment %}
|
|
39
|
+
|
|
40
|
+
{% block proxy -%}
|
|
41
|
+
{%- if cluster.proxy %}
|
|
42
|
+
{% from 'fragments/proxy.bash.j2' import proxy %}
|
|
43
|
+
{% if cluster.proxy == "submission-host" %}
|
|
44
|
+
{{ proxy("$SLURM_SUBMIT_HOST") }}
|
|
45
|
+
{% else %}
|
|
46
|
+
{{ proxy(cluster.proxy) }}
|
|
47
|
+
{% endif %}
|
|
48
|
+
{% endif %}
|
|
49
|
+
{%- endblock proxy %}
|
|
50
|
+
|
|
51
|
+
{% block bootstrap %}
|
|
52
|
+
srun \
|
|
53
|
+
--unbuffered \
|
|
54
|
+
--kill-on-bad-exit=0 \
|
|
55
|
+
--overlap \
|
|
56
|
+
--export={{ export(job, "ALL") }} \
|
|
57
|
+
bash <<'SRUN_EOF' &
|
|
58
|
+
set -Eeuxo pipefail
|
|
59
|
+
{{ run(job, cluster) }}
|
|
60
|
+
SRUN_EOF
|
|
61
|
+
{%- endblock bootstrap %}
|
|
62
|
+
|
|
63
|
+
echo "[INFO] Start timestamp: $(date)"
|
|
64
|
+
|
|
65
|
+
{%- block epilog -%}
|
|
66
|
+
{% if cluster.epilog %}
|
|
67
|
+
{{ cluster.epilog }}
|
|
68
|
+
{% endif %}
|
|
69
|
+
{%- endblock epilog %}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
{% block monitor -%}
|
|
73
|
+
{% from 'fragments/monitor.bash.j2' import monitor %}
|
|
74
|
+
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
|
|
75
|
+
{%- endblock monitor %}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
echo "[INFO] End timestamp: $(date)"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
{% macro env(env_vars) -%}
|
|
2
|
+
{% for key, value in env_vars.items() %}
|
|
3
|
+
{% if key.startswith("SINGULARITY") or key.startswith("APPTAINER") or key.startswith("_") %}
|
|
4
|
+
{% set key = key.lstrip('_') %}
|
|
5
|
+
export {{ key }}="{{ value }}"
|
|
6
|
+
{% else %}
|
|
7
|
+
export APPTAINERENV_{{ key }}="{{ value }}"
|
|
8
|
+
export SINGULARITYENV_{{ key }}="{{ value }}"
|
|
9
|
+
{% endif %}
|
|
10
|
+
{% endfor %}
|
|
11
|
+
{%- endmacro %}
|
|
12
|
+
|
|
13
|
+
{% macro export(job, mode=None) -%}
|
|
14
|
+
{%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
|
|
15
|
+
{%- if job.executable.credentials -%}
|
|
16
|
+
{%- set combined_envs = operator.or_(combined_envs, {
|
|
17
|
+
"APPTAINER_DOCKER_USERNAME": job.executable.credentials.username,
|
|
18
|
+
"APPTAINER_DOCKER_PASSWORD": job.executable.credentials.password,
|
|
19
|
+
"SINGULARITY_DOCKER_USERNAME": job.executable.credentials.username,
|
|
20
|
+
"SINGULARITY_DOCKER_PASSWORD": job.executable.credentials.password,
|
|
21
|
+
})
|
|
22
|
+
-%}
|
|
23
|
+
{%- endif %}
|
|
24
|
+
|
|
25
|
+
{%- set env_strings = [] -%}
|
|
26
|
+
{%- for key, value in combined_envs.items() -%}
|
|
27
|
+
{%- if key.startswith("SINGULARITY") or key.startswith("APPTAINER") -%}
|
|
28
|
+
{%- set _ = env_strings.append('{0}="{1}"'.format(key, value)) -%}
|
|
29
|
+
{%- else -%}
|
|
30
|
+
{%- set _ = env_strings.append('APPTAINERENV_{0}="{1}",SINGULARITYENV_{0}="{1}"'.format(key, value)) -%}
|
|
31
|
+
{%- endif -%}
|
|
32
|
+
{%- endfor -%}
|
|
33
|
+
|
|
34
|
+
{%- if mode is not none -%}
|
|
35
|
+
{{- mode -}}{{- "," if combined_envs -}}
|
|
36
|
+
{%- endif -%}
|
|
37
|
+
|
|
38
|
+
{{- env_strings | join(",") -}}
|
|
39
|
+
{% endmacro %}
|
|
40
|
+
|
|
41
|
+
{% macro run(job, cluster) -%}
|
|
42
|
+
# Determine which binary to use or if an error should be raised
|
|
43
|
+
if [[ $(command -v apptainer) ]]; then
|
|
44
|
+
readonly CONTAINER_RUNTIME="apptainer"
|
|
45
|
+
elif [[ $(command -v singularity) ]]; then
|
|
46
|
+
readonly CONTAINER_RUNTIME="singularity"
|
|
47
|
+
else
|
|
48
|
+
echo "Error: Neither singularity nor apptainer binaries found" >&2
|
|
49
|
+
exit 1
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Bundle will be where our built sandbox image is stored
|
|
53
|
+
# container-workdir will be our container's scratch directory
|
|
54
|
+
mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
|
|
55
|
+
|
|
56
|
+
time ${CONTAINER_RUNTIME} build \
|
|
57
|
+
--force \
|
|
58
|
+
--sandbox \
|
|
59
|
+
--fix-perms \
|
|
60
|
+
"$SLURM_TMPDIR"/container \
|
|
61
|
+
docker://{{ job.executable.image }}
|
|
62
|
+
|
|
63
|
+
{% if (cluster.runtime | string) == "singularity" and cluster.mounts %}
|
|
64
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
65
|
+
mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
|
|
66
|
+
{% endfor %}
|
|
67
|
+
{% endif %}
|
|
68
|
+
|
|
69
|
+
exec ${CONTAINER_RUNTIME} run \
|
|
70
|
+
{% if job.executor.requirements.accelerator %}
|
|
71
|
+
--nv \
|
|
72
|
+
{% endif %}
|
|
73
|
+
--no-init \
|
|
74
|
+
--no-umask \
|
|
75
|
+
--no-home \
|
|
76
|
+
--cleanenv \
|
|
77
|
+
--containall \
|
|
78
|
+
{% if cluster.mounts %}
|
|
79
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
80
|
+
--mount type=bind,src={{ source }},dst={{ dest }} \
|
|
81
|
+
{% endfor %}
|
|
82
|
+
{% endif %}
|
|
83
|
+
--workdir "$SLURM_TMPDIR"/container-workdir \
|
|
84
|
+
{% if (cluster.runtime | string) == "apptainer" %}
|
|
85
|
+
--overlay "$SLURM_TMPDIR"/container-overlay \
|
|
86
|
+
{% else %}
|
|
87
|
+
--writable \
|
|
88
|
+
{% endif %}
|
|
89
|
+
{% if job.executable.workdir %}
|
|
90
|
+
--pwd {{ job.executable.workdir }} \
|
|
91
|
+
{% endif %}
|
|
92
|
+
"$SLURM_TMPDIR"/container \
|
|
93
|
+
{% for arg in job.executable.args.to_list() %}
|
|
94
|
+
{{ arg }} \
|
|
95
|
+
{% endfor %}
|
|
96
|
+
{% for arg in job.args.to_list() %}
|
|
97
|
+
{{ arg }} \
|
|
98
|
+
{% endfor %}
|
|
99
|
+
{% if caller %}
|
|
100
|
+
{{- caller() -}}
|
|
101
|
+
{% endif %}
|
|
102
|
+
"$@"
|
|
103
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{% macro env(env_vars) -%}
|
|
2
|
+
{% for key, value in env_vars.items() %}
|
|
3
|
+
export PODMANENV_{{ key }}="{{ value }}"
|
|
4
|
+
{% endfor %}
|
|
5
|
+
{%- endmacro %}
|
|
6
|
+
|
|
7
|
+
{% macro export(job, mode=None) -%}
|
|
8
|
+
{%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
|
|
9
|
+
|
|
10
|
+
{%- set env_strings = [] -%}
|
|
11
|
+
{%- for key, value in combined_envs.items() -%}
|
|
12
|
+
{%- set _ = env_strings.append('PODMANENV_{0}="{1}"'.format(key, value)) -%}
|
|
13
|
+
{%- endfor -%}
|
|
14
|
+
|
|
15
|
+
{%- if mode is not none -%}
|
|
16
|
+
{{- mode -}}{{- "," if combined_envs -}}
|
|
17
|
+
{%- endif -%}
|
|
18
|
+
|
|
19
|
+
{{- env_strings | join(",") -}}
|
|
20
|
+
{% endmacro %}
|
|
21
|
+
|
|
22
|
+
{% macro run(job, cluster) -%}
|
|
23
|
+
podman pull \
|
|
24
|
+
{% if job.executable.credentials %}
|
|
25
|
+
--creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
|
|
26
|
+
{% endif %}
|
|
27
|
+
{{ job.executable.image }}
|
|
28
|
+
|
|
29
|
+
exec podman run \
|
|
30
|
+
--env PODMANENV* \
|
|
31
|
+
--pull never \
|
|
32
|
+
--restart no \
|
|
33
|
+
--rm \
|
|
34
|
+
{% if job.executor.requirements.accelerator %}
|
|
35
|
+
--device nvidia.com/gpu=all \
|
|
36
|
+
{% endif %}
|
|
37
|
+
{% if cluster.mounts %}
|
|
38
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
39
|
+
--mount type=bind,src={{ source }},dst={{ dest }} \
|
|
40
|
+
{% endfor %}
|
|
41
|
+
{% endif %}
|
|
42
|
+
{% if job.executable.workdir %}
|
|
43
|
+
--workdir {{ job.executable.workdir }} \
|
|
44
|
+
{% endif %}
|
|
45
|
+
{{ job.executable.image }} \
|
|
46
|
+
{% for arg in job.executable.args.to_list() %}
|
|
47
|
+
{{ arg }} \
|
|
48
|
+
{% endfor %}
|
|
49
|
+
{% for arg in job.args.to_list() %}
|
|
50
|
+
{{ arg }} \
|
|
51
|
+
{% endfor %}
|
|
52
|
+
{% if caller %}
|
|
53
|
+
{{- caller() -}}
|
|
54
|
+
{% endif %}
|
|
55
|
+
"$@"
|
|
56
|
+
{% endmacro %}
|