xmanager-slurm 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xm_slurm/__init__.py +47 -0
- xm_slurm/api/__init__.py +33 -0
- xm_slurm/api/abc.py +65 -0
- xm_slurm/api/models.py +70 -0
- xm_slurm/api/sqlite/client.py +358 -0
- xm_slurm/api/web/client.py +173 -0
- xm_slurm/batching.py +139 -0
- xm_slurm/config.py +189 -0
- xm_slurm/console.py +3 -0
- xm_slurm/constants.py +19 -0
- xm_slurm/contrib/__init__.py +0 -0
- xm_slurm/contrib/clusters/__init__.py +67 -0
- xm_slurm/contrib/clusters/drac.py +242 -0
- xm_slurm/dependencies.py +171 -0
- xm_slurm/executables.py +215 -0
- xm_slurm/execution.py +995 -0
- xm_slurm/executors.py +210 -0
- xm_slurm/experiment.py +1016 -0
- xm_slurm/experimental/parameter_controller.py +206 -0
- xm_slurm/filesystems.py +129 -0
- xm_slurm/job_blocks.py +21 -0
- xm_slurm/metadata_context.py +253 -0
- xm_slurm/packageables.py +309 -0
- xm_slurm/packaging/__init__.py +8 -0
- xm_slurm/packaging/docker.py +348 -0
- xm_slurm/packaging/registry.py +45 -0
- xm_slurm/packaging/router.py +56 -0
- xm_slurm/packaging/utils.py +22 -0
- xm_slurm/resources.py +350 -0
- xm_slurm/scripts/_cloudpickle.py +28 -0
- xm_slurm/scripts/cli.py +90 -0
- xm_slurm/status.py +197 -0
- xm_slurm/templates/docker/docker-bake.hcl.j2 +54 -0
- xm_slurm/templates/docker/mamba.Dockerfile +29 -0
- xm_slurm/templates/docker/python.Dockerfile +32 -0
- xm_slurm/templates/docker/uv.Dockerfile +38 -0
- xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +78 -0
- xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-group.bash.j2 +47 -0
- xm_slurm/templates/slurm/job.bash.j2 +90 -0
- xm_slurm/templates/slurm/library/retry.bash +62 -0
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +73 -0
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +43 -0
- xm_slurm/types.py +23 -0
- xm_slurm/utils.py +196 -0
- xmanager_slurm-0.4.19.dist-info/METADATA +28 -0
- xmanager_slurm-0.4.19.dist-info/RECORD +52 -0
- xmanager_slurm-0.4.19.dist-info/WHEEL +4 -0
- xmanager_slurm-0.4.19.dist-info/entry_points.txt +2 -0
- xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md +227 -0
xm_slurm/status.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Implementation of Slurm work unit statuses."""
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import re
|
|
5
|
+
import typing as tp
|
|
6
|
+
|
|
7
|
+
from xmanager import xm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SlurmJobState(enum.Enum):
|
|
11
|
+
BOOT_FAIL = enum.auto()
|
|
12
|
+
CANCELLED = enum.auto()
|
|
13
|
+
COMPLETED = enum.auto()
|
|
14
|
+
DEADLINE = enum.auto()
|
|
15
|
+
FAILED = enum.auto()
|
|
16
|
+
NODE_FAIL = enum.auto()
|
|
17
|
+
OUT_OF_MEMORY = enum.auto()
|
|
18
|
+
PENDING = enum.auto()
|
|
19
|
+
PREEMPTED = enum.auto()
|
|
20
|
+
RUNNING = enum.auto()
|
|
21
|
+
REQUEUED = enum.auto()
|
|
22
|
+
RESIZING = enum.auto()
|
|
23
|
+
REVOKED = enum.auto()
|
|
24
|
+
SUSPENDED = enum.auto()
|
|
25
|
+
TIMEOUT = enum.auto()
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def message(self) -> str:
|
|
29
|
+
match self:
|
|
30
|
+
case SlurmJobState.BOOT_FAIL:
|
|
31
|
+
return (
|
|
32
|
+
"Job terminated due to launch failure, "
|
|
33
|
+
"typically due to a hardware failure (e.g. unable to boot "
|
|
34
|
+
"the node or block and the job can not be requeued)."
|
|
35
|
+
)
|
|
36
|
+
case SlurmJobState.CANCELLED:
|
|
37
|
+
return (
|
|
38
|
+
"Job was explicitly cancelled by the user or "
|
|
39
|
+
"system administrator. The job may or may not have been "
|
|
40
|
+
"initiated."
|
|
41
|
+
)
|
|
42
|
+
case SlurmJobState.COMPLETED:
|
|
43
|
+
return "Job has terminated all processes on all " "nodes with an exit code of zero."
|
|
44
|
+
case SlurmJobState.DEADLINE:
|
|
45
|
+
return "Job terminated on deadline."
|
|
46
|
+
case SlurmJobState.FAILED:
|
|
47
|
+
return "Job terminated with non-zero exit code or " "other failure condition."
|
|
48
|
+
case SlurmJobState.NODE_FAIL:
|
|
49
|
+
return "Job terminated due to failure of one or " "more allocated nodes."
|
|
50
|
+
case SlurmJobState.OUT_OF_MEMORY:
|
|
51
|
+
return "Job experienced out of memory error."
|
|
52
|
+
case SlurmJobState.PENDING:
|
|
53
|
+
return "Job is awaiting resource allocation."
|
|
54
|
+
case SlurmJobState.PREEMPTED:
|
|
55
|
+
return "Job terminated due to preemption."
|
|
56
|
+
case SlurmJobState.RUNNING:
|
|
57
|
+
return "Job currently has an allocation."
|
|
58
|
+
case SlurmJobState.REQUEUED:
|
|
59
|
+
return "Job was requeued."
|
|
60
|
+
case SlurmJobState.RESIZING:
|
|
61
|
+
return "Job is about to change size."
|
|
62
|
+
case SlurmJobState.REVOKED:
|
|
63
|
+
return "Sibling was removed from cluster due to " "other cluster starting the job."
|
|
64
|
+
case SlurmJobState.SUSPENDED:
|
|
65
|
+
return "Job has an allocation, but execution has been suspended."
|
|
66
|
+
case SlurmJobState.TIMEOUT:
|
|
67
|
+
return "Job terminated upon reaching its time limit."
|
|
68
|
+
case _:
|
|
69
|
+
raise ValueError(f"Invalid Slurm job state: {self}")
|
|
70
|
+
|
|
71
|
+
def __str__(self) -> str:
|
|
72
|
+
return f"{self.name}: {self.message}"
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_str(cls, state: str) -> "SlurmJobState":
|
|
76
|
+
return cls[state]
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_slurm_str(cls, state: str) -> "SlurmJobState":
|
|
80
|
+
_SLURM_JOB_STATE_REGEX = re.compile(f"({'|'.join(entry.name for entry in cls)})\\s?.*")
|
|
81
|
+
match = _SLURM_JOB_STATE_REGEX.match(state)
|
|
82
|
+
assert match and len(match.groups()) == 1, f"Failed to parse job state, {state!r}"
|
|
83
|
+
return cls.from_str(match.group(1))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
SlurmPendingJobStates = set([
|
|
87
|
+
SlurmJobState.PENDING,
|
|
88
|
+
SlurmJobState.REQUEUED,
|
|
89
|
+
SlurmJobState.RESIZING,
|
|
90
|
+
])
|
|
91
|
+
SlurmRunningJobStates = set([
|
|
92
|
+
SlurmJobState.RUNNING,
|
|
93
|
+
SlurmJobState.SUSPENDED,
|
|
94
|
+
])
|
|
95
|
+
SlurmActiveJobStates = SlurmPendingJobStates | SlurmRunningJobStates
|
|
96
|
+
SlurmCompletedJobStates = set([SlurmJobState.COMPLETED])
|
|
97
|
+
SlurmFailedJobStates = set([
|
|
98
|
+
SlurmJobState.BOOT_FAIL,
|
|
99
|
+
SlurmJobState.DEADLINE,
|
|
100
|
+
SlurmJobState.FAILED,
|
|
101
|
+
SlurmJobState.NODE_FAIL,
|
|
102
|
+
SlurmJobState.OUT_OF_MEMORY,
|
|
103
|
+
SlurmJobState.PREEMPTED,
|
|
104
|
+
SlurmJobState.REVOKED,
|
|
105
|
+
SlurmJobState.TIMEOUT,
|
|
106
|
+
])
|
|
107
|
+
SlurmCancelledJobStates = set([SlurmJobState.CANCELLED])
|
|
108
|
+
|
|
109
|
+
assert (
|
|
110
|
+
SlurmPendingJobStates
|
|
111
|
+
| SlurmRunningJobStates
|
|
112
|
+
| SlurmActiveJobStates
|
|
113
|
+
| SlurmCompletedJobStates
|
|
114
|
+
| SlurmFailedJobStates
|
|
115
|
+
| SlurmCancelledJobStates
|
|
116
|
+
) == set(SlurmJobState.__members__.values()), "Slurm job states are not exhaustive."
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class SlurmWorkUnitStatusEnum(enum.IntEnum):
|
|
120
|
+
"""Status of a local experiment job."""
|
|
121
|
+
|
|
122
|
+
# Work unit was created, but has not started yet.
|
|
123
|
+
PENDING = 0
|
|
124
|
+
# Work unit was created, but has not terminated yet.
|
|
125
|
+
RUNNING = 1
|
|
126
|
+
# Work unit terminated and was successful.
|
|
127
|
+
COMPLETED = 2
|
|
128
|
+
# Work unit terminated and was not succesful.
|
|
129
|
+
FAILED = 3
|
|
130
|
+
# Work unit terminated because it was cancelled by the user.
|
|
131
|
+
CANCELLED = 4
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def from_job_state(cls, state: SlurmJobState) -> "SlurmWorkUnitStatusEnum":
|
|
135
|
+
"""Convert a Slurm job state to a SlurmWorkUnitStatusEnum."""
|
|
136
|
+
if state in SlurmPendingJobStates:
|
|
137
|
+
return cls.PENDING
|
|
138
|
+
elif state in SlurmRunningJobStates:
|
|
139
|
+
return cls.RUNNING
|
|
140
|
+
elif state in SlurmCompletedJobStates:
|
|
141
|
+
return cls.COMPLETED
|
|
142
|
+
elif state in SlurmFailedJobStates:
|
|
143
|
+
return cls.FAILED
|
|
144
|
+
elif state in SlurmCancelledJobStates:
|
|
145
|
+
return cls.CANCELLED
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError(f"Invalid Slurm job state: {state}")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class SlurmWorkUnitStatus(xm.ExperimentUnitStatus):
|
|
151
|
+
"""Status of a Slurm experiment job."""
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def aggregate(cls, states: tp.Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
|
|
155
|
+
"""Aggregate a sequence of statuses into a single status."""
|
|
156
|
+
assert len(states) > 0, "Cannot aggregate empty sequence of statuses."
|
|
157
|
+
max_error_state: SlurmJobState | None = None
|
|
158
|
+
for state in states:
|
|
159
|
+
if not max_error_state:
|
|
160
|
+
max_error_state = state
|
|
161
|
+
elif SlurmWorkUnitStatusEnum.from_job_state(
|
|
162
|
+
state
|
|
163
|
+
) > SlurmWorkUnitStatusEnum.from_job_state(max_error_state):
|
|
164
|
+
max_error_state = state
|
|
165
|
+
assert max_error_state is not None
|
|
166
|
+
return cls(max_error_state)
|
|
167
|
+
|
|
168
|
+
def __init__(self, state: SlurmJobState) -> None:
|
|
169
|
+
super().__init__()
|
|
170
|
+
self._state = state
|
|
171
|
+
self._status = SlurmWorkUnitStatusEnum.from_job_state(state)
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def is_active(self) -> bool:
|
|
175
|
+
return (
|
|
176
|
+
self._status == SlurmWorkUnitStatusEnum.RUNNING
|
|
177
|
+
or self._status == SlurmWorkUnitStatusEnum.PENDING
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def is_completed(self) -> bool:
|
|
182
|
+
return self._status == SlurmWorkUnitStatusEnum.COMPLETED
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def is_failed(self) -> bool:
|
|
186
|
+
return self._status == SlurmWorkUnitStatusEnum.FAILED
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def status(self) -> SlurmWorkUnitStatusEnum:
|
|
190
|
+
return self._status
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def message(self) -> str:
|
|
194
|
+
return str(self._state)
|
|
195
|
+
|
|
196
|
+
def __repr__(self) -> str:
|
|
197
|
+
return f"<SlurmWorkUnitStatus {self._state!r}>"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{% for executable, executors in executables.items() %}
|
|
2
|
+
target "{{ hash(executable) }}" {
|
|
3
|
+
dockerfile-inline = <<EOF
|
|
4
|
+
{{ executable.dockerfile.read_text() }}
|
|
5
|
+
EOF
|
|
6
|
+
context = "{{ executable.context }}"
|
|
7
|
+
{% if executable.ssh %}
|
|
8
|
+
ssh = [
|
|
9
|
+
{% for ssh_val in executable.ssh %}
|
|
10
|
+
"{{ ssh_val }}"{% if not loop.last %},{% endif %}
|
|
11
|
+
{% endfor %}
|
|
12
|
+
]
|
|
13
|
+
{% endif %}
|
|
14
|
+
{% if executable.target %}
|
|
15
|
+
target = "{{ executable.target }}"
|
|
16
|
+
{% endif %}
|
|
17
|
+
pull = false
|
|
18
|
+
tags = [
|
|
19
|
+
{% for executor in executors %}
|
|
20
|
+
"{{ executor.tag }}"{% if not loop.last %},{% endif %}
|
|
21
|
+
{% endfor %}
|
|
22
|
+
]
|
|
23
|
+
output = [
|
|
24
|
+
"type=registry"
|
|
25
|
+
]
|
|
26
|
+
{% if executable.cache_from %}
|
|
27
|
+
cache-from = [
|
|
28
|
+
{% for cache_from in executable.cache_from %}
|
|
29
|
+
"{{ cache_from }}"{% if not loop.last %},{% endif %}
|
|
30
|
+
{% endfor %}
|
|
31
|
+
]
|
|
32
|
+
{% endif %}
|
|
33
|
+
cache-to = [
|
|
34
|
+
"type=inline"
|
|
35
|
+
]
|
|
36
|
+
platforms = [
|
|
37
|
+
{% for platform in executable.platforms %}
|
|
38
|
+
"{{ platform }}"{% if not loop.last %},{% endif %}
|
|
39
|
+
{% endfor %}
|
|
40
|
+
]
|
|
41
|
+
labels = {
|
|
42
|
+
{% for key, value in executable.labels.items() %}
|
|
43
|
+
"{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
|
|
44
|
+
{% endfor %}
|
|
45
|
+
}
|
|
46
|
+
{% if executable.build_args %}
|
|
47
|
+
args = {
|
|
48
|
+
{% for key, value in executable.build_args.items() %}
|
|
49
|
+
"{{ key }}" = "{{ value }}"{% if not loop.last %},{% endif %}
|
|
50
|
+
{% endfor %}
|
|
51
|
+
}
|
|
52
|
+
{% endif %}
|
|
53
|
+
}
|
|
54
|
+
{% endfor %}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE=gcr.io/distroless/base-debian10
|
|
3
|
+
|
|
4
|
+
FROM docker.io/mambaorg/micromamba:bookworm-slim as mamba
|
|
5
|
+
ARG CONDA_ENVIRONMENT=environment.yml
|
|
6
|
+
|
|
7
|
+
USER root
|
|
8
|
+
|
|
9
|
+
COPY $CONDA_ENVIRONMENT /tmp/
|
|
10
|
+
|
|
11
|
+
# Setup mamba environment
|
|
12
|
+
RUN --mount=type=cache,target=/opt/conda/pkgs \
|
|
13
|
+
--mount=type=cache,target=/root/.cache/pip \
|
|
14
|
+
--mount=type=ssh \
|
|
15
|
+
micromamba create --yes --always-copy --no-pyc --prefix /opt/env --file /tmp/environment.yml
|
|
16
|
+
|
|
17
|
+
RUN find /opt/env/ -follow -type f -name '*.a' -delete && \
|
|
18
|
+
find /opt/env/ -follow -type f -name '*.js.map' -delete
|
|
19
|
+
|
|
20
|
+
FROM $BASE_IMAGE
|
|
21
|
+
|
|
22
|
+
COPY --link --from=mamba /opt/env /opt/env
|
|
23
|
+
|
|
24
|
+
ENV PATH=$PATH:/opt/env/bin
|
|
25
|
+
|
|
26
|
+
WORKDIR /workspace
|
|
27
|
+
COPY --link . /workspace
|
|
28
|
+
|
|
29
|
+
ENTRYPOINT ["/opt/env/bin/python"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE=docker.io/python:3.10-slim-bookworm
|
|
3
|
+
FROM $BASE_IMAGE as builder
|
|
4
|
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
|
|
5
|
+
|
|
6
|
+
ARG EXTRA_SYSTEM_PACKAGES=""
|
|
7
|
+
ARG EXTRA_PYTHON_PACKAGES=""
|
|
8
|
+
|
|
9
|
+
ENV UV_PYTHON_DOWNLOADS=0
|
|
10
|
+
ENV UV_COMPILE_BYTECODE=1
|
|
11
|
+
ENV UV_LINK_MODE=copy
|
|
12
|
+
|
|
13
|
+
WORKDIR /workspace
|
|
14
|
+
|
|
15
|
+
RUN apt-get update \
|
|
16
|
+
&& apt-get install -y --no-install-recommends \
|
|
17
|
+
git $EXTRA_SYSTEM_PACKAGES \
|
|
18
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
19
|
+
|
|
20
|
+
# Install and update necesarry global Python packages
|
|
21
|
+
RUN uv pip install --system pysocks $EXTRA_PYTHON_PACKAGES
|
|
22
|
+
|
|
23
|
+
ARG PIP_REQUIREMENTS=requirements.txt
|
|
24
|
+
|
|
25
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
26
|
+
--mount=type=bind,source=$PIP_REQUIREMENTS,target=requirements.txt \
|
|
27
|
+
--mount=type=ssh \
|
|
28
|
+
uv pip install --system --requirement requirements.txt
|
|
29
|
+
|
|
30
|
+
COPY --link . /workspace
|
|
31
|
+
|
|
32
|
+
ENTRYPOINT [ "python" ]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1.4
|
|
2
|
+
ARG BASE_IMAGE=docker.io/python:3.10-slim-bookworm
|
|
3
|
+
FROM $BASE_IMAGE
|
|
4
|
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
|
|
5
|
+
|
|
6
|
+
ARG EXTRA_SYSTEM_PACKAGES=""
|
|
7
|
+
ARG EXTRA_PYTHON_PACKAGES=""
|
|
8
|
+
|
|
9
|
+
WORKDIR /workspace
|
|
10
|
+
|
|
11
|
+
ENV UV_PYTHON_DOWNLOADS=0
|
|
12
|
+
ENV UV_COMPILE_BYTECODE=1
|
|
13
|
+
ENV UV_LINK_MODE=copy
|
|
14
|
+
|
|
15
|
+
RUN apt-get update \
|
|
16
|
+
&& apt-get install -y --no-install-recommends \
|
|
17
|
+
git $EXTRA_SYSTEM_PACKAGES \
|
|
18
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
19
|
+
|
|
20
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
21
|
+
uv pip install --system pysocks $EXTRA_PYTHON_PACKAGES
|
|
22
|
+
|
|
23
|
+
RUN uv venv --system-site-packages
|
|
24
|
+
|
|
25
|
+
ENV PATH="/workspace/.venv/bin:$PATH"
|
|
26
|
+
|
|
27
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
28
|
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
|
29
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
30
|
+
--mount=type=ssh \
|
|
31
|
+
uv sync --frozen --no-install-project --no-dev --no-editable
|
|
32
|
+
|
|
33
|
+
COPY --link . /workspace
|
|
34
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
35
|
+
--mount=type=ssh \
|
|
36
|
+
uv sync --frozen --no-dev
|
|
37
|
+
|
|
38
|
+
ENTRYPOINT [ "python" ]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{%- macro entrypoint(cluster, job) -%}
|
|
2
|
+
#!/bin/sh
|
|
3
|
+
set -eux
|
|
4
|
+
|
|
5
|
+
{% if cluster.container_environment %}
|
|
6
|
+
# Cluster environment variables
|
|
7
|
+
{% for key, value in cluster.container_environment.items() %}
|
|
8
|
+
export {{ key }}="{{ value }}"
|
|
9
|
+
{% endfor %}
|
|
10
|
+
{%- endif %}
|
|
11
|
+
|
|
12
|
+
{% if job.executable.env_vars %}
|
|
13
|
+
# Executable environment variables
|
|
14
|
+
{% for key, value in job.executable.env_vars.items() %}
|
|
15
|
+
export {{ key }}="{{ value }}"
|
|
16
|
+
{% endfor %}
|
|
17
|
+
{%- endif %}
|
|
18
|
+
|
|
19
|
+
{% if job.env_vars %}
|
|
20
|
+
# Job environment variables
|
|
21
|
+
{% for key, value in job.env_vars.items() %}
|
|
22
|
+
export {{ key }}="{{ value }}"
|
|
23
|
+
{% endfor %}
|
|
24
|
+
{%- endif %}
|
|
25
|
+
|
|
26
|
+
exec {{ job.executable.entrypoint.to_list() | join(' ') }} "$@"
|
|
27
|
+
{%- endmacro -%}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
|
|
2
|
+
__xm_slurm_wait_for_children() {
|
|
3
|
+
if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
|
|
4
|
+
local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
|
|
5
|
+
else
|
|
6
|
+
local -r JOB_ID="${SLURM_JOB_ID}"
|
|
7
|
+
fi
|
|
8
|
+
|
|
9
|
+
# If there are no child jobs we should error out
|
|
10
|
+
children=( $(jobs -p) )
|
|
11
|
+
{% raw %}
|
|
12
|
+
if [ ${#children[@]} -eq 0 ]; then
|
|
13
|
+
{% endraw %}
|
|
14
|
+
echo "ERROR: no child jobs exist..." >&2
|
|
15
|
+
exit 1
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
{% if requeue_on_timeout %}
|
|
19
|
+
# Start a watchdog process to signal timeout.
|
|
20
|
+
sleep {{ requeue_timeout }} &
|
|
21
|
+
timeout_pid=$!
|
|
22
|
+
{% endif %}
|
|
23
|
+
|
|
24
|
+
{% raw %}
|
|
25
|
+
while [ ${#children[@]} -gt 0 ]; do
|
|
26
|
+
{% endraw %}
|
|
27
|
+
echo "INFO: Waiting for child processes to finish..."
|
|
28
|
+
set +e
|
|
29
|
+
{% if requeue_on_timeout %}
|
|
30
|
+
# Wait on either one of the child processes or the timeout process.
|
|
31
|
+
wait -n -p child_pid "${children[@]}" "${timeout_pid}"
|
|
32
|
+
{% else %}
|
|
33
|
+
wait -n -p child_pid "${children[@]}"
|
|
34
|
+
{% endif %}
|
|
35
|
+
local child_exit_code=$?
|
|
36
|
+
set -e
|
|
37
|
+
|
|
38
|
+
{% if requeue_on_timeout %}
|
|
39
|
+
# If the finished process is the watchdog, trigger the timeout handling.
|
|
40
|
+
if [ "${child_pid}" = "${timeout_pid}" ]; then
|
|
41
|
+
echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
|
|
42
|
+
kill "${children[@]}" 2>/dev/null || true
|
|
43
|
+
scontrol requeue "${JOB_ID}"
|
|
44
|
+
exit {{ requeue_exit_code }}
|
|
45
|
+
fi
|
|
46
|
+
{% endif %}
|
|
47
|
+
|
|
48
|
+
echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
|
|
49
|
+
|
|
50
|
+
# Handle the exit code of the finished process.
|
|
51
|
+
if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
|
|
52
|
+
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
53
|
+
scontrol requeue "${JOB_ID}"
|
|
54
|
+
exit {{ requeue_exit_code }}
|
|
55
|
+
elif [ "${child_exit_code}" -ne 0 ]; then
|
|
56
|
+
echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
|
|
57
|
+
exit "${child_exit_code}"
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
# Remove the finished PID from the array in a concise way.
|
|
61
|
+
for i in "${!children[@]}"; do
|
|
62
|
+
if [ "${children[i]}" = "$child_pid" ]; then
|
|
63
|
+
unset 'children[i]'
|
|
64
|
+
break
|
|
65
|
+
fi
|
|
66
|
+
done
|
|
67
|
+
|
|
68
|
+
# Reindex the array.
|
|
69
|
+
children=( "${children[@]}" )
|
|
70
|
+
done
|
|
71
|
+
|
|
72
|
+
{% if requeue_on_timeout %}
|
|
73
|
+
kill "$timeout_pid" 2>/dev/null || true
|
|
74
|
+
{% endif %}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
__xm_slurm_wait_for_children
|
|
78
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{% macro proxy(host) -%}
|
|
2
|
+
__xm_slurm_proxy() {
|
|
3
|
+
local -r GATEWAY="$1"
|
|
4
|
+
local PORT
|
|
5
|
+
|
|
6
|
+
# Find an open port
|
|
7
|
+
while
|
|
8
|
+
PORT="$(shuf -n 1 -i 1024-65535)"
|
|
9
|
+
netstat -atun | grep -q "$PORT"
|
|
10
|
+
do
|
|
11
|
+
sleep 0.25
|
|
12
|
+
continue
|
|
13
|
+
done
|
|
14
|
+
|
|
15
|
+
# Reverse proxy through the gateway
|
|
16
|
+
ssh -D "$PORT" "$GATEWAY" -N -f
|
|
17
|
+
|
|
18
|
+
# Export all env vars for applications to pick up on proxy
|
|
19
|
+
export ALL_PROXY="socks5://127.0.0.1:$PORT"
|
|
20
|
+
export all_proxy="socks5://127.0.0.1:$PORT"
|
|
21
|
+
|
|
22
|
+
export HTTP_PROXY="socks5://127.0.0.1:$PORT"
|
|
23
|
+
export http_proxy="socks5://127.0.0.1:$PORT"
|
|
24
|
+
|
|
25
|
+
export HTTPS_PROXY="socks5://127.0.0.1:$PORT"
|
|
26
|
+
export https_proxy="socks5://127.0.0.1:$PORT"
|
|
27
|
+
|
|
28
|
+
export JAVA_OPTS="-DsocksProxyHost=127.0.0.1 -DsocksProxyPort=$PORT"
|
|
29
|
+
}
|
|
30
|
+
__xm_slurm_proxy "{{ host }}"
|
|
31
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{% extends "job.bash.j2" %}
|
|
2
|
+
{% block directives %}
|
|
3
|
+
{{ super() -}}
|
|
4
|
+
#SBATCH --array=0-{{ args | length - 1 }}
|
|
5
|
+
#SBATCH --output=slurm-%A_%a.out
|
|
6
|
+
{% endblock directives %}
|
|
7
|
+
|
|
8
|
+
{% block bootstrap %}
|
|
9
|
+
srun \
|
|
10
|
+
--label \
|
|
11
|
+
--unbuffered \
|
|
12
|
+
--kill-on-bad-exit=1 \
|
|
13
|
+
--export="ALL" \
|
|
14
|
+
{% for directive in job.executor.step_directives() %}
|
|
15
|
+
{{ directive }} \
|
|
16
|
+
{% endfor %}
|
|
17
|
+
bash <<'SRUN_EOF' &
|
|
18
|
+
set -Eeuxo pipefail
|
|
19
|
+
|
|
20
|
+
readonly XM_SLURM_TRIAL_ARGS=(
|
|
21
|
+
{% for trial in args %}
|
|
22
|
+
"{{ trial.to_list() | join(" ") }}"
|
|
23
|
+
{% endfor %}
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
{% call run(cluster, job) %}
|
|
27
|
+
${XM_SLURM_TRIAL_ARGS[$SLURM_ARRAY_TASK_ID]} \
|
|
28
|
+
{% endcall %}
|
|
29
|
+
|
|
30
|
+
SRUN_EOF
|
|
31
|
+
{%- endblock bootstrap %}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{% extends "job.bash.j2" %}
|
|
2
|
+
{% block directives %}
|
|
3
|
+
#SBATCH --export=NONE
|
|
4
|
+
#SBATCH --comment="{'xid': {{ experiment_id }}}"
|
|
5
|
+
{% if cluster.account %}
|
|
6
|
+
#SBATCH --account={{ cluster.account }}
|
|
7
|
+
{% endif %}
|
|
8
|
+
{% if cluster.partition %}
|
|
9
|
+
#SBATCH --partition={{ cluster.partition }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if cluster.qos %}
|
|
12
|
+
#SBATCH --qos={{ cluster.qos }}
|
|
13
|
+
{% endif %}
|
|
14
|
+
|
|
15
|
+
{% for job_name, job in job_group.jobs.items() %}
|
|
16
|
+
#SBATCH --output=xm-%j+{{ job_name }}.stdout
|
|
17
|
+
#SBATCH --error=xm-%j+{{ job_name }}.stderr
|
|
18
|
+
{% if identity %}
|
|
19
|
+
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}.{{ identity }}]
|
|
20
|
+
#SBATCH --dependency=singleton
|
|
21
|
+
{% else %}
|
|
22
|
+
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
|
|
23
|
+
{% endif %}
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
|
+
#SBATCH {{ directive }}
|
|
26
|
+
{% endfor %}
|
|
27
|
+
{{ "\n#SBATCH hetjob\n" if not loop.last }}
|
|
28
|
+
{% endfor %}
|
|
29
|
+
{% endblock directives %}
|
|
30
|
+
|
|
31
|
+
{% block bootstrap %}
|
|
32
|
+
{% for job in job_group.jobs.values() +%}
|
|
33
|
+
srun \
|
|
34
|
+
--label \
|
|
35
|
+
--unbuffered \
|
|
36
|
+
--kill-on-bad-exit=1 \
|
|
37
|
+
--export="ALL" \
|
|
38
|
+
{% for directive in job.executor.step_directives() %}
|
|
39
|
+
{{ directive }} \
|
|
40
|
+
{% endfor %}
|
|
41
|
+
--het-group={{ loop.index0 }} \
|
|
42
|
+
bash <<'SRUN_EOF' &
|
|
43
|
+
set -Eeuxo pipefail
|
|
44
|
+
{{ run(cluster, job) }}
|
|
45
|
+
SRUN_EOF
|
|
46
|
+
{% endfor +%}
|
|
47
|
+
{% endblock bootstrap %}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
{% block directives %}
|
|
3
|
+
#SBATCH --open-mode=append
|
|
4
|
+
#SBATCH --export=NONE
|
|
5
|
+
#SBATCH --output=slurm-%j.out
|
|
6
|
+
#SBATCH --comment="{'xid': {{ experiment_id }}}"
|
|
7
|
+
{% if cluster.account and not job.executor.account %}
|
|
8
|
+
#SBATCH --account={{ cluster.account }}
|
|
9
|
+
{% endif %}
|
|
10
|
+
{% if cluster.partition and not job.executor.partition %}
|
|
11
|
+
#SBATCH --partition={{ cluster.partition }}
|
|
12
|
+
{% endif %}
|
|
13
|
+
{% if cluster.qos and not job.executor.qos %}
|
|
14
|
+
#SBATCH --qos={{ cluster.qos }}
|
|
15
|
+
{% endif %}
|
|
16
|
+
{% if identity %}
|
|
17
|
+
#SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
|
|
18
|
+
{% else %}
|
|
19
|
+
{% if dependency %}
|
|
20
|
+
#SBATCH {{ dependency.to_directive() }}
|
|
21
|
+
{% endif %}
|
|
22
|
+
#SBATCH --job-name=xm[{{ experiment_id }}]
|
|
23
|
+
{% endif %}
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
|
+
#SBATCH {{ directive }}
|
|
26
|
+
{% endfor %}
|
|
27
|
+
{% endblock directives %}
|
|
28
|
+
set -Eeuxo pipefail
|
|
29
|
+
|
|
30
|
+
{% if stdlib %}
|
|
31
|
+
# --- Helper functions ---
|
|
32
|
+
{% for fn in stdlib %}
|
|
33
|
+
{{ fn }}
|
|
34
|
+
{% endfor %}
|
|
35
|
+
{% endif %}
|
|
36
|
+
|
|
37
|
+
{% block prolog %}
|
|
38
|
+
{% if cluster.prolog %}
|
|
39
|
+
{{- cluster.prolog -}}
|
|
40
|
+
{% endif %}
|
|
41
|
+
{%- endblock prolog %}
|
|
42
|
+
|
|
43
|
+
{% block environment -%}
|
|
44
|
+
{% for key, value in cluster.host_environment.items() %}
|
|
45
|
+
export {{ key }}="{{ value }}"
|
|
46
|
+
{% endfor %}
|
|
47
|
+
{%- endblock environment %}
|
|
48
|
+
|
|
49
|
+
{% block proxy -%}
|
|
50
|
+
{%- if cluster.proxy %}
|
|
51
|
+
{% from 'fragments/proxy.bash.j2' import proxy %}
|
|
52
|
+
{% if cluster.proxy == "submission-host" %}
|
|
53
|
+
{{ proxy("$SLURM_SUBMIT_HOST") }}
|
|
54
|
+
{% else %}
|
|
55
|
+
{{ proxy(cluster.proxy) }}
|
|
56
|
+
{% endif %}
|
|
57
|
+
{% endif %}
|
|
58
|
+
{%- endblock proxy %}
|
|
59
|
+
|
|
60
|
+
{% block bootstrap %}
|
|
61
|
+
srun \
|
|
62
|
+
--label \
|
|
63
|
+
--unbuffered \
|
|
64
|
+
--kill-on-bad-exit=1 \
|
|
65
|
+
--export="ALL" \
|
|
66
|
+
{% for directive in job.executor.step_directives() %}
|
|
67
|
+
{{ directive }} \
|
|
68
|
+
{% endfor %}
|
|
69
|
+
bash <<'SRUN_EOF' &
|
|
70
|
+
set -Eeuxo pipefail
|
|
71
|
+
{{ run(cluster, job) }}
|
|
72
|
+
SRUN_EOF
|
|
73
|
+
{%- endblock bootstrap %}
|
|
74
|
+
|
|
75
|
+
echo "[INFO] Start timestamp: $(date)"
|
|
76
|
+
|
|
77
|
+
{%- block epilog -%}
|
|
78
|
+
{% if cluster.epilog %}
|
|
79
|
+
{{ cluster.epilog }}
|
|
80
|
+
{% endif %}
|
|
81
|
+
{%- endblock epilog %}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
{% block monitor -%}
|
|
85
|
+
{% from 'fragments/monitor.bash.j2' import monitor %}
|
|
86
|
+
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
|
|
87
|
+
{%- endblock monitor %}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
echo "[INFO] End timestamp: $(date)"
|