xmanager-slurm 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +0 -2
- xm_slurm/api/__init__.py +33 -0
- xm_slurm/api/abc.py +65 -0
- xm_slurm/api/models.py +70 -0
- xm_slurm/api/sqlite/client.py +358 -0
- xm_slurm/api/web/client.py +173 -0
- xm_slurm/config.py +11 -3
- xm_slurm/contrib/clusters/__init__.py +3 -6
- xm_slurm/contrib/clusters/drac.py +4 -3
- xm_slurm/executables.py +4 -7
- xm_slurm/execution.py +290 -159
- xm_slurm/experiment.py +26 -180
- xm_slurm/filesystem.py +129 -0
- xm_slurm/metadata_context.py +253 -0
- xm_slurm/packageables.py +0 -9
- xm_slurm/packaging/docker.py +72 -22
- xm_slurm/packaging/utils.py +0 -108
- xm_slurm/scripts/cli.py +9 -2
- xm_slurm/templates/docker/uv.Dockerfile +6 -3
- xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +4 -4
- xm_slurm/templates/slurm/job-group.bash.j2 +2 -2
- xm_slurm/templates/slurm/job.bash.j2 +5 -4
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +18 -54
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +9 -24
- xm_slurm/utils.py +122 -41
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/METADATA +7 -3
- xmanager_slurm-0.4.7.dist-info/RECORD +51 -0
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/WHEEL +1 -1
- xm_slurm/api.py +0 -528
- xmanager_slurm-0.4.5.dist-info/RECORD +0 -44
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/entry_points.txt +0 -0
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/licenses/LICENSE.md +0 -0
xm_slurm/packageables.py
CHANGED
|
@@ -44,7 +44,6 @@ def docker_container(
|
|
|
44
44
|
build_args: Mapping[str, str] = immutabledict.immutabledict(),
|
|
45
45
|
cache_from: str | Sequence[str] | None = None,
|
|
46
46
|
labels: Mapping[str, str] = immutabledict.immutabledict(),
|
|
47
|
-
workdir: pathlib.Path | None = None,
|
|
48
47
|
args: xm.UserArgs | None = None,
|
|
49
48
|
env_vars: Mapping[str, str] = immutabledict.immutabledict(),
|
|
50
49
|
) -> xm.Packageable:
|
|
@@ -59,7 +58,6 @@ def docker_container(
|
|
|
59
58
|
build_args: Build arguments to docker.
|
|
60
59
|
cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
|
|
61
60
|
labels: The container labels.
|
|
62
|
-
workdir: The working directory in container.
|
|
63
61
|
args: The user arguments to pass to the executable.
|
|
64
62
|
env_vars: The environment variables to pass to the executable.
|
|
65
63
|
|
|
@@ -93,7 +91,6 @@ def docker_container(
|
|
|
93
91
|
ssh=ssh,
|
|
94
92
|
build_args=build_args,
|
|
95
93
|
cache_from=cache_from,
|
|
96
|
-
workdir=workdir,
|
|
97
94
|
labels=labels,
|
|
98
95
|
),
|
|
99
96
|
args=xm.SequentialArgs.from_collection(args),
|
|
@@ -173,8 +170,6 @@ def python_container(
|
|
|
173
170
|
},
|
|
174
171
|
cache_from=cache_from,
|
|
175
172
|
labels=labels,
|
|
176
|
-
# We must specify the workdir manually for apptainer support
|
|
177
|
-
workdir=pathlib.Path("/workspace"),
|
|
178
173
|
args=args,
|
|
179
174
|
env_vars=env_vars,
|
|
180
175
|
)
|
|
@@ -241,8 +236,6 @@ def mamba_container(
|
|
|
241
236
|
},
|
|
242
237
|
cache_from=cache_from,
|
|
243
238
|
labels=labels,
|
|
244
|
-
# We must specify the workdir manually for apptainer support
|
|
245
|
-
workdir=pathlib.Path("/workspace"),
|
|
246
239
|
args=args,
|
|
247
240
|
env_vars=env_vars,
|
|
248
241
|
)
|
|
@@ -312,8 +305,6 @@ def uv_container(
|
|
|
312
305
|
},
|
|
313
306
|
cache_from=cache_from,
|
|
314
307
|
labels=labels,
|
|
315
|
-
# We must specify the workdir manually for apptainer support
|
|
316
|
-
workdir=pathlib.Path("/workspace"),
|
|
317
308
|
args=args,
|
|
318
309
|
env_vars=env_vars,
|
|
319
310
|
)
|
xm_slurm/packaging/docker.py
CHANGED
|
@@ -10,13 +10,13 @@ import os
|
|
|
10
10
|
import pathlib
|
|
11
11
|
import shlex
|
|
12
12
|
import shutil
|
|
13
|
-
import subprocess
|
|
14
13
|
import tempfile
|
|
15
|
-
|
|
14
|
+
import typing as tp
|
|
16
15
|
|
|
17
16
|
import jinja2 as j2
|
|
18
17
|
from xmanager import xm
|
|
19
18
|
|
|
19
|
+
from xm_slurm import utils
|
|
20
20
|
from xm_slurm.executables import (
|
|
21
21
|
Dockerfile,
|
|
22
22
|
DockerImage,
|
|
@@ -32,7 +32,7 @@ from xm_slurm.packaging.registry import IndexedContainer
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _hash_digest(obj: Hashable) -> str:
|
|
35
|
+
def _hash_digest(obj: tp.Hashable) -> str:
|
|
36
36
|
return hashlib.sha256(repr(obj).encode()).hexdigest()
|
|
37
37
|
|
|
38
38
|
|
|
@@ -52,7 +52,7 @@ class DockerClient:
|
|
|
52
52
|
raise RuntimeError("No Docker client found.")
|
|
53
53
|
self._client_call = client_call
|
|
54
54
|
|
|
55
|
-
backend_version =
|
|
55
|
+
backend_version = utils.run_command(
|
|
56
56
|
xm.merge_args(self._client_call, ["buildx", "version"]), return_stdout=True
|
|
57
57
|
)
|
|
58
58
|
if backend_version.stdout.startswith("github.com/docker/buildx"):
|
|
@@ -71,14 +71,17 @@ class DockerClient:
|
|
|
71
71
|
"""Parse credentials from a Docker credential helper."""
|
|
72
72
|
if not shutil.which(f"docker-credential-{helper}"):
|
|
73
73
|
return None
|
|
74
|
-
|
|
75
|
-
f"
|
|
74
|
+
result = utils.run_command(
|
|
75
|
+
[f"docker-credential-{helper}", "get"],
|
|
76
|
+
stdin=hostname,
|
|
77
|
+
return_stdout=True,
|
|
76
78
|
)
|
|
77
79
|
|
|
78
|
-
if returncode == 0:
|
|
79
|
-
credentials = json.loads(
|
|
80
|
+
if result.returncode == 0:
|
|
81
|
+
credentials = json.loads(result.stdout)
|
|
80
82
|
return RemoteRepositoryCredentials(
|
|
81
|
-
username=credentials["Username"],
|
|
83
|
+
username=str.strip(credentials["Username"]),
|
|
84
|
+
password=str.strip(credentials["Secret"]),
|
|
82
85
|
)
|
|
83
86
|
return None
|
|
84
87
|
|
|
@@ -112,7 +115,10 @@ class DockerClient:
|
|
|
112
115
|
if registry.domain == hostname:
|
|
113
116
|
auth = base64.b64decode(metadata["auth"]).decode("utf-8")
|
|
114
117
|
username, password = auth.split(":")
|
|
115
|
-
credentials = RemoteRepositoryCredentials(
|
|
118
|
+
credentials = RemoteRepositoryCredentials(
|
|
119
|
+
str.strip(username),
|
|
120
|
+
str.strip(password),
|
|
121
|
+
)
|
|
116
122
|
self._credentials_cache[hostname] = credentials
|
|
117
123
|
return credentials
|
|
118
124
|
return None
|
|
@@ -133,6 +139,22 @@ class DockerClient:
|
|
|
133
139
|
)
|
|
134
140
|
return _parse_credentials_from_config(podman_config_path)
|
|
135
141
|
|
|
142
|
+
def inspect(
|
|
143
|
+
self, image: ImageURI, element: str | None = None, type: tp.Literal["image"] = "image"
|
|
144
|
+
) -> dict[str, tp.Any]:
|
|
145
|
+
output = utils.run_command(
|
|
146
|
+
xm.merge_args(
|
|
147
|
+
self._client_call,
|
|
148
|
+
["inspect"],
|
|
149
|
+
["--format", f"{{{{json .{element}}}}}"] if element else [],
|
|
150
|
+
["--type", type] if type else [],
|
|
151
|
+
[str(image)],
|
|
152
|
+
),
|
|
153
|
+
check=True,
|
|
154
|
+
return_stdout=True,
|
|
155
|
+
)
|
|
156
|
+
return json.loads(output.stdout.strip().strip("'"))
|
|
157
|
+
|
|
136
158
|
@functools.cached_property
|
|
137
159
|
def _bake_template(self) -> j2.Template:
|
|
138
160
|
template_loader = j2.PackageLoader("xm_slurm", "templates/docker")
|
|
@@ -143,17 +165,17 @@ class DockerClient:
|
|
|
143
165
|
def _bake_args(
|
|
144
166
|
self,
|
|
145
167
|
*,
|
|
146
|
-
targets: str | Sequence[str] | None = None,
|
|
168
|
+
targets: str | tp.Sequence[str] | None = None,
|
|
147
169
|
builder: str | None = None,
|
|
148
|
-
files: str | os.PathLike[str] | Sequence[os.PathLike[str] | str] | None = None,
|
|
170
|
+
files: str | os.PathLike[str] | tp.Sequence[os.PathLike[str] | str] | None = None,
|
|
149
171
|
load: bool = False,
|
|
150
172
|
cache: bool = True,
|
|
151
173
|
print: bool = False,
|
|
152
174
|
pull: bool = False,
|
|
153
175
|
push: bool = False,
|
|
154
176
|
metadata_file: str | os.PathLike[str] | None = None,
|
|
155
|
-
progress: Literal["auto", "plain", "tty"] = "auto",
|
|
156
|
-
set: Mapping[str, str] | None = None,
|
|
177
|
+
progress: tp.Literal["auto", "plain", "tty"] = "auto",
|
|
178
|
+
set: tp.Mapping[str, str] | None = None,
|
|
157
179
|
) -> xm.SequentialArgs:
|
|
158
180
|
files = files
|
|
159
181
|
if files is None:
|
|
@@ -184,9 +206,7 @@ class DockerClient:
|
|
|
184
206
|
)
|
|
185
207
|
|
|
186
208
|
def bake(
|
|
187
|
-
self,
|
|
188
|
-
*,
|
|
189
|
-
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
209
|
+
self, *, targets: tp.Sequence[IndexedContainer[xm.Packageable]]
|
|
190
210
|
) -> list[IndexedContainer[RemoteImage]]:
|
|
191
211
|
executors_by_executables = packaging_utils.collect_executors_by_executable(targets)
|
|
192
212
|
for executable, executors in executors_by_executables.items():
|
|
@@ -223,7 +243,7 @@ class DockerClient:
|
|
|
223
243
|
push=True,
|
|
224
244
|
),
|
|
225
245
|
)
|
|
226
|
-
|
|
246
|
+
utils.run_command(bake_command.to_list(), tty=True, check=True)
|
|
227
247
|
except Exception as ex:
|
|
228
248
|
raise RuntimeError(f"Failed to build Dockerfiles: {ex}") from ex
|
|
229
249
|
else:
|
|
@@ -239,13 +259,27 @@ class DockerClient:
|
|
|
239
259
|
uri = ImageURI(target.value.executor_spec.tag).with_digest(
|
|
240
260
|
executable_metadata["containerimage.digest"]
|
|
241
261
|
)
|
|
262
|
+
config = self.inspect(uri, "Config")
|
|
263
|
+
if "WorkingDir" not in config:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
"Docker image does not have a working directory. "
|
|
266
|
+
"To support all runtimes, we need to set a working directory. "
|
|
267
|
+
"Please set `WORKDIR` in the `Dockerfile`."
|
|
268
|
+
)
|
|
269
|
+
if "Entrypoint" not in config:
|
|
270
|
+
raise ValueError(
|
|
271
|
+
"Docker image does not have an entrypoint. "
|
|
272
|
+
"To support all runtimes, we need to set an entrypoint. "
|
|
273
|
+
"Please set `ENTRYPOINT` in the `Dockerfile`."
|
|
274
|
+
)
|
|
242
275
|
|
|
243
276
|
images.append(
|
|
244
277
|
dataclasses.replace(
|
|
245
278
|
target,
|
|
246
279
|
value=RemoteImage( # type: ignore
|
|
247
280
|
image=str(uri),
|
|
248
|
-
workdir=
|
|
281
|
+
workdir=config["WorkingDir"],
|
|
282
|
+
entrypoint=xm.SequentialArgs.from_collection(config["Entrypoint"]),
|
|
249
283
|
args=target.value.args,
|
|
250
284
|
env_vars=target.value.env_vars,
|
|
251
285
|
credentials=self.credentials(uri.domain),
|
|
@@ -263,14 +297,14 @@ def docker_client() -> DockerClient:
|
|
|
263
297
|
|
|
264
298
|
@registry.register(Dockerfile)
|
|
265
299
|
def _(
|
|
266
|
-
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
300
|
+
targets: tp.Sequence[IndexedContainer[xm.Packageable]],
|
|
267
301
|
) -> list[IndexedContainer[RemoteImage]]:
|
|
268
302
|
return docker_client().bake(targets=targets)
|
|
269
303
|
|
|
270
304
|
|
|
271
305
|
@registry.register(DockerImage)
|
|
272
306
|
def _(
|
|
273
|
-
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
307
|
+
targets: tp.Sequence[IndexedContainer[xm.Packageable]],
|
|
274
308
|
) -> list[IndexedContainer[RemoteImage]]:
|
|
275
309
|
"""Build Docker images, this is essentially a passthrough."""
|
|
276
310
|
images = []
|
|
@@ -285,12 +319,28 @@ def _(
|
|
|
285
319
|
)
|
|
286
320
|
|
|
287
321
|
uri = ImageURI(target.value.executable_spec.image)
|
|
322
|
+
|
|
323
|
+
config = client.inspect(uri, "Config")
|
|
324
|
+
if "WorkingDir" not in config:
|
|
325
|
+
raise ValueError(
|
|
326
|
+
"Docker image does not have a working directory. "
|
|
327
|
+
"To support all runtimes, we need to set a working directory. "
|
|
328
|
+
"Please set `WORKDIR` in the `Dockerfile`."
|
|
329
|
+
)
|
|
330
|
+
if "Entrypoint" not in config:
|
|
331
|
+
raise ValueError(
|
|
332
|
+
"Docker image does not have an entrypoint. "
|
|
333
|
+
"To support all runtimes, we need to set an entrypoint. "
|
|
334
|
+
"Please set `ENTRYPOINT` in the `Dockerfile`."
|
|
335
|
+
)
|
|
336
|
+
|
|
288
337
|
images.append(
|
|
289
338
|
dataclasses.replace(
|
|
290
339
|
target,
|
|
291
340
|
value=RemoteImage( # type: ignore
|
|
292
341
|
image=str(uri),
|
|
293
|
-
workdir=
|
|
342
|
+
workdir=config["WorkingDir"],
|
|
343
|
+
entrypoint=xm.SequentialArgs.from_collection(config["Entrypoint"]),
|
|
294
344
|
args=target.value.args,
|
|
295
345
|
env_vars=target.value.env_vars,
|
|
296
346
|
credentials=client.credentials(hostname=uri.domain),
|
xm_slurm/packaging/utils.py
CHANGED
|
@@ -1,12 +1,5 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
import pathlib
|
|
5
|
-
import pty
|
|
6
|
-
import re
|
|
7
|
-
import select
|
|
8
|
-
import shutil
|
|
9
|
-
import subprocess
|
|
10
3
|
from typing import ParamSpec, Sequence, TypeVar
|
|
11
4
|
|
|
12
5
|
from xmanager import xm
|
|
@@ -27,104 +20,3 @@ def collect_executors_by_executable(
|
|
|
27
20
|
for target in targets:
|
|
28
21
|
executors_by_executable[target.value.executable_spec].add(target.value.executor_spec)
|
|
29
22
|
return executors_by_executable
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# Cursor commands to filter out from the command data stream
|
|
33
|
-
_CURSOR_ESCAPE_SEQUENCES_REGEX = re.compile(
|
|
34
|
-
rb"\x1b\[\?25[hl]" # Matches cursor show/hide commands (CSI ?25h and CSI ?25l)
|
|
35
|
-
rb"|\x1b\[[0-9;]*[Hf]" # Matches cursor position commands (CSI n;mH and CSI n;mf)
|
|
36
|
-
rb"|\x1b\[s" # Matches cursor save position (CSI s)
|
|
37
|
-
rb"|\x1b\[u" # Matches cursor restore position (CSI u)
|
|
38
|
-
rb"|\x1b\[2J" # Matches clear screen (CSI 2J)
|
|
39
|
-
rb"|\x1b\[K" # Matches clear line (CSI K)
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def run_command(
|
|
44
|
-
args: Sequence[str] | xm.SequentialArgs,
|
|
45
|
-
env: dict[str, str] | None = None,
|
|
46
|
-
tty: bool = False,
|
|
47
|
-
cwd: str | os.PathLike[str] | None = None,
|
|
48
|
-
check: bool = False,
|
|
49
|
-
return_stdout: bool = False,
|
|
50
|
-
return_stderr: bool = False,
|
|
51
|
-
) -> subprocess.CompletedProcess[str]:
|
|
52
|
-
if isinstance(args, xm.SequentialArgs):
|
|
53
|
-
args = args.to_list()
|
|
54
|
-
args = list(args)
|
|
55
|
-
|
|
56
|
-
executable = shutil.which(args[0])
|
|
57
|
-
if not executable:
|
|
58
|
-
raise RuntimeError(f"Couldn't find executable {args[0]}")
|
|
59
|
-
executable = pathlib.Path(executable)
|
|
60
|
-
|
|
61
|
-
subprocess_env = os.environ.copy() | (env if env else {})
|
|
62
|
-
if executable.name == "docker" and args[1] == "buildx":
|
|
63
|
-
subprocess_env |= {"DOCKER_CLI_EXPERIMENTAL": "enabled"}
|
|
64
|
-
|
|
65
|
-
logger.debug(f"env: {subprocess_env}")
|
|
66
|
-
logger.debug(f"command: {' '.join(args)}")
|
|
67
|
-
|
|
68
|
-
stdout_master, stdout_slave = pty.openpty()
|
|
69
|
-
stderr_master, stderr_slave = pty.openpty()
|
|
70
|
-
|
|
71
|
-
stdout_data, stderr_data = b"", b""
|
|
72
|
-
with subprocess.Popen(
|
|
73
|
-
executable=executable,
|
|
74
|
-
args=args,
|
|
75
|
-
shell=False,
|
|
76
|
-
text=True,
|
|
77
|
-
bufsize=0,
|
|
78
|
-
stdout=stdout_slave,
|
|
79
|
-
stderr=stderr_slave,
|
|
80
|
-
start_new_session=True,
|
|
81
|
-
close_fds=True,
|
|
82
|
-
cwd=cwd,
|
|
83
|
-
env=subprocess_env,
|
|
84
|
-
) as process:
|
|
85
|
-
os.close(stdout_slave)
|
|
86
|
-
os.close(stderr_slave)
|
|
87
|
-
|
|
88
|
-
fds = [stdout_master, stderr_master]
|
|
89
|
-
while fds:
|
|
90
|
-
rlist, _, _ = select.select(fds, [], [])
|
|
91
|
-
for fd in rlist:
|
|
92
|
-
try:
|
|
93
|
-
data = os.read(fd, 1024)
|
|
94
|
-
except OSError:
|
|
95
|
-
data = None
|
|
96
|
-
|
|
97
|
-
if not data:
|
|
98
|
-
os.close(fd)
|
|
99
|
-
fds.remove(fd)
|
|
100
|
-
continue
|
|
101
|
-
|
|
102
|
-
data = _CURSOR_ESCAPE_SEQUENCES_REGEX.sub(b"", data)
|
|
103
|
-
|
|
104
|
-
if fd == stdout_master:
|
|
105
|
-
if return_stdout:
|
|
106
|
-
stdout_data += data
|
|
107
|
-
if tty:
|
|
108
|
-
os.write(pty.STDOUT_FILENO, data)
|
|
109
|
-
elif fd == stderr_master:
|
|
110
|
-
if return_stderr:
|
|
111
|
-
stderr_data += data
|
|
112
|
-
if tty:
|
|
113
|
-
os.write(pty.STDERR_FILENO, data)
|
|
114
|
-
else:
|
|
115
|
-
raise RuntimeError("Unexpected file descriptor")
|
|
116
|
-
|
|
117
|
-
stdout = stdout_data.decode(errors="replace") if stdout_data else ""
|
|
118
|
-
stderr = stderr_data.decode(errors="replace") if stderr_data else ""
|
|
119
|
-
|
|
120
|
-
retcode = process.poll()
|
|
121
|
-
assert retcode is not None
|
|
122
|
-
|
|
123
|
-
if check and retcode:
|
|
124
|
-
raise subprocess.CalledProcessError(retcode, process.args)
|
|
125
|
-
return subprocess.CompletedProcess(
|
|
126
|
-
process.args,
|
|
127
|
-
retcode,
|
|
128
|
-
stdout=stdout,
|
|
129
|
-
stderr=stderr,
|
|
130
|
-
)
|
xm_slurm/scripts/cli.py
CHANGED
|
@@ -29,8 +29,15 @@ async def logs(
|
|
|
29
29
|
raise ValueError("Must specify either wid or identity.")
|
|
30
30
|
assert wu is not None
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
with console.status("Waiting for logs...") as status:
|
|
33
|
+
waiting = True
|
|
34
|
+
async for log in wu.logs(
|
|
35
|
+
num_lines=num_lines, block_size=block_size, wait=True, follow=follow
|
|
36
|
+
):
|
|
37
|
+
if waiting:
|
|
38
|
+
status.stop()
|
|
39
|
+
waiting = False
|
|
40
|
+
console.print(log, end="\n")
|
|
34
41
|
|
|
35
42
|
|
|
36
43
|
@xm.run_in_asyncio_loop
|
|
@@ -17,7 +17,12 @@ RUN apt-get update \
|
|
|
17
17
|
git $EXTRA_SYSTEM_PACKAGES \
|
|
18
18
|
&& rm -rf /var/lib/apt/lists/*
|
|
19
19
|
|
|
20
|
-
RUN uv
|
|
20
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
21
|
+
uv pip install --system pysocks $EXTRA_PYTHON_PACKAGES
|
|
22
|
+
|
|
23
|
+
RUN uv venv --system-site-packages
|
|
24
|
+
|
|
25
|
+
ENV PATH="/workspace/.venv/bin:$PATH"
|
|
21
26
|
|
|
22
27
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
23
28
|
--mount=type=bind,source=uv.lock,target=uv.lock \
|
|
@@ -30,6 +35,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|
|
30
35
|
--mount=type=ssh \
|
|
31
36
|
uv sync --frozen --no-dev
|
|
32
37
|
|
|
33
|
-
ENV PATH="/workspace/.venv/bin:$PATH"
|
|
34
|
-
|
|
35
38
|
ENTRYPOINT [ "python" ]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{%- macro entrypoint(cluster, job) -%}
|
|
2
|
+
#!/bin/sh
|
|
3
|
+
set -eux
|
|
4
|
+
|
|
5
|
+
{% if cluster.container_environment %}
|
|
6
|
+
# Cluster environment variables
|
|
7
|
+
{% for key, value in cluster.container_environment.items() %}
|
|
8
|
+
export {{ key }}="{{ value }}"
|
|
9
|
+
{% endfor %}
|
|
10
|
+
{%- endif %}
|
|
11
|
+
|
|
12
|
+
{% if job.executable.env_vars %}
|
|
13
|
+
# Executable environment variables
|
|
14
|
+
{% for key, value in job.executable.env_vars.items() %}
|
|
15
|
+
export {{ key }}="{{ value }}"
|
|
16
|
+
{% endfor %}
|
|
17
|
+
{%- endif %}
|
|
18
|
+
|
|
19
|
+
{% if job.env_vars %}
|
|
20
|
+
# Job environment variables
|
|
21
|
+
{% for key, value in job.env_vars.items() %}
|
|
22
|
+
export {{ key }}="{{ value }}"
|
|
23
|
+
{% endfor %}
|
|
24
|
+
{%- endif %}
|
|
25
|
+
|
|
26
|
+
exec {{ job.executable.entrypoint.to_list() | join(' ') }} "$@"
|
|
27
|
+
{%- endmacro -%}
|
|
@@ -10,18 +10,18 @@ srun \
|
|
|
10
10
|
--unbuffered \
|
|
11
11
|
--kill-on-bad-exit=0 \
|
|
12
12
|
--overlap \
|
|
13
|
-
--export=
|
|
13
|
+
--export="ALL" \
|
|
14
14
|
bash <<'SRUN_EOF' &
|
|
15
15
|
set -Eeuxo pipefail
|
|
16
16
|
|
|
17
|
-
readonly
|
|
17
|
+
readonly XM_SLURM_TRIAL_ARGS=(
|
|
18
18
|
{% for trial in args %}
|
|
19
19
|
"{{ trial.to_list() | join(" ") }}"
|
|
20
20
|
{% endfor %}
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
-
{% call run(
|
|
24
|
-
${
|
|
23
|
+
{% call run(cluster, job) %}
|
|
24
|
+
${XM_SLURM_TRIAL_ARGS[$SLURM_ARRAY_TASK_ID]} \
|
|
25
25
|
{% endcall %}
|
|
26
26
|
|
|
27
27
|
SRUN_EOF
|
|
@@ -31,11 +31,11 @@
|
|
|
31
31
|
srun \
|
|
32
32
|
--unbuffered \
|
|
33
33
|
--kill-on-bad-exit=0 \
|
|
34
|
-
--export=
|
|
34
|
+
--export="ALL" \
|
|
35
35
|
--het-group={{ loop.index0 }} \
|
|
36
36
|
bash <<'SRUN_EOF' &
|
|
37
37
|
set -Eeuxo pipefail
|
|
38
|
-
{{ run(
|
|
38
|
+
{{ run(cluster, job) }}
|
|
39
39
|
SRUN_EOF
|
|
40
40
|
{% endfor +%}
|
|
41
41
|
{% endblock bootstrap %}
|
|
@@ -33,9 +33,10 @@ set -Eeuxo pipefail
|
|
|
33
33
|
{% endif %}
|
|
34
34
|
{%- endblock prolog %}
|
|
35
35
|
|
|
36
|
-
|
|
37
36
|
{% block environment -%}
|
|
38
|
-
{
|
|
37
|
+
{% for key, value in cluster.host_environment.items() %}
|
|
38
|
+
export {{ key }}="{{ value }}"
|
|
39
|
+
{% endfor %}
|
|
39
40
|
{%- endblock environment %}
|
|
40
41
|
|
|
41
42
|
{% block proxy -%}
|
|
@@ -54,10 +55,10 @@ srun \
|
|
|
54
55
|
--unbuffered \
|
|
55
56
|
--kill-on-bad-exit=0 \
|
|
56
57
|
--overlap \
|
|
57
|
-
--export=
|
|
58
|
+
--export="ALL" \
|
|
58
59
|
bash <<'SRUN_EOF' &
|
|
59
60
|
set -Eeuxo pipefail
|
|
60
|
-
{{ run(
|
|
61
|
+
{{ run(cluster, job) }}
|
|
61
62
|
SRUN_EOF
|
|
62
63
|
{%- endblock bootstrap %}
|
|
63
64
|
|
|
@@ -1,73 +1,33 @@
|
|
|
1
|
-
{% macro
|
|
2
|
-
{
|
|
3
|
-
{% if key.startswith("SINGULARITY") or key.startswith("APPTAINER") or key.startswith("_") %}
|
|
4
|
-
{% set key = key.lstrip('_') %}
|
|
5
|
-
export {{ key }}="{{ value }}"
|
|
6
|
-
{% else %}
|
|
7
|
-
export APPTAINERENV_{{ key }}="{{ value }}"
|
|
8
|
-
export SINGULARITYENV_{{ key }}="{{ value }}"
|
|
9
|
-
export {{ key }}="{{ value }}"
|
|
10
|
-
{% endif %}
|
|
11
|
-
{% endfor %}
|
|
12
|
-
{%- endmacro %}
|
|
13
|
-
|
|
14
|
-
{% macro export(job, mode=None) -%}
|
|
15
|
-
{%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
|
|
16
|
-
{%- if job.executable.credentials -%}
|
|
17
|
-
{%- set combined_envs = operator.or_(combined_envs, {
|
|
18
|
-
"APPTAINER_DOCKER_USERNAME": job.executable.credentials.username,
|
|
19
|
-
"APPTAINER_DOCKER_PASSWORD": job.executable.credentials.password,
|
|
20
|
-
"SINGULARITY_DOCKER_USERNAME": job.executable.credentials.username,
|
|
21
|
-
"SINGULARITY_DOCKER_PASSWORD": job.executable.credentials.password,
|
|
22
|
-
})
|
|
23
|
-
-%}
|
|
24
|
-
{%- endif %}
|
|
25
|
-
|
|
26
|
-
{%- set env_strings = [] -%}
|
|
27
|
-
{%- for key, value in combined_envs.items() -%}
|
|
28
|
-
{%- if key.startswith("SINGULARITY") or key.startswith("APPTAINER") -%}
|
|
29
|
-
{%- set _ = env_strings.append('{0}="{1}"'.format(key, value)) -%}
|
|
30
|
-
{%- else -%}
|
|
31
|
-
{%- set _ = env_strings.append('APPTAINERENV_{0}="{1}",SINGULARITYENV_{0}="{1}"'.format(key, value)) -%}
|
|
32
|
-
{%- endif -%}
|
|
33
|
-
{%- endfor -%}
|
|
34
|
-
|
|
35
|
-
{%- if mode is not none -%}
|
|
36
|
-
{{- mode -}}{{- "," if combined_envs -}}
|
|
37
|
-
{%- endif -%}
|
|
38
|
-
|
|
39
|
-
{{- env_strings | join(",") -}}
|
|
40
|
-
{% endmacro %}
|
|
41
|
-
|
|
42
|
-
{% macro run(job, cluster) -%}
|
|
43
|
-
# Determine which binary to use or if an error should be raised
|
|
44
|
-
if [[ $(command -v apptainer) ]]; then
|
|
45
|
-
readonly CONTAINER_RUNTIME="apptainer"
|
|
46
|
-
elif [[ $(command -v singularity) ]]; then
|
|
47
|
-
readonly CONTAINER_RUNTIME="singularity"
|
|
48
|
-
else
|
|
49
|
-
echo "Error: Neither singularity nor apptainer binaries found" >&2
|
|
50
|
-
exit 1
|
|
51
|
-
fi
|
|
1
|
+
{% macro run(cluster, job) -%}
|
|
2
|
+
{%- set runtime = (cluster.runtime | string | lower) -%}
|
|
52
3
|
|
|
53
4
|
# Bundle will be where our built sandbox image is stored
|
|
54
5
|
# container-workdir will be our container's scratch directory
|
|
55
6
|
mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
|
|
56
7
|
|
|
57
|
-
|
|
8
|
+
{% if job.executable.credentials %}
|
|
9
|
+
env {{ runtime | upper }}_DOCKER_USERNAME="{{ job.executable.credentials.username }}" {{ runtime | upper }}_DOCKER_PASSWORD="{{ job.executable.credentials.password }}" time {{ runtime }} build \
|
|
10
|
+
{% else %}
|
|
11
|
+
time {{ runtime }} build \
|
|
12
|
+
{% endif %}
|
|
58
13
|
--force \
|
|
59
14
|
--sandbox \
|
|
60
15
|
--fix-perms \
|
|
61
16
|
"$SLURM_TMPDIR"/container \
|
|
62
17
|
docker://{{ job.executable.image }}
|
|
63
18
|
|
|
64
|
-
{% if
|
|
19
|
+
{% if runtime == "singularity" and cluster.mounts %}
|
|
65
20
|
{% for source, dest in cluster.mounts.items() %}
|
|
66
21
|
mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
|
|
67
22
|
{% endfor %}
|
|
68
23
|
{% endif %}
|
|
69
24
|
|
|
70
|
-
|
|
25
|
+
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
|
|
26
|
+
{{ entrypoint(cluster, job) }}
|
|
27
|
+
ENTRYPOINT_EOF
|
|
28
|
+
chmod +x "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
|
|
29
|
+
|
|
30
|
+
exec {{ runtime }} exec \
|
|
71
31
|
{% if job.executor.requirements.accelerator %}
|
|
72
32
|
--nv \
|
|
73
33
|
{% endif %}
|
|
@@ -75,6 +35,9 @@ exec ${CONTAINER_RUNTIME} run \
|
|
|
75
35
|
--no-umask \
|
|
76
36
|
--no-home \
|
|
77
37
|
--cleanenv \
|
|
38
|
+
{% if runtime == "apptainer" %}
|
|
39
|
+
--no-eval \
|
|
40
|
+
{% endif %}
|
|
78
41
|
--containall \
|
|
79
42
|
{% if cluster.mounts %}
|
|
80
43
|
{% for source, dest in cluster.mounts.items() %}
|
|
@@ -91,6 +54,7 @@ exec ${CONTAINER_RUNTIME} run \
|
|
|
91
54
|
--pwd {{ job.executable.workdir }} \
|
|
92
55
|
{% endif %}
|
|
93
56
|
"$SLURM_TMPDIR"/container \
|
|
57
|
+
/xm-slurm-entrypoint.sh \
|
|
94
58
|
{% for arg in job.executable.args.to_list() %}
|
|
95
59
|
{{ arg }} \
|
|
96
60
|
{% endfor %}
|
|
@@ -1,33 +1,18 @@
|
|
|
1
|
-
{% macro
|
|
2
|
-
|
|
3
|
-
export PODMANENV_{{ key }}="{{ value }}"
|
|
4
|
-
{% endfor %}
|
|
5
|
-
{%- endmacro %}
|
|
6
|
-
|
|
7
|
-
{% macro export(job, mode=None) -%}
|
|
8
|
-
{%- set combined_envs = operator.or_(job.env_vars, job.executable.env_vars) -%}
|
|
9
|
-
|
|
10
|
-
{%- set env_strings = [] -%}
|
|
11
|
-
{%- for key, value in combined_envs.items() -%}
|
|
12
|
-
{%- set _ = env_strings.append('PODMANENV_{0}="{1}"'.format(key, value)) -%}
|
|
13
|
-
{%- endfor -%}
|
|
14
|
-
|
|
15
|
-
{%- if mode is not none -%}
|
|
16
|
-
{{- mode -}}{{- "," if combined_envs -}}
|
|
17
|
-
{%- endif -%}
|
|
18
|
-
|
|
19
|
-
{{- env_strings | join(",") -}}
|
|
20
|
-
{% endmacro %}
|
|
21
|
-
|
|
22
|
-
{% macro run(job, cluster) -%}
|
|
23
|
-
podman pull \
|
|
1
|
+
{% macro run(cluster, job) -%}
|
|
2
|
+
time podman pull \
|
|
24
3
|
{% if job.executable.credentials %}
|
|
25
4
|
--creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
|
|
26
5
|
{% endif %}
|
|
27
6
|
{{ job.executable.image }}
|
|
28
7
|
|
|
8
|
+
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/xm-slurm-entrypoint.sh
|
|
9
|
+
{{ entrypoint(cluster, job) }}
|
|
10
|
+
ENTRYPOINT_EOF
|
|
11
|
+
chmod +x "$SLURM_TMPDIR"/xm-slurm-entrypoint.sh
|
|
12
|
+
|
|
29
13
|
exec podman run \
|
|
30
|
-
--
|
|
14
|
+
--mount type=bind,src="$SLURM_TMPDIR"/xm-slurm-entrypoint.sh,dst=/xm-slurm-entrypoint.sh,ro \
|
|
15
|
+
--entrypoint /xm-slurm-entrypoint.sh \
|
|
31
16
|
--pull never \
|
|
32
17
|
--restart no \
|
|
33
18
|
--rm \
|