xmanager-slurm 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +44 -0
- xm_slurm/api.py +261 -0
- xm_slurm/batching.py +139 -0
- xm_slurm/config.py +162 -0
- xm_slurm/console.py +3 -0
- xm_slurm/contrib/clusters/__init__.py +52 -0
- xm_slurm/contrib/clusters/drac.py +169 -0
- xm_slurm/executables.py +201 -0
- xm_slurm/execution.py +491 -0
- xm_slurm/executors.py +127 -0
- xm_slurm/experiment.py +737 -0
- xm_slurm/job_blocks.py +14 -0
- xm_slurm/packageables.py +292 -0
- xm_slurm/packaging/__init__.py +8 -0
- xm_slurm/packaging/docker/__init__.py +75 -0
- xm_slurm/packaging/docker/abc.py +112 -0
- xm_slurm/packaging/docker/cloud.py +503 -0
- xm_slurm/packaging/docker/local.py +206 -0
- xm_slurm/packaging/registry.py +45 -0
- xm_slurm/packaging/router.py +52 -0
- xm_slurm/packaging/utils.py +202 -0
- xm_slurm/resources.py +150 -0
- xm_slurm/status.py +188 -0
- xm_slurm/templates/docker/docker-bake.hcl.j2 +47 -0
- xm_slurm/templates/docker/mamba.Dockerfile +27 -0
- xm_slurm/templates/docker/pdm.Dockerfile +31 -0
- xm_slurm/templates/docker/python.Dockerfile +24 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +32 -0
- xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +29 -0
- xm_slurm/templates/slurm/job-group.bash.j2 +41 -0
- xm_slurm/templates/slurm/job.bash.j2 +78 -0
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +103 -0
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +56 -0
- xm_slurm/utils.py +69 -0
- xmanager_slurm-0.3.0.dist-info/METADATA +25 -0
- xmanager_slurm-0.3.0.dist-info/RECORD +38 -0
- xmanager_slurm-0.3.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from xm_slurm import config
|
|
5
|
+
from xm_slurm.resources import ResourceType
|
|
6
|
+
|
|
7
|
+
__all__ = ["narval", "beluga", "cedar", "graham"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _drac_cluster(
|
|
11
|
+
*,
|
|
12
|
+
name: str,
|
|
13
|
+
host: str,
|
|
14
|
+
host_public_key: config.PublicKey,
|
|
15
|
+
port: int = 22,
|
|
16
|
+
user: str | None = None,
|
|
17
|
+
account: str | None = None,
|
|
18
|
+
modules: list[str] | None = None,
|
|
19
|
+
proxy: Literal["submission-host"] | str | None = None,
|
|
20
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
21
|
+
resources: dict[str, ResourceType] | None = None,
|
|
22
|
+
) -> config.SlurmClusterConfig:
|
|
23
|
+
"""DRAC Cluster."""
|
|
24
|
+
if mounts is None:
|
|
25
|
+
mounts = {"/scratch/$USER": "/scratch"}
|
|
26
|
+
|
|
27
|
+
return config.SlurmClusterConfig(
|
|
28
|
+
name=name,
|
|
29
|
+
user=user,
|
|
30
|
+
host=host,
|
|
31
|
+
host_public_key=host_public_key,
|
|
32
|
+
port=port,
|
|
33
|
+
account=account,
|
|
34
|
+
proxy=proxy,
|
|
35
|
+
runtime=config.ContainerRuntime.APPTAINER,
|
|
36
|
+
prolog=f"module load apptainer {' '.join(modules) if modules else ''}".rstrip(),
|
|
37
|
+
environment={
|
|
38
|
+
"APPTAINER_CACHEDIR": "$SCRATCH/.apptainer",
|
|
39
|
+
"APPTAINER_TMPDIR": "$SLURM_TMPDIR",
|
|
40
|
+
"APPTAINER_LOCALCACHEDIR": "$SLURM_TMPDIR",
|
|
41
|
+
"_XDG_DATA_HOME": "$SLURM_TMPDIR/.local",
|
|
42
|
+
"SCRATCH": "/scratch",
|
|
43
|
+
},
|
|
44
|
+
mounts=mounts,
|
|
45
|
+
resources=resources or {},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def narval(
|
|
50
|
+
*,
|
|
51
|
+
user: str | None = None,
|
|
52
|
+
account: str | None = None,
|
|
53
|
+
proxy: Literal["submission-host"] | str | None = None,
|
|
54
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
55
|
+
) -> config.SlurmClusterConfig:
|
|
56
|
+
"""DRAC Narval Cluster (https://docs.alliancecan.ca/wiki/Narval/en)."""
|
|
57
|
+
modules = []
|
|
58
|
+
if proxy != "submission-host":
|
|
59
|
+
modules.append("httpproxy")
|
|
60
|
+
|
|
61
|
+
return _drac_cluster(
|
|
62
|
+
name="narval",
|
|
63
|
+
host="robot.narval.alliancecan.ca",
|
|
64
|
+
host_public_key=config.PublicKey(
|
|
65
|
+
"ssh-ed25519",
|
|
66
|
+
"AAAAC3NzaC1lZDI1NTE5AAAAILFxB0spH5RApc43sBx0zOxo1ARVH0ezU+FbQH95FW+h",
|
|
67
|
+
),
|
|
68
|
+
user=user,
|
|
69
|
+
account=account,
|
|
70
|
+
mounts=mounts,
|
|
71
|
+
proxy=proxy,
|
|
72
|
+
modules=modules,
|
|
73
|
+
resources={"a100": ResourceType.A100},
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def beluga(
|
|
78
|
+
*,
|
|
79
|
+
user: str | None = None,
|
|
80
|
+
account: str | None = None,
|
|
81
|
+
proxy: Literal["submission-host"] | str | None = None,
|
|
82
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
83
|
+
) -> config.SlurmClusterConfig:
|
|
84
|
+
"""DRAC Beluga Cluster (https://docs.alliancecan.ca/wiki/B%C3%A9luga/en)."""
|
|
85
|
+
modules = []
|
|
86
|
+
if proxy != "submission-host":
|
|
87
|
+
modules.append("httpproxy")
|
|
88
|
+
|
|
89
|
+
return _drac_cluster(
|
|
90
|
+
name="beluga",
|
|
91
|
+
host="robot.beluga.alliancecan.ca",
|
|
92
|
+
host_public_key=config.PublicKey(
|
|
93
|
+
"ssh-ed25519",
|
|
94
|
+
"AAAAC3NzaC1lZDI1NTE5AAAAIOAzTHRerKjcFhDqqgRss7Sj4xePWVn1f1QvBfUmX6Pe",
|
|
95
|
+
),
|
|
96
|
+
user=user,
|
|
97
|
+
account=account,
|
|
98
|
+
mounts=mounts,
|
|
99
|
+
proxy=proxy,
|
|
100
|
+
modules=modules,
|
|
101
|
+
resources={"tesla_v100-sxm2-16gb": ResourceType.V100},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def cedar(
|
|
106
|
+
*,
|
|
107
|
+
user: str | None = None,
|
|
108
|
+
account: str | None = None,
|
|
109
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
110
|
+
) -> config.SlurmClusterConfig:
|
|
111
|
+
"""DRAC Cedar Cluster (https://docs.alliancecan.ca/wiki/Cedar/en)."""
|
|
112
|
+
return _drac_cluster(
|
|
113
|
+
name="cedar",
|
|
114
|
+
host="robot.cedar.alliancecan.ca",
|
|
115
|
+
host_public_key=config.PublicKey(
|
|
116
|
+
"ssh-ed25519",
|
|
117
|
+
"AAAAC3NzaC1lZDI1NTE5AAAAIEsmR+vxeKYEDFIFj+nxlgp3ACs64VwVD5qBifQ2I5VS",
|
|
118
|
+
),
|
|
119
|
+
user=user,
|
|
120
|
+
account=account,
|
|
121
|
+
mounts=mounts,
|
|
122
|
+
resources={
|
|
123
|
+
"v100l": ResourceType.V100_32GIB,
|
|
124
|
+
"p100": ResourceType.P100,
|
|
125
|
+
"p100l": ResourceType.P100_16GIB,
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def graham(
|
|
131
|
+
*,
|
|
132
|
+
user: str | None = None,
|
|
133
|
+
account: str | None = None,
|
|
134
|
+
proxy: Literal["submission-host"] | str | None = "submission-host",
|
|
135
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
136
|
+
) -> config.SlurmClusterConfig:
|
|
137
|
+
"""DRAC Cedar Cluster (https://docs.alliancecan.ca/wiki/Graham/en)."""
|
|
138
|
+
return _drac_cluster(
|
|
139
|
+
name="graham",
|
|
140
|
+
host="robot.graham.alliancecan.ca",
|
|
141
|
+
host_public_key=config.PublicKey(
|
|
142
|
+
"ssh-ed25519",
|
|
143
|
+
"AAAAC3NzaC1lZDI1NTE5AAAAIDPcZ+yKur5GvPoisN2KjtEbrem/0j+JviMfAk7GVlL9",
|
|
144
|
+
),
|
|
145
|
+
user=user,
|
|
146
|
+
account=account,
|
|
147
|
+
mounts=mounts,
|
|
148
|
+
proxy=proxy,
|
|
149
|
+
resources={
|
|
150
|
+
"v100": ResourceType.V100,
|
|
151
|
+
"p100": ResourceType.P100,
|
|
152
|
+
"a100": ResourceType.A100,
|
|
153
|
+
"a5000": ResourceType.A5000,
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def all(
|
|
159
|
+
user: str | None = None,
|
|
160
|
+
account: str | None = None,
|
|
161
|
+
mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
|
|
162
|
+
) -> list[config.SlurmClusterConfig]:
|
|
163
|
+
"""All DRAC clusters."""
|
|
164
|
+
return [
|
|
165
|
+
narval(user=user, account=account, mounts=mounts),
|
|
166
|
+
beluga(user=user, account=account, mounts=mounts),
|
|
167
|
+
cedar(user=user, account=account, mounts=mounts),
|
|
168
|
+
graham(user=user, account=account, mounts=mounts),
|
|
169
|
+
]
|
xm_slurm/executables.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import pathlib
|
|
3
|
+
import re
|
|
4
|
+
from typing import Mapping, NamedTuple, Sequence
|
|
5
|
+
|
|
6
|
+
from xmanager import xm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
10
|
+
class Dockerfile(xm.ExecutableSpec):
|
|
11
|
+
"""A specification describing a Dockerfile to build.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
dockerfile: The path to the Dockerfile.
|
|
15
|
+
context: The path to the Docker context.
|
|
16
|
+
target: The Docker build target.
|
|
17
|
+
build_args: Build arguments to docker.
|
|
18
|
+
cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
|
|
19
|
+
workdir: The working directory in container.
|
|
20
|
+
labels: The container labels.
|
|
21
|
+
platforms: The target platform.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Dockerfile
|
|
25
|
+
dockerfile: pathlib.Path
|
|
26
|
+
# Docker context
|
|
27
|
+
context: pathlib.Path
|
|
28
|
+
|
|
29
|
+
# Docker build target
|
|
30
|
+
target: str | None = None
|
|
31
|
+
|
|
32
|
+
# Build arguments to docker
|
|
33
|
+
build_args: Mapping[str, str] = dataclasses.field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
# --cache-from field in BuildKit
|
|
36
|
+
cache_from: Sequence[str] = dataclasses.field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
# Working directory in container
|
|
39
|
+
workdir: pathlib.Path | None = None
|
|
40
|
+
|
|
41
|
+
# Container labels
|
|
42
|
+
labels: Mapping[str, str] = dataclasses.field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
# Target platform
|
|
45
|
+
platforms: Sequence[str] = dataclasses.field(default_factory=lambda: ["linux/amd64"])
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def name(self) -> str:
|
|
49
|
+
name = self.dockerfile.stem
|
|
50
|
+
if self.target is not None:
|
|
51
|
+
name = f"{name}-{self.target}"
|
|
52
|
+
return name
|
|
53
|
+
|
|
54
|
+
def __hash__(self) -> int:
|
|
55
|
+
return hash((
|
|
56
|
+
self.dockerfile,
|
|
57
|
+
self.context,
|
|
58
|
+
self.target,
|
|
59
|
+
tuple(sorted(self.build_args.items())),
|
|
60
|
+
tuple(sorted(self.cache_from)),
|
|
61
|
+
self.workdir,
|
|
62
|
+
tuple(sorted(self.labels.items())),
|
|
63
|
+
tuple(sorted(self.platforms)),
|
|
64
|
+
))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
68
|
+
class DockerImage(xm.ExecutableSpec):
|
|
69
|
+
"""A specification describing a pre-built Docker image.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
image: The remote image URI.
|
|
73
|
+
workdir: The working directory in container.
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
image: str
|
|
78
|
+
|
|
79
|
+
# Working directory in container
|
|
80
|
+
workdir: pathlib.Path | None = None
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def name(self) -> str:
|
|
84
|
+
return self.image
|
|
85
|
+
|
|
86
|
+
def __hash__(self) -> int:
|
|
87
|
+
return hash((self.image, self.workdir))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
_IMAGE_URI_REGEX = re.compile(
|
|
91
|
+
r"^(?P<scheme>(?:[^:]+://)?)?(?P<domain>[^/]+)(?P<path>/[^:]*)?(?::(?P<tag>[^@]+))?@?(?P<digest>.+)?$"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclasses.dataclass
|
|
96
|
+
class ImageURI:
|
|
97
|
+
image: dataclasses.InitVar[str]
|
|
98
|
+
|
|
99
|
+
scheme: str | None = dataclasses.field(init=False, default=None)
|
|
100
|
+
domain: str = dataclasses.field(init=False)
|
|
101
|
+
path: str = dataclasses.field(init=False)
|
|
102
|
+
tag: str | None = dataclasses.field(init=False, default=None)
|
|
103
|
+
digest: str | None = dataclasses.field(init=False, default=None)
|
|
104
|
+
|
|
105
|
+
def __post_init__(self, image: str):
|
|
106
|
+
match = _IMAGE_URI_REGEX.match(image)
|
|
107
|
+
if not match:
|
|
108
|
+
raise ValueError(f"Invalid OCI image URI: {image}")
|
|
109
|
+
groups = {k: v for k, v in match.groupdict().items() if v is not None}
|
|
110
|
+
for k, v in groups.items():
|
|
111
|
+
setattr(self, k, v)
|
|
112
|
+
|
|
113
|
+
if self.tag is None and self.digest is None:
|
|
114
|
+
self.tag = "latest"
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def locator(self) -> str:
|
|
118
|
+
"""Unique locator for this image.
|
|
119
|
+
|
|
120
|
+
Locator will return the digest if it exists otherwise the tag format.
|
|
121
|
+
If neither are present, it will raise an AssertionError.
|
|
122
|
+
"""
|
|
123
|
+
if self.digest is not None:
|
|
124
|
+
return f"@{self.digest}"
|
|
125
|
+
assert self.tag is not None
|
|
126
|
+
return f":{self.tag}"
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def url(self) -> str:
|
|
130
|
+
"""URL for this image without the locator."""
|
|
131
|
+
return f"{self.origin}{self.path}"
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def origin(self) -> str:
|
|
135
|
+
return f"{self.scheme}{self.domain}"
|
|
136
|
+
|
|
137
|
+
def with_tag(self, tag: str) -> "ImageURI":
|
|
138
|
+
self.tag = tag
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def with_digest(self, digest: str) -> "ImageURI":
|
|
142
|
+
self.digest = digest
|
|
143
|
+
return self
|
|
144
|
+
|
|
145
|
+
def __str__(self) -> str:
|
|
146
|
+
return self.format("{url}{locator}")
|
|
147
|
+
|
|
148
|
+
def __hash__(self) -> int:
|
|
149
|
+
return hash((
|
|
150
|
+
self.scheme,
|
|
151
|
+
self.domain,
|
|
152
|
+
self.path,
|
|
153
|
+
self.tag,
|
|
154
|
+
self.digest,
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
def format(self, format: str) -> str:
|
|
158
|
+
fields = {k: v for k, v in dataclasses.asdict(self).items() if v is not None}
|
|
159
|
+
fields |= {"locator": self.locator, "url": self.url}
|
|
160
|
+
return format.format(**fields)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class ImageDescriptor:
|
|
164
|
+
def __set_name__(self, owner: type, name: str):
|
|
165
|
+
del owner
|
|
166
|
+
self.image = f"_{name}"
|
|
167
|
+
|
|
168
|
+
def __get__(self, instance: object, owner: type) -> ImageURI:
|
|
169
|
+
del owner
|
|
170
|
+
return getattr(instance, self.image)
|
|
171
|
+
|
|
172
|
+
def __set__(self, instance: object, value: str | ImageURI):
|
|
173
|
+
if isinstance(value, str):
|
|
174
|
+
value = ImageURI(value)
|
|
175
|
+
setattr(instance, self.image, value)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class RemoteRepositoryCredentials(NamedTuple):
|
|
179
|
+
username: str
|
|
180
|
+
password: str
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclasses.dataclass(kw_only=True) # type: ignore
|
|
184
|
+
class RemoteImage(xm.Executable):
|
|
185
|
+
# Remote base image
|
|
186
|
+
image: ImageDescriptor = ImageDescriptor()
|
|
187
|
+
|
|
188
|
+
# Working directory in container
|
|
189
|
+
workdir: pathlib.Path | None = None
|
|
190
|
+
|
|
191
|
+
# Container arguments
|
|
192
|
+
args: xm.SequentialArgs = dataclasses.field(default_factory=xm.SequentialArgs)
|
|
193
|
+
# Container environment variables
|
|
194
|
+
env_vars: Mapping[str, str] = dataclasses.field(default_factory=dict)
|
|
195
|
+
|
|
196
|
+
# Remote repository credentials
|
|
197
|
+
credentials: RemoteRepositoryCredentials | None = None
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def name(self) -> str:
|
|
201
|
+
return str(self.image)
|