xmanager-slurm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (38) hide show
  1. xm_slurm/__init__.py +44 -0
  2. xm_slurm/api.py +261 -0
  3. xm_slurm/batching.py +139 -0
  4. xm_slurm/config.py +162 -0
  5. xm_slurm/console.py +3 -0
  6. xm_slurm/contrib/clusters/__init__.py +52 -0
  7. xm_slurm/contrib/clusters/drac.py +169 -0
  8. xm_slurm/executables.py +201 -0
  9. xm_slurm/execution.py +491 -0
  10. xm_slurm/executors.py +127 -0
  11. xm_slurm/experiment.py +737 -0
  12. xm_slurm/job_blocks.py +14 -0
  13. xm_slurm/packageables.py +292 -0
  14. xm_slurm/packaging/__init__.py +8 -0
  15. xm_slurm/packaging/docker/__init__.py +75 -0
  16. xm_slurm/packaging/docker/abc.py +112 -0
  17. xm_slurm/packaging/docker/cloud.py +503 -0
  18. xm_slurm/packaging/docker/local.py +206 -0
  19. xm_slurm/packaging/registry.py +45 -0
  20. xm_slurm/packaging/router.py +52 -0
  21. xm_slurm/packaging/utils.py +202 -0
  22. xm_slurm/resources.py +150 -0
  23. xm_slurm/status.py +188 -0
  24. xm_slurm/templates/docker/docker-bake.hcl.j2 +47 -0
  25. xm_slurm/templates/docker/mamba.Dockerfile +27 -0
  26. xm_slurm/templates/docker/pdm.Dockerfile +31 -0
  27. xm_slurm/templates/docker/python.Dockerfile +24 -0
  28. xm_slurm/templates/slurm/fragments/monitor.bash.j2 +32 -0
  29. xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
  30. xm_slurm/templates/slurm/job-array.bash.j2 +29 -0
  31. xm_slurm/templates/slurm/job-group.bash.j2 +41 -0
  32. xm_slurm/templates/slurm/job.bash.j2 +78 -0
  33. xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +103 -0
  34. xm_slurm/templates/slurm/runtimes/podman.bash.j2 +56 -0
  35. xm_slurm/utils.py +69 -0
  36. xmanager_slurm-0.3.0.dist-info/METADATA +25 -0
  37. xmanager_slurm-0.3.0.dist-info/RECORD +38 -0
  38. xmanager_slurm-0.3.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,169 @@
1
+ import os
2
+ from typing import Literal
3
+
4
+ from xm_slurm import config
5
+ from xm_slurm.resources import ResourceType
6
+
7
+ __all__ = ["narval", "beluga", "cedar", "graham"]
8
+
9
+
10
+ def _drac_cluster(
11
+ *,
12
+ name: str,
13
+ host: str,
14
+ host_public_key: config.PublicKey,
15
+ port: int = 22,
16
+ user: str | None = None,
17
+ account: str | None = None,
18
+ modules: list[str] | None = None,
19
+ proxy: Literal["submission-host"] | str | None = None,
20
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
21
+ resources: dict[str, ResourceType] | None = None,
22
+ ) -> config.SlurmClusterConfig:
23
+ """DRAC Cluster."""
24
+ if mounts is None:
25
+ mounts = {"/scratch/$USER": "/scratch"}
26
+
27
+ return config.SlurmClusterConfig(
28
+ name=name,
29
+ user=user,
30
+ host=host,
31
+ host_public_key=host_public_key,
32
+ port=port,
33
+ account=account,
34
+ proxy=proxy,
35
+ runtime=config.ContainerRuntime.APPTAINER,
36
+ prolog=f"module load apptainer {' '.join(modules) if modules else ''}".rstrip(),
37
+ environment={
38
+ "APPTAINER_CACHEDIR": "$SCRATCH/.apptainer",
39
+ "APPTAINER_TMPDIR": "$SLURM_TMPDIR",
40
+ "APPTAINER_LOCALCACHEDIR": "$SLURM_TMPDIR",
41
+ "_XDG_DATA_HOME": "$SLURM_TMPDIR/.local",
42
+ "SCRATCH": "/scratch",
43
+ },
44
+ mounts=mounts,
45
+ resources=resources or {},
46
+ )
47
+
48
+
49
+ def narval(
50
+ *,
51
+ user: str | None = None,
52
+ account: str | None = None,
53
+ proxy: Literal["submission-host"] | str | None = None,
54
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
55
+ ) -> config.SlurmClusterConfig:
56
+ """DRAC Narval Cluster (https://docs.alliancecan.ca/wiki/Narval/en)."""
57
+ modules = []
58
+ if proxy != "submission-host":
59
+ modules.append("httpproxy")
60
+
61
+ return _drac_cluster(
62
+ name="narval",
63
+ host="robot.narval.alliancecan.ca",
64
+ host_public_key=config.PublicKey(
65
+ "ssh-ed25519",
66
+ "AAAAC3NzaC1lZDI1NTE5AAAAILFxB0spH5RApc43sBx0zOxo1ARVH0ezU+FbQH95FW+h",
67
+ ),
68
+ user=user,
69
+ account=account,
70
+ mounts=mounts,
71
+ proxy=proxy,
72
+ modules=modules,
73
+ resources={"a100": ResourceType.A100},
74
+ )
75
+
76
+
77
+ def beluga(
78
+ *,
79
+ user: str | None = None,
80
+ account: str | None = None,
81
+ proxy: Literal["submission-host"] | str | None = None,
82
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
83
+ ) -> config.SlurmClusterConfig:
84
+ """DRAC Beluga Cluster (https://docs.alliancecan.ca/wiki/B%C3%A9luga/en)."""
85
+ modules = []
86
+ if proxy != "submission-host":
87
+ modules.append("httpproxy")
88
+
89
+ return _drac_cluster(
90
+ name="beluga",
91
+ host="robot.beluga.alliancecan.ca",
92
+ host_public_key=config.PublicKey(
93
+ "ssh-ed25519",
94
+ "AAAAC3NzaC1lZDI1NTE5AAAAIOAzTHRerKjcFhDqqgRss7Sj4xePWVn1f1QvBfUmX6Pe",
95
+ ),
96
+ user=user,
97
+ account=account,
98
+ mounts=mounts,
99
+ proxy=proxy,
100
+ modules=modules,
101
+ resources={"tesla_v100-sxm2-16gb": ResourceType.V100},
102
+ )
103
+
104
+
105
+ def cedar(
106
+ *,
107
+ user: str | None = None,
108
+ account: str | None = None,
109
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
110
+ ) -> config.SlurmClusterConfig:
111
+ """DRAC Cedar Cluster (https://docs.alliancecan.ca/wiki/Cedar/en)."""
112
+ return _drac_cluster(
113
+ name="cedar",
114
+ host="robot.cedar.alliancecan.ca",
115
+ host_public_key=config.PublicKey(
116
+ "ssh-ed25519",
117
+ "AAAAC3NzaC1lZDI1NTE5AAAAIEsmR+vxeKYEDFIFj+nxlgp3ACs64VwVD5qBifQ2I5VS",
118
+ ),
119
+ user=user,
120
+ account=account,
121
+ mounts=mounts,
122
+ resources={
123
+ "v100l": ResourceType.V100_32GIB,
124
+ "p100": ResourceType.P100,
125
+ "p100l": ResourceType.P100_16GIB,
126
+ },
127
+ )
128
+
129
+
130
+ def graham(
131
+ *,
132
+ user: str | None = None,
133
+ account: str | None = None,
134
+ proxy: Literal["submission-host"] | str | None = "submission-host",
135
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
136
+ ) -> config.SlurmClusterConfig:
137
+ """DRAC Cedar Cluster (https://docs.alliancecan.ca/wiki/Graham/en)."""
138
+ return _drac_cluster(
139
+ name="graham",
140
+ host="robot.graham.alliancecan.ca",
141
+ host_public_key=config.PublicKey(
142
+ "ssh-ed25519",
143
+ "AAAAC3NzaC1lZDI1NTE5AAAAIDPcZ+yKur5GvPoisN2KjtEbrem/0j+JviMfAk7GVlL9",
144
+ ),
145
+ user=user,
146
+ account=account,
147
+ mounts=mounts,
148
+ proxy=proxy,
149
+ resources={
150
+ "v100": ResourceType.V100,
151
+ "p100": ResourceType.P100,
152
+ "a100": ResourceType.A100,
153
+ "a5000": ResourceType.A5000,
154
+ },
155
+ )
156
+
157
+
158
+ def all(
159
+ user: str | None = None,
160
+ account: str | None = None,
161
+ mounts: dict[os.PathLike[str] | str, os.PathLike[str] | str] | None = None,
162
+ ) -> list[config.SlurmClusterConfig]:
163
+ """All DRAC clusters."""
164
+ return [
165
+ narval(user=user, account=account, mounts=mounts),
166
+ beluga(user=user, account=account, mounts=mounts),
167
+ cedar(user=user, account=account, mounts=mounts),
168
+ graham(user=user, account=account, mounts=mounts),
169
+ ]
@@ -0,0 +1,201 @@
1
+ import dataclasses
2
+ import pathlib
3
+ import re
4
+ from typing import Mapping, NamedTuple, Sequence
5
+
6
+ from xmanager import xm
7
+
8
+
9
+ @dataclasses.dataclass(frozen=True, kw_only=True)
10
+ class Dockerfile(xm.ExecutableSpec):
11
+ """A specification describing a Dockerfile to build.
12
+
13
+ Args:
14
+ dockerfile: The path to the Dockerfile.
15
+ context: The path to the Docker context.
16
+ target: The Docker build target.
17
+ build_args: Build arguments to docker.
18
+ cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
19
+ workdir: The working directory in container.
20
+ labels: The container labels.
21
+ platforms: The target platform.
22
+ """
23
+
24
+ # Dockerfile
25
+ dockerfile: pathlib.Path
26
+ # Docker context
27
+ context: pathlib.Path
28
+
29
+ # Docker build target
30
+ target: str | None = None
31
+
32
+ # Build arguments to docker
33
+ build_args: Mapping[str, str] = dataclasses.field(default_factory=dict)
34
+
35
+ # --cache-from field in BuildKit
36
+ cache_from: Sequence[str] = dataclasses.field(default_factory=list)
37
+
38
+ # Working directory in container
39
+ workdir: pathlib.Path | None = None
40
+
41
+ # Container labels
42
+ labels: Mapping[str, str] = dataclasses.field(default_factory=dict)
43
+
44
+ # Target platform
45
+ platforms: Sequence[str] = dataclasses.field(default_factory=lambda: ["linux/amd64"])
46
+
47
+ @property
48
+ def name(self) -> str:
49
+ name = self.dockerfile.stem
50
+ if self.target is not None:
51
+ name = f"{name}-{self.target}"
52
+ return name
53
+
54
+ def __hash__(self) -> int:
55
+ return hash((
56
+ self.dockerfile,
57
+ self.context,
58
+ self.target,
59
+ tuple(sorted(self.build_args.items())),
60
+ tuple(sorted(self.cache_from)),
61
+ self.workdir,
62
+ tuple(sorted(self.labels.items())),
63
+ tuple(sorted(self.platforms)),
64
+ ))
65
+
66
+
67
+ @dataclasses.dataclass(frozen=True, kw_only=True)
68
+ class DockerImage(xm.ExecutableSpec):
69
+ """A specification describing a pre-built Docker image.
70
+
71
+ Args:
72
+ image: The remote image URI.
73
+ workdir: The working directory in container.
74
+
75
+ """
76
+
77
+ image: str
78
+
79
+ # Working directory in container
80
+ workdir: pathlib.Path | None = None
81
+
82
+ @property
83
+ def name(self) -> str:
84
+ return self.image
85
+
86
+ def __hash__(self) -> int:
87
+ return hash((self.image, self.workdir))
88
+
89
+
90
+ _IMAGE_URI_REGEX = re.compile(
91
+ r"^(?P<scheme>(?:[^:]+://)?)?(?P<domain>[^/]+)(?P<path>/[^:]*)?(?::(?P<tag>[^@]+))?@?(?P<digest>.+)?$"
92
+ )
93
+
94
+
95
+ @dataclasses.dataclass
96
+ class ImageURI:
97
+ image: dataclasses.InitVar[str]
98
+
99
+ scheme: str | None = dataclasses.field(init=False, default=None)
100
+ domain: str = dataclasses.field(init=False)
101
+ path: str = dataclasses.field(init=False)
102
+ tag: str | None = dataclasses.field(init=False, default=None)
103
+ digest: str | None = dataclasses.field(init=False, default=None)
104
+
105
+ def __post_init__(self, image: str):
106
+ match = _IMAGE_URI_REGEX.match(image)
107
+ if not match:
108
+ raise ValueError(f"Invalid OCI image URI: {image}")
109
+ groups = {k: v for k, v in match.groupdict().items() if v is not None}
110
+ for k, v in groups.items():
111
+ setattr(self, k, v)
112
+
113
+ if self.tag is None and self.digest is None:
114
+ self.tag = "latest"
115
+
116
+ @property
117
+ def locator(self) -> str:
118
+ """Unique locator for this image.
119
+
120
+ Locator will return the digest if it exists otherwise the tag format.
121
+ If neither are present, it will raise an AssertionError.
122
+ """
123
+ if self.digest is not None:
124
+ return f"@{self.digest}"
125
+ assert self.tag is not None
126
+ return f":{self.tag}"
127
+
128
+ @property
129
+ def url(self) -> str:
130
+ """URL for this image without the locator."""
131
+ return f"{self.origin}{self.path}"
132
+
133
+ @property
134
+ def origin(self) -> str:
135
+ return f"{self.scheme}{self.domain}"
136
+
137
+ def with_tag(self, tag: str) -> "ImageURI":
138
+ self.tag = tag
139
+ return self
140
+
141
+ def with_digest(self, digest: str) -> "ImageURI":
142
+ self.digest = digest
143
+ return self
144
+
145
+ def __str__(self) -> str:
146
+ return self.format("{url}{locator}")
147
+
148
+ def __hash__(self) -> int:
149
+ return hash((
150
+ self.scheme,
151
+ self.domain,
152
+ self.path,
153
+ self.tag,
154
+ self.digest,
155
+ ))
156
+
157
+ def format(self, format: str) -> str:
158
+ fields = {k: v for k, v in dataclasses.asdict(self).items() if v is not None}
159
+ fields |= {"locator": self.locator, "url": self.url}
160
+ return format.format(**fields)
161
+
162
+
163
+ class ImageDescriptor:
164
+ def __set_name__(self, owner: type, name: str):
165
+ del owner
166
+ self.image = f"_{name}"
167
+
168
+ def __get__(self, instance: object, owner: type) -> ImageURI:
169
+ del owner
170
+ return getattr(instance, self.image)
171
+
172
+ def __set__(self, instance: object, value: str | ImageURI):
173
+ if isinstance(value, str):
174
+ value = ImageURI(value)
175
+ setattr(instance, self.image, value)
176
+
177
+
178
+ class RemoteRepositoryCredentials(NamedTuple):
179
+ username: str
180
+ password: str
181
+
182
+
183
+ @dataclasses.dataclass(kw_only=True) # type: ignore
184
+ class RemoteImage(xm.Executable):
185
+ # Remote base image
186
+ image: ImageDescriptor = ImageDescriptor()
187
+
188
+ # Working directory in container
189
+ workdir: pathlib.Path | None = None
190
+
191
+ # Container arguments
192
+ args: xm.SequentialArgs = dataclasses.field(default_factory=xm.SequentialArgs)
193
+ # Container environment variables
194
+ env_vars: Mapping[str, str] = dataclasses.field(default_factory=dict)
195
+
196
+ # Remote repository credentials
197
+ credentials: RemoteRepositoryCredentials | None = None
198
+
199
+ @property
200
+ def name(self) -> str:
201
+ return str(self.image)