xmanager-slurm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (38) hide show
  1. xm_slurm/__init__.py +44 -0
  2. xm_slurm/api.py +261 -0
  3. xm_slurm/batching.py +139 -0
  4. xm_slurm/config.py +162 -0
  5. xm_slurm/console.py +3 -0
  6. xm_slurm/contrib/clusters/__init__.py +52 -0
  7. xm_slurm/contrib/clusters/drac.py +169 -0
  8. xm_slurm/executables.py +201 -0
  9. xm_slurm/execution.py +491 -0
  10. xm_slurm/executors.py +127 -0
  11. xm_slurm/experiment.py +737 -0
  12. xm_slurm/job_blocks.py +14 -0
  13. xm_slurm/packageables.py +292 -0
  14. xm_slurm/packaging/__init__.py +8 -0
  15. xm_slurm/packaging/docker/__init__.py +75 -0
  16. xm_slurm/packaging/docker/abc.py +112 -0
  17. xm_slurm/packaging/docker/cloud.py +503 -0
  18. xm_slurm/packaging/docker/local.py +206 -0
  19. xm_slurm/packaging/registry.py +45 -0
  20. xm_slurm/packaging/router.py +52 -0
  21. xm_slurm/packaging/utils.py +202 -0
  22. xm_slurm/resources.py +150 -0
  23. xm_slurm/status.py +188 -0
  24. xm_slurm/templates/docker/docker-bake.hcl.j2 +47 -0
  25. xm_slurm/templates/docker/mamba.Dockerfile +27 -0
  26. xm_slurm/templates/docker/pdm.Dockerfile +31 -0
  27. xm_slurm/templates/docker/python.Dockerfile +24 -0
  28. xm_slurm/templates/slurm/fragments/monitor.bash.j2 +32 -0
  29. xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
  30. xm_slurm/templates/slurm/job-array.bash.j2 +29 -0
  31. xm_slurm/templates/slurm/job-group.bash.j2 +41 -0
  32. xm_slurm/templates/slurm/job.bash.j2 +78 -0
  33. xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +103 -0
  34. xm_slurm/templates/slurm/runtimes/podman.bash.j2 +56 -0
  35. xm_slurm/utils.py +69 -0
  36. xmanager_slurm-0.3.0.dist-info/METADATA +25 -0
  37. xmanager_slurm-0.3.0.dist-info/RECORD +38 -0
  38. xmanager_slurm-0.3.0.dist-info/WHEEL +4 -0
xm_slurm/job_blocks.py ADDED
@@ -0,0 +1,14 @@
1
+ from xmanager import xm
2
+
3
+
4
+ def get_args_for_python_entrypoint(
5
+ entrypoint: xm.ModuleName | xm.CommandList,
6
+ ) -> xm.SequentialArgs:
7
+ match entrypoint:
8
+ case xm.ModuleName():
9
+ entrypoint_args = ["-m", entrypoint.module_name]
10
+ case xm.CommandList():
11
+ entrypoint_args = entrypoint.commands
12
+ case _:
13
+ raise TypeError(f"Invalid entrypoint type: {type(entrypoint)}")
14
+ return xm.SequentialArgs.from_collection(entrypoint_args)
@@ -0,0 +1,292 @@
1
+ import importlib.resources as resources
2
+ import pathlib
3
+ import sys
4
+ from typing import Mapping, Sequence
5
+
6
+ import immutabledict
7
+ from xmanager import xm
8
+
9
+ from xm_slurm import job_blocks, utils
10
+ from xm_slurm.executables import Dockerfile, DockerImage
11
+ from xm_slurm.executors import SlurmSpec
12
+
13
+
14
+ def docker_image(
15
+ *,
16
+ image: str,
17
+ args: xm.UserArgs | None = None,
18
+ env_vars: Mapping[str, str] = immutabledict.immutabledict(),
19
+ ) -> xm.Packageable:
20
+ """Creates a packageable for a pre-built Docker image.
21
+
22
+ Args:
23
+ image: The remote image URI.
24
+ args: The user arguments to pass to the executable.
25
+ env_vars: The environment variables to pass to the executable.
26
+
27
+ Returns: A packageable for a pre-built Docker image.
28
+ """
29
+ return xm.Packageable(
30
+ executor_spec=SlurmSpec(),
31
+ executable_spec=DockerImage(image=image),
32
+ args=args,
33
+ env_vars=env_vars,
34
+ )
35
+
36
+
37
+ def docker_container(
38
+ *,
39
+ executor_spec: xm.ExecutorSpec,
40
+ dockerfile: pathlib.Path | None = None,
41
+ context: pathlib.Path | None = None,
42
+ target: str | None = None,
43
+ build_args: Mapping[str, str] = immutabledict.immutabledict(),
44
+ cache_from: str | Sequence[str] | None = None,
45
+ labels: Mapping[str, str] = immutabledict.immutabledict(),
46
+ workdir: pathlib.Path | None = None,
47
+ args: xm.UserArgs | None = None,
48
+ env_vars: Mapping[str, str] = immutabledict.immutabledict(),
49
+ ) -> xm.Packageable:
50
+ """Creates a Docker container packageable from a dockerfile.
51
+
52
+ Args:
53
+ executor_spec: The executor specification, where will the container be stored at.
54
+ dockerfile: The path to the dockerfile.
55
+ context: The path to the docker context.
56
+ target: The docker build target.
57
+ build_args: Build arguments to docker.
58
+ cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
59
+ labels: The container labels.
60
+ workdir: The working directory in container.
61
+ args: The user arguments to pass to the executable.
62
+ env_vars: The environment variables to pass to the executable.
63
+
64
+ Returns: A Docker container packageable.
65
+ """
66
+ if context is None:
67
+ context = utils.find_project_root()
68
+ context = context.resolve()
69
+ if dockerfile is None:
70
+ dockerfile = context / "Dockerfile"
71
+ dockerfile = dockerfile.resolve()
72
+ if cache_from is None and isinstance(executor_spec, SlurmSpec):
73
+ cache_from = executor_spec.tag
74
+ if cache_from is None:
75
+ cache_from = []
76
+ elif isinstance(cache_from, str):
77
+ cache_from = [cache_from]
78
+
79
+ return xm.Packageable(
80
+ executor_spec=executor_spec,
81
+ executable_spec=Dockerfile(
82
+ dockerfile=dockerfile,
83
+ context=context,
84
+ target=target,
85
+ build_args=build_args,
86
+ cache_from=cache_from,
87
+ workdir=workdir,
88
+ labels=labels,
89
+ ),
90
+ args=args,
91
+ env_vars=env_vars,
92
+ )
93
+
94
+
95
+ def python_container(
96
+ *,
97
+ executor_spec: xm.ExecutorSpec,
98
+ entrypoint: xm.ModuleName | xm.CommandList,
99
+ context: pathlib.Path | None = None,
100
+ requirements: pathlib.Path | None = None,
101
+ base_image: str = "docker.io/python:{major}.{minor}-slim",
102
+ cache_from: str | Sequence[str] | None = None,
103
+ labels: Mapping[str, str] = immutabledict.immutabledict(),
104
+ args: xm.UserArgs | None = None,
105
+ env_vars: Mapping[str, str] = immutabledict.immutabledict(),
106
+ ) -> xm.Packageable:
107
+ """Creates a Python container from a base image using pip from a `requirements.txt` file.
108
+
109
+ NOTE: The base image will use the Python version of the current interpreter.
110
+
111
+ Args:
112
+ executor_spec: The executor specification, where will the container be stored at.
113
+ entrypoint: The entrypoint to run in the container.
114
+ context: The path to the docker context.
115
+ requirements: The path to the pip requirements file.
116
+ base_image: The base image to use. NOTE: The base image must contain the Python runtime.
117
+ cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
118
+ labels: The container labels.
119
+ args: The user arguments to pass to the executable.
120
+ env_vars: The environment variables to pass to the executable.
121
+
122
+ Returns: A Python container packageable.
123
+ """
124
+ entrypoint_args = job_blocks.get_args_for_python_entrypoint(entrypoint)
125
+ args = xm.merge_args(entrypoint_args, args or {})
126
+
127
+ if context is None:
128
+ context = utils.find_project_root()
129
+ context = context.resolve()
130
+ if requirements is None:
131
+ requirements = context / "requirements.txt"
132
+ requirements = requirements.resolve()
133
+ if not requirements.exists():
134
+ raise ValueError(f"Pip requirements `{requirements}` doesn't exist.")
135
+ if not requirements.is_relative_to(context):
136
+ raise ValueError(
137
+ f"Pip requirements `{requirements}` must be relative to context: `{context}`"
138
+ )
139
+
140
+ with resources.as_file(
141
+ resources.files("xm_slurm.templates").joinpath("docker/python.Dockerfile")
142
+ ) as dockerfile:
143
+ return docker_container(
144
+ executor_spec=executor_spec,
145
+ dockerfile=dockerfile,
146
+ context=context,
147
+ build_args={
148
+ "PIP_REQUIREMENTS": requirements.relative_to(context).as_posix(),
149
+ "PYTHON_MAJOR": str(sys.version_info.major),
150
+ "PYTHON_MINOR": str(sys.version_info.minor),
151
+ "PYTHON_MICRO": str(sys.version_info.micro),
152
+ "BASE_IMAGE": base_image.format_map({
153
+ "major": sys.version_info.major,
154
+ "minor": sys.version_info.minor,
155
+ "micro": sys.version_info.micro,
156
+ }),
157
+ },
158
+ cache_from=cache_from,
159
+ labels=labels,
160
+ # We must specify the workdir manually for apptainer support
161
+ workdir=pathlib.Path("/workspace"),
162
+ args=args,
163
+ env_vars=env_vars,
164
+ )
165
+
166
+
167
+ def mamba_container(
168
+ *,
169
+ executor_spec: xm.ExecutorSpec,
170
+ entrypoint: xm.ModuleName | xm.CommandList,
171
+ context: pathlib.Path | None = None,
172
+ environment: pathlib.Path | None = None,
173
+ base_image: str = "gcr.io/distroless/base-debian10",
174
+ cache_from: str | Sequence[str] | None = None,
175
+ labels: Mapping[str, str] = immutabledict.immutabledict(),
176
+ args: xm.UserArgs | None = None,
177
+ env_vars: Mapping[str, str] = immutabledict.immutabledict(),
178
+ ) -> xm.Packageable:
179
+ """Creates a Conda container from a base image using mamba from a `environment.yml` file.
180
+
181
+ Note: The base image *doesn't* need to contain the Python runtime.
182
+
183
+ Args:
184
+ executor_spec: The executor specification, where will the container be stored at.
185
+ entrypoint: The entrypoint to run in the container.
186
+ context: The path to the docker context.
187
+ environment: The path to the conda environment file.
188
+ base_image: The base image to use.
189
+ cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
190
+ labels: The container labels.
191
+ args: The user arguments to pass to the executable.
192
+ env_vars: The environment variables to pass to the executable.
193
+
194
+ Returns: A Conda container packageable.
195
+ """
196
+ entrypoint_args = job_blocks.get_args_for_python_entrypoint(entrypoint)
197
+ args = xm.merge_args(entrypoint_args, args or {})
198
+
199
+ if context is None:
200
+ context = utils.find_project_root()
201
+ context = context.resolve()
202
+ if environment is None:
203
+ environment = context / "environment.yml"
204
+ environment = environment.resolve()
205
+ if not environment.exists():
206
+ raise ValueError(f"Conda environment manifest `{environment}` doesn't exist.")
207
+ if not environment.is_relative_to(context):
208
+ raise ValueError(
209
+ f"Conda environment manifest `{environment}` must be relative to context: `{context}`"
210
+ )
211
+
212
+ with resources.as_file(
213
+ resources.files("xm_slurm.templates").joinpath("docker/mamba.Dockerfile")
214
+ ) as dockerfile:
215
+ return docker_container(
216
+ executor_spec=executor_spec,
217
+ dockerfile=dockerfile,
218
+ context=context,
219
+ build_args={
220
+ "CONDA_ENVIRONMENT": environment.relative_to(context).as_posix(),
221
+ "BASE_IMAGE": base_image,
222
+ },
223
+ cache_from=cache_from,
224
+ labels=labels,
225
+ # We must specify the workdir manually for apptainer support
226
+ workdir=pathlib.Path("/workspace"),
227
+ args=args,
228
+ env_vars=env_vars,
229
+ )
230
+
231
+
232
+ conda_container = mamba_container
233
+
234
+
235
+ def pdm_container(
236
+ *,
237
+ executor_spec: xm.ExecutorSpec,
238
+ entrypoint: xm.ModuleName | xm.CommandList,
239
+ context: pathlib.Path | None = None,
240
+ base_image: str = "docker.io/python:{major}.{minor}-slim",
241
+ cache_from: str | Sequence[str] | None = None,
242
+ labels: Mapping[str, str] = immutabledict.immutabledict(),
243
+ args: xm.UserArgs | None = None,
244
+ env_vars: Mapping[str, str] = immutabledict.immutabledict(),
245
+ ) -> xm.Packageable:
246
+ """Creates a Python container from a base image using pdm from a `pdm.lock` file.
247
+
248
+ Args:
249
+ executor_spec: The executor specification, where will the container be stored at.
250
+ entrypoint: The entrypoint to run in the container.
251
+ context: The path to the docker context.
252
+ base_image: The base image to use. NOTE: The base image must contain the Python runtime.
253
+ cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
254
+ labels: The container labels.
255
+ args: The user arguments to pass to the executable.
256
+ env_vars: The environment variables to pass to the executable.
257
+
258
+ Returns: A Python container packageable.
259
+ """
260
+ entrypoint_args = job_blocks.get_args_for_python_entrypoint(entrypoint)
261
+ args = xm.merge_args(entrypoint_args, args or {})
262
+
263
+ if context is None:
264
+ context = utils.find_project_root()
265
+ context = context.resolve()
266
+ if not (context / "pdm.lock").exists():
267
+ raise ValueError(f"PDM lockfile `{context / 'pdm.lock'}` doesn't exist.")
268
+
269
+ with resources.as_file(
270
+ resources.files("xm_slurm.templates").joinpath("docker/pdm.Dockerfile")
271
+ ) as dockerfile:
272
+ return docker_container(
273
+ executor_spec=executor_spec,
274
+ dockerfile=dockerfile,
275
+ context=context,
276
+ build_args={
277
+ "PYTHON_MAJOR": str(sys.version_info.major),
278
+ "PYTHON_MINOR": str(sys.version_info.minor),
279
+ "PYTHON_MICRO": str(sys.version_info.micro),
280
+ "BASE_IMAGE": base_image.format_map({
281
+ "major": sys.version_info.major,
282
+ "minor": sys.version_info.minor,
283
+ "micro": sys.version_info.micro,
284
+ }),
285
+ },
286
+ cache_from=cache_from,
287
+ labels=labels,
288
+ # We must specify the workdir manually for apptainer support
289
+ workdir=pathlib.Path("/workspace/src"),
290
+ args=args,
291
+ env_vars=env_vars,
292
+ )
@@ -0,0 +1,8 @@
1
+ # First register our built-in packaging methods
2
+ import xm_slurm.packaging.docker # noqa: F401
3
+ from xm_slurm.packaging import registry, router
4
+
5
+ package = router.package
6
+ register = registry.register
7
+
8
+ __all__ = ["package", "register"]
@@ -0,0 +1,75 @@
1
+ import dataclasses
2
+ import functools
3
+ from typing import Sequence
4
+
5
+ from absl import flags
6
+ from xmanager import xm
7
+
8
+ from xm_slurm.executables import Dockerfile, DockerImage, ImageURI, RemoteImage
9
+ from xm_slurm.executors import SlurmSpec
10
+ from xm_slurm.packaging import registry
11
+ from xm_slurm.packaging.docker.abc import DockerClient
12
+
13
+ FLAGS = flags.FLAGS
14
+ REMOTE_BUILD = flags.DEFINE_enum(
15
+ "xm_builder", "local", ["local", "gcp", "azure"], "Remote build provider."
16
+ )
17
+
18
+ IndexedContainer = registry.IndexedContainer
19
+
20
+
21
+ @functools.cache
22
+ def docker_client() -> DockerClient:
23
+ match REMOTE_BUILD.value:
24
+ case "local":
25
+ from xm_slurm.packaging.docker.local import LocalDockerClient
26
+
27
+ return LocalDockerClient()
28
+ case "gcp":
29
+ from xm_slurm.packaging.docker.cloud import GoogleCloudRemoteDockerClient
30
+
31
+ return GoogleCloudRemoteDockerClient()
32
+ case "azure":
33
+ raise NotImplementedError("Azure remote build is not yet supported.")
34
+ case _:
35
+ raise ValueError(f"Unknown remote build provider: {REMOTE_BUILD.value}")
36
+
37
+
38
+ @registry.register(Dockerfile)
39
+ def _(
40
+ targets: Sequence[IndexedContainer[xm.Packageable]],
41
+ ) -> list[IndexedContainer[RemoteImage]]:
42
+ return docker_client().bake(targets=targets)
43
+
44
+
45
+ @registry.register(DockerImage)
46
+ def _(
47
+ targets: Sequence[IndexedContainer[xm.Packageable]],
48
+ ) -> list[IndexedContainer[RemoteImage]]:
49
+ """Build Docker images, this is essentially a passthrough."""
50
+ images = []
51
+ client = docker_client()
52
+ for target in targets:
53
+ assert isinstance(target.value.executable_spec, DockerImage)
54
+ assert isinstance(target.value.executor_spec, SlurmSpec)
55
+ if target.value.executor_spec.tag is not None:
56
+ raise ValueError(
57
+ "Executable `DockerImage` should not be tagged via `SlurmSpec`. "
58
+ "The image URI is provided by the `DockerImage` itself."
59
+ )
60
+
61
+ uri = ImageURI(target.value.executable_spec.image)
62
+ images.append(
63
+ dataclasses.replace(
64
+ target,
65
+ value=RemoteImage( # type: ignore
66
+ image=str(uri),
67
+ workdir=target.value.executable_spec.workdir,
68
+ args=target.value.args,
69
+ env_vars=target.value.env_vars,
70
+ credentials=client.credentials(hostname=uri.domain),
71
+ ),
72
+ )
73
+ )
74
+
75
+ return images
@@ -0,0 +1,112 @@
1
+ import abc
2
+ import collections.abc
3
+ import dataclasses
4
+ import functools
5
+ import os
6
+ from typing import Literal, Mapping, Protocol, Sequence
7
+
8
+ import jinja2 as j2
9
+ from xmanager import xm
10
+
11
+ from xm_slurm.executables import RemoteImage, RemoteRepositoryCredentials
12
+ from xm_slurm.packaging.registry import IndexedContainer
13
+
14
+
15
+ class DockerCommandProtocol(Protocol):
16
+ def to_args(self) -> xm.SequentialArgs: ...
17
+
18
+
19
+ @dataclasses.dataclass(frozen=True, kw_only=True)
20
+ class DockerBakeCommand(DockerCommandProtocol):
21
+ targets: str | Sequence[str] | None = None
22
+ builder: str | None = None
23
+ files: str | os.PathLike[str] | Sequence[os.PathLike[str] | str] | None = None
24
+ load: bool = False
25
+ cache: bool = True
26
+ print: bool = False
27
+ pull: bool = False
28
+ push: bool = False
29
+ metadata_file: str | os.PathLike[str] | None = None
30
+ progress: Literal["auto", "plain", "tty"] = "auto"
31
+ set: Mapping[str, str] | None = None
32
+
33
+ def to_args(self) -> xm.SequentialArgs:
34
+ files = self.files
35
+ if files is None:
36
+ files = []
37
+ if not isinstance(files, collections.abc.Sequence):
38
+ files = [files]
39
+
40
+ targets = self.targets
41
+ if targets is None:
42
+ targets = []
43
+ elif not isinstance(targets, collections.abc.Sequence):
44
+ targets = [targets]
45
+
46
+ return xm.merge_args(
47
+ ["buildx", "bake"],
48
+ [f"--progress={self.progress}"],
49
+ [f"--builder={self.builder}"] if self.builder else [],
50
+ [f"--metadata-file={self.metadata_file}"] if self.metadata_file else [],
51
+ ["--print"] if self.print else [],
52
+ ["--push"] if self.push else [],
53
+ ["--pull"] if self.pull else [],
54
+ ["--load"] if self.load else [],
55
+ ["--no-cache"] if not self.cache else [],
56
+ [f"--file={file}" for file in files],
57
+ [f"--set={key}={value}" for key, value in self.set.items()] if self.set else [],
58
+ targets,
59
+ )
60
+
61
+
62
+ @dataclasses.dataclass(frozen=True, kw_only=True)
63
+ class DockerPullCommand(DockerCommandProtocol):
64
+ image: str
65
+
66
+ def to_args(self) -> xm.SequentialArgs:
67
+ return xm.merge_args(["pull", self.image])
68
+
69
+
70
+ @dataclasses.dataclass(frozen=True, kw_only=True)
71
+ class DockerLoginCommand(DockerCommandProtocol):
72
+ server: str
73
+ username: str
74
+ password: str | None = None
75
+ password_stdin: bool = False
76
+
77
+ def __post_init__(self):
78
+ if self.password is None and not self.password_stdin:
79
+ raise ValueError("Either password or password_stdin must be set")
80
+ if self.password is not None and self.password_stdin:
81
+ raise ValueError("Only one of password or password_stdin must be set")
82
+
83
+ def to_args(self) -> xm.SequentialArgs:
84
+ return xm.merge_args(
85
+ ["login", "--username", self.username],
86
+ ["--password", self.password] if self.password else [],
87
+ ["--password-stdin"] if self.password_stdin else [],
88
+ [self.server],
89
+ )
90
+
91
+
92
+ @dataclasses.dataclass(frozen=True, kw_only=True)
93
+ class DockerVersionCommand(DockerCommandProtocol):
94
+ def to_args(self) -> xm.SequentialArgs:
95
+ return xm.merge_args(["buildx", "version"])
96
+
97
+
98
+ class DockerClient(abc.ABC):
99
+ @functools.cached_property
100
+ def _bake_template(self) -> j2.Template:
101
+ template_loader = j2.PackageLoader("xm_slurm", "templates/docker")
102
+ template_env = j2.Environment(loader=template_loader, trim_blocks=True, lstrip_blocks=False)
103
+
104
+ return template_env.get_template("docker-bake.hcl.j2")
105
+
106
+ @abc.abstractmethod
107
+ def credentials(self, *, hostname: str) -> RemoteRepositoryCredentials | None: ...
108
+
109
+ @abc.abstractmethod
110
+ def bake(
111
+ self, *, targets: Sequence[IndexedContainer[xm.Packageable]]
112
+ ) -> list[IndexedContainer[RemoteImage]]: ...