xmanager-slurm 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +4 -2
- xm_slurm/api.py +1 -1
- xm_slurm/config.py +7 -2
- xm_slurm/constants.py +4 -0
- xm_slurm/contrib/clusters/__init__.py +25 -0
- xm_slurm/dependencies.py +171 -0
- xm_slurm/executables.py +20 -15
- xm_slurm/execution.py +246 -96
- xm_slurm/executors.py +8 -12
- xm_slurm/experiment.py +374 -83
- xm_slurm/experimental/parameter_controller.py +12 -10
- xm_slurm/packaging/{docker/local.py → docker.py} +126 -32
- xm_slurm/packaging/router.py +3 -1
- xm_slurm/packaging/utils.py +4 -28
- xm_slurm/scripts/cli.py +52 -0
- xm_slurm/templates/docker/mamba.Dockerfile +1 -1
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +5 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +1 -2
- xm_slurm/templates/slurm/job.bash.j2 +4 -3
- xm_slurm/types.py +23 -0
- {xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.1.dist-info}/METADATA +1 -1
- xmanager_slurm-0.4.1.dist-info/RECORD +44 -0
- xmanager_slurm-0.4.1.dist-info/entry_points.txt +2 -0
- xm_slurm/packaging/docker/__init__.py +0 -69
- xm_slurm/packaging/docker/abc.py +0 -112
- xmanager_slurm-0.4.0.dist-info/RECORD +0 -42
- {xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.1.dist-info}/WHEEL +0 -0
- {xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -18,6 +18,8 @@ from xm_slurm.experiment import SlurmAuxiliaryUnit, SlurmExperiment
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
T = TypeVar("T")
|
|
20
20
|
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
21
23
|
|
|
22
24
|
async def _monitor_parameter_controller(
|
|
23
25
|
aux_unit: SlurmAuxiliaryUnit,
|
|
@@ -33,16 +35,16 @@ async def _monitor_parameter_controller(
|
|
|
33
35
|
try:
|
|
34
36
|
_ = future.result()
|
|
35
37
|
except asyncio.CancelledError:
|
|
36
|
-
|
|
38
|
+
logger.info("Local parameter controller was cancelled, resuming on remote controller.")
|
|
37
39
|
pass
|
|
38
40
|
except Exception:
|
|
39
|
-
|
|
41
|
+
logger.error("Local parameter controller failed, stopping remote controller.")
|
|
40
42
|
aux_unit.stop(
|
|
41
43
|
mark_as_failed=True, mark_as_completed=False, message="Local controller failed."
|
|
42
44
|
)
|
|
43
45
|
raise
|
|
44
46
|
else:
|
|
45
|
-
|
|
47
|
+
logger.info(
|
|
46
48
|
"Local parameter controller finished before remote controller started, "
|
|
47
49
|
"stopping remote controller."
|
|
48
50
|
)
|
|
@@ -56,33 +58,33 @@ async def _monitor_parameter_controller(
|
|
|
56
58
|
interval=poll_interval,
|
|
57
59
|
)
|
|
58
60
|
async def wait_for_remote_controller() -> status.SlurmWorkUnitStatusEnum:
|
|
59
|
-
|
|
61
|
+
logger.info("Waiting for remote parameter controller to start.")
|
|
60
62
|
if local_controller_finished.is_set():
|
|
61
63
|
return status.SlurmWorkUnitStatusEnum.COMPLETED
|
|
62
64
|
return (await aux_unit.get_status()).status
|
|
63
65
|
|
|
64
|
-
|
|
66
|
+
logger.info("Monitoring remote parameter controller.")
|
|
65
67
|
# TODO(jfarebro): make get_status() more resiliant to errors when initially scheduling.
|
|
66
68
|
# We run into issues if we call get_status() too quickly when Slurm hasn't ingested the job.
|
|
67
69
|
await asyncio.sleep(15)
|
|
68
70
|
match await wait_for_remote_controller():
|
|
69
71
|
case status.SlurmWorkUnitStatusEnum.RUNNING:
|
|
70
|
-
|
|
72
|
+
logger.info("Remote parameter controller started.")
|
|
71
73
|
local_parameter_controller.cancel("Remote parameter controller started.")
|
|
72
74
|
case status.SlurmWorkUnitStatusEnum.COMPLETED:
|
|
73
75
|
if local_parameter_controller.done():
|
|
74
|
-
|
|
76
|
+
logger.info("Local parameter controller finished, stopping remote controller.")
|
|
75
77
|
aux_unit.stop(
|
|
76
78
|
mark_as_completed=True, message="Local parameter controller finished."
|
|
77
79
|
)
|
|
78
80
|
else:
|
|
79
|
-
|
|
81
|
+
logger.info("Remote parameter controller finished, stopping local controller.")
|
|
80
82
|
local_parameter_controller.cancel()
|
|
81
83
|
case status.SlurmWorkUnitStatusEnum.FAILED:
|
|
82
|
-
|
|
84
|
+
logger.error("Remote parameter controller failed, stopping local controller.")
|
|
83
85
|
local_parameter_controller.cancel()
|
|
84
86
|
case status.SlurmWorkUnitStatusEnum.CANCELLED:
|
|
85
|
-
|
|
87
|
+
logger.info("Remote parameter controller was cancelled, stopping local controller.")
|
|
86
88
|
local_parameter_controller.cancel()
|
|
87
89
|
case status.SlurmWorkUnitStatusEnum.PENDING:
|
|
88
90
|
raise RuntimeError("Remote parameter controller is still pending, invalid state.")
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import collections.abc
|
|
2
3
|
import dataclasses
|
|
3
4
|
import enum
|
|
5
|
+
import functools
|
|
6
|
+
import hashlib
|
|
4
7
|
import json
|
|
5
8
|
import logging
|
|
6
9
|
import os
|
|
@@ -9,29 +12,31 @@ import shlex
|
|
|
9
12
|
import shutil
|
|
10
13
|
import subprocess
|
|
11
14
|
import tempfile
|
|
12
|
-
from typing import Sequence
|
|
15
|
+
from typing import Hashable, Literal, Mapping, Sequence
|
|
13
16
|
|
|
17
|
+
import jinja2 as j2
|
|
14
18
|
from xmanager import xm
|
|
15
19
|
|
|
16
20
|
from xm_slurm.executables import (
|
|
17
21
|
Dockerfile,
|
|
22
|
+
DockerImage,
|
|
18
23
|
ImageURI,
|
|
19
24
|
RemoteImage,
|
|
20
25
|
RemoteRepositoryCredentials,
|
|
21
26
|
)
|
|
22
27
|
from xm_slurm.executors import SlurmSpec
|
|
28
|
+
from xm_slurm.packaging import registry
|
|
23
29
|
from xm_slurm.packaging import utils as packaging_utils
|
|
24
|
-
from xm_slurm.packaging.docker.abc import (
|
|
25
|
-
DockerBakeCommand,
|
|
26
|
-
DockerClient,
|
|
27
|
-
DockerVersionCommand,
|
|
28
|
-
)
|
|
29
30
|
from xm_slurm.packaging.registry import IndexedContainer
|
|
30
31
|
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _hash_digest(obj: Hashable) -> str:
|
|
36
|
+
return hashlib.sha256(repr(obj).encode()).hexdigest()
|
|
31
37
|
|
|
32
|
-
class LocalDockerClient(DockerClient):
|
|
33
|
-
"""Build Docker images locally."""
|
|
34
38
|
|
|
39
|
+
class DockerClient:
|
|
35
40
|
class Builder(enum.Enum):
|
|
36
41
|
BUILDKIT = enum.auto()
|
|
37
42
|
BUILDAH = enum.auto()
|
|
@@ -47,12 +52,11 @@ class LocalDockerClient(DockerClient):
|
|
|
47
52
|
raise RuntimeError("No Docker client found.")
|
|
48
53
|
self._client_call = client_call
|
|
49
54
|
|
|
50
|
-
version_command = DockerVersionCommand()
|
|
51
55
|
backend_version = packaging_utils.run_command(
|
|
52
|
-
xm.merge_args(self._client_call,
|
|
56
|
+
xm.merge_args(self._client_call, ["buildx", "version"]), return_stdout=True
|
|
53
57
|
)
|
|
54
58
|
if backend_version.stdout.startswith("github.com/docker/buildx"):
|
|
55
|
-
self._builder =
|
|
59
|
+
self._builder = DockerClient.Builder.BUILDKIT
|
|
56
60
|
else:
|
|
57
61
|
raise NotImplementedError(f"Unsupported Docker build backend: {backend_version}")
|
|
58
62
|
|
|
@@ -115,21 +119,69 @@ class LocalDockerClient(DockerClient):
|
|
|
115
119
|
|
|
116
120
|
# Attempt to parse credentials from the Docker or Podman configuration
|
|
117
121
|
match self._builder:
|
|
118
|
-
case
|
|
122
|
+
case DockerClient.Builder.BUILDKIT:
|
|
119
123
|
docker_config_path = (
|
|
120
124
|
pathlib.Path(os.environ.get("DOCKER_CONFIG", "~/.docker")).expanduser()
|
|
121
125
|
/ "config.json"
|
|
122
126
|
)
|
|
123
127
|
return _parse_credentials_from_config(docker_config_path)
|
|
124
|
-
case
|
|
128
|
+
case DockerClient.Builder.BUILDAH:
|
|
125
129
|
podman_config_path = (
|
|
126
130
|
pathlib.Path(os.environ.get("XDG_CONFIG_HOME", "~/.config")).expanduser()
|
|
127
131
|
/ "containers"
|
|
128
132
|
/ "auth.json"
|
|
129
133
|
)
|
|
130
134
|
return _parse_credentials_from_config(podman_config_path)
|
|
131
|
-
|
|
132
|
-
|
|
135
|
+
|
|
136
|
+
@functools.cached_property
|
|
137
|
+
def _bake_template(self) -> j2.Template:
|
|
138
|
+
template_loader = j2.PackageLoader("xm_slurm", "templates/docker")
|
|
139
|
+
template_env = j2.Environment(loader=template_loader, trim_blocks=True, lstrip_blocks=False)
|
|
140
|
+
|
|
141
|
+
return template_env.get_template("docker-bake.hcl.j2")
|
|
142
|
+
|
|
143
|
+
def _bake_args(
|
|
144
|
+
self,
|
|
145
|
+
*,
|
|
146
|
+
targets: str | Sequence[str] | None = None,
|
|
147
|
+
builder: str | None = None,
|
|
148
|
+
files: str | os.PathLike[str] | Sequence[os.PathLike[str] | str] | None = None,
|
|
149
|
+
load: bool = False,
|
|
150
|
+
cache: bool = True,
|
|
151
|
+
print: bool = False,
|
|
152
|
+
pull: bool = False,
|
|
153
|
+
push: bool = False,
|
|
154
|
+
metadata_file: str | os.PathLike[str] | None = None,
|
|
155
|
+
progress: Literal["auto", "plain", "tty"] = "auto",
|
|
156
|
+
set: Mapping[str, str] | None = None,
|
|
157
|
+
) -> xm.SequentialArgs:
|
|
158
|
+
files = files
|
|
159
|
+
if files is None:
|
|
160
|
+
files = []
|
|
161
|
+
if not isinstance(files, collections.abc.Sequence):
|
|
162
|
+
files = [files]
|
|
163
|
+
|
|
164
|
+
targets = targets
|
|
165
|
+
if targets is None:
|
|
166
|
+
targets = []
|
|
167
|
+
elif isinstance(targets, str):
|
|
168
|
+
targets = [targets]
|
|
169
|
+
assert isinstance(targets, collections.abc.Sequence)
|
|
170
|
+
|
|
171
|
+
return xm.merge_args(
|
|
172
|
+
["buildx", "bake"],
|
|
173
|
+
[f"--progress={progress}"],
|
|
174
|
+
[f"--builder={builder}"] if builder else [],
|
|
175
|
+
[f"--metadata-file={metadata_file}"] if metadata_file else [],
|
|
176
|
+
["--print"] if print else [],
|
|
177
|
+
["--push"] if push else [],
|
|
178
|
+
["--pull"] if pull else [],
|
|
179
|
+
["--load"] if load else [],
|
|
180
|
+
["--no-cache"] if not cache else [],
|
|
181
|
+
[f"--file={file}" for file in files],
|
|
182
|
+
[f"--set={key}={value}" for key, value in set.items()] if set else [],
|
|
183
|
+
targets,
|
|
184
|
+
)
|
|
133
185
|
|
|
134
186
|
def bake(
|
|
135
187
|
self,
|
|
@@ -150,28 +202,27 @@ class LocalDockerClient(DockerClient):
|
|
|
150
202
|
metadata_file = pathlib.Path(tempdir) / "metadata.json"
|
|
151
203
|
|
|
152
204
|
# Write HCL and bake it
|
|
205
|
+
# TODO(jfarebro): Need a better way to hash the executables
|
|
153
206
|
hcl = self._bake_template.render(
|
|
154
207
|
executables=executors_by_executables,
|
|
155
|
-
hash=
|
|
208
|
+
hash=_hash_digest,
|
|
156
209
|
)
|
|
157
210
|
hcl_file.write_text(hcl)
|
|
158
|
-
|
|
211
|
+
logger.debug(hcl)
|
|
159
212
|
|
|
160
213
|
try:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
for target in targets
|
|
166
|
-
|
|
214
|
+
bake_command = xm.merge_args(
|
|
215
|
+
self._client_call,
|
|
216
|
+
self._bake_args(
|
|
217
|
+
targets=list(
|
|
218
|
+
set([_hash_digest(target.value.executable_spec) for target in targets])
|
|
219
|
+
),
|
|
220
|
+
files=[hcl_file],
|
|
221
|
+
metadata_file=metadata_file,
|
|
222
|
+
pull=True,
|
|
223
|
+
push=True,
|
|
167
224
|
),
|
|
168
|
-
files=[hcl_file],
|
|
169
|
-
metadata_file=metadata_file,
|
|
170
|
-
pull=True,
|
|
171
|
-
push=True,
|
|
172
225
|
)
|
|
173
|
-
|
|
174
|
-
bake_command = xm.merge_args(self._client_call, command.to_args())
|
|
175
226
|
packaging_utils.run_command(bake_command.to_list(), tty=True, check=True)
|
|
176
227
|
except Exception as ex:
|
|
177
228
|
raise RuntimeError(f"Failed to build Dockerfiles: {ex}") from ex
|
|
@@ -184,9 +235,7 @@ class LocalDockerClient(DockerClient):
|
|
|
184
235
|
assert isinstance(target.value.executor_spec, SlurmSpec)
|
|
185
236
|
assert target.value.executor_spec.tag
|
|
186
237
|
|
|
187
|
-
executable_metadata = metadata[
|
|
188
|
-
packaging_utils.hash_digest(target.value.executable_spec)
|
|
189
|
-
]
|
|
238
|
+
executable_metadata = metadata[_hash_digest(target.value.executable_spec)]
|
|
190
239
|
uri = ImageURI(target.value.executor_spec.tag).with_digest(
|
|
191
240
|
executable_metadata["containerimage.digest"]
|
|
192
241
|
)
|
|
@@ -205,3 +254,48 @@ class LocalDockerClient(DockerClient):
|
|
|
205
254
|
)
|
|
206
255
|
|
|
207
256
|
return images
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@functools.cache
|
|
260
|
+
def docker_client() -> DockerClient:
|
|
261
|
+
return DockerClient()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@registry.register(Dockerfile)
|
|
265
|
+
def _(
|
|
266
|
+
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
267
|
+
) -> list[IndexedContainer[RemoteImage]]:
|
|
268
|
+
return docker_client().bake(targets=targets)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@registry.register(DockerImage)
|
|
272
|
+
def _(
|
|
273
|
+
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
274
|
+
) -> list[IndexedContainer[RemoteImage]]:
|
|
275
|
+
"""Build Docker images, this is essentially a passthrough."""
|
|
276
|
+
images = []
|
|
277
|
+
client = docker_client()
|
|
278
|
+
for target in targets:
|
|
279
|
+
assert isinstance(target.value.executable_spec, DockerImage)
|
|
280
|
+
assert isinstance(target.value.executor_spec, SlurmSpec)
|
|
281
|
+
if target.value.executor_spec.tag is not None:
|
|
282
|
+
raise ValueError(
|
|
283
|
+
"Executable `DockerImage` should not be tagged via `SlurmSpec`. "
|
|
284
|
+
"The image URI is provided by the `DockerImage` itself."
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
uri = ImageURI(target.value.executable_spec.image)
|
|
288
|
+
images.append(
|
|
289
|
+
dataclasses.replace(
|
|
290
|
+
target,
|
|
291
|
+
value=RemoteImage( # type: ignore
|
|
292
|
+
image=str(uri),
|
|
293
|
+
workdir=target.value.executable_spec.workdir,
|
|
294
|
+
args=target.value.args,
|
|
295
|
+
env_vars=target.value.env_vars,
|
|
296
|
+
credentials=client.credentials(hostname=uri.domain),
|
|
297
|
+
),
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return images
|
xm_slurm/packaging/router.py
CHANGED
|
@@ -10,6 +10,8 @@ from xm_slurm.packaging import registry
|
|
|
10
10
|
|
|
11
11
|
IndexedContainer = registry.IndexedContainer
|
|
12
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def package(
|
|
15
17
|
packageables: Sequence[xm.Packageable],
|
|
@@ -39,7 +41,7 @@ def package(
|
|
|
39
41
|
# TODO(jfarebro): Could make this async as well...?
|
|
40
42
|
with console.status("[magenta] :package: Packaging executables..."):
|
|
41
43
|
for executable_spec_type, targets_for_type in targets_by_type.items():
|
|
42
|
-
|
|
44
|
+
logger.info(f"Packaging {len(targets_for_type)} {executable_spec_type!r} targets.")
|
|
43
45
|
targets.extend(registry.route(executable_spec_type, targets_for_type))
|
|
44
46
|
|
|
45
47
|
console.print(
|
xm_slurm/packaging/utils.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
import collections
|
|
2
|
-
import concurrent.futures
|
|
3
|
-
import functools
|
|
4
|
-
import hashlib
|
|
5
2
|
import logging
|
|
6
3
|
import os
|
|
7
4
|
import pathlib
|
|
@@ -10,7 +7,7 @@ import re
|
|
|
10
7
|
import select
|
|
11
8
|
import shutil
|
|
12
9
|
import subprocess
|
|
13
|
-
from typing import
|
|
10
|
+
from typing import ParamSpec, Sequence, TypeVar
|
|
14
11
|
|
|
15
12
|
from xmanager import xm
|
|
16
13
|
|
|
@@ -20,17 +17,7 @@ T = TypeVar("T")
|
|
|
20
17
|
P = ParamSpec("P")
|
|
21
18
|
ReturnT = TypeVar("ReturnT")
|
|
22
19
|
|
|
23
|
-
|
|
24
|
-
def hash_digest(obj: Hashable) -> str:
|
|
25
|
-
# TODO(jfarebro): Need a better way to hash these objects
|
|
26
|
-
# obj_hash = hash(obj)
|
|
27
|
-
# unsigned_obj_hash = obj_hash.from_bytes(
|
|
28
|
-
# obj_hash.to_bytes((obj_hash.bit_length() + 7) // 8, "big", signed=True),
|
|
29
|
-
# "big",
|
|
30
|
-
# signed=False,
|
|
31
|
-
# )
|
|
32
|
-
# return hex(unsigned_obj_hash).removeprefix("0x")
|
|
33
|
-
return hashlib.sha256(repr(obj).encode()).hexdigest()
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
34
21
|
|
|
35
22
|
|
|
36
23
|
def collect_executors_by_executable(
|
|
@@ -42,17 +29,6 @@ def collect_executors_by_executable(
|
|
|
42
29
|
return executors_by_executable
|
|
43
30
|
|
|
44
31
|
|
|
45
|
-
def parallel_map(
|
|
46
|
-
f: Callable[Concatenate[T, P], ReturnT],
|
|
47
|
-
) -> Callable[Concatenate[Sequence[T], P], list[ReturnT]]:
|
|
48
|
-
@functools.wraps(f)
|
|
49
|
-
def decorator(sequence: Sequence[T], *args: P.args, **kwargs: P.kwargs) -> list[ReturnT]:
|
|
50
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
51
|
-
return list(executor.map(lambda x: f(x, *args, **kwargs), sequence))
|
|
52
|
-
|
|
53
|
-
return decorator
|
|
54
|
-
|
|
55
|
-
|
|
56
32
|
# Cursor commands to filter out from the command data stream
|
|
57
33
|
_CURSOR_ESCAPE_SEQUENCES_REGEX = re.compile(
|
|
58
34
|
rb"\x1b\[\?25[hl]" # Matches cursor show/hide commands (CSI ?25h and CSI ?25l)
|
|
@@ -86,8 +62,8 @@ def run_command(
|
|
|
86
62
|
if executable.name == "docker" and args[1] == "buildx":
|
|
87
63
|
subprocess_env |= {"DOCKER_CLI_EXPERIMENTAL": "enabled"}
|
|
88
64
|
|
|
89
|
-
|
|
90
|
-
|
|
65
|
+
logger.debug(f"env: {subprocess_env}")
|
|
66
|
+
logger.debug(f"command: {' '.join(args)}")
|
|
91
67
|
|
|
92
68
|
stdout_master, stdout_slave = pty.openpty()
|
|
93
69
|
stderr_master, stderr_slave = pty.openpty()
|
xm_slurm/scripts/cli.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from xmanager import xm
|
|
4
|
+
|
|
5
|
+
import xm_slurm
|
|
6
|
+
from xm_slurm.console import console
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def logs(
|
|
10
|
+
experiment_id: int,
|
|
11
|
+
wid: int,
|
|
12
|
+
*,
|
|
13
|
+
follow: bool = True,
|
|
14
|
+
num_lines: int = 10,
|
|
15
|
+
block_size: int = 1024,
|
|
16
|
+
):
|
|
17
|
+
wu = xm_slurm.get_experiment(experiment_id).work_units()[wid]
|
|
18
|
+
async for log in wu.logs(num_lines=num_lines, block_size=block_size, wait=True, follow=follow):
|
|
19
|
+
console.print(log, end="\n")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@xm.run_in_asyncio_loop
|
|
23
|
+
async def main():
|
|
24
|
+
parser = argparse.ArgumentParser(description="XManager.")
|
|
25
|
+
subparsers = parser.add_subparsers(dest="subcommand", required=True)
|
|
26
|
+
|
|
27
|
+
logs_parser = subparsers.add_parser("logs", help="Display logs for a specific experiment.")
|
|
28
|
+
logs_parser.add_argument("xid", type=int, help="Experiment ID.")
|
|
29
|
+
logs_parser.add_argument("wid", type=int, help="Work Unit ID.")
|
|
30
|
+
logs_parser.add_argument(
|
|
31
|
+
"-n",
|
|
32
|
+
"--n-lines",
|
|
33
|
+
type=int,
|
|
34
|
+
default=50,
|
|
35
|
+
help="Number of lines to display from the end of the log file.",
|
|
36
|
+
)
|
|
37
|
+
logs_parser.add_argument(
|
|
38
|
+
"-f",
|
|
39
|
+
"--follow",
|
|
40
|
+
default=True,
|
|
41
|
+
action="store_true",
|
|
42
|
+
help="Follow the log file as it is updated.",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
args = parser.parse_args()
|
|
46
|
+
match args.subcommand:
|
|
47
|
+
case "logs":
|
|
48
|
+
await logs(args.xid, args.wid, follow=args.follow, num_lines=args.n_lines)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
main() # type: ignore
|
|
@@ -24,6 +24,11 @@ __xm_slurm_wait_for_children() {
|
|
|
24
24
|
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
25
25
|
scontrol requeue "${JOB_ID}"
|
|
26
26
|
exit {{ requeue_exit_code }}
|
|
27
|
+
elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
|
|
28
|
+
echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
|
|
29
|
+
exit "${JOB_EXIT_CODE}"
|
|
30
|
+
else
|
|
31
|
+
echo "INFO: Job ${job} exited successfully." >&2
|
|
27
32
|
fi
|
|
28
33
|
done
|
|
29
34
|
}
|
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
{% block directives %}
|
|
3
3
|
#SBATCH --open-mode=append
|
|
4
4
|
#SBATCH --export=NONE
|
|
5
|
-
#SBATCH --output=
|
|
6
|
-
#SBATCH --error=xm-%j.stderr
|
|
5
|
+
#SBATCH --output=slurm-%j.out
|
|
7
6
|
#SBATCH --comment="{'xid': {{ experiment_id }}}"
|
|
8
7
|
{% if cluster.account and not job.executor.account %}
|
|
9
8
|
#SBATCH --account={{ cluster.account }}
|
|
@@ -16,8 +15,10 @@
|
|
|
16
15
|
{% endif %}
|
|
17
16
|
{% if identity %}
|
|
18
17
|
#SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
|
|
19
|
-
#SBATCH --dependency=singleton
|
|
20
18
|
{% else %}
|
|
19
|
+
{% if dependency %}
|
|
20
|
+
#SBATCH {{ dependency.to_directive() }}
|
|
21
|
+
{% endif %}
|
|
21
22
|
#SBATCH --job-name=xm[{{ experiment_id }}]
|
|
22
23
|
{% endif %}
|
|
23
24
|
{% for directive in job.executor.to_directives() %}
|
xm_slurm/types.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import typing as tp
|
|
2
|
+
|
|
3
|
+
InstanceT_contra = tp.TypeVar("InstanceT_contra", contravariant=True)
|
|
4
|
+
GetterT_co = tp.TypeVar("GetterT_co", covariant=True)
|
|
5
|
+
SetterT_co = tp.TypeVar("SetterT_co", contravariant=True)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Descriptor(tp.Protocol[GetterT_co, SetterT_co]):
|
|
9
|
+
def __set_name__(self, owner: tp.Type[tp.Any], name: str) -> None: ...
|
|
10
|
+
|
|
11
|
+
@tp.overload
|
|
12
|
+
def __get__(
|
|
13
|
+
self, instance: InstanceT_contra, owner: tp.Type[InstanceT_contra] | None = None
|
|
14
|
+
) -> GetterT_co: ...
|
|
15
|
+
|
|
16
|
+
@tp.overload
|
|
17
|
+
def __get__(self, instance: None, owner: tp.Type[InstanceT_contra]) -> GetterT_co: ...
|
|
18
|
+
|
|
19
|
+
def __get__(
|
|
20
|
+
self, instance: InstanceT_contra | None, owner: tp.Type[InstanceT_contra] | None = None
|
|
21
|
+
) -> GetterT_co: ...
|
|
22
|
+
|
|
23
|
+
def __set__(self, instance: tp.Any, value: SetterT_co) -> None: ...
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
xm_slurm/__init__.py,sha256=Ld2w7ofLlTieWOHP_Jb3f48-qtVQBjFXynxUm9WF8mc,1116
|
|
2
|
+
xm_slurm/api.py,sha256=LeGgHz82t8Oay0Z1Ourv9-r-DBur3lhCUTnmmGhGFY4,18502
|
|
3
|
+
xm_slurm/batching.py,sha256=GbKBsNz9w8gIc2fHLZpslC0e4K9YUfLXFHmjduRRCfQ,4385
|
|
4
|
+
xm_slurm/config.py,sha256=GLLEkRLJxQW0urmHCLmwq_4ECmimEBQFl8Nz62SIo78,6787
|
|
5
|
+
xm_slurm/console.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
|
|
6
|
+
xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
|
|
7
|
+
xm_slurm/dependencies.py,sha256=-5gN_tpfs3dOA7H5_MIHO2ratb7F5Pm_yjkR5rZcgI8,6421
|
|
8
|
+
xm_slurm/executables.py,sha256=S3z8jSDL6AdyGYpzy_cCs03Mj0vgA4ZTqIe8APYor3E,6469
|
|
9
|
+
xm_slurm/execution.py,sha256=i2oYH5RS-mHsHPwFDFZvo5qCudbgqBML-Hzq6DPNItw,25721
|
|
10
|
+
xm_slurm/executors.py,sha256=fMtxGUCi4vEKmb_p4JEpqPUTh7L_f1LcR_TamMLAWNg,4667
|
|
11
|
+
xm_slurm/experiment.py,sha256=trHapcYxPNKofzSqu7KZawML59tZ8FVjoEZYe2Wal7w,44521
|
|
12
|
+
xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
|
|
13
|
+
xm_slurm/packageables.py,sha256=YZFTL6UWx9A_zyztTy1THUlj3pW1rA0cBPHJxD1LOJk,12884
|
|
14
|
+
xm_slurm/resources.py,sha256=EaYDATVudrEDPKKdSZoWgfqPiidc6DMjIctmzLQmiH0,5683
|
|
15
|
+
xm_slurm/status.py,sha256=WTWiDHi-ZHtwHRnDP0cGa-27zTSm6LkA-GCKsN-zBgg,6916
|
|
16
|
+
xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
|
|
17
|
+
xm_slurm/utils.py,sha256=ESjOkGT7bRSzIeZrUtZplSHP4oaH6VZ92y2woYdcyKM,2239
|
|
18
|
+
xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
xm_slurm/contrib/clusters/__init__.py,sha256=vugR50D9fPJQN5bTd7cSArDGrA6pC-YJHMXrEyvr_Uw,2980
|
|
20
|
+
xm_slurm/contrib/clusters/drac.py,sha256=tJeQFWFIpeZ1gD3j6AAJssNoLSiDkB-3lz1_ObnkRhc,5905
|
|
21
|
+
xm_slurm/experimental/parameter_controller.py,sha256=b5LfglHV307F6QcPrHeZX5GJBtyOK9aQydke_SZ3Wto,8457
|
|
22
|
+
xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
|
|
23
|
+
xm_slurm/packaging/docker.py,sha256=TA8-QG09EdhF4K1ixrEboVFarF9LcURNHhzUXL-7Iqg,11518
|
|
24
|
+
xm_slurm/packaging/registry.py,sha256=GrdmQg9MgSo38OiqOzMKWSkQyBuyryOfc3zcdgZ4CUE,1148
|
|
25
|
+
xm_slurm/packaging/router.py,sha256=yPbdA9clrhly97cLgDsSRZG2LZRKE-oz8Hhdb7WtYqk,2070
|
|
26
|
+
xm_slurm/packaging/utils.py,sha256=KI5s32rNTCfgwzY_7Ghck27jHKvKg5sl5_NEEqJbJqI,3999
|
|
27
|
+
xm_slurm/scripts/_cloudpickle.py,sha256=dlJYf2SceOuUn8wi-ozuoYAQg71wqD2MUVOUCyOwWIY,647
|
|
28
|
+
xm_slurm/scripts/cli.py,sha256=ZXqYOs8X23TYDdKxvV-wIa-0mTfpxSl4_Pli6TiKI7s,1435
|
|
29
|
+
xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=ClsFpj91Mr1VfA8L6eqBG3HQz0Z8VenF6mEfmAhQgUo,1498
|
|
30
|
+
xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgCdBXjzqslFJdZI,753
|
|
31
|
+
xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
|
|
32
|
+
xm_slurm/templates/docker/uv.Dockerfile,sha256=kYD32oUS1jUaARsNV1o6EFnIfLCNh5GMmck27b-5NRU,969
|
|
33
|
+
xm_slurm/templates/slurm/job-array.bash.j2,sha256=iYtGMRDXgwwc2_8E3v4a30f3fKuq4zWgZHkxCXJ9iXc,567
|
|
34
|
+
xm_slurm/templates/slurm/job-group.bash.j2,sha256=UkjfBE7jg9mepcUWaHZEAjkiXsIM1j_sLxLzxkteD-Y,1120
|
|
35
|
+
xm_slurm/templates/slurm/job.bash.j2,sha256=v0xGYzagDdWW6Tg44qobGJLNSUP1Cf4CcekrPibYdrE,1864
|
|
36
|
+
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=HYqYhXsTv8TCed5UaGCZVGIYsqxSKHcnPyNNTHWNvxc,1279
|
|
37
|
+
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
38
|
+
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=dMntzelhs8DqKyIpO9S6wzMfH2PDevmgvyjCW8Xc2dY,3222
|
|
39
|
+
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=xKXYFvQvazMx0PgvmlRXR6eecoiBUl8y52dIzQtWkBE,1469
|
|
40
|
+
xmanager_slurm-0.4.1.dist-info/METADATA,sha256=3mT4XIm8evv-5qw7oney4nYn3IasIA_l1rWz86XNOY8,954
|
|
41
|
+
xmanager_slurm-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
42
|
+
xmanager_slurm-0.4.1.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
|
|
43
|
+
xmanager_slurm-0.4.1.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
|
|
44
|
+
xmanager_slurm-0.4.1.dist-info/RECORD,,
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import functools
|
|
3
|
-
from typing import Sequence
|
|
4
|
-
|
|
5
|
-
from absl import flags
|
|
6
|
-
from xmanager import xm
|
|
7
|
-
|
|
8
|
-
from xm_slurm.executables import Dockerfile, DockerImage, ImageURI, RemoteImage
|
|
9
|
-
from xm_slurm.executors import SlurmSpec
|
|
10
|
-
from xm_slurm.packaging import registry
|
|
11
|
-
from xm_slurm.packaging.docker.abc import DockerClient
|
|
12
|
-
|
|
13
|
-
FLAGS = flags.FLAGS
|
|
14
|
-
DOCKER_CLIENT_PROVIDER = flags.DEFINE_enum(
|
|
15
|
-
"xm_docker_client", "docker", ["docker"], "Docker image build client."
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
IndexedContainer = registry.IndexedContainer
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@functools.cache
|
|
22
|
-
def docker_client() -> DockerClient:
|
|
23
|
-
match DOCKER_CLIENT_PROVIDER.value:
|
|
24
|
-
case "docker":
|
|
25
|
-
from xm_slurm.packaging.docker.local import LocalDockerClient
|
|
26
|
-
|
|
27
|
-
return LocalDockerClient()
|
|
28
|
-
case _:
|
|
29
|
-
raise ValueError(f"Unknown build client: {DOCKER_CLIENT_PROVIDER.value}")
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@registry.register(Dockerfile)
|
|
33
|
-
def _(
|
|
34
|
-
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
35
|
-
) -> list[IndexedContainer[RemoteImage]]:
|
|
36
|
-
return docker_client().bake(targets=targets)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@registry.register(DockerImage)
|
|
40
|
-
def _(
|
|
41
|
-
targets: Sequence[IndexedContainer[xm.Packageable]],
|
|
42
|
-
) -> list[IndexedContainer[RemoteImage]]:
|
|
43
|
-
"""Build Docker images, this is essentially a passthrough."""
|
|
44
|
-
images = []
|
|
45
|
-
client = docker_client()
|
|
46
|
-
for target in targets:
|
|
47
|
-
assert isinstance(target.value.executable_spec, DockerImage)
|
|
48
|
-
assert isinstance(target.value.executor_spec, SlurmSpec)
|
|
49
|
-
if target.value.executor_spec.tag is not None:
|
|
50
|
-
raise ValueError(
|
|
51
|
-
"Executable `DockerImage` should not be tagged via `SlurmSpec`. "
|
|
52
|
-
"The image URI is provided by the `DockerImage` itself."
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
uri = ImageURI(target.value.executable_spec.image)
|
|
56
|
-
images.append(
|
|
57
|
-
dataclasses.replace(
|
|
58
|
-
target,
|
|
59
|
-
value=RemoteImage( # type: ignore
|
|
60
|
-
image=str(uri),
|
|
61
|
-
workdir=target.value.executable_spec.workdir,
|
|
62
|
-
args=target.value.args,
|
|
63
|
-
env_vars=target.value.env_vars,
|
|
64
|
-
credentials=client.credentials(hostname=uri.domain),
|
|
65
|
-
),
|
|
66
|
-
)
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
return images
|