xmanager-slurm 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

@@ -18,6 +18,8 @@ from xm_slurm.experiment import SlurmAuxiliaryUnit, SlurmExperiment
18
18
  P = ParamSpec("P")
19
19
  T = TypeVar("T")
20
20
 
21
+ logger = logging.getLogger(__name__)
22
+
21
23
 
22
24
  async def _monitor_parameter_controller(
23
25
  aux_unit: SlurmAuxiliaryUnit,
@@ -33,16 +35,16 @@ async def _monitor_parameter_controller(
33
35
  try:
34
36
  _ = future.result()
35
37
  except asyncio.CancelledError:
36
- logging.info("Local parameter controller was cancelled, resuming on remote controller.")
38
+ logger.info("Local parameter controller was cancelled, resuming on remote controller.")
37
39
  pass
38
40
  except Exception:
39
- logging.error("Local parameter controller failed, stopping remote controller.")
41
+ logger.error("Local parameter controller failed, stopping remote controller.")
40
42
  aux_unit.stop(
41
43
  mark_as_failed=True, mark_as_completed=False, message="Local controller failed."
42
44
  )
43
45
  raise
44
46
  else:
45
- logging.info(
47
+ logger.info(
46
48
  "Local parameter controller finished before remote controller started, "
47
49
  "stopping remote controller."
48
50
  )
@@ -56,33 +58,33 @@ async def _monitor_parameter_controller(
56
58
  interval=poll_interval,
57
59
  )
58
60
  async def wait_for_remote_controller() -> status.SlurmWorkUnitStatusEnum:
59
- logging.info("Waiting for remote parameter controller to start.")
61
+ logger.info("Waiting for remote parameter controller to start.")
60
62
  if local_controller_finished.is_set():
61
63
  return status.SlurmWorkUnitStatusEnum.COMPLETED
62
64
  return (await aux_unit.get_status()).status
63
65
 
64
- logging.info("Monitoring remote parameter controller.")
66
+ logger.info("Monitoring remote parameter controller.")
65
67
  # TODO(jfarebro): make get_status() more resiliant to errors when initially scheduling.
66
68
  # We run into issues if we call get_status() too quickly when Slurm hasn't ingested the job.
67
69
  await asyncio.sleep(15)
68
70
  match await wait_for_remote_controller():
69
71
  case status.SlurmWorkUnitStatusEnum.RUNNING:
70
- logging.info("Remote parameter controller started.")
72
+ logger.info("Remote parameter controller started.")
71
73
  local_parameter_controller.cancel("Remote parameter controller started.")
72
74
  case status.SlurmWorkUnitStatusEnum.COMPLETED:
73
75
  if local_parameter_controller.done():
74
- logging.info("Local parameter controller finished, stopping remote controller.")
76
+ logger.info("Local parameter controller finished, stopping remote controller.")
75
77
  aux_unit.stop(
76
78
  mark_as_completed=True, message="Local parameter controller finished."
77
79
  )
78
80
  else:
79
- logging.info("Remote parameter controller finished, stopping local controller.")
81
+ logger.info("Remote parameter controller finished, stopping local controller.")
80
82
  local_parameter_controller.cancel()
81
83
  case status.SlurmWorkUnitStatusEnum.FAILED:
82
- logging.error("Remote parameter controller failed, stopping local controller.")
84
+ logger.error("Remote parameter controller failed, stopping local controller.")
83
85
  local_parameter_controller.cancel()
84
86
  case status.SlurmWorkUnitStatusEnum.CANCELLED:
85
- logging.info("Remote parameter controller was cancelled, stopping local controller.")
87
+ logger.info("Remote parameter controller was cancelled, stopping local controller.")
86
88
  local_parameter_controller.cancel()
87
89
  case status.SlurmWorkUnitStatusEnum.PENDING:
88
90
  raise RuntimeError("Remote parameter controller is still pending, invalid state.")
@@ -1,6 +1,9 @@
1
1
  import base64
2
+ import collections.abc
2
3
  import dataclasses
3
4
  import enum
5
+ import functools
6
+ import hashlib
4
7
  import json
5
8
  import logging
6
9
  import os
@@ -9,29 +12,31 @@ import shlex
9
12
  import shutil
10
13
  import subprocess
11
14
  import tempfile
12
- from typing import Sequence
15
+ from typing import Hashable, Literal, Mapping, Sequence
13
16
 
17
+ import jinja2 as j2
14
18
  from xmanager import xm
15
19
 
16
20
  from xm_slurm.executables import (
17
21
  Dockerfile,
22
+ DockerImage,
18
23
  ImageURI,
19
24
  RemoteImage,
20
25
  RemoteRepositoryCredentials,
21
26
  )
22
27
  from xm_slurm.executors import SlurmSpec
28
+ from xm_slurm.packaging import registry
23
29
  from xm_slurm.packaging import utils as packaging_utils
24
- from xm_slurm.packaging.docker.abc import (
25
- DockerBakeCommand,
26
- DockerClient,
27
- DockerVersionCommand,
28
- )
29
30
  from xm_slurm.packaging.registry import IndexedContainer
30
31
 
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _hash_digest(obj: Hashable) -> str:
36
+ return hashlib.sha256(repr(obj).encode()).hexdigest()
31
37
 
32
- class LocalDockerClient(DockerClient):
33
- """Build Docker images locally."""
34
38
 
39
+ class DockerClient:
35
40
  class Builder(enum.Enum):
36
41
  BUILDKIT = enum.auto()
37
42
  BUILDAH = enum.auto()
@@ -47,12 +52,11 @@ class LocalDockerClient(DockerClient):
47
52
  raise RuntimeError("No Docker client found.")
48
53
  self._client_call = client_call
49
54
 
50
- version_command = DockerVersionCommand()
51
55
  backend_version = packaging_utils.run_command(
52
- xm.merge_args(self._client_call, version_command.to_args()), return_stdout=True
56
+ xm.merge_args(self._client_call, ["buildx", "version"]), return_stdout=True
53
57
  )
54
58
  if backend_version.stdout.startswith("github.com/docker/buildx"):
55
- self._builder = LocalDockerClient.Builder.BUILDKIT
59
+ self._builder = DockerClient.Builder.BUILDKIT
56
60
  else:
57
61
  raise NotImplementedError(f"Unsupported Docker build backend: {backend_version}")
58
62
 
@@ -115,21 +119,69 @@ class LocalDockerClient(DockerClient):
115
119
 
116
120
  # Attempt to parse credentials from the Docker or Podman configuration
117
121
  match self._builder:
118
- case LocalDockerClient.Builder.BUILDKIT:
122
+ case DockerClient.Builder.BUILDKIT:
119
123
  docker_config_path = (
120
124
  pathlib.Path(os.environ.get("DOCKER_CONFIG", "~/.docker")).expanduser()
121
125
  / "config.json"
122
126
  )
123
127
  return _parse_credentials_from_config(docker_config_path)
124
- case LocalDockerClient.Builder.BUILDAH:
128
+ case DockerClient.Builder.BUILDAH:
125
129
  podman_config_path = (
126
130
  pathlib.Path(os.environ.get("XDG_CONFIG_HOME", "~/.config")).expanduser()
127
131
  / "containers"
128
132
  / "auth.json"
129
133
  )
130
134
  return _parse_credentials_from_config(podman_config_path)
131
- case _:
132
- return None
135
+
136
+ @functools.cached_property
137
+ def _bake_template(self) -> j2.Template:
138
+ template_loader = j2.PackageLoader("xm_slurm", "templates/docker")
139
+ template_env = j2.Environment(loader=template_loader, trim_blocks=True, lstrip_blocks=False)
140
+
141
+ return template_env.get_template("docker-bake.hcl.j2")
142
+
143
+ def _bake_args(
144
+ self,
145
+ *,
146
+ targets: str | Sequence[str] | None = None,
147
+ builder: str | None = None,
148
+ files: str | os.PathLike[str] | Sequence[os.PathLike[str] | str] | None = None,
149
+ load: bool = False,
150
+ cache: bool = True,
151
+ print: bool = False,
152
+ pull: bool = False,
153
+ push: bool = False,
154
+ metadata_file: str | os.PathLike[str] | None = None,
155
+ progress: Literal["auto", "plain", "tty"] = "auto",
156
+ set: Mapping[str, str] | None = None,
157
+ ) -> xm.SequentialArgs:
158
+ files = files
159
+ if files is None:
160
+ files = []
161
+ if not isinstance(files, collections.abc.Sequence):
162
+ files = [files]
163
+
164
+ targets = targets
165
+ if targets is None:
166
+ targets = []
167
+ elif isinstance(targets, str):
168
+ targets = [targets]
169
+ assert isinstance(targets, collections.abc.Sequence)
170
+
171
+ return xm.merge_args(
172
+ ["buildx", "bake"],
173
+ [f"--progress={progress}"],
174
+ [f"--builder={builder}"] if builder else [],
175
+ [f"--metadata-file={metadata_file}"] if metadata_file else [],
176
+ ["--print"] if print else [],
177
+ ["--push"] if push else [],
178
+ ["--pull"] if pull else [],
179
+ ["--load"] if load else [],
180
+ ["--no-cache"] if not cache else [],
181
+ [f"--file={file}" for file in files],
182
+ [f"--set={key}={value}" for key, value in set.items()] if set else [],
183
+ targets,
184
+ )
133
185
 
134
186
  def bake(
135
187
  self,
@@ -150,28 +202,27 @@ class LocalDockerClient(DockerClient):
150
202
  metadata_file = pathlib.Path(tempdir) / "metadata.json"
151
203
 
152
204
  # Write HCL and bake it
205
+ # TODO(jfarebro): Need a better way to hash the executables
153
206
  hcl = self._bake_template.render(
154
207
  executables=executors_by_executables,
155
- hash=packaging_utils.hash_digest,
208
+ hash=_hash_digest,
156
209
  )
157
210
  hcl_file.write_text(hcl)
158
- logging.debug(hcl)
211
+ logger.debug(hcl)
159
212
 
160
213
  try:
161
- command = DockerBakeCommand(
162
- targets=list(
163
- set([
164
- packaging_utils.hash_digest(target.value.executable_spec)
165
- for target in targets
166
- ])
214
+ bake_command = xm.merge_args(
215
+ self._client_call,
216
+ self._bake_args(
217
+ targets=list(
218
+ set([_hash_digest(target.value.executable_spec) for target in targets])
219
+ ),
220
+ files=[hcl_file],
221
+ metadata_file=metadata_file,
222
+ pull=True,
223
+ push=True,
167
224
  ),
168
- files=[hcl_file],
169
- metadata_file=metadata_file,
170
- pull=True,
171
- push=True,
172
225
  )
173
-
174
- bake_command = xm.merge_args(self._client_call, command.to_args())
175
226
  packaging_utils.run_command(bake_command.to_list(), tty=True, check=True)
176
227
  except Exception as ex:
177
228
  raise RuntimeError(f"Failed to build Dockerfiles: {ex}") from ex
@@ -184,9 +235,7 @@ class LocalDockerClient(DockerClient):
184
235
  assert isinstance(target.value.executor_spec, SlurmSpec)
185
236
  assert target.value.executor_spec.tag
186
237
 
187
- executable_metadata = metadata[
188
- packaging_utils.hash_digest(target.value.executable_spec)
189
- ]
238
+ executable_metadata = metadata[_hash_digest(target.value.executable_spec)]
190
239
  uri = ImageURI(target.value.executor_spec.tag).with_digest(
191
240
  executable_metadata["containerimage.digest"]
192
241
  )
@@ -205,3 +254,48 @@ class LocalDockerClient(DockerClient):
205
254
  )
206
255
 
207
256
  return images
257
+
258
+
259
+ @functools.cache
260
+ def docker_client() -> DockerClient:
261
+ return DockerClient()
262
+
263
+
264
+ @registry.register(Dockerfile)
265
+ def _(
266
+ targets: Sequence[IndexedContainer[xm.Packageable]],
267
+ ) -> list[IndexedContainer[RemoteImage]]:
268
+ return docker_client().bake(targets=targets)
269
+
270
+
271
+ @registry.register(DockerImage)
272
+ def _(
273
+ targets: Sequence[IndexedContainer[xm.Packageable]],
274
+ ) -> list[IndexedContainer[RemoteImage]]:
275
+ """Build Docker images, this is essentially a passthrough."""
276
+ images = []
277
+ client = docker_client()
278
+ for target in targets:
279
+ assert isinstance(target.value.executable_spec, DockerImage)
280
+ assert isinstance(target.value.executor_spec, SlurmSpec)
281
+ if target.value.executor_spec.tag is not None:
282
+ raise ValueError(
283
+ "Executable `DockerImage` should not be tagged via `SlurmSpec`. "
284
+ "The image URI is provided by the `DockerImage` itself."
285
+ )
286
+
287
+ uri = ImageURI(target.value.executable_spec.image)
288
+ images.append(
289
+ dataclasses.replace(
290
+ target,
291
+ value=RemoteImage( # type: ignore
292
+ image=str(uri),
293
+ workdir=target.value.executable_spec.workdir,
294
+ args=target.value.args,
295
+ env_vars=target.value.env_vars,
296
+ credentials=client.credentials(hostname=uri.domain),
297
+ ),
298
+ )
299
+ )
300
+
301
+ return images
@@ -10,6 +10,8 @@ from xm_slurm.packaging import registry
10
10
 
11
11
  IndexedContainer = registry.IndexedContainer
12
12
 
13
+ logger = logging.getLogger(__name__)
14
+
13
15
 
14
16
  def package(
15
17
  packageables: Sequence[xm.Packageable],
@@ -39,7 +41,7 @@ def package(
39
41
  # TODO(jfarebro): Could make this async as well...?
40
42
  with console.status("[magenta] :package: Packaging executables..."):
41
43
  for executable_spec_type, targets_for_type in targets_by_type.items():
42
- logging.info(f"Packaging {len(targets_for_type)} {executable_spec_type!r} targets.")
44
+ logger.info(f"Packaging {len(targets_for_type)} {executable_spec_type!r} targets.")
43
45
  targets.extend(registry.route(executable_spec_type, targets_for_type))
44
46
 
45
47
  console.print(
@@ -1,7 +1,4 @@
1
1
  import collections
2
- import concurrent.futures
3
- import functools
4
- import hashlib
5
2
  import logging
6
3
  import os
7
4
  import pathlib
@@ -10,7 +7,7 @@ import re
10
7
  import select
11
8
  import shutil
12
9
  import subprocess
13
- from typing import Callable, Concatenate, Hashable, ParamSpec, Sequence, TypeVar
10
+ from typing import ParamSpec, Sequence, TypeVar
14
11
 
15
12
  from xmanager import xm
16
13
 
@@ -20,17 +17,7 @@ T = TypeVar("T")
20
17
  P = ParamSpec("P")
21
18
  ReturnT = TypeVar("ReturnT")
22
19
 
23
-
24
- def hash_digest(obj: Hashable) -> str:
25
- # TODO(jfarebro): Need a better way to hash these objects
26
- # obj_hash = hash(obj)
27
- # unsigned_obj_hash = obj_hash.from_bytes(
28
- # obj_hash.to_bytes((obj_hash.bit_length() + 7) // 8, "big", signed=True),
29
- # "big",
30
- # signed=False,
31
- # )
32
- # return hex(unsigned_obj_hash).removeprefix("0x")
33
- return hashlib.sha256(repr(obj).encode()).hexdigest()
20
+ logger = logging.getLogger(__name__)
34
21
 
35
22
 
36
23
  def collect_executors_by_executable(
@@ -42,17 +29,6 @@ def collect_executors_by_executable(
42
29
  return executors_by_executable
43
30
 
44
31
 
45
- def parallel_map(
46
- f: Callable[Concatenate[T, P], ReturnT],
47
- ) -> Callable[Concatenate[Sequence[T], P], list[ReturnT]]:
48
- @functools.wraps(f)
49
- def decorator(sequence: Sequence[T], *args: P.args, **kwargs: P.kwargs) -> list[ReturnT]:
50
- with concurrent.futures.ThreadPoolExecutor() as executor:
51
- return list(executor.map(lambda x: f(x, *args, **kwargs), sequence))
52
-
53
- return decorator
54
-
55
-
56
32
  # Cursor commands to filter out from the command data stream
57
33
  _CURSOR_ESCAPE_SEQUENCES_REGEX = re.compile(
58
34
  rb"\x1b\[\?25[hl]" # Matches cursor show/hide commands (CSI ?25h and CSI ?25l)
@@ -86,8 +62,8 @@ def run_command(
86
62
  if executable.name == "docker" and args[1] == "buildx":
87
63
  subprocess_env |= {"DOCKER_CLI_EXPERIMENTAL": "enabled"}
88
64
 
89
- logging.debug(f"env: {subprocess_env}")
90
- logging.debug(f"command: {' '.join(args)}")
65
+ logger.debug(f"env: {subprocess_env}")
66
+ logger.debug(f"command: {' '.join(args)}")
91
67
 
92
68
  stdout_master, stdout_slave = pty.openpty()
93
69
  stderr_master, stderr_slave = pty.openpty()
@@ -0,0 +1,52 @@
1
+ import argparse
2
+
3
+ from xmanager import xm
4
+
5
+ import xm_slurm
6
+ from xm_slurm.console import console
7
+
8
+
9
+ async def logs(
10
+ experiment_id: int,
11
+ wid: int,
12
+ *,
13
+ follow: bool = True,
14
+ num_lines: int = 10,
15
+ block_size: int = 1024,
16
+ ):
17
+ wu = xm_slurm.get_experiment(experiment_id).work_units()[wid]
18
+ async for log in wu.logs(num_lines=num_lines, block_size=block_size, wait=True, follow=follow):
19
+ console.print(log, end="\n")
20
+
21
+
22
+ @xm.run_in_asyncio_loop
23
+ async def main():
24
+ parser = argparse.ArgumentParser(description="XManager.")
25
+ subparsers = parser.add_subparsers(dest="subcommand", required=True)
26
+
27
+ logs_parser = subparsers.add_parser("logs", help="Display logs for a specific experiment.")
28
+ logs_parser.add_argument("xid", type=int, help="Experiment ID.")
29
+ logs_parser.add_argument("wid", type=int, help="Work Unit ID.")
30
+ logs_parser.add_argument(
31
+ "-n",
32
+ "--n-lines",
33
+ type=int,
34
+ default=50,
35
+ help="Number of lines to display from the end of the log file.",
36
+ )
37
+ logs_parser.add_argument(
38
+ "-f",
39
+ "--follow",
40
+ default=True,
41
+ action="store_true",
42
+ help="Follow the log file as it is updated.",
43
+ )
44
+
45
+ args = parser.parse_args()
46
+ match args.subcommand:
47
+ case "logs":
48
+ await logs(args.xid, args.wid, follow=args.follow, num_lines=args.n_lines)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main() # type: ignore
@@ -1,7 +1,7 @@
1
1
  # syntax=docker/dockerfile:1.4
2
2
  ARG BASE_IMAGE=gcr.io/distroless/base-debian10
3
3
 
4
- FROM docker.io/mambaorg/micromamba:jammy as mamba
4
+ FROM docker.io/mambaorg/micromamba:bookworm-slim as mamba
5
5
  ARG CONDA_ENVIRONMENT=environment.yml
6
6
 
7
7
  USER root
@@ -24,6 +24,11 @@ __xm_slurm_wait_for_children() {
24
24
  echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
25
25
  scontrol requeue "${JOB_ID}"
26
26
  exit {{ requeue_exit_code }}
27
+ elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
28
+ echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
29
+ exit "${JOB_EXIT_CODE}"
30
+ else
31
+ echo "INFO: Job ${job} exited successfully." >&2
27
32
  fi
28
33
  done
29
34
  }
@@ -2,8 +2,7 @@
2
2
  {% block directives %}
3
3
  {{ super() -}}
4
4
  #SBATCH --array=0-{{ args | length - 1 }}
5
- #SBATCH --output=xm-%j-%a.stdout
6
- #SBATCH --error=xm-%j-%a.stderr
5
+ #SBATCH --output=slurm-%A_%a.out
7
6
  {% endblock directives %}
8
7
 
9
8
  {% block bootstrap %}
@@ -2,8 +2,7 @@
2
2
  {% block directives %}
3
3
  #SBATCH --open-mode=append
4
4
  #SBATCH --export=NONE
5
- #SBATCH --output=xm-%j.stdout
6
- #SBATCH --error=xm-%j.stderr
5
+ #SBATCH --output=slurm-%j.out
7
6
  #SBATCH --comment="{'xid': {{ experiment_id }}}"
8
7
  {% if cluster.account and not job.executor.account %}
9
8
  #SBATCH --account={{ cluster.account }}
@@ -16,8 +15,10 @@
16
15
  {% endif %}
17
16
  {% if identity %}
18
17
  #SBATCH --job-name=xm[{{ experiment_id }}.{{ identity }}]
19
- #SBATCH --dependency=singleton
20
18
  {% else %}
19
+ {% if dependency %}
20
+ #SBATCH {{ dependency.to_directive() }}
21
+ {% endif %}
21
22
  #SBATCH --job-name=xm[{{ experiment_id }}]
22
23
  {% endif %}
23
24
  {% for directive in job.executor.to_directives() %}
xm_slurm/types.py ADDED
@@ -0,0 +1,23 @@
1
+ import typing as tp
2
+
3
+ InstanceT_contra = tp.TypeVar("InstanceT_contra", contravariant=True)
4
+ GetterT_co = tp.TypeVar("GetterT_co", covariant=True)
5
+ SetterT_co = tp.TypeVar("SetterT_co", contravariant=True)
6
+
7
+
8
+ class Descriptor(tp.Protocol[GetterT_co, SetterT_co]):
9
+ def __set_name__(self, owner: tp.Type[tp.Any], name: str) -> None: ...
10
+
11
+ @tp.overload
12
+ def __get__(
13
+ self, instance: InstanceT_contra, owner: tp.Type[InstanceT_contra] | None = None
14
+ ) -> GetterT_co: ...
15
+
16
+ @tp.overload
17
+ def __get__(self, instance: None, owner: tp.Type[InstanceT_contra]) -> GetterT_co: ...
18
+
19
+ def __get__(
20
+ self, instance: InstanceT_contra | None, owner: tp.Type[InstanceT_contra] | None = None
21
+ ) -> GetterT_co: ...
22
+
23
+ def __set__(self, instance: tp.Any, value: SetterT_co) -> None: ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: xmanager-slurm
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -0,0 +1,44 @@
1
+ xm_slurm/__init__.py,sha256=Ld2w7ofLlTieWOHP_Jb3f48-qtVQBjFXynxUm9WF8mc,1116
2
+ xm_slurm/api.py,sha256=LeGgHz82t8Oay0Z1Ourv9-r-DBur3lhCUTnmmGhGFY4,18502
3
+ xm_slurm/batching.py,sha256=GbKBsNz9w8gIc2fHLZpslC0e4K9YUfLXFHmjduRRCfQ,4385
4
+ xm_slurm/config.py,sha256=GLLEkRLJxQW0urmHCLmwq_4ECmimEBQFl8Nz62SIo78,6787
5
+ xm_slurm/console.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
6
+ xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
7
+ xm_slurm/dependencies.py,sha256=-5gN_tpfs3dOA7H5_MIHO2ratb7F5Pm_yjkR5rZcgI8,6421
8
+ xm_slurm/executables.py,sha256=S3z8jSDL6AdyGYpzy_cCs03Mj0vgA4ZTqIe8APYor3E,6469
9
+ xm_slurm/execution.py,sha256=i2oYH5RS-mHsHPwFDFZvo5qCudbgqBML-Hzq6DPNItw,25721
10
+ xm_slurm/executors.py,sha256=fMtxGUCi4vEKmb_p4JEpqPUTh7L_f1LcR_TamMLAWNg,4667
11
+ xm_slurm/experiment.py,sha256=trHapcYxPNKofzSqu7KZawML59tZ8FVjoEZYe2Wal7w,44521
12
+ xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
13
+ xm_slurm/packageables.py,sha256=YZFTL6UWx9A_zyztTy1THUlj3pW1rA0cBPHJxD1LOJk,12884
14
+ xm_slurm/resources.py,sha256=EaYDATVudrEDPKKdSZoWgfqPiidc6DMjIctmzLQmiH0,5683
15
+ xm_slurm/status.py,sha256=WTWiDHi-ZHtwHRnDP0cGa-27zTSm6LkA-GCKsN-zBgg,6916
16
+ xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
17
+ xm_slurm/utils.py,sha256=ESjOkGT7bRSzIeZrUtZplSHP4oaH6VZ92y2woYdcyKM,2239
18
+ xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ xm_slurm/contrib/clusters/__init__.py,sha256=vugR50D9fPJQN5bTd7cSArDGrA6pC-YJHMXrEyvr_Uw,2980
20
+ xm_slurm/contrib/clusters/drac.py,sha256=tJeQFWFIpeZ1gD3j6AAJssNoLSiDkB-3lz1_ObnkRhc,5905
21
+ xm_slurm/experimental/parameter_controller.py,sha256=b5LfglHV307F6QcPrHeZX5GJBtyOK9aQydke_SZ3Wto,8457
22
+ xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
23
+ xm_slurm/packaging/docker.py,sha256=TA8-QG09EdhF4K1ixrEboVFarF9LcURNHhzUXL-7Iqg,11518
24
+ xm_slurm/packaging/registry.py,sha256=GrdmQg9MgSo38OiqOzMKWSkQyBuyryOfc3zcdgZ4CUE,1148
25
+ xm_slurm/packaging/router.py,sha256=yPbdA9clrhly97cLgDsSRZG2LZRKE-oz8Hhdb7WtYqk,2070
26
+ xm_slurm/packaging/utils.py,sha256=KI5s32rNTCfgwzY_7Ghck27jHKvKg5sl5_NEEqJbJqI,3999
27
+ xm_slurm/scripts/_cloudpickle.py,sha256=dlJYf2SceOuUn8wi-ozuoYAQg71wqD2MUVOUCyOwWIY,647
28
+ xm_slurm/scripts/cli.py,sha256=ZXqYOs8X23TYDdKxvV-wIa-0mTfpxSl4_Pli6TiKI7s,1435
29
+ xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=ClsFpj91Mr1VfA8L6eqBG3HQz0Z8VenF6mEfmAhQgUo,1498
30
+ xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgCdBXjzqslFJdZI,753
31
+ xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
32
+ xm_slurm/templates/docker/uv.Dockerfile,sha256=kYD32oUS1jUaARsNV1o6EFnIfLCNh5GMmck27b-5NRU,969
33
+ xm_slurm/templates/slurm/job-array.bash.j2,sha256=iYtGMRDXgwwc2_8E3v4a30f3fKuq4zWgZHkxCXJ9iXc,567
34
+ xm_slurm/templates/slurm/job-group.bash.j2,sha256=UkjfBE7jg9mepcUWaHZEAjkiXsIM1j_sLxLzxkteD-Y,1120
35
+ xm_slurm/templates/slurm/job.bash.j2,sha256=v0xGYzagDdWW6Tg44qobGJLNSUP1Cf4CcekrPibYdrE,1864
36
+ xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=HYqYhXsTv8TCed5UaGCZVGIYsqxSKHcnPyNNTHWNvxc,1279
37
+ xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
38
+ xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=dMntzelhs8DqKyIpO9S6wzMfH2PDevmgvyjCW8Xc2dY,3222
39
+ xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=xKXYFvQvazMx0PgvmlRXR6eecoiBUl8y52dIzQtWkBE,1469
40
+ xmanager_slurm-0.4.1.dist-info/METADATA,sha256=3mT4XIm8evv-5qw7oney4nYn3IasIA_l1rWz86XNOY8,954
41
+ xmanager_slurm-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
42
+ xmanager_slurm-0.4.1.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
43
+ xmanager_slurm-0.4.1.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
44
+ xmanager_slurm-0.4.1.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ xm = xm_slurm.scripts.cli:main
@@ -1,69 +0,0 @@
1
- import dataclasses
2
- import functools
3
- from typing import Sequence
4
-
5
- from absl import flags
6
- from xmanager import xm
7
-
8
- from xm_slurm.executables import Dockerfile, DockerImage, ImageURI, RemoteImage
9
- from xm_slurm.executors import SlurmSpec
10
- from xm_slurm.packaging import registry
11
- from xm_slurm.packaging.docker.abc import DockerClient
12
-
13
- FLAGS = flags.FLAGS
14
- DOCKER_CLIENT_PROVIDER = flags.DEFINE_enum(
15
- "xm_docker_client", "docker", ["docker"], "Docker image build client."
16
- )
17
-
18
- IndexedContainer = registry.IndexedContainer
19
-
20
-
21
- @functools.cache
22
- def docker_client() -> DockerClient:
23
- match DOCKER_CLIENT_PROVIDER.value:
24
- case "docker":
25
- from xm_slurm.packaging.docker.local import LocalDockerClient
26
-
27
- return LocalDockerClient()
28
- case _:
29
- raise ValueError(f"Unknown build client: {DOCKER_CLIENT_PROVIDER.value}")
30
-
31
-
32
- @registry.register(Dockerfile)
33
- def _(
34
- targets: Sequence[IndexedContainer[xm.Packageable]],
35
- ) -> list[IndexedContainer[RemoteImage]]:
36
- return docker_client().bake(targets=targets)
37
-
38
-
39
- @registry.register(DockerImage)
40
- def _(
41
- targets: Sequence[IndexedContainer[xm.Packageable]],
42
- ) -> list[IndexedContainer[RemoteImage]]:
43
- """Build Docker images, this is essentially a passthrough."""
44
- images = []
45
- client = docker_client()
46
- for target in targets:
47
- assert isinstance(target.value.executable_spec, DockerImage)
48
- assert isinstance(target.value.executor_spec, SlurmSpec)
49
- if target.value.executor_spec.tag is not None:
50
- raise ValueError(
51
- "Executable `DockerImage` should not be tagged via `SlurmSpec`. "
52
- "The image URI is provided by the `DockerImage` itself."
53
- )
54
-
55
- uri = ImageURI(target.value.executable_spec.image)
56
- images.append(
57
- dataclasses.replace(
58
- target,
59
- value=RemoteImage( # type: ignore
60
- image=str(uri),
61
- workdir=target.value.executable_spec.workdir,
62
- args=target.value.args,
63
- env_vars=target.value.env_vars,
64
- credentials=client.credentials(hostname=uri.domain),
65
- ),
66
- )
67
- )
68
-
69
- return images