xmanager-slurm 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. xm_slurm/__init__.py +47 -0
  2. xm_slurm/api/__init__.py +33 -0
  3. xm_slurm/api/abc.py +65 -0
  4. xm_slurm/api/models.py +70 -0
  5. xm_slurm/api/sqlite/client.py +358 -0
  6. xm_slurm/api/web/client.py +173 -0
  7. xm_slurm/batching.py +139 -0
  8. xm_slurm/config.py +189 -0
  9. xm_slurm/console.py +3 -0
  10. xm_slurm/constants.py +19 -0
  11. xm_slurm/contrib/__init__.py +0 -0
  12. xm_slurm/contrib/clusters/__init__.py +67 -0
  13. xm_slurm/contrib/clusters/drac.py +242 -0
  14. xm_slurm/dependencies.py +171 -0
  15. xm_slurm/executables.py +215 -0
  16. xm_slurm/execution.py +995 -0
  17. xm_slurm/executors.py +210 -0
  18. xm_slurm/experiment.py +1016 -0
  19. xm_slurm/experimental/parameter_controller.py +206 -0
  20. xm_slurm/filesystems.py +129 -0
  21. xm_slurm/job_blocks.py +21 -0
  22. xm_slurm/metadata_context.py +253 -0
  23. xm_slurm/packageables.py +309 -0
  24. xm_slurm/packaging/__init__.py +8 -0
  25. xm_slurm/packaging/docker.py +348 -0
  26. xm_slurm/packaging/registry.py +45 -0
  27. xm_slurm/packaging/router.py +56 -0
  28. xm_slurm/packaging/utils.py +22 -0
  29. xm_slurm/resources.py +350 -0
  30. xm_slurm/scripts/_cloudpickle.py +28 -0
  31. xm_slurm/scripts/cli.py +90 -0
  32. xm_slurm/status.py +197 -0
  33. xm_slurm/templates/docker/docker-bake.hcl.j2 +54 -0
  34. xm_slurm/templates/docker/mamba.Dockerfile +29 -0
  35. xm_slurm/templates/docker/python.Dockerfile +32 -0
  36. xm_slurm/templates/docker/uv.Dockerfile +38 -0
  37. xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
  38. xm_slurm/templates/slurm/fragments/monitor.bash.j2 +78 -0
  39. xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
  40. xm_slurm/templates/slurm/job-array.bash.j2 +31 -0
  41. xm_slurm/templates/slurm/job-group.bash.j2 +47 -0
  42. xm_slurm/templates/slurm/job.bash.j2 +90 -0
  43. xm_slurm/templates/slurm/library/retry.bash +62 -0
  44. xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +73 -0
  45. xm_slurm/templates/slurm/runtimes/podman.bash.j2 +43 -0
  46. xm_slurm/types.py +23 -0
  47. xm_slurm/utils.py +196 -0
  48. xmanager_slurm-0.4.19.dist-info/METADATA +28 -0
  49. xmanager_slurm-0.4.19.dist-info/RECORD +52 -0
  50. xmanager_slurm-0.4.19.dist-info/WHEEL +4 -0
  51. xmanager_slurm-0.4.19.dist-info/entry_points.txt +2 -0
  52. xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md +227 -0
xm_slurm/executors.py ADDED
@@ -0,0 +1,210 @@
1
+ import collections.abc
2
+ import dataclasses
3
+ import datetime as dt
4
+ import signal
5
+ import typing as tp
6
+
7
+ from xmanager import xm
8
+
9
+ from xm_slurm import resources, utils
10
+
11
+ ResourceBindType = tp.Literal[
12
+ resources.ResourceType.GPU,
13
+ resources.ResourceType.MEMORY,
14
+ resources.ResourceType.RAM,
15
+ ]
16
+
17
+
18
+ @dataclasses.dataclass(frozen=True, kw_only=True)
19
+ class SlurmSpec(xm.ExecutorSpec):
20
+ """Slurm executor specification that describes the location of the container runtime.
21
+
22
+ Args:
23
+ tag: The Image URI to push and pull the container image from.
24
+ For example, using the GitHub Container Registry: `ghcr.io/my-project/my-image:latest`.
25
+ """
26
+
27
+ tag: str | None = None
28
+
29
+
30
+ @dataclasses.dataclass(frozen=True, kw_only=True)
31
+ class Slurm(xm.Executor):
32
+ """Slurm Executor describing the runtime environment.
33
+
34
+ Args:
35
+ requirements: The requirements for the job.
36
+ time: The maximum time to run the job.
37
+ switches: Maximum count of leaf switches desired for the job allocation.
38
+ switches_grace_period: Maximum time to wait for that number of switches.
39
+ bind: How to bind tasks to resource (memory, GPU, or generic resource).
40
+ bind_flag: Generic resource task binding options.
41
+ account: The account to charge the job to.
42
+ partition: The partition to run the job in.
43
+ qos: The quality of service to run the job with.
44
+ priority: The priority of the job.
45
+ reservation: Allocate resources for the job from the named reservation.
46
+ exclusive: Allow sharing nodes with other running jobs.
47
+ oversubscribe: Allow over-subscribing resources with other running jobs.
48
+ overcommit: Allow sharing of allocated resources as if only one task per was requested.
49
+ nice: Run the job with an adjusted scheduling priority.
50
+ kill_on_invalid_dependencies: Whether to kill the job if it has invalid dependencies.
51
+ timeout_signal: The signal to send to the job when it runs out of time.
52
+ timeout_signal_grace_period: The time to wait before sending `timeout_signal`.
53
+ requeue: Whether or not the job is eligible for requeueing.
54
+ requeue_on_exit_code: The exit code that triggers requeueing.
55
+ requeue_max_attempts: The maximum number of times to attempt requeueing.
56
+
57
+ """
58
+
59
+ # Job requirements
60
+ requirements: resources.JobRequirements
61
+ time: dt.timedelta
62
+ bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
63
+ bind_flag: str | None = None
64
+
65
+ # Placement
66
+ account: str | None = None
67
+ partition: str | None = None
68
+ qos: str | None = None
69
+ priority: int | None = None
70
+ reservation: str | tp.Iterable[str] | None = None
71
+ exclusive: bool = False
72
+ oversubscribe: bool = False
73
+ overcommit: bool = False
74
+ nice: int | None = None
75
+
76
+ # Job dependency handling
77
+ kill_on_invalid_dependencies: bool = True
78
+
79
+ # Job rescheduling
80
+ timeout_signal: signal.Signals = signal.SIGUSR2
81
+ timeout_signal_grace_period: dt.timedelta = dt.timedelta(seconds=90)
82
+
83
+ requeue: bool = True # Is this job ellible for requeueing?
84
+ requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
85
+ requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
86
+ requeue_max_attempts: int = 5 # How many times to attempt requeueing
87
+
88
+ @property
89
+ def requeue_timeout(self) -> dt.timedelta:
90
+ return self.time - self.timeout_signal_grace_period
91
+
92
+ def __post_init__(self) -> None:
93
+ if not isinstance(self.requirements, resources.JobRequirements):
94
+ raise TypeError(
95
+ f"requirements must be a `xm_slurm.JobRequirements`, got {type(self.requirements)}. "
96
+ "If you're still using `xm.JobRequirements`, please update to `xm_slurm.JobRequirements`."
97
+ )
98
+ if not isinstance(self.time, dt.timedelta):
99
+ raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
100
+ if self.bind is not None:
101
+ if not isinstance(self.bind, collections.abc.Mapping):
102
+ raise TypeError(f"bind must be a mapping, got {type(self.bind)}")
103
+ for resource, value in self.bind.items():
104
+ if resource not in (
105
+ resources.ResourceType.GPU,
106
+ resources.ResourceType.MEMORY,
107
+ resources.ResourceType.RAM,
108
+ ) and not isinstance(resource, str):
109
+ raise TypeError(
110
+ f"bind resource must be a {resources.ResourceType.GPU.name}, {resources.ResourceType.MEMORY.name}, or {resources.ResourceType.RAM.name}, got {type(resource)}"
111
+ )
112
+ if value is not None and not isinstance(value, str):
113
+ raise TypeError(f"bind value must be None or a string, got {type(value)}")
114
+ if self.bind_flag is not None and not isinstance(self.bind_flag, str):
115
+ raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
116
+
117
+ if not isinstance(self.timeout_signal, signal.Signals):
118
+ raise TypeError(
119
+ f"termination_signal must be a `signal.Signals`, got {type(self.timeout_signal)}"
120
+ )
121
+ if not isinstance(self.timeout_signal_grace_period, dt.timedelta):
122
+ raise TypeError(
123
+ f"termination_signal_delay_time must be a `datetime.timedelta`, got {type(self.timeout_signal_grace_period)}"
124
+ )
125
+ if self.requeue_max_attempts < 0:
126
+ raise ValueError(
127
+ f"requeue_max_attempts must be greater than or equal to 0, got {self.requeue_max_attempts}"
128
+ )
129
+ if self.requeue_on_exit_code == 0:
130
+ raise ValueError("requeue_on_exit_code should not be 0 to avoid unexpected behavior.")
131
+ if self.exclusive and self.oversubscribe:
132
+ raise ValueError("exclusive and oversubscribe are mutually exclusive.")
133
+ if self.nice is not None and not (-2147483645 <= self.nice <= 2147483645):
134
+ raise ValueError(f"nice must be between -2147483645 and 2147483645, got {self.nice}")
135
+
136
+ @classmethod
137
+ def Spec(cls, tag: str | None = None) -> SlurmSpec:
138
+ return SlurmSpec(tag=tag)
139
+
140
+ def batch_directives(self) -> list[str]:
141
+ # Job requirements
142
+ directives = self.requirements.batch_directives()
143
+
144
+ # Time
145
+ directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
146
+
147
+ # Job dependency handling
148
+ directives.append(
149
+ f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
150
+ )
151
+
152
+ # Placement
153
+ if self.account is not None:
154
+ directives.append(f"--account={self.account}")
155
+ if self.partition is not None:
156
+ directives.append(f"--partition={self.partition}")
157
+ if self.qos is not None:
158
+ directives.append(f"--qos={self.qos}")
159
+ if self.priority is not None:
160
+ directives.append(f"--priority={self.priority}")
161
+ if self.reservation is not None:
162
+ match self.reservation:
163
+ case str():
164
+ directives.append(f"--reservation={self.reservation}")
165
+ case collections.abc.Iterable():
166
+ directives.append(f"--reservation={','.join(self.reservation)}")
167
+ case _:
168
+ raise ValueError(f"Invalid reservation type: {type(self.reservation)}")
169
+ if self.exclusive:
170
+ directives.append("--exclusive")
171
+ if self.oversubscribe:
172
+ directives.append("--oversubscribe")
173
+ if self.overcommit:
174
+ directives.append("--overcommit")
175
+ if self.nice is not None:
176
+ directives.append(f"--nice={self.nice}")
177
+
178
+ # Job rescheduling
179
+ directives.append(
180
+ f"--signal={self.timeout_signal.name.removeprefix('SIG')}@{self.timeout_signal_grace_period.seconds}"
181
+ )
182
+ if self.requeue is not None and self.requeue_max_attempts > 0:
183
+ directives.append("--requeue")
184
+ else:
185
+ directives.append("--no-requeue")
186
+
187
+ return directives
188
+
189
+ def step_directives(self) -> list[str]:
190
+ directives = self.requirements.step_directives()
191
+
192
+ # Resource binding
193
+ if self.bind is not None:
194
+ for resource, value in self.bind.items():
195
+ if value is None:
196
+ value = "none"
197
+ match resource:
198
+ case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
199
+ directives.append(f"--mem-bind={value}")
200
+ case resources.ResourceType.GPU:
201
+ directives.append(f"--gpu-bind={value}")
202
+ case str():
203
+ directives.append(f"--tres-bind=gres/{resource}:{value}")
204
+ case _:
205
+ raise ValueError(f"Unsupported resource type {resource!r} for binding.")
206
+
207
+ if self.bind_flag is not None:
208
+ directives.append(f"--gres-flags={self.bind_flag}")
209
+
210
+ return directives