experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +14 -4
- experimaestro/__main__.py +3 -423
- experimaestro/annotations.py +14 -4
- experimaestro/cli/__init__.py +311 -0
- experimaestro/{filter.py → cli/filter.py} +23 -9
- experimaestro/cli/jobs.py +268 -0
- experimaestro/cli/progress.py +269 -0
- experimaestro/click.py +0 -35
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +29 -14
- experimaestro/connectors/local.py +19 -10
- experimaestro/connectors/ssh.py +27 -8
- experimaestro/core/arguments.py +45 -3
- experimaestro/core/callbacks.py +52 -0
- experimaestro/core/context.py +8 -9
- experimaestro/core/identifier.py +310 -0
- experimaestro/core/objects/__init__.py +44 -0
- experimaestro/core/{objects.py → objects/config.py} +399 -772
- experimaestro/core/objects/config_utils.py +58 -0
- experimaestro/core/objects/config_walk.py +151 -0
- experimaestro/core/objects.pyi +15 -45
- experimaestro/core/serialization.py +63 -9
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/types.py +104 -66
- experimaestro/experiments/cli.py +154 -72
- experimaestro/experiments/configuration.py +10 -1
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/__init__.py +1 -1
- experimaestro/launcherfinder/base.py +2 -18
- experimaestro/launcherfinder/parser.py +8 -3
- experimaestro/launcherfinder/registry.py +52 -140
- experimaestro/launcherfinder/specs.py +49 -10
- experimaestro/launchers/direct.py +0 -47
- experimaestro/launchers/slurm/base.py +54 -14
- experimaestro/mkdocs/__init__.py +1 -1
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/notifications.py +38 -12
- experimaestro/progress.py +406 -0
- experimaestro/run.py +24 -3
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +108 -808
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +75 -0
- experimaestro/scheduler/workspace.py +27 -8
- experimaestro/scriptbuilder.py +18 -3
- experimaestro/server/__init__.py +36 -5
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/index.css +5187 -5068
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +68887 -68064
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +45 -5
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/core/__init__.py +0 -0
- experimaestro/tests/core/test_generics.py +206 -0
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +17 -5
- experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
- experimaestro/tests/restart.py +10 -5
- experimaestro/tests/tasks/all.py +23 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_experiment.py +73 -0
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_findlauncher.py +12 -5
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +182 -158
- experimaestro/tests/test_instance.py +19 -27
- experimaestro/tests/test_objects.py +13 -20
- experimaestro/tests/test_outputs.py +6 -6
- experimaestro/tests/test_param.py +68 -30
- experimaestro/tests/test_progress.py +4 -4
- experimaestro/tests/test_serializers.py +24 -64
- experimaestro/tests/test_ssh.py +7 -0
- experimaestro/tests/test_tags.py +50 -21
- experimaestro/tests/test_tasks.py +42 -51
- experimaestro/tests/test_tokens.py +11 -8
- experimaestro/tests/test_types.py +24 -21
- experimaestro/tests/test_validation.py +67 -110
- experimaestro/tests/token_reschedule.py +1 -1
- experimaestro/tokens.py +24 -13
- experimaestro/tools/diff.py +8 -1
- experimaestro/typingutils.py +20 -11
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/multiprocessing.py +44 -0
- experimaestro/utils/resources.py +11 -3
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
- experimaestro-2.0.0a8.dist-info/RECORD +166 -0
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
- experimaestro/launchers/slurm/cli.py +0 -29
- experimaestro/launchers/slurm/configuration.py +0 -597
- experimaestro/scheduler/environment.py +0 -94
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
- experimaestro/utils/yaml.py +0 -202
- experimaestro-1.5.1.dist-info/RECORD +0 -148
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,597 +0,0 @@
|
|
|
1
|
-
import codecs
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from copy import deepcopy
|
|
4
|
-
import io
|
|
5
|
-
import math
|
|
6
|
-
from attr import Factory
|
|
7
|
-
from attrs import define
|
|
8
|
-
import logging
|
|
9
|
-
from experimaestro import Annotated
|
|
10
|
-
from typing import (
|
|
11
|
-
Dict,
|
|
12
|
-
List,
|
|
13
|
-
Optional,
|
|
14
|
-
Set,
|
|
15
|
-
TextIO,
|
|
16
|
-
)
|
|
17
|
-
import re
|
|
18
|
-
import humanfriendly
|
|
19
|
-
from dataclasses import dataclass, field
|
|
20
|
-
from experimaestro.launcherfinder import YAMLDataClass, HostRequirement
|
|
21
|
-
from experimaestro.launcherfinder.base import LauncherConfiguration
|
|
22
|
-
from experimaestro.launcherfinder.registry import (
|
|
23
|
-
Initialize,
|
|
24
|
-
LauncherRegistry,
|
|
25
|
-
)
|
|
26
|
-
from experimaestro.launcherfinder.specs import (
|
|
27
|
-
CPUSpecification,
|
|
28
|
-
CudaSpecification,
|
|
29
|
-
HostSpecification,
|
|
30
|
-
)
|
|
31
|
-
from experimaestro.compat import cached_property
|
|
32
|
-
from . import Launcher
|
|
33
|
-
from experimaestro.connectors import (
|
|
34
|
-
Redirect,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
logger = logging.getLogger("xpm.slurm")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def fill_nodes_configuration(input: TextIO, configuration: "SlurmConfiguration"):
|
|
41
|
-
"""Parses the output of scontrol show nodes"""
|
|
42
|
-
re_nodename = re.compile(r"""^NodeName=([_\-\w]+)""")
|
|
43
|
-
re_features = re.compile(r"""^\s*AvailableFeatures=([,_\-\w]+)""")
|
|
44
|
-
re_partitions = re.compile(r"""^\s*Partitions=([,\-_\w]+)""")
|
|
45
|
-
|
|
46
|
-
nodename = ""
|
|
47
|
-
features = []
|
|
48
|
-
partition_names = []
|
|
49
|
-
partitions = configuration.partitions
|
|
50
|
-
partitions2features2nodes = defaultdict(lambda: {})
|
|
51
|
-
|
|
52
|
-
def process():
|
|
53
|
-
for partition_name in partition_names:
|
|
54
|
-
partition = partitions.setdefault(partition_name, SlurmPartition(nodes=[]))
|
|
55
|
-
|
|
56
|
-
fl = "&".join(sorted(features))
|
|
57
|
-
nodes = partitions2features2nodes[partition_name].get(fl)
|
|
58
|
-
if nodes is None:
|
|
59
|
-
nodes = SlurmNodes(hosts=[nodename], features=features)
|
|
60
|
-
partitions2features2nodes[partition_name][fl] = nodes
|
|
61
|
-
partition.nodes.append(nodes)
|
|
62
|
-
else:
|
|
63
|
-
if nodename not in nodes.hosts:
|
|
64
|
-
nodes.hosts.append(nodename)
|
|
65
|
-
|
|
66
|
-
for line in input.readlines():
|
|
67
|
-
if match := re_nodename.search(line):
|
|
68
|
-
if nodename:
|
|
69
|
-
process()
|
|
70
|
-
nodename = match.group(1)
|
|
71
|
-
elif match := re_features.search(line):
|
|
72
|
-
features = match.group(1).split(",")
|
|
73
|
-
elif match := re_partitions.search(line):
|
|
74
|
-
partition_names = match.group(1).split(",")
|
|
75
|
-
|
|
76
|
-
if nodename:
|
|
77
|
-
process()
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def fill_partitions_configuration(input: TextIO, configuration: "SlurmConfiguration"):
|
|
81
|
-
"""Parses the output of scontrol show --oneliner partition"""
|
|
82
|
-
re_partitionname = re.compile(r"""^PartitionName=(\w+)""")
|
|
83
|
-
re_mem_per_cpu = re.compile(r"""(?:=|\s)DefMemPerCPU=(\d+)(?:\D|$)""")
|
|
84
|
-
re_cpu_per_gpu = re.compile(r"""(?:=|\s)DefCpuPerGPU=(\d+)(?:\D|$)""")
|
|
85
|
-
|
|
86
|
-
for line in input.readlines():
|
|
87
|
-
if match := re_partitionname.search(line):
|
|
88
|
-
name = match.group(1)
|
|
89
|
-
cfg = configuration.partitions.setdefault(name, SlurmPartition(nodes=[]))
|
|
90
|
-
|
|
91
|
-
if m := re_mem_per_cpu.search(line):
|
|
92
|
-
cfg.mem_per_cpu = int(m.group(1)) * 1024
|
|
93
|
-
|
|
94
|
-
if m := re_cpu_per_gpu.search(line):
|
|
95
|
-
cfg.cpu_per_gpu = int(m.group(1))
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
# ---- SLURM launcher finder
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def parse_size(s: Optional[str]):
|
|
102
|
-
return humanfriendly.parse_size(s) if s else None
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
@dataclass
|
|
106
|
-
class GPUConfig(YAMLDataClass):
|
|
107
|
-
"""Represents a GPU"""
|
|
108
|
-
|
|
109
|
-
model: Optional[str] = None
|
|
110
|
-
count: int = 0
|
|
111
|
-
memory: Annotated[int, Initialize(parse_size)] = 0
|
|
112
|
-
|
|
113
|
-
min_memory: Annotated[int, Initialize(parse_size)] = 0
|
|
114
|
-
"""Minimum memory to be allocated on this node"""
|
|
115
|
-
|
|
116
|
-
min_mem_ratio: Optional[float] = 0.0
|
|
117
|
-
"""Minimum memory ratio"""
|
|
118
|
-
|
|
119
|
-
def update(self, other: "GPUConfig"):
|
|
120
|
-
if other.model:
|
|
121
|
-
self.model = other.model
|
|
122
|
-
if other.count:
|
|
123
|
-
self.count = other.count
|
|
124
|
-
if other.memory:
|
|
125
|
-
self.memory = other.memory
|
|
126
|
-
if other.min_memory:
|
|
127
|
-
self.min_memory = other.min_memory
|
|
128
|
-
|
|
129
|
-
def to_spec(self):
|
|
130
|
-
cuda = []
|
|
131
|
-
min_memory = max(int(self.memory * self.min_mem_ratio), self.min_memory)
|
|
132
|
-
cuda.extend(
|
|
133
|
-
[
|
|
134
|
-
CudaSpecification(self.memory, self.model, min_memory)
|
|
135
|
-
for _ in range(self.count)
|
|
136
|
-
]
|
|
137
|
-
)
|
|
138
|
-
return cuda
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
@dataclass
|
|
142
|
-
class CPUConfig(YAMLDataClass):
|
|
143
|
-
cpu_per_gpu: int = 0
|
|
144
|
-
"""Number of CPU per GPU"""
|
|
145
|
-
|
|
146
|
-
mem_per_cpu: Annotated[int, Initialize(humanfriendly.parse_size)] = 0
|
|
147
|
-
"""Memory per CPU"""
|
|
148
|
-
|
|
149
|
-
cores: int = 0
|
|
150
|
-
|
|
151
|
-
memory: Annotated[int, Initialize(parse_size)] = 0
|
|
152
|
-
|
|
153
|
-
def update(self, other: "CPUConfig"):
|
|
154
|
-
if other.cpu_per_gpu:
|
|
155
|
-
self.cpu_per_gpu = other.cpu_per_gpu
|
|
156
|
-
if other.mem_per_cpu:
|
|
157
|
-
self.mem_per_cpu = other.mem_per_cpu
|
|
158
|
-
if other.memory:
|
|
159
|
-
self.memory = other.memory
|
|
160
|
-
if other.cores:
|
|
161
|
-
self.cores = other.cores
|
|
162
|
-
|
|
163
|
-
def to_spec(self):
|
|
164
|
-
return CPUSpecification(
|
|
165
|
-
memory=self.memory,
|
|
166
|
-
cores=self.cores,
|
|
167
|
-
mem_per_cpu=self.mem_per_cpu,
|
|
168
|
-
cpu_per_gpu=self.cpu_per_gpu,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
@define
|
|
173
|
-
class SlurmHostSpecification(HostSpecification):
|
|
174
|
-
features: List[str] = Factory(list)
|
|
175
|
-
hosts: List[str] = Factory(list)
|
|
176
|
-
partition: str = Factory(str)
|
|
177
|
-
|
|
178
|
-
qos_id: Optional[str] = Factory(lambda: None)
|
|
179
|
-
"""Quality of Service"""
|
|
180
|
-
|
|
181
|
-
account_id: Optional[str] = Factory(lambda: None)
|
|
182
|
-
"""Account for this host"""
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
@dataclass
|
|
186
|
-
class SlurmNodeConfiguration(YAMLDataClass):
|
|
187
|
-
max_duration: Annotated[int, Initialize(humanfriendly.parse_timespan)] = 0
|
|
188
|
-
"""Maximum duration of a job"""
|
|
189
|
-
|
|
190
|
-
gpu: GPUConfig = field(default_factory=GPUConfig)
|
|
191
|
-
"""GPU Configuration"""
|
|
192
|
-
|
|
193
|
-
cpu: CPUConfig = field(default_factory=CPUConfig)
|
|
194
|
-
"""CPU Configuration"""
|
|
195
|
-
|
|
196
|
-
def update(self, other: "SlurmNodeConfiguration"):
|
|
197
|
-
if other.max_duration:
|
|
198
|
-
self.max_duration = other.max_duration
|
|
199
|
-
|
|
200
|
-
if other.gpu:
|
|
201
|
-
self.gpu.update(other.gpu)
|
|
202
|
-
|
|
203
|
-
if other.cpu:
|
|
204
|
-
self.cpu.update(other.cpu)
|
|
205
|
-
|
|
206
|
-
def to_host_spec(self):
|
|
207
|
-
spec = SlurmHostSpecification(
|
|
208
|
-
cpu=(self.cpu or CPUConfig()).to_spec(),
|
|
209
|
-
cuda=(self.gpu or GPUConfig()).to_spec(),
|
|
210
|
-
)
|
|
211
|
-
spec.max_duration = self.max_duration or 0
|
|
212
|
-
return spec
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
@dataclass
|
|
216
|
-
class SlurmNodes(YAMLDataClass):
|
|
217
|
-
features: List[str] = field(default_factory=list)
|
|
218
|
-
"""Nodes features"""
|
|
219
|
-
|
|
220
|
-
hosts: List[str] = field(default_factory=list)
|
|
221
|
-
"""List of hostnames"""
|
|
222
|
-
|
|
223
|
-
configuration: Optional[SlurmNodeConfiguration] = None
|
|
224
|
-
"""(optional) nodes configuration"""
|
|
225
|
-
|
|
226
|
-
count: int = 0
|
|
227
|
-
"""Number of hosts (if list of hosts is empty)"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
@dataclass
|
|
231
|
-
class SlurmPartition(YAMLDataClass):
|
|
232
|
-
"""A Slurm partition"""
|
|
233
|
-
|
|
234
|
-
accounts: List[str] = field(default_factory=list)
|
|
235
|
-
"""List of accounts for this partition with the associated priority modifier"""
|
|
236
|
-
|
|
237
|
-
qos: List[str] = field(default_factory=list)
|
|
238
|
-
"""List of QoS for this partition with the associated priority modifier"""
|
|
239
|
-
|
|
240
|
-
nodes: List[SlurmNodes] = field(default_factory=list)
|
|
241
|
-
"""List of nodes"""
|
|
242
|
-
|
|
243
|
-
configuration: Optional[SlurmNodeConfiguration] = None
|
|
244
|
-
"""Partition configuration"""
|
|
245
|
-
|
|
246
|
-
priority: int = 0
|
|
247
|
-
"""Priority for choosing this partition (higher preferred)"""
|
|
248
|
-
|
|
249
|
-
disabled: bool = False
|
|
250
|
-
"""Can be used to disable a partition"""
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
class FeatureConjunction(List[str]):
|
|
254
|
-
def __init__(self, features: List[str]):
|
|
255
|
-
super().__init__(sorted(features))
|
|
256
|
-
|
|
257
|
-
def __hash__(self) -> int:
|
|
258
|
-
return sum([hash(tag) for tag in self])
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
@dataclass
|
|
262
|
-
class SlurmFeature(YAMLDataClass):
|
|
263
|
-
"""Associate a configuration with a Slurm feature"""
|
|
264
|
-
|
|
265
|
-
configuration: Optional[SlurmNodeConfiguration] = None
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
@dataclass
|
|
269
|
-
class SlurmQOS(YAMLDataClass):
|
|
270
|
-
max_duration: Annotated[int, Initialize(humanfriendly.parse_timespan)] = 0
|
|
271
|
-
"""Maximum duration of a job"""
|
|
272
|
-
|
|
273
|
-
min_gpu: int = 0
|
|
274
|
-
"""Minimum number of GPUs"""
|
|
275
|
-
|
|
276
|
-
priority: int = 0
|
|
277
|
-
"""Priority modifier for this QoS"""
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
class NodesSpecComputer:
|
|
281
|
-
def __init__(self, config: "SlurmConfiguration", partition: SlurmPartition):
|
|
282
|
-
self.config = config
|
|
283
|
-
self.partition = partition
|
|
284
|
-
self.main_config = config
|
|
285
|
-
self.config = deepcopy(config.configuration)
|
|
286
|
-
|
|
287
|
-
self.config.max_duration = (
|
|
288
|
-
self.partition.configuration.max_duration
|
|
289
|
-
if self.partition.configuration
|
|
290
|
-
else None
|
|
291
|
-
)
|
|
292
|
-
self.priority = partition.priority
|
|
293
|
-
self.qos_id = None
|
|
294
|
-
self.min_gpu = 0
|
|
295
|
-
|
|
296
|
-
def update(self, config: SlurmNodeConfiguration):
|
|
297
|
-
self.config = deepcopy(self.config)
|
|
298
|
-
self.config.update(config)
|
|
299
|
-
|
|
300
|
-
def update_with_qos(self, qos_id: str):
|
|
301
|
-
self.qos_id = qos_id
|
|
302
|
-
if qos := self.main_config.qos.get(qos_id, None):
|
|
303
|
-
self.priority += qos.priority
|
|
304
|
-
self.min_gpu = qos.min_gpu
|
|
305
|
-
if qos.max_duration > 0:
|
|
306
|
-
self.config.max_duration = qos.max_duration
|
|
307
|
-
|
|
308
|
-
def get_host(self) -> SlurmHostSpecification:
|
|
309
|
-
host = self.config.to_host_spec()
|
|
310
|
-
host.priority = self.priority
|
|
311
|
-
host.qos_id = self.qos_id
|
|
312
|
-
host.min_gpu = self.min_gpu
|
|
313
|
-
|
|
314
|
-
return host
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
class FeatureBooleanFormula:
|
|
318
|
-
clauses: Set[List[str]]
|
|
319
|
-
|
|
320
|
-
def __init__(self):
|
|
321
|
-
self.clauses = set()
|
|
322
|
-
|
|
323
|
-
def add(self, features: List[str]):
|
|
324
|
-
"""Adds conjunction of tags"""
|
|
325
|
-
self.clauses.add(FeatureConjunction(features))
|
|
326
|
-
|
|
327
|
-
def to_constraint(self):
|
|
328
|
-
"""Returns a constraint for sbatch/srun"""
|
|
329
|
-
it = ("&".join(clause) for clause in self.clauses)
|
|
330
|
-
s = f"""({")|(".join(it)})"""
|
|
331
|
-
return None if s == "()" else s
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
class MatchingSpec:
|
|
335
|
-
def __init__(self):
|
|
336
|
-
self.fbf = FeatureBooleanFormula()
|
|
337
|
-
self.hosts: set[str] = set()
|
|
338
|
-
self.partitions: Set[str] = set()
|
|
339
|
-
self.qos: Optional[str] = None
|
|
340
|
-
self.account: Optional[str] = None
|
|
341
|
-
self.mem_per_cpu: int = 0
|
|
342
|
-
|
|
343
|
-
def update(self, host_spec: SlurmHostSpecification):
|
|
344
|
-
if host_spec.qos_id != self.qos and self.qos is not None:
|
|
345
|
-
# Cannot update with other QoS
|
|
346
|
-
return
|
|
347
|
-
self.qos = host_spec.qos_id
|
|
348
|
-
|
|
349
|
-
if host_spec.account_id != self.account and self.account is not None:
|
|
350
|
-
# Cannot update with other account
|
|
351
|
-
return
|
|
352
|
-
|
|
353
|
-
if (
|
|
354
|
-
host_spec.cpu.mem_per_cpu > 0
|
|
355
|
-
and self.mem_per_cpu > 0
|
|
356
|
-
and host_spec.cpu.mem_per_cpu != self.mem_per_cpu
|
|
357
|
-
):
|
|
358
|
-
# Cannot update with different mem per cpu
|
|
359
|
-
return
|
|
360
|
-
|
|
361
|
-
if host_spec.cpu.mem_per_cpu:
|
|
362
|
-
self.mem_per_cpu = host_spec.cpu.mem_per_cpu
|
|
363
|
-
|
|
364
|
-
self.account = host_spec.account_id
|
|
365
|
-
|
|
366
|
-
self.partitions.add(host_spec.partition)
|
|
367
|
-
self.fbf.add(host_spec.features)
|
|
368
|
-
if host_spec.hosts:
|
|
369
|
-
self.hosts.update(host_spec.hosts)
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
@dataclass
|
|
373
|
-
class SlurmConfiguration(YAMLDataClass, LauncherConfiguration):
|
|
374
|
-
id: str
|
|
375
|
-
"""Slurm ID"""
|
|
376
|
-
|
|
377
|
-
partitions: Dict[str, SlurmPartition]
|
|
378
|
-
"""List of partitions"""
|
|
379
|
-
|
|
380
|
-
connector: str = "local"
|
|
381
|
-
"""Name of the connector"""
|
|
382
|
-
|
|
383
|
-
path: str = "/usr/bin"
|
|
384
|
-
"""Path for SLURM commands"""
|
|
385
|
-
|
|
386
|
-
use_features: bool = True
|
|
387
|
-
"""Whether features should be used"""
|
|
388
|
-
|
|
389
|
-
use_hosts: bool = True
|
|
390
|
-
"""Whether hosts should be used in the query"""
|
|
391
|
-
|
|
392
|
-
use_memory_contraint: bool = True
|
|
393
|
-
"""Whether memory constraint can be specified"""
|
|
394
|
-
|
|
395
|
-
query_slurm: bool = False
|
|
396
|
-
"""True to query SLURM directly (using scontrol)"""
|
|
397
|
-
|
|
398
|
-
tags: List[str] = field(default_factory=list)
|
|
399
|
-
|
|
400
|
-
weight: int = 0
|
|
401
|
-
|
|
402
|
-
qos: Dict[str, SlurmQOS] = field(default_factory=lambda: {})
|
|
403
|
-
|
|
404
|
-
features_regex: Annotated[
|
|
405
|
-
List[re.Pattern],
|
|
406
|
-
Initialize(lambda regexps: [re.compile(regex) for regex in regexps]),
|
|
407
|
-
] = field(default_factory=list)
|
|
408
|
-
"""
|
|
409
|
-
Regex to get the information from features
|
|
410
|
-
- CUDA: cuda:count, cuda:memory
|
|
411
|
-
"""
|
|
412
|
-
|
|
413
|
-
features: Dict[str, SlurmFeature] = field(default_factory=lambda: {})
|
|
414
|
-
"""List of features with associated configurations"""
|
|
415
|
-
|
|
416
|
-
configuration: Optional[SlurmNodeConfiguration] = None
|
|
417
|
-
"""Partition configuration"""
|
|
418
|
-
|
|
419
|
-
def compute(self, registry: "LauncherRegistry"):
|
|
420
|
-
if self.query_slurm:
|
|
421
|
-
self.query_slurm = False
|
|
422
|
-
|
|
423
|
-
# Read node information
|
|
424
|
-
connector = registry.getConnector(self.connector)
|
|
425
|
-
pb = connector.processbuilder()
|
|
426
|
-
pb.command = ["scontrol", "--hide", "show", "nodes"]
|
|
427
|
-
|
|
428
|
-
def handle_output(input: io.BytesIO):
|
|
429
|
-
StreamReader = codecs.getreader("utf-8")
|
|
430
|
-
fill_nodes_configuration(StreamReader(input), self)
|
|
431
|
-
|
|
432
|
-
pb.stdout = Redirect.pipe(handle_output)
|
|
433
|
-
pb.start()
|
|
434
|
-
|
|
435
|
-
# Read partition information
|
|
436
|
-
pb = connector.processbuilder()
|
|
437
|
-
pb.command = ["scontrol", "--hide", "show", "--oneliner", "partition"]
|
|
438
|
-
|
|
439
|
-
def handle_output(input: io.BytesIO):
|
|
440
|
-
StreamReader = codecs.getreader("utf-8")
|
|
441
|
-
fill_partitions_configuration(StreamReader(input), self)
|
|
442
|
-
|
|
443
|
-
pb.stdout = Redirect.pipe(handle_output)
|
|
444
|
-
pb.start()
|
|
445
|
-
|
|
446
|
-
@cached_property
|
|
447
|
-
def computed_nodes(self) -> List[SlurmHostSpecification]:
|
|
448
|
-
"""Computes the list of potential compute nodes (grouped by similar nodes)"""
|
|
449
|
-
hosts = []
|
|
450
|
-
|
|
451
|
-
for partition_name, partition in self.partitions.items():
|
|
452
|
-
if partition.disabled:
|
|
453
|
-
continue
|
|
454
|
-
|
|
455
|
-
for node in partition.nodes:
|
|
456
|
-
nodes_spec = NodesSpecComputer(self, partition)
|
|
457
|
-
nodes_spec.update(self.configuration)
|
|
458
|
-
|
|
459
|
-
# Set partition GPU
|
|
460
|
-
if partition.configuration:
|
|
461
|
-
nodes_spec.update(partition.configuration)
|
|
462
|
-
|
|
463
|
-
if node.configuration:
|
|
464
|
-
nodes_spec.update(node.configuration)
|
|
465
|
-
|
|
466
|
-
for feature in node.features:
|
|
467
|
-
# Use feature data directly
|
|
468
|
-
if data := self.features.get(feature, None):
|
|
469
|
-
nodes_spec.update(data.configuration)
|
|
470
|
-
|
|
471
|
-
# logger.debug("Looking at %s", self.features_regex)
|
|
472
|
-
for regex in self.features_regex:
|
|
473
|
-
# logger.debug("%s/%s => %s", regex, tag, regex.match(tag))
|
|
474
|
-
if m := regex.match(feature):
|
|
475
|
-
d = m.groupdict()
|
|
476
|
-
if _count := d.get("cuda_count", None):
|
|
477
|
-
nodes_spec.config.gpu.count = int(_count)
|
|
478
|
-
if memory := d.get("cuda_memory", None):
|
|
479
|
-
nodes_spec.config.gpu.memory = humanfriendly.parse_size(
|
|
480
|
-
memory
|
|
481
|
-
)
|
|
482
|
-
|
|
483
|
-
qos_list = partition.qos or [None]
|
|
484
|
-
accounts = partition.accounts or [None]
|
|
485
|
-
for qos in qos_list:
|
|
486
|
-
qos_nodes_spec = deepcopy(nodes_spec)
|
|
487
|
-
qos_nodes_spec.update_with_qos(qos)
|
|
488
|
-
|
|
489
|
-
host = qos_nodes_spec.get_host()
|
|
490
|
-
host.features = node.features
|
|
491
|
-
host.partition = partition_name
|
|
492
|
-
host.hosts = node.hosts
|
|
493
|
-
|
|
494
|
-
for account in accounts:
|
|
495
|
-
account_host = deepcopy(host)
|
|
496
|
-
account_host.account_id = account
|
|
497
|
-
hosts.append(account_host)
|
|
498
|
-
logging.debug("Computed slurm host: %s", host)
|
|
499
|
-
|
|
500
|
-
hosts.sort(key=lambda host: -host.priority)
|
|
501
|
-
return hosts
|
|
502
|
-
|
|
503
|
-
def get(
|
|
504
|
-
self, registry: "LauncherRegistry", requirement: HostRequirement
|
|
505
|
-
) -> Optional["Launcher"]:
|
|
506
|
-
# Compute the configuration if needed
|
|
507
|
-
self.compute(registry)
|
|
508
|
-
|
|
509
|
-
# Compute tags or hosts
|
|
510
|
-
|
|
511
|
-
# Current set of constraints
|
|
512
|
-
current_match = None
|
|
513
|
-
matching_spec = MatchingSpec()
|
|
514
|
-
|
|
515
|
-
for node in self.computed_nodes:
|
|
516
|
-
if match := requirement.match(node):
|
|
517
|
-
logger.debug("Match %s for %s", match, node)
|
|
518
|
-
|
|
519
|
-
# If score is below the current one, goes to the next one
|
|
520
|
-
if current_match and (
|
|
521
|
-
match.score <= current_match.score
|
|
522
|
-
and match.requirement is not current_match.requirement
|
|
523
|
-
):
|
|
524
|
-
continue
|
|
525
|
-
|
|
526
|
-
# If the requirement has changed, clear everything
|
|
527
|
-
if not current_match or (
|
|
528
|
-
match.requirement is not current_match.requirement
|
|
529
|
-
):
|
|
530
|
-
# Clear if the requirement changed
|
|
531
|
-
logger.debug("Clearing %s / %s", current_match, match)
|
|
532
|
-
matching_spec = MatchingSpec()
|
|
533
|
-
current_match = match
|
|
534
|
-
|
|
535
|
-
logger.debug(
|
|
536
|
-
"Adding %s, %s, %s", node.partition, node.features, node.hosts
|
|
537
|
-
)
|
|
538
|
-
matching_spec.update(node)
|
|
539
|
-
|
|
540
|
-
# Returns the appropriate launcher (if any)
|
|
541
|
-
use_features = matching_spec.fbf.clauses and self.use_features
|
|
542
|
-
if use_features or matching_spec.hosts:
|
|
543
|
-
assert current_match is not None
|
|
544
|
-
|
|
545
|
-
# Launching using tags
|
|
546
|
-
from .base import SlurmLauncher
|
|
547
|
-
|
|
548
|
-
launcher = SlurmLauncher(
|
|
549
|
-
connector=registry.getConnector(self.connector), binpath=self.path
|
|
550
|
-
)
|
|
551
|
-
|
|
552
|
-
launcher.options.partition = ",".join(matching_spec.partitions)
|
|
553
|
-
launcher.options.gpus_per_node = (
|
|
554
|
-
len(current_match.requirement.cuda_gpus)
|
|
555
|
-
if current_match.requirement.cuda_gpus
|
|
556
|
-
else None
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
launcher.options.qos = matching_spec.qos
|
|
560
|
-
launcher.options.account = matching_spec.account
|
|
561
|
-
|
|
562
|
-
if current_match.requirement.cpu.cores > 0:
|
|
563
|
-
launcher.options.cpus_per_task = current_match.requirement.cpu.cores
|
|
564
|
-
|
|
565
|
-
if current_match.requirement.cpu.memory > 0:
|
|
566
|
-
if self.use_memory_contraint:
|
|
567
|
-
launcher.options.mem = (
|
|
568
|
-
f"{current_match.requirement.cpu.memory // (1024*1024)}M"
|
|
569
|
-
)
|
|
570
|
-
else:
|
|
571
|
-
assert (
|
|
572
|
-
matching_spec.mem_per_cpu > 0
|
|
573
|
-
), "Memory per CPU should be specified"
|
|
574
|
-
cpus_per_task = math.ceil(
|
|
575
|
-
current_match.requirement.cpu.memory / matching_spec.mem_per_cpu
|
|
576
|
-
)
|
|
577
|
-
launcher.options.cpus_per_task = max(
|
|
578
|
-
launcher.options.cpus_per_task, cpus_per_task
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
if use_features:
|
|
582
|
-
launcher.options.constraint = matching_spec.fbf.to_constraint()
|
|
583
|
-
else:
|
|
584
|
-
logger.warning("Selecting first host")
|
|
585
|
-
launcher.options.nodelist = next(iter(matching_spec.hosts))
|
|
586
|
-
|
|
587
|
-
if current_match.requirement.duration > 0:
|
|
588
|
-
total_seconds = current_match.requirement.duration
|
|
589
|
-
seconds = total_seconds % 60
|
|
590
|
-
minutes = (total_seconds // 60) % 60
|
|
591
|
-
hours = total_seconds // 3600
|
|
592
|
-
launcher.options.time = f"{hours}:{minutes}:{seconds}"
|
|
593
|
-
|
|
594
|
-
logger.debug("Slurm options: %s", " ".join(launcher.options.args()))
|
|
595
|
-
return launcher
|
|
596
|
-
|
|
597
|
-
return None
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
"""Defines an experimental environment"""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Dict
|
|
5
|
-
import marshmallow as mm
|
|
6
|
-
from experimaestro.utils.settings import JsonSettings
|
|
7
|
-
from pytools import memoize
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def schema(schema_cls):
|
|
11
|
-
def annotate(object_cls):
|
|
12
|
-
schema_cls.OBJECT_CLS = object_cls
|
|
13
|
-
object_cls.SCHEMA = schema_cls
|
|
14
|
-
return object_cls
|
|
15
|
-
|
|
16
|
-
return annotate
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class _Schema(mm.Schema):
|
|
20
|
-
@mm.post_load
|
|
21
|
-
def make_settings(self, data, **kwargs):
|
|
22
|
-
settings = self.__class__.OBJECT_CLS()
|
|
23
|
-
for key, value in data.items():
|
|
24
|
-
setattr(settings, key, value)
|
|
25
|
-
return settings
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class EnvironmentSchema(_Schema):
|
|
29
|
-
hostname = mm.fields.Str()
|
|
30
|
-
"""The hostname (can be empty for localhost)"""
|
|
31
|
-
|
|
32
|
-
pythonpath = mm.fields.Str()
|
|
33
|
-
"""Path to python executable"""
|
|
34
|
-
workdir = mm.fields.Str()
|
|
35
|
-
environ = mm.fields.Dict(keys=mm.fields.Str(), values=mm.fields.Str())
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class Schema(_Schema):
|
|
39
|
-
environments = mm.fields.Dict(
|
|
40
|
-
keys=mm.fields.Str(), values=mm.fields.Nested(EnvironmentSchema)
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@schema(Schema)
|
|
45
|
-
class Settings(JsonSettings):
|
|
46
|
-
"""User settings"""
|
|
47
|
-
|
|
48
|
-
def __init__(self):
|
|
49
|
-
self.environments: Dict[str, str] = {}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@schema(EnvironmentSchema)
|
|
53
|
-
class Environment:
|
|
54
|
-
"""This defines the environment for an experiment, and can be stored"""
|
|
55
|
-
|
|
56
|
-
def __init__(self, workdir=None):
|
|
57
|
-
self.hostname = None
|
|
58
|
-
self._workdir = workdir
|
|
59
|
-
self.pythonpath = None
|
|
60
|
-
self.environ = {}
|
|
61
|
-
|
|
62
|
-
@property
|
|
63
|
-
def basepath(self):
|
|
64
|
-
if self.hostname:
|
|
65
|
-
from ..connectors.ssh import SshPath
|
|
66
|
-
|
|
67
|
-
return SshPath(f"ssh://{self.hostname}")
|
|
68
|
-
return Path()
|
|
69
|
-
|
|
70
|
-
@property
|
|
71
|
-
def workdir(self) -> Path:
|
|
72
|
-
assert self._workdir, "The working directory has not been set"
|
|
73
|
-
return self.basepath / self._workdir
|
|
74
|
-
|
|
75
|
-
@workdir.setter
|
|
76
|
-
def workdir(self, value):
|
|
77
|
-
self._workdir = value
|
|
78
|
-
|
|
79
|
-
def setenv(self, key: str, value: str):
|
|
80
|
-
"""Set the environment variable with key"""
|
|
81
|
-
self.environ[key] = value
|
|
82
|
-
|
|
83
|
-
@staticmethod
|
|
84
|
-
@memoize()
|
|
85
|
-
def _load():
|
|
86
|
-
path = (
|
|
87
|
-
Path("~").expanduser() / ".config" / "experimaestro" / "environments.json"
|
|
88
|
-
)
|
|
89
|
-
return Settings.load(path)
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
def get(name: str):
|
|
93
|
-
"""Retrieve an environment by name"""
|
|
94
|
-
return Environment._load().environments[name]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|