experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (118) hide show
  1. experimaestro/__init__.py +14 -4
  2. experimaestro/__main__.py +3 -423
  3. experimaestro/annotations.py +14 -4
  4. experimaestro/cli/__init__.py +311 -0
  5. experimaestro/{filter.py → cli/filter.py} +23 -9
  6. experimaestro/cli/jobs.py +268 -0
  7. experimaestro/cli/progress.py +269 -0
  8. experimaestro/click.py +0 -35
  9. experimaestro/commandline.py +3 -7
  10. experimaestro/connectors/__init__.py +29 -14
  11. experimaestro/connectors/local.py +19 -10
  12. experimaestro/connectors/ssh.py +27 -8
  13. experimaestro/core/arguments.py +45 -3
  14. experimaestro/core/callbacks.py +52 -0
  15. experimaestro/core/context.py +8 -9
  16. experimaestro/core/identifier.py +310 -0
  17. experimaestro/core/objects/__init__.py +44 -0
  18. experimaestro/core/{objects.py → objects/config.py} +399 -772
  19. experimaestro/core/objects/config_utils.py +58 -0
  20. experimaestro/core/objects/config_walk.py +151 -0
  21. experimaestro/core/objects.pyi +15 -45
  22. experimaestro/core/serialization.py +63 -9
  23. experimaestro/core/serializers.py +1 -8
  24. experimaestro/core/types.py +104 -66
  25. experimaestro/experiments/cli.py +154 -72
  26. experimaestro/experiments/configuration.py +10 -1
  27. experimaestro/generators.py +6 -1
  28. experimaestro/ipc.py +4 -1
  29. experimaestro/launcherfinder/__init__.py +1 -1
  30. experimaestro/launcherfinder/base.py +2 -18
  31. experimaestro/launcherfinder/parser.py +8 -3
  32. experimaestro/launcherfinder/registry.py +52 -140
  33. experimaestro/launcherfinder/specs.py +49 -10
  34. experimaestro/launchers/direct.py +0 -47
  35. experimaestro/launchers/slurm/base.py +54 -14
  36. experimaestro/mkdocs/__init__.py +1 -1
  37. experimaestro/mkdocs/base.py +6 -8
  38. experimaestro/notifications.py +38 -12
  39. experimaestro/progress.py +406 -0
  40. experimaestro/run.py +24 -3
  41. experimaestro/scheduler/__init__.py +18 -1
  42. experimaestro/scheduler/base.py +108 -808
  43. experimaestro/scheduler/dynamic_outputs.py +184 -0
  44. experimaestro/scheduler/experiment.py +387 -0
  45. experimaestro/scheduler/jobs.py +475 -0
  46. experimaestro/scheduler/signal_handler.py +32 -0
  47. experimaestro/scheduler/state.py +75 -0
  48. experimaestro/scheduler/workspace.py +27 -8
  49. experimaestro/scriptbuilder.py +18 -3
  50. experimaestro/server/__init__.py +36 -5
  51. experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
  52. experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
  53. experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
  54. experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
  55. experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
  56. experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
  57. experimaestro/server/data/index.css +5187 -5068
  58. experimaestro/server/data/index.css.map +1 -1
  59. experimaestro/server/data/index.js +68887 -68064
  60. experimaestro/server/data/index.js.map +1 -1
  61. experimaestro/settings.py +45 -5
  62. experimaestro/sphinx/__init__.py +7 -17
  63. experimaestro/taskglobals.py +7 -2
  64. experimaestro/tests/core/__init__.py +0 -0
  65. experimaestro/tests/core/test_generics.py +206 -0
  66. experimaestro/tests/definitions_types.py +5 -3
  67. experimaestro/tests/launchers/bin/sbatch +34 -7
  68. experimaestro/tests/launchers/bin/srun +5 -0
  69. experimaestro/tests/launchers/common.py +17 -5
  70. experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
  71. experimaestro/tests/restart.py +10 -5
  72. experimaestro/tests/tasks/all.py +23 -10
  73. experimaestro/tests/tasks/foreign.py +2 -4
  74. experimaestro/tests/test_checkers.py +2 -2
  75. experimaestro/tests/test_dependencies.py +11 -17
  76. experimaestro/tests/test_experiment.py +73 -0
  77. experimaestro/tests/test_file_progress.py +425 -0
  78. experimaestro/tests/test_file_progress_integration.py +477 -0
  79. experimaestro/tests/test_findlauncher.py +12 -5
  80. experimaestro/tests/test_forward.py +5 -5
  81. experimaestro/tests/test_generators.py +93 -0
  82. experimaestro/tests/test_identifier.py +182 -158
  83. experimaestro/tests/test_instance.py +19 -27
  84. experimaestro/tests/test_objects.py +13 -20
  85. experimaestro/tests/test_outputs.py +6 -6
  86. experimaestro/tests/test_param.py +68 -30
  87. experimaestro/tests/test_progress.py +4 -4
  88. experimaestro/tests/test_serializers.py +24 -64
  89. experimaestro/tests/test_ssh.py +7 -0
  90. experimaestro/tests/test_tags.py +50 -21
  91. experimaestro/tests/test_tasks.py +42 -51
  92. experimaestro/tests/test_tokens.py +11 -8
  93. experimaestro/tests/test_types.py +24 -21
  94. experimaestro/tests/test_validation.py +67 -110
  95. experimaestro/tests/token_reschedule.py +1 -1
  96. experimaestro/tokens.py +24 -13
  97. experimaestro/tools/diff.py +8 -1
  98. experimaestro/typingutils.py +20 -11
  99. experimaestro/utils/asyncio.py +6 -2
  100. experimaestro/utils/multiprocessing.py +44 -0
  101. experimaestro/utils/resources.py +11 -3
  102. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
  103. experimaestro-2.0.0a8.dist-info/RECORD +166 -0
  104. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
  105. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
  106. experimaestro/launchers/slurm/cli.py +0 -29
  107. experimaestro/launchers/slurm/configuration.py +0 -597
  108. experimaestro/scheduler/environment.py +0 -94
  109. experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
  110. experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
  111. experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
  112. experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
  113. experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
  114. experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
  115. experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
  116. experimaestro/utils/yaml.py +0 -202
  117. experimaestro-1.5.1.dist-info/RECORD +0 -148
  118. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
@@ -1,597 +0,0 @@
1
- import codecs
2
- from collections import defaultdict
3
- from copy import deepcopy
4
- import io
5
- import math
6
- from attr import Factory
7
- from attrs import define
8
- import logging
9
- from experimaestro import Annotated
10
- from typing import (
11
- Dict,
12
- List,
13
- Optional,
14
- Set,
15
- TextIO,
16
- )
17
- import re
18
- import humanfriendly
19
- from dataclasses import dataclass, field
20
- from experimaestro.launcherfinder import YAMLDataClass, HostRequirement
21
- from experimaestro.launcherfinder.base import LauncherConfiguration
22
- from experimaestro.launcherfinder.registry import (
23
- Initialize,
24
- LauncherRegistry,
25
- )
26
- from experimaestro.launcherfinder.specs import (
27
- CPUSpecification,
28
- CudaSpecification,
29
- HostSpecification,
30
- )
31
- from experimaestro.compat import cached_property
32
- from . import Launcher
33
- from experimaestro.connectors import (
34
- Redirect,
35
- )
36
-
37
- logger = logging.getLogger("xpm.slurm")
38
-
39
-
40
- def fill_nodes_configuration(input: TextIO, configuration: "SlurmConfiguration"):
41
- """Parses the output of scontrol show nodes"""
42
- re_nodename = re.compile(r"""^NodeName=([_\-\w]+)""")
43
- re_features = re.compile(r"""^\s*AvailableFeatures=([,_\-\w]+)""")
44
- re_partitions = re.compile(r"""^\s*Partitions=([,\-_\w]+)""")
45
-
46
- nodename = ""
47
- features = []
48
- partition_names = []
49
- partitions = configuration.partitions
50
- partitions2features2nodes = defaultdict(lambda: {})
51
-
52
- def process():
53
- for partition_name in partition_names:
54
- partition = partitions.setdefault(partition_name, SlurmPartition(nodes=[]))
55
-
56
- fl = "&".join(sorted(features))
57
- nodes = partitions2features2nodes[partition_name].get(fl)
58
- if nodes is None:
59
- nodes = SlurmNodes(hosts=[nodename], features=features)
60
- partitions2features2nodes[partition_name][fl] = nodes
61
- partition.nodes.append(nodes)
62
- else:
63
- if nodename not in nodes.hosts:
64
- nodes.hosts.append(nodename)
65
-
66
- for line in input.readlines():
67
- if match := re_nodename.search(line):
68
- if nodename:
69
- process()
70
- nodename = match.group(1)
71
- elif match := re_features.search(line):
72
- features = match.group(1).split(",")
73
- elif match := re_partitions.search(line):
74
- partition_names = match.group(1).split(",")
75
-
76
- if nodename:
77
- process()
78
-
79
-
80
- def fill_partitions_configuration(input: TextIO, configuration: "SlurmConfiguration"):
81
- """Parses the output of scontrol show --oneliner partition"""
82
- re_partitionname = re.compile(r"""^PartitionName=(\w+)""")
83
- re_mem_per_cpu = re.compile(r"""(?:=|\s)DefMemPerCPU=(\d+)(?:\D|$)""")
84
- re_cpu_per_gpu = re.compile(r"""(?:=|\s)DefCpuPerGPU=(\d+)(?:\D|$)""")
85
-
86
- for line in input.readlines():
87
- if match := re_partitionname.search(line):
88
- name = match.group(1)
89
- cfg = configuration.partitions.setdefault(name, SlurmPartition(nodes=[]))
90
-
91
- if m := re_mem_per_cpu.search(line):
92
- cfg.mem_per_cpu = int(m.group(1)) * 1024
93
-
94
- if m := re_cpu_per_gpu.search(line):
95
- cfg.cpu_per_gpu = int(m.group(1))
96
-
97
-
98
- # ---- SLURM launcher finder
99
-
100
-
101
- def parse_size(s: Optional[str]):
102
- return humanfriendly.parse_size(s) if s else None
103
-
104
-
105
- @dataclass
106
- class GPUConfig(YAMLDataClass):
107
- """Represents a GPU"""
108
-
109
- model: Optional[str] = None
110
- count: int = 0
111
- memory: Annotated[int, Initialize(parse_size)] = 0
112
-
113
- min_memory: Annotated[int, Initialize(parse_size)] = 0
114
- """Minimum memory to be allocated on this node"""
115
-
116
- min_mem_ratio: Optional[float] = 0.0
117
- """Minimum memory ratio"""
118
-
119
- def update(self, other: "GPUConfig"):
120
- if other.model:
121
- self.model = other.model
122
- if other.count:
123
- self.count = other.count
124
- if other.memory:
125
- self.memory = other.memory
126
- if other.min_memory:
127
- self.min_memory = other.min_memory
128
-
129
- def to_spec(self):
130
- cuda = []
131
- min_memory = max(int(self.memory * self.min_mem_ratio), self.min_memory)
132
- cuda.extend(
133
- [
134
- CudaSpecification(self.memory, self.model, min_memory)
135
- for _ in range(self.count)
136
- ]
137
- )
138
- return cuda
139
-
140
-
141
- @dataclass
142
- class CPUConfig(YAMLDataClass):
143
- cpu_per_gpu: int = 0
144
- """Number of CPU per GPU"""
145
-
146
- mem_per_cpu: Annotated[int, Initialize(humanfriendly.parse_size)] = 0
147
- """Memory per CPU"""
148
-
149
- cores: int = 0
150
-
151
- memory: Annotated[int, Initialize(parse_size)] = 0
152
-
153
- def update(self, other: "CPUConfig"):
154
- if other.cpu_per_gpu:
155
- self.cpu_per_gpu = other.cpu_per_gpu
156
- if other.mem_per_cpu:
157
- self.mem_per_cpu = other.mem_per_cpu
158
- if other.memory:
159
- self.memory = other.memory
160
- if other.cores:
161
- self.cores = other.cores
162
-
163
- def to_spec(self):
164
- return CPUSpecification(
165
- memory=self.memory,
166
- cores=self.cores,
167
- mem_per_cpu=self.mem_per_cpu,
168
- cpu_per_gpu=self.cpu_per_gpu,
169
- )
170
-
171
-
172
- @define
173
- class SlurmHostSpecification(HostSpecification):
174
- features: List[str] = Factory(list)
175
- hosts: List[str] = Factory(list)
176
- partition: str = Factory(str)
177
-
178
- qos_id: Optional[str] = Factory(lambda: None)
179
- """Quality of Service"""
180
-
181
- account_id: Optional[str] = Factory(lambda: None)
182
- """Account for this host"""
183
-
184
-
185
- @dataclass
186
- class SlurmNodeConfiguration(YAMLDataClass):
187
- max_duration: Annotated[int, Initialize(humanfriendly.parse_timespan)] = 0
188
- """Maximum duration of a job"""
189
-
190
- gpu: GPUConfig = field(default_factory=GPUConfig)
191
- """GPU Configuration"""
192
-
193
- cpu: CPUConfig = field(default_factory=CPUConfig)
194
- """CPU Configuration"""
195
-
196
- def update(self, other: "SlurmNodeConfiguration"):
197
- if other.max_duration:
198
- self.max_duration = other.max_duration
199
-
200
- if other.gpu:
201
- self.gpu.update(other.gpu)
202
-
203
- if other.cpu:
204
- self.cpu.update(other.cpu)
205
-
206
- def to_host_spec(self):
207
- spec = SlurmHostSpecification(
208
- cpu=(self.cpu or CPUConfig()).to_spec(),
209
- cuda=(self.gpu or GPUConfig()).to_spec(),
210
- )
211
- spec.max_duration = self.max_duration or 0
212
- return spec
213
-
214
-
215
- @dataclass
216
- class SlurmNodes(YAMLDataClass):
217
- features: List[str] = field(default_factory=list)
218
- """Nodes features"""
219
-
220
- hosts: List[str] = field(default_factory=list)
221
- """List of hostnames"""
222
-
223
- configuration: Optional[SlurmNodeConfiguration] = None
224
- """(optional) nodes configuration"""
225
-
226
- count: int = 0
227
- """Number of hosts (if list of hosts is empty)"""
228
-
229
-
230
- @dataclass
231
- class SlurmPartition(YAMLDataClass):
232
- """A Slurm partition"""
233
-
234
- accounts: List[str] = field(default_factory=list)
235
- """List of accounts for this partition with the associated priority modifier"""
236
-
237
- qos: List[str] = field(default_factory=list)
238
- """List of QoS for this partition with the associated priority modifier"""
239
-
240
- nodes: List[SlurmNodes] = field(default_factory=list)
241
- """List of nodes"""
242
-
243
- configuration: Optional[SlurmNodeConfiguration] = None
244
- """Partition configuration"""
245
-
246
- priority: int = 0
247
- """Priority for choosing this partition (higher preferred)"""
248
-
249
- disabled: bool = False
250
- """Can be used to disable a partition"""
251
-
252
-
253
- class FeatureConjunction(List[str]):
254
- def __init__(self, features: List[str]):
255
- super().__init__(sorted(features))
256
-
257
- def __hash__(self) -> int:
258
- return sum([hash(tag) for tag in self])
259
-
260
-
261
- @dataclass
262
- class SlurmFeature(YAMLDataClass):
263
- """Associate a configuration with a Slurm feature"""
264
-
265
- configuration: Optional[SlurmNodeConfiguration] = None
266
-
267
-
268
- @dataclass
269
- class SlurmQOS(YAMLDataClass):
270
- max_duration: Annotated[int, Initialize(humanfriendly.parse_timespan)] = 0
271
- """Maximum duration of a job"""
272
-
273
- min_gpu: int = 0
274
- """Minimum number of GPUs"""
275
-
276
- priority: int = 0
277
- """Priority modifier for this QoS"""
278
-
279
-
280
- class NodesSpecComputer:
281
- def __init__(self, config: "SlurmConfiguration", partition: SlurmPartition):
282
- self.config = config
283
- self.partition = partition
284
- self.main_config = config
285
- self.config = deepcopy(config.configuration)
286
-
287
- self.config.max_duration = (
288
- self.partition.configuration.max_duration
289
- if self.partition.configuration
290
- else None
291
- )
292
- self.priority = partition.priority
293
- self.qos_id = None
294
- self.min_gpu = 0
295
-
296
- def update(self, config: SlurmNodeConfiguration):
297
- self.config = deepcopy(self.config)
298
- self.config.update(config)
299
-
300
- def update_with_qos(self, qos_id: str):
301
- self.qos_id = qos_id
302
- if qos := self.main_config.qos.get(qos_id, None):
303
- self.priority += qos.priority
304
- self.min_gpu = qos.min_gpu
305
- if qos.max_duration > 0:
306
- self.config.max_duration = qos.max_duration
307
-
308
- def get_host(self) -> SlurmHostSpecification:
309
- host = self.config.to_host_spec()
310
- host.priority = self.priority
311
- host.qos_id = self.qos_id
312
- host.min_gpu = self.min_gpu
313
-
314
- return host
315
-
316
-
317
- class FeatureBooleanFormula:
318
- clauses: Set[List[str]]
319
-
320
- def __init__(self):
321
- self.clauses = set()
322
-
323
- def add(self, features: List[str]):
324
- """Adds conjunction of tags"""
325
- self.clauses.add(FeatureConjunction(features))
326
-
327
- def to_constraint(self):
328
- """Returns a constraint for sbatch/srun"""
329
- it = ("&".join(clause) for clause in self.clauses)
330
- s = f"""({")|(".join(it)})"""
331
- return None if s == "()" else s
332
-
333
-
334
- class MatchingSpec:
335
- def __init__(self):
336
- self.fbf = FeatureBooleanFormula()
337
- self.hosts: set[str] = set()
338
- self.partitions: Set[str] = set()
339
- self.qos: Optional[str] = None
340
- self.account: Optional[str] = None
341
- self.mem_per_cpu: int = 0
342
-
343
- def update(self, host_spec: SlurmHostSpecification):
344
- if host_spec.qos_id != self.qos and self.qos is not None:
345
- # Cannot update with other QoS
346
- return
347
- self.qos = host_spec.qos_id
348
-
349
- if host_spec.account_id != self.account and self.account is not None:
350
- # Cannot update with other account
351
- return
352
-
353
- if (
354
- host_spec.cpu.mem_per_cpu > 0
355
- and self.mem_per_cpu > 0
356
- and host_spec.cpu.mem_per_cpu != self.mem_per_cpu
357
- ):
358
- # Cannot update with different mem per cpu
359
- return
360
-
361
- if host_spec.cpu.mem_per_cpu:
362
- self.mem_per_cpu = host_spec.cpu.mem_per_cpu
363
-
364
- self.account = host_spec.account_id
365
-
366
- self.partitions.add(host_spec.partition)
367
- self.fbf.add(host_spec.features)
368
- if host_spec.hosts:
369
- self.hosts.update(host_spec.hosts)
370
-
371
-
372
- @dataclass
373
- class SlurmConfiguration(YAMLDataClass, LauncherConfiguration):
374
- id: str
375
- """Slurm ID"""
376
-
377
- partitions: Dict[str, SlurmPartition]
378
- """List of partitions"""
379
-
380
- connector: str = "local"
381
- """Name of the connector"""
382
-
383
- path: str = "/usr/bin"
384
- """Path for SLURM commands"""
385
-
386
- use_features: bool = True
387
- """Whether features should be used"""
388
-
389
- use_hosts: bool = True
390
- """Whether hosts should be used in the query"""
391
-
392
- use_memory_contraint: bool = True
393
- """Whether memory constraint can be specified"""
394
-
395
- query_slurm: bool = False
396
- """True to query SLURM directly (using scontrol)"""
397
-
398
- tags: List[str] = field(default_factory=list)
399
-
400
- weight: int = 0
401
-
402
- qos: Dict[str, SlurmQOS] = field(default_factory=lambda: {})
403
-
404
- features_regex: Annotated[
405
- List[re.Pattern],
406
- Initialize(lambda regexps: [re.compile(regex) for regex in regexps]),
407
- ] = field(default_factory=list)
408
- """
409
- Regex to get the information from features
410
- - CUDA: cuda:count, cuda:memory
411
- """
412
-
413
- features: Dict[str, SlurmFeature] = field(default_factory=lambda: {})
414
- """List of features with associated configurations"""
415
-
416
- configuration: Optional[SlurmNodeConfiguration] = None
417
- """Partition configuration"""
418
-
419
- def compute(self, registry: "LauncherRegistry"):
420
- if self.query_slurm:
421
- self.query_slurm = False
422
-
423
- # Read node information
424
- connector = registry.getConnector(self.connector)
425
- pb = connector.processbuilder()
426
- pb.command = ["scontrol", "--hide", "show", "nodes"]
427
-
428
- def handle_output(input: io.BytesIO):
429
- StreamReader = codecs.getreader("utf-8")
430
- fill_nodes_configuration(StreamReader(input), self)
431
-
432
- pb.stdout = Redirect.pipe(handle_output)
433
- pb.start()
434
-
435
- # Read partition information
436
- pb = connector.processbuilder()
437
- pb.command = ["scontrol", "--hide", "show", "--oneliner", "partition"]
438
-
439
- def handle_output(input: io.BytesIO):
440
- StreamReader = codecs.getreader("utf-8")
441
- fill_partitions_configuration(StreamReader(input), self)
442
-
443
- pb.stdout = Redirect.pipe(handle_output)
444
- pb.start()
445
-
446
- @cached_property
447
- def computed_nodes(self) -> List[SlurmHostSpecification]:
448
- """Computes the list of potential compute nodes (grouped by similar nodes)"""
449
- hosts = []
450
-
451
- for partition_name, partition in self.partitions.items():
452
- if partition.disabled:
453
- continue
454
-
455
- for node in partition.nodes:
456
- nodes_spec = NodesSpecComputer(self, partition)
457
- nodes_spec.update(self.configuration)
458
-
459
- # Set partition GPU
460
- if partition.configuration:
461
- nodes_spec.update(partition.configuration)
462
-
463
- if node.configuration:
464
- nodes_spec.update(node.configuration)
465
-
466
- for feature in node.features:
467
- # Use feature data directly
468
- if data := self.features.get(feature, None):
469
- nodes_spec.update(data.configuration)
470
-
471
- # logger.debug("Looking at %s", self.features_regex)
472
- for regex in self.features_regex:
473
- # logger.debug("%s/%s => %s", regex, tag, regex.match(tag))
474
- if m := regex.match(feature):
475
- d = m.groupdict()
476
- if _count := d.get("cuda_count", None):
477
- nodes_spec.config.gpu.count = int(_count)
478
- if memory := d.get("cuda_memory", None):
479
- nodes_spec.config.gpu.memory = humanfriendly.parse_size(
480
- memory
481
- )
482
-
483
- qos_list = partition.qos or [None]
484
- accounts = partition.accounts or [None]
485
- for qos in qos_list:
486
- qos_nodes_spec = deepcopy(nodes_spec)
487
- qos_nodes_spec.update_with_qos(qos)
488
-
489
- host = qos_nodes_spec.get_host()
490
- host.features = node.features
491
- host.partition = partition_name
492
- host.hosts = node.hosts
493
-
494
- for account in accounts:
495
- account_host = deepcopy(host)
496
- account_host.account_id = account
497
- hosts.append(account_host)
498
- logging.debug("Computed slurm host: %s", host)
499
-
500
- hosts.sort(key=lambda host: -host.priority)
501
- return hosts
502
-
503
- def get(
504
- self, registry: "LauncherRegistry", requirement: HostRequirement
505
- ) -> Optional["Launcher"]:
506
- # Compute the configuration if needed
507
- self.compute(registry)
508
-
509
- # Compute tags or hosts
510
-
511
- # Current set of constraints
512
- current_match = None
513
- matching_spec = MatchingSpec()
514
-
515
- for node in self.computed_nodes:
516
- if match := requirement.match(node):
517
- logger.debug("Match %s for %s", match, node)
518
-
519
- # If score is below the current one, goes to the next one
520
- if current_match and (
521
- match.score <= current_match.score
522
- and match.requirement is not current_match.requirement
523
- ):
524
- continue
525
-
526
- # If the requirement has changed, clear everything
527
- if not current_match or (
528
- match.requirement is not current_match.requirement
529
- ):
530
- # Clear if the requirement changed
531
- logger.debug("Clearing %s / %s", current_match, match)
532
- matching_spec = MatchingSpec()
533
- current_match = match
534
-
535
- logger.debug(
536
- "Adding %s, %s, %s", node.partition, node.features, node.hosts
537
- )
538
- matching_spec.update(node)
539
-
540
- # Returns the appropriate launcher (if any)
541
- use_features = matching_spec.fbf.clauses and self.use_features
542
- if use_features or matching_spec.hosts:
543
- assert current_match is not None
544
-
545
- # Launching using tags
546
- from .base import SlurmLauncher
547
-
548
- launcher = SlurmLauncher(
549
- connector=registry.getConnector(self.connector), binpath=self.path
550
- )
551
-
552
- launcher.options.partition = ",".join(matching_spec.partitions)
553
- launcher.options.gpus_per_node = (
554
- len(current_match.requirement.cuda_gpus)
555
- if current_match.requirement.cuda_gpus
556
- else None
557
- )
558
-
559
- launcher.options.qos = matching_spec.qos
560
- launcher.options.account = matching_spec.account
561
-
562
- if current_match.requirement.cpu.cores > 0:
563
- launcher.options.cpus_per_task = current_match.requirement.cpu.cores
564
-
565
- if current_match.requirement.cpu.memory > 0:
566
- if self.use_memory_contraint:
567
- launcher.options.mem = (
568
- f"{current_match.requirement.cpu.memory // (1024*1024)}M"
569
- )
570
- else:
571
- assert (
572
- matching_spec.mem_per_cpu > 0
573
- ), "Memory per CPU should be specified"
574
- cpus_per_task = math.ceil(
575
- current_match.requirement.cpu.memory / matching_spec.mem_per_cpu
576
- )
577
- launcher.options.cpus_per_task = max(
578
- launcher.options.cpus_per_task, cpus_per_task
579
- )
580
-
581
- if use_features:
582
- launcher.options.constraint = matching_spec.fbf.to_constraint()
583
- else:
584
- logger.warning("Selecting first host")
585
- launcher.options.nodelist = next(iter(matching_spec.hosts))
586
-
587
- if current_match.requirement.duration > 0:
588
- total_seconds = current_match.requirement.duration
589
- seconds = total_seconds % 60
590
- minutes = (total_seconds // 60) % 60
591
- hours = total_seconds // 3600
592
- launcher.options.time = f"{hours}:{minutes}:{seconds}"
593
-
594
- logger.debug("Slurm options: %s", " ".join(launcher.options.args()))
595
- return launcher
596
-
597
- return None
@@ -1,94 +0,0 @@
1
- """Defines an experimental environment"""
2
-
3
- from pathlib import Path
4
- from typing import Dict
5
- import marshmallow as mm
6
- from experimaestro.utils.settings import JsonSettings
7
- from pytools import memoize
8
-
9
-
10
- def schema(schema_cls):
11
- def annotate(object_cls):
12
- schema_cls.OBJECT_CLS = object_cls
13
- object_cls.SCHEMA = schema_cls
14
- return object_cls
15
-
16
- return annotate
17
-
18
-
19
- class _Schema(mm.Schema):
20
- @mm.post_load
21
- def make_settings(self, data, **kwargs):
22
- settings = self.__class__.OBJECT_CLS()
23
- for key, value in data.items():
24
- setattr(settings, key, value)
25
- return settings
26
-
27
-
28
- class EnvironmentSchema(_Schema):
29
- hostname = mm.fields.Str()
30
- """The hostname (can be empty for localhost)"""
31
-
32
- pythonpath = mm.fields.Str()
33
- """Path to python executable"""
34
- workdir = mm.fields.Str()
35
- environ = mm.fields.Dict(keys=mm.fields.Str(), values=mm.fields.Str())
36
-
37
-
38
- class Schema(_Schema):
39
- environments = mm.fields.Dict(
40
- keys=mm.fields.Str(), values=mm.fields.Nested(EnvironmentSchema)
41
- )
42
-
43
-
44
- @schema(Schema)
45
- class Settings(JsonSettings):
46
- """User settings"""
47
-
48
- def __init__(self):
49
- self.environments: Dict[str, str] = {}
50
-
51
-
52
- @schema(EnvironmentSchema)
53
- class Environment:
54
- """This defines the environment for an experiment, and can be stored"""
55
-
56
- def __init__(self, workdir=None):
57
- self.hostname = None
58
- self._workdir = workdir
59
- self.pythonpath = None
60
- self.environ = {}
61
-
62
- @property
63
- def basepath(self):
64
- if self.hostname:
65
- from ..connectors.ssh import SshPath
66
-
67
- return SshPath(f"ssh://{self.hostname}")
68
- return Path()
69
-
70
- @property
71
- def workdir(self) -> Path:
72
- assert self._workdir, "The working directory has not been set"
73
- return self.basepath / self._workdir
74
-
75
- @workdir.setter
76
- def workdir(self, value):
77
- self._workdir = value
78
-
79
- def setenv(self, key: str, value: str):
80
- """Set the environment variable with key"""
81
- self.environ[key] = value
82
-
83
- @staticmethod
84
- @memoize()
85
- def _load():
86
- path = (
87
- Path("~").expanduser() / ".config" / "experimaestro" / "environments.json"
88
- )
89
- return Settings.load(path)
90
-
91
- @staticmethod
92
- def get(name: str):
93
- """Retrieve an environment by name"""
94
- return Environment._load().environments[name]