matensemble 0.3.7__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matensemble-0.3.7 → matensemble-0.3.8}/PKG-INFO +1 -1
- {matensemble-0.3.7 → matensemble-0.3.8}/pyproject.toml +1 -1
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/chore.py +9 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/fluxlet.py +3 -5
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/manager.py +31 -8
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/strategy.py +11 -1
- {matensemble-0.3.7 → matensemble-0.3.8}/LICENSE +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/README.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/.python-version +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/README.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/TODO.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/assets/index-1X2cLUgt.js +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/assets/index-DRkGfWlx.css +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/index.html +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dash/vite.svg +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/driver.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/ensemble.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_diffraction.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_twist.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/correlations.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/ovito_calculators.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/MDSubprocess.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/analysis_registry.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/lammps_init.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/preprocessors.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/stat.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/logger.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/model.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/pipeline.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/service.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/redis/test.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/runtime_worker.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: matensemble
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
|
|
5
5
|
Author: Soumendu Bagchi, Kaleb Duchesneau
|
|
6
6
|
Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "matensemble"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.8"
|
|
4
4
|
description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -53,6 +53,7 @@ class Chore:
|
|
|
53
53
|
deps: tuple[str, ...] = (),
|
|
54
54
|
args: tuple = (),
|
|
55
55
|
kwargs: dict | None = None,
|
|
56
|
+
nnodes: int | None = None,
|
|
56
57
|
) -> None:
|
|
57
58
|
"""
|
|
58
59
|
The constructor for a :obj:`Chore`
|
|
@@ -84,6 +85,12 @@ class Chore:
|
|
|
84
85
|
The arguments to give the function if type is PYTHON
|
|
85
86
|
kwargs : dict
|
|
86
87
|
The key-word arguments to give the function if flaovr is PYTHON
|
|
88
|
+
nnodes : int, optional
|
|
89
|
+
When set, this chore will be scheduled via ``per_resource`` and will
|
|
90
|
+
occupy *nnodes* whole nodes (all cores and all GPUs on each node).
|
|
91
|
+
The manager uses this to compute the true resource footprint instead
|
|
92
|
+
of ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
|
|
93
|
+
Leave as ``None`` for normal ``from_command`` chores.
|
|
87
94
|
"""
|
|
88
95
|
|
|
89
96
|
self.id = id
|
|
@@ -100,6 +107,7 @@ class Chore:
|
|
|
100
107
|
self.deps = deps
|
|
101
108
|
self.args = args
|
|
102
109
|
self.kwargs = {} if kwargs is None else kwargs
|
|
110
|
+
self.nnodes = nnodes
|
|
103
111
|
|
|
104
112
|
def graph(self) -> nx.DiGraph:
|
|
105
113
|
return nx.DiGraph()
|
|
@@ -121,6 +129,7 @@ class Chore:
|
|
|
121
129
|
"deps": list(self.deps),
|
|
122
130
|
"args": _json_safe(self.args),
|
|
123
131
|
"kwargs": _json_safe(self.kwargs),
|
|
132
|
+
"nnodes": self.nnodes,
|
|
124
133
|
}
|
|
125
134
|
|
|
126
135
|
def _write_metadata(self) -> None:
|
|
@@ -26,7 +26,7 @@ class Fluxlet:
|
|
|
26
26
|
handle: flux.Flux,
|
|
27
27
|
) -> None:
|
|
28
28
|
self.handle = handle
|
|
29
|
-
self.gpus_per_node = self.get_gpus_per_node()
|
|
29
|
+
self.num_nodes, self.gpus_per_node = self.get_gpus_per_node()
|
|
30
30
|
|
|
31
31
|
def get_gpus_per_node(self) -> tuple[int, int]:
|
|
32
32
|
"""
|
|
@@ -81,11 +81,11 @@ class Fluxlet:
|
|
|
81
81
|
if dynopro:
|
|
82
82
|
jobspec = flux.job.JobspecV1.per_resource(
|
|
83
83
|
chore.command,
|
|
84
|
-
ncores=chore.resources.num_tasks,
|
|
85
84
|
nnodes=nnodes,
|
|
86
85
|
gpus_per_node=self.gpus_per_node,
|
|
87
|
-
per_resource_type="
|
|
86
|
+
per_resource_type="node",
|
|
88
87
|
per_resource_count=1,
|
|
88
|
+
exclusive=True, # dynopro needs whole nodes
|
|
89
89
|
)
|
|
90
90
|
|
|
91
91
|
chore.workdir.mkdir(parents=True, exist_ok=True)
|
|
@@ -106,8 +106,6 @@ class Fluxlet:
|
|
|
106
106
|
base_env["SLURM_GPUS_PER_NODE"] = str(self.gpus_per_node)
|
|
107
107
|
jobspec.environment = base_env
|
|
108
108
|
|
|
109
|
-
fut = executor.submit(jobspec)
|
|
110
|
-
|
|
111
109
|
fut = executor.submit(jobspec)
|
|
112
110
|
fut.chore_id = chore.id
|
|
113
111
|
fut.chore_obj = chore
|
|
@@ -272,6 +272,26 @@ class FluxManager:
|
|
|
272
272
|
gpus_per_node = total_gpus // nnodes
|
|
273
273
|
return nnodes, cores_per_node, gpus_per_node
|
|
274
274
|
|
|
275
|
+
def _chore_resource_footprint(self, chore: Chore) -> tuple[int, int]:
|
|
276
|
+
"""
|
|
277
|
+
Return ``(needed_cores, needed_gpus)`` for a chore.
|
|
278
|
+
|
|
279
|
+
For whole-node (dynopro) chores — those with ``chore.nnodes`` set — the
|
|
280
|
+
footprint is ``nnodes * cores_per_node`` and ``nnodes * gpus_per_node``
|
|
281
|
+
because ``per_resource`` allocates entire nodes and every core and GPU on
|
|
282
|
+
them becomes unavailable. For ordinary chores the footprint is the
|
|
283
|
+
familiar ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
if getattr(chore, "nnodes", None) is not None:
|
|
287
|
+
needed_cores = chore.nnodes * self._cores_per_node
|
|
288
|
+
needed_gpus = chore.nnodes * self._gpus_per_node
|
|
289
|
+
else:
|
|
290
|
+
needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
|
|
291
|
+
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
292
|
+
|
|
293
|
+
return needed_cores, needed_gpus
|
|
294
|
+
|
|
275
295
|
def _chore_fits_allocation(self, chore: Chore) -> bool:
|
|
276
296
|
"""
|
|
277
297
|
Checks whether the given chore is too big to be submitted
|
|
@@ -282,8 +302,7 @@ class FluxManager:
|
|
|
282
302
|
The :obj:`Chore` to check if it will fit in the allocation
|
|
283
303
|
"""
|
|
284
304
|
|
|
285
|
-
needed_cores =
|
|
286
|
-
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
305
|
+
needed_cores, needed_gpus = self._chore_resource_footprint(chore)
|
|
287
306
|
|
|
288
307
|
total_cores = self._nnodes_on_allocation * self._cores_per_node
|
|
289
308
|
total_gpus = self._nnodes_on_allocation * self._gpus_per_node
|
|
@@ -334,8 +353,7 @@ class FluxManager:
|
|
|
334
353
|
Checks to see if there are enough resources to submit the given :obj:`Chore`
|
|
335
354
|
"""
|
|
336
355
|
|
|
337
|
-
needed_cores =
|
|
338
|
-
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
356
|
+
needed_cores, needed_gpus = self._chore_resource_footprint(chore)
|
|
339
357
|
return self._free_cores >= needed_cores and self._free_gpus >= needed_gpus
|
|
340
358
|
|
|
341
359
|
def _has_failed(self, chore_id: str) -> bool:
|
|
@@ -399,7 +417,9 @@ class FluxManager:
|
|
|
399
417
|
|
|
400
418
|
self._fail_dependents(dep_id)
|
|
401
419
|
|
|
402
|
-
def _submit_one(
|
|
420
|
+
def _submit_one(
|
|
421
|
+
self, chore_id: str, buffer_time: float, dynopro: bool = False
|
|
422
|
+
) -> None:
|
|
403
423
|
"""
|
|
404
424
|
Submits a :obj:`Chore` and does book-keeping all the queues and resources
|
|
405
425
|
count
|
|
@@ -419,6 +439,7 @@ class FluxManager:
|
|
|
419
439
|
set_cpu_affinity=self._set_cpu_affinity,
|
|
420
440
|
set_gpu_affinity=self._set_gpu_affinity,
|
|
421
441
|
nnodes=None,
|
|
442
|
+
dynopro=dynopro,
|
|
422
443
|
)
|
|
423
444
|
except Exception as e:
|
|
424
445
|
self._logger.exception("CHORE SUBMIT FAILED: chore=%s", chore_id)
|
|
@@ -441,7 +462,9 @@ class FluxManager:
|
|
|
441
462
|
self._free_gpus -= chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
442
463
|
time.sleep(buffer_time)
|
|
443
464
|
|
|
444
|
-
def _submit_until_ooresources(
|
|
465
|
+
def _submit_until_ooresources(
|
|
466
|
+
self, buffer_time: float, dynopro: bool = False
|
|
467
|
+
) -> bool:
|
|
445
468
|
"""
|
|
446
469
|
Submit as many chores as possible until out-of-resources
|
|
447
470
|
|
|
@@ -459,7 +482,7 @@ class FluxManager:
|
|
|
459
482
|
chore = self._chores_by_id[chore_id]
|
|
460
483
|
|
|
461
484
|
if self._can_submit_now(chore):
|
|
462
|
-
self._submit_one(chore_id, buffer_time)
|
|
485
|
+
self._submit_one(chore_id, buffer_time, dynopro=dynopro)
|
|
463
486
|
submitted_any = True
|
|
464
487
|
else:
|
|
465
488
|
deferred.append(chore_id)
|
|
@@ -636,7 +659,7 @@ class FluxManager:
|
|
|
636
659
|
)
|
|
637
660
|
while not done:
|
|
638
661
|
self._check_resources()
|
|
639
|
-
self._submit_until_ooresources(buffer_time=buffer_time)
|
|
662
|
+
self._submit_until_ooresources(buffer_time=buffer_time, dynopro=dynopro)
|
|
640
663
|
proc_strat.process_futures(buffer_time=buffer_time)
|
|
641
664
|
|
|
642
665
|
done = (
|
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from abc import ABC, abstractmethod
|
|
13
13
|
|
|
14
|
+
from matensemble import dynopro
|
|
14
15
|
from matensemble.model import OutputReference
|
|
15
16
|
|
|
16
17
|
|
|
@@ -127,7 +128,10 @@ class AdaptiveStrategy(FutureProcessingStrategy):
|
|
|
127
128
|
self.manager._blocked.discard(dep_id)
|
|
128
129
|
|
|
129
130
|
# adaptively submit another chore
|
|
130
|
-
self.manager._submit_until_ooresources(
|
|
131
|
+
self.manager._submit_until_ooresources(
|
|
132
|
+
buffer_time=buffer_time,
|
|
133
|
+
dynopro=getattr(self.manager, "_dynopro", False),
|
|
134
|
+
)
|
|
131
135
|
|
|
132
136
|
if self.manager._write_restart_freq and (
|
|
133
137
|
len(self.manager._completed_chores) % self.manager._write_restart_freq
|
|
@@ -339,6 +343,12 @@ class UserStrategy(FutureProcessingStrategy):
|
|
|
339
343
|
f"bolo_match={chore_name} | due the following Exception ->\n{e}"
|
|
340
344
|
)
|
|
341
345
|
|
|
346
|
+
# adaptively submit another chore
|
|
347
|
+
self.manager._submit_until_ooresources(
|
|
348
|
+
buffer_time=buffer_time,
|
|
349
|
+
dynopro=getattr(self.manager, "_dynopro", False),
|
|
350
|
+
)
|
|
351
|
+
|
|
342
352
|
if self.manager._write_restart_freq and (
|
|
343
353
|
len(self.manager._completed_chores) % self.manager._write_restart_freq
|
|
344
354
|
== 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/compute_twist.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/correlations.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/postprocessors/ovito_calculators.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/task_lib/analysis_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.8}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|