matensemble 0.3.7__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matensemble-0.3.7 → matensemble-0.3.9}/PKG-INFO +1 -1
- {matensemble-0.3.7 → matensemble-0.3.9}/pyproject.toml +1 -1
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/chore.py +9 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/ensemble.py +3 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/fluxlet.py +4 -7
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/manager.py +31 -9
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/strategy.py +12 -2
- {matensemble-0.3.7 → matensemble-0.3.9}/LICENSE +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/README.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/.python-version +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/README.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/TODO.md +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/assets/index-1X2cLUgt.js +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/assets/index-DRkGfWlx.css +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/index.html +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dash/vite.svg +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/driver.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_diffraction.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_twist.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/correlations.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/ovito_calculators.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/MDSubprocess.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/analysis_registry.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/lammps_init.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/preprocessors.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/stat.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/logger.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/model.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/pipeline.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/__init__.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/service.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/redis/test.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/runtime_worker.py +0 -0
- {matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: matensemble
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
|
|
5
5
|
Author: Soumendu Bagchi, Kaleb Duchesneau
|
|
6
6
|
Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "matensemble"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.9"
|
|
4
4
|
description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -53,6 +53,7 @@ class Chore:
|
|
|
53
53
|
deps: tuple[str, ...] = (),
|
|
54
54
|
args: tuple = (),
|
|
55
55
|
kwargs: dict | None = None,
|
|
56
|
+
nnodes: int | None = None,
|
|
56
57
|
) -> None:
|
|
57
58
|
"""
|
|
58
59
|
The constructor for a :obj:`Chore`
|
|
@@ -84,6 +85,12 @@ class Chore:
|
|
|
84
85
|
The arguments to give the function if type is PYTHON
|
|
85
86
|
kwargs : dict
|
|
86
87
|
The key-word arguments to give the function if flaovr is PYTHON
|
|
88
|
+
nnodes : int, optional
|
|
89
|
+
When set, this chore will be scheduled via ``per_resource`` and will
|
|
90
|
+
occupy *nnodes* whole nodes (all cores and all GPUs on each node).
|
|
91
|
+
The manager uses this to compute the true resource footprint instead
|
|
92
|
+
of ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
|
|
93
|
+
Leave as ``None`` for normal ``from_command`` chores.
|
|
87
94
|
"""
|
|
88
95
|
|
|
89
96
|
self.id = id
|
|
@@ -100,6 +107,7 @@ class Chore:
|
|
|
100
107
|
self.deps = deps
|
|
101
108
|
self.args = args
|
|
102
109
|
self.kwargs = {} if kwargs is None else kwargs
|
|
110
|
+
self.nnodes = nnodes
|
|
103
111
|
|
|
104
112
|
def graph(self) -> nx.DiGraph:
|
|
105
113
|
return nx.DiGraph()
|
|
@@ -121,6 +129,7 @@ class Chore:
|
|
|
121
129
|
"deps": list(self.deps),
|
|
122
130
|
"args": _json_safe(self.args),
|
|
123
131
|
"kwargs": _json_safe(self.kwargs),
|
|
132
|
+
"nnodes": self.nnodes,
|
|
124
133
|
}
|
|
125
134
|
|
|
126
135
|
def _write_metadata(self) -> None:
|
|
@@ -7,6 +7,7 @@ from networkx import write_network_text
|
|
|
7
7
|
from matensemble.manager import FluxManager
|
|
8
8
|
from matensemble.chore import Chore, ChoreType
|
|
9
9
|
from matensemble.model import Resources
|
|
10
|
+
|
|
10
11
|
# from matensemble.dynopro.driver import online_dynamics
|
|
11
12
|
|
|
12
13
|
|
|
@@ -73,6 +74,7 @@ class EnsembleDynamicsRunner:
|
|
|
73
74
|
num_tasks=self.tasks_per_job,
|
|
74
75
|
cores_per_task=self.cores_per_task,
|
|
75
76
|
gpus_per_task=self.gpus_per_task,
|
|
77
|
+
mpi=True,
|
|
76
78
|
)
|
|
77
79
|
workdir = outdir / chore_id
|
|
78
80
|
|
|
@@ -85,6 +87,7 @@ class EnsembleDynamicsRunner:
|
|
|
85
87
|
chore_type=ChoreType.EXECUTABLE,
|
|
86
88
|
resources=resources,
|
|
87
89
|
workdir=workdir,
|
|
90
|
+
nnodes=self.nnodes,
|
|
88
91
|
)
|
|
89
92
|
)
|
|
90
93
|
|
|
@@ -26,7 +26,7 @@ class Fluxlet:
|
|
|
26
26
|
handle: flux.Flux,
|
|
27
27
|
) -> None:
|
|
28
28
|
self.handle = handle
|
|
29
|
-
self.gpus_per_node = self.get_gpus_per_node()
|
|
29
|
+
self.num_nodes, self.gpus_per_node = self.get_gpus_per_node()
|
|
30
30
|
|
|
31
31
|
def get_gpus_per_node(self) -> tuple[int, int]:
|
|
32
32
|
"""
|
|
@@ -50,7 +50,6 @@ class Fluxlet:
|
|
|
50
50
|
chore: Chore,
|
|
51
51
|
set_cpu_affinity: bool | None = None,
|
|
52
52
|
set_gpu_affinity: bool | None = None,
|
|
53
|
-
nnodes: int | None = None,
|
|
54
53
|
dynopro: bool | None = None,
|
|
55
54
|
) -> flux.job.FluxExecutorFuture:
|
|
56
55
|
"""
|
|
@@ -81,11 +80,11 @@ class Fluxlet:
|
|
|
81
80
|
if dynopro:
|
|
82
81
|
jobspec = flux.job.JobspecV1.per_resource(
|
|
83
82
|
chore.command,
|
|
84
|
-
|
|
85
|
-
nnodes=nnodes,
|
|
83
|
+
nnodes=chore.nnodes,
|
|
86
84
|
gpus_per_node=self.gpus_per_node,
|
|
87
|
-
per_resource_type="
|
|
85
|
+
per_resource_type="node",
|
|
88
86
|
per_resource_count=1,
|
|
87
|
+
exclusive=True, # dynopro needs whole nodes
|
|
89
88
|
)
|
|
90
89
|
|
|
91
90
|
chore.workdir.mkdir(parents=True, exist_ok=True)
|
|
@@ -106,8 +105,6 @@ class Fluxlet:
|
|
|
106
105
|
base_env["SLURM_GPUS_PER_NODE"] = str(self.gpus_per_node)
|
|
107
106
|
jobspec.environment = base_env
|
|
108
107
|
|
|
109
|
-
fut = executor.submit(jobspec)
|
|
110
|
-
|
|
111
108
|
fut = executor.submit(jobspec)
|
|
112
109
|
fut.chore_id = chore.id
|
|
113
110
|
fut.chore_obj = chore
|
|
@@ -272,6 +272,26 @@ class FluxManager:
|
|
|
272
272
|
gpus_per_node = total_gpus // nnodes
|
|
273
273
|
return nnodes, cores_per_node, gpus_per_node
|
|
274
274
|
|
|
275
|
+
def _chore_resource_footprint(self, chore: Chore) -> tuple[int, int]:
|
|
276
|
+
"""
|
|
277
|
+
Return ``(needed_cores, needed_gpus)`` for a chore.
|
|
278
|
+
|
|
279
|
+
For whole-node (dynopro) chores — those with ``chore.nnodes`` set — the
|
|
280
|
+
footprint is ``nnodes * cores_per_node`` and ``nnodes * gpus_per_node``
|
|
281
|
+
because ``per_resource`` allocates entire nodes and every core and GPU on
|
|
282
|
+
them becomes unavailable. For ordinary chores the footprint is the
|
|
283
|
+
familiar ``num_tasks * cores_per_task`` / ``num_tasks * gpus_per_task``.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
if getattr(chore, "nnodes", None) is not None:
|
|
287
|
+
needed_cores = chore.nnodes * self._cores_per_node
|
|
288
|
+
needed_gpus = chore.nnodes * self._gpus_per_node
|
|
289
|
+
else:
|
|
290
|
+
needed_cores = chore.resources.num_tasks * chore.resources.cores_per_task
|
|
291
|
+
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
292
|
+
|
|
293
|
+
return needed_cores, needed_gpus
|
|
294
|
+
|
|
275
295
|
def _chore_fits_allocation(self, chore: Chore) -> bool:
|
|
276
296
|
"""
|
|
277
297
|
Checks whether the given chore is too big to be submitted
|
|
@@ -282,8 +302,7 @@ class FluxManager:
|
|
|
282
302
|
The :obj:`Chore` to check if it will fit in the allocation
|
|
283
303
|
"""
|
|
284
304
|
|
|
285
|
-
needed_cores =
|
|
286
|
-
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
305
|
+
needed_cores, needed_gpus = self._chore_resource_footprint(chore)
|
|
287
306
|
|
|
288
307
|
total_cores = self._nnodes_on_allocation * self._cores_per_node
|
|
289
308
|
total_gpus = self._nnodes_on_allocation * self._gpus_per_node
|
|
@@ -334,8 +353,7 @@ class FluxManager:
|
|
|
334
353
|
Checks to see if there are enough resources to submit the given :obj:`Chore`
|
|
335
354
|
"""
|
|
336
355
|
|
|
337
|
-
needed_cores =
|
|
338
|
-
needed_gpus = chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
356
|
+
needed_cores, needed_gpus = self._chore_resource_footprint(chore)
|
|
339
357
|
return self._free_cores >= needed_cores and self._free_gpus >= needed_gpus
|
|
340
358
|
|
|
341
359
|
def _has_failed(self, chore_id: str) -> bool:
|
|
@@ -399,7 +417,9 @@ class FluxManager:
|
|
|
399
417
|
|
|
400
418
|
self._fail_dependents(dep_id)
|
|
401
419
|
|
|
402
|
-
def _submit_one(
|
|
420
|
+
def _submit_one(
|
|
421
|
+
self, chore_id: str, buffer_time: float, dynopro: bool = False
|
|
422
|
+
) -> None:
|
|
403
423
|
"""
|
|
404
424
|
Submits a :obj:`Chore` and does book-keeping all the queues and resources
|
|
405
425
|
count
|
|
@@ -418,7 +438,7 @@ class FluxManager:
|
|
|
418
438
|
chore,
|
|
419
439
|
set_cpu_affinity=self._set_cpu_affinity,
|
|
420
440
|
set_gpu_affinity=self._set_gpu_affinity,
|
|
421
|
-
|
|
441
|
+
dynopro=dynopro,
|
|
422
442
|
)
|
|
423
443
|
except Exception as e:
|
|
424
444
|
self._logger.exception("CHORE SUBMIT FAILED: chore=%s", chore_id)
|
|
@@ -441,7 +461,9 @@ class FluxManager:
|
|
|
441
461
|
self._free_gpus -= chore.resources.num_tasks * chore.resources.gpus_per_task
|
|
442
462
|
time.sleep(buffer_time)
|
|
443
463
|
|
|
444
|
-
def _submit_until_ooresources(
|
|
464
|
+
def _submit_until_ooresources(
|
|
465
|
+
self, buffer_time: float, dynopro: bool = False
|
|
466
|
+
) -> bool:
|
|
445
467
|
"""
|
|
446
468
|
Submit as many chores as possible until out-of-resources
|
|
447
469
|
|
|
@@ -459,7 +481,7 @@ class FluxManager:
|
|
|
459
481
|
chore = self._chores_by_id[chore_id]
|
|
460
482
|
|
|
461
483
|
if self._can_submit_now(chore):
|
|
462
|
-
self._submit_one(chore_id, buffer_time)
|
|
484
|
+
self._submit_one(chore_id, buffer_time, dynopro=dynopro)
|
|
463
485
|
submitted_any = True
|
|
464
486
|
else:
|
|
465
487
|
deferred.append(chore_id)
|
|
@@ -636,7 +658,7 @@ class FluxManager:
|
|
|
636
658
|
)
|
|
637
659
|
while not done:
|
|
638
660
|
self._check_resources()
|
|
639
|
-
self._submit_until_ooresources(buffer_time=buffer_time)
|
|
661
|
+
self._submit_until_ooresources(buffer_time=buffer_time, dynopro=dynopro)
|
|
640
662
|
proc_strat.process_futures(buffer_time=buffer_time)
|
|
641
663
|
|
|
642
664
|
done = (
|
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from abc import ABC, abstractmethod
|
|
13
13
|
|
|
14
|
+
from matensemble import dynopro
|
|
14
15
|
from matensemble.model import OutputReference
|
|
15
16
|
|
|
16
17
|
|
|
@@ -127,7 +128,10 @@ class AdaptiveStrategy(FutureProcessingStrategy):
|
|
|
127
128
|
self.manager._blocked.discard(dep_id)
|
|
128
129
|
|
|
129
130
|
# adaptively submit another chore
|
|
130
|
-
self.manager._submit_until_ooresources(
|
|
131
|
+
self.manager._submit_until_ooresources(
|
|
132
|
+
buffer_time=buffer_time,
|
|
133
|
+
dynopro=getattr(self.manager, "_dynopro", False),
|
|
134
|
+
)
|
|
131
135
|
|
|
132
136
|
if self.manager._write_restart_freq and (
|
|
133
137
|
len(self.manager._completed_chores) % self.manager._write_restart_freq
|
|
@@ -304,7 +308,7 @@ class UserStrategy(FutureProcessingStrategy):
|
|
|
304
308
|
self.manager._blocked.discard(dep_id)
|
|
305
309
|
|
|
306
310
|
# --- Processing the chore and spawning the new one ---
|
|
307
|
-
if self.proc_chore
|
|
311
|
+
if chore_name == self.proc_chore:
|
|
308
312
|
try:
|
|
309
313
|
# Trust boundary: result.pickle is written by matensemble.runtime_worker
|
|
310
314
|
# in this workflow's chore workdir only—do not load pickles from
|
|
@@ -339,6 +343,12 @@ class UserStrategy(FutureProcessingStrategy):
|
|
|
339
343
|
f"bolo_match={chore_name} | due the following Exception ->\n{e}"
|
|
340
344
|
)
|
|
341
345
|
|
|
346
|
+
# adaptively submit another chore
|
|
347
|
+
self.manager._submit_until_ooresources(
|
|
348
|
+
buffer_time=buffer_time,
|
|
349
|
+
dynopro=getattr(self.manager, "_dynopro", False),
|
|
350
|
+
)
|
|
351
|
+
|
|
342
352
|
if self.manager._write_restart_freq and (
|
|
343
353
|
len(self.manager._completed_chores) % self.manager._write_restart_freq
|
|
344
354
|
== 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/compute_twist.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/correlations.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/postprocessors/ovito_calculators.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py
RENAMED
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/task_lib/analysis_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matensemble-0.3.7 → matensemble-0.3.9}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|