matensemble 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matensemble-0.2.1/LICENSE +29 -0
- matensemble-0.2.1/PKG-INFO +124 -0
- matensemble-0.2.1/README.md +101 -0
- matensemble-0.2.1/pyproject.toml +48 -0
- matensemble-0.2.1/src/matensemble/.python-version +1 -0
- matensemble-0.2.1/src/matensemble/README.md +0 -0
- matensemble-0.2.1/src/matensemble/TODO.md +208 -0
- matensemble-0.2.1/src/matensemble/__init__.py +15 -0
- matensemble-0.2.1/src/matensemble/chore.py +139 -0
- matensemble-0.2.1/src/matensemble/dash/assets/index-1X2cLUgt.js +50 -0
- matensemble-0.2.1/src/matensemble/dash/assets/index-DRkGfWlx.css +1 -0
- matensemble-0.2.1/src/matensemble/dash/index.html +14 -0
- matensemble-0.2.1/src/matensemble/dash/vite.svg +1 -0
- matensemble-0.2.1/src/matensemble/dynopro/__init__.py +0 -0
- matensemble-0.2.1/src/matensemble/dynopro/driver.py +84 -0
- matensemble-0.2.1/src/matensemble/dynopro/ensemble.py +119 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +42 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_diffraction.py +55 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +23 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_twist.py +240 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/correlations.py +47 -0
- matensemble-0.2.1/src/matensemble/dynopro/postprocessors/ovito_calculators.py +84 -0
- matensemble-0.2.1/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +194 -0
- matensemble-0.2.1/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +37 -0
- matensemble-0.2.1/src/matensemble/dynopro/task_lib/MDSubprocess.py +170 -0
- matensemble-0.2.1/src/matensemble/dynopro/task_lib/__init__.py +0 -0
- matensemble-0.2.1/src/matensemble/dynopro/task_lib/analysis_registry.py +33 -0
- matensemble-0.2.1/src/matensemble/dynopro/utils/__init__.py +0 -0
- matensemble-0.2.1/src/matensemble/dynopro/utils/lammps_init.py +52 -0
- matensemble-0.2.1/src/matensemble/dynopro/utils/preprocessors.py +70 -0
- matensemble-0.2.1/src/matensemble/dynopro/utils/stat.py +33 -0
- matensemble-0.2.1/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +50 -0
- matensemble-0.2.1/src/matensemble/fluxlet.py +162 -0
- matensemble-0.2.1/src/matensemble/logger.py +131 -0
- matensemble-0.2.1/src/matensemble/manager.py +592 -0
- matensemble-0.2.1/src/matensemble/model.py +75 -0
- matensemble-0.2.1/src/matensemble/pipeline.py +409 -0
- matensemble-0.2.1/src/matensemble/redis/__init__.py +0 -0
- matensemble-0.2.1/src/matensemble/redis/service.py +87 -0
- matensemble-0.2.1/src/matensemble/redis/test.py +13 -0
- matensemble-0.2.1/src/matensemble/runtime_worker.py +162 -0
- matensemble-0.2.1/src/matensemble/strategy.py +224 -0
- matensemble-0.2.1/src/matensemble/utils.py +181 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Soumendu Bagchi
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
* Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matensemble
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
|
|
5
|
+
Author: Soumendu Bagchi, Kaleb Duchesneau
|
|
6
|
+
Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: ase>=3.22.0
|
|
9
|
+
Requires-Dist: cloudpickle>=3.1.2
|
|
10
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
11
|
+
Requires-Dist: networkx>=3.6.1
|
|
12
|
+
Requires-Dist: numpy>=1.21.0
|
|
13
|
+
Requires-Dist: ovito>=3.7.0
|
|
14
|
+
Requires-Dist: pandas>=1.3.0
|
|
15
|
+
Requires-Dist: pymatgen>=2022.0.0
|
|
16
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
17
|
+
Requires-Dist: scipy>=1.7.0
|
|
18
|
+
Requires-Dist: seaborn>=0.11.0
|
|
19
|
+
Requires-Dist: flux-python==0.66.0 ; extra == 'flux'
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Provides-Extra: flux
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/matensemble/)
|
|
25
|
+
[](https://matensemble.readthedocs.io/en/latest/)
|
|
26
|
+
[](https://www.python.org/downloads/)
|
|
27
|
+
[](https://opensource.org/licenses/BSD-3-Clause)
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="images/Logo-Matensemble.png" alt="MatEnsemble" width="720" />
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
# MatEnsemble
|
|
34
|
+
|
|
35
|
+
MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
|
|
36
|
+
|
|
37
|
+
An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
|
|
42
|
+
- **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
|
|
43
|
+
- **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
|
|
44
|
+
- **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
|
|
45
|
+
- **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
|
|
49
|
+
</p>
|
|
50
|
+
|
|
51
|
+
<p align="center">
|
|
52
|
+
<img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
|
|
53
|
+
</p>
|
|
54
|
+
|
|
55
|
+
## Documentation
|
|
56
|
+
|
|
57
|
+
Documentation (overview, architecture, tutorials, API reference):
|
|
58
|
+
|
|
59
|
+
**[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
### Containers (recommended on many clusters)
|
|
64
|
+
|
|
65
|
+
OCI images are published to GitHub Container Registry, for example:
|
|
66
|
+
|
|
67
|
+
`ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
|
|
68
|
+
|
|
69
|
+
See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
|
|
70
|
+
|
|
71
|
+
### Development install
|
|
72
|
+
|
|
73
|
+
From a clone of this repository:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv sync
|
|
77
|
+
uv sync --group dev # optional: docs and pytest tooling
|
|
78
|
+
uv run pytest
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or with pip:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install -e ".[flux]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
|
|
88
|
+
|
|
89
|
+
## Quick example
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from matensemble.pipeline import Pipeline
|
|
93
|
+
|
|
94
|
+
pipe = Pipeline()
|
|
95
|
+
pipe.exec(command=["/bin/echo", "hello from MatEnsemble"])
|
|
96
|
+
pipe.submit()
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
For Python chores, dependency graphs, and the required split between an importable **chore module** and a **runner script**, see the [Tutorials](https://matensemble.readthedocs.io/en/latest/tutorials.html).
|
|
100
|
+
|
|
101
|
+
## Examples in the repository
|
|
102
|
+
|
|
103
|
+
Illustrative workflows live under [`example_workflows/`](https://github.com/FredDude2004/MatEnsemble/tree/main/example_workflows).
|
|
104
|
+
|
|
105
|
+
## Requirements and runtime
|
|
106
|
+
|
|
107
|
+
- A **Flux allocation** (or equivalent) on the machine where you call `Pipeline.submit()`
|
|
108
|
+
- For MPI Python or executable chores: a coherent MPI/Flux setup (e.g. PMI2) as expected by your site
|
|
109
|
+
- Optional: SSH port forwarding if you enable the dashboard on a compute node (see the architecture guide in the docs)
|
|
110
|
+
|
|
111
|
+
## Related links
|
|
112
|
+
|
|
113
|
+
- [Flux documentation](https://flux-framework.readthedocs.io/)
|
|
114
|
+
- [Flux Python guide](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/start.html)
|
|
115
|
+
- [Slurm documentation](https://slurm.schedmd.com/documentation.html) (common front-end to batch allocations)
|
|
116
|
+
- [LAMMPS manual](https://docs.lammps.org/Manual.html) (often used alongside ensemble MD workflows)
|
|
117
|
+
|
|
118
|
+
## Authors
|
|
119
|
+
|
|
120
|
+
Soumendu Bagchi, Kaleb Duchesneau (see `pyproject.toml` for contact details).
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
BSD 3-Clause. See [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
[](https://pypi.org/project/matensemble/)
|
|
2
|
+
[](https://matensemble.readthedocs.io/en/latest/)
|
|
3
|
+
[](https://www.python.org/downloads/)
|
|
4
|
+
[](https://opensource.org/licenses/BSD-3-Clause)
|
|
5
|
+
|
|
6
|
+
<p align="center">
|
|
7
|
+
<img src="images/Logo-Matensemble.png" alt="MatEnsemble" width="720" />
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
# MatEnsemble
|
|
11
|
+
|
|
12
|
+
MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
|
|
13
|
+
|
|
14
|
+
An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
|
|
19
|
+
- **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
|
|
20
|
+
- **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
|
|
21
|
+
- **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
|
|
22
|
+
- **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
|
|
26
|
+
</p>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
<img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
|
|
30
|
+
</p>
|
|
31
|
+
|
|
32
|
+
## Documentation
|
|
33
|
+
|
|
34
|
+
Documentation (overview, architecture, tutorials, API reference):
|
|
35
|
+
|
|
36
|
+
**[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
### Containers (recommended on many clusters)
|
|
41
|
+
|
|
42
|
+
OCI images are published to GitHub Container Registry, for example:
|
|
43
|
+
|
|
44
|
+
`ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
|
|
45
|
+
|
|
46
|
+
See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
|
|
47
|
+
|
|
48
|
+
### Development install
|
|
49
|
+
|
|
50
|
+
From a clone of this repository:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv sync
|
|
54
|
+
uv sync --group dev # optional: docs and pytest tooling
|
|
55
|
+
uv run pytest
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Or with pip:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -e ".[flux]"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
|
|
65
|
+
|
|
66
|
+
## Quick example
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from matensemble.pipeline import Pipeline
|
|
70
|
+
|
|
71
|
+
pipe = Pipeline()
|
|
72
|
+
pipe.exec(command=["/bin/echo", "hello from MatEnsemble"])
|
|
73
|
+
pipe.submit()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
For Python chores, dependency graphs, and the required split between an importable **chore module** and a **runner script**, see the [Tutorials](https://matensemble.readthedocs.io/en/latest/tutorials.html).
|
|
77
|
+
|
|
78
|
+
## Examples in the repository
|
|
79
|
+
|
|
80
|
+
Illustrative workflows live under [`example_workflows/`](https://github.com/FredDude2004/MatEnsemble/tree/main/example_workflows).
|
|
81
|
+
|
|
82
|
+
## Requirements and runtime
|
|
83
|
+
|
|
84
|
+
- A **Flux allocation** (or equivalent) on the machine where you call `Pipeline.submit()`
|
|
85
|
+
- For MPI Python or executable chores: a coherent MPI/Flux setup (e.g. PMI2) as expected by your site
|
|
86
|
+
- Optional: SSH port forwarding if you enable the dashboard on a compute node (see the architecture guide in the docs)
|
|
87
|
+
|
|
88
|
+
## Related links
|
|
89
|
+
|
|
90
|
+
- [Flux documentation](https://flux-framework.readthedocs.io/)
|
|
91
|
+
- [Flux Python guide](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/start.html)
|
|
92
|
+
- [Slurm documentation](https://slurm.schedmd.com/documentation.html) (common front-end to batch allocations)
|
|
93
|
+
- [LAMMPS manual](https://docs.lammps.org/Manual.html) (often used alongside ensemble MD workflows)
|
|
94
|
+
|
|
95
|
+
## Authors
|
|
96
|
+
|
|
97
|
+
Soumendu Bagchi, Kaleb Duchesneau (see `pyproject.toml` for contact details).
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
BSD 3-Clause. See [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "matensemble"
|
|
3
|
+
version = "0.2.1"
|
|
4
|
+
description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license-files = ["LICENSE"]
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Soumendu Bagchi", email = "soumendubagchi@gmail.com" },
|
|
9
|
+
{ name = "Kaleb Duchesneau", email = "kalebduchesneau@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
requires-python = ">=3.12"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"ase>=3.22.0",
|
|
14
|
+
"cloudpickle>=3.1.2",
|
|
15
|
+
"matplotlib>=3.4.0",
|
|
16
|
+
"networkx>=3.6.1",
|
|
17
|
+
"numpy>=1.21.0",
|
|
18
|
+
"ovito>=3.7.0",
|
|
19
|
+
"pandas>=1.3.0",
|
|
20
|
+
"pymatgen>=2022.0.0",
|
|
21
|
+
"scikit-learn>=1.0.0",
|
|
22
|
+
"scipy>=1.7.0",
|
|
23
|
+
"seaborn>=0.11.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
flux = [
|
|
28
|
+
"flux-python==0.66.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = ["uv_build>=0.10.0,<0.11.0"]
|
|
33
|
+
build-backend = "uv_build"
|
|
34
|
+
|
|
35
|
+
[dependency-groups]
|
|
36
|
+
dev = [
|
|
37
|
+
"myst-parser>=5.0.0",
|
|
38
|
+
"pytest>=9.0.2",
|
|
39
|
+
"sphinx>=9.1.0",
|
|
40
|
+
"sphinx-autodoc-typehints>=3.6.2",
|
|
41
|
+
"sphinx-rtd-theme>=3.1.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[tool.uv.workspace]
|
|
45
|
+
members = [
|
|
46
|
+
"src/mcp",
|
|
47
|
+
"src/mcp_matensemble",
|
|
48
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# === TODO ===
|
|
2
|
+
|
|
3
|
+
## --- Refactor MatEnsemble ---
|
|
4
|
+
- [x] Create strategy base class
|
|
5
|
+
- [x] Implement the stategies
|
|
6
|
+
- [x] Create strategy base class for processing futures
|
|
7
|
+
- [x] Implement strategies for processing futures
|
|
8
|
+
- [x] Refactor matflux.py and matfluxGen.py to be more modular and in one manager.py
|
|
9
|
+
- [x] Test matflux/matfluxGen refactor make sure it works before doing anything else
|
|
10
|
+
- [x] Fix problems and test again
|
|
11
|
+
|
|
12
|
+
### --- Problems ---
|
|
13
|
+
- [x] Use *ONE* executor in the manager super loop instead of spawning new ones each time
|
|
14
|
+
- [x] Make sure future objects have proper fields appended at creation (task_ or task + job_spec)
|
|
15
|
+
- [x] Move writing of restart files into the FutureProcessingStrategy implementations
|
|
16
|
+
- [x] Make sure you remove the finished future rather than popleft in FutureProcessingStrategy implementations
|
|
17
|
+
|
|
18
|
+
## --- NOTE: Refactored code runs way slower ---
|
|
19
|
+
- [x] Fix problems causing slowdown and test again
|
|
20
|
+
|
|
21
|
+
### --- More Problems ---
|
|
22
|
+
- [x] Make tests consistent so that we have an apples to apples comparison
|
|
23
|
+
- [x] Remove extra logging and RPC calls to limit traffic
|
|
24
|
+
- [x] Update resources calls to update in place in submit_until_ooresources()
|
|
25
|
+
|
|
26
|
+
- [x] Test matensemble again until it is working as before
|
|
27
|
+
|
|
28
|
+
**--- Got it working as before ---**
|
|
29
|
+
- [x] Update logging to be more industry standard
|
|
30
|
+
- [x] Refactor Fluxlet to remove global side effects
|
|
31
|
+
- [x] Add type annotations back to strategies
|
|
32
|
+
- [x] Document all of the code vigorously
|
|
33
|
+
- [x] Document manager.py
|
|
34
|
+
- [x] Document fluxlet.py
|
|
35
|
+
- [x] Document strategies/*
|
|
36
|
+
- [x] Remove all TODOs and HACKs
|
|
37
|
+
|
|
38
|
+
- [x] Update the documentation and make sure it has all of the strategies
|
|
39
|
+
- [x] Make a script to build the documentation
|
|
40
|
+
- [x] Remove all artifacts from the repository
|
|
41
|
+
|
|
42
|
+
## --- Add Testing ---
|
|
43
|
+
- [x] Make sure the simple hello world tests work
|
|
44
|
+
- [x] Figure out what is going on with the GPU tasks
|
|
45
|
+
- [x] Make some tests that have failures to make sure the failed tasks get logged appropriately
|
|
46
|
+
- [x] NOTE: Come back later -- Unit tests | Integration Tests --
|
|
47
|
+
|
|
48
|
+
## --- Find Solution For Distribution ---
|
|
49
|
+
- [x] Turn Matensemble into a uv project
|
|
50
|
+
- [x] Build the initial Apptainer container
|
|
51
|
+
- [x] Create the matensemble.def file
|
|
52
|
+
### --- Def File Spec ---
|
|
53
|
+
- The file should be off of a frontier base image use rocky linux version
|
|
54
|
+
- Install build dependecies of flux-core
|
|
55
|
+
- Build flux-core from source
|
|
56
|
+
- Install build dependencies for flux-sched
|
|
57
|
+
- Build flux-sched from source
|
|
58
|
+
- Export all variables
|
|
59
|
+
- Install matensemble
|
|
60
|
+
- [x] Test the apptainer container
|
|
61
|
+
|
|
62
|
+
## --- Build Base Images ---
|
|
63
|
+
- [x] Build Base Image for Baseline
|
|
64
|
+
- [x] Build Base Image for Frontier
|
|
65
|
+
- [x] Build Base Image for Perlmutter
|
|
66
|
+
- [x] Push images to GitHub Container Registry
|
|
67
|
+
- [x] Build MatEnsemble Images with each base image
|
|
68
|
+
- [x] Test Images on each respective system
|
|
69
|
+
- [x] NOTE: Come back later -- Test Perlmutter Image
|
|
70
|
+
- [x] Create Perlmutter image from new base image [Neil's Containerfiles](https://github.com/namehta4/Containerfiles/blob/main/Base/GPU/Dockerfile)
|
|
71
|
+
|
|
72
|
+
## --- Setup GitHub Actions ---
|
|
73
|
+
- [x] Setup Matrix build action to build MatEnsemble images for baseline, frontier, and perlmutter and push them to ghcr
|
|
74
|
+
- [x] Setup action to build with uv and publish with uv
|
|
75
|
+
- [x] Setup action to build docs and publish them
|
|
76
|
+
|
|
77
|
+
## --- Test CI/CD ---
|
|
78
|
+
- [x] Make small change to MatEnsemble and Docs
|
|
79
|
+
- [x] Push to main see if dev builds succeed
|
|
80
|
+
- [x] Run release.sh script to see if releases happen properly
|
|
81
|
+
|
|
82
|
+
## --- Updated/Better UX ---
|
|
83
|
+
- [x] Refactor to be built around Task/Job Objects
|
|
84
|
+
- [x] Allow users to decorate python functions to create TaskSpec's
|
|
85
|
+
- [x] Allow functions to depend on other functions
|
|
86
|
+
- [x] Topologically sort all of the Jobs based on dependencies
|
|
87
|
+
- [x] Write worker runtime that flux can target and call user defined functions
|
|
88
|
+
- [x] Write the Job objects specification to a file in their direcotry
|
|
89
|
+
|
|
90
|
+
## --- Status Dashboard ---
|
|
91
|
+
- [x] Update the Pipeline.run() method to have a dashboard flag
|
|
92
|
+
- [x] Add logic to launch the dashboard when the user runs the workflow
|
|
93
|
+
|
|
94
|
+
## --- Science Example ---
|
|
95
|
+
- [x] Test the science example that Soumendu provided
|
|
96
|
+
- [x] Update version of LAMMPS
|
|
97
|
+
- [x] Fix Bug with jobspec.env -> jobspec.environment
|
|
98
|
+
- [x] Test it again
|
|
99
|
+
|
|
100
|
+
## --- Polish Everything ---
|
|
101
|
+
- [x] Update all the documentation
|
|
102
|
+
- [x] Update the example workflows
|
|
103
|
+
- [x] Provide tutorials for how to run the example workflows
|
|
104
|
+
- [x] Change name of 'Job' to 'Chore'
|
|
105
|
+
- [x] Change name of 'Pipeline' to something else
|
|
106
|
+
- [x] Make ChoreType.PYTHON have the ability to be defined in the runner script
|
|
107
|
+
|
|
108
|
+
## --- Fix Containers ---
|
|
109
|
+
- [x] Install latest version of lammps in frontier images
|
|
110
|
+
- [x] Test MPI problem with MatEnsemble in Frontier Images
|
|
111
|
+
- [x] Test Science Example with latest MPICH install
|
|
112
|
+
- [x] Ping neil to ask about MPICH in image
|
|
113
|
+
|
|
114
|
+
## --- Finish presentation ---
|
|
115
|
+
- [x] Conda environment might make this very simple
|
|
116
|
+
- [x] Create a jupyter notebook and screen record it
|
|
117
|
+
- [x] Place that at the end of the presentation
|
|
118
|
+
|
|
119
|
+
## MatEnsemble Fixes
|
|
120
|
+
- [x] Add the ability to print the results of the OutputReference objects
|
|
121
|
+
- [x] Give the user the ability to define workflows in a single file
|
|
122
|
+
- [x] Make the log updates threaded
|
|
123
|
+
- [x] Implement the restart files
|
|
124
|
+
- [x] Bring back the terminal view the log command
|
|
125
|
+
- [x] Dynopro fix thingy (read flux docs JobspecV1.from_command vs. JobspecV1.per_resource)
|
|
126
|
+
- [x] Test all the fixes with a simple dynopro example and print an
|
|
127
|
+
OutputReference and watch the logs and make a single file workflow and
|
|
128
|
+
|
|
129
|
+
## --- Test Perlmutter Container ---
|
|
130
|
+
- [x] Give them a test with the current command that you have been running
|
|
131
|
+
- [x] If that doesn't work break it down into smaller pieces
|
|
132
|
+
## --- Smaller pieces ---
|
|
133
|
+
- [x] Maybe start with an nvidia image rather than Neil's image
|
|
134
|
+
- [ ] Make sure that flux works in the container
|
|
135
|
+
- [ ] Create a container that just has flux and have some different tests for that
|
|
136
|
+
- [ ] Create a container that has just MPI and test that make sure it works
|
|
137
|
+
- [ ] Combine flux and MPI and see if that works
|
|
138
|
+
- [ ] Create a container that has lammps and make sure that that is working
|
|
139
|
+
- [ ] Combine all the pieces
|
|
140
|
+
|
|
141
|
+
## --- Test Frontier Apptainer container ---
|
|
142
|
+
- [x] Need lots more help here
|
|
143
|
+
- [x] Make test that is very small first
|
|
144
|
+
- [x] Test new container against the running examples
|
|
145
|
+
|
|
146
|
+
## --- Clean some things up ---
|
|
147
|
+
- [x] Make ticket for neil
|
|
148
|
+
- [x] Update frontier Dockerfile
|
|
149
|
+
- [x] Update README.md
|
|
150
|
+
- [x] Create multi architecture builds for baseline image
|
|
151
|
+
- [x] Run release script
|
|
152
|
+
|
|
153
|
+
## --- Create new strategy to enable autonomous workflows ---
|
|
154
|
+
- [x] Update the OutputReference objects to have the ability to get the results
|
|
155
|
+
- [x] Create a method in the Pipeline to be able to get the results of all chores
|
|
156
|
+
- [x] Make a strategy that can take in chore and does processing which spawns a new chore
|
|
157
|
+
- [x] Figure out how to spawn a new chore
|
|
158
|
+
- [x] Change to only use cloudpickle and only pickle the actual function once
|
|
159
|
+
- [x] Change the chore objects to not store the function
|
|
160
|
+
- [x] Change the chore objects to reference a function in the registry
|
|
161
|
+
- [x] Change runtime worker to load function from registry and call it with args/kwargs
|
|
162
|
+
- [x] Add a set of OutputReference objects in the pipeline
|
|
163
|
+
- [x] Connect the added chores back to the pipeline somehow
|
|
164
|
+
- [x] Add a method in pipeline where you can get the results of all of your chores
|
|
165
|
+
- [x] Ping Neil about MPICH
|
|
166
|
+
- [x] Convert Scaffold into PowerPoint Presentation
|
|
167
|
+
- [x] Email coordinatior about length of presentation and audience
|
|
168
|
+
|
|
169
|
+
## === AFTER EVERYTHING ABOVE IS DONE AND STABLE ===
|
|
170
|
+
|
|
171
|
+
## --- Model Context Protocol ---
|
|
172
|
+
- [ ] MCP implementation
|
|
173
|
+
- [ ] Map out the Tool and Resources
|
|
174
|
+
### Implement the Resources
|
|
175
|
+
- [ ] Resource to Fetch ALL Docs
|
|
176
|
+
- [ ] Resource to Fetch Relavant Source Code
|
|
177
|
+
- [ ] Resource to Fetch Examples General or system dependent
|
|
178
|
+
### Implement the Tools
|
|
179
|
+
- [ ] Tool to create a directory for the workflow
|
|
180
|
+
- [ ] Tool to write a file in that directory
|
|
181
|
+
- [ ] Tool to delete a file in that directory
|
|
182
|
+
- [ ] Tool to create a workflow
|
|
183
|
+
- [ ] Tool to verify a workflow
|
|
184
|
+
- [ ] Tool to create a batch script
|
|
185
|
+
- [ ] Tool to setup container env
|
|
186
|
+
- [ ] Tool to submit a batch script
|
|
187
|
+
### Implement the Prompts
|
|
188
|
+
- [ ] ???
|
|
189
|
+
- [ ] Test the server locally
|
|
190
|
+
- [ ] Test the server on an HPC cluster
|
|
191
|
+
- [ ] Create documentation for setting it up
|
|
192
|
+
|
|
193
|
+
## --- Create first draft for JOSS ---
|
|
194
|
+
- [ ] Read some example papers
|
|
195
|
+
- [ ] Create draft and show Dr. Bagchi
|
|
196
|
+
- [ ] Polish the repository to be ready for review
|
|
197
|
+
- [ ] Make sure that the tests work
|
|
198
|
+
- [ ] Make sure that the example workflows work correctly
|
|
199
|
+
- [ ] Make sure that they can easily test the code and
|
|
200
|
+
- [ ] Create a conda package that they can easily test the code without having
|
|
201
|
+
to compile flux and flux-sched themselves
|
|
202
|
+
|
|
203
|
+
## --- Reading List ---
|
|
204
|
+
- [ ] [Agentic Orchestration of HPC Applications](https://vsoch.github.io/assets/posts/agentic-orchestration-hpc-workloads-cloud-sochat-milroy.pdf)
|
|
205
|
+
- [x] [Container Training Slides](https://drive.google.com/drive/folders/1_mTBBc98TEX3XFpNp0rqoqj1VjN9TKoO)
|
|
206
|
+
- [ ] [Containers as Jupyter Kernels](https://docs.nersc.gov/services/jupyter/how-to-guides/#how-to-use-a-container-to-run-a-jupyter-kernel)
|
|
207
|
+
- [ ] [Using SPIN to Run Persistent Containers](https://docs.nersc.gov/services/spin/)
|
|
208
|
+
- [ ] [Using uv to package lammps and flux into pip install???](https://sgoel.dev/posts/building-cython-or-c-extensions-using-uv/)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MatEnsemble
|
|
3
|
+
|
|
4
|
+
MatEnsemble is a Python workflow library for building and running
|
|
5
|
+
high-throughput and dependency-aware workflows on HPC systems.
|
|
6
|
+
It lets users define delayed Python and executable chores, connect them
|
|
7
|
+
through data dependencies, and submit them for execution with Flux.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__author__ = ["Soumendu Bagchi", "Kaleb Duchesneau"]
|
|
11
|
+
__package__ = "matensemble"
|
|
12
|
+
|
|
13
|
+
# Re-export core data model types at the package root for convenience and for
|
|
14
|
+
# backwards compatibility with code/tests that import from `matensemble`.
|
|
15
|
+
from .model import OutputReference, Resources, ChoreType # noqa: E402,F401
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import networkx as nx
|
|
6
|
+
import shlex
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from matensemble.model import ChoreType, Resources
|
|
11
|
+
from matensemble.utils import _json_safe
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Chore:
|
|
15
|
+
"""
|
|
16
|
+
A :obj:`Chore` is what MatEnsemble is built around. :obj:`Job`'s can have two
|
|
17
|
+
different types. ``PYTHON`` or ``EXECUTABLE``
|
|
18
|
+
|
|
19
|
+
Python chores are delayed function calls that will be submitted to the
|
|
20
|
+
runtime-worker when the :obj:`Chore`'s dependencies are resolved and they are
|
|
21
|
+
schduled in the queue.
|
|
22
|
+
|
|
23
|
+
Executable chores are simply commands that will usually call an Executable script
|
|
24
|
+
when the chore is scheduled.
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
id: str,
|
|
31
|
+
command: str | list[str],
|
|
32
|
+
chore_type: ChoreType,
|
|
33
|
+
resources: Resources,
|
|
34
|
+
workdir: Path,
|
|
35
|
+
func_module: str | None = None,
|
|
36
|
+
func_qualname: str | None = None,
|
|
37
|
+
serialized_callable: bytes | None = None,
|
|
38
|
+
deps: tuple[str, ...] = (),
|
|
39
|
+
args: tuple = (),
|
|
40
|
+
kwargs: dict | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""
|
|
43
|
+
The constructor for a :obj:`Chore`
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
id : str
|
|
48
|
+
The ID for the :obj:`Chore`
|
|
49
|
+
command : str, list[str]
|
|
50
|
+
The command that will be run when the :obj:`Chore` is submitted
|
|
51
|
+
chore_type: ChoreType
|
|
52
|
+
Either PYTHON or EXECUTABLE
|
|
53
|
+
resources : Resources
|
|
54
|
+
An instance of :obj:`Resources` that holds all the information about
|
|
55
|
+
what resources are needed to run the :obj:`Chore`
|
|
56
|
+
workdir : Path
|
|
57
|
+
The Path to the directory where the output of the :obj:`Chore` will be
|
|
58
|
+
handled
|
|
59
|
+
func_module : str
|
|
60
|
+
The module where the function definition is if the type of the :obj:`Chore`
|
|
61
|
+
is PYTHON
|
|
62
|
+
func_qualname : str
|
|
63
|
+
The name of the function if the type of the :obj:`Chore` is PYTHON
|
|
64
|
+
serialized_callable : bytes
|
|
65
|
+
The original function that was wrapped stored as bytes
|
|
66
|
+
deps : tuple[str, ...]
|
|
67
|
+
A tupele of chore-id's which results this :obj:`Chore
|
|
68
|
+
args : tuple
|
|
69
|
+
The arguments to give the function if type is PYTHON
|
|
70
|
+
kwargs : dict
|
|
71
|
+
The key-word arguments to give the function if flaovr is PYTHON
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
self.id = id
|
|
75
|
+
self.command = (
|
|
76
|
+
shlex.split(command) if isinstance(command, str) else list(command)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if chore_type == ChoreType.PYTHON:
|
|
80
|
+
if serialized_callable is None and not (func_module and func_qualname):
|
|
81
|
+
raise ValueError(
|
|
82
|
+
"Python chores require either serialized_callable or func_module+func_qualname"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self.chore_type = chore_type
|
|
86
|
+
self.resources = resources
|
|
87
|
+
self.workdir = workdir.resolve()
|
|
88
|
+
self.spec_path = self.workdir / "chore.pkl"
|
|
89
|
+
|
|
90
|
+
self.func_module = func_module
|
|
91
|
+
self.func_qualname = func_qualname
|
|
92
|
+
self.serialized_callable = serialized_callable
|
|
93
|
+
|
|
94
|
+
self.deps = deps
|
|
95
|
+
self.args = args
|
|
96
|
+
self.kwargs = {} if kwargs is None else kwargs
|
|
97
|
+
|
|
98
|
+
def graph(self) -> nx.DiGraph:
|
|
99
|
+
return nx.DiGraph()
|
|
100
|
+
|
|
101
|
+
def _to_debug_dict(self) -> dict:
|
|
102
|
+
return {
|
|
103
|
+
"id": self.id,
|
|
104
|
+
"command": self.command,
|
|
105
|
+
"chore_type": _json_safe(self.chore_type),
|
|
106
|
+
"resources": {
|
|
107
|
+
"num_tasks": self.resources.num_tasks,
|
|
108
|
+
"cores_per_task": self.resources.cores_per_task,
|
|
109
|
+
"gpus_per_task": self.resources.gpus_per_task,
|
|
110
|
+
"mpi": self.resources.mpi,
|
|
111
|
+
"env": _json_safe(self.resources.env),
|
|
112
|
+
"inherit_env": self.resources.inherit_env,
|
|
113
|
+
},
|
|
114
|
+
"spec_file": str(self.spec_path),
|
|
115
|
+
"func_module": self.func_module,
|
|
116
|
+
"func_qualname": self.func_qualname,
|
|
117
|
+
"has_serialized_callable": self.serialized_callable is not None,
|
|
118
|
+
"deps": list(self.deps),
|
|
119
|
+
"args": _json_safe(self.args),
|
|
120
|
+
"kwargs": _json_safe(self.kwargs),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
def _write_debug_json(self) -> None:
|
|
124
|
+
"""
|
|
125
|
+
The :obj:`Chore` is pickled at runtime to be used later on, but it is also
|
|
126
|
+
written as json for debugging.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
debug_file = self.spec_path.parent / "chore.json"
|
|
130
|
+
debug_file.parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
with debug_file.open("w") as f:
|
|
132
|
+
json.dump(self._to_debug_dict(), f, indent=2)
|
|
133
|
+
|
|
134
|
+
def __str__(self) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Return the :obj:`Chore` as a JSON string.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
return json.dumps(self._to_debug_dict(), indent=2)
|