matensemble 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. matensemble-0.2.1/LICENSE +29 -0
  2. matensemble-0.2.1/PKG-INFO +124 -0
  3. matensemble-0.2.1/README.md +101 -0
  4. matensemble-0.2.1/pyproject.toml +48 -0
  5. matensemble-0.2.1/src/matensemble/.python-version +1 -0
  6. matensemble-0.2.1/src/matensemble/README.md +0 -0
  7. matensemble-0.2.1/src/matensemble/TODO.md +208 -0
  8. matensemble-0.2.1/src/matensemble/__init__.py +15 -0
  9. matensemble-0.2.1/src/matensemble/chore.py +139 -0
  10. matensemble-0.2.1/src/matensemble/dash/assets/index-1X2cLUgt.js +50 -0
  11. matensemble-0.2.1/src/matensemble/dash/assets/index-DRkGfWlx.css +1 -0
  12. matensemble-0.2.1/src/matensemble/dash/index.html +14 -0
  13. matensemble-0.2.1/src/matensemble/dash/vite.svg +1 -0
  14. matensemble-0.2.1/src/matensemble/dynopro/__init__.py +0 -0
  15. matensemble-0.2.1/src/matensemble/dynopro/driver.py +84 -0
  16. matensemble-0.2.1/src/matensemble/dynopro/ensemble.py +119 -0
  17. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
  18. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +42 -0
  19. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_diffraction.py +55 -0
  20. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +23 -0
  21. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/compute_twist.py +240 -0
  22. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/correlations.py +47 -0
  23. matensemble-0.2.1/src/matensemble/dynopro/postprocessors/ovito_calculators.py +84 -0
  24. matensemble-0.2.1/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +194 -0
  25. matensemble-0.2.1/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +37 -0
  26. matensemble-0.2.1/src/matensemble/dynopro/task_lib/MDSubprocess.py +170 -0
  27. matensemble-0.2.1/src/matensemble/dynopro/task_lib/__init__.py +0 -0
  28. matensemble-0.2.1/src/matensemble/dynopro/task_lib/analysis_registry.py +33 -0
  29. matensemble-0.2.1/src/matensemble/dynopro/utils/__init__.py +0 -0
  30. matensemble-0.2.1/src/matensemble/dynopro/utils/lammps_init.py +52 -0
  31. matensemble-0.2.1/src/matensemble/dynopro/utils/preprocessors.py +70 -0
  32. matensemble-0.2.1/src/matensemble/dynopro/utils/stat.py +33 -0
  33. matensemble-0.2.1/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +50 -0
  34. matensemble-0.2.1/src/matensemble/fluxlet.py +162 -0
  35. matensemble-0.2.1/src/matensemble/logger.py +131 -0
  36. matensemble-0.2.1/src/matensemble/manager.py +592 -0
  37. matensemble-0.2.1/src/matensemble/model.py +75 -0
  38. matensemble-0.2.1/src/matensemble/pipeline.py +409 -0
  39. matensemble-0.2.1/src/matensemble/redis/__init__.py +0 -0
  40. matensemble-0.2.1/src/matensemble/redis/service.py +87 -0
  41. matensemble-0.2.1/src/matensemble/redis/test.py +13 -0
  42. matensemble-0.2.1/src/matensemble/runtime_worker.py +162 -0
  43. matensemble-0.2.1/src/matensemble/strategy.py +224 -0
  44. matensemble-0.2.1/src/matensemble/utils.py +181 -0
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2025, Soumendu Bagchi
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ * Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: matensemble
3
+ Version: 0.2.1
4
+ Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
5
+ Author: Soumendu Bagchi, Kaleb Duchesneau
6
+ Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
7
+ License-File: LICENSE
8
+ Requires-Dist: ase>=3.22.0
9
+ Requires-Dist: cloudpickle>=3.1.2
10
+ Requires-Dist: matplotlib>=3.4.0
11
+ Requires-Dist: networkx>=3.6.1
12
+ Requires-Dist: numpy>=1.21.0
13
+ Requires-Dist: ovito>=3.7.0
14
+ Requires-Dist: pandas>=1.3.0
15
+ Requires-Dist: pymatgen>=2022.0.0
16
+ Requires-Dist: scikit-learn>=1.0.0
17
+ Requires-Dist: scipy>=1.7.0
18
+ Requires-Dist: seaborn>=0.11.0
19
+ Requires-Dist: flux-python==0.66.0 ; extra == 'flux'
20
+ Requires-Python: >=3.12
21
+ Provides-Extra: flux
22
+ Description-Content-Type: text/markdown
23
+
24
+ [![PyPI version](https://badge.fury.io/py/matensemble.svg)](https://pypi.org/project/matensemble/)
25
+ [![Documentation](https://readthedocs.org/projects/matensemble/badge/?version=latest)](https://matensemble.readthedocs.io/en/latest/)
26
+ [![Python](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
27
+ [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
28
+
29
+ <p align="center">
30
+ <img src="images/Logo-Matensemble.png" alt="MatEnsemble" width="720" />
31
+ </p>
32
+
33
+ # MatEnsemble
34
+
35
+ MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
36
+
37
+ An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
38
+
39
+ ## Features
40
+
41
+ - **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
42
+ - **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
43
+ - **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
44
+ - **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
45
+ - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
46
+
47
+ <p align="center">
48
+ <img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
49
+ </p>
50
+
51
+ <p align="center">
52
+ <img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
53
+ </p>
54
+
55
+ ## Documentation
56
+
57
+ Documentation (overview, architecture, tutorials, API reference):
58
+
59
+ **[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
60
+
61
+ ## Installation
62
+
63
+ ### Containers (recommended on many clusters)
64
+
65
+ OCI images are published to GitHub Container Registry, for example:
66
+
67
+ `ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
68
+
69
+ See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
70
+
71
+ ### Development install
72
+
73
+ From a clone of this repository:
74
+
75
+ ```bash
76
+ uv sync
77
+ uv sync --group dev # optional: docs and pytest tooling
78
+ uv run pytest
79
+ ```
80
+
81
+ Or with pip:
82
+
83
+ ```bash
84
+ pip install -e ".[flux]"
85
+ ```
86
+
87
+ Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
88
+
89
+ ## Quick example
90
+
91
+ ```python
92
+ from matensemble.pipeline import Pipeline
93
+
94
+ pipe = Pipeline()
95
+ pipe.exec(command=["/bin/echo", "hello from MatEnsemble"])
96
+ pipe.submit()
97
+ ```
98
+
99
+ For Python chores, dependency graphs, and the required split between an importable **chore module** and a **runner script**, see the [Tutorials](https://matensemble.readthedocs.io/en/latest/tutorials.html).
100
+
101
+ ## Examples in the repository
102
+
103
+ Illustrative workflows live under [`example_workflows/`](https://github.com/FredDude2004/MatEnsemble/tree/main/example_workflows).
104
+
105
+ ## Requirements and runtime
106
+
107
+ - A **Flux allocation** (or equivalent) on the machine where you call `Pipeline.submit()`
108
+ - For MPI Python or executable chores: a coherent MPI/Flux setup (e.g. PMI2) as expected by your site
109
+ - Optional: SSH port forwarding if you enable the dashboard on a compute node (see the architecture guide in the docs)
110
+
111
+ ## Related links
112
+
113
+ - [Flux documentation](https://flux-framework.readthedocs.io/)
114
+ - [Flux Python guide](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/start.html)
115
+ - [Slurm documentation](https://slurm.schedmd.com/documentation.html) (common front-end to batch allocations)
116
+ - [LAMMPS manual](https://docs.lammps.org/Manual.html) (often used alongside ensemble MD workflows)
117
+
118
+ ## Authors
119
+
120
+ Soumendu Bagchi, Kaleb Duchesneau (see `pyproject.toml` for contact details).
121
+
122
+ ## License
123
+
124
+ BSD 3-Clause. See [`LICENSE`](LICENSE).
@@ -0,0 +1,101 @@
1
+ [![PyPI version](https://badge.fury.io/py/matensemble.svg)](https://pypi.org/project/matensemble/)
2
+ [![Documentation](https://readthedocs.org/projects/matensemble/badge/?version=latest)](https://matensemble.readthedocs.io/en/latest/)
3
+ [![Python](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
4
+ [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
5
+
6
+ <p align="center">
7
+ <img src="images/Logo-Matensemble.png" alt="MatEnsemble" width="720" />
8
+ </p>
9
+
10
+ # MatEnsemble
11
+
12
+ MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
13
+
14
+ An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
15
+
16
+ ## Features
17
+
18
+ - **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
19
+ - **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
20
+ - **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
21
+ - **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
22
+ - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
23
+
24
+ <p align="center">
25
+ <img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
26
+ </p>
27
+
28
+ <p align="center">
29
+ <img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
30
+ </p>
31
+
32
+ ## Documentation
33
+
34
+ Documentation (overview, architecture, tutorials, API reference):
35
+
36
+ **[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
37
+
38
+ ## Installation
39
+
40
+ ### Containers (recommended on many clusters)
41
+
42
+ OCI images are published to GitHub Container Registry, for example:
43
+
44
+ `ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
45
+
46
+ See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
47
+
48
+ ### Development install
49
+
50
+ From a clone of this repository:
51
+
52
+ ```bash
53
+ uv sync
54
+ uv sync --group dev # optional: docs and pytest tooling
55
+ uv run pytest
56
+ ```
57
+
58
+ Or with pip:
59
+
60
+ ```bash
61
+ pip install -e ".[flux]"
62
+ ```
63
+
64
+ Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
65
+
66
+ ## Quick example
67
+
68
+ ```python
69
+ from matensemble.pipeline import Pipeline
70
+
71
+ pipe = Pipeline()
72
+ pipe.exec(command=["/bin/echo", "hello from MatEnsemble"])
73
+ pipe.submit()
74
+ ```
75
+
76
+ For Python chores, dependency graphs, and the required split between an importable **chore module** and a **runner script**, see the [Tutorials](https://matensemble.readthedocs.io/en/latest/tutorials.html).
77
+
78
+ ## Examples in the repository
79
+
80
+ Illustrative workflows live under [`example_workflows/`](https://github.com/FredDude2004/MatEnsemble/tree/main/example_workflows).
81
+
82
+ ## Requirements and runtime
83
+
84
+ - A **Flux allocation** (or equivalent) on the machine where you call `Pipeline.submit()`
85
+ - For MPI Python or executable chores: a coherent MPI/Flux setup (e.g. PMI2) as expected by your site
86
+ - Optional: SSH port forwarding if you enable the dashboard on a compute node (see the architecture guide in the docs)
87
+
88
+ ## Related links
89
+
90
+ - [Flux documentation](https://flux-framework.readthedocs.io/)
91
+ - [Flux Python guide](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/start.html)
92
+ - [Slurm documentation](https://slurm.schedmd.com/documentation.html) (common front-end to batch allocations)
93
+ - [LAMMPS manual](https://docs.lammps.org/Manual.html) (often used alongside ensemble MD workflows)
94
+
95
+ ## Authors
96
+
97
+ Soumendu Bagchi, Kaleb Duchesneau (see `pyproject.toml` for contact details).
98
+
99
+ ## License
100
+
101
+ BSD 3-Clause. See [`LICENSE`](LICENSE).
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "matensemble"
3
+ version = "0.2.1"
4
+ description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
5
+ readme = "README.md"
6
+ license-files = ["LICENSE"]
7
+ authors = [
8
+ { name = "Soumendu Bagchi", email = "soumendubagchi@gmail.com" },
9
+ { name = "Kaleb Duchesneau", email = "kalebduchesneau@gmail.com" },
10
+ ]
11
+ requires-python = ">=3.12"
12
+ dependencies = [
13
+ "ase>=3.22.0",
14
+ "cloudpickle>=3.1.2",
15
+ "matplotlib>=3.4.0",
16
+ "networkx>=3.6.1",
17
+ "numpy>=1.21.0",
18
+ "ovito>=3.7.0",
19
+ "pandas>=1.3.0",
20
+ "pymatgen>=2022.0.0",
21
+ "scikit-learn>=1.0.0",
22
+ "scipy>=1.7.0",
23
+ "seaborn>=0.11.0",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ flux = [
28
+ "flux-python==0.66.0",
29
+ ]
30
+
31
+ [build-system]
32
+ requires = ["uv_build>=0.10.0,<0.11.0"]
33
+ build-backend = "uv_build"
34
+
35
+ [dependency-groups]
36
+ dev = [
37
+ "myst-parser>=5.0.0",
38
+ "pytest>=9.0.2",
39
+ "sphinx>=9.1.0",
40
+ "sphinx-autodoc-typehints>=3.6.2",
41
+ "sphinx-rtd-theme>=3.1.0",
42
+ ]
43
+
44
+ [tool.uv.workspace]
45
+ members = [
46
+ "src/mcp",
47
+ "src/mcp_matensemble",
48
+ ]
@@ -0,0 +1 @@
1
+ 3.12
File without changes
@@ -0,0 +1,208 @@
1
+ # === TODO ===
2
+
3
+ ## --- Refactor MatEnsemble ---
4
+ - [x] Create strategy base class
5
+ - [x] Implement the stategies
6
+ - [x] Create strategy base class for processing futures
7
+ - [x] Implement strategies for processing futures
8
+ - [x] Refactor matflux.py and matfluxGen.py to be more modular and in one manager.py
9
+ - [x] Test matflux/matfluxGen refactor make sure it works before doing anything else
10
+ - [x] Fix problems and test again
11
+
12
+ ### --- Problems ---
13
+ - [x] Use *ONE* executor in the manager super loop instead of spawning new ones each time
14
+ - [x] Make sure future objects have proper fields appended at creation (task_ or task + job_spec)
15
+ - [x] Move writing of restart files into the FutureProcessingStrategy implementations
16
+ - [x] Make sure you remove the finished future rather than popleft in FutureProcessingStrategy implementations
17
+
18
+ ## --- NOTE: Refactored code runs way slower ---
19
+ - [x] Fix problems causing slowdown and test again
20
+
21
+ ### --- More Problems ---
22
+ - [x] Make tests consistent so that we have an apples to apples comparison
23
+ - [x] Remove extra logging and RPC calls to limit traffic
24
+ - [x] Update resources calls to update in place in submit_until_ooresources()
25
+
26
+ - [x] Test matensemble again until it is working as before
27
+
28
+ **--- Got it working as before ---**
29
+ - [x] Update logging to be more industry standard
30
+ - [x] Refactor Fluxlet to remove global side effects
31
+ - [x] Add type annotations back to strategies
32
+ - [x] Document all of the code vigorously
33
+ - [x] Document manager.py
34
+ - [x] Document fluxlet.py
35
+ - [x] Document strategies/*
36
+ - [x] Remove all TODOs and HACKs
37
+
38
+ - [x] Update the documentation and make sure it has all of the strategies
39
+ - [x] Make a script to build the documentation
40
+ - [x] Remove all artifacts from the repository
41
+
42
+ ## --- Add Testing ---
43
+ - [x] Make sure the simple hello world tests work
44
+ - [x] Figure out what is going on with the GPU tasks
45
+ - [x] Make some tests that have failures to make sure the failed tasks get logged appropriately
46
+ - [x] NOTE: Come back later -- Unit tests | Integration Tests --
47
+
48
+ ## --- Find Solution For Distribution ---
49
+ - [x] Turn Matensemble into a uv project
50
+ - [x] Build the initial Apptainer container
51
+ - [x] Create the matensemble.def file
52
+ ### --- Def File Spec ---
53
+ - The file should be off of a frontier base image use rocky linux version
54
+ - Install build dependecies of flux-core
55
+ - Build flux-core from source
56
+ - Install build dependencies for flux-sched
57
+ - Build flux-sched from source
58
+ - Export all variables
59
+ - Install matensemble
60
+ - [x] Test the apptainer container
61
+
62
+ ## --- Build Base Images ---
63
+ - [x] Build Base Image for Baseline
64
+ - [x] Build Base Image for Frontier
65
+ - [x] Build Base Image for Perlmutter
66
+ - [x] Push images to GitHub Container Registry
67
+ - [x] Build MatEnsemble Images with each base image
68
+ - [x] Test Images on each respective system
69
+ - [x] NOTE: Come back later -- Test Perlmutter Image
70
+ - [x] Create Perlmutter image from new base image [Neil's Containerfiles](https://github.com/namehta4/Containerfiles/blob/main/Base/GPU/Dockerfile)
71
+
72
+ ## --- Setup GitHub Actions ---
73
+ - [x] Setup Matrix build action to build MatEnsemble images for baseline, frontier, and perlmutter and push them to ghcr
74
+ - [x] Setup action to build with uv and publish with uv
75
+ - [x] Setup action to build docs and publish them
76
+
77
+ ## --- Test CI/CD ---
78
+ - [x] Make small change to MatEnsemble and Docs
79
+ - [x] Push to main see if dev builds succeed
80
+ - [x] Run release.sh script to see if releases happen properly
81
+
82
+ ## --- Updated/Better UX ---
83
+ - [x] Refactor to be built around Task/Job Objects
84
+ - [x] Allow users to decorate python functions to create TaskSpec's
85
+ - [x] Allow functions to depend on other functions
86
+ - [x] Topologically sort all of the Jobs based on dependencies
87
+ - [x] Write worker runtime that flux can target and call user defined functions
88
+ - [x] Write the Job objects specification to a file in their direcotry
89
+
90
+ ## --- Status Dashboard ---
91
+ - [x] Update the Pipeline.run() method to have a dashboard flag
92
+ - [x] Add logic to launch the dashboard when the user runs the workflow
93
+
94
+ ## --- Science Example ---
95
+ - [x] Test the science example that Soumendu provided
96
+ - [x] Update version of LAMMPS
97
+ - [x] Fix Bug with jobspec.env -> jobspec.environment
98
+ - [x] Test it again
99
+
100
+ ## --- Polish Everything ---
101
+ - [x] Update all the documentation
102
+ - [x] Update the example workflows
103
+ - [x] Provide tutorials for how to run the example workflows
104
+ - [x] Change name of 'Job' to 'Chore'
105
+ - [x] Change name of 'Pipeline' to something else
106
+ - [x] Make ChoreType.PYTHON have the ability to be defined in the runner script
107
+
108
+ ## --- Fix Containers ---
109
+ - [x] Install latest version of lammps in frontier images
110
+ - [x] Test MPI problem with MatEnsemble in Frontier Images
111
+ - [x] Test Science Example with latest MPICH install
112
+ - [x] Ping neil to ask about MPICH in image
113
+
114
+ ## --- Finish presentation ---
115
+ - [x] Conda environment might make this very simple
116
+ - [x] Create a jupyter notebook and screen record it
117
+ - [x] Place that at the end of the presentation
118
+
119
+ ## MatEnsemble Fixes
120
+ - [x] Add the ability to print the results of the OutputReference objects
121
+ - [x] Give the user the ability to define workflows in a single file
122
+ - [x] Make the log updates threaded
123
+ - [x] Implement the restart files
124
+ - [x] Bring back the terminal view the log command
125
+ - [x] Dynopro fix thingy (read flux docs JobspecV1.from_command vs. JobspecV1.per_resource)
126
+ - [x] Test all the fixes with a simple dynopro example and print an
127
+ OutputReference and watch the logs and make a single file workflow and
128
+
129
+ ## --- Test Perlmutter Container ---
130
+ - [x] Give them a test with the current command that you have been running
131
+ - [x] If that doesn't work break it down into smaller pieces
132
+ ## --- Smaller pieces ---
133
+ - [x] Maybe start with an nvidia image rather than Neil's image
134
+ - [ ] Make sure that flux works in the container
135
+ - [ ] Create a container that just has flux and have some different tests for that
136
+ - [ ] Create a container that has just MPI and test that make sure it works
137
+ - [ ] Combine flux and MPI and see if that works
138
+ - [ ] Create a container that has lammps and make sure that that is working
139
+ - [ ] Combine all the pieces
140
+
141
+ ## --- Test Frontier Apptainer container ---
142
+ - [x] Need lots more help here
143
+ - [x] Make test that is very small first
144
+ - [x] Test new container against the running examples
145
+
146
+ ## --- Clean some things up ---
147
+ - [x] Make ticket for neil
148
+ - [x] Update frontier Dockerfile
149
+ - [x] Update README.md
150
+ - [x] Create multi architecture builds for baseline image
151
+ - [x] Run release script
152
+
153
+ ## --- Create new strategy to enable autonomous workflows ---
154
+ - [x] Update the OutputReference objects to have the ability to get the results
155
+ - [x] Create a method in the Pipeline to be able to get the results of all chores
156
+ - [x] Make a strategy that can take in chore and does processing which spawns a new chore
157
+ - [x] Figure out how to spawn a new chore
158
+ - [x] Change to only use cloudpickle and only pickle the actual function once
159
+ - [x] Change the chore objects to not store the function
160
+ - [x] Change the chore objects to reference a function in the registry
161
+ - [x] Change runtime worker to load function from registry and call it with args/kwargs
162
+ - [x] Add a set of OutputReference objects in the pipeline
163
+ - [x] Connect the added chores back to the pipeline somehow
164
+ - [x] Add a method in pipeline where you can get the results of all of your chores
165
+ - [x] Ping Neil about MPICH
166
+ - [x] Convert Scaffold into PowerPoint Presentation
167
+ - [x] Email coordinatior about length of presentation and audience
168
+
169
+ ## === AFTER EVERYTHING ABOVE IS DONE AND STABLE ===
170
+
171
+ ## --- Model Context Protocol ---
172
+ - [ ] MCP implementation
173
+ - [ ] Map out the Tool and Resources
174
+ ### Implement the Resources
175
+ - [ ] Resource to Fetch ALL Docs
176
+ - [ ] Resource to Fetch Relavant Source Code
177
+ - [ ] Resource to Fetch Examples General or system dependent
178
+ ### Implement the Tools
179
+ - [ ] Tool to create a directory for the workflow
180
+ - [ ] Tool to write a file in that directory
181
+ - [ ] Tool to delete a file in that directory
182
+ - [ ] Tool to create a workflow
183
+ - [ ] Tool to verify a workflow
184
+ - [ ] Tool to create a batch script
185
+ - [ ] Tool to setup container env
186
+ - [ ] Tool to submit a batch script
187
+ ### Implement the Prompts
188
+ - [ ] ???
189
+ - [ ] Test the server locally
190
+ - [ ] Test the server on an HPC cluster
191
+ - [ ] Create documentation for setting it up
192
+
193
+ ## --- Create first draft for JOSS ---
194
+ - [ ] Read some example papers
195
+ - [ ] Create draft and show Dr. Bagchi
196
+ - [ ] Polish the repository to be ready for review
197
+ - [ ] Make sure that the tests work
198
+ - [ ] Make sure that the example workflows work correctly
199
+ - [ ] Make sure that they can easily test the code and
200
+ - [ ] Create a conda package that they can easily test the code without having
201
+ to compile flux and flux-sched themselves
202
+
203
+ ## --- Reading List ---
204
+ - [ ] [Agentic Orchestration of HPC Applications](https://vsoch.github.io/assets/posts/agentic-orchestration-hpc-workloads-cloud-sochat-milroy.pdf)
205
+ - [x] [Container Training Slides](https://drive.google.com/drive/folders/1_mTBBc98TEX3XFpNp0rqoqj1VjN9TKoO)
206
+ - [ ] [Containers as Jupyter Kernels](https://docs.nersc.gov/services/jupyter/how-to-guides/#how-to-use-a-container-to-run-a-jupyter-kernel)
207
+ - [ ] [Using SPIN to Run Persistent Containers](https://docs.nersc.gov/services/spin/)
208
+ - [ ] [Using uv to package lammps and flux into pip install???](https://sgoel.dev/posts/building-cython-or-c-extensions-using-uv/)
@@ -0,0 +1,15 @@
1
+ """
2
+ MatEnsemble
3
+
4
+ MatEnsemble is a Python workflow library for building and running
5
+ high-throughput and dependency-aware workflows on HPC systems.
6
+ It lets users define delayed Python and executable chores, connect them
7
+ through data dependencies, and submit them for execution with Flux.
8
+ """
9
+
10
+ __author__ = ["Soumendu Bagchi", "Kaleb Duchesneau"]
11
+ __package__ = "matensemble"
12
+
13
+ # Re-export core data model types at the package root for convenience and for
14
+ # backwards compatibility with code/tests that import from `matensemble`.
15
+ from .model import OutputReference, Resources, ChoreType # noqa: E402,F401
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ import networkx as nx
6
+ import shlex
7
+
8
+ from pathlib import Path
9
+
10
+ from matensemble.model import ChoreType, Resources
11
+ from matensemble.utils import _json_safe
12
+
13
+
14
+ class Chore:
15
+ """
16
+ A :obj:`Chore` is what MatEnsemble is built around. :obj:`Job`'s can have two
17
+ different types. ``PYTHON`` or ``EXECUTABLE``
18
+
19
+ Python chores are delayed function calls that will be submitted to the
20
+ runtime-worker when the :obj:`Chore`'s dependencies are resolved and they are
21
+ schduled in the queue.
22
+
23
+ Executable chores are simply commands that will usually call an Executable script
24
+ when the chore is scheduled.
25
+
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ id: str,
31
+ command: str | list[str],
32
+ chore_type: ChoreType,
33
+ resources: Resources,
34
+ workdir: Path,
35
+ func_module: str | None = None,
36
+ func_qualname: str | None = None,
37
+ serialized_callable: bytes | None = None,
38
+ deps: tuple[str, ...] = (),
39
+ args: tuple = (),
40
+ kwargs: dict | None = None,
41
+ ) -> None:
42
+ """
43
+ The constructor for a :obj:`Chore`
44
+
45
+ Parameters
46
+ ----------
47
+ id : str
48
+ The ID for the :obj:`Chore`
49
+ command : str, list[str]
50
+ The command that will be run when the :obj:`Chore` is submitted
51
+ chore_type: ChoreType
52
+ Either PYTHON or EXECUTABLE
53
+ resources : Resources
54
+ An instance of :obj:`Resources` that holds all the information about
55
+ what resources are needed to run the :obj:`Chore`
56
+ workdir : Path
57
+ The Path to the directory where the output of the :obj:`Chore` will be
58
+ handled
59
+ func_module : str
60
+ The module where the function definition is if the type of the :obj:`Chore`
61
+ is PYTHON
62
+ func_qualname : str
63
+ The name of the function if the type of the :obj:`Chore` is PYTHON
64
+ serialized_callable : bytes
65
+ The original function that was wrapped stored as bytes
66
+ deps : tuple[str, ...]
67
+ A tupele of chore-id's which results this :obj:`Chore
68
+ args : tuple
69
+ The arguments to give the function if type is PYTHON
70
+ kwargs : dict
71
+ The key-word arguments to give the function if flaovr is PYTHON
72
+ """
73
+
74
+ self.id = id
75
+ self.command = (
76
+ shlex.split(command) if isinstance(command, str) else list(command)
77
+ )
78
+
79
+ if chore_type == ChoreType.PYTHON:
80
+ if serialized_callable is None and not (func_module and func_qualname):
81
+ raise ValueError(
82
+ "Python chores require either serialized_callable or func_module+func_qualname"
83
+ )
84
+
85
+ self.chore_type = chore_type
86
+ self.resources = resources
87
+ self.workdir = workdir.resolve()
88
+ self.spec_path = self.workdir / "chore.pkl"
89
+
90
+ self.func_module = func_module
91
+ self.func_qualname = func_qualname
92
+ self.serialized_callable = serialized_callable
93
+
94
+ self.deps = deps
95
+ self.args = args
96
+ self.kwargs = {} if kwargs is None else kwargs
97
+
98
+ def graph(self) -> nx.DiGraph:
99
+ return nx.DiGraph()
100
+
101
+ def _to_debug_dict(self) -> dict:
102
+ return {
103
+ "id": self.id,
104
+ "command": self.command,
105
+ "chore_type": _json_safe(self.chore_type),
106
+ "resources": {
107
+ "num_tasks": self.resources.num_tasks,
108
+ "cores_per_task": self.resources.cores_per_task,
109
+ "gpus_per_task": self.resources.gpus_per_task,
110
+ "mpi": self.resources.mpi,
111
+ "env": _json_safe(self.resources.env),
112
+ "inherit_env": self.resources.inherit_env,
113
+ },
114
+ "spec_file": str(self.spec_path),
115
+ "func_module": self.func_module,
116
+ "func_qualname": self.func_qualname,
117
+ "has_serialized_callable": self.serialized_callable is not None,
118
+ "deps": list(self.deps),
119
+ "args": _json_safe(self.args),
120
+ "kwargs": _json_safe(self.kwargs),
121
+ }
122
+
123
+ def _write_debug_json(self) -> None:
124
+ """
125
+ The :obj:`Chore` is pickled at runtime to be used later on, but it is also
126
+ written as json for debugging.
127
+ """
128
+
129
+ debug_file = self.spec_path.parent / "chore.json"
130
+ debug_file.parent.mkdir(parents=True, exist_ok=True)
131
+ with debug_file.open("w") as f:
132
+ json.dump(self._to_debug_dict(), f, indent=2)
133
+
134
+ def __str__(self) -> str:
135
+ """
136
+ Return the :obj:`Chore` as a JSON string.
137
+ """
138
+
139
+ return json.dumps(self._to_debug_dict(), indent=2)