matensemble 0.2.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {matensemble-0.2.1 → matensemble-0.3.3}/PKG-INFO +9 -26
  2. {matensemble-0.2.1 → matensemble-0.3.3}/README.md +8 -25
  3. {matensemble-0.2.1 → matensemble-0.3.3}/pyproject.toml +1 -1
  4. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/TODO.md +7 -2
  5. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/chore.py +25 -22
  6. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/fluxlet.py +1 -1
  7. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/manager.py +63 -0
  8. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/model.py +10 -2
  9. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/pipeline.py +369 -48
  10. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/runtime_worker.py +9 -42
  11. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/strategy.py +161 -12
  12. {matensemble-0.2.1 → matensemble-0.3.3}/LICENSE +0 -0
  13. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/.python-version +0 -0
  14. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/README.md +0 -0
  15. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/__init__.py +0 -0
  16. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dash/assets/index-1X2cLUgt.js +0 -0
  17. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dash/assets/index-DRkGfWlx.css +0 -0
  18. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dash/index.html +0 -0
  19. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dash/vite.svg +0 -0
  20. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/__init__.py +0 -0
  21. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/driver.py +0 -0
  22. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/ensemble.py +0 -0
  23. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/__init__.py +0 -0
  24. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/bispectrum_calculator.py +0 -0
  25. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/compute_diffraction.py +0 -0
  26. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/compute_order_from_pairs.py +0 -0
  27. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/compute_twist.py +0 -0
  28. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/correlations.py +0 -0
  29. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/postprocessors/ovito_calculators.py +0 -0
  30. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/task_lib/AnalysisSubprocess.py +0 -0
  31. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/task_lib/AnalysysDescriptor.py +0 -0
  32. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/task_lib/MDSubprocess.py +0 -0
  33. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/task_lib/__init__.py +0 -0
  34. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/task_lib/analysis_registry.py +0 -0
  35. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/utils/__init__.py +0 -0
  36. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/utils/lammps_init.py +0 -0
  37. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/utils/preprocessors.py +0 -0
  38. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/utils/stat.py +0 -0
  39. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/dynopro/utils/stress_rotate_z_theta.py +0 -0
  40. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/logger.py +0 -0
  41. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/redis/__init__.py +0 -0
  42. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/redis/service.py +0 -0
  43. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/redis/test.py +0 -0
  44. {matensemble-0.2.1 → matensemble-0.3.3}/src/matensemble/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matensemble
3
- Version: 0.2.1
3
+ Version: 0.3.3
4
4
  Summary: An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python
5
5
  Author: Soumendu Bagchi, Kaleb Duchesneau
6
6
  Author-email: Soumendu Bagchi <soumendubagchi@gmail.com>, Kaleb Duchesneau <kalebduchesneau@gmail.com>
@@ -32,17 +32,17 @@ Description-Content-Type: text/markdown
32
32
 
33
33
  # MatEnsemble
34
34
 
35
- MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
35
+ MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of chores—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
36
36
 
37
37
  An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
38
38
 
39
39
  ## Features
40
40
 
41
41
  - **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
42
- - **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
42
+ - **Adaptive scheduling** that back-fills the allocation as chores finish (with a non-adaptive available)
43
43
  - **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
44
44
  - **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
45
- - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
45
+ - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard**
46
46
 
47
47
  <p align="center">
48
48
  <img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
@@ -52,41 +52,24 @@ An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly
52
52
  <img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
53
53
  </p>
54
54
 
55
- ## Documentation
56
-
57
- Documentation (overview, architecture, tutorials, API reference):
58
-
59
- **[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
60
55
 
61
56
  ## Installation
62
57
 
63
- ### Containers (recommended on many clusters)
64
-
65
- OCI images are published to GitHub Container Registry, for example:
58
+ OCI images are published to GitHub Container Registry
66
59
 
67
60
  `ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
68
61
 
69
62
  See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
70
63
 
71
- ### Development install
72
-
73
- From a clone of this repository:
64
+ ### Anaconda
74
65
 
75
- ```bash
76
- uv sync
77
- uv sync --group dev # optional: docs and pytest tooling
78
- uv run pytest
79
- ```
80
-
81
- Or with pip:
66
+ You can build a Conda environment with MatEnsemble and dependencies installed using the environment.yaml file.
82
67
 
83
68
  ```bash
84
- pip install -e ".[flux]"
69
+ conda env create -f environment.yaml
85
70
  ```
86
71
 
87
- Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
88
-
89
- ## Quick example
72
+ ## Example
90
73
 
91
74
  ```python
92
75
  from matensemble.pipeline import Pipeline
@@ -9,17 +9,17 @@
9
9
 
10
10
  # MatEnsemble
11
11
 
12
- MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of tasks—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
12
+ MatEnsemble is a Python library for **high-throughput workflows** on HPC systems. You define a directed acyclic graph (DAG) of chores—**Python callables** or **executable commands**—and MatEnsemble submits work through **[Flux](https://flux-framework.readthedocs.io/)**, tracks completions, **adapts** scheduling to free CPUs and GPUs, and writes structured logs and per-chore output directories.
13
13
 
14
14
  An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly analysis for advanced materials simulation workflows.
15
15
 
16
16
  ## Features
17
17
 
18
18
  - **DAG-based workflows** with dependencies via deferred return values (`OutputReference`)
19
- - **Adaptive scheduling** that back-fills the allocation as tasks finish (with a non-adaptive mode when you need it)
19
+ - **Adaptive scheduling** that back-fills the allocation as chores finish (with a non-adaptive available)
20
20
  - **Two chore types**: Python chores (remotely unpickled and executed by `matensemble.runtime_worker`) and argv-style **executable** chores
21
21
  - **Resource requests**: tasks, cores per task, GPUs per task, optional MPI (`pmi2`) via Flux
22
- - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard** (FastAPI on port 8000)
22
+ - **Observability**: `status.json`, `matensemble_workflow.log`, per-chore `stdout` / `stderr`, pickle and JSON result artifacts; optional **web dashboard**
23
23
 
24
24
  <p align="center">
25
25
  <img src="images/Cap_1_adaptive_task_management.png" alt="Adaptive task management" width="620" />
@@ -29,41 +29,24 @@ An optional in-tree **dynopro** stack supports streaming dynamics and on-the-fly
29
29
  <img src="images/Cap_2_dynopro.png" alt="On-the-fly dynamics and analysis" width="620" />
30
30
  </p>
31
31
 
32
- ## Documentation
33
-
34
- Documentation (overview, architecture, tutorials, API reference):
35
-
36
- **[matensemble.readthedocs.io](https://matensemble.readthedocs.io/en/latest/)**
37
32
 
38
33
  ## Installation
39
34
 
40
- ### Containers (recommended on many clusters)
41
-
42
- OCI images are published to GitHub Container Registry, for example:
35
+ OCI images are published to GitHub Container Registry
43
36
 
44
37
  `ghcr.io/freddude2004/matensemble:baseline-vX.Y.Z`
45
38
 
46
39
  See the [container packages](https://github.com/FredDude2004/MatEnsemble/pkgs/container/matensemble) and the [Quick start](https://matensemble.readthedocs.io/en/latest/quickstart.html) in the docs for Apptainer/Singularity and site-specific notes.
47
40
 
48
- ### Development install
49
-
50
- From a clone of this repository:
41
+ ### Anaconda
51
42
 
52
- ```bash
53
- uv sync
54
- uv sync --group dev # optional: docs and pytest tooling
55
- uv run pytest
56
- ```
57
-
58
- Or with pip:
43
+ You can build a Conda environment with MatEnsemble and dependencies installed using the environment.yaml file.
59
44
 
60
45
  ```bash
61
- pip install -e ".[flux]"
46
+ conda env create -f environment.yaml
62
47
  ```
63
48
 
64
- Site-specific Conda-style environment files live under `scripts/` (for example `scripts/baseline/environment.yaml`, `scripts/frontier/`, `scripts/perlmuter/`). Align Python with **3.12+** and Flux with your center’s modules.
65
-
66
- ## Quick example
49
+ ## Example
67
50
 
68
51
  ```python
69
52
  from matensemble.pipeline import Pipeline
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "matensemble"
3
- version = "0.2.1"
3
+ version = "0.3.3"
4
4
  description = "An adaptive and highly asynchronous ensemble simulation workflow manager MatEnsemble (https://github.com/Q-CAD/MatEnsemble) built jointly on top of the hierarchical graph based scheduler FLUX and concurrent-futures infrastructure of python"
5
5
  readme = "README.md"
6
6
  license-files = ["LICENSE"]
@@ -149,11 +149,11 @@
149
149
  - [x] Update README.md
150
150
  - [x] Create multi architecture builds for baseline image
151
151
  - [x] Run release script
152
+ - [x] Try the MPI Hello examples in perlmutter image and see if they work
152
153
 
153
154
  ## --- Create new strategy to enable autonomous workflows ---
154
155
  - [x] Update the OutputReference objects to have the ability to get the results
155
156
  - [x] Create a method in the Pipeline to be able to get the results of all chores
156
- - [x] Make a strategy that can take in chore and does processing which spawns a new chore
157
157
  - [x] Figure out how to spawn a new chore
158
158
  - [x] Change to only use cloudpickle and only pickle the actual function once
159
159
  - [x] Change the chore objects to not store the function
@@ -162,9 +162,14 @@
162
162
  - [x] Add a set of OutputReference objects in the pipeline
163
163
  - [x] Connect the added chores back to the pipeline somehow
164
164
  - [x] Add a method in pipeline where you can get the results of all of your chores
165
+ - [x] Make the Pipeline.submit() function asynchronous
166
+ - [x] Factor out the common pieces of the FutureProcessingStrategy into the base class
167
+ - [x] Add a UserStrategy class
168
+ - [x] Make a strategy that can take in chore and does processing which spawns a new chore
169
+ - [x] Make a method in the pipeline that constructs a UserStrategy
165
170
  - [x] Ping Neil about MPICH
166
171
  - [x] Convert Scaffold into PowerPoint Presentation
167
- - [x] Email coordinatior about length of presentation and audience
172
+ - [x] Email coordinator about length of presentation and audience
168
173
 
169
174
  ## === AFTER EVERYTHING ABOVE IS DONE AND STABLE ===
170
175
 
@@ -11,6 +11,23 @@ from matensemble.model import ChoreType, Resources
11
11
  from matensemble.utils import _json_safe
12
12
 
13
13
 
14
+ class ChoreSpec:
15
+ """
16
+ The specification of a :obj:`Chore`
17
+
18
+ Holds the arguments, key-word
19
+ arguments and the name of the chore that you want those arguments to be
20
+ passed to. This is class is used by the user when creating a UserStrategy
21
+ that does processing on completed chores and can spawn new chores.
22
+ """
23
+
24
+ def __init__(self, args, kwargs, qualname, resources: Resources) -> None:
25
+ self.args = args
26
+ self.kwargs = kwargs
27
+ self.qualname = qualname
28
+ self.resources = resources
29
+
30
+
14
31
  class Chore:
15
32
  """
16
33
  A :obj:`Chore` is what MatEnsemble is built around. :obj:`Job`'s can have two
@@ -28,13 +45,11 @@ class Chore:
28
45
  def __init__(
29
46
  self,
30
47
  id: str,
48
+ workdir: Path,
31
49
  command: str | list[str],
32
- chore_type: ChoreType,
50
+ chore_type: ChoreType | int,
33
51
  resources: Resources,
34
- workdir: Path,
35
- func_module: str | None = None,
36
- func_qualname: str | None = None,
37
- serialized_callable: bytes | None = None,
52
+ chore_qualname: str | None = None,
38
53
  deps: tuple[str, ...] = (),
39
54
  args: tuple = (),
40
55
  kwargs: dict | None = None,
@@ -76,21 +91,12 @@ class Chore:
76
91
  shlex.split(command) if isinstance(command, str) else list(command)
77
92
  )
78
93
 
79
- if chore_type == ChoreType.PYTHON:
80
- if serialized_callable is None and not (func_module and func_qualname):
81
- raise ValueError(
82
- "Python chores require either serialized_callable or func_module+func_qualname"
83
- )
84
-
85
94
  self.chore_type = chore_type
86
95
  self.resources = resources
87
96
  self.workdir = workdir.resolve()
88
- self.spec_path = self.workdir / "chore.pkl"
97
+ self.spec_path = self.workdir / "chore.pickle"
89
98
 
90
- self.func_module = func_module
91
- self.func_qualname = func_qualname
92
- self.serialized_callable = serialized_callable
93
-
99
+ self.chore_qualname = chore_qualname
94
100
  self.deps = deps
95
101
  self.args = args
96
102
  self.kwargs = {} if kwargs is None else kwargs
@@ -111,22 +117,19 @@ class Chore:
111
117
  "env": _json_safe(self.resources.env),
112
118
  "inherit_env": self.resources.inherit_env,
113
119
  },
114
- "spec_file": str(self.spec_path),
115
- "func_module": self.func_module,
116
- "func_qualname": self.func_qualname,
117
- "has_serialized_callable": self.serialized_callable is not None,
120
+ "chore_qualname": self.chore_qualname,
118
121
  "deps": list(self.deps),
119
122
  "args": _json_safe(self.args),
120
123
  "kwargs": _json_safe(self.kwargs),
121
124
  }
122
125
 
123
- def _write_debug_json(self) -> None:
126
+ def _write_metadata(self) -> None:
124
127
  """
125
128
  The :obj:`Chore` is pickled at runtime to be used later on, but it is also
126
129
  written as json for debugging.
127
130
  """
128
131
 
129
- debug_file = self.spec_path.parent / "chore.json"
132
+ debug_file = self.spec_path.parent / "metadata.json"
130
133
  debug_file.parent.mkdir(parents=True, exist_ok=True)
131
134
  with debug_file.open("w") as f:
132
135
  json.dump(self._to_debug_dict(), f, indent=2)
@@ -148,7 +148,7 @@ class Fluxlet:
148
148
  jobspec.environment = base_env
149
149
 
150
150
  # helpful for debugging
151
- chore._write_debug_json()
151
+ chore._write_metadata()
152
152
 
153
153
  # only set this if you truly want every chore to span a fixed node count
154
154
  if nnodes is not None:
@@ -485,6 +485,69 @@ class FluxManager:
485
485
  and len(self._blocked) == 0
486
486
  )
487
487
 
488
+ def _add_chore(self, chore: Chore) -> bool:
489
+ """
490
+ Add a UserStrategy spawned chore to the queue.
491
+
492
+ Returns
493
+ -------
494
+ bool
495
+ True if *chore* was admitted to the manager, False if it was rejected.
496
+ """
497
+
498
+ if not self._chore_fits_allocation(chore):
499
+ self._record_failure(chore.id, reason="chore_exceeds_allocation")
500
+ self._logger.error(
501
+ "CHORE INVALID: chore=%s requires more resources than the allocation can provide",
502
+ chore.id,
503
+ )
504
+ self._fail_dependents(chore.id)
505
+ return False
506
+
507
+ if chore.id in self._chores_by_id:
508
+ self._logger.error(
509
+ "CHORE DUPLICATE: chore=%s already exists, rejecting spawn",
510
+ chore.id,
511
+ )
512
+ return False
513
+
514
+ for dep in chore.deps:
515
+ if dep not in self._chores_by_id:
516
+ self._record_failure(
517
+ chore.id, reason="unknown_dependency", upstream=dep
518
+ )
519
+ self._logger.error(
520
+ "CHORE INVALID: chore=%s has unknown dependency %s",
521
+ chore.id,
522
+ dep,
523
+ )
524
+ return False
525
+ if self._has_failed(dep):
526
+ self._record_failure(chore.id, reason="dependency_failed", upstream=dep)
527
+ self._logger.error(
528
+ "CHORE SKIPPED: chore=%s because dependency %s already failed",
529
+ chore.id,
530
+ dep,
531
+ )
532
+ return False
533
+
534
+ self._chores_by_id[chore.id] = chore
535
+ self._dependents.setdefault(chore.id, [])
536
+
537
+ remaining = sum(1 for dep in chore.deps if dep not in self._completed_chores)
538
+ self._remaining_deps[chore.id] = remaining
539
+
540
+ for dep in chore.deps:
541
+ self._dependents.setdefault(dep, []).append(chore.id)
542
+
543
+ if remaining == 0:
544
+ self._ready.appendleft(chore.id)
545
+ self._blocked.discard(chore.id)
546
+ else:
547
+ self._blocked.add(chore.id)
548
+
549
+ return True
550
+
488
551
  def run(
489
552
  self,
490
553
  buffer_time: float = 1.0,
@@ -18,14 +18,22 @@ class OutputReference:
18
18
  """
19
19
  Return the deserialized result of the referenced chore as a string.
20
20
  """
21
- # TODO: Make sure that this file exists and add some exception handling
22
- dep_result = self.workdir / "result.pkl"
21
+
22
+ dep_result = self.workdir / "result.pickle"
23
23
  try:
24
24
  with dep_result.open("rb") as f:
25
25
  return str(pickle.load(f))
26
26
  except Exception as e:
27
27
  return f"Error: Could not open result of chore: {self.chore_id} becuase of the following exception: {e}"
28
28
 
29
+ def result(self):
30
+ dep_result = self.workdir / "result.pickle"
31
+ try:
32
+ with dep_result.open("rb") as f:
33
+ return pickle.load(f)
34
+ except Exception as e:
35
+ return f"Error: Could not open result of chore: {self.chore_id} becuase of the following exception: {e}"
36
+
29
37
 
30
38
  @dataclass
31
39
  class Resources: