executorlib 0.0.4__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- executorlib-0.0.5/PKG-INFO +229 -0
- executorlib-0.0.5/README.md +154 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/__init__.py +87 -68
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/_version.py +3 -3
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/backend/cache_parallel.py +1 -1
- executorlib-0.0.5/executorlib/backend/cache_serial.py +6 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/backend/interactive_parallel.py +2 -2
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/backend/interactive_serial.py +2 -2
- executorlib-0.0.5/executorlib/base/executor.py +163 -0
- executorlib-0.0.5/executorlib/cache/__init__.py +0 -0
- executorlib-0.0.5/executorlib/cache/backend.py +68 -0
- executorlib-0.0.5/executorlib/cache/executor.py +118 -0
- executorlib-0.0.5/executorlib/cache/queue_spawner.py +109 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/cache/shared.py +44 -131
- executorlib-0.0.5/executorlib/cache/subprocess_spawner.py +65 -0
- executorlib-0.0.5/executorlib/interactive/__init__.py +0 -0
- executorlib-0.0.5/executorlib/interactive/executor.py +289 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/interactive/flux.py +3 -3
- executorlib-0.0.4/executorlib/shared/executor.py → executorlib-0.0.5/executorlib/interactive/shared.py +226 -240
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone}/__init__.py +3 -5
- executorlib-0.0.5/executorlib/standalone/command.py +14 -0
- {executorlib-0.0.4/executorlib/cache → executorlib-0.0.5/executorlib/standalone}/hdf.py +10 -1
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone}/inputcheck.py +64 -19
- executorlib-0.0.5/executorlib/standalone/interactive/__init__.py +0 -0
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone/interactive}/communication.py +12 -6
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone/interactive}/spawner.py +4 -1
- executorlib-0.0.5/executorlib/standalone/queue.py +19 -0
- executorlib-0.0.5/executorlib/standalone/serialize.py +82 -0
- executorlib-0.0.5/executorlib.egg-info/PKG-INFO +229 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib.egg-info/SOURCES.txt +20 -13
- executorlib-0.0.5/executorlib.egg-info/requires.txt +27 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/pyproject.toml +18 -6
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_cache_executor_mpi.py +10 -7
- executorlib-0.0.5/tests/test_cache_executor_pysqa_flux.py +49 -0
- executorlib-0.0.5/tests/test_cache_executor_serial.py +193 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_cache_hdf.py +34 -5
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_cache_shared.py +19 -22
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_dependencies_executor.py +19 -14
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_executor_backend_flux.py +9 -9
- executorlib-0.0.5/tests/test_executor_backend_mpi.py +112 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_executor_backend_mpi_noblock.py +22 -17
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_flux_executor.py +3 -2
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_integration_pyiron_workflow.py +7 -7
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_local_executor.py +75 -34
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_local_executor_future.py +8 -6
- executorlib-0.0.5/tests/test_pysqa_subprocess.py +45 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_shared_backend.py +2 -2
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_shared_communication.py +3 -3
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_shared_executorbase.py +1 -1
- executorlib-0.0.5/tests/test_shared_input_check.py +120 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_shared_thread.py +1 -1
- executorlib-0.0.5/tests/test_shell_executor.py +149 -0
- executorlib-0.0.5/tests/test_shell_interactive.py +129 -0
- executorlib-0.0.4/PKG-INFO +0 -190
- executorlib-0.0.4/README.md +0 -127
- executorlib-0.0.4/executorlib/backend/cache_serial.py +0 -6
- executorlib-0.0.4/executorlib/cache/executor.py +0 -36
- executorlib-0.0.4/executorlib/interactive/__init__.py +0 -152
- executorlib-0.0.4/executorlib/interactive/dependencies.py +0 -118
- executorlib-0.0.4/executorlib/interactive/executor.py +0 -112
- executorlib-0.0.4/executorlib/shell/__init__.py +0 -7
- executorlib-0.0.4/executorlib/shell/executor.py +0 -96
- executorlib-0.0.4/executorlib/shell/interactive.py +0 -184
- executorlib-0.0.4/executorlib.egg-info/PKG-INFO +0 -190
- executorlib-0.0.4/executorlib.egg-info/requires.txt +0 -14
- executorlib-0.0.4/tests/test_cache_executor_serial.py +0 -110
- executorlib-0.0.4/tests/test_executor_backend_mpi.py +0 -80
- executorlib-0.0.4/tests/test_shared_input_check.py +0 -71
- executorlib-0.0.4/tests/test_shell_executor.py +0 -86
- executorlib-0.0.4/tests/test_shell_interactive.py +0 -70
- {executorlib-0.0.4 → executorlib-0.0.5}/LICENSE +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/MANIFEST.in +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib/backend/__init__.py +0 -0
- {executorlib-0.0.4/executorlib/cache → executorlib-0.0.5/executorlib/base}/__init__.py +0 -0
- {executorlib-0.0.4/executorlib → executorlib-0.0.5/executorlib/standalone}/interactive/backend.py +0 -0
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone}/plot.py +0 -0
- {executorlib-0.0.4/executorlib/shared → executorlib-0.0.5/executorlib/standalone}/thread.py +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib.egg-info/dependency_links.txt +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/executorlib.egg-info/top_level.txt +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/setup.cfg +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/setup.py +0 -0
- {executorlib-0.0.4 → executorlib-0.0.5}/tests/test_backend_serial.py +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: executorlib
|
|
3
|
+
Version: 0.0.5
|
|
4
|
+
Summary: Scale serial and MPI-parallel python functions over hundreds of compute nodes all from within a jupyter notebook or serial python process.
|
|
5
|
+
Author-email: Jan Janssen <janssen@lanl.gov>
|
|
6
|
+
License: BSD 3-Clause License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2022, Jan Janssen
|
|
9
|
+
All rights reserved.
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without
|
|
12
|
+
modification, are permitted provided that the following conditions are met:
|
|
13
|
+
|
|
14
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
15
|
+
list of conditions and the following disclaimer.
|
|
16
|
+
|
|
17
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
18
|
+
this list of conditions and the following disclaimer in the documentation
|
|
19
|
+
and/or other materials provided with the distribution.
|
|
20
|
+
|
|
21
|
+
* Neither the name of the copyright holder nor the names of its
|
|
22
|
+
contributors may be used to endorse or promote products derived from
|
|
23
|
+
this software without specific prior written permission.
|
|
24
|
+
|
|
25
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
26
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
27
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
28
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
29
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
30
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
31
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
32
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
33
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
34
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
35
|
+
|
|
36
|
+
Project-URL: Homepage, https://github.com/pyiron/executorlib
|
|
37
|
+
Project-URL: Documentation, https://executorlib.readthedocs.io
|
|
38
|
+
Project-URL: Repository, https://github.com/pyiron/executorlib
|
|
39
|
+
Keywords: pyiron
|
|
40
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
41
|
+
Classifier: Topic :: Scientific/Engineering :: Physics
|
|
42
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
43
|
+
Classifier: Intended Audience :: Science/Research
|
|
44
|
+
Classifier: Operating System :: OS Independent
|
|
45
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
46
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
47
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
48
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
49
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
50
|
+
Requires-Python: <3.14,>=3.9
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
License-File: LICENSE
|
|
53
|
+
Requires-Dist: cloudpickle<=3.1.0,>=2.0.0
|
|
54
|
+
Requires-Dist: pyzmq<=26.2.0,>=25.0.0
|
|
55
|
+
Provides-Extra: cache
|
|
56
|
+
Requires-Dist: h5py<=3.12.1,>=3.6.0; extra == "cache"
|
|
57
|
+
Provides-Extra: graph
|
|
58
|
+
Requires-Dist: pygraphviz<=1.14,>=1.10; extra == "graph"
|
|
59
|
+
Requires-Dist: matplotlib<=3.9.2,>=3.5.3; extra == "graph"
|
|
60
|
+
Requires-Dist: networkx<=3.4.2,>=2.8.8; extra == "graph"
|
|
61
|
+
Requires-Dist: ipython<=8.29.0,>=7.33.0; extra == "graph"
|
|
62
|
+
Provides-Extra: mpi
|
|
63
|
+
Requires-Dist: mpi4py<=4.0.1,>=3.1.4; extra == "mpi"
|
|
64
|
+
Provides-Extra: submission
|
|
65
|
+
Requires-Dist: pysqa==0.2.2; extra == "submission"
|
|
66
|
+
Requires-Dist: h5py<=3.12.1,>=3.6.0; extra == "submission"
|
|
67
|
+
Provides-Extra: all
|
|
68
|
+
Requires-Dist: mpi4py<=4.0.1,>=3.1.4; extra == "all"
|
|
69
|
+
Requires-Dist: pysqa==0.2.2; extra == "all"
|
|
70
|
+
Requires-Dist: h5py<=3.12.1,>=3.6.0; extra == "all"
|
|
71
|
+
Requires-Dist: pygraphviz<=1.14,>=1.10; extra == "all"
|
|
72
|
+
Requires-Dist: matplotlib<=3.9.2,>=3.5.3; extra == "all"
|
|
73
|
+
Requires-Dist: networkx<=3.4.2,>=2.8.8; extra == "all"
|
|
74
|
+
Requires-Dist: ipython<=8.29.0,>=7.33.0; extra == "all"
|
|
75
|
+
|
|
76
|
+
# executorlib
|
|
77
|
+
[](https://github.com/pyiron/executorlib/actions/workflows/unittest-openmpi.yml)
|
|
78
|
+
[](https://coveralls.io/github/pyiron/executorlib?branch=main)
|
|
79
|
+
[](https://mybinder.org/v2/gh/pyiron/executorlib/HEAD?labpath=notebooks%2Fexamples.ipynb)
|
|
80
|
+
|
|
81
|
+
Up-scale python functions for high performance computing (HPC) with executorlib.
|
|
82
|
+
|
|
83
|
+
## Key Features
|
|
84
|
+
* **Up-scale your Python functions beyond a single computer.** - executorlib extends the [Executor interface](https://docs.python.org/3/library/concurrent.futures.html#executor-objects)
|
|
85
|
+
from the Python standard library and combines it with job schedulers for high performance computing (HPC) including
|
|
86
|
+
the [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) and [flux](http://flux-framework.org).
|
|
87
|
+
With this combination executorlib allows users to distribute their Python functions over multiple compute nodes.
|
|
88
|
+
* **Parallelize your Python program one function at a time** - executorlib allows users to assign dedicated computing
|
|
89
|
+
resources like CPU cores, threads or GPUs to one Python function call at a time. So you can accelerate your Python
|
|
90
|
+
code function by function.
|
|
91
|
+
* **Permanent caching of intermediate results to accelerate rapid prototyping** - To accelerate the development of
|
|
92
|
+
machine learning pipelines and simulation workflows executorlib provides optional caching of intermediate results for
|
|
93
|
+
iterative development in interactive environments like jupyter notebooks.
|
|
94
|
+
|
|
95
|
+
## Examples
|
|
96
|
+
The Python standard library provides the [Executor interface](https://docs.python.org/3/library/concurrent.futures.html#executor-objects)
|
|
97
|
+
with the [ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor) and the
|
|
98
|
+
[ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor) for parallel
|
|
99
|
+
execution of Python functions on a single computer. executorlib extends this functionality to distribute Python
|
|
100
|
+
functions over multiple computers within a high performance computing (HPC) cluster. This can be either achieved by
|
|
101
|
+
submitting each function as individual job to the HPC job scheduler - [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html) -
|
|
102
|
+
or by requesting a compute allocation of multiple nodes and then distribute the Python functions within this - allocation -
|
|
103
|
+
[HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html). Finally, to accelerate the
|
|
104
|
+
development process executorlib also provides a - [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) -
|
|
105
|
+
to use the executorlib functionality on a single workstation for testing. Starting with the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html)
|
|
106
|
+
set by setting the backend parameter to local - `backend="local"`:
|
|
107
|
+
```python
|
|
108
|
+
from executorlib import Executor
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
with Executor(backend="local") as exe:
|
|
112
|
+
future_lst = [exe.submit(sum, [i, i]) for i in range(1, 5)]
|
|
113
|
+
print([f.result() for f in future_lst])
|
|
114
|
+
```
|
|
115
|
+
In the same way executorlib can also execute Python functions which use additional computing resources, like multiple
|
|
116
|
+
CPU cores, CPU threads or GPUs. For example if the Python function internally uses the Message Passing Interface (MPI)
|
|
117
|
+
via the [mpi4py](https://mpi4py.readthedocs.io) Python libary:
|
|
118
|
+
```python
|
|
119
|
+
from executorlib import Executor
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def calc(i):
|
|
123
|
+
from mpi4py import MPI
|
|
124
|
+
|
|
125
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
126
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
127
|
+
return i, size, rank
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
with Executor(backend="local") as exe:
|
|
131
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
132
|
+
print(fs.result())
|
|
133
|
+
```
|
|
134
|
+
The additional `resource_dict` parameter defines the computing resources allocated to the execution of the submitted
|
|
135
|
+
Python function. In addition to the compute cores `cores`, the resource dictionary can also define the threads per core
|
|
136
|
+
as `threads_per_core`, the GPUs per core as `gpus_per_core`, the working directory with `cwd`, the option to use the
|
|
137
|
+
OpenMPI oversubscribe feature with `openmpi_oversubscribe` and finally for the [Simple Linux Utility for Resource
|
|
138
|
+
Management (SLURM)](https://slurm.schedmd.com) queuing system the option to provide additional command line arguments
|
|
139
|
+
with the `slurm_cmd_args` parameter - [resource dictionary](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary
|
|
140
|
+
This flexibility to assign computing resources on a per-function-call basis simplifies the up-scaling of Python programs.
|
|
141
|
+
Only the part of the Python functions which benefit from parallel execution are implemented as MPI parallel Python
|
|
142
|
+
funtions, while the rest of the program remains serial.
|
|
143
|
+
|
|
144
|
+
The same function can be submitted to the [SLURM](https://slurm.schedmd.com) queuing by just changing the `backend`
|
|
145
|
+
parameter to `slurm_submission`. The rest of the example remains the same, which highlights how executorlib accelerates
|
|
146
|
+
the rapid prototyping and up-scaling of HPC Python programs.
|
|
147
|
+
```python
|
|
148
|
+
from executorlib import Executor
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def calc(i):
|
|
152
|
+
from mpi4py import MPI
|
|
153
|
+
|
|
154
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
155
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
156
|
+
return i, size, rank
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
with Executor(backend="slurm_submission") as exe:
|
|
160
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
161
|
+
print(fs.result())
|
|
162
|
+
```
|
|
163
|
+
In this case the [Python simple queuing system adapter (pysqa)](https://pysqa.readthedocs.io) is used to submit the
|
|
164
|
+
`calc()` function to the [SLURM](https://slurm.schedmd.com) job scheduler and request an allocation with two CPU cores
|
|
165
|
+
for the execution of the function - [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html). In the background the [sbatch](https://slurm.schedmd.com/sbatch.html)
|
|
166
|
+
command is used to request the allocation to execute the Python function.
|
|
167
|
+
|
|
168
|
+
Within a given [SLURM](https://slurm.schedmd.com) allocation executorlib can also be used to assign a subset of the
|
|
169
|
+
available computing resources to execute a given Python function. In terms of the [SLURM](https://slurm.schedmd.com)
|
|
170
|
+
commands, this functionality internally uses the [srun](https://slurm.schedmd.com/srun.html) command to receive a subset
|
|
171
|
+
of the resources of a given queuing system allocation.
|
|
172
|
+
```python
|
|
173
|
+
from executorlib import Executor
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def calc(i):
|
|
177
|
+
from mpi4py import MPI
|
|
178
|
+
|
|
179
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
180
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
181
|
+
return i, size, rank
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
with Executor(backend="slurm_allocation") as exe:
|
|
185
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
186
|
+
print(fs.result())
|
|
187
|
+
```
|
|
188
|
+
In addition, to support for [SLURM](https://slurm.schedmd.com) executorlib also provides support for the hierarchical
|
|
189
|
+
[flux](http://flux-framework.org) job scheduler. The [flux](http://flux-framework.org) job scheduler is developed at
|
|
190
|
+
[Larwence Livermore National Laboratory](https://computing.llnl.gov/projects/flux-building-framework-resource-management)
|
|
191
|
+
to address the needs for the up-coming generation of Exascale computers. Still even on traditional HPC clusters the
|
|
192
|
+
hierarchical approach of the [flux](http://flux-framework.org) is beneficial to distribute hundreds of tasks within a
|
|
193
|
+
given allocation. Even when [SLURM](https://slurm.schedmd.com) is used as primary job scheduler of your HPC, it is
|
|
194
|
+
recommended to use [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm-with-flux)
|
|
195
|
+
as hierarchical job scheduler within the allocations.
|
|
196
|
+
|
|
197
|
+
## Documentation
|
|
198
|
+
* [Installation](https://executorlib.readthedocs.io/en/latest/installation.html)
|
|
199
|
+
* [Minimal](https://executorlib.readthedocs.io/en/latest/installation.html#minimal)
|
|
200
|
+
* [MPI Support](https://executorlib.readthedocs.io/en/latest/installation.html#mpi-support)
|
|
201
|
+
* [Caching](https://executorlib.readthedocs.io/en/latest/installation.html#caching)
|
|
202
|
+
* [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-submission-mode)
|
|
203
|
+
* [HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-allocation-mode)
|
|
204
|
+
* [Visualisation](https://executorlib.readthedocs.io/en/latest/installation.html#visualisation)
|
|
205
|
+
* [For Developers](https://executorlib.readthedocs.io/en/latest/installation.html#for-developers)
|
|
206
|
+
* [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html)
|
|
207
|
+
* [Basic Functionality](https://executorlib.readthedocs.io/en/latest/1-local.html#basic-functionality)
|
|
208
|
+
* [Parallel Functions](https://executorlib.readthedocs.io/en/latest/1-local.html#parallel-functions)
|
|
209
|
+
* [Performance Optimization](https://executorlib.readthedocs.io/en/latest/1-local.html#performance-optimization)
|
|
210
|
+
* [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html)
|
|
211
|
+
* [SLURM](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html#slurm)
|
|
212
|
+
* [Flux](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html#flux)
|
|
213
|
+
* [HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html)
|
|
214
|
+
* [SLURM](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm)
|
|
215
|
+
* [SLURM with Flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm-with-flux)
|
|
216
|
+
* [Flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#flux)
|
|
217
|
+
* [Trouble Shooting](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html)
|
|
218
|
+
* [Filesystem Usage](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#filesystem-usage)
|
|
219
|
+
* [Firewall Issues](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#firewall-issues)
|
|
220
|
+
* [Message Passing Interface](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#message-passing-interface)
|
|
221
|
+
* [Python Version](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#python-version)
|
|
222
|
+
* [Resource Dictionary](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary)
|
|
223
|
+
* [SSH Connection](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#ssh-connection)
|
|
224
|
+
* [Developer](https://executorlib.readthedocs.io/en/latest/4-developer.html)
|
|
225
|
+
* [Communication](https://executorlib.readthedocs.io/en/latest/4-developer.html#communication)
|
|
226
|
+
* [External Executables](https://executorlib.readthedocs.io/en/latest/4-developer.html#external-executables)
|
|
227
|
+
* [License](https://executorlib.readthedocs.io/en/latest/4-developer.html#license)
|
|
228
|
+
* [Modules](https://executorlib.readthedocs.io/en/latest/4-developer.html#modules)
|
|
229
|
+
* [Interface](https://executorlib.readthedocs.io/en/latest/api.html)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# executorlib
|
|
2
|
+
[](https://github.com/pyiron/executorlib/actions/workflows/unittest-openmpi.yml)
|
|
3
|
+
[](https://coveralls.io/github/pyiron/executorlib?branch=main)
|
|
4
|
+
[](https://mybinder.org/v2/gh/pyiron/executorlib/HEAD?labpath=notebooks%2Fexamples.ipynb)
|
|
5
|
+
|
|
6
|
+
Up-scale python functions for high performance computing (HPC) with executorlib.
|
|
7
|
+
|
|
8
|
+
## Key Features
|
|
9
|
+
* **Up-scale your Python functions beyond a single computer.** - executorlib extends the [Executor interface](https://docs.python.org/3/library/concurrent.futures.html#executor-objects)
|
|
10
|
+
from the Python standard library and combines it with job schedulers for high performance computing (HPC) including
|
|
11
|
+
the [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) and [flux](http://flux-framework.org).
|
|
12
|
+
With this combination executorlib allows users to distribute their Python functions over multiple compute nodes.
|
|
13
|
+
* **Parallelize your Python program one function at a time** - executorlib allows users to assign dedicated computing
|
|
14
|
+
resources like CPU cores, threads or GPUs to one Python function call at a time. So you can accelerate your Python
|
|
15
|
+
code function by function.
|
|
16
|
+
* **Permanent caching of intermediate results to accelerate rapid prototyping** - To accelerate the development of
|
|
17
|
+
machine learning pipelines and simulation workflows executorlib provides optional caching of intermediate results for
|
|
18
|
+
iterative development in interactive environments like jupyter notebooks.
|
|
19
|
+
|
|
20
|
+
## Examples
|
|
21
|
+
The Python standard library provides the [Executor interface](https://docs.python.org/3/library/concurrent.futures.html#executor-objects)
|
|
22
|
+
with the [ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor) and the
|
|
23
|
+
[ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor) for parallel
|
|
24
|
+
execution of Python functions on a single computer. executorlib extends this functionality to distribute Python
|
|
25
|
+
functions over multiple computers within a high performance computing (HPC) cluster. This can be either achieved by
|
|
26
|
+
submitting each function as individual job to the HPC job scheduler - [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html) -
|
|
27
|
+
or by requesting a compute allocation of multiple nodes and then distribute the Python functions within this - allocation -
|
|
28
|
+
[HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html). Finally, to accelerate the
|
|
29
|
+
development process executorlib also provides a - [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) -
|
|
30
|
+
to use the executorlib functionality on a single workstation for testing. Starting with the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html)
|
|
31
|
+
set by setting the backend parameter to local - `backend="local"`:
|
|
32
|
+
```python
|
|
33
|
+
from executorlib import Executor
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
with Executor(backend="local") as exe:
|
|
37
|
+
future_lst = [exe.submit(sum, [i, i]) for i in range(1, 5)]
|
|
38
|
+
print([f.result() for f in future_lst])
|
|
39
|
+
```
|
|
40
|
+
In the same way executorlib can also execute Python functions which use additional computing resources, like multiple
|
|
41
|
+
CPU cores, CPU threads or GPUs. For example if the Python function internally uses the Message Passing Interface (MPI)
|
|
42
|
+
via the [mpi4py](https://mpi4py.readthedocs.io) Python libary:
|
|
43
|
+
```python
|
|
44
|
+
from executorlib import Executor
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def calc(i):
|
|
48
|
+
from mpi4py import MPI
|
|
49
|
+
|
|
50
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
51
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
52
|
+
return i, size, rank
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
with Executor(backend="local") as exe:
|
|
56
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
57
|
+
print(fs.result())
|
|
58
|
+
```
|
|
59
|
+
The additional `resource_dict` parameter defines the computing resources allocated to the execution of the submitted
|
|
60
|
+
Python function. In addition to the compute cores `cores`, the resource dictionary can also define the threads per core
|
|
61
|
+
as `threads_per_core`, the GPUs per core as `gpus_per_core`, the working directory with `cwd`, the option to use the
|
|
62
|
+
OpenMPI oversubscribe feature with `openmpi_oversubscribe` and finally for the [Simple Linux Utility for Resource
|
|
63
|
+
Management (SLURM)](https://slurm.schedmd.com) queuing system the option to provide additional command line arguments
|
|
64
|
+
with the `slurm_cmd_args` parameter - [resource dictionary](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary
|
|
65
|
+
This flexibility to assign computing resources on a per-function-call basis simplifies the up-scaling of Python programs.
|
|
66
|
+
Only the part of the Python functions which benefit from parallel execution are implemented as MPI parallel Python
|
|
67
|
+
funtions, while the rest of the program remains serial.
|
|
68
|
+
|
|
69
|
+
The same function can be submitted to the [SLURM](https://slurm.schedmd.com) queuing by just changing the `backend`
|
|
70
|
+
parameter to `slurm_submission`. The rest of the example remains the same, which highlights how executorlib accelerates
|
|
71
|
+
the rapid prototyping and up-scaling of HPC Python programs.
|
|
72
|
+
```python
|
|
73
|
+
from executorlib import Executor
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def calc(i):
|
|
77
|
+
from mpi4py import MPI
|
|
78
|
+
|
|
79
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
80
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
81
|
+
return i, size, rank
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
with Executor(backend="slurm_submission") as exe:
|
|
85
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
86
|
+
print(fs.result())
|
|
87
|
+
```
|
|
88
|
+
In this case the [Python simple queuing system adapter (pysqa)](https://pysqa.readthedocs.io) is used to submit the
|
|
89
|
+
`calc()` function to the [SLURM](https://slurm.schedmd.com) job scheduler and request an allocation with two CPU cores
|
|
90
|
+
for the execution of the function - [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html). In the background the [sbatch](https://slurm.schedmd.com/sbatch.html)
|
|
91
|
+
command is used to request the allocation to execute the Python function.
|
|
92
|
+
|
|
93
|
+
Within a given [SLURM](https://slurm.schedmd.com) allocation executorlib can also be used to assign a subset of the
|
|
94
|
+
available computing resources to execute a given Python function. In terms of the [SLURM](https://slurm.schedmd.com)
|
|
95
|
+
commands, this functionality internally uses the [srun](https://slurm.schedmd.com/srun.html) command to receive a subset
|
|
96
|
+
of the resources of a given queuing system allocation.
|
|
97
|
+
```python
|
|
98
|
+
from executorlib import Executor
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def calc(i):
|
|
102
|
+
from mpi4py import MPI
|
|
103
|
+
|
|
104
|
+
size = MPI.COMM_WORLD.Get_size()
|
|
105
|
+
rank = MPI.COMM_WORLD.Get_rank()
|
|
106
|
+
return i, size, rank
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
with Executor(backend="slurm_allocation") as exe:
|
|
110
|
+
fs = exe.submit(calc, 3, resource_dict={"cores": 2})
|
|
111
|
+
print(fs.result())
|
|
112
|
+
```
|
|
113
|
+
In addition, to support for [SLURM](https://slurm.schedmd.com) executorlib also provides support for the hierarchical
|
|
114
|
+
[flux](http://flux-framework.org) job scheduler. The [flux](http://flux-framework.org) job scheduler is developed at
|
|
115
|
+
[Larwence Livermore National Laboratory](https://computing.llnl.gov/projects/flux-building-framework-resource-management)
|
|
116
|
+
to address the needs for the up-coming generation of Exascale computers. Still even on traditional HPC clusters the
|
|
117
|
+
hierarchical approach of the [flux](http://flux-framework.org) is beneficial to distribute hundreds of tasks within a
|
|
118
|
+
given allocation. Even when [SLURM](https://slurm.schedmd.com) is used as primary job scheduler of your HPC, it is
|
|
119
|
+
recommended to use [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm-with-flux)
|
|
120
|
+
as hierarchical job scheduler within the allocations.
|
|
121
|
+
|
|
122
|
+
## Documentation
|
|
123
|
+
* [Installation](https://executorlib.readthedocs.io/en/latest/installation.html)
|
|
124
|
+
* [Minimal](https://executorlib.readthedocs.io/en/latest/installation.html#minimal)
|
|
125
|
+
* [MPI Support](https://executorlib.readthedocs.io/en/latest/installation.html#mpi-support)
|
|
126
|
+
* [Caching](https://executorlib.readthedocs.io/en/latest/installation.html#caching)
|
|
127
|
+
* [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-submission-mode)
|
|
128
|
+
* [HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-allocation-mode)
|
|
129
|
+
* [Visualisation](https://executorlib.readthedocs.io/en/latest/installation.html#visualisation)
|
|
130
|
+
* [For Developers](https://executorlib.readthedocs.io/en/latest/installation.html#for-developers)
|
|
131
|
+
* [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html)
|
|
132
|
+
* [Basic Functionality](https://executorlib.readthedocs.io/en/latest/1-local.html#basic-functionality)
|
|
133
|
+
* [Parallel Functions](https://executorlib.readthedocs.io/en/latest/1-local.html#parallel-functions)
|
|
134
|
+
* [Performance Optimization](https://executorlib.readthedocs.io/en/latest/1-local.html#performance-optimization)
|
|
135
|
+
* [HPC Submission Mode](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html)
|
|
136
|
+
* [SLURM](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html#slurm)
|
|
137
|
+
* [Flux](https://executorlib.readthedocs.io/en/latest/2-hpc-submission.html#flux)
|
|
138
|
+
* [HPC Allocation Mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html)
|
|
139
|
+
* [SLURM](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm)
|
|
140
|
+
* [SLURM with Flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#slurm-with-flux)
|
|
141
|
+
* [Flux](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html#flux)
|
|
142
|
+
* [Trouble Shooting](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html)
|
|
143
|
+
* [Filesystem Usage](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#filesystem-usage)
|
|
144
|
+
* [Firewall Issues](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#firewall-issues)
|
|
145
|
+
* [Message Passing Interface](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#message-passing-interface)
|
|
146
|
+
* [Python Version](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#python-version)
|
|
147
|
+
* [Resource Dictionary](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary)
|
|
148
|
+
* [SSH Connection](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#ssh-connection)
|
|
149
|
+
* [Developer](https://executorlib.readthedocs.io/en/latest/4-developer.html)
|
|
150
|
+
* [Communication](https://executorlib.readthedocs.io/en/latest/4-developer.html#communication)
|
|
151
|
+
* [External Executables](https://executorlib.readthedocs.io/en/latest/4-developer.html#external-executables)
|
|
152
|
+
* [License](https://executorlib.readthedocs.io/en/latest/4-developer.html#license)
|
|
153
|
+
* [Modules](https://executorlib.readthedocs.io/en/latest/4-developer.html#modules)
|
|
154
|
+
* [Interface](https://executorlib.readthedocs.io/en/latest/api.html)
|
|
@@ -1,31 +1,22 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
-
from executorlib.
|
|
4
|
-
from executorlib.interactive.
|
|
5
|
-
|
|
3
|
+
from executorlib._version import get_versions as _get_versions
|
|
4
|
+
from executorlib.interactive.executor import (
|
|
5
|
+
ExecutorWithDependencies as _ExecutorWithDependencies,
|
|
6
|
+
)
|
|
7
|
+
from executorlib.interactive.executor import create_executor as _create_executor
|
|
8
|
+
from executorlib.standalone.inputcheck import (
|
|
6
9
|
check_plot_dependency_graph as _check_plot_dependency_graph,
|
|
7
10
|
)
|
|
8
|
-
from executorlib.
|
|
11
|
+
from executorlib.standalone.inputcheck import (
|
|
12
|
+
check_pysqa_config_directory as _check_pysqa_config_directory,
|
|
13
|
+
)
|
|
14
|
+
from executorlib.standalone.inputcheck import (
|
|
9
15
|
check_refresh_rate as _check_refresh_rate,
|
|
10
16
|
)
|
|
11
|
-
from executorlib.shell.executor import SubprocessExecutor
|
|
12
|
-
from executorlib.shell.interactive import ShellExecutor
|
|
13
|
-
|
|
14
|
-
from ._version import get_versions
|
|
15
|
-
|
|
16
|
-
__version__ = get_versions()["version"]
|
|
17
|
-
__all__ = [
|
|
18
|
-
SubprocessExecutor,
|
|
19
|
-
ShellExecutor,
|
|
20
|
-
]
|
|
21
|
-
|
|
22
17
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
__all__ += [FileExecutor]
|
|
27
|
-
except ImportError:
|
|
28
|
-
pass
|
|
18
|
+
__version__ = _get_versions()["version"]
|
|
19
|
+
__all__ = []
|
|
29
20
|
|
|
30
21
|
|
|
31
22
|
class Executor:
|
|
@@ -41,16 +32,20 @@ class Executor:
|
|
|
41
32
|
cores which can be used in parallel - just like the max_cores parameter. Using max_cores is
|
|
42
33
|
recommended, as computers have a limited number of compute cores.
|
|
43
34
|
backend (str): Switch between the different backends "flux", "local" or "slurm". The default is "local".
|
|
35
|
+
cache_directory (str, optional): The directory to store cache files. Defaults to "cache".
|
|
44
36
|
max_cores (int): defines the number cores which can be used in parallel
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
37
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
38
|
+
- cores_per_worker (int): number of MPI cores to be used for each function call
|
|
39
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
40
|
+
- gpus_per_worker (int): number of GPUs per worker - defaults to 0
|
|
41
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
42
|
+
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
|
|
43
|
+
SLURM only) - default False
|
|
44
|
+
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
|
|
51
45
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
52
46
|
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
53
47
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
48
|
+
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
54
49
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
55
50
|
context of an HPC cluster this essential to be able to communicate to an
|
|
56
51
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -91,20 +86,17 @@ class Executor:
|
|
|
91
86
|
|
|
92
87
|
def __init__(
|
|
93
88
|
self,
|
|
94
|
-
max_workers: int =
|
|
89
|
+
max_workers: Optional[int] = None,
|
|
95
90
|
backend: str = "local",
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
gpus_per_worker: int = 0,
|
|
100
|
-
cwd: Optional[str] = None,
|
|
101
|
-
openmpi_oversubscribe: bool = False,
|
|
102
|
-
slurm_cmd_args: list[str] = [],
|
|
91
|
+
cache_directory: Optional[str] = None,
|
|
92
|
+
max_cores: Optional[int] = None,
|
|
93
|
+
resource_dict: Optional[dict] = None,
|
|
103
94
|
flux_executor=None,
|
|
104
95
|
flux_executor_pmi_mode: Optional[str] = None,
|
|
105
96
|
flux_executor_nesting: bool = False,
|
|
106
|
-
|
|
107
|
-
|
|
97
|
+
pysqa_config_directory: Optional[str] = None,
|
|
98
|
+
hostname_localhost: Optional[bool] = None,
|
|
99
|
+
block_allocation: bool = False,
|
|
108
100
|
init_function: Optional[callable] = None,
|
|
109
101
|
disable_dependencies: bool = False,
|
|
110
102
|
refresh_rate: float = 0.01,
|
|
@@ -115,20 +107,17 @@ class Executor:
|
|
|
115
107
|
|
|
116
108
|
def __new__(
|
|
117
109
|
cls,
|
|
118
|
-
max_workers: int =
|
|
110
|
+
max_workers: Optional[int] = None,
|
|
119
111
|
backend: str = "local",
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
gpus_per_worker: int = 0,
|
|
124
|
-
cwd: Optional[str] = None,
|
|
125
|
-
openmpi_oversubscribe: bool = False,
|
|
126
|
-
slurm_cmd_args: list[str] = [],
|
|
112
|
+
cache_directory: Optional[str] = None,
|
|
113
|
+
max_cores: Optional[int] = None,
|
|
114
|
+
resource_dict: Optional[dict] = None,
|
|
127
115
|
flux_executor=None,
|
|
128
116
|
flux_executor_pmi_mode: Optional[str] = None,
|
|
129
117
|
flux_executor_nesting: bool = False,
|
|
130
|
-
|
|
131
|
-
|
|
118
|
+
pysqa_config_directory: Optional[str] = None,
|
|
119
|
+
hostname_localhost: Optional[bool] = None,
|
|
120
|
+
block_allocation: bool = False,
|
|
132
121
|
init_function: Optional[callable] = None,
|
|
133
122
|
disable_dependencies: bool = False,
|
|
134
123
|
refresh_rate: float = 0.01,
|
|
@@ -147,16 +136,21 @@ class Executor:
|
|
|
147
136
|
number of cores which can be used in parallel - just like the max_cores parameter. Using
|
|
148
137
|
max_cores is recommended, as computers have a limited number of compute cores.
|
|
149
138
|
backend (str): Switch between the different backends "flux", "local" or "slurm". The default is "local".
|
|
139
|
+
cache_directory (str, optional): The directory to store cache files. Defaults to "cache".
|
|
150
140
|
max_cores (int): defines the number cores which can be used in parallel
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
141
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
142
|
+
- cores (int): number of MPI cores to be used for each function call
|
|
143
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
144
|
+
- gpus_per_core (int): number of GPUs per worker - defaults to 0
|
|
145
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
146
|
+
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI
|
|
147
|
+
and SLURM only) - default False
|
|
148
|
+
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM
|
|
149
|
+
only)
|
|
157
150
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
158
151
|
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
159
152
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
153
|
+
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
160
154
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
161
155
|
context of an HPC cluster this essential to be able to communicate to an
|
|
162
156
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -175,17 +169,45 @@ class Executor:
|
|
|
175
169
|
debugging purposes and to get an overview of the specified dependencies.
|
|
176
170
|
|
|
177
171
|
"""
|
|
178
|
-
|
|
179
|
-
|
|
172
|
+
default_resource_dict = {
|
|
173
|
+
"cores": 1,
|
|
174
|
+
"threads_per_core": 1,
|
|
175
|
+
"gpus_per_core": 0,
|
|
176
|
+
"cwd": None,
|
|
177
|
+
"openmpi_oversubscribe": False,
|
|
178
|
+
"slurm_cmd_args": [],
|
|
179
|
+
}
|
|
180
|
+
if resource_dict is None:
|
|
181
|
+
resource_dict = {}
|
|
182
|
+
resource_dict.update(
|
|
183
|
+
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
184
|
+
)
|
|
185
|
+
if "_submission" in backend and not plot_dependency_graph:
|
|
186
|
+
from executorlib.cache.executor import create_file_executor
|
|
187
|
+
|
|
188
|
+
return create_file_executor(
|
|
189
|
+
max_workers=max_workers,
|
|
190
|
+
backend=backend,
|
|
191
|
+
max_cores=max_cores,
|
|
192
|
+
cache_directory=cache_directory,
|
|
193
|
+
resource_dict=resource_dict,
|
|
194
|
+
flux_executor=flux_executor,
|
|
195
|
+
flux_executor_pmi_mode=flux_executor_pmi_mode,
|
|
196
|
+
flux_executor_nesting=flux_executor_nesting,
|
|
197
|
+
pysqa_config_directory=pysqa_config_directory,
|
|
198
|
+
hostname_localhost=hostname_localhost,
|
|
199
|
+
block_allocation=block_allocation,
|
|
200
|
+
init_function=init_function,
|
|
201
|
+
disable_dependencies=disable_dependencies,
|
|
202
|
+
)
|
|
203
|
+
elif not disable_dependencies:
|
|
204
|
+
_check_pysqa_config_directory(pysqa_config_directory=pysqa_config_directory)
|
|
205
|
+
return _ExecutorWithDependencies(
|
|
180
206
|
max_workers=max_workers,
|
|
181
207
|
backend=backend,
|
|
208
|
+
cache_directory=cache_directory,
|
|
182
209
|
max_cores=max_cores,
|
|
183
|
-
|
|
184
|
-
threads_per_core=threads_per_core,
|
|
185
|
-
gpus_per_worker=gpus_per_worker,
|
|
186
|
-
cwd=cwd,
|
|
187
|
-
openmpi_oversubscribe=openmpi_oversubscribe,
|
|
188
|
-
slurm_cmd_args=slurm_cmd_args,
|
|
210
|
+
resource_dict=resource_dict,
|
|
189
211
|
flux_executor=flux_executor,
|
|
190
212
|
flux_executor_pmi_mode=flux_executor_pmi_mode,
|
|
191
213
|
flux_executor_nesting=flux_executor_nesting,
|
|
@@ -196,18 +218,15 @@ class Executor:
|
|
|
196
218
|
plot_dependency_graph=plot_dependency_graph,
|
|
197
219
|
)
|
|
198
220
|
else:
|
|
221
|
+
_check_pysqa_config_directory(pysqa_config_directory=pysqa_config_directory)
|
|
199
222
|
_check_plot_dependency_graph(plot_dependency_graph=plot_dependency_graph)
|
|
200
223
|
_check_refresh_rate(refresh_rate=refresh_rate)
|
|
201
|
-
return
|
|
224
|
+
return _create_executor(
|
|
202
225
|
max_workers=max_workers,
|
|
203
226
|
backend=backend,
|
|
227
|
+
cache_directory=cache_directory,
|
|
204
228
|
max_cores=max_cores,
|
|
205
|
-
|
|
206
|
-
threads_per_core=threads_per_core,
|
|
207
|
-
gpus_per_worker=gpus_per_worker,
|
|
208
|
-
cwd=cwd,
|
|
209
|
-
openmpi_oversubscribe=openmpi_oversubscribe,
|
|
210
|
-
slurm_cmd_args=slurm_cmd_args,
|
|
229
|
+
resource_dict=resource_dict,
|
|
211
230
|
flux_executor=flux_executor,
|
|
212
231
|
flux_executor_pmi_mode=flux_executor_pmi_mode,
|
|
213
232
|
flux_executor_nesting=flux_executor_nesting,
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-
|
|
11
|
+
"date": "2024-11-20T13:20:24+0100",
|
|
12
12
|
"dirty": true,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.0.
|
|
14
|
+
"full-revisionid": "eb8cb29e2ebbc6930da51c5511b873ea5bf92f69",
|
|
15
|
+
"version": "0.0.5"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|