executorlib 1.6.0__tar.gz → 1.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {executorlib-1.6.0 → executorlib-1.6.2}/PKG-INFO +5 -4
- {executorlib-1.6.0 → executorlib-1.6.2}/README.md +1 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/__init__.py +42 -10
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/_version.py +16 -3
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/backend/interactive_parallel.py +2 -2
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/backend/interactive_serial.py +2 -2
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/executor/base.py +27 -10
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/executor/flux.py +14 -11
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/executor/single.py +1 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/executor/slurm.py +12 -1
- executorlib-1.6.2/executorlib/standalone/batched.py +27 -0
- executorlib-1.6.2/executorlib/standalone/command.py +114 -0
- {executorlib-1.6.0/executorlib/task_scheduler/file → executorlib-1.6.2/executorlib/standalone}/hdf.py +61 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/inputcheck.py +11 -4
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/interactive/backend.py +2 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/interactive/communication.py +5 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/interactive/spawner.py +4 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/plot.py +7 -3
- executorlib-1.6.2/executorlib/standalone/scheduler.py +65 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/serialize.py +11 -11
- executorlib-1.6.2/executorlib/standalone/slurm_command.py +51 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/base.py +27 -10
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/backend.py +1 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/queue_spawner.py +5 -66
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/shared.py +13 -35
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/subprocess_spawner.py +3 -1
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/task_scheduler.py +12 -7
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/blockallocation.py +12 -12
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/dependency.py +56 -7
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/fluxspawner.py +6 -5
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/shared.py +11 -32
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/slurmspawner.py +16 -57
- {executorlib-1.6.0 → executorlib-1.6.2}/pyproject.toml +8 -4
- executorlib-1.6.0/executorlib/standalone/cache.py +0 -57
- executorlib-1.6.0/executorlib/standalone/command.py +0 -14
- {executorlib-1.6.0 → executorlib-1.6.2}/.gitignore +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/LICENSE +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/api.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/backend/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/backend/cache_parallel.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/backend/cache_serial.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/executor/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/error.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/interactive/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/interactive/arguments.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/standalone/queue.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/file/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/__init__.py +0 -0
- {executorlib-1.6.0 → executorlib-1.6.2}/executorlib/task_scheduler/interactive/onetoone.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: executorlib
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.2
|
|
4
4
|
Summary: Up-scale python functions for high performance computing (HPC) with executorlib.
|
|
5
5
|
Project-URL: Homepage, https://github.com/pyiron/executorlib
|
|
6
6
|
Project-URL: Documentation, https://executorlib.readthedocs.io
|
|
@@ -48,19 +48,19 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
48
48
|
Classifier: Topic :: Scientific/Engineering :: Physics
|
|
49
49
|
Requires-Python: <3.14,>3.9
|
|
50
50
|
Requires-Dist: cloudpickle<=3.1.1,>=2.0.0
|
|
51
|
-
Requires-Dist: pyzmq<=27.0.
|
|
51
|
+
Requires-Dist: pyzmq<=27.0.2,>=25.0.0
|
|
52
52
|
Provides-Extra: all
|
|
53
53
|
Requires-Dist: h5py<=3.14.0,>=3.6.0; extra == 'all'
|
|
54
54
|
Requires-Dist: ipython<=9.0.2,>=7.33.0; extra == 'all'
|
|
55
55
|
Requires-Dist: mpi4py<=4.0.1,>=3.1.4; extra == 'all'
|
|
56
56
|
Requires-Dist: networkx<=3.4.2,>=2.8.8; extra == 'all'
|
|
57
57
|
Requires-Dist: pygraphviz<=1.14,>=1.10; extra == 'all'
|
|
58
|
-
Requires-Dist: pysqa==0.
|
|
58
|
+
Requires-Dist: pysqa==0.3.1; extra == 'all'
|
|
59
59
|
Provides-Extra: cache
|
|
60
60
|
Requires-Dist: h5py<=3.14.0,>=3.6.0; extra == 'cache'
|
|
61
61
|
Provides-Extra: cluster
|
|
62
62
|
Requires-Dist: h5py<=3.14.0,>=3.6.0; extra == 'cluster'
|
|
63
|
-
Requires-Dist: pysqa==0.
|
|
63
|
+
Requires-Dist: pysqa==0.3.1; extra == 'cluster'
|
|
64
64
|
Provides-Extra: graph
|
|
65
65
|
Requires-Dist: networkx<=3.4.2,>=2.8.8; extra == 'graph'
|
|
66
66
|
Requires-Dist: pygraphviz<=1.14,>=1.10; extra == 'graph'
|
|
@@ -208,6 +208,7 @@ as hierarchical job scheduler within the allocations.
|
|
|
208
208
|
* [Basic Functionality](https://executorlib.readthedocs.io/en/latest/1-single-node.html#basic-functionality)
|
|
209
209
|
* [Parallel Functions](https://executorlib.readthedocs.io/en/latest/1-single-node.html#parallel-functions)
|
|
210
210
|
* [Performance Optimization](https://executorlib.readthedocs.io/en/latest/1-single-node.html#performance-optimization)
|
|
211
|
+
* [Testing and Debugging](https://executorlib.readthedocs.io/en/latest/1-single-node.html#testing-and-debugging)
|
|
211
212
|
* [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html)
|
|
212
213
|
* [SLURM](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#slurm)
|
|
213
214
|
* [Flux](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#flux)
|
|
@@ -134,6 +134,7 @@ as hierarchical job scheduler within the allocations.
|
|
|
134
134
|
* [Basic Functionality](https://executorlib.readthedocs.io/en/latest/1-single-node.html#basic-functionality)
|
|
135
135
|
* [Parallel Functions](https://executorlib.readthedocs.io/en/latest/1-single-node.html#parallel-functions)
|
|
136
136
|
* [Performance Optimization](https://executorlib.readthedocs.io/en/latest/1-single-node.html#performance-optimization)
|
|
137
|
+
* [Testing and Debugging](https://executorlib.readthedocs.io/en/latest/1-single-node.html#testing-and-debugging)
|
|
137
138
|
* [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html)
|
|
138
139
|
* [SLURM](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#slurm)
|
|
139
140
|
* [Flux](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#flux)
|
|
@@ -12,6 +12,9 @@ Finally, the get_cache_data() function allows users to cache the content of thei
|
|
|
12
12
|
pandas.DataFrame.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
import executorlib._version
|
|
15
18
|
from executorlib.executor.base import BaseExecutor
|
|
16
19
|
from executorlib.executor.flux import (
|
|
17
20
|
FluxClusterExecutor,
|
|
@@ -22,12 +25,48 @@ from executorlib.executor.slurm import (
|
|
|
22
25
|
SlurmClusterExecutor,
|
|
23
26
|
SlurmJobExecutor,
|
|
24
27
|
)
|
|
25
|
-
from executorlib.standalone.cache import get_cache_data
|
|
26
28
|
|
|
27
|
-
|
|
29
|
+
|
|
30
|
+
def get_cache_data(cache_directory: str) -> list[dict]:
|
|
31
|
+
"""
|
|
32
|
+
Collect all HDF5 files in the cache directory
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
cache_directory (str): The directory to store cache files.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
list[dict]: List of dictionaries each representing on of the HDF5 files in the cache directory.
|
|
39
|
+
"""
|
|
40
|
+
from executorlib.standalone.hdf import get_cache_data
|
|
41
|
+
|
|
42
|
+
return get_cache_data(cache_directory=cache_directory)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def terminate_tasks_in_cache(
|
|
46
|
+
cache_directory: str,
|
|
47
|
+
config_directory: Optional[str] = None,
|
|
48
|
+
backend: Optional[str] = None,
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Delete all jobs stored in the cache directory from the queuing system
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
cache_directory (str): The directory to store cache files.
|
|
55
|
+
config_directory (str, optional): path to the config directory.
|
|
56
|
+
backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"].
|
|
57
|
+
"""
|
|
58
|
+
from executorlib.task_scheduler.file.queue_spawner import terminate_tasks_in_cache
|
|
59
|
+
|
|
60
|
+
return terminate_tasks_in_cache(
|
|
61
|
+
cache_directory=cache_directory,
|
|
62
|
+
config_directory=config_directory,
|
|
63
|
+
backend=backend,
|
|
64
|
+
)
|
|
65
|
+
|
|
28
66
|
|
|
29
67
|
__all__: list[str] = [
|
|
30
68
|
"get_cache_data",
|
|
69
|
+
"terminate_tasks_in_cache",
|
|
31
70
|
"BaseExecutor",
|
|
32
71
|
"FluxJobExecutor",
|
|
33
72
|
"FluxClusterExecutor",
|
|
@@ -36,11 +75,4 @@ __all__: list[str] = [
|
|
|
36
75
|
"SlurmClusterExecutor",
|
|
37
76
|
]
|
|
38
77
|
|
|
39
|
-
|
|
40
|
-
from executorlib.task_scheduler.file.queue_spawner import terminate_tasks_in_cache
|
|
41
|
-
|
|
42
|
-
__all__ += ["terminate_tasks_in_cache"]
|
|
43
|
-
except ImportError:
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
__version__ = _version.__version__
|
|
78
|
+
__version__ = executorlib._version.__version__
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
5
12
|
|
|
6
13
|
TYPE_CHECKING = False
|
|
7
14
|
if TYPE_CHECKING:
|
|
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
|
|
|
9
16
|
from typing import Union
|
|
10
17
|
|
|
11
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
12
20
|
else:
|
|
13
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
14
23
|
|
|
15
24
|
version: str
|
|
16
25
|
__version__: str
|
|
17
26
|
__version_tuple__: VERSION_TUPLE
|
|
18
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
19
30
|
|
|
20
|
-
__version__ = version = '1.6.
|
|
21
|
-
__version_tuple__ = version_tuple = (1, 6,
|
|
31
|
+
__version__ = version = '1.6.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (1, 6, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -43,7 +43,7 @@ def main() -> None:
|
|
|
43
43
|
host=argument_dict["host"], port=argument_dict["zmqport"]
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
memory =
|
|
46
|
+
memory = {"executorlib_worker_id": int(argument_dict["worker_id"])}
|
|
47
47
|
|
|
48
48
|
# required for flux interface - otherwise the current path is not included in the python path
|
|
49
49
|
cwd = abspath(".")
|
|
@@ -97,7 +97,7 @@ def main() -> None:
|
|
|
97
97
|
and "args" in input_dict
|
|
98
98
|
and "kwargs" in input_dict
|
|
99
99
|
):
|
|
100
|
-
memory
|
|
100
|
+
memory.update(call_funct(input_dict=input_dict, funct=None, memory=memory))
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
if __name__ == "__main__":
|
|
@@ -29,7 +29,7 @@ def main(argument_lst: Optional[list[str]] = None):
|
|
|
29
29
|
host=argument_dict["host"], port=argument_dict["zmqport"]
|
|
30
30
|
)
|
|
31
31
|
|
|
32
|
-
memory =
|
|
32
|
+
memory = {"executorlib_worker_id": int(argument_dict["worker_id"])}
|
|
33
33
|
|
|
34
34
|
# required for flux interface - otherwise the current path is not included in the python path
|
|
35
35
|
cwd = abspath(".")
|
|
@@ -72,7 +72,7 @@ def main(argument_lst: Optional[list[str]] = None):
|
|
|
72
72
|
and "args" in input_dict
|
|
73
73
|
and "kwargs" in input_dict
|
|
74
74
|
):
|
|
75
|
-
memory
|
|
75
|
+
memory.update(call_funct(input_dict=input_dict, funct=None, memory=memory))
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
if __name__ == "__main__":
|
|
@@ -50,6 +50,23 @@ class BaseExecutor(FutureExecutor, ABC):
|
|
|
50
50
|
"""
|
|
51
51
|
return self._task_scheduler.future_queue
|
|
52
52
|
|
|
53
|
+
def batched(
|
|
54
|
+
self,
|
|
55
|
+
iterable: list[Future],
|
|
56
|
+
n: int,
|
|
57
|
+
) -> list[Future]:
|
|
58
|
+
"""
|
|
59
|
+
Batch futures from the iterable into tuples of length n. The last batch may be shorter than n.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
iterable (list): list of future objects to batch based on which future objects finish first
|
|
63
|
+
n (int): badge size
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
list[Future]: list of future objects one for each batch
|
|
67
|
+
"""
|
|
68
|
+
return self._task_scheduler.batched(iterable=iterable, n=n)
|
|
69
|
+
|
|
53
70
|
def submit( # type: ignore
|
|
54
71
|
self,
|
|
55
72
|
fn: Callable,
|
|
@@ -68,16 +85,16 @@ class BaseExecutor(FutureExecutor, ABC):
|
|
|
68
85
|
fn (callable): function to submit for execution
|
|
69
86
|
args: arguments for the submitted function
|
|
70
87
|
kwargs: keyword arguments for the submitted function
|
|
71
|
-
resource_dict (dict):
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
88
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
89
|
+
- cores (int): number of MPI cores to be used for each function call
|
|
90
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
91
|
+
- gpus_per_core (int): number of GPUs per worker - defaults to 0
|
|
92
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
93
|
+
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
|
|
94
|
+
SLURM only) - default False
|
|
95
|
+
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
|
|
96
|
+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
97
|
+
by the Python functions submitted to the Executor.
|
|
81
98
|
|
|
82
99
|
Returns:
|
|
83
100
|
Future: A Future representing the given call.
|
|
@@ -43,8 +43,8 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
43
43
|
compute notes. Defaults to False.
|
|
44
44
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
45
45
|
by the Python functions submitted to the Executor.
|
|
46
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
46
47
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
47
|
-
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
48
48
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
49
49
|
flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False.
|
|
50
50
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
@@ -93,8 +93,8 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
93
93
|
cache_directory: Optional[str] = None,
|
|
94
94
|
max_cores: Optional[int] = None,
|
|
95
95
|
resource_dict: Optional[dict] = None,
|
|
96
|
+
pmi_mode: Optional[str] = None,
|
|
96
97
|
flux_executor=None,
|
|
97
|
-
flux_executor_pmi_mode: Optional[str] = None,
|
|
98
98
|
flux_executor_nesting: bool = False,
|
|
99
99
|
flux_log_files: bool = False,
|
|
100
100
|
hostname_localhost: Optional[bool] = None,
|
|
@@ -130,8 +130,8 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
130
130
|
compute notes. Defaults to False.
|
|
131
131
|
- error_log_file (str): Name of the error log file to use for storing exceptions
|
|
132
132
|
raised by the Python functions submitted to the Executor.
|
|
133
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
133
134
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
134
|
-
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
135
135
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
136
136
|
flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False.
|
|
137
137
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
@@ -175,8 +175,8 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
175
175
|
cache_directory=cache_directory,
|
|
176
176
|
max_cores=max_cores,
|
|
177
177
|
resource_dict=resource_dict,
|
|
178
|
+
pmi_mode=pmi_mode,
|
|
178
179
|
flux_executor=flux_executor,
|
|
179
|
-
flux_executor_pmi_mode=flux_executor_pmi_mode,
|
|
180
180
|
flux_executor_nesting=flux_executor_nesting,
|
|
181
181
|
flux_log_files=flux_log_files,
|
|
182
182
|
hostname_localhost=hostname_localhost,
|
|
@@ -199,8 +199,8 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
199
199
|
cache_directory=cache_directory,
|
|
200
200
|
max_cores=max_cores,
|
|
201
201
|
resource_dict=resource_dict,
|
|
202
|
+
pmi_mode=pmi_mode,
|
|
202
203
|
flux_executor=flux_executor,
|
|
203
|
-
flux_executor_pmi_mode=flux_executor_pmi_mode,
|
|
204
204
|
flux_executor_nesting=flux_executor_nesting,
|
|
205
205
|
flux_log_files=flux_log_files,
|
|
206
206
|
hostname_localhost=hostname_localhost,
|
|
@@ -236,6 +236,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
236
236
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
237
237
|
by the Python functions submitted to the Executor.
|
|
238
238
|
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
239
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
239
240
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
240
241
|
context of an HPC cluster this essential to be able to communicate to an
|
|
241
242
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -283,6 +284,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
283
284
|
max_cores: Optional[int] = None,
|
|
284
285
|
resource_dict: Optional[dict] = None,
|
|
285
286
|
pysqa_config_directory: Optional[str] = None,
|
|
287
|
+
pmi_mode: Optional[str] = None,
|
|
286
288
|
hostname_localhost: Optional[bool] = None,
|
|
287
289
|
block_allocation: bool = False,
|
|
288
290
|
init_function: Optional[Callable] = None,
|
|
@@ -317,6 +319,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
317
319
|
- error_log_file (str): Name of the error log file to use for storing exceptions
|
|
318
320
|
raised by the Python functions submitted to the Executor.
|
|
319
321
|
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
322
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
320
323
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
321
324
|
context of an HPC cluster this essential to be able to communicate to an
|
|
322
325
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -366,7 +369,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
366
369
|
cache_directory=cache_directory,
|
|
367
370
|
resource_dict=resource_dict,
|
|
368
371
|
flux_executor=None,
|
|
369
|
-
|
|
372
|
+
pmi_mode=pmi_mode,
|
|
370
373
|
flux_executor_nesting=False,
|
|
371
374
|
flux_log_files=False,
|
|
372
375
|
pysqa_config_directory=pysqa_config_directory,
|
|
@@ -384,8 +387,8 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
384
387
|
cache_directory=cache_directory,
|
|
385
388
|
max_cores=max_cores,
|
|
386
389
|
resource_dict=resource_dict,
|
|
390
|
+
pmi_mode=None,
|
|
387
391
|
flux_executor=None,
|
|
388
|
-
flux_executor_pmi_mode=None,
|
|
389
392
|
flux_executor_nesting=False,
|
|
390
393
|
flux_log_files=False,
|
|
391
394
|
hostname_localhost=hostname_localhost,
|
|
@@ -405,8 +408,8 @@ def create_flux_executor(
|
|
|
405
408
|
max_cores: Optional[int] = None,
|
|
406
409
|
cache_directory: Optional[str] = None,
|
|
407
410
|
resource_dict: Optional[dict] = None,
|
|
411
|
+
pmi_mode: Optional[str] = None,
|
|
408
412
|
flux_executor=None,
|
|
409
|
-
flux_executor_pmi_mode: Optional[str] = None,
|
|
410
413
|
flux_executor_nesting: bool = False,
|
|
411
414
|
flux_log_files: bool = False,
|
|
412
415
|
hostname_localhost: Optional[bool] = None,
|
|
@@ -434,8 +437,8 @@ def create_flux_executor(
|
|
|
434
437
|
compute notes. Defaults to False.
|
|
435
438
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
436
439
|
by the Python functions submitted to the Executor.
|
|
440
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
437
441
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
438
|
-
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
439
442
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
440
443
|
flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False.
|
|
441
444
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
@@ -467,7 +470,7 @@ def create_flux_executor(
|
|
|
467
470
|
resource_dict["hostname_localhost"] = hostname_localhost
|
|
468
471
|
resource_dict["log_obj_size"] = log_obj_size
|
|
469
472
|
check_init_function(block_allocation=block_allocation, init_function=init_function)
|
|
470
|
-
check_pmi(backend="flux_allocation", pmi=
|
|
473
|
+
check_pmi(backend="flux_allocation", pmi=pmi_mode)
|
|
471
474
|
check_oversubscribe(oversubscribe=resource_dict.get("openmpi_oversubscribe", False))
|
|
472
475
|
check_command_line_argument_lst(
|
|
473
476
|
command_line_argument_lst=resource_dict.get("slurm_cmd_args", [])
|
|
@@ -476,8 +479,8 @@ def create_flux_executor(
|
|
|
476
479
|
del resource_dict["openmpi_oversubscribe"]
|
|
477
480
|
if "slurm_cmd_args" in resource_dict:
|
|
478
481
|
del resource_dict["slurm_cmd_args"]
|
|
482
|
+
resource_dict["pmi_mode"] = pmi_mode
|
|
479
483
|
resource_dict["flux_executor"] = flux_executor
|
|
480
|
-
resource_dict["flux_executor_pmi_mode"] = flux_executor_pmi_mode
|
|
481
484
|
resource_dict["flux_executor_nesting"] = flux_executor_nesting
|
|
482
485
|
resource_dict["flux_log_files"] = flux_log_files
|
|
483
486
|
if block_allocation:
|
|
@@ -329,7 +329,7 @@ class TestClusterExecutor(BaseExecutor):
|
|
|
329
329
|
cache_directory=cache_directory,
|
|
330
330
|
resource_dict=resource_dict,
|
|
331
331
|
flux_executor=None,
|
|
332
|
-
|
|
332
|
+
pmi_mode=None,
|
|
333
333
|
flux_executor_nesting=False,
|
|
334
334
|
flux_log_files=False,
|
|
335
335
|
pysqa_config_directory=None,
|
|
@@ -44,6 +44,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
44
44
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
45
45
|
by the Python functions submitted to the Executor.
|
|
46
46
|
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
47
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
47
48
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
48
49
|
context of an HPC cluster this essential to be able to communicate to an
|
|
49
50
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -91,6 +92,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
91
92
|
max_cores: Optional[int] = None,
|
|
92
93
|
resource_dict: Optional[dict] = None,
|
|
93
94
|
pysqa_config_directory: Optional[str] = None,
|
|
95
|
+
pmi_mode: Optional[str] = None,
|
|
94
96
|
hostname_localhost: Optional[bool] = None,
|
|
95
97
|
block_allocation: bool = False,
|
|
96
98
|
init_function: Optional[Callable] = None,
|
|
@@ -125,6 +127,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
125
127
|
- error_log_file (str): Name of the error log file to use for storing exceptions
|
|
126
128
|
raised by the Python functions submitted to the Executor.
|
|
127
129
|
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
130
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
128
131
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
129
132
|
context of an HPC cluster this essential to be able to communicate to an
|
|
130
133
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -173,8 +176,8 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
173
176
|
max_cores=max_cores,
|
|
174
177
|
cache_directory=cache_directory,
|
|
175
178
|
resource_dict=resource_dict,
|
|
179
|
+
pmi_mode=pmi_mode,
|
|
176
180
|
flux_executor=None,
|
|
177
|
-
flux_executor_pmi_mode=None,
|
|
178
181
|
flux_executor_nesting=False,
|
|
179
182
|
flux_log_files=False,
|
|
180
183
|
pysqa_config_directory=pysqa_config_directory,
|
|
@@ -232,6 +235,7 @@ class SlurmJobExecutor(BaseExecutor):
|
|
|
232
235
|
compute notes. Defaults to False.
|
|
233
236
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
234
237
|
by the Python functions submitted to the Executor.
|
|
238
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
235
239
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
236
240
|
context of an HPC cluster this essential to be able to communicate to an
|
|
237
241
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -278,6 +282,7 @@ class SlurmJobExecutor(BaseExecutor):
|
|
|
278
282
|
cache_directory: Optional[str] = None,
|
|
279
283
|
max_cores: Optional[int] = None,
|
|
280
284
|
resource_dict: Optional[dict] = None,
|
|
285
|
+
pmi_mode: Optional[str] = None,
|
|
281
286
|
hostname_localhost: Optional[bool] = None,
|
|
282
287
|
block_allocation: bool = False,
|
|
283
288
|
init_function: Optional[Callable] = None,
|
|
@@ -315,6 +320,7 @@ class SlurmJobExecutor(BaseExecutor):
|
|
|
315
320
|
compute notes. Defaults to False.
|
|
316
321
|
- error_log_file (str): Name of the error log file to use for storing exceptions
|
|
317
322
|
raised by the Python functions submitted to the Executor.
|
|
323
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
318
324
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
319
325
|
context of an HPC cluster this essential to be able to communicate to an
|
|
320
326
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -356,6 +362,7 @@ class SlurmJobExecutor(BaseExecutor):
|
|
|
356
362
|
cache_directory=cache_directory,
|
|
357
363
|
max_cores=max_cores,
|
|
358
364
|
resource_dict=resource_dict,
|
|
365
|
+
pmi_mode=pmi_mode,
|
|
359
366
|
hostname_localhost=hostname_localhost,
|
|
360
367
|
block_allocation=block_allocation,
|
|
361
368
|
init_function=init_function,
|
|
@@ -376,6 +383,7 @@ class SlurmJobExecutor(BaseExecutor):
|
|
|
376
383
|
cache_directory=cache_directory,
|
|
377
384
|
max_cores=max_cores,
|
|
378
385
|
resource_dict=resource_dict,
|
|
386
|
+
pmi_mode=pmi_mode,
|
|
379
387
|
hostname_localhost=hostname_localhost,
|
|
380
388
|
block_allocation=block_allocation,
|
|
381
389
|
init_function=init_function,
|
|
@@ -389,6 +397,7 @@ def create_slurm_executor(
|
|
|
389
397
|
max_cores: Optional[int] = None,
|
|
390
398
|
cache_directory: Optional[str] = None,
|
|
391
399
|
resource_dict: Optional[dict] = None,
|
|
400
|
+
pmi_mode: Optional[str] = None,
|
|
392
401
|
hostname_localhost: Optional[bool] = None,
|
|
393
402
|
block_allocation: bool = False,
|
|
394
403
|
init_function: Optional[Callable] = None,
|
|
@@ -418,6 +427,7 @@ def create_slurm_executor(
|
|
|
418
427
|
compute notes. Defaults to False.
|
|
419
428
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
420
429
|
by the Python functions submitted to the Executor.
|
|
430
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
421
431
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
422
432
|
context of an HPC cluster this essential to be able to communicate to an
|
|
423
433
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -441,6 +451,7 @@ def create_slurm_executor(
|
|
|
441
451
|
resource_dict["cache_directory"] = cache_directory
|
|
442
452
|
resource_dict["hostname_localhost"] = hostname_localhost
|
|
443
453
|
resource_dict["log_obj_size"] = log_obj_size
|
|
454
|
+
resource_dict["pmi_mode"] = pmi_mode
|
|
444
455
|
check_init_function(block_allocation=block_allocation, init_function=init_function)
|
|
445
456
|
if block_allocation:
|
|
446
457
|
resource_dict["init_function"] = init_function
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from concurrent.futures import Future
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def batched_futures(lst: list[Future], skip_lst: list[list], n: int) -> list[list]:
|
|
5
|
+
"""
|
|
6
|
+
Batch n completed future objects. If the number of completed futures is smaller than n and the end of the batch is
|
|
7
|
+
not reached yet, then an empty list is returned. If n future objects are done, which are not included in the skip_lst
|
|
8
|
+
then they are returned as batch.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
lst (list): list of all future objects
|
|
12
|
+
skip_lst (list): list of previous batches of future objects
|
|
13
|
+
n (int): batch size
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
list: results of the batched futures
|
|
17
|
+
"""
|
|
18
|
+
skipped_elements_lst = [item for items in skip_lst for item in items]
|
|
19
|
+
|
|
20
|
+
done_lst = []
|
|
21
|
+
n_expected = min(n, len(lst) - len(skipped_elements_lst))
|
|
22
|
+
for v in lst:
|
|
23
|
+
if v.done() and v.result() not in skipped_elements_lst:
|
|
24
|
+
done_lst.append(v.result())
|
|
25
|
+
if len(done_lst) == n_expected:
|
|
26
|
+
return done_lst
|
|
27
|
+
return []
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_command_path(executable: str) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Get path of the backend executable script
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
executable (str): Name of the backend executable script, either mpiexec.py or serial.py
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
str: absolute path to the executable script
|
|
16
|
+
"""
|
|
17
|
+
return os.path.abspath(os.path.join(__file__, "..", "..", "backend", executable))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_cache_execute_command(
|
|
21
|
+
file_name: str,
|
|
22
|
+
cores: int = 1,
|
|
23
|
+
backend: Optional[str] = None,
|
|
24
|
+
exclusive: bool = False,
|
|
25
|
+
openmpi_oversubscribe: bool = False,
|
|
26
|
+
pmi_mode: Optional[str] = None,
|
|
27
|
+
) -> list:
|
|
28
|
+
"""
|
|
29
|
+
Get command to call backend as a list of two strings
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_name (str): The name of the file.
|
|
33
|
+
cores (int, optional): Number of cores used to execute the task. Defaults to 1.
|
|
34
|
+
backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"].
|
|
35
|
+
exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False.
|
|
36
|
+
openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False.
|
|
37
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
list[str]: List of strings containing the python executable path and the backend script to execute
|
|
41
|
+
"""
|
|
42
|
+
command_lst = [sys.executable]
|
|
43
|
+
if cores > 1 and importlib.util.find_spec("mpi4py") is not None:
|
|
44
|
+
if backend is None:
|
|
45
|
+
command_lst = (
|
|
46
|
+
["mpiexec", "-n", str(cores)]
|
|
47
|
+
+ command_lst
|
|
48
|
+
+ [get_command_path(executable="cache_parallel.py"), file_name]
|
|
49
|
+
)
|
|
50
|
+
elif backend == "slurm":
|
|
51
|
+
command_prepend = ["srun", "-n", str(cores)]
|
|
52
|
+
if pmi_mode is not None:
|
|
53
|
+
command_prepend += ["--mpi=" + pmi_mode]
|
|
54
|
+
if openmpi_oversubscribe:
|
|
55
|
+
command_prepend += ["--oversubscribe"]
|
|
56
|
+
if exclusive:
|
|
57
|
+
command_prepend += ["--exact"]
|
|
58
|
+
command_lst = (
|
|
59
|
+
command_prepend
|
|
60
|
+
+ command_lst
|
|
61
|
+
+ [get_command_path(executable="cache_parallel.py"), file_name]
|
|
62
|
+
)
|
|
63
|
+
elif backend == "flux":
|
|
64
|
+
flux_command = ["flux", "run"]
|
|
65
|
+
if pmi_mode is not None:
|
|
66
|
+
flux_command += ["-o", "pmi=" + pmi_mode]
|
|
67
|
+
if openmpi_oversubscribe:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"The option openmpi_oversubscribe is not available with the flux backend."
|
|
70
|
+
)
|
|
71
|
+
if exclusive:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
"The option exclusive is not available with the flux backend."
|
|
74
|
+
)
|
|
75
|
+
command_lst = (
|
|
76
|
+
flux_command
|
|
77
|
+
+ ["-n", str(cores)]
|
|
78
|
+
+ command_lst
|
|
79
|
+
+ [get_command_path(executable="cache_parallel.py"), file_name]
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"backend should be None, slurm or flux, not {backend}")
|
|
83
|
+
elif cores > 1:
|
|
84
|
+
raise ImportError(
|
|
85
|
+
"mpi4py is required for parallel calculations. Please install mpi4py."
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
command_lst += [get_command_path(executable="cache_serial.py"), file_name]
|
|
89
|
+
return command_lst
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_interactive_execute_command(
|
|
93
|
+
cores: int,
|
|
94
|
+
) -> list:
|
|
95
|
+
"""
|
|
96
|
+
Get command to call backend as a list of two strings
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
cores (int): Number of cores used to execute the task, if it is greater than one use interactive_parallel.py
|
|
100
|
+
else interactive_serial.py
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
list[str]: List of strings containing the python executable path and the backend script to execute
|
|
104
|
+
"""
|
|
105
|
+
command_lst = [sys.executable]
|
|
106
|
+
if cores > 1 and importlib.util.find_spec("mpi4py") is not None:
|
|
107
|
+
command_lst += [get_command_path(executable="interactive_parallel.py")]
|
|
108
|
+
elif cores > 1:
|
|
109
|
+
raise ImportError(
|
|
110
|
+
"mpi4py is required for parallel calculations. Please install mpi4py."
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
command_lst += [get_command_path(executable="interactive_serial.py")]
|
|
114
|
+
return command_lst
|