executorlib 1.6.2__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {executorlib-1.6.2 → executorlib-1.7.0}/PKG-INFO +1 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/__init__.py +1 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/_version.py +2 -2
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/interactive_parallel.py +17 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/interactive_serial.py +15 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/flux.py +42 -21
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/single.py +2 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/slurm.py +43 -21
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/command.py +50 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/communication.py +63 -14
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/spawner.py +14 -3
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/shared.py +1 -1
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/task_scheduler.py +2 -2
- executorlib-1.7.0/executorlib/task_scheduler/interactive/blockallocation.py +313 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/interactive/onetoone.py +93 -35
- executorlib-1.7.0/executorlib/task_scheduler/interactive/shared.py +159 -0
- executorlib-1.6.2/executorlib/task_scheduler/interactive/fluxspawner.py → executorlib-1.7.0/executorlib/task_scheduler/interactive/spawner_flux.py +9 -2
- executorlib-1.7.0/executorlib/task_scheduler/interactive/spawner_pysqa.py +248 -0
- executorlib-1.6.2/executorlib/task_scheduler/interactive/slurmspawner.py → executorlib-1.7.0/executorlib/task_scheduler/interactive/spawner_slurm.py +1 -1
- executorlib-1.6.2/executorlib/standalone/slurm_command.py +0 -51
- executorlib-1.6.2/executorlib/task_scheduler/interactive/blockallocation.py +0 -177
- executorlib-1.6.2/executorlib/task_scheduler/interactive/shared.py +0 -172
- {executorlib-1.6.2 → executorlib-1.7.0}/.gitignore +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/LICENSE +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/README.md +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/api.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/cache_parallel.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/cache_serial.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/base.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/batched.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/error.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/hdf.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/inputcheck.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/arguments.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/backend.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/plot.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/queue.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/scheduler.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/serialize.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/base.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/backend.py +0 -0
- /executorlib-1.6.2/executorlib/task_scheduler/file/queue_spawner.py → /executorlib-1.7.0/executorlib/task_scheduler/file/spawner_pysqa.py +0 -0
- /executorlib-1.6.2/executorlib/task_scheduler/file/subprocess_spawner.py → /executorlib-1.7.0/executorlib/task_scheduler/file/spawner_subprocess.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/interactive/__init__.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/interactive/dependency.py +0 -0
- {executorlib-1.6.2 → executorlib-1.7.0}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: executorlib
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Up-scale python functions for high performance computing (HPC) with executorlib.
|
|
5
5
|
Project-URL: Homepage, https://github.com/pyiron/executorlib
|
|
6
6
|
Project-URL: Documentation, https://executorlib.readthedocs.io
|
|
@@ -55,7 +55,7 @@ def terminate_tasks_in_cache(
|
|
|
55
55
|
config_directory (str, optional): path to the config directory.
|
|
56
56
|
backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"].
|
|
57
57
|
"""
|
|
58
|
-
from executorlib.task_scheduler.file.
|
|
58
|
+
from executorlib.task_scheduler.file.spawner_pysqa import terminate_tasks_in_cache
|
|
59
59
|
|
|
60
60
|
return terminate_tasks_in_cache(
|
|
61
61
|
cache_directory=cache_directory,
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '1.
|
|
32
|
-
__version_tuple__ = version_tuple = (1,
|
|
31
|
+
__version__ = version = '1.7.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (1, 7, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -97,7 +97,23 @@ def main() -> None:
|
|
|
97
97
|
and "args" in input_dict
|
|
98
98
|
and "kwargs" in input_dict
|
|
99
99
|
):
|
|
100
|
-
|
|
100
|
+
try:
|
|
101
|
+
memory.update(
|
|
102
|
+
call_funct(input_dict=input_dict, funct=None, memory=memory)
|
|
103
|
+
)
|
|
104
|
+
except Exception as error:
|
|
105
|
+
if mpi_rank_zero:
|
|
106
|
+
interface_send(
|
|
107
|
+
socket=socket,
|
|
108
|
+
result_dict={"error": error},
|
|
109
|
+
)
|
|
110
|
+
backend_write_error_file(
|
|
111
|
+
error=error,
|
|
112
|
+
apply_dict=input_dict,
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
if mpi_rank_zero:
|
|
116
|
+
interface_send(socket=socket, result_dict={"result": True})
|
|
101
117
|
|
|
102
118
|
|
|
103
119
|
if __name__ == "__main__":
|
|
@@ -72,7 +72,21 @@ def main(argument_lst: Optional[list[str]] = None):
|
|
|
72
72
|
and "args" in input_dict
|
|
73
73
|
and "kwargs" in input_dict
|
|
74
74
|
):
|
|
75
|
-
|
|
75
|
+
try:
|
|
76
|
+
memory.update(
|
|
77
|
+
call_funct(input_dict=input_dict, funct=None, memory=memory)
|
|
78
|
+
)
|
|
79
|
+
except Exception as error:
|
|
80
|
+
interface_send(
|
|
81
|
+
socket=socket,
|
|
82
|
+
result_dict={"error": error},
|
|
83
|
+
)
|
|
84
|
+
backend_write_error_file(
|
|
85
|
+
error=error,
|
|
86
|
+
apply_dict=input_dict,
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
interface_send(socket=socket, result_dict={"result": True})
|
|
76
90
|
|
|
77
91
|
|
|
78
92
|
if __name__ == "__main__":
|
|
@@ -43,6 +43,7 @@ class FluxJobExecutor(BaseExecutor):
|
|
|
43
43
|
compute notes. Defaults to False.
|
|
44
44
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
45
45
|
by the Python functions submitted to the Executor.
|
|
46
|
+
- restart_limit (int): The maximum number of restarting worker processes. Default: 0
|
|
46
47
|
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
47
48
|
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
48
49
|
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
@@ -357,28 +358,48 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
357
358
|
if not plot_dependency_graph:
|
|
358
359
|
import pysqa # noqa
|
|
359
360
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
361
|
+
if block_allocation:
|
|
362
|
+
from executorlib.task_scheduler.interactive.spawner_pysqa import (
|
|
363
|
+
create_pysqa_block_allocation_scheduler,
|
|
364
|
+
)
|
|
363
365
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
366
|
+
super().__init__(
|
|
367
|
+
executor=create_pysqa_block_allocation_scheduler(
|
|
368
|
+
max_cores=max_cores,
|
|
369
|
+
cache_directory=cache_directory,
|
|
370
|
+
hostname_localhost=hostname_localhost,
|
|
371
|
+
log_obj_size=log_obj_size,
|
|
372
|
+
pmi_mode=pmi_mode,
|
|
373
|
+
init_function=init_function,
|
|
374
|
+
max_workers=max_workers,
|
|
375
|
+
resource_dict=resource_dict,
|
|
376
|
+
pysqa_config_directory=pysqa_config_directory,
|
|
377
|
+
backend="flux",
|
|
378
|
+
)
|
|
379
|
+
)
|
|
380
|
+
else:
|
|
381
|
+
from executorlib.task_scheduler.file.task_scheduler import (
|
|
382
|
+
create_file_executor,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
super().__init__(
|
|
386
|
+
executor=create_file_executor(
|
|
387
|
+
max_workers=max_workers,
|
|
388
|
+
backend="flux",
|
|
389
|
+
max_cores=max_cores,
|
|
390
|
+
cache_directory=cache_directory,
|
|
391
|
+
resource_dict=resource_dict,
|
|
392
|
+
flux_executor=None,
|
|
393
|
+
pmi_mode=pmi_mode,
|
|
394
|
+
flux_executor_nesting=False,
|
|
395
|
+
flux_log_files=False,
|
|
396
|
+
pysqa_config_directory=pysqa_config_directory,
|
|
397
|
+
hostname_localhost=hostname_localhost,
|
|
398
|
+
block_allocation=block_allocation,
|
|
399
|
+
init_function=init_function,
|
|
400
|
+
disable_dependencies=disable_dependencies,
|
|
401
|
+
)
|
|
380
402
|
)
|
|
381
|
-
)
|
|
382
403
|
else:
|
|
383
404
|
super().__init__(
|
|
384
405
|
executor=DependencyTaskScheduler(
|
|
@@ -458,7 +479,7 @@ def create_flux_executor(
|
|
|
458
479
|
Returns:
|
|
459
480
|
InteractiveStepExecutor/ InteractiveExecutor
|
|
460
481
|
"""
|
|
461
|
-
from executorlib.task_scheduler.interactive.
|
|
482
|
+
from executorlib.task_scheduler.interactive.spawner_flux import (
|
|
462
483
|
FluxPythonSpawner,
|
|
463
484
|
validate_max_workers,
|
|
464
485
|
)
|
|
@@ -120,6 +120,7 @@ class SingleNodeExecutor(BaseExecutor):
|
|
|
120
120
|
only)
|
|
121
121
|
- error_log_file (str): Name of the error log file to use for storing exceptions
|
|
122
122
|
raised by the Python functions submitted to the Executor.
|
|
123
|
+
- restart_limit (int): The maximum number of restarting worker processes. Default: 0
|
|
123
124
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
124
125
|
context of an HPC cluster this essential to be able to communicate to an
|
|
125
126
|
Executor running on a different compute node within the same allocation. And
|
|
@@ -314,7 +315,7 @@ class TestClusterExecutor(BaseExecutor):
|
|
|
314
315
|
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
315
316
|
)
|
|
316
317
|
if not plot_dependency_graph:
|
|
317
|
-
from executorlib.task_scheduler.file.
|
|
318
|
+
from executorlib.task_scheduler.file.spawner_subprocess import (
|
|
318
319
|
execute_in_subprocess,
|
|
319
320
|
)
|
|
320
321
|
from executorlib.task_scheduler.file.task_scheduler import (
|
|
@@ -13,7 +13,7 @@ from executorlib.task_scheduler.interactive.blockallocation import (
|
|
|
13
13
|
)
|
|
14
14
|
from executorlib.task_scheduler.interactive.dependency import DependencyTaskScheduler
|
|
15
15
|
from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler
|
|
16
|
-
from executorlib.task_scheduler.interactive.
|
|
16
|
+
from executorlib.task_scheduler.interactive.spawner_slurm import (
|
|
17
17
|
SrunSpawner,
|
|
18
18
|
validate_max_workers,
|
|
19
19
|
)
|
|
@@ -43,6 +43,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
43
43
|
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
|
|
44
44
|
- error_log_file (str): Name of the error log file to use for storing exceptions raised
|
|
45
45
|
by the Python functions submitted to the Executor.
|
|
46
|
+
- restart_limit (int): The maximum number of restarting worker processes. Default: 0
|
|
46
47
|
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
|
|
47
48
|
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
48
49
|
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
@@ -165,28 +166,49 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
165
166
|
if not plot_dependency_graph:
|
|
166
167
|
import pysqa # noqa
|
|
167
168
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
169
|
+
if block_allocation:
|
|
170
|
+
from executorlib.task_scheduler.interactive.spawner_pysqa import (
|
|
171
|
+
create_pysqa_block_allocation_scheduler,
|
|
172
|
+
)
|
|
171
173
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
174
|
+
super().__init__(
|
|
175
|
+
executor=create_pysqa_block_allocation_scheduler(
|
|
176
|
+
max_cores=max_cores,
|
|
177
|
+
cache_directory=cache_directory,
|
|
178
|
+
hostname_localhost=hostname_localhost,
|
|
179
|
+
log_obj_size=log_obj_size,
|
|
180
|
+
pmi_mode=pmi_mode,
|
|
181
|
+
init_function=init_function,
|
|
182
|
+
max_workers=max_workers,
|
|
183
|
+
resource_dict=resource_dict,
|
|
184
|
+
pysqa_config_directory=pysqa_config_directory,
|
|
185
|
+
backend="slurm",
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
else:
|
|
190
|
+
from executorlib.task_scheduler.file.task_scheduler import (
|
|
191
|
+
create_file_executor,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
super().__init__(
|
|
195
|
+
executor=create_file_executor(
|
|
196
|
+
max_workers=max_workers,
|
|
197
|
+
backend="slurm",
|
|
198
|
+
max_cores=max_cores,
|
|
199
|
+
cache_directory=cache_directory,
|
|
200
|
+
resource_dict=resource_dict,
|
|
201
|
+
pmi_mode=pmi_mode,
|
|
202
|
+
flux_executor=None,
|
|
203
|
+
flux_executor_nesting=False,
|
|
204
|
+
flux_log_files=False,
|
|
205
|
+
pysqa_config_directory=pysqa_config_directory,
|
|
206
|
+
hostname_localhost=hostname_localhost,
|
|
207
|
+
block_allocation=block_allocation,
|
|
208
|
+
init_function=init_function,
|
|
209
|
+
disable_dependencies=disable_dependencies,
|
|
210
|
+
)
|
|
188
211
|
)
|
|
189
|
-
)
|
|
190
212
|
else:
|
|
191
213
|
super().__init__(
|
|
192
214
|
executor=DependencyTaskScheduler(
|
|
@@ -3,6 +3,8 @@ import os
|
|
|
3
3
|
import sys
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
+
SLURM_COMMAND = "srun"
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def get_command_path(executable: str) -> str:
|
|
8
10
|
"""
|
|
@@ -112,3 +114,51 @@ def get_interactive_execute_command(
|
|
|
112
114
|
else:
|
|
113
115
|
command_lst += [get_command_path(executable="interactive_serial.py")]
|
|
114
116
|
return command_lst
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def generate_slurm_command(
|
|
120
|
+
cores: int,
|
|
121
|
+
cwd: Optional[str],
|
|
122
|
+
threads_per_core: int = 1,
|
|
123
|
+
gpus_per_core: int = 0,
|
|
124
|
+
num_nodes: Optional[int] = None,
|
|
125
|
+
exclusive: bool = False,
|
|
126
|
+
openmpi_oversubscribe: bool = False,
|
|
127
|
+
slurm_cmd_args: Optional[list[str]] = None,
|
|
128
|
+
pmi_mode: Optional[str] = None,
|
|
129
|
+
) -> list[str]:
|
|
130
|
+
"""
|
|
131
|
+
Generate the command list for the SLURM interface.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
cores (int): The number of cores.
|
|
135
|
+
cwd (str): The current working directory.
|
|
136
|
+
threads_per_core (int, optional): The number of threads per core. Defaults to 1.
|
|
137
|
+
gpus_per_core (int, optional): The number of GPUs per core. Defaults to 0.
|
|
138
|
+
num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None.
|
|
139
|
+
exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False.
|
|
140
|
+
openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False.
|
|
141
|
+
slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to [].
|
|
142
|
+
pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
list[str]: The generated command list.
|
|
146
|
+
"""
|
|
147
|
+
command_prepend_lst = [SLURM_COMMAND, "-n", str(cores)]
|
|
148
|
+
if cwd is not None:
|
|
149
|
+
command_prepend_lst += ["-D", cwd]
|
|
150
|
+
if pmi_mode is not None:
|
|
151
|
+
command_prepend_lst += ["--mpi=" + pmi_mode]
|
|
152
|
+
if num_nodes is not None:
|
|
153
|
+
command_prepend_lst += ["-N", str(num_nodes)]
|
|
154
|
+
if threads_per_core > 1:
|
|
155
|
+
command_prepend_lst += ["--cpus-per-task=" + str(threads_per_core)]
|
|
156
|
+
if gpus_per_core > 0:
|
|
157
|
+
command_prepend_lst += ["--gpus-per-task=" + str(gpus_per_core)]
|
|
158
|
+
if exclusive:
|
|
159
|
+
command_prepend_lst += ["--exact"]
|
|
160
|
+
if openmpi_oversubscribe:
|
|
161
|
+
command_prepend_lst += ["--oversubscribe"]
|
|
162
|
+
if slurm_cmd_args is not None and len(slurm_cmd_args) > 0:
|
|
163
|
+
command_prepend_lst += slurm_cmd_args
|
|
164
|
+
return command_prepend_lst
|
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import sys
|
|
3
3
|
from socket import gethostname
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Any, Callable, Optional
|
|
5
5
|
|
|
6
6
|
import cloudpickle
|
|
7
7
|
import zmq
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
class ExecutorlibSocketError(RuntimeError):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
10
14
|
class SocketInterface:
|
|
11
15
|
"""
|
|
12
16
|
The SocketInterface is an abstraction layer on top of the zero message queue.
|
|
@@ -14,23 +18,41 @@ class SocketInterface:
|
|
|
14
18
|
Args:
|
|
15
19
|
spawner (executorlib.shared.spawner.BaseSpawner): Interface for starting the parallel process
|
|
16
20
|
log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
|
|
21
|
+
time_out_ms (int): Time out for waiting for a message on socket in milliseconds.
|
|
17
22
|
"""
|
|
18
23
|
|
|
19
|
-
def __init__(
|
|
24
|
+
def __init__(
|
|
25
|
+
self, spawner=None, log_obj_size: bool = False, time_out_ms: int = 1000
|
|
26
|
+
):
|
|
20
27
|
"""
|
|
21
28
|
Initialize the SocketInterface.
|
|
22
29
|
|
|
23
30
|
Args:
|
|
24
31
|
spawner (executorlib.shared.spawner.BaseSpawner): Interface for starting the parallel process
|
|
32
|
+
log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
|
|
33
|
+
time_out_ms (int): Time out for waiting for a message on socket in milliseconds.
|
|
25
34
|
"""
|
|
26
35
|
self._context = zmq.Context()
|
|
27
36
|
self._socket = self._context.socket(zmq.PAIR)
|
|
37
|
+
self._poller = zmq.Poller()
|
|
38
|
+
self._poller.register(self._socket, zmq.POLLIN)
|
|
28
39
|
self._process = None
|
|
40
|
+
self._time_out_ms = time_out_ms
|
|
41
|
+
self._logger: Optional[logging.Logger] = None
|
|
29
42
|
if log_obj_size:
|
|
30
43
|
self._logger = logging.getLogger("executorlib")
|
|
31
|
-
else:
|
|
32
|
-
self._logger = None
|
|
33
44
|
self._spawner = spawner
|
|
45
|
+
self._command_lst: list[str] = []
|
|
46
|
+
self._booted_sucessfully: bool = False
|
|
47
|
+
self._stop_function: Optional[Callable] = None
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def status(self) -> bool:
|
|
51
|
+
return self._booted_sucessfully
|
|
52
|
+
|
|
53
|
+
@status.setter
|
|
54
|
+
def status(self, status: bool):
|
|
55
|
+
self._booted_sucessfully = status
|
|
34
56
|
|
|
35
57
|
def send_dict(self, input_dict: dict):
|
|
36
58
|
"""
|
|
@@ -52,7 +74,14 @@ class SocketInterface:
|
|
|
52
74
|
Returns:
|
|
53
75
|
dict: dictionary with response received from the connected client
|
|
54
76
|
"""
|
|
55
|
-
|
|
77
|
+
response_lst: list[tuple[Any, int]] = []
|
|
78
|
+
while len(response_lst) == 0:
|
|
79
|
+
response_lst = self._poller.poll(self._time_out_ms)
|
|
80
|
+
if not self._spawner.poll():
|
|
81
|
+
raise ExecutorlibSocketError(
|
|
82
|
+
"SocketInterface crashed during execution."
|
|
83
|
+
)
|
|
84
|
+
data = self._socket.recv(zmq.NOBLOCK)
|
|
56
85
|
if self._logger is not None:
|
|
57
86
|
self._logger.warning(
|
|
58
87
|
"Received dictionary of size: " + str(sys.getsizeof(data))
|
|
@@ -69,7 +98,7 @@ class SocketInterface:
|
|
|
69
98
|
|
|
70
99
|
Args:
|
|
71
100
|
input_dict (dict): dictionary of commands to be communicated. The key "shutdown" is reserved to stop the
|
|
72
|
-
|
|
101
|
+
connected client from listening.
|
|
73
102
|
|
|
74
103
|
Returns:
|
|
75
104
|
dict: dictionary with response received from the connected client
|
|
@@ -89,17 +118,30 @@ class SocketInterface:
|
|
|
89
118
|
|
|
90
119
|
def bootup(
|
|
91
120
|
self,
|
|
92
|
-
command_lst: list[str],
|
|
121
|
+
command_lst: Optional[list[str]] = None,
|
|
122
|
+
stop_function: Optional[Callable] = None,
|
|
93
123
|
):
|
|
94
124
|
"""
|
|
95
125
|
Boot up the client process to connect to the SocketInterface.
|
|
96
126
|
|
|
97
127
|
Args:
|
|
98
128
|
command_lst (list): list of strings to start the client process
|
|
129
|
+
stop_function (Callable): Function to stop the interface.
|
|
99
130
|
"""
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
131
|
+
if command_lst is not None:
|
|
132
|
+
self._command_lst = command_lst
|
|
133
|
+
if stop_function is not None:
|
|
134
|
+
self._stop_function = stop_function
|
|
135
|
+
if len(self._command_lst) == 0:
|
|
136
|
+
raise ValueError("No command defined to boot up SocketInterface.")
|
|
137
|
+
if not self._spawner.bootup(
|
|
138
|
+
command_lst=self._command_lst,
|
|
139
|
+
stop_function=self._stop_function,
|
|
140
|
+
):
|
|
141
|
+
self._reset_socket()
|
|
142
|
+
self._booted_sucessfully = False
|
|
143
|
+
else:
|
|
144
|
+
self._booted_sucessfully = True
|
|
103
145
|
|
|
104
146
|
def shutdown(self, wait: bool = True):
|
|
105
147
|
"""
|
|
@@ -114,6 +156,13 @@ class SocketInterface:
|
|
|
114
156
|
input_dict={"shutdown": True, "wait": wait}
|
|
115
157
|
)
|
|
116
158
|
self._spawner.shutdown(wait=wait)
|
|
159
|
+
self._reset_socket()
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
def _reset_socket(self):
|
|
163
|
+
"""
|
|
164
|
+
Reset the socket and context of the SocketInterface instance.
|
|
165
|
+
"""
|
|
117
166
|
if self._socket is not None:
|
|
118
167
|
self._socket.close()
|
|
119
168
|
if self._context is not None:
|
|
@@ -121,7 +170,6 @@ class SocketInterface:
|
|
|
121
170
|
self._process = None
|
|
122
171
|
self._socket = None
|
|
123
172
|
self._context = None
|
|
124
|
-
return result
|
|
125
173
|
|
|
126
174
|
def __del__(self):
|
|
127
175
|
"""
|
|
@@ -137,6 +185,7 @@ def interface_bootup(
|
|
|
137
185
|
hostname_localhost: Optional[bool] = None,
|
|
138
186
|
log_obj_size: bool = False,
|
|
139
187
|
worker_id: Optional[int] = None,
|
|
188
|
+
stop_function: Optional[Callable] = None,
|
|
140
189
|
) -> SocketInterface:
|
|
141
190
|
"""
|
|
142
191
|
Start interface for ZMQ communication
|
|
@@ -155,13 +204,12 @@ def interface_bootup(
|
|
|
155
204
|
log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
|
|
156
205
|
worker_id (int): Communicate the worker which ID was assigned to it for future reference and resource
|
|
157
206
|
distribution.
|
|
207
|
+
stop_function (Callable): Function to stop the interface.
|
|
158
208
|
|
|
159
209
|
Returns:
|
|
160
210
|
executorlib.shared.communication.SocketInterface: socket interface for zmq communication
|
|
161
211
|
"""
|
|
162
|
-
if hostname_localhost is None and sys.platform
|
|
163
|
-
hostname_localhost = True
|
|
164
|
-
elif hostname_localhost is None:
|
|
212
|
+
if hostname_localhost is None and sys.platform != "darwin":
|
|
165
213
|
hostname_localhost = False
|
|
166
214
|
if not hostname_localhost:
|
|
167
215
|
command_lst += [
|
|
@@ -180,6 +228,7 @@ def interface_bootup(
|
|
|
180
228
|
]
|
|
181
229
|
interface.bootup(
|
|
182
230
|
command_lst=command_lst,
|
|
231
|
+
stop_function=stop_function,
|
|
183
232
|
)
|
|
184
233
|
return interface
|
|
185
234
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Callable, Optional
|
|
5
5
|
|
|
6
6
|
MPI_COMMAND = "mpiexec"
|
|
7
7
|
|
|
@@ -29,12 +29,17 @@ class BaseSpawner(ABC):
|
|
|
29
29
|
def bootup(
|
|
30
30
|
self,
|
|
31
31
|
command_lst: list[str],
|
|
32
|
-
|
|
32
|
+
stop_function: Optional[Callable] = None,
|
|
33
|
+
) -> bool:
|
|
33
34
|
"""
|
|
34
35
|
Method to start the interface.
|
|
35
36
|
|
|
36
37
|
Args:
|
|
37
38
|
command_lst (list[str]): The command list to execute.
|
|
39
|
+
stop_function (Callable): Function to stop the interface.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
bool: Whether the interface was successfully started.
|
|
38
43
|
"""
|
|
39
44
|
raise NotImplementedError
|
|
40
45
|
|
|
@@ -87,12 +92,17 @@ class SubprocessSpawner(BaseSpawner):
|
|
|
87
92
|
def bootup(
|
|
88
93
|
self,
|
|
89
94
|
command_lst: list[str],
|
|
90
|
-
|
|
95
|
+
stop_function: Optional[Callable] = None,
|
|
96
|
+
) -> bool:
|
|
91
97
|
"""
|
|
92
98
|
Method to start the subprocess interface.
|
|
93
99
|
|
|
94
100
|
Args:
|
|
95
101
|
command_lst (list[str]): The command list to execute.
|
|
102
|
+
stop_function (Callable): Function to stop the interface.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
bool: Whether the interface was successfully started.
|
|
96
106
|
"""
|
|
97
107
|
if self._cwd is not None:
|
|
98
108
|
os.makedirs(self._cwd, exist_ok=True)
|
|
@@ -101,6 +111,7 @@ class SubprocessSpawner(BaseSpawner):
|
|
|
101
111
|
cwd=self._cwd,
|
|
102
112
|
stdin=subprocess.DEVNULL,
|
|
103
113
|
)
|
|
114
|
+
return self.poll()
|
|
104
115
|
|
|
105
116
|
def generate_command(self, command_lst: list[str]) -> list[str]:
|
|
106
117
|
"""
|
|
@@ -7,7 +7,7 @@ from typing import Any, Callable, Optional
|
|
|
7
7
|
from executorlib.standalone.command import get_cache_execute_command
|
|
8
8
|
from executorlib.standalone.hdf import get_cache_files, get_output
|
|
9
9
|
from executorlib.standalone.serialize import serialize_funct
|
|
10
|
-
from executorlib.task_scheduler.file.
|
|
10
|
+
from executorlib.task_scheduler.file.spawner_subprocess import terminate_subprocess
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class FutureItem:
|
|
@@ -11,14 +11,14 @@ from executorlib.standalone.inputcheck import (
|
|
|
11
11
|
)
|
|
12
12
|
from executorlib.task_scheduler.base import TaskSchedulerBase
|
|
13
13
|
from executorlib.task_scheduler.file.shared import execute_tasks_h5
|
|
14
|
-
from executorlib.task_scheduler.file.
|
|
14
|
+
from executorlib.task_scheduler.file.spawner_subprocess import (
|
|
15
15
|
execute_in_subprocess,
|
|
16
16
|
terminate_subprocess,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
try:
|
|
20
20
|
from executorlib.standalone.scheduler import terminate_with_pysqa
|
|
21
|
-
from executorlib.task_scheduler.file.
|
|
21
|
+
from executorlib.task_scheduler.file.spawner_pysqa import execute_with_pysqa
|
|
22
22
|
except ImportError:
|
|
23
23
|
# If pysqa is not available fall back to executing tasks in a subprocess
|
|
24
24
|
execute_with_pysqa = execute_in_subprocess # type: ignore
|