executorlib 1.5.2__tar.gz → 1.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {executorlib-1.5.2 → executorlib-1.5.3}/PKG-INFO +1 -1
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/_version.py +2 -2
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/api.py +2 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/executor/flux.py +6 -1
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/executor/single.py +169 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/executor/slurm.py +6 -1
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/inputcheck.py +14 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/hdf.py +1 -1
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/queue_spawner.py +36 -4
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/shared.py +25 -5
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/subprocess_spawner.py +11 -4
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/task_scheduler.py +14 -5
- {executorlib-1.5.2 → executorlib-1.5.3}/.gitignore +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/LICENSE +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/README.md +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/backend/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/backend/cache_parallel.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/backend/cache_serial.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/backend/interactive_parallel.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/backend/interactive_serial.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/executor/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/executor/base.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/cache.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/command.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/interactive/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/interactive/arguments.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/interactive/backend.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/interactive/communication.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/interactive/spawner.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/plot.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/queue.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/standalone/serialize.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/base.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/backend.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/__init__.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/blockallocation.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/dependency.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/fluxspawner.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/onetoone.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/shared.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/slurmspawner.py +0 -0
- {executorlib-1.5.2 → executorlib-1.5.3}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: executorlib
|
|
3
|
-
Version: 1.5.
|
|
3
|
+
Version: 1.5.3
|
|
4
4
|
Summary: Up-scale python functions for high performance computing (HPC) with executorlib.
|
|
5
5
|
Project-URL: Homepage, https://github.com/pyiron/executorlib
|
|
6
6
|
Project-URL: Documentation, https://executorlib.readthedocs.io
|
|
@@ -5,6 +5,7 @@ only use the functionality in this API in combination with the user interface de
|
|
|
5
5
|
functionality is considered internal and might change during minor releases.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
from executorlib.executor.single import TestClusterExecutor
|
|
8
9
|
from executorlib.standalone.command import get_command_path
|
|
9
10
|
from executorlib.standalone.interactive.communication import (
|
|
10
11
|
SocketInterface,
|
|
@@ -19,6 +20,7 @@ from executorlib.standalone.queue import cancel_items_in_queue
|
|
|
19
20
|
from executorlib.standalone.serialize import cloudpickle_register
|
|
20
21
|
|
|
21
22
|
__all__: list[str] = [
|
|
23
|
+
"TestClusterExecutor",
|
|
22
24
|
"cancel_items_in_queue",
|
|
23
25
|
"cloudpickle_register",
|
|
24
26
|
"get_command_path",
|
|
@@ -4,6 +4,7 @@ from executorlib.executor.base import BaseExecutor
|
|
|
4
4
|
from executorlib.standalone.inputcheck import (
|
|
5
5
|
check_command_line_argument_lst,
|
|
6
6
|
check_init_function,
|
|
7
|
+
check_log_obj_size,
|
|
7
8
|
check_oversubscribe,
|
|
8
9
|
check_plot_dependency_graph,
|
|
9
10
|
check_pmi,
|
|
@@ -246,6 +247,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
246
247
|
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
247
248
|
debugging purposes and to get an overview of the specified dependencies.
|
|
248
249
|
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
250
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
249
251
|
|
|
250
252
|
Examples:
|
|
251
253
|
```
|
|
@@ -282,6 +284,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
282
284
|
refresh_rate: float = 0.01,
|
|
283
285
|
plot_dependency_graph: bool = False,
|
|
284
286
|
plot_dependency_graph_filename: Optional[str] = None,
|
|
287
|
+
log_obj_size: bool = False,
|
|
285
288
|
):
|
|
286
289
|
"""
|
|
287
290
|
The executorlib.FluxClusterExecutor leverages either the message passing interface (MPI), the SLURM workload
|
|
@@ -323,6 +326,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
323
326
|
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
324
327
|
debugging purposes and to get an overview of the specified dependencies.
|
|
325
328
|
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
329
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
326
330
|
|
|
327
331
|
"""
|
|
328
332
|
default_resource_dict: dict = {
|
|
@@ -338,6 +342,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
338
342
|
resource_dict.update(
|
|
339
343
|
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
340
344
|
)
|
|
345
|
+
check_log_obj_size(log_obj_size=log_obj_size)
|
|
341
346
|
if not plot_dependency_graph:
|
|
342
347
|
import pysqa # noqa
|
|
343
348
|
|
|
@@ -348,7 +353,7 @@ class FluxClusterExecutor(BaseExecutor):
|
|
|
348
353
|
super().__init__(
|
|
349
354
|
executor=create_file_executor(
|
|
350
355
|
max_workers=max_workers,
|
|
351
|
-
backend="
|
|
356
|
+
backend="flux",
|
|
352
357
|
max_cores=max_cores,
|
|
353
358
|
cache_directory=cache_directory,
|
|
354
359
|
resource_dict=resource_dict,
|
|
@@ -56,6 +56,7 @@ class SingleNodeExecutor(BaseExecutor):
|
|
|
56
56
|
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
57
57
|
debugging purposes and to get an overview of the specified dependencies.
|
|
58
58
|
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
59
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
59
60
|
|
|
60
61
|
Examples:
|
|
61
62
|
```
|
|
@@ -184,6 +185,174 @@ class SingleNodeExecutor(BaseExecutor):
|
|
|
184
185
|
)
|
|
185
186
|
|
|
186
187
|
|
|
188
|
+
class TestClusterExecutor(BaseExecutor):
|
|
189
|
+
"""
|
|
190
|
+
The executorlib.api.TestClusterExecutor is designed to test the file based communication used in the
|
|
191
|
+
SlurmClusterExecutor and the FluxClusterExecutor locally. It is not recommended for production use, rather use the
|
|
192
|
+
SingleNodeExecutor.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
max_workers (int): for backwards compatibility with the standard library, max_workers also defines the number of
|
|
196
|
+
cores which can be used in parallel - just like the max_cores parameter. Using max_cores is
|
|
197
|
+
recommended, as computers have a limited number of compute cores.
|
|
198
|
+
cache_directory (str, optional): The directory to store cache files. Defaults to "executorlib_cache".
|
|
199
|
+
max_cores (int): defines the number cores which can be used in parallel
|
|
200
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
201
|
+
- cores (int): number of MPI cores to be used for each function call
|
|
202
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
203
|
+
- gpus_per_core (int): number of GPUs per worker - defaults to 0
|
|
204
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
205
|
+
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
206
|
+
context of an HPC cluster this essential to be able to communicate to an
|
|
207
|
+
Executor running on a different compute node within the same allocation. And
|
|
208
|
+
in principle any computer should be able to resolve that their own hostname
|
|
209
|
+
points to the same address as localhost. Still MacOS >= 12 seems to disable
|
|
210
|
+
this look up for security reasons. So on MacOS it is required to set this
|
|
211
|
+
option to true
|
|
212
|
+
block_allocation (boolean): To accelerate the submission of a series of python functions with the same resource
|
|
213
|
+
requirements, executorlib supports block allocation. In this case all resources have
|
|
214
|
+
to be defined on the executor, rather than during the submission of the individual
|
|
215
|
+
function.
|
|
216
|
+
init_function (None): optional function to preset arguments for functions which are submitted later
|
|
217
|
+
disable_dependencies (boolean): Disable resolving future objects during the submission.
|
|
218
|
+
refresh_rate (float): Set the refresh rate in seconds, how frequently the input queue is checked.
|
|
219
|
+
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
220
|
+
debugging purposes and to get an overview of the specified dependencies.
|
|
221
|
+
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
222
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
|
+
```
|
|
226
|
+
>>> import numpy as np
|
|
227
|
+
>>> from executorlib.api import TestClusterExecutor
|
|
228
|
+
>>>
|
|
229
|
+
>>> def calc(i, j, k):
|
|
230
|
+
>>> from mpi4py import MPI
|
|
231
|
+
>>> size = MPI.COMM_WORLD.Get_size()
|
|
232
|
+
>>> rank = MPI.COMM_WORLD.Get_rank()
|
|
233
|
+
>>> return np.array([i, j, k]), size, rank
|
|
234
|
+
>>>
|
|
235
|
+
>>> def init_k():
|
|
236
|
+
>>> return {"k": 3}
|
|
237
|
+
>>>
|
|
238
|
+
>>> with TestClusterExecutor(max_workers=2, init_function=init_k) as p:
|
|
239
|
+
>>> fs = p.submit(calc, 2, j=4)
|
|
240
|
+
>>> print(fs.result())
|
|
241
|
+
[(array([2, 4, 3]), 2, 0), (array([2, 4, 3]), 2, 1)]
|
|
242
|
+
```
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
def __init__(
|
|
246
|
+
self,
|
|
247
|
+
max_workers: Optional[int] = None,
|
|
248
|
+
cache_directory: Optional[str] = None,
|
|
249
|
+
max_cores: Optional[int] = None,
|
|
250
|
+
resource_dict: Optional[dict] = None,
|
|
251
|
+
hostname_localhost: Optional[bool] = None,
|
|
252
|
+
block_allocation: bool = False,
|
|
253
|
+
init_function: Optional[Callable] = None,
|
|
254
|
+
disable_dependencies: bool = False,
|
|
255
|
+
refresh_rate: float = 0.01,
|
|
256
|
+
plot_dependency_graph: bool = False,
|
|
257
|
+
plot_dependency_graph_filename: Optional[str] = None,
|
|
258
|
+
log_obj_size: bool = False,
|
|
259
|
+
):
|
|
260
|
+
"""
|
|
261
|
+
The executorlib.api.TestClusterExecutor is designed to test the file based communication used in the
|
|
262
|
+
SlurmClusterExecutor and the FluxClusterExecutor locally. It is not recommended for production use, rather use
|
|
263
|
+
the SingleNodeExecutor.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
max_workers (int): for backwards compatibility with the standard library, max_workers also defines the
|
|
267
|
+
number of cores which can be used in parallel - just like the max_cores parameter. Using
|
|
268
|
+
max_cores is recommended, as computers have a limited number of compute cores.
|
|
269
|
+
cache_directory (str, optional): The directory to store cache files. Defaults to "executorlib_cache".
|
|
270
|
+
max_cores (int): defines the number cores which can be used in parallel
|
|
271
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
272
|
+
- cores (int): number of MPI cores to be used for each function call
|
|
273
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
274
|
+
- gpus_per_core (int): number of GPUs per worker - defaults to 0
|
|
275
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
276
|
+
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
277
|
+
context of an HPC cluster this essential to be able to communicate to an
|
|
278
|
+
Executor running on a different compute node within the same allocation. And
|
|
279
|
+
in principle any computer should be able to resolve that their own hostname
|
|
280
|
+
points to the same address as localhost. Still MacOS >= 12 seems to disable
|
|
281
|
+
this look up for security reasons. So on MacOS it is required to set this
|
|
282
|
+
option to true
|
|
283
|
+
block_allocation (boolean): To accelerate the submission of a series of python functions with the same
|
|
284
|
+
resource requirements, executorlib supports block allocation. In this case all
|
|
285
|
+
resources have to be defined on the executor, rather than during the submission
|
|
286
|
+
of the individual function.
|
|
287
|
+
init_function (None): optional function to preset arguments for functions which are submitted later
|
|
288
|
+
disable_dependencies (boolean): Disable resolving future objects during the submission.
|
|
289
|
+
refresh_rate (float): Set the refresh rate in seconds, how frequently the input queue is checked.
|
|
290
|
+
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
291
|
+
debugging purposes and to get an overview of the specified dependencies.
|
|
292
|
+
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
293
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
294
|
+
|
|
295
|
+
"""
|
|
296
|
+
default_resource_dict: dict = {
|
|
297
|
+
"cores": 1,
|
|
298
|
+
"threads_per_core": 1,
|
|
299
|
+
"gpus_per_core": 0,
|
|
300
|
+
"cwd": None,
|
|
301
|
+
"openmpi_oversubscribe": False,
|
|
302
|
+
}
|
|
303
|
+
if resource_dict is None:
|
|
304
|
+
resource_dict = {}
|
|
305
|
+
resource_dict.update(
|
|
306
|
+
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
307
|
+
)
|
|
308
|
+
if not plot_dependency_graph:
|
|
309
|
+
from executorlib.task_scheduler.file.subprocess_spawner import (
|
|
310
|
+
execute_in_subprocess,
|
|
311
|
+
)
|
|
312
|
+
from executorlib.task_scheduler.file.task_scheduler import (
|
|
313
|
+
create_file_executor,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
super().__init__(
|
|
317
|
+
executor=create_file_executor(
|
|
318
|
+
max_workers=max_workers,
|
|
319
|
+
backend=None,
|
|
320
|
+
max_cores=max_cores,
|
|
321
|
+
cache_directory=cache_directory,
|
|
322
|
+
resource_dict=resource_dict,
|
|
323
|
+
flux_executor=None,
|
|
324
|
+
flux_executor_pmi_mode=None,
|
|
325
|
+
flux_executor_nesting=False,
|
|
326
|
+
flux_log_files=False,
|
|
327
|
+
pysqa_config_directory=None,
|
|
328
|
+
hostname_localhost=hostname_localhost,
|
|
329
|
+
block_allocation=block_allocation,
|
|
330
|
+
init_function=init_function,
|
|
331
|
+
disable_dependencies=disable_dependencies,
|
|
332
|
+
execute_function=execute_in_subprocess,
|
|
333
|
+
)
|
|
334
|
+
)
|
|
335
|
+
else:
|
|
336
|
+
super().__init__(
|
|
337
|
+
executor=DependencyTaskScheduler(
|
|
338
|
+
executor=create_single_node_executor(
|
|
339
|
+
max_workers=max_workers,
|
|
340
|
+
cache_directory=cache_directory,
|
|
341
|
+
max_cores=max_cores,
|
|
342
|
+
resource_dict=resource_dict,
|
|
343
|
+
hostname_localhost=hostname_localhost,
|
|
344
|
+
block_allocation=block_allocation,
|
|
345
|
+
init_function=init_function,
|
|
346
|
+
log_obj_size=log_obj_size,
|
|
347
|
+
),
|
|
348
|
+
max_cores=max_cores,
|
|
349
|
+
refresh_rate=refresh_rate,
|
|
350
|
+
plot_dependency_graph=plot_dependency_graph,
|
|
351
|
+
plot_dependency_graph_filename=plot_dependency_graph_filename,
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
187
356
|
def create_single_node_executor(
|
|
188
357
|
max_workers: Optional[int] = None,
|
|
189
358
|
max_cores: Optional[int] = None,
|
|
@@ -3,6 +3,7 @@ from typing import Callable, Optional, Union
|
|
|
3
3
|
from executorlib.executor.base import BaseExecutor
|
|
4
4
|
from executorlib.standalone.inputcheck import (
|
|
5
5
|
check_init_function,
|
|
6
|
+
check_log_obj_size,
|
|
6
7
|
check_plot_dependency_graph,
|
|
7
8
|
check_refresh_rate,
|
|
8
9
|
validate_number_of_cores,
|
|
@@ -58,6 +59,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
58
59
|
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
59
60
|
debugging purposes and to get an overview of the specified dependencies.
|
|
60
61
|
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
62
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
61
63
|
|
|
62
64
|
Examples:
|
|
63
65
|
```
|
|
@@ -94,6 +96,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
94
96
|
refresh_rate: float = 0.01,
|
|
95
97
|
plot_dependency_graph: bool = False,
|
|
96
98
|
plot_dependency_graph_filename: Optional[str] = None,
|
|
99
|
+
log_obj_size: bool = False,
|
|
97
100
|
):
|
|
98
101
|
"""
|
|
99
102
|
The executorlib.SlurmClusterExecutor leverages either the message passing interface (MPI), the SLURM workload
|
|
@@ -135,6 +138,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
135
138
|
plot_dependency_graph (bool): Plot the dependencies of multiple future objects without executing them. For
|
|
136
139
|
debugging purposes and to get an overview of the specified dependencies.
|
|
137
140
|
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
141
|
+
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
|
|
138
142
|
|
|
139
143
|
"""
|
|
140
144
|
default_resource_dict: dict = {
|
|
@@ -150,6 +154,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
150
154
|
resource_dict.update(
|
|
151
155
|
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
152
156
|
)
|
|
157
|
+
check_log_obj_size(log_obj_size=log_obj_size)
|
|
153
158
|
if not plot_dependency_graph:
|
|
154
159
|
import pysqa # noqa
|
|
155
160
|
|
|
@@ -160,7 +165,7 @@ class SlurmClusterExecutor(BaseExecutor):
|
|
|
160
165
|
super().__init__(
|
|
161
166
|
executor=create_file_executor(
|
|
162
167
|
max_workers=max_workers,
|
|
163
|
-
backend="
|
|
168
|
+
backend="slurm",
|
|
164
169
|
max_cores=max_cores,
|
|
165
170
|
cache_directory=cache_directory,
|
|
166
171
|
resource_dict=resource_dict,
|
|
@@ -194,7 +194,21 @@ def validate_number_of_cores(
|
|
|
194
194
|
|
|
195
195
|
|
|
196
196
|
def check_file_exists(file_name: Optional[str]):
|
|
197
|
+
"""
|
|
198
|
+
Check if file exists and raise a ValueError if it does not or file_name is None.
|
|
199
|
+
"""
|
|
197
200
|
if file_name is None:
|
|
198
201
|
raise ValueError("file_name is not set.")
|
|
199
202
|
if not os.path.exists(file_name):
|
|
200
203
|
raise ValueError("file_name is not written to the file system.")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def check_log_obj_size(log_obj_size: bool) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Check if log_obj_size is True and raise a ValueError if it is.
|
|
209
|
+
"""
|
|
210
|
+
if log_obj_size:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
"log_obj_size is not supported for the executorlib.SlurmClusterExecutor and executorlib.FluxClusterExecutor."
|
|
213
|
+
"Please use log_obj_size=False instead of log_obj_size=True."
|
|
214
|
+
)
|
|
@@ -101,7 +101,7 @@ def get_queue_id(file_name: Optional[str]) -> Optional[int]:
|
|
|
101
101
|
Returns:
|
|
102
102
|
int: queuing system id from the execution of the python function
|
|
103
103
|
"""
|
|
104
|
-
if file_name is not None:
|
|
104
|
+
if file_name is not None and os.path.exists(file_name):
|
|
105
105
|
with h5py.File(file_name, "r") as hdf:
|
|
106
106
|
if "queue_id" in hdf:
|
|
107
107
|
return cloudpickle.loads(np.void(hdf["/queue_id"]))
|
|
@@ -10,9 +10,10 @@ from executorlib.task_scheduler.file.hdf import dump, get_queue_id
|
|
|
10
10
|
|
|
11
11
|
def execute_with_pysqa(
|
|
12
12
|
command: list,
|
|
13
|
+
file_name: str,
|
|
14
|
+
data_dict: dict,
|
|
13
15
|
cache_directory: str,
|
|
14
16
|
task_dependent_lst: Optional[list[int]] = None,
|
|
15
|
-
file_name: Optional[str] = None,
|
|
16
17
|
resource_dict: Optional[dict] = None,
|
|
17
18
|
config_directory: Optional[str] = None,
|
|
18
19
|
backend: Optional[str] = None,
|
|
@@ -22,9 +23,10 @@ def execute_with_pysqa(
|
|
|
22
23
|
|
|
23
24
|
Args:
|
|
24
25
|
command (list): The command to be executed.
|
|
26
|
+
file_name (str): Name of the HDF5 file which contains the Python function
|
|
27
|
+
data_dict (dict): dictionary containing the python function to be executed {"fn": ..., "args": (), "kwargs": {}}
|
|
25
28
|
cache_directory (str): The directory to store the HDF5 files.
|
|
26
29
|
task_dependent_lst (list): A list of subprocesses that the current subprocess depends on. Defaults to [].
|
|
27
|
-
file_name (str): Name of the HDF5 file which contains the Python function
|
|
28
30
|
resource_dict (dict): resource dictionary, which defines the resources used for the execution of the function.
|
|
29
31
|
Example resource dictionary: {
|
|
30
32
|
cwd: None,
|
|
@@ -37,13 +39,20 @@ def execute_with_pysqa(
|
|
|
37
39
|
"""
|
|
38
40
|
if task_dependent_lst is None:
|
|
39
41
|
task_dependent_lst = []
|
|
40
|
-
check_file_exists(file_name=file_name)
|
|
41
|
-
queue_id = get_queue_id(file_name=file_name)
|
|
42
42
|
qa = QueueAdapter(
|
|
43
43
|
directory=config_directory,
|
|
44
44
|
queue_type=backend,
|
|
45
45
|
execute_command=_pysqa_execute_command,
|
|
46
46
|
)
|
|
47
|
+
queue_id = get_queue_id(file_name=file_name)
|
|
48
|
+
if os.path.exists(file_name) and (
|
|
49
|
+
queue_id is None or qa.get_status_of_job(process_id=queue_id) is None
|
|
50
|
+
):
|
|
51
|
+
os.remove(file_name)
|
|
52
|
+
dump(file_name=file_name, data_dict=data_dict)
|
|
53
|
+
elif not os.path.exists(file_name):
|
|
54
|
+
dump(file_name=file_name, data_dict=data_dict)
|
|
55
|
+
check_file_exists(file_name=file_name)
|
|
47
56
|
if queue_id is None or qa.get_status_of_job(process_id=queue_id) is None:
|
|
48
57
|
if resource_dict is None:
|
|
49
58
|
resource_dict = {}
|
|
@@ -81,6 +90,29 @@ def execute_with_pysqa(
|
|
|
81
90
|
return queue_id
|
|
82
91
|
|
|
83
92
|
|
|
93
|
+
def terminate_with_pysqa(
|
|
94
|
+
queue_id: int,
|
|
95
|
+
config_directory: Optional[str] = None,
|
|
96
|
+
backend: Optional[str] = None,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Delete job from queuing system
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
queue_id (int): Queuing system ID of the job to delete.
|
|
103
|
+
config_directory (str, optional): path to the config directory.
|
|
104
|
+
backend (str, optional): name of the backend used to spawn tasks.
|
|
105
|
+
"""
|
|
106
|
+
qa = QueueAdapter(
|
|
107
|
+
directory=config_directory,
|
|
108
|
+
queue_type=backend,
|
|
109
|
+
execute_command=_pysqa_execute_command,
|
|
110
|
+
)
|
|
111
|
+
status = qa.get_status_of_job(process_id=queue_id)
|
|
112
|
+
if status is not None and status not in ["finished", "error"]:
|
|
113
|
+
qa.delete_job(process_id=queue_id)
|
|
114
|
+
|
|
115
|
+
|
|
84
116
|
def _pysqa_execute_command(
|
|
85
117
|
commands: str,
|
|
86
118
|
working_directory: Optional[str] = None,
|
|
@@ -9,7 +9,8 @@ from typing import Any, Callable, Optional
|
|
|
9
9
|
from executorlib.standalone.cache import get_cache_files
|
|
10
10
|
from executorlib.standalone.command import get_command_path
|
|
11
11
|
from executorlib.standalone.serialize import serialize_funct_h5
|
|
12
|
-
from executorlib.task_scheduler.file.hdf import
|
|
12
|
+
from executorlib.task_scheduler.file.hdf import get_output
|
|
13
|
+
from executorlib.task_scheduler.file.subprocess_spawner import terminate_subprocess
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
class FutureItem:
|
|
@@ -86,9 +87,30 @@ def execute_tasks_h5(
|
|
|
86
87
|
with contextlib.suppress(queue.Empty):
|
|
87
88
|
task_dict = future_queue.get_nowait()
|
|
88
89
|
if task_dict is not None and "shutdown" in task_dict and task_dict["shutdown"]:
|
|
89
|
-
if
|
|
90
|
+
if task_dict["wait"]:
|
|
91
|
+
while len(memory_dict) > 0:
|
|
92
|
+
memory_dict = {
|
|
93
|
+
key: _check_task_output(
|
|
94
|
+
task_key=key,
|
|
95
|
+
future_obj=value,
|
|
96
|
+
cache_directory=cache_dir_dict[key],
|
|
97
|
+
)
|
|
98
|
+
for key, value in memory_dict.items()
|
|
99
|
+
if not value.done()
|
|
100
|
+
}
|
|
101
|
+
if (
|
|
102
|
+
terminate_function is not None
|
|
103
|
+
and terminate_function == terminate_subprocess
|
|
104
|
+
):
|
|
90
105
|
for task in process_dict.values():
|
|
91
106
|
terminate_function(task=task)
|
|
107
|
+
elif terminate_function is not None:
|
|
108
|
+
for queue_id in process_dict.values():
|
|
109
|
+
terminate_function(
|
|
110
|
+
queue_id=queue_id,
|
|
111
|
+
config_directory=pysqa_config_directory,
|
|
112
|
+
backend=backend,
|
|
113
|
+
)
|
|
92
114
|
future_queue.task_done()
|
|
93
115
|
future_queue.join()
|
|
94
116
|
break
|
|
@@ -116,9 +138,6 @@ def execute_tasks_h5(
|
|
|
116
138
|
cache_directory, task_key + "_o.h5"
|
|
117
139
|
) not in get_cache_files(cache_directory=cache_directory):
|
|
118
140
|
file_name = os.path.join(cache_directory, task_key + "_i.h5")
|
|
119
|
-
if os.path.exists(file_name):
|
|
120
|
-
os.remove(file_name)
|
|
121
|
-
dump(file_name=file_name, data_dict=data_dict)
|
|
122
141
|
if not disable_dependencies:
|
|
123
142
|
task_dependent_lst = [
|
|
124
143
|
process_dict[k] for k in future_wait_key_lst
|
|
@@ -137,6 +156,7 @@ def execute_tasks_h5(
|
|
|
137
156
|
cores=task_resource_dict["cores"],
|
|
138
157
|
),
|
|
139
158
|
file_name=file_name,
|
|
159
|
+
data_dict=data_dict,
|
|
140
160
|
task_dependent_lst=task_dependent_lst,
|
|
141
161
|
resource_dict=task_resource_dict,
|
|
142
162
|
config_directory=pysqa_config_directory,
|
{executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/file/subprocess_spawner.py
RENAMED
|
@@ -1,33 +1,37 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import subprocess
|
|
2
3
|
import time
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
from executorlib.standalone.inputcheck import check_file_exists
|
|
7
|
+
from executorlib.task_scheduler.file.hdf import dump
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
def execute_in_subprocess(
|
|
9
11
|
command: list,
|
|
12
|
+
file_name: str,
|
|
13
|
+
data_dict: dict,
|
|
14
|
+
cache_directory: Optional[str] = None,
|
|
10
15
|
task_dependent_lst: Optional[list] = None,
|
|
11
|
-
file_name: Optional[str] = None,
|
|
12
16
|
resource_dict: Optional[dict] = None,
|
|
13
17
|
config_directory: Optional[str] = None,
|
|
14
18
|
backend: Optional[str] = None,
|
|
15
|
-
cache_directory: Optional[str] = None,
|
|
16
19
|
) -> subprocess.Popen:
|
|
17
20
|
"""
|
|
18
21
|
Execute a command in a subprocess.
|
|
19
22
|
|
|
20
23
|
Args:
|
|
21
24
|
command (list): The command to be executed.
|
|
22
|
-
task_dependent_lst (list): A list of subprocesses that the current subprocess depends on. Defaults to [].
|
|
23
25
|
file_name (str): Name of the HDF5 file which contains the Python function
|
|
26
|
+
data_dict (dict): dictionary containing the python function to be executed {"fn": ..., "args": (), "kwargs": {}}
|
|
27
|
+
cache_directory (str): The directory to store the HDF5 files.
|
|
28
|
+
task_dependent_lst (list): A list of subprocesses that the current subprocess depends on. Defaults to [].
|
|
24
29
|
resource_dict (dict): resource dictionary, which defines the resources used for the execution of the function.
|
|
25
30
|
Example resource dictionary: {
|
|
26
31
|
cwd: None,
|
|
27
32
|
}
|
|
28
33
|
config_directory (str, optional): path to the config directory.
|
|
29
34
|
backend (str, optional): name of the backend used to spawn tasks.
|
|
30
|
-
cache_directory (str): The directory to store the HDF5 files.
|
|
31
35
|
|
|
32
36
|
Returns:
|
|
33
37
|
subprocess.Popen: The subprocess object.
|
|
@@ -35,6 +39,9 @@ def execute_in_subprocess(
|
|
|
35
39
|
"""
|
|
36
40
|
if task_dependent_lst is None:
|
|
37
41
|
task_dependent_lst = []
|
|
42
|
+
if os.path.exists(file_name):
|
|
43
|
+
os.remove(file_name)
|
|
44
|
+
dump(file_name=file_name, data_dict=data_dict)
|
|
38
45
|
check_file_exists(file_name=file_name)
|
|
39
46
|
while len(task_dependent_lst) > 0:
|
|
40
47
|
task_dependent_lst = [
|
|
@@ -17,10 +17,14 @@ from executorlib.task_scheduler.file.subprocess_spawner import (
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
try:
|
|
20
|
-
from executorlib.task_scheduler.file.queue_spawner import
|
|
20
|
+
from executorlib.task_scheduler.file.queue_spawner import (
|
|
21
|
+
execute_with_pysqa,
|
|
22
|
+
terminate_with_pysqa,
|
|
23
|
+
)
|
|
21
24
|
except ImportError:
|
|
22
25
|
# If pysqa is not available fall back to executing tasks in a subprocess
|
|
23
26
|
execute_with_pysqa = execute_in_subprocess # type: ignore
|
|
27
|
+
terminate_with_pysqa = None # type: ignore
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
class FileTaskScheduler(TaskSchedulerBase):
|
|
@@ -58,8 +62,6 @@ class FileTaskScheduler(TaskSchedulerBase):
|
|
|
58
62
|
resource_dict.update(
|
|
59
63
|
{k: v for k, v in default_resource_dict.items() if k not in resource_dict}
|
|
60
64
|
)
|
|
61
|
-
if execute_function == execute_in_subprocess and terminate_function is None:
|
|
62
|
-
terminate_function = terminate_subprocess
|
|
63
65
|
self._process_kwargs = {
|
|
64
66
|
"resource_dict": resource_dict,
|
|
65
67
|
"future_queue": self._future_queue,
|
|
@@ -80,7 +82,7 @@ class FileTaskScheduler(TaskSchedulerBase):
|
|
|
80
82
|
def create_file_executor(
|
|
81
83
|
resource_dict: dict,
|
|
82
84
|
max_workers: Optional[int] = None,
|
|
83
|
-
backend: str =
|
|
85
|
+
backend: Optional[str] = None,
|
|
84
86
|
max_cores: Optional[int] = None,
|
|
85
87
|
cache_directory: Optional[str] = None,
|
|
86
88
|
flux_executor=None,
|
|
@@ -92,6 +94,7 @@ def create_file_executor(
|
|
|
92
94
|
block_allocation: bool = False,
|
|
93
95
|
init_function: Optional[Callable] = None,
|
|
94
96
|
disable_dependencies: bool = False,
|
|
97
|
+
execute_function: Callable = execute_with_pysqa,
|
|
95
98
|
):
|
|
96
99
|
if block_allocation:
|
|
97
100
|
raise ValueError(
|
|
@@ -109,9 +112,15 @@ def create_file_executor(
|
|
|
109
112
|
check_executor(executor=flux_executor)
|
|
110
113
|
check_nested_flux_executor(nested_flux_executor=flux_executor_nesting)
|
|
111
114
|
check_flux_log_files(flux_log_files=flux_log_files)
|
|
115
|
+
if execute_function != execute_in_subprocess:
|
|
116
|
+
terminate_function = terminate_with_pysqa # type: ignore
|
|
117
|
+
else:
|
|
118
|
+
terminate_function = terminate_subprocess # type: ignore
|
|
112
119
|
return FileTaskScheduler(
|
|
113
120
|
resource_dict=resource_dict,
|
|
114
121
|
pysqa_config_directory=pysqa_config_directory,
|
|
115
|
-
backend=backend
|
|
122
|
+
backend=backend,
|
|
116
123
|
disable_dependencies=disable_dependencies,
|
|
124
|
+
execute_function=execute_function,
|
|
125
|
+
terminate_function=terminate_function,
|
|
117
126
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/blockallocation.py
RENAMED
|
File without changes
|
{executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/dependency.py
RENAMED
|
File without changes
|
{executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/fluxspawner.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{executorlib-1.5.2 → executorlib-1.5.3}/executorlib/task_scheduler/interactive/slurmspawner.py
RENAMED
|
File without changes
|
|
File without changes
|