executorlib 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- executorlib/__init__.py +248 -0
- executorlib/_version.py +716 -0
- executorlib/backend/__init__.py +0 -0
- executorlib/backend/cache_parallel.py +57 -0
- executorlib/backend/cache_serial.py +6 -0
- executorlib/backend/interactive_parallel.py +99 -0
- executorlib/backend/interactive_serial.py +74 -0
- executorlib/base/__init__.py +0 -0
- executorlib/base/executor.py +167 -0
- executorlib/cache/__init__.py +0 -0
- executorlib/cache/backend.py +75 -0
- executorlib/cache/executor.py +121 -0
- executorlib/cache/queue_spawner.py +109 -0
- executorlib/cache/shared.py +249 -0
- executorlib/cache/subprocess_spawner.py +65 -0
- executorlib/interactive/__init__.py +0 -0
- executorlib/interactive/executor.py +329 -0
- executorlib/interactive/flux.py +135 -0
- executorlib/interactive/shared.py +657 -0
- executorlib/interactive/slurm.py +109 -0
- executorlib/standalone/__init__.py +21 -0
- executorlib/standalone/command.py +14 -0
- executorlib/standalone/hdf.py +116 -0
- executorlib/standalone/inputcheck.py +201 -0
- executorlib/standalone/interactive/__init__.py +0 -0
- executorlib/standalone/interactive/backend.py +98 -0
- executorlib/standalone/interactive/communication.py +213 -0
- executorlib/standalone/interactive/spawner.py +174 -0
- executorlib/standalone/plot.py +134 -0
- executorlib/standalone/queue.py +19 -0
- executorlib/standalone/serialize.py +82 -0
- executorlib/standalone/thread.py +42 -0
- executorlib-0.0.8.dist-info/LICENSE +29 -0
- executorlib-0.0.8.dist-info/METADATA +230 -0
- executorlib-0.0.8.dist-info/RECORD +37 -0
- executorlib-0.0.8.dist-info/WHEEL +5 -0
- executorlib-0.0.8.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
from concurrent.futures import Future
|
|
2
|
+
from typing import Any, Callable, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from executorlib.base.executor import ExecutorBase
|
|
5
|
+
from executorlib.interactive.shared import (
|
|
6
|
+
InteractiveExecutor,
|
|
7
|
+
InteractiveStepExecutor,
|
|
8
|
+
execute_tasks_with_dependencies,
|
|
9
|
+
)
|
|
10
|
+
from executorlib.interactive.slurm import SrunSpawner
|
|
11
|
+
from executorlib.interactive.slurm import (
|
|
12
|
+
validate_max_workers as validate_max_workers_slurm,
|
|
13
|
+
)
|
|
14
|
+
from executorlib.standalone.inputcheck import (
|
|
15
|
+
check_command_line_argument_lst,
|
|
16
|
+
check_executor,
|
|
17
|
+
check_flux_log_files,
|
|
18
|
+
check_gpus_per_worker,
|
|
19
|
+
check_init_function,
|
|
20
|
+
check_nested_flux_executor,
|
|
21
|
+
check_oversubscribe,
|
|
22
|
+
check_pmi,
|
|
23
|
+
validate_number_of_cores,
|
|
24
|
+
)
|
|
25
|
+
from executorlib.standalone.interactive.spawner import MpiExecSpawner
|
|
26
|
+
from executorlib.standalone.plot import (
|
|
27
|
+
draw,
|
|
28
|
+
generate_nodes_and_edges,
|
|
29
|
+
generate_task_hash,
|
|
30
|
+
)
|
|
31
|
+
from executorlib.standalone.thread import RaisingThread
|
|
32
|
+
|
|
33
|
+
try: # The PyFluxExecutor requires flux-base to be installed.
|
|
34
|
+
from executorlib.interactive.flux import FluxPythonSpawner
|
|
35
|
+
from executorlib.interactive.flux import (
|
|
36
|
+
validate_max_workers as validate_max_workers_flux,
|
|
37
|
+
)
|
|
38
|
+
except ImportError:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ExecutorWithDependencies(ExecutorBase):
|
|
43
|
+
"""
|
|
44
|
+
ExecutorWithDependencies is a class that extends ExecutorBase and provides functionality for executing tasks with
|
|
45
|
+
dependencies.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
refresh_rate (float, optional): The refresh rate for updating the executor queue. Defaults to 0.01.
|
|
49
|
+
plot_dependency_graph (bool, optional): Whether to generate and plot the dependency graph. Defaults to False.
|
|
50
|
+
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
|
|
51
|
+
*args: Variable length argument list.
|
|
52
|
+
**kwargs: Arbitrary keyword arguments.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
_future_hash_dict (Dict[str, Future]): A dictionary mapping task hash to future object.
|
|
56
|
+
_task_hash_dict (Dict[str, Dict]): A dictionary mapping task hash to task dictionary.
|
|
57
|
+
_generate_dependency_graph (bool): Whether to generate the dependency graph.
|
|
58
|
+
_generate_dependency_graph (str): Name of the file to store the plotted graph in.
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
*args: Any,
|
|
65
|
+
refresh_rate: float = 0.01,
|
|
66
|
+
plot_dependency_graph: bool = False,
|
|
67
|
+
plot_dependency_graph_filename: Optional[str] = None,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> None:
|
|
70
|
+
super().__init__(max_cores=kwargs.get("max_cores", None))
|
|
71
|
+
executor = create_executor(*args, **kwargs)
|
|
72
|
+
self._set_process(
|
|
73
|
+
RaisingThread(
|
|
74
|
+
target=execute_tasks_with_dependencies,
|
|
75
|
+
kwargs={
|
|
76
|
+
# Executor Arguments
|
|
77
|
+
"future_queue": self._future_queue,
|
|
78
|
+
"executor_queue": executor._future_queue,
|
|
79
|
+
"executor": executor,
|
|
80
|
+
"refresh_rate": refresh_rate,
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
self._future_hash_dict: dict = {}
|
|
85
|
+
self._task_hash_dict: dict = {}
|
|
86
|
+
self._plot_dependency_graph_filename = plot_dependency_graph_filename
|
|
87
|
+
if plot_dependency_graph_filename is None:
|
|
88
|
+
self._generate_dependency_graph = plot_dependency_graph
|
|
89
|
+
else:
|
|
90
|
+
self._generate_dependency_graph = True
|
|
91
|
+
|
|
92
|
+
def submit( # type: ignore
|
|
93
|
+
self,
|
|
94
|
+
fn: Callable[..., Any],
|
|
95
|
+
*args: Any,
|
|
96
|
+
resource_dict: Dict[str, Any] = {},
|
|
97
|
+
**kwargs: Any,
|
|
98
|
+
) -> Future:
|
|
99
|
+
"""
|
|
100
|
+
Submits a task to the executor.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
fn (Callable): The function to be executed.
|
|
104
|
+
*args: Variable length argument list.
|
|
105
|
+
resource_dict (dict, optional): A dictionary of resources required by the task. Defaults to {}.
|
|
106
|
+
**kwargs: Arbitrary keyword arguments.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Future: A future object representing the result of the task.
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
if not self._generate_dependency_graph:
|
|
113
|
+
f = super().submit(fn, *args, resource_dict=resource_dict, **kwargs)
|
|
114
|
+
else:
|
|
115
|
+
f = Future()
|
|
116
|
+
f.set_result(None)
|
|
117
|
+
task_dict = {
|
|
118
|
+
"fn": fn,
|
|
119
|
+
"args": args,
|
|
120
|
+
"kwargs": kwargs,
|
|
121
|
+
"future": f,
|
|
122
|
+
"resource_dict": resource_dict,
|
|
123
|
+
}
|
|
124
|
+
task_hash = generate_task_hash(
|
|
125
|
+
task_dict=task_dict,
|
|
126
|
+
future_hash_inverse_dict={
|
|
127
|
+
v: k for k, v in self._future_hash_dict.items()
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
self._future_hash_dict[task_hash] = f
|
|
131
|
+
self._task_hash_dict[task_hash] = task_dict
|
|
132
|
+
return f
|
|
133
|
+
|
|
134
|
+
def __exit__(
|
|
135
|
+
self,
|
|
136
|
+
exc_type: Any,
|
|
137
|
+
exc_val: Any,
|
|
138
|
+
exc_tb: Any,
|
|
139
|
+
) -> None:
|
|
140
|
+
"""
|
|
141
|
+
Exit method called when exiting the context manager.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
exc_type: The type of the exception.
|
|
145
|
+
exc_val: The exception instance.
|
|
146
|
+
exc_tb: The traceback object.
|
|
147
|
+
|
|
148
|
+
"""
|
|
149
|
+
super().__exit__(exc_type=exc_type, exc_val=exc_val, exc_tb=exc_tb) # type: ignore
|
|
150
|
+
if self._generate_dependency_graph:
|
|
151
|
+
node_lst, edge_lst = generate_nodes_and_edges(
|
|
152
|
+
task_hash_dict=self._task_hash_dict,
|
|
153
|
+
future_hash_inverse_dict={
|
|
154
|
+
v: k for k, v in self._future_hash_dict.items()
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
return draw(
|
|
158
|
+
node_lst=node_lst,
|
|
159
|
+
edge_lst=edge_lst,
|
|
160
|
+
filename=self._plot_dependency_graph_filename,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def create_executor(
|
|
165
|
+
max_workers: Optional[int] = None,
|
|
166
|
+
backend: str = "local",
|
|
167
|
+
max_cores: Optional[int] = None,
|
|
168
|
+
cache_directory: Optional[str] = None,
|
|
169
|
+
resource_dict: dict = {},
|
|
170
|
+
flux_executor=None,
|
|
171
|
+
flux_executor_pmi_mode: Optional[str] = None,
|
|
172
|
+
flux_executor_nesting: bool = False,
|
|
173
|
+
flux_log_files: bool = False,
|
|
174
|
+
hostname_localhost: Optional[bool] = None,
|
|
175
|
+
block_allocation: bool = False,
|
|
176
|
+
init_function: Optional[Callable] = None,
|
|
177
|
+
):
|
|
178
|
+
"""
|
|
179
|
+
Instead of returning a executorlib.Executor object this function returns either a executorlib.mpi.PyMPIExecutor,
|
|
180
|
+
executorlib.slurm.PySlurmExecutor or executorlib.flux.PyFluxExecutor depending on which backend is available. The
|
|
181
|
+
executorlib.flux.PyFluxExecutor is the preferred choice while the executorlib.mpi.PyMPIExecutor is primarily used
|
|
182
|
+
for development and testing. The executorlib.flux.PyFluxExecutor requires flux-base from the flux-framework to be
|
|
183
|
+
installed and in addition flux-sched to enable GPU scheduling. Finally, the executorlib.slurm.PySlurmExecutor
|
|
184
|
+
requires the SLURM workload manager to be installed on the system.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
max_workers (int): for backwards compatibility with the standard library, max_workers also defines the number of
|
|
188
|
+
cores which can be used in parallel - just like the max_cores parameter. Using max_cores is
|
|
189
|
+
recommended, as computers have a limited number of compute cores.
|
|
190
|
+
backend (str): Switch between the different backends "flux", "local" or "slurm". The default is "local".
|
|
191
|
+
max_cores (int): defines the number cores which can be used in parallel
|
|
192
|
+
cache_directory (str, optional): The directory to store cache files. Defaults to "cache".
|
|
193
|
+
resource_dict (dict): A dictionary of resources required by the task. With the following keys:
|
|
194
|
+
- cores (int): number of MPI cores to be used for each function call
|
|
195
|
+
- threads_per_core (int): number of OpenMP threads to be used for each function call
|
|
196
|
+
- gpus_per_core (int): number of GPUs per worker - defaults to 0
|
|
197
|
+
- cwd (str/None): current working directory where the parallel python task is executed
|
|
198
|
+
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
|
|
199
|
+
SLURM only) - default False
|
|
200
|
+
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
|
|
201
|
+
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
|
|
202
|
+
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
|
|
203
|
+
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
|
|
204
|
+
flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False.
|
|
205
|
+
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
|
|
206
|
+
context of an HPC cluster this essential to be able to communicate to an Executor
|
|
207
|
+
running on a different compute node within the same allocation. And in principle
|
|
208
|
+
any computer should be able to resolve that their own hostname points to the same
|
|
209
|
+
address as localhost. Still MacOS >= 12 seems to disable this look up for security
|
|
210
|
+
reasons. So on MacOS it is required to set this option to true
|
|
211
|
+
block_allocation (boolean): To accelerate the submission of a series of python functions with the same
|
|
212
|
+
resource requirements, executorlib supports block allocation. In this case all
|
|
213
|
+
resources have to be defined on the executor, rather than during the submission
|
|
214
|
+
of the individual function.
|
|
215
|
+
init_function (None): optional function to preset arguments for functions which are submitted later
|
|
216
|
+
"""
|
|
217
|
+
check_init_function(block_allocation=block_allocation, init_function=init_function)
|
|
218
|
+
if flux_executor is not None and backend != "flux_allocation":
|
|
219
|
+
backend = "flux_allocation"
|
|
220
|
+
check_pmi(backend=backend, pmi=flux_executor_pmi_mode)
|
|
221
|
+
cores_per_worker = resource_dict.get("cores", 1)
|
|
222
|
+
resource_dict["cache_directory"] = cache_directory
|
|
223
|
+
resource_dict["hostname_localhost"] = hostname_localhost
|
|
224
|
+
if backend == "flux_allocation":
|
|
225
|
+
check_oversubscribe(
|
|
226
|
+
oversubscribe=resource_dict.get("openmpi_oversubscribe", False)
|
|
227
|
+
)
|
|
228
|
+
check_command_line_argument_lst(
|
|
229
|
+
command_line_argument_lst=resource_dict.get("slurm_cmd_args", [])
|
|
230
|
+
)
|
|
231
|
+
if "openmpi_oversubscribe" in resource_dict.keys():
|
|
232
|
+
del resource_dict["openmpi_oversubscribe"]
|
|
233
|
+
if "slurm_cmd_args" in resource_dict.keys():
|
|
234
|
+
del resource_dict["slurm_cmd_args"]
|
|
235
|
+
resource_dict["flux_executor"] = flux_executor
|
|
236
|
+
resource_dict["flux_executor_pmi_mode"] = flux_executor_pmi_mode
|
|
237
|
+
resource_dict["flux_executor_nesting"] = flux_executor_nesting
|
|
238
|
+
resource_dict["flux_log_files"] = flux_log_files
|
|
239
|
+
if block_allocation:
|
|
240
|
+
resource_dict["init_function"] = init_function
|
|
241
|
+
max_workers = validate_number_of_cores(
|
|
242
|
+
max_cores=max_cores,
|
|
243
|
+
max_workers=max_workers,
|
|
244
|
+
cores_per_worker=cores_per_worker,
|
|
245
|
+
set_local_cores=False,
|
|
246
|
+
)
|
|
247
|
+
validate_max_workers_flux(
|
|
248
|
+
max_workers=max_workers,
|
|
249
|
+
cores=cores_per_worker,
|
|
250
|
+
threads_per_core=resource_dict.get("threads_per_core", 1),
|
|
251
|
+
)
|
|
252
|
+
return InteractiveExecutor(
|
|
253
|
+
max_workers=max_workers,
|
|
254
|
+
executor_kwargs=resource_dict,
|
|
255
|
+
spawner=FluxPythonSpawner,
|
|
256
|
+
)
|
|
257
|
+
else:
|
|
258
|
+
return InteractiveStepExecutor(
|
|
259
|
+
max_cores=max_cores,
|
|
260
|
+
max_workers=max_workers,
|
|
261
|
+
executor_kwargs=resource_dict,
|
|
262
|
+
spawner=FluxPythonSpawner,
|
|
263
|
+
)
|
|
264
|
+
elif backend == "slurm_allocation":
|
|
265
|
+
check_executor(executor=flux_executor)
|
|
266
|
+
check_nested_flux_executor(nested_flux_executor=flux_executor_nesting)
|
|
267
|
+
check_flux_log_files(flux_log_files=flux_log_files)
|
|
268
|
+
if block_allocation:
|
|
269
|
+
resource_dict["init_function"] = init_function
|
|
270
|
+
max_workers = validate_number_of_cores(
|
|
271
|
+
max_cores=max_cores,
|
|
272
|
+
max_workers=max_workers,
|
|
273
|
+
cores_per_worker=cores_per_worker,
|
|
274
|
+
set_local_cores=False,
|
|
275
|
+
)
|
|
276
|
+
validate_max_workers_slurm(
|
|
277
|
+
max_workers=max_workers,
|
|
278
|
+
cores=cores_per_worker,
|
|
279
|
+
threads_per_core=resource_dict.get("threads_per_core", 1),
|
|
280
|
+
)
|
|
281
|
+
return InteractiveExecutor(
|
|
282
|
+
max_workers=max_workers,
|
|
283
|
+
executor_kwargs=resource_dict,
|
|
284
|
+
spawner=SrunSpawner,
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
return InteractiveStepExecutor(
|
|
288
|
+
max_cores=max_cores,
|
|
289
|
+
max_workers=max_workers,
|
|
290
|
+
executor_kwargs=resource_dict,
|
|
291
|
+
spawner=SrunSpawner,
|
|
292
|
+
)
|
|
293
|
+
elif backend == "local":
|
|
294
|
+
check_executor(executor=flux_executor)
|
|
295
|
+
check_nested_flux_executor(nested_flux_executor=flux_executor_nesting)
|
|
296
|
+
check_flux_log_files(flux_log_files=flux_log_files)
|
|
297
|
+
check_gpus_per_worker(gpus_per_worker=resource_dict.get("gpus_per_core", 0))
|
|
298
|
+
check_command_line_argument_lst(
|
|
299
|
+
command_line_argument_lst=resource_dict.get("slurm_cmd_args", [])
|
|
300
|
+
)
|
|
301
|
+
if "threads_per_core" in resource_dict.keys():
|
|
302
|
+
del resource_dict["threads_per_core"]
|
|
303
|
+
if "gpus_per_core" in resource_dict.keys():
|
|
304
|
+
del resource_dict["gpus_per_core"]
|
|
305
|
+
if "slurm_cmd_args" in resource_dict.keys():
|
|
306
|
+
del resource_dict["slurm_cmd_args"]
|
|
307
|
+
if block_allocation:
|
|
308
|
+
resource_dict["init_function"] = init_function
|
|
309
|
+
return InteractiveExecutor(
|
|
310
|
+
max_workers=validate_number_of_cores(
|
|
311
|
+
max_cores=max_cores,
|
|
312
|
+
max_workers=max_workers,
|
|
313
|
+
cores_per_worker=cores_per_worker,
|
|
314
|
+
set_local_cores=True,
|
|
315
|
+
),
|
|
316
|
+
executor_kwargs=resource_dict,
|
|
317
|
+
spawner=MpiExecSpawner,
|
|
318
|
+
)
|
|
319
|
+
else:
|
|
320
|
+
return InteractiveStepExecutor(
|
|
321
|
+
max_cores=max_cores,
|
|
322
|
+
max_workers=max_workers,
|
|
323
|
+
executor_kwargs=resource_dict,
|
|
324
|
+
spawner=MpiExecSpawner,
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
raise ValueError(
|
|
328
|
+
"The supported backends are slurm_allocation, slurm_submission, flux_allocation, flux_submission and local."
|
|
329
|
+
)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import flux
|
|
5
|
+
import flux.job
|
|
6
|
+
|
|
7
|
+
from executorlib.standalone.interactive.spawner import BaseSpawner
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def validate_max_workers(max_workers: int, cores: int, threads_per_core: int):
|
|
11
|
+
handle = flux.Flux()
|
|
12
|
+
cores_total = flux.resource.list.resource_list(handle).get().up.ncores
|
|
13
|
+
cores_requested = max_workers * cores * threads_per_core
|
|
14
|
+
if cores_total < cores_requested:
|
|
15
|
+
raise ValueError(
|
|
16
|
+
"The number of requested cores is larger than the available cores "
|
|
17
|
+
+ str(cores_total)
|
|
18
|
+
+ " < "
|
|
19
|
+
+ str(cores_requested)
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FluxPythonSpawner(BaseSpawner):
|
|
24
|
+
"""
|
|
25
|
+
A class representing the FluxPythonInterface.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
cwd (str, optional): The current working directory. Defaults to None.
|
|
29
|
+
cores (int, optional): The number of cores. Defaults to 1.
|
|
30
|
+
threads_per_core (int, optional): The number of threads per base. Defaults to 1.
|
|
31
|
+
gpus_per_core (int, optional): The number of GPUs per base. Defaults to 0.
|
|
32
|
+
openmpi_oversubscribe (bool, optional): Whether to oversubscribe. Defaults to False.
|
|
33
|
+
flux_executor (flux.job.FluxExecutor, optional): The FluxExecutor instance. Defaults to None.
|
|
34
|
+
flux_executor_pmi_mode (str, optional): The PMI option. Defaults to None.
|
|
35
|
+
flux_executor_nesting (bool, optional): Whether to use nested FluxExecutor. Defaults to False.
|
|
36
|
+
flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
cwd: Optional[str] = None,
|
|
42
|
+
cores: int = 1,
|
|
43
|
+
threads_per_core: int = 1,
|
|
44
|
+
gpus_per_core: int = 0,
|
|
45
|
+
openmpi_oversubscribe: bool = False,
|
|
46
|
+
flux_executor: Optional[flux.job.FluxExecutor] = None,
|
|
47
|
+
flux_executor_pmi_mode: Optional[str] = None,
|
|
48
|
+
flux_executor_nesting: bool = False,
|
|
49
|
+
flux_log_files: bool = False,
|
|
50
|
+
):
|
|
51
|
+
super().__init__(
|
|
52
|
+
cwd=cwd,
|
|
53
|
+
cores=cores,
|
|
54
|
+
openmpi_oversubscribe=openmpi_oversubscribe,
|
|
55
|
+
)
|
|
56
|
+
self._threads_per_core = threads_per_core
|
|
57
|
+
self._gpus_per_core = gpus_per_core
|
|
58
|
+
self._flux_executor = flux_executor
|
|
59
|
+
self._flux_executor_pmi_mode = flux_executor_pmi_mode
|
|
60
|
+
self._flux_executor_nesting = flux_executor_nesting
|
|
61
|
+
self._flux_log_files = flux_log_files
|
|
62
|
+
self._future = None
|
|
63
|
+
|
|
64
|
+
def bootup(
|
|
65
|
+
self,
|
|
66
|
+
command_lst: list[str],
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Boot up the client process to connect to the SocketInterface.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
command_lst (list[str]): List of strings to start the client process.
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If oversubscribing is not supported for the Flux adapter or if conda environments are not supported.
|
|
75
|
+
"""
|
|
76
|
+
if self._openmpi_oversubscribe:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"Oversubscribing is currently not supported for the Flux adapter."
|
|
79
|
+
)
|
|
80
|
+
if self._flux_executor is None:
|
|
81
|
+
self._flux_executor = flux.job.FluxExecutor()
|
|
82
|
+
if not self._flux_executor_nesting:
|
|
83
|
+
jobspec = flux.job.JobspecV1.from_command(
|
|
84
|
+
command=command_lst,
|
|
85
|
+
num_tasks=self._cores,
|
|
86
|
+
cores_per_task=self._threads_per_core,
|
|
87
|
+
gpus_per_task=self._gpus_per_core,
|
|
88
|
+
num_nodes=None,
|
|
89
|
+
exclusive=False,
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
jobspec = flux.job.JobspecV1.from_nest_command(
|
|
93
|
+
command=command_lst,
|
|
94
|
+
num_slots=self._cores,
|
|
95
|
+
cores_per_slot=self._threads_per_core,
|
|
96
|
+
gpus_per_slot=self._gpus_per_core,
|
|
97
|
+
num_nodes=None,
|
|
98
|
+
exclusive=False,
|
|
99
|
+
)
|
|
100
|
+
jobspec.environment = dict(os.environ)
|
|
101
|
+
if self._flux_executor_pmi_mode is not None:
|
|
102
|
+
jobspec.setattr_shell_option("pmi", self._flux_executor_pmi_mode)
|
|
103
|
+
if self._cwd is not None:
|
|
104
|
+
jobspec.cwd = self._cwd
|
|
105
|
+
if self._flux_log_files and self._cwd is not None:
|
|
106
|
+
jobspec.stderr = os.path.join(self._cwd, "flux.err")
|
|
107
|
+
jobspec.stdout = os.path.join(self._cwd, "flux.out")
|
|
108
|
+
elif self._flux_log_files:
|
|
109
|
+
jobspec.stderr = os.path.abspath("flux.err")
|
|
110
|
+
jobspec.stdout = os.path.abspath("flux.out")
|
|
111
|
+
self._future = self._flux_executor.submit(jobspec)
|
|
112
|
+
|
|
113
|
+
def shutdown(self, wait: bool = True):
|
|
114
|
+
"""
|
|
115
|
+
Shutdown the FluxPythonInterface.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
wait (bool, optional): Whether to wait for the execution to complete. Defaults to True.
|
|
119
|
+
"""
|
|
120
|
+
if self._future is not None:
|
|
121
|
+
if self.poll():
|
|
122
|
+
self._future.cancel()
|
|
123
|
+
# The flux future objects are not instantly updated,
|
|
124
|
+
# still showing running after cancel was called,
|
|
125
|
+
# so we wait until the execution is completed.
|
|
126
|
+
self._future.result()
|
|
127
|
+
|
|
128
|
+
def poll(self):
|
|
129
|
+
"""
|
|
130
|
+
Check if the FluxPythonInterface is running.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
bool: True if the interface is running, False otherwise.
|
|
134
|
+
"""
|
|
135
|
+
return self._future is not None and not self._future.done()
|