fractal-server 2.13.1__py3-none-any.whl → 2.14.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/history/__init__.py +4 -0
- fractal_server/app/history/image_updates.py +142 -0
- fractal_server/app/history/status_enum.py +16 -0
- fractal_server/app/models/v2/__init__.py +5 -1
- fractal_server/app/models/v2/history.py +53 -0
- fractal_server/app/routes/api/v2/__init__.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +78 -0
- fractal_server/app/routes/api/v2/dataset.py +12 -9
- fractal_server/app/routes/api/v2/history.py +247 -0
- fractal_server/app/routes/api/v2/workflow.py +18 -3
- fractal_server/app/routes/api/v2/workflowtask.py +22 -0
- fractal_server/app/runner/executors/base_runner.py +114 -0
- fractal_server/app/runner/{v2/_local → executors/local}/_local_config.py +3 -3
- fractal_server/app/runner/executors/local/_submit_setup.py +54 -0
- fractal_server/app/runner/executors/local/runner.py +200 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +3 -3
- fractal_server/app/runner/{v2/_slurm_ssh → executors/slurm_common}/_submit_setup.py +13 -12
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +9 -15
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_executor_wait_thread.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_slurm_job.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/executor.py +13 -14
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_check_jobs_status.py +11 -9
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_executor_wait_thread.py +3 -3
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -68
- fractal_server/app/runner/executors/slurm_sudo/runner.py +632 -0
- fractal_server/app/runner/task_files.py +70 -96
- fractal_server/app/runner/v2/__init__.py +5 -19
- fractal_server/app/runner/v2/_local.py +84 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +10 -13
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +10 -12
- fractal_server/app/runner/v2/runner.py +93 -28
- fractal_server/app/runner/v2/runner_functions.py +85 -62
- fractal_server/app/runner/v2/runner_functions_low_level.py +20 -20
- fractal_server/app/schemas/v2/dataset.py +0 -17
- fractal_server/app/schemas/v2/history.py +23 -0
- fractal_server/config.py +2 -2
- fractal_server/migrations/versions/8223fcef886c_image_status.py +63 -0
- fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +68 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/METADATA +1 -1
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/RECORD +52 -46
- fractal_server/app/routes/api/v2/status.py +0 -168
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- /fractal_server/app/runner/executors/{slurm → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_ssh}/__init__.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_sudo}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/entry_points.txt +0 -0
@@ -1,1281 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import math
|
3
|
-
import shlex
|
4
|
-
import subprocess # nosec
|
5
|
-
import sys
|
6
|
-
import threading
|
7
|
-
import time
|
8
|
-
import uuid
|
9
|
-
from concurrent.futures import Executor
|
10
|
-
from concurrent.futures import Future
|
11
|
-
from concurrent.futures import InvalidStateError
|
12
|
-
from copy import copy
|
13
|
-
from pathlib import Path
|
14
|
-
from subprocess import CompletedProcess # nosec
|
15
|
-
from typing import Any
|
16
|
-
from typing import Callable
|
17
|
-
from typing import Optional
|
18
|
-
from typing import Sequence
|
19
|
-
|
20
|
-
import cloudpickle
|
21
|
-
|
22
|
-
from ......config import get_settings
|
23
|
-
from ......logger import set_logger
|
24
|
-
from ......syringe import Inject
|
25
|
-
from ....exceptions import JobExecutionError
|
26
|
-
from ....exceptions import TaskExecutionError
|
27
|
-
from ....filenames import SHUTDOWN_FILENAME
|
28
|
-
from ....task_files import get_task_file_paths
|
29
|
-
from ....task_files import TaskFiles
|
30
|
-
from ...slurm._slurm_config import SlurmConfig
|
31
|
-
from .._batching import heuristics
|
32
|
-
from ..utils_executors import get_pickle_file_path
|
33
|
-
from ..utils_executors import get_slurm_file_path
|
34
|
-
from ..utils_executors import get_slurm_script_file_path
|
35
|
-
from ._executor_wait_thread import FractalSlurmSudoWaitThread
|
36
|
-
from ._subprocess_run_as_user import _glob_as_user
|
37
|
-
from ._subprocess_run_as_user import _glob_as_user_strict
|
38
|
-
from ._subprocess_run_as_user import _path_exists_as_user
|
39
|
-
from ._subprocess_run_as_user import _run_command_as_user
|
40
|
-
from fractal_server import __VERSION__
|
41
|
-
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
42
|
-
from fractal_server.string_tools import validate_cmd
|
43
|
-
|
44
|
-
|
45
|
-
logger = set_logger(__name__)
|
46
|
-
|
47
|
-
|
48
|
-
def _subprocess_run_or_raise(full_command: str) -> Optional[CompletedProcess]:
|
49
|
-
"""
|
50
|
-
Wrap `subprocess.run` and raise appropriate `JobExecutionError` if needed.
|
51
|
-
|
52
|
-
Args:
|
53
|
-
full_command: Full string of the command to execute.
|
54
|
-
|
55
|
-
Raises:
|
56
|
-
JobExecutionError: If `subprocess.run` raises a `CalledProcessError`.
|
57
|
-
|
58
|
-
Returns:
|
59
|
-
The actual `CompletedProcess` output of `subprocess.run`.
|
60
|
-
"""
|
61
|
-
validate_cmd(full_command)
|
62
|
-
try:
|
63
|
-
output = subprocess.run( # nosec
|
64
|
-
shlex.split(full_command),
|
65
|
-
capture_output=True,
|
66
|
-
check=True,
|
67
|
-
encoding="utf-8",
|
68
|
-
)
|
69
|
-
return output
|
70
|
-
except subprocess.CalledProcessError as e:
|
71
|
-
error_msg = (
|
72
|
-
f"Submit command `{full_command}` failed. "
|
73
|
-
f"Original error:\n{str(e)}\n"
|
74
|
-
f"Original stdout:\n{e.stdout}\n"
|
75
|
-
f"Original stderr:\n{e.stderr}\n"
|
76
|
-
)
|
77
|
-
logger.error(error_msg)
|
78
|
-
raise JobExecutionError(info=error_msg)
|
79
|
-
|
80
|
-
|
81
|
-
class SlurmJob:
|
82
|
-
"""
|
83
|
-
Collect information related to a FractalSlurmExecutor job
|
84
|
-
|
85
|
-
This includes three groups of attributes:
|
86
|
-
|
87
|
-
1. Attributes related to the (possibly multi-task) SLURM job, e.g.
|
88
|
-
submission-file path.
|
89
|
-
2. Attributes related to single tasks, e.g. the paths of their input/output
|
90
|
-
pickle files.
|
91
|
-
3. SLURM configuration options, encoded in a SlurmConfig object.
|
92
|
-
|
93
|
-
Note: A SlurmJob object is generally defined as a multi-task job. Jobs
|
94
|
-
coming from the `map` method must have `single_task_submission=False` (even
|
95
|
-
if `num_tasks_tot=1`), while jobs coming from `submit` must have it set to
|
96
|
-
`True`.
|
97
|
-
|
98
|
-
Attributes:
|
99
|
-
num_tasks_tot:
|
100
|
-
Total number of tasks to be executed as part of this SLURM job.
|
101
|
-
single_task_submission:
|
102
|
-
This must be `True` for jobs submitted as part of the `submit`
|
103
|
-
method, and `False` for jobs coming from the `map` method.
|
104
|
-
slurm_file_prefix:
|
105
|
-
Prefix for SLURM-job related files (submission script and SLURM
|
106
|
-
stdout/stderr); this is also needed in the
|
107
|
-
`_copy_files_from_remote_to_local` method.
|
108
|
-
wftask_file_prefixes:
|
109
|
-
Prefix for files that are created as part of the functions
|
110
|
-
submitted for execution on the `FractalSlurmExecutor`; this is
|
111
|
-
needed in the `_copy_files_from_remote_to_local` method, and also
|
112
|
-
to construct the names of per-task input/output pickle files.
|
113
|
-
wftask_subfolder_name:
|
114
|
-
Name of the per-task subfolder (e.g. `7_task_name`).
|
115
|
-
slurm_script:
|
116
|
-
Path of SLURM submission script.
|
117
|
-
slurm_stdout:
|
118
|
-
Path of SLURM stdout file; if this includes `"%j"`, then this
|
119
|
-
string will be replaced by the SLURM job ID upon `sbatch`
|
120
|
-
submission.
|
121
|
-
slurm_stderr:
|
122
|
-
Path of SLURM stderr file; see `slurm_stdout` concerning `"%j"`.
|
123
|
-
workerids:
|
124
|
-
IDs that enter in the per-task input/output pickle files (one per
|
125
|
-
task).
|
126
|
-
input_pickle_files:
|
127
|
-
Input pickle files (one per task).
|
128
|
-
output_pickle_files:
|
129
|
-
Output pickle files (one per task).
|
130
|
-
slurm_config:
|
131
|
-
`SlurmConfig` object.
|
132
|
-
"""
|
133
|
-
|
134
|
-
# Job-related attributes
|
135
|
-
num_tasks_tot: int
|
136
|
-
single_task_submission: bool
|
137
|
-
slurm_file_prefix: str
|
138
|
-
slurm_script: Path
|
139
|
-
slurm_stdout: Path
|
140
|
-
slurm_stderr: Path
|
141
|
-
# Per-task attributes
|
142
|
-
workerids: tuple[str, ...]
|
143
|
-
wftask_file_prefixes: tuple[str, ...]
|
144
|
-
wftask_subfolder_name: str
|
145
|
-
input_pickle_files: tuple[Path, ...]
|
146
|
-
output_pickle_files: tuple[Path, ...]
|
147
|
-
# Slurm configuration
|
148
|
-
slurm_config: SlurmConfig
|
149
|
-
|
150
|
-
def __init__(
|
151
|
-
self,
|
152
|
-
num_tasks_tot: int,
|
153
|
-
slurm_config: SlurmConfig,
|
154
|
-
slurm_file_prefix: Optional[str] = None,
|
155
|
-
wftask_file_prefixes: Optional[tuple[str, ...]] = None,
|
156
|
-
single_task_submission: bool = False,
|
157
|
-
):
|
158
|
-
if single_task_submission and num_tasks_tot > 1:
|
159
|
-
raise ValueError(
|
160
|
-
"Trying to initialize SlurmJob with"
|
161
|
-
f"{single_task_submission=} and {num_tasks_tot=}."
|
162
|
-
)
|
163
|
-
self.num_tasks_tot = num_tasks_tot
|
164
|
-
self.single_task_submission = single_task_submission
|
165
|
-
self.slurm_file_prefix = slurm_file_prefix or "default_slurm_prefix"
|
166
|
-
if wftask_file_prefixes is None:
|
167
|
-
self.wftask_file_prefixes = tuple(
|
168
|
-
"default_wftask_prefix" for i in range(self.num_tasks_tot)
|
169
|
-
)
|
170
|
-
else:
|
171
|
-
self.wftask_file_prefixes = wftask_file_prefixes
|
172
|
-
self.workerids = tuple(uuid.uuid4() for i in range(self.num_tasks_tot))
|
173
|
-
self.slurm_config = slurm_config
|
174
|
-
|
175
|
-
def get_clean_output_pickle_files(self) -> tuple[str, ...]:
|
176
|
-
"""
|
177
|
-
Transform all pathlib.Path objects in self.output_pickle_files to
|
178
|
-
strings
|
179
|
-
"""
|
180
|
-
return tuple(str(f.as_posix()) for f in self.output_pickle_files)
|
181
|
-
|
182
|
-
|
183
|
-
class FractalSlurmSudoExecutor(Executor):
|
184
|
-
"""
|
185
|
-
Executor to submit SLURM jobs as a different user, via `sudo -u`
|
186
|
-
|
187
|
-
This class is a custom re-implementation of the SLURM executor from
|
188
|
-
|
189
|
-
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
190
|
-
> Original Copyright
|
191
|
-
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
192
|
-
> License: MIT
|
193
|
-
|
194
|
-
|
195
|
-
Attributes:
|
196
|
-
slurm_user:
|
197
|
-
Shell username that runs the `sbatch` command.
|
198
|
-
common_script_lines:
|
199
|
-
Arbitrary script lines that will always be included in the
|
200
|
-
sbatch script
|
201
|
-
workflow_dir_local:
|
202
|
-
Directory for both the cfut/SLURM and fractal-server files and logs
|
203
|
-
workflow_dir_remote:
|
204
|
-
Directory for both the cfut/SLURM and fractal-server files and logs
|
205
|
-
map_jobid_to_slurm_files:
|
206
|
-
Dictionary with paths of slurm-related files for active jobs
|
207
|
-
"""
|
208
|
-
|
209
|
-
wait_thread_cls = FractalSlurmSudoWaitThread
|
210
|
-
slurm_user: str
|
211
|
-
shutdown_file: str
|
212
|
-
common_script_lines: list[str]
|
213
|
-
user_cache_dir: str
|
214
|
-
workflow_dir_local: Path
|
215
|
-
workflow_dir_remote: Path
|
216
|
-
map_jobid_to_slurm_files: dict[str, tuple[str, str, str]]
|
217
|
-
slurm_account: Optional[str] = None
|
218
|
-
jobs: dict[str, tuple[Future, SlurmJob]]
|
219
|
-
|
220
|
-
def __init__(
|
221
|
-
self,
|
222
|
-
slurm_user: str,
|
223
|
-
workflow_dir_local: Path,
|
224
|
-
workflow_dir_remote: Path,
|
225
|
-
shutdown_file: Optional[str] = None,
|
226
|
-
user_cache_dir: Optional[str] = None,
|
227
|
-
common_script_lines: Optional[list[str]] = None,
|
228
|
-
slurm_poll_interval: Optional[int] = None,
|
229
|
-
slurm_account: Optional[str] = None,
|
230
|
-
*args,
|
231
|
-
**kwargs,
|
232
|
-
):
|
233
|
-
"""
|
234
|
-
Init method for FractalSlurmExecutor
|
235
|
-
"""
|
236
|
-
|
237
|
-
if not slurm_user:
|
238
|
-
raise RuntimeError(
|
239
|
-
"Missing attribute FractalSlurmExecutor.slurm_user"
|
240
|
-
)
|
241
|
-
|
242
|
-
self.jobs = {}
|
243
|
-
self.job_outfiles = {}
|
244
|
-
self.jobs_lock = threading.Lock()
|
245
|
-
self.jobs_empty_cond = threading.Condition(self.jobs_lock)
|
246
|
-
|
247
|
-
self.wait_thread = self.wait_thread_cls(self._completion)
|
248
|
-
self.wait_thread.start()
|
249
|
-
|
250
|
-
# Assign `wait_thread.shutdown_callback` early, since it may be called
|
251
|
-
# from within `_stop_and_join_wait_thread` (e.g. if an exception is
|
252
|
-
# raised within `__init__`).
|
253
|
-
self.wait_thread.shutdown_callback = self.shutdown
|
254
|
-
|
255
|
-
self.slurm_user = slurm_user
|
256
|
-
self.slurm_account = slurm_account
|
257
|
-
|
258
|
-
self.common_script_lines = common_script_lines or []
|
259
|
-
settings = Inject(get_settings)
|
260
|
-
|
261
|
-
if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
|
262
|
-
try:
|
263
|
-
self.check_remote_python_interpreter()
|
264
|
-
except Exception as e:
|
265
|
-
self._stop_and_join_wait_thread()
|
266
|
-
raise RuntimeError(f"Original error {str(e)}")
|
267
|
-
|
268
|
-
# Check that SLURM account is not set here
|
269
|
-
try:
|
270
|
-
invalid_line = next(
|
271
|
-
line
|
272
|
-
for line in self.common_script_lines
|
273
|
-
if line.startswith("#SBATCH --account=")
|
274
|
-
)
|
275
|
-
self._stop_and_join_wait_thread()
|
276
|
-
raise RuntimeError(
|
277
|
-
"Invalid line in `FractalSlurmExecutor.common_script_lines`: "
|
278
|
-
f"'{invalid_line}'.\n"
|
279
|
-
"SLURM account must be set via the request body of the "
|
280
|
-
"apply-workflow endpoint, or by modifying the user properties."
|
281
|
-
)
|
282
|
-
except StopIteration:
|
283
|
-
pass
|
284
|
-
|
285
|
-
self.workflow_dir_local = workflow_dir_local
|
286
|
-
if not _path_exists_as_user(
|
287
|
-
path=str(workflow_dir_remote), user=self.slurm_user
|
288
|
-
):
|
289
|
-
logger.info(f"Missing folder {workflow_dir_remote=}")
|
290
|
-
self.user_cache_dir = user_cache_dir
|
291
|
-
|
292
|
-
self.workflow_dir_remote = workflow_dir_remote
|
293
|
-
self.map_jobid_to_slurm_files = {}
|
294
|
-
|
295
|
-
# Set the attribute slurm_poll_interval for self.wait_thread (see
|
296
|
-
# cfut.SlurmWaitThread)
|
297
|
-
if not slurm_poll_interval:
|
298
|
-
slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
|
299
|
-
self.wait_thread.slurm_poll_interval = slurm_poll_interval
|
300
|
-
self.wait_thread.slurm_user = self.slurm_user
|
301
|
-
|
302
|
-
self.wait_thread.shutdown_file = (
|
303
|
-
shutdown_file
|
304
|
-
or (self.workflow_dir_local / SHUTDOWN_FILENAME).as_posix()
|
305
|
-
)
|
306
|
-
|
307
|
-
def _cleanup(self, jobid: str) -> None:
|
308
|
-
"""
|
309
|
-
Given a job ID as returned by _start, perform any necessary
|
310
|
-
cleanup after the job has finished.
|
311
|
-
"""
|
312
|
-
with self.jobs_lock:
|
313
|
-
self.map_jobid_to_slurm_files.pop(jobid)
|
314
|
-
|
315
|
-
def submit(
|
316
|
-
self,
|
317
|
-
fun: Callable[..., Any],
|
318
|
-
*fun_args: Sequence[Any],
|
319
|
-
slurm_config: SlurmConfig,
|
320
|
-
task_files: TaskFiles,
|
321
|
-
**fun_kwargs: dict,
|
322
|
-
) -> Future:
|
323
|
-
"""
|
324
|
-
Submit a function for execution on `FractalSlurmExecutor`
|
325
|
-
|
326
|
-
Arguments:
|
327
|
-
fun: The function to be executed
|
328
|
-
fun_args: Function positional arguments
|
329
|
-
fun_kwargs: Function keyword arguments
|
330
|
-
slurm_config:
|
331
|
-
A `SlurmConfig` object.
|
332
|
-
task_files:
|
333
|
-
A `TaskFiles` object.
|
334
|
-
|
335
|
-
Returns:
|
336
|
-
Future representing the execution of the current SLURM job.
|
337
|
-
"""
|
338
|
-
|
339
|
-
# Do not continue if auxiliary thread was shut down
|
340
|
-
if self.wait_thread.shutdown:
|
341
|
-
error_msg = "Cannot call `submit` method after executor shutdown"
|
342
|
-
logger.warning(error_msg)
|
343
|
-
raise JobExecutionError(info=error_msg)
|
344
|
-
|
345
|
-
# Set slurm_file_prefix
|
346
|
-
slurm_file_prefix = task_files.file_prefix
|
347
|
-
|
348
|
-
# Include common_script_lines in extra_lines
|
349
|
-
logger.debug(
|
350
|
-
f"Adding {self.common_script_lines=} to "
|
351
|
-
f"{slurm_config.extra_lines=}, from submit method."
|
352
|
-
)
|
353
|
-
current_extra_lines = slurm_config.extra_lines or []
|
354
|
-
slurm_config.extra_lines = (
|
355
|
-
current_extra_lines + self.common_script_lines
|
356
|
-
)
|
357
|
-
|
358
|
-
# Adapt slurm_config to the fact that this is a single-task SlurmJob
|
359
|
-
# instance
|
360
|
-
slurm_config.tasks_per_job = 1
|
361
|
-
slurm_config.parallel_tasks_per_job = 1
|
362
|
-
|
363
|
-
fut = self._submit_job(
|
364
|
-
fun,
|
365
|
-
slurm_config=slurm_config,
|
366
|
-
slurm_file_prefix=slurm_file_prefix,
|
367
|
-
task_files=task_files,
|
368
|
-
single_task_submission=True,
|
369
|
-
args=fun_args,
|
370
|
-
kwargs=fun_kwargs,
|
371
|
-
)
|
372
|
-
return fut
|
373
|
-
|
374
|
-
def map(
|
375
|
-
self,
|
376
|
-
fn: Callable[..., Any],
|
377
|
-
iterable: list[Sequence[Any]],
|
378
|
-
*,
|
379
|
-
slurm_config: SlurmConfig,
|
380
|
-
task_files: TaskFiles,
|
381
|
-
):
|
382
|
-
"""
|
383
|
-
Return an iterator with the results of several execution of a function
|
384
|
-
|
385
|
-
This function is based on `concurrent.futures.Executor.map` from Python
|
386
|
-
Standard Library 3.11.
|
387
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
|
388
|
-
PSF under a Contributor Agreement.
|
389
|
-
|
390
|
-
Main modifications from the PSF function:
|
391
|
-
|
392
|
-
1. Only `fn` and `iterable` can be assigned as positional arguments;
|
393
|
-
2. `*iterables` argument replaced with a single `iterable`;
|
394
|
-
3. `timeout` and `chunksize` arguments are not supported.
|
395
|
-
|
396
|
-
Arguments:
|
397
|
-
fn:
|
398
|
-
The function to be executed
|
399
|
-
iterable:
|
400
|
-
An iterable such that each element is the list of arguments to
|
401
|
-
be passed to `fn`, as in `fn(*args)`.
|
402
|
-
slurm_config:
|
403
|
-
A `SlurmConfig` object.
|
404
|
-
task_files:
|
405
|
-
A `TaskFiles` object.
|
406
|
-
|
407
|
-
"""
|
408
|
-
|
409
|
-
# Do not continue if auxiliary thread was shut down
|
410
|
-
if self.wait_thread.shutdown:
|
411
|
-
error_msg = "Cannot call `map` method after executor shutdown"
|
412
|
-
logger.warning(error_msg)
|
413
|
-
raise JobExecutionError(info=error_msg)
|
414
|
-
|
415
|
-
def _result_or_cancel(fut):
|
416
|
-
"""
|
417
|
-
This function is based on the Python Standard Library 3.11.
|
418
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
419
|
-
Licensed to PSF under a Contributor Agreement.
|
420
|
-
"""
|
421
|
-
try:
|
422
|
-
try:
|
423
|
-
return fut.result()
|
424
|
-
finally:
|
425
|
-
fut.cancel()
|
426
|
-
finally:
|
427
|
-
# Break a reference cycle with the exception in
|
428
|
-
# self._exception
|
429
|
-
del fut
|
430
|
-
|
431
|
-
# Include common_script_lines in extra_lines
|
432
|
-
logger.debug(
|
433
|
-
f"Adding {self.common_script_lines=} to "
|
434
|
-
f"{slurm_config.extra_lines=}, from map method."
|
435
|
-
)
|
436
|
-
current_extra_lines = slurm_config.extra_lines or []
|
437
|
-
slurm_config.extra_lines = (
|
438
|
-
current_extra_lines + self.common_script_lines
|
439
|
-
)
|
440
|
-
|
441
|
-
# Set file prefixes
|
442
|
-
general_slurm_file_prefix = str(task_files.task_order)
|
443
|
-
|
444
|
-
# Transform iterable into a list and count its elements
|
445
|
-
list_args = list(iterable)
|
446
|
-
tot_tasks = len(list_args)
|
447
|
-
|
448
|
-
# Set/validate parameters for task batching
|
449
|
-
tasks_per_job, parallel_tasks_per_job = heuristics(
|
450
|
-
# Number of parallel components (always known)
|
451
|
-
tot_tasks=len(list_args),
|
452
|
-
# Optional WorkflowTask attributes:
|
453
|
-
tasks_per_job=slurm_config.tasks_per_job,
|
454
|
-
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
455
|
-
# Task requirements (multiple possible sources):
|
456
|
-
cpus_per_task=slurm_config.cpus_per_task,
|
457
|
-
mem_per_task=slurm_config.mem_per_task_MB,
|
458
|
-
# Fractal configuration variables (soft/hard limits):
|
459
|
-
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
460
|
-
target_mem_per_job=slurm_config.target_mem_per_job,
|
461
|
-
target_num_jobs=slurm_config.target_num_jobs,
|
462
|
-
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
463
|
-
max_mem_per_job=slurm_config.max_mem_per_job,
|
464
|
-
max_num_jobs=slurm_config.max_num_jobs,
|
465
|
-
)
|
466
|
-
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
467
|
-
slurm_config.tasks_per_job = tasks_per_job
|
468
|
-
|
469
|
-
# Divide arguments in batches of `n_tasks_per_script` tasks each
|
470
|
-
args_batches = []
|
471
|
-
batch_size = tasks_per_job
|
472
|
-
for ind_chunk in range(0, tot_tasks, batch_size):
|
473
|
-
args_batches.append(
|
474
|
-
list_args[ind_chunk : ind_chunk + batch_size] # noqa
|
475
|
-
)
|
476
|
-
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
477
|
-
raise RuntimeError("Something wrong here while batching tasks")
|
478
|
-
|
479
|
-
# Fetch configuration variable
|
480
|
-
settings = Inject(get_settings)
|
481
|
-
FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
|
482
|
-
|
483
|
-
# Construct list of futures (one per SLURM job, i.e. one per batch)
|
484
|
-
fs = []
|
485
|
-
current_component_index = 0
|
486
|
-
for ind_batch, batch in enumerate(args_batches):
|
487
|
-
batch_size = len(batch)
|
488
|
-
this_slurm_file_prefix = (
|
489
|
-
f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
|
490
|
-
)
|
491
|
-
fs.append(
|
492
|
-
self._submit_job(
|
493
|
-
fn,
|
494
|
-
slurm_config=slurm_config,
|
495
|
-
slurm_file_prefix=this_slurm_file_prefix,
|
496
|
-
task_files=task_files,
|
497
|
-
single_task_submission=False,
|
498
|
-
components=batch,
|
499
|
-
)
|
500
|
-
)
|
501
|
-
current_component_index += batch_size
|
502
|
-
time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
|
503
|
-
|
504
|
-
# Yield must be hidden in closure so that the futures are submitted
|
505
|
-
# before the first iterator value is required.
|
506
|
-
# NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
|
507
|
-
# iterable of results (if successful), and we should yield its elements
|
508
|
-
# rather than the whole iterable.
|
509
|
-
def result_iterator():
|
510
|
-
"""
|
511
|
-
This function is based on the Python Standard Library 3.11.
|
512
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
513
|
-
Licensed to PSF under a Contributor Agreement.
|
514
|
-
"""
|
515
|
-
try:
|
516
|
-
# reverse to keep finishing order
|
517
|
-
fs.reverse()
|
518
|
-
while fs:
|
519
|
-
# Careful not to keep a reference to the popped future
|
520
|
-
results = _result_or_cancel(fs.pop())
|
521
|
-
for res in results:
|
522
|
-
yield res
|
523
|
-
finally:
|
524
|
-
for future in fs:
|
525
|
-
future.cancel()
|
526
|
-
|
527
|
-
return result_iterator()
|
528
|
-
|
529
|
-
def _submit_job(
|
530
|
-
self,
|
531
|
-
fun: Callable[..., Any],
|
532
|
-
slurm_file_prefix: str,
|
533
|
-
task_files: TaskFiles,
|
534
|
-
slurm_config: SlurmConfig,
|
535
|
-
single_task_submission: bool = False,
|
536
|
-
args: Optional[Sequence[Any]] = None,
|
537
|
-
kwargs: Optional[dict] = None,
|
538
|
-
components: Optional[list[Any]] = None,
|
539
|
-
) -> Future:
|
540
|
-
"""
|
541
|
-
Submit a multi-task job to the pool, where each task is handled via the
|
542
|
-
pickle/remote logic
|
543
|
-
|
544
|
-
NOTE: this method has different behaviors when it is called from the
|
545
|
-
`self.submit` or `self.map` methods (which is also encoded in
|
546
|
-
`single_task_submission`):
|
547
|
-
|
548
|
-
* When called from `self.submit`, it supports general `args` and
|
549
|
-
`kwargs` arguments;
|
550
|
-
* When called from `self.map`, there cannot be any `args` or `kwargs`
|
551
|
-
argument, but there must be a `components` argument.
|
552
|
-
|
553
|
-
Arguments:
|
554
|
-
fun:
|
555
|
-
slurm_file_prefix:
|
556
|
-
task_files:
|
557
|
-
slurm_config:
|
558
|
-
single_task_submission:
|
559
|
-
args:
|
560
|
-
kwargs:
|
561
|
-
components:
|
562
|
-
|
563
|
-
Returns:
|
564
|
-
Future representing the execution of the current SLURM job.
|
565
|
-
"""
|
566
|
-
|
567
|
-
# Prevent calling sbatch if auxiliary thread was shut down
|
568
|
-
if self.wait_thread.shutdown:
|
569
|
-
error_msg = (
|
570
|
-
"Cannot call `_submit_job` method after executor shutdown"
|
571
|
-
)
|
572
|
-
logger.warning(error_msg)
|
573
|
-
raise JobExecutionError(info=error_msg)
|
574
|
-
|
575
|
-
fut: Future = Future()
|
576
|
-
|
577
|
-
# Inject SLURM account (if set) into slurm_config
|
578
|
-
if self.slurm_account:
|
579
|
-
slurm_config.account = self.slurm_account
|
580
|
-
|
581
|
-
# Define slurm-job-related files
|
582
|
-
if single_task_submission:
|
583
|
-
if components is not None:
|
584
|
-
raise ValueError(
|
585
|
-
f"{single_task_submission=} but components is not None"
|
586
|
-
)
|
587
|
-
job = SlurmJob(
|
588
|
-
slurm_file_prefix=slurm_file_prefix,
|
589
|
-
num_tasks_tot=1,
|
590
|
-
slurm_config=slurm_config,
|
591
|
-
)
|
592
|
-
if job.num_tasks_tot > 1:
|
593
|
-
raise ValueError(
|
594
|
-
"{single_task_submission=} but {job.num_tasks_tot=}"
|
595
|
-
)
|
596
|
-
job.single_task_submission = True
|
597
|
-
job.wftask_file_prefixes = (task_files.file_prefix,)
|
598
|
-
job.wftask_subfolder_name = task_files.subfolder_name
|
599
|
-
|
600
|
-
else:
|
601
|
-
if not components or len(components) < 1:
|
602
|
-
raise ValueError(
|
603
|
-
"In FractalSlurmExecutor._submit_job, given "
|
604
|
-
f"{components=}."
|
605
|
-
)
|
606
|
-
num_tasks_tot = len(components)
|
607
|
-
job = SlurmJob(
|
608
|
-
slurm_file_prefix=slurm_file_prefix,
|
609
|
-
num_tasks_tot=num_tasks_tot,
|
610
|
-
slurm_config=slurm_config,
|
611
|
-
)
|
612
|
-
|
613
|
-
_prefixes = []
|
614
|
-
_subfolder_names = []
|
615
|
-
for component in components:
|
616
|
-
# In Fractal, `component` is a `dict` by construction (e.g.
|
617
|
-
# `component = {"zarr_url": "/something", "param": 1}``). The
|
618
|
-
# try/except covers the case of e.g. `executor.map([1, 2])`,
|
619
|
-
# which is useful for testing.
|
620
|
-
try:
|
621
|
-
actual_component = component.get(_COMPONENT_KEY_, None)
|
622
|
-
except AttributeError:
|
623
|
-
actual_component = str(component)
|
624
|
-
_task_file_paths = get_task_file_paths(
|
625
|
-
workflow_dir_local=task_files.workflow_dir_local,
|
626
|
-
workflow_dir_remote=task_files.workflow_dir_remote,
|
627
|
-
task_name=task_files.task_name,
|
628
|
-
task_order=task_files.task_order,
|
629
|
-
component=actual_component,
|
630
|
-
)
|
631
|
-
_prefixes.append(_task_file_paths.file_prefix)
|
632
|
-
_subfolder_names.append(_task_file_paths.subfolder_name)
|
633
|
-
job.wftask_file_prefixes = tuple(_prefixes)
|
634
|
-
|
635
|
-
num_subfolders = len(set(_subfolder_names))
|
636
|
-
if num_subfolders != 1:
|
637
|
-
error_msg_short = (
|
638
|
-
f"[_submit_job] Subfolder list has {num_subfolders} "
|
639
|
-
"different values, but it must have only one (since "
|
640
|
-
"workflow tasks are executed one by one)."
|
641
|
-
)
|
642
|
-
error_msg_detail = (
|
643
|
-
"[_submit_job] Current unique subfolder names: "
|
644
|
-
f"{set(_subfolder_names)}"
|
645
|
-
)
|
646
|
-
logger.error(error_msg_short)
|
647
|
-
logger.error(error_msg_detail)
|
648
|
-
raise ValueError(error_msg_short)
|
649
|
-
job.wftask_subfolder_name = _subfolder_names[0]
|
650
|
-
|
651
|
-
# Check that server-side subfolder exists
|
652
|
-
subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
|
653
|
-
if not subfolder_path.exists():
|
654
|
-
raise FileNotFoundError(
|
655
|
-
f"Missing folder {subfolder_path.as_posix()}."
|
656
|
-
)
|
657
|
-
|
658
|
-
job.input_pickle_files = tuple(
|
659
|
-
get_pickle_file_path(
|
660
|
-
arg=job.workerids[ind],
|
661
|
-
workflow_dir=self.workflow_dir_local,
|
662
|
-
subfolder_name=job.wftask_subfolder_name,
|
663
|
-
in_or_out="in",
|
664
|
-
prefix=job.wftask_file_prefixes[ind],
|
665
|
-
)
|
666
|
-
for ind in range(job.num_tasks_tot)
|
667
|
-
)
|
668
|
-
job.output_pickle_files = tuple(
|
669
|
-
get_pickle_file_path(
|
670
|
-
arg=job.workerids[ind],
|
671
|
-
workflow_dir=self.workflow_dir_remote,
|
672
|
-
subfolder_name=job.wftask_subfolder_name,
|
673
|
-
in_or_out="out",
|
674
|
-
prefix=job.wftask_file_prefixes[ind],
|
675
|
-
)
|
676
|
-
for ind in range(job.num_tasks_tot)
|
677
|
-
)
|
678
|
-
# Define SLURM-job file names/paths
|
679
|
-
job.slurm_script = get_slurm_script_file_path(
|
680
|
-
workflow_dir=self.workflow_dir_local,
|
681
|
-
subfolder_name=job.wftask_subfolder_name,
|
682
|
-
prefix=job.slurm_file_prefix,
|
683
|
-
)
|
684
|
-
job.slurm_stdout = get_slurm_file_path(
|
685
|
-
workflow_dir=self.workflow_dir_remote,
|
686
|
-
subfolder_name=job.wftask_subfolder_name,
|
687
|
-
out_or_err="out",
|
688
|
-
prefix=job.slurm_file_prefix,
|
689
|
-
)
|
690
|
-
job.slurm_stderr = get_slurm_file_path(
|
691
|
-
workflow_dir=self.workflow_dir_remote,
|
692
|
-
subfolder_name=job.wftask_subfolder_name,
|
693
|
-
out_or_err="err",
|
694
|
-
prefix=job.slurm_file_prefix,
|
695
|
-
)
|
696
|
-
|
697
|
-
# Dump serialized versions+function+args+kwargs to pickle
|
698
|
-
versions = dict(
|
699
|
-
python=sys.version_info[:3],
|
700
|
-
cloudpickle=cloudpickle.__version__,
|
701
|
-
fractal_server=__VERSION__,
|
702
|
-
)
|
703
|
-
if job.single_task_submission:
|
704
|
-
_args = args or []
|
705
|
-
_kwargs = kwargs or {}
|
706
|
-
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
707
|
-
with open(job.input_pickle_files[0], "wb") as f:
|
708
|
-
f.write(funcser)
|
709
|
-
else:
|
710
|
-
for ind_component, component in enumerate(components):
|
711
|
-
_args = [component]
|
712
|
-
_kwargs = {}
|
713
|
-
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
714
|
-
with open(job.input_pickle_files[ind_component], "wb") as f:
|
715
|
-
f.write(funcser)
|
716
|
-
|
717
|
-
# Submit job to SLURM, and get jobid
|
718
|
-
jobid, job = self._start(job)
|
719
|
-
|
720
|
-
# Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
|
721
|
-
# must be after self._start(job), so that "%j" has already been
|
722
|
-
# replaced with the job ID)
|
723
|
-
with self.jobs_lock:
|
724
|
-
self.map_jobid_to_slurm_files[jobid] = (
|
725
|
-
job.slurm_script.as_posix(),
|
726
|
-
job.slurm_stdout.as_posix(),
|
727
|
-
job.slurm_stderr.as_posix(),
|
728
|
-
)
|
729
|
-
|
730
|
-
# Thread will wait for it to finish.
|
731
|
-
self.wait_thread.wait(
|
732
|
-
filenames=job.get_clean_output_pickle_files(),
|
733
|
-
jobid=jobid,
|
734
|
-
)
|
735
|
-
|
736
|
-
with self.jobs_lock:
|
737
|
-
self.jobs[jobid] = (fut, job)
|
738
|
-
return fut
|
739
|
-
|
740
|
-
def _prepare_JobExecutionError(
|
741
|
-
self, jobid: str, info: str
|
742
|
-
) -> JobExecutionError:
|
743
|
-
"""
|
744
|
-
Prepare the `JobExecutionError` for a given job
|
745
|
-
|
746
|
-
This method creates a `JobExecutionError` object and sets its attribute
|
747
|
-
to the appropriate SLURM-related file names. Note that the method
|
748
|
-
should always be called after values in `self.map_jobid_to_slurm_files`
|
749
|
-
have been updated, so that they point to `self.workflow_dir_local`
|
750
|
-
files which are readable from `fractal-server`.
|
751
|
-
|
752
|
-
Arguments:
|
753
|
-
jobid:
|
754
|
-
ID of the SLURM job.
|
755
|
-
info:
|
756
|
-
"""
|
757
|
-
# Extract SLURM file paths
|
758
|
-
with self.jobs_lock:
|
759
|
-
(
|
760
|
-
slurm_script_file,
|
761
|
-
slurm_stdout_file,
|
762
|
-
slurm_stderr_file,
|
763
|
-
) = self.map_jobid_to_slurm_files[jobid]
|
764
|
-
# Construct JobExecutionError exception
|
765
|
-
job_exc = JobExecutionError(
|
766
|
-
cmd_file=slurm_script_file,
|
767
|
-
stdout_file=slurm_stdout_file,
|
768
|
-
stderr_file=slurm_stderr_file,
|
769
|
-
info=info,
|
770
|
-
)
|
771
|
-
return job_exc
|
772
|
-
|
773
|
-
def _completion(self, jobid: str) -> None:
|
774
|
-
"""
|
775
|
-
Callback function to be executed whenever a job finishes.
|
776
|
-
|
777
|
-
This function is executed by self.wait_thread (triggered by either
|
778
|
-
finding an existing output pickle file `out_path` or finding that the
|
779
|
-
SLURM job is over). Since this takes place on a different thread,
|
780
|
-
failures may not be captured by the main thread; we use a broad
|
781
|
-
try/except block, so that those exceptions are reported to the main
|
782
|
-
thread via `fut.set_exception(...)`.
|
783
|
-
|
784
|
-
Arguments:
|
785
|
-
jobid: ID of the SLURM job
|
786
|
-
"""
|
787
|
-
# Handle all uncaught exceptions in this broad try/except block
|
788
|
-
try:
|
789
|
-
# Retrieve job
|
790
|
-
with self.jobs_lock:
|
791
|
-
try:
|
792
|
-
fut, job = self.jobs.pop(jobid)
|
793
|
-
except KeyError:
|
794
|
-
return
|
795
|
-
if not self.jobs:
|
796
|
-
self.jobs_empty_cond.notify_all()
|
797
|
-
|
798
|
-
# Copy all relevant files from self.workflow_dir_remote to
|
799
|
-
# self.workflow_dir_local
|
800
|
-
|
801
|
-
self._copy_files_from_remote_to_local(job)
|
802
|
-
|
803
|
-
# Update the paths to use the files in self.workflow_dir_local
|
804
|
-
# (rather than the user's ones in self.workflow_dir_remote)
|
805
|
-
with self.jobs_lock:
|
806
|
-
self.map_jobid_to_slurm_files[jobid]
|
807
|
-
(
|
808
|
-
slurm_script_file,
|
809
|
-
slurm_stdout_file,
|
810
|
-
slurm_stderr_file,
|
811
|
-
) = self.map_jobid_to_slurm_files[jobid]
|
812
|
-
new_slurm_stdout_file = str(
|
813
|
-
self.workflow_dir_local
|
814
|
-
/ job.wftask_subfolder_name
|
815
|
-
/ Path(slurm_stdout_file).name
|
816
|
-
)
|
817
|
-
new_slurm_stderr_file = str(
|
818
|
-
self.workflow_dir_local
|
819
|
-
/ job.wftask_subfolder_name
|
820
|
-
/ Path(slurm_stderr_file).name
|
821
|
-
)
|
822
|
-
with self.jobs_lock:
|
823
|
-
self.map_jobid_to_slurm_files[jobid] = (
|
824
|
-
slurm_script_file,
|
825
|
-
new_slurm_stdout_file,
|
826
|
-
new_slurm_stderr_file,
|
827
|
-
)
|
828
|
-
|
829
|
-
in_paths = job.input_pickle_files
|
830
|
-
out_paths = tuple(
|
831
|
-
(self.workflow_dir_local / job.wftask_subfolder_name / f.name)
|
832
|
-
for f in job.output_pickle_files
|
833
|
-
)
|
834
|
-
|
835
|
-
outputs = []
|
836
|
-
for ind_out_path, out_path in enumerate(out_paths):
|
837
|
-
in_path = in_paths[ind_out_path]
|
838
|
-
|
839
|
-
# The output pickle file may be missing because of some slow
|
840
|
-
# filesystem operation; wait some time before considering it as
|
841
|
-
# missing
|
842
|
-
if not out_path.exists():
|
843
|
-
settings = Inject(get_settings)
|
844
|
-
time.sleep(settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL)
|
845
|
-
if not out_path.exists():
|
846
|
-
# Output pickle file is missing
|
847
|
-
info = (
|
848
|
-
"Output pickle file of the FractalSlurmExecutor job "
|
849
|
-
"not found.\n"
|
850
|
-
f"Expected file path: {str(out_path)}.\n"
|
851
|
-
"Here are some possible reasons:\n"
|
852
|
-
"1. The SLURM job was scancel-ed, either by the user "
|
853
|
-
"or due to an error (e.g. an out-of-memory or timeout "
|
854
|
-
"error). Note that if the scancel took place before "
|
855
|
-
"the job started running, the SLURM out/err files "
|
856
|
-
"will be empty.\n"
|
857
|
-
"2. Some error occurred upon writing the file to disk "
|
858
|
-
"(e.g. because there is not enough space on disk, or "
|
859
|
-
"due to an overloaded NFS filesystem). "
|
860
|
-
"Note that the server configuration has "
|
861
|
-
"FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
|
862
|
-
f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
|
863
|
-
"seconds.\n"
|
864
|
-
)
|
865
|
-
job_exc = self._prepare_JobExecutionError(jobid, info=info)
|
866
|
-
try:
|
867
|
-
fut.set_exception(job_exc)
|
868
|
-
return
|
869
|
-
except InvalidStateError:
|
870
|
-
logger.warning(
|
871
|
-
f"Future {fut} (SLURM job ID: {jobid}) was already"
|
872
|
-
" cancelled, exit from"
|
873
|
-
" FractalSlurmExecutor._completion."
|
874
|
-
)
|
875
|
-
in_path.unlink()
|
876
|
-
self._cleanup(jobid)
|
877
|
-
return
|
878
|
-
|
879
|
-
# Read the task output (note: we now know that out_path exists)
|
880
|
-
with out_path.open("rb") as f:
|
881
|
-
outdata = f.read()
|
882
|
-
# Note: output can be either the task result (typically a
|
883
|
-
# dictionary) or an ExceptionProxy object; in the latter
|
884
|
-
# case, the ExceptionProxy definition is also part of the
|
885
|
-
# pickle file (thanks to cloudpickle.dumps).
|
886
|
-
success, output = cloudpickle.loads(outdata)
|
887
|
-
try:
|
888
|
-
if success:
|
889
|
-
outputs.append(output)
|
890
|
-
else:
|
891
|
-
proxy = output
|
892
|
-
if proxy.exc_type_name == "JobExecutionError":
|
893
|
-
job_exc = self._prepare_JobExecutionError(
|
894
|
-
jobid, info=proxy.kwargs.get("info", None)
|
895
|
-
)
|
896
|
-
fut.set_exception(job_exc)
|
897
|
-
return
|
898
|
-
else:
|
899
|
-
# This branch catches both TaskExecutionError's
|
900
|
-
# (coming from the typical fractal-server
|
901
|
-
# execution of tasks, and with additional
|
902
|
-
# fractal-specific kwargs) or arbitrary
|
903
|
-
# exceptions (coming from a direct use of
|
904
|
-
# FractalSlurmExecutor, possibly outside
|
905
|
-
# fractal-server)
|
906
|
-
kwargs = {}
|
907
|
-
for key in [
|
908
|
-
"workflow_task_id",
|
909
|
-
"workflow_task_order",
|
910
|
-
"task_name",
|
911
|
-
]:
|
912
|
-
if key in proxy.kwargs.keys():
|
913
|
-
kwargs[key] = proxy.kwargs[key]
|
914
|
-
exc = TaskExecutionError(proxy.tb, **kwargs)
|
915
|
-
fut.set_exception(exc)
|
916
|
-
return
|
917
|
-
out_path.unlink()
|
918
|
-
except InvalidStateError:
|
919
|
-
logger.warning(
|
920
|
-
f"Future {fut} (SLURM job ID: {jobid}) was already"
|
921
|
-
" cancelled, exit from"
|
922
|
-
" FractalSlurmExecutor._completion."
|
923
|
-
)
|
924
|
-
out_path.unlink()
|
925
|
-
in_path.unlink()
|
926
|
-
self._cleanup(jobid)
|
927
|
-
return
|
928
|
-
|
929
|
-
# Clean up input pickle file
|
930
|
-
in_path.unlink()
|
931
|
-
self._cleanup(jobid)
|
932
|
-
if job.single_task_submission:
|
933
|
-
fut.set_result(outputs[0])
|
934
|
-
else:
|
935
|
-
fut.set_result(outputs)
|
936
|
-
return
|
937
|
-
|
938
|
-
except Exception as e:
|
939
|
-
try:
|
940
|
-
fut.set_exception(e)
|
941
|
-
return
|
942
|
-
except InvalidStateError:
|
943
|
-
logger.warning(
|
944
|
-
f"Future {fut} (SLURM job ID: {jobid}) was already"
|
945
|
-
" cancelled, exit from"
|
946
|
-
" FractalSlurmExecutor._completion."
|
947
|
-
)
|
948
|
-
|
949
|
-
def _copy_files_from_remote_to_local(
|
950
|
-
self,
|
951
|
-
job: SlurmJob,
|
952
|
-
):
|
953
|
-
"""
|
954
|
-
Impersonate the user and copy task-related files
|
955
|
-
|
956
|
-
For all files in `self.workflow_dir_remote` that start with
|
957
|
-
`job.file_prefix`, read them (with `sudo -u` impersonation) and write
|
958
|
-
them to `self.workflow_dir_local`.
|
959
|
-
|
960
|
-
Files to copy:
|
961
|
-
* Job-related files (SLURM stderr/stdout files); with prefix
|
962
|
-
`job.slurm_file_prefix`;
|
963
|
-
* Task-related files (stderr/stdout, args.json, metadiff.json, output
|
964
|
-
pickle), with prefixes `job.wftask_file_prefixes`.
|
965
|
-
|
966
|
-
Arguments:
|
967
|
-
job:
|
968
|
-
`SlurmJob` object (needed for its prefixes-related attributes).
|
969
|
-
|
970
|
-
Raises:
|
971
|
-
JobExecutionError: If a `cat` command fails.
|
972
|
-
"""
|
973
|
-
logger.debug("[_copy_files_from_remote_to_local] Start")
|
974
|
-
|
975
|
-
if self.workflow_dir_remote == self.workflow_dir_local:
|
976
|
-
logger.debug(
|
977
|
-
"[_copy_files_from_remote_to_local] "
|
978
|
-
"workflow_dir_local corresponds to workflow_dir_remote, "
|
979
|
-
"return."
|
980
|
-
)
|
981
|
-
return
|
982
|
-
|
983
|
-
subfolder_name = job.wftask_subfolder_name
|
984
|
-
prefixes = set(
|
985
|
-
[job.slurm_file_prefix] + list(job.wftask_file_prefixes)
|
986
|
-
)
|
987
|
-
|
988
|
-
logger.debug(
|
989
|
-
"[_copy_files_from_remote_to_local] "
|
990
|
-
f"WorkflowTask subfolder_name: {subfolder_name}"
|
991
|
-
)
|
992
|
-
logger.debug(f"[_copy_files_from_remote_to_local] {prefixes=}")
|
993
|
-
logger.debug(
|
994
|
-
"[_copy_files_from_remote_to_local] "
|
995
|
-
f"{str(self.workflow_dir_remote)=}"
|
996
|
-
)
|
997
|
-
|
998
|
-
for prefix in prefixes:
|
999
|
-
if prefix == job.slurm_file_prefix:
|
1000
|
-
files_to_copy = _glob_as_user(
|
1001
|
-
folder=str(self.workflow_dir_remote / subfolder_name),
|
1002
|
-
user=self.slurm_user,
|
1003
|
-
startswith=prefix,
|
1004
|
-
)
|
1005
|
-
else:
|
1006
|
-
files_to_copy = _glob_as_user_strict(
|
1007
|
-
folder=str(self.workflow_dir_remote / subfolder_name),
|
1008
|
-
user=self.slurm_user,
|
1009
|
-
startswith=prefix,
|
1010
|
-
)
|
1011
|
-
|
1012
|
-
logger.debug(
|
1013
|
-
"[_copy_files_from_remote_to_local] "
|
1014
|
-
f"{prefix=}, {len(files_to_copy)=}"
|
1015
|
-
)
|
1016
|
-
|
1017
|
-
for source_file_name in files_to_copy:
|
1018
|
-
if " " in source_file_name:
|
1019
|
-
raise ValueError(
|
1020
|
-
f'source_file_name="{source_file_name}" '
|
1021
|
-
"contains whitespaces"
|
1022
|
-
)
|
1023
|
-
source_file_path = str(
|
1024
|
-
self.workflow_dir_remote
|
1025
|
-
/ subfolder_name
|
1026
|
-
/ source_file_name
|
1027
|
-
)
|
1028
|
-
|
1029
|
-
# Read source_file_path (requires sudo)
|
1030
|
-
# NOTE: By setting encoding=None, we read/write bytes instead
|
1031
|
-
# of strings; this is needed to also handle pickle files.
|
1032
|
-
cmd = f"cat {source_file_path}"
|
1033
|
-
res = _run_command_as_user(
|
1034
|
-
cmd=cmd, user=self.slurm_user, encoding=None
|
1035
|
-
)
|
1036
|
-
if res.returncode != 0:
|
1037
|
-
info = (
|
1038
|
-
f'Running cmd="{cmd}" as {self.slurm_user=} failed\n\n'
|
1039
|
-
f"{res.returncode=}\n\n"
|
1040
|
-
f"{res.stdout=}\n\n{res.stderr=}\n"
|
1041
|
-
)
|
1042
|
-
logger.error(info)
|
1043
|
-
raise JobExecutionError(info)
|
1044
|
-
# Write to dest_file_path (including empty files)
|
1045
|
-
dest_file_path = str(
|
1046
|
-
self.workflow_dir_local / subfolder_name / source_file_name
|
1047
|
-
)
|
1048
|
-
with open(dest_file_path, "wb") as f:
|
1049
|
-
f.write(res.stdout)
|
1050
|
-
logger.debug("[_copy_files_from_remote_to_local] End")
|
1051
|
-
|
1052
|
-
def _start(
|
1053
|
-
self,
|
1054
|
-
job: SlurmJob,
|
1055
|
-
) -> tuple[str, SlurmJob]:
|
1056
|
-
"""
|
1057
|
-
Submit function for execution on a SLURM cluster
|
1058
|
-
"""
|
1059
|
-
|
1060
|
-
# Prepare commands to be included in SLURM submission script
|
1061
|
-
settings = Inject(get_settings)
|
1062
|
-
python_worker_interpreter = (
|
1063
|
-
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
1064
|
-
)
|
1065
|
-
|
1066
|
-
cmdlines = []
|
1067
|
-
for ind_task in range(job.num_tasks_tot):
|
1068
|
-
input_pickle_file = job.input_pickle_files[ind_task]
|
1069
|
-
output_pickle_file = job.output_pickle_files[ind_task]
|
1070
|
-
cmdlines.append(
|
1071
|
-
(
|
1072
|
-
f"{python_worker_interpreter}"
|
1073
|
-
" -m fractal_server.app.runner.executors.slurm.remote "
|
1074
|
-
f"--input-file {input_pickle_file} "
|
1075
|
-
f"--output-file {output_pickle_file}"
|
1076
|
-
)
|
1077
|
-
)
|
1078
|
-
|
1079
|
-
# ...
|
1080
|
-
sbatch_script = self._prepare_sbatch_script(
|
1081
|
-
slurm_config=job.slurm_config,
|
1082
|
-
list_commands=cmdlines,
|
1083
|
-
slurm_out_path=str(job.slurm_stdout),
|
1084
|
-
slurm_err_path=str(job.slurm_stderr),
|
1085
|
-
)
|
1086
|
-
|
1087
|
-
# Print warning for ignored parameter
|
1088
|
-
if len(job.slurm_config.pre_submission_commands) > 0:
|
1089
|
-
logger.warning(
|
1090
|
-
f"Ignoring {job.slurm_config.pre_submission_commands=}."
|
1091
|
-
)
|
1092
|
-
|
1093
|
-
# Submit job via sbatch, and retrieve jobid
|
1094
|
-
|
1095
|
-
# Write script content to a job.slurm_script
|
1096
|
-
with job.slurm_script.open("w") as f:
|
1097
|
-
f.write(sbatch_script)
|
1098
|
-
|
1099
|
-
# Prepare submission command
|
1100
|
-
pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
|
1101
|
-
submit_command = f"sbatch --parsable {job.slurm_script}"
|
1102
|
-
full_command = f"{pre_command} {submit_command}"
|
1103
|
-
|
1104
|
-
# Submit SLURM job and retrieve job ID
|
1105
|
-
output = _subprocess_run_or_raise(full_command)
|
1106
|
-
try:
|
1107
|
-
jobid = int(output.stdout)
|
1108
|
-
except ValueError as e:
|
1109
|
-
error_msg = (
|
1110
|
-
f"Submit command `{full_command}` returned "
|
1111
|
-
f"`{output.stdout=}` which cannot be cast to an integer "
|
1112
|
-
f"SLURM-job ID. Original error:\n{str(e)}"
|
1113
|
-
)
|
1114
|
-
logger.error(error_msg)
|
1115
|
-
raise JobExecutionError(info=error_msg)
|
1116
|
-
jobid_str = str(jobid)
|
1117
|
-
|
1118
|
-
# Plug SLURM job id in stdout/stderr file paths
|
1119
|
-
job.slurm_stdout = Path(
|
1120
|
-
job.slurm_stdout.as_posix().replace("%j", jobid_str)
|
1121
|
-
)
|
1122
|
-
job.slurm_stderr = Path(
|
1123
|
-
job.slurm_stderr.as_posix().replace("%j", jobid_str)
|
1124
|
-
)
|
1125
|
-
|
1126
|
-
return jobid_str, job
|
1127
|
-
|
1128
|
-
def _prepare_sbatch_script(
|
1129
|
-
self,
|
1130
|
-
*,
|
1131
|
-
list_commands: list[str],
|
1132
|
-
slurm_out_path: str,
|
1133
|
-
slurm_err_path: str,
|
1134
|
-
slurm_config: SlurmConfig,
|
1135
|
-
):
|
1136
|
-
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
1137
|
-
mem_per_task_MB = slurm_config.mem_per_task_MB
|
1138
|
-
|
1139
|
-
# Set ntasks
|
1140
|
-
ntasks = min(len(list_commands), num_tasks_max_running)
|
1141
|
-
if len(list_commands) < num_tasks_max_running:
|
1142
|
-
ntasks = len(list_commands)
|
1143
|
-
slurm_config.parallel_tasks_per_job = ntasks
|
1144
|
-
logger.debug(
|
1145
|
-
f"{len(list_commands)=} is smaller than "
|
1146
|
-
f"{num_tasks_max_running=}. Setting {ntasks=}."
|
1147
|
-
)
|
1148
|
-
|
1149
|
-
# Prepare SLURM preamble based on SlurmConfig object
|
1150
|
-
script_lines = slurm_config.to_sbatch_preamble(
|
1151
|
-
remote_export_dir=self.user_cache_dir
|
1152
|
-
)
|
1153
|
-
|
1154
|
-
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
1155
|
-
# fix their order
|
1156
|
-
script_lines.extend(
|
1157
|
-
[
|
1158
|
-
f"#SBATCH --err={slurm_err_path}",
|
1159
|
-
f"#SBATCH --out={slurm_out_path}",
|
1160
|
-
f"#SBATCH -D {self.workflow_dir_remote}",
|
1161
|
-
]
|
1162
|
-
)
|
1163
|
-
script_lines = slurm_config.sort_script_lines(script_lines)
|
1164
|
-
logger.debug(script_lines)
|
1165
|
-
|
1166
|
-
# Always print output of `uname -n` and `pwd`
|
1167
|
-
script_lines.append(
|
1168
|
-
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
1169
|
-
)
|
1170
|
-
|
1171
|
-
# Complete script preamble
|
1172
|
-
script_lines.append("\n")
|
1173
|
-
|
1174
|
-
# Include command lines
|
1175
|
-
tmp_list_commands = copy(list_commands)
|
1176
|
-
while tmp_list_commands:
|
1177
|
-
if tmp_list_commands:
|
1178
|
-
cmd = tmp_list_commands.pop(0) # take first element
|
1179
|
-
script_lines.append(
|
1180
|
-
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
1181
|
-
f"--mem={mem_per_task_MB}MB "
|
1182
|
-
f"{cmd} &"
|
1183
|
-
)
|
1184
|
-
script_lines.append("wait\n")
|
1185
|
-
|
1186
|
-
script = "\n".join(script_lines)
|
1187
|
-
return script
|
1188
|
-
|
1189
|
-
def shutdown(self, wait=True, *, cancel_futures=False):
|
1190
|
-
"""
|
1191
|
-
Clean up all executor variables. Note that this function is executed on
|
1192
|
-
the self.wait_thread thread, see _completion.
|
1193
|
-
"""
|
1194
|
-
|
1195
|
-
logger.debug("Executor shutdown: start")
|
1196
|
-
|
1197
|
-
# Handle all job futures
|
1198
|
-
slurm_jobs_to_scancel = []
|
1199
|
-
with self.jobs_lock:
|
1200
|
-
while self.jobs:
|
1201
|
-
jobid, fut_and_job = self.jobs.popitem()
|
1202
|
-
slurm_jobs_to_scancel.append(jobid)
|
1203
|
-
fut = fut_and_job[0]
|
1204
|
-
self.map_jobid_to_slurm_files.pop(jobid)
|
1205
|
-
if not fut.cancelled():
|
1206
|
-
fut.set_exception(
|
1207
|
-
JobExecutionError(
|
1208
|
-
"Job cancelled due to executor shutdown."
|
1209
|
-
)
|
1210
|
-
)
|
1211
|
-
fut.cancel()
|
1212
|
-
|
1213
|
-
# Cancel SLURM jobs
|
1214
|
-
if slurm_jobs_to_scancel:
|
1215
|
-
scancel_string = " ".join(slurm_jobs_to_scancel)
|
1216
|
-
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
1217
|
-
pre_command = f"sudo --non-interactive -u {self.slurm_user}"
|
1218
|
-
submit_command = f"scancel {scancel_string}"
|
1219
|
-
full_command = f"{pre_command} {submit_command}"
|
1220
|
-
validate_cmd(full_command)
|
1221
|
-
logger.debug(f"Now execute `{full_command}`")
|
1222
|
-
try:
|
1223
|
-
subprocess.run( # nosec
|
1224
|
-
shlex.split(full_command),
|
1225
|
-
capture_output=True,
|
1226
|
-
check=True,
|
1227
|
-
encoding="utf-8",
|
1228
|
-
)
|
1229
|
-
except subprocess.CalledProcessError as e:
|
1230
|
-
error_msg = (
|
1231
|
-
f"Cancel command `{full_command}` failed. "
|
1232
|
-
f"Original error:\n{str(e)}"
|
1233
|
-
)
|
1234
|
-
logger.error(error_msg)
|
1235
|
-
raise JobExecutionError(info=error_msg)
|
1236
|
-
|
1237
|
-
# Redudantly set thread shutdown attribute to True
|
1238
|
-
self.wait_thread.shutdown = True
|
1239
|
-
|
1240
|
-
logger.debug("Executor shutdown: end")
|
1241
|
-
|
1242
|
-
def _stop_and_join_wait_thread(self):
|
1243
|
-
self.wait_thread.shutdown = True
|
1244
|
-
self.wait_thread.join()
|
1245
|
-
|
1246
|
-
def __exit__(self, *args, **kwargs):
|
1247
|
-
"""
|
1248
|
-
See
|
1249
|
-
https://github.com/fractal-analytics-platform/fractal-server/issues/1508
|
1250
|
-
"""
|
1251
|
-
logger.debug(
|
1252
|
-
"[FractalSlurmExecutor.__exit__] Stop and join `wait_thread`"
|
1253
|
-
)
|
1254
|
-
self._stop_and_join_wait_thread()
|
1255
|
-
logger.debug("[FractalSlurmExecutor.__exit__] End")
|
1256
|
-
|
1257
|
-
def check_remote_python_interpreter(self):
|
1258
|
-
"""
|
1259
|
-
Check fractal-server version on the _remote_ Python interpreter.
|
1260
|
-
"""
|
1261
|
-
settings = Inject(get_settings)
|
1262
|
-
output = _subprocess_run_or_raise(
|
1263
|
-
(
|
1264
|
-
f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
|
1265
|
-
"-m fractal_server.app.runner.versions"
|
1266
|
-
)
|
1267
|
-
)
|
1268
|
-
runner_version = json.loads(output.stdout.strip("\n"))[
|
1269
|
-
"fractal_server"
|
1270
|
-
]
|
1271
|
-
|
1272
|
-
if runner_version != __VERSION__:
|
1273
|
-
error_msg = (
|
1274
|
-
"Fractal-server version mismatch.\n"
|
1275
|
-
"Local interpreter: "
|
1276
|
-
f"({sys.executable}): {__VERSION__}.\n"
|
1277
|
-
"Remote interpreter: "
|
1278
|
-
f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
|
1279
|
-
)
|
1280
|
-
logger.error(error_msg)
|
1281
|
-
raise ValueError(error_msg)
|