fractal-server 2.2.0a0__py3-none-any.whl → 2.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/models/v1/state.py +1 -2
- fractal_server/app/routes/admin/v1.py +2 -2
- fractal_server/app/routes/admin/v2.py +2 -2
- fractal_server/app/routes/api/v1/job.py +2 -2
- fractal_server/app/routes/api/v1/task_collection.py +4 -4
- fractal_server/app/routes/api/v2/__init__.py +23 -3
- fractal_server/app/routes/api/v2/job.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +6 -0
- fractal_server/app/routes/api/v2/task_collection.py +74 -34
- fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
- fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
- fractal_server/app/routes/aux/_runner.py +10 -2
- fractal_server/app/runner/compress_folder.py +120 -0
- fractal_server/app/runner/executors/slurm/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/_batching.py +0 -1
- fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
- fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
- fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
- fractal_server/app/runner/extract_archive.py +38 -0
- fractal_server/app/runner/v1/__init__.py +78 -40
- fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
- fractal_server/app/runner/v2/__init__.py +183 -82
- fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
- fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
- fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
- fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
- fractal_server/app/runner/versions.py +30 -0
- fractal_server/app/schemas/v1/__init__.py +1 -0
- fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
- fractal_server/app/schemas/v2/__init__.py +4 -1
- fractal_server/app/schemas/v2/task_collection.py +97 -27
- fractal_server/config.py +222 -21
- fractal_server/main.py +25 -1
- fractal_server/migrations/env.py +1 -1
- fractal_server/ssh/__init__.py +4 -0
- fractal_server/ssh/_fabric.py +190 -0
- fractal_server/tasks/utils.py +12 -64
- fractal_server/tasks/v1/background_operations.py +2 -2
- fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
- fractal_server/tasks/v1/utils.py +67 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
- fractal_server/tasks/v2/_venv_pip.py +195 -0
- fractal_server/tasks/v2/background_operations.py +257 -295
- fractal_server/tasks/v2/background_operations_ssh.py +304 -0
- fractal_server/tasks/v2/endpoint_operations.py +136 -0
- fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
- fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
- fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
- fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
- fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
- fractal_server/tasks/v2/utils.py +54 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +6 -2
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +68 -44
- fractal_server/tasks/v2/get_collection_data.py +0 -14
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1490 @@
|
|
1
|
+
# This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
|
2
|
+
# Original Copyright
|
3
|
+
# Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
4
|
+
# License: MIT
|
5
|
+
#
|
6
|
+
# Modified by:
|
7
|
+
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
8
|
+
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
9
|
+
# Marco Franzon <marco.franzon@exact-lab.it>
|
10
|
+
#
|
11
|
+
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
12
|
+
# University of Zurich
|
13
|
+
import json
|
14
|
+
import math
|
15
|
+
import sys
|
16
|
+
import tarfile
|
17
|
+
import threading
|
18
|
+
import time
|
19
|
+
from concurrent.futures import Future
|
20
|
+
from concurrent.futures import InvalidStateError
|
21
|
+
from copy import copy
|
22
|
+
from pathlib import Path
|
23
|
+
from typing import Any
|
24
|
+
from typing import Callable
|
25
|
+
from typing import Optional
|
26
|
+
from typing import Sequence
|
27
|
+
|
28
|
+
import cloudpickle
|
29
|
+
from cfut import SlurmExecutor
|
30
|
+
from fabric.connection import Connection
|
31
|
+
from paramiko.ssh_exception import NoValidConnectionsError
|
32
|
+
|
33
|
+
from ....filenames import SHUTDOWN_FILENAME
|
34
|
+
from ....task_files import get_task_file_paths
|
35
|
+
from ....task_files import TaskFiles
|
36
|
+
from ....versions import get_versions
|
37
|
+
from ...slurm._slurm_config import get_default_slurm_config
|
38
|
+
from ...slurm._slurm_config import SlurmConfig
|
39
|
+
from .._batching import heuristics
|
40
|
+
from ._executor_wait_thread import FractalSlurmWaitThread
|
41
|
+
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
42
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
43
|
+
from fractal_server.app.runner.exceptions import TaskExecutionError
|
44
|
+
from fractal_server.app.runner.executors.slurm.ssh._slurm_job import SlurmJob
|
45
|
+
from fractal_server.config import get_settings
|
46
|
+
from fractal_server.logger import set_logger
|
47
|
+
from fractal_server.ssh._fabric import check_connection
|
48
|
+
from fractal_server.ssh._fabric import run_command_over_ssh
|
49
|
+
from fractal_server.syringe import Inject
|
50
|
+
|
51
|
+
logger = set_logger(__name__)
|
52
|
+
|
53
|
+
|
54
|
+
class FractalSlurmSSHExecutor(SlurmExecutor):
|
55
|
+
"""
|
56
|
+
FractalSlurmSSHExecutor (inherits from cfut.SlurmExecutor)
|
57
|
+
|
58
|
+
FIXME: docstring
|
59
|
+
|
60
|
+
Attributes:
|
61
|
+
connection: SSH connection
|
62
|
+
shutdown_file:
|
63
|
+
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
+
wait_thread_cls: Class for waiting thread
|
65
|
+
keep_pickle_files:
|
66
|
+
workflow_dir_local:
|
67
|
+
Directory for both the cfut/SLURM and fractal-server files and logs
|
68
|
+
workflow_dir_remote:
|
69
|
+
Directory for both the cfut/SLURM and fractal-server files and logs
|
70
|
+
common_script_lines:
|
71
|
+
Arbitrary script lines that will always be included in the
|
72
|
+
sbatch script
|
73
|
+
slurm_account:
|
74
|
+
jobs:
|
75
|
+
map_jobid_to_slurm_files:
|
76
|
+
Dictionary with paths of slurm-related files for active jobs
|
77
|
+
"""
|
78
|
+
|
79
|
+
connection: Connection
|
80
|
+
|
81
|
+
workflow_dir_local: Path
|
82
|
+
workflow_dir_remote: Path
|
83
|
+
shutdown_file: str
|
84
|
+
python_remote: str
|
85
|
+
|
86
|
+
wait_thread_cls = FractalSlurmWaitThread
|
87
|
+
keep_pickle_files: bool
|
88
|
+
|
89
|
+
common_script_lines: list[str]
|
90
|
+
slurm_account: Optional[str]
|
91
|
+
|
92
|
+
jobs: dict[str, tuple[Future, SlurmJob]]
|
93
|
+
map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
|
94
|
+
|
95
|
+
def __init__(
|
96
|
+
self,
|
97
|
+
*,
|
98
|
+
# SSH connection
|
99
|
+
connection: Connection,
|
100
|
+
# Folders and files
|
101
|
+
workflow_dir_local: Path,
|
102
|
+
workflow_dir_remote: Path,
|
103
|
+
# Runner options
|
104
|
+
keep_pickle_files: bool = False,
|
105
|
+
# Monitoring options
|
106
|
+
slurm_poll_interval: Optional[int] = None,
|
107
|
+
# SLURM submission script options
|
108
|
+
common_script_lines: Optional[list[str]] = None,
|
109
|
+
slurm_account: Optional[str] = None,
|
110
|
+
# Other kwargs are ignored
|
111
|
+
**kwargs,
|
112
|
+
):
|
113
|
+
"""
|
114
|
+
Init method for FractalSlurmSSHExecutor
|
115
|
+
|
116
|
+
Note: since we are not using `super().__init__`, we duplicate some
|
117
|
+
relevant bits of `cfut.ClusterExecutor.__init__`.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
connection:
|
121
|
+
workflow_dir_local:
|
122
|
+
workflow_dir_remote:
|
123
|
+
keep_pickle_files:
|
124
|
+
slurm_poll_interval:
|
125
|
+
common_script_lines:
|
126
|
+
slurm_account:
|
127
|
+
"""
|
128
|
+
|
129
|
+
if kwargs != {}:
|
130
|
+
raise ValueError(
|
131
|
+
f"FractalSlurmSSHExecutor received unexpected {kwargs=}"
|
132
|
+
)
|
133
|
+
|
134
|
+
self.workflow_dir_local = workflow_dir_local
|
135
|
+
self.workflow_dir_remote = workflow_dir_remote
|
136
|
+
|
137
|
+
# Relevant bits of cfut.ClusterExecutor.__init__ are copied here,
|
138
|
+
# postponing the .start() call to when the callbacks are defined
|
139
|
+
self.jobs = {}
|
140
|
+
self.job_outfiles = {}
|
141
|
+
self.jobs_lock = threading.Lock()
|
142
|
+
self.jobs_empty_cond = threading.Condition(self.jobs_lock)
|
143
|
+
self.wait_thread = self.wait_thread_cls(self._completion)
|
144
|
+
|
145
|
+
# Set up attributes and methods for self.wait_thread
|
146
|
+
# cfut.SlurmWaitThread)
|
147
|
+
self.wait_thread.shutdown_callback = self.shutdown
|
148
|
+
self.wait_thread.jobs_finished_callback = self._jobs_finished
|
149
|
+
if slurm_poll_interval is None:
|
150
|
+
settings = Inject(get_settings)
|
151
|
+
slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
|
152
|
+
elif slurm_poll_interval <= 0:
|
153
|
+
raise ValueError(f"Invalid attribute {slurm_poll_interval=}")
|
154
|
+
self.wait_thread.slurm_poll_interval = slurm_poll_interval
|
155
|
+
self.wait_thread.shutdown_file = (
|
156
|
+
self.workflow_dir_local / SHUTDOWN_FILENAME
|
157
|
+
).as_posix()
|
158
|
+
|
159
|
+
# Now start self.wait_thread (note: this must be *after* its callback
|
160
|
+
# methods have been defined)
|
161
|
+
self.wait_thread.start()
|
162
|
+
|
163
|
+
# Define remote Python interpreter
|
164
|
+
settings = Inject(get_settings)
|
165
|
+
self.python_remote = settings.FRACTAL_SLURM_WORKER_PYTHON
|
166
|
+
if self.python_remote is None:
|
167
|
+
raise ValueError("FRACTAL_SLURM_WORKER_PYTHON is not set. Exit.")
|
168
|
+
|
169
|
+
# Initialize connection and perform handshake
|
170
|
+
self.connection = connection
|
171
|
+
logger.warning(self.connection)
|
172
|
+
self.handshake()
|
173
|
+
|
174
|
+
# Set/validate parameters for SLURM submission scripts
|
175
|
+
self.slurm_account = slurm_account
|
176
|
+
self.common_script_lines = common_script_lines or []
|
177
|
+
self._validate_common_script_lines()
|
178
|
+
|
179
|
+
# Set/initialize some more options
|
180
|
+
self.keep_pickle_files = keep_pickle_files
|
181
|
+
self.map_jobid_to_slurm_files_local = {}
|
182
|
+
|
183
|
+
def _validate_common_script_lines(self):
|
184
|
+
"""
|
185
|
+
Check that SLURM account is not set in `self.common_script_lines`.
|
186
|
+
"""
|
187
|
+
try:
|
188
|
+
invalid_line = next(
|
189
|
+
line
|
190
|
+
for line in self.common_script_lines
|
191
|
+
if line.startswith("#SBATCH --account=")
|
192
|
+
)
|
193
|
+
raise RuntimeError(
|
194
|
+
"Invalid line in `FractalSlurmSSHExecutor."
|
195
|
+
"common_script_lines`: "
|
196
|
+
f"'{invalid_line}'.\n"
|
197
|
+
"SLURM account must be set via the request body of the "
|
198
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
199
|
+
)
|
200
|
+
except StopIteration:
|
201
|
+
pass
|
202
|
+
|
203
|
+
def _cleanup(self, jobid: str) -> None:
|
204
|
+
"""
|
205
|
+
Given a job ID, perform any necessary cleanup after the job has
|
206
|
+
finished.
|
207
|
+
"""
|
208
|
+
with self.jobs_lock:
|
209
|
+
self.map_jobid_to_slurm_files_local.pop(jobid)
|
210
|
+
|
211
|
+
def get_input_pickle_file_path_local(
|
212
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
213
|
+
) -> Path:
|
214
|
+
|
215
|
+
prefix = prefix or "cfut"
|
216
|
+
output = (
|
217
|
+
self.workflow_dir_local
|
218
|
+
/ subfolder_name
|
219
|
+
/ f"{prefix}_in_{arg}.pickle"
|
220
|
+
)
|
221
|
+
return output
|
222
|
+
|
223
|
+
def get_input_pickle_file_path_remote(
|
224
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
225
|
+
) -> Path:
|
226
|
+
|
227
|
+
prefix = prefix or "cfut"
|
228
|
+
output = (
|
229
|
+
self.workflow_dir_remote
|
230
|
+
/ subfolder_name
|
231
|
+
/ f"{prefix}_in_{arg}.pickle"
|
232
|
+
)
|
233
|
+
return output
|
234
|
+
|
235
|
+
def get_output_pickle_file_path_local(
|
236
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
237
|
+
) -> Path:
|
238
|
+
prefix = prefix or "cfut"
|
239
|
+
return (
|
240
|
+
self.workflow_dir_local
|
241
|
+
/ subfolder_name
|
242
|
+
/ f"{prefix}_out_{arg}.pickle"
|
243
|
+
)
|
244
|
+
|
245
|
+
def get_output_pickle_file_path_remote(
|
246
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
247
|
+
) -> Path:
|
248
|
+
prefix = prefix or "cfut"
|
249
|
+
return (
|
250
|
+
self.workflow_dir_remote
|
251
|
+
/ subfolder_name
|
252
|
+
/ f"{prefix}_out_{arg}.pickle"
|
253
|
+
)
|
254
|
+
|
255
|
+
def get_slurm_script_file_path_local(
|
256
|
+
self, *, subfolder_name: str, prefix: Optional[str] = None
|
257
|
+
) -> Path:
|
258
|
+
prefix = prefix or "_temp"
|
259
|
+
return (
|
260
|
+
self.workflow_dir_local
|
261
|
+
/ subfolder_name
|
262
|
+
/ f"{prefix}_slurm_submit.sbatch"
|
263
|
+
)
|
264
|
+
|
265
|
+
def get_slurm_script_file_path_remote(
|
266
|
+
self, *, subfolder_name: str, prefix: Optional[str] = None
|
267
|
+
) -> Path:
|
268
|
+
prefix = prefix or "_temp"
|
269
|
+
return (
|
270
|
+
self.workflow_dir_remote
|
271
|
+
/ subfolder_name
|
272
|
+
/ f"{prefix}_slurm_submit.sbatch"
|
273
|
+
)
|
274
|
+
|
275
|
+
def get_slurm_stdout_file_path_local(
|
276
|
+
self,
|
277
|
+
*,
|
278
|
+
subfolder_name: str,
|
279
|
+
arg: str = "%j",
|
280
|
+
prefix: Optional[str] = None,
|
281
|
+
) -> Path:
|
282
|
+
prefix = prefix or "slurmpy.stdout"
|
283
|
+
return (
|
284
|
+
self.workflow_dir_local
|
285
|
+
/ subfolder_name
|
286
|
+
/ f"{prefix}_slurm_{arg}.out"
|
287
|
+
)
|
288
|
+
|
289
|
+
def get_slurm_stdout_file_path_remote(
|
290
|
+
self,
|
291
|
+
*,
|
292
|
+
subfolder_name: str,
|
293
|
+
arg: str = "%j",
|
294
|
+
prefix: Optional[str] = None,
|
295
|
+
) -> Path:
|
296
|
+
prefix = prefix or "slurmpy.stdout"
|
297
|
+
return (
|
298
|
+
self.workflow_dir_remote
|
299
|
+
/ subfolder_name
|
300
|
+
/ f"{prefix}_slurm_{arg}.out"
|
301
|
+
)
|
302
|
+
|
303
|
+
def get_slurm_stderr_file_path_local(
|
304
|
+
self,
|
305
|
+
*,
|
306
|
+
subfolder_name: str,
|
307
|
+
arg: str = "%j",
|
308
|
+
prefix: Optional[str] = None,
|
309
|
+
) -> Path:
|
310
|
+
prefix = prefix or "slurmpy.stderr"
|
311
|
+
return (
|
312
|
+
self.workflow_dir_local
|
313
|
+
/ subfolder_name
|
314
|
+
/ f"{prefix}_slurm_{arg}.err"
|
315
|
+
)
|
316
|
+
|
317
|
+
def get_slurm_stderr_file_path_remote(
|
318
|
+
self,
|
319
|
+
*,
|
320
|
+
subfolder_name: str,
|
321
|
+
arg: str = "%j",
|
322
|
+
prefix: Optional[str] = None,
|
323
|
+
) -> Path:
|
324
|
+
prefix = prefix or "slurmpy.stderr"
|
325
|
+
return (
|
326
|
+
self.workflow_dir_remote
|
327
|
+
/ subfolder_name
|
328
|
+
/ f"{prefix}_slurm_{arg}.err"
|
329
|
+
)
|
330
|
+
|
331
|
+
def submit(
|
332
|
+
self,
|
333
|
+
fun: Callable[..., Any],
|
334
|
+
*fun_args: Sequence[Any],
|
335
|
+
slurm_config: Optional[SlurmConfig] = None,
|
336
|
+
task_files: Optional[TaskFiles] = None,
|
337
|
+
**fun_kwargs: dict,
|
338
|
+
) -> Future:
|
339
|
+
"""
|
340
|
+
Submit a function for execution on `FractalSlurmSSHExecutor`
|
341
|
+
|
342
|
+
Arguments:
|
343
|
+
fun: The function to be executed
|
344
|
+
fun_args: Function positional arguments
|
345
|
+
fun_kwargs: Function keyword arguments
|
346
|
+
slurm_config:
|
347
|
+
A `SlurmConfig` object; if `None`, use
|
348
|
+
`get_default_slurm_config()`.
|
349
|
+
task_files:
|
350
|
+
A `TaskFiles` object; if `None`, use
|
351
|
+
`self.get_default_task_files()`.
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
Future representing the execution of the current SLURM job.
|
355
|
+
"""
|
356
|
+
|
357
|
+
# Set defaults, if needed
|
358
|
+
if slurm_config is None:
|
359
|
+
slurm_config = get_default_slurm_config()
|
360
|
+
if task_files is None:
|
361
|
+
task_files = self.get_default_task_files()
|
362
|
+
|
363
|
+
# Set slurm_file_prefix
|
364
|
+
slurm_file_prefix = task_files.file_prefix
|
365
|
+
|
366
|
+
# Include common_script_lines in extra_lines
|
367
|
+
logger.debug(
|
368
|
+
f"Adding {self.common_script_lines=} to "
|
369
|
+
f"{slurm_config.extra_lines=}, from submit method."
|
370
|
+
)
|
371
|
+
current_extra_lines = slurm_config.extra_lines or []
|
372
|
+
slurm_config.extra_lines = (
|
373
|
+
current_extra_lines + self.common_script_lines
|
374
|
+
)
|
375
|
+
|
376
|
+
# Adapt slurm_config to the fact that this is a single-task SlurmJob
|
377
|
+
# instance
|
378
|
+
slurm_config.tasks_per_job = 1
|
379
|
+
slurm_config.parallel_tasks_per_job = 1
|
380
|
+
|
381
|
+
job = self._prepare_job(
|
382
|
+
fun,
|
383
|
+
slurm_config=slurm_config,
|
384
|
+
slurm_file_prefix=slurm_file_prefix,
|
385
|
+
task_files=task_files,
|
386
|
+
single_task_submission=True,
|
387
|
+
args=fun_args,
|
388
|
+
kwargs=fun_kwargs,
|
389
|
+
)
|
390
|
+
try:
|
391
|
+
self._put_subfolder_sftp(jobs=[job])
|
392
|
+
except NoValidConnectionsError as e:
|
393
|
+
logger.error("NoValidConnectionError")
|
394
|
+
logger.error(f"{str(e)=}")
|
395
|
+
logger.error(f"{e.errors=}")
|
396
|
+
for err in e.errors:
|
397
|
+
logger.error(f"{str(err)}")
|
398
|
+
raise e
|
399
|
+
future, job_id_str = self._submit_job(job)
|
400
|
+
self.wait_thread.wait(job_id=job_id_str)
|
401
|
+
return future
|
402
|
+
|
403
|
+
def map(
|
404
|
+
self,
|
405
|
+
fn: Callable[..., Any],
|
406
|
+
iterable: list[Sequence[Any]],
|
407
|
+
*,
|
408
|
+
slurm_config: Optional[SlurmConfig] = None,
|
409
|
+
task_files: Optional[TaskFiles] = None,
|
410
|
+
):
|
411
|
+
"""
|
412
|
+
Return an iterator with the results of several execution of a function
|
413
|
+
|
414
|
+
This function is based on `concurrent.futures.Executor.map` from Python
|
415
|
+
Standard Library 3.11.
|
416
|
+
Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
|
417
|
+
PSF under a Contributor Agreement.
|
418
|
+
|
419
|
+
Main modifications from the PSF function:
|
420
|
+
|
421
|
+
1. Only `fn` and `iterable` can be assigned as positional arguments;
|
422
|
+
2. `*iterables` argument replaced with a single `iterable`;
|
423
|
+
3. `timeout` and `chunksize` arguments are not supported.
|
424
|
+
|
425
|
+
Arguments:
|
426
|
+
fn:
|
427
|
+
The function to be executed
|
428
|
+
iterable:
|
429
|
+
An iterable such that each element is the list of arguments to
|
430
|
+
be passed to `fn`, as in `fn(*args)`.
|
431
|
+
slurm_config:
|
432
|
+
A `SlurmConfig` object; if `None`, use
|
433
|
+
`get_default_slurm_config()`.
|
434
|
+
task_files:
|
435
|
+
A `TaskFiles` object; if `None`, use
|
436
|
+
`self.get_default_task_files()`.
|
437
|
+
|
438
|
+
"""
|
439
|
+
|
440
|
+
def _result_or_cancel(fut):
|
441
|
+
"""
|
442
|
+
This function is based on the Python Standard Library 3.11.
|
443
|
+
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
444
|
+
Licensed to PSF under a Contributor Agreement.
|
445
|
+
"""
|
446
|
+
try:
|
447
|
+
try:
|
448
|
+
return fut.result()
|
449
|
+
finally:
|
450
|
+
fut.cancel()
|
451
|
+
finally:
|
452
|
+
# Break a reference cycle with the exception in
|
453
|
+
# self._exception
|
454
|
+
del fut
|
455
|
+
|
456
|
+
# Set defaults, if needed
|
457
|
+
if not slurm_config:
|
458
|
+
slurm_config = get_default_slurm_config()
|
459
|
+
if task_files is None:
|
460
|
+
task_files = self.get_default_task_files()
|
461
|
+
|
462
|
+
# Include common_script_lines in extra_lines
|
463
|
+
logger.debug(
|
464
|
+
f"Adding {self.common_script_lines=} to "
|
465
|
+
f"{slurm_config.extra_lines=}, from map method."
|
466
|
+
)
|
467
|
+
current_extra_lines = slurm_config.extra_lines or []
|
468
|
+
slurm_config.extra_lines = (
|
469
|
+
current_extra_lines + self.common_script_lines
|
470
|
+
)
|
471
|
+
|
472
|
+
# Set file prefixes
|
473
|
+
general_slurm_file_prefix = str(task_files.task_order)
|
474
|
+
|
475
|
+
# Transform iterable into a list and count its elements
|
476
|
+
list_args = list(iterable)
|
477
|
+
tot_tasks = len(list_args)
|
478
|
+
|
479
|
+
# Set/validate parameters for task batching
|
480
|
+
tasks_per_job, parallel_tasks_per_job = heuristics(
|
481
|
+
# Number of parallel components (always known)
|
482
|
+
tot_tasks=len(list_args),
|
483
|
+
# Optional WorkflowTask attributes:
|
484
|
+
tasks_per_job=slurm_config.tasks_per_job,
|
485
|
+
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
486
|
+
# Task requirements (multiple possible sources):
|
487
|
+
cpus_per_task=slurm_config.cpus_per_task,
|
488
|
+
mem_per_task=slurm_config.mem_per_task_MB,
|
489
|
+
# Fractal configuration variables (soft/hard limits):
|
490
|
+
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
491
|
+
target_mem_per_job=slurm_config.target_mem_per_job,
|
492
|
+
target_num_jobs=slurm_config.target_num_jobs,
|
493
|
+
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
494
|
+
max_mem_per_job=slurm_config.max_mem_per_job,
|
495
|
+
max_num_jobs=slurm_config.max_num_jobs,
|
496
|
+
)
|
497
|
+
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
498
|
+
slurm_config.tasks_per_job = tasks_per_job
|
499
|
+
|
500
|
+
# Divide arguments in batches of `n_tasks_per_script` tasks each
|
501
|
+
args_batches = []
|
502
|
+
batch_size = tasks_per_job
|
503
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
504
|
+
args_batches.append(
|
505
|
+
list_args[ind_chunk : ind_chunk + batch_size] # noqa
|
506
|
+
)
|
507
|
+
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
508
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
509
|
+
|
510
|
+
# Fetch configuration variable
|
511
|
+
settings = Inject(get_settings)
|
512
|
+
FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
|
513
|
+
|
514
|
+
logger.debug("[map] Job preparation - START")
|
515
|
+
current_component_index = 0
|
516
|
+
jobs_to_submit = []
|
517
|
+
for ind_batch, batch in enumerate(args_batches):
|
518
|
+
batch_size = len(batch)
|
519
|
+
this_slurm_file_prefix = (
|
520
|
+
f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
|
521
|
+
)
|
522
|
+
new_job_to_submit = self._prepare_job(
|
523
|
+
fn,
|
524
|
+
slurm_config=slurm_config,
|
525
|
+
slurm_file_prefix=this_slurm_file_prefix,
|
526
|
+
task_files=task_files,
|
527
|
+
single_task_submission=False,
|
528
|
+
components=batch,
|
529
|
+
)
|
530
|
+
jobs_to_submit.append(new_job_to_submit)
|
531
|
+
current_component_index += batch_size
|
532
|
+
logger.debug("[map] Job preparation - END")
|
533
|
+
|
534
|
+
try:
|
535
|
+
self._put_subfolder_sftp(jobs=jobs_to_submit)
|
536
|
+
except NoValidConnectionsError as e:
|
537
|
+
logger.error("NoValidConnectionError")
|
538
|
+
logger.error(f"{str(e)=}")
|
539
|
+
logger.error(f"{e.errors=}")
|
540
|
+
for err in e.errors:
|
541
|
+
logger.error(f"{str(err)}")
|
542
|
+
|
543
|
+
raise e
|
544
|
+
|
545
|
+
# Construct list of futures (one per SLURM job, i.e. one per batch)
|
546
|
+
# FIXME SSH: we may create a single `_submit_many_jobs` method to
|
547
|
+
# reduce the number of commands run over SSH
|
548
|
+
logger.debug("[map] Job submission - START")
|
549
|
+
fs = []
|
550
|
+
job_ids = []
|
551
|
+
for job in jobs_to_submit:
|
552
|
+
future, job_id = self._submit_job(job)
|
553
|
+
job_ids.append(job_id)
|
554
|
+
fs.append(future)
|
555
|
+
time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
|
556
|
+
for job_id in job_ids:
|
557
|
+
self.wait_thread.wait(job_id=job_id)
|
558
|
+
logger.debug("[map] Job submission - END")
|
559
|
+
|
560
|
+
# Yield must be hidden in closure so that the futures are submitted
|
561
|
+
# before the first iterator value is required.
|
562
|
+
# NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
|
563
|
+
# iterable of results (if successful), and we should yield its elements
|
564
|
+
# rather than the whole iterable.
|
565
|
+
def result_iterator():
|
566
|
+
"""
|
567
|
+
This function is based on the Python Standard Library 3.11.
|
568
|
+
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
569
|
+
Licensed to PSF under a Contributor Agreement.
|
570
|
+
"""
|
571
|
+
try:
|
572
|
+
# reverse to keep finishing order
|
573
|
+
fs.reverse()
|
574
|
+
while fs:
|
575
|
+
# Careful not to keep a reference to the popped future
|
576
|
+
results = _result_or_cancel(fs.pop())
|
577
|
+
for res in results:
|
578
|
+
yield res
|
579
|
+
finally:
|
580
|
+
for future in fs:
|
581
|
+
future.cancel()
|
582
|
+
|
583
|
+
return result_iterator()
|
584
|
+
|
585
|
+
def _prepare_job(
|
586
|
+
self,
|
587
|
+
fun: Callable[..., Any],
|
588
|
+
slurm_file_prefix: str,
|
589
|
+
task_files: TaskFiles,
|
590
|
+
slurm_config: SlurmConfig,
|
591
|
+
single_task_submission: bool = False,
|
592
|
+
args: Optional[Sequence[Any]] = None,
|
593
|
+
kwargs: Optional[dict] = None,
|
594
|
+
components: Optional[list[Any]] = None,
|
595
|
+
) -> SlurmJob:
|
596
|
+
"""
|
597
|
+
Prepare a SLURM job locally, without submitting it
|
598
|
+
|
599
|
+
This function prepares and writes the local submission script, but it
|
600
|
+
does not transfer it to the SLURM cluster.
|
601
|
+
|
602
|
+
NOTE: this method has different behaviors when it is called from the
|
603
|
+
`self.submit` or `self.map` methods (which is also encoded in
|
604
|
+
`single_task_submission`):
|
605
|
+
|
606
|
+
* When called from `self.submit`, it supports general `args` and
|
607
|
+
`kwargs` arguments;
|
608
|
+
* When called from `self.map`, there cannot be any `args` or `kwargs`
|
609
|
+
argument, but there must be a `components` argument.
|
610
|
+
|
611
|
+
Arguments:
|
612
|
+
fun:
|
613
|
+
slurm_file_prefix:
|
614
|
+
task_files:
|
615
|
+
slurm_config:
|
616
|
+
single_task_submission:
|
617
|
+
args:
|
618
|
+
kwargs:
|
619
|
+
components:
|
620
|
+
|
621
|
+
Returns:
|
622
|
+
SlurmJob object
|
623
|
+
"""
|
624
|
+
|
625
|
+
# Inject SLURM account (if set) into slurm_config
|
626
|
+
if self.slurm_account:
|
627
|
+
slurm_config.account = self.slurm_account
|
628
|
+
|
629
|
+
# Define slurm-job-related files
|
630
|
+
if single_task_submission:
|
631
|
+
if components is not None:
|
632
|
+
raise ValueError(
|
633
|
+
f"{single_task_submission=} but components is not None"
|
634
|
+
)
|
635
|
+
job = SlurmJob(
|
636
|
+
slurm_file_prefix=slurm_file_prefix,
|
637
|
+
num_tasks_tot=1,
|
638
|
+
slurm_config=slurm_config,
|
639
|
+
)
|
640
|
+
if job.num_tasks_tot > 1:
|
641
|
+
raise ValueError(
|
642
|
+
"{single_task_submission=} but {job.num_tasks_tot=}"
|
643
|
+
)
|
644
|
+
job.single_task_submission = True
|
645
|
+
job.wftask_file_prefixes = (task_files.file_prefix,)
|
646
|
+
job.wftask_subfolder_name = task_files.subfolder_name
|
647
|
+
|
648
|
+
else:
|
649
|
+
if not components or len(components) < 1:
|
650
|
+
raise ValueError(
|
651
|
+
"In FractalSlurmSSHExecutor._submit_job, given "
|
652
|
+
f"{components=}."
|
653
|
+
)
|
654
|
+
num_tasks_tot = len(components)
|
655
|
+
job = SlurmJob(
|
656
|
+
slurm_file_prefix=slurm_file_prefix,
|
657
|
+
num_tasks_tot=num_tasks_tot,
|
658
|
+
slurm_config=slurm_config,
|
659
|
+
)
|
660
|
+
|
661
|
+
_prefixes = []
|
662
|
+
_subfolder_names = []
|
663
|
+
for component in components:
|
664
|
+
if isinstance(component, dict):
|
665
|
+
actual_component = component.get(_COMPONENT_KEY_, None)
|
666
|
+
else:
|
667
|
+
actual_component = component
|
668
|
+
_task_file_paths = get_task_file_paths(
|
669
|
+
workflow_dir_local=task_files.workflow_dir_local,
|
670
|
+
workflow_dir_remote=task_files.workflow_dir_remote,
|
671
|
+
task_name=task_files.task_name,
|
672
|
+
task_order=task_files.task_order,
|
673
|
+
component=actual_component,
|
674
|
+
)
|
675
|
+
_prefixes.append(_task_file_paths.file_prefix)
|
676
|
+
_subfolder_names.append(_task_file_paths.subfolder_name)
|
677
|
+
job.wftask_file_prefixes = tuple(_prefixes)
|
678
|
+
|
679
|
+
# Check that all components share the same subfolder
|
680
|
+
num_subfolders = len(set(_subfolder_names))
|
681
|
+
if num_subfolders != 1:
|
682
|
+
error_msg_short = (
|
683
|
+
f"[_submit_job] Subfolder list has {num_subfolders} "
|
684
|
+
"different values, but it must have only one (since "
|
685
|
+
"workflow tasks are executed one by one)."
|
686
|
+
)
|
687
|
+
error_msg_detail = (
|
688
|
+
"[_submit_job] Current unique subfolder names: "
|
689
|
+
f"{set(_subfolder_names)}"
|
690
|
+
)
|
691
|
+
logger.error(error_msg_short)
|
692
|
+
logger.error(error_msg_detail)
|
693
|
+
raise ValueError(error_msg_short)
|
694
|
+
job.wftask_subfolder_name = _subfolder_names[0]
|
695
|
+
|
696
|
+
# Check that server-side subfolder exists
|
697
|
+
subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
|
698
|
+
if not subfolder_path.exists():
|
699
|
+
raise FileNotFoundError(
|
700
|
+
f"Missing folder {subfolder_path.as_posix()}."
|
701
|
+
)
|
702
|
+
|
703
|
+
# Define I/O pickle file local/remote paths
|
704
|
+
job.input_pickle_files_local = tuple(
|
705
|
+
self.get_input_pickle_file_path_local(
|
706
|
+
arg=job.workerids[ind],
|
707
|
+
subfolder_name=job.wftask_subfolder_name,
|
708
|
+
prefix=job.wftask_file_prefixes[ind],
|
709
|
+
)
|
710
|
+
for ind in range(job.num_tasks_tot)
|
711
|
+
)
|
712
|
+
job.input_pickle_files_remote = tuple(
|
713
|
+
self.get_input_pickle_file_path_remote(
|
714
|
+
arg=job.workerids[ind],
|
715
|
+
subfolder_name=job.wftask_subfolder_name,
|
716
|
+
prefix=job.wftask_file_prefixes[ind],
|
717
|
+
)
|
718
|
+
for ind in range(job.num_tasks_tot)
|
719
|
+
)
|
720
|
+
job.output_pickle_files_local = tuple(
|
721
|
+
self.get_output_pickle_file_path_local(
|
722
|
+
arg=job.workerids[ind],
|
723
|
+
subfolder_name=job.wftask_subfolder_name,
|
724
|
+
prefix=job.wftask_file_prefixes[ind],
|
725
|
+
)
|
726
|
+
for ind in range(job.num_tasks_tot)
|
727
|
+
)
|
728
|
+
job.output_pickle_files_remote = tuple(
|
729
|
+
self.get_output_pickle_file_path_remote(
|
730
|
+
arg=job.workerids[ind],
|
731
|
+
subfolder_name=job.wftask_subfolder_name,
|
732
|
+
prefix=job.wftask_file_prefixes[ind],
|
733
|
+
)
|
734
|
+
for ind in range(job.num_tasks_tot)
|
735
|
+
)
|
736
|
+
|
737
|
+
# Define SLURM-job file local/remote paths
|
738
|
+
job.slurm_script_local = self.get_slurm_script_file_path_local(
|
739
|
+
subfolder_name=job.wftask_subfolder_name,
|
740
|
+
prefix=job.slurm_file_prefix,
|
741
|
+
)
|
742
|
+
job.slurm_script_remote = self.get_slurm_script_file_path_remote(
|
743
|
+
subfolder_name=job.wftask_subfolder_name,
|
744
|
+
prefix=job.slurm_file_prefix,
|
745
|
+
)
|
746
|
+
job.slurm_stdout_local = self.get_slurm_stdout_file_path_local(
|
747
|
+
subfolder_name=job.wftask_subfolder_name,
|
748
|
+
prefix=job.slurm_file_prefix,
|
749
|
+
)
|
750
|
+
job.slurm_stdout_remote = self.get_slurm_stdout_file_path_remote(
|
751
|
+
subfolder_name=job.wftask_subfolder_name,
|
752
|
+
prefix=job.slurm_file_prefix,
|
753
|
+
)
|
754
|
+
job.slurm_stderr_local = self.get_slurm_stderr_file_path_local(
|
755
|
+
subfolder_name=job.wftask_subfolder_name,
|
756
|
+
prefix=job.slurm_file_prefix,
|
757
|
+
)
|
758
|
+
job.slurm_stderr_remote = self.get_slurm_stderr_file_path_remote(
|
759
|
+
subfolder_name=job.wftask_subfolder_name,
|
760
|
+
prefix=job.slurm_file_prefix,
|
761
|
+
)
|
762
|
+
|
763
|
+
# Dump serialized versions+function+args+kwargs to pickle file(s)
|
764
|
+
versions = get_versions()
|
765
|
+
if job.single_task_submission:
|
766
|
+
_args = args or []
|
767
|
+
_kwargs = kwargs or {}
|
768
|
+
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
769
|
+
with open(job.input_pickle_files_local[0], "wb") as f:
|
770
|
+
f.write(funcser)
|
771
|
+
else:
|
772
|
+
for ind_component, component in enumerate(components):
|
773
|
+
_args = [component]
|
774
|
+
_kwargs = {}
|
775
|
+
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
776
|
+
with open(
|
777
|
+
job.input_pickle_files_local[ind_component], "wb"
|
778
|
+
) as f:
|
779
|
+
f.write(funcser)
|
780
|
+
|
781
|
+
# Prepare commands to be included in SLURM submission script
|
782
|
+
cmdlines = []
|
783
|
+
for ind_task in range(job.num_tasks_tot):
|
784
|
+
input_pickle_file = job.input_pickle_files_remote[ind_task]
|
785
|
+
output_pickle_file = job.output_pickle_files_remote[ind_task]
|
786
|
+
cmdlines.append(
|
787
|
+
(
|
788
|
+
f"{self.python_remote}"
|
789
|
+
" -m fractal_server.app.runner.executors.slurm.remote "
|
790
|
+
f"--input-file {input_pickle_file} "
|
791
|
+
f"--output-file {output_pickle_file}"
|
792
|
+
)
|
793
|
+
)
|
794
|
+
|
795
|
+
# Prepare SLURM submission script
|
796
|
+
sbatch_script_content = self._prepare_sbatch_script(
|
797
|
+
slurm_config=job.slurm_config,
|
798
|
+
list_commands=cmdlines,
|
799
|
+
slurm_out_path=str(job.slurm_stdout_remote),
|
800
|
+
slurm_err_path=str(job.slurm_stderr_remote),
|
801
|
+
)
|
802
|
+
with job.slurm_script_local.open("w") as f:
|
803
|
+
f.write(sbatch_script_content)
|
804
|
+
|
805
|
+
return job
|
806
|
+
|
807
|
+
def _put_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
|
808
|
+
"""
|
809
|
+
Transfer the jobs subfolder to the remote host.
|
810
|
+
|
811
|
+
Arguments:
|
812
|
+
jobs: The list of `SlurmJob` objects associated to a given
|
813
|
+
subfolder.
|
814
|
+
"""
|
815
|
+
|
816
|
+
# Check that the subfolder is unique
|
817
|
+
subfolder_names = [job.wftask_subfolder_name for job in jobs]
|
818
|
+
if len(set(subfolder_names)) > 1:
|
819
|
+
raise ValueError(
|
820
|
+
"[_put_subfolder] Invalid list of jobs, "
|
821
|
+
f"{set(subfolder_names)=}."
|
822
|
+
)
|
823
|
+
subfolder_name = subfolder_names[0]
|
824
|
+
|
825
|
+
# Create compressed subfolder archive (locally)
|
826
|
+
local_subfolder = self.workflow_dir_local / subfolder_name
|
827
|
+
tarfile_name = f"{subfolder_name}.tar.gz"
|
828
|
+
tarfile_path_local = (
|
829
|
+
self.workflow_dir_local / tarfile_name
|
830
|
+
).as_posix()
|
831
|
+
tarfile_path_remote = (
|
832
|
+
self.workflow_dir_remote / tarfile_name
|
833
|
+
).as_posix()
|
834
|
+
with tarfile.open(tarfile_path_local, "w:gz") as tar:
|
835
|
+
for this_file in local_subfolder.glob("*"):
|
836
|
+
tar.add(this_file, arcname=this_file.name)
|
837
|
+
logger.info(f"Subfolder archive created at {tarfile_path_local}")
|
838
|
+
|
839
|
+
# Transfer archive
|
840
|
+
t_0_put = time.perf_counter()
|
841
|
+
self.connection.put(
|
842
|
+
local=tarfile_path_local,
|
843
|
+
remote=tarfile_path_remote,
|
844
|
+
)
|
845
|
+
t_1_put = time.perf_counter()
|
846
|
+
logger.info(
|
847
|
+
f"Subfolder archive transferred to {tarfile_path_remote}"
|
848
|
+
f" - elapsed: {t_1_put - t_0_put:.3f} s"
|
849
|
+
)
|
850
|
+
# Uncompress archive (remotely)
|
851
|
+
tar_command = (
|
852
|
+
f"{self.python_remote} -m "
|
853
|
+
"fractal_server.app.runner.extract_archive "
|
854
|
+
f"{tarfile_path_remote}"
|
855
|
+
)
|
856
|
+
run_command_over_ssh(cmd=tar_command, connection=self.connection)
|
857
|
+
|
858
|
+
# Remove local version
|
859
|
+
t_0_rm = time.perf_counter()
|
860
|
+
Path(tarfile_path_local).unlink()
|
861
|
+
t_1_rm = time.perf_counter()
|
862
|
+
logger.info(
|
863
|
+
f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
|
864
|
+
)
|
865
|
+
|
866
|
+
def _submit_job(self, job: SlurmJob) -> tuple[Future, str]:
|
867
|
+
"""
|
868
|
+
Submit a job to SLURM via SSH.
|
869
|
+
|
870
|
+
This method must always be called after `self._put_subfolder`.
|
871
|
+
|
872
|
+
Arguments:
|
873
|
+
job: The `SlurmJob` object to submit.
|
874
|
+
"""
|
875
|
+
|
876
|
+
# Submit job to SLURM, and get jobid
|
877
|
+
sbatch_command = f"sbatch --parsable {job.slurm_script_remote}"
|
878
|
+
sbatch_stdout = run_command_over_ssh(
|
879
|
+
cmd=sbatch_command,
|
880
|
+
connection=self.connection,
|
881
|
+
)
|
882
|
+
|
883
|
+
# Extract SLURM job ID from stdout
|
884
|
+
try:
|
885
|
+
stdout = sbatch_stdout.strip("\n")
|
886
|
+
jobid = int(stdout)
|
887
|
+
except ValueError as e:
|
888
|
+
error_msg = (
|
889
|
+
f"Submit command `{sbatch_command}` returned "
|
890
|
+
f"`{stdout=}` which cannot be cast to an integer "
|
891
|
+
f"SLURM-job ID. Original error:\n{str(e)}"
|
892
|
+
)
|
893
|
+
logger.error(error_msg)
|
894
|
+
raise JobExecutionError(info=error_msg)
|
895
|
+
job_id_str = str(jobid)
|
896
|
+
|
897
|
+
# Plug job id in stdout/stderr SLURM file paths (local and remote)
|
898
|
+
def _replace_job_id(_old_path: Path) -> Path:
|
899
|
+
return Path(_old_path.as_posix().replace("%j", job_id_str))
|
900
|
+
|
901
|
+
job.slurm_stdout_local = _replace_job_id(job.slurm_stdout_local)
|
902
|
+
job.slurm_stdout_remote = _replace_job_id(job.slurm_stdout_remote)
|
903
|
+
job.slurm_stderr_local = _replace_job_id(job.slurm_stderr_local)
|
904
|
+
job.slurm_stderr_remote = _replace_job_id(job.slurm_stderr_remote)
|
905
|
+
|
906
|
+
# Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
|
907
|
+
# must be after the `sbatch` call, so that "%j" has already been
|
908
|
+
# replaced with the job ID)
|
909
|
+
with self.jobs_lock:
|
910
|
+
self.map_jobid_to_slurm_files_local[job_id_str] = (
|
911
|
+
job.slurm_script_local.as_posix(),
|
912
|
+
job.slurm_stdout_local.as_posix(),
|
913
|
+
job.slurm_stderr_local.as_posix(),
|
914
|
+
)
|
915
|
+
|
916
|
+
# Create future
|
917
|
+
future = Future()
|
918
|
+
with self.jobs_lock:
|
919
|
+
self.jobs[job_id_str] = (future, job)
|
920
|
+
return future, job_id_str
|
921
|
+
|
922
|
+
def _prepare_JobExecutionError(
|
923
|
+
self, jobid: str, info: str
|
924
|
+
) -> JobExecutionError:
|
925
|
+
"""
|
926
|
+
Prepare the `JobExecutionError` for a given job
|
927
|
+
|
928
|
+
This method creates a `JobExecutionError` object and sets its attribute
|
929
|
+
to the appropriate SLURM-related file names. Note that the SLURM files
|
930
|
+
are the local ones (i.e. the ones in `self.workflow_dir_local`).
|
931
|
+
|
932
|
+
Arguments:
|
933
|
+
jobid:
|
934
|
+
ID of the SLURM job.
|
935
|
+
info:
|
936
|
+
"""
|
937
|
+
# Extract SLURM file paths
|
938
|
+
with self.jobs_lock:
|
939
|
+
(
|
940
|
+
slurm_script_file,
|
941
|
+
slurm_stdout_file,
|
942
|
+
slurm_stderr_file,
|
943
|
+
) = self.map_jobid_to_slurm_files_local[jobid]
|
944
|
+
# Construct JobExecutionError exception
|
945
|
+
job_exc = JobExecutionError(
|
946
|
+
cmd_file=slurm_script_file,
|
947
|
+
stdout_file=slurm_stdout_file,
|
948
|
+
stderr_file=slurm_stderr_file,
|
949
|
+
info=info,
|
950
|
+
)
|
951
|
+
return job_exc
|
952
|
+
|
953
|
+
def _missing_pickle_error_msg(self, out_path: Path) -> str:
|
954
|
+
settings = Inject(get_settings)
|
955
|
+
info = (
|
956
|
+
"Output pickle file of the FractalSlurmSSHExecutor "
|
957
|
+
"job not found.\n"
|
958
|
+
f"Expected file path: {out_path.as_posix()}n"
|
959
|
+
"Here are some possible reasons:\n"
|
960
|
+
"1. The SLURM job was scancel-ed, either by the user "
|
961
|
+
"or due to an error (e.g. an out-of-memory or timeout "
|
962
|
+
"error). Note that if the scancel took place before "
|
963
|
+
"the job started running, the SLURM out/err files "
|
964
|
+
"will be empty.\n"
|
965
|
+
"2. Some error occurred upon writing the file to disk "
|
966
|
+
"(e.g. because there is not enough space on disk, or "
|
967
|
+
"due to an overloaded NFS filesystem). "
|
968
|
+
"Note that the server configuration has "
|
969
|
+
"FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
|
970
|
+
f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
|
971
|
+
"seconds.\n"
|
972
|
+
)
|
973
|
+
return info
|
974
|
+
|
975
|
+
def _handle_remaining_jobs(
|
976
|
+
self,
|
977
|
+
remaining_futures: list[Future],
|
978
|
+
remaining_job_ids: list[str],
|
979
|
+
remaining_jobs: list[SlurmJob],
|
980
|
+
) -> None:
|
981
|
+
"""
|
982
|
+
Helper function used within _completion, when looping over a list of
|
983
|
+
several jobs/futures.
|
984
|
+
"""
|
985
|
+
for future in remaining_futures:
|
986
|
+
try:
|
987
|
+
future.cancel()
|
988
|
+
except InvalidStateError:
|
989
|
+
pass
|
990
|
+
for job_id in remaining_job_ids:
|
991
|
+
self._cleanup(job_id)
|
992
|
+
if not self.keep_pickle_files:
|
993
|
+
for job in remaining_jobs:
|
994
|
+
for path in job.output_pickle_files_local:
|
995
|
+
path.unlink()
|
996
|
+
for path in job.input_pickle_files_local:
|
997
|
+
path.unlink()
|
998
|
+
|
999
|
+
def _completion(self, job_ids: list[str]) -> None:
|
1000
|
+
"""
|
1001
|
+
Callback function to be executed whenever a job finishes.
|
1002
|
+
|
1003
|
+
This function is executed by self.wait_thread (triggered by either
|
1004
|
+
finding an existing output pickle file `out_path` or finding that the
|
1005
|
+
SLURM job is over). Since this takes place on a different thread,
|
1006
|
+
failures may not be captured by the main thread; we use a broad
|
1007
|
+
try/except block, so that those exceptions are reported to the main
|
1008
|
+
thread via `fut.set_exception(...)`.
|
1009
|
+
|
1010
|
+
Arguments:
|
1011
|
+
jobid: ID of the SLURM job
|
1012
|
+
"""
|
1013
|
+
|
1014
|
+
# Loop over all job_ids, and fetch future and job objects
|
1015
|
+
futures: list[Future] = []
|
1016
|
+
jobs: list[SlurmJob] = []
|
1017
|
+
with self.jobs_lock:
|
1018
|
+
for job_id in job_ids:
|
1019
|
+
future, job = self.jobs.pop(job_id)
|
1020
|
+
futures.append(future)
|
1021
|
+
jobs.append(job)
|
1022
|
+
if not self.jobs:
|
1023
|
+
self.jobs_empty_cond.notify_all()
|
1024
|
+
|
1025
|
+
# Fetch subfolder from remote host
|
1026
|
+
try:
|
1027
|
+
self._get_subfolder_sftp(jobs=jobs)
|
1028
|
+
except NoValidConnectionsError as e:
|
1029
|
+
logger.error("NoValidConnectionError")
|
1030
|
+
logger.error(f"{str(e)=}")
|
1031
|
+
logger.error(f"{e.errors=}")
|
1032
|
+
for err in e.errors:
|
1033
|
+
logger.error(f"{str(err)}")
|
1034
|
+
|
1035
|
+
raise e
|
1036
|
+
|
1037
|
+
# First round of checking whether all output files exist
|
1038
|
+
missing_out_paths = []
|
1039
|
+
for job in jobs:
|
1040
|
+
for ind_out_path, out_path in enumerate(
|
1041
|
+
job.output_pickle_files_local
|
1042
|
+
):
|
1043
|
+
if not out_path.exists():
|
1044
|
+
missing_out_paths.append(out_path)
|
1045
|
+
num_missing = len(missing_out_paths)
|
1046
|
+
if num_missing > 0:
|
1047
|
+
# Output pickle files may be missing e.g. because of some slow
|
1048
|
+
# filesystem operation; wait some time before re-trying
|
1049
|
+
settings = Inject(get_settings)
|
1050
|
+
sleep_time = settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL
|
1051
|
+
logger.info(
|
1052
|
+
f"{num_missing} output pickle files are missing; "
|
1053
|
+
f"sleep {sleep_time} seconds."
|
1054
|
+
)
|
1055
|
+
for missing_file in missing_out_paths:
|
1056
|
+
logger.debug(f"Missing output pickle file: {missing_file}")
|
1057
|
+
time.sleep(sleep_time)
|
1058
|
+
|
1059
|
+
# Handle all jobs
|
1060
|
+
for ind_job, job_id in enumerate(job_ids):
|
1061
|
+
try:
|
1062
|
+
# Retrieve job and future objects
|
1063
|
+
job = jobs[ind_job]
|
1064
|
+
future = futures[ind_job]
|
1065
|
+
remaining_job_ids = job_ids[ind_job + 1 :] # noqa: E203
|
1066
|
+
remaining_futures = futures[ind_job + 1 :] # noqa: E203
|
1067
|
+
|
1068
|
+
outputs = []
|
1069
|
+
|
1070
|
+
for ind_out_path, out_path in enumerate(
|
1071
|
+
job.output_pickle_files_local
|
1072
|
+
):
|
1073
|
+
in_path = job.input_pickle_files_local[ind_out_path]
|
1074
|
+
if not out_path.exists():
|
1075
|
+
# Output pickle file is still missing
|
1076
|
+
info = self._missing_pickle_error_msg(out_path)
|
1077
|
+
job_exc = self._prepare_JobExecutionError(
|
1078
|
+
job_id, info=info
|
1079
|
+
)
|
1080
|
+
try:
|
1081
|
+
future.set_exception(job_exc)
|
1082
|
+
self._handle_remaining_jobs(
|
1083
|
+
remaining_futures=remaining_futures,
|
1084
|
+
remaining_job_ids=remaining_job_ids,
|
1085
|
+
)
|
1086
|
+
return
|
1087
|
+
except InvalidStateError:
|
1088
|
+
logger.warning(
|
1089
|
+
f"Future {future} (SLURM job ID: {job_id}) "
|
1090
|
+
"was already cancelled."
|
1091
|
+
)
|
1092
|
+
if not self.keep_pickle_files:
|
1093
|
+
in_path.unlink()
|
1094
|
+
self._cleanup(job_id)
|
1095
|
+
self._handle_remaining_jobs(
|
1096
|
+
remaining_futures=remaining_futures,
|
1097
|
+
remaining_job_ids=remaining_job_ids,
|
1098
|
+
)
|
1099
|
+
return
|
1100
|
+
|
1101
|
+
# Read the task output
|
1102
|
+
with out_path.open("rb") as f:
|
1103
|
+
outdata = f.read()
|
1104
|
+
# Note: output can be either the task result (typically a
|
1105
|
+
# dictionary) or an ExceptionProxy object; in the latter
|
1106
|
+
# case, the ExceptionProxy definition is also part of the
|
1107
|
+
# pickle file (thanks to cloudpickle.dumps).
|
1108
|
+
success, output = cloudpickle.loads(outdata)
|
1109
|
+
try:
|
1110
|
+
if success:
|
1111
|
+
outputs.append(output)
|
1112
|
+
else:
|
1113
|
+
proxy = output
|
1114
|
+
if proxy.exc_type_name == "JobExecutionError":
|
1115
|
+
job_exc = self._prepare_JobExecutionError(
|
1116
|
+
job_id, info=proxy.kwargs.get("info", None)
|
1117
|
+
)
|
1118
|
+
future.set_exception(job_exc)
|
1119
|
+
self._handle_remaining_jobs(
|
1120
|
+
remaining_futures=remaining_futures,
|
1121
|
+
remaining_job_ids=remaining_job_ids,
|
1122
|
+
)
|
1123
|
+
return
|
1124
|
+
else:
|
1125
|
+
# This branch catches both TaskExecutionError's
|
1126
|
+
# (coming from the typical fractal-server
|
1127
|
+
# execution of tasks, and with additional
|
1128
|
+
# fractal-specific kwargs) or arbitrary
|
1129
|
+
# exceptions (coming from a direct use of
|
1130
|
+
# FractalSlurmSSHExecutor, possibly outside
|
1131
|
+
# fractal-server)
|
1132
|
+
kwargs = {}
|
1133
|
+
for key in [
|
1134
|
+
"workflow_task_id",
|
1135
|
+
"workflow_task_order",
|
1136
|
+
"task_name",
|
1137
|
+
]:
|
1138
|
+
if key in proxy.kwargs.keys():
|
1139
|
+
kwargs[key] = proxy.kwargs[key]
|
1140
|
+
exc = TaskExecutionError(proxy.tb, **kwargs)
|
1141
|
+
future.set_exception(exc)
|
1142
|
+
self._handle_remaining_jobs(
|
1143
|
+
remaining_futures=remaining_futures,
|
1144
|
+
remaining_job_ids=remaining_job_ids,
|
1145
|
+
)
|
1146
|
+
return
|
1147
|
+
if not self.keep_pickle_files:
|
1148
|
+
out_path.unlink()
|
1149
|
+
except InvalidStateError:
|
1150
|
+
logger.warning(
|
1151
|
+
f"Future {future} (SLURM job ID: {job_id}) was "
|
1152
|
+
"already cancelled, exit from "
|
1153
|
+
"FractalSlurmSSHExecutor._completion."
|
1154
|
+
)
|
1155
|
+
if not self.keep_pickle_files:
|
1156
|
+
out_path.unlink()
|
1157
|
+
in_path.unlink()
|
1158
|
+
|
1159
|
+
self._cleanup(job_id)
|
1160
|
+
self._handle_remaining_jobs(
|
1161
|
+
remaining_futures=remaining_futures,
|
1162
|
+
remaining_job_ids=remaining_job_ids,
|
1163
|
+
)
|
1164
|
+
return
|
1165
|
+
|
1166
|
+
# Clean up input pickle file
|
1167
|
+
if not self.keep_pickle_files:
|
1168
|
+
in_path.unlink()
|
1169
|
+
self._cleanup(job_id)
|
1170
|
+
if job.single_task_submission:
|
1171
|
+
future.set_result(outputs[0])
|
1172
|
+
else:
|
1173
|
+
future.set_result(outputs)
|
1174
|
+
|
1175
|
+
except Exception as e:
|
1176
|
+
try:
|
1177
|
+
future.set_exception(e)
|
1178
|
+
return
|
1179
|
+
except InvalidStateError:
|
1180
|
+
logger.warning(
|
1181
|
+
f"Future {future} (SLURM job ID: {job_id}) was already"
|
1182
|
+
" cancelled, exit from"
|
1183
|
+
" FractalSlurmSSHExecutor._completion."
|
1184
|
+
)
|
1185
|
+
|
1186
|
+
def _get_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
|
1187
|
+
"""
|
1188
|
+
Fetch a remote folder via tar+sftp+tar
|
1189
|
+
|
1190
|
+
Arguments:
|
1191
|
+
job:
|
1192
|
+
`SlurmJob` object (needed for its prefixes-related attributes).
|
1193
|
+
"""
|
1194
|
+
|
1195
|
+
# Check that the subfolder is unique
|
1196
|
+
subfolder_names = [job.wftask_subfolder_name for job in jobs]
|
1197
|
+
if len(set(subfolder_names)) > 1:
|
1198
|
+
raise ValueError(
|
1199
|
+
"[_put_subfolder] Invalid list of jobs, "
|
1200
|
+
f"{set(subfolder_names)=}."
|
1201
|
+
)
|
1202
|
+
subfolder_name = subfolder_names[0]
|
1203
|
+
|
1204
|
+
t_0 = time.perf_counter()
|
1205
|
+
logger.debug("[_get_subfolder_sftp] Start")
|
1206
|
+
tarfile_path_local = (
|
1207
|
+
self.workflow_dir_local / f"{subfolder_name}.tar.gz"
|
1208
|
+
).as_posix()
|
1209
|
+
tarfile_path_remote = (
|
1210
|
+
self.workflow_dir_remote / f"{subfolder_name}.tar.gz"
|
1211
|
+
).as_posix()
|
1212
|
+
|
1213
|
+
# Remove local tarfile - FIXME SSH: is this needed?
|
1214
|
+
logger.warning(f"In principle I just removed {tarfile_path_local}")
|
1215
|
+
logger.warning(f"{Path(tarfile_path_local).exists()=}")
|
1216
|
+
|
1217
|
+
# Remove remote tarfile - FIXME SSH: is this needed?
|
1218
|
+
# rm_command = f"rm {tarfile_path_remote}"
|
1219
|
+
# _run_command_over_ssh(cmd=rm_command, connection=self.connection)
|
1220
|
+
logger.warning(f"Unlink {tarfile_path_remote=} - START")
|
1221
|
+
self.connection.sftp().unlink(tarfile_path_remote)
|
1222
|
+
logger.warning(f"Unlink {tarfile_path_remote=} - STOP")
|
1223
|
+
|
1224
|
+
# Create remote tarfile
|
1225
|
+
tar_command = (
|
1226
|
+
f"{self.python_remote} "
|
1227
|
+
"-m fractal_server.app.runner.compress_folder "
|
1228
|
+
f"{(self.workflow_dir_remote / subfolder_name).as_posix()}"
|
1229
|
+
)
|
1230
|
+
stdout = run_command_over_ssh(
|
1231
|
+
cmd=tar_command, connection=self.connection
|
1232
|
+
)
|
1233
|
+
print(stdout)
|
1234
|
+
|
1235
|
+
# Fetch tarfile
|
1236
|
+
t_0_get = time.perf_counter()
|
1237
|
+
self.connection.get(
|
1238
|
+
remote=tarfile_path_remote,
|
1239
|
+
local=tarfile_path_local,
|
1240
|
+
)
|
1241
|
+
t_1_get = time.perf_counter()
|
1242
|
+
logger.info(
|
1243
|
+
f"Subfolder archive transferred back to {tarfile_path_local}"
|
1244
|
+
f" - elapsed: {t_1_get - t_0_get:.3f} s"
|
1245
|
+
)
|
1246
|
+
|
1247
|
+
# Extract tarfile locally
|
1248
|
+
with tarfile.open(tarfile_path_local) as tar:
|
1249
|
+
tar.extractall(path=(self.workflow_dir_local / subfolder_name))
|
1250
|
+
|
1251
|
+
t_1 = time.perf_counter()
|
1252
|
+
logger.info("[_get_subfolder_sftp] End - " f"elapsed: {t_1-t_0:.3f} s")
|
1253
|
+
|
1254
|
+
def _prepare_sbatch_script(
|
1255
|
+
self,
|
1256
|
+
*,
|
1257
|
+
list_commands: list[str],
|
1258
|
+
slurm_out_path: str,
|
1259
|
+
slurm_err_path: str,
|
1260
|
+
slurm_config: SlurmConfig,
|
1261
|
+
):
|
1262
|
+
|
1263
|
+
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
1264
|
+
mem_per_task_MB = slurm_config.mem_per_task_MB
|
1265
|
+
|
1266
|
+
# Set ntasks
|
1267
|
+
ntasks = min(len(list_commands), num_tasks_max_running)
|
1268
|
+
if len(list_commands) < num_tasks_max_running:
|
1269
|
+
ntasks = len(list_commands)
|
1270
|
+
slurm_config.parallel_tasks_per_job = ntasks
|
1271
|
+
logger.debug(
|
1272
|
+
f"{len(list_commands)=} is smaller than "
|
1273
|
+
f"{num_tasks_max_running=}. Setting {ntasks=}."
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
# Prepare SLURM preamble based on SlurmConfig object
|
1277
|
+
script_lines = slurm_config.to_sbatch_preamble(
|
1278
|
+
remote_export_dir=self.workflow_dir_remote.as_posix()
|
1279
|
+
)
|
1280
|
+
|
1281
|
+
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
1282
|
+
# fix their order
|
1283
|
+
script_lines.extend(
|
1284
|
+
[
|
1285
|
+
f"#SBATCH --err={slurm_err_path}",
|
1286
|
+
f"#SBATCH --out={slurm_out_path}",
|
1287
|
+
f"#SBATCH -D {self.workflow_dir_remote}",
|
1288
|
+
]
|
1289
|
+
)
|
1290
|
+
script_lines = slurm_config.sort_script_lines(script_lines)
|
1291
|
+
logger.debug(script_lines)
|
1292
|
+
|
1293
|
+
# Always print output of `pwd`
|
1294
|
+
script_lines.append('echo "Working directory (pwd): `pwd`"\n')
|
1295
|
+
|
1296
|
+
# Complete script preamble
|
1297
|
+
script_lines.append("\n")
|
1298
|
+
|
1299
|
+
# Include command lines
|
1300
|
+
tmp_list_commands = copy(list_commands)
|
1301
|
+
while tmp_list_commands:
|
1302
|
+
if tmp_list_commands:
|
1303
|
+
cmd = tmp_list_commands.pop(0) # take first element
|
1304
|
+
script_lines.append(
|
1305
|
+
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
1306
|
+
f"--mem={mem_per_task_MB}MB "
|
1307
|
+
f"{cmd} &"
|
1308
|
+
)
|
1309
|
+
script_lines.append("wait\n")
|
1310
|
+
|
1311
|
+
script = "\n".join(script_lines)
|
1312
|
+
return script
|
1313
|
+
|
1314
|
+
def get_default_task_files(self) -> TaskFiles:
|
1315
|
+
"""
|
1316
|
+
This will be called when self.submit or self.map are called from
|
1317
|
+
outside fractal-server, and then lack some optional arguments.
|
1318
|
+
"""
|
1319
|
+
task_files = TaskFiles(
|
1320
|
+
workflow_dir_local=self.workflow_dir_local,
|
1321
|
+
workflow_dir_remote=self.workflow_dir_remote,
|
1322
|
+
task_order=None,
|
1323
|
+
task_name="name",
|
1324
|
+
)
|
1325
|
+
return task_files
|
1326
|
+
|
1327
|
+
def shutdown(self, wait=True, *, cancel_futures=False):
|
1328
|
+
"""
|
1329
|
+
Clean up all executor variables. Note that this function is executed on
|
1330
|
+
the self.wait_thread thread, see _completion.
|
1331
|
+
"""
|
1332
|
+
|
1333
|
+
logger.debug("Executor shutdown: start")
|
1334
|
+
# self.connection.close()
|
1335
|
+
|
1336
|
+
# Handle all job futures
|
1337
|
+
slurm_jobs_to_scancel = []
|
1338
|
+
with self.jobs_lock:
|
1339
|
+
while self.jobs:
|
1340
|
+
jobid, fut_and_job = self.jobs.popitem()
|
1341
|
+
slurm_jobs_to_scancel.append(jobid)
|
1342
|
+
fut = fut_and_job[0]
|
1343
|
+
self.map_jobid_to_slurm_files_local.pop(jobid)
|
1344
|
+
if not fut.cancelled():
|
1345
|
+
fut.set_exception(
|
1346
|
+
JobExecutionError(
|
1347
|
+
"Job cancelled due to executor shutdown."
|
1348
|
+
)
|
1349
|
+
)
|
1350
|
+
fut.cancel()
|
1351
|
+
|
1352
|
+
# Cancel SLURM jobs
|
1353
|
+
if slurm_jobs_to_scancel:
|
1354
|
+
scancel_string = " ".join(slurm_jobs_to_scancel)
|
1355
|
+
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
1356
|
+
scancel_command = f"scancel {scancel_string}"
|
1357
|
+
run_command_over_ssh(
|
1358
|
+
cmd=scancel_command, connection=self.connection
|
1359
|
+
)
|
1360
|
+
logger.debug("Executor shutdown: end")
|
1361
|
+
|
1362
|
+
def __exit__(self, *args, **kwargs):
|
1363
|
+
"""
|
1364
|
+
See
|
1365
|
+
https://github.com/fractal-analytics-platform/fractal-server/issues/1508
|
1366
|
+
"""
|
1367
|
+
logger.debug(
|
1368
|
+
"[FractalSlurmSSHExecutor.__exit__] Stop and join `wait_thread`"
|
1369
|
+
)
|
1370
|
+
self.wait_thread.stop()
|
1371
|
+
self.wait_thread.join()
|
1372
|
+
logger.debug("[FractalSlurmSSHExecutor.__exit__] End")
|
1373
|
+
|
1374
|
+
def run_squeue(self, job_ids):
|
1375
|
+
squeue_command = (
|
1376
|
+
"squeue "
|
1377
|
+
"--noheader "
|
1378
|
+
"--format='%i %T' "
|
1379
|
+
"--jobs __JOBS__ "
|
1380
|
+
"--states=all"
|
1381
|
+
)
|
1382
|
+
job_ids = ",".join([str(j) for j in job_ids])
|
1383
|
+
squeue_command = squeue_command.replace("__JOBS__", job_ids)
|
1384
|
+
stdout = run_command_over_ssh(
|
1385
|
+
cmd=squeue_command,
|
1386
|
+
connection=self.connection,
|
1387
|
+
)
|
1388
|
+
return stdout
|
1389
|
+
|
1390
|
+
def _jobs_finished(self, job_ids: list[str]) -> set[str]:
|
1391
|
+
"""
|
1392
|
+
Check which ones of the given Slurm jobs already finished
|
1393
|
+
|
1394
|
+
The function is based on the `_jobs_finished` function from
|
1395
|
+
clusterfutures (version 0.5).
|
1396
|
+
Original Copyright: 2022 Adrian Sampson
|
1397
|
+
(released under the MIT licence)
|
1398
|
+
"""
|
1399
|
+
|
1400
|
+
from cfut.slurm import STATES_FINISHED
|
1401
|
+
|
1402
|
+
logger.debug(
|
1403
|
+
f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
|
1404
|
+
)
|
1405
|
+
|
1406
|
+
# If there is no Slurm job to check, return right away
|
1407
|
+
if not job_ids:
|
1408
|
+
logger.debug(
|
1409
|
+
"[FractalSlurmSSHExecutor._jobs_finished] "
|
1410
|
+
"No jobs provided, return."
|
1411
|
+
)
|
1412
|
+
return set()
|
1413
|
+
|
1414
|
+
try:
|
1415
|
+
stdout = self.run_squeue(job_ids)
|
1416
|
+
id_to_state = {
|
1417
|
+
out.split()[0]: out.split()[1] for out in stdout.splitlines()
|
1418
|
+
}
|
1419
|
+
# Finished jobs only stay in squeue for a few mins (configurable).
|
1420
|
+
# If a job ID isn't there, we'll assume it's finished.
|
1421
|
+
output = {
|
1422
|
+
_id
|
1423
|
+
for _id in job_ids
|
1424
|
+
if id_to_state.get(_id, "COMPLETED") in STATES_FINISHED
|
1425
|
+
}
|
1426
|
+
logger.debug(
|
1427
|
+
f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
|
1428
|
+
)
|
1429
|
+
return output
|
1430
|
+
except Exception as e:
|
1431
|
+
# If something goes wrong, proceed anyway
|
1432
|
+
logger.error(
|
1433
|
+
f"Something wrong in _jobs_finished. Original error: {str(e)}"
|
1434
|
+
)
|
1435
|
+
output = set()
|
1436
|
+
logger.debug(
|
1437
|
+
f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
|
1438
|
+
)
|
1439
|
+
return output
|
1440
|
+
|
1441
|
+
id_to_state = dict()
|
1442
|
+
for j in job_ids:
|
1443
|
+
res = self.run_squeue([j])
|
1444
|
+
if res.returncode != 0:
|
1445
|
+
logger.info(f"Job {j} not found. Marked it as completed")
|
1446
|
+
id_to_state.update({str(j): "COMPLETED"})
|
1447
|
+
else:
|
1448
|
+
id_to_state.update(
|
1449
|
+
{res.stdout.split()[0]: res.stdout.split()[1]}
|
1450
|
+
)
|
1451
|
+
|
1452
|
+
def handshake(self) -> dict:
|
1453
|
+
"""
|
1454
|
+
Healthcheck for SSH connection and for versions match.
|
1455
|
+
|
1456
|
+
FIXME SSH: We should add a timeout here
|
1457
|
+
FIXME SSH: We could include checks on the existence of folders
|
1458
|
+
FIXME SSH: We could include further checks on version matches
|
1459
|
+
"""
|
1460
|
+
|
1461
|
+
check_connection(self.connection)
|
1462
|
+
|
1463
|
+
t_start_handshake = time.perf_counter()
|
1464
|
+
|
1465
|
+
logger.info("[FractalSlurmSSHExecutor.ssh_handshake] START")
|
1466
|
+
cmd = f"{self.python_remote} -m fractal_server.app.runner.versions"
|
1467
|
+
stdout = run_command_over_ssh(cmd=cmd, connection=self.connection)
|
1468
|
+
remote_versions = json.loads(stdout.strip("\n"))
|
1469
|
+
|
1470
|
+
# Check compatibility with local versions
|
1471
|
+
local_versions = get_versions()
|
1472
|
+
remote_fractal_server = remote_versions["fractal_server"]
|
1473
|
+
local_fractal_server = local_versions["fractal_server"]
|
1474
|
+
if remote_fractal_server != local_fractal_server:
|
1475
|
+
error_msg = (
|
1476
|
+
"Fractal-server version mismatch.\n"
|
1477
|
+
"Local interpreter: "
|
1478
|
+
f"({sys.executable}): {local_versions}.\n"
|
1479
|
+
"Remote interpreter: "
|
1480
|
+
f"({self.python_remote}): {remote_versions}."
|
1481
|
+
)
|
1482
|
+
logger.error(error_msg)
|
1483
|
+
raise ValueError(error_msg)
|
1484
|
+
|
1485
|
+
t_end_handshake = time.perf_counter()
|
1486
|
+
logger.info(
|
1487
|
+
"[FractalSlurmSSHExecutor.ssh_handshake] END"
|
1488
|
+
f" - elapsed: {t_end_handshake-t_start_handshake:.3f} s"
|
1489
|
+
)
|
1490
|
+
return remote_versions
|