fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v1/state.py +1 -2
  3. fractal_server/app/routes/admin/v1.py +2 -2
  4. fractal_server/app/routes/admin/v2.py +2 -2
  5. fractal_server/app/routes/api/v1/job.py +2 -2
  6. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  7. fractal_server/app/routes/api/v2/__init__.py +23 -3
  8. fractal_server/app/routes/api/v2/job.py +2 -2
  9. fractal_server/app/routes/api/v2/submit.py +6 -0
  10. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  11. fractal_server/app/routes/api/v2/task_collection_custom.py +170 -0
  12. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  13. fractal_server/app/routes/aux/_runner.py +10 -2
  14. fractal_server/app/runner/compress_folder.py +120 -0
  15. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  16. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  17. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  18. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  19. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  21. fractal_server/app/runner/executors/slurm/ssh/executor.py +1488 -0
  22. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  23. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  24. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  27. fractal_server/app/runner/extract_archive.py +38 -0
  28. fractal_server/app/runner/v1/__init__.py +78 -40
  29. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  30. fractal_server/app/runner/v2/__init__.py +147 -62
  31. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  32. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  33. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  34. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +125 -0
  35. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  37. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  38. fractal_server/app/runner/versions.py +30 -0
  39. fractal_server/app/schemas/v1/__init__.py +1 -0
  40. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  41. fractal_server/app/schemas/v2/__init__.py +4 -1
  42. fractal_server/app/schemas/v2/task_collection.py +101 -30
  43. fractal_server/config.py +184 -3
  44. fractal_server/main.py +27 -1
  45. fractal_server/ssh/__init__.py +4 -0
  46. fractal_server/ssh/_fabric.py +245 -0
  47. fractal_server/tasks/utils.py +12 -64
  48. fractal_server/tasks/v1/background_operations.py +2 -2
  49. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  50. fractal_server/tasks/v1/utils.py +67 -0
  51. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  52. fractal_server/tasks/v2/_venv_pip.py +195 -0
  53. fractal_server/tasks/v2/background_operations.py +257 -295
  54. fractal_server/tasks/v2/background_operations_ssh.py +317 -0
  55. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  56. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  57. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  58. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  59. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  60. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  61. fractal_server/tasks/v2/utils.py +54 -0
  62. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0.dist-info}/METADATA +4 -2
  63. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0.dist-info}/RECORD +66 -42
  64. fractal_server/tasks/v2/get_collection_data.py +0 -14
  65. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0.dist-info}/LICENSE +0 -0
  66. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0.dist-info}/WHEEL +0 -0
  67. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1488 @@
1
+ # This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
2
+ # Original Copyright
3
+ # Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
4
+ # License: MIT
5
+ #
6
+ # Modified by:
7
+ # Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
8
+ # Tommaso Comparin <tommaso.comparin@exact-lab.it>
9
+ # Marco Franzon <marco.franzon@exact-lab.it>
10
+ #
11
+ # Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
12
+ # University of Zurich
13
+ import json
14
+ import math
15
+ import sys
16
+ import tarfile
17
+ import threading
18
+ import time
19
+ from concurrent.futures import Future
20
+ from concurrent.futures import InvalidStateError
21
+ from copy import copy
22
+ from pathlib import Path
23
+ from typing import Any
24
+ from typing import Callable
25
+ from typing import Optional
26
+ from typing import Sequence
27
+
28
+ import cloudpickle
29
+ from cfut import SlurmExecutor
30
+ from paramiko.ssh_exception import NoValidConnectionsError
31
+
32
+ from ....filenames import SHUTDOWN_FILENAME
33
+ from ....task_files import get_task_file_paths
34
+ from ....task_files import TaskFiles
35
+ from ....versions import get_versions
36
+ from ...slurm._slurm_config import get_default_slurm_config
37
+ from ...slurm._slurm_config import SlurmConfig
38
+ from .._batching import heuristics
39
+ from ._executor_wait_thread import FractalSlurmWaitThread
40
+ from fractal_server.app.runner.components import _COMPONENT_KEY_
41
+ from fractal_server.app.runner.exceptions import JobExecutionError
42
+ from fractal_server.app.runner.exceptions import TaskExecutionError
43
+ from fractal_server.app.runner.executors.slurm.ssh._slurm_job import SlurmJob
44
+ from fractal_server.config import get_settings
45
+ from fractal_server.logger import set_logger
46
+ from fractal_server.ssh._fabric import FractalSSH
47
+ from fractal_server.ssh._fabric import run_command_over_ssh
48
+ from fractal_server.syringe import Inject
49
+
50
+ logger = set_logger(__name__)
51
+
52
+
53
+ class FractalSlurmSSHExecutor(SlurmExecutor):
54
+ """
55
+ FractalSlurmSSHExecutor (inherits from cfut.SlurmExecutor)
56
+
57
+ FIXME: docstring
58
+
59
+ Attributes:
60
+ fractal_ssh: FractalSSH connection with custom lock
61
+ shutdown_file:
62
+ python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
63
+ wait_thread_cls: Class for waiting thread
64
+ keep_pickle_files:
65
+ workflow_dir_local:
66
+ Directory for both the cfut/SLURM and fractal-server files and logs
67
+ workflow_dir_remote:
68
+ Directory for both the cfut/SLURM and fractal-server files and logs
69
+ common_script_lines:
70
+ Arbitrary script lines that will always be included in the
71
+ sbatch script
72
+ slurm_account:
73
+ jobs:
74
+ map_jobid_to_slurm_files:
75
+ Dictionary with paths of slurm-related files for active jobs
76
+ """
77
+
78
+ fractal_ssh: FractalSSH
79
+
80
+ workflow_dir_local: Path
81
+ workflow_dir_remote: Path
82
+ shutdown_file: str
83
+ python_remote: str
84
+
85
+ wait_thread_cls = FractalSlurmWaitThread
86
+ keep_pickle_files: bool
87
+
88
+ common_script_lines: list[str]
89
+ slurm_account: Optional[str]
90
+
91
+ jobs: dict[str, tuple[Future, SlurmJob]]
92
+ map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
93
+
94
+ def __init__(
95
+ self,
96
+ *,
97
+ # FractalSSH connection
98
+ fractal_ssh: FractalSSH,
99
+ # Folders and files
100
+ workflow_dir_local: Path,
101
+ workflow_dir_remote: Path,
102
+ # Runner options
103
+ keep_pickle_files: bool = False,
104
+ # Monitoring options
105
+ slurm_poll_interval: Optional[int] = None,
106
+ # SLURM submission script options
107
+ common_script_lines: Optional[list[str]] = None,
108
+ slurm_account: Optional[str] = None,
109
+ # Other kwargs are ignored
110
+ **kwargs,
111
+ ):
112
+ """
113
+ Init method for FractalSlurmSSHExecutor
114
+
115
+ Note: since we are not using `super().__init__`, we duplicate some
116
+ relevant bits of `cfut.ClusterExecutor.__init__`.
117
+
118
+ Args:
119
+ fractal_ssh:
120
+ workflow_dir_local:
121
+ workflow_dir_remote:
122
+ keep_pickle_files:
123
+ slurm_poll_interval:
124
+ common_script_lines:
125
+ slurm_account:
126
+ """
127
+
128
+ if kwargs != {}:
129
+ raise ValueError(
130
+ f"FractalSlurmSSHExecutor received unexpected {kwargs=}"
131
+ )
132
+
133
+ self.workflow_dir_local = workflow_dir_local
134
+ self.workflow_dir_remote = workflow_dir_remote
135
+
136
+ # Relevant bits of cfut.ClusterExecutor.__init__ are copied here,
137
+ # postponing the .start() call to when the callbacks are defined
138
+ self.jobs = {}
139
+ self.job_outfiles = {}
140
+ self.jobs_lock = threading.Lock()
141
+ self.jobs_empty_cond = threading.Condition(self.jobs_lock)
142
+ self.wait_thread = self.wait_thread_cls(self._completion)
143
+
144
+ # Set up attributes and methods for self.wait_thread
145
+ # cfut.SlurmWaitThread)
146
+ self.wait_thread.shutdown_callback = self.shutdown
147
+ self.wait_thread.jobs_finished_callback = self._jobs_finished
148
+ if slurm_poll_interval is None:
149
+ settings = Inject(get_settings)
150
+ slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
151
+ elif slurm_poll_interval <= 0:
152
+ raise ValueError(f"Invalid attribute {slurm_poll_interval=}")
153
+ self.wait_thread.slurm_poll_interval = slurm_poll_interval
154
+ self.wait_thread.shutdown_file = (
155
+ self.workflow_dir_local / SHUTDOWN_FILENAME
156
+ ).as_posix()
157
+
158
+ # Now start self.wait_thread (note: this must be *after* its callback
159
+ # methods have been defined)
160
+ self.wait_thread.start()
161
+
162
+ # Define remote Python interpreter
163
+ settings = Inject(get_settings)
164
+ self.python_remote = settings.FRACTAL_SLURM_WORKER_PYTHON
165
+ if self.python_remote is None:
166
+ raise ValueError("FRACTAL_SLURM_WORKER_PYTHON is not set. Exit.")
167
+
168
+ # Initialize connection and perform handshake
169
+ self.fractal_ssh = fractal_ssh
170
+ logger.warning(self.fractal_ssh)
171
+ self.handshake()
172
+
173
+ # Set/validate parameters for SLURM submission scripts
174
+ self.slurm_account = slurm_account
175
+ self.common_script_lines = common_script_lines or []
176
+ self._validate_common_script_lines()
177
+
178
+ # Set/initialize some more options
179
+ self.keep_pickle_files = keep_pickle_files
180
+ self.map_jobid_to_slurm_files_local = {}
181
+
182
+ def _validate_common_script_lines(self):
183
+ """
184
+ Check that SLURM account is not set in `self.common_script_lines`.
185
+ """
186
+ try:
187
+ invalid_line = next(
188
+ line
189
+ for line in self.common_script_lines
190
+ if line.startswith("#SBATCH --account=")
191
+ )
192
+ raise RuntimeError(
193
+ "Invalid line in `FractalSlurmSSHExecutor."
194
+ "common_script_lines`: "
195
+ f"'{invalid_line}'.\n"
196
+ "SLURM account must be set via the request body of the "
197
+ "apply-workflow endpoint, or by modifying the user properties."
198
+ )
199
+ except StopIteration:
200
+ pass
201
+
202
+ def _cleanup(self, jobid: str) -> None:
203
+ """
204
+ Given a job ID, perform any necessary cleanup after the job has
205
+ finished.
206
+ """
207
+ with self.jobs_lock:
208
+ self.map_jobid_to_slurm_files_local.pop(jobid)
209
+
210
+ def get_input_pickle_file_path_local(
211
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
212
+ ) -> Path:
213
+
214
+ prefix = prefix or "cfut"
215
+ output = (
216
+ self.workflow_dir_local
217
+ / subfolder_name
218
+ / f"{prefix}_in_{arg}.pickle"
219
+ )
220
+ return output
221
+
222
+ def get_input_pickle_file_path_remote(
223
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
224
+ ) -> Path:
225
+
226
+ prefix = prefix or "cfut"
227
+ output = (
228
+ self.workflow_dir_remote
229
+ / subfolder_name
230
+ / f"{prefix}_in_{arg}.pickle"
231
+ )
232
+ return output
233
+
234
+ def get_output_pickle_file_path_local(
235
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
236
+ ) -> Path:
237
+ prefix = prefix or "cfut"
238
+ return (
239
+ self.workflow_dir_local
240
+ / subfolder_name
241
+ / f"{prefix}_out_{arg}.pickle"
242
+ )
243
+
244
+ def get_output_pickle_file_path_remote(
245
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
246
+ ) -> Path:
247
+ prefix = prefix or "cfut"
248
+ return (
249
+ self.workflow_dir_remote
250
+ / subfolder_name
251
+ / f"{prefix}_out_{arg}.pickle"
252
+ )
253
+
254
+ def get_slurm_script_file_path_local(
255
+ self, *, subfolder_name: str, prefix: Optional[str] = None
256
+ ) -> Path:
257
+ prefix = prefix or "_temp"
258
+ return (
259
+ self.workflow_dir_local
260
+ / subfolder_name
261
+ / f"{prefix}_slurm_submit.sbatch"
262
+ )
263
+
264
+ def get_slurm_script_file_path_remote(
265
+ self, *, subfolder_name: str, prefix: Optional[str] = None
266
+ ) -> Path:
267
+ prefix = prefix or "_temp"
268
+ return (
269
+ self.workflow_dir_remote
270
+ / subfolder_name
271
+ / f"{prefix}_slurm_submit.sbatch"
272
+ )
273
+
274
+ def get_slurm_stdout_file_path_local(
275
+ self,
276
+ *,
277
+ subfolder_name: str,
278
+ arg: str = "%j",
279
+ prefix: Optional[str] = None,
280
+ ) -> Path:
281
+ prefix = prefix or "slurmpy.stdout"
282
+ return (
283
+ self.workflow_dir_local
284
+ / subfolder_name
285
+ / f"{prefix}_slurm_{arg}.out"
286
+ )
287
+
288
+ def get_slurm_stdout_file_path_remote(
289
+ self,
290
+ *,
291
+ subfolder_name: str,
292
+ arg: str = "%j",
293
+ prefix: Optional[str] = None,
294
+ ) -> Path:
295
+ prefix = prefix or "slurmpy.stdout"
296
+ return (
297
+ self.workflow_dir_remote
298
+ / subfolder_name
299
+ / f"{prefix}_slurm_{arg}.out"
300
+ )
301
+
302
+ def get_slurm_stderr_file_path_local(
303
+ self,
304
+ *,
305
+ subfolder_name: str,
306
+ arg: str = "%j",
307
+ prefix: Optional[str] = None,
308
+ ) -> Path:
309
+ prefix = prefix or "slurmpy.stderr"
310
+ return (
311
+ self.workflow_dir_local
312
+ / subfolder_name
313
+ / f"{prefix}_slurm_{arg}.err"
314
+ )
315
+
316
+ def get_slurm_stderr_file_path_remote(
317
+ self,
318
+ *,
319
+ subfolder_name: str,
320
+ arg: str = "%j",
321
+ prefix: Optional[str] = None,
322
+ ) -> Path:
323
+ prefix = prefix or "slurmpy.stderr"
324
+ return (
325
+ self.workflow_dir_remote
326
+ / subfolder_name
327
+ / f"{prefix}_slurm_{arg}.err"
328
+ )
329
+
330
+ def submit(
331
+ self,
332
+ fun: Callable[..., Any],
333
+ *fun_args: Sequence[Any],
334
+ slurm_config: Optional[SlurmConfig] = None,
335
+ task_files: Optional[TaskFiles] = None,
336
+ **fun_kwargs: dict,
337
+ ) -> Future:
338
+ """
339
+ Submit a function for execution on `FractalSlurmSSHExecutor`
340
+
341
+ Arguments:
342
+ fun: The function to be executed
343
+ fun_args: Function positional arguments
344
+ fun_kwargs: Function keyword arguments
345
+ slurm_config:
346
+ A `SlurmConfig` object; if `None`, use
347
+ `get_default_slurm_config()`.
348
+ task_files:
349
+ A `TaskFiles` object; if `None`, use
350
+ `self.get_default_task_files()`.
351
+
352
+ Returns:
353
+ Future representing the execution of the current SLURM job.
354
+ """
355
+
356
+ # Set defaults, if needed
357
+ if slurm_config is None:
358
+ slurm_config = get_default_slurm_config()
359
+ if task_files is None:
360
+ task_files = self.get_default_task_files()
361
+
362
+ # Set slurm_file_prefix
363
+ slurm_file_prefix = task_files.file_prefix
364
+
365
+ # Include common_script_lines in extra_lines
366
+ logger.debug(
367
+ f"Adding {self.common_script_lines=} to "
368
+ f"{slurm_config.extra_lines=}, from submit method."
369
+ )
370
+ current_extra_lines = slurm_config.extra_lines or []
371
+ slurm_config.extra_lines = (
372
+ current_extra_lines + self.common_script_lines
373
+ )
374
+
375
+ # Adapt slurm_config to the fact that this is a single-task SlurmJob
376
+ # instance
377
+ slurm_config.tasks_per_job = 1
378
+ slurm_config.parallel_tasks_per_job = 1
379
+
380
+ job = self._prepare_job(
381
+ fun,
382
+ slurm_config=slurm_config,
383
+ slurm_file_prefix=slurm_file_prefix,
384
+ task_files=task_files,
385
+ single_task_submission=True,
386
+ args=fun_args,
387
+ kwargs=fun_kwargs,
388
+ )
389
+ try:
390
+ self._put_subfolder_sftp(jobs=[job])
391
+ except NoValidConnectionsError as e:
392
+ logger.error("NoValidConnectionError")
393
+ logger.error(f"{str(e)=}")
394
+ logger.error(f"{e.errors=}")
395
+ for err in e.errors:
396
+ logger.error(f"{str(err)}")
397
+ raise e
398
+ future, job_id_str = self._submit_job(job)
399
+ self.wait_thread.wait(job_id=job_id_str)
400
+ return future
401
+
402
+ def map(
403
+ self,
404
+ fn: Callable[..., Any],
405
+ iterable: list[Sequence[Any]],
406
+ *,
407
+ slurm_config: Optional[SlurmConfig] = None,
408
+ task_files: Optional[TaskFiles] = None,
409
+ ):
410
+ """
411
+ Return an iterator with the results of several execution of a function
412
+
413
+ This function is based on `concurrent.futures.Executor.map` from Python
414
+ Standard Library 3.11.
415
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
416
+ PSF under a Contributor Agreement.
417
+
418
+ Main modifications from the PSF function:
419
+
420
+ 1. Only `fn` and `iterable` can be assigned as positional arguments;
421
+ 2. `*iterables` argument replaced with a single `iterable`;
422
+ 3. `timeout` and `chunksize` arguments are not supported.
423
+
424
+ Arguments:
425
+ fn:
426
+ The function to be executed
427
+ iterable:
428
+ An iterable such that each element is the list of arguments to
429
+ be passed to `fn`, as in `fn(*args)`.
430
+ slurm_config:
431
+ A `SlurmConfig` object; if `None`, use
432
+ `get_default_slurm_config()`.
433
+ task_files:
434
+ A `TaskFiles` object; if `None`, use
435
+ `self.get_default_task_files()`.
436
+
437
+ """
438
+
439
+ def _result_or_cancel(fut):
440
+ """
441
+ This function is based on the Python Standard Library 3.11.
442
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved.
443
+ Licensed to PSF under a Contributor Agreement.
444
+ """
445
+ try:
446
+ try:
447
+ return fut.result()
448
+ finally:
449
+ fut.cancel()
450
+ finally:
451
+ # Break a reference cycle with the exception in
452
+ # self._exception
453
+ del fut
454
+
455
+ # Set defaults, if needed
456
+ if not slurm_config:
457
+ slurm_config = get_default_slurm_config()
458
+ if task_files is None:
459
+ task_files = self.get_default_task_files()
460
+
461
+ # Include common_script_lines in extra_lines
462
+ logger.debug(
463
+ f"Adding {self.common_script_lines=} to "
464
+ f"{slurm_config.extra_lines=}, from map method."
465
+ )
466
+ current_extra_lines = slurm_config.extra_lines or []
467
+ slurm_config.extra_lines = (
468
+ current_extra_lines + self.common_script_lines
469
+ )
470
+
471
+ # Set file prefixes
472
+ general_slurm_file_prefix = str(task_files.task_order)
473
+
474
+ # Transform iterable into a list and count its elements
475
+ list_args = list(iterable)
476
+ tot_tasks = len(list_args)
477
+
478
+ # Set/validate parameters for task batching
479
+ tasks_per_job, parallel_tasks_per_job = heuristics(
480
+ # Number of parallel components (always known)
481
+ tot_tasks=len(list_args),
482
+ # Optional WorkflowTask attributes:
483
+ tasks_per_job=slurm_config.tasks_per_job,
484
+ parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
485
+ # Task requirements (multiple possible sources):
486
+ cpus_per_task=slurm_config.cpus_per_task,
487
+ mem_per_task=slurm_config.mem_per_task_MB,
488
+ # Fractal configuration variables (soft/hard limits):
489
+ target_cpus_per_job=slurm_config.target_cpus_per_job,
490
+ target_mem_per_job=slurm_config.target_mem_per_job,
491
+ target_num_jobs=slurm_config.target_num_jobs,
492
+ max_cpus_per_job=slurm_config.max_cpus_per_job,
493
+ max_mem_per_job=slurm_config.max_mem_per_job,
494
+ max_num_jobs=slurm_config.max_num_jobs,
495
+ )
496
+ slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
497
+ slurm_config.tasks_per_job = tasks_per_job
498
+
499
+ # Divide arguments in batches of `n_tasks_per_script` tasks each
500
+ args_batches = []
501
+ batch_size = tasks_per_job
502
+ for ind_chunk in range(0, tot_tasks, batch_size):
503
+ args_batches.append(
504
+ list_args[ind_chunk : ind_chunk + batch_size] # noqa
505
+ )
506
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
507
+ raise RuntimeError("Something wrong here while batching tasks")
508
+
509
+ # Fetch configuration variable
510
+ settings = Inject(get_settings)
511
+ FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
512
+
513
+ logger.debug("[map] Job preparation - START")
514
+ current_component_index = 0
515
+ jobs_to_submit = []
516
+ for ind_batch, batch in enumerate(args_batches):
517
+ batch_size = len(batch)
518
+ this_slurm_file_prefix = (
519
+ f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
520
+ )
521
+ new_job_to_submit = self._prepare_job(
522
+ fn,
523
+ slurm_config=slurm_config,
524
+ slurm_file_prefix=this_slurm_file_prefix,
525
+ task_files=task_files,
526
+ single_task_submission=False,
527
+ components=batch,
528
+ )
529
+ jobs_to_submit.append(new_job_to_submit)
530
+ current_component_index += batch_size
531
+ logger.debug("[map] Job preparation - END")
532
+
533
+ try:
534
+ self._put_subfolder_sftp(jobs=jobs_to_submit)
535
+ except NoValidConnectionsError as e:
536
+ logger.error("NoValidConnectionError")
537
+ logger.error(f"{str(e)=}")
538
+ logger.error(f"{e.errors=}")
539
+ for err in e.errors:
540
+ logger.error(f"{str(err)}")
541
+
542
+ raise e
543
+
544
+ # Construct list of futures (one per SLURM job, i.e. one per batch)
545
+ # FIXME SSH: we may create a single `_submit_many_jobs` method to
546
+ # reduce the number of commands run over SSH
547
+ logger.debug("[map] Job submission - START")
548
+ fs = []
549
+ job_ids = []
550
+ for job in jobs_to_submit:
551
+ future, job_id = self._submit_job(job)
552
+ job_ids.append(job_id)
553
+ fs.append(future)
554
+ time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
555
+ for job_id in job_ids:
556
+ self.wait_thread.wait(job_id=job_id)
557
+ logger.debug("[map] Job submission - END")
558
+
559
+ # Yield must be hidden in closure so that the futures are submitted
560
+ # before the first iterator value is required.
561
+ # NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
562
+ # iterable of results (if successful), and we should yield its elements
563
+ # rather than the whole iterable.
564
+ def result_iterator():
565
+ """
566
+ This function is based on the Python Standard Library 3.11.
567
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved.
568
+ Licensed to PSF under a Contributor Agreement.
569
+ """
570
+ try:
571
+ # reverse to keep finishing order
572
+ fs.reverse()
573
+ while fs:
574
+ # Careful not to keep a reference to the popped future
575
+ results = _result_or_cancel(fs.pop())
576
+ for res in results:
577
+ yield res
578
+ finally:
579
+ for future in fs:
580
+ future.cancel()
581
+
582
+ return result_iterator()
583
+
584
+ def _prepare_job(
585
+ self,
586
+ fun: Callable[..., Any],
587
+ slurm_file_prefix: str,
588
+ task_files: TaskFiles,
589
+ slurm_config: SlurmConfig,
590
+ single_task_submission: bool = False,
591
+ args: Optional[Sequence[Any]] = None,
592
+ kwargs: Optional[dict] = None,
593
+ components: Optional[list[Any]] = None,
594
+ ) -> SlurmJob:
595
+ """
596
+ Prepare a SLURM job locally, without submitting it
597
+
598
+ This function prepares and writes the local submission script, but it
599
+ does not transfer it to the SLURM cluster.
600
+
601
+ NOTE: this method has different behaviors when it is called from the
602
+ `self.submit` or `self.map` methods (which is also encoded in
603
+ `single_task_submission`):
604
+
605
+ * When called from `self.submit`, it supports general `args` and
606
+ `kwargs` arguments;
607
+ * When called from `self.map`, there cannot be any `args` or `kwargs`
608
+ argument, but there must be a `components` argument.
609
+
610
+ Arguments:
611
+ fun:
612
+ slurm_file_prefix:
613
+ task_files:
614
+ slurm_config:
615
+ single_task_submission:
616
+ args:
617
+ kwargs:
618
+ components:
619
+
620
+ Returns:
621
+ SlurmJob object
622
+ """
623
+
624
+ # Inject SLURM account (if set) into slurm_config
625
+ if self.slurm_account:
626
+ slurm_config.account = self.slurm_account
627
+
628
+ # Define slurm-job-related files
629
+ if single_task_submission:
630
+ if components is not None:
631
+ raise ValueError(
632
+ f"{single_task_submission=} but components is not None"
633
+ )
634
+ job = SlurmJob(
635
+ slurm_file_prefix=slurm_file_prefix,
636
+ num_tasks_tot=1,
637
+ slurm_config=slurm_config,
638
+ )
639
+ if job.num_tasks_tot > 1:
640
+ raise ValueError(
641
+ "{single_task_submission=} but {job.num_tasks_tot=}"
642
+ )
643
+ job.single_task_submission = True
644
+ job.wftask_file_prefixes = (task_files.file_prefix,)
645
+ job.wftask_subfolder_name = task_files.subfolder_name
646
+
647
+ else:
648
+ if not components or len(components) < 1:
649
+ raise ValueError(
650
+ "In FractalSlurmSSHExecutor._submit_job, given "
651
+ f"{components=}."
652
+ )
653
+ num_tasks_tot = len(components)
654
+ job = SlurmJob(
655
+ slurm_file_prefix=slurm_file_prefix,
656
+ num_tasks_tot=num_tasks_tot,
657
+ slurm_config=slurm_config,
658
+ )
659
+
660
+ _prefixes = []
661
+ _subfolder_names = []
662
+ for component in components:
663
+ if isinstance(component, dict):
664
+ actual_component = component.get(_COMPONENT_KEY_, None)
665
+ else:
666
+ actual_component = component
667
+ _task_file_paths = get_task_file_paths(
668
+ workflow_dir_local=task_files.workflow_dir_local,
669
+ workflow_dir_remote=task_files.workflow_dir_remote,
670
+ task_name=task_files.task_name,
671
+ task_order=task_files.task_order,
672
+ component=actual_component,
673
+ )
674
+ _prefixes.append(_task_file_paths.file_prefix)
675
+ _subfolder_names.append(_task_file_paths.subfolder_name)
676
+ job.wftask_file_prefixes = tuple(_prefixes)
677
+
678
+ # Check that all components share the same subfolder
679
+ num_subfolders = len(set(_subfolder_names))
680
+ if num_subfolders != 1:
681
+ error_msg_short = (
682
+ f"[_submit_job] Subfolder list has {num_subfolders} "
683
+ "different values, but it must have only one (since "
684
+ "workflow tasks are executed one by one)."
685
+ )
686
+ error_msg_detail = (
687
+ "[_submit_job] Current unique subfolder names: "
688
+ f"{set(_subfolder_names)}"
689
+ )
690
+ logger.error(error_msg_short)
691
+ logger.error(error_msg_detail)
692
+ raise ValueError(error_msg_short)
693
+ job.wftask_subfolder_name = _subfolder_names[0]
694
+
695
+ # Check that server-side subfolder exists
696
+ subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
697
+ if not subfolder_path.exists():
698
+ raise FileNotFoundError(
699
+ f"Missing folder {subfolder_path.as_posix()}."
700
+ )
701
+
702
+ # Define I/O pickle file local/remote paths
703
+ job.input_pickle_files_local = tuple(
704
+ self.get_input_pickle_file_path_local(
705
+ arg=job.workerids[ind],
706
+ subfolder_name=job.wftask_subfolder_name,
707
+ prefix=job.wftask_file_prefixes[ind],
708
+ )
709
+ for ind in range(job.num_tasks_tot)
710
+ )
711
+ job.input_pickle_files_remote = tuple(
712
+ self.get_input_pickle_file_path_remote(
713
+ arg=job.workerids[ind],
714
+ subfolder_name=job.wftask_subfolder_name,
715
+ prefix=job.wftask_file_prefixes[ind],
716
+ )
717
+ for ind in range(job.num_tasks_tot)
718
+ )
719
+ job.output_pickle_files_local = tuple(
720
+ self.get_output_pickle_file_path_local(
721
+ arg=job.workerids[ind],
722
+ subfolder_name=job.wftask_subfolder_name,
723
+ prefix=job.wftask_file_prefixes[ind],
724
+ )
725
+ for ind in range(job.num_tasks_tot)
726
+ )
727
+ job.output_pickle_files_remote = tuple(
728
+ self.get_output_pickle_file_path_remote(
729
+ arg=job.workerids[ind],
730
+ subfolder_name=job.wftask_subfolder_name,
731
+ prefix=job.wftask_file_prefixes[ind],
732
+ )
733
+ for ind in range(job.num_tasks_tot)
734
+ )
735
+
736
+ # Define SLURM-job file local/remote paths
737
+ job.slurm_script_local = self.get_slurm_script_file_path_local(
738
+ subfolder_name=job.wftask_subfolder_name,
739
+ prefix=job.slurm_file_prefix,
740
+ )
741
+ job.slurm_script_remote = self.get_slurm_script_file_path_remote(
742
+ subfolder_name=job.wftask_subfolder_name,
743
+ prefix=job.slurm_file_prefix,
744
+ )
745
+ job.slurm_stdout_local = self.get_slurm_stdout_file_path_local(
746
+ subfolder_name=job.wftask_subfolder_name,
747
+ prefix=job.slurm_file_prefix,
748
+ )
749
+ job.slurm_stdout_remote = self.get_slurm_stdout_file_path_remote(
750
+ subfolder_name=job.wftask_subfolder_name,
751
+ prefix=job.slurm_file_prefix,
752
+ )
753
+ job.slurm_stderr_local = self.get_slurm_stderr_file_path_local(
754
+ subfolder_name=job.wftask_subfolder_name,
755
+ prefix=job.slurm_file_prefix,
756
+ )
757
+ job.slurm_stderr_remote = self.get_slurm_stderr_file_path_remote(
758
+ subfolder_name=job.wftask_subfolder_name,
759
+ prefix=job.slurm_file_prefix,
760
+ )
761
+
762
+ # Dump serialized versions+function+args+kwargs to pickle file(s)
763
+ versions = get_versions()
764
+ if job.single_task_submission:
765
+ _args = args or []
766
+ _kwargs = kwargs or {}
767
+ funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
768
+ with open(job.input_pickle_files_local[0], "wb") as f:
769
+ f.write(funcser)
770
+ else:
771
+ for ind_component, component in enumerate(components):
772
+ _args = [component]
773
+ _kwargs = {}
774
+ funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
775
+ with open(
776
+ job.input_pickle_files_local[ind_component], "wb"
777
+ ) as f:
778
+ f.write(funcser)
779
+
780
+ # Prepare commands to be included in SLURM submission script
781
+ cmdlines = []
782
+ for ind_task in range(job.num_tasks_tot):
783
+ input_pickle_file = job.input_pickle_files_remote[ind_task]
784
+ output_pickle_file = job.output_pickle_files_remote[ind_task]
785
+ cmdlines.append(
786
+ (
787
+ f"{self.python_remote}"
788
+ " -m fractal_server.app.runner.executors.slurm.remote "
789
+ f"--input-file {input_pickle_file} "
790
+ f"--output-file {output_pickle_file}"
791
+ )
792
+ )
793
+
794
+ # Prepare SLURM submission script
795
+ sbatch_script_content = self._prepare_sbatch_script(
796
+ slurm_config=job.slurm_config,
797
+ list_commands=cmdlines,
798
+ slurm_out_path=str(job.slurm_stdout_remote),
799
+ slurm_err_path=str(job.slurm_stderr_remote),
800
+ )
801
+ with job.slurm_script_local.open("w") as f:
802
+ f.write(sbatch_script_content)
803
+
804
+ return job
805
+
806
+ def _put_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
807
+ """
808
+ Transfer the jobs subfolder to the remote host.
809
+
810
+ Arguments:
811
+ jobs: The list of `SlurmJob` objects associated to a given
812
+ subfolder.
813
+ """
814
+
815
+ # Check that the subfolder is unique
816
+ subfolder_names = [job.wftask_subfolder_name for job in jobs]
817
+ if len(set(subfolder_names)) > 1:
818
+ raise ValueError(
819
+ "[_put_subfolder] Invalid list of jobs, "
820
+ f"{set(subfolder_names)=}."
821
+ )
822
+ subfolder_name = subfolder_names[0]
823
+
824
+ # Create compressed subfolder archive (locally)
825
+ local_subfolder = self.workflow_dir_local / subfolder_name
826
+ tarfile_name = f"{subfolder_name}.tar.gz"
827
+ tarfile_path_local = (
828
+ self.workflow_dir_local / tarfile_name
829
+ ).as_posix()
830
+ tarfile_path_remote = (
831
+ self.workflow_dir_remote / tarfile_name
832
+ ).as_posix()
833
+ with tarfile.open(tarfile_path_local, "w:gz") as tar:
834
+ for this_file in local_subfolder.glob("*"):
835
+ tar.add(this_file, arcname=this_file.name)
836
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
837
+
838
+ # Transfer archive
839
+ t_0_put = time.perf_counter()
840
+ self.fractal_ssh.put(
841
+ local=tarfile_path_local,
842
+ remote=tarfile_path_remote,
843
+ )
844
+ t_1_put = time.perf_counter()
845
+ logger.info(
846
+ f"Subfolder archive transferred to {tarfile_path_remote}"
847
+ f" - elapsed: {t_1_put - t_0_put:.3f} s"
848
+ )
849
+ # Uncompress archive (remotely)
850
+ tar_command = (
851
+ f"{self.python_remote} -m "
852
+ "fractal_server.app.runner.extract_archive "
853
+ f"{tarfile_path_remote}"
854
+ )
855
+ run_command_over_ssh(cmd=tar_command, fractal_ssh=self.fractal_ssh)
856
+
857
+ # Remove local version
858
+ t_0_rm = time.perf_counter()
859
+ Path(tarfile_path_local).unlink()
860
+ t_1_rm = time.perf_counter()
861
+ logger.info(
862
+ f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
863
+ )
864
+
865
+ def _submit_job(self, job: SlurmJob) -> tuple[Future, str]:
866
+ """
867
+ Submit a job to SLURM via SSH.
868
+
869
+ This method must always be called after `self._put_subfolder`.
870
+
871
+ Arguments:
872
+ job: The `SlurmJob` object to submit.
873
+ """
874
+
875
+ # Submit job to SLURM, and get jobid
876
+ sbatch_command = f"sbatch --parsable {job.slurm_script_remote}"
877
+ sbatch_stdout = run_command_over_ssh(
878
+ cmd=sbatch_command,
879
+ fractal_ssh=self.fractal_ssh,
880
+ )
881
+
882
+ # Extract SLURM job ID from stdout
883
+ try:
884
+ stdout = sbatch_stdout.strip("\n")
885
+ jobid = int(stdout)
886
+ except ValueError as e:
887
+ error_msg = (
888
+ f"Submit command `{sbatch_command}` returned "
889
+ f"`{stdout=}` which cannot be cast to an integer "
890
+ f"SLURM-job ID. Original error:\n{str(e)}"
891
+ )
892
+ logger.error(error_msg)
893
+ raise JobExecutionError(info=error_msg)
894
+ job_id_str = str(jobid)
895
+
896
+ # Plug job id in stdout/stderr SLURM file paths (local and remote)
897
+ def _replace_job_id(_old_path: Path) -> Path:
898
+ return Path(_old_path.as_posix().replace("%j", job_id_str))
899
+
900
+ job.slurm_stdout_local = _replace_job_id(job.slurm_stdout_local)
901
+ job.slurm_stdout_remote = _replace_job_id(job.slurm_stdout_remote)
902
+ job.slurm_stderr_local = _replace_job_id(job.slurm_stderr_local)
903
+ job.slurm_stderr_remote = _replace_job_id(job.slurm_stderr_remote)
904
+
905
+ # Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
906
+ # must be after the `sbatch` call, so that "%j" has already been
907
+ # replaced with the job ID)
908
+ with self.jobs_lock:
909
+ self.map_jobid_to_slurm_files_local[job_id_str] = (
910
+ job.slurm_script_local.as_posix(),
911
+ job.slurm_stdout_local.as_posix(),
912
+ job.slurm_stderr_local.as_posix(),
913
+ )
914
+
915
+ # Create future
916
+ future = Future()
917
+ with self.jobs_lock:
918
+ self.jobs[job_id_str] = (future, job)
919
+ return future, job_id_str
920
+
921
+ def _prepare_JobExecutionError(
922
+ self, jobid: str, info: str
923
+ ) -> JobExecutionError:
924
+ """
925
+ Prepare the `JobExecutionError` for a given job
926
+
927
+ This method creates a `JobExecutionError` object and sets its attribute
928
+ to the appropriate SLURM-related file names. Note that the SLURM files
929
+ are the local ones (i.e. the ones in `self.workflow_dir_local`).
930
+
931
+ Arguments:
932
+ jobid:
933
+ ID of the SLURM job.
934
+ info:
935
+ """
936
+ # Extract SLURM file paths
937
+ with self.jobs_lock:
938
+ (
939
+ slurm_script_file,
940
+ slurm_stdout_file,
941
+ slurm_stderr_file,
942
+ ) = self.map_jobid_to_slurm_files_local[jobid]
943
+ # Construct JobExecutionError exception
944
+ job_exc = JobExecutionError(
945
+ cmd_file=slurm_script_file,
946
+ stdout_file=slurm_stdout_file,
947
+ stderr_file=slurm_stderr_file,
948
+ info=info,
949
+ )
950
+ return job_exc
951
+
952
+ def _missing_pickle_error_msg(self, out_path: Path) -> str:
953
+ settings = Inject(get_settings)
954
+ info = (
955
+ "Output pickle file of the FractalSlurmSSHExecutor "
956
+ "job not found.\n"
957
+ f"Expected file path: {out_path.as_posix()}n"
958
+ "Here are some possible reasons:\n"
959
+ "1. The SLURM job was scancel-ed, either by the user "
960
+ "or due to an error (e.g. an out-of-memory or timeout "
961
+ "error). Note that if the scancel took place before "
962
+ "the job started running, the SLURM out/err files "
963
+ "will be empty.\n"
964
+ "2. Some error occurred upon writing the file to disk "
965
+ "(e.g. because there is not enough space on disk, or "
966
+ "due to an overloaded NFS filesystem). "
967
+ "Note that the server configuration has "
968
+ "FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
969
+ f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
970
+ "seconds.\n"
971
+ )
972
+ return info
973
+
974
+ def _handle_remaining_jobs(
975
+ self,
976
+ remaining_futures: list[Future],
977
+ remaining_job_ids: list[str],
978
+ remaining_jobs: list[SlurmJob],
979
+ ) -> None:
980
+ """
981
+ Helper function used within _completion, when looping over a list of
982
+ several jobs/futures.
983
+ """
984
+ for future in remaining_futures:
985
+ try:
986
+ future.cancel()
987
+ except InvalidStateError:
988
+ pass
989
+ for job_id in remaining_job_ids:
990
+ self._cleanup(job_id)
991
+ if not self.keep_pickle_files:
992
+ for job in remaining_jobs:
993
+ for path in job.output_pickle_files_local:
994
+ path.unlink()
995
+ for path in job.input_pickle_files_local:
996
+ path.unlink()
997
+
998
+ def _completion(self, job_ids: list[str]) -> None:
999
+ """
1000
+ Callback function to be executed whenever a job finishes.
1001
+
1002
+ This function is executed by self.wait_thread (triggered by either
1003
+ finding an existing output pickle file `out_path` or finding that the
1004
+ SLURM job is over). Since this takes place on a different thread,
1005
+ failures may not be captured by the main thread; we use a broad
1006
+ try/except block, so that those exceptions are reported to the main
1007
+ thread via `fut.set_exception(...)`.
1008
+
1009
+ Arguments:
1010
+ jobid: ID of the SLURM job
1011
+ """
1012
+
1013
+ # Loop over all job_ids, and fetch future and job objects
1014
+ futures: list[Future] = []
1015
+ jobs: list[SlurmJob] = []
1016
+ with self.jobs_lock:
1017
+ for job_id in job_ids:
1018
+ future, job = self.jobs.pop(job_id)
1019
+ futures.append(future)
1020
+ jobs.append(job)
1021
+ if not self.jobs:
1022
+ self.jobs_empty_cond.notify_all()
1023
+
1024
+ # Fetch subfolder from remote host
1025
+ try:
1026
+ self._get_subfolder_sftp(jobs=jobs)
1027
+ except NoValidConnectionsError as e:
1028
+ logger.error("NoValidConnectionError")
1029
+ logger.error(f"{str(e)=}")
1030
+ logger.error(f"{e.errors=}")
1031
+ for err in e.errors:
1032
+ logger.error(f"{str(err)}")
1033
+
1034
+ raise e
1035
+
1036
+ # First round of checking whether all output files exist
1037
+ missing_out_paths = []
1038
+ for job in jobs:
1039
+ for ind_out_path, out_path in enumerate(
1040
+ job.output_pickle_files_local
1041
+ ):
1042
+ if not out_path.exists():
1043
+ missing_out_paths.append(out_path)
1044
+ num_missing = len(missing_out_paths)
1045
+ if num_missing > 0:
1046
+ # Output pickle files may be missing e.g. because of some slow
1047
+ # filesystem operation; wait some time before re-trying
1048
+ settings = Inject(get_settings)
1049
+ sleep_time = settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL
1050
+ logger.info(
1051
+ f"{num_missing} output pickle files are missing; "
1052
+ f"sleep {sleep_time} seconds."
1053
+ )
1054
+ for missing_file in missing_out_paths:
1055
+ logger.debug(f"Missing output pickle file: {missing_file}")
1056
+ time.sleep(sleep_time)
1057
+
1058
+ # Handle all jobs
1059
+ for ind_job, job_id in enumerate(job_ids):
1060
+ try:
1061
+ # Retrieve job and future objects
1062
+ job = jobs[ind_job]
1063
+ future = futures[ind_job]
1064
+ remaining_job_ids = job_ids[ind_job + 1 :] # noqa: E203
1065
+ remaining_futures = futures[ind_job + 1 :] # noqa: E203
1066
+
1067
+ outputs = []
1068
+
1069
+ for ind_out_path, out_path in enumerate(
1070
+ job.output_pickle_files_local
1071
+ ):
1072
+ in_path = job.input_pickle_files_local[ind_out_path]
1073
+ if not out_path.exists():
1074
+ # Output pickle file is still missing
1075
+ info = self._missing_pickle_error_msg(out_path)
1076
+ job_exc = self._prepare_JobExecutionError(
1077
+ job_id, info=info
1078
+ )
1079
+ try:
1080
+ future.set_exception(job_exc)
1081
+ self._handle_remaining_jobs(
1082
+ remaining_futures=remaining_futures,
1083
+ remaining_job_ids=remaining_job_ids,
1084
+ )
1085
+ return
1086
+ except InvalidStateError:
1087
+ logger.warning(
1088
+ f"Future {future} (SLURM job ID: {job_id}) "
1089
+ "was already cancelled."
1090
+ )
1091
+ if not self.keep_pickle_files:
1092
+ in_path.unlink()
1093
+ self._cleanup(job_id)
1094
+ self._handle_remaining_jobs(
1095
+ remaining_futures=remaining_futures,
1096
+ remaining_job_ids=remaining_job_ids,
1097
+ )
1098
+ return
1099
+
1100
+ # Read the task output
1101
+ with out_path.open("rb") as f:
1102
+ outdata = f.read()
1103
+ # Note: output can be either the task result (typically a
1104
+ # dictionary) or an ExceptionProxy object; in the latter
1105
+ # case, the ExceptionProxy definition is also part of the
1106
+ # pickle file (thanks to cloudpickle.dumps).
1107
+ success, output = cloudpickle.loads(outdata)
1108
+ try:
1109
+ if success:
1110
+ outputs.append(output)
1111
+ else:
1112
+ proxy = output
1113
+ if proxy.exc_type_name == "JobExecutionError":
1114
+ job_exc = self._prepare_JobExecutionError(
1115
+ job_id, info=proxy.kwargs.get("info", None)
1116
+ )
1117
+ future.set_exception(job_exc)
1118
+ self._handle_remaining_jobs(
1119
+ remaining_futures=remaining_futures,
1120
+ remaining_job_ids=remaining_job_ids,
1121
+ )
1122
+ return
1123
+ else:
1124
+ # This branch catches both TaskExecutionError's
1125
+ # (coming from the typical fractal-server
1126
+ # execution of tasks, and with additional
1127
+ # fractal-specific kwargs) or arbitrary
1128
+ # exceptions (coming from a direct use of
1129
+ # FractalSlurmSSHExecutor, possibly outside
1130
+ # fractal-server)
1131
+ kwargs = {}
1132
+ for key in [
1133
+ "workflow_task_id",
1134
+ "workflow_task_order",
1135
+ "task_name",
1136
+ ]:
1137
+ if key in proxy.kwargs.keys():
1138
+ kwargs[key] = proxy.kwargs[key]
1139
+ exc = TaskExecutionError(proxy.tb, **kwargs)
1140
+ future.set_exception(exc)
1141
+ self._handle_remaining_jobs(
1142
+ remaining_futures=remaining_futures,
1143
+ remaining_job_ids=remaining_job_ids,
1144
+ )
1145
+ return
1146
+ if not self.keep_pickle_files:
1147
+ out_path.unlink()
1148
+ except InvalidStateError:
1149
+ logger.warning(
1150
+ f"Future {future} (SLURM job ID: {job_id}) was "
1151
+ "already cancelled, exit from "
1152
+ "FractalSlurmSSHExecutor._completion."
1153
+ )
1154
+ if not self.keep_pickle_files:
1155
+ out_path.unlink()
1156
+ in_path.unlink()
1157
+
1158
+ self._cleanup(job_id)
1159
+ self._handle_remaining_jobs(
1160
+ remaining_futures=remaining_futures,
1161
+ remaining_job_ids=remaining_job_ids,
1162
+ )
1163
+ return
1164
+
1165
+ # Clean up input pickle file
1166
+ if not self.keep_pickle_files:
1167
+ in_path.unlink()
1168
+ self._cleanup(job_id)
1169
+ if job.single_task_submission:
1170
+ future.set_result(outputs[0])
1171
+ else:
1172
+ future.set_result(outputs)
1173
+
1174
+ except Exception as e:
1175
+ try:
1176
+ future.set_exception(e)
1177
+ return
1178
+ except InvalidStateError:
1179
+ logger.warning(
1180
+ f"Future {future} (SLURM job ID: {job_id}) was already"
1181
+ " cancelled, exit from"
1182
+ " FractalSlurmSSHExecutor._completion."
1183
+ )
1184
+
1185
+ def _get_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
1186
+ """
1187
+ Fetch a remote folder via tar+sftp+tar
1188
+
1189
+ Arguments:
1190
+ job:
1191
+ `SlurmJob` object (needed for its prefixes-related attributes).
1192
+ """
1193
+
1194
+ # Check that the subfolder is unique
1195
+ subfolder_names = [job.wftask_subfolder_name for job in jobs]
1196
+ if len(set(subfolder_names)) > 1:
1197
+ raise ValueError(
1198
+ "[_put_subfolder] Invalid list of jobs, "
1199
+ f"{set(subfolder_names)=}."
1200
+ )
1201
+ subfolder_name = subfolder_names[0]
1202
+
1203
+ t_0 = time.perf_counter()
1204
+ logger.debug("[_get_subfolder_sftp] Start")
1205
+ tarfile_path_local = (
1206
+ self.workflow_dir_local / f"{subfolder_name}.tar.gz"
1207
+ ).as_posix()
1208
+ tarfile_path_remote = (
1209
+ self.workflow_dir_remote / f"{subfolder_name}.tar.gz"
1210
+ ).as_posix()
1211
+
1212
+ # Remove local tarfile - FIXME SSH: is this needed?
1213
+ logger.warning(f"In principle I just removed {tarfile_path_local}")
1214
+ logger.warning(f"{Path(tarfile_path_local).exists()=}")
1215
+
1216
+ # Remove remote tarfile - FIXME SSH: is this needed?
1217
+ # rm_command = f"rm {tarfile_path_remote}"
1218
+ # _run_command_over_ssh(cmd=rm_command, fractal_ssh=self.fractal_ssh)
1219
+ logger.warning(f"Unlink {tarfile_path_remote=} - START")
1220
+ self.fractal_ssh.sftp().unlink(tarfile_path_remote)
1221
+ logger.warning(f"Unlink {tarfile_path_remote=} - STOP")
1222
+
1223
+ # Create remote tarfile
1224
+ tar_command = (
1225
+ f"{self.python_remote} "
1226
+ "-m fractal_server.app.runner.compress_folder "
1227
+ f"{(self.workflow_dir_remote / subfolder_name).as_posix()}"
1228
+ )
1229
+ stdout = run_command_over_ssh(
1230
+ cmd=tar_command, fractal_ssh=self.fractal_ssh
1231
+ )
1232
+ print(stdout)
1233
+
1234
+ # Fetch tarfile
1235
+ t_0_get = time.perf_counter()
1236
+ self.fractal_ssh.get(
1237
+ remote=tarfile_path_remote,
1238
+ local=tarfile_path_local,
1239
+ )
1240
+ t_1_get = time.perf_counter()
1241
+ logger.info(
1242
+ f"Subfolder archive transferred back to {tarfile_path_local}"
1243
+ f" - elapsed: {t_1_get - t_0_get:.3f} s"
1244
+ )
1245
+
1246
+ # Extract tarfile locally
1247
+ with tarfile.open(tarfile_path_local) as tar:
1248
+ tar.extractall(path=(self.workflow_dir_local / subfolder_name))
1249
+
1250
+ t_1 = time.perf_counter()
1251
+ logger.info("[_get_subfolder_sftp] End - " f"elapsed: {t_1-t_0:.3f} s")
1252
+
1253
+ def _prepare_sbatch_script(
1254
+ self,
1255
+ *,
1256
+ list_commands: list[str],
1257
+ slurm_out_path: str,
1258
+ slurm_err_path: str,
1259
+ slurm_config: SlurmConfig,
1260
+ ):
1261
+
1262
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
1263
+ mem_per_task_MB = slurm_config.mem_per_task_MB
1264
+
1265
+ # Set ntasks
1266
+ ntasks = min(len(list_commands), num_tasks_max_running)
1267
+ if len(list_commands) < num_tasks_max_running:
1268
+ ntasks = len(list_commands)
1269
+ slurm_config.parallel_tasks_per_job = ntasks
1270
+ logger.debug(
1271
+ f"{len(list_commands)=} is smaller than "
1272
+ f"{num_tasks_max_running=}. Setting {ntasks=}."
1273
+ )
1274
+
1275
+ # Prepare SLURM preamble based on SlurmConfig object
1276
+ script_lines = slurm_config.to_sbatch_preamble(
1277
+ remote_export_dir=self.workflow_dir_remote.as_posix()
1278
+ )
1279
+
1280
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
1281
+ # fix their order
1282
+ script_lines.extend(
1283
+ [
1284
+ f"#SBATCH --err={slurm_err_path}",
1285
+ f"#SBATCH --out={slurm_out_path}",
1286
+ f"#SBATCH -D {self.workflow_dir_remote}",
1287
+ ]
1288
+ )
1289
+ script_lines = slurm_config.sort_script_lines(script_lines)
1290
+ logger.debug(script_lines)
1291
+
1292
+ # Always print output of `pwd`
1293
+ script_lines.append('echo "Working directory (pwd): `pwd`"\n')
1294
+
1295
+ # Complete script preamble
1296
+ script_lines.append("\n")
1297
+
1298
+ # Include command lines
1299
+ tmp_list_commands = copy(list_commands)
1300
+ while tmp_list_commands:
1301
+ if tmp_list_commands:
1302
+ cmd = tmp_list_commands.pop(0) # take first element
1303
+ script_lines.append(
1304
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
1305
+ f"--mem={mem_per_task_MB}MB "
1306
+ f"{cmd} &"
1307
+ )
1308
+ script_lines.append("wait\n")
1309
+
1310
+ script = "\n".join(script_lines)
1311
+ return script
1312
+
1313
+ def get_default_task_files(self) -> TaskFiles:
1314
+ """
1315
+ This will be called when self.submit or self.map are called from
1316
+ outside fractal-server, and then lack some optional arguments.
1317
+ """
1318
+ task_files = TaskFiles(
1319
+ workflow_dir_local=self.workflow_dir_local,
1320
+ workflow_dir_remote=self.workflow_dir_remote,
1321
+ task_order=None,
1322
+ task_name="name",
1323
+ )
1324
+ return task_files
1325
+
1326
+ def shutdown(self, wait=True, *, cancel_futures=False):
1327
+ """
1328
+ Clean up all executor variables. Note that this function is executed on
1329
+ the self.wait_thread thread, see _completion.
1330
+ """
1331
+
1332
+ logger.debug("Executor shutdown: start")
1333
+
1334
+ # Handle all job futures
1335
+ slurm_jobs_to_scancel = []
1336
+ with self.jobs_lock:
1337
+ while self.jobs:
1338
+ jobid, fut_and_job = self.jobs.popitem()
1339
+ slurm_jobs_to_scancel.append(jobid)
1340
+ fut = fut_and_job[0]
1341
+ self.map_jobid_to_slurm_files_local.pop(jobid)
1342
+ if not fut.cancelled():
1343
+ fut.set_exception(
1344
+ JobExecutionError(
1345
+ "Job cancelled due to executor shutdown."
1346
+ )
1347
+ )
1348
+ fut.cancel()
1349
+
1350
+ # Cancel SLURM jobs
1351
+ if slurm_jobs_to_scancel:
1352
+ scancel_string = " ".join(slurm_jobs_to_scancel)
1353
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
1354
+ scancel_command = f"scancel {scancel_string}"
1355
+ run_command_over_ssh(
1356
+ cmd=scancel_command, fractal_ssh=self.fractal_ssh
1357
+ )
1358
+ logger.debug("Executor shutdown: end")
1359
+
1360
+ def __exit__(self, *args, **kwargs):
1361
+ """
1362
+ See
1363
+ https://github.com/fractal-analytics-platform/fractal-server/issues/1508
1364
+ """
1365
+ logger.debug(
1366
+ "[FractalSlurmSSHExecutor.__exit__] Stop and join `wait_thread`"
1367
+ )
1368
+ self.wait_thread.stop()
1369
+ self.wait_thread.join()
1370
+ logger.debug("[FractalSlurmSSHExecutor.__exit__] End")
1371
+
1372
+ def run_squeue(self, job_ids):
1373
+ squeue_command = (
1374
+ "squeue "
1375
+ "--noheader "
1376
+ "--format='%i %T' "
1377
+ "--jobs __JOBS__ "
1378
+ "--states=all"
1379
+ )
1380
+ job_ids = ",".join([str(j) for j in job_ids])
1381
+ squeue_command = squeue_command.replace("__JOBS__", job_ids)
1382
+ stdout = run_command_over_ssh(
1383
+ cmd=squeue_command,
1384
+ fractal_ssh=self.fractal_ssh,
1385
+ )
1386
+ return stdout
1387
+
1388
+ def _jobs_finished(self, job_ids: list[str]) -> set[str]:
1389
+ """
1390
+ Check which ones of the given Slurm jobs already finished
1391
+
1392
+ The function is based on the `_jobs_finished` function from
1393
+ clusterfutures (version 0.5).
1394
+ Original Copyright: 2022 Adrian Sampson
1395
+ (released under the MIT licence)
1396
+ """
1397
+
1398
+ from cfut.slurm import STATES_FINISHED
1399
+
1400
+ logger.debug(
1401
+ f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
1402
+ )
1403
+
1404
+ # If there is no Slurm job to check, return right away
1405
+ if not job_ids:
1406
+ logger.debug(
1407
+ "[FractalSlurmSSHExecutor._jobs_finished] "
1408
+ "No jobs provided, return."
1409
+ )
1410
+ return set()
1411
+
1412
+ try:
1413
+ stdout = self.run_squeue(job_ids)
1414
+ id_to_state = {
1415
+ out.split()[0]: out.split()[1] for out in stdout.splitlines()
1416
+ }
1417
+ # Finished jobs only stay in squeue for a few mins (configurable).
1418
+ # If a job ID isn't there, we'll assume it's finished.
1419
+ output = {
1420
+ _id
1421
+ for _id in job_ids
1422
+ if id_to_state.get(_id, "COMPLETED") in STATES_FINISHED
1423
+ }
1424
+ logger.debug(
1425
+ f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1426
+ )
1427
+ return output
1428
+ except Exception as e:
1429
+ # If something goes wrong, proceed anyway
1430
+ logger.error(
1431
+ f"Something wrong in _jobs_finished. Original error: {str(e)}"
1432
+ )
1433
+ output = set()
1434
+ logger.debug(
1435
+ f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1436
+ )
1437
+ return output
1438
+
1439
+ id_to_state = dict()
1440
+ for j in job_ids:
1441
+ res = self.run_squeue([j])
1442
+ if res.returncode != 0:
1443
+ logger.info(f"Job {j} not found. Marked it as completed")
1444
+ id_to_state.update({str(j): "COMPLETED"})
1445
+ else:
1446
+ id_to_state.update(
1447
+ {res.stdout.split()[0]: res.stdout.split()[1]}
1448
+ )
1449
+
1450
+ def handshake(self) -> dict:
1451
+ """
1452
+ Healthcheck for SSH connection and for versions match.
1453
+
1454
+ FIXME SSH: We should add a timeout here
1455
+ FIXME SSH: We could include checks on the existence of folders
1456
+ FIXME SSH: We could include further checks on version matches
1457
+ """
1458
+
1459
+ self.fractal_ssh.check_connection()
1460
+
1461
+ t_start_handshake = time.perf_counter()
1462
+
1463
+ logger.info("[FractalSlurmSSHExecutor.ssh_handshake] START")
1464
+ cmd = f"{self.python_remote} -m fractal_server.app.runner.versions"
1465
+ stdout = run_command_over_ssh(cmd=cmd, fractal_ssh=self.fractal_ssh)
1466
+ remote_versions = json.loads(stdout.strip("\n"))
1467
+
1468
+ # Check compatibility with local versions
1469
+ local_versions = get_versions()
1470
+ remote_fractal_server = remote_versions["fractal_server"]
1471
+ local_fractal_server = local_versions["fractal_server"]
1472
+ if remote_fractal_server != local_fractal_server:
1473
+ error_msg = (
1474
+ "Fractal-server version mismatch.\n"
1475
+ "Local interpreter: "
1476
+ f"({sys.executable}): {local_versions}.\n"
1477
+ "Remote interpreter: "
1478
+ f"({self.python_remote}): {remote_versions}."
1479
+ )
1480
+ logger.error(error_msg)
1481
+ raise ValueError(error_msg)
1482
+
1483
+ t_end_handshake = time.perf_counter()
1484
+ logger.info(
1485
+ "[FractalSlurmSSHExecutor.ssh_handshake] END"
1486
+ f" - elapsed: {t_end_handshake-t_start_handshake:.3f} s"
1487
+ )
1488
+ return remote_versions