fractal-server 2.2.0a0__py3-none-any.whl → 2.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/db/__init__.py +1 -1
  3. fractal_server/app/models/v1/state.py +1 -2
  4. fractal_server/app/routes/admin/v1.py +2 -2
  5. fractal_server/app/routes/admin/v2.py +2 -2
  6. fractal_server/app/routes/api/v1/job.py +2 -2
  7. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  8. fractal_server/app/routes/api/v2/__init__.py +23 -3
  9. fractal_server/app/routes/api/v2/job.py +2 -2
  10. fractal_server/app/routes/api/v2/submit.py +6 -0
  11. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  12. fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
  13. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  14. fractal_server/app/routes/aux/_runner.py +10 -2
  15. fractal_server/app/runner/compress_folder.py +120 -0
  16. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  17. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  18. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  19. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  21. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  22. fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
  23. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  24. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  27. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  28. fractal_server/app/runner/extract_archive.py +38 -0
  29. fractal_server/app/runner/v1/__init__.py +78 -40
  30. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  31. fractal_server/app/runner/v2/__init__.py +183 -82
  32. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  33. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  34. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  35. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  37. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  38. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  39. fractal_server/app/runner/versions.py +30 -0
  40. fractal_server/app/schemas/v1/__init__.py +1 -0
  41. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  42. fractal_server/app/schemas/v2/__init__.py +4 -1
  43. fractal_server/app/schemas/v2/task_collection.py +97 -27
  44. fractal_server/config.py +222 -21
  45. fractal_server/main.py +25 -1
  46. fractal_server/migrations/env.py +1 -1
  47. fractal_server/ssh/__init__.py +4 -0
  48. fractal_server/ssh/_fabric.py +190 -0
  49. fractal_server/tasks/utils.py +12 -64
  50. fractal_server/tasks/v1/background_operations.py +2 -2
  51. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  52. fractal_server/tasks/v1/utils.py +67 -0
  53. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  54. fractal_server/tasks/v2/_venv_pip.py +195 -0
  55. fractal_server/tasks/v2/background_operations.py +257 -295
  56. fractal_server/tasks/v2/background_operations_ssh.py +304 -0
  57. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  58. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  59. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  60. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  61. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  62. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  63. fractal_server/tasks/v2/utils.py +54 -0
  64. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +6 -2
  65. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +68 -44
  66. fractal_server/tasks/v2/get_collection_data.py +0 -14
  67. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
  68. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
  69. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1490 @@
1
+ # This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
2
+ # Original Copyright
3
+ # Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
4
+ # License: MIT
5
+ #
6
+ # Modified by:
7
+ # Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
8
+ # Tommaso Comparin <tommaso.comparin@exact-lab.it>
9
+ # Marco Franzon <marco.franzon@exact-lab.it>
10
+ #
11
+ # Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
12
+ # University of Zurich
13
+ import json
14
+ import math
15
+ import sys
16
+ import tarfile
17
+ import threading
18
+ import time
19
+ from concurrent.futures import Future
20
+ from concurrent.futures import InvalidStateError
21
+ from copy import copy
22
+ from pathlib import Path
23
+ from typing import Any
24
+ from typing import Callable
25
+ from typing import Optional
26
+ from typing import Sequence
27
+
28
+ import cloudpickle
29
+ from cfut import SlurmExecutor
30
+ from fabric.connection import Connection
31
+ from paramiko.ssh_exception import NoValidConnectionsError
32
+
33
+ from ....filenames import SHUTDOWN_FILENAME
34
+ from ....task_files import get_task_file_paths
35
+ from ....task_files import TaskFiles
36
+ from ....versions import get_versions
37
+ from ...slurm._slurm_config import get_default_slurm_config
38
+ from ...slurm._slurm_config import SlurmConfig
39
+ from .._batching import heuristics
40
+ from ._executor_wait_thread import FractalSlurmWaitThread
41
+ from fractal_server.app.runner.components import _COMPONENT_KEY_
42
+ from fractal_server.app.runner.exceptions import JobExecutionError
43
+ from fractal_server.app.runner.exceptions import TaskExecutionError
44
+ from fractal_server.app.runner.executors.slurm.ssh._slurm_job import SlurmJob
45
+ from fractal_server.config import get_settings
46
+ from fractal_server.logger import set_logger
47
+ from fractal_server.ssh._fabric import check_connection
48
+ from fractal_server.ssh._fabric import run_command_over_ssh
49
+ from fractal_server.syringe import Inject
50
+
51
+ logger = set_logger(__name__)
52
+
53
+
54
+ class FractalSlurmSSHExecutor(SlurmExecutor):
55
+ """
56
+ FractalSlurmSSHExecutor (inherits from cfut.SlurmExecutor)
57
+
58
+ FIXME: docstring
59
+
60
+ Attributes:
61
+ connection: SSH connection
62
+ shutdown_file:
63
+ python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
64
+ wait_thread_cls: Class for waiting thread
65
+ keep_pickle_files:
66
+ workflow_dir_local:
67
+ Directory for both the cfut/SLURM and fractal-server files and logs
68
+ workflow_dir_remote:
69
+ Directory for both the cfut/SLURM and fractal-server files and logs
70
+ common_script_lines:
71
+ Arbitrary script lines that will always be included in the
72
+ sbatch script
73
+ slurm_account:
74
+ jobs:
75
+ map_jobid_to_slurm_files:
76
+ Dictionary with paths of slurm-related files for active jobs
77
+ """
78
+
79
+ connection: Connection
80
+
81
+ workflow_dir_local: Path
82
+ workflow_dir_remote: Path
83
+ shutdown_file: str
84
+ python_remote: str
85
+
86
+ wait_thread_cls = FractalSlurmWaitThread
87
+ keep_pickle_files: bool
88
+
89
+ common_script_lines: list[str]
90
+ slurm_account: Optional[str]
91
+
92
+ jobs: dict[str, tuple[Future, SlurmJob]]
93
+ map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
94
+
95
+ def __init__(
96
+ self,
97
+ *,
98
+ # SSH connection
99
+ connection: Connection,
100
+ # Folders and files
101
+ workflow_dir_local: Path,
102
+ workflow_dir_remote: Path,
103
+ # Runner options
104
+ keep_pickle_files: bool = False,
105
+ # Monitoring options
106
+ slurm_poll_interval: Optional[int] = None,
107
+ # SLURM submission script options
108
+ common_script_lines: Optional[list[str]] = None,
109
+ slurm_account: Optional[str] = None,
110
+ # Other kwargs are ignored
111
+ **kwargs,
112
+ ):
113
+ """
114
+ Init method for FractalSlurmSSHExecutor
115
+
116
+ Note: since we are not using `super().__init__`, we duplicate some
117
+ relevant bits of `cfut.ClusterExecutor.__init__`.
118
+
119
+ Args:
120
+ connection:
121
+ workflow_dir_local:
122
+ workflow_dir_remote:
123
+ keep_pickle_files:
124
+ slurm_poll_interval:
125
+ common_script_lines:
126
+ slurm_account:
127
+ """
128
+
129
+ if kwargs != {}:
130
+ raise ValueError(
131
+ f"FractalSlurmSSHExecutor received unexpected {kwargs=}"
132
+ )
133
+
134
+ self.workflow_dir_local = workflow_dir_local
135
+ self.workflow_dir_remote = workflow_dir_remote
136
+
137
+ # Relevant bits of cfut.ClusterExecutor.__init__ are copied here,
138
+ # postponing the .start() call to when the callbacks are defined
139
+ self.jobs = {}
140
+ self.job_outfiles = {}
141
+ self.jobs_lock = threading.Lock()
142
+ self.jobs_empty_cond = threading.Condition(self.jobs_lock)
143
+ self.wait_thread = self.wait_thread_cls(self._completion)
144
+
145
+ # Set up attributes and methods for self.wait_thread
146
+ # cfut.SlurmWaitThread)
147
+ self.wait_thread.shutdown_callback = self.shutdown
148
+ self.wait_thread.jobs_finished_callback = self._jobs_finished
149
+ if slurm_poll_interval is None:
150
+ settings = Inject(get_settings)
151
+ slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
152
+ elif slurm_poll_interval <= 0:
153
+ raise ValueError(f"Invalid attribute {slurm_poll_interval=}")
154
+ self.wait_thread.slurm_poll_interval = slurm_poll_interval
155
+ self.wait_thread.shutdown_file = (
156
+ self.workflow_dir_local / SHUTDOWN_FILENAME
157
+ ).as_posix()
158
+
159
+ # Now start self.wait_thread (note: this must be *after* its callback
160
+ # methods have been defined)
161
+ self.wait_thread.start()
162
+
163
+ # Define remote Python interpreter
164
+ settings = Inject(get_settings)
165
+ self.python_remote = settings.FRACTAL_SLURM_WORKER_PYTHON
166
+ if self.python_remote is None:
167
+ raise ValueError("FRACTAL_SLURM_WORKER_PYTHON is not set. Exit.")
168
+
169
+ # Initialize connection and perform handshake
170
+ self.connection = connection
171
+ logger.warning(self.connection)
172
+ self.handshake()
173
+
174
+ # Set/validate parameters for SLURM submission scripts
175
+ self.slurm_account = slurm_account
176
+ self.common_script_lines = common_script_lines or []
177
+ self._validate_common_script_lines()
178
+
179
+ # Set/initialize some more options
180
+ self.keep_pickle_files = keep_pickle_files
181
+ self.map_jobid_to_slurm_files_local = {}
182
+
183
+ def _validate_common_script_lines(self):
184
+ """
185
+ Check that SLURM account is not set in `self.common_script_lines`.
186
+ """
187
+ try:
188
+ invalid_line = next(
189
+ line
190
+ for line in self.common_script_lines
191
+ if line.startswith("#SBATCH --account=")
192
+ )
193
+ raise RuntimeError(
194
+ "Invalid line in `FractalSlurmSSHExecutor."
195
+ "common_script_lines`: "
196
+ f"'{invalid_line}'.\n"
197
+ "SLURM account must be set via the request body of the "
198
+ "apply-workflow endpoint, or by modifying the user properties."
199
+ )
200
+ except StopIteration:
201
+ pass
202
+
203
+ def _cleanup(self, jobid: str) -> None:
204
+ """
205
+ Given a job ID, perform any necessary cleanup after the job has
206
+ finished.
207
+ """
208
+ with self.jobs_lock:
209
+ self.map_jobid_to_slurm_files_local.pop(jobid)
210
+
211
+ def get_input_pickle_file_path_local(
212
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
213
+ ) -> Path:
214
+
215
+ prefix = prefix or "cfut"
216
+ output = (
217
+ self.workflow_dir_local
218
+ / subfolder_name
219
+ / f"{prefix}_in_{arg}.pickle"
220
+ )
221
+ return output
222
+
223
+ def get_input_pickle_file_path_remote(
224
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
225
+ ) -> Path:
226
+
227
+ prefix = prefix or "cfut"
228
+ output = (
229
+ self.workflow_dir_remote
230
+ / subfolder_name
231
+ / f"{prefix}_in_{arg}.pickle"
232
+ )
233
+ return output
234
+
235
+ def get_output_pickle_file_path_local(
236
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
237
+ ) -> Path:
238
+ prefix = prefix or "cfut"
239
+ return (
240
+ self.workflow_dir_local
241
+ / subfolder_name
242
+ / f"{prefix}_out_{arg}.pickle"
243
+ )
244
+
245
+ def get_output_pickle_file_path_remote(
246
+ self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
247
+ ) -> Path:
248
+ prefix = prefix or "cfut"
249
+ return (
250
+ self.workflow_dir_remote
251
+ / subfolder_name
252
+ / f"{prefix}_out_{arg}.pickle"
253
+ )
254
+
255
+ def get_slurm_script_file_path_local(
256
+ self, *, subfolder_name: str, prefix: Optional[str] = None
257
+ ) -> Path:
258
+ prefix = prefix or "_temp"
259
+ return (
260
+ self.workflow_dir_local
261
+ / subfolder_name
262
+ / f"{prefix}_slurm_submit.sbatch"
263
+ )
264
+
265
+ def get_slurm_script_file_path_remote(
266
+ self, *, subfolder_name: str, prefix: Optional[str] = None
267
+ ) -> Path:
268
+ prefix = prefix or "_temp"
269
+ return (
270
+ self.workflow_dir_remote
271
+ / subfolder_name
272
+ / f"{prefix}_slurm_submit.sbatch"
273
+ )
274
+
275
+ def get_slurm_stdout_file_path_local(
276
+ self,
277
+ *,
278
+ subfolder_name: str,
279
+ arg: str = "%j",
280
+ prefix: Optional[str] = None,
281
+ ) -> Path:
282
+ prefix = prefix or "slurmpy.stdout"
283
+ return (
284
+ self.workflow_dir_local
285
+ / subfolder_name
286
+ / f"{prefix}_slurm_{arg}.out"
287
+ )
288
+
289
+ def get_slurm_stdout_file_path_remote(
290
+ self,
291
+ *,
292
+ subfolder_name: str,
293
+ arg: str = "%j",
294
+ prefix: Optional[str] = None,
295
+ ) -> Path:
296
+ prefix = prefix or "slurmpy.stdout"
297
+ return (
298
+ self.workflow_dir_remote
299
+ / subfolder_name
300
+ / f"{prefix}_slurm_{arg}.out"
301
+ )
302
+
303
+ def get_slurm_stderr_file_path_local(
304
+ self,
305
+ *,
306
+ subfolder_name: str,
307
+ arg: str = "%j",
308
+ prefix: Optional[str] = None,
309
+ ) -> Path:
310
+ prefix = prefix or "slurmpy.stderr"
311
+ return (
312
+ self.workflow_dir_local
313
+ / subfolder_name
314
+ / f"{prefix}_slurm_{arg}.err"
315
+ )
316
+
317
+ def get_slurm_stderr_file_path_remote(
318
+ self,
319
+ *,
320
+ subfolder_name: str,
321
+ arg: str = "%j",
322
+ prefix: Optional[str] = None,
323
+ ) -> Path:
324
+ prefix = prefix or "slurmpy.stderr"
325
+ return (
326
+ self.workflow_dir_remote
327
+ / subfolder_name
328
+ / f"{prefix}_slurm_{arg}.err"
329
+ )
330
+
331
+ def submit(
332
+ self,
333
+ fun: Callable[..., Any],
334
+ *fun_args: Sequence[Any],
335
+ slurm_config: Optional[SlurmConfig] = None,
336
+ task_files: Optional[TaskFiles] = None,
337
+ **fun_kwargs: dict,
338
+ ) -> Future:
339
+ """
340
+ Submit a function for execution on `FractalSlurmSSHExecutor`
341
+
342
+ Arguments:
343
+ fun: The function to be executed
344
+ fun_args: Function positional arguments
345
+ fun_kwargs: Function keyword arguments
346
+ slurm_config:
347
+ A `SlurmConfig` object; if `None`, use
348
+ `get_default_slurm_config()`.
349
+ task_files:
350
+ A `TaskFiles` object; if `None`, use
351
+ `self.get_default_task_files()`.
352
+
353
+ Returns:
354
+ Future representing the execution of the current SLURM job.
355
+ """
356
+
357
+ # Set defaults, if needed
358
+ if slurm_config is None:
359
+ slurm_config = get_default_slurm_config()
360
+ if task_files is None:
361
+ task_files = self.get_default_task_files()
362
+
363
+ # Set slurm_file_prefix
364
+ slurm_file_prefix = task_files.file_prefix
365
+
366
+ # Include common_script_lines in extra_lines
367
+ logger.debug(
368
+ f"Adding {self.common_script_lines=} to "
369
+ f"{slurm_config.extra_lines=}, from submit method."
370
+ )
371
+ current_extra_lines = slurm_config.extra_lines or []
372
+ slurm_config.extra_lines = (
373
+ current_extra_lines + self.common_script_lines
374
+ )
375
+
376
+ # Adapt slurm_config to the fact that this is a single-task SlurmJob
377
+ # instance
378
+ slurm_config.tasks_per_job = 1
379
+ slurm_config.parallel_tasks_per_job = 1
380
+
381
+ job = self._prepare_job(
382
+ fun,
383
+ slurm_config=slurm_config,
384
+ slurm_file_prefix=slurm_file_prefix,
385
+ task_files=task_files,
386
+ single_task_submission=True,
387
+ args=fun_args,
388
+ kwargs=fun_kwargs,
389
+ )
390
+ try:
391
+ self._put_subfolder_sftp(jobs=[job])
392
+ except NoValidConnectionsError as e:
393
+ logger.error("NoValidConnectionError")
394
+ logger.error(f"{str(e)=}")
395
+ logger.error(f"{e.errors=}")
396
+ for err in e.errors:
397
+ logger.error(f"{str(err)}")
398
+ raise e
399
+ future, job_id_str = self._submit_job(job)
400
+ self.wait_thread.wait(job_id=job_id_str)
401
+ return future
402
+
403
+ def map(
404
+ self,
405
+ fn: Callable[..., Any],
406
+ iterable: list[Sequence[Any]],
407
+ *,
408
+ slurm_config: Optional[SlurmConfig] = None,
409
+ task_files: Optional[TaskFiles] = None,
410
+ ):
411
+ """
412
+ Return an iterator with the results of several execution of a function
413
+
414
+ This function is based on `concurrent.futures.Executor.map` from Python
415
+ Standard Library 3.11.
416
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
417
+ PSF under a Contributor Agreement.
418
+
419
+ Main modifications from the PSF function:
420
+
421
+ 1. Only `fn` and `iterable` can be assigned as positional arguments;
422
+ 2. `*iterables` argument replaced with a single `iterable`;
423
+ 3. `timeout` and `chunksize` arguments are not supported.
424
+
425
+ Arguments:
426
+ fn:
427
+ The function to be executed
428
+ iterable:
429
+ An iterable such that each element is the list of arguments to
430
+ be passed to `fn`, as in `fn(*args)`.
431
+ slurm_config:
432
+ A `SlurmConfig` object; if `None`, use
433
+ `get_default_slurm_config()`.
434
+ task_files:
435
+ A `TaskFiles` object; if `None`, use
436
+ `self.get_default_task_files()`.
437
+
438
+ """
439
+
440
+ def _result_or_cancel(fut):
441
+ """
442
+ This function is based on the Python Standard Library 3.11.
443
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved.
444
+ Licensed to PSF under a Contributor Agreement.
445
+ """
446
+ try:
447
+ try:
448
+ return fut.result()
449
+ finally:
450
+ fut.cancel()
451
+ finally:
452
+ # Break a reference cycle with the exception in
453
+ # self._exception
454
+ del fut
455
+
456
+ # Set defaults, if needed
457
+ if not slurm_config:
458
+ slurm_config = get_default_slurm_config()
459
+ if task_files is None:
460
+ task_files = self.get_default_task_files()
461
+
462
+ # Include common_script_lines in extra_lines
463
+ logger.debug(
464
+ f"Adding {self.common_script_lines=} to "
465
+ f"{slurm_config.extra_lines=}, from map method."
466
+ )
467
+ current_extra_lines = slurm_config.extra_lines or []
468
+ slurm_config.extra_lines = (
469
+ current_extra_lines + self.common_script_lines
470
+ )
471
+
472
+ # Set file prefixes
473
+ general_slurm_file_prefix = str(task_files.task_order)
474
+
475
+ # Transform iterable into a list and count its elements
476
+ list_args = list(iterable)
477
+ tot_tasks = len(list_args)
478
+
479
+ # Set/validate parameters for task batching
480
+ tasks_per_job, parallel_tasks_per_job = heuristics(
481
+ # Number of parallel components (always known)
482
+ tot_tasks=len(list_args),
483
+ # Optional WorkflowTask attributes:
484
+ tasks_per_job=slurm_config.tasks_per_job,
485
+ parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
486
+ # Task requirements (multiple possible sources):
487
+ cpus_per_task=slurm_config.cpus_per_task,
488
+ mem_per_task=slurm_config.mem_per_task_MB,
489
+ # Fractal configuration variables (soft/hard limits):
490
+ target_cpus_per_job=slurm_config.target_cpus_per_job,
491
+ target_mem_per_job=slurm_config.target_mem_per_job,
492
+ target_num_jobs=slurm_config.target_num_jobs,
493
+ max_cpus_per_job=slurm_config.max_cpus_per_job,
494
+ max_mem_per_job=slurm_config.max_mem_per_job,
495
+ max_num_jobs=slurm_config.max_num_jobs,
496
+ )
497
+ slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
498
+ slurm_config.tasks_per_job = tasks_per_job
499
+
500
+ # Divide arguments in batches of `n_tasks_per_script` tasks each
501
+ args_batches = []
502
+ batch_size = tasks_per_job
503
+ for ind_chunk in range(0, tot_tasks, batch_size):
504
+ args_batches.append(
505
+ list_args[ind_chunk : ind_chunk + batch_size] # noqa
506
+ )
507
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
508
+ raise RuntimeError("Something wrong here while batching tasks")
509
+
510
+ # Fetch configuration variable
511
+ settings = Inject(get_settings)
512
+ FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
513
+
514
+ logger.debug("[map] Job preparation - START")
515
+ current_component_index = 0
516
+ jobs_to_submit = []
517
+ for ind_batch, batch in enumerate(args_batches):
518
+ batch_size = len(batch)
519
+ this_slurm_file_prefix = (
520
+ f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
521
+ )
522
+ new_job_to_submit = self._prepare_job(
523
+ fn,
524
+ slurm_config=slurm_config,
525
+ slurm_file_prefix=this_slurm_file_prefix,
526
+ task_files=task_files,
527
+ single_task_submission=False,
528
+ components=batch,
529
+ )
530
+ jobs_to_submit.append(new_job_to_submit)
531
+ current_component_index += batch_size
532
+ logger.debug("[map] Job preparation - END")
533
+
534
+ try:
535
+ self._put_subfolder_sftp(jobs=jobs_to_submit)
536
+ except NoValidConnectionsError as e:
537
+ logger.error("NoValidConnectionError")
538
+ logger.error(f"{str(e)=}")
539
+ logger.error(f"{e.errors=}")
540
+ for err in e.errors:
541
+ logger.error(f"{str(err)}")
542
+
543
+ raise e
544
+
545
+ # Construct list of futures (one per SLURM job, i.e. one per batch)
546
+ # FIXME SSH: we may create a single `_submit_many_jobs` method to
547
+ # reduce the number of commands run over SSH
548
+ logger.debug("[map] Job submission - START")
549
+ fs = []
550
+ job_ids = []
551
+ for job in jobs_to_submit:
552
+ future, job_id = self._submit_job(job)
553
+ job_ids.append(job_id)
554
+ fs.append(future)
555
+ time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
556
+ for job_id in job_ids:
557
+ self.wait_thread.wait(job_id=job_id)
558
+ logger.debug("[map] Job submission - END")
559
+
560
+ # Yield must be hidden in closure so that the futures are submitted
561
+ # before the first iterator value is required.
562
+ # NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
563
+ # iterable of results (if successful), and we should yield its elements
564
+ # rather than the whole iterable.
565
+ def result_iterator():
566
+ """
567
+ This function is based on the Python Standard Library 3.11.
568
+ Original Copyright 2009 Brian Quinlan. All Rights Reserved.
569
+ Licensed to PSF under a Contributor Agreement.
570
+ """
571
+ try:
572
+ # reverse to keep finishing order
573
+ fs.reverse()
574
+ while fs:
575
+ # Careful not to keep a reference to the popped future
576
+ results = _result_or_cancel(fs.pop())
577
+ for res in results:
578
+ yield res
579
+ finally:
580
+ for future in fs:
581
+ future.cancel()
582
+
583
+ return result_iterator()
584
+
585
+ def _prepare_job(
586
+ self,
587
+ fun: Callable[..., Any],
588
+ slurm_file_prefix: str,
589
+ task_files: TaskFiles,
590
+ slurm_config: SlurmConfig,
591
+ single_task_submission: bool = False,
592
+ args: Optional[Sequence[Any]] = None,
593
+ kwargs: Optional[dict] = None,
594
+ components: Optional[list[Any]] = None,
595
+ ) -> SlurmJob:
596
+ """
597
+ Prepare a SLURM job locally, without submitting it
598
+
599
+ This function prepares and writes the local submission script, but it
600
+ does not transfer it to the SLURM cluster.
601
+
602
+ NOTE: this method has different behaviors when it is called from the
603
+ `self.submit` or `self.map` methods (which is also encoded in
604
+ `single_task_submission`):
605
+
606
+ * When called from `self.submit`, it supports general `args` and
607
+ `kwargs` arguments;
608
+ * When called from `self.map`, there cannot be any `args` or `kwargs`
609
+ argument, but there must be a `components` argument.
610
+
611
+ Arguments:
612
+ fun:
613
+ slurm_file_prefix:
614
+ task_files:
615
+ slurm_config:
616
+ single_task_submission:
617
+ args:
618
+ kwargs:
619
+ components:
620
+
621
+ Returns:
622
+ SlurmJob object
623
+ """
624
+
625
+ # Inject SLURM account (if set) into slurm_config
626
+ if self.slurm_account:
627
+ slurm_config.account = self.slurm_account
628
+
629
+ # Define slurm-job-related files
630
+ if single_task_submission:
631
+ if components is not None:
632
+ raise ValueError(
633
+ f"{single_task_submission=} but components is not None"
634
+ )
635
+ job = SlurmJob(
636
+ slurm_file_prefix=slurm_file_prefix,
637
+ num_tasks_tot=1,
638
+ slurm_config=slurm_config,
639
+ )
640
+ if job.num_tasks_tot > 1:
641
+ raise ValueError(
642
+ "{single_task_submission=} but {job.num_tasks_tot=}"
643
+ )
644
+ job.single_task_submission = True
645
+ job.wftask_file_prefixes = (task_files.file_prefix,)
646
+ job.wftask_subfolder_name = task_files.subfolder_name
647
+
648
+ else:
649
+ if not components or len(components) < 1:
650
+ raise ValueError(
651
+ "In FractalSlurmSSHExecutor._submit_job, given "
652
+ f"{components=}."
653
+ )
654
+ num_tasks_tot = len(components)
655
+ job = SlurmJob(
656
+ slurm_file_prefix=slurm_file_prefix,
657
+ num_tasks_tot=num_tasks_tot,
658
+ slurm_config=slurm_config,
659
+ )
660
+
661
+ _prefixes = []
662
+ _subfolder_names = []
663
+ for component in components:
664
+ if isinstance(component, dict):
665
+ actual_component = component.get(_COMPONENT_KEY_, None)
666
+ else:
667
+ actual_component = component
668
+ _task_file_paths = get_task_file_paths(
669
+ workflow_dir_local=task_files.workflow_dir_local,
670
+ workflow_dir_remote=task_files.workflow_dir_remote,
671
+ task_name=task_files.task_name,
672
+ task_order=task_files.task_order,
673
+ component=actual_component,
674
+ )
675
+ _prefixes.append(_task_file_paths.file_prefix)
676
+ _subfolder_names.append(_task_file_paths.subfolder_name)
677
+ job.wftask_file_prefixes = tuple(_prefixes)
678
+
679
+ # Check that all components share the same subfolder
680
+ num_subfolders = len(set(_subfolder_names))
681
+ if num_subfolders != 1:
682
+ error_msg_short = (
683
+ f"[_submit_job] Subfolder list has {num_subfolders} "
684
+ "different values, but it must have only one (since "
685
+ "workflow tasks are executed one by one)."
686
+ )
687
+ error_msg_detail = (
688
+ "[_submit_job] Current unique subfolder names: "
689
+ f"{set(_subfolder_names)}"
690
+ )
691
+ logger.error(error_msg_short)
692
+ logger.error(error_msg_detail)
693
+ raise ValueError(error_msg_short)
694
+ job.wftask_subfolder_name = _subfolder_names[0]
695
+
696
+ # Check that server-side subfolder exists
697
+ subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
698
+ if not subfolder_path.exists():
699
+ raise FileNotFoundError(
700
+ f"Missing folder {subfolder_path.as_posix()}."
701
+ )
702
+
703
+ # Define I/O pickle file local/remote paths
704
+ job.input_pickle_files_local = tuple(
705
+ self.get_input_pickle_file_path_local(
706
+ arg=job.workerids[ind],
707
+ subfolder_name=job.wftask_subfolder_name,
708
+ prefix=job.wftask_file_prefixes[ind],
709
+ )
710
+ for ind in range(job.num_tasks_tot)
711
+ )
712
+ job.input_pickle_files_remote = tuple(
713
+ self.get_input_pickle_file_path_remote(
714
+ arg=job.workerids[ind],
715
+ subfolder_name=job.wftask_subfolder_name,
716
+ prefix=job.wftask_file_prefixes[ind],
717
+ )
718
+ for ind in range(job.num_tasks_tot)
719
+ )
720
+ job.output_pickle_files_local = tuple(
721
+ self.get_output_pickle_file_path_local(
722
+ arg=job.workerids[ind],
723
+ subfolder_name=job.wftask_subfolder_name,
724
+ prefix=job.wftask_file_prefixes[ind],
725
+ )
726
+ for ind in range(job.num_tasks_tot)
727
+ )
728
+ job.output_pickle_files_remote = tuple(
729
+ self.get_output_pickle_file_path_remote(
730
+ arg=job.workerids[ind],
731
+ subfolder_name=job.wftask_subfolder_name,
732
+ prefix=job.wftask_file_prefixes[ind],
733
+ )
734
+ for ind in range(job.num_tasks_tot)
735
+ )
736
+
737
+ # Define SLURM-job file local/remote paths
738
+ job.slurm_script_local = self.get_slurm_script_file_path_local(
739
+ subfolder_name=job.wftask_subfolder_name,
740
+ prefix=job.slurm_file_prefix,
741
+ )
742
+ job.slurm_script_remote = self.get_slurm_script_file_path_remote(
743
+ subfolder_name=job.wftask_subfolder_name,
744
+ prefix=job.slurm_file_prefix,
745
+ )
746
+ job.slurm_stdout_local = self.get_slurm_stdout_file_path_local(
747
+ subfolder_name=job.wftask_subfolder_name,
748
+ prefix=job.slurm_file_prefix,
749
+ )
750
+ job.slurm_stdout_remote = self.get_slurm_stdout_file_path_remote(
751
+ subfolder_name=job.wftask_subfolder_name,
752
+ prefix=job.slurm_file_prefix,
753
+ )
754
+ job.slurm_stderr_local = self.get_slurm_stderr_file_path_local(
755
+ subfolder_name=job.wftask_subfolder_name,
756
+ prefix=job.slurm_file_prefix,
757
+ )
758
+ job.slurm_stderr_remote = self.get_slurm_stderr_file_path_remote(
759
+ subfolder_name=job.wftask_subfolder_name,
760
+ prefix=job.slurm_file_prefix,
761
+ )
762
+
763
+ # Dump serialized versions+function+args+kwargs to pickle file(s)
764
+ versions = get_versions()
765
+ if job.single_task_submission:
766
+ _args = args or []
767
+ _kwargs = kwargs or {}
768
+ funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
769
+ with open(job.input_pickle_files_local[0], "wb") as f:
770
+ f.write(funcser)
771
+ else:
772
+ for ind_component, component in enumerate(components):
773
+ _args = [component]
774
+ _kwargs = {}
775
+ funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
776
+ with open(
777
+ job.input_pickle_files_local[ind_component], "wb"
778
+ ) as f:
779
+ f.write(funcser)
780
+
781
+ # Prepare commands to be included in SLURM submission script
782
+ cmdlines = []
783
+ for ind_task in range(job.num_tasks_tot):
784
+ input_pickle_file = job.input_pickle_files_remote[ind_task]
785
+ output_pickle_file = job.output_pickle_files_remote[ind_task]
786
+ cmdlines.append(
787
+ (
788
+ f"{self.python_remote}"
789
+ " -m fractal_server.app.runner.executors.slurm.remote "
790
+ f"--input-file {input_pickle_file} "
791
+ f"--output-file {output_pickle_file}"
792
+ )
793
+ )
794
+
795
+ # Prepare SLURM submission script
796
+ sbatch_script_content = self._prepare_sbatch_script(
797
+ slurm_config=job.slurm_config,
798
+ list_commands=cmdlines,
799
+ slurm_out_path=str(job.slurm_stdout_remote),
800
+ slurm_err_path=str(job.slurm_stderr_remote),
801
+ )
802
+ with job.slurm_script_local.open("w") as f:
803
+ f.write(sbatch_script_content)
804
+
805
+ return job
806
+
807
+ def _put_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
808
+ """
809
+ Transfer the jobs subfolder to the remote host.
810
+
811
+ Arguments:
812
+ jobs: The list of `SlurmJob` objects associated to a given
813
+ subfolder.
814
+ """
815
+
816
+ # Check that the subfolder is unique
817
+ subfolder_names = [job.wftask_subfolder_name for job in jobs]
818
+ if len(set(subfolder_names)) > 1:
819
+ raise ValueError(
820
+ "[_put_subfolder] Invalid list of jobs, "
821
+ f"{set(subfolder_names)=}."
822
+ )
823
+ subfolder_name = subfolder_names[0]
824
+
825
+ # Create compressed subfolder archive (locally)
826
+ local_subfolder = self.workflow_dir_local / subfolder_name
827
+ tarfile_name = f"{subfolder_name}.tar.gz"
828
+ tarfile_path_local = (
829
+ self.workflow_dir_local / tarfile_name
830
+ ).as_posix()
831
+ tarfile_path_remote = (
832
+ self.workflow_dir_remote / tarfile_name
833
+ ).as_posix()
834
+ with tarfile.open(tarfile_path_local, "w:gz") as tar:
835
+ for this_file in local_subfolder.glob("*"):
836
+ tar.add(this_file, arcname=this_file.name)
837
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
838
+
839
+ # Transfer archive
840
+ t_0_put = time.perf_counter()
841
+ self.connection.put(
842
+ local=tarfile_path_local,
843
+ remote=tarfile_path_remote,
844
+ )
845
+ t_1_put = time.perf_counter()
846
+ logger.info(
847
+ f"Subfolder archive transferred to {tarfile_path_remote}"
848
+ f" - elapsed: {t_1_put - t_0_put:.3f} s"
849
+ )
850
+ # Uncompress archive (remotely)
851
+ tar_command = (
852
+ f"{self.python_remote} -m "
853
+ "fractal_server.app.runner.extract_archive "
854
+ f"{tarfile_path_remote}"
855
+ )
856
+ run_command_over_ssh(cmd=tar_command, connection=self.connection)
857
+
858
+ # Remove local version
859
+ t_0_rm = time.perf_counter()
860
+ Path(tarfile_path_local).unlink()
861
+ t_1_rm = time.perf_counter()
862
+ logger.info(
863
+ f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
864
+ )
865
+
866
+ def _submit_job(self, job: SlurmJob) -> tuple[Future, str]:
867
+ """
868
+ Submit a job to SLURM via SSH.
869
+
870
+ This method must always be called after `self._put_subfolder`.
871
+
872
+ Arguments:
873
+ job: The `SlurmJob` object to submit.
874
+ """
875
+
876
+ # Submit job to SLURM, and get jobid
877
+ sbatch_command = f"sbatch --parsable {job.slurm_script_remote}"
878
+ sbatch_stdout = run_command_over_ssh(
879
+ cmd=sbatch_command,
880
+ connection=self.connection,
881
+ )
882
+
883
+ # Extract SLURM job ID from stdout
884
+ try:
885
+ stdout = sbatch_stdout.strip("\n")
886
+ jobid = int(stdout)
887
+ except ValueError as e:
888
+ error_msg = (
889
+ f"Submit command `{sbatch_command}` returned "
890
+ f"`{stdout=}` which cannot be cast to an integer "
891
+ f"SLURM-job ID. Original error:\n{str(e)}"
892
+ )
893
+ logger.error(error_msg)
894
+ raise JobExecutionError(info=error_msg)
895
+ job_id_str = str(jobid)
896
+
897
+ # Plug job id in stdout/stderr SLURM file paths (local and remote)
898
+ def _replace_job_id(_old_path: Path) -> Path:
899
+ return Path(_old_path.as_posix().replace("%j", job_id_str))
900
+
901
+ job.slurm_stdout_local = _replace_job_id(job.slurm_stdout_local)
902
+ job.slurm_stdout_remote = _replace_job_id(job.slurm_stdout_remote)
903
+ job.slurm_stderr_local = _replace_job_id(job.slurm_stderr_local)
904
+ job.slurm_stderr_remote = _replace_job_id(job.slurm_stderr_remote)
905
+
906
+ # Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
907
+ # must be after the `sbatch` call, so that "%j" has already been
908
+ # replaced with the job ID)
909
+ with self.jobs_lock:
910
+ self.map_jobid_to_slurm_files_local[job_id_str] = (
911
+ job.slurm_script_local.as_posix(),
912
+ job.slurm_stdout_local.as_posix(),
913
+ job.slurm_stderr_local.as_posix(),
914
+ )
915
+
916
+ # Create future
917
+ future = Future()
918
+ with self.jobs_lock:
919
+ self.jobs[job_id_str] = (future, job)
920
+ return future, job_id_str
921
+
922
+ def _prepare_JobExecutionError(
923
+ self, jobid: str, info: str
924
+ ) -> JobExecutionError:
925
+ """
926
+ Prepare the `JobExecutionError` for a given job
927
+
928
+ This method creates a `JobExecutionError` object and sets its attribute
929
+ to the appropriate SLURM-related file names. Note that the SLURM files
930
+ are the local ones (i.e. the ones in `self.workflow_dir_local`).
931
+
932
+ Arguments:
933
+ jobid:
934
+ ID of the SLURM job.
935
+ info:
936
+ """
937
+ # Extract SLURM file paths
938
+ with self.jobs_lock:
939
+ (
940
+ slurm_script_file,
941
+ slurm_stdout_file,
942
+ slurm_stderr_file,
943
+ ) = self.map_jobid_to_slurm_files_local[jobid]
944
+ # Construct JobExecutionError exception
945
+ job_exc = JobExecutionError(
946
+ cmd_file=slurm_script_file,
947
+ stdout_file=slurm_stdout_file,
948
+ stderr_file=slurm_stderr_file,
949
+ info=info,
950
+ )
951
+ return job_exc
952
+
953
+ def _missing_pickle_error_msg(self, out_path: Path) -> str:
954
+ settings = Inject(get_settings)
955
+ info = (
956
+ "Output pickle file of the FractalSlurmSSHExecutor "
957
+ "job not found.\n"
958
+ f"Expected file path: {out_path.as_posix()}n"
959
+ "Here are some possible reasons:\n"
960
+ "1. The SLURM job was scancel-ed, either by the user "
961
+ "or due to an error (e.g. an out-of-memory or timeout "
962
+ "error). Note that if the scancel took place before "
963
+ "the job started running, the SLURM out/err files "
964
+ "will be empty.\n"
965
+ "2. Some error occurred upon writing the file to disk "
966
+ "(e.g. because there is not enough space on disk, or "
967
+ "due to an overloaded NFS filesystem). "
968
+ "Note that the server configuration has "
969
+ "FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
970
+ f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
971
+ "seconds.\n"
972
+ )
973
+ return info
974
+
975
+ def _handle_remaining_jobs(
976
+ self,
977
+ remaining_futures: list[Future],
978
+ remaining_job_ids: list[str],
979
+ remaining_jobs: list[SlurmJob],
980
+ ) -> None:
981
+ """
982
+ Helper function used within _completion, when looping over a list of
983
+ several jobs/futures.
984
+ """
985
+ for future in remaining_futures:
986
+ try:
987
+ future.cancel()
988
+ except InvalidStateError:
989
+ pass
990
+ for job_id in remaining_job_ids:
991
+ self._cleanup(job_id)
992
+ if not self.keep_pickle_files:
993
+ for job in remaining_jobs:
994
+ for path in job.output_pickle_files_local:
995
+ path.unlink()
996
+ for path in job.input_pickle_files_local:
997
+ path.unlink()
998
+
999
+ def _completion(self, job_ids: list[str]) -> None:
1000
+ """
1001
+ Callback function to be executed whenever a job finishes.
1002
+
1003
+ This function is executed by self.wait_thread (triggered by either
1004
+ finding an existing output pickle file `out_path` or finding that the
1005
+ SLURM job is over). Since this takes place on a different thread,
1006
+ failures may not be captured by the main thread; we use a broad
1007
+ try/except block, so that those exceptions are reported to the main
1008
+ thread via `fut.set_exception(...)`.
1009
+
1010
+ Arguments:
1011
+ jobid: ID of the SLURM job
1012
+ """
1013
+
1014
+ # Loop over all job_ids, and fetch future and job objects
1015
+ futures: list[Future] = []
1016
+ jobs: list[SlurmJob] = []
1017
+ with self.jobs_lock:
1018
+ for job_id in job_ids:
1019
+ future, job = self.jobs.pop(job_id)
1020
+ futures.append(future)
1021
+ jobs.append(job)
1022
+ if not self.jobs:
1023
+ self.jobs_empty_cond.notify_all()
1024
+
1025
+ # Fetch subfolder from remote host
1026
+ try:
1027
+ self._get_subfolder_sftp(jobs=jobs)
1028
+ except NoValidConnectionsError as e:
1029
+ logger.error("NoValidConnectionError")
1030
+ logger.error(f"{str(e)=}")
1031
+ logger.error(f"{e.errors=}")
1032
+ for err in e.errors:
1033
+ logger.error(f"{str(err)}")
1034
+
1035
+ raise e
1036
+
1037
+ # First round of checking whether all output files exist
1038
+ missing_out_paths = []
1039
+ for job in jobs:
1040
+ for ind_out_path, out_path in enumerate(
1041
+ job.output_pickle_files_local
1042
+ ):
1043
+ if not out_path.exists():
1044
+ missing_out_paths.append(out_path)
1045
+ num_missing = len(missing_out_paths)
1046
+ if num_missing > 0:
1047
+ # Output pickle files may be missing e.g. because of some slow
1048
+ # filesystem operation; wait some time before re-trying
1049
+ settings = Inject(get_settings)
1050
+ sleep_time = settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL
1051
+ logger.info(
1052
+ f"{num_missing} output pickle files are missing; "
1053
+ f"sleep {sleep_time} seconds."
1054
+ )
1055
+ for missing_file in missing_out_paths:
1056
+ logger.debug(f"Missing output pickle file: {missing_file}")
1057
+ time.sleep(sleep_time)
1058
+
1059
+ # Handle all jobs
1060
+ for ind_job, job_id in enumerate(job_ids):
1061
+ try:
1062
+ # Retrieve job and future objects
1063
+ job = jobs[ind_job]
1064
+ future = futures[ind_job]
1065
+ remaining_job_ids = job_ids[ind_job + 1 :] # noqa: E203
1066
+ remaining_futures = futures[ind_job + 1 :] # noqa: E203
1067
+
1068
+ outputs = []
1069
+
1070
+ for ind_out_path, out_path in enumerate(
1071
+ job.output_pickle_files_local
1072
+ ):
1073
+ in_path = job.input_pickle_files_local[ind_out_path]
1074
+ if not out_path.exists():
1075
+ # Output pickle file is still missing
1076
+ info = self._missing_pickle_error_msg(out_path)
1077
+ job_exc = self._prepare_JobExecutionError(
1078
+ job_id, info=info
1079
+ )
1080
+ try:
1081
+ future.set_exception(job_exc)
1082
+ self._handle_remaining_jobs(
1083
+ remaining_futures=remaining_futures,
1084
+ remaining_job_ids=remaining_job_ids,
1085
+ )
1086
+ return
1087
+ except InvalidStateError:
1088
+ logger.warning(
1089
+ f"Future {future} (SLURM job ID: {job_id}) "
1090
+ "was already cancelled."
1091
+ )
1092
+ if not self.keep_pickle_files:
1093
+ in_path.unlink()
1094
+ self._cleanup(job_id)
1095
+ self._handle_remaining_jobs(
1096
+ remaining_futures=remaining_futures,
1097
+ remaining_job_ids=remaining_job_ids,
1098
+ )
1099
+ return
1100
+
1101
+ # Read the task output
1102
+ with out_path.open("rb") as f:
1103
+ outdata = f.read()
1104
+ # Note: output can be either the task result (typically a
1105
+ # dictionary) or an ExceptionProxy object; in the latter
1106
+ # case, the ExceptionProxy definition is also part of the
1107
+ # pickle file (thanks to cloudpickle.dumps).
1108
+ success, output = cloudpickle.loads(outdata)
1109
+ try:
1110
+ if success:
1111
+ outputs.append(output)
1112
+ else:
1113
+ proxy = output
1114
+ if proxy.exc_type_name == "JobExecutionError":
1115
+ job_exc = self._prepare_JobExecutionError(
1116
+ job_id, info=proxy.kwargs.get("info", None)
1117
+ )
1118
+ future.set_exception(job_exc)
1119
+ self._handle_remaining_jobs(
1120
+ remaining_futures=remaining_futures,
1121
+ remaining_job_ids=remaining_job_ids,
1122
+ )
1123
+ return
1124
+ else:
1125
+ # This branch catches both TaskExecutionError's
1126
+ # (coming from the typical fractal-server
1127
+ # execution of tasks, and with additional
1128
+ # fractal-specific kwargs) or arbitrary
1129
+ # exceptions (coming from a direct use of
1130
+ # FractalSlurmSSHExecutor, possibly outside
1131
+ # fractal-server)
1132
+ kwargs = {}
1133
+ for key in [
1134
+ "workflow_task_id",
1135
+ "workflow_task_order",
1136
+ "task_name",
1137
+ ]:
1138
+ if key in proxy.kwargs.keys():
1139
+ kwargs[key] = proxy.kwargs[key]
1140
+ exc = TaskExecutionError(proxy.tb, **kwargs)
1141
+ future.set_exception(exc)
1142
+ self._handle_remaining_jobs(
1143
+ remaining_futures=remaining_futures,
1144
+ remaining_job_ids=remaining_job_ids,
1145
+ )
1146
+ return
1147
+ if not self.keep_pickle_files:
1148
+ out_path.unlink()
1149
+ except InvalidStateError:
1150
+ logger.warning(
1151
+ f"Future {future} (SLURM job ID: {job_id}) was "
1152
+ "already cancelled, exit from "
1153
+ "FractalSlurmSSHExecutor._completion."
1154
+ )
1155
+ if not self.keep_pickle_files:
1156
+ out_path.unlink()
1157
+ in_path.unlink()
1158
+
1159
+ self._cleanup(job_id)
1160
+ self._handle_remaining_jobs(
1161
+ remaining_futures=remaining_futures,
1162
+ remaining_job_ids=remaining_job_ids,
1163
+ )
1164
+ return
1165
+
1166
+ # Clean up input pickle file
1167
+ if not self.keep_pickle_files:
1168
+ in_path.unlink()
1169
+ self._cleanup(job_id)
1170
+ if job.single_task_submission:
1171
+ future.set_result(outputs[0])
1172
+ else:
1173
+ future.set_result(outputs)
1174
+
1175
+ except Exception as e:
1176
+ try:
1177
+ future.set_exception(e)
1178
+ return
1179
+ except InvalidStateError:
1180
+ logger.warning(
1181
+ f"Future {future} (SLURM job ID: {job_id}) was already"
1182
+ " cancelled, exit from"
1183
+ " FractalSlurmSSHExecutor._completion."
1184
+ )
1185
+
1186
+ def _get_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
1187
+ """
1188
+ Fetch a remote folder via tar+sftp+tar
1189
+
1190
+ Arguments:
1191
+ job:
1192
+ `SlurmJob` object (needed for its prefixes-related attributes).
1193
+ """
1194
+
1195
+ # Check that the subfolder is unique
1196
+ subfolder_names = [job.wftask_subfolder_name for job in jobs]
1197
+ if len(set(subfolder_names)) > 1:
1198
+ raise ValueError(
1199
+ "[_put_subfolder] Invalid list of jobs, "
1200
+ f"{set(subfolder_names)=}."
1201
+ )
1202
+ subfolder_name = subfolder_names[0]
1203
+
1204
+ t_0 = time.perf_counter()
1205
+ logger.debug("[_get_subfolder_sftp] Start")
1206
+ tarfile_path_local = (
1207
+ self.workflow_dir_local / f"{subfolder_name}.tar.gz"
1208
+ ).as_posix()
1209
+ tarfile_path_remote = (
1210
+ self.workflow_dir_remote / f"{subfolder_name}.tar.gz"
1211
+ ).as_posix()
1212
+
1213
+ # Remove local tarfile - FIXME SSH: is this needed?
1214
+ logger.warning(f"In principle I just removed {tarfile_path_local}")
1215
+ logger.warning(f"{Path(tarfile_path_local).exists()=}")
1216
+
1217
+ # Remove remote tarfile - FIXME SSH: is this needed?
1218
+ # rm_command = f"rm {tarfile_path_remote}"
1219
+ # _run_command_over_ssh(cmd=rm_command, connection=self.connection)
1220
+ logger.warning(f"Unlink {tarfile_path_remote=} - START")
1221
+ self.connection.sftp().unlink(tarfile_path_remote)
1222
+ logger.warning(f"Unlink {tarfile_path_remote=} - STOP")
1223
+
1224
+ # Create remote tarfile
1225
+ tar_command = (
1226
+ f"{self.python_remote} "
1227
+ "-m fractal_server.app.runner.compress_folder "
1228
+ f"{(self.workflow_dir_remote / subfolder_name).as_posix()}"
1229
+ )
1230
+ stdout = run_command_over_ssh(
1231
+ cmd=tar_command, connection=self.connection
1232
+ )
1233
+ print(stdout)
1234
+
1235
+ # Fetch tarfile
1236
+ t_0_get = time.perf_counter()
1237
+ self.connection.get(
1238
+ remote=tarfile_path_remote,
1239
+ local=tarfile_path_local,
1240
+ )
1241
+ t_1_get = time.perf_counter()
1242
+ logger.info(
1243
+ f"Subfolder archive transferred back to {tarfile_path_local}"
1244
+ f" - elapsed: {t_1_get - t_0_get:.3f} s"
1245
+ )
1246
+
1247
+ # Extract tarfile locally
1248
+ with tarfile.open(tarfile_path_local) as tar:
1249
+ tar.extractall(path=(self.workflow_dir_local / subfolder_name))
1250
+
1251
+ t_1 = time.perf_counter()
1252
+ logger.info("[_get_subfolder_sftp] End - " f"elapsed: {t_1-t_0:.3f} s")
1253
+
1254
+ def _prepare_sbatch_script(
1255
+ self,
1256
+ *,
1257
+ list_commands: list[str],
1258
+ slurm_out_path: str,
1259
+ slurm_err_path: str,
1260
+ slurm_config: SlurmConfig,
1261
+ ):
1262
+
1263
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
1264
+ mem_per_task_MB = slurm_config.mem_per_task_MB
1265
+
1266
+ # Set ntasks
1267
+ ntasks = min(len(list_commands), num_tasks_max_running)
1268
+ if len(list_commands) < num_tasks_max_running:
1269
+ ntasks = len(list_commands)
1270
+ slurm_config.parallel_tasks_per_job = ntasks
1271
+ logger.debug(
1272
+ f"{len(list_commands)=} is smaller than "
1273
+ f"{num_tasks_max_running=}. Setting {ntasks=}."
1274
+ )
1275
+
1276
+ # Prepare SLURM preamble based on SlurmConfig object
1277
+ script_lines = slurm_config.to_sbatch_preamble(
1278
+ remote_export_dir=self.workflow_dir_remote.as_posix()
1279
+ )
1280
+
1281
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
1282
+ # fix their order
1283
+ script_lines.extend(
1284
+ [
1285
+ f"#SBATCH --err={slurm_err_path}",
1286
+ f"#SBATCH --out={slurm_out_path}",
1287
+ f"#SBATCH -D {self.workflow_dir_remote}",
1288
+ ]
1289
+ )
1290
+ script_lines = slurm_config.sort_script_lines(script_lines)
1291
+ logger.debug(script_lines)
1292
+
1293
+ # Always print output of `pwd`
1294
+ script_lines.append('echo "Working directory (pwd): `pwd`"\n')
1295
+
1296
+ # Complete script preamble
1297
+ script_lines.append("\n")
1298
+
1299
+ # Include command lines
1300
+ tmp_list_commands = copy(list_commands)
1301
+ while tmp_list_commands:
1302
+ if tmp_list_commands:
1303
+ cmd = tmp_list_commands.pop(0) # take first element
1304
+ script_lines.append(
1305
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
1306
+ f"--mem={mem_per_task_MB}MB "
1307
+ f"{cmd} &"
1308
+ )
1309
+ script_lines.append("wait\n")
1310
+
1311
+ script = "\n".join(script_lines)
1312
+ return script
1313
+
1314
+ def get_default_task_files(self) -> TaskFiles:
1315
+ """
1316
+ This will be called when self.submit or self.map are called from
1317
+ outside fractal-server, and then lack some optional arguments.
1318
+ """
1319
+ task_files = TaskFiles(
1320
+ workflow_dir_local=self.workflow_dir_local,
1321
+ workflow_dir_remote=self.workflow_dir_remote,
1322
+ task_order=None,
1323
+ task_name="name",
1324
+ )
1325
+ return task_files
1326
+
1327
+ def shutdown(self, wait=True, *, cancel_futures=False):
1328
+ """
1329
+ Clean up all executor variables. Note that this function is executed on
1330
+ the self.wait_thread thread, see _completion.
1331
+ """
1332
+
1333
+ logger.debug("Executor shutdown: start")
1334
+ # self.connection.close()
1335
+
1336
+ # Handle all job futures
1337
+ slurm_jobs_to_scancel = []
1338
+ with self.jobs_lock:
1339
+ while self.jobs:
1340
+ jobid, fut_and_job = self.jobs.popitem()
1341
+ slurm_jobs_to_scancel.append(jobid)
1342
+ fut = fut_and_job[0]
1343
+ self.map_jobid_to_slurm_files_local.pop(jobid)
1344
+ if not fut.cancelled():
1345
+ fut.set_exception(
1346
+ JobExecutionError(
1347
+ "Job cancelled due to executor shutdown."
1348
+ )
1349
+ )
1350
+ fut.cancel()
1351
+
1352
+ # Cancel SLURM jobs
1353
+ if slurm_jobs_to_scancel:
1354
+ scancel_string = " ".join(slurm_jobs_to_scancel)
1355
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
1356
+ scancel_command = f"scancel {scancel_string}"
1357
+ run_command_over_ssh(
1358
+ cmd=scancel_command, connection=self.connection
1359
+ )
1360
+ logger.debug("Executor shutdown: end")
1361
+
1362
+ def __exit__(self, *args, **kwargs):
1363
+ """
1364
+ See
1365
+ https://github.com/fractal-analytics-platform/fractal-server/issues/1508
1366
+ """
1367
+ logger.debug(
1368
+ "[FractalSlurmSSHExecutor.__exit__] Stop and join `wait_thread`"
1369
+ )
1370
+ self.wait_thread.stop()
1371
+ self.wait_thread.join()
1372
+ logger.debug("[FractalSlurmSSHExecutor.__exit__] End")
1373
+
1374
+ def run_squeue(self, job_ids):
1375
+ squeue_command = (
1376
+ "squeue "
1377
+ "--noheader "
1378
+ "--format='%i %T' "
1379
+ "--jobs __JOBS__ "
1380
+ "--states=all"
1381
+ )
1382
+ job_ids = ",".join([str(j) for j in job_ids])
1383
+ squeue_command = squeue_command.replace("__JOBS__", job_ids)
1384
+ stdout = run_command_over_ssh(
1385
+ cmd=squeue_command,
1386
+ connection=self.connection,
1387
+ )
1388
+ return stdout
1389
+
1390
+ def _jobs_finished(self, job_ids: list[str]) -> set[str]:
1391
+ """
1392
+ Check which ones of the given Slurm jobs already finished
1393
+
1394
+ The function is based on the `_jobs_finished` function from
1395
+ clusterfutures (version 0.5).
1396
+ Original Copyright: 2022 Adrian Sampson
1397
+ (released under the MIT licence)
1398
+ """
1399
+
1400
+ from cfut.slurm import STATES_FINISHED
1401
+
1402
+ logger.debug(
1403
+ f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
1404
+ )
1405
+
1406
+ # If there is no Slurm job to check, return right away
1407
+ if not job_ids:
1408
+ logger.debug(
1409
+ "[FractalSlurmSSHExecutor._jobs_finished] "
1410
+ "No jobs provided, return."
1411
+ )
1412
+ return set()
1413
+
1414
+ try:
1415
+ stdout = self.run_squeue(job_ids)
1416
+ id_to_state = {
1417
+ out.split()[0]: out.split()[1] for out in stdout.splitlines()
1418
+ }
1419
+ # Finished jobs only stay in squeue for a few mins (configurable).
1420
+ # If a job ID isn't there, we'll assume it's finished.
1421
+ output = {
1422
+ _id
1423
+ for _id in job_ids
1424
+ if id_to_state.get(_id, "COMPLETED") in STATES_FINISHED
1425
+ }
1426
+ logger.debug(
1427
+ f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1428
+ )
1429
+ return output
1430
+ except Exception as e:
1431
+ # If something goes wrong, proceed anyway
1432
+ logger.error(
1433
+ f"Something wrong in _jobs_finished. Original error: {str(e)}"
1434
+ )
1435
+ output = set()
1436
+ logger.debug(
1437
+ f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1438
+ )
1439
+ return output
1440
+
1441
+ id_to_state = dict()
1442
+ for j in job_ids:
1443
+ res = self.run_squeue([j])
1444
+ if res.returncode != 0:
1445
+ logger.info(f"Job {j} not found. Marked it as completed")
1446
+ id_to_state.update({str(j): "COMPLETED"})
1447
+ else:
1448
+ id_to_state.update(
1449
+ {res.stdout.split()[0]: res.stdout.split()[1]}
1450
+ )
1451
+
1452
+ def handshake(self) -> dict:
1453
+ """
1454
+ Healthcheck for SSH connection and for versions match.
1455
+
1456
+ FIXME SSH: We should add a timeout here
1457
+ FIXME SSH: We could include checks on the existence of folders
1458
+ FIXME SSH: We could include further checks on version matches
1459
+ """
1460
+
1461
+ check_connection(self.connection)
1462
+
1463
+ t_start_handshake = time.perf_counter()
1464
+
1465
+ logger.info("[FractalSlurmSSHExecutor.ssh_handshake] START")
1466
+ cmd = f"{self.python_remote} -m fractal_server.app.runner.versions"
1467
+ stdout = run_command_over_ssh(cmd=cmd, connection=self.connection)
1468
+ remote_versions = json.loads(stdout.strip("\n"))
1469
+
1470
+ # Check compatibility with local versions
1471
+ local_versions = get_versions()
1472
+ remote_fractal_server = remote_versions["fractal_server"]
1473
+ local_fractal_server = local_versions["fractal_server"]
1474
+ if remote_fractal_server != local_fractal_server:
1475
+ error_msg = (
1476
+ "Fractal-server version mismatch.\n"
1477
+ "Local interpreter: "
1478
+ f"({sys.executable}): {local_versions}.\n"
1479
+ "Remote interpreter: "
1480
+ f"({self.python_remote}): {remote_versions}."
1481
+ )
1482
+ logger.error(error_msg)
1483
+ raise ValueError(error_msg)
1484
+
1485
+ t_end_handshake = time.perf_counter()
1486
+ logger.info(
1487
+ "[FractalSlurmSSHExecutor.ssh_handshake] END"
1488
+ f" - elapsed: {t_end_handshake-t_start_handshake:.3f} s"
1489
+ )
1490
+ return remote_versions