fractal-server 2.14.0a12__py3-none-any.whl → 2.14.0a14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/linkusergroup.py +6 -2
  3. fractal_server/app/models/v2/dataset.py +1 -1
  4. fractal_server/app/models/v2/job.py +7 -3
  5. fractal_server/app/models/v2/task_group.py +2 -2
  6. fractal_server/app/models/v2/workflow.py +1 -1
  7. fractal_server/app/models/v2/workflowtask.py +1 -1
  8. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  9. fractal_server/app/routes/api/v2/dataset.py +0 -8
  10. fractal_server/app/routes/api/v2/history.py +112 -27
  11. fractal_server/app/routes/api/v2/images.py +16 -14
  12. fractal_server/app/routes/api/v2/project.py +0 -52
  13. fractal_server/app/routes/api/v2/task_group.py +0 -17
  14. fractal_server/app/routes/api/v2/workflow.py +0 -8
  15. fractal_server/app/routes/auth/group.py +0 -16
  16. fractal_server/app/runner/executors/base_runner.py +5 -0
  17. fractal_server/app/runner/executors/local/runner.py +15 -7
  18. fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
  19. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +676 -0
  20. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
  21. fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
  22. fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
  23. fractal_server/app/runner/task_files.py +20 -6
  24. fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
  25. fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
  26. fractal_server/app/runner/v2/db_tools.py +1 -0
  27. fractal_server/app/runner/v2/runner.py +4 -0
  28. fractal_server/app/runner/v2/runner_functions.py +2 -2
  29. fractal_server/app/runner/v2/submit_workflow.py +7 -16
  30. fractal_server/app/schemas/v2/__init__.py +3 -1
  31. fractal_server/app/schemas/v2/history.py +27 -2
  32. fractal_server/config.py +6 -2
  33. fractal_server/images/tools.py +23 -0
  34. fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
  35. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
  36. fractal_server/tasks/v2/utils_background.py +0 -19
  37. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/METADATA +1 -1
  38. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/RECORD +41 -42
  39. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
  40. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
  41. fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
  42. fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
  43. fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
  44. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/LICENSE +0 -0
  45. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/WHEEL +0 -0
  46. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/entry_points.txt +0 -0
@@ -1,1386 +0,0 @@
1
- import json
2
- import math
3
- import sys
4
- import threading
5
- import time
6
- from concurrent.futures import Executor
7
- from concurrent.futures import Future
8
- from concurrent.futures import InvalidStateError
9
- from copy import copy
10
- from pathlib import Path
11
- from typing import Any
12
- from typing import Optional
13
- from typing import Sequence
14
-
15
- import cloudpickle
16
-
17
- from ...filenames import SHUTDOWN_FILENAME
18
- from ...task_files import TaskFiles
19
- from ...versions import get_versions
20
- from ..slurm_common._batching import heuristics
21
- from ..slurm_common._job_states import STATES_FINISHED
22
- from ..slurm_common._slurm_config import SlurmConfig
23
- from ..slurm_common.utils_executors import get_pickle_file_path
24
- from ..slurm_common.utils_executors import get_slurm_file_path
25
- from ..slurm_common.utils_executors import get_slurm_script_file_path
26
- from ._executor_wait_thread import FractalSlurmSSHWaitThread
27
- from fractal_server.app.runner.compress_folder import compress_folder
28
- from fractal_server.app.runner.exceptions import JobExecutionError
29
- from fractal_server.app.runner.exceptions import TaskExecutionError
30
- from fractal_server.app.runner.executors.slurm_ssh._slurm_job import SlurmJob
31
- from fractal_server.app.runner.extract_archive import extract_archive
32
- from fractal_server.config import get_settings
33
- from fractal_server.logger import set_logger
34
- from fractal_server.ssh._fabric import FractalSSH
35
- from fractal_server.syringe import Inject
36
-
37
-
38
- logger = set_logger(__name__)
39
-
40
-
41
- class FractalSlurmSSHExecutor(Executor):
42
- """
43
- Executor to submit SLURM jobs via SSH
44
-
45
- This class is a custom re-implementation of the SLURM executor from
46
-
47
- > clusterfutures <https://github.com/sampsyo/clusterfutures>
48
- > Original Copyright
49
- > Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
50
- > License: MIT
51
-
52
-
53
- Attributes:
54
- fractal_ssh: FractalSSH connection with custom lock
55
- workflow_dir_local:
56
- Directory for both the cfut/SLURM and fractal-server files and logs
57
- workflow_dir_remote:
58
- Directory for both the cfut/SLURM and fractal-server files and logs
59
- shutdown_file:
60
- python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
61
- wait_thread_cls: Class for waiting thread
62
- common_script_lines:
63
- Arbitrary script lines that will always be included in the
64
- sbatch script
65
- slurm_account:
66
- jobs:
67
- map_jobid_to_slurm_files:
68
- Dictionary with paths of slurm-related files for active jobs
69
- """
70
-
71
- fractal_ssh: FractalSSH
72
-
73
- workflow_dir_local: Path
74
- workflow_dir_remote: Path
75
- shutdown_file: str
76
- python_remote: str
77
-
78
- wait_thread_cls = FractalSlurmSSHWaitThread
79
-
80
- common_script_lines: list[str]
81
- slurm_account: Optional[str] = None
82
-
83
- jobs: dict[str, tuple[Future, SlurmJob]]
84
- map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
85
-
86
- def __init__(
87
- self,
88
- *,
89
- # FractalSSH connection
90
- fractal_ssh: FractalSSH,
91
- # Folders and files
92
- workflow_dir_local: Path,
93
- workflow_dir_remote: Path,
94
- # Monitoring options
95
- slurm_poll_interval: Optional[int] = None,
96
- # SLURM submission script options
97
- common_script_lines: Optional[list[str]] = None,
98
- slurm_account: Optional[str] = None,
99
- # Other kwargs are ignored
100
- **kwargs,
101
- ):
102
- """
103
- Init method for FractalSlurmSSHExecutor
104
-
105
- Note: since we are not using `super().__init__`, we duplicate some
106
- relevant bits of `cfut.ClusterExecutor.__init__`.
107
-
108
- Args:
109
- fractal_ssh:
110
- workflow_dir_local:
111
- workflow_dir_remote:
112
- slurm_poll_interval:
113
- common_script_lines:
114
- slurm_account:
115
- """
116
-
117
- if kwargs != {}:
118
- raise ValueError(
119
- f"FractalSlurmSSHExecutor received unexpected {kwargs=}"
120
- )
121
-
122
- self.workflow_dir_local = workflow_dir_local
123
- self.workflow_dir_remote = workflow_dir_remote
124
-
125
- # Relevant bits of cfut.ClusterExecutor.__init__ are copied here,
126
- # postponing the .start() call to when the callbacks are defined
127
- self.jobs = {}
128
- self.job_outfiles = {}
129
- self.jobs_lock = threading.Lock()
130
- self.jobs_empty_cond = threading.Condition(self.jobs_lock)
131
- self.wait_thread = self.wait_thread_cls(self._completion)
132
-
133
- # Set up attributes and methods for self.wait_thread
134
- # cfut.SlurmWaitThread)
135
- self.wait_thread.shutdown_callback = self.shutdown
136
- self.wait_thread.jobs_finished_callback = self._jobs_finished
137
- if slurm_poll_interval is None:
138
- settings = Inject(get_settings)
139
- slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
140
- elif slurm_poll_interval <= 0:
141
- raise ValueError(f"Invalid attribute {slurm_poll_interval=}")
142
- self.wait_thread.slurm_poll_interval = slurm_poll_interval
143
- self.wait_thread.shutdown_file = (
144
- self.workflow_dir_local / SHUTDOWN_FILENAME
145
- ).as_posix()
146
-
147
- # Now start self.wait_thread (note: this must be *after* its callback
148
- # methods have been defined)
149
- self.wait_thread.start()
150
-
151
- # Define remote Python interpreter
152
- settings = Inject(get_settings)
153
- self.python_remote = settings.FRACTAL_SLURM_WORKER_PYTHON
154
- if self.python_remote is None:
155
- self._stop_and_join_wait_thread()
156
- raise ValueError("FRACTAL_SLURM_WORKER_PYTHON is not set. Exit.")
157
-
158
- # Initialize connection and perform handshake
159
- self.fractal_ssh = fractal_ssh
160
- logger.warning(self.fractal_ssh)
161
- try:
162
- self.handshake()
163
- except Exception as e:
164
- logger.warning(
165
- "Stop/join waiting thread and then "
166
- f"re-raise original error {str(e)}"
167
- )
168
- self._stop_and_join_wait_thread()
169
- raise e
170
-
171
- # Set/validate parameters for SLURM submission scripts
172
- self.slurm_account = slurm_account
173
- self.common_script_lines = common_script_lines or []
174
- try:
175
- self._validate_common_script_lines()
176
- except Exception as e:
177
- logger.warning(
178
- "Stop/join waiting thread and then "
179
- f"re-raise original error {str(e)}"
180
- )
181
- self._stop_and_join_wait_thread()
182
- raise e
183
-
184
- # Set/initialize some more options
185
- self.map_jobid_to_slurm_files_local = {}
186
-
187
- def _validate_common_script_lines(self):
188
- """
189
- Check that SLURM account is not set in `self.common_script_lines`.
190
- """
191
- try:
192
- invalid_line = next(
193
- line
194
- for line in self.common_script_lines
195
- if line.startswith("#SBATCH --account=")
196
- )
197
- raise RuntimeError(
198
- "Invalid line in `FractalSlurmSSHExecutor."
199
- "common_script_lines`: "
200
- f"'{invalid_line}'.\n"
201
- "SLURM account must be set via the request body of the "
202
- "apply-workflow endpoint, or by modifying the user properties."
203
- )
204
- except StopIteration:
205
- pass
206
-
207
- def _cleanup(self, jobid: str) -> None:
208
- """
209
- Given a job ID, perform any necessary cleanup after the job has
210
- finished.
211
- """
212
- with self.jobs_lock:
213
- self.map_jobid_to_slurm_files_local.pop(jobid)
214
-
215
- def submit(
216
- self,
217
- fun: callable,
218
- *fun_args: Sequence[Any],
219
- slurm_config: SlurmConfig,
220
- task_files: TaskFiles,
221
- **fun_kwargs: dict,
222
- ) -> Future:
223
- """
224
- Submit a function for execution on `FractalSlurmSSHExecutor`
225
-
226
- Arguments:
227
- fun: The function to be executed
228
- fun_args: Function positional arguments
229
- fun_kwargs: Function keyword arguments
230
- slurm_config:
231
- A `SlurmConfig` object.
232
- task_files:
233
- A `TaskFiles` object.
234
-
235
- Returns:
236
- Future representing the execution of the current SLURM job.
237
- """
238
-
239
- # Do not continue if auxiliary thread was shut down
240
- if self.wait_thread.shutdown:
241
- error_msg = "Cannot call `submit` method after executor shutdown"
242
- logger.warning(error_msg)
243
- raise JobExecutionError(info=error_msg)
244
-
245
- # Set slurm_file_prefix
246
- slurm_file_prefix = task_files.file_prefix
247
-
248
- # Include common_script_lines in extra_lines
249
- logger.debug(
250
- f"Adding {self.common_script_lines=} to "
251
- f"{slurm_config.extra_lines=}, from submit method."
252
- )
253
- current_extra_lines = slurm_config.extra_lines or []
254
- slurm_config.extra_lines = (
255
- current_extra_lines + self.common_script_lines
256
- )
257
-
258
- # Adapt slurm_config to the fact that this is a single-task SlurmJob
259
- # instance
260
- slurm_config.tasks_per_job = 1
261
- slurm_config.parallel_tasks_per_job = 1
262
-
263
- job = self._prepare_job(
264
- fun,
265
- slurm_config=slurm_config,
266
- slurm_file_prefix=slurm_file_prefix,
267
- task_files=task_files,
268
- single_task_submission=True,
269
- args=fun_args,
270
- kwargs=fun_kwargs,
271
- )
272
- self._put_subfolder_sftp(jobs=[job])
273
- future, job_id_str = self._submit_job(job)
274
- self.wait_thread.wait(job_id=job_id_str)
275
- return future
276
-
277
- def map(
278
- self,
279
- fn: callable,
280
- iterable: list[Sequence[Any]],
281
- *,
282
- slurm_config: SlurmConfig,
283
- task_files: TaskFiles,
284
- ):
285
- """
286
- Return an iterator with the results of several execution of a function
287
-
288
- This function is based on `concurrent.futures.Executor.map` from Python
289
- Standard Library 3.11.
290
- Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
291
- PSF under a Contributor Agreement.
292
-
293
- Main modifications from the PSF function:
294
-
295
- 1. Only `fn` and `iterable` can be assigned as positional arguments;
296
- 2. `*iterables` argument replaced with a single `iterable`;
297
- 3. `timeout` and `chunksize` arguments are not supported.
298
-
299
- Arguments:
300
- fn:
301
- The function to be executed
302
- iterable:
303
- An iterable such that each element is the list of arguments to
304
- be passed to `fn`, as in `fn(*args)`.
305
- slurm_config:
306
- A `SlurmConfig` object.
307
- task_files:
308
- A `TaskFiles` object.
309
- """
310
-
311
- # Do not continue if auxiliary thread was shut down
312
- if self.wait_thread.shutdown:
313
- error_msg = "Cannot call `map` method after executor shutdown"
314
- logger.warning(error_msg)
315
- raise JobExecutionError(info=error_msg)
316
-
317
- def _result_or_cancel(fut):
318
- """
319
- This function is based on the Python Standard Library 3.11.
320
- Original Copyright 2009 Brian Quinlan. All Rights Reserved.
321
- Licensed to PSF under a Contributor Agreement.
322
- """
323
- try:
324
- try:
325
- return fut.result()
326
- finally:
327
- fut.cancel()
328
- finally:
329
- # Break a reference cycle with the exception in
330
- # self._exception
331
- del fut
332
-
333
- # Include common_script_lines in extra_lines
334
- logger.debug(
335
- f"Adding {self.common_script_lines=} to "
336
- f"{slurm_config.extra_lines=}, from map method."
337
- )
338
- current_extra_lines = slurm_config.extra_lines or []
339
- slurm_config.extra_lines = (
340
- current_extra_lines + self.common_script_lines
341
- )
342
-
343
- # Set file prefixes
344
- general_slurm_file_prefix = str(task_files.task_order)
345
-
346
- # Transform iterable into a list and count its elements
347
- list_args = list(iterable)
348
- tot_tasks = len(list_args)
349
-
350
- # Set/validate parameters for task batching
351
- tasks_per_job, parallel_tasks_per_job = heuristics(
352
- # Number of parallel components (always known)
353
- tot_tasks=len(list_args),
354
- # Optional WorkflowTask attributes:
355
- tasks_per_job=slurm_config.tasks_per_job,
356
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
357
- # Task requirements (multiple possible sources):
358
- cpus_per_task=slurm_config.cpus_per_task,
359
- mem_per_task=slurm_config.mem_per_task_MB,
360
- # Fractal configuration variables (soft/hard limits):
361
- target_cpus_per_job=slurm_config.target_cpus_per_job,
362
- target_mem_per_job=slurm_config.target_mem_per_job,
363
- target_num_jobs=slurm_config.target_num_jobs,
364
- max_cpus_per_job=slurm_config.max_cpus_per_job,
365
- max_mem_per_job=slurm_config.max_mem_per_job,
366
- max_num_jobs=slurm_config.max_num_jobs,
367
- )
368
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
369
- slurm_config.tasks_per_job = tasks_per_job
370
-
371
- # Divide arguments in batches of `n_tasks_per_script` tasks each
372
- args_batches = []
373
- batch_size = tasks_per_job
374
- for ind_chunk in range(0, tot_tasks, batch_size):
375
- args_batches.append(list_args[ind_chunk : ind_chunk + batch_size])
376
- if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
377
- raise RuntimeError("Something wrong here while batching tasks")
378
-
379
- # Fetch configuration variable
380
- settings = Inject(get_settings)
381
- FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
382
-
383
- logger.debug("[map] Job preparation - START")
384
- current_component_index = 0
385
- jobs_to_submit = []
386
- for ind_batch, batch in enumerate(args_batches):
387
- batch_size = len(batch)
388
- this_slurm_file_prefix = (
389
- f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
390
- )
391
- new_job_to_submit = self._prepare_job(
392
- fn,
393
- slurm_config=slurm_config,
394
- slurm_file_prefix=this_slurm_file_prefix,
395
- task_files=task_files,
396
- single_task_submission=False,
397
- components=batch,
398
- )
399
- jobs_to_submit.append(new_job_to_submit)
400
- current_component_index += batch_size
401
- logger.debug("[map] Job preparation - END")
402
-
403
- self._put_subfolder_sftp(jobs=jobs_to_submit)
404
-
405
- # Construct list of futures (one per SLURM job, i.e. one per batch)
406
- # FIXME SSH: we may create a single `_submit_many_jobs` method to
407
- # reduce the number of commands run over SSH
408
- logger.debug("[map] Job submission - START")
409
- fs = []
410
- job_ids = []
411
- for job in jobs_to_submit:
412
- future, job_id = self._submit_job(job)
413
- job_ids.append(job_id)
414
- fs.append(future)
415
- time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
416
- for job_id in job_ids:
417
- self.wait_thread.wait(job_id=job_id)
418
- logger.debug("[map] Job submission - END")
419
-
420
- # Yield must be hidden in closure so that the futures are submitted
421
- # before the first iterator value is required.
422
- # NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
423
- # iterable of results (if successful), and we should yield its elements
424
- # rather than the whole iterable.
425
- def result_iterator():
426
- """
427
- This function is based on the Python Standard Library 3.11.
428
- Original Copyright 2009 Brian Quinlan. All Rights Reserved.
429
- Licensed to PSF under a Contributor Agreement.
430
- """
431
- try:
432
- # reverse to keep finishing order
433
- fs.reverse()
434
- while fs:
435
- # Careful not to keep a reference to the popped future
436
- results = _result_or_cancel(fs.pop())
437
- for res in results:
438
- yield res
439
- finally:
440
- for future in fs:
441
- future.cancel()
442
-
443
- return result_iterator()
444
-
445
- def _prepare_job(
446
- self,
447
- fun: callable,
448
- slurm_file_prefix: str,
449
- task_files: TaskFiles,
450
- slurm_config: SlurmConfig,
451
- single_task_submission: bool = False,
452
- args: Optional[Sequence[Any]] = None,
453
- kwargs: Optional[dict] = None,
454
- components: Optional[list[Any]] = None,
455
- ) -> SlurmJob:
456
- """
457
- Prepare a SLURM job locally, without submitting it
458
-
459
- This function prepares and writes the local submission script, but it
460
- does not transfer it to the SLURM cluster.
461
-
462
- NOTE: this method has different behaviors when it is called from the
463
- `self.submit` or `self.map` methods (which is also encoded in
464
- `single_task_submission`):
465
-
466
- * When called from `self.submit`, it supports general `args` and
467
- `kwargs` arguments;
468
- * When called from `self.map`, there cannot be any `args` or `kwargs`
469
- argument, but there must be a `components` argument.
470
-
471
- Arguments:
472
- fun:
473
- slurm_file_prefix:
474
- task_files:
475
- slurm_config:
476
- single_task_submission:
477
- args:
478
- kwargs:
479
- components:
480
-
481
- Returns:
482
- SlurmJob object
483
- """
484
-
485
- # Inject SLURM account (if set) into slurm_config
486
- if self.slurm_account:
487
- slurm_config.account = self.slurm_account
488
-
489
- # Define slurm-job-related files
490
- if single_task_submission:
491
- if components is not None:
492
- raise ValueError(
493
- f"{single_task_submission=} but components is not None"
494
- )
495
- job = SlurmJob(
496
- slurm_file_prefix=slurm_file_prefix,
497
- num_tasks_tot=1,
498
- slurm_config=slurm_config,
499
- )
500
- if job.num_tasks_tot > 1:
501
- raise ValueError(
502
- "{single_task_submission=} but {job.num_tasks_tot=}"
503
- )
504
- job.single_task_submission = True
505
- job.wftask_file_prefixes = (task_files.file_prefix,)
506
- job.wftask_subfolder_name = task_files.subfolder_name
507
-
508
- else:
509
- if not components or len(components) < 1:
510
- raise ValueError(
511
- "In FractalSlurmSSHExecutor._submit_job, given "
512
- f"{components=}."
513
- )
514
- num_tasks_tot = len(components)
515
- job = SlurmJob(
516
- slurm_file_prefix=slurm_file_prefix,
517
- num_tasks_tot=num_tasks_tot,
518
- slurm_config=slurm_config,
519
- )
520
-
521
- _prefixes = []
522
- _subfolder_names = []
523
- for component in components:
524
- # In Fractal, `component` is `dict` by construction (e.g.
525
- # `component = {"zarr_url": "/something", "param": 1}``). The
526
- # try/except covers the case of e.g. `executor.map([1, 2])`,
527
- # which is useful for testing.
528
-
529
- # FIXME: the use of _COMPONENT_KEY_ is now deprecated
530
- # try:
531
- # actual_component = component.get(_COMPONENT_KEY_, None)
532
- # except AttributeError:
533
- # actual_component = str(component)
534
- actual_component = "FAKE_INVALID_VALUE_FIXME"
535
-
536
- _task_file_paths = TaskFiles(
537
- root_dir_local=task_files.workflow_dir_local,
538
- root_dir_remote=task_files.workflow_dir_remote,
539
- task_name=task_files.task_name,
540
- task_order=task_files.task_order,
541
- component=actual_component,
542
- )
543
- _prefixes.append(_task_file_paths.file_prefix)
544
- _subfolder_names.append(_task_file_paths.subfolder_name)
545
- job.wftask_file_prefixes = tuple(_prefixes)
546
-
547
- # Check that all components share the same subfolder
548
- num_subfolders = len(set(_subfolder_names))
549
- if num_subfolders != 1:
550
- error_msg_short = (
551
- f"[_submit_job] Subfolder list has {num_subfolders} "
552
- "different values, but it must have only one (since "
553
- "workflow tasks are executed one by one)."
554
- )
555
- error_msg_detail = (
556
- "[_submit_job] Current unique subfolder names: "
557
- f"{set(_subfolder_names)}"
558
- )
559
- logger.error(error_msg_short)
560
- logger.error(error_msg_detail)
561
- raise ValueError(error_msg_short)
562
- job.wftask_subfolder_name = _subfolder_names[0]
563
-
564
- # Check that server-side subfolder exists
565
- subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
566
- if not subfolder_path.exists():
567
- raise FileNotFoundError(
568
- f"Missing folder {subfolder_path.as_posix()}."
569
- )
570
-
571
- job.input_pickle_files_local = tuple(
572
- get_pickle_file_path(
573
- arg=job.workerids[ind],
574
- workflow_dir=self.workflow_dir_local,
575
- subfolder_name=job.wftask_subfolder_name,
576
- in_or_out="in",
577
- prefix=job.wftask_file_prefixes[ind],
578
- )
579
- for ind in range(job.num_tasks_tot)
580
- )
581
-
582
- job.input_pickle_files_remote = tuple(
583
- get_pickle_file_path(
584
- arg=job.workerids[ind],
585
- workflow_dir=self.workflow_dir_remote,
586
- subfolder_name=job.wftask_subfolder_name,
587
- in_or_out="in",
588
- prefix=job.wftask_file_prefixes[ind],
589
- )
590
- for ind in range(job.num_tasks_tot)
591
- )
592
- job.output_pickle_files_local = tuple(
593
- get_pickle_file_path(
594
- arg=job.workerids[ind],
595
- workflow_dir=self.workflow_dir_local,
596
- subfolder_name=job.wftask_subfolder_name,
597
- in_or_out="out",
598
- prefix=job.wftask_file_prefixes[ind],
599
- )
600
- for ind in range(job.num_tasks_tot)
601
- )
602
- job.output_pickle_files_remote = tuple(
603
- get_pickle_file_path(
604
- arg=job.workerids[ind],
605
- workflow_dir=self.workflow_dir_remote,
606
- subfolder_name=job.wftask_subfolder_name,
607
- in_or_out="out",
608
- prefix=job.wftask_file_prefixes[ind],
609
- )
610
- for ind in range(job.num_tasks_tot)
611
- )
612
- # define slurm-job file local/remote paths
613
- job.slurm_script_local = get_slurm_script_file_path(
614
- workflow_dir=self.workflow_dir_local,
615
- subfolder_name=job.wftask_subfolder_name,
616
- prefix=job.slurm_file_prefix,
617
- )
618
- job.slurm_script_remote = get_slurm_script_file_path(
619
- workflow_dir=self.workflow_dir_remote,
620
- subfolder_name=job.wftask_subfolder_name,
621
- prefix=job.slurm_file_prefix,
622
- )
623
- job.slurm_stdout_local = get_slurm_file_path(
624
- workflow_dir=self.workflow_dir_local,
625
- subfolder_name=job.wftask_subfolder_name,
626
- out_or_err="out",
627
- prefix=job.slurm_file_prefix,
628
- )
629
- job.slurm_stdout_remote = get_slurm_file_path(
630
- workflow_dir=self.workflow_dir_remote,
631
- subfolder_name=job.wftask_subfolder_name,
632
- out_or_err="out",
633
- prefix=job.slurm_file_prefix,
634
- )
635
- job.slurm_stderr_local = get_slurm_file_path(
636
- workflow_dir=self.workflow_dir_local,
637
- subfolder_name=job.wftask_subfolder_name,
638
- out_or_err="err",
639
- prefix=job.slurm_file_prefix,
640
- )
641
- job.slurm_stderr_remote = get_slurm_file_path(
642
- workflow_dir=self.workflow_dir_remote,
643
- subfolder_name=job.wftask_subfolder_name,
644
- out_or_err="err",
645
- prefix=job.slurm_file_prefix,
646
- )
647
-
648
- # Dump serialized versions+function+args+kwargs to pickle file(s)
649
- versions = get_versions()
650
- if job.single_task_submission:
651
- _args = args or []
652
- _kwargs = kwargs or {}
653
- funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
654
- with open(job.input_pickle_files_local[0], "wb") as f:
655
- f.write(funcser)
656
- else:
657
- for ind_component, component in enumerate(components):
658
- _args = [component]
659
- _kwargs = {}
660
- funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
661
- with open(
662
- job.input_pickle_files_local[ind_component], "wb"
663
- ) as f:
664
- f.write(funcser)
665
-
666
- # Prepare commands to be included in SLURM submission script
667
- cmdlines = []
668
- for ind_task in range(job.num_tasks_tot):
669
- input_pickle_file = job.input_pickle_files_remote[ind_task]
670
- output_pickle_file = job.output_pickle_files_remote[ind_task]
671
- cmdlines.append(
672
- (
673
- f"{self.python_remote}"
674
- " -m fractal_server.app.runner.executors.slurm.remote "
675
- f"--input-file {input_pickle_file} "
676
- f"--output-file {output_pickle_file}"
677
- )
678
- )
679
-
680
- # Prepare SLURM submission script
681
- sbatch_script_content = self._prepare_sbatch_script(
682
- slurm_config=job.slurm_config,
683
- list_commands=cmdlines,
684
- slurm_out_path=str(job.slurm_stdout_remote),
685
- slurm_err_path=str(job.slurm_stderr_remote),
686
- )
687
- with job.slurm_script_local.open("w") as f:
688
- f.write(sbatch_script_content)
689
-
690
- return job
691
-
692
- def _put_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
693
- """
694
- Transfer the jobs subfolder to the remote host.
695
-
696
- Arguments:
697
- jobs: The list of `SlurmJob` objects associated to a given
698
- subfolder.
699
- """
700
-
701
- # Check that the subfolder is unique
702
- subfolder_names = [job.wftask_subfolder_name for job in jobs]
703
- if len(set(subfolder_names)) > 1:
704
- raise ValueError(
705
- "[_put_subfolder] Invalid list of jobs, "
706
- f"{set(subfolder_names)=}."
707
- )
708
- subfolder_name = subfolder_names[0]
709
-
710
- # Create compressed subfolder archive (locally)
711
- local_subfolder = self.workflow_dir_local / subfolder_name
712
- tarfile_path_local = compress_folder(local_subfolder)
713
- tarfile_name = Path(tarfile_path_local).name
714
- logger.info(f"Subfolder archive created at {tarfile_path_local}")
715
- tarfile_path_remote = (
716
- self.workflow_dir_remote / tarfile_name
717
- ).as_posix()
718
-
719
- # Transfer archive
720
- t_0_put = time.perf_counter()
721
- self.fractal_ssh.send_file(
722
- local=tarfile_path_local,
723
- remote=tarfile_path_remote,
724
- )
725
- t_1_put = time.perf_counter()
726
- logger.info(
727
- f"Subfolder archive transferred to {tarfile_path_remote}"
728
- f" - elapsed: {t_1_put - t_0_put:.3f} s"
729
- )
730
- # Uncompress archive (remotely)
731
- tar_command = (
732
- f"{self.python_remote} -m "
733
- "fractal_server.app.runner.extract_archive "
734
- f"{tarfile_path_remote}"
735
- )
736
- self.fractal_ssh.run_command(cmd=tar_command)
737
-
738
- # Remove local version
739
- t_0_rm = time.perf_counter()
740
- Path(tarfile_path_local).unlink()
741
- t_1_rm = time.perf_counter()
742
- logger.info(
743
- f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
744
- )
745
-
746
- def _submit_job(self, job: SlurmJob) -> tuple[Future, str]:
747
- """
748
- Submit a job to SLURM via SSH.
749
-
750
- This method must always be called after `self._put_subfolder`.
751
-
752
- Arguments:
753
- job: The `SlurmJob` object to submit.
754
- """
755
-
756
- # Prevent calling sbatch if auxiliary thread was shut down
757
- if self.wait_thread.shutdown:
758
- error_msg = (
759
- "Cannot call `_submit_job` method after executor shutdown"
760
- )
761
- logger.warning(error_msg)
762
- raise JobExecutionError(info=error_msg)
763
-
764
- # Submit job to SLURM, and get jobid
765
- sbatch_command = f"sbatch --parsable {job.slurm_script_remote}"
766
- pre_submission_cmds = job.slurm_config.pre_submission_commands
767
- if len(pre_submission_cmds) == 0:
768
- sbatch_stdout = self.fractal_ssh.run_command(cmd=sbatch_command)
769
- else:
770
- logger.debug(f"Now using {pre_submission_cmds=}")
771
- script_lines = pre_submission_cmds + [sbatch_command]
772
- script_content = "\n".join(script_lines)
773
- script_content = f"{script_content}\n"
774
- script_path_remote = (
775
- f"{job.slurm_script_remote.as_posix()}_wrapper.sh"
776
- )
777
- self.fractal_ssh.write_remote_file(
778
- path=script_path_remote, content=script_content
779
- )
780
- cmd = f"bash {script_path_remote}"
781
- sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
782
-
783
- # Extract SLURM job ID from stdout
784
- try:
785
- stdout = sbatch_stdout.strip("\n")
786
- jobid = int(stdout)
787
- except ValueError as e:
788
- error_msg = (
789
- f"Submit command `{sbatch_command}` returned "
790
- f"`{stdout=}` which cannot be cast to an integer "
791
- f"SLURM-job ID.\n"
792
- f"Note that {pre_submission_cmds=}.\n"
793
- f"Original error:\n{str(e)}"
794
- )
795
- logger.error(error_msg)
796
- raise JobExecutionError(info=error_msg)
797
- job_id_str = str(jobid)
798
-
799
- # Plug job id in stdout/stderr SLURM file paths (local and remote)
800
- def _replace_job_id(_old_path: Path) -> Path:
801
- return Path(_old_path.as_posix().replace("%j", job_id_str))
802
-
803
- job.slurm_stdout_local = _replace_job_id(job.slurm_stdout_local)
804
- job.slurm_stdout_remote = _replace_job_id(job.slurm_stdout_remote)
805
- job.slurm_stderr_local = _replace_job_id(job.slurm_stderr_local)
806
- job.slurm_stderr_remote = _replace_job_id(job.slurm_stderr_remote)
807
-
808
- # Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
809
- # must be after the `sbatch` call, so that "%j" has already been
810
- # replaced with the job ID)
811
- with self.jobs_lock:
812
- self.map_jobid_to_slurm_files_local[job_id_str] = (
813
- job.slurm_script_local.as_posix(),
814
- job.slurm_stdout_local.as_posix(),
815
- job.slurm_stderr_local.as_posix(),
816
- )
817
-
818
- # Create future
819
- future = Future()
820
- with self.jobs_lock:
821
- self.jobs[job_id_str] = (future, job)
822
- return future, job_id_str
823
-
824
- def _prepare_JobExecutionError(
825
- self, jobid: str, info: str
826
- ) -> JobExecutionError:
827
- """
828
- Prepare the `JobExecutionError` for a given job
829
-
830
- This method creates a `JobExecutionError` object and sets its attribute
831
- to the appropriate SLURM-related file names. Note that the SLURM files
832
- are the local ones (i.e. the ones in `self.workflow_dir_local`).
833
-
834
- Arguments:
835
- jobid:
836
- ID of the SLURM job.
837
- info:
838
- """
839
- # Extract SLURM file paths
840
- with self.jobs_lock:
841
- (
842
- slurm_script_file,
843
- slurm_stdout_file,
844
- slurm_stderr_file,
845
- ) = self.map_jobid_to_slurm_files_local[jobid]
846
- # Construct JobExecutionError exception
847
- job_exc = JobExecutionError(
848
- cmd_file=slurm_script_file,
849
- stdout_file=slurm_stdout_file,
850
- stderr_file=slurm_stderr_file,
851
- info=info,
852
- )
853
- return job_exc
854
-
855
- def _missing_pickle_error_msg(self, out_path: Path) -> str:
856
- settings = Inject(get_settings)
857
- info = (
858
- "Output pickle file of the FractalSlurmSSHExecutor "
859
- "job not found.\n"
860
- f"Expected file path: {out_path.as_posix()}n"
861
- "Here are some possible reasons:\n"
862
- "1. The SLURM job was scancel-ed, either by the user "
863
- "or due to an error (e.g. an out-of-memory or timeout "
864
- "error). Note that if the scancel took place before "
865
- "the job started running, the SLURM out/err files "
866
- "will be empty.\n"
867
- "2. Some error occurred upon writing the file to disk "
868
- "(e.g. because there is not enough space on disk, or "
869
- "due to an overloaded NFS filesystem). "
870
- "Note that the server configuration has "
871
- "FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
872
- f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
873
- "seconds.\n"
874
- )
875
- return info
876
-
877
- def _handle_remaining_jobs(
878
- self,
879
- remaining_futures: list[Future],
880
- remaining_job_ids: list[str],
881
- remaining_jobs: list[SlurmJob],
882
- ) -> None:
883
- """
884
- Helper function used within _completion, when looping over a list of
885
- several jobs/futures.
886
- """
887
- for future in remaining_futures:
888
- try:
889
- future.cancel()
890
- except InvalidStateError:
891
- pass
892
- for job_id in remaining_job_ids:
893
- self._cleanup(job_id)
894
- for job in remaining_jobs:
895
- for path in job.output_pickle_files_local:
896
- path.unlink()
897
- for path in job.input_pickle_files_local:
898
- path.unlink()
899
-
900
- def _completion(self, job_ids: list[str]) -> None:
901
- """
902
- Callback function to be executed whenever a job finishes.
903
-
904
- This function is executed by self.wait_thread (triggered by either
905
- finding an existing output pickle file `out_path` or finding that the
906
- SLURM job is over). Since this takes place on a different thread,
907
- failures may not be captured by the main thread; we use a broad
908
- try/except block, so that those exceptions are reported to the main
909
- thread via `fut.set_exception(...)`.
910
-
911
- Arguments:
912
- job_ids: IDs of the SLURM jobs to handle.
913
- """
914
- # Handle all uncaught exceptions in this broad try/except block
915
- try:
916
- logger.info(
917
- f"[FractalSlurmSSHExecutor._completion] START, for {job_ids=}."
918
- )
919
-
920
- # Loop over all job_ids, and fetch future and job objects
921
- futures: list[Future] = []
922
- jobs: list[SlurmJob] = []
923
- with self.jobs_lock:
924
- for job_id in job_ids:
925
- future, job = self.jobs.pop(job_id)
926
- futures.append(future)
927
- jobs.append(job)
928
- if not self.jobs:
929
- self.jobs_empty_cond.notify_all()
930
-
931
- # Fetch subfolder from remote host
932
- self._get_subfolder_sftp(jobs=jobs)
933
-
934
- # First round of checking whether all output files exist
935
- missing_out_paths = []
936
- for job in jobs:
937
- for ind_out_path, out_path in enumerate(
938
- job.output_pickle_files_local
939
- ):
940
- if not out_path.exists():
941
- missing_out_paths.append(out_path)
942
- num_missing = len(missing_out_paths)
943
- if num_missing > 0:
944
- # Output pickle files may be missing e.g. because of some slow
945
- # filesystem operation; wait some time before re-trying
946
- settings = Inject(get_settings)
947
- sleep_time = settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL
948
- logger.info(
949
- f"{num_missing} output pickle files are missing; "
950
- f"sleep {sleep_time} seconds."
951
- )
952
- for missing_file in missing_out_paths:
953
- logger.debug(f"Missing output pickle file: {missing_file}")
954
- time.sleep(sleep_time)
955
-
956
- # Handle all jobs
957
- for ind_job, job_id in enumerate(job_ids):
958
- # Retrieve job and future objects
959
- job = jobs[ind_job]
960
- future = futures[ind_job]
961
- remaining_job_ids = job_ids[ind_job + 1 :]
962
- remaining_futures = futures[ind_job + 1 :]
963
-
964
- outputs = []
965
-
966
- for ind_out_path, out_path in enumerate(
967
- job.output_pickle_files_local
968
- ):
969
- in_path = job.input_pickle_files_local[ind_out_path]
970
- if not out_path.exists():
971
- # Output pickle file is still missing
972
- info = self._missing_pickle_error_msg(out_path)
973
- job_exc = self._prepare_JobExecutionError(
974
- job_id, info=info
975
- )
976
- try:
977
- future.set_exception(job_exc)
978
- self._handle_remaining_jobs(
979
- remaining_futures=remaining_futures,
980
- remaining_job_ids=remaining_job_ids,
981
- )
982
- logger.info(
983
- "[FractalSlurmSSHExecutor._completion] END, "
984
- f"for {job_ids=}, with JobExecutionError due "
985
- f"to missing {out_path.as_posix()}."
986
- )
987
- return
988
- except InvalidStateError:
989
- logger.warning(
990
- f"Future {future} (SLURM job ID: {job_id}) "
991
- "was already cancelled."
992
- )
993
- in_path.unlink()
994
- self._cleanup(job_id)
995
- self._handle_remaining_jobs(
996
- remaining_futures=remaining_futures,
997
- remaining_job_ids=remaining_job_ids,
998
- )
999
- logger.info(
1000
- "[FractalSlurmSSHExecutor._completion] END, "
1001
- f"for {job_ids=}, with JobExecutionError/"
1002
- "InvalidStateError due to "
1003
- f"missing {out_path.as_posix()}."
1004
- )
1005
- return
1006
-
1007
- # Read the task output
1008
- with out_path.open("rb") as f:
1009
- outdata = f.read()
1010
- # Note: output can be either the task result (typically a
1011
- # dictionary) or an ExceptionProxy object; in the latter
1012
- # case, the ExceptionProxy definition is also part of the
1013
- # pickle file (thanks to cloudpickle.dumps).
1014
- success, output = cloudpickle.loads(outdata)
1015
- try:
1016
- if success:
1017
- outputs.append(output)
1018
- else:
1019
- proxy = output
1020
- if proxy.exc_type_name == "JobExecutionError":
1021
- job_exc = self._prepare_JobExecutionError(
1022
- job_id, info=proxy.kwargs.get("info", None)
1023
- )
1024
- future.set_exception(job_exc)
1025
- self._handle_remaining_jobs(
1026
- remaining_futures=remaining_futures,
1027
- remaining_job_ids=remaining_job_ids,
1028
- )
1029
- return
1030
- else:
1031
- # This branch catches both TaskExecutionError's
1032
- # (coming from the typical fractal-server
1033
- # execution of tasks, and with additional
1034
- # fractal-specific kwargs) or arbitrary
1035
- # exceptions (coming from a direct use of
1036
- # FractalSlurmSSHExecutor, possibly outside
1037
- # fractal-server)
1038
- kwargs = {}
1039
- for key in [
1040
- "workflow_task_id",
1041
- "workflow_task_order",
1042
- "task_name",
1043
- ]:
1044
- if key in proxy.kwargs.keys():
1045
- kwargs[key] = proxy.kwargs[key]
1046
- exc = TaskExecutionError(proxy.tb, **kwargs)
1047
- future.set_exception(exc)
1048
- self._handle_remaining_jobs(
1049
- remaining_futures=remaining_futures,
1050
- remaining_job_ids=remaining_job_ids,
1051
- )
1052
- return
1053
- out_path.unlink()
1054
- except InvalidStateError:
1055
- logger.warning(
1056
- f"Future {future} (SLURM job ID: {job_id}) was "
1057
- "already cancelled, exit from "
1058
- "FractalSlurmSSHExecutor._completion."
1059
- )
1060
- out_path.unlink()
1061
- in_path.unlink()
1062
-
1063
- self._cleanup(job_id)
1064
- self._handle_remaining_jobs(
1065
- remaining_futures=remaining_futures,
1066
- remaining_job_ids=remaining_job_ids,
1067
- )
1068
- return
1069
-
1070
- # Clean up input pickle file
1071
- in_path.unlink()
1072
- self._cleanup(job_id)
1073
- if job.single_task_submission:
1074
- future.set_result(outputs[0])
1075
- else:
1076
- future.set_result(outputs)
1077
-
1078
- except Exception as e:
1079
- logger.warning(
1080
- "[FractalSlurmSSHExecutor._completion] "
1081
- f"An exception took place: {str(e)}."
1082
- )
1083
- for future in futures:
1084
- try:
1085
- logger.info(f"Set exception for {future=}")
1086
- future.set_exception(e)
1087
- except InvalidStateError:
1088
- logger.info(f"Future {future} was already cancelled.")
1089
- logger.info(
1090
- f"[FractalSlurmSSHExecutor._completion] END, for {job_ids=}, "
1091
- "from within exception handling."
1092
- )
1093
- return
1094
-
1095
- def _get_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
1096
- """
1097
- Fetch a remote folder via tar+sftp+tar
1098
-
1099
- Arguments:
1100
- jobs:
1101
- List of `SlurmJob` object (needed for their prefix-related
1102
- attributes).
1103
- """
1104
-
1105
- # Check that the subfolder is unique
1106
- subfolder_names = [job.wftask_subfolder_name for job in jobs]
1107
- if len(set(subfolder_names)) > 1:
1108
- raise ValueError(
1109
- "[_put_subfolder] Invalid list of jobs, "
1110
- f"{set(subfolder_names)=}."
1111
- )
1112
- subfolder_name = subfolder_names[0]
1113
-
1114
- t_0 = time.perf_counter()
1115
- logger.debug("[_get_subfolder_sftp] Start")
1116
- tarfile_path_local = (
1117
- self.workflow_dir_local / f"{subfolder_name}.tar.gz"
1118
- ).as_posix()
1119
- tarfile_path_remote = (
1120
- self.workflow_dir_remote / f"{subfolder_name}.tar.gz"
1121
- ).as_posix()
1122
-
1123
- # Remove remote tarfile
1124
- rm_command = f"rm {tarfile_path_remote}"
1125
- self.fractal_ssh.run_command(cmd=rm_command)
1126
-
1127
- # Create remote tarfile
1128
- tar_command = (
1129
- f"{self.python_remote} "
1130
- "-m fractal_server.app.runner.compress_folder "
1131
- f"{(self.workflow_dir_remote / subfolder_name).as_posix()} "
1132
- "--remote-to-local"
1133
- )
1134
- stdout = self.fractal_ssh.run_command(cmd=tar_command)
1135
- print(stdout)
1136
-
1137
- # Fetch tarfile
1138
- t_0_get = time.perf_counter()
1139
- self.fractal_ssh.fetch_file(
1140
- remote=tarfile_path_remote,
1141
- local=tarfile_path_local,
1142
- )
1143
- t_1_get = time.perf_counter()
1144
- logger.info(
1145
- f"Subfolder archive transferred back to {tarfile_path_local}"
1146
- f" - elapsed: {t_1_get - t_0_get:.3f} s"
1147
- )
1148
-
1149
- # Extract tarfile locally
1150
- extract_archive(Path(tarfile_path_local))
1151
-
1152
- # Remove local tarfile
1153
- if Path(tarfile_path_local).exists():
1154
- logger.warning(f"Remove existing file {tarfile_path_local}.")
1155
- Path(tarfile_path_local).unlink()
1156
-
1157
- t_1 = time.perf_counter()
1158
- logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
1159
-
1160
- def _prepare_sbatch_script(
1161
- self,
1162
- *,
1163
- list_commands: list[str],
1164
- slurm_out_path: str,
1165
- slurm_err_path: str,
1166
- slurm_config: SlurmConfig,
1167
- ):
1168
- num_tasks_max_running = slurm_config.parallel_tasks_per_job
1169
- mem_per_task_MB = slurm_config.mem_per_task_MB
1170
-
1171
- # Set ntasks
1172
- ntasks = min(len(list_commands), num_tasks_max_running)
1173
- if len(list_commands) < num_tasks_max_running:
1174
- ntasks = len(list_commands)
1175
- slurm_config.parallel_tasks_per_job = ntasks
1176
- logger.debug(
1177
- f"{len(list_commands)=} is smaller than "
1178
- f"{num_tasks_max_running=}. Setting {ntasks=}."
1179
- )
1180
-
1181
- # Prepare SLURM preamble based on SlurmConfig object
1182
- script_lines = slurm_config.to_sbatch_preamble(
1183
- remote_export_dir=self.workflow_dir_remote.as_posix()
1184
- )
1185
-
1186
- # Extend SLURM preamble with variable which are not in SlurmConfig, and
1187
- # fix their order
1188
- script_lines.extend(
1189
- [
1190
- f"#SBATCH --err={slurm_err_path}",
1191
- f"#SBATCH --out={slurm_out_path}",
1192
- f"#SBATCH -D {self.workflow_dir_remote}",
1193
- ]
1194
- )
1195
- script_lines = slurm_config.sort_script_lines(script_lines)
1196
- logger.debug(script_lines)
1197
-
1198
- # Always print output of `uname -n` and `pwd`
1199
- script_lines.append(
1200
- '"Hostname: `uname -n`; current directory: `pwd`"\n'
1201
- )
1202
-
1203
- # Complete script preamble
1204
- script_lines.append("\n")
1205
-
1206
- # Include command lines
1207
- tmp_list_commands = copy(list_commands)
1208
- while tmp_list_commands:
1209
- if tmp_list_commands:
1210
- cmd = tmp_list_commands.pop(0) # take first element
1211
- script_lines.append(
1212
- "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
1213
- f"--mem={mem_per_task_MB}MB "
1214
- f"{cmd} &"
1215
- )
1216
- script_lines.append("wait\n")
1217
-
1218
- script = "\n".join(script_lines)
1219
- return script
1220
-
1221
- def shutdown(self, wait=True, *, cancel_futures=False):
1222
- """
1223
- Clean up all executor variables. Note that this function is executed on
1224
- the self.wait_thread thread, see _completion.
1225
- """
1226
-
1227
- # Redudantly set thread shutdown attribute to True
1228
- self.wait_thread.shutdown = True
1229
-
1230
- logger.debug("Executor shutdown: start")
1231
-
1232
- # Handle all job futures
1233
- slurm_jobs_to_scancel = []
1234
- with self.jobs_lock:
1235
- while self.jobs:
1236
- jobid, fut_and_job = self.jobs.popitem()
1237
- slurm_jobs_to_scancel.append(jobid)
1238
- fut = fut_and_job[0]
1239
- self.map_jobid_to_slurm_files_local.pop(jobid)
1240
- if not fut.cancelled():
1241
- fut.set_exception(
1242
- JobExecutionError(
1243
- "Job cancelled due to executor shutdown."
1244
- )
1245
- )
1246
- fut.cancel()
1247
-
1248
- # Cancel SLURM jobs
1249
- if slurm_jobs_to_scancel:
1250
- scancel_string = " ".join(slurm_jobs_to_scancel)
1251
- logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
1252
- scancel_command = f"scancel {scancel_string}"
1253
- self.fractal_ssh.run_command(cmd=scancel_command)
1254
- logger.debug("Executor shutdown: end")
1255
-
1256
- def _stop_and_join_wait_thread(self):
1257
- self.wait_thread.shutdown = True
1258
- self.wait_thread.join()
1259
-
1260
- def __exit__(self, *args, **kwargs):
1261
- """
1262
- See
1263
- https://github.com/fractal-analytics-platform/fractal-server/issues/1508
1264
- """
1265
- logger.debug(
1266
- "[FractalSlurmSSHExecutor.__exit__] Stop and join `wait_thread`"
1267
- )
1268
- self._stop_and_join_wait_thread()
1269
- logger.debug("[FractalSlurmSSHExecutor.__exit__] End")
1270
-
1271
- def run_squeue(self, job_ids):
1272
- squeue_command = (
1273
- "squeue "
1274
- "--noheader "
1275
- "--format='%i %T' "
1276
- "--jobs __JOBS__ "
1277
- "--states=all"
1278
- )
1279
- job_ids = ",".join([str(j) for j in job_ids])
1280
- squeue_command = squeue_command.replace("__JOBS__", job_ids)
1281
- stdout = self.fractal_ssh.run_command(cmd=squeue_command)
1282
- return stdout
1283
-
1284
- def _jobs_finished(self, job_ids: list[str]) -> set[str]:
1285
- """
1286
- Check which ones of the given Slurm jobs already finished
1287
-
1288
- The function is based on the `_jobs_finished` function from
1289
- clusterfutures (version 0.5).
1290
- Original Copyright: 2022 Adrian Sampson
1291
- (released under the MIT licence)
1292
- """
1293
-
1294
- logger.debug(
1295
- f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
1296
- )
1297
-
1298
- # If there is no Slurm job to check, return right away
1299
- if not job_ids:
1300
- logger.debug(
1301
- "[FractalSlurmSSHExecutor._jobs_finished] "
1302
- "No jobs provided, return."
1303
- )
1304
- return set()
1305
-
1306
- try:
1307
- stdout = self.run_squeue(job_ids)
1308
- id_to_state = {
1309
- out.split()[0]: out.split()[1] for out in stdout.splitlines()
1310
- }
1311
- # Finished jobs only stay in squeue for a few mins (configurable).
1312
- # If a job ID isn't there, we'll assume it's finished.
1313
- output = {
1314
- _id
1315
- for _id in job_ids
1316
- if id_to_state.get(_id, "COMPLETED") in STATES_FINISHED
1317
- }
1318
- logger.debug(
1319
- f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1320
- )
1321
- return output
1322
- except Exception as e:
1323
- # If something goes wrong, proceed anyway
1324
- logger.error(
1325
- f"Something wrong in _jobs_finished. Original error: {str(e)}"
1326
- )
1327
- output = set()
1328
- logger.debug(
1329
- f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
1330
- )
1331
- return output
1332
-
1333
- id_to_state = dict()
1334
- for j in job_ids:
1335
- res = self.run_squeue([j])
1336
- if res.returncode != 0:
1337
- logger.info(f"Job {j} not found. Marked it as completed")
1338
- id_to_state.update({str(j): "COMPLETED"})
1339
- else:
1340
- id_to_state.update(
1341
- {res.stdout.split()[0]: res.stdout.split()[1]}
1342
- )
1343
-
1344
- def handshake(self) -> dict:
1345
- """
1346
- Healthcheck for SSH connection and for versions match.
1347
-
1348
- FIXME SSH: We should add a timeout here
1349
- FIXME SSH: We could include checks on the existence of folders
1350
- FIXME SSH: We could include further checks on version matches
1351
- """
1352
-
1353
- self.fractal_ssh.check_connection()
1354
-
1355
- t_start_handshake = time.perf_counter()
1356
-
1357
- logger.info("[FractalSlurmSSHExecutor.ssh_handshake] START")
1358
- cmd = f"{self.python_remote} -m fractal_server.app.runner.versions"
1359
- stdout = self.fractal_ssh.run_command(cmd=cmd)
1360
- try:
1361
- remote_versions = json.loads(stdout.strip("\n"))
1362
- except json.decoder.JSONDecodeError as e:
1363
- logger.error("Fractal server versions not available")
1364
- raise e
1365
-
1366
- # Check compatibility with local versions
1367
- local_versions = get_versions()
1368
- remote_fractal_server = remote_versions["fractal_server"]
1369
- local_fractal_server = local_versions["fractal_server"]
1370
- if remote_fractal_server != local_fractal_server:
1371
- error_msg = (
1372
- "Fractal-server version mismatch.\n"
1373
- "Local interpreter: "
1374
- f"({sys.executable}): {local_versions}.\n"
1375
- "Remote interpreter: "
1376
- f"({self.python_remote}): {remote_versions}."
1377
- )
1378
- logger.error(error_msg)
1379
- raise ValueError(error_msg)
1380
-
1381
- t_end_handshake = time.perf_counter()
1382
- logger.info(
1383
- "[FractalSlurmSSHExecutor.ssh_handshake] END"
1384
- f" - elapsed: {t_end_handshake - t_start_handshake:.3f} s"
1385
- )
1386
- return remote_versions