fractal-server 2.13.1__py3-none-any.whl → 2.14.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/history/__init__.py +4 -0
  3. fractal_server/app/history/image_updates.py +142 -0
  4. fractal_server/app/history/status_enum.py +16 -0
  5. fractal_server/app/models/v2/__init__.py +5 -1
  6. fractal_server/app/models/v2/history.py +53 -0
  7. fractal_server/app/routes/api/v2/__init__.py +2 -2
  8. fractal_server/app/routes/api/v2/_aux_functions.py +78 -0
  9. fractal_server/app/routes/api/v2/dataset.py +12 -9
  10. fractal_server/app/routes/api/v2/history.py +247 -0
  11. fractal_server/app/routes/api/v2/workflow.py +18 -3
  12. fractal_server/app/routes/api/v2/workflowtask.py +22 -0
  13. fractal_server/app/runner/executors/base_runner.py +114 -0
  14. fractal_server/app/runner/{v2/_local → executors/local}/_local_config.py +3 -3
  15. fractal_server/app/runner/executors/local/_submit_setup.py +54 -0
  16. fractal_server/app/runner/executors/local/runner.py +200 -0
  17. fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
  18. fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +3 -3
  19. fractal_server/app/runner/{v2/_slurm_ssh → executors/slurm_common}/_submit_setup.py +13 -12
  20. fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +9 -15
  21. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_executor_wait_thread.py +1 -1
  22. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_slurm_job.py +1 -1
  23. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/executor.py +13 -14
  24. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_check_jobs_status.py +11 -9
  25. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_executor_wait_thread.py +3 -3
  26. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -68
  27. fractal_server/app/runner/executors/slurm_sudo/runner.py +632 -0
  28. fractal_server/app/runner/task_files.py +70 -96
  29. fractal_server/app/runner/v2/__init__.py +5 -19
  30. fractal_server/app/runner/v2/_local.py +84 -0
  31. fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +10 -13
  32. fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +10 -12
  33. fractal_server/app/runner/v2/runner.py +93 -28
  34. fractal_server/app/runner/v2/runner_functions.py +85 -62
  35. fractal_server/app/runner/v2/runner_functions_low_level.py +20 -20
  36. fractal_server/app/schemas/v2/dataset.py +0 -17
  37. fractal_server/app/schemas/v2/history.py +23 -0
  38. fractal_server/config.py +2 -2
  39. fractal_server/migrations/versions/8223fcef886c_image_status.py +63 -0
  40. fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +68 -0
  41. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/METADATA +1 -1
  42. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/RECORD +52 -46
  43. fractal_server/app/routes/api/v2/status.py +0 -168
  44. fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
  45. fractal_server/app/runner/v2/_local/__init__.py +0 -132
  46. fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
  47. fractal_server/app/runner/v2/_local/executor.py +0 -100
  48. fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
  49. fractal_server/app/runner/v2/handle_failed_job.py +0 -59
  50. /fractal_server/app/runner/executors/{slurm → local}/__init__.py +0 -0
  51. /fractal_server/app/runner/executors/{slurm/ssh → slurm_common}/__init__.py +0 -0
  52. /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
  53. /fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +0 -0
  54. /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
  55. /fractal_server/app/runner/executors/{slurm/sudo → slurm_ssh}/__init__.py +0 -0
  56. /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_sudo}/__init__.py +0 -0
  57. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/LICENSE +0 -0
  58. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/WHEEL +0 -0
  59. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/entry_points.txt +0 -0
@@ -1,1281 +0,0 @@
1
- import json
2
- import math
3
- import shlex
4
- import subprocess # nosec
5
- import sys
6
- import threading
7
- import time
8
- import uuid
9
- from concurrent.futures import Executor
10
- from concurrent.futures import Future
11
- from concurrent.futures import InvalidStateError
12
- from copy import copy
13
- from pathlib import Path
14
- from subprocess import CompletedProcess # nosec
15
- from typing import Any
16
- from typing import Callable
17
- from typing import Optional
18
- from typing import Sequence
19
-
20
- import cloudpickle
21
-
22
- from ......config import get_settings
23
- from ......logger import set_logger
24
- from ......syringe import Inject
25
- from ....exceptions import JobExecutionError
26
- from ....exceptions import TaskExecutionError
27
- from ....filenames import SHUTDOWN_FILENAME
28
- from ....task_files import get_task_file_paths
29
- from ....task_files import TaskFiles
30
- from ...slurm._slurm_config import SlurmConfig
31
- from .._batching import heuristics
32
- from ..utils_executors import get_pickle_file_path
33
- from ..utils_executors import get_slurm_file_path
34
- from ..utils_executors import get_slurm_script_file_path
35
- from ._executor_wait_thread import FractalSlurmSudoWaitThread
36
- from ._subprocess_run_as_user import _glob_as_user
37
- from ._subprocess_run_as_user import _glob_as_user_strict
38
- from ._subprocess_run_as_user import _path_exists_as_user
39
- from ._subprocess_run_as_user import _run_command_as_user
40
- from fractal_server import __VERSION__
41
- from fractal_server.app.runner.components import _COMPONENT_KEY_
42
- from fractal_server.string_tools import validate_cmd
43
-
44
-
45
- logger = set_logger(__name__)
46
-
47
-
48
- def _subprocess_run_or_raise(full_command: str) -> Optional[CompletedProcess]:
49
- """
50
- Wrap `subprocess.run` and raise appropriate `JobExecutionError` if needed.
51
-
52
- Args:
53
- full_command: Full string of the command to execute.
54
-
55
- Raises:
56
- JobExecutionError: If `subprocess.run` raises a `CalledProcessError`.
57
-
58
- Returns:
59
- The actual `CompletedProcess` output of `subprocess.run`.
60
- """
61
- validate_cmd(full_command)
62
- try:
63
- output = subprocess.run( # nosec
64
- shlex.split(full_command),
65
- capture_output=True,
66
- check=True,
67
- encoding="utf-8",
68
- )
69
- return output
70
- except subprocess.CalledProcessError as e:
71
- error_msg = (
72
- f"Submit command `{full_command}` failed. "
73
- f"Original error:\n{str(e)}\n"
74
- f"Original stdout:\n{e.stdout}\n"
75
- f"Original stderr:\n{e.stderr}\n"
76
- )
77
- logger.error(error_msg)
78
- raise JobExecutionError(info=error_msg)
79
-
80
-
81
- class SlurmJob:
82
- """
83
- Collect information related to a FractalSlurmExecutor job
84
-
85
- This includes three groups of attributes:
86
-
87
- 1. Attributes related to the (possibly multi-task) SLURM job, e.g.
88
- submission-file path.
89
- 2. Attributes related to single tasks, e.g. the paths of their input/output
90
- pickle files.
91
- 3. SLURM configuration options, encoded in a SlurmConfig object.
92
-
93
- Note: A SlurmJob object is generally defined as a multi-task job. Jobs
94
- coming from the `map` method must have `single_task_submission=False` (even
95
- if `num_tasks_tot=1`), while jobs coming from `submit` must have it set to
96
- `True`.
97
-
98
- Attributes:
99
- num_tasks_tot:
100
- Total number of tasks to be executed as part of this SLURM job.
101
- single_task_submission:
102
- This must be `True` for jobs submitted as part of the `submit`
103
- method, and `False` for jobs coming from the `map` method.
104
- slurm_file_prefix:
105
- Prefix for SLURM-job related files (submission script and SLURM
106
- stdout/stderr); this is also needed in the
107
- `_copy_files_from_remote_to_local` method.
108
- wftask_file_prefixes:
109
- Prefix for files that are created as part of the functions
110
- submitted for execution on the `FractalSlurmExecutor`; this is
111
- needed in the `_copy_files_from_remote_to_local` method, and also
112
- to construct the names of per-task input/output pickle files.
113
- wftask_subfolder_name:
114
- Name of the per-task subfolder (e.g. `7_task_name`).
115
- slurm_script:
116
- Path of SLURM submission script.
117
- slurm_stdout:
118
- Path of SLURM stdout file; if this includes `"%j"`, then this
119
- string will be replaced by the SLURM job ID upon `sbatch`
120
- submission.
121
- slurm_stderr:
122
- Path of SLURM stderr file; see `slurm_stdout` concerning `"%j"`.
123
- workerids:
124
- IDs that enter in the per-task input/output pickle files (one per
125
- task).
126
- input_pickle_files:
127
- Input pickle files (one per task).
128
- output_pickle_files:
129
- Output pickle files (one per task).
130
- slurm_config:
131
- `SlurmConfig` object.
132
- """
133
-
134
- # Job-related attributes
135
- num_tasks_tot: int
136
- single_task_submission: bool
137
- slurm_file_prefix: str
138
- slurm_script: Path
139
- slurm_stdout: Path
140
- slurm_stderr: Path
141
- # Per-task attributes
142
- workerids: tuple[str, ...]
143
- wftask_file_prefixes: tuple[str, ...]
144
- wftask_subfolder_name: str
145
- input_pickle_files: tuple[Path, ...]
146
- output_pickle_files: tuple[Path, ...]
147
- # Slurm configuration
148
- slurm_config: SlurmConfig
149
-
150
- def __init__(
151
- self,
152
- num_tasks_tot: int,
153
- slurm_config: SlurmConfig,
154
- slurm_file_prefix: Optional[str] = None,
155
- wftask_file_prefixes: Optional[tuple[str, ...]] = None,
156
- single_task_submission: bool = False,
157
- ):
158
- if single_task_submission and num_tasks_tot > 1:
159
- raise ValueError(
160
- "Trying to initialize SlurmJob with"
161
- f"{single_task_submission=} and {num_tasks_tot=}."
162
- )
163
- self.num_tasks_tot = num_tasks_tot
164
- self.single_task_submission = single_task_submission
165
- self.slurm_file_prefix = slurm_file_prefix or "default_slurm_prefix"
166
- if wftask_file_prefixes is None:
167
- self.wftask_file_prefixes = tuple(
168
- "default_wftask_prefix" for i in range(self.num_tasks_tot)
169
- )
170
- else:
171
- self.wftask_file_prefixes = wftask_file_prefixes
172
- self.workerids = tuple(uuid.uuid4() for i in range(self.num_tasks_tot))
173
- self.slurm_config = slurm_config
174
-
175
- def get_clean_output_pickle_files(self) -> tuple[str, ...]:
176
- """
177
- Transform all pathlib.Path objects in self.output_pickle_files to
178
- strings
179
- """
180
- return tuple(str(f.as_posix()) for f in self.output_pickle_files)
181
-
182
-
183
- class FractalSlurmSudoExecutor(Executor):
184
- """
185
- Executor to submit SLURM jobs as a different user, via `sudo -u`
186
-
187
- This class is a custom re-implementation of the SLURM executor from
188
-
189
- > clusterfutures <https://github.com/sampsyo/clusterfutures>
190
- > Original Copyright
191
- > Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
192
- > License: MIT
193
-
194
-
195
- Attributes:
196
- slurm_user:
197
- Shell username that runs the `sbatch` command.
198
- common_script_lines:
199
- Arbitrary script lines that will always be included in the
200
- sbatch script
201
- workflow_dir_local:
202
- Directory for both the cfut/SLURM and fractal-server files and logs
203
- workflow_dir_remote:
204
- Directory for both the cfut/SLURM and fractal-server files and logs
205
- map_jobid_to_slurm_files:
206
- Dictionary with paths of slurm-related files for active jobs
207
- """
208
-
209
- wait_thread_cls = FractalSlurmSudoWaitThread
210
- slurm_user: str
211
- shutdown_file: str
212
- common_script_lines: list[str]
213
- user_cache_dir: str
214
- workflow_dir_local: Path
215
- workflow_dir_remote: Path
216
- map_jobid_to_slurm_files: dict[str, tuple[str, str, str]]
217
- slurm_account: Optional[str] = None
218
- jobs: dict[str, tuple[Future, SlurmJob]]
219
-
220
- def __init__(
221
- self,
222
- slurm_user: str,
223
- workflow_dir_local: Path,
224
- workflow_dir_remote: Path,
225
- shutdown_file: Optional[str] = None,
226
- user_cache_dir: Optional[str] = None,
227
- common_script_lines: Optional[list[str]] = None,
228
- slurm_poll_interval: Optional[int] = None,
229
- slurm_account: Optional[str] = None,
230
- *args,
231
- **kwargs,
232
- ):
233
- """
234
- Init method for FractalSlurmExecutor
235
- """
236
-
237
- if not slurm_user:
238
- raise RuntimeError(
239
- "Missing attribute FractalSlurmExecutor.slurm_user"
240
- )
241
-
242
- self.jobs = {}
243
- self.job_outfiles = {}
244
- self.jobs_lock = threading.Lock()
245
- self.jobs_empty_cond = threading.Condition(self.jobs_lock)
246
-
247
- self.wait_thread = self.wait_thread_cls(self._completion)
248
- self.wait_thread.start()
249
-
250
- # Assign `wait_thread.shutdown_callback` early, since it may be called
251
- # from within `_stop_and_join_wait_thread` (e.g. if an exception is
252
- # raised within `__init__`).
253
- self.wait_thread.shutdown_callback = self.shutdown
254
-
255
- self.slurm_user = slurm_user
256
- self.slurm_account = slurm_account
257
-
258
- self.common_script_lines = common_script_lines or []
259
- settings = Inject(get_settings)
260
-
261
- if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
262
- try:
263
- self.check_remote_python_interpreter()
264
- except Exception as e:
265
- self._stop_and_join_wait_thread()
266
- raise RuntimeError(f"Original error {str(e)}")
267
-
268
- # Check that SLURM account is not set here
269
- try:
270
- invalid_line = next(
271
- line
272
- for line in self.common_script_lines
273
- if line.startswith("#SBATCH --account=")
274
- )
275
- self._stop_and_join_wait_thread()
276
- raise RuntimeError(
277
- "Invalid line in `FractalSlurmExecutor.common_script_lines`: "
278
- f"'{invalid_line}'.\n"
279
- "SLURM account must be set via the request body of the "
280
- "apply-workflow endpoint, or by modifying the user properties."
281
- )
282
- except StopIteration:
283
- pass
284
-
285
- self.workflow_dir_local = workflow_dir_local
286
- if not _path_exists_as_user(
287
- path=str(workflow_dir_remote), user=self.slurm_user
288
- ):
289
- logger.info(f"Missing folder {workflow_dir_remote=}")
290
- self.user_cache_dir = user_cache_dir
291
-
292
- self.workflow_dir_remote = workflow_dir_remote
293
- self.map_jobid_to_slurm_files = {}
294
-
295
- # Set the attribute slurm_poll_interval for self.wait_thread (see
296
- # cfut.SlurmWaitThread)
297
- if not slurm_poll_interval:
298
- slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
299
- self.wait_thread.slurm_poll_interval = slurm_poll_interval
300
- self.wait_thread.slurm_user = self.slurm_user
301
-
302
- self.wait_thread.shutdown_file = (
303
- shutdown_file
304
- or (self.workflow_dir_local / SHUTDOWN_FILENAME).as_posix()
305
- )
306
-
307
- def _cleanup(self, jobid: str) -> None:
308
- """
309
- Given a job ID as returned by _start, perform any necessary
310
- cleanup after the job has finished.
311
- """
312
- with self.jobs_lock:
313
- self.map_jobid_to_slurm_files.pop(jobid)
314
-
315
- def submit(
316
- self,
317
- fun: Callable[..., Any],
318
- *fun_args: Sequence[Any],
319
- slurm_config: SlurmConfig,
320
- task_files: TaskFiles,
321
- **fun_kwargs: dict,
322
- ) -> Future:
323
- """
324
- Submit a function for execution on `FractalSlurmExecutor`
325
-
326
- Arguments:
327
- fun: The function to be executed
328
- fun_args: Function positional arguments
329
- fun_kwargs: Function keyword arguments
330
- slurm_config:
331
- A `SlurmConfig` object.
332
- task_files:
333
- A `TaskFiles` object.
334
-
335
- Returns:
336
- Future representing the execution of the current SLURM job.
337
- """
338
-
339
- # Do not continue if auxiliary thread was shut down
340
- if self.wait_thread.shutdown:
341
- error_msg = "Cannot call `submit` method after executor shutdown"
342
- logger.warning(error_msg)
343
- raise JobExecutionError(info=error_msg)
344
-
345
- # Set slurm_file_prefix
346
- slurm_file_prefix = task_files.file_prefix
347
-
348
- # Include common_script_lines in extra_lines
349
- logger.debug(
350
- f"Adding {self.common_script_lines=} to "
351
- f"{slurm_config.extra_lines=}, from submit method."
352
- )
353
- current_extra_lines = slurm_config.extra_lines or []
354
- slurm_config.extra_lines = (
355
- current_extra_lines + self.common_script_lines
356
- )
357
-
358
- # Adapt slurm_config to the fact that this is a single-task SlurmJob
359
- # instance
360
- slurm_config.tasks_per_job = 1
361
- slurm_config.parallel_tasks_per_job = 1
362
-
363
- fut = self._submit_job(
364
- fun,
365
- slurm_config=slurm_config,
366
- slurm_file_prefix=slurm_file_prefix,
367
- task_files=task_files,
368
- single_task_submission=True,
369
- args=fun_args,
370
- kwargs=fun_kwargs,
371
- )
372
- return fut
373
-
374
- def map(
375
- self,
376
- fn: Callable[..., Any],
377
- iterable: list[Sequence[Any]],
378
- *,
379
- slurm_config: SlurmConfig,
380
- task_files: TaskFiles,
381
- ):
382
- """
383
- Return an iterator with the results of several execution of a function
384
-
385
- This function is based on `concurrent.futures.Executor.map` from Python
386
- Standard Library 3.11.
387
- Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
388
- PSF under a Contributor Agreement.
389
-
390
- Main modifications from the PSF function:
391
-
392
- 1. Only `fn` and `iterable` can be assigned as positional arguments;
393
- 2. `*iterables` argument replaced with a single `iterable`;
394
- 3. `timeout` and `chunksize` arguments are not supported.
395
-
396
- Arguments:
397
- fn:
398
- The function to be executed
399
- iterable:
400
- An iterable such that each element is the list of arguments to
401
- be passed to `fn`, as in `fn(*args)`.
402
- slurm_config:
403
- A `SlurmConfig` object.
404
- task_files:
405
- A `TaskFiles` object.
406
-
407
- """
408
-
409
- # Do not continue if auxiliary thread was shut down
410
- if self.wait_thread.shutdown:
411
- error_msg = "Cannot call `map` method after executor shutdown"
412
- logger.warning(error_msg)
413
- raise JobExecutionError(info=error_msg)
414
-
415
- def _result_or_cancel(fut):
416
- """
417
- This function is based on the Python Standard Library 3.11.
418
- Original Copyright 2009 Brian Quinlan. All Rights Reserved.
419
- Licensed to PSF under a Contributor Agreement.
420
- """
421
- try:
422
- try:
423
- return fut.result()
424
- finally:
425
- fut.cancel()
426
- finally:
427
- # Break a reference cycle with the exception in
428
- # self._exception
429
- del fut
430
-
431
- # Include common_script_lines in extra_lines
432
- logger.debug(
433
- f"Adding {self.common_script_lines=} to "
434
- f"{slurm_config.extra_lines=}, from map method."
435
- )
436
- current_extra_lines = slurm_config.extra_lines or []
437
- slurm_config.extra_lines = (
438
- current_extra_lines + self.common_script_lines
439
- )
440
-
441
- # Set file prefixes
442
- general_slurm_file_prefix = str(task_files.task_order)
443
-
444
- # Transform iterable into a list and count its elements
445
- list_args = list(iterable)
446
- tot_tasks = len(list_args)
447
-
448
- # Set/validate parameters for task batching
449
- tasks_per_job, parallel_tasks_per_job = heuristics(
450
- # Number of parallel components (always known)
451
- tot_tasks=len(list_args),
452
- # Optional WorkflowTask attributes:
453
- tasks_per_job=slurm_config.tasks_per_job,
454
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
455
- # Task requirements (multiple possible sources):
456
- cpus_per_task=slurm_config.cpus_per_task,
457
- mem_per_task=slurm_config.mem_per_task_MB,
458
- # Fractal configuration variables (soft/hard limits):
459
- target_cpus_per_job=slurm_config.target_cpus_per_job,
460
- target_mem_per_job=slurm_config.target_mem_per_job,
461
- target_num_jobs=slurm_config.target_num_jobs,
462
- max_cpus_per_job=slurm_config.max_cpus_per_job,
463
- max_mem_per_job=slurm_config.max_mem_per_job,
464
- max_num_jobs=slurm_config.max_num_jobs,
465
- )
466
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
467
- slurm_config.tasks_per_job = tasks_per_job
468
-
469
- # Divide arguments in batches of `n_tasks_per_script` tasks each
470
- args_batches = []
471
- batch_size = tasks_per_job
472
- for ind_chunk in range(0, tot_tasks, batch_size):
473
- args_batches.append(
474
- list_args[ind_chunk : ind_chunk + batch_size] # noqa
475
- )
476
- if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
477
- raise RuntimeError("Something wrong here while batching tasks")
478
-
479
- # Fetch configuration variable
480
- settings = Inject(get_settings)
481
- FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
482
-
483
- # Construct list of futures (one per SLURM job, i.e. one per batch)
484
- fs = []
485
- current_component_index = 0
486
- for ind_batch, batch in enumerate(args_batches):
487
- batch_size = len(batch)
488
- this_slurm_file_prefix = (
489
- f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
490
- )
491
- fs.append(
492
- self._submit_job(
493
- fn,
494
- slurm_config=slurm_config,
495
- slurm_file_prefix=this_slurm_file_prefix,
496
- task_files=task_files,
497
- single_task_submission=False,
498
- components=batch,
499
- )
500
- )
501
- current_component_index += batch_size
502
- time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
503
-
504
- # Yield must be hidden in closure so that the futures are submitted
505
- # before the first iterator value is required.
506
- # NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
507
- # iterable of results (if successful), and we should yield its elements
508
- # rather than the whole iterable.
509
- def result_iterator():
510
- """
511
- This function is based on the Python Standard Library 3.11.
512
- Original Copyright 2009 Brian Quinlan. All Rights Reserved.
513
- Licensed to PSF under a Contributor Agreement.
514
- """
515
- try:
516
- # reverse to keep finishing order
517
- fs.reverse()
518
- while fs:
519
- # Careful not to keep a reference to the popped future
520
- results = _result_or_cancel(fs.pop())
521
- for res in results:
522
- yield res
523
- finally:
524
- for future in fs:
525
- future.cancel()
526
-
527
- return result_iterator()
528
-
529
- def _submit_job(
530
- self,
531
- fun: Callable[..., Any],
532
- slurm_file_prefix: str,
533
- task_files: TaskFiles,
534
- slurm_config: SlurmConfig,
535
- single_task_submission: bool = False,
536
- args: Optional[Sequence[Any]] = None,
537
- kwargs: Optional[dict] = None,
538
- components: Optional[list[Any]] = None,
539
- ) -> Future:
540
- """
541
- Submit a multi-task job to the pool, where each task is handled via the
542
- pickle/remote logic
543
-
544
- NOTE: this method has different behaviors when it is called from the
545
- `self.submit` or `self.map` methods (which is also encoded in
546
- `single_task_submission`):
547
-
548
- * When called from `self.submit`, it supports general `args` and
549
- `kwargs` arguments;
550
- * When called from `self.map`, there cannot be any `args` or `kwargs`
551
- argument, but there must be a `components` argument.
552
-
553
- Arguments:
554
- fun:
555
- slurm_file_prefix:
556
- task_files:
557
- slurm_config:
558
- single_task_submission:
559
- args:
560
- kwargs:
561
- components:
562
-
563
- Returns:
564
- Future representing the execution of the current SLURM job.
565
- """
566
-
567
- # Prevent calling sbatch if auxiliary thread was shut down
568
- if self.wait_thread.shutdown:
569
- error_msg = (
570
- "Cannot call `_submit_job` method after executor shutdown"
571
- )
572
- logger.warning(error_msg)
573
- raise JobExecutionError(info=error_msg)
574
-
575
- fut: Future = Future()
576
-
577
- # Inject SLURM account (if set) into slurm_config
578
- if self.slurm_account:
579
- slurm_config.account = self.slurm_account
580
-
581
- # Define slurm-job-related files
582
- if single_task_submission:
583
- if components is not None:
584
- raise ValueError(
585
- f"{single_task_submission=} but components is not None"
586
- )
587
- job = SlurmJob(
588
- slurm_file_prefix=slurm_file_prefix,
589
- num_tasks_tot=1,
590
- slurm_config=slurm_config,
591
- )
592
- if job.num_tasks_tot > 1:
593
- raise ValueError(
594
- "{single_task_submission=} but {job.num_tasks_tot=}"
595
- )
596
- job.single_task_submission = True
597
- job.wftask_file_prefixes = (task_files.file_prefix,)
598
- job.wftask_subfolder_name = task_files.subfolder_name
599
-
600
- else:
601
- if not components or len(components) < 1:
602
- raise ValueError(
603
- "In FractalSlurmExecutor._submit_job, given "
604
- f"{components=}."
605
- )
606
- num_tasks_tot = len(components)
607
- job = SlurmJob(
608
- slurm_file_prefix=slurm_file_prefix,
609
- num_tasks_tot=num_tasks_tot,
610
- slurm_config=slurm_config,
611
- )
612
-
613
- _prefixes = []
614
- _subfolder_names = []
615
- for component in components:
616
- # In Fractal, `component` is a `dict` by construction (e.g.
617
- # `component = {"zarr_url": "/something", "param": 1}``). The
618
- # try/except covers the case of e.g. `executor.map([1, 2])`,
619
- # which is useful for testing.
620
- try:
621
- actual_component = component.get(_COMPONENT_KEY_, None)
622
- except AttributeError:
623
- actual_component = str(component)
624
- _task_file_paths = get_task_file_paths(
625
- workflow_dir_local=task_files.workflow_dir_local,
626
- workflow_dir_remote=task_files.workflow_dir_remote,
627
- task_name=task_files.task_name,
628
- task_order=task_files.task_order,
629
- component=actual_component,
630
- )
631
- _prefixes.append(_task_file_paths.file_prefix)
632
- _subfolder_names.append(_task_file_paths.subfolder_name)
633
- job.wftask_file_prefixes = tuple(_prefixes)
634
-
635
- num_subfolders = len(set(_subfolder_names))
636
- if num_subfolders != 1:
637
- error_msg_short = (
638
- f"[_submit_job] Subfolder list has {num_subfolders} "
639
- "different values, but it must have only one (since "
640
- "workflow tasks are executed one by one)."
641
- )
642
- error_msg_detail = (
643
- "[_submit_job] Current unique subfolder names: "
644
- f"{set(_subfolder_names)}"
645
- )
646
- logger.error(error_msg_short)
647
- logger.error(error_msg_detail)
648
- raise ValueError(error_msg_short)
649
- job.wftask_subfolder_name = _subfolder_names[0]
650
-
651
- # Check that server-side subfolder exists
652
- subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
653
- if not subfolder_path.exists():
654
- raise FileNotFoundError(
655
- f"Missing folder {subfolder_path.as_posix()}."
656
- )
657
-
658
- job.input_pickle_files = tuple(
659
- get_pickle_file_path(
660
- arg=job.workerids[ind],
661
- workflow_dir=self.workflow_dir_local,
662
- subfolder_name=job.wftask_subfolder_name,
663
- in_or_out="in",
664
- prefix=job.wftask_file_prefixes[ind],
665
- )
666
- for ind in range(job.num_tasks_tot)
667
- )
668
- job.output_pickle_files = tuple(
669
- get_pickle_file_path(
670
- arg=job.workerids[ind],
671
- workflow_dir=self.workflow_dir_remote,
672
- subfolder_name=job.wftask_subfolder_name,
673
- in_or_out="out",
674
- prefix=job.wftask_file_prefixes[ind],
675
- )
676
- for ind in range(job.num_tasks_tot)
677
- )
678
- # Define SLURM-job file names/paths
679
- job.slurm_script = get_slurm_script_file_path(
680
- workflow_dir=self.workflow_dir_local,
681
- subfolder_name=job.wftask_subfolder_name,
682
- prefix=job.slurm_file_prefix,
683
- )
684
- job.slurm_stdout = get_slurm_file_path(
685
- workflow_dir=self.workflow_dir_remote,
686
- subfolder_name=job.wftask_subfolder_name,
687
- out_or_err="out",
688
- prefix=job.slurm_file_prefix,
689
- )
690
- job.slurm_stderr = get_slurm_file_path(
691
- workflow_dir=self.workflow_dir_remote,
692
- subfolder_name=job.wftask_subfolder_name,
693
- out_or_err="err",
694
- prefix=job.slurm_file_prefix,
695
- )
696
-
697
- # Dump serialized versions+function+args+kwargs to pickle
698
- versions = dict(
699
- python=sys.version_info[:3],
700
- cloudpickle=cloudpickle.__version__,
701
- fractal_server=__VERSION__,
702
- )
703
- if job.single_task_submission:
704
- _args = args or []
705
- _kwargs = kwargs or {}
706
- funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
707
- with open(job.input_pickle_files[0], "wb") as f:
708
- f.write(funcser)
709
- else:
710
- for ind_component, component in enumerate(components):
711
- _args = [component]
712
- _kwargs = {}
713
- funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
714
- with open(job.input_pickle_files[ind_component], "wb") as f:
715
- f.write(funcser)
716
-
717
- # Submit job to SLURM, and get jobid
718
- jobid, job = self._start(job)
719
-
720
- # Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
721
- # must be after self._start(job), so that "%j" has already been
722
- # replaced with the job ID)
723
- with self.jobs_lock:
724
- self.map_jobid_to_slurm_files[jobid] = (
725
- job.slurm_script.as_posix(),
726
- job.slurm_stdout.as_posix(),
727
- job.slurm_stderr.as_posix(),
728
- )
729
-
730
- # Thread will wait for it to finish.
731
- self.wait_thread.wait(
732
- filenames=job.get_clean_output_pickle_files(),
733
- jobid=jobid,
734
- )
735
-
736
- with self.jobs_lock:
737
- self.jobs[jobid] = (fut, job)
738
- return fut
739
-
740
- def _prepare_JobExecutionError(
741
- self, jobid: str, info: str
742
- ) -> JobExecutionError:
743
- """
744
- Prepare the `JobExecutionError` for a given job
745
-
746
- This method creates a `JobExecutionError` object and sets its attribute
747
- to the appropriate SLURM-related file names. Note that the method
748
- should always be called after values in `self.map_jobid_to_slurm_files`
749
- have been updated, so that they point to `self.workflow_dir_local`
750
- files which are readable from `fractal-server`.
751
-
752
- Arguments:
753
- jobid:
754
- ID of the SLURM job.
755
- info:
756
- """
757
- # Extract SLURM file paths
758
- with self.jobs_lock:
759
- (
760
- slurm_script_file,
761
- slurm_stdout_file,
762
- slurm_stderr_file,
763
- ) = self.map_jobid_to_slurm_files[jobid]
764
- # Construct JobExecutionError exception
765
- job_exc = JobExecutionError(
766
- cmd_file=slurm_script_file,
767
- stdout_file=slurm_stdout_file,
768
- stderr_file=slurm_stderr_file,
769
- info=info,
770
- )
771
- return job_exc
772
-
773
- def _completion(self, jobid: str) -> None:
774
- """
775
- Callback function to be executed whenever a job finishes.
776
-
777
- This function is executed by self.wait_thread (triggered by either
778
- finding an existing output pickle file `out_path` or finding that the
779
- SLURM job is over). Since this takes place on a different thread,
780
- failures may not be captured by the main thread; we use a broad
781
- try/except block, so that those exceptions are reported to the main
782
- thread via `fut.set_exception(...)`.
783
-
784
- Arguments:
785
- jobid: ID of the SLURM job
786
- """
787
- # Handle all uncaught exceptions in this broad try/except block
788
- try:
789
- # Retrieve job
790
- with self.jobs_lock:
791
- try:
792
- fut, job = self.jobs.pop(jobid)
793
- except KeyError:
794
- return
795
- if not self.jobs:
796
- self.jobs_empty_cond.notify_all()
797
-
798
- # Copy all relevant files from self.workflow_dir_remote to
799
- # self.workflow_dir_local
800
-
801
- self._copy_files_from_remote_to_local(job)
802
-
803
- # Update the paths to use the files in self.workflow_dir_local
804
- # (rather than the user's ones in self.workflow_dir_remote)
805
- with self.jobs_lock:
806
- self.map_jobid_to_slurm_files[jobid]
807
- (
808
- slurm_script_file,
809
- slurm_stdout_file,
810
- slurm_stderr_file,
811
- ) = self.map_jobid_to_slurm_files[jobid]
812
- new_slurm_stdout_file = str(
813
- self.workflow_dir_local
814
- / job.wftask_subfolder_name
815
- / Path(slurm_stdout_file).name
816
- )
817
- new_slurm_stderr_file = str(
818
- self.workflow_dir_local
819
- / job.wftask_subfolder_name
820
- / Path(slurm_stderr_file).name
821
- )
822
- with self.jobs_lock:
823
- self.map_jobid_to_slurm_files[jobid] = (
824
- slurm_script_file,
825
- new_slurm_stdout_file,
826
- new_slurm_stderr_file,
827
- )
828
-
829
- in_paths = job.input_pickle_files
830
- out_paths = tuple(
831
- (self.workflow_dir_local / job.wftask_subfolder_name / f.name)
832
- for f in job.output_pickle_files
833
- )
834
-
835
- outputs = []
836
- for ind_out_path, out_path in enumerate(out_paths):
837
- in_path = in_paths[ind_out_path]
838
-
839
- # The output pickle file may be missing because of some slow
840
- # filesystem operation; wait some time before considering it as
841
- # missing
842
- if not out_path.exists():
843
- settings = Inject(get_settings)
844
- time.sleep(settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL)
845
- if not out_path.exists():
846
- # Output pickle file is missing
847
- info = (
848
- "Output pickle file of the FractalSlurmExecutor job "
849
- "not found.\n"
850
- f"Expected file path: {str(out_path)}.\n"
851
- "Here are some possible reasons:\n"
852
- "1. The SLURM job was scancel-ed, either by the user "
853
- "or due to an error (e.g. an out-of-memory or timeout "
854
- "error). Note that if the scancel took place before "
855
- "the job started running, the SLURM out/err files "
856
- "will be empty.\n"
857
- "2. Some error occurred upon writing the file to disk "
858
- "(e.g. because there is not enough space on disk, or "
859
- "due to an overloaded NFS filesystem). "
860
- "Note that the server configuration has "
861
- "FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
862
- f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
863
- "seconds.\n"
864
- )
865
- job_exc = self._prepare_JobExecutionError(jobid, info=info)
866
- try:
867
- fut.set_exception(job_exc)
868
- return
869
- except InvalidStateError:
870
- logger.warning(
871
- f"Future {fut} (SLURM job ID: {jobid}) was already"
872
- " cancelled, exit from"
873
- " FractalSlurmExecutor._completion."
874
- )
875
- in_path.unlink()
876
- self._cleanup(jobid)
877
- return
878
-
879
- # Read the task output (note: we now know that out_path exists)
880
- with out_path.open("rb") as f:
881
- outdata = f.read()
882
- # Note: output can be either the task result (typically a
883
- # dictionary) or an ExceptionProxy object; in the latter
884
- # case, the ExceptionProxy definition is also part of the
885
- # pickle file (thanks to cloudpickle.dumps).
886
- success, output = cloudpickle.loads(outdata)
887
- try:
888
- if success:
889
- outputs.append(output)
890
- else:
891
- proxy = output
892
- if proxy.exc_type_name == "JobExecutionError":
893
- job_exc = self._prepare_JobExecutionError(
894
- jobid, info=proxy.kwargs.get("info", None)
895
- )
896
- fut.set_exception(job_exc)
897
- return
898
- else:
899
- # This branch catches both TaskExecutionError's
900
- # (coming from the typical fractal-server
901
- # execution of tasks, and with additional
902
- # fractal-specific kwargs) or arbitrary
903
- # exceptions (coming from a direct use of
904
- # FractalSlurmExecutor, possibly outside
905
- # fractal-server)
906
- kwargs = {}
907
- for key in [
908
- "workflow_task_id",
909
- "workflow_task_order",
910
- "task_name",
911
- ]:
912
- if key in proxy.kwargs.keys():
913
- kwargs[key] = proxy.kwargs[key]
914
- exc = TaskExecutionError(proxy.tb, **kwargs)
915
- fut.set_exception(exc)
916
- return
917
- out_path.unlink()
918
- except InvalidStateError:
919
- logger.warning(
920
- f"Future {fut} (SLURM job ID: {jobid}) was already"
921
- " cancelled, exit from"
922
- " FractalSlurmExecutor._completion."
923
- )
924
- out_path.unlink()
925
- in_path.unlink()
926
- self._cleanup(jobid)
927
- return
928
-
929
- # Clean up input pickle file
930
- in_path.unlink()
931
- self._cleanup(jobid)
932
- if job.single_task_submission:
933
- fut.set_result(outputs[0])
934
- else:
935
- fut.set_result(outputs)
936
- return
937
-
938
- except Exception as e:
939
- try:
940
- fut.set_exception(e)
941
- return
942
- except InvalidStateError:
943
- logger.warning(
944
- f"Future {fut} (SLURM job ID: {jobid}) was already"
945
- " cancelled, exit from"
946
- " FractalSlurmExecutor._completion."
947
- )
948
-
949
- def _copy_files_from_remote_to_local(
950
- self,
951
- job: SlurmJob,
952
- ):
953
- """
954
- Impersonate the user and copy task-related files
955
-
956
- For all files in `self.workflow_dir_remote` that start with
957
- `job.file_prefix`, read them (with `sudo -u` impersonation) and write
958
- them to `self.workflow_dir_local`.
959
-
960
- Files to copy:
961
- * Job-related files (SLURM stderr/stdout files); with prefix
962
- `job.slurm_file_prefix`;
963
- * Task-related files (stderr/stdout, args.json, metadiff.json, output
964
- pickle), with prefixes `job.wftask_file_prefixes`.
965
-
966
- Arguments:
967
- job:
968
- `SlurmJob` object (needed for its prefixes-related attributes).
969
-
970
- Raises:
971
- JobExecutionError: If a `cat` command fails.
972
- """
973
- logger.debug("[_copy_files_from_remote_to_local] Start")
974
-
975
- if self.workflow_dir_remote == self.workflow_dir_local:
976
- logger.debug(
977
- "[_copy_files_from_remote_to_local] "
978
- "workflow_dir_local corresponds to workflow_dir_remote, "
979
- "return."
980
- )
981
- return
982
-
983
- subfolder_name = job.wftask_subfolder_name
984
- prefixes = set(
985
- [job.slurm_file_prefix] + list(job.wftask_file_prefixes)
986
- )
987
-
988
- logger.debug(
989
- "[_copy_files_from_remote_to_local] "
990
- f"WorkflowTask subfolder_name: {subfolder_name}"
991
- )
992
- logger.debug(f"[_copy_files_from_remote_to_local] {prefixes=}")
993
- logger.debug(
994
- "[_copy_files_from_remote_to_local] "
995
- f"{str(self.workflow_dir_remote)=}"
996
- )
997
-
998
- for prefix in prefixes:
999
- if prefix == job.slurm_file_prefix:
1000
- files_to_copy = _glob_as_user(
1001
- folder=str(self.workflow_dir_remote / subfolder_name),
1002
- user=self.slurm_user,
1003
- startswith=prefix,
1004
- )
1005
- else:
1006
- files_to_copy = _glob_as_user_strict(
1007
- folder=str(self.workflow_dir_remote / subfolder_name),
1008
- user=self.slurm_user,
1009
- startswith=prefix,
1010
- )
1011
-
1012
- logger.debug(
1013
- "[_copy_files_from_remote_to_local] "
1014
- f"{prefix=}, {len(files_to_copy)=}"
1015
- )
1016
-
1017
- for source_file_name in files_to_copy:
1018
- if " " in source_file_name:
1019
- raise ValueError(
1020
- f'source_file_name="{source_file_name}" '
1021
- "contains whitespaces"
1022
- )
1023
- source_file_path = str(
1024
- self.workflow_dir_remote
1025
- / subfolder_name
1026
- / source_file_name
1027
- )
1028
-
1029
- # Read source_file_path (requires sudo)
1030
- # NOTE: By setting encoding=None, we read/write bytes instead
1031
- # of strings; this is needed to also handle pickle files.
1032
- cmd = f"cat {source_file_path}"
1033
- res = _run_command_as_user(
1034
- cmd=cmd, user=self.slurm_user, encoding=None
1035
- )
1036
- if res.returncode != 0:
1037
- info = (
1038
- f'Running cmd="{cmd}" as {self.slurm_user=} failed\n\n'
1039
- f"{res.returncode=}\n\n"
1040
- f"{res.stdout=}\n\n{res.stderr=}\n"
1041
- )
1042
- logger.error(info)
1043
- raise JobExecutionError(info)
1044
- # Write to dest_file_path (including empty files)
1045
- dest_file_path = str(
1046
- self.workflow_dir_local / subfolder_name / source_file_name
1047
- )
1048
- with open(dest_file_path, "wb") as f:
1049
- f.write(res.stdout)
1050
- logger.debug("[_copy_files_from_remote_to_local] End")
1051
-
1052
- def _start(
1053
- self,
1054
- job: SlurmJob,
1055
- ) -> tuple[str, SlurmJob]:
1056
- """
1057
- Submit function for execution on a SLURM cluster
1058
- """
1059
-
1060
- # Prepare commands to be included in SLURM submission script
1061
- settings = Inject(get_settings)
1062
- python_worker_interpreter = (
1063
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
1064
- )
1065
-
1066
- cmdlines = []
1067
- for ind_task in range(job.num_tasks_tot):
1068
- input_pickle_file = job.input_pickle_files[ind_task]
1069
- output_pickle_file = job.output_pickle_files[ind_task]
1070
- cmdlines.append(
1071
- (
1072
- f"{python_worker_interpreter}"
1073
- " -m fractal_server.app.runner.executors.slurm.remote "
1074
- f"--input-file {input_pickle_file} "
1075
- f"--output-file {output_pickle_file}"
1076
- )
1077
- )
1078
-
1079
- # ...
1080
- sbatch_script = self._prepare_sbatch_script(
1081
- slurm_config=job.slurm_config,
1082
- list_commands=cmdlines,
1083
- slurm_out_path=str(job.slurm_stdout),
1084
- slurm_err_path=str(job.slurm_stderr),
1085
- )
1086
-
1087
- # Print warning for ignored parameter
1088
- if len(job.slurm_config.pre_submission_commands) > 0:
1089
- logger.warning(
1090
- f"Ignoring {job.slurm_config.pre_submission_commands=}."
1091
- )
1092
-
1093
- # Submit job via sbatch, and retrieve jobid
1094
-
1095
- # Write script content to a job.slurm_script
1096
- with job.slurm_script.open("w") as f:
1097
- f.write(sbatch_script)
1098
-
1099
- # Prepare submission command
1100
- pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
1101
- submit_command = f"sbatch --parsable {job.slurm_script}"
1102
- full_command = f"{pre_command} {submit_command}"
1103
-
1104
- # Submit SLURM job and retrieve job ID
1105
- output = _subprocess_run_or_raise(full_command)
1106
- try:
1107
- jobid = int(output.stdout)
1108
- except ValueError as e:
1109
- error_msg = (
1110
- f"Submit command `{full_command}` returned "
1111
- f"`{output.stdout=}` which cannot be cast to an integer "
1112
- f"SLURM-job ID. Original error:\n{str(e)}"
1113
- )
1114
- logger.error(error_msg)
1115
- raise JobExecutionError(info=error_msg)
1116
- jobid_str = str(jobid)
1117
-
1118
- # Plug SLURM job id in stdout/stderr file paths
1119
- job.slurm_stdout = Path(
1120
- job.slurm_stdout.as_posix().replace("%j", jobid_str)
1121
- )
1122
- job.slurm_stderr = Path(
1123
- job.slurm_stderr.as_posix().replace("%j", jobid_str)
1124
- )
1125
-
1126
- return jobid_str, job
1127
-
1128
- def _prepare_sbatch_script(
1129
- self,
1130
- *,
1131
- list_commands: list[str],
1132
- slurm_out_path: str,
1133
- slurm_err_path: str,
1134
- slurm_config: SlurmConfig,
1135
- ):
1136
- num_tasks_max_running = slurm_config.parallel_tasks_per_job
1137
- mem_per_task_MB = slurm_config.mem_per_task_MB
1138
-
1139
- # Set ntasks
1140
- ntasks = min(len(list_commands), num_tasks_max_running)
1141
- if len(list_commands) < num_tasks_max_running:
1142
- ntasks = len(list_commands)
1143
- slurm_config.parallel_tasks_per_job = ntasks
1144
- logger.debug(
1145
- f"{len(list_commands)=} is smaller than "
1146
- f"{num_tasks_max_running=}. Setting {ntasks=}."
1147
- )
1148
-
1149
- # Prepare SLURM preamble based on SlurmConfig object
1150
- script_lines = slurm_config.to_sbatch_preamble(
1151
- remote_export_dir=self.user_cache_dir
1152
- )
1153
-
1154
- # Extend SLURM preamble with variable which are not in SlurmConfig, and
1155
- # fix their order
1156
- script_lines.extend(
1157
- [
1158
- f"#SBATCH --err={slurm_err_path}",
1159
- f"#SBATCH --out={slurm_out_path}",
1160
- f"#SBATCH -D {self.workflow_dir_remote}",
1161
- ]
1162
- )
1163
- script_lines = slurm_config.sort_script_lines(script_lines)
1164
- logger.debug(script_lines)
1165
-
1166
- # Always print output of `uname -n` and `pwd`
1167
- script_lines.append(
1168
- '"Hostname: `uname -n`; current directory: `pwd`"\n'
1169
- )
1170
-
1171
- # Complete script preamble
1172
- script_lines.append("\n")
1173
-
1174
- # Include command lines
1175
- tmp_list_commands = copy(list_commands)
1176
- while tmp_list_commands:
1177
- if tmp_list_commands:
1178
- cmd = tmp_list_commands.pop(0) # take first element
1179
- script_lines.append(
1180
- "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
1181
- f"--mem={mem_per_task_MB}MB "
1182
- f"{cmd} &"
1183
- )
1184
- script_lines.append("wait\n")
1185
-
1186
- script = "\n".join(script_lines)
1187
- return script
1188
-
1189
- def shutdown(self, wait=True, *, cancel_futures=False):
1190
- """
1191
- Clean up all executor variables. Note that this function is executed on
1192
- the self.wait_thread thread, see _completion.
1193
- """
1194
-
1195
- logger.debug("Executor shutdown: start")
1196
-
1197
- # Handle all job futures
1198
- slurm_jobs_to_scancel = []
1199
- with self.jobs_lock:
1200
- while self.jobs:
1201
- jobid, fut_and_job = self.jobs.popitem()
1202
- slurm_jobs_to_scancel.append(jobid)
1203
- fut = fut_and_job[0]
1204
- self.map_jobid_to_slurm_files.pop(jobid)
1205
- if not fut.cancelled():
1206
- fut.set_exception(
1207
- JobExecutionError(
1208
- "Job cancelled due to executor shutdown."
1209
- )
1210
- )
1211
- fut.cancel()
1212
-
1213
- # Cancel SLURM jobs
1214
- if slurm_jobs_to_scancel:
1215
- scancel_string = " ".join(slurm_jobs_to_scancel)
1216
- logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
1217
- pre_command = f"sudo --non-interactive -u {self.slurm_user}"
1218
- submit_command = f"scancel {scancel_string}"
1219
- full_command = f"{pre_command} {submit_command}"
1220
- validate_cmd(full_command)
1221
- logger.debug(f"Now execute `{full_command}`")
1222
- try:
1223
- subprocess.run( # nosec
1224
- shlex.split(full_command),
1225
- capture_output=True,
1226
- check=True,
1227
- encoding="utf-8",
1228
- )
1229
- except subprocess.CalledProcessError as e:
1230
- error_msg = (
1231
- f"Cancel command `{full_command}` failed. "
1232
- f"Original error:\n{str(e)}"
1233
- )
1234
- logger.error(error_msg)
1235
- raise JobExecutionError(info=error_msg)
1236
-
1237
- # Redudantly set thread shutdown attribute to True
1238
- self.wait_thread.shutdown = True
1239
-
1240
- logger.debug("Executor shutdown: end")
1241
-
1242
- def _stop_and_join_wait_thread(self):
1243
- self.wait_thread.shutdown = True
1244
- self.wait_thread.join()
1245
-
1246
- def __exit__(self, *args, **kwargs):
1247
- """
1248
- See
1249
- https://github.com/fractal-analytics-platform/fractal-server/issues/1508
1250
- """
1251
- logger.debug(
1252
- "[FractalSlurmExecutor.__exit__] Stop and join `wait_thread`"
1253
- )
1254
- self._stop_and_join_wait_thread()
1255
- logger.debug("[FractalSlurmExecutor.__exit__] End")
1256
-
1257
- def check_remote_python_interpreter(self):
1258
- """
1259
- Check fractal-server version on the _remote_ Python interpreter.
1260
- """
1261
- settings = Inject(get_settings)
1262
- output = _subprocess_run_or_raise(
1263
- (
1264
- f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
1265
- "-m fractal_server.app.runner.versions"
1266
- )
1267
- )
1268
- runner_version = json.loads(output.stdout.strip("\n"))[
1269
- "fractal_server"
1270
- ]
1271
-
1272
- if runner_version != __VERSION__:
1273
- error_msg = (
1274
- "Fractal-server version mismatch.\n"
1275
- "Local interpreter: "
1276
- f"({sys.executable}): {__VERSION__}.\n"
1277
- "Remote interpreter: "
1278
- f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
1279
- )
1280
- logger.error(error_msg)
1281
- raise ValueError(error_msg)