fractal-server 2.12.1__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/security.py +9 -12
- fractal_server/app/models/v2/dataset.py +2 -2
- fractal_server/app/models/v2/job.py +11 -9
- fractal_server/app/models/v2/task.py +2 -3
- fractal_server/app/models/v2/task_group.py +6 -2
- fractal_server/app/models/v2/workflowtask.py +15 -8
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +1 -1
- fractal_server/app/routes/api/v2/dataset.py +4 -4
- fractal_server/app/routes/api/v2/images.py +11 -11
- fractal_server/app/routes/api/v2/project.py +2 -2
- fractal_server/app/routes/api/v2/status.py +1 -1
- fractal_server/app/routes/api/v2/submit.py +8 -6
- fractal_server/app/routes/api/v2/task.py +4 -2
- fractal_server/app/routes/api/v2/task_collection.py +3 -2
- fractal_server/app/routes/api/v2/task_group.py +2 -2
- fractal_server/app/routes/api/v2/workflow.py +3 -3
- fractal_server/app/routes/api/v2/workflow_import.py +3 -3
- fractal_server/app/routes/api/v2/workflowtask.py +3 -1
- fractal_server/app/routes/auth/_aux_auth.py +4 -1
- fractal_server/app/routes/auth/current_user.py +3 -5
- fractal_server/app/routes/auth/group.py +1 -1
- fractal_server/app/routes/auth/users.py +2 -4
- fractal_server/app/routes/aux/_runner.py +1 -1
- fractal_server/app/routes/aux/validate_user_settings.py +1 -2
- fractal_server/app/runner/executors/_job_states.py +13 -0
- fractal_server/app/runner/executors/slurm/_slurm_config.py +26 -18
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +31 -22
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +2 -5
- fractal_server/app/runner/executors/slurm/ssh/executor.py +21 -27
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +1 -2
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +37 -47
- fractal_server/app/runner/executors/slurm/sudo/executor.py +25 -24
- fractal_server/app/runner/v2/__init__.py +0 -9
- fractal_server/app/runner/v2/_local/_local_config.py +5 -4
- fractal_server/app/runner/v2/_slurm_common/get_slurm_config.py +4 -4
- fractal_server/app/runner/v2/_slurm_sudo/__init__.py +2 -2
- fractal_server/app/runner/v2/deduplicate_list.py +1 -1
- fractal_server/app/runner/v2/runner.py +9 -4
- fractal_server/app/runner/v2/task_interface.py +15 -7
- fractal_server/app/schemas/_filter_validators.py +6 -3
- fractal_server/app/schemas/_validators.py +7 -5
- fractal_server/app/schemas/user.py +23 -18
- fractal_server/app/schemas/user_group.py +25 -11
- fractal_server/app/schemas/user_settings.py +31 -24
- fractal_server/app/schemas/v2/dataset.py +48 -35
- fractal_server/app/schemas/v2/dumps.py +16 -14
- fractal_server/app/schemas/v2/job.py +49 -29
- fractal_server/app/schemas/v2/manifest.py +32 -28
- fractal_server/app/schemas/v2/project.py +18 -8
- fractal_server/app/schemas/v2/task.py +86 -75
- fractal_server/app/schemas/v2/task_collection.py +41 -30
- fractal_server/app/schemas/v2/task_group.py +39 -20
- fractal_server/app/schemas/v2/workflow.py +24 -12
- fractal_server/app/schemas/v2/workflowtask.py +63 -61
- fractal_server/app/security/__init__.py +1 -1
- fractal_server/config.py +32 -25
- fractal_server/images/models.py +18 -12
- fractal_server/main.py +1 -1
- fractal_server/tasks/v2/utils_background.py +1 -1
- fractal_server/tasks/v2/utils_database.py +1 -1
- {fractal_server-2.12.1.dist-info → fractal_server-2.13.0.dist-info}/METADATA +9 -10
- {fractal_server-2.12.1.dist-info → fractal_server-2.13.0.dist-info}/RECORD +69 -72
- fractal_server/app/runner/v2/_local_experimental/__init__.py +0 -121
- fractal_server/app/runner/v2/_local_experimental/_local_config.py +0 -108
- fractal_server/app/runner/v2/_local_experimental/_submit_setup.py +0 -42
- fractal_server/app/runner/v2/_local_experimental/executor.py +0 -157
- {fractal_server-2.12.1.dist-info → fractal_server-2.13.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.12.1.dist-info → fractal_server-2.13.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.12.1.dist-info → fractal_server-2.13.0.dist-info}/entry_points.txt +0 -0
@@ -18,9 +18,9 @@ from typing import Optional
|
|
18
18
|
from typing import Union
|
19
19
|
|
20
20
|
from pydantic import BaseModel
|
21
|
-
from pydantic import
|
21
|
+
from pydantic import ConfigDict
|
22
22
|
from pydantic import Field
|
23
|
-
from pydantic
|
23
|
+
from pydantic import ValidationError
|
24
24
|
|
25
25
|
from .....config import get_settings
|
26
26
|
from .....logger import set_logger
|
@@ -37,7 +37,7 @@ class SlurmConfigError(ValueError):
|
|
37
37
|
pass
|
38
38
|
|
39
39
|
|
40
|
-
class _SlurmConfigSet(BaseModel
|
40
|
+
class _SlurmConfigSet(BaseModel):
|
41
41
|
"""
|
42
42
|
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` for the default/gpu
|
43
43
|
SLURM config. Only used as part of `SlurmConfigFile`.
|
@@ -54,19 +54,21 @@ class _SlurmConfigSet(BaseModel, extra=Extra.forbid):
|
|
54
54
|
extra_lines:
|
55
55
|
"""
|
56
56
|
|
57
|
-
|
58
|
-
cpus_per_task: Optional[int]
|
59
|
-
mem: Optional[Union[int, str]]
|
60
|
-
constraint: Optional[str]
|
61
|
-
gres: Optional[str]
|
62
|
-
time: Optional[str]
|
63
|
-
account: Optional[str]
|
64
|
-
extra_lines: Optional[list[str]]
|
65
|
-
pre_submission_commands: Optional[list[str]]
|
66
|
-
gpus: Optional[str]
|
57
|
+
model_config = ConfigDict(extra="forbid")
|
67
58
|
|
59
|
+
partition: Optional[str] = None
|
60
|
+
cpus_per_task: Optional[int] = None
|
61
|
+
mem: Optional[Union[int, str]] = None
|
62
|
+
constraint: Optional[str] = None
|
63
|
+
gres: Optional[str] = None
|
64
|
+
time: Optional[str] = None
|
65
|
+
account: Optional[str] = None
|
66
|
+
extra_lines: Optional[list[str]] = None
|
67
|
+
pre_submission_commands: Optional[list[str]] = None
|
68
|
+
gpus: Optional[str] = None
|
68
69
|
|
69
|
-
|
70
|
+
|
71
|
+
class _BatchingConfigSet(BaseModel):
|
70
72
|
"""
|
71
73
|
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` to configure the
|
72
74
|
batching strategy (that is, how to combine several tasks in a single SLURM
|
@@ -83,6 +85,8 @@ class _BatchingConfigSet(BaseModel, extra=Extra.forbid):
|
|
83
85
|
max_num_jobs:
|
84
86
|
"""
|
85
87
|
|
88
|
+
model_config = ConfigDict(extra="forbid")
|
89
|
+
|
86
90
|
target_cpus_per_job: int
|
87
91
|
max_cpus_per_job: int
|
88
92
|
target_mem_per_job: Union[int, str]
|
@@ -91,7 +95,7 @@ class _BatchingConfigSet(BaseModel, extra=Extra.forbid):
|
|
91
95
|
max_num_jobs: int
|
92
96
|
|
93
97
|
|
94
|
-
class SlurmConfigFile(BaseModel
|
98
|
+
class SlurmConfigFile(BaseModel):
|
95
99
|
"""
|
96
100
|
Specifications for the content of `FRACTAL_SLURM_CONFIG_FILE`
|
97
101
|
|
@@ -136,10 +140,12 @@ class SlurmConfigFile(BaseModel, extra=Extra.forbid):
|
|
136
140
|
directory.
|
137
141
|
"""
|
138
142
|
|
143
|
+
model_config = ConfigDict(extra="forbid")
|
144
|
+
|
139
145
|
default_slurm_config: _SlurmConfigSet
|
140
|
-
gpu_slurm_config: Optional[_SlurmConfigSet]
|
146
|
+
gpu_slurm_config: Optional[_SlurmConfigSet] = None
|
141
147
|
batching_config: _BatchingConfigSet
|
142
|
-
user_local_exports: Optional[dict[str, str]]
|
148
|
+
user_local_exports: Optional[dict[str, str]] = None
|
143
149
|
|
144
150
|
|
145
151
|
def load_slurm_config_file(
|
@@ -196,7 +202,7 @@ def load_slurm_config_file(
|
|
196
202
|
return obj
|
197
203
|
|
198
204
|
|
199
|
-
class SlurmConfig(BaseModel
|
205
|
+
class SlurmConfig(BaseModel):
|
200
206
|
"""
|
201
207
|
Abstraction for SLURM parameters
|
202
208
|
|
@@ -247,6 +253,8 @@ class SlurmConfig(BaseModel, extra=Extra.forbid):
|
|
247
253
|
command.
|
248
254
|
"""
|
249
255
|
|
256
|
+
model_config = ConfigDict(extra="forbid")
|
257
|
+
|
250
258
|
# Required SLURM parameters (note that the integer attributes are those
|
251
259
|
# that will need to scale up with the number of parallel tasks per job)
|
252
260
|
partition: str
|
@@ -1,10 +1,8 @@
|
|
1
1
|
import os
|
2
|
+
import threading
|
2
3
|
import time
|
3
4
|
import traceback
|
4
5
|
from itertools import count
|
5
|
-
from typing import Callable
|
6
|
-
|
7
|
-
from cfut import FileWaitThread
|
8
6
|
|
9
7
|
from ......logger import set_logger
|
10
8
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
@@ -12,35 +10,46 @@ from fractal_server.app.runner.exceptions import JobExecutionError
|
|
12
10
|
logger = set_logger(__name__)
|
13
11
|
|
14
12
|
|
15
|
-
class
|
13
|
+
class FractalSlurmSSHWaitThread(threading.Thread):
|
16
14
|
"""
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
15
|
+
Thread that monitors a pool of SLURM jobs
|
16
|
+
|
17
|
+
This class is a custom re-implementation of the waiting thread class from:
|
18
|
+
|
19
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
20
|
+
> Original Copyright
|
21
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
22
|
+
> License: MIT
|
23
|
+
|
24
|
+
Attributes:
|
25
|
+
shutdown_file:
|
26
|
+
shutdown_callback:
|
27
|
+
slurm_poll_interval:
|
28
|
+
jobs_finished_callback:
|
29
|
+
active_job_ids:
|
30
|
+
shutdown:
|
31
|
+
lock:
|
29
32
|
"""
|
30
33
|
|
31
34
|
shutdown_file: str
|
32
|
-
shutdown_callback:
|
33
|
-
jobs_finished_callback: Callable
|
35
|
+
shutdown_callback: callable
|
34
36
|
slurm_poll_interval = 30
|
37
|
+
jobs_finished_callback: callable
|
35
38
|
active_job_ids: list[str]
|
39
|
+
shutdown: bool
|
40
|
+
_lock: threading.Lock
|
36
41
|
|
37
|
-
def __init__(self,
|
42
|
+
def __init__(self, callback: callable, interval=1):
|
38
43
|
"""
|
39
44
|
Init method
|
40
45
|
|
41
46
|
This method is executed on the main thread.
|
42
47
|
"""
|
43
|
-
|
48
|
+
threading.Thread.__init__(self, daemon=True)
|
49
|
+
self.callback = callback
|
50
|
+
self.interval = interval
|
51
|
+
self._lock = threading.Lock()
|
52
|
+
self.shutdown = False
|
44
53
|
self.active_job_ids = []
|
45
54
|
|
46
55
|
def wait(self, *, job_id: str):
|
@@ -53,7 +62,7 @@ class FractalSlurmWaitThread(FileWaitThread):
|
|
53
62
|
error_msg = "Cannot call `wait` method after executor shutdown."
|
54
63
|
logger.warning(error_msg)
|
55
64
|
raise JobExecutionError(info=error_msg)
|
56
|
-
with self.
|
65
|
+
with self._lock:
|
57
66
|
self.active_job_ids.append(job_id)
|
58
67
|
|
59
68
|
def check_shutdown(self):
|
@@ -109,7 +118,7 @@ class FractalSlurmWaitThread(FileWaitThread):
|
|
109
118
|
pass
|
110
119
|
return
|
111
120
|
if ind % skip == 0:
|
112
|
-
with self.
|
121
|
+
with self._lock:
|
113
122
|
try:
|
114
123
|
self.check_jobs()
|
115
124
|
except Exception: # nosec
|
@@ -1,8 +1,7 @@
|
|
1
|
+
import uuid
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
|
-
from cfut.util import random_string
|
5
|
-
|
6
5
|
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
7
6
|
SlurmConfig,
|
8
7
|
)
|
@@ -106,9 +105,7 @@ class SlurmJob:
|
|
106
105
|
)
|
107
106
|
else:
|
108
107
|
self.wftask_file_prefixes = wftask_file_prefixes
|
109
|
-
self.workerids = tuple(
|
110
|
-
random_string() for i in range(self.num_tasks_tot)
|
111
|
-
)
|
108
|
+
self.workerids = tuple(uuid.uuid4() for i in range(self.num_tasks_tot))
|
112
109
|
self.slurm_config = slurm_config
|
113
110
|
|
114
111
|
def get_clean_output_pickle_files(self) -> tuple[str, ...]:
|
@@ -1,20 +1,9 @@
|
|
1
|
-
# This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
|
2
|
-
# Original Copyright
|
3
|
-
# Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
4
|
-
# License: MIT
|
5
|
-
#
|
6
|
-
# Modified by:
|
7
|
-
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
8
|
-
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
9
|
-
# Marco Franzon <marco.franzon@exact-lab.it>
|
10
|
-
#
|
11
|
-
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
12
|
-
# University of Zurich
|
13
1
|
import json
|
14
2
|
import math
|
15
3
|
import sys
|
16
4
|
import threading
|
17
5
|
import time
|
6
|
+
from concurrent.futures import Executor
|
18
7
|
from concurrent.futures import Future
|
19
8
|
from concurrent.futures import InvalidStateError
|
20
9
|
from copy import copy
|
@@ -25,18 +14,18 @@ from typing import Optional
|
|
25
14
|
from typing import Sequence
|
26
15
|
|
27
16
|
import cloudpickle
|
28
|
-
from cfut import SlurmExecutor
|
29
17
|
|
30
18
|
from ....filenames import SHUTDOWN_FILENAME
|
31
19
|
from ....task_files import get_task_file_paths
|
32
20
|
from ....task_files import TaskFiles
|
33
21
|
from ....versions import get_versions
|
22
|
+
from ..._job_states import STATES_FINISHED
|
34
23
|
from ...slurm._slurm_config import SlurmConfig
|
35
24
|
from .._batching import heuristics
|
36
25
|
from ..utils_executors import get_pickle_file_path
|
37
26
|
from ..utils_executors import get_slurm_file_path
|
38
27
|
from ..utils_executors import get_slurm_script_file_path
|
39
|
-
from ._executor_wait_thread import
|
28
|
+
from ._executor_wait_thread import FractalSlurmSSHWaitThread
|
40
29
|
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
41
30
|
from fractal_server.app.runner.compress_folder import compress_folder
|
42
31
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
@@ -48,24 +37,31 @@ from fractal_server.logger import set_logger
|
|
48
37
|
from fractal_server.ssh._fabric import FractalSSH
|
49
38
|
from fractal_server.syringe import Inject
|
50
39
|
|
40
|
+
|
51
41
|
logger = set_logger(__name__)
|
52
42
|
|
53
43
|
|
54
|
-
class FractalSlurmSSHExecutor(
|
44
|
+
class FractalSlurmSSHExecutor(Executor):
|
55
45
|
"""
|
56
|
-
|
46
|
+
Executor to submit SLURM jobs via SSH
|
47
|
+
|
48
|
+
This class is a custom re-implementation of the SLURM executor from
|
49
|
+
|
50
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
51
|
+
> Original Copyright
|
52
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
53
|
+
> License: MIT
|
57
54
|
|
58
|
-
FIXME: docstring
|
59
55
|
|
60
56
|
Attributes:
|
61
57
|
fractal_ssh: FractalSSH connection with custom lock
|
62
|
-
shutdown_file:
|
63
|
-
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
-
wait_thread_cls: Class for waiting thread
|
65
58
|
workflow_dir_local:
|
66
59
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
67
60
|
workflow_dir_remote:
|
68
61
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
62
|
+
shutdown_file:
|
63
|
+
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
+
wait_thread_cls: Class for waiting thread
|
69
65
|
common_script_lines:
|
70
66
|
Arbitrary script lines that will always be included in the
|
71
67
|
sbatch script
|
@@ -82,10 +78,10 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
82
78
|
shutdown_file: str
|
83
79
|
python_remote: str
|
84
80
|
|
85
|
-
wait_thread_cls =
|
81
|
+
wait_thread_cls = FractalSlurmSSHWaitThread
|
86
82
|
|
87
83
|
common_script_lines: list[str]
|
88
|
-
slurm_account: Optional[str]
|
84
|
+
slurm_account: Optional[str] = None
|
89
85
|
|
90
86
|
jobs: dict[str, tuple[Future, SlurmJob]]
|
91
87
|
map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
|
@@ -1159,7 +1155,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1159
1155
|
Path(tarfile_path_local).unlink()
|
1160
1156
|
|
1161
1157
|
t_1 = time.perf_counter()
|
1162
|
-
logger.info("[_get_subfolder_sftp] End -
|
1158
|
+
logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
|
1163
1159
|
|
1164
1160
|
def _prepare_sbatch_script(
|
1165
1161
|
self,
|
@@ -1258,7 +1254,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1258
1254
|
logger.debug("Executor shutdown: end")
|
1259
1255
|
|
1260
1256
|
def _stop_and_join_wait_thread(self):
|
1261
|
-
self.wait_thread.
|
1257
|
+
self.wait_thread.shutdown = True
|
1262
1258
|
self.wait_thread.join()
|
1263
1259
|
|
1264
1260
|
def __exit__(self, *args, **kwargs):
|
@@ -1295,8 +1291,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1295
1291
|
(released under the MIT licence)
|
1296
1292
|
"""
|
1297
1293
|
|
1298
|
-
from cfut.slurm import STATES_FINISHED
|
1299
|
-
|
1300
1294
|
logger.debug(
|
1301
1295
|
f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
|
1302
1296
|
)
|
@@ -1387,6 +1381,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1387
1381
|
t_end_handshake = time.perf_counter()
|
1388
1382
|
logger.info(
|
1389
1383
|
"[FractalSlurmSSHExecutor.ssh_handshake] END"
|
1390
|
-
f" - elapsed: {t_end_handshake-t_start_handshake:.3f} s"
|
1384
|
+
f" - elapsed: {t_end_handshake - t_start_handshake:.3f} s"
|
1391
1385
|
)
|
1392
1386
|
return remote_versions
|
@@ -1,12 +1,10 @@
|
|
1
1
|
import os
|
2
|
+
import threading
|
2
3
|
import time
|
3
4
|
import traceback
|
4
5
|
from itertools import count
|
5
|
-
from typing import Callable
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
-
from cfut import FileWaitThread
|
9
|
-
|
10
8
|
from ......logger import set_logger
|
11
9
|
from ._check_jobs_status import _jobs_finished
|
12
10
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
@@ -14,33 +12,43 @@ from fractal_server.app.runner.exceptions import JobExecutionError
|
|
14
12
|
logger = set_logger(__name__)
|
15
13
|
|
16
14
|
|
17
|
-
class
|
15
|
+
class FractalSlurmSudoWaitThread(threading.Thread):
|
18
16
|
"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
17
|
+
Thread that monitors a pool of SLURM jobs
|
18
|
+
|
19
|
+
This class is a custom re-implementation of the waiting thread class from:
|
20
|
+
|
21
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
22
|
+
> Original Copyright
|
23
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
24
|
+
> License: MIT
|
25
|
+
|
26
|
+
Attributes:
|
27
|
+
slurm_user:
|
28
|
+
shutdown_file:
|
29
|
+
shutdown_callback:
|
30
|
+
slurm_poll_interval:
|
31
|
+
waiting:
|
32
|
+
shutdown:
|
33
|
+
lock:
|
36
34
|
"""
|
37
35
|
|
38
36
|
slurm_user: str
|
39
37
|
shutdown_file: Optional[str] = None
|
40
|
-
shutdown_callback:
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
shutdown_callback: callable
|
39
|
+
slurm_poll_interval: int = 30
|
40
|
+
waiting: dict[tuple[str, ...], str]
|
41
|
+
shutdown: bool
|
42
|
+
_lock: threading.Lock
|
43
|
+
|
44
|
+
def __init__(self, callback: callable, interval=1):
|
45
|
+
threading.Thread.__init__(self, daemon=True)
|
46
|
+
self.callback = callback
|
47
|
+
self.interval = interval
|
48
|
+
self.waiting = {}
|
49
|
+
self._lock = threading.Lock() # To protect the .waiting dict
|
50
|
+
self.shutdown = False
|
51
|
+
self.active_job_ids = []
|
44
52
|
|
45
53
|
def wait(
|
46
54
|
self,
|
@@ -61,10 +69,10 @@ class FractalFileWaitThread(FileWaitThread):
|
|
61
69
|
error_msg = "Cannot call `wait` method after executor shutdown."
|
62
70
|
logger.warning(error_msg)
|
63
71
|
raise JobExecutionError(info=error_msg)
|
64
|
-
with self.
|
72
|
+
with self._lock:
|
65
73
|
self.waiting[filenames] = jobid
|
66
74
|
|
67
|
-
def
|
75
|
+
def check_shutdown(self, i):
|
68
76
|
"""
|
69
77
|
Do one shutdown-file-existence check.
|
70
78
|
|
@@ -99,30 +107,12 @@ class FractalFileWaitThread(FileWaitThread):
|
|
99
107
|
if self.shutdown:
|
100
108
|
self.shutdown_callback()
|
101
109
|
return
|
102
|
-
with self.
|
110
|
+
with self._lock:
|
103
111
|
self.check(i)
|
104
112
|
time.sleep(self.interval)
|
105
113
|
|
106
|
-
|
107
|
-
class FractalSlurmWaitThread(FractalFileWaitThread):
|
108
|
-
"""
|
109
|
-
Replaces the original clusterfutures.SlurmWaitThread, to inherit from
|
110
|
-
FractalFileWaitThread instead of FileWaitThread.
|
111
|
-
|
112
|
-
The function is copied from clusterfutures 0.5. Original Copyright: 2022
|
113
|
-
Adrian Sampson, released under the MIT licence
|
114
|
-
|
115
|
-
**Note**: if `self.interval != 1` then this should be modified, but for
|
116
|
-
`clusterfutures` v0.5 `self.interval` is indeed equal to `1`.
|
117
|
-
|
118
|
-
Changed from clusterfutures:
|
119
|
-
* Rename `id_to_filename` to `id_to_filenames`
|
120
|
-
"""
|
121
|
-
|
122
|
-
slurm_poll_interval = 30
|
123
|
-
|
124
114
|
def check(self, i):
|
125
|
-
|
115
|
+
self.check_shutdown(i)
|
126
116
|
if i % (self.slurm_poll_interval // self.interval) == 0:
|
127
117
|
try:
|
128
118
|
finished_jobs = _jobs_finished(self.waiting.values())
|
@@ -1,21 +1,12 @@
|
|
1
|
-
# This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
|
2
|
-
# Original Copyright
|
3
|
-
# Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
4
|
-
# License: MIT
|
5
|
-
#
|
6
|
-
# Modified by:
|
7
|
-
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
8
|
-
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
9
|
-
# Marco Franzon <marco.franzon@exact-lab.it>
|
10
|
-
#
|
11
|
-
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
12
|
-
# University of Zurich
|
13
1
|
import json
|
14
2
|
import math
|
15
3
|
import shlex
|
16
4
|
import subprocess # nosec
|
17
5
|
import sys
|
6
|
+
import threading
|
18
7
|
import time
|
8
|
+
import uuid
|
9
|
+
from concurrent.futures import Executor
|
19
10
|
from concurrent.futures import Future
|
20
11
|
from concurrent.futures import InvalidStateError
|
21
12
|
from copy import copy
|
@@ -27,8 +18,6 @@ from typing import Optional
|
|
27
18
|
from typing import Sequence
|
28
19
|
|
29
20
|
import cloudpickle
|
30
|
-
from cfut import SlurmExecutor
|
31
|
-
from cfut.util import random_string
|
32
21
|
|
33
22
|
from ......config import get_settings
|
34
23
|
from ......logger import set_logger
|
@@ -43,7 +32,7 @@ from .._batching import heuristics
|
|
43
32
|
from ..utils_executors import get_pickle_file_path
|
44
33
|
from ..utils_executors import get_slurm_file_path
|
45
34
|
from ..utils_executors import get_slurm_script_file_path
|
46
|
-
from ._executor_wait_thread import
|
35
|
+
from ._executor_wait_thread import FractalSlurmSudoWaitThread
|
47
36
|
from ._subprocess_run_as_user import _glob_as_user
|
48
37
|
from ._subprocess_run_as_user import _glob_as_user_strict
|
49
38
|
from ._subprocess_run_as_user import _path_exists_as_user
|
@@ -180,9 +169,7 @@ class SlurmJob:
|
|
180
169
|
)
|
181
170
|
else:
|
182
171
|
self.wftask_file_prefixes = wftask_file_prefixes
|
183
|
-
self.workerids = tuple(
|
184
|
-
random_string() for i in range(self.num_tasks_tot)
|
185
|
-
)
|
172
|
+
self.workerids = tuple(uuid.uuid4() for i in range(self.num_tasks_tot))
|
186
173
|
self.slurm_config = slurm_config
|
187
174
|
|
188
175
|
def get_clean_output_pickle_files(self) -> tuple[str, ...]:
|
@@ -193,9 +180,17 @@ class SlurmJob:
|
|
193
180
|
return tuple(str(f.as_posix()) for f in self.output_pickle_files)
|
194
181
|
|
195
182
|
|
196
|
-
class
|
183
|
+
class FractalSlurmSudoExecutor(Executor):
|
197
184
|
"""
|
198
|
-
|
185
|
+
Executor to submit SLURM jobs as a different user, via `sudo -u`
|
186
|
+
|
187
|
+
This class is a custom re-implementation of the SLURM executor from
|
188
|
+
|
189
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
190
|
+
> Original Copyright
|
191
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
192
|
+
> License: MIT
|
193
|
+
|
199
194
|
|
200
195
|
Attributes:
|
201
196
|
slurm_user:
|
@@ -211,7 +206,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
211
206
|
Dictionary with paths of slurm-related files for active jobs
|
212
207
|
"""
|
213
208
|
|
214
|
-
wait_thread_cls =
|
209
|
+
wait_thread_cls = FractalSlurmSudoWaitThread
|
215
210
|
slurm_user: str
|
216
211
|
shutdown_file: str
|
217
212
|
common_script_lines: list[str]
|
@@ -219,7 +214,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
219
214
|
workflow_dir_local: Path
|
220
215
|
workflow_dir_remote: Path
|
221
216
|
map_jobid_to_slurm_files: dict[str, tuple[str, str, str]]
|
222
|
-
slurm_account: Optional[str]
|
217
|
+
slurm_account: Optional[str] = None
|
223
218
|
jobs: dict[str, tuple[Future, SlurmJob]]
|
224
219
|
|
225
220
|
def __init__(
|
@@ -244,7 +239,13 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
244
239
|
"Missing attribute FractalSlurmExecutor.slurm_user"
|
245
240
|
)
|
246
241
|
|
247
|
-
|
242
|
+
self.jobs = {}
|
243
|
+
self.job_outfiles = {}
|
244
|
+
self.jobs_lock = threading.Lock()
|
245
|
+
self.jobs_empty_cond = threading.Condition(self.jobs_lock)
|
246
|
+
|
247
|
+
self.wait_thread = self.wait_thread_cls(self._completion)
|
248
|
+
self.wait_thread.start()
|
248
249
|
|
249
250
|
# Assign `wait_thread.shutdown_callback` early, since it may be called
|
250
251
|
# from within `_stop_and_join_wait_thread` (e.g. if an exception is
|
@@ -1239,7 +1240,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1239
1240
|
logger.debug("Executor shutdown: end")
|
1240
1241
|
|
1241
1242
|
def _stop_and_join_wait_thread(self):
|
1242
|
-
self.wait_thread.
|
1243
|
+
self.wait_thread.shutdown = True
|
1243
1244
|
self.wait_thread.join()
|
1244
1245
|
|
1245
1246
|
def __exit__(self, *args, **kwargs):
|
@@ -31,9 +31,6 @@ from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
|
|
31
31
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
32
32
|
from ..task_files import task_subfolder_name
|
33
33
|
from ._local import process_workflow as local_process_workflow
|
34
|
-
from ._local_experimental import (
|
35
|
-
process_workflow as local_experimental_process_workflow,
|
36
|
-
)
|
37
34
|
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
38
35
|
from ._slurm_sudo import process_workflow as slurm_sudo_process_workflow
|
39
36
|
from .handle_failed_job import mark_last_wftask_as_failed
|
@@ -45,7 +42,6 @@ _backends = {}
|
|
45
42
|
_backends["local"] = local_process_workflow
|
46
43
|
_backends["slurm"] = slurm_sudo_process_workflow
|
47
44
|
_backends["slurm_ssh"] = slurm_ssh_process_workflow
|
48
|
-
_backends["local_experimental"] = local_experimental_process_workflow
|
49
45
|
|
50
46
|
|
51
47
|
def fail_job(
|
@@ -184,8 +180,6 @@ def submit_workflow(
|
|
184
180
|
# Define and create WORKFLOW_DIR_REMOTE
|
185
181
|
if FRACTAL_RUNNER_BACKEND == "local":
|
186
182
|
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
187
|
-
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
188
|
-
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
189
183
|
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
190
184
|
WORKFLOW_DIR_REMOTE = (
|
191
185
|
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
@@ -287,9 +281,6 @@ def submit_workflow(
|
|
287
281
|
if FRACTAL_RUNNER_BACKEND == "local":
|
288
282
|
process_workflow = local_process_workflow
|
289
283
|
backend_specific_kwargs = {}
|
290
|
-
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
291
|
-
process_workflow = local_experimental_process_workflow
|
292
|
-
backend_specific_kwargs = {}
|
293
284
|
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
294
285
|
process_workflow = slurm_sudo_process_workflow
|
295
286
|
backend_specific_kwargs = dict(
|
@@ -17,8 +17,8 @@ from typing import Literal
|
|
17
17
|
from typing import Optional
|
18
18
|
|
19
19
|
from pydantic import BaseModel
|
20
|
-
from pydantic import
|
21
|
-
from pydantic
|
20
|
+
from pydantic import ConfigDict
|
21
|
+
from pydantic import ValidationError
|
22
22
|
|
23
23
|
from .....config import get_settings
|
24
24
|
from .....syringe import Inject
|
@@ -33,7 +33,7 @@ class LocalBackendConfigError(ValueError):
|
|
33
33
|
pass
|
34
34
|
|
35
35
|
|
36
|
-
class LocalBackendConfig(BaseModel
|
36
|
+
class LocalBackendConfig(BaseModel):
|
37
37
|
"""
|
38
38
|
Specifications of the local-backend configuration
|
39
39
|
|
@@ -44,7 +44,8 @@ class LocalBackendConfig(BaseModel, extra=Extra.forbid):
|
|
44
44
|
start at the same time.
|
45
45
|
"""
|
46
46
|
|
47
|
-
|
47
|
+
model_config = ConfigDict(extra="forbid")
|
48
|
+
parallel_tasks_per_job: Optional[int] = None
|
48
49
|
|
49
50
|
|
50
51
|
def get_default_local_backend_config():
|