fractal-server 2.12.0a1__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +17 -63
- fractal_server/app/models/security.py +9 -12
- fractal_server/app/models/v2/dataset.py +2 -2
- fractal_server/app/models/v2/job.py +11 -9
- fractal_server/app/models/v2/task.py +2 -3
- fractal_server/app/models/v2/task_group.py +6 -2
- fractal_server/app/models/v2/workflowtask.py +15 -8
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +1 -1
- fractal_server/app/routes/api/v2/dataset.py +4 -4
- fractal_server/app/routes/api/v2/images.py +11 -23
- fractal_server/app/routes/api/v2/project.py +2 -2
- fractal_server/app/routes/api/v2/status.py +1 -1
- fractal_server/app/routes/api/v2/submit.py +8 -6
- fractal_server/app/routes/api/v2/task.py +4 -2
- fractal_server/app/routes/api/v2/task_collection.py +3 -2
- fractal_server/app/routes/api/v2/task_group.py +2 -2
- fractal_server/app/routes/api/v2/workflow.py +3 -3
- fractal_server/app/routes/api/v2/workflow_import.py +3 -3
- fractal_server/app/routes/api/v2/workflowtask.py +3 -1
- fractal_server/app/routes/auth/_aux_auth.py +4 -1
- fractal_server/app/routes/auth/current_user.py +3 -5
- fractal_server/app/routes/auth/group.py +1 -1
- fractal_server/app/routes/auth/users.py +2 -4
- fractal_server/app/routes/aux/_runner.py +1 -1
- fractal_server/app/routes/aux/validate_user_settings.py +1 -2
- fractal_server/app/runner/executors/_job_states.py +13 -0
- fractal_server/app/runner/executors/slurm/_slurm_config.py +26 -18
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +31 -22
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +2 -6
- fractal_server/app/runner/executors/slurm/ssh/executor.py +35 -50
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +1 -2
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +37 -47
- fractal_server/app/runner/executors/slurm/sudo/executor.py +77 -41
- fractal_server/app/runner/v2/__init__.py +0 -9
- fractal_server/app/runner/v2/_local/_local_config.py +5 -4
- fractal_server/app/runner/v2/_slurm_common/get_slurm_config.py +4 -4
- fractal_server/app/runner/v2/_slurm_sudo/__init__.py +2 -2
- fractal_server/app/runner/v2/deduplicate_list.py +1 -1
- fractal_server/app/runner/v2/runner.py +9 -4
- fractal_server/app/runner/v2/task_interface.py +15 -7
- fractal_server/app/schemas/_filter_validators.py +6 -3
- fractal_server/app/schemas/_validators.py +7 -5
- fractal_server/app/schemas/user.py +23 -18
- fractal_server/app/schemas/user_group.py +25 -11
- fractal_server/app/schemas/user_settings.py +31 -24
- fractal_server/app/schemas/v2/dataset.py +48 -35
- fractal_server/app/schemas/v2/dumps.py +16 -14
- fractal_server/app/schemas/v2/job.py +49 -29
- fractal_server/app/schemas/v2/manifest.py +32 -28
- fractal_server/app/schemas/v2/project.py +18 -8
- fractal_server/app/schemas/v2/task.py +86 -75
- fractal_server/app/schemas/v2/task_collection.py +41 -30
- fractal_server/app/schemas/v2/task_group.py +39 -20
- fractal_server/app/schemas/v2/workflow.py +24 -12
- fractal_server/app/schemas/v2/workflowtask.py +63 -61
- fractal_server/app/security/__init__.py +7 -4
- fractal_server/app/security/signup_email.py +21 -12
- fractal_server/config.py +123 -75
- fractal_server/images/models.py +18 -12
- fractal_server/main.py +13 -10
- fractal_server/migrations/env.py +16 -63
- fractal_server/tasks/v2/local/collect.py +9 -8
- fractal_server/tasks/v2/local/deactivate.py +3 -0
- fractal_server/tasks/v2/local/reactivate.py +3 -0
- fractal_server/tasks/v2/ssh/collect.py +8 -8
- fractal_server/tasks/v2/ssh/deactivate.py +3 -0
- fractal_server/tasks/v2/ssh/reactivate.py +9 -6
- fractal_server/tasks/v2/utils_background.py +1 -1
- fractal_server/tasks/v2/utils_database.py +1 -1
- {fractal_server-2.12.0a1.dist-info → fractal_server-2.13.0.dist-info}/METADATA +10 -11
- {fractal_server-2.12.0a1.dist-info → fractal_server-2.13.0.dist-info}/RECORD +78 -81
- fractal_server/app/runner/v2/_local_experimental/__init__.py +0 -121
- fractal_server/app/runner/v2/_local_experimental/_local_config.py +0 -108
- fractal_server/app/runner/v2/_local_experimental/_submit_setup.py +0 -42
- fractal_server/app/runner/v2/_local_experimental/executor.py +0 -157
- {fractal_server-2.12.0a1.dist-info → fractal_server-2.13.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.12.0a1.dist-info → fractal_server-2.13.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.12.0a1.dist-info → fractal_server-2.13.0.dist-info}/entry_points.txt +0 -0
@@ -58,11 +58,14 @@ async def _get_single_user_with_groups(
|
|
58
58
|
group_ids_names.insert(0, default_group)
|
59
59
|
else:
|
60
60
|
pass
|
61
|
+
oauth_accounts = [
|
62
|
+
oauth_account.model_dump() for oauth_account in user.oauth_accounts
|
63
|
+
]
|
61
64
|
|
62
65
|
return UserRead(
|
63
66
|
**user.model_dump(),
|
64
67
|
group_ids_names=group_ids_names,
|
65
|
-
oauth_accounts=
|
68
|
+
oauth_accounts=oauth_accounts,
|
66
69
|
)
|
67
70
|
|
68
71
|
|
@@ -57,14 +57,14 @@ async def patch_current_user(
|
|
57
57
|
Note: a user cannot patch their own password (as enforced within the
|
58
58
|
`UserUpdateStrict` schema).
|
59
59
|
"""
|
60
|
-
update = UserUpdate(**user_update.
|
60
|
+
update = UserUpdate(**user_update.model_dump(exclude_unset=True))
|
61
61
|
|
62
62
|
# NOTE: here it would be relevant to catch an `InvalidPasswordException`
|
63
63
|
# (from `fastapi_users.exceptions`), if we were to allow users change
|
64
64
|
# their own password
|
65
65
|
|
66
66
|
user = await user_manager.update(update, current_user, safe=True)
|
67
|
-
validated_user = schemas.model_validate(UserOAuth, user)
|
67
|
+
validated_user = schemas.model_validate(UserOAuth, user.model_dump())
|
68
68
|
|
69
69
|
patched_user = await db.get(
|
70
70
|
UserOAuth, validated_user.id, populate_existing=True
|
@@ -82,7 +82,6 @@ async def get_current_user_settings(
|
|
82
82
|
current_user: UserOAuth = Depends(current_active_user),
|
83
83
|
db: AsyncSession = Depends(get_async_db),
|
84
84
|
) -> UserSettingsReadStrict:
|
85
|
-
|
86
85
|
verify_user_has_settings(current_user)
|
87
86
|
user_settings = await db.get(UserSettings, current_user.user_settings_id)
|
88
87
|
return user_settings
|
@@ -96,13 +95,12 @@ async def patch_current_user_settings(
|
|
96
95
|
current_user: UserOAuth = Depends(current_active_user),
|
97
96
|
db: AsyncSession = Depends(get_async_db),
|
98
97
|
) -> UserSettingsReadStrict:
|
99
|
-
|
100
98
|
verify_user_has_settings(current_user)
|
101
99
|
current_user_settings = await db.get(
|
102
100
|
UserSettings, current_user.user_settings_id
|
103
101
|
)
|
104
102
|
|
105
|
-
for k, v in settings_update.
|
103
|
+
for k, v in settings_update.model_dump(exclude_unset=True).items():
|
106
104
|
setattr(current_user_settings, k, v)
|
107
105
|
|
108
106
|
db.add(current_user_settings)
|
@@ -194,7 +194,7 @@ async def patch_user_settings_bulk(
|
|
194
194
|
.where(LinkUserGroup.group_id == group_id)
|
195
195
|
)
|
196
196
|
settings_list = res.scalars().all()
|
197
|
-
update = settings_update.
|
197
|
+
update = settings_update.model_dump(exclude_unset=True)
|
198
198
|
for settings in settings_list:
|
199
199
|
for k, v in update.items():
|
200
200
|
setattr(settings, k, v)
|
@@ -75,7 +75,7 @@ async def patch_user(
|
|
75
75
|
safe=False,
|
76
76
|
request=None,
|
77
77
|
)
|
78
|
-
validated_user = schemas.model_validate(UserOAuth, user)
|
78
|
+
validated_user = schemas.model_validate(UserOAuth, user.model_dump())
|
79
79
|
patched_user = await db.get(
|
80
80
|
UserOAuth, validated_user.id, populate_existing=True
|
81
81
|
)
|
@@ -139,7 +139,6 @@ async def set_user_groups(
|
|
139
139
|
superuser: UserOAuth = Depends(current_active_superuser),
|
140
140
|
db: AsyncSession = Depends(get_async_db),
|
141
141
|
) -> UserRead:
|
142
|
-
|
143
142
|
# Preliminary check that all objects exist in the db
|
144
143
|
user = await _user_or_404(user_id=user_id, db=db)
|
145
144
|
target_group_ids = user_update.group_ids
|
@@ -209,7 +208,6 @@ async def get_user_settings(
|
|
209
208
|
superuser: UserOAuth = Depends(current_active_superuser),
|
210
209
|
db: AsyncSession = Depends(get_async_db),
|
211
210
|
) -> UserSettingsRead:
|
212
|
-
|
213
211
|
user = await _user_or_404(user_id=user_id, db=db)
|
214
212
|
verify_user_has_settings(user)
|
215
213
|
user_settings = await db.get(UserSettings, user.user_settings_id)
|
@@ -229,7 +227,7 @@ async def patch_user_settings(
|
|
229
227
|
verify_user_has_settings(user)
|
230
228
|
user_settings = await db.get(UserSettings, user.user_settings_id)
|
231
229
|
|
232
|
-
for k, v in settings_update.
|
230
|
+
for k, v in settings_update.model_dump(exclude_unset=True).items():
|
233
231
|
setattr(user_settings, k, v)
|
234
232
|
|
235
233
|
db.add(user_settings)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
from fastapi import HTTPException
|
2
2
|
from fastapi import status
|
3
|
-
from pydantic import BaseModel
|
4
3
|
from pydantic import ValidationError
|
5
4
|
|
6
5
|
from fractal_server.app.db import AsyncSession
|
@@ -55,7 +54,7 @@ async def validate_user_settings(
|
|
55
54
|
UserSettingsValidationModel = SlurmSudoUserSettings
|
56
55
|
else:
|
57
56
|
# For other backends, we don't validate anything
|
58
|
-
|
57
|
+
return user_settings
|
59
58
|
|
60
59
|
try:
|
61
60
|
UserSettingsValidationModel(**user_settings.model_dump())
|
@@ -18,9 +18,9 @@ from typing import Optional
|
|
18
18
|
from typing import Union
|
19
19
|
|
20
20
|
from pydantic import BaseModel
|
21
|
-
from pydantic import
|
21
|
+
from pydantic import ConfigDict
|
22
22
|
from pydantic import Field
|
23
|
-
from pydantic
|
23
|
+
from pydantic import ValidationError
|
24
24
|
|
25
25
|
from .....config import get_settings
|
26
26
|
from .....logger import set_logger
|
@@ -37,7 +37,7 @@ class SlurmConfigError(ValueError):
|
|
37
37
|
pass
|
38
38
|
|
39
39
|
|
40
|
-
class _SlurmConfigSet(BaseModel
|
40
|
+
class _SlurmConfigSet(BaseModel):
|
41
41
|
"""
|
42
42
|
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` for the default/gpu
|
43
43
|
SLURM config. Only used as part of `SlurmConfigFile`.
|
@@ -54,19 +54,21 @@ class _SlurmConfigSet(BaseModel, extra=Extra.forbid):
|
|
54
54
|
extra_lines:
|
55
55
|
"""
|
56
56
|
|
57
|
-
|
58
|
-
cpus_per_task: Optional[int]
|
59
|
-
mem: Optional[Union[int, str]]
|
60
|
-
constraint: Optional[str]
|
61
|
-
gres: Optional[str]
|
62
|
-
time: Optional[str]
|
63
|
-
account: Optional[str]
|
64
|
-
extra_lines: Optional[list[str]]
|
65
|
-
pre_submission_commands: Optional[list[str]]
|
66
|
-
gpus: Optional[str]
|
57
|
+
model_config = ConfigDict(extra="forbid")
|
67
58
|
|
59
|
+
partition: Optional[str] = None
|
60
|
+
cpus_per_task: Optional[int] = None
|
61
|
+
mem: Optional[Union[int, str]] = None
|
62
|
+
constraint: Optional[str] = None
|
63
|
+
gres: Optional[str] = None
|
64
|
+
time: Optional[str] = None
|
65
|
+
account: Optional[str] = None
|
66
|
+
extra_lines: Optional[list[str]] = None
|
67
|
+
pre_submission_commands: Optional[list[str]] = None
|
68
|
+
gpus: Optional[str] = None
|
68
69
|
|
69
|
-
|
70
|
+
|
71
|
+
class _BatchingConfigSet(BaseModel):
|
70
72
|
"""
|
71
73
|
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` to configure the
|
72
74
|
batching strategy (that is, how to combine several tasks in a single SLURM
|
@@ -83,6 +85,8 @@ class _BatchingConfigSet(BaseModel, extra=Extra.forbid):
|
|
83
85
|
max_num_jobs:
|
84
86
|
"""
|
85
87
|
|
88
|
+
model_config = ConfigDict(extra="forbid")
|
89
|
+
|
86
90
|
target_cpus_per_job: int
|
87
91
|
max_cpus_per_job: int
|
88
92
|
target_mem_per_job: Union[int, str]
|
@@ -91,7 +95,7 @@ class _BatchingConfigSet(BaseModel, extra=Extra.forbid):
|
|
91
95
|
max_num_jobs: int
|
92
96
|
|
93
97
|
|
94
|
-
class SlurmConfigFile(BaseModel
|
98
|
+
class SlurmConfigFile(BaseModel):
|
95
99
|
"""
|
96
100
|
Specifications for the content of `FRACTAL_SLURM_CONFIG_FILE`
|
97
101
|
|
@@ -136,10 +140,12 @@ class SlurmConfigFile(BaseModel, extra=Extra.forbid):
|
|
136
140
|
directory.
|
137
141
|
"""
|
138
142
|
|
143
|
+
model_config = ConfigDict(extra="forbid")
|
144
|
+
|
139
145
|
default_slurm_config: _SlurmConfigSet
|
140
|
-
gpu_slurm_config: Optional[_SlurmConfigSet]
|
146
|
+
gpu_slurm_config: Optional[_SlurmConfigSet] = None
|
141
147
|
batching_config: _BatchingConfigSet
|
142
|
-
user_local_exports: Optional[dict[str, str]]
|
148
|
+
user_local_exports: Optional[dict[str, str]] = None
|
143
149
|
|
144
150
|
|
145
151
|
def load_slurm_config_file(
|
@@ -196,7 +202,7 @@ def load_slurm_config_file(
|
|
196
202
|
return obj
|
197
203
|
|
198
204
|
|
199
|
-
class SlurmConfig(BaseModel
|
205
|
+
class SlurmConfig(BaseModel):
|
200
206
|
"""
|
201
207
|
Abstraction for SLURM parameters
|
202
208
|
|
@@ -247,6 +253,8 @@ class SlurmConfig(BaseModel, extra=Extra.forbid):
|
|
247
253
|
command.
|
248
254
|
"""
|
249
255
|
|
256
|
+
model_config = ConfigDict(extra="forbid")
|
257
|
+
|
250
258
|
# Required SLURM parameters (note that the integer attributes are those
|
251
259
|
# that will need to scale up with the number of parallel tasks per job)
|
252
260
|
partition: str
|
@@ -1,10 +1,8 @@
|
|
1
1
|
import os
|
2
|
+
import threading
|
2
3
|
import time
|
3
4
|
import traceback
|
4
5
|
from itertools import count
|
5
|
-
from typing import Callable
|
6
|
-
|
7
|
-
from cfut import FileWaitThread
|
8
6
|
|
9
7
|
from ......logger import set_logger
|
10
8
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
@@ -12,35 +10,46 @@ from fractal_server.app.runner.exceptions import JobExecutionError
|
|
12
10
|
logger = set_logger(__name__)
|
13
11
|
|
14
12
|
|
15
|
-
class
|
13
|
+
class FractalSlurmSSHWaitThread(threading.Thread):
|
16
14
|
"""
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
15
|
+
Thread that monitors a pool of SLURM jobs
|
16
|
+
|
17
|
+
This class is a custom re-implementation of the waiting thread class from:
|
18
|
+
|
19
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
20
|
+
> Original Copyright
|
21
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
22
|
+
> License: MIT
|
23
|
+
|
24
|
+
Attributes:
|
25
|
+
shutdown_file:
|
26
|
+
shutdown_callback:
|
27
|
+
slurm_poll_interval:
|
28
|
+
jobs_finished_callback:
|
29
|
+
active_job_ids:
|
30
|
+
shutdown:
|
31
|
+
lock:
|
29
32
|
"""
|
30
33
|
|
31
34
|
shutdown_file: str
|
32
|
-
shutdown_callback:
|
33
|
-
jobs_finished_callback: Callable
|
35
|
+
shutdown_callback: callable
|
34
36
|
slurm_poll_interval = 30
|
37
|
+
jobs_finished_callback: callable
|
35
38
|
active_job_ids: list[str]
|
39
|
+
shutdown: bool
|
40
|
+
_lock: threading.Lock
|
36
41
|
|
37
|
-
def __init__(self,
|
42
|
+
def __init__(self, callback: callable, interval=1):
|
38
43
|
"""
|
39
44
|
Init method
|
40
45
|
|
41
46
|
This method is executed on the main thread.
|
42
47
|
"""
|
43
|
-
|
48
|
+
threading.Thread.__init__(self, daemon=True)
|
49
|
+
self.callback = callback
|
50
|
+
self.interval = interval
|
51
|
+
self._lock = threading.Lock()
|
52
|
+
self.shutdown = False
|
44
53
|
self.active_job_ids = []
|
45
54
|
|
46
55
|
def wait(self, *, job_id: str):
|
@@ -53,7 +62,7 @@ class FractalSlurmWaitThread(FileWaitThread):
|
|
53
62
|
error_msg = "Cannot call `wait` method after executor shutdown."
|
54
63
|
logger.warning(error_msg)
|
55
64
|
raise JobExecutionError(info=error_msg)
|
56
|
-
with self.
|
65
|
+
with self._lock:
|
57
66
|
self.active_job_ids.append(job_id)
|
58
67
|
|
59
68
|
def check_shutdown(self):
|
@@ -109,7 +118,7 @@ class FractalSlurmWaitThread(FileWaitThread):
|
|
109
118
|
pass
|
110
119
|
return
|
111
120
|
if ind % skip == 0:
|
112
|
-
with self.
|
121
|
+
with self._lock:
|
113
122
|
try:
|
114
123
|
self.check_jobs()
|
115
124
|
except Exception: # nosec
|
@@ -1,8 +1,7 @@
|
|
1
|
+
import uuid
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
|
-
from cfut.util import random_string
|
5
|
-
|
6
5
|
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
7
6
|
SlurmConfig,
|
8
7
|
)
|
@@ -88,7 +87,6 @@ class SlurmJob:
|
|
88
87
|
self,
|
89
88
|
num_tasks_tot: int,
|
90
89
|
slurm_config: SlurmConfig,
|
91
|
-
workflow_task_file_prefix: Optional[str] = None,
|
92
90
|
slurm_file_prefix: Optional[str] = None,
|
93
91
|
wftask_file_prefixes: Optional[tuple[str, ...]] = None,
|
94
92
|
single_task_submission: bool = False,
|
@@ -107,9 +105,7 @@ class SlurmJob:
|
|
107
105
|
)
|
108
106
|
else:
|
109
107
|
self.wftask_file_prefixes = wftask_file_prefixes
|
110
|
-
self.workerids = tuple(
|
111
|
-
random_string() for i in range(self.num_tasks_tot)
|
112
|
-
)
|
108
|
+
self.workerids = tuple(uuid.uuid4() for i in range(self.num_tasks_tot))
|
113
109
|
self.slurm_config = slurm_config
|
114
110
|
|
115
111
|
def get_clean_output_pickle_files(self) -> tuple[str, ...]:
|
@@ -1,20 +1,9 @@
|
|
1
|
-
# This adapts clusterfutures <https://github.com/sampsyo/clusterfutures>
|
2
|
-
# Original Copyright
|
3
|
-
# Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
4
|
-
# License: MIT
|
5
|
-
#
|
6
|
-
# Modified by:
|
7
|
-
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
8
|
-
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
9
|
-
# Marco Franzon <marco.franzon@exact-lab.it>
|
10
|
-
#
|
11
|
-
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
12
|
-
# University of Zurich
|
13
1
|
import json
|
14
2
|
import math
|
15
3
|
import sys
|
16
4
|
import threading
|
17
5
|
import time
|
6
|
+
from concurrent.futures import Executor
|
18
7
|
from concurrent.futures import Future
|
19
8
|
from concurrent.futures import InvalidStateError
|
20
9
|
from copy import copy
|
@@ -25,18 +14,18 @@ from typing import Optional
|
|
25
14
|
from typing import Sequence
|
26
15
|
|
27
16
|
import cloudpickle
|
28
|
-
from cfut import SlurmExecutor
|
29
17
|
|
30
18
|
from ....filenames import SHUTDOWN_FILENAME
|
31
19
|
from ....task_files import get_task_file_paths
|
32
20
|
from ....task_files import TaskFiles
|
33
21
|
from ....versions import get_versions
|
22
|
+
from ..._job_states import STATES_FINISHED
|
34
23
|
from ...slurm._slurm_config import SlurmConfig
|
35
24
|
from .._batching import heuristics
|
36
25
|
from ..utils_executors import get_pickle_file_path
|
37
26
|
from ..utils_executors import get_slurm_file_path
|
38
27
|
from ..utils_executors import get_slurm_script_file_path
|
39
|
-
from ._executor_wait_thread import
|
28
|
+
from ._executor_wait_thread import FractalSlurmSSHWaitThread
|
40
29
|
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
41
30
|
from fractal_server.app.runner.compress_folder import compress_folder
|
42
31
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
@@ -48,25 +37,31 @@ from fractal_server.logger import set_logger
|
|
48
37
|
from fractal_server.ssh._fabric import FractalSSH
|
49
38
|
from fractal_server.syringe import Inject
|
50
39
|
|
40
|
+
|
51
41
|
logger = set_logger(__name__)
|
52
42
|
|
53
43
|
|
54
|
-
class FractalSlurmSSHExecutor(
|
44
|
+
class FractalSlurmSSHExecutor(Executor):
|
55
45
|
"""
|
56
|
-
|
46
|
+
Executor to submit SLURM jobs via SSH
|
47
|
+
|
48
|
+
This class is a custom re-implementation of the SLURM executor from
|
49
|
+
|
50
|
+
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
51
|
+
> Original Copyright
|
52
|
+
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
53
|
+
> License: MIT
|
57
54
|
|
58
|
-
FIXME: docstring
|
59
55
|
|
60
56
|
Attributes:
|
61
57
|
fractal_ssh: FractalSSH connection with custom lock
|
62
|
-
shutdown_file:
|
63
|
-
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
-
wait_thread_cls: Class for waiting thread
|
65
|
-
keep_pickle_files:
|
66
58
|
workflow_dir_local:
|
67
59
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
68
60
|
workflow_dir_remote:
|
69
61
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
62
|
+
shutdown_file:
|
63
|
+
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
+
wait_thread_cls: Class for waiting thread
|
70
65
|
common_script_lines:
|
71
66
|
Arbitrary script lines that will always be included in the
|
72
67
|
sbatch script
|
@@ -83,11 +78,10 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
83
78
|
shutdown_file: str
|
84
79
|
python_remote: str
|
85
80
|
|
86
|
-
wait_thread_cls =
|
87
|
-
keep_pickle_files: bool
|
81
|
+
wait_thread_cls = FractalSlurmSSHWaitThread
|
88
82
|
|
89
83
|
common_script_lines: list[str]
|
90
|
-
slurm_account: Optional[str]
|
84
|
+
slurm_account: Optional[str] = None
|
91
85
|
|
92
86
|
jobs: dict[str, tuple[Future, SlurmJob]]
|
93
87
|
map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
|
@@ -100,8 +94,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
100
94
|
# Folders and files
|
101
95
|
workflow_dir_local: Path,
|
102
96
|
workflow_dir_remote: Path,
|
103
|
-
# Runner options
|
104
|
-
keep_pickle_files: bool = False,
|
105
97
|
# Monitoring options
|
106
98
|
slurm_poll_interval: Optional[int] = None,
|
107
99
|
# SLURM submission script options
|
@@ -120,7 +112,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
120
112
|
fractal_ssh:
|
121
113
|
workflow_dir_local:
|
122
114
|
workflow_dir_remote:
|
123
|
-
keep_pickle_files:
|
124
115
|
slurm_poll_interval:
|
125
116
|
common_script_lines:
|
126
117
|
slurm_account:
|
@@ -194,7 +185,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
194
185
|
raise e
|
195
186
|
|
196
187
|
# Set/initialize some more options
|
197
|
-
self.keep_pickle_files = keep_pickle_files
|
198
188
|
self.map_jobid_to_slurm_files_local = {}
|
199
189
|
|
200
190
|
def _validate_common_script_lines(self):
|
@@ -901,12 +891,11 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
901
891
|
pass
|
902
892
|
for job_id in remaining_job_ids:
|
903
893
|
self._cleanup(job_id)
|
904
|
-
|
905
|
-
for
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
path.unlink()
|
894
|
+
for job in remaining_jobs:
|
895
|
+
for path in job.output_pickle_files_local:
|
896
|
+
path.unlink()
|
897
|
+
for path in job.input_pickle_files_local:
|
898
|
+
path.unlink()
|
910
899
|
|
911
900
|
def _completion(self, job_ids: list[str]) -> None:
|
912
901
|
"""
|
@@ -1001,8 +990,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1001
990
|
f"Future {future} (SLURM job ID: {job_id}) "
|
1002
991
|
"was already cancelled."
|
1003
992
|
)
|
1004
|
-
|
1005
|
-
in_path.unlink()
|
993
|
+
in_path.unlink()
|
1006
994
|
self._cleanup(job_id)
|
1007
995
|
self._handle_remaining_jobs(
|
1008
996
|
remaining_futures=remaining_futures,
|
@@ -1062,17 +1050,15 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1062
1050
|
remaining_job_ids=remaining_job_ids,
|
1063
1051
|
)
|
1064
1052
|
return
|
1065
|
-
|
1066
|
-
out_path.unlink()
|
1053
|
+
out_path.unlink()
|
1067
1054
|
except InvalidStateError:
|
1068
1055
|
logger.warning(
|
1069
1056
|
f"Future {future} (SLURM job ID: {job_id}) was "
|
1070
1057
|
"already cancelled, exit from "
|
1071
1058
|
"FractalSlurmSSHExecutor._completion."
|
1072
1059
|
)
|
1073
|
-
|
1074
|
-
|
1075
|
-
in_path.unlink()
|
1060
|
+
out_path.unlink()
|
1061
|
+
in_path.unlink()
|
1076
1062
|
|
1077
1063
|
self._cleanup(job_id)
|
1078
1064
|
self._handle_remaining_jobs(
|
@@ -1082,8 +1068,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1082
1068
|
return
|
1083
1069
|
|
1084
1070
|
# Clean up input pickle file
|
1085
|
-
|
1086
|
-
in_path.unlink()
|
1071
|
+
in_path.unlink()
|
1087
1072
|
self._cleanup(job_id)
|
1088
1073
|
if job.single_task_submission:
|
1089
1074
|
future.set_result(outputs[0])
|
@@ -1170,7 +1155,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1170
1155
|
Path(tarfile_path_local).unlink()
|
1171
1156
|
|
1172
1157
|
t_1 = time.perf_counter()
|
1173
|
-
logger.info("[_get_subfolder_sftp] End -
|
1158
|
+
logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
|
1174
1159
|
|
1175
1160
|
def _prepare_sbatch_script(
|
1176
1161
|
self,
|
@@ -1210,8 +1195,10 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1210
1195
|
script_lines = slurm_config.sort_script_lines(script_lines)
|
1211
1196
|
logger.debug(script_lines)
|
1212
1197
|
|
1213
|
-
# Always print output of `pwd`
|
1214
|
-
script_lines.append(
|
1198
|
+
# Always print output of `uname -n` and `pwd`
|
1199
|
+
script_lines.append(
|
1200
|
+
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
1201
|
+
)
|
1215
1202
|
|
1216
1203
|
# Complete script preamble
|
1217
1204
|
script_lines.append("\n")
|
@@ -1267,7 +1254,7 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1267
1254
|
logger.debug("Executor shutdown: end")
|
1268
1255
|
|
1269
1256
|
def _stop_and_join_wait_thread(self):
|
1270
|
-
self.wait_thread.
|
1257
|
+
self.wait_thread.shutdown = True
|
1271
1258
|
self.wait_thread.join()
|
1272
1259
|
|
1273
1260
|
def __exit__(self, *args, **kwargs):
|
@@ -1304,8 +1291,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1304
1291
|
(released under the MIT licence)
|
1305
1292
|
"""
|
1306
1293
|
|
1307
|
-
from cfut.slurm import STATES_FINISHED
|
1308
|
-
|
1309
1294
|
logger.debug(
|
1310
1295
|
f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
|
1311
1296
|
)
|
@@ -1396,6 +1381,6 @@ class FractalSlurmSSHExecutor(SlurmExecutor):
|
|
1396
1381
|
t_end_handshake = time.perf_counter()
|
1397
1382
|
logger.info(
|
1398
1383
|
"[FractalSlurmSSHExecutor.ssh_handshake] END"
|
1399
|
-
f" - elapsed: {t_end_handshake-t_start_handshake:.3f} s"
|
1384
|
+
f" - elapsed: {t_end_handshake - t_start_handshake:.3f} s"
|
1400
1385
|
)
|
1401
1386
|
return remote_versions
|