fractal-server 2.13.1__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +3 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/__init__.py +7 -1
- fractal_server/app/models/v2/dataset.py +1 -11
- fractal_server/app/models/v2/history.py +78 -0
- fractal_server/app/models/v2/job.py +10 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/accounting.py +18 -28
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/__init__.py +1 -1
- fractal_server/app/routes/api/v2/__init__.py +8 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
- fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -17
- fractal_server/app/routes/api/v2/history.py +544 -0
- fractal_server/app/routes/api/v2/images.py +31 -43
- fractal_server/app/routes/api/v2/job.py +30 -0
- fractal_server/app/routes/api/v2/project.py +1 -53
- fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/task.py +3 -10
- fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
- fractal_server/app/routes/api/v2/workflow.py +28 -69
- fractal_server/app/routes/api/v2/workflowtask.py +53 -50
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/routes/auth/oauth.py +5 -3
- fractal_server/app/routes/pagination.py +47 -0
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/compress_folder.py +57 -29
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +157 -0
- fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
- fractal_server/app/runner/executors/local/runner.py +248 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
- fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
- fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
- fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
- fractal_server/app/runner/extract_archive.py +1 -3
- fractal_server/app/runner/task_files.py +134 -87
- fractal_server/app/runner/v2/__init__.py +0 -399
- fractal_server/app/runner/v2/_local.py +88 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +20 -19
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +17 -15
- fractal_server/app/runner/v2/db_tools.py +119 -0
- fractal_server/app/runner/v2/runner.py +206 -95
- fractal_server/app/runner/v2/runner_functions.py +488 -187
- fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
- fractal_server/app/runner/v2/submit_workflow.py +358 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/_validators.py +13 -24
- fractal_server/app/schemas/user.py +10 -7
- fractal_server/app/schemas/user_settings.py +9 -21
- fractal_server/app/schemas/v2/__init__.py +9 -1
- fractal_server/app/schemas/v2/dataset.py +12 -94
- fractal_server/app/schemas/v2/dumps.py +26 -9
- fractal_server/app/schemas/v2/history.py +80 -0
- fractal_server/app/schemas/v2/job.py +15 -8
- fractal_server/app/schemas/v2/manifest.py +14 -7
- fractal_server/app/schemas/v2/project.py +9 -7
- fractal_server/app/schemas/v2/status_legacy.py +35 -0
- fractal_server/app/schemas/v2/task.py +72 -77
- fractal_server/app/schemas/v2/task_collection.py +14 -32
- fractal_server/app/schemas/v2/task_group.py +10 -9
- fractal_server/app/schemas/v2/workflow.py +10 -11
- fractal_server/app/schemas/v2/workflowtask.py +2 -21
- fractal_server/app/security/__init__.py +3 -3
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/config.py +41 -46
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
- fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
- fractal_server/ssh/_fabric.py +28 -14
- fractal_server/tasks/v2/local/collect.py +2 -2
- fractal_server/tasks/v2/ssh/collect.py +2 -2
- fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
- fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
- fractal_server/tasks/v2/utils_background.py +0 -19
- fractal_server/tasks/v2/utils_database.py +30 -17
- fractal_server/tasks/v2/utils_templates.py +6 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/RECORD +106 -96
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- fractal_server/app/schemas/v2/status.py +0 -16
- /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -1,399 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Runner backend subsystem root V2
|
3
|
-
|
4
|
-
This module is the single entry point to the runner backend subsystem V2.
|
5
|
-
Other subystems should only import this module and not its submodules or
|
6
|
-
the individual backends.
|
7
|
-
"""
|
8
|
-
import os
|
9
|
-
import traceback
|
10
|
-
from pathlib import Path
|
11
|
-
from typing import Optional
|
12
|
-
|
13
|
-
from sqlalchemy.orm import Session as DBSyncSession
|
14
|
-
|
15
|
-
from ....config import get_settings
|
16
|
-
from ....logger import get_logger
|
17
|
-
from ....logger import reset_logger_handlers
|
18
|
-
from ....logger import set_logger
|
19
|
-
from ....ssh._fabric import FractalSSH
|
20
|
-
from ....syringe import Inject
|
21
|
-
from ....utils import get_timestamp
|
22
|
-
from ....zip_tools import _zip_folder_to_file_and_remove
|
23
|
-
from ...db import DB
|
24
|
-
from ...models.v2 import DatasetV2
|
25
|
-
from ...models.v2 import JobV2
|
26
|
-
from ...models.v2 import WorkflowV2
|
27
|
-
from ...schemas.v2 import JobStatusTypeV2
|
28
|
-
from ..exceptions import JobExecutionError
|
29
|
-
from ..exceptions import TaskExecutionError
|
30
|
-
from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
|
31
|
-
from ..filenames import WORKFLOW_LOG_FILENAME
|
32
|
-
from ..task_files import task_subfolder_name
|
33
|
-
from ._local import process_workflow as local_process_workflow
|
34
|
-
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
35
|
-
from ._slurm_sudo import process_workflow as slurm_sudo_process_workflow
|
36
|
-
from .handle_failed_job import mark_last_wftask_as_failed
|
37
|
-
from fractal_server import __VERSION__
|
38
|
-
from fractal_server.app.models import UserSettings
|
39
|
-
|
40
|
-
|
41
|
-
_backends = {}
|
42
|
-
_backends["local"] = local_process_workflow
|
43
|
-
_backends["slurm"] = slurm_sudo_process_workflow
|
44
|
-
_backends["slurm_ssh"] = slurm_ssh_process_workflow
|
45
|
-
|
46
|
-
|
47
|
-
def fail_job(
|
48
|
-
*,
|
49
|
-
db: DBSyncSession,
|
50
|
-
job: JobV2,
|
51
|
-
log_msg: str,
|
52
|
-
logger_name: str,
|
53
|
-
emit_log: bool = False,
|
54
|
-
) -> None:
|
55
|
-
logger = get_logger(logger_name=logger_name)
|
56
|
-
if emit_log:
|
57
|
-
logger.error(log_msg)
|
58
|
-
reset_logger_handlers(logger)
|
59
|
-
job.status = JobStatusTypeV2.FAILED
|
60
|
-
job.end_timestamp = get_timestamp()
|
61
|
-
job.log = log_msg
|
62
|
-
db.merge(job)
|
63
|
-
db.commit()
|
64
|
-
db.close()
|
65
|
-
return
|
66
|
-
|
67
|
-
|
68
|
-
def submit_workflow(
|
69
|
-
*,
|
70
|
-
workflow_id: int,
|
71
|
-
dataset_id: int,
|
72
|
-
job_id: int,
|
73
|
-
user_id: int,
|
74
|
-
user_settings: UserSettings,
|
75
|
-
worker_init: Optional[str] = None,
|
76
|
-
slurm_user: Optional[str] = None,
|
77
|
-
user_cache_dir: Optional[str] = None,
|
78
|
-
fractal_ssh: Optional[FractalSSH] = None,
|
79
|
-
) -> None:
|
80
|
-
"""
|
81
|
-
Prepares a workflow and applies it to a dataset
|
82
|
-
|
83
|
-
This function wraps the process_workflow one, which is different for each
|
84
|
-
backend (e.g. local or slurm backend).
|
85
|
-
|
86
|
-
Args:
|
87
|
-
workflow_id:
|
88
|
-
ID of the workflow being applied
|
89
|
-
dataset_id:
|
90
|
-
Dataset ID
|
91
|
-
job_id:
|
92
|
-
Id of the job record which stores the state for the current
|
93
|
-
workflow application.
|
94
|
-
user_id:
|
95
|
-
User ID.
|
96
|
-
worker_init:
|
97
|
-
Custom executor parameters that get parsed before the execution of
|
98
|
-
each task.
|
99
|
-
user_cache_dir:
|
100
|
-
Cache directory (namely a path where the user can write); for the
|
101
|
-
slurm backend, this is used as a base directory for
|
102
|
-
`job.working_dir_user`.
|
103
|
-
slurm_user:
|
104
|
-
The username to impersonate for the workflow execution, for the
|
105
|
-
slurm backend.
|
106
|
-
"""
|
107
|
-
# Declare runner backend and set `process_workflow` function
|
108
|
-
settings = Inject(get_settings)
|
109
|
-
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
110
|
-
logger_name = f"WF{workflow_id}_job{job_id}"
|
111
|
-
logger = set_logger(logger_name=logger_name)
|
112
|
-
|
113
|
-
with next(DB.get_sync_db()) as db_sync:
|
114
|
-
try:
|
115
|
-
job: Optional[JobV2] = db_sync.get(JobV2, job_id)
|
116
|
-
dataset: Optional[DatasetV2] = db_sync.get(DatasetV2, dataset_id)
|
117
|
-
workflow: Optional[WorkflowV2] = db_sync.get(
|
118
|
-
WorkflowV2, workflow_id
|
119
|
-
)
|
120
|
-
except Exception as e:
|
121
|
-
logger.error(
|
122
|
-
f"Error conneting to the database. Original error: {str(e)}"
|
123
|
-
)
|
124
|
-
reset_logger_handlers(logger)
|
125
|
-
return
|
126
|
-
|
127
|
-
if job is None:
|
128
|
-
logger.error(f"JobV2 {job_id} does not exist")
|
129
|
-
reset_logger_handlers(logger)
|
130
|
-
return
|
131
|
-
if dataset is None or workflow is None:
|
132
|
-
log_msg = ""
|
133
|
-
if not dataset:
|
134
|
-
log_msg += f"Cannot fetch dataset {dataset_id} from database\n"
|
135
|
-
if not workflow:
|
136
|
-
log_msg += (
|
137
|
-
f"Cannot fetch workflow {workflow_id} from database\n"
|
138
|
-
)
|
139
|
-
fail_job(
|
140
|
-
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
141
|
-
)
|
142
|
-
return
|
143
|
-
|
144
|
-
# Declare runner backend and set `process_workflow` function
|
145
|
-
settings = Inject(get_settings)
|
146
|
-
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
147
|
-
try:
|
148
|
-
process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
|
149
|
-
except KeyError as e:
|
150
|
-
fail_job(
|
151
|
-
db=db_sync,
|
152
|
-
job=job,
|
153
|
-
log_msg=(
|
154
|
-
f"Invalid {FRACTAL_RUNNER_BACKEND=}.\n"
|
155
|
-
f"Original KeyError: {str(e)}"
|
156
|
-
),
|
157
|
-
logger_name=logger_name,
|
158
|
-
emit_log=True,
|
159
|
-
)
|
160
|
-
return
|
161
|
-
|
162
|
-
# Define and create server-side working folder
|
163
|
-
WORKFLOW_DIR_LOCAL = Path(job.working_dir)
|
164
|
-
if WORKFLOW_DIR_LOCAL.exists():
|
165
|
-
fail_job(
|
166
|
-
db=db_sync,
|
167
|
-
job=job,
|
168
|
-
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
169
|
-
logger_name=logger_name,
|
170
|
-
emit_log=True,
|
171
|
-
)
|
172
|
-
return
|
173
|
-
|
174
|
-
try:
|
175
|
-
# Create WORKFLOW_DIR_LOCAL
|
176
|
-
if FRACTAL_RUNNER_BACKEND == "slurm":
|
177
|
-
original_umask = os.umask(0)
|
178
|
-
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
179
|
-
os.umask(original_umask)
|
180
|
-
else:
|
181
|
-
WORKFLOW_DIR_LOCAL.mkdir(parents=True)
|
182
|
-
|
183
|
-
# Define and create WORKFLOW_DIR_REMOTE
|
184
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
185
|
-
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
186
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
187
|
-
WORKFLOW_DIR_REMOTE = (
|
188
|
-
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
189
|
-
)
|
190
|
-
_mkdir_as_user(
|
191
|
-
folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user
|
192
|
-
)
|
193
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
194
|
-
# Folder creation is deferred to _process_workflow
|
195
|
-
WORKFLOW_DIR_REMOTE = (
|
196
|
-
Path(user_settings.ssh_jobs_dir) / WORKFLOW_DIR_LOCAL.name
|
197
|
-
)
|
198
|
-
else:
|
199
|
-
logger.error(
|
200
|
-
"Invalid FRACTAL_RUNNER_BACKEND="
|
201
|
-
f"{settings.FRACTAL_RUNNER_BACKEND}."
|
202
|
-
)
|
203
|
-
|
204
|
-
# Create all tasks subfolders
|
205
|
-
for order in range(job.first_task_index, job.last_task_index + 1):
|
206
|
-
this_wftask = workflow.task_list[order]
|
207
|
-
task_name = this_wftask.task.name
|
208
|
-
subfolder_name = task_subfolder_name(
|
209
|
-
order=order,
|
210
|
-
task_name=task_name,
|
211
|
-
)
|
212
|
-
if FRACTAL_RUNNER_BACKEND == "slurm":
|
213
|
-
# Create local subfolder (with 755) and remote one
|
214
|
-
# (via `sudo -u`)
|
215
|
-
original_umask = os.umask(0)
|
216
|
-
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
|
217
|
-
os.umask(original_umask)
|
218
|
-
_mkdir_as_user(
|
219
|
-
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
220
|
-
user=slurm_user,
|
221
|
-
)
|
222
|
-
else:
|
223
|
-
# Create local subfolder (with standard permission set)
|
224
|
-
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir()
|
225
|
-
logger.info("Skip remote-subfolder creation")
|
226
|
-
except Exception as e:
|
227
|
-
error_type = type(e).__name__
|
228
|
-
fail_job(
|
229
|
-
db=db_sync,
|
230
|
-
job=job,
|
231
|
-
log_msg=(
|
232
|
-
f"{error_type} error occurred while creating job folder "
|
233
|
-
f"and subfolders.\nOriginal error: {str(e)}"
|
234
|
-
),
|
235
|
-
logger_name=logger_name,
|
236
|
-
emit_log=True,
|
237
|
-
)
|
238
|
-
return
|
239
|
-
|
240
|
-
# After Session.commit() is called, either explicitly or when using a
|
241
|
-
# context manager, all objects associated with the Session are expired.
|
242
|
-
# https://docs.sqlalchemy.org/en/14/orm/
|
243
|
-
# session_basics.html#opening-and-closing-a-session
|
244
|
-
# https://docs.sqlalchemy.org/en/14/orm/
|
245
|
-
# session_state_management.html#refreshing-expiring
|
246
|
-
|
247
|
-
# See issue #928:
|
248
|
-
# https://github.com/fractal-analytics-platform/
|
249
|
-
# fractal-server/issues/928
|
250
|
-
|
251
|
-
db_sync.refresh(dataset)
|
252
|
-
db_sync.refresh(workflow)
|
253
|
-
for wftask in workflow.task_list:
|
254
|
-
db_sync.refresh(wftask)
|
255
|
-
|
256
|
-
# Write logs
|
257
|
-
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
258
|
-
logger = set_logger(
|
259
|
-
logger_name=logger_name,
|
260
|
-
log_file_path=log_file_path,
|
261
|
-
)
|
262
|
-
logger.info(
|
263
|
-
f'Start execution of workflow "{workflow.name}"; '
|
264
|
-
f"more logs at {str(log_file_path)}"
|
265
|
-
)
|
266
|
-
logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
|
267
|
-
logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
|
268
|
-
if FRACTAL_RUNNER_BACKEND == "slurm":
|
269
|
-
logger.debug(f"slurm_user: {slurm_user}")
|
270
|
-
logger.debug(f"slurm_account: {job.slurm_account}")
|
271
|
-
logger.debug(f"worker_init: {worker_init}")
|
272
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
273
|
-
logger.debug(f"ssh_user: {user_settings.ssh_username}")
|
274
|
-
logger.debug(f"base dir: {user_settings.ssh_tasks_dir}")
|
275
|
-
logger.debug(f"worker_init: {worker_init}")
|
276
|
-
logger.debug(f"job.id: {job.id}")
|
277
|
-
logger.debug(f"job.working_dir: {job.working_dir}")
|
278
|
-
logger.debug(f"job.working_dir_user: {job.working_dir_user}")
|
279
|
-
logger.debug(f"job.first_task_index: {job.first_task_index}")
|
280
|
-
logger.debug(f"job.last_task_index: {job.last_task_index}")
|
281
|
-
logger.debug(f'START workflow "{workflow.name}"')
|
282
|
-
|
283
|
-
try:
|
284
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
285
|
-
process_workflow = local_process_workflow
|
286
|
-
backend_specific_kwargs = {}
|
287
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
288
|
-
process_workflow = slurm_sudo_process_workflow
|
289
|
-
backend_specific_kwargs = dict(
|
290
|
-
slurm_user=slurm_user,
|
291
|
-
slurm_account=job.slurm_account,
|
292
|
-
user_cache_dir=user_cache_dir,
|
293
|
-
)
|
294
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
295
|
-
process_workflow = slurm_ssh_process_workflow
|
296
|
-
backend_specific_kwargs = dict(fractal_ssh=fractal_ssh)
|
297
|
-
else:
|
298
|
-
raise RuntimeError(
|
299
|
-
f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}"
|
300
|
-
)
|
301
|
-
|
302
|
-
# "The Session.close() method does not prevent the Session from being
|
303
|
-
# used again. The Session itself does not actually have a distinct
|
304
|
-
# “closed” state; it merely means the Session will release all database
|
305
|
-
# connections and ORM objects."
|
306
|
-
# (https://docs.sqlalchemy.org/en/20/orm/session_api.html#sqlalchemy.orm.Session.close).
|
307
|
-
#
|
308
|
-
# We close the session before the (possibly long) process_workflow
|
309
|
-
# call, to make sure all DB connections are released. The reason why we
|
310
|
-
# are not using a context manager within the try block is that we also
|
311
|
-
# need access to db_sync in the except branches.
|
312
|
-
db_sync = next(DB.get_sync_db())
|
313
|
-
db_sync.close()
|
314
|
-
|
315
|
-
process_workflow(
|
316
|
-
workflow=workflow,
|
317
|
-
dataset=dataset,
|
318
|
-
user_id=user_id,
|
319
|
-
workflow_dir_local=WORKFLOW_DIR_LOCAL,
|
320
|
-
workflow_dir_remote=WORKFLOW_DIR_REMOTE,
|
321
|
-
logger_name=logger_name,
|
322
|
-
worker_init=worker_init,
|
323
|
-
first_task_index=job.first_task_index,
|
324
|
-
last_task_index=job.last_task_index,
|
325
|
-
job_attribute_filters=job.attribute_filters,
|
326
|
-
**backend_specific_kwargs,
|
327
|
-
)
|
328
|
-
|
329
|
-
logger.info(
|
330
|
-
f'End execution of workflow "{workflow.name}"; '
|
331
|
-
f"more logs at {str(log_file_path)}"
|
332
|
-
)
|
333
|
-
logger.debug(f'END workflow "{workflow.name}"')
|
334
|
-
|
335
|
-
# Update job DB entry
|
336
|
-
job.status = JobStatusTypeV2.DONE
|
337
|
-
job.end_timestamp = get_timestamp()
|
338
|
-
with log_file_path.open("r") as f:
|
339
|
-
logs = f.read()
|
340
|
-
job.log = logs
|
341
|
-
db_sync.merge(job)
|
342
|
-
db_sync.commit()
|
343
|
-
|
344
|
-
except TaskExecutionError as e:
|
345
|
-
logger.debug(f'FAILED workflow "{workflow.name}", TaskExecutionError.')
|
346
|
-
logger.info(f'Workflow "{workflow.name}" failed (TaskExecutionError).')
|
347
|
-
|
348
|
-
mark_last_wftask_as_failed(
|
349
|
-
dataset_id=dataset_id,
|
350
|
-
logger_name=logger_name,
|
351
|
-
)
|
352
|
-
exception_args_string = "\n".join(e.args)
|
353
|
-
log_msg = (
|
354
|
-
f"TASK ERROR: "
|
355
|
-
f"Task name: {e.task_name}, "
|
356
|
-
f"position in Workflow: {e.workflow_task_order}\n"
|
357
|
-
f"TRACEBACK:\n{exception_args_string}"
|
358
|
-
)
|
359
|
-
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
360
|
-
|
361
|
-
except JobExecutionError as e:
|
362
|
-
logger.debug(f'FAILED workflow "{workflow.name}", JobExecutionError.')
|
363
|
-
logger.info(f'Workflow "{workflow.name}" failed (JobExecutionError).')
|
364
|
-
mark_last_wftask_as_failed(
|
365
|
-
dataset_id=dataset_id,
|
366
|
-
logger_name=logger_name,
|
367
|
-
)
|
368
|
-
fail_job(
|
369
|
-
db=db_sync,
|
370
|
-
job=job,
|
371
|
-
log_msg=(
|
372
|
-
f"JOB ERROR in Fractal job {job.id}:\n"
|
373
|
-
f"TRACEBACK:\n{e.assemble_error()}"
|
374
|
-
),
|
375
|
-
logger_name=logger_name,
|
376
|
-
)
|
377
|
-
|
378
|
-
except Exception:
|
379
|
-
logger.debug(f'FAILED workflow "{workflow.name}", unknown error.')
|
380
|
-
logger.info(f'Workflow "{workflow.name}" failed (unkwnon error).')
|
381
|
-
mark_last_wftask_as_failed(
|
382
|
-
dataset_id=dataset_id,
|
383
|
-
logger_name=logger_name,
|
384
|
-
)
|
385
|
-
current_traceback = traceback.format_exc()
|
386
|
-
fail_job(
|
387
|
-
db=db_sync,
|
388
|
-
job=job,
|
389
|
-
log_msg=(
|
390
|
-
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
391
|
-
f"TRACEBACK:\n{current_traceback}"
|
392
|
-
),
|
393
|
-
logger_name=logger_name,
|
394
|
-
)
|
395
|
-
|
396
|
-
finally:
|
397
|
-
reset_logger_handlers(logger)
|
398
|
-
db_sync.close()
|
399
|
-
_zip_folder_to_file_and_remove(folder=job.working_dir)
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from ...models.v2 import DatasetV2
|
5
|
+
from ...models.v2 import WorkflowV2
|
6
|
+
from ..executors.local.get_local_config import get_local_backend_config
|
7
|
+
from ..executors.local.runner import LocalRunner
|
8
|
+
from ..set_start_and_last_task_index import set_start_and_last_task_index
|
9
|
+
from .runner import execute_tasks_v2
|
10
|
+
from fractal_server.images.models import AttributeFiltersType
|
11
|
+
|
12
|
+
|
13
|
+
def process_workflow(
|
14
|
+
*,
|
15
|
+
workflow: WorkflowV2,
|
16
|
+
dataset: DatasetV2,
|
17
|
+
workflow_dir_local: Path,
|
18
|
+
job_id: int,
|
19
|
+
workflow_dir_remote: Optional[Path] = None,
|
20
|
+
first_task_index: Optional[int] = None,
|
21
|
+
last_task_index: Optional[int] = None,
|
22
|
+
logger_name: str,
|
23
|
+
job_attribute_filters: AttributeFiltersType,
|
24
|
+
job_type_filters: dict[str, bool],
|
25
|
+
user_id: int,
|
26
|
+
**kwargs,
|
27
|
+
) -> None:
|
28
|
+
"""
|
29
|
+
Run a workflow through
|
30
|
+
|
31
|
+
Args:
|
32
|
+
workflow:
|
33
|
+
The workflow to be run
|
34
|
+
dataset:
|
35
|
+
Initial dataset.
|
36
|
+
workflow_dir_local:
|
37
|
+
Working directory for this run.
|
38
|
+
workflow_dir_remote:
|
39
|
+
Working directory for this run, on the user side. This argument is
|
40
|
+
present for compatibility with the standard backend interface, but
|
41
|
+
for the `local` backend it cannot be different from
|
42
|
+
`workflow_dir_local`.
|
43
|
+
first_task_index:
|
44
|
+
Positional index of the first task to execute; if `None`, start
|
45
|
+
from `0`.
|
46
|
+
last_task_index:
|
47
|
+
Positional index of the last task to execute; if `None`, proceed
|
48
|
+
until the last task.
|
49
|
+
logger_name: Logger name
|
50
|
+
user_id:
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
TaskExecutionError: wrapper for errors raised during tasks' execution
|
54
|
+
(positive exit codes).
|
55
|
+
JobExecutionError: wrapper for errors raised by the tasks' executors
|
56
|
+
(negative exit codes).
|
57
|
+
"""
|
58
|
+
|
59
|
+
if workflow_dir_remote and (workflow_dir_remote != workflow_dir_local):
|
60
|
+
raise NotImplementedError(
|
61
|
+
"Local backend does not support different directories "
|
62
|
+
f"{workflow_dir_local=} and {workflow_dir_remote=}"
|
63
|
+
)
|
64
|
+
|
65
|
+
# Set values of first_task_index and last_task_index
|
66
|
+
num_tasks = len(workflow.task_list)
|
67
|
+
first_task_index, last_task_index = set_start_and_last_task_index(
|
68
|
+
num_tasks,
|
69
|
+
first_task_index=first_task_index,
|
70
|
+
last_task_index=last_task_index,
|
71
|
+
)
|
72
|
+
|
73
|
+
with LocalRunner(root_dir_local=workflow_dir_local) as runner:
|
74
|
+
execute_tasks_v2(
|
75
|
+
wf_task_list=workflow.task_list[
|
76
|
+
first_task_index : (last_task_index + 1)
|
77
|
+
],
|
78
|
+
dataset=dataset,
|
79
|
+
job_id=job_id,
|
80
|
+
runner=runner,
|
81
|
+
workflow_dir_local=workflow_dir_local,
|
82
|
+
workflow_dir_remote=workflow_dir_local,
|
83
|
+
logger_name=logger_name,
|
84
|
+
get_runner_config=get_local_backend_config,
|
85
|
+
job_attribute_filters=job_attribute_filters,
|
86
|
+
job_type_filters=job_type_filters,
|
87
|
+
user_id=user_id,
|
88
|
+
)
|
@@ -11,7 +11,7 @@
|
|
11
11
|
# Institute for Biomedical Research and Pelkmans Lab from the University of
|
12
12
|
# Zurich.
|
13
13
|
"""
|
14
|
-
Slurm
|
14
|
+
Slurm Backend
|
15
15
|
|
16
16
|
This backend runs fractal workflows in a SLURM cluster using Clusterfutures
|
17
17
|
Executor objects.
|
@@ -19,14 +19,14 @@ Executor objects.
|
|
19
19
|
from pathlib import Path
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from
|
23
|
-
from
|
24
|
-
from
|
25
|
-
from
|
26
|
-
from
|
27
|
-
from
|
28
|
-
from ..
|
29
|
-
from .
|
22
|
+
from ....ssh._fabric import FractalSSH
|
23
|
+
from ...models.v2 import DatasetV2
|
24
|
+
from ...models.v2 import WorkflowV2
|
25
|
+
from ..exceptions import JobExecutionError
|
26
|
+
from ..executors.slurm_common.get_slurm_config import get_slurm_config
|
27
|
+
from ..executors.slurm_ssh.runner import SlurmSSHRunner
|
28
|
+
from ..set_start_and_last_task_index import set_start_and_last_task_index
|
29
|
+
from .runner import execute_tasks_v2
|
30
30
|
from fractal_server.images.models import AttributeFiltersType
|
31
31
|
from fractal_server.logger import set_logger
|
32
32
|
|
@@ -38,18 +38,17 @@ def process_workflow(
|
|
38
38
|
workflow: WorkflowV2,
|
39
39
|
dataset: DatasetV2,
|
40
40
|
workflow_dir_local: Path,
|
41
|
+
job_id: int,
|
41
42
|
workflow_dir_remote: Optional[Path] = None,
|
42
43
|
first_task_index: Optional[int] = None,
|
43
44
|
last_task_index: Optional[int] = None,
|
44
45
|
logger_name: str,
|
45
46
|
job_attribute_filters: AttributeFiltersType,
|
47
|
+
job_type_filters: dict[str, bool],
|
46
48
|
fractal_ssh: FractalSSH,
|
47
49
|
worker_init: Optional[str] = None,
|
48
50
|
user_id: int,
|
49
|
-
#
|
50
|
-
user_cache_dir: Optional[str] = None,
|
51
|
-
slurm_user: Optional[str] = None,
|
52
|
-
slurm_account: Optional[str] = None,
|
51
|
+
**kwargs, # not used
|
53
52
|
) -> None:
|
54
53
|
"""
|
55
54
|
Process workflow (SLURM backend public interface)
|
@@ -78,22 +77,24 @@ def process_workflow(
|
|
78
77
|
logger.error(error_msg)
|
79
78
|
raise JobExecutionError(info=error_msg)
|
80
79
|
|
81
|
-
with
|
80
|
+
with SlurmSSHRunner(
|
82
81
|
fractal_ssh=fractal_ssh,
|
83
|
-
|
84
|
-
|
82
|
+
root_dir_local=workflow_dir_local,
|
83
|
+
root_dir_remote=workflow_dir_remote,
|
85
84
|
common_script_lines=worker_init,
|
86
|
-
) as
|
85
|
+
) as runner:
|
87
86
|
execute_tasks_v2(
|
88
87
|
wf_task_list=workflow.task_list[
|
89
88
|
first_task_index : (last_task_index + 1)
|
90
89
|
],
|
91
90
|
dataset=dataset,
|
92
|
-
|
91
|
+
job_id=job_id,
|
92
|
+
runner=runner,
|
93
93
|
workflow_dir_local=workflow_dir_local,
|
94
94
|
workflow_dir_remote=workflow_dir_remote,
|
95
95
|
logger_name=logger_name,
|
96
|
-
|
96
|
+
get_runner_config=get_slurm_config,
|
97
97
|
job_attribute_filters=job_attribute_filters,
|
98
|
+
job_type_filters=job_type_filters,
|
98
99
|
user_id=user_id,
|
99
100
|
)
|
@@ -11,7 +11,7 @@
|
|
11
11
|
# Institute for Biomedical Research and Pelkmans Lab from the University of
|
12
12
|
# Zurich.
|
13
13
|
"""
|
14
|
-
Slurm
|
14
|
+
Slurm Backend
|
15
15
|
|
16
16
|
This backend runs fractal workflows in a SLURM cluster using Clusterfutures
|
17
17
|
Executor objects.
|
@@ -19,12 +19,12 @@ Executor objects.
|
|
19
19
|
from pathlib import Path
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from
|
23
|
-
from
|
24
|
-
from
|
25
|
-
from
|
26
|
-
from ..
|
27
|
-
from .
|
22
|
+
from ...models.v2 import DatasetV2
|
23
|
+
from ...models.v2 import WorkflowV2
|
24
|
+
from ..executors.slurm_common.get_slurm_config import get_slurm_config
|
25
|
+
from ..executors.slurm_sudo.runner import SudoSlurmRunner
|
26
|
+
from ..set_start_and_last_task_index import set_start_and_last_task_index
|
27
|
+
from .runner import execute_tasks_v2
|
28
28
|
from fractal_server.images.models import AttributeFiltersType
|
29
29
|
|
30
30
|
|
@@ -33,11 +33,13 @@ def process_workflow(
|
|
33
33
|
workflow: WorkflowV2,
|
34
34
|
dataset: DatasetV2,
|
35
35
|
workflow_dir_local: Path,
|
36
|
+
job_id: int,
|
36
37
|
workflow_dir_remote: Optional[Path] = None,
|
37
38
|
first_task_index: Optional[int] = None,
|
38
39
|
last_task_index: Optional[int] = None,
|
39
40
|
logger_name: str,
|
40
41
|
job_attribute_filters: AttributeFiltersType,
|
42
|
+
job_type_filters: dict[str, bool],
|
41
43
|
user_id: int,
|
42
44
|
# Slurm-specific
|
43
45
|
user_cache_dir: Optional[str] = None,
|
@@ -65,26 +67,26 @@ def process_workflow(
|
|
65
67
|
if isinstance(worker_init, str):
|
66
68
|
worker_init = worker_init.split("\n")
|
67
69
|
|
68
|
-
with
|
69
|
-
debug=True,
|
70
|
-
keep_logs=True,
|
70
|
+
with SudoSlurmRunner(
|
71
71
|
slurm_user=slurm_user,
|
72
72
|
user_cache_dir=user_cache_dir,
|
73
|
-
|
74
|
-
|
73
|
+
root_dir_local=workflow_dir_local,
|
74
|
+
root_dir_remote=workflow_dir_remote,
|
75
75
|
common_script_lines=worker_init,
|
76
76
|
slurm_account=slurm_account,
|
77
|
-
) as
|
77
|
+
) as runner:
|
78
78
|
execute_tasks_v2(
|
79
79
|
wf_task_list=workflow.task_list[
|
80
80
|
first_task_index : (last_task_index + 1)
|
81
81
|
],
|
82
82
|
dataset=dataset,
|
83
|
-
|
83
|
+
job_id=job_id,
|
84
|
+
runner=runner,
|
84
85
|
workflow_dir_local=workflow_dir_local,
|
85
86
|
workflow_dir_remote=workflow_dir_remote,
|
86
87
|
logger_name=logger_name,
|
87
|
-
|
88
|
+
get_runner_config=get_slurm_config,
|
88
89
|
job_attribute_filters=job_attribute_filters,
|
90
|
+
job_type_filters=job_type_filters,
|
89
91
|
user_id=user_id,
|
90
92
|
)
|