fractal-server 2.2.0a0__py3-none-any.whl → 2.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/models/v1/state.py +1 -2
- fractal_server/app/routes/admin/v1.py +2 -2
- fractal_server/app/routes/admin/v2.py +2 -2
- fractal_server/app/routes/api/v1/job.py +2 -2
- fractal_server/app/routes/api/v1/task_collection.py +4 -4
- fractal_server/app/routes/api/v2/__init__.py +23 -3
- fractal_server/app/routes/api/v2/job.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +6 -0
- fractal_server/app/routes/api/v2/task_collection.py +74 -34
- fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
- fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
- fractal_server/app/routes/aux/_runner.py +10 -2
- fractal_server/app/runner/compress_folder.py +120 -0
- fractal_server/app/runner/executors/slurm/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/_batching.py +0 -1
- fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
- fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
- fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
- fractal_server/app/runner/extract_archive.py +38 -0
- fractal_server/app/runner/v1/__init__.py +78 -40
- fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
- fractal_server/app/runner/v2/__init__.py +183 -82
- fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
- fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
- fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
- fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
- fractal_server/app/runner/versions.py +30 -0
- fractal_server/app/schemas/v1/__init__.py +1 -0
- fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
- fractal_server/app/schemas/v2/__init__.py +4 -1
- fractal_server/app/schemas/v2/task_collection.py +97 -27
- fractal_server/config.py +222 -21
- fractal_server/main.py +25 -1
- fractal_server/migrations/env.py +1 -1
- fractal_server/ssh/__init__.py +4 -0
- fractal_server/ssh/_fabric.py +190 -0
- fractal_server/tasks/utils.py +12 -64
- fractal_server/tasks/v1/background_operations.py +2 -2
- fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
- fractal_server/tasks/v1/utils.py +67 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
- fractal_server/tasks/v2/_venv_pip.py +195 -0
- fractal_server/tasks/v2/background_operations.py +257 -295
- fractal_server/tasks/v2/background_operations_ssh.py +304 -0
- fractal_server/tasks/v2/endpoint_operations.py +136 -0
- fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
- fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
- fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
- fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
- fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
- fractal_server/tasks/v2/utils.py +54 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +6 -2
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +68 -44
- fractal_server/tasks/v2/get_collection_data.py +0 -14
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -5,14 +5,18 @@ This module is the single entry point to the runner backend subsystem V2.
|
|
5
5
|
Other subystems should only import this module and not its submodules or
|
6
6
|
the individual backends.
|
7
7
|
"""
|
8
|
+
import logging
|
8
9
|
import os
|
9
10
|
import traceback
|
10
11
|
from pathlib import Path
|
11
12
|
from typing import Optional
|
12
13
|
|
14
|
+
from fabric import Connection # FIXME SSH: try/except import
|
15
|
+
from sqlalchemy.orm import Session as DBSyncSession
|
13
16
|
from sqlalchemy.orm.attributes import flag_modified
|
14
17
|
|
15
18
|
from ....config import get_settings
|
19
|
+
from ....logger import get_logger
|
16
20
|
from ....logger import reset_logger_handlers
|
17
21
|
from ....logger import set_logger
|
18
22
|
from ....syringe import Inject
|
@@ -25,14 +29,15 @@ from ...models.v2 import WorkflowV2
|
|
25
29
|
from ...schemas.v2 import JobStatusTypeV2
|
26
30
|
from ..exceptions import JobExecutionError
|
27
31
|
from ..exceptions import TaskExecutionError
|
28
|
-
from ..executors.slurm._subprocess_run_as_user import _mkdir_as_user
|
32
|
+
from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
|
29
33
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
30
34
|
from ..task_files import task_subfolder_name
|
31
35
|
from ._local import process_workflow as local_process_workflow
|
32
36
|
from ._local_experimental import (
|
33
37
|
process_workflow as local_experimental_process_workflow,
|
34
38
|
)
|
35
|
-
from ._slurm import process_workflow as
|
39
|
+
from ._slurm import process_workflow as slurm_sudo_process_workflow
|
40
|
+
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
36
41
|
from .handle_failed_job import assemble_filters_failed_job
|
37
42
|
from .handle_failed_job import assemble_history_failed_job
|
38
43
|
from .handle_failed_job import assemble_images_failed_job
|
@@ -40,8 +45,30 @@ from fractal_server import __VERSION__
|
|
40
45
|
|
41
46
|
_backends = {}
|
42
47
|
_backends["local"] = local_process_workflow
|
48
|
+
_backends["slurm"] = slurm_sudo_process_workflow
|
49
|
+
_backends["slurm_ssh"] = slurm_ssh_process_workflow
|
43
50
|
_backends["local_experimental"] = local_experimental_process_workflow
|
44
|
-
|
51
|
+
|
52
|
+
|
53
|
+
def fail_job(
|
54
|
+
*,
|
55
|
+
db: DBSyncSession,
|
56
|
+
job: JobV2,
|
57
|
+
log_msg: str,
|
58
|
+
logger_name: str,
|
59
|
+
emit_log: bool = False,
|
60
|
+
) -> None:
|
61
|
+
logger = get_logger(logger_name=logger_name)
|
62
|
+
if emit_log:
|
63
|
+
logger.error(log_msg)
|
64
|
+
reset_logger_handlers(logger)
|
65
|
+
job.status = JobStatusTypeV2.FAILED
|
66
|
+
job.end_timestamp = get_timestamp()
|
67
|
+
job.log = log_msg
|
68
|
+
db.merge(job)
|
69
|
+
db.commit()
|
70
|
+
db.close()
|
71
|
+
return
|
45
72
|
|
46
73
|
|
47
74
|
async def submit_workflow(
|
@@ -52,6 +79,7 @@ async def submit_workflow(
|
|
52
79
|
worker_init: Optional[str] = None,
|
53
80
|
slurm_user: Optional[str] = None,
|
54
81
|
user_cache_dir: Optional[str] = None,
|
82
|
+
connection: Optional[Connection] = None,
|
55
83
|
) -> None:
|
56
84
|
"""
|
57
85
|
Prepares a workflow and applies it to a dataset
|
@@ -78,24 +106,36 @@ async def submit_workflow(
|
|
78
106
|
The username to impersonate for the workflow execution, for the
|
79
107
|
slurm backend.
|
80
108
|
"""
|
81
|
-
|
82
109
|
# Declare runner backend and set `process_workflow` function
|
83
110
|
settings = Inject(get_settings)
|
84
111
|
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
85
|
-
|
86
|
-
|
87
|
-
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
88
|
-
process_workflow = local_experimental_process_workflow
|
89
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
90
|
-
process_workflow = slurm_process_workflow
|
91
|
-
else:
|
92
|
-
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
112
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
113
|
+
logger = set_logger(logger_name=logger_name)
|
93
114
|
|
94
115
|
with next(DB.get_sync_db()) as db_sync:
|
95
116
|
|
96
117
|
job: JobV2 = db_sync.get(JobV2, job_id)
|
97
118
|
if not job:
|
98
|
-
|
119
|
+
logger.error(f"JobV2 {job_id} does not exist")
|
120
|
+
return
|
121
|
+
|
122
|
+
# Declare runner backend and set `process_workflow` function
|
123
|
+
settings = Inject(get_settings)
|
124
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
125
|
+
try:
|
126
|
+
process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
|
127
|
+
except KeyError as e:
|
128
|
+
fail_job(
|
129
|
+
db=db_sync,
|
130
|
+
job=job,
|
131
|
+
log_msg=(
|
132
|
+
f"Invalid {FRACTAL_RUNNER_BACKEND=}.\n"
|
133
|
+
f"Original KeyError: {str(e)}"
|
134
|
+
),
|
135
|
+
logger_name=logger_name,
|
136
|
+
emit_log=True,
|
137
|
+
)
|
138
|
+
return
|
99
139
|
|
100
140
|
dataset: DatasetV2 = db_sync.get(DatasetV2, dataset_id)
|
101
141
|
workflow: WorkflowV2 = db_sync.get(WorkflowV2, workflow_id)
|
@@ -107,61 +147,96 @@ async def submit_workflow(
|
|
107
147
|
log_msg += (
|
108
148
|
f"Cannot fetch workflow {workflow_id} from database\n"
|
109
149
|
)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
db_sync.merge(job)
|
114
|
-
db_sync.commit()
|
115
|
-
db_sync.close()
|
150
|
+
fail_job(
|
151
|
+
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
152
|
+
)
|
116
153
|
return
|
117
154
|
|
118
155
|
# Define and create server-side working folder
|
119
156
|
WORKFLOW_DIR_LOCAL = Path(job.working_dir)
|
120
157
|
if WORKFLOW_DIR_LOCAL.exists():
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
158
|
+
fail_job(
|
159
|
+
db=db_sync,
|
160
|
+
job=job,
|
161
|
+
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
162
|
+
logger_name=logger_name,
|
163
|
+
emit_log=True,
|
164
|
+
)
|
127
165
|
return
|
128
166
|
|
129
|
-
|
130
|
-
original_umask = os.umask(0)
|
131
|
-
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
132
|
-
os.umask(original_umask)
|
133
|
-
|
134
|
-
# Define and create WORKFLOW_DIR_REMOTE
|
135
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
136
|
-
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
137
|
-
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
138
|
-
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
139
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
140
|
-
WORKFLOW_DIR_REMOTE = (
|
141
|
-
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
142
|
-
)
|
143
|
-
_mkdir_as_user(folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user)
|
167
|
+
try:
|
144
168
|
|
145
|
-
|
146
|
-
for order in range(job.first_task_index, job.last_task_index + 1):
|
147
|
-
this_wftask = workflow.task_list[order]
|
148
|
-
if this_wftask.is_legacy_task:
|
149
|
-
task_name = this_wftask.task_legacy.name
|
150
|
-
else:
|
151
|
-
task_name = this_wftask.task.name
|
152
|
-
subfolder_name = task_subfolder_name(
|
153
|
-
order=order,
|
154
|
-
task_name=task_name,
|
155
|
-
)
|
169
|
+
# Create WORKFLOW_DIR_LOCAL
|
156
170
|
original_umask = os.umask(0)
|
157
|
-
|
171
|
+
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
158
172
|
os.umask(original_umask)
|
159
|
-
|
173
|
+
|
174
|
+
# Define and create WORKFLOW_DIR_REMOTE
|
175
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
176
|
+
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
177
|
+
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
178
|
+
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
179
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
180
|
+
WORKFLOW_DIR_REMOTE = (
|
181
|
+
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
182
|
+
)
|
160
183
|
_mkdir_as_user(
|
161
|
-
folder=str(WORKFLOW_DIR_REMOTE
|
162
|
-
|
184
|
+
folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user
|
185
|
+
)
|
186
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
187
|
+
WORKFLOW_DIR_REMOTE = (
|
188
|
+
Path(settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR)
|
189
|
+
/ WORKFLOW_DIR_LOCAL.name
|
190
|
+
)
|
191
|
+
# FIXME SSH: move mkdir to executor, likely within handshake
|
192
|
+
|
193
|
+
from ....ssh._fabric import _mkdir_over_ssh
|
194
|
+
|
195
|
+
_mkdir_over_ssh(
|
196
|
+
folder=str(WORKFLOW_DIR_REMOTE), connection=connection
|
197
|
+
)
|
198
|
+
logging.info(f"Created {str(WORKFLOW_DIR_REMOTE)} via SSH.")
|
199
|
+
else:
|
200
|
+
logging.error(
|
201
|
+
"Invalid FRACTAL_RUNNER_BACKEND="
|
202
|
+
f"{settings.FRACTAL_RUNNER_BACKEND}."
|
163
203
|
)
|
164
204
|
|
205
|
+
# Create all tasks subfolders
|
206
|
+
for order in range(job.first_task_index, job.last_task_index + 1):
|
207
|
+
this_wftask = workflow.task_list[order]
|
208
|
+
if this_wftask.is_legacy_task:
|
209
|
+
task_name = this_wftask.task_legacy.name
|
210
|
+
else:
|
211
|
+
task_name = this_wftask.task.name
|
212
|
+
subfolder_name = task_subfolder_name(
|
213
|
+
order=order,
|
214
|
+
task_name=task_name,
|
215
|
+
)
|
216
|
+
original_umask = os.umask(0)
|
217
|
+
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
|
218
|
+
os.umask(original_umask)
|
219
|
+
if FRACTAL_RUNNER_BACKEND == "slurm":
|
220
|
+
_mkdir_as_user(
|
221
|
+
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
222
|
+
user=slurm_user,
|
223
|
+
)
|
224
|
+
else:
|
225
|
+
logging.info("Skip remote-subfolder creation")
|
226
|
+
except Exception as e:
|
227
|
+
error_type = type(e).__name__
|
228
|
+
fail_job(
|
229
|
+
db=db_sync,
|
230
|
+
job=job,
|
231
|
+
log_msg=(
|
232
|
+
f"{error_type} error occurred while creating job folder "
|
233
|
+
f"and subfolders.\nOriginal error: {str(e)}"
|
234
|
+
),
|
235
|
+
logger_name=logger_name,
|
236
|
+
emit_log=True,
|
237
|
+
)
|
238
|
+
return
|
239
|
+
|
165
240
|
# After Session.commit() is called, either explicitly or when using a
|
166
241
|
# context manager, all objects associated with the Session are expired.
|
167
242
|
# https://docs.sqlalchemy.org/en/14/orm/
|
@@ -179,7 +254,6 @@ async def submit_workflow(
|
|
179
254
|
db_sync.refresh(wftask)
|
180
255
|
|
181
256
|
# Write logs
|
182
|
-
logger_name = f"WF{workflow_id}_job{job_id}"
|
183
257
|
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
184
258
|
logger = set_logger(
|
185
259
|
logger_name=logger_name,
|
@@ -191,9 +265,17 @@ async def submit_workflow(
|
|
191
265
|
)
|
192
266
|
logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
|
193
267
|
logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
|
194
|
-
|
195
|
-
|
196
|
-
|
268
|
+
if FRACTAL_RUNNER_BACKEND == "slurm":
|
269
|
+
logger.debug(f"slurm_user: {slurm_user}")
|
270
|
+
logger.debug(f"slurm_account: {job.slurm_account}")
|
271
|
+
logger.debug(f"worker_init: {worker_init}")
|
272
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
273
|
+
logger.debug(f"ssh_host: {settings.FRACTAL_SLURM_SSH_HOST}")
|
274
|
+
logger.debug(f"ssh_user: {settings.FRACTAL_SLURM_SSH_USER}")
|
275
|
+
logger.debug(
|
276
|
+
f"base dir: {settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR}"
|
277
|
+
)
|
278
|
+
logger.debug(f"worker_init: {worker_init}")
|
197
279
|
logger.debug(f"job.id: {job.id}")
|
198
280
|
logger.debug(f"job.working_dir: {job.working_dir}")
|
199
281
|
logger.debug(f"job.working_dir_user: {job.working_dir_user}")
|
@@ -202,6 +284,27 @@ async def submit_workflow(
|
|
202
284
|
logger.debug(f'START workflow "{workflow.name}"')
|
203
285
|
|
204
286
|
try:
|
287
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
288
|
+
process_workflow = local_process_workflow
|
289
|
+
backend_specific_kwargs = {}
|
290
|
+
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
291
|
+
process_workflow = local_experimental_process_workflow
|
292
|
+
backend_specific_kwargs = {}
|
293
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
294
|
+
process_workflow = slurm_sudo_process_workflow
|
295
|
+
backend_specific_kwargs = dict(
|
296
|
+
slurm_user=slurm_user,
|
297
|
+
slurm_account=job.slurm_account,
|
298
|
+
user_cache_dir=user_cache_dir,
|
299
|
+
)
|
300
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
301
|
+
process_workflow = slurm_ssh_process_workflow
|
302
|
+
backend_specific_kwargs = dict(connection=connection)
|
303
|
+
else:
|
304
|
+
raise RuntimeError(
|
305
|
+
f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}"
|
306
|
+
)
|
307
|
+
|
205
308
|
# "The Session.close() method does not prevent the Session from being
|
206
309
|
# used again. The Session itself does not actually have a distinct
|
207
310
|
# “closed” state; it merely means the Session will release all database
|
@@ -218,15 +321,13 @@ async def submit_workflow(
|
|
218
321
|
new_dataset_attributes = await process_workflow(
|
219
322
|
workflow=workflow,
|
220
323
|
dataset=dataset,
|
221
|
-
slurm_user=slurm_user,
|
222
|
-
slurm_account=job.slurm_account,
|
223
|
-
user_cache_dir=user_cache_dir,
|
224
324
|
workflow_dir_local=WORKFLOW_DIR_LOCAL,
|
225
325
|
workflow_dir_remote=WORKFLOW_DIR_REMOTE,
|
226
326
|
logger_name=logger_name,
|
227
327
|
worker_init=worker_init,
|
228
328
|
first_task_index=job.first_task_index,
|
229
329
|
last_task_index=job.last_task_index,
|
330
|
+
**backend_specific_kwargs,
|
230
331
|
)
|
231
332
|
|
232
333
|
logger.info(
|
@@ -275,18 +376,14 @@ async def submit_workflow(
|
|
275
376
|
dataset.images = latest_images
|
276
377
|
db_sync.merge(dataset)
|
277
378
|
|
278
|
-
job.status = JobStatusTypeV2.FAILED
|
279
|
-
job.end_timestamp = get_timestamp()
|
280
|
-
|
281
379
|
exception_args_string = "\n".join(e.args)
|
282
|
-
|
380
|
+
log_msg = (
|
283
381
|
f"TASK ERROR: "
|
284
382
|
f"Task name: {e.task_name}, "
|
285
383
|
f"position in Workflow: {e.workflow_task_order}\n"
|
286
384
|
f"TRACEBACK:\n{exception_args_string}"
|
287
385
|
)
|
288
|
-
db_sync
|
289
|
-
db_sync.commit()
|
386
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
290
387
|
|
291
388
|
except JobExecutionError as e:
|
292
389
|
|
@@ -309,12 +406,15 @@ async def submit_workflow(
|
|
309
406
|
dataset.images = latest_images
|
310
407
|
db_sync.merge(dataset)
|
311
408
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
409
|
+
fail_job(
|
410
|
+
db=db_sync,
|
411
|
+
job=job,
|
412
|
+
log_msg=(
|
413
|
+
f"JOB ERROR in Fractal job {job.id}:\n"
|
414
|
+
f"TRACEBACK:\n{e.assemble_error()}"
|
415
|
+
),
|
416
|
+
logger_name=logger_name,
|
417
|
+
)
|
318
418
|
|
319
419
|
except Exception:
|
320
420
|
|
@@ -338,15 +438,16 @@ async def submit_workflow(
|
|
338
438
|
if latest_images is not None:
|
339
439
|
dataset.images = latest_images
|
340
440
|
db_sync.merge(dataset)
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
441
|
+
fail_job(
|
442
|
+
db=db_sync,
|
443
|
+
job=job,
|
444
|
+
log_msg=(
|
445
|
+
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
446
|
+
f"TRACEBACK:\n{current_traceback}"
|
447
|
+
),
|
448
|
+
logger_name=logger_name,
|
347
449
|
)
|
348
|
-
|
349
|
-
db_sync.commit()
|
450
|
+
|
350
451
|
finally:
|
351
452
|
reset_logger_handlers(logger)
|
352
453
|
db_sync.close()
|
@@ -1,9 +1,11 @@
|
|
1
|
+
from concurrent.futures.process import BrokenProcessPool
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
from ....models.v2 import DatasetV2
|
5
6
|
from ....models.v2 import WorkflowV2
|
6
7
|
from ...async_wrap import async_wrap
|
8
|
+
from ...exceptions import JobExecutionError
|
7
9
|
from ...filenames import SHUTDOWN_FILENAME
|
8
10
|
from ...set_start_and_last_task_index import set_start_and_last_task_index
|
9
11
|
from ..runner import execute_tasks_v2
|
@@ -29,21 +31,29 @@ def _process_workflow(
|
|
29
31
|
[process_workflow][fractal_server.app.runner.v2._local_experimental.process_workflow]
|
30
32
|
for the call signature.
|
31
33
|
"""
|
32
|
-
|
33
34
|
with FractalProcessPoolExecutor(
|
34
35
|
shutdown_file=workflow_dir_local / SHUTDOWN_FILENAME
|
35
36
|
) as executor:
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
try:
|
38
|
+
new_dataset_attributes = execute_tasks_v2(
|
39
|
+
wf_task_list=workflow.task_list[
|
40
|
+
first_task_index : (last_task_index + 1) # noqa
|
41
|
+
],
|
42
|
+
dataset=dataset,
|
43
|
+
executor=executor,
|
44
|
+
workflow_dir_local=workflow_dir_local,
|
45
|
+
workflow_dir_remote=workflow_dir_local,
|
46
|
+
logger_name=logger_name,
|
47
|
+
submit_setup_call=_local_submit_setup,
|
48
|
+
)
|
49
|
+
except BrokenProcessPool as e:
|
50
|
+
raise JobExecutionError(
|
51
|
+
info=(
|
52
|
+
"Job failed with BrokenProcessPool error, likely due to "
|
53
|
+
f"an executor shutdown.\nOriginal error:\n{e.args[0]}"
|
54
|
+
)
|
55
|
+
)
|
56
|
+
|
47
57
|
return new_dataset_attributes
|
48
58
|
|
49
59
|
|
@@ -2,8 +2,6 @@
|
|
2
2
|
Custom version of Python
|
3
3
|
[ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)).
|
4
4
|
"""
|
5
|
-
import os
|
6
|
-
import signal
|
7
5
|
import threading
|
8
6
|
import time
|
9
7
|
from concurrent.futures import ProcessPoolExecutor
|
@@ -14,13 +12,14 @@ from typing import Iterable
|
|
14
12
|
from typing import Optional
|
15
13
|
from typing import Sequence
|
16
14
|
|
15
|
+
import psutil
|
16
|
+
|
17
17
|
from ._local_config import get_default_local_backend_config
|
18
18
|
from ._local_config import LocalBackendConfig
|
19
19
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
20
|
-
from fractal_server.logger import
|
21
|
-
|
20
|
+
from fractal_server.logger import set_logger
|
22
21
|
|
23
|
-
logger =
|
22
|
+
logger = set_logger("FractalProcessPoolExecutor")
|
24
23
|
|
25
24
|
|
26
25
|
class FractalProcessPoolExecutor(ProcessPoolExecutor):
|
@@ -66,12 +65,17 @@ class FractalProcessPoolExecutor(ProcessPoolExecutor):
|
|
66
65
|
"""
|
67
66
|
Running on '_shutdown_file_thread'.
|
68
67
|
"""
|
68
|
+
|
69
69
|
logger.info("Start terminating FractalProcessPoolExecutor processes.")
|
70
|
+
# We use 'psutil' in order to easily access the PIDs of the children.
|
70
71
|
if self._processes is not None:
|
71
72
|
for pid in self._processes.keys():
|
72
|
-
|
73
|
-
|
74
|
-
|
73
|
+
parent = psutil.Process(pid)
|
74
|
+
children = parent.children(recursive=True)
|
75
|
+
for child in children:
|
76
|
+
child.kill()
|
77
|
+
parent.kill()
|
78
|
+
logger.info(f"Process {pid} and its children terminated.")
|
75
79
|
logger.info("FractalProcessPoolExecutor processes terminated.")
|
76
80
|
|
77
81
|
def shutdown(self, *args, **kwargs) -> None:
|
@@ -24,16 +24,11 @@ from typing import Union
|
|
24
24
|
from ....models.v2 import DatasetV2
|
25
25
|
from ....models.v2 import WorkflowV2
|
26
26
|
from ...async_wrap import async_wrap
|
27
|
-
from ...executors.slurm.executor import FractalSlurmExecutor
|
27
|
+
from ...executors.slurm.sudo.executor import FractalSlurmExecutor
|
28
28
|
from ...set_start_and_last_task_index import set_start_and_last_task_index
|
29
29
|
from ..runner import execute_tasks_v2
|
30
30
|
from ._submit_setup import _slurm_submit_setup
|
31
31
|
|
32
|
-
# from .._common import execute_tasks
|
33
|
-
# from ..common import async_wrap
|
34
|
-
# from ..common import set_start_and_last_task_index
|
35
|
-
# from ..common import TaskParameters
|
36
|
-
|
37
32
|
|
38
33
|
def _process_workflow(
|
39
34
|
*,
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
2
|
+
# University of Zurich
|
3
|
+
#
|
4
|
+
# Original authors:
|
5
|
+
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
6
|
+
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
7
|
+
# Marco Franzon <marco.franzon@exact-lab.it>
|
8
|
+
#
|
9
|
+
# This file is part of Fractal and was originally developed by eXact lab S.r.l.
|
10
|
+
# <exact-lab.it> under contract with Liberali Lab from the Friedrich Miescher
|
11
|
+
# Institute for Biomedical Research and Pelkmans Lab from the University of
|
12
|
+
# Zurich.
|
13
|
+
"""
|
14
|
+
Slurm Bakend
|
15
|
+
|
16
|
+
This backend runs fractal workflows in a SLURM cluster using Clusterfutures
|
17
|
+
Executor objects.
|
18
|
+
"""
|
19
|
+
from pathlib import Path
|
20
|
+
from typing import Any
|
21
|
+
from typing import Optional
|
22
|
+
from typing import Union
|
23
|
+
|
24
|
+
from fabric import Connection
|
25
|
+
|
26
|
+
from ....models.v2 import DatasetV2
|
27
|
+
from ....models.v2 import WorkflowV2
|
28
|
+
from ...async_wrap import async_wrap
|
29
|
+
from ...executors.slurm.ssh.executor import FractalSlurmSSHExecutor
|
30
|
+
from ...set_start_and_last_task_index import set_start_and_last_task_index
|
31
|
+
from ..runner import execute_tasks_v2
|
32
|
+
from ._submit_setup import _slurm_submit_setup
|
33
|
+
|
34
|
+
|
35
|
+
def _process_workflow(
|
36
|
+
*,
|
37
|
+
workflow: WorkflowV2,
|
38
|
+
dataset: DatasetV2,
|
39
|
+
logger_name: str,
|
40
|
+
workflow_dir_local: Path,
|
41
|
+
workflow_dir_remote: Path,
|
42
|
+
first_task_index: int,
|
43
|
+
last_task_index: int,
|
44
|
+
connection: Connection,
|
45
|
+
worker_init: Optional[Union[str, list[str]]] = None,
|
46
|
+
) -> dict[str, Any]:
|
47
|
+
"""
|
48
|
+
Internal processing routine for the SLURM backend
|
49
|
+
|
50
|
+
This function initialises the a FractalSlurmExecutor, setting logging,
|
51
|
+
workflow working dir and user to impersonate. It then schedules the
|
52
|
+
workflow tasks and returns the new dataset attributes
|
53
|
+
|
54
|
+
Cf.
|
55
|
+
[process_workflow][fractal_server.app.runner.v2._local.process_workflow]
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
new_dataset_attributes:
|
59
|
+
"""
|
60
|
+
|
61
|
+
if isinstance(worker_init, str):
|
62
|
+
worker_init = worker_init.split("\n")
|
63
|
+
|
64
|
+
with FractalSlurmSSHExecutor(
|
65
|
+
connection=connection,
|
66
|
+
workflow_dir_local=workflow_dir_local,
|
67
|
+
workflow_dir_remote=workflow_dir_remote,
|
68
|
+
common_script_lines=worker_init,
|
69
|
+
) as executor:
|
70
|
+
new_dataset_attributes = execute_tasks_v2(
|
71
|
+
wf_task_list=workflow.task_list[
|
72
|
+
first_task_index : (last_task_index + 1) # noqa
|
73
|
+
], # noqa
|
74
|
+
dataset=dataset,
|
75
|
+
executor=executor,
|
76
|
+
workflow_dir_local=workflow_dir_local,
|
77
|
+
workflow_dir_remote=workflow_dir_remote,
|
78
|
+
logger_name=logger_name,
|
79
|
+
submit_setup_call=_slurm_submit_setup,
|
80
|
+
)
|
81
|
+
return new_dataset_attributes
|
82
|
+
|
83
|
+
|
84
|
+
async def process_workflow(
|
85
|
+
*,
|
86
|
+
workflow: WorkflowV2,
|
87
|
+
dataset: DatasetV2,
|
88
|
+
workflow_dir_local: Path,
|
89
|
+
workflow_dir_remote: Optional[Path] = None,
|
90
|
+
first_task_index: Optional[int] = None,
|
91
|
+
last_task_index: Optional[int] = None,
|
92
|
+
logger_name: str,
|
93
|
+
# Not used
|
94
|
+
connection: Connection,
|
95
|
+
user_cache_dir: Optional[str] = None,
|
96
|
+
slurm_user: Optional[str] = None,
|
97
|
+
slurm_account: Optional[str] = None,
|
98
|
+
worker_init: Optional[str] = None,
|
99
|
+
) -> dict:
|
100
|
+
"""
|
101
|
+
Process workflow (SLURM backend public interface)
|
102
|
+
|
103
|
+
Cf.
|
104
|
+
[process_workflow][fractal_server.app.runner.v2._local.process_workflow]
|
105
|
+
"""
|
106
|
+
|
107
|
+
# Set values of first_task_index and last_task_index
|
108
|
+
num_tasks = len(workflow.task_list)
|
109
|
+
first_task_index, last_task_index = set_start_and_last_task_index(
|
110
|
+
num_tasks,
|
111
|
+
first_task_index=first_task_index,
|
112
|
+
last_task_index=last_task_index,
|
113
|
+
)
|
114
|
+
|
115
|
+
new_dataset_attributes = await async_wrap(_process_workflow)(
|
116
|
+
workflow=workflow,
|
117
|
+
dataset=dataset,
|
118
|
+
logger_name=logger_name,
|
119
|
+
workflow_dir_local=workflow_dir_local,
|
120
|
+
workflow_dir_remote=workflow_dir_remote,
|
121
|
+
first_task_index=first_task_index,
|
122
|
+
last_task_index=last_task_index,
|
123
|
+
worker_init=worker_init,
|
124
|
+
connection=connection,
|
125
|
+
)
|
126
|
+
return new_dataset_attributes
|