fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/v1/state.py +1 -2
- fractal_server/app/routes/admin/v1.py +2 -2
- fractal_server/app/routes/admin/v2.py +2 -2
- fractal_server/app/routes/api/v1/job.py +2 -2
- fractal_server/app/routes/api/v1/task_collection.py +4 -4
- fractal_server/app/routes/api/v2/__init__.py +23 -3
- fractal_server/app/routes/api/v2/job.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +6 -0
- fractal_server/app/routes/api/v2/task_collection.py +74 -34
- fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
- fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
- fractal_server/app/routes/aux/_runner.py +10 -2
- fractal_server/app/runner/compress_folder.py +120 -0
- fractal_server/app/runner/executors/slurm/__init__.py +0 -3
- fractal_server/app/runner/executors/slurm/_batching.py +0 -1
- fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
- fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
- fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
- fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
- fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
- fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
- fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
- fractal_server/app/runner/extract_archive.py +38 -0
- fractal_server/app/runner/v1/__init__.py +78 -40
- fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
- fractal_server/app/runner/v2/__init__.py +147 -62
- fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
- fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
- fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
- fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
- fractal_server/app/runner/versions.py +30 -0
- fractal_server/app/schemas/v1/__init__.py +1 -0
- fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
- fractal_server/app/schemas/v2/__init__.py +4 -1
- fractal_server/app/schemas/v2/task_collection.py +97 -27
- fractal_server/config.py +184 -3
- fractal_server/main.py +25 -1
- fractal_server/ssh/__init__.py +4 -0
- fractal_server/ssh/_fabric.py +190 -0
- fractal_server/tasks/utils.py +12 -64
- fractal_server/tasks/v1/background_operations.py +2 -2
- fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
- fractal_server/tasks/v1/utils.py +67 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
- fractal_server/tasks/v2/_venv_pip.py +195 -0
- fractal_server/tasks/v2/background_operations.py +257 -295
- fractal_server/tasks/v2/background_operations_ssh.py +304 -0
- fractal_server/tasks/v2/endpoint_operations.py +136 -0
- fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
- fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
- fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
- fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
- fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
- fractal_server/tasks/v2/utils.py +54 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +4 -2
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +66 -42
- fractal_server/tasks/v2/get_collection_data.py +0 -14
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -29,18 +29,18 @@ import cloudpickle
|
|
29
29
|
from cfut import SlurmExecutor
|
30
30
|
from cfut.util import random_string
|
31
31
|
|
32
|
-
from
|
33
|
-
from
|
34
|
-
from
|
35
|
-
from
|
36
|
-
from
|
37
|
-
from
|
38
|
-
from
|
39
|
-
from
|
40
|
-
from .
|
32
|
+
from ......config import get_settings
|
33
|
+
from ......logger import set_logger
|
34
|
+
from ......syringe import Inject
|
35
|
+
from ....exceptions import JobExecutionError
|
36
|
+
from ....exceptions import TaskExecutionError
|
37
|
+
from ....filenames import SHUTDOWN_FILENAME
|
38
|
+
from ....task_files import get_task_file_paths
|
39
|
+
from ....task_files import TaskFiles
|
40
|
+
from ...slurm._slurm_config import get_default_slurm_config
|
41
|
+
from ...slurm._slurm_config import SlurmConfig
|
42
|
+
from .._batching import heuristics
|
41
43
|
from ._executor_wait_thread import FractalSlurmWaitThread
|
42
|
-
from ._slurm_config import get_default_slurm_config
|
43
|
-
from ._slurm_config import SlurmConfig
|
44
44
|
from ._subprocess_run_as_user import _glob_as_user
|
45
45
|
from ._subprocess_run_as_user import _glob_as_user_strict
|
46
46
|
from ._subprocess_run_as_user import _path_exists_as_user
|
@@ -1180,7 +1180,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1180
1180
|
|
1181
1181
|
# Prepare SLURM preamble based on SlurmConfig object
|
1182
1182
|
script_lines = slurm_config.to_sbatch_preamble(
|
1183
|
-
|
1183
|
+
remote_export_dir=self.user_cache_dir
|
1184
1184
|
)
|
1185
1185
|
|
1186
1186
|
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import sys
|
2
|
+
import tarfile
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
|
6
|
+
def _remove_suffix(*, string: str, suffix: str) -> str:
|
7
|
+
if string.endswith(suffix):
|
8
|
+
return string[: -len(suffix)]
|
9
|
+
else:
|
10
|
+
raise ValueError(f"Cannot remove {suffix=} from {string=}.")
|
11
|
+
|
12
|
+
|
13
|
+
if __name__ == "__main__":
|
14
|
+
help_msg = (
|
15
|
+
"Expected use:\n"
|
16
|
+
"python -m fractal_server.app.runner.extract_archive "
|
17
|
+
"path/to/archive.tar.gz"
|
18
|
+
)
|
19
|
+
|
20
|
+
if len(sys.argv[1:]) != 1:
|
21
|
+
raise ValueError(
|
22
|
+
f"Invalid argument.\n{help_msg}\nProvided: {sys.argv=}"
|
23
|
+
)
|
24
|
+
elif not sys.argv[1].endswith(".tar.gz"):
|
25
|
+
raise ValueError(
|
26
|
+
f"Invalid argument.\n{help_msg}\nProvided: {sys.argv=}"
|
27
|
+
)
|
28
|
+
|
29
|
+
tarfile_path = Path(sys.argv[1])
|
30
|
+
|
31
|
+
print(f"[extract_archive.py] {tarfile_path=}")
|
32
|
+
|
33
|
+
job_folder = tarfile_path.parent
|
34
|
+
subfolder_name = _remove_suffix(string=tarfile_path.name, suffix=".tar.gz")
|
35
|
+
with tarfile.open(tarfile_path) as tar:
|
36
|
+
tar.extractall(path=Path(job_folder, subfolder_name).as_posix())
|
37
|
+
|
38
|
+
print(f"[extract_archive.py] {tarfile_path=}")
|
@@ -22,6 +22,10 @@ import traceback
|
|
22
22
|
from pathlib import Path
|
23
23
|
from typing import Optional
|
24
24
|
|
25
|
+
from sqlalchemy.orm import Session as DBSyncSession
|
26
|
+
|
27
|
+
from ....logger import get_logger
|
28
|
+
from ....logger import reset_logger_handlers
|
25
29
|
from ....logger import set_logger
|
26
30
|
from ....syringe import Inject
|
27
31
|
from ....utils import get_timestamp
|
@@ -33,7 +37,7 @@ from ...models.v1 import WorkflowTask
|
|
33
37
|
from ...schemas.v1 import JobStatusTypeV1
|
34
38
|
from ..exceptions import JobExecutionError
|
35
39
|
from ..exceptions import TaskExecutionError
|
36
|
-
from ..executors.slurm._subprocess_run_as_user import (
|
40
|
+
from ..executors.slurm.sudo._subprocess_run_as_user import (
|
37
41
|
_mkdir_as_user,
|
38
42
|
)
|
39
43
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
@@ -53,6 +57,27 @@ _backends["local"] = local_process_workflow
|
|
53
57
|
_backends["slurm"] = slurm_process_workflow
|
54
58
|
|
55
59
|
|
60
|
+
def fail_job(
|
61
|
+
*,
|
62
|
+
db: DBSyncSession,
|
63
|
+
job: ApplyWorkflow,
|
64
|
+
log_msg: str,
|
65
|
+
logger_name: str,
|
66
|
+
emit_log: bool = False,
|
67
|
+
) -> None:
|
68
|
+
logger = get_logger(logger_name=logger_name)
|
69
|
+
if emit_log:
|
70
|
+
logger.error(log_msg)
|
71
|
+
reset_logger_handlers(logger)
|
72
|
+
job.status = JobStatusTypeV1.FAILED
|
73
|
+
job.end_timestamp = get_timestamp()
|
74
|
+
job.log = log_msg
|
75
|
+
db.merge(job)
|
76
|
+
db.commit()
|
77
|
+
db.close()
|
78
|
+
return
|
79
|
+
|
80
|
+
|
56
81
|
async def submit_workflow(
|
57
82
|
*,
|
58
83
|
workflow_id: int,
|
@@ -91,21 +116,41 @@ async def submit_workflow(
|
|
91
116
|
slurm backend.
|
92
117
|
"""
|
93
118
|
|
94
|
-
|
95
|
-
|
96
|
-
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
97
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
98
|
-
process_workflow = local_process_workflow
|
99
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
100
|
-
process_workflow = slurm_process_workflow
|
101
|
-
else:
|
102
|
-
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
119
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
120
|
+
logger = set_logger(logger_name=logger_name)
|
103
121
|
|
104
122
|
with next(DB.get_sync_db()) as db_sync:
|
105
123
|
|
106
124
|
job: ApplyWorkflow = db_sync.get(ApplyWorkflow, job_id)
|
107
125
|
if not job:
|
108
|
-
|
126
|
+
logger.error(f"ApplyWorkflow {job_id} does not exist")
|
127
|
+
return
|
128
|
+
|
129
|
+
settings = Inject(get_settings)
|
130
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
131
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
132
|
+
process_workflow = local_process_workflow
|
133
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
134
|
+
process_workflow = slurm_process_workflow
|
135
|
+
else:
|
136
|
+
|
137
|
+
if FRACTAL_RUNNER_BACKEND == "local_experimental":
|
138
|
+
log_msg = (
|
139
|
+
f"{FRACTAL_RUNNER_BACKEND=} is not available for v1 jobs."
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
log_msg = f"Invalid {FRACTAL_RUNNER_BACKEND=}"
|
143
|
+
|
144
|
+
fail_job(
|
145
|
+
job=job,
|
146
|
+
db=db_sync,
|
147
|
+
log_msg=log_msg,
|
148
|
+
logger_name=logger_name,
|
149
|
+
emit_log=True,
|
150
|
+
)
|
151
|
+
return
|
152
|
+
|
153
|
+
# Declare runner backend and set `process_workflow` function
|
109
154
|
|
110
155
|
input_dataset: Dataset = db_sync.get(Dataset, input_dataset_id)
|
111
156
|
output_dataset: Dataset = db_sync.get(Dataset, output_dataset_id)
|
@@ -126,12 +171,9 @@ async def submit_workflow(
|
|
126
171
|
log_msg += (
|
127
172
|
f"Cannot fetch workflow {workflow_id} from database\n"
|
128
173
|
)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
db_sync.merge(job)
|
133
|
-
db_sync.commit()
|
134
|
-
db_sync.close()
|
174
|
+
fail_job(
|
175
|
+
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
176
|
+
)
|
135
177
|
return
|
136
178
|
|
137
179
|
# Prepare some of process_workflow arguments
|
@@ -147,9 +189,14 @@ async def submit_workflow(
|
|
147
189
|
)
|
148
190
|
|
149
191
|
if WORKFLOW_DIR_LOCAL.exists():
|
150
|
-
|
151
|
-
|
192
|
+
fail_job(
|
193
|
+
db=db_sync,
|
194
|
+
job=job,
|
195
|
+
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
196
|
+
logger_name=logger_name,
|
197
|
+
emit_log=True,
|
152
198
|
)
|
199
|
+
return
|
153
200
|
|
154
201
|
# Create WORKFLOW_DIR
|
155
202
|
original_umask = os.umask(0)
|
@@ -202,7 +249,6 @@ async def submit_workflow(
|
|
202
249
|
db_sync.refresh(workflow)
|
203
250
|
|
204
251
|
# Write logs
|
205
|
-
logger_name = f"WF{workflow_id}_job{job_id}"
|
206
252
|
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
207
253
|
logger = set_logger(
|
208
254
|
logger_name=logger_name,
|
@@ -302,19 +348,14 @@ async def submit_workflow(
|
|
302
348
|
|
303
349
|
db_sync.merge(output_dataset)
|
304
350
|
|
305
|
-
job.status = JobStatusTypeV1.FAILED
|
306
|
-
job.end_timestamp = get_timestamp()
|
307
|
-
|
308
351
|
exception_args_string = "\n".join(e.args)
|
309
|
-
|
352
|
+
log_msg = (
|
310
353
|
f"TASK ERROR: "
|
311
354
|
f"Task name: {e.task_name}, "
|
312
355
|
f"position in Workflow: {e.workflow_task_order}\n"
|
313
356
|
f"TRACEBACK:\n{exception_args_string}"
|
314
357
|
)
|
315
|
-
db_sync
|
316
|
-
close_job_logger(logger)
|
317
|
-
db_sync.commit()
|
358
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
318
359
|
|
319
360
|
except JobExecutionError as e:
|
320
361
|
|
@@ -334,14 +375,13 @@ async def submit_workflow(
|
|
334
375
|
)
|
335
376
|
|
336
377
|
db_sync.merge(output_dataset)
|
337
|
-
|
338
|
-
job.status = JobStatusTypeV1.FAILED
|
339
|
-
job.end_timestamp = get_timestamp()
|
340
378
|
error = e.assemble_error()
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
379
|
+
fail_job(
|
380
|
+
db=db_sync,
|
381
|
+
job=job,
|
382
|
+
log_msg=f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}",
|
383
|
+
logger_name=logger_name,
|
384
|
+
)
|
345
385
|
|
346
386
|
except Exception:
|
347
387
|
|
@@ -364,14 +404,12 @@ async def submit_workflow(
|
|
364
404
|
|
365
405
|
db_sync.merge(output_dataset)
|
366
406
|
|
367
|
-
|
368
|
-
job.end_timestamp = get_timestamp()
|
369
|
-
job.log = (
|
407
|
+
log_msg = (
|
370
408
|
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
371
409
|
f"TRACEBACK:\n{current_traceback}"
|
372
410
|
)
|
373
|
-
db_sync
|
374
|
-
|
375
|
-
db_sync.commit()
|
411
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
412
|
+
|
376
413
|
finally:
|
377
414
|
db_sync.close()
|
415
|
+
reset_logger_handlers(logger)
|
@@ -22,7 +22,7 @@ from typing import Optional
|
|
22
22
|
from typing import Union
|
23
23
|
|
24
24
|
from ...async_wrap import async_wrap
|
25
|
-
from ...executors.slurm.executor import FractalSlurmExecutor
|
25
|
+
from ...executors.slurm.sudo.executor import FractalSlurmExecutor
|
26
26
|
from ...set_start_and_last_task_index import set_start_and_last_task_index
|
27
27
|
from .._common import execute_tasks
|
28
28
|
from ..common import TaskParameters
|
@@ -5,14 +5,18 @@ This module is the single entry point to the runner backend subsystem V2.
|
|
5
5
|
Other subystems should only import this module and not its submodules or
|
6
6
|
the individual backends.
|
7
7
|
"""
|
8
|
+
import logging
|
8
9
|
import os
|
9
10
|
import traceback
|
10
11
|
from pathlib import Path
|
11
12
|
from typing import Optional
|
12
13
|
|
14
|
+
from fabric import Connection # FIXME SSH: try/except import
|
15
|
+
from sqlalchemy.orm import Session as DBSyncSession
|
13
16
|
from sqlalchemy.orm.attributes import flag_modified
|
14
17
|
|
15
18
|
from ....config import get_settings
|
19
|
+
from ....logger import get_logger
|
16
20
|
from ....logger import reset_logger_handlers
|
17
21
|
from ....logger import set_logger
|
18
22
|
from ....syringe import Inject
|
@@ -25,14 +29,15 @@ from ...models.v2 import WorkflowV2
|
|
25
29
|
from ...schemas.v2 import JobStatusTypeV2
|
26
30
|
from ..exceptions import JobExecutionError
|
27
31
|
from ..exceptions import TaskExecutionError
|
28
|
-
from ..executors.slurm._subprocess_run_as_user import _mkdir_as_user
|
32
|
+
from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
|
29
33
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
30
34
|
from ..task_files import task_subfolder_name
|
31
35
|
from ._local import process_workflow as local_process_workflow
|
32
36
|
from ._local_experimental import (
|
33
37
|
process_workflow as local_experimental_process_workflow,
|
34
38
|
)
|
35
|
-
from ._slurm import process_workflow as
|
39
|
+
from ._slurm import process_workflow as slurm_sudo_process_workflow
|
40
|
+
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
36
41
|
from .handle_failed_job import assemble_filters_failed_job
|
37
42
|
from .handle_failed_job import assemble_history_failed_job
|
38
43
|
from .handle_failed_job import assemble_images_failed_job
|
@@ -40,8 +45,30 @@ from fractal_server import __VERSION__
|
|
40
45
|
|
41
46
|
_backends = {}
|
42
47
|
_backends["local"] = local_process_workflow
|
48
|
+
_backends["slurm"] = slurm_sudo_process_workflow
|
49
|
+
_backends["slurm_ssh"] = slurm_ssh_process_workflow
|
43
50
|
_backends["local_experimental"] = local_experimental_process_workflow
|
44
|
-
|
51
|
+
|
52
|
+
|
53
|
+
def fail_job(
|
54
|
+
*,
|
55
|
+
db: DBSyncSession,
|
56
|
+
job: JobV2,
|
57
|
+
log_msg: str,
|
58
|
+
logger_name: str,
|
59
|
+
emit_log: bool = False,
|
60
|
+
) -> None:
|
61
|
+
logger = get_logger(logger_name=logger_name)
|
62
|
+
if emit_log:
|
63
|
+
logger.error(log_msg)
|
64
|
+
reset_logger_handlers(logger)
|
65
|
+
job.status = JobStatusTypeV2.FAILED
|
66
|
+
job.end_timestamp = get_timestamp()
|
67
|
+
job.log = log_msg
|
68
|
+
db.merge(job)
|
69
|
+
db.commit()
|
70
|
+
db.close()
|
71
|
+
return
|
45
72
|
|
46
73
|
|
47
74
|
async def submit_workflow(
|
@@ -52,6 +79,7 @@ async def submit_workflow(
|
|
52
79
|
worker_init: Optional[str] = None,
|
53
80
|
slurm_user: Optional[str] = None,
|
54
81
|
user_cache_dir: Optional[str] = None,
|
82
|
+
connection: Optional[Connection] = None,
|
55
83
|
) -> None:
|
56
84
|
"""
|
57
85
|
Prepares a workflow and applies it to a dataset
|
@@ -78,24 +106,36 @@ async def submit_workflow(
|
|
78
106
|
The username to impersonate for the workflow execution, for the
|
79
107
|
slurm backend.
|
80
108
|
"""
|
81
|
-
|
82
109
|
# Declare runner backend and set `process_workflow` function
|
83
110
|
settings = Inject(get_settings)
|
84
111
|
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
85
|
-
|
86
|
-
|
87
|
-
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
88
|
-
process_workflow = local_experimental_process_workflow
|
89
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
90
|
-
process_workflow = slurm_process_workflow
|
91
|
-
else:
|
92
|
-
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
112
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
113
|
+
logger = set_logger(logger_name=logger_name)
|
93
114
|
|
94
115
|
with next(DB.get_sync_db()) as db_sync:
|
95
116
|
|
96
117
|
job: JobV2 = db_sync.get(JobV2, job_id)
|
97
118
|
if not job:
|
98
|
-
|
119
|
+
logger.error(f"JobV2 {job_id} does not exist")
|
120
|
+
return
|
121
|
+
|
122
|
+
# Declare runner backend and set `process_workflow` function
|
123
|
+
settings = Inject(get_settings)
|
124
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
125
|
+
try:
|
126
|
+
process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
|
127
|
+
except KeyError as e:
|
128
|
+
fail_job(
|
129
|
+
db=db_sync,
|
130
|
+
job=job,
|
131
|
+
log_msg=(
|
132
|
+
f"Invalid {FRACTAL_RUNNER_BACKEND=}.\n"
|
133
|
+
f"Original KeyError: {str(e)}"
|
134
|
+
),
|
135
|
+
logger_name=logger_name,
|
136
|
+
emit_log=True,
|
137
|
+
)
|
138
|
+
return
|
99
139
|
|
100
140
|
dataset: DatasetV2 = db_sync.get(DatasetV2, dataset_id)
|
101
141
|
workflow: WorkflowV2 = db_sync.get(WorkflowV2, workflow_id)
|
@@ -107,31 +147,28 @@ async def submit_workflow(
|
|
107
147
|
log_msg += (
|
108
148
|
f"Cannot fetch workflow {workflow_id} from database\n"
|
109
149
|
)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
db_sync.merge(job)
|
114
|
-
db_sync.commit()
|
115
|
-
db_sync.close()
|
150
|
+
fail_job(
|
151
|
+
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
152
|
+
)
|
116
153
|
return
|
117
154
|
|
118
155
|
# Define and create server-side working folder
|
119
156
|
WORKFLOW_DIR_LOCAL = Path(job.working_dir)
|
120
157
|
if WORKFLOW_DIR_LOCAL.exists():
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
158
|
+
fail_job(
|
159
|
+
db=db_sync,
|
160
|
+
job=job,
|
161
|
+
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
162
|
+
logger_name=logger_name,
|
163
|
+
emit_log=True,
|
164
|
+
)
|
127
165
|
return
|
128
166
|
|
129
167
|
try:
|
130
168
|
|
131
|
-
# Create
|
169
|
+
# Create WORKFLOW_DIR_LOCAL
|
132
170
|
original_umask = os.umask(0)
|
133
171
|
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
134
|
-
|
135
172
|
os.umask(original_umask)
|
136
173
|
|
137
174
|
# Define and create WORKFLOW_DIR_REMOTE
|
@@ -146,6 +183,24 @@ async def submit_workflow(
|
|
146
183
|
_mkdir_as_user(
|
147
184
|
folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user
|
148
185
|
)
|
186
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
187
|
+
WORKFLOW_DIR_REMOTE = (
|
188
|
+
Path(settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR)
|
189
|
+
/ WORKFLOW_DIR_LOCAL.name
|
190
|
+
)
|
191
|
+
# FIXME SSH: move mkdir to executor, likely within handshake
|
192
|
+
|
193
|
+
from ....ssh._fabric import _mkdir_over_ssh
|
194
|
+
|
195
|
+
_mkdir_over_ssh(
|
196
|
+
folder=str(WORKFLOW_DIR_REMOTE), connection=connection
|
197
|
+
)
|
198
|
+
logging.info(f"Created {str(WORKFLOW_DIR_REMOTE)} via SSH.")
|
199
|
+
else:
|
200
|
+
logging.error(
|
201
|
+
"Invalid FRACTAL_RUNNER_BACKEND="
|
202
|
+
f"{settings.FRACTAL_RUNNER_BACKEND}."
|
203
|
+
)
|
149
204
|
|
150
205
|
# Create all tasks subfolders
|
151
206
|
for order in range(job.first_task_index, job.last_task_index + 1):
|
@@ -166,16 +221,20 @@ async def submit_workflow(
|
|
166
221
|
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
167
222
|
user=slurm_user,
|
168
223
|
)
|
224
|
+
else:
|
225
|
+
logging.info("Skip remote-subfolder creation")
|
169
226
|
except Exception as e:
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
227
|
+
error_type = type(e).__name__
|
228
|
+
fail_job(
|
229
|
+
db=db_sync,
|
230
|
+
job=job,
|
231
|
+
log_msg=(
|
232
|
+
f"{error_type} error occurred while creating job folder "
|
233
|
+
f"and subfolders.\nOriginal error: {str(e)}"
|
234
|
+
),
|
235
|
+
logger_name=logger_name,
|
236
|
+
emit_log=True,
|
175
237
|
)
|
176
|
-
db_sync.merge(job)
|
177
|
-
db_sync.commit()
|
178
|
-
db_sync.close()
|
179
238
|
return
|
180
239
|
|
181
240
|
# After Session.commit() is called, either explicitly or when using a
|
@@ -195,7 +254,6 @@ async def submit_workflow(
|
|
195
254
|
db_sync.refresh(wftask)
|
196
255
|
|
197
256
|
# Write logs
|
198
|
-
logger_name = f"WF{workflow_id}_job{job_id}"
|
199
257
|
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
200
258
|
logger = set_logger(
|
201
259
|
logger_name=logger_name,
|
@@ -207,9 +265,17 @@ async def submit_workflow(
|
|
207
265
|
)
|
208
266
|
logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
|
209
267
|
logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
|
210
|
-
|
211
|
-
|
212
|
-
|
268
|
+
if FRACTAL_RUNNER_BACKEND == "slurm":
|
269
|
+
logger.debug(f"slurm_user: {slurm_user}")
|
270
|
+
logger.debug(f"slurm_account: {job.slurm_account}")
|
271
|
+
logger.debug(f"worker_init: {worker_init}")
|
272
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
273
|
+
logger.debug(f"ssh_host: {settings.FRACTAL_SLURM_SSH_HOST}")
|
274
|
+
logger.debug(f"ssh_user: {settings.FRACTAL_SLURM_SSH_USER}")
|
275
|
+
logger.debug(
|
276
|
+
f"base dir: {settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR}"
|
277
|
+
)
|
278
|
+
logger.debug(f"worker_init: {worker_init}")
|
213
279
|
logger.debug(f"job.id: {job.id}")
|
214
280
|
logger.debug(f"job.working_dir: {job.working_dir}")
|
215
281
|
logger.debug(f"job.working_dir_user: {job.working_dir_user}")
|
@@ -218,6 +284,27 @@ async def submit_workflow(
|
|
218
284
|
logger.debug(f'START workflow "{workflow.name}"')
|
219
285
|
|
220
286
|
try:
|
287
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
288
|
+
process_workflow = local_process_workflow
|
289
|
+
backend_specific_kwargs = {}
|
290
|
+
elif FRACTAL_RUNNER_BACKEND == "local_experimental":
|
291
|
+
process_workflow = local_experimental_process_workflow
|
292
|
+
backend_specific_kwargs = {}
|
293
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
294
|
+
process_workflow = slurm_sudo_process_workflow
|
295
|
+
backend_specific_kwargs = dict(
|
296
|
+
slurm_user=slurm_user,
|
297
|
+
slurm_account=job.slurm_account,
|
298
|
+
user_cache_dir=user_cache_dir,
|
299
|
+
)
|
300
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
301
|
+
process_workflow = slurm_ssh_process_workflow
|
302
|
+
backend_specific_kwargs = dict(connection=connection)
|
303
|
+
else:
|
304
|
+
raise RuntimeError(
|
305
|
+
f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}"
|
306
|
+
)
|
307
|
+
|
221
308
|
# "The Session.close() method does not prevent the Session from being
|
222
309
|
# used again. The Session itself does not actually have a distinct
|
223
310
|
# “closed” state; it merely means the Session will release all database
|
@@ -234,15 +321,13 @@ async def submit_workflow(
|
|
234
321
|
new_dataset_attributes = await process_workflow(
|
235
322
|
workflow=workflow,
|
236
323
|
dataset=dataset,
|
237
|
-
slurm_user=slurm_user,
|
238
|
-
slurm_account=job.slurm_account,
|
239
|
-
user_cache_dir=user_cache_dir,
|
240
324
|
workflow_dir_local=WORKFLOW_DIR_LOCAL,
|
241
325
|
workflow_dir_remote=WORKFLOW_DIR_REMOTE,
|
242
326
|
logger_name=logger_name,
|
243
327
|
worker_init=worker_init,
|
244
328
|
first_task_index=job.first_task_index,
|
245
329
|
last_task_index=job.last_task_index,
|
330
|
+
**backend_specific_kwargs,
|
246
331
|
)
|
247
332
|
|
248
333
|
logger.info(
|
@@ -291,18 +376,14 @@ async def submit_workflow(
|
|
291
376
|
dataset.images = latest_images
|
292
377
|
db_sync.merge(dataset)
|
293
378
|
|
294
|
-
job.status = JobStatusTypeV2.FAILED
|
295
|
-
job.end_timestamp = get_timestamp()
|
296
|
-
|
297
379
|
exception_args_string = "\n".join(e.args)
|
298
|
-
|
380
|
+
log_msg = (
|
299
381
|
f"TASK ERROR: "
|
300
382
|
f"Task name: {e.task_name}, "
|
301
383
|
f"position in Workflow: {e.workflow_task_order}\n"
|
302
384
|
f"TRACEBACK:\n{exception_args_string}"
|
303
385
|
)
|
304
|
-
db_sync
|
305
|
-
db_sync.commit()
|
386
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
306
387
|
|
307
388
|
except JobExecutionError as e:
|
308
389
|
|
@@ -325,12 +406,15 @@ async def submit_workflow(
|
|
325
406
|
dataset.images = latest_images
|
326
407
|
db_sync.merge(dataset)
|
327
408
|
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
409
|
+
fail_job(
|
410
|
+
db=db_sync,
|
411
|
+
job=job,
|
412
|
+
log_msg=(
|
413
|
+
f"JOB ERROR in Fractal job {job.id}:\n"
|
414
|
+
f"TRACEBACK:\n{e.assemble_error()}"
|
415
|
+
),
|
416
|
+
logger_name=logger_name,
|
417
|
+
)
|
334
418
|
|
335
419
|
except Exception:
|
336
420
|
|
@@ -354,15 +438,16 @@ async def submit_workflow(
|
|
354
438
|
if latest_images is not None:
|
355
439
|
dataset.images = latest_images
|
356
440
|
db_sync.merge(dataset)
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
441
|
+
fail_job(
|
442
|
+
db=db_sync,
|
443
|
+
job=job,
|
444
|
+
log_msg=(
|
445
|
+
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
446
|
+
f"TRACEBACK:\n{current_traceback}"
|
447
|
+
),
|
448
|
+
logger_name=logger_name,
|
363
449
|
)
|
364
|
-
|
365
|
-
db_sync.commit()
|
450
|
+
|
366
451
|
finally:
|
367
452
|
reset_logger_handlers(logger)
|
368
453
|
db_sync.close()
|