fractal-server 1.4.9__py3-none-any.whl → 2.0.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/__init__.py +4 -7
- fractal_server/app/models/linkuserproject.py +9 -0
- fractal_server/app/models/security.py +6 -0
- fractal_server/app/models/state.py +1 -1
- fractal_server/app/models/v1/__init__.py +10 -0
- fractal_server/app/models/{dataset.py → v1/dataset.py} +5 -5
- fractal_server/app/models/{job.py → v1/job.py} +5 -5
- fractal_server/app/models/{project.py → v1/project.py} +5 -5
- fractal_server/app/models/{task.py → v1/task.py} +7 -2
- fractal_server/app/models/{workflow.py → v1/workflow.py} +5 -5
- fractal_server/app/models/v2/__init__.py +20 -0
- fractal_server/app/models/v2/dataset.py +55 -0
- fractal_server/app/models/v2/job.py +51 -0
- fractal_server/app/models/v2/project.py +31 -0
- fractal_server/app/models/v2/task.py +93 -0
- fractal_server/app/models/v2/workflow.py +43 -0
- fractal_server/app/models/v2/workflowtask.py +90 -0
- fractal_server/app/routes/{admin.py → admin/v1.py} +42 -42
- fractal_server/app/routes/admin/v2.py +275 -0
- fractal_server/app/routes/api/v1/__init__.py +7 -7
- fractal_server/app/routes/api/v1/_aux_functions.py +2 -2
- fractal_server/app/routes/api/v1/dataset.py +44 -37
- fractal_server/app/routes/api/v1/job.py +12 -12
- fractal_server/app/routes/api/v1/project.py +23 -21
- fractal_server/app/routes/api/v1/task.py +24 -14
- fractal_server/app/routes/api/v1/task_collection.py +16 -14
- fractal_server/app/routes/api/v1/workflow.py +24 -24
- fractal_server/app/routes/api/v1/workflowtask.py +10 -10
- fractal_server/app/routes/api/v2/__init__.py +28 -0
- fractal_server/app/routes/api/v2/_aux_functions.py +497 -0
- fractal_server/app/routes/api/v2/apply.py +220 -0
- fractal_server/app/routes/api/v2/dataset.py +310 -0
- fractal_server/app/routes/api/v2/images.py +212 -0
- fractal_server/app/routes/api/v2/job.py +200 -0
- fractal_server/app/routes/api/v2/project.py +205 -0
- fractal_server/app/routes/api/v2/task.py +222 -0
- fractal_server/app/routes/api/v2/task_collection.py +229 -0
- fractal_server/app/routes/api/v2/workflow.py +398 -0
- fractal_server/app/routes/api/v2/workflowtask.py +269 -0
- fractal_server/app/routes/aux/_job.py +1 -1
- fractal_server/app/runner/async_wrap.py +27 -0
- fractal_server/app/runner/exceptions.py +129 -0
- fractal_server/app/runner/executors/local/__init__.py +3 -0
- fractal_server/app/runner/{_local → executors/local}/executor.py +2 -2
- fractal_server/app/runner/executors/slurm/__init__.py +3 -0
- fractal_server/app/runner/{_slurm → executors/slurm}/_batching.py +1 -1
- fractal_server/app/runner/executors/slurm/_check_jobs_status.py +72 -0
- fractal_server/app/runner/{_slurm → executors/slurm}/_executor_wait_thread.py +3 -4
- fractal_server/app/runner/{_slurm → executors/slurm}/_slurm_config.py +3 -152
- fractal_server/app/runner/{_slurm → executors/slurm}/_subprocess_run_as_user.py +1 -1
- fractal_server/app/runner/{_slurm → executors/slurm}/executor.py +9 -9
- fractal_server/app/runner/filenames.py +6 -0
- fractal_server/app/runner/set_start_and_last_task_index.py +39 -0
- fractal_server/app/runner/task_files.py +105 -0
- fractal_server/app/runner/{__init__.py → v1/__init__.py} +36 -49
- fractal_server/app/runner/{_common.py → v1/_common.py} +13 -120
- fractal_server/app/runner/{_local → v1/_local}/__init__.py +6 -6
- fractal_server/app/runner/{_local → v1/_local}/_local_config.py +6 -7
- fractal_server/app/runner/{_local → v1/_local}/_submit_setup.py +1 -5
- fractal_server/app/runner/v1/_slurm/__init__.py +310 -0
- fractal_server/app/runner/{_slurm → v1/_slurm}/_submit_setup.py +3 -9
- fractal_server/app/runner/v1/_slurm/get_slurm_config.py +163 -0
- fractal_server/app/runner/v1/common.py +117 -0
- fractal_server/app/runner/{handle_failed_job.py → v1/handle_failed_job.py} +8 -8
- fractal_server/app/runner/v2/__init__.py +337 -0
- fractal_server/app/runner/v2/_local/__init__.py +169 -0
- fractal_server/app/runner/v2/_local/_local_config.py +118 -0
- fractal_server/app/runner/v2/_local/_submit_setup.py +52 -0
- fractal_server/app/runner/v2/_slurm/__init__.py +157 -0
- fractal_server/app/runner/v2/_slurm/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm/get_slurm_config.py +179 -0
- fractal_server/app/runner/v2/components.py +5 -0
- fractal_server/app/runner/v2/deduplicate_list.py +24 -0
- fractal_server/app/runner/v2/handle_failed_job.py +156 -0
- fractal_server/app/runner/v2/merge_outputs.py +41 -0
- fractal_server/app/runner/v2/runner.py +264 -0
- fractal_server/app/runner/v2/runner_functions.py +339 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +134 -0
- fractal_server/app/runner/v2/task_interface.py +43 -0
- fractal_server/app/runner/v2/v1_compat.py +21 -0
- fractal_server/app/schemas/__init__.py +4 -42
- fractal_server/app/schemas/v1/__init__.py +42 -0
- fractal_server/app/schemas/{applyworkflow.py → v1/applyworkflow.py} +18 -18
- fractal_server/app/schemas/{dataset.py → v1/dataset.py} +30 -30
- fractal_server/app/schemas/{dumps.py → v1/dumps.py} +8 -8
- fractal_server/app/schemas/{manifest.py → v1/manifest.py} +5 -5
- fractal_server/app/schemas/{project.py → v1/project.py} +9 -9
- fractal_server/app/schemas/{task.py → v1/task.py} +12 -12
- fractal_server/app/schemas/{task_collection.py → v1/task_collection.py} +7 -7
- fractal_server/app/schemas/{workflow.py → v1/workflow.py} +38 -38
- fractal_server/app/schemas/v2/__init__.py +34 -0
- fractal_server/app/schemas/v2/dataset.py +88 -0
- fractal_server/app/schemas/v2/dumps.py +87 -0
- fractal_server/app/schemas/v2/job.py +113 -0
- fractal_server/app/schemas/v2/manifest.py +109 -0
- fractal_server/app/schemas/v2/project.py +36 -0
- fractal_server/app/schemas/v2/task.py +121 -0
- fractal_server/app/schemas/v2/task_collection.py +105 -0
- fractal_server/app/schemas/v2/workflow.py +78 -0
- fractal_server/app/schemas/v2/workflowtask.py +118 -0
- fractal_server/config.py +5 -10
- fractal_server/images/__init__.py +50 -0
- fractal_server/images/tools.py +86 -0
- fractal_server/main.py +11 -3
- fractal_server/migrations/versions/4b35c5cefbe3_tmp_is_v2_compatible.py +39 -0
- fractal_server/migrations/versions/56af171b0159_v2.py +217 -0
- fractal_server/migrations/versions/876f28db9d4e_tmp_split_task_and_wftask_meta.py +68 -0
- fractal_server/migrations/versions/974c802f0dd0_tmp_workflowtaskv2_type_in_db.py +37 -0
- fractal_server/migrations/versions/9cd305cd6023_tmp_workflowtaskv2.py +40 -0
- fractal_server/migrations/versions/a6231ed6273c_tmp_args_schemas_in_taskv2.py +42 -0
- fractal_server/migrations/versions/b9e9eed9d442_tmp_taskv2_type.py +37 -0
- fractal_server/migrations/versions/e3e639454d4b_tmp_make_task_meta_non_optional.py +50 -0
- fractal_server/tasks/__init__.py +0 -5
- fractal_server/tasks/endpoint_operations.py +13 -19
- fractal_server/tasks/utils.py +35 -0
- fractal_server/tasks/{_TaskCollectPip.py → v1/_TaskCollectPip.py} +3 -3
- fractal_server/tasks/{background_operations.py → v1/background_operations.py} +18 -50
- fractal_server/tasks/v1/get_collection_data.py +14 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +103 -0
- fractal_server/tasks/v2/background_operations.py +382 -0
- fractal_server/tasks/v2/get_collection_data.py +14 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/METADATA +3 -4
- fractal_server-2.0.0a0.dist-info/RECORD +166 -0
- fractal_server/app/runner/_slurm/.gitignore +0 -2
- fractal_server/app/runner/_slurm/__init__.py +0 -150
- fractal_server/app/runner/common.py +0 -311
- fractal_server-1.4.9.dist-info/RECORD +0 -97
- /fractal_server/app/runner/{_slurm → executors/slurm}/remote.py +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from fractal_server.app.models.v1 import WorkflowTask
|
5
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
6
|
+
_parse_mem_value,
|
7
|
+
)
|
8
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
9
|
+
load_slurm_config_file,
|
10
|
+
)
|
11
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import logger
|
12
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import SlurmConfig
|
13
|
+
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
14
|
+
SlurmConfigError,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
def get_slurm_config(
|
19
|
+
wftask: WorkflowTask,
|
20
|
+
workflow_dir: Path,
|
21
|
+
workflow_dir_user: Path,
|
22
|
+
config_path: Optional[Path] = None,
|
23
|
+
) -> SlurmConfig:
|
24
|
+
"""
|
25
|
+
Prepare a `SlurmConfig` configuration object
|
26
|
+
|
27
|
+
The sources for `SlurmConfig` attributes, in increasing priority order, are
|
28
|
+
|
29
|
+
1. The general content of the Fractal SLURM configuration file.
|
30
|
+
2. The GPU-specific content of the Fractal SLURM configuration file, if
|
31
|
+
appropriate.
|
32
|
+
3. Properties in `wftask.meta` (which, for `WorkflowTask`s added through
|
33
|
+
`Workflow.insert_task`, also includes `wftask.task.meta`);
|
34
|
+
|
35
|
+
Note: `wftask.meta` may be `None`.
|
36
|
+
|
37
|
+
Arguments:
|
38
|
+
wftask:
|
39
|
+
WorkflowTask for which the SLURM configuration is is to be
|
40
|
+
prepared.
|
41
|
+
workflow_dir:
|
42
|
+
Server-owned directory to store all task-execution-related relevant
|
43
|
+
files (inputs, outputs, errors, and all meta files related to the
|
44
|
+
job execution). Note: users cannot write directly to this folder.
|
45
|
+
workflow_dir_user:
|
46
|
+
User-side directory with the same scope as `workflow_dir`, and
|
47
|
+
where a user can write.
|
48
|
+
config_path:
|
49
|
+
Path of aFractal SLURM configuration file; if `None`, use
|
50
|
+
`FRACTAL_SLURM_CONFIG_FILE` variable from settings.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
slurm_config:
|
54
|
+
The SlurmConfig object
|
55
|
+
"""
|
56
|
+
|
57
|
+
logger.debug(
|
58
|
+
"[get_slurm_config] WorkflowTask meta attribute: {wftask.meta=}"
|
59
|
+
)
|
60
|
+
|
61
|
+
# Incorporate slurm_env.default_slurm_config
|
62
|
+
slurm_env = load_slurm_config_file(config_path=config_path)
|
63
|
+
slurm_dict = slurm_env.default_slurm_config.dict(
|
64
|
+
exclude_unset=True, exclude={"mem"}
|
65
|
+
)
|
66
|
+
if slurm_env.default_slurm_config.mem:
|
67
|
+
slurm_dict["mem_per_task_MB"] = slurm_env.default_slurm_config.mem
|
68
|
+
|
69
|
+
# Incorporate slurm_env.batching_config
|
70
|
+
for key, value in slurm_env.batching_config.dict().items():
|
71
|
+
slurm_dict[key] = value
|
72
|
+
|
73
|
+
# Incorporate slurm_env.user_local_exports
|
74
|
+
slurm_dict["user_local_exports"] = slurm_env.user_local_exports
|
75
|
+
|
76
|
+
logger.debug(
|
77
|
+
"[get_slurm_config] Fractal SLURM configuration file: "
|
78
|
+
f"{slurm_env.dict()=}"
|
79
|
+
)
|
80
|
+
|
81
|
+
# GPU-related options
|
82
|
+
# Notes about priority:
|
83
|
+
# 1. This block of definitions takes priority over other definitions from
|
84
|
+
# slurm_env which are not under the `needs_gpu` subgroup
|
85
|
+
# 2. This block of definitions has lower priority than whatever comes next
|
86
|
+
# (i.e. from WorkflowTask.meta).
|
87
|
+
if wftask.meta is not None:
|
88
|
+
needs_gpu = wftask.meta.get("needs_gpu", False)
|
89
|
+
else:
|
90
|
+
needs_gpu = False
|
91
|
+
logger.debug(f"[get_slurm_config] {needs_gpu=}")
|
92
|
+
if needs_gpu:
|
93
|
+
for key, value in slurm_env.gpu_slurm_config.dict(
|
94
|
+
exclude_unset=True, exclude={"mem"}
|
95
|
+
).items():
|
96
|
+
slurm_dict[key] = value
|
97
|
+
if slurm_env.gpu_slurm_config.mem:
|
98
|
+
slurm_dict["mem_per_task_MB"] = slurm_env.gpu_slurm_config.mem
|
99
|
+
|
100
|
+
# Number of CPUs per task, for multithreading
|
101
|
+
if wftask.meta is not None and "cpus_per_task" in wftask.meta:
|
102
|
+
cpus_per_task = int(wftask.meta["cpus_per_task"])
|
103
|
+
slurm_dict["cpus_per_task"] = cpus_per_task
|
104
|
+
|
105
|
+
# Required memory per task, in MB
|
106
|
+
if wftask.meta is not None and "mem" in wftask.meta:
|
107
|
+
raw_mem = wftask.meta["mem"]
|
108
|
+
mem_per_task_MB = _parse_mem_value(raw_mem)
|
109
|
+
slurm_dict["mem_per_task_MB"] = mem_per_task_MB
|
110
|
+
|
111
|
+
# Job name
|
112
|
+
job_name = wftask.task.name.replace(" ", "_")
|
113
|
+
slurm_dict["job_name"] = job_name
|
114
|
+
|
115
|
+
# Optional SLURM arguments and extra lines
|
116
|
+
if wftask.meta is not None:
|
117
|
+
account = wftask.meta.get("account", None)
|
118
|
+
if account is not None:
|
119
|
+
error_msg = (
|
120
|
+
f"Invalid {account=} property in WorkflowTask `meta` "
|
121
|
+
"attribute.\n"
|
122
|
+
"SLURM account must be set in the request body of the "
|
123
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
124
|
+
)
|
125
|
+
logger.error(error_msg)
|
126
|
+
raise SlurmConfigError(error_msg)
|
127
|
+
for key in ["time", "gres", "constraint"]:
|
128
|
+
value = wftask.meta.get(key, None)
|
129
|
+
if value:
|
130
|
+
slurm_dict[key] = value
|
131
|
+
if wftask.meta is not None:
|
132
|
+
extra_lines = wftask.meta.get("extra_lines", [])
|
133
|
+
else:
|
134
|
+
extra_lines = []
|
135
|
+
extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
|
136
|
+
if len(set(extra_lines)) != len(extra_lines):
|
137
|
+
logger.debug(
|
138
|
+
"[get_slurm_config] Removing repeated elements "
|
139
|
+
f"from {extra_lines=}."
|
140
|
+
)
|
141
|
+
extra_lines = list(set(extra_lines))
|
142
|
+
slurm_dict["extra_lines"] = extra_lines
|
143
|
+
|
144
|
+
# Job-batching parameters (if None, they will be determined heuristically)
|
145
|
+
if wftask.meta is not None:
|
146
|
+
tasks_per_job = wftask.meta.get("tasks_per_job", None)
|
147
|
+
parallel_tasks_per_job = wftask.meta.get(
|
148
|
+
"parallel_tasks_per_job", None
|
149
|
+
)
|
150
|
+
else:
|
151
|
+
tasks_per_job = None
|
152
|
+
parallel_tasks_per_job = None
|
153
|
+
slurm_dict["tasks_per_job"] = tasks_per_job
|
154
|
+
slurm_dict["parallel_tasks_per_job"] = parallel_tasks_per_job
|
155
|
+
|
156
|
+
# Put everything together
|
157
|
+
logger.debug(
|
158
|
+
"[get_slurm_config] Now create a SlurmConfig object based "
|
159
|
+
f"on {slurm_dict=}"
|
160
|
+
)
|
161
|
+
slurm_config = SlurmConfig(**slurm_dict)
|
162
|
+
|
163
|
+
return slurm_config
|
@@ -0,0 +1,117 @@
|
|
1
|
+
"""
|
2
|
+
Common utilities and routines for runner backends (public API)
|
3
|
+
|
4
|
+
This module includes utilities and routines that are of use to implement
|
5
|
+
runner backends but that should also be exposed to the other components of
|
6
|
+
`Fractal Server`.
|
7
|
+
"""
|
8
|
+
import json
|
9
|
+
from json import JSONEncoder
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Any
|
12
|
+
|
13
|
+
from pydantic import BaseModel
|
14
|
+
|
15
|
+
from ....logger import close_logger as close_job_logger # noqa F401
|
16
|
+
from ...models.v1 import Dataset
|
17
|
+
from ...models.v1 import Workflow
|
18
|
+
|
19
|
+
|
20
|
+
class TaskParameterEncoder(JSONEncoder):
|
21
|
+
"""
|
22
|
+
Convenience JSONEncoder that serialises `Path`s as strings
|
23
|
+
"""
|
24
|
+
|
25
|
+
def default(self, value):
|
26
|
+
if isinstance(value, Path):
|
27
|
+
return value.as_posix()
|
28
|
+
return JSONEncoder.default(self, value)
|
29
|
+
|
30
|
+
|
31
|
+
class TaskParameters(BaseModel):
|
32
|
+
"""
|
33
|
+
Wrapper for task input parameters
|
34
|
+
|
35
|
+
Instances of this class are used to pass parameters from the output of a
|
36
|
+
task to the input of the next one.
|
37
|
+
|
38
|
+
Attributes:
|
39
|
+
input_paths:
|
40
|
+
Input paths as derived by the input dataset.
|
41
|
+
output_paths:
|
42
|
+
Output path as derived from the output dataset.
|
43
|
+
metadata:
|
44
|
+
Dataset metadata, as found in the input dataset or as updated by
|
45
|
+
the previous task.
|
46
|
+
history:
|
47
|
+
Dataset history, as found in the input dataset or as updated by
|
48
|
+
the previous task.
|
49
|
+
"""
|
50
|
+
|
51
|
+
input_paths: list[Path]
|
52
|
+
output_path: Path
|
53
|
+
metadata: dict[str, Any]
|
54
|
+
history: list[dict[str, Any]]
|
55
|
+
|
56
|
+
class Config:
|
57
|
+
arbitrary_types_allowed = True
|
58
|
+
extra = "forbid"
|
59
|
+
|
60
|
+
|
61
|
+
def validate_workflow_compatibility(
|
62
|
+
*,
|
63
|
+
input_dataset: Dataset,
|
64
|
+
workflow: Workflow,
|
65
|
+
output_dataset: Dataset,
|
66
|
+
first_task_index: int,
|
67
|
+
last_task_index: int,
|
68
|
+
) -> None:
|
69
|
+
"""
|
70
|
+
Check compatibility of workflow and input / ouptut dataset
|
71
|
+
"""
|
72
|
+
# Check input_dataset type
|
73
|
+
workflow_input_type = workflow.task_list[first_task_index].task.input_type
|
74
|
+
if (
|
75
|
+
workflow_input_type != "Any"
|
76
|
+
and workflow_input_type != input_dataset.type
|
77
|
+
):
|
78
|
+
raise TypeError(
|
79
|
+
f"Incompatible types `{workflow_input_type}` of workflow "
|
80
|
+
f"`{workflow.name}` and `{input_dataset.type}` of dataset "
|
81
|
+
f"`{input_dataset.name}`"
|
82
|
+
)
|
83
|
+
|
84
|
+
# Check output_dataset type
|
85
|
+
workflow_output_type = workflow.task_list[last_task_index].task.output_type
|
86
|
+
if (
|
87
|
+
workflow_output_type != "Any"
|
88
|
+
and workflow_output_type != output_dataset.type
|
89
|
+
):
|
90
|
+
raise TypeError(
|
91
|
+
f"Incompatible types `{workflow_output_type}` of workflow "
|
92
|
+
f"`{workflow.name}` and `{output_dataset.type}` of dataset "
|
93
|
+
f"`{output_dataset.name}`"
|
94
|
+
)
|
95
|
+
|
96
|
+
|
97
|
+
def write_args_file(
|
98
|
+
*args: dict[str, Any],
|
99
|
+
path: Path,
|
100
|
+
):
|
101
|
+
"""
|
102
|
+
Merge arbitrary dictionaries and write to file
|
103
|
+
|
104
|
+
Args:
|
105
|
+
*args:
|
106
|
+
One or more dictionaries that will be merged into one respecting
|
107
|
+
the order with which they are passed in, i.e., last in overrides
|
108
|
+
previous ones.
|
109
|
+
path:
|
110
|
+
Destination for serialised file.
|
111
|
+
"""
|
112
|
+
out = {}
|
113
|
+
for d in args:
|
114
|
+
out.update(d)
|
115
|
+
|
116
|
+
with open(path, "w") as f:
|
117
|
+
json.dump(out, f, cls=TaskParameterEncoder, indent=4)
|
@@ -19,13 +19,13 @@ from pathlib import Path
|
|
19
19
|
from typing import Any
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from
|
23
|
-
from
|
24
|
-
from
|
25
|
-
from
|
26
|
-
from
|
27
|
-
from
|
28
|
-
from
|
22
|
+
from ...models.v1 import ApplyWorkflow
|
23
|
+
from ...models.v1 import Dataset
|
24
|
+
from ...models.v1 import Workflow
|
25
|
+
from ...models.v1 import WorkflowTask
|
26
|
+
from ...schemas.v1 import WorkflowTaskStatusTypeV1
|
27
|
+
from ..filenames import HISTORY_FILENAME
|
28
|
+
from ..filenames import METADATA_FILENAME
|
29
29
|
|
30
30
|
|
31
31
|
def assemble_history_failed_job(
|
@@ -98,7 +98,7 @@ def assemble_history_failed_job(
|
|
98
98
|
failed_wftask_dump["task"] = failed_wftask.task.model_dump()
|
99
99
|
new_history_item = dict(
|
100
100
|
workflowtask=failed_wftask_dump,
|
101
|
-
status=
|
101
|
+
status=WorkflowTaskStatusTypeV1.FAILED,
|
102
102
|
parallelization=dict(
|
103
103
|
parallelization_level=failed_wftask.parallelization_level,
|
104
104
|
),
|
@@ -0,0 +1,337 @@
|
|
1
|
+
"""
|
2
|
+
Runner backend subsystem root V2
|
3
|
+
|
4
|
+
This module is the single entry point to the runner backend subsystem V2.
|
5
|
+
Other subystems should only import this module and not its submodules or
|
6
|
+
the individual backends.
|
7
|
+
"""
|
8
|
+
import os
|
9
|
+
import traceback
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Optional
|
12
|
+
|
13
|
+
from sqlalchemy.orm.attributes import flag_modified
|
14
|
+
|
15
|
+
from ....config import get_settings
|
16
|
+
from ....logger import close_logger
|
17
|
+
from ....logger import set_logger
|
18
|
+
from ....syringe import Inject
|
19
|
+
from ....utils import get_timestamp
|
20
|
+
from ...db import DB
|
21
|
+
from ...models.v2 import DatasetV2
|
22
|
+
from ...models.v2 import JobV2
|
23
|
+
from ...models.v2 import WorkflowTaskV2
|
24
|
+
from ...models.v2 import WorkflowV2
|
25
|
+
from ...schemas.v2 import JobStatusTypeV2
|
26
|
+
from ..exceptions import JobExecutionError
|
27
|
+
from ..exceptions import TaskExecutionError
|
28
|
+
from ..filenames import WORKFLOW_LOG_FILENAME
|
29
|
+
from ._local import process_workflow as local_process_workflow
|
30
|
+
from ._slurm import process_workflow as slurm_process_workflow
|
31
|
+
from .handle_failed_job import assemble_filters_failed_job
|
32
|
+
from .handle_failed_job import assemble_history_failed_job
|
33
|
+
from .handle_failed_job import assemble_images_failed_job
|
34
|
+
from .runner import execute_tasks_v2 # noqa
|
35
|
+
from fractal_server import __VERSION__
|
36
|
+
|
37
|
+
_backends = {}
|
38
|
+
_backends["local"] = local_process_workflow
|
39
|
+
_backends["slurm"] = slurm_process_workflow
|
40
|
+
|
41
|
+
|
42
|
+
async def submit_workflow(
|
43
|
+
*,
|
44
|
+
workflow_id: int,
|
45
|
+
dataset_id: int,
|
46
|
+
job_id: int,
|
47
|
+
worker_init: Optional[str] = None,
|
48
|
+
slurm_user: Optional[str] = None,
|
49
|
+
user_cache_dir: Optional[str] = None,
|
50
|
+
) -> None:
|
51
|
+
"""
|
52
|
+
Prepares a workflow and applies it to a dataset
|
53
|
+
|
54
|
+
This function wraps the process_workflow one, which is different for each
|
55
|
+
backend (e.g. local or slurm backend).
|
56
|
+
|
57
|
+
Args:
|
58
|
+
workflow_id:
|
59
|
+
ID of the workflow being applied
|
60
|
+
dataset_id:
|
61
|
+
Dataset ID
|
62
|
+
job_id:
|
63
|
+
Id of the job record which stores the state for the current
|
64
|
+
workflow application.
|
65
|
+
worker_init:
|
66
|
+
Custom executor parameters that get parsed before the execution of
|
67
|
+
each task.
|
68
|
+
user_cache_dir:
|
69
|
+
Cache directory (namely a path where the user can write); for the
|
70
|
+
slurm backend, this is used as a base directory for
|
71
|
+
`job.working_dir_user`.
|
72
|
+
slurm_user:
|
73
|
+
The username to impersonate for the workflow execution, for the
|
74
|
+
slurm backend.
|
75
|
+
"""
|
76
|
+
|
77
|
+
# Declare runner backend and set `process_workflow` function
|
78
|
+
settings = Inject(get_settings)
|
79
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
80
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
81
|
+
process_workflow = local_process_workflow
|
82
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
83
|
+
process_workflow = slurm_process_workflow
|
84
|
+
else:
|
85
|
+
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
86
|
+
|
87
|
+
with next(DB.get_sync_db()) as db_sync:
|
88
|
+
|
89
|
+
job: JobV2 = db_sync.get(JobV2, job_id)
|
90
|
+
if not job:
|
91
|
+
raise ValueError(f"Cannot fetch job {job_id} from database")
|
92
|
+
|
93
|
+
dataset: DatasetV2 = db_sync.get(DatasetV2, dataset_id)
|
94
|
+
workflow: WorkflowV2 = db_sync.get(WorkflowV2, workflow_id)
|
95
|
+
if not (dataset and workflow):
|
96
|
+
log_msg = ""
|
97
|
+
if not dataset:
|
98
|
+
log_msg += f"Cannot fetch dataset {dataset_id} from database\n"
|
99
|
+
if not workflow:
|
100
|
+
log_msg += (
|
101
|
+
f"Cannot fetch workflow {workflow_id} from database\n"
|
102
|
+
)
|
103
|
+
job.status = JobStatusTypeV2.FAILED
|
104
|
+
job.end_timestamp = get_timestamp()
|
105
|
+
job.log = log_msg
|
106
|
+
db_sync.merge(job)
|
107
|
+
db_sync.commit()
|
108
|
+
db_sync.close()
|
109
|
+
return
|
110
|
+
|
111
|
+
# Define and create server-side working folder
|
112
|
+
project_id = workflow.project_id
|
113
|
+
timestamp_string = get_timestamp().strftime("%Y%m%d_%H%M%S")
|
114
|
+
WORKFLOW_DIR = (
|
115
|
+
settings.FRACTAL_RUNNER_WORKING_BASE_DIR
|
116
|
+
/ (
|
117
|
+
f"proj_{project_id:07d}_wf_{workflow_id:07d}_job_{job_id:07d}"
|
118
|
+
f"_{timestamp_string}"
|
119
|
+
)
|
120
|
+
).resolve()
|
121
|
+
|
122
|
+
if WORKFLOW_DIR.exists():
|
123
|
+
raise RuntimeError(f"Workflow dir {WORKFLOW_DIR} already exists.")
|
124
|
+
|
125
|
+
# Create WORKFLOW_DIR with 755 permissions
|
126
|
+
original_umask = os.umask(0)
|
127
|
+
WORKFLOW_DIR.mkdir(parents=True, mode=0o755)
|
128
|
+
os.umask(original_umask)
|
129
|
+
|
130
|
+
# Define and create user-side working folder, if needed
|
131
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
132
|
+
WORKFLOW_DIR_USER = WORKFLOW_DIR
|
133
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
134
|
+
|
135
|
+
from ..executors.slurm._subprocess_run_as_user import (
|
136
|
+
_mkdir_as_user,
|
137
|
+
)
|
138
|
+
|
139
|
+
WORKFLOW_DIR_USER = (
|
140
|
+
Path(user_cache_dir) / f"{WORKFLOW_DIR.name}"
|
141
|
+
).resolve()
|
142
|
+
_mkdir_as_user(folder=str(WORKFLOW_DIR_USER), user=slurm_user)
|
143
|
+
else:
|
144
|
+
raise ValueError(f"{FRACTAL_RUNNER_BACKEND=} not supported")
|
145
|
+
|
146
|
+
# Update db
|
147
|
+
job.working_dir = WORKFLOW_DIR.as_posix()
|
148
|
+
job.working_dir_user = WORKFLOW_DIR_USER.as_posix()
|
149
|
+
db_sync.merge(job)
|
150
|
+
db_sync.commit()
|
151
|
+
|
152
|
+
# After Session.commit() is called, either explicitly or when using a
|
153
|
+
# context manager, all objects associated with the Session are expired.
|
154
|
+
# https://docs.sqlalchemy.org/en/14/orm/
|
155
|
+
# session_basics.html#opening-and-closing-a-session
|
156
|
+
# https://docs.sqlalchemy.org/en/14/orm/
|
157
|
+
# session_state_management.html#refreshing-expiring
|
158
|
+
|
159
|
+
# See issue #928:
|
160
|
+
# https://github.com/fractal-analytics-platform/
|
161
|
+
# fractal-server/issues/928
|
162
|
+
|
163
|
+
db_sync.refresh(dataset)
|
164
|
+
db_sync.refresh(workflow)
|
165
|
+
|
166
|
+
# Write logs
|
167
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
168
|
+
log_file_path = WORKFLOW_DIR / WORKFLOW_LOG_FILENAME
|
169
|
+
logger = set_logger(
|
170
|
+
logger_name=logger_name,
|
171
|
+
log_file_path=log_file_path,
|
172
|
+
)
|
173
|
+
logger.info(
|
174
|
+
f'Start execution of workflow "{workflow.name}"; '
|
175
|
+
f"more logs at {str(log_file_path)}"
|
176
|
+
)
|
177
|
+
logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
|
178
|
+
logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
|
179
|
+
logger.debug(f"slurm_user: {slurm_user}")
|
180
|
+
logger.debug(f"slurm_account: {job.slurm_account}")
|
181
|
+
logger.debug(f"worker_init: {worker_init}")
|
182
|
+
logger.debug(f"job.id: {job.id}")
|
183
|
+
logger.debug(f"job.working_dir: {job.working_dir}")
|
184
|
+
logger.debug(f"job.working_dir_user: {job.working_dir_user}")
|
185
|
+
logger.debug(f"job.first_task_index: {job.first_task_index}")
|
186
|
+
logger.debug(f"job.last_task_index: {job.last_task_index}")
|
187
|
+
logger.debug(f'START workflow "{workflow.name}"')
|
188
|
+
|
189
|
+
try:
|
190
|
+
# "The Session.close() method does not prevent the Session from being
|
191
|
+
# used again. The Session itself does not actually have a distinct
|
192
|
+
# “closed” state; it merely means the Session will release all database
|
193
|
+
# connections and ORM objects."
|
194
|
+
# (https://docs.sqlalchemy.org/en/20/orm/session_api.html#sqlalchemy.orm.Session.close).
|
195
|
+
#
|
196
|
+
# We close the session before the (possibly long) process_workflow
|
197
|
+
# call, to make sure all DB connections are released. The reason why we
|
198
|
+
# are not using a context manager within the try block is that we also
|
199
|
+
# need access to db_sync in the except branches.
|
200
|
+
db_sync = next(DB.get_sync_db())
|
201
|
+
db_sync.close()
|
202
|
+
|
203
|
+
new_dataset_attributes = await process_workflow(
|
204
|
+
workflow=workflow,
|
205
|
+
dataset=dataset,
|
206
|
+
slurm_user=slurm_user,
|
207
|
+
slurm_account=job.slurm_account,
|
208
|
+
user_cache_dir=user_cache_dir,
|
209
|
+
workflow_dir=WORKFLOW_DIR,
|
210
|
+
workflow_dir_user=WORKFLOW_DIR_USER,
|
211
|
+
logger_name=logger_name,
|
212
|
+
worker_init=worker_init,
|
213
|
+
first_task_index=job.first_task_index,
|
214
|
+
last_task_index=job.last_task_index,
|
215
|
+
)
|
216
|
+
|
217
|
+
logger.info(
|
218
|
+
f'End execution of workflow "{workflow.name}"; '
|
219
|
+
f"more logs at {str(log_file_path)}"
|
220
|
+
)
|
221
|
+
logger.debug(f'END workflow "{workflow.name}"')
|
222
|
+
|
223
|
+
# Update dataset attributes, in case of successful execution
|
224
|
+
dataset.history.extend(new_dataset_attributes["history"])
|
225
|
+
dataset.filters = new_dataset_attributes["filters"]
|
226
|
+
dataset.images = new_dataset_attributes["images"]
|
227
|
+
for attribute_name in ["filters", "history", "images"]:
|
228
|
+
flag_modified(dataset, attribute_name)
|
229
|
+
db_sync.merge(dataset)
|
230
|
+
|
231
|
+
# Update job DB entry
|
232
|
+
job.status = JobStatusTypeV2.DONE
|
233
|
+
job.end_timestamp = get_timestamp()
|
234
|
+
with log_file_path.open("r") as f:
|
235
|
+
logs = f.read()
|
236
|
+
job.log = logs
|
237
|
+
db_sync.merge(job)
|
238
|
+
db_sync.commit()
|
239
|
+
|
240
|
+
except TaskExecutionError as e:
|
241
|
+
|
242
|
+
logger.debug(f'FAILED workflow "{workflow.name}", TaskExecutionError.')
|
243
|
+
logger.info(f'Workflow "{workflow.name}" failed (TaskExecutionError).')
|
244
|
+
|
245
|
+
# Read dataset attributes produced by the last successful task, and
|
246
|
+
# update the DB dataset accordingly
|
247
|
+
failed_wftask = db_sync.get(WorkflowTaskV2, e.workflow_task_id)
|
248
|
+
dataset.history = assemble_history_failed_job(
|
249
|
+
job,
|
250
|
+
dataset,
|
251
|
+
workflow,
|
252
|
+
logger,
|
253
|
+
failed_wftask=failed_wftask,
|
254
|
+
)
|
255
|
+
latest_filters = assemble_filters_failed_job(job)
|
256
|
+
if latest_filters is not None:
|
257
|
+
dataset.filters = latest_filters
|
258
|
+
latest_images = assemble_images_failed_job(job)
|
259
|
+
if latest_images is not None:
|
260
|
+
dataset.images = latest_images
|
261
|
+
db_sync.merge(dataset)
|
262
|
+
|
263
|
+
job.status = JobStatusTypeV2.FAILED
|
264
|
+
job.end_timestamp = get_timestamp()
|
265
|
+
|
266
|
+
exception_args_string = "\n".join(e.args)
|
267
|
+
job.log = (
|
268
|
+
f"TASK ERROR: "
|
269
|
+
f"Task name: {e.task_name}, "
|
270
|
+
f"position in Workflow: {e.workflow_task_order}\n"
|
271
|
+
f"TRACEBACK:\n{exception_args_string}"
|
272
|
+
)
|
273
|
+
db_sync.merge(job)
|
274
|
+
db_sync.commit()
|
275
|
+
|
276
|
+
except JobExecutionError as e:
|
277
|
+
|
278
|
+
logger.debug(f'FAILED workflow "{workflow.name}", JobExecutionError.')
|
279
|
+
logger.info(f'Workflow "{workflow.name}" failed (JobExecutionError).')
|
280
|
+
|
281
|
+
# Read dataset attributes produced by the last successful task, and
|
282
|
+
# update the DB dataset accordingly
|
283
|
+
dataset.history = assemble_history_failed_job(
|
284
|
+
job,
|
285
|
+
dataset,
|
286
|
+
workflow,
|
287
|
+
logger,
|
288
|
+
)
|
289
|
+
latest_filters = assemble_filters_failed_job(job)
|
290
|
+
if latest_filters is not None:
|
291
|
+
dataset.filters = latest_filters
|
292
|
+
latest_images = assemble_images_failed_job(job)
|
293
|
+
if latest_images is not None:
|
294
|
+
dataset.images = latest_images
|
295
|
+
db_sync.merge(dataset)
|
296
|
+
|
297
|
+
job.status = JobStatusTypeV2.FAILED
|
298
|
+
job.end_timestamp = get_timestamp()
|
299
|
+
error = e.assemble_error()
|
300
|
+
job.log = f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}"
|
301
|
+
db_sync.merge(job)
|
302
|
+
db_sync.commit()
|
303
|
+
|
304
|
+
except Exception:
|
305
|
+
|
306
|
+
logger.debug(f'FAILED workflow "{workflow.name}", unknown error.')
|
307
|
+
logger.info(f'Workflow "{workflow.name}" failed (unkwnon error).')
|
308
|
+
|
309
|
+
current_traceback = traceback.format_exc()
|
310
|
+
|
311
|
+
# Read dataset attributes produced by the last successful task, and
|
312
|
+
# update the DB dataset accordingly
|
313
|
+
dataset.history = assemble_history_failed_job(
|
314
|
+
job,
|
315
|
+
dataset,
|
316
|
+
workflow,
|
317
|
+
logger,
|
318
|
+
)
|
319
|
+
latest_filters = assemble_filters_failed_job(job)
|
320
|
+
if latest_filters is not None:
|
321
|
+
dataset.filters = latest_filters
|
322
|
+
latest_images = assemble_images_failed_job(job)
|
323
|
+
if latest_images is not None:
|
324
|
+
dataset.images = latest_images
|
325
|
+
db_sync.merge(dataset)
|
326
|
+
|
327
|
+
job.status = JobStatusTypeV2.FAILED
|
328
|
+
job.end_timestamp = get_timestamp()
|
329
|
+
job.log = (
|
330
|
+
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
331
|
+
f"TRACEBACK:\n{current_traceback}"
|
332
|
+
)
|
333
|
+
db_sync.merge(job)
|
334
|
+
db_sync.commit()
|
335
|
+
finally:
|
336
|
+
close_logger(logger)
|
337
|
+
db_sync.close()
|