fractal-server 2.14.0a10__py3-none-any.whl → 2.14.0a12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/routes/api/v2/submit.py +1 -1
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +38 -17
- fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
- fractal_server/app/runner/executors/local/runner.py +109 -59
- fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +4 -0
- fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
- fractal_server/app/runner/executors/slurm_ssh/runner.py +6 -10
- fractal_server/app/runner/executors/slurm_sudo/runner.py +196 -99
- fractal_server/app/runner/task_files.py +8 -0
- fractal_server/app/runner/v2/__init__.py +0 -366
- fractal_server/app/runner/v2/_local.py +2 -2
- fractal_server/app/runner/v2/_slurm_ssh.py +2 -2
- fractal_server/app/runner/v2/_slurm_sudo.py +2 -2
- fractal_server/app/runner/v2/db_tools.py +87 -0
- fractal_server/app/runner/v2/runner.py +77 -81
- fractal_server/app/runner/v2/runner_functions.py +274 -436
- fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
- fractal_server/app/runner/v2/submit_workflow.py +366 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/RECORD +27 -28
- fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
- fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
- fractal_server/app/runner/v2/_db_tools.py +0 -48
- {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/entry_points.txt +0 -0
fractal_server/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__VERSION__ = "2.14.
|
1
|
+
__VERSION__ = "2.14.0a12"
|
@@ -30,7 +30,7 @@ from fractal_server.app.routes.aux.validate_user_settings import (
|
|
30
30
|
from fractal_server.app.runner.set_start_and_last_task_index import (
|
31
31
|
set_start_and_last_task_index,
|
32
32
|
)
|
33
|
-
from fractal_server.app.runner.v2 import submit_workflow
|
33
|
+
from fractal_server.app.runner.v2.submit_workflow import submit_workflow
|
34
34
|
from fractal_server.app.schemas.v2 import JobCreateV2
|
35
35
|
from fractal_server.app.schemas.v2 import JobReadV2
|
36
36
|
from fractal_server.app.schemas.v2 import JobStatusTypeV2
|
@@ -37,6 +37,10 @@ class TaskExecutionError(RuntimeError):
|
|
37
37
|
self.task_name = task_name
|
38
38
|
|
39
39
|
|
40
|
+
class TaskOutputValidationError(ValueError):
|
41
|
+
pass
|
42
|
+
|
43
|
+
|
40
44
|
class JobExecutionError(RuntimeError):
|
41
45
|
"""
|
42
46
|
Forwards errors in the execution of a task that are due to external factors
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import Any
|
2
2
|
|
3
|
-
from fractal_server.app.runner.
|
3
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
4
4
|
from fractal_server.app.schemas.v2.task import TaskTypeType
|
5
5
|
|
6
6
|
|
@@ -29,9 +29,10 @@ class BaseRunner(object):
|
|
29
29
|
self,
|
30
30
|
func: callable,
|
31
31
|
parameters: dict[str, Any],
|
32
|
-
|
32
|
+
history_unit_id: int,
|
33
|
+
task_files: TaskFiles,
|
33
34
|
task_type: TaskTypeType,
|
34
|
-
|
35
|
+
config: Any,
|
35
36
|
) -> tuple[Any, BaseException]:
|
36
37
|
"""
|
37
38
|
Run a single fractal task.
|
@@ -45,7 +46,7 @@ class BaseRunner(object):
|
|
45
46
|
history_item_id:
|
46
47
|
Database ID of the corresponding `HistoryItemV2` entry.
|
47
48
|
task_type: Task type.
|
48
|
-
|
49
|
+
config: Runner-specific parameters.
|
49
50
|
"""
|
50
51
|
raise NotImplementedError()
|
51
52
|
|
@@ -53,9 +54,10 @@ class BaseRunner(object):
|
|
53
54
|
self,
|
54
55
|
func: callable,
|
55
56
|
list_parameters: list[dict[str, Any]],
|
56
|
-
|
57
|
+
history_unit_ids: list[int],
|
58
|
+
list_task_files: list[TaskFiles],
|
57
59
|
task_type: TaskTypeType,
|
58
|
-
|
60
|
+
config: Any,
|
59
61
|
) -> tuple[dict[int, Any], dict[int, BaseException]]:
|
60
62
|
"""
|
61
63
|
Run a parallel fractal task.
|
@@ -70,7 +72,7 @@ class BaseRunner(object):
|
|
70
72
|
history_item_id:
|
71
73
|
Database ID of the corresponding `HistoryItemV2` entry.
|
72
74
|
task_type: Task type.
|
73
|
-
|
75
|
+
config: Runner-specific parameters.
|
74
76
|
"""
|
75
77
|
raise NotImplementedError()
|
76
78
|
|
@@ -101,15 +103,11 @@ class BaseRunner(object):
|
|
101
103
|
f"Forbidden 'zarr_urls' key in {list(parameters.keys())}"
|
102
104
|
)
|
103
105
|
|
104
|
-
if _COMPONENT_KEY_ not in parameters.keys():
|
105
|
-
raise ValueError(
|
106
|
-
f"No '{_COMPONENT_KEY_}' key in in {list(parameters.keys())}"
|
107
|
-
)
|
108
|
-
|
109
106
|
def validate_multisubmit_parameters(
|
110
107
|
self,
|
111
108
|
list_parameters: list[dict[str, Any]],
|
112
109
|
task_type: TaskTypeType,
|
110
|
+
list_task_files: list[TaskFiles],
|
113
111
|
) -> None:
|
114
112
|
"""
|
115
113
|
Validate parameters for `multi_submit` method
|
@@ -121,6 +119,12 @@ class BaseRunner(object):
|
|
121
119
|
if task_type not in TASK_TYPES_MULTISUBMIT:
|
122
120
|
raise ValueError(f"Invalid {task_type=} for `multisubmit`.")
|
123
121
|
|
122
|
+
subfolders = set(
|
123
|
+
task_file.wftask_subfolder_local for task_file in list_task_files
|
124
|
+
)
|
125
|
+
if len(subfolders) != 1:
|
126
|
+
raise ValueError(f"More than one subfolders: {subfolders}.")
|
127
|
+
|
124
128
|
if not isinstance(list_parameters, list):
|
125
129
|
raise ValueError("`parameters` must be a list.")
|
126
130
|
|
@@ -131,12 +135,29 @@ class BaseRunner(object):
|
|
131
135
|
raise ValueError(
|
132
136
|
f"No 'zarr_url' key in in {list(single_kwargs.keys())}"
|
133
137
|
)
|
134
|
-
if _COMPONENT_KEY_ not in single_kwargs.keys():
|
135
|
-
raise ValueError(
|
136
|
-
f"No '{_COMPONENT_KEY_}' key "
|
137
|
-
f"in {list(single_kwargs.keys())}"
|
138
|
-
)
|
139
138
|
if task_type == "parallel":
|
140
139
|
zarr_urls = [kwargs["zarr_url"] for kwargs in list_parameters]
|
141
140
|
if len(zarr_urls) != len(set(zarr_urls)):
|
142
141
|
raise ValueError("Non-unique zarr_urls")
|
142
|
+
|
143
|
+
def validate_multisubmit_history_unit_ids(
|
144
|
+
self,
|
145
|
+
*,
|
146
|
+
history_unit_ids: list[int],
|
147
|
+
task_type: TaskTypeType,
|
148
|
+
list_parameters: list[dict[str, Any]],
|
149
|
+
) -> None:
|
150
|
+
if task_type in ["compound", "converter_compound"]:
|
151
|
+
if len(history_unit_ids) != 1:
|
152
|
+
raise NotImplementedError(
|
153
|
+
"We are breaking the assumption that compound/multisubmit "
|
154
|
+
"is associated to a single HistoryUnit. This is not "
|
155
|
+
"supported."
|
156
|
+
)
|
157
|
+
elif task_type == "parallel" and len(history_unit_ids) != len(
|
158
|
+
list_parameters
|
159
|
+
):
|
160
|
+
raise ValueError(
|
161
|
+
f"{len(history_unit_ids)=} differs from "
|
162
|
+
f"{len(list_parameters)=}."
|
163
|
+
)
|
@@ -48,13 +48,6 @@ class LocalBackendConfig(BaseModel):
|
|
48
48
|
parallel_tasks_per_job: Optional[int] = None
|
49
49
|
|
50
50
|
|
51
|
-
def get_default_local_backend_config():
|
52
|
-
"""
|
53
|
-
Return a default `LocalBackendConfig` configuration object
|
54
|
-
"""
|
55
|
-
return LocalBackendConfig(parallel_tasks_per_job=None)
|
56
|
-
|
57
|
-
|
58
51
|
def get_local_backend_config(
|
59
52
|
wftask: WorkflowTaskV2,
|
60
53
|
which_type: Literal["non_parallel", "parallel"],
|
@@ -2,14 +2,14 @@ from concurrent.futures import Future
|
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Any
|
5
|
-
from typing import
|
5
|
+
from typing import Literal
|
6
6
|
|
7
|
-
from .
|
8
|
-
from .
|
9
|
-
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
7
|
+
from .get_local_config import LocalBackendConfig
|
8
|
+
from fractal_server.app.db import get_sync_db
|
10
9
|
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
11
10
|
from fractal_server.app.runner.task_files import TaskFiles
|
12
|
-
from fractal_server.app.
|
11
|
+
from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
|
12
|
+
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
13
13
|
from fractal_server.logger import set_logger
|
14
14
|
|
15
15
|
logger = set_logger(__name__)
|
@@ -49,67 +49,97 @@ class LocalRunner(BaseRunner):
|
|
49
49
|
self,
|
50
50
|
func: callable,
|
51
51
|
parameters: dict[str, Any],
|
52
|
+
history_unit_id: int,
|
52
53
|
task_files: TaskFiles,
|
53
|
-
task_type:
|
54
|
-
|
54
|
+
task_type: Literal[
|
55
|
+
"non_parallel",
|
56
|
+
"converter_non_parallel",
|
57
|
+
"compound",
|
58
|
+
"converter_compound",
|
59
|
+
],
|
60
|
+
config: LocalBackendConfig,
|
55
61
|
) -> tuple[Any, Exception]:
|
56
62
|
logger.debug("[submit] START")
|
57
63
|
|
58
|
-
current_task_files = TaskFiles(
|
59
|
-
**task_files.model_dump(
|
60
|
-
exclude={"component"},
|
61
|
-
),
|
62
|
-
component=parameters[_COMPONENT_KEY_],
|
63
|
-
)
|
64
|
-
|
65
64
|
self.validate_submit_parameters(parameters, task_type=task_type)
|
66
|
-
workdir_local =
|
65
|
+
workdir_local = task_files.wftask_subfolder_local
|
67
66
|
workdir_local.mkdir()
|
68
67
|
|
69
68
|
# SUBMISSION PHASE
|
70
|
-
future = self.executor.submit(
|
69
|
+
future = self.executor.submit(
|
70
|
+
func,
|
71
|
+
parameters=parameters,
|
72
|
+
remote_files=task_files.remote_files_dict,
|
73
|
+
)
|
71
74
|
|
72
75
|
# RETRIEVAL PHASE
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
76
|
+
with next(get_sync_db()) as db:
|
77
|
+
try:
|
78
|
+
result = future.result()
|
79
|
+
logger.debug("[submit] END with result")
|
80
|
+
if task_type not in ["compound", "converter_compound"]:
|
81
|
+
update_status_of_history_unit(
|
82
|
+
history_unit_id=history_unit_id,
|
83
|
+
status=HistoryUnitStatus.DONE,
|
84
|
+
db_sync=db,
|
85
|
+
)
|
86
|
+
return result, None
|
87
|
+
except Exception as e:
|
88
|
+
exception = e
|
89
|
+
logger.debug("[submit] END with exception")
|
90
|
+
update_status_of_history_unit(
|
91
|
+
history_unit_id=history_unit_id,
|
92
|
+
status=HistoryUnitStatus.FAILED,
|
93
|
+
db_sync=db,
|
94
|
+
)
|
95
|
+
|
96
|
+
return None, exception
|
81
97
|
|
82
98
|
def multisubmit(
|
83
99
|
self,
|
84
100
|
func: callable,
|
85
101
|
list_parameters: list[dict],
|
86
|
-
|
87
|
-
|
88
|
-
|
102
|
+
history_unit_ids: list[int],
|
103
|
+
list_task_files: list[TaskFiles],
|
104
|
+
task_type: Literal["parallel", "compound", "converter_compound"],
|
105
|
+
config: LocalBackendConfig,
|
89
106
|
):
|
90
|
-
|
107
|
+
"""
|
108
|
+
Note:
|
109
|
+
|
110
|
+
1. The number of sruns and futures is equal to `len(list_parameters)`.
|
111
|
+
2. The number of `HistoryUnit`s is equal to `len(history_unit_ids)`.
|
112
|
+
3. For compound tasks, these two numbers are not the same.
|
113
|
+
|
114
|
+
For this reason, we defer database updates to the caller function,
|
115
|
+
when we are in one of the "compound" cases
|
116
|
+
|
117
|
+
"""
|
91
118
|
|
92
119
|
self.validate_multisubmit_parameters(
|
93
120
|
list_parameters=list_parameters,
|
94
121
|
task_type=task_type,
|
122
|
+
list_task_files=list_task_files,
|
95
123
|
)
|
96
124
|
|
97
|
-
|
98
|
-
|
99
|
-
|
125
|
+
self.validate_multisubmit_history_unit_ids(
|
126
|
+
history_unit_ids=history_unit_ids,
|
127
|
+
task_type=task_type,
|
128
|
+
list_parameters=list_parameters,
|
129
|
+
)
|
100
130
|
|
101
|
-
|
102
|
-
|
103
|
-
|
131
|
+
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
132
|
+
|
133
|
+
workdir_local = list_task_files[0].wftask_subfolder_local
|
134
|
+
if task_type == "parallel":
|
135
|
+
workdir_local.mkdir()
|
104
136
|
|
105
137
|
# Set `n_elements` and `parallel_tasks_per_job`
|
106
138
|
n_elements = len(list_parameters)
|
107
|
-
parallel_tasks_per_job =
|
139
|
+
parallel_tasks_per_job = config.parallel_tasks_per_job
|
108
140
|
if parallel_tasks_per_job is None:
|
109
141
|
parallel_tasks_per_job = n_elements
|
110
142
|
|
111
|
-
original_task_files = task_files
|
112
|
-
|
113
143
|
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
114
144
|
results: dict[int, Any] = {}
|
115
145
|
exceptions: dict[int, BaseException] = {}
|
@@ -119,37 +149,57 @@ class LocalRunner(BaseRunner):
|
|
119
149
|
]
|
120
150
|
|
121
151
|
active_futures: dict[int, Future] = {}
|
122
|
-
active_task_files: dict[int, TaskFiles] = {}
|
123
152
|
for ind_within_chunk, kwargs in enumerate(list_parameters_chunk):
|
124
153
|
positional_index = ind_chunk + ind_within_chunk
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
154
|
+
future = self.executor.submit(
|
155
|
+
func,
|
156
|
+
parameters=kwargs,
|
157
|
+
remote_files=list_task_files[
|
158
|
+
positional_index
|
159
|
+
].remote_files_dict,
|
131
160
|
)
|
161
|
+
active_futures[positional_index] = future
|
132
162
|
|
133
163
|
while active_futures:
|
134
164
|
# FIXME: add shutdown detection
|
135
165
|
# if file exists: cancel all futures, and raise
|
136
166
|
finished_futures = [
|
137
|
-
|
138
|
-
for
|
139
|
-
if not
|
167
|
+
index_and_future
|
168
|
+
for index_and_future in active_futures.items()
|
169
|
+
if not index_and_future[1].running()
|
140
170
|
]
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
171
|
+
if len(finished_futures) == 0:
|
172
|
+
continue
|
173
|
+
|
174
|
+
with next(get_sync_db()) as db:
|
175
|
+
for positional_index, fut in finished_futures:
|
176
|
+
active_futures.pop(positional_index)
|
177
|
+
if task_type == "parallel":
|
178
|
+
current_history_unit_id = history_unit_ids[
|
179
|
+
positional_index
|
180
|
+
]
|
181
|
+
|
182
|
+
try:
|
183
|
+
results[positional_index] = fut.result()
|
184
|
+
if task_type == "parallel":
|
185
|
+
update_status_of_history_unit(
|
186
|
+
history_unit_id=current_history_unit_id,
|
187
|
+
status=HistoryUnitStatus.DONE,
|
188
|
+
db_sync=db,
|
189
|
+
)
|
190
|
+
|
191
|
+
except Exception as e:
|
192
|
+
exceptions[positional_index] = e
|
193
|
+
if task_type == "parallel":
|
194
|
+
update_status_of_history_unit(
|
195
|
+
history_unit_id=current_history_unit_id,
|
196
|
+
status=HistoryUnitStatus.FAILED,
|
197
|
+
db_sync=db,
|
198
|
+
)
|
199
|
+
|
200
|
+
# FIXME: what should happen here? Option 1: stop
|
201
|
+
# all existing tasks and shutdown runner (for the
|
202
|
+
# compound-task case)
|
153
203
|
|
154
204
|
logger.debug(f"[multisubmit] END, {results=}, {exceptions=}")
|
155
205
|
|
@@ -32,6 +32,10 @@ def run_squeue(job_ids: list[str]) -> subprocess.CompletedProcess:
|
|
32
32
|
return res
|
33
33
|
|
34
34
|
|
35
|
+
def are_all_jobs_on_squeue(job_ids: list[str]) -> bool:
|
36
|
+
pass
|
37
|
+
|
38
|
+
|
35
39
|
def get_finished_jobs(job_ids: list[str]) -> set[str]:
|
36
40
|
"""
|
37
41
|
Check which ones of the given Slurm jobs already finished
|
@@ -24,7 +24,6 @@ from ..slurm_common.utils_executors import get_pickle_file_path
|
|
24
24
|
from ..slurm_common.utils_executors import get_slurm_file_path
|
25
25
|
from ..slurm_common.utils_executors import get_slurm_script_file_path
|
26
26
|
from ._executor_wait_thread import FractalSlurmSSHWaitThread
|
27
|
-
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
28
27
|
from fractal_server.app.runner.compress_folder import compress_folder
|
29
28
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
30
29
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
@@ -526,10 +525,13 @@ class FractalSlurmSSHExecutor(Executor):
|
|
526
525
|
# `component = {"zarr_url": "/something", "param": 1}``). The
|
527
526
|
# try/except covers the case of e.g. `executor.map([1, 2])`,
|
528
527
|
# which is useful for testing.
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
528
|
+
|
529
|
+
# FIXME: the use of _COMPONENT_KEY_ is now deprecated
|
530
|
+
# try:
|
531
|
+
# actual_component = component.get(_COMPONENT_KEY_, None)
|
532
|
+
# except AttributeError:
|
533
|
+
# actual_component = str(component)
|
534
|
+
actual_component = "FAKE_INVALID_VALUE_FIXME"
|
533
535
|
|
534
536
|
_task_file_paths = TaskFiles(
|
535
537
|
root_dir_local=task_files.workflow_dir_local,
|
@@ -13,7 +13,6 @@ from pydantic import ConfigDict
|
|
13
13
|
|
14
14
|
from ._check_job_status_ssh import get_finished_jobs_ssh
|
15
15
|
from fractal_server import __VERSION__
|
16
|
-
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
17
16
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
18
17
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
19
18
|
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
@@ -31,11 +30,6 @@ from fractal_server.logger import set_logger
|
|
31
30
|
from fractal_server.ssh._fabric import FractalSSH
|
32
31
|
from fractal_server.syringe import Inject
|
33
32
|
|
34
|
-
# from fractal_server.app.history import ImageStatus
|
35
|
-
# from fractal_server.app.history import update_all_images
|
36
|
-
# from fractal_server.app.history import update_single_image
|
37
|
-
# from fractal_server.app.history import update_single_image_logfile
|
38
|
-
|
39
33
|
|
40
34
|
logger = set_logger(__name__)
|
41
35
|
|
@@ -500,7 +494,9 @@ class RunnerSlurmSSH(BaseRunner):
|
|
500
494
|
**task_files.model_dump(
|
501
495
|
exclude={"component"},
|
502
496
|
),
|
503
|
-
|
497
|
+
# FIXME _COMPONENT_KEY_ is deprecated
|
498
|
+
component="FIXME_INVALID_FAKE_VALUE",
|
499
|
+
# component=parameters[_COMPONENT_KEY_],
|
504
500
|
)
|
505
501
|
|
506
502
|
if self.jobs != {}:
|
@@ -546,8 +542,6 @@ class RunnerSlurmSSH(BaseRunner):
|
|
546
542
|
slurm_config=slurm_config,
|
547
543
|
)
|
548
544
|
|
549
|
-
# LOGFILE = task_files.log_file_local
|
550
|
-
|
551
545
|
# Retrieval phase
|
552
546
|
while len(self.jobs) > 0:
|
553
547
|
if self.is_shutdown():
|
@@ -638,7 +632,9 @@ class RunnerSlurmSSH(BaseRunner):
|
|
638
632
|
# TODO: replace with actual values
|
639
633
|
tasks = []
|
640
634
|
for ind_chunk, parameters in enumerate(chunk):
|
641
|
-
|
635
|
+
# FIXME: _COMPONENT_KEY_ is deprecated
|
636
|
+
# component = parameters[_COMPONENT_KEY_]
|
637
|
+
component = "INVALID_FAKE_VALUE_FIXME"
|
642
638
|
tasks.append(
|
643
639
|
SlurmTask(
|
644
640
|
index=(ind_batch * batch_size) + ind_chunk,
|