fractal-server 1.4.9__py3-none-any.whl → 2.0.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/__init__.py +4 -7
- fractal_server/app/models/linkuserproject.py +9 -0
- fractal_server/app/models/security.py +6 -0
- fractal_server/app/models/state.py +1 -1
- fractal_server/app/models/v1/__init__.py +10 -0
- fractal_server/app/models/{dataset.py → v1/dataset.py} +5 -5
- fractal_server/app/models/{job.py → v1/job.py} +5 -5
- fractal_server/app/models/{project.py → v1/project.py} +5 -5
- fractal_server/app/models/{task.py → v1/task.py} +7 -2
- fractal_server/app/models/{workflow.py → v1/workflow.py} +5 -5
- fractal_server/app/models/v2/__init__.py +20 -0
- fractal_server/app/models/v2/dataset.py +55 -0
- fractal_server/app/models/v2/job.py +51 -0
- fractal_server/app/models/v2/project.py +31 -0
- fractal_server/app/models/v2/task.py +93 -0
- fractal_server/app/models/v2/workflow.py +43 -0
- fractal_server/app/models/v2/workflowtask.py +90 -0
- fractal_server/app/routes/{admin.py → admin/v1.py} +42 -42
- fractal_server/app/routes/admin/v2.py +275 -0
- fractal_server/app/routes/api/v1/__init__.py +7 -7
- fractal_server/app/routes/api/v1/_aux_functions.py +2 -2
- fractal_server/app/routes/api/v1/dataset.py +44 -37
- fractal_server/app/routes/api/v1/job.py +12 -12
- fractal_server/app/routes/api/v1/project.py +23 -21
- fractal_server/app/routes/api/v1/task.py +24 -14
- fractal_server/app/routes/api/v1/task_collection.py +16 -14
- fractal_server/app/routes/api/v1/workflow.py +24 -24
- fractal_server/app/routes/api/v1/workflowtask.py +10 -10
- fractal_server/app/routes/api/v2/__init__.py +28 -0
- fractal_server/app/routes/api/v2/_aux_functions.py +497 -0
- fractal_server/app/routes/api/v2/apply.py +220 -0
- fractal_server/app/routes/api/v2/dataset.py +310 -0
- fractal_server/app/routes/api/v2/images.py +212 -0
- fractal_server/app/routes/api/v2/job.py +200 -0
- fractal_server/app/routes/api/v2/project.py +205 -0
- fractal_server/app/routes/api/v2/task.py +222 -0
- fractal_server/app/routes/api/v2/task_collection.py +229 -0
- fractal_server/app/routes/api/v2/workflow.py +398 -0
- fractal_server/app/routes/api/v2/workflowtask.py +269 -0
- fractal_server/app/routes/aux/_job.py +1 -1
- fractal_server/app/runner/async_wrap.py +27 -0
- fractal_server/app/runner/exceptions.py +129 -0
- fractal_server/app/runner/executors/local/__init__.py +3 -0
- fractal_server/app/runner/{_local → executors/local}/executor.py +2 -2
- fractal_server/app/runner/executors/slurm/__init__.py +3 -0
- fractal_server/app/runner/{_slurm → executors/slurm}/_batching.py +1 -1
- fractal_server/app/runner/executors/slurm/_check_jobs_status.py +72 -0
- fractal_server/app/runner/{_slurm → executors/slurm}/_executor_wait_thread.py +3 -4
- fractal_server/app/runner/{_slurm → executors/slurm}/_slurm_config.py +3 -152
- fractal_server/app/runner/{_slurm → executors/slurm}/_subprocess_run_as_user.py +1 -1
- fractal_server/app/runner/{_slurm → executors/slurm}/executor.py +9 -9
- fractal_server/app/runner/filenames.py +6 -0
- fractal_server/app/runner/set_start_and_last_task_index.py +39 -0
- fractal_server/app/runner/task_files.py +105 -0
- fractal_server/app/runner/{__init__.py → v1/__init__.py} +36 -49
- fractal_server/app/runner/{_common.py → v1/_common.py} +13 -120
- fractal_server/app/runner/{_local → v1/_local}/__init__.py +6 -6
- fractal_server/app/runner/{_local → v1/_local}/_local_config.py +6 -7
- fractal_server/app/runner/{_local → v1/_local}/_submit_setup.py +1 -5
- fractal_server/app/runner/v1/_slurm/__init__.py +310 -0
- fractal_server/app/runner/{_slurm → v1/_slurm}/_submit_setup.py +3 -9
- fractal_server/app/runner/v1/_slurm/get_slurm_config.py +163 -0
- fractal_server/app/runner/v1/common.py +117 -0
- fractal_server/app/runner/{handle_failed_job.py → v1/handle_failed_job.py} +8 -8
- fractal_server/app/runner/v2/__init__.py +337 -0
- fractal_server/app/runner/v2/_local/__init__.py +169 -0
- fractal_server/app/runner/v2/_local/_local_config.py +118 -0
- fractal_server/app/runner/v2/_local/_submit_setup.py +52 -0
- fractal_server/app/runner/v2/_slurm/__init__.py +157 -0
- fractal_server/app/runner/v2/_slurm/_submit_setup.py +83 -0
- fractal_server/app/runner/v2/_slurm/get_slurm_config.py +179 -0
- fractal_server/app/runner/v2/components.py +5 -0
- fractal_server/app/runner/v2/deduplicate_list.py +24 -0
- fractal_server/app/runner/v2/handle_failed_job.py +156 -0
- fractal_server/app/runner/v2/merge_outputs.py +41 -0
- fractal_server/app/runner/v2/runner.py +264 -0
- fractal_server/app/runner/v2/runner_functions.py +339 -0
- fractal_server/app/runner/v2/runner_functions_low_level.py +134 -0
- fractal_server/app/runner/v2/task_interface.py +43 -0
- fractal_server/app/runner/v2/v1_compat.py +21 -0
- fractal_server/app/schemas/__init__.py +4 -42
- fractal_server/app/schemas/v1/__init__.py +42 -0
- fractal_server/app/schemas/{applyworkflow.py → v1/applyworkflow.py} +18 -18
- fractal_server/app/schemas/{dataset.py → v1/dataset.py} +30 -30
- fractal_server/app/schemas/{dumps.py → v1/dumps.py} +8 -8
- fractal_server/app/schemas/{manifest.py → v1/manifest.py} +5 -5
- fractal_server/app/schemas/{project.py → v1/project.py} +9 -9
- fractal_server/app/schemas/{task.py → v1/task.py} +12 -12
- fractal_server/app/schemas/{task_collection.py → v1/task_collection.py} +7 -7
- fractal_server/app/schemas/{workflow.py → v1/workflow.py} +38 -38
- fractal_server/app/schemas/v2/__init__.py +34 -0
- fractal_server/app/schemas/v2/dataset.py +88 -0
- fractal_server/app/schemas/v2/dumps.py +87 -0
- fractal_server/app/schemas/v2/job.py +113 -0
- fractal_server/app/schemas/v2/manifest.py +109 -0
- fractal_server/app/schemas/v2/project.py +36 -0
- fractal_server/app/schemas/v2/task.py +121 -0
- fractal_server/app/schemas/v2/task_collection.py +105 -0
- fractal_server/app/schemas/v2/workflow.py +78 -0
- fractal_server/app/schemas/v2/workflowtask.py +118 -0
- fractal_server/config.py +5 -10
- fractal_server/images/__init__.py +50 -0
- fractal_server/images/tools.py +86 -0
- fractal_server/main.py +11 -3
- fractal_server/migrations/versions/4b35c5cefbe3_tmp_is_v2_compatible.py +39 -0
- fractal_server/migrations/versions/56af171b0159_v2.py +217 -0
- fractal_server/migrations/versions/876f28db9d4e_tmp_split_task_and_wftask_meta.py +68 -0
- fractal_server/migrations/versions/974c802f0dd0_tmp_workflowtaskv2_type_in_db.py +37 -0
- fractal_server/migrations/versions/9cd305cd6023_tmp_workflowtaskv2.py +40 -0
- fractal_server/migrations/versions/a6231ed6273c_tmp_args_schemas_in_taskv2.py +42 -0
- fractal_server/migrations/versions/b9e9eed9d442_tmp_taskv2_type.py +37 -0
- fractal_server/migrations/versions/e3e639454d4b_tmp_make_task_meta_non_optional.py +50 -0
- fractal_server/tasks/__init__.py +0 -5
- fractal_server/tasks/endpoint_operations.py +13 -19
- fractal_server/tasks/utils.py +35 -0
- fractal_server/tasks/{_TaskCollectPip.py → v1/_TaskCollectPip.py} +3 -3
- fractal_server/tasks/{background_operations.py → v1/background_operations.py} +18 -50
- fractal_server/tasks/v1/get_collection_data.py +14 -0
- fractal_server/tasks/v2/_TaskCollectPip.py +103 -0
- fractal_server/tasks/v2/background_operations.py +382 -0
- fractal_server/tasks/v2/get_collection_data.py +14 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/METADATA +3 -4
- fractal_server-2.0.0a0.dist-info/RECORD +166 -0
- fractal_server/app/runner/_slurm/.gitignore +0 -2
- fractal_server/app/runner/_slurm/__init__.py +0 -150
- fractal_server/app/runner/common.py +0 -311
- fractal_server-1.4.9.dist-info/RECORD +0 -97
- /fractal_server/app/runner/{_slurm → executors/slurm}/remote.py +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-1.4.9.dist-info → fractal_server-2.0.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,269 @@
|
|
1
|
+
from copy import deepcopy
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from fastapi import APIRouter
|
5
|
+
from fastapi import Depends
|
6
|
+
from fastapi import HTTPException
|
7
|
+
from fastapi import Response
|
8
|
+
from fastapi import status
|
9
|
+
|
10
|
+
from ....db import AsyncSession
|
11
|
+
from ....db import get_async_db
|
12
|
+
from ....models.v1 import Task
|
13
|
+
from ....models.v2 import TaskV2
|
14
|
+
from ....schemas.v2 import WorkflowTaskCreateV2
|
15
|
+
from ....schemas.v2 import WorkflowTaskReadV2
|
16
|
+
from ....schemas.v2 import WorkflowTaskUpdateV2
|
17
|
+
from ....security import current_active_user
|
18
|
+
from ....security import User
|
19
|
+
from ._aux_functions import _get_workflow_check_owner
|
20
|
+
from ._aux_functions import _get_workflow_task_check_owner
|
21
|
+
from ._aux_functions import _workflow_insert_task
|
22
|
+
|
23
|
+
router = APIRouter()
|
24
|
+
|
25
|
+
|
26
|
+
@router.post(
|
27
|
+
"/project/{project_id}/workflow/{workflow_id}/wftask/",
|
28
|
+
response_model=WorkflowTaskReadV2,
|
29
|
+
status_code=status.HTTP_201_CREATED,
|
30
|
+
)
|
31
|
+
async def create_workflowtask(
|
32
|
+
project_id: int,
|
33
|
+
workflow_id: int,
|
34
|
+
task_id: int,
|
35
|
+
new_task: WorkflowTaskCreateV2,
|
36
|
+
user: User = Depends(current_active_user),
|
37
|
+
db: AsyncSession = Depends(get_async_db),
|
38
|
+
) -> Optional[WorkflowTaskReadV2]:
|
39
|
+
"""
|
40
|
+
Add a WorkflowTask to a Workflow
|
41
|
+
"""
|
42
|
+
|
43
|
+
workflow = await _get_workflow_check_owner(
|
44
|
+
project_id=project_id, workflow_id=workflow_id, user_id=user.id, db=db
|
45
|
+
)
|
46
|
+
|
47
|
+
if new_task.is_legacy_task is True:
|
48
|
+
task = await db.get(Task, task_id)
|
49
|
+
if not task.is_v2_compatible:
|
50
|
+
raise HTTPException(
|
51
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
52
|
+
detail=f"Task {task_id} is not V2-compatible.",
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
task = await db.get(TaskV2, task_id)
|
56
|
+
|
57
|
+
if not task:
|
58
|
+
if new_task.is_legacy_task:
|
59
|
+
error = f"Task {task_id} not found."
|
60
|
+
else:
|
61
|
+
error = f"TaskV2 {task_id} not found."
|
62
|
+
|
63
|
+
raise HTTPException(
|
64
|
+
status_code=status.HTTP_404_NOT_FOUND, detail=error
|
65
|
+
)
|
66
|
+
|
67
|
+
if new_task.is_legacy_task is True or task.type == "parallel":
|
68
|
+
if (
|
69
|
+
new_task.meta_non_parallel is not None
|
70
|
+
or new_task.args_non_parallel is not None
|
71
|
+
):
|
72
|
+
raise HTTPException(
|
73
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
74
|
+
detail=(
|
75
|
+
"Cannot set `WorkflowTaskV2.meta_non_parallel` or "
|
76
|
+
"`WorkflowTask.args_non_parallel` if the associated Task "
|
77
|
+
"is `parallel` (or legacy)."
|
78
|
+
),
|
79
|
+
)
|
80
|
+
elif task.type == "non_parallel":
|
81
|
+
if (
|
82
|
+
new_task.meta_parallel is not None
|
83
|
+
or new_task.args_parallel is not None
|
84
|
+
):
|
85
|
+
raise HTTPException(
|
86
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
87
|
+
detail=(
|
88
|
+
"Cannot set `WorkflowTaskV2.meta_parallel` or "
|
89
|
+
"`WorkflowTask.args_parallel` if the associated Task "
|
90
|
+
"is `non_parallel`."
|
91
|
+
),
|
92
|
+
)
|
93
|
+
|
94
|
+
async with db:
|
95
|
+
workflow_task = await _workflow_insert_task(
|
96
|
+
workflow_id=workflow.id,
|
97
|
+
is_legacy_task=new_task.is_legacy_task,
|
98
|
+
task_id=task_id,
|
99
|
+
order=new_task.order,
|
100
|
+
meta_non_parallel=new_task.meta_non_parallel,
|
101
|
+
meta_parallel=new_task.meta_parallel,
|
102
|
+
args_non_parallel=new_task.args_non_parallel,
|
103
|
+
args_parallel=new_task.args_parallel,
|
104
|
+
input_filters=new_task.input_filters,
|
105
|
+
db=db,
|
106
|
+
)
|
107
|
+
|
108
|
+
await db.close()
|
109
|
+
|
110
|
+
return workflow_task
|
111
|
+
|
112
|
+
|
113
|
+
@router.get(
|
114
|
+
"/project/{project_id}/workflow/{workflow_id}/wftask/{workflow_task_id}/",
|
115
|
+
response_model=WorkflowTaskReadV2,
|
116
|
+
)
|
117
|
+
async def read_workflowtask(
|
118
|
+
project_id: int,
|
119
|
+
workflow_id: int,
|
120
|
+
workflow_task_id: int,
|
121
|
+
user: User = Depends(current_active_user),
|
122
|
+
db: AsyncSession = Depends(get_async_db),
|
123
|
+
):
|
124
|
+
workflow_task, _ = await _get_workflow_task_check_owner(
|
125
|
+
project_id=project_id,
|
126
|
+
workflow_task_id=workflow_task_id,
|
127
|
+
workflow_id=workflow_id,
|
128
|
+
user_id=user.id,
|
129
|
+
db=db,
|
130
|
+
)
|
131
|
+
return workflow_task
|
132
|
+
|
133
|
+
|
134
|
+
@router.patch(
|
135
|
+
"/project/{project_id}/workflow/{workflow_id}/wftask/{workflow_task_id}/",
|
136
|
+
response_model=WorkflowTaskReadV2,
|
137
|
+
)
|
138
|
+
async def update_workflowtask(
|
139
|
+
project_id: int,
|
140
|
+
workflow_id: int,
|
141
|
+
workflow_task_id: int,
|
142
|
+
workflow_task_update: WorkflowTaskUpdateV2,
|
143
|
+
user: User = Depends(current_active_user),
|
144
|
+
db: AsyncSession = Depends(get_async_db),
|
145
|
+
) -> Optional[WorkflowTaskReadV2]:
|
146
|
+
"""
|
147
|
+
Edit a WorkflowTask of a Workflow
|
148
|
+
"""
|
149
|
+
|
150
|
+
db_wf_task, db_workflow = await _get_workflow_task_check_owner(
|
151
|
+
project_id=project_id,
|
152
|
+
workflow_task_id=workflow_task_id,
|
153
|
+
workflow_id=workflow_id,
|
154
|
+
user_id=user.id,
|
155
|
+
db=db,
|
156
|
+
)
|
157
|
+
|
158
|
+
if db_wf_task.task_type == "parallel" and (
|
159
|
+
workflow_task_update.args_non_parallel is not None
|
160
|
+
or workflow_task_update.meta_non_parallel is not None
|
161
|
+
):
|
162
|
+
raise HTTPException(
|
163
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
164
|
+
detail=(
|
165
|
+
"Cannot patch `WorkflowTaskV2.args_non_parallel` or "
|
166
|
+
"`WorkflowTask.meta_non_parallel` if the associated Task is "
|
167
|
+
"parallel."
|
168
|
+
),
|
169
|
+
)
|
170
|
+
elif db_wf_task.task_type == "non_parallel" and (
|
171
|
+
workflow_task_update.args_parallel is not None
|
172
|
+
or workflow_task_update.meta_parallel is not None
|
173
|
+
):
|
174
|
+
raise HTTPException(
|
175
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
176
|
+
detail=(
|
177
|
+
"Cannot patch `WorkflowTaskV2.args_parallel` or "
|
178
|
+
"`WorkflowTask.meta_parallel` if the associated Task is "
|
179
|
+
"non parallel."
|
180
|
+
),
|
181
|
+
)
|
182
|
+
|
183
|
+
for key, value in workflow_task_update.dict(exclude_unset=True).items():
|
184
|
+
if key == "args_parallel":
|
185
|
+
# Get default arguments via a Task property method
|
186
|
+
if db_wf_task.is_legacy_task:
|
187
|
+
default_args = (
|
188
|
+
db_wf_task.task_legacy.default_args_from_args_schema
|
189
|
+
)
|
190
|
+
else:
|
191
|
+
default_args = (
|
192
|
+
db_wf_task.task.default_args_parallel_from_args_schema
|
193
|
+
)
|
194
|
+
# Override default_args with args value items
|
195
|
+
actual_args = deepcopy(default_args)
|
196
|
+
if value is not None:
|
197
|
+
for k, v in value.items():
|
198
|
+
actual_args[k] = v
|
199
|
+
if not actual_args:
|
200
|
+
actual_args = None
|
201
|
+
setattr(db_wf_task, key, actual_args)
|
202
|
+
elif key == "args_non_parallel":
|
203
|
+
# Get default arguments via a Task property method
|
204
|
+
default_args = deepcopy(
|
205
|
+
db_wf_task.task.default_args_non_parallel_from_args_schema
|
206
|
+
)
|
207
|
+
# Override default_args with args value items
|
208
|
+
actual_args = default_args.copy()
|
209
|
+
if value is not None:
|
210
|
+
for k, v in value.items():
|
211
|
+
actual_args[k] = v
|
212
|
+
if not actual_args:
|
213
|
+
actual_args = None
|
214
|
+
setattr(db_wf_task, key, actual_args)
|
215
|
+
elif key == "meta_parallel":
|
216
|
+
current_meta_parallel = deepcopy(db_wf_task.meta_parallel) or {}
|
217
|
+
current_meta_parallel.update(value)
|
218
|
+
setattr(db_wf_task, key, current_meta_parallel)
|
219
|
+
elif key == "meta_non_parallel":
|
220
|
+
current_meta_non_parallel = (
|
221
|
+
deepcopy(db_wf_task.meta_non_parallel) or {}
|
222
|
+
)
|
223
|
+
current_meta_non_parallel.update(value)
|
224
|
+
setattr(db_wf_task, key, current_meta_non_parallel)
|
225
|
+
# FIXME handle `input_filters`
|
226
|
+
else:
|
227
|
+
raise HTTPException(
|
228
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
229
|
+
detail=f"patch_workflow_task endpoint cannot set {key=}",
|
230
|
+
)
|
231
|
+
|
232
|
+
await db.commit()
|
233
|
+
await db.refresh(db_wf_task)
|
234
|
+
await db.close()
|
235
|
+
|
236
|
+
return db_wf_task
|
237
|
+
|
238
|
+
|
239
|
+
@router.delete(
|
240
|
+
"/project/{project_id}/workflow/{workflow_id}/wftask/{workflow_task_id}/",
|
241
|
+
status_code=status.HTTP_204_NO_CONTENT,
|
242
|
+
)
|
243
|
+
async def delete_workflowtask(
|
244
|
+
project_id: int,
|
245
|
+
workflow_id: int,
|
246
|
+
workflow_task_id: int,
|
247
|
+
user: User = Depends(current_active_user),
|
248
|
+
db: AsyncSession = Depends(get_async_db),
|
249
|
+
) -> Response:
|
250
|
+
"""
|
251
|
+
Delete a WorkflowTask of a Workflow
|
252
|
+
"""
|
253
|
+
|
254
|
+
db_workflow_task, db_workflow = await _get_workflow_task_check_owner(
|
255
|
+
project_id=project_id,
|
256
|
+
workflow_task_id=workflow_task_id,
|
257
|
+
workflow_id=workflow_id,
|
258
|
+
user_id=user.id,
|
259
|
+
db=db,
|
260
|
+
)
|
261
|
+
|
262
|
+
await db.delete(db_workflow_task)
|
263
|
+
await db.commit()
|
264
|
+
|
265
|
+
await db.refresh(db_workflow)
|
266
|
+
db_workflow.task_list.reorder()
|
267
|
+
await db.commit()
|
268
|
+
|
269
|
+
return Response(status_code=status.HTTP_204_NO_CONTENT)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import asyncio
|
2
|
+
from functools import partial
|
3
|
+
from functools import wraps
|
4
|
+
from typing import Callable
|
5
|
+
|
6
|
+
|
7
|
+
def async_wrap(func: Callable) -> Callable:
|
8
|
+
"""
|
9
|
+
Wrap a synchronous callable in an async task
|
10
|
+
|
11
|
+
Ref: [issue #140](https://github.com/fractal-analytics-platform/fractal-server/issues/140)
|
12
|
+
and [this StackOverflow answer](https://stackoverflow.com/q/43241221/19085332).
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
async_wrapper:
|
16
|
+
A factory that allows wrapping a blocking callable within a
|
17
|
+
coroutine.
|
18
|
+
""" # noqa: E501
|
19
|
+
|
20
|
+
@wraps(func)
|
21
|
+
async def async_wrapper(*args, loop=None, executor=None, **kwargs):
|
22
|
+
if loop is None:
|
23
|
+
loop = asyncio.get_event_loop()
|
24
|
+
pfunc = partial(func, *args, **kwargs)
|
25
|
+
return await loop.run_in_executor(executor, pfunc)
|
26
|
+
|
27
|
+
return async_wrapper
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
|
5
|
+
class TaskExecutionError(RuntimeError):
|
6
|
+
"""
|
7
|
+
Forwards errors occurred during the execution of a task
|
8
|
+
|
9
|
+
This error wraps and forwards errors occurred during the execution of
|
10
|
+
tasks, when the exit code is larger than 0 (i.e. the error took place
|
11
|
+
within the task). This error also adds information that is useful to track
|
12
|
+
down and debug the failing task within a workflow.
|
13
|
+
|
14
|
+
Attributes:
|
15
|
+
workflow_task_id:
|
16
|
+
ID of the workflow task that failed.
|
17
|
+
workflow_task_order:
|
18
|
+
Order of the task within the workflow.
|
19
|
+
task_name:
|
20
|
+
Human readable name of the failing task.
|
21
|
+
"""
|
22
|
+
|
23
|
+
workflow_task_id: Optional[int] = None
|
24
|
+
workflow_task_order: Optional[int] = None
|
25
|
+
task_name: Optional[str] = None
|
26
|
+
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
*args,
|
30
|
+
workflow_task_id: Optional[int] = None,
|
31
|
+
workflow_task_order: Optional[int] = None,
|
32
|
+
task_name: Optional[str] = None,
|
33
|
+
):
|
34
|
+
super().__init__(*args)
|
35
|
+
self.workflow_task_id = workflow_task_id
|
36
|
+
self.workflow_task_order = workflow_task_order
|
37
|
+
self.task_name = task_name
|
38
|
+
|
39
|
+
|
40
|
+
class JobExecutionError(RuntimeError):
|
41
|
+
"""
|
42
|
+
Forwards errors in the execution of a task that are due to external factors
|
43
|
+
|
44
|
+
This error wraps and forwards errors occurred during the execution of
|
45
|
+
tasks, but related to external factors like:
|
46
|
+
|
47
|
+
1. A negative exit code (e.g. because the task received a TERM or KILL
|
48
|
+
signal);
|
49
|
+
2. An error on the executor side (e.g. the SLURM executor could not
|
50
|
+
find the pickled file with task output).
|
51
|
+
|
52
|
+
This error also adds information that is useful to track down and debug the
|
53
|
+
failing task within a workflow.
|
54
|
+
|
55
|
+
Attributes:
|
56
|
+
info:
|
57
|
+
A free field for additional information
|
58
|
+
cmd_file:
|
59
|
+
Path to the file of the command that was executed (e.g. a SLURM
|
60
|
+
submission script).
|
61
|
+
stdout_file:
|
62
|
+
Path to the file with the command stdout
|
63
|
+
stderr_file:
|
64
|
+
Path to the file with the command stderr
|
65
|
+
"""
|
66
|
+
|
67
|
+
cmd_file: Optional[str] = None
|
68
|
+
stdout_file: Optional[str] = None
|
69
|
+
stderr_file: Optional[str] = None
|
70
|
+
info: Optional[str] = None
|
71
|
+
|
72
|
+
def __init__(
|
73
|
+
self,
|
74
|
+
*args,
|
75
|
+
cmd_file: Optional[str] = None,
|
76
|
+
stdout_file: Optional[str] = None,
|
77
|
+
stderr_file: Optional[str] = None,
|
78
|
+
info: Optional[str] = None,
|
79
|
+
):
|
80
|
+
super().__init__(*args)
|
81
|
+
self.cmd_file = cmd_file
|
82
|
+
self.stdout_file = stdout_file
|
83
|
+
self.stderr_file = stderr_file
|
84
|
+
self.info = info
|
85
|
+
|
86
|
+
def _read_file(self, filepath: str) -> str:
|
87
|
+
"""
|
88
|
+
Return the content of a text file, and handle the cases where it is
|
89
|
+
empty or missing
|
90
|
+
"""
|
91
|
+
if os.path.exists(filepath):
|
92
|
+
with open(filepath, "r") as f:
|
93
|
+
content = f.read()
|
94
|
+
if content:
|
95
|
+
return f"Content of {filepath}:\n{content}"
|
96
|
+
else:
|
97
|
+
return f"File {filepath} is empty\n"
|
98
|
+
else:
|
99
|
+
return f"File {filepath} is missing\n"
|
100
|
+
|
101
|
+
def assemble_error(self) -> str:
|
102
|
+
"""
|
103
|
+
Read the files that are specified in attributes, and combine them in an
|
104
|
+
error message.
|
105
|
+
"""
|
106
|
+
if self.cmd_file:
|
107
|
+
content = self._read_file(self.cmd_file)
|
108
|
+
cmd_content = f"COMMAND:\n{content}\n\n"
|
109
|
+
else:
|
110
|
+
cmd_content = ""
|
111
|
+
if self.stdout_file:
|
112
|
+
content = self._read_file(self.stdout_file)
|
113
|
+
out_content = f"STDOUT:\n{content}\n\n"
|
114
|
+
else:
|
115
|
+
out_content = ""
|
116
|
+
if self.stderr_file:
|
117
|
+
content = self._read_file(self.stderr_file)
|
118
|
+
err_content = f"STDERR:\n{content}\n\n"
|
119
|
+
else:
|
120
|
+
err_content = ""
|
121
|
+
|
122
|
+
content = f"{cmd_content}{out_content}{err_content}"
|
123
|
+
if self.info:
|
124
|
+
content = f"{content}ADDITIONAL INFO:\n{self.info}\n\n"
|
125
|
+
|
126
|
+
if not content:
|
127
|
+
content = str(self)
|
128
|
+
message = f"JobExecutionError\n\n{content}"
|
129
|
+
return message
|
@@ -18,8 +18,8 @@ from typing import Iterable
|
|
18
18
|
from typing import Optional
|
19
19
|
from typing import Sequence
|
20
20
|
|
21
|
-
from ._local_config import get_default_local_backend_config
|
22
|
-
from ._local_config import LocalBackendConfig
|
21
|
+
from ...v1._local._local_config import get_default_local_backend_config
|
22
|
+
from ...v1._local._local_config import LocalBackendConfig
|
23
23
|
|
24
24
|
|
25
25
|
class FractalThreadPoolExecutor(ThreadPoolExecutor):
|
@@ -0,0 +1,72 @@
|
|
1
|
+
from subprocess import run # nosec
|
2
|
+
|
3
|
+
from cfut.slurm import STATES_FINISHED
|
4
|
+
|
5
|
+
from .....logger import set_logger
|
6
|
+
|
7
|
+
|
8
|
+
logger = set_logger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
def run_squeue(job_ids):
|
12
|
+
res = run( # nosec
|
13
|
+
[
|
14
|
+
"squeue",
|
15
|
+
"--noheader",
|
16
|
+
"--format=%i %T",
|
17
|
+
"--jobs",
|
18
|
+
",".join([str(j) for j in job_ids]),
|
19
|
+
"--states=all",
|
20
|
+
],
|
21
|
+
capture_output=True,
|
22
|
+
encoding="utf-8",
|
23
|
+
check=False,
|
24
|
+
)
|
25
|
+
if res.returncode != 0:
|
26
|
+
logger.warning(
|
27
|
+
f"squeue command with {job_ids}"
|
28
|
+
f" failed with:\n{res.stderr=}\n{res.stdout=}"
|
29
|
+
)
|
30
|
+
|
31
|
+
return res
|
32
|
+
|
33
|
+
|
34
|
+
def _jobs_finished(job_ids) -> set[str]:
|
35
|
+
"""
|
36
|
+
Check which ones of the given Slurm jobs already finished
|
37
|
+
|
38
|
+
The function is based on the `_jobs_finished` function from
|
39
|
+
clusterfutures (version 0.5).
|
40
|
+
Original Copyright: 2022 Adrian Sampson
|
41
|
+
(released under the MIT licence)
|
42
|
+
"""
|
43
|
+
|
44
|
+
# If there is no Slurm job to check, return right away
|
45
|
+
if not job_ids:
|
46
|
+
return set()
|
47
|
+
id_to_state = dict()
|
48
|
+
|
49
|
+
res = run_squeue(job_ids)
|
50
|
+
if res.returncode == 0:
|
51
|
+
id_to_state = {
|
52
|
+
out.split()[0]: out.split()[1] for out in res.stdout.splitlines()
|
53
|
+
}
|
54
|
+
else:
|
55
|
+
id_to_state = dict()
|
56
|
+
for j in job_ids:
|
57
|
+
res = run_squeue([j])
|
58
|
+
if res.returncode != 0:
|
59
|
+
logger.info(f"Job {j} not found. Marked it as completed")
|
60
|
+
id_to_state.update({str(j): "COMPLETED"})
|
61
|
+
else:
|
62
|
+
id_to_state.update(
|
63
|
+
{res.stdout.split()[0]: res.stdout.split()[1]}
|
64
|
+
)
|
65
|
+
|
66
|
+
# Finished jobs only stay in squeue for a few mins (configurable). If
|
67
|
+
# a job ID isn't there, we'll assume it's finished.
|
68
|
+
return {
|
69
|
+
j
|
70
|
+
for j in job_ids
|
71
|
+
if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
|
72
|
+
}
|
@@ -6,10 +6,9 @@ from typing import Callable
|
|
6
6
|
from typing import Optional
|
7
7
|
|
8
8
|
from cfut import FileWaitThread
|
9
|
-
from cfut import slurm
|
10
|
-
|
11
|
-
from ....logger import set_logger
|
12
9
|
|
10
|
+
from .....logger import set_logger
|
11
|
+
from ._check_jobs_status import _jobs_finished
|
13
12
|
|
14
13
|
logger = set_logger(__name__)
|
15
14
|
|
@@ -121,7 +120,7 @@ class FractalSlurmWaitThread(FractalFileWaitThread):
|
|
121
120
|
super().check(i)
|
122
121
|
if i % (self.slurm_poll_interval // self.interval) == 0:
|
123
122
|
try:
|
124
|
-
finished_jobs =
|
123
|
+
finished_jobs = _jobs_finished(self.waiting.values())
|
125
124
|
except Exception:
|
126
125
|
# Don't abandon completion checking if jobs_finished errors
|
127
126
|
traceback.print_exc()
|