fractal-server 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/routes/admin/v1.py +2 -4
- fractal_server/app/routes/admin/v2.py +2 -4
- fractal_server/app/routes/api/v1/_aux_functions.py +24 -0
- fractal_server/app/routes/api/v1/job.py +3 -4
- fractal_server/app/routes/api/v1/project.py +28 -18
- fractal_server/app/routes/api/v2/_aux_functions.py +35 -12
- fractal_server/app/routes/api/v2/job.py +3 -4
- fractal_server/app/routes/api/v2/project.py +21 -0
- fractal_server/app/routes/api/v2/submit.py +33 -7
- fractal_server/app/routes/aux/_job.py +3 -1
- fractal_server/app/routes/aux/_runner.py +3 -3
- fractal_server/app/runner/executors/slurm/executor.py +157 -68
- fractal_server/app/runner/shutdown.py +88 -0
- fractal_server/app/runner/task_files.py +59 -27
- fractal_server/app/runner/v1/__init__.py +110 -56
- fractal_server/app/runner/v1/_common.py +53 -51
- fractal_server/app/runner/v1/_local/__init__.py +12 -11
- fractal_server/app/runner/v1/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v1/_slurm/__init__.py +16 -16
- fractal_server/app/runner/v1/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v1/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/__init__.py +139 -60
- fractal_server/app/runner/v2/_local/__init__.py +12 -11
- fractal_server/app/runner/v2/_local/_local_config.py +1 -1
- fractal_server/app/runner/v2/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v2/_local_experimental/__init__.py +155 -0
- fractal_server/app/runner/v2/_local_experimental/_local_config.py +108 -0
- fractal_server/app/runner/v2/_local_experimental/_submit_setup.py +42 -0
- fractal_server/app/runner/v2/_local_experimental/executor.py +156 -0
- fractal_server/app/runner/v2/_slurm/__init__.py +10 -10
- fractal_server/app/runner/v2/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v2/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/runner.py +17 -15
- fractal_server/app/runner/v2/runner_functions.py +38 -38
- fractal_server/app/runner/v2/runner_functions_low_level.py +12 -6
- fractal_server/config.py +52 -19
- fractal_server/gunicorn_fractal.py +40 -0
- fractal_server/{logger/__init__.py → logger.py} +2 -2
- fractal_server/main.py +24 -1
- fractal_server/migrations/env.py +1 -1
- {fractal_server-2.1.0.dist-info → fractal_server-2.2.0.dist-info}/METADATA +4 -1
- {fractal_server-2.1.0.dist-info → fractal_server-2.2.0.dist-info}/RECORD +47 -42
- fractal_server/logger/gunicorn_logger.py +0 -19
- {fractal_server-2.1.0.dist-info → fractal_server-2.2.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.1.0.dist-info → fractal_server-2.2.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.1.0.dist-info → fractal_server-2.2.0.dist-info}/entry_points.txt +0 -0
@@ -1,32 +1,55 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Optional
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
from fractal_server.tasks.utils import slugify_task_name
|
3
6
|
|
4
7
|
|
5
8
|
def sanitize_component(value: str) -> str:
|
6
9
|
"""
|
7
10
|
Remove {" ", "/", "."} form a string, e.g. going from
|
8
11
|
'plate.zarr/B/03/0' to 'plate_zarr_B_03_0'.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
value: Input strig
|
9
15
|
"""
|
10
16
|
return value.replace(" ", "_").replace("/", "_").replace(".", "_")
|
11
17
|
|
12
18
|
|
19
|
+
def task_subfolder_name(order: Union[int, str], task_name: str) -> str:
|
20
|
+
"""
|
21
|
+
Get name of task-specific subfolder.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
order:
|
25
|
+
task_name:
|
26
|
+
"""
|
27
|
+
task_name_slug = slugify_task_name(task_name)
|
28
|
+
return f"{order}_{task_name_slug}"
|
29
|
+
|
30
|
+
|
13
31
|
class TaskFiles:
|
14
32
|
"""
|
15
33
|
Group all file paths pertaining to a task
|
16
34
|
|
17
35
|
Attributes:
|
18
|
-
|
36
|
+
workflow_dir_local:
|
19
37
|
Server-owned directory to store all task-execution-related relevant
|
20
|
-
files
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
38
|
+
files. Note: users cannot write directly to this folder.
|
39
|
+
workflow_dir_remote:
|
40
|
+
User-side directory with the same scope as `workflow_dir_local`,
|
41
|
+
and where a user can write.
|
42
|
+
subfolder_name:
|
43
|
+
Name of task-specific subfolder
|
44
|
+
remote_subfolder:
|
45
|
+
Path to user-side task-specific subfolder
|
46
|
+
task_name:
|
47
|
+
Name of the task
|
25
48
|
task_order:
|
26
49
|
Positional order of the task within a workflow.
|
27
50
|
component:
|
28
|
-
Specific component to run the task for (relevant for tasks
|
29
|
-
|
51
|
+
Specific component to run the task for (relevant for tasks to be
|
52
|
+
executed in parallel over many components).
|
30
53
|
file_prefix:
|
31
54
|
Prefix for all task-related files.
|
32
55
|
args:
|
@@ -39,12 +62,16 @@ class TaskFiles:
|
|
39
62
|
Path for task-execution stderr.
|
40
63
|
"""
|
41
64
|
|
42
|
-
|
43
|
-
|
65
|
+
workflow_dir_local: Path
|
66
|
+
workflow_dir_remote: Path
|
67
|
+
remote_subfolder: Path
|
68
|
+
subfolder_name: str
|
69
|
+
task_name: str
|
44
70
|
task_order: Optional[int] = None
|
45
71
|
component: Optional[str] = None
|
46
72
|
|
47
73
|
file_prefix: str
|
74
|
+
file_prefix_with_subfolder: str
|
48
75
|
args: Path
|
49
76
|
out: Path
|
50
77
|
err: Path
|
@@ -53,14 +80,16 @@ class TaskFiles:
|
|
53
80
|
|
54
81
|
def __init__(
|
55
82
|
self,
|
56
|
-
|
57
|
-
|
83
|
+
workflow_dir_local: Path,
|
84
|
+
workflow_dir_remote: Path,
|
85
|
+
task_name: str,
|
58
86
|
task_order: Optional[int] = None,
|
59
87
|
component: Optional[str] = None,
|
60
88
|
):
|
61
|
-
self.
|
62
|
-
self.
|
89
|
+
self.workflow_dir_local = workflow_dir_local
|
90
|
+
self.workflow_dir_remote = workflow_dir_remote
|
63
91
|
self.task_order = task_order
|
92
|
+
self.task_name = task_name
|
64
93
|
self.component = component
|
65
94
|
|
66
95
|
if self.component is not None:
|
@@ -72,32 +101,35 @@ class TaskFiles:
|
|
72
101
|
if self.task_order is not None:
|
73
102
|
order = str(self.task_order)
|
74
103
|
else:
|
75
|
-
order = "
|
104
|
+
order = "0"
|
76
105
|
self.file_prefix = f"{order}{component_safe}"
|
77
|
-
self.
|
78
|
-
|
79
|
-
|
80
|
-
self.
|
106
|
+
self.subfolder_name = task_subfolder_name(
|
107
|
+
order=order, task_name=self.task_name
|
108
|
+
)
|
109
|
+
self.remote_subfolder = self.workflow_dir_remote / self.subfolder_name
|
110
|
+
self.args = self.remote_subfolder / f"{self.file_prefix}.args.json"
|
111
|
+
self.out = self.remote_subfolder / f"{self.file_prefix}.out"
|
112
|
+
self.err = self.remote_subfolder / f"{self.file_prefix}.err"
|
113
|
+
self.log = self.remote_subfolder / f"{self.file_prefix}.log"
|
81
114
|
self.metadiff = (
|
82
|
-
self.
|
115
|
+
self.remote_subfolder / f"{self.file_prefix}.metadiff.json"
|
83
116
|
)
|
84
117
|
|
85
118
|
|
86
119
|
def get_task_file_paths(
|
87
|
-
|
88
|
-
|
120
|
+
workflow_dir_local: Path,
|
121
|
+
workflow_dir_remote: Path,
|
122
|
+
task_name: str,
|
89
123
|
task_order: Optional[int] = None,
|
90
124
|
component: Optional[str] = None,
|
91
125
|
) -> TaskFiles:
|
92
126
|
"""
|
93
127
|
Return the corrisponding TaskFiles object
|
94
|
-
|
95
|
-
This function is mainly used as a cache to avoid instantiating needless
|
96
|
-
objects.
|
97
128
|
"""
|
98
129
|
return TaskFiles(
|
99
|
-
|
100
|
-
|
130
|
+
workflow_dir_local=workflow_dir_local,
|
131
|
+
workflow_dir_remote=workflow_dir_remote,
|
132
|
+
task_name=task_name,
|
101
133
|
task_order=task_order,
|
102
134
|
component=component,
|
103
135
|
)
|
@@ -22,6 +22,10 @@ import traceback
|
|
22
22
|
from pathlib import Path
|
23
23
|
from typing import Optional
|
24
24
|
|
25
|
+
from sqlalchemy.orm import Session as DBSyncSession
|
26
|
+
|
27
|
+
from ....logger import get_logger
|
28
|
+
from ....logger import reset_logger_handlers
|
25
29
|
from ....logger import set_logger
|
26
30
|
from ....syringe import Inject
|
27
31
|
from ....utils import get_timestamp
|
@@ -33,7 +37,11 @@ from ...models.v1 import WorkflowTask
|
|
33
37
|
from ...schemas.v1 import JobStatusTypeV1
|
34
38
|
from ..exceptions import JobExecutionError
|
35
39
|
from ..exceptions import TaskExecutionError
|
40
|
+
from ..executors.slurm._subprocess_run_as_user import (
|
41
|
+
_mkdir_as_user,
|
42
|
+
)
|
36
43
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
44
|
+
from ..task_files import task_subfolder_name
|
37
45
|
from ._local import process_workflow as local_process_workflow
|
38
46
|
from ._slurm import process_workflow as slurm_process_workflow
|
39
47
|
from .common import close_job_logger
|
@@ -49,6 +57,27 @@ _backends["local"] = local_process_workflow
|
|
49
57
|
_backends["slurm"] = slurm_process_workflow
|
50
58
|
|
51
59
|
|
60
|
+
def fail_job(
|
61
|
+
*,
|
62
|
+
db: DBSyncSession,
|
63
|
+
job: ApplyWorkflow,
|
64
|
+
log_msg: str,
|
65
|
+
logger_name: str,
|
66
|
+
emit_log: bool = False,
|
67
|
+
) -> None:
|
68
|
+
logger = get_logger(logger_name=logger_name)
|
69
|
+
if emit_log:
|
70
|
+
logger.error(log_msg)
|
71
|
+
reset_logger_handlers(logger)
|
72
|
+
job.status = JobStatusTypeV1.FAILED
|
73
|
+
job.end_timestamp = get_timestamp()
|
74
|
+
job.log = log_msg
|
75
|
+
db.merge(job)
|
76
|
+
db.commit()
|
77
|
+
db.close()
|
78
|
+
return
|
79
|
+
|
80
|
+
|
52
81
|
async def submit_workflow(
|
53
82
|
*,
|
54
83
|
workflow_id: int,
|
@@ -87,21 +116,41 @@ async def submit_workflow(
|
|
87
116
|
slurm backend.
|
88
117
|
"""
|
89
118
|
|
90
|
-
|
91
|
-
|
92
|
-
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
93
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
94
|
-
process_workflow = local_process_workflow
|
95
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
96
|
-
process_workflow = slurm_process_workflow
|
97
|
-
else:
|
98
|
-
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
119
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
120
|
+
logger = set_logger(logger_name=logger_name)
|
99
121
|
|
100
122
|
with next(DB.get_sync_db()) as db_sync:
|
101
123
|
|
102
124
|
job: ApplyWorkflow = db_sync.get(ApplyWorkflow, job_id)
|
103
125
|
if not job:
|
104
|
-
|
126
|
+
logger.error(f"ApplyWorkflow {job_id} does not exist")
|
127
|
+
return
|
128
|
+
|
129
|
+
settings = Inject(get_settings)
|
130
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
131
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
132
|
+
process_workflow = local_process_workflow
|
133
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
134
|
+
process_workflow = slurm_process_workflow
|
135
|
+
else:
|
136
|
+
|
137
|
+
if FRACTAL_RUNNER_BACKEND == "local_experimental":
|
138
|
+
log_msg = (
|
139
|
+
f"{FRACTAL_RUNNER_BACKEND=} is not available for v1 jobs."
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
log_msg = f"Invalid {FRACTAL_RUNNER_BACKEND=}"
|
143
|
+
|
144
|
+
fail_job(
|
145
|
+
job=job,
|
146
|
+
db=db_sync,
|
147
|
+
log_msg=log_msg,
|
148
|
+
logger_name=logger_name,
|
149
|
+
emit_log=True,
|
150
|
+
)
|
151
|
+
return
|
152
|
+
|
153
|
+
# Declare runner backend and set `process_workflow` function
|
105
154
|
|
106
155
|
input_dataset: Dataset = db_sync.get(Dataset, input_dataset_id)
|
107
156
|
output_dataset: Dataset = db_sync.get(Dataset, output_dataset_id)
|
@@ -122,12 +171,9 @@ async def submit_workflow(
|
|
122
171
|
log_msg += (
|
123
172
|
f"Cannot fetch workflow {workflow_id} from database\n"
|
124
173
|
)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
db_sync.merge(job)
|
129
|
-
db_sync.commit()
|
130
|
-
db_sync.close()
|
174
|
+
fail_job(
|
175
|
+
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
176
|
+
)
|
131
177
|
return
|
132
178
|
|
133
179
|
# Prepare some of process_workflow arguments
|
@@ -137,36 +183,53 @@ async def submit_workflow(
|
|
137
183
|
# Define and create server-side working folder
|
138
184
|
project_id = workflow.project_id
|
139
185
|
timestamp_string = get_timestamp().strftime("%Y%m%d_%H%M%S")
|
140
|
-
|
186
|
+
WORKFLOW_DIR_LOCAL = settings.FRACTAL_RUNNER_WORKING_BASE_DIR / (
|
141
187
|
f"proj_{project_id:07d}_wf_{workflow_id:07d}_job_{job_id:07d}"
|
142
188
|
f"_{timestamp_string}"
|
143
189
|
)
|
144
190
|
|
145
|
-
if
|
146
|
-
|
191
|
+
if WORKFLOW_DIR_LOCAL.exists():
|
192
|
+
fail_job(
|
193
|
+
db=db_sync,
|
194
|
+
job=job,
|
195
|
+
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
196
|
+
logger_name=logger_name,
|
197
|
+
emit_log=True,
|
198
|
+
)
|
199
|
+
return
|
147
200
|
|
148
|
-
# Create WORKFLOW_DIR
|
201
|
+
# Create WORKFLOW_DIR
|
149
202
|
original_umask = os.umask(0)
|
150
|
-
|
203
|
+
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
151
204
|
os.umask(original_umask)
|
152
205
|
|
153
|
-
# Define and create
|
206
|
+
# Define and create WORKFLOW_DIR_REMOTE
|
154
207
|
if FRACTAL_RUNNER_BACKEND == "local":
|
155
|
-
|
208
|
+
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
156
209
|
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
157
|
-
|
158
|
-
|
159
|
-
_mkdir_as_user,
|
210
|
+
WORKFLOW_DIR_REMOTE = (
|
211
|
+
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
160
212
|
)
|
213
|
+
_mkdir_as_user(folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user)
|
161
214
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
215
|
+
# Create all tasks subfolders
|
216
|
+
for order in range(job.first_task_index, job.last_task_index + 1):
|
217
|
+
subfolder_name = task_subfolder_name(
|
218
|
+
order=order,
|
219
|
+
task_name=workflow.task_list[order].task.name,
|
220
|
+
)
|
221
|
+
original_umask = os.umask(0)
|
222
|
+
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
|
223
|
+
os.umask(original_umask)
|
224
|
+
if FRACTAL_RUNNER_BACKEND == "slurm":
|
225
|
+
_mkdir_as_user(
|
226
|
+
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
227
|
+
user=slurm_user,
|
228
|
+
)
|
166
229
|
|
167
230
|
# Update db
|
168
|
-
job.working_dir =
|
169
|
-
job.working_dir_user =
|
231
|
+
job.working_dir = WORKFLOW_DIR_LOCAL.as_posix()
|
232
|
+
job.working_dir_user = WORKFLOW_DIR_REMOTE.as_posix()
|
170
233
|
db_sync.merge(job)
|
171
234
|
db_sync.commit()
|
172
235
|
|
@@ -186,8 +249,7 @@ async def submit_workflow(
|
|
186
249
|
db_sync.refresh(workflow)
|
187
250
|
|
188
251
|
# Write logs
|
189
|
-
|
190
|
-
log_file_path = WORKFLOW_DIR / WORKFLOW_LOG_FILENAME
|
252
|
+
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
191
253
|
logger = set_logger(
|
192
254
|
logger_name=logger_name,
|
193
255
|
log_file_path=log_file_path,
|
@@ -234,8 +296,8 @@ async def submit_workflow(
|
|
234
296
|
slurm_user=slurm_user,
|
235
297
|
slurm_account=job.slurm_account,
|
236
298
|
user_cache_dir=user_cache_dir,
|
237
|
-
|
238
|
-
|
299
|
+
workflow_dir_local=WORKFLOW_DIR_LOCAL,
|
300
|
+
workflow_dir_remote=WORKFLOW_DIR_REMOTE,
|
239
301
|
logger_name=logger_name,
|
240
302
|
worker_init=worker_init,
|
241
303
|
first_task_index=job.first_task_index,
|
@@ -286,19 +348,14 @@ async def submit_workflow(
|
|
286
348
|
|
287
349
|
db_sync.merge(output_dataset)
|
288
350
|
|
289
|
-
job.status = JobStatusTypeV1.FAILED
|
290
|
-
job.end_timestamp = get_timestamp()
|
291
|
-
|
292
351
|
exception_args_string = "\n".join(e.args)
|
293
|
-
|
352
|
+
log_msg = (
|
294
353
|
f"TASK ERROR: "
|
295
354
|
f"Task name: {e.task_name}, "
|
296
355
|
f"position in Workflow: {e.workflow_task_order}\n"
|
297
356
|
f"TRACEBACK:\n{exception_args_string}"
|
298
357
|
)
|
299
|
-
db_sync
|
300
|
-
close_job_logger(logger)
|
301
|
-
db_sync.commit()
|
358
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
302
359
|
|
303
360
|
except JobExecutionError as e:
|
304
361
|
|
@@ -318,14 +375,13 @@ async def submit_workflow(
|
|
318
375
|
)
|
319
376
|
|
320
377
|
db_sync.merge(output_dataset)
|
321
|
-
|
322
|
-
job.status = JobStatusTypeV1.FAILED
|
323
|
-
job.end_timestamp = get_timestamp()
|
324
378
|
error = e.assemble_error()
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
379
|
+
fail_job(
|
380
|
+
db=db_sync,
|
381
|
+
job=job,
|
382
|
+
log_msg=f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}",
|
383
|
+
logger_name=logger_name,
|
384
|
+
)
|
329
385
|
|
330
386
|
except Exception:
|
331
387
|
|
@@ -348,14 +404,12 @@ async def submit_workflow(
|
|
348
404
|
|
349
405
|
db_sync.merge(output_dataset)
|
350
406
|
|
351
|
-
|
352
|
-
job.end_timestamp = get_timestamp()
|
353
|
-
job.log = (
|
407
|
+
log_msg = (
|
354
408
|
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
355
409
|
f"TRACEBACK:\n{current_traceback}"
|
356
410
|
)
|
357
|
-
db_sync
|
358
|
-
|
359
|
-
db_sync.commit()
|
411
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
412
|
+
|
360
413
|
finally:
|
361
414
|
db_sync.close()
|
415
|
+
reset_logger_handlers(logger)
|