fractal-server 2.0.6__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/routes/admin/v1.py +2 -4
- fractal_server/app/routes/admin/v2.py +2 -4
- fractal_server/app/routes/api/v1/_aux_functions.py +24 -0
- fractal_server/app/routes/api/v1/job.py +3 -4
- fractal_server/app/routes/api/v1/project.py +28 -18
- fractal_server/app/routes/api/v2/_aux_functions.py +35 -12
- fractal_server/app/routes/api/v2/job.py +3 -4
- fractal_server/app/routes/api/v2/project.py +21 -0
- fractal_server/app/routes/api/v2/submit.py +36 -15
- fractal_server/app/routes/aux/_job.py +3 -1
- fractal_server/app/routes/aux/_runner.py +3 -3
- fractal_server/app/runner/executors/slurm/executor.py +169 -68
- fractal_server/app/runner/shutdown.py +88 -0
- fractal_server/app/runner/task_files.py +59 -27
- fractal_server/app/runner/v1/__init__.py +113 -64
- fractal_server/app/runner/v1/_common.py +53 -51
- fractal_server/app/runner/v1/_local/__init__.py +12 -11
- fractal_server/app/runner/v1/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v1/_slurm/__init__.py +16 -16
- fractal_server/app/runner/v1/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v1/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/__init__.py +139 -60
- fractal_server/app/runner/v2/_local/__init__.py +12 -11
- fractal_server/app/runner/v2/_local/_local_config.py +1 -1
- fractal_server/app/runner/v2/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v2/_local_experimental/__init__.py +155 -0
- fractal_server/app/runner/v2/_local_experimental/_local_config.py +108 -0
- fractal_server/app/runner/v2/_local_experimental/_submit_setup.py +42 -0
- fractal_server/app/runner/v2/_local_experimental/executor.py +156 -0
- fractal_server/app/runner/v2/_slurm/__init__.py +10 -10
- fractal_server/app/runner/v2/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v2/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/runner.py +17 -15
- fractal_server/app/runner/v2/runner_functions.py +38 -38
- fractal_server/app/runner/v2/runner_functions_low_level.py +12 -6
- fractal_server/app/security/__init__.py +4 -5
- fractal_server/config.py +73 -19
- fractal_server/gunicorn_fractal.py +40 -0
- fractal_server/{logger/__init__.py → logger.py} +2 -2
- fractal_server/main.py +45 -26
- fractal_server/migrations/env.py +1 -1
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/METADATA +4 -1
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/RECORD +48 -43
- fractal_server/logger/gunicorn_logger.py +0 -19
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/entry_points.txt +0 -0
@@ -1,32 +1,55 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Optional
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
from fractal_server.tasks.utils import slugify_task_name
|
3
6
|
|
4
7
|
|
5
8
|
def sanitize_component(value: str) -> str:
|
6
9
|
"""
|
7
10
|
Remove {" ", "/", "."} form a string, e.g. going from
|
8
11
|
'plate.zarr/B/03/0' to 'plate_zarr_B_03_0'.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
value: Input strig
|
9
15
|
"""
|
10
16
|
return value.replace(" ", "_").replace("/", "_").replace(".", "_")
|
11
17
|
|
12
18
|
|
19
|
+
def task_subfolder_name(order: Union[int, str], task_name: str) -> str:
|
20
|
+
"""
|
21
|
+
Get name of task-specific subfolder.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
order:
|
25
|
+
task_name:
|
26
|
+
"""
|
27
|
+
task_name_slug = slugify_task_name(task_name)
|
28
|
+
return f"{order}_{task_name_slug}"
|
29
|
+
|
30
|
+
|
13
31
|
class TaskFiles:
|
14
32
|
"""
|
15
33
|
Group all file paths pertaining to a task
|
16
34
|
|
17
35
|
Attributes:
|
18
|
-
|
36
|
+
workflow_dir_local:
|
19
37
|
Server-owned directory to store all task-execution-related relevant
|
20
|
-
files
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
38
|
+
files. Note: users cannot write directly to this folder.
|
39
|
+
workflow_dir_remote:
|
40
|
+
User-side directory with the same scope as `workflow_dir_local`,
|
41
|
+
and where a user can write.
|
42
|
+
subfolder_name:
|
43
|
+
Name of task-specific subfolder
|
44
|
+
remote_subfolder:
|
45
|
+
Path to user-side task-specific subfolder
|
46
|
+
task_name:
|
47
|
+
Name of the task
|
25
48
|
task_order:
|
26
49
|
Positional order of the task within a workflow.
|
27
50
|
component:
|
28
|
-
Specific component to run the task for (relevant for tasks
|
29
|
-
|
51
|
+
Specific component to run the task for (relevant for tasks to be
|
52
|
+
executed in parallel over many components).
|
30
53
|
file_prefix:
|
31
54
|
Prefix for all task-related files.
|
32
55
|
args:
|
@@ -39,12 +62,16 @@ class TaskFiles:
|
|
39
62
|
Path for task-execution stderr.
|
40
63
|
"""
|
41
64
|
|
42
|
-
|
43
|
-
|
65
|
+
workflow_dir_local: Path
|
66
|
+
workflow_dir_remote: Path
|
67
|
+
remote_subfolder: Path
|
68
|
+
subfolder_name: str
|
69
|
+
task_name: str
|
44
70
|
task_order: Optional[int] = None
|
45
71
|
component: Optional[str] = None
|
46
72
|
|
47
73
|
file_prefix: str
|
74
|
+
file_prefix_with_subfolder: str
|
48
75
|
args: Path
|
49
76
|
out: Path
|
50
77
|
err: Path
|
@@ -53,14 +80,16 @@ class TaskFiles:
|
|
53
80
|
|
54
81
|
def __init__(
|
55
82
|
self,
|
56
|
-
|
57
|
-
|
83
|
+
workflow_dir_local: Path,
|
84
|
+
workflow_dir_remote: Path,
|
85
|
+
task_name: str,
|
58
86
|
task_order: Optional[int] = None,
|
59
87
|
component: Optional[str] = None,
|
60
88
|
):
|
61
|
-
self.
|
62
|
-
self.
|
89
|
+
self.workflow_dir_local = workflow_dir_local
|
90
|
+
self.workflow_dir_remote = workflow_dir_remote
|
63
91
|
self.task_order = task_order
|
92
|
+
self.task_name = task_name
|
64
93
|
self.component = component
|
65
94
|
|
66
95
|
if self.component is not None:
|
@@ -72,32 +101,35 @@ class TaskFiles:
|
|
72
101
|
if self.task_order is not None:
|
73
102
|
order = str(self.task_order)
|
74
103
|
else:
|
75
|
-
order = "
|
104
|
+
order = "0"
|
76
105
|
self.file_prefix = f"{order}{component_safe}"
|
77
|
-
self.
|
78
|
-
|
79
|
-
|
80
|
-
self.
|
106
|
+
self.subfolder_name = task_subfolder_name(
|
107
|
+
order=order, task_name=self.task_name
|
108
|
+
)
|
109
|
+
self.remote_subfolder = self.workflow_dir_remote / self.subfolder_name
|
110
|
+
self.args = self.remote_subfolder / f"{self.file_prefix}.args.json"
|
111
|
+
self.out = self.remote_subfolder / f"{self.file_prefix}.out"
|
112
|
+
self.err = self.remote_subfolder / f"{self.file_prefix}.err"
|
113
|
+
self.log = self.remote_subfolder / f"{self.file_prefix}.log"
|
81
114
|
self.metadiff = (
|
82
|
-
self.
|
115
|
+
self.remote_subfolder / f"{self.file_prefix}.metadiff.json"
|
83
116
|
)
|
84
117
|
|
85
118
|
|
86
119
|
def get_task_file_paths(
|
87
|
-
|
88
|
-
|
120
|
+
workflow_dir_local: Path,
|
121
|
+
workflow_dir_remote: Path,
|
122
|
+
task_name: str,
|
89
123
|
task_order: Optional[int] = None,
|
90
124
|
component: Optional[str] = None,
|
91
125
|
) -> TaskFiles:
|
92
126
|
"""
|
93
127
|
Return the corrisponding TaskFiles object
|
94
|
-
|
95
|
-
This function is mainly used as a cache to avoid instantiating needless
|
96
|
-
objects.
|
97
128
|
"""
|
98
129
|
return TaskFiles(
|
99
|
-
|
100
|
-
|
130
|
+
workflow_dir_local=workflow_dir_local,
|
131
|
+
workflow_dir_remote=workflow_dir_remote,
|
132
|
+
task_name=task_name,
|
101
133
|
task_order=task_order,
|
102
134
|
component=component,
|
103
135
|
)
|
@@ -22,6 +22,10 @@ import traceback
|
|
22
22
|
from pathlib import Path
|
23
23
|
from typing import Optional
|
24
24
|
|
25
|
+
from sqlalchemy.orm import Session as DBSyncSession
|
26
|
+
|
27
|
+
from ....logger import get_logger
|
28
|
+
from ....logger import reset_logger_handlers
|
25
29
|
from ....logger import set_logger
|
26
30
|
from ....syringe import Inject
|
27
31
|
from ....utils import get_timestamp
|
@@ -33,7 +37,11 @@ from ...models.v1 import WorkflowTask
|
|
33
37
|
from ...schemas.v1 import JobStatusTypeV1
|
34
38
|
from ..exceptions import JobExecutionError
|
35
39
|
from ..exceptions import TaskExecutionError
|
40
|
+
from ..executors.slurm._subprocess_run_as_user import (
|
41
|
+
_mkdir_as_user,
|
42
|
+
)
|
36
43
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
44
|
+
from ..task_files import task_subfolder_name
|
37
45
|
from ._local import process_workflow as local_process_workflow
|
38
46
|
from ._slurm import process_workflow as slurm_process_workflow
|
39
47
|
from .common import close_job_logger
|
@@ -49,6 +57,27 @@ _backends["local"] = local_process_workflow
|
|
49
57
|
_backends["slurm"] = slurm_process_workflow
|
50
58
|
|
51
59
|
|
60
|
+
def fail_job(
|
61
|
+
*,
|
62
|
+
db: DBSyncSession,
|
63
|
+
job: ApplyWorkflow,
|
64
|
+
log_msg: str,
|
65
|
+
logger_name: str,
|
66
|
+
emit_log: bool = False,
|
67
|
+
) -> None:
|
68
|
+
logger = get_logger(logger_name=logger_name)
|
69
|
+
if emit_log:
|
70
|
+
logger.error(log_msg)
|
71
|
+
reset_logger_handlers(logger)
|
72
|
+
job.status = JobStatusTypeV1.FAILED
|
73
|
+
job.end_timestamp = get_timestamp()
|
74
|
+
job.log = log_msg
|
75
|
+
db.merge(job)
|
76
|
+
db.commit()
|
77
|
+
db.close()
|
78
|
+
return
|
79
|
+
|
80
|
+
|
52
81
|
async def submit_workflow(
|
53
82
|
*,
|
54
83
|
workflow_id: int,
|
@@ -87,21 +116,41 @@ async def submit_workflow(
|
|
87
116
|
slurm backend.
|
88
117
|
"""
|
89
118
|
|
90
|
-
|
91
|
-
|
92
|
-
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
93
|
-
if FRACTAL_RUNNER_BACKEND == "local":
|
94
|
-
process_workflow = local_process_workflow
|
95
|
-
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
96
|
-
process_workflow = slurm_process_workflow
|
97
|
-
else:
|
98
|
-
raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
|
119
|
+
logger_name = f"WF{workflow_id}_job{job_id}"
|
120
|
+
logger = set_logger(logger_name=logger_name)
|
99
121
|
|
100
122
|
with next(DB.get_sync_db()) as db_sync:
|
101
123
|
|
102
124
|
job: ApplyWorkflow = db_sync.get(ApplyWorkflow, job_id)
|
103
125
|
if not job:
|
104
|
-
|
126
|
+
logger.error(f"ApplyWorkflow {job_id} does not exist")
|
127
|
+
return
|
128
|
+
|
129
|
+
settings = Inject(get_settings)
|
130
|
+
FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
|
131
|
+
if FRACTAL_RUNNER_BACKEND == "local":
|
132
|
+
process_workflow = local_process_workflow
|
133
|
+
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
134
|
+
process_workflow = slurm_process_workflow
|
135
|
+
else:
|
136
|
+
|
137
|
+
if FRACTAL_RUNNER_BACKEND == "local_experimental":
|
138
|
+
log_msg = (
|
139
|
+
f"{FRACTAL_RUNNER_BACKEND=} is not available for v1 jobs."
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
log_msg = f"Invalid {FRACTAL_RUNNER_BACKEND=}"
|
143
|
+
|
144
|
+
fail_job(
|
145
|
+
job=job,
|
146
|
+
db=db_sync,
|
147
|
+
log_msg=log_msg,
|
148
|
+
logger_name=logger_name,
|
149
|
+
emit_log=True,
|
150
|
+
)
|
151
|
+
return
|
152
|
+
|
153
|
+
# Declare runner backend and set `process_workflow` function
|
105
154
|
|
106
155
|
input_dataset: Dataset = db_sync.get(Dataset, input_dataset_id)
|
107
156
|
output_dataset: Dataset = db_sync.get(Dataset, output_dataset_id)
|
@@ -122,12 +171,9 @@ async def submit_workflow(
|
|
122
171
|
log_msg += (
|
123
172
|
f"Cannot fetch workflow {workflow_id} from database\n"
|
124
173
|
)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
db_sync.merge(job)
|
129
|
-
db_sync.commit()
|
130
|
-
db_sync.close()
|
174
|
+
fail_job(
|
175
|
+
db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
|
176
|
+
)
|
131
177
|
return
|
132
178
|
|
133
179
|
# Prepare some of process_workflow arguments
|
@@ -137,41 +183,53 @@ async def submit_workflow(
|
|
137
183
|
# Define and create server-side working folder
|
138
184
|
project_id = workflow.project_id
|
139
185
|
timestamp_string = get_timestamp().strftime("%Y%m%d_%H%M%S")
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
f"_{timestamp_string}"
|
145
|
-
)
|
146
|
-
).resolve()
|
186
|
+
WORKFLOW_DIR_LOCAL = settings.FRACTAL_RUNNER_WORKING_BASE_DIR / (
|
187
|
+
f"proj_{project_id:07d}_wf_{workflow_id:07d}_job_{job_id:07d}"
|
188
|
+
f"_{timestamp_string}"
|
189
|
+
)
|
147
190
|
|
148
|
-
if
|
149
|
-
|
191
|
+
if WORKFLOW_DIR_LOCAL.exists():
|
192
|
+
fail_job(
|
193
|
+
db=db_sync,
|
194
|
+
job=job,
|
195
|
+
log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
|
196
|
+
logger_name=logger_name,
|
197
|
+
emit_log=True,
|
198
|
+
)
|
199
|
+
return
|
150
200
|
|
151
|
-
# Create WORKFLOW_DIR
|
201
|
+
# Create WORKFLOW_DIR
|
152
202
|
original_umask = os.umask(0)
|
153
|
-
|
203
|
+
WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
|
154
204
|
os.umask(original_umask)
|
155
205
|
|
156
|
-
# Define and create
|
206
|
+
# Define and create WORKFLOW_DIR_REMOTE
|
157
207
|
if FRACTAL_RUNNER_BACKEND == "local":
|
158
|
-
|
208
|
+
WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
|
159
209
|
elif FRACTAL_RUNNER_BACKEND == "slurm":
|
160
|
-
|
161
|
-
|
162
|
-
_mkdir_as_user,
|
210
|
+
WORKFLOW_DIR_REMOTE = (
|
211
|
+
Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
|
163
212
|
)
|
213
|
+
_mkdir_as_user(folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user)
|
164
214
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
215
|
+
# Create all tasks subfolders
|
216
|
+
for order in range(job.first_task_index, job.last_task_index + 1):
|
217
|
+
subfolder_name = task_subfolder_name(
|
218
|
+
order=order,
|
219
|
+
task_name=workflow.task_list[order].task.name,
|
220
|
+
)
|
221
|
+
original_umask = os.umask(0)
|
222
|
+
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
|
223
|
+
os.umask(original_umask)
|
224
|
+
if FRACTAL_RUNNER_BACKEND == "slurm":
|
225
|
+
_mkdir_as_user(
|
226
|
+
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
227
|
+
user=slurm_user,
|
228
|
+
)
|
171
229
|
|
172
230
|
# Update db
|
173
|
-
job.working_dir =
|
174
|
-
job.working_dir_user =
|
231
|
+
job.working_dir = WORKFLOW_DIR_LOCAL.as_posix()
|
232
|
+
job.working_dir_user = WORKFLOW_DIR_REMOTE.as_posix()
|
175
233
|
db_sync.merge(job)
|
176
234
|
db_sync.commit()
|
177
235
|
|
@@ -191,8 +249,7 @@ async def submit_workflow(
|
|
191
249
|
db_sync.refresh(workflow)
|
192
250
|
|
193
251
|
# Write logs
|
194
|
-
|
195
|
-
log_file_path = WORKFLOW_DIR / WORKFLOW_LOG_FILENAME
|
252
|
+
log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
|
196
253
|
logger = set_logger(
|
197
254
|
logger_name=logger_name,
|
198
255
|
log_file_path=log_file_path,
|
@@ -239,8 +296,8 @@ async def submit_workflow(
|
|
239
296
|
slurm_user=slurm_user,
|
240
297
|
slurm_account=job.slurm_account,
|
241
298
|
user_cache_dir=user_cache_dir,
|
242
|
-
|
243
|
-
|
299
|
+
workflow_dir_local=WORKFLOW_DIR_LOCAL,
|
300
|
+
workflow_dir_remote=WORKFLOW_DIR_REMOTE,
|
244
301
|
logger_name=logger_name,
|
245
302
|
worker_init=worker_init,
|
246
303
|
first_task_index=job.first_task_index,
|
@@ -291,19 +348,14 @@ async def submit_workflow(
|
|
291
348
|
|
292
349
|
db_sync.merge(output_dataset)
|
293
350
|
|
294
|
-
job.status = JobStatusTypeV1.FAILED
|
295
|
-
job.end_timestamp = get_timestamp()
|
296
|
-
|
297
351
|
exception_args_string = "\n".join(e.args)
|
298
|
-
|
352
|
+
log_msg = (
|
299
353
|
f"TASK ERROR: "
|
300
354
|
f"Task name: {e.task_name}, "
|
301
355
|
f"position in Workflow: {e.workflow_task_order}\n"
|
302
356
|
f"TRACEBACK:\n{exception_args_string}"
|
303
357
|
)
|
304
|
-
db_sync
|
305
|
-
close_job_logger(logger)
|
306
|
-
db_sync.commit()
|
358
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
307
359
|
|
308
360
|
except JobExecutionError as e:
|
309
361
|
|
@@ -323,14 +375,13 @@ async def submit_workflow(
|
|
323
375
|
)
|
324
376
|
|
325
377
|
db_sync.merge(output_dataset)
|
326
|
-
|
327
|
-
job.status = JobStatusTypeV1.FAILED
|
328
|
-
job.end_timestamp = get_timestamp()
|
329
378
|
error = e.assemble_error()
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
379
|
+
fail_job(
|
380
|
+
db=db_sync,
|
381
|
+
job=job,
|
382
|
+
log_msg=f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}",
|
383
|
+
logger_name=logger_name,
|
384
|
+
)
|
334
385
|
|
335
386
|
except Exception:
|
336
387
|
|
@@ -353,14 +404,12 @@ async def submit_workflow(
|
|
353
404
|
|
354
405
|
db_sync.merge(output_dataset)
|
355
406
|
|
356
|
-
|
357
|
-
job.end_timestamp = get_timestamp()
|
358
|
-
job.log = (
|
407
|
+
log_msg = (
|
359
408
|
f"UNKNOWN ERROR in Fractal job {job.id}\n"
|
360
409
|
f"TRACEBACK:\n{current_traceback}"
|
361
410
|
)
|
362
|
-
db_sync
|
363
|
-
|
364
|
-
db_sync.commit()
|
411
|
+
fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
|
412
|
+
|
365
413
|
finally:
|
366
414
|
db_sync.close()
|
415
|
+
reset_logger_handlers(logger)
|