fractal-server 2.14.0a2__py3-none-any.whl → 2.14.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +3 -1
- fractal_server/app/history/__init__.py +4 -4
- fractal_server/app/history/image_updates.py +124 -142
- fractal_server/app/history/status_enum.py +2 -2
- fractal_server/app/models/v2/__init__.py +6 -4
- fractal_server/app/models/v2/history.py +44 -20
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/api/__init__.py +1 -1
- fractal_server/app/routes/api/v2/__init__.py +4 -0
- fractal_server/app/routes/api/v2/_aux_functions_history.py +49 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -12
- fractal_server/app/routes/api/v2/history.py +302 -176
- fractal_server/app/routes/api/v2/project.py +1 -26
- fractal_server/app/routes/api/v2/status_legacy.py +168 -0
- fractal_server/app/routes/api/v2/workflow.py +2 -17
- fractal_server/app/routes/api/v2/workflowtask.py +41 -71
- fractal_server/app/routes/auth/oauth.py +5 -3
- fractal_server/app/runner/executors/base_runner.py +2 -1
- fractal_server/app/runner/executors/local/_submit_setup.py +5 -13
- fractal_server/app/runner/executors/local/runner.py +10 -55
- fractal_server/app/runner/executors/slurm_common/_slurm_config.py +1 -1
- fractal_server/app/runner/executors/slurm_common/get_slurm_config.py +1 -1
- fractal_server/app/runner/executors/slurm_common/remote.py +1 -1
- fractal_server/app/runner/executors/slurm_sudo/runner.py +171 -108
- fractal_server/app/runner/v2/__init__.py +2 -22
- fractal_server/app/runner/v2/_slurm_ssh.py +1 -1
- fractal_server/app/runner/v2/_slurm_sudo.py +1 -1
- fractal_server/app/runner/v2/runner.py +47 -59
- fractal_server/app/runner/v2/runner_functions.py +185 -69
- fractal_server/app/schemas/_validators.py +13 -24
- fractal_server/app/schemas/user.py +10 -7
- fractal_server/app/schemas/user_settings.py +9 -21
- fractal_server/app/schemas/v2/dataset.py +8 -6
- fractal_server/app/schemas/v2/job.py +9 -5
- fractal_server/app/schemas/v2/manifest.py +3 -7
- fractal_server/app/schemas/v2/project.py +9 -7
- fractal_server/app/schemas/v2/task.py +41 -77
- fractal_server/app/schemas/v2/task_collection.py +14 -32
- fractal_server/app/schemas/v2/task_group.py +10 -9
- fractal_server/app/schemas/v2/workflow.py +10 -11
- fractal_server/app/security/__init__.py +3 -3
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/config.py +33 -34
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
- fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
- fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
- fractal_server/tasks/v2/utils_templates.py +6 -0
- {fractal_server-2.14.0a2.dist-info → fractal_server-2.14.0a4.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a2.dist-info → fractal_server-2.14.0a4.dist-info}/RECORD +53 -54
- fractal_server/app/runner/executors/slurm_sudo/_executor_wait_thread.py +0 -130
- fractal_server/app/schemas/v2/history.py +0 -23
- fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +0 -68
- fractal_server/migrations/versions/954ddc64425a_image_status.py +0 -63
- {fractal_server-2.14.0a2.dist-info → fractal_server-2.14.0a4.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a2.dist-info → fractal_server-2.14.0a4.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a2.dist-info → fractal_server-2.14.0a4.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
+
import math
|
4
|
+
import os
|
3
5
|
import shlex
|
4
6
|
import subprocess # nosec
|
5
7
|
import sys
|
@@ -16,13 +18,13 @@ from ._check_jobs_status import get_finished_jobs
|
|
16
18
|
from ._subprocess_run_as_user import _mkdir_as_user
|
17
19
|
from ._subprocess_run_as_user import _run_command_as_user
|
18
20
|
from fractal_server import __VERSION__
|
19
|
-
from fractal_server.app.history import HistoryItemImageStatus
|
20
|
-
from fractal_server.app.history import update_all_images
|
21
|
-
from fractal_server.app.history import update_single_image
|
22
21
|
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
23
22
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
24
23
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
25
24
|
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
25
|
+
from fractal_server.app.runner.executors.slurm_common._batching import (
|
26
|
+
heuristics,
|
27
|
+
)
|
26
28
|
from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
27
29
|
SlurmConfig,
|
28
30
|
)
|
@@ -32,6 +34,11 @@ from fractal_server.config import get_settings
|
|
32
34
|
from fractal_server.logger import set_logger
|
33
35
|
from fractal_server.syringe import Inject
|
34
36
|
|
37
|
+
# from fractal_server.app.history import ImageStatus
|
38
|
+
# from fractal_server.app.history import update_all_images
|
39
|
+
# from fractal_server.app.history import update_single_image
|
40
|
+
# from fractal_server.app.history import update_single_image_logfile
|
41
|
+
|
35
42
|
|
36
43
|
logger = set_logger(__name__)
|
37
44
|
|
@@ -56,8 +63,10 @@ class SlurmTask(BaseModel):
|
|
56
63
|
component: str
|
57
64
|
workdir_local: Path
|
58
65
|
workdir_remote: Path
|
66
|
+
parameters: dict[str, Any]
|
59
67
|
zarr_url: Optional[str] = None
|
60
68
|
task_files: TaskFiles
|
69
|
+
index: int
|
61
70
|
|
62
71
|
@property
|
63
72
|
def input_pickle_file_local(self) -> str:
|
@@ -89,7 +98,7 @@ class SlurmJob(BaseModel):
|
|
89
98
|
label: str
|
90
99
|
workdir_local: Path
|
91
100
|
workdir_remote: Path
|
92
|
-
tasks:
|
101
|
+
tasks: list[SlurmTask]
|
93
102
|
|
94
103
|
@property
|
95
104
|
def slurm_log_file_local(self) -> str:
|
@@ -213,7 +222,9 @@ class RunnerSlurmSudo(BaseRunner):
|
|
213
222
|
self.root_dir_remote = root_dir_remote
|
214
223
|
|
215
224
|
# Create folders
|
216
|
-
|
225
|
+
original_umask = os.umask(0)
|
226
|
+
self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
|
227
|
+
os.umask(original_umask)
|
217
228
|
_mkdir_as_user(
|
218
229
|
folder=self.root_dir_remote.as_posix(),
|
219
230
|
user=self.slurm_user,
|
@@ -242,9 +253,8 @@ class RunnerSlurmSudo(BaseRunner):
|
|
242
253
|
def is_shutdown(self) -> bool:
|
243
254
|
return self.shutdown_file.exists()
|
244
255
|
|
245
|
-
def
|
246
|
-
|
247
|
-
logger.debug("[exit_if_shutdown] START")
|
256
|
+
def scancel_jobs(self) -> None:
|
257
|
+
logger.debug("[scancel_jobs] START")
|
248
258
|
|
249
259
|
if self.jobs:
|
250
260
|
scancel_string = " ".join(self.job_ids)
|
@@ -258,22 +268,20 @@ class RunnerSlurmSudo(BaseRunner):
|
|
258
268
|
)
|
259
269
|
except RuntimeError as e:
|
260
270
|
logger.warning(
|
261
|
-
"[
|
271
|
+
"[scancel_jobs] `scancel` command failed. "
|
262
272
|
f"Original error:\n{str(e)}"
|
263
273
|
)
|
264
274
|
|
265
|
-
logger.debug("[
|
275
|
+
logger.debug("[scancel_jobs] END")
|
266
276
|
|
267
277
|
def _submit_single_sbatch(
|
268
278
|
self,
|
269
279
|
func,
|
270
|
-
parameters, # FIXME this should be per-task
|
271
280
|
slurm_job: SlurmJob,
|
272
281
|
slurm_config: SlurmConfig,
|
273
282
|
) -> str:
|
274
|
-
|
275
|
-
|
276
|
-
raise NotImplementedError()
|
283
|
+
# if len(slurm_job.tasks) > 1:
|
284
|
+
# raise NotImplementedError()
|
277
285
|
|
278
286
|
# Prepare input pickle(s)
|
279
287
|
versions = dict(
|
@@ -283,10 +291,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
283
291
|
)
|
284
292
|
for task in slurm_job.tasks:
|
285
293
|
_args = []
|
286
|
-
|
287
|
-
_kwargs = dict(
|
288
|
-
parameters=parameters
|
289
|
-
) # FIXME: this should be per-tas
|
294
|
+
_kwargs = dict(parameters=task.parameters)
|
290
295
|
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
291
296
|
with open(task.input_pickle_file_local, "wb") as f:
|
292
297
|
f.write(funcser)
|
@@ -339,6 +344,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
339
344
|
|
340
345
|
# Add job to self.jobs
|
341
346
|
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
347
|
+
logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
|
342
348
|
|
343
349
|
@property
|
344
350
|
def job_ids(self) -> list[str]:
|
@@ -352,22 +358,25 @@ class RunnerSlurmSudo(BaseRunner):
|
|
352
358
|
(job.slurm_log_file_remote, job.slurm_log_file_local)
|
353
359
|
]
|
354
360
|
for task in job.tasks:
|
355
|
-
source_target_list.
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
361
|
+
source_target_list.extend(
|
362
|
+
[
|
363
|
+
(
|
364
|
+
task.output_pickle_file_remote,
|
365
|
+
task.output_pickle_file_local,
|
366
|
+
),
|
367
|
+
(
|
368
|
+
task.task_files.log_file_remote,
|
369
|
+
task.task_files.log_file_local,
|
370
|
+
),
|
371
|
+
(
|
372
|
+
task.task_files.args_file_remote,
|
373
|
+
task.task_files.args_file_local,
|
374
|
+
),
|
375
|
+
(
|
376
|
+
task.task_files.metadiff_file_remote,
|
377
|
+
task.task_files.metadiff_file_local,
|
378
|
+
),
|
379
|
+
]
|
371
380
|
)
|
372
381
|
|
373
382
|
for source, target in source_target_list:
|
@@ -383,12 +392,12 @@ class RunnerSlurmSudo(BaseRunner):
|
|
383
392
|
# Write local file
|
384
393
|
with open(target, "wb") as f:
|
385
394
|
f.write(res.stdout)
|
395
|
+
logger.critical(f"Copied {source} into {target}")
|
386
396
|
except RuntimeError as e:
|
387
397
|
logger.warning(
|
388
398
|
f"SKIP copy {source} into {target}. "
|
389
399
|
f"Original error: {str(e)}"
|
390
400
|
)
|
391
|
-
logger.debug(f"Copied {source} into {target}")
|
392
401
|
|
393
402
|
def _postprocess_single_task(
|
394
403
|
self, *, task: SlurmTask
|
@@ -416,11 +425,9 @@ class RunnerSlurmSudo(BaseRunner):
|
|
416
425
|
parameters: dict[str, Any],
|
417
426
|
history_item_id: int,
|
418
427
|
task_files: TaskFiles,
|
428
|
+
slurm_config: SlurmConfig,
|
419
429
|
in_compound_task: bool = False,
|
420
|
-
slurm_config: Optional[SlurmConfig] = None,
|
421
|
-
**kwargs,
|
422
430
|
) -> tuple[Any, Exception]:
|
423
|
-
|
424
431
|
workdir_local = task_files.wftask_subfolder_local
|
425
432
|
workdir_remote = task_files.wftask_subfolder_remote
|
426
433
|
|
@@ -433,25 +440,29 @@ class RunnerSlurmSudo(BaseRunner):
|
|
433
440
|
|
434
441
|
if self.jobs != {}:
|
435
442
|
if not in_compound_task:
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
443
|
+
pass
|
444
|
+
# update_all_images(
|
445
|
+
# history_item_id=history_item_id,
|
446
|
+
# status=ImageStatus.FAILED,
|
447
|
+
# )
|
440
448
|
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
441
449
|
|
442
450
|
if self.is_shutdown():
|
443
451
|
if not in_compound_task:
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
452
|
+
pass
|
453
|
+
# update_all_images(
|
454
|
+
# history_item_id=history_item_id,
|
455
|
+
# status=ImageStatus.FAILED,
|
456
|
+
# )
|
448
457
|
raise JobExecutionError("Cannot continue after shutdown.")
|
449
458
|
|
450
459
|
# Validation phase
|
451
460
|
self.validate_submit_parameters(parameters)
|
452
461
|
|
453
462
|
# Create task subfolder
|
454
|
-
|
463
|
+
original_umask = os.umask(0)
|
464
|
+
workdir_local.mkdir(parents=True, mode=0o755)
|
465
|
+
os.umask(original_umask)
|
455
466
|
_mkdir_as_user(
|
456
467
|
folder=workdir_remote.as_posix(),
|
457
468
|
user=self.slurm_user,
|
@@ -464,7 +475,9 @@ class RunnerSlurmSudo(BaseRunner):
|
|
464
475
|
workdir_remote=workdir_remote,
|
465
476
|
tasks=[
|
466
477
|
SlurmTask(
|
478
|
+
index=0,
|
467
479
|
component="0",
|
480
|
+
parameters=parameters,
|
468
481
|
workdir_remote=workdir_remote,
|
469
482
|
workdir_local=workdir_local,
|
470
483
|
task_files=task_files,
|
@@ -473,16 +486,16 @@ class RunnerSlurmSudo(BaseRunner):
|
|
473
486
|
) # TODO: replace with actual values (BASED ON TASKFILES)
|
474
487
|
self._submit_single_sbatch(
|
475
488
|
func,
|
476
|
-
parameters=parameters,
|
477
489
|
slurm_job=slurm_job,
|
490
|
+
slurm_config=slurm_config,
|
478
491
|
)
|
479
492
|
|
480
|
-
LOGFILE = task_files.log_file_local
|
493
|
+
# LOGFILE = task_files.log_file_local
|
481
494
|
|
482
495
|
# Retrieval phase
|
483
496
|
while len(self.jobs) > 0:
|
484
497
|
if self.is_shutdown():
|
485
|
-
self.
|
498
|
+
self.scancel_jobs()
|
486
499
|
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
487
500
|
for slurm_job_id in finished_job_ids:
|
488
501
|
slurm_job = self.jobs.pop(slurm_job_id)
|
@@ -494,17 +507,19 @@ class RunnerSlurmSudo(BaseRunner):
|
|
494
507
|
|
495
508
|
if not in_compound_task:
|
496
509
|
if exception is None:
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
510
|
+
pass
|
511
|
+
# update_all_images(
|
512
|
+
# history_item_id=history_item_id,
|
513
|
+
# status=ImageStatus.DONE,
|
514
|
+
# logfile=LOGFILE,
|
515
|
+
# )
|
502
516
|
else:
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
517
|
+
pass
|
518
|
+
# update_all_images(
|
519
|
+
# history_item_id=history_item_id,
|
520
|
+
# status=ImageStatus.FAILED,
|
521
|
+
# logfile=LOGFILE,
|
522
|
+
# )
|
508
523
|
|
509
524
|
return result, exception
|
510
525
|
|
@@ -514,10 +529,10 @@ class RunnerSlurmSudo(BaseRunner):
|
|
514
529
|
list_parameters: list[dict],
|
515
530
|
history_item_id: int,
|
516
531
|
task_files: TaskFiles,
|
532
|
+
slurm_config: SlurmConfig,
|
517
533
|
in_compound_task: bool = False,
|
518
|
-
**kwargs,
|
519
534
|
):
|
520
|
-
self.
|
535
|
+
# self.scancel_jobs()
|
521
536
|
|
522
537
|
self.validate_multisubmit_parameters(
|
523
538
|
list_parameters=list_parameters,
|
@@ -527,82 +542,130 @@ class RunnerSlurmSudo(BaseRunner):
|
|
527
542
|
workdir_local = task_files.wftask_subfolder_local
|
528
543
|
workdir_remote = task_files.wftask_subfolder_remote
|
529
544
|
|
530
|
-
# Create
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
545
|
+
# Create local&remote task subfolders
|
546
|
+
if not in_compound_task:
|
547
|
+
original_umask = os.umask(0)
|
548
|
+
workdir_local.mkdir(parents=True, mode=0o755)
|
549
|
+
os.umask(original_umask)
|
550
|
+
_mkdir_as_user(
|
551
|
+
folder=workdir_remote.as_posix(),
|
552
|
+
user=self.slurm_user,
|
553
|
+
)
|
536
554
|
|
537
555
|
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
538
556
|
# TODO Pick a data structure for results and exceptions, or review the
|
539
557
|
# interface
|
540
|
-
results =
|
541
|
-
exceptions =
|
542
|
-
jobs: dict[str, SlurmJob] = {}
|
558
|
+
results: dict[int, Any] = {}
|
559
|
+
exceptions: dict[int, BaseException] = {}
|
543
560
|
|
544
561
|
original_task_files = task_files
|
545
|
-
|
546
|
-
|
547
|
-
|
562
|
+
tot_tasks = len(list_parameters)
|
563
|
+
|
564
|
+
# Set/validate parameters for task batching
|
565
|
+
tasks_per_job, parallel_tasks_per_job = heuristics(
|
566
|
+
# Number of parallel components (always known)
|
567
|
+
tot_tasks=tot_tasks,
|
568
|
+
# Optional WorkflowTask attributes:
|
569
|
+
tasks_per_job=slurm_config.tasks_per_job,
|
570
|
+
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
571
|
+
# Task requirements (multiple possible sources):
|
572
|
+
cpus_per_task=slurm_config.cpus_per_task,
|
573
|
+
mem_per_task=slurm_config.mem_per_task_MB,
|
574
|
+
# Fractal configuration variables (soft/hard limits):
|
575
|
+
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
576
|
+
target_mem_per_job=slurm_config.target_mem_per_job,
|
577
|
+
target_num_jobs=slurm_config.target_num_jobs,
|
578
|
+
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
579
|
+
max_mem_per_job=slurm_config.max_mem_per_job,
|
580
|
+
max_num_jobs=slurm_config.max_num_jobs,
|
581
|
+
)
|
582
|
+
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
583
|
+
slurm_config.tasks_per_job = tasks_per_job
|
584
|
+
|
585
|
+
# Divide arguments in batches of `tasks_per_job` tasks each
|
586
|
+
args_batches = []
|
587
|
+
batch_size = tasks_per_job
|
588
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
589
|
+
args_batches.append(
|
590
|
+
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
591
|
+
)
|
592
|
+
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
593
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
548
594
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
595
|
+
logger.info(f"START submission phase, {list(self.jobs.keys())=}")
|
596
|
+
for ind_batch, chunk in enumerate(args_batches):
|
597
|
+
# TODO: replace with actual values
|
598
|
+
tasks = []
|
599
|
+
for ind_chunk, parameters in enumerate(chunk):
|
600
|
+
component = parameters[_COMPONENT_KEY_]
|
601
|
+
tasks.append(
|
555
602
|
SlurmTask(
|
603
|
+
index=(ind_batch * batch_size) + ind_chunk,
|
556
604
|
component=component,
|
557
605
|
workdir_local=workdir_local,
|
558
606
|
workdir_remote=workdir_remote,
|
607
|
+
parameters=parameters,
|
559
608
|
zarr_url=parameters["zarr_url"],
|
560
609
|
task_files=TaskFiles(
|
561
|
-
**original_task_files
|
610
|
+
**original_task_files.model_dump(
|
611
|
+
exclude={"component"}
|
612
|
+
),
|
562
613
|
component=component,
|
563
614
|
),
|
564
|
-
)
|
565
|
-
|
615
|
+
),
|
616
|
+
)
|
617
|
+
|
618
|
+
slurm_job = SlurmJob(
|
619
|
+
label=f"{ind_batch:06d}",
|
620
|
+
workdir_local=workdir_local,
|
621
|
+
workdir_remote=workdir_remote,
|
622
|
+
tasks=tasks,
|
566
623
|
)
|
567
|
-
|
624
|
+
self._submit_single_sbatch(
|
568
625
|
func,
|
569
|
-
parameters=parameters,
|
570
626
|
slurm_job=slurm_job,
|
627
|
+
slurm_config=slurm_config,
|
571
628
|
)
|
572
|
-
|
573
|
-
jobs[slurm_job_id] = slurm_job
|
629
|
+
logger.info(f"END submission phase, {list(self.jobs.keys())=}")
|
574
630
|
|
575
631
|
# Retrieval phase
|
576
|
-
while len(jobs) > 0:
|
632
|
+
while len(self.jobs) > 0:
|
577
633
|
if self.is_shutdown():
|
578
|
-
self.
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
slurm_job = jobs.pop(slurm_job_id)
|
634
|
+
self.scancel_jobs()
|
635
|
+
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
636
|
+
for slurm_job_id in finished_job_ids:
|
637
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
583
638
|
self._copy_files_from_remote_to_local(slurm_job)
|
584
639
|
for task in slurm_job.tasks:
|
585
640
|
result, exception = self._postprocess_single_task(
|
586
641
|
task=task
|
587
642
|
)
|
643
|
+
if not in_compound_task:
|
644
|
+
pass
|
645
|
+
# update_single_image_logfile(
|
646
|
+
# history_item_id=history_item_id,
|
647
|
+
# zarr_url=task.zarr_url,
|
648
|
+
# logfile=task.task_files.log_file_local,
|
649
|
+
# )
|
588
650
|
if not in_compound_task:
|
589
651
|
if exception is None:
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
)
|
652
|
+
pass
|
653
|
+
# update_single_image(
|
654
|
+
# zarr_url=task.zarr_url,
|
655
|
+
# history_item_id=history_item_id,
|
656
|
+
# status=ImageStatus.DONE,
|
657
|
+
# )
|
596
658
|
else:
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
)
|
603
|
-
|
604
|
-
|
605
|
-
|
659
|
+
pass
|
660
|
+
# update_single_image(
|
661
|
+
# zarr_url=task.zarr_url,
|
662
|
+
# history_item_id=history_item_id,
|
663
|
+
# status=ImageStatus.FAILED,
|
664
|
+
# )
|
665
|
+
if exception is None:
|
666
|
+
results[task.index] = result
|
667
|
+
else:
|
668
|
+
exceptions[task.index] = exception
|
606
669
|
time.sleep(self.slurm_poll_interval)
|
607
670
|
return results, exceptions
|
608
671
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
Runner backend subsystem root V2
|
3
3
|
|
4
4
|
This module is the single entry point to the runner backend subsystem V2.
|
5
|
-
Other
|
5
|
+
Other subsystems should only import this module and not its submodules or
|
6
6
|
the individual backends.
|
7
7
|
"""
|
8
8
|
import os
|
@@ -29,7 +29,6 @@ from ..exceptions import JobExecutionError
|
|
29
29
|
from ..exceptions import TaskExecutionError
|
30
30
|
from ..executors.slurm_sudo._subprocess_run_as_user import _mkdir_as_user
|
31
31
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
32
|
-
from ..task_files import task_subfolder_name
|
33
32
|
from ._local import process_workflow as local_process_workflow
|
34
33
|
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
35
34
|
from ._slurm_sudo import process_workflow as slurm_sudo_process_workflow
|
@@ -118,7 +117,7 @@ def submit_workflow(
|
|
118
117
|
)
|
119
118
|
except Exception as e:
|
120
119
|
logger.error(
|
121
|
-
f"Error
|
120
|
+
f"Error connecting to the database. Original error: {str(e)}"
|
122
121
|
)
|
123
122
|
reset_logger_handlers(logger)
|
124
123
|
return
|
@@ -200,25 +199,6 @@ def submit_workflow(
|
|
200
199
|
f"{settings.FRACTAL_RUNNER_BACKEND}."
|
201
200
|
)
|
202
201
|
|
203
|
-
# Create all tasks subfolders # FIXME: do this with Runner
|
204
|
-
for order in range(job.first_task_index, job.last_task_index + 1):
|
205
|
-
this_wftask = workflow.task_list[order]
|
206
|
-
task_name = this_wftask.task.name
|
207
|
-
subfolder_name = task_subfolder_name(
|
208
|
-
order=order,
|
209
|
-
task_name=task_name,
|
210
|
-
)
|
211
|
-
if FRACTAL_RUNNER_BACKEND == "slurm":
|
212
|
-
# Create local subfolder (with 755) and remote one
|
213
|
-
# (via `sudo -u`)
|
214
|
-
original_umask = os.umask(0)
|
215
|
-
(WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
|
216
|
-
os.umask(original_umask)
|
217
|
-
_mkdir_as_user(
|
218
|
-
folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
|
219
|
-
user=slurm_user,
|
220
|
-
)
|
221
|
-
|
222
202
|
except Exception as e:
|
223
203
|
error_type = type(e).__name__
|
224
204
|
fail_job(
|