fractal-server 2.14.0a9__py3-none-any.whl → 2.14.0a11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/v2/dataset.py +0 -10
- fractal_server/app/models/v2/job.py +3 -0
- fractal_server/app/routes/api/v2/__init__.py +2 -0
- fractal_server/app/routes/api/v2/history.py +14 -9
- fractal_server/app/routes/api/v2/images.py +5 -2
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/verify_image_types.py +64 -0
- fractal_server/app/routes/api/v2/workflow.py +11 -7
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +16 -17
- fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
- fractal_server/app/runner/executors/local/runner.py +117 -58
- fractal_server/app/runner/executors/{slurm_sudo → slurm_common}/_check_jobs_status.py +4 -0
- fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +67 -0
- fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
- fractal_server/app/runner/executors/slurm_ssh/runner.py +707 -0
- fractal_server/app/runner/executors/slurm_sudo/runner.py +265 -114
- fractal_server/app/runner/task_files.py +8 -0
- fractal_server/app/runner/v2/__init__.py +0 -365
- fractal_server/app/runner/v2/_local.py +4 -2
- fractal_server/app/runner/v2/_slurm_ssh.py +4 -2
- fractal_server/app/runner/v2/_slurm_sudo.py +4 -2
- fractal_server/app/runner/v2/db_tools.py +87 -0
- fractal_server/app/runner/v2/runner.py +83 -89
- fractal_server/app/runner/v2/runner_functions.py +279 -436
- fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
- fractal_server/app/runner/v2/submit_workflow.py +366 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/v2/dataset.py +4 -71
- fractal_server/app/schemas/v2/dumps.py +6 -5
- fractal_server/app/schemas/v2/job.py +6 -3
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +40 -36
- fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
- fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
- fractal_server/app/runner/v2/_db_tools.py +0 -48
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -6,19 +6,22 @@ import shlex
|
|
6
6
|
import subprocess # nosec
|
7
7
|
import sys
|
8
8
|
import time
|
9
|
+
from copy import copy
|
9
10
|
from pathlib import Path
|
10
11
|
from typing import Any
|
12
|
+
from typing import Literal
|
11
13
|
from typing import Optional
|
12
14
|
|
13
15
|
import cloudpickle
|
14
16
|
from pydantic import BaseModel
|
15
17
|
from pydantic import ConfigDict
|
16
18
|
|
17
|
-
from ._check_jobs_status import get_finished_jobs
|
19
|
+
from ..slurm_common._check_jobs_status import get_finished_jobs
|
20
|
+
from ..slurm_common._check_jobs_status import run_squeue
|
18
21
|
from ._subprocess_run_as_user import _mkdir_as_user
|
19
22
|
from ._subprocess_run_as_user import _run_command_as_user
|
20
23
|
from fractal_server import __VERSION__
|
21
|
-
from fractal_server.app.
|
24
|
+
from fractal_server.app.db import get_sync_db
|
22
25
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
23
26
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
24
27
|
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
@@ -30,7 +33,8 @@ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
|
30
33
|
)
|
31
34
|
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
32
35
|
from fractal_server.app.runner.task_files import TaskFiles
|
33
|
-
from fractal_server.app.
|
36
|
+
from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
|
37
|
+
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
34
38
|
from fractal_server.config import get_settings
|
35
39
|
from fractal_server.logger import set_logger
|
36
40
|
from fractal_server.syringe import Inject
|
@@ -97,40 +101,68 @@ class SlurmJob(BaseModel):
|
|
97
101
|
tasks: list[SlurmTask]
|
98
102
|
|
99
103
|
@property
|
100
|
-
def
|
104
|
+
def slurm_submission_script_local(self) -> str:
|
105
|
+
return (
|
106
|
+
self.workdir_local / f"slurm-{self.label}-submit.sh"
|
107
|
+
).as_posix()
|
108
|
+
|
109
|
+
@property
|
110
|
+
def slurm_submission_script_remote(self) -> str:
|
111
|
+
return (
|
112
|
+
self.workdir_remote / f"slurm-{self.label}-submit.sh"
|
113
|
+
).as_posix()
|
114
|
+
|
115
|
+
@property
|
116
|
+
def slurm_stdout_remote(self) -> str:
|
101
117
|
if self.slurm_job_id:
|
102
118
|
return (
|
103
|
-
self.
|
104
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.
|
119
|
+
self.workdir_remote
|
120
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.out"
|
105
121
|
).as_posix()
|
122
|
+
|
106
123
|
else:
|
107
124
|
return (
|
108
|
-
self.
|
125
|
+
self.workdir_remote / f"slurm-{self.label}-%j.out"
|
109
126
|
).as_posix()
|
110
127
|
|
111
128
|
@property
|
112
|
-
def
|
129
|
+
def slurm_stderr_remote(self) -> str:
|
113
130
|
if self.slurm_job_id:
|
114
131
|
return (
|
115
132
|
self.workdir_remote
|
116
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.
|
133
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.err"
|
117
134
|
).as_posix()
|
135
|
+
|
118
136
|
else:
|
119
137
|
return (
|
120
|
-
self.workdir_remote / f"slurm-{self.label}-%j.
|
138
|
+
self.workdir_remote / f"slurm-{self.label}-%j.err"
|
121
139
|
).as_posix()
|
122
140
|
|
123
141
|
@property
|
124
|
-
def
|
125
|
-
|
126
|
-
|
127
|
-
|
142
|
+
def slurm_stdout_local(self) -> str:
|
143
|
+
if self.slurm_job_id:
|
144
|
+
return (
|
145
|
+
self.workdir_local
|
146
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.out"
|
147
|
+
).as_posix()
|
148
|
+
|
149
|
+
else:
|
150
|
+
return (
|
151
|
+
self.workdir_local / f"slurm-{self.label}-%j.out"
|
152
|
+
).as_posix()
|
128
153
|
|
129
154
|
@property
|
130
|
-
def
|
131
|
-
|
132
|
-
|
133
|
-
|
155
|
+
def slurm_stderr_local(self) -> str:
|
156
|
+
if self.slurm_job_id:
|
157
|
+
return (
|
158
|
+
self.workdir_local
|
159
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.err"
|
160
|
+
).as_posix()
|
161
|
+
|
162
|
+
else:
|
163
|
+
return (
|
164
|
+
self.workdir_local / f"slurm-{self.label}-%j.err"
|
165
|
+
).as_posix()
|
134
166
|
|
135
167
|
@property
|
136
168
|
def log_files_local(self) -> list[str]:
|
@@ -276,9 +308,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
276
308
|
slurm_job: SlurmJob,
|
277
309
|
slurm_config: SlurmConfig,
|
278
310
|
) -> str:
|
279
|
-
|
280
|
-
# raise NotImplementedError()
|
281
|
-
|
311
|
+
logger.debug("[_submit_single_sbatch] START")
|
282
312
|
# Prepare input pickle(s)
|
283
313
|
versions = dict(
|
284
314
|
python=sys.version_info[:3],
|
@@ -287,44 +317,87 @@ class RunnerSlurmSudo(BaseRunner):
|
|
287
317
|
)
|
288
318
|
for task in slurm_job.tasks:
|
289
319
|
_args = []
|
290
|
-
_kwargs = dict(
|
320
|
+
_kwargs = dict(
|
321
|
+
parameters=task.parameters,
|
322
|
+
remote_files=task.task_files.remote_files_dict,
|
323
|
+
)
|
291
324
|
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
292
325
|
with open(task.input_pickle_file_local, "wb") as f:
|
293
326
|
f.write(funcser)
|
294
|
-
|
327
|
+
logger.debug(
|
328
|
+
"[_submit_single_sbatch] Written "
|
329
|
+
f"{task.input_pickle_file_local=}"
|
330
|
+
)
|
295
331
|
# Prepare commands to be included in SLURM submission script
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
"#SBATCH --ntasks=1",
|
301
|
-
"#SBATCH --cpus-per-task=1",
|
302
|
-
"#SBATCH --mem=10M",
|
303
|
-
f"#SBATCH --err={slurm_job.slurm_log_file_remote}",
|
304
|
-
f"#SBATCH --out={slurm_job.slurm_log_file_remote}",
|
305
|
-
f"#SBATCH -D {slurm_job.workdir_remote}",
|
306
|
-
"#SBATCH --job-name=test",
|
307
|
-
"\n",
|
308
|
-
]
|
309
|
-
|
332
|
+
settings = Inject(get_settings)
|
333
|
+
python_worker_interpreter = (
|
334
|
+
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
335
|
+
)
|
310
336
|
cmdlines = []
|
311
337
|
for task in slurm_job.tasks:
|
312
|
-
|
313
|
-
|
314
|
-
" -m fractal_server.app.runner.executors.slurm_common.remote "
|
315
|
-
f"--input-file {task.input_pickle_file_local} "
|
316
|
-
f"--output-file {task.output_pickle_file_remote}"
|
317
|
-
)
|
318
|
-
cmdlines.append("whoami")
|
338
|
+
input_pickle_file = task.input_pickle_file_local
|
339
|
+
output_pickle_file = task.output_pickle_file_remote
|
319
340
|
cmdlines.append(
|
320
|
-
|
341
|
+
(
|
342
|
+
f"{python_worker_interpreter}"
|
343
|
+
" -m fractal_server.app.runner."
|
344
|
+
"executors.slurm_common.remote "
|
345
|
+
f"--input-file {input_pickle_file} "
|
346
|
+
f"--output-file {output_pickle_file}"
|
347
|
+
)
|
321
348
|
)
|
322
|
-
|
349
|
+
|
350
|
+
# ...
|
351
|
+
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
352
|
+
mem_per_task_MB = slurm_config.mem_per_task_MB
|
353
|
+
|
354
|
+
# Set ntasks
|
355
|
+
ntasks = min(len(cmdlines), num_tasks_max_running)
|
356
|
+
slurm_config.parallel_tasks_per_job = ntasks
|
357
|
+
|
358
|
+
# Prepare SLURM preamble based on SlurmConfig object
|
359
|
+
script_lines = slurm_config.to_sbatch_preamble(
|
360
|
+
remote_export_dir=self.user_cache_dir
|
361
|
+
)
|
362
|
+
|
363
|
+
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
364
|
+
# fix their order
|
365
|
+
script_lines.extend(
|
366
|
+
[
|
367
|
+
f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
|
368
|
+
f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
|
369
|
+
f"#SBATCH -D {slurm_job.workdir_remote}",
|
370
|
+
]
|
371
|
+
)
|
372
|
+
script_lines = slurm_config.sort_script_lines(script_lines)
|
373
|
+
logger.debug(script_lines)
|
374
|
+
|
375
|
+
# Always print output of `uname -n` and `pwd`
|
376
|
+
script_lines.append(
|
377
|
+
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
378
|
+
)
|
379
|
+
|
380
|
+
# Complete script preamble
|
381
|
+
script_lines.append("\n")
|
382
|
+
|
383
|
+
# Include command lines
|
384
|
+
tmp_list_commands = copy(cmdlines)
|
385
|
+
while tmp_list_commands:
|
386
|
+
if tmp_list_commands:
|
387
|
+
cmd = tmp_list_commands.pop(0) # take first element
|
388
|
+
script_lines.append(
|
389
|
+
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
390
|
+
f"--mem={mem_per_task_MB}MB "
|
391
|
+
f"{cmd} &"
|
392
|
+
)
|
393
|
+
script_lines.append("wait\n")
|
394
|
+
|
395
|
+
script = "\n".join(script_lines)
|
323
396
|
|
324
397
|
# Write submission script
|
325
|
-
submission_script_contents = "\n".join(preamble_lines + cmdlines)
|
398
|
+
# submission_script_contents = "\n".join(preamble_lines + cmdlines)
|
326
399
|
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
327
|
-
f.write(
|
400
|
+
f.write(script)
|
328
401
|
|
329
402
|
# Run sbatch
|
330
403
|
pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
|
@@ -350,8 +423,10 @@ class RunnerSlurmSudo(BaseRunner):
|
|
350
423
|
"""
|
351
424
|
Note: this would differ for SSH
|
352
425
|
"""
|
426
|
+
logger.debug(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
|
353
427
|
source_target_list = [
|
354
|
-
(job.
|
428
|
+
(job.slurm_stdout_remote, job.slurm_stdout_local),
|
429
|
+
(job.slurm_stderr_remote, job.slurm_stderr_local),
|
355
430
|
]
|
356
431
|
for task in job.tasks:
|
357
432
|
source_target_list.extend(
|
@@ -419,21 +494,22 @@ class RunnerSlurmSudo(BaseRunner):
|
|
419
494
|
self,
|
420
495
|
func: callable,
|
421
496
|
parameters: dict[str, Any],
|
422
|
-
|
497
|
+
history_unit_id: int,
|
423
498
|
task_files: TaskFiles,
|
424
|
-
|
425
|
-
|
499
|
+
task_type: Literal[
|
500
|
+
"non_parallel",
|
501
|
+
"converter_non_parallel",
|
502
|
+
"compound",
|
503
|
+
"converter_compound",
|
504
|
+
],
|
505
|
+
config: SlurmConfig,
|
426
506
|
) -> tuple[Any, Exception]:
|
427
|
-
workdir_local = task_files.wftask_subfolder_local
|
428
|
-
workdir_remote = task_files.wftask_subfolder_remote
|
429
507
|
|
430
|
-
|
431
|
-
|
432
|
-
exclude={"component"},
|
433
|
-
),
|
434
|
-
component=parameters[_COMPONENT_KEY_],
|
435
|
-
)
|
508
|
+
if len(self.jobs) > 0:
|
509
|
+
raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
|
436
510
|
|
511
|
+
workdir_local = task_files.wftask_subfolder_local
|
512
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
437
513
|
if self.jobs != {}:
|
438
514
|
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
439
515
|
|
@@ -441,7 +517,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
441
517
|
raise JobExecutionError("Cannot continue after shutdown.")
|
442
518
|
|
443
519
|
# Validation phase
|
444
|
-
self.validate_submit_parameters(parameters)
|
520
|
+
self.validate_submit_parameters(parameters, task_type=task_type)
|
445
521
|
|
446
522
|
# Create task subfolder
|
447
523
|
original_umask = os.umask(0)
|
@@ -460,7 +536,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
460
536
|
tasks=[
|
461
537
|
SlurmTask(
|
462
538
|
index=0,
|
463
|
-
component=
|
539
|
+
component=task_files.component,
|
464
540
|
parameters=parameters,
|
465
541
|
workdir_remote=workdir_remote,
|
466
542
|
workdir_local=workdir_local,
|
@@ -468,25 +544,57 @@ class RunnerSlurmSudo(BaseRunner):
|
|
468
544
|
)
|
469
545
|
],
|
470
546
|
) # TODO: replace with actual values (BASED ON TASKFILES)
|
547
|
+
|
548
|
+
config.parallel_tasks_per_job = 1
|
471
549
|
self._submit_single_sbatch(
|
472
550
|
func,
|
473
551
|
slurm_job=slurm_job,
|
474
|
-
slurm_config=
|
552
|
+
slurm_config=config,
|
475
553
|
)
|
476
|
-
|
477
|
-
|
554
|
+
logger.debug("END SUBMISSION PHASE")
|
555
|
+
logger.debug(f"{self.jobs=}")
|
556
|
+
logger.debug(f"{self.job_ids=}")
|
557
|
+
|
558
|
+
# FIXME
|
559
|
+
jobs_that_started = set()
|
560
|
+
while len(jobs_that_started) != len(self.job_ids):
|
561
|
+
logger.debug("CALL SQUEUE")
|
562
|
+
res = run_squeue(self.job_ids)
|
563
|
+
new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
|
564
|
+
jobs_that_started = jobs_that_started.union(new_jobs)
|
565
|
+
logger.debug(f"{new_jobs=}")
|
566
|
+
logger.debug(f"{len(jobs_that_started)=}")
|
567
|
+
|
568
|
+
logger.debug("START RETRIEVAL PHASE")
|
478
569
|
|
479
570
|
# Retrieval phase
|
480
571
|
while len(self.jobs) > 0:
|
481
572
|
if self.is_shutdown():
|
482
573
|
self.scancel_jobs()
|
483
574
|
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
575
|
+
logger.debug(f"{finished_job_ids=}")
|
576
|
+
with next(get_sync_db()) as db:
|
577
|
+
for slurm_job_id in finished_job_ids:
|
578
|
+
logger.debug(f"Now process {slurm_job_id=}")
|
579
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
580
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
581
|
+
result, exception = self._postprocess_single_task(
|
582
|
+
task=slurm_job.tasks[0]
|
583
|
+
)
|
584
|
+
if result is not None:
|
585
|
+
if task_type not in ["compound", "converter_compound"]:
|
586
|
+
update_status_of_history_unit(
|
587
|
+
history_unit_id=history_unit_id,
|
588
|
+
status=HistoryUnitStatus.DONE,
|
589
|
+
db_sync=db,
|
590
|
+
)
|
591
|
+
if exception is not None:
|
592
|
+
update_status_of_history_unit(
|
593
|
+
history_unit_id=history_unit_id,
|
594
|
+
status=HistoryUnitStatus.FAILED,
|
595
|
+
db_sync=db,
|
596
|
+
)
|
597
|
+
|
490
598
|
time.sleep(self.slurm_poll_interval)
|
491
599
|
|
492
600
|
return result, exception
|
@@ -495,19 +603,38 @@ class RunnerSlurmSudo(BaseRunner):
|
|
495
603
|
self,
|
496
604
|
func: callable,
|
497
605
|
list_parameters: list[dict],
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
606
|
+
history_unit_ids: list[int],
|
607
|
+
list_task_files: list[TaskFiles],
|
608
|
+
task_type: Literal["parallel", "compound", "converter_compound"],
|
609
|
+
config: SlurmConfig,
|
502
610
|
):
|
503
|
-
|
611
|
+
|
612
|
+
if len(self.jobs) > 0:
|
613
|
+
raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
|
614
|
+
|
615
|
+
if task_type in ["compound", "converter_compound"]:
|
616
|
+
if len(history_unit_ids) != 1:
|
617
|
+
raise NotImplementedError(
|
618
|
+
"We are breaking the assumption that compound/multisubmit "
|
619
|
+
"is associated to a single HistoryUnit. This is not "
|
620
|
+
"supported."
|
621
|
+
)
|
622
|
+
elif task_type == "parallel" and len(history_unit_ids) != len(
|
623
|
+
list_parameters
|
624
|
+
):
|
625
|
+
raise ValueError(
|
626
|
+
f"{len(history_unit_ids)=} differs from "
|
627
|
+
f"{len(list_parameters)=}."
|
628
|
+
)
|
504
629
|
|
505
630
|
self.validate_multisubmit_parameters(
|
506
|
-
list_parameters=list_parameters,
|
631
|
+
list_parameters=list_parameters,
|
632
|
+
task_type=task_type,
|
633
|
+
list_task_files=list_task_files,
|
507
634
|
)
|
508
635
|
|
509
|
-
workdir_local =
|
510
|
-
workdir_remote =
|
636
|
+
workdir_local = list_task_files[0].wftask_subfolder_local
|
637
|
+
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
511
638
|
|
512
639
|
# Create local&remote task subfolders
|
513
640
|
if task_type not in ["converter_compound", "compound"]:
|
@@ -525,7 +652,7 @@ class RunnerSlurmSudo(BaseRunner):
|
|
525
652
|
results: dict[int, Any] = {}
|
526
653
|
exceptions: dict[int, BaseException] = {}
|
527
654
|
|
528
|
-
original_task_files =
|
655
|
+
original_task_files = list_task_files
|
529
656
|
tot_tasks = len(list_parameters)
|
530
657
|
|
531
658
|
# Set/validate parameters for task batching
|
@@ -533,21 +660,21 @@ class RunnerSlurmSudo(BaseRunner):
|
|
533
660
|
# Number of parallel components (always known)
|
534
661
|
tot_tasks=tot_tasks,
|
535
662
|
# Optional WorkflowTask attributes:
|
536
|
-
tasks_per_job=
|
537
|
-
parallel_tasks_per_job=
|
663
|
+
tasks_per_job=config.tasks_per_job,
|
664
|
+
parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
|
538
665
|
# Task requirements (multiple possible sources):
|
539
|
-
cpus_per_task=
|
540
|
-
mem_per_task=
|
666
|
+
cpus_per_task=config.cpus_per_task,
|
667
|
+
mem_per_task=config.mem_per_task_MB,
|
541
668
|
# Fractal configuration variables (soft/hard limits):
|
542
|
-
target_cpus_per_job=
|
543
|
-
target_mem_per_job=
|
544
|
-
target_num_jobs=
|
545
|
-
max_cpus_per_job=
|
546
|
-
max_mem_per_job=
|
547
|
-
max_num_jobs=
|
669
|
+
target_cpus_per_job=config.target_cpus_per_job,
|
670
|
+
target_mem_per_job=config.target_mem_per_job,
|
671
|
+
target_num_jobs=config.target_num_jobs,
|
672
|
+
max_cpus_per_job=config.max_cpus_per_job,
|
673
|
+
max_mem_per_job=config.max_mem_per_job,
|
674
|
+
max_num_jobs=config.max_num_jobs,
|
548
675
|
)
|
549
|
-
|
550
|
-
|
676
|
+
config.parallel_tasks_per_job = parallel_tasks_per_job
|
677
|
+
config.tasks_per_job = tasks_per_job
|
551
678
|
|
552
679
|
# Divide arguments in batches of `tasks_per_job` tasks each
|
553
680
|
args_batches = []
|
@@ -561,24 +688,18 @@ class RunnerSlurmSudo(BaseRunner):
|
|
561
688
|
|
562
689
|
logger.info(f"START submission phase, {list(self.jobs.keys())=}")
|
563
690
|
for ind_batch, chunk in enumerate(args_batches):
|
564
|
-
# TODO: replace with actual values
|
565
691
|
tasks = []
|
566
692
|
for ind_chunk, parameters in enumerate(chunk):
|
567
|
-
|
693
|
+
index = (ind_batch * batch_size) + ind_chunk
|
568
694
|
tasks.append(
|
569
695
|
SlurmTask(
|
570
|
-
index=
|
571
|
-
component=component,
|
696
|
+
index=index,
|
697
|
+
component=original_task_files[index].component,
|
572
698
|
workdir_local=workdir_local,
|
573
699
|
workdir_remote=workdir_remote,
|
574
700
|
parameters=parameters,
|
575
701
|
zarr_url=parameters["zarr_url"],
|
576
|
-
task_files=
|
577
|
-
**original_task_files.model_dump(
|
578
|
-
exclude={"component"}
|
579
|
-
),
|
580
|
-
component=component,
|
581
|
-
),
|
702
|
+
task_files=original_task_files[index],
|
582
703
|
),
|
583
704
|
)
|
584
705
|
|
@@ -591,26 +712,56 @@ class RunnerSlurmSudo(BaseRunner):
|
|
591
712
|
self._submit_single_sbatch(
|
592
713
|
func,
|
593
714
|
slurm_job=slurm_job,
|
594
|
-
slurm_config=
|
715
|
+
slurm_config=config,
|
595
716
|
)
|
596
717
|
logger.info(f"END submission phase, {list(self.jobs.keys())=}")
|
597
718
|
|
719
|
+
# FIXME
|
720
|
+
jobs_that_started = set()
|
721
|
+
while len(jobs_that_started) != len(self.job_ids):
|
722
|
+
res = run_squeue(self.job_ids)
|
723
|
+
new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
|
724
|
+
jobs_that_started = jobs_that_started.union(new_jobs)
|
725
|
+
logger.debug(f"{new_jobs=}")
|
726
|
+
logger.debug(f"{len(jobs_that_started)=}")
|
727
|
+
|
598
728
|
# Retrieval phase
|
599
729
|
while len(self.jobs) > 0:
|
600
730
|
if self.is_shutdown():
|
601
731
|
self.scancel_jobs()
|
602
732
|
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
733
|
+
logger.debug(f"{finished_job_ids=}")
|
734
|
+
with next(get_sync_db()) as db:
|
735
|
+
for slurm_job_id in finished_job_ids:
|
736
|
+
logger.debug(f"Now processing {slurm_job_id=}")
|
737
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
738
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
739
|
+
for task in slurm_job.tasks:
|
740
|
+
result, exception = self._postprocess_single_task(
|
741
|
+
task=task
|
742
|
+
)
|
743
|
+
|
744
|
+
if result is not None:
|
745
|
+
results[task.index] = result
|
746
|
+
if task_type == "parallel":
|
747
|
+
update_status_of_history_unit(
|
748
|
+
history_unit_id=history_unit_ids[
|
749
|
+
task.index
|
750
|
+
],
|
751
|
+
status=HistoryUnitStatus.DONE,
|
752
|
+
db_sync=db,
|
753
|
+
)
|
754
|
+
if exception is not None:
|
755
|
+
exceptions[task.index] = exception
|
756
|
+
if task_type == "parallel":
|
757
|
+
update_status_of_history_unit(
|
758
|
+
history_unit_id=history_unit_ids[
|
759
|
+
task.index
|
760
|
+
],
|
761
|
+
status=HistoryUnitStatus.FAILED,
|
762
|
+
db_sync=db,
|
763
|
+
)
|
764
|
+
|
614
765
|
time.sleep(self.slurm_poll_interval)
|
615
766
|
return results, exceptions
|
616
767
|
|
@@ -96,3 +96,11 @@ class TaskFiles(BaseModel):
|
|
96
96
|
return (
|
97
97
|
self.wftask_subfolder_remote / f"{self.component}-metadiff.json"
|
98
98
|
).as_posix()
|
99
|
+
|
100
|
+
@property
|
101
|
+
def remote_files_dict(self) -> dict[str, str]:
|
102
|
+
return dict(
|
103
|
+
args_file_remote=self.args_file_remote,
|
104
|
+
metadiff_file_remote=self.metadiff_file_remote,
|
105
|
+
log_file_remote=self.log_file_remote,
|
106
|
+
)
|