fractal-server 2.14.0a33__py3-none-any.whl → 2.14.0a35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/runner/executors/base_runner.py +4 -0
- fractal_server/app/runner/executors/local/runner.py +97 -35
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +327 -253
- fractal_server/app/runner/executors/slurm_common/remote.py +14 -11
- fractal_server/app/runner/executors/slurm_ssh/runner.py +66 -6
- fractal_server/app/runner/executors/slurm_sudo/_subprocess_run_as_user.py +0 -15
- fractal_server/app/runner/executors/slurm_sudo/runner.py +13 -1
- fractal_server/app/runner/v2/runner.py +3 -0
- fractal_server/app/runner/v2/runner_functions.py +7 -0
- fractal_server/ssh/_fabric.py +24 -12
- {fractal_server-2.14.0a33.dist-info → fractal_server-2.14.0a35.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a33.dist-info → fractal_server-2.14.0a35.dist-info}/RECORD +16 -16
- {fractal_server-2.14.0a33.dist-info → fractal_server-2.14.0a35.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a33.dist-info → fractal_server-2.14.0a35.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a33.dist-info → fractal_server-2.14.0a35.dist-info}/entry_points.txt +0 -0
@@ -15,6 +15,7 @@ from ..slurm_common.slurm_job_task_models import SlurmTask
|
|
15
15
|
from ._job_states import STATES_FINISHED
|
16
16
|
from fractal_server import __VERSION__
|
17
17
|
from fractal_server.app.db import get_sync_db
|
18
|
+
from fractal_server.app.models.v2 import AccountingRecordSlurm
|
18
19
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
19
20
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
20
21
|
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
@@ -34,7 +35,20 @@ SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
|
|
34
35
|
|
35
36
|
logger = set_logger(__name__)
|
36
37
|
|
37
|
-
|
38
|
+
|
39
|
+
def create_accounting_record_slurm(
|
40
|
+
*,
|
41
|
+
user_id: int,
|
42
|
+
slurm_job_ids: list[int],
|
43
|
+
) -> None:
|
44
|
+
with next(get_sync_db()) as db:
|
45
|
+
db.add(
|
46
|
+
AccountingRecordSlurm(
|
47
|
+
user_id=user_id,
|
48
|
+
slurm_job_ids=slurm_job_ids,
|
49
|
+
)
|
50
|
+
)
|
51
|
+
db.commit()
|
38
52
|
|
39
53
|
|
40
54
|
class BaseSlurmRunner(BaseRunner):
|
@@ -100,65 +114,51 @@ class BaseSlurmRunner(BaseRunner):
|
|
100
114
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
101
115
|
return False
|
102
116
|
|
103
|
-
def _run_local_cmd(self, cmd: str) -> str:
|
104
|
-
raise NotImplementedError("Implement in child class.")
|
105
|
-
|
106
117
|
def _run_remote_cmd(self, cmd: str) -> str:
|
107
118
|
raise NotImplementedError("Implement in child class.")
|
108
119
|
|
109
|
-
def run_squeue(self, job_ids: list[str]) ->
|
110
|
-
|
111
|
-
|
112
|
-
if len(job_ids) == 0:
|
113
|
-
return (False, "")
|
114
|
-
|
115
|
-
job_id_single_str = ",".join([str(j) for j in job_ids])
|
116
|
-
cmd = (
|
117
|
-
f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
|
118
|
-
" --states=all"
|
119
|
-
)
|
120
|
-
|
121
|
-
try:
|
122
|
-
if self.slurm_runner_type == "sudo":
|
123
|
-
stdout = self._run_local_cmd(cmd)
|
124
|
-
else:
|
125
|
-
stdout = self._run_remote_cmd(cmd)
|
126
|
-
return (True, stdout)
|
127
|
-
except Exception as e:
|
128
|
-
logger.info(f"{cmd=} failed with {str(e)}")
|
129
|
-
return (False, "")
|
120
|
+
def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
|
121
|
+
raise NotImplementedError("Implement in child class.")
|
130
122
|
|
131
123
|
def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
|
132
|
-
# If there is no Slurm job to check, return right away
|
133
124
|
|
125
|
+
# If there is no Slurm job to check, return right away
|
134
126
|
if not job_ids:
|
135
127
|
return set()
|
136
|
-
id_to_state = dict()
|
137
128
|
|
138
|
-
|
139
|
-
|
140
|
-
|
129
|
+
try:
|
130
|
+
stdout = self.run_squeue(job_ids=job_ids)
|
131
|
+
slurm_statuses = {
|
141
132
|
out.split()[0]: out.split()[1] for out in stdout.splitlines()
|
142
133
|
}
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
134
|
+
except Exception as e:
|
135
|
+
logger.warning(
|
136
|
+
"[_get_finished_jobs] `squeue` failed, "
|
137
|
+
"retry with individual job IDs. "
|
138
|
+
f"Original error: {str(e)}."
|
139
|
+
)
|
140
|
+
slurm_statuses = dict()
|
141
|
+
for job_id in job_ids:
|
142
|
+
try:
|
143
|
+
stdout = self.run_squeue(job_ids=[job_id])
|
144
|
+
slurm_statuses.update(
|
145
|
+
{stdout.split()[0]: stdout.split()[1]}
|
153
146
|
)
|
147
|
+
except Exception as e:
|
148
|
+
logger.warning(
|
149
|
+
"[_get_finished_jobs] `squeue` failed for "
|
150
|
+
f"{job_id=}, mark job as completed. "
|
151
|
+
f"Original error: {str(e)}."
|
152
|
+
)
|
153
|
+
slurm_statuses.update({str(job_id): "COMPLETED"})
|
154
154
|
|
155
|
-
#
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
|
155
|
+
# If a job is not in `squeue` output, mark it as completed.
|
156
|
+
finished_jobs = {
|
157
|
+
job_id
|
158
|
+
for job_id in job_ids
|
159
|
+
if slurm_statuses.get(job_id, "COMPLETED") in STATES_FINISHED
|
161
160
|
}
|
161
|
+
return finished_jobs
|
162
162
|
|
163
163
|
def _mkdir_local_folder(self, folder: str) -> None:
|
164
164
|
raise NotImplementedError("Implement in child class.")
|
@@ -172,7 +172,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
172
172
|
slurm_job: SlurmJob,
|
173
173
|
slurm_config: SlurmConfig,
|
174
174
|
) -> str:
|
175
|
-
logger.
|
175
|
+
logger.debug("[_submit_single_sbatch] START")
|
176
176
|
# Prepare input pickle(s)
|
177
177
|
versions = dict(
|
178
178
|
python=sys.version_info[:3],
|
@@ -189,7 +189,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
189
189
|
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
190
190
|
with open(task.input_pickle_file_local, "wb") as f:
|
191
191
|
f.write(funcser)
|
192
|
-
logger.
|
192
|
+
logger.debug(
|
193
193
|
"[_submit_single_sbatch] Written "
|
194
194
|
f"{task.input_pickle_file_local=}"
|
195
195
|
)
|
@@ -200,7 +200,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
200
200
|
local=task.input_pickle_file_local,
|
201
201
|
remote=task.input_pickle_file_remote,
|
202
202
|
)
|
203
|
-
logger.
|
203
|
+
logger.debug(
|
204
204
|
"[_submit_single_sbatch] Transferred "
|
205
205
|
f"{task.input_pickle_file_local=}"
|
206
206
|
)
|
@@ -243,7 +243,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
243
243
|
]
|
244
244
|
)
|
245
245
|
script_lines = slurm_config.sort_script_lines(script_lines)
|
246
|
-
logger.
|
246
|
+
logger.debug(script_lines)
|
247
247
|
|
248
248
|
# Always print output of `uname -n` and `pwd`
|
249
249
|
script_lines.append('\necho "Hostname: $(uname -n)"')
|
@@ -272,7 +272,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
272
272
|
# Write submission script
|
273
273
|
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
274
274
|
f.write(script)
|
275
|
-
logger.
|
275
|
+
logger.debug(
|
276
276
|
"[_submit_single_sbatch] Written "
|
277
277
|
f"{slurm_job.slurm_submission_script_local=}"
|
278
278
|
)
|
@@ -294,10 +294,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
294
294
|
# Run sbatch
|
295
295
|
pre_submission_cmds = slurm_config.pre_submission_commands
|
296
296
|
if len(pre_submission_cmds) == 0:
|
297
|
-
logger.
|
297
|
+
logger.debug(f"Now run {submit_command=}")
|
298
298
|
sbatch_stdout = self._run_remote_cmd(submit_command)
|
299
299
|
else:
|
300
|
-
logger.
|
300
|
+
logger.debug(f"Now using {pre_submission_cmds=}")
|
301
301
|
script_lines = pre_submission_cmds + [submit_command]
|
302
302
|
wrapper_script_contents = "\n".join(script_lines)
|
303
303
|
wrapper_script_contents = f"{wrapper_script_contents}\n"
|
@@ -314,22 +314,22 @@ class BaseSlurmRunner(BaseRunner):
|
|
314
314
|
)
|
315
315
|
with open(wrapper_script, "w") as f:
|
316
316
|
f.write(wrapper_script_contents)
|
317
|
-
logger.
|
317
|
+
logger.debug(f"Now run {wrapper_script=}")
|
318
318
|
sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
|
319
319
|
|
320
320
|
# Submit SLURM job and retrieve job ID
|
321
|
-
logger.info(f"[
|
321
|
+
logger.info(f"[_submit_single_sbatch] {sbatch_stdout=}")
|
322
322
|
stdout = sbatch_stdout.strip("\n")
|
323
323
|
submitted_job_id = int(stdout)
|
324
324
|
slurm_job.slurm_job_id = str(submitted_job_id)
|
325
325
|
|
326
326
|
# Add job to self.jobs
|
327
327
|
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
328
|
-
logger.
|
328
|
+
logger.debug(
|
329
329
|
"[_submit_single_sbatch] Added "
|
330
330
|
f"{slurm_job.slurm_job_id} to self.jobs."
|
331
331
|
)
|
332
|
-
logger.
|
332
|
+
logger.debug("[_submit_single_sbatch] END")
|
333
333
|
|
334
334
|
def _fetch_artifacts(
|
335
335
|
self,
|
@@ -421,27 +421,34 @@ class BaseSlurmRunner(BaseRunner):
|
|
421
421
|
"""
|
422
422
|
# Sleep for `self.poll_interval`, but keep checking for shutdowns
|
423
423
|
start_time = time.perf_counter()
|
424
|
-
|
425
|
-
|
424
|
+
# Always wait at least 0.2 (note: this is for cases where
|
425
|
+
# `poll_interval=0`).
|
426
|
+
waiting_time = max(self.poll_interval, 0.2)
|
427
|
+
max_time = start_time + waiting_time
|
426
428
|
logger.debug(
|
427
429
|
"[wait_and_check_shutdown] "
|
428
430
|
f"I will wait at most {self.poll_interval} s, "
|
429
431
|
f"in blocks of {self.poll_interval_internal} s."
|
430
432
|
)
|
431
433
|
|
432
|
-
while
|
433
|
-
# Handle shutdown
|
434
|
+
while time.perf_counter() < max_time:
|
434
435
|
if self.is_shutdown():
|
435
436
|
logger.info("[wait_and_check_shutdown] Shutdown file detected")
|
436
437
|
scancelled_job_ids = self.scancel_jobs()
|
437
438
|
logger.info(f"[wait_and_check_shutdown] {scancelled_job_ids=}")
|
438
439
|
return scancelled_job_ids
|
439
|
-
can_return = True
|
440
440
|
time.sleep(self.poll_interval_internal)
|
441
441
|
|
442
442
|
logger.debug("[wait_and_check_shutdown] No shutdown file detected")
|
443
443
|
return []
|
444
444
|
|
445
|
+
def _check_no_active_jobs(self):
|
446
|
+
if self.jobs != {}:
|
447
|
+
raise JobExecutionError(
|
448
|
+
"Unexpected branch: jobs must be empty before new "
|
449
|
+
"submissions."
|
450
|
+
)
|
451
|
+
|
445
452
|
def submit(
|
446
453
|
self,
|
447
454
|
func: callable,
|
@@ -455,109 +462,133 @@ class BaseSlurmRunner(BaseRunner):
|
|
455
462
|
"compound",
|
456
463
|
"converter_compound",
|
457
464
|
],
|
465
|
+
user_id: int,
|
458
466
|
) -> tuple[Any, Exception]:
|
459
|
-
logger.
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
if self.jobs != {}:
|
465
|
-
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
467
|
+
logger.debug("[submit] START")
|
468
|
+
try:
|
469
|
+
workdir_local = task_files.wftask_subfolder_local
|
470
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
466
471
|
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
472
|
+
if self.is_shutdown():
|
473
|
+
with next(get_sync_db()) as db:
|
474
|
+
update_status_of_history_unit(
|
475
|
+
history_unit_id=history_unit_id,
|
476
|
+
status=HistoryUnitStatus.FAILED,
|
477
|
+
db_sync=db,
|
478
|
+
)
|
474
479
|
|
475
|
-
|
480
|
+
return None, SHUTDOWN_EXCEPTION
|
476
481
|
|
477
|
-
|
478
|
-
self.validate_submit_parameters(
|
479
|
-
parameters=parameters,
|
480
|
-
task_type=task_type,
|
481
|
-
)
|
482
|
+
self._check_no_active_jobs()
|
482
483
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
# Submission phase
|
490
|
-
slurm_job = SlurmJob(
|
491
|
-
prefix=task_files.prefix,
|
492
|
-
workdir_local=workdir_local,
|
493
|
-
workdir_remote=workdir_remote,
|
494
|
-
tasks=[
|
495
|
-
SlurmTask(
|
496
|
-
prefix=task_files.prefix,
|
497
|
-
index=0,
|
498
|
-
component=task_files.component,
|
499
|
-
parameters=parameters,
|
500
|
-
workdir_remote=workdir_remote,
|
501
|
-
workdir_local=workdir_local,
|
502
|
-
task_files=task_files,
|
503
|
-
)
|
504
|
-
],
|
505
|
-
)
|
484
|
+
# Validation phase
|
485
|
+
self.validate_submit_parameters(
|
486
|
+
parameters=parameters,
|
487
|
+
task_type=task_type,
|
488
|
+
)
|
506
489
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
490
|
+
# Create task subfolder
|
491
|
+
logger.debug("[submit] Create local/remote folders - START")
|
492
|
+
self._mkdir_local_folder(folder=workdir_local.as_posix())
|
493
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
494
|
+
logger.debug("[submit] Create local/remote folders - END")
|
495
|
+
|
496
|
+
# Submission phase
|
497
|
+
slurm_job = SlurmJob(
|
498
|
+
prefix=task_files.prefix,
|
499
|
+
workdir_local=workdir_local,
|
500
|
+
workdir_remote=workdir_remote,
|
501
|
+
tasks=[
|
502
|
+
SlurmTask(
|
503
|
+
prefix=task_files.prefix,
|
504
|
+
index=0,
|
505
|
+
component=task_files.component,
|
506
|
+
parameters=parameters,
|
507
|
+
workdir_remote=workdir_remote,
|
508
|
+
workdir_local=workdir_local,
|
509
|
+
task_files=task_files,
|
510
|
+
)
|
511
|
+
],
|
512
|
+
)
|
514
513
|
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
514
|
+
config.parallel_tasks_per_job = 1
|
515
|
+
self._submit_single_sbatch(
|
516
|
+
func,
|
517
|
+
slurm_job=slurm_job,
|
518
|
+
slurm_config=config,
|
519
|
+
)
|
520
|
+
logger.debug(f"[submit] END submission phase, {self.job_ids=}")
|
520
521
|
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
# Look for finished jobs
|
526
|
-
finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
|
527
|
-
logger.debug(f"[submit] {finished_job_ids=}")
|
528
|
-
finished_jobs = [
|
529
|
-
self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
|
530
|
-
]
|
531
|
-
self._fetch_artifacts(finished_jobs)
|
532
|
-
with next(get_sync_db()) as db:
|
533
|
-
for slurm_job_id in finished_job_ids:
|
534
|
-
logger.debug(f"[submit] Now process {slurm_job_id=}")
|
535
|
-
slurm_job = self.jobs.pop(slurm_job_id)
|
536
|
-
was_job_scancelled = slurm_job_id in scancelled_job_ids
|
537
|
-
result, exception = self._postprocess_single_task(
|
538
|
-
task=slurm_job.tasks[0],
|
539
|
-
was_job_scancelled=was_job_scancelled,
|
540
|
-
)
|
522
|
+
create_accounting_record_slurm(
|
523
|
+
user_id=user_id,
|
524
|
+
slurm_job_ids=self.job_ids,
|
525
|
+
)
|
541
526
|
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
527
|
+
# NOTE: see issue 2444
|
528
|
+
settings = Inject(get_settings)
|
529
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
530
|
+
logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
|
531
|
+
time.sleep(sleep_time)
|
532
|
+
|
533
|
+
# Retrieval phase
|
534
|
+
logger.debug("[submit] START retrieval phase")
|
535
|
+
scancelled_job_ids = []
|
536
|
+
while len(self.jobs) > 0:
|
537
|
+
# Look for finished jobs
|
538
|
+
finished_job_ids = self._get_finished_jobs(
|
539
|
+
job_ids=self.job_ids
|
540
|
+
)
|
541
|
+
logger.debug(f"[submit] {finished_job_ids=}")
|
542
|
+
finished_jobs = [
|
543
|
+
self.jobs[_slurm_job_id]
|
544
|
+
for _slurm_job_id in finished_job_ids
|
545
|
+
]
|
546
|
+
self._fetch_artifacts(finished_jobs)
|
547
|
+
with next(get_sync_db()) as db:
|
548
|
+
for slurm_job_id in finished_job_ids:
|
549
|
+
logger.debug(f"[submit] Now process {slurm_job_id=}")
|
550
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
551
|
+
was_job_scancelled = slurm_job_id in scancelled_job_ids
|
552
|
+
result, exception = self._postprocess_single_task(
|
553
|
+
task=slurm_job.tasks[0],
|
554
|
+
was_job_scancelled=was_job_scancelled,
|
547
555
|
)
|
548
|
-
|
549
|
-
if
|
556
|
+
|
557
|
+
if exception is not None:
|
550
558
|
update_status_of_history_unit(
|
551
559
|
history_unit_id=history_unit_id,
|
552
|
-
status=HistoryUnitStatus.
|
560
|
+
status=HistoryUnitStatus.FAILED,
|
553
561
|
db_sync=db,
|
554
562
|
)
|
563
|
+
else:
|
564
|
+
if task_type not in [
|
565
|
+
"compound",
|
566
|
+
"converter_compound",
|
567
|
+
]:
|
568
|
+
update_status_of_history_unit(
|
569
|
+
history_unit_id=history_unit_id,
|
570
|
+
status=HistoryUnitStatus.DONE,
|
571
|
+
db_sync=db,
|
572
|
+
)
|
555
573
|
|
556
|
-
|
557
|
-
|
574
|
+
if len(self.jobs) > 0:
|
575
|
+
scancelled_job_ids = self.wait_and_check_shutdown()
|
558
576
|
|
559
|
-
|
560
|
-
|
577
|
+
logger.debug("[submit] END")
|
578
|
+
return result, exception
|
579
|
+
|
580
|
+
except Exception as e:
|
581
|
+
logger.error(
|
582
|
+
f"[submit] Unexpected exception. Original error: {str(e)}"
|
583
|
+
)
|
584
|
+
with next(get_sync_db()) as db:
|
585
|
+
update_status_of_history_unit(
|
586
|
+
history_unit_id=history_unit_id,
|
587
|
+
status=HistoryUnitStatus.FAILED,
|
588
|
+
db_sync=db,
|
589
|
+
)
|
590
|
+
self.scancel_jobs()
|
591
|
+
return None, e
|
561
592
|
|
562
593
|
def multisubmit(
|
563
594
|
self,
|
@@ -567,6 +598,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
567
598
|
list_task_files: list[TaskFiles],
|
568
599
|
task_type: Literal["parallel", "compound", "converter_compound"],
|
569
600
|
config: SlurmConfig,
|
601
|
+
user_id: int,
|
570
602
|
) -> tuple[dict[int, Any], dict[int, BaseException]]:
|
571
603
|
"""
|
572
604
|
Note: `list_parameters`, `list_task_files` and `history_unit_ids`
|
@@ -574,111 +606,128 @@ class BaseSlurmRunner(BaseRunner):
|
|
574
606
|
input images, while for compound tasks these can differ.
|
575
607
|
"""
|
576
608
|
|
577
|
-
|
578
|
-
|
579
|
-
|
609
|
+
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
610
|
+
try:
|
611
|
+
|
612
|
+
if self.is_shutdown():
|
613
|
+
if task_type == "parallel":
|
614
|
+
with next(get_sync_db()) as db:
|
615
|
+
bulk_update_status_of_history_unit(
|
616
|
+
history_unit_ids=history_unit_ids,
|
617
|
+
status=HistoryUnitStatus.FAILED,
|
618
|
+
db_sync=db,
|
619
|
+
)
|
620
|
+
results = {}
|
621
|
+
exceptions = {
|
622
|
+
ind: SHUTDOWN_EXCEPTION
|
623
|
+
for ind in range(len(list_parameters))
|
624
|
+
}
|
625
|
+
return results, exceptions
|
626
|
+
|
627
|
+
self._check_no_active_jobs()
|
628
|
+
self.validate_multisubmit_parameters(
|
629
|
+
list_parameters=list_parameters,
|
630
|
+
task_type=task_type,
|
631
|
+
list_task_files=list_task_files,
|
632
|
+
history_unit_ids=history_unit_ids,
|
580
633
|
)
|
581
634
|
|
582
|
-
|
635
|
+
workdir_local = list_task_files[0].wftask_subfolder_local
|
636
|
+
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
637
|
+
|
638
|
+
# Create local&remote task subfolders
|
583
639
|
if task_type == "parallel":
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
640
|
+
self._mkdir_local_folder(workdir_local.as_posix())
|
641
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
642
|
+
|
643
|
+
results: dict[int, Any] = {}
|
644
|
+
exceptions: dict[int, BaseException] = {}
|
645
|
+
|
646
|
+
# NOTE: chunking has already taken place in `get_slurm_config`,
|
647
|
+
# so that `config.tasks_per_job` is now set.
|
648
|
+
|
649
|
+
# Divide arguments in batches of `tasks_per_job` tasks each
|
650
|
+
tot_tasks = len(list_parameters)
|
651
|
+
args_batches = []
|
652
|
+
batch_size = config.tasks_per_job
|
653
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
654
|
+
args_batches.append(
|
655
|
+
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
656
|
+
)
|
657
|
+
if len(args_batches) != math.ceil(
|
658
|
+
tot_tasks / config.tasks_per_job
|
659
|
+
):
|
660
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
661
|
+
|
662
|
+
# Part 1/3: Iterate over chunks, prepare SlurmJob objects
|
663
|
+
logger.debug("[multisubmit] Prepare `SlurmJob`s.")
|
664
|
+
jobs_to_submit = []
|
665
|
+
for ind_batch, chunk in enumerate(args_batches):
|
666
|
+
# Read prefix based on the first task of this batch
|
667
|
+
prefix = list_task_files[ind_batch * batch_size].prefix
|
668
|
+
tasks = []
|
669
|
+
for ind_chunk, parameters in enumerate(chunk):
|
670
|
+
index = (ind_batch * batch_size) + ind_chunk
|
671
|
+
tasks.append(
|
672
|
+
SlurmTask(
|
673
|
+
prefix=prefix,
|
674
|
+
index=index,
|
675
|
+
component=list_task_files[index].component,
|
676
|
+
workdir_local=workdir_local,
|
677
|
+
workdir_remote=workdir_remote,
|
678
|
+
parameters=parameters,
|
679
|
+
zarr_url=parameters["zarr_url"],
|
680
|
+
task_files=list_task_files[index],
|
681
|
+
),
|
589
682
|
)
|
590
|
-
|
591
|
-
|
592
|
-
ind: SHUTDOWN_EXCEPTION for ind in range(len(list_parameters))
|
593
|
-
}
|
594
|
-
return results, exceptions
|
595
|
-
|
596
|
-
self.validate_multisubmit_parameters(
|
597
|
-
list_parameters=list_parameters,
|
598
|
-
task_type=task_type,
|
599
|
-
list_task_files=list_task_files,
|
600
|
-
history_unit_ids=history_unit_ids,
|
601
|
-
)
|
602
|
-
|
603
|
-
logger.info(f"[multisubmit] START, {len(list_parameters)=}")
|
604
|
-
|
605
|
-
workdir_local = list_task_files[0].wftask_subfolder_local
|
606
|
-
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
607
|
-
|
608
|
-
# Create local&remote task subfolders
|
609
|
-
if task_type == "parallel":
|
610
|
-
self._mkdir_local_folder(workdir_local.as_posix())
|
611
|
-
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
612
|
-
|
613
|
-
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
614
|
-
# TODO Pick a data structure for results and exceptions, or review the
|
615
|
-
# interface
|
616
|
-
results: dict[int, Any] = {}
|
617
|
-
exceptions: dict[int, BaseException] = {}
|
618
|
-
|
619
|
-
tot_tasks = len(list_parameters)
|
620
|
-
|
621
|
-
# NOTE: chunking has already taken place in `get_slurm_config`,
|
622
|
-
# so that `config.tasks_per_job` is now set.
|
623
|
-
|
624
|
-
# Divide arguments in batches of `tasks_per_job` tasks each
|
625
|
-
args_batches = []
|
626
|
-
batch_size = config.tasks_per_job
|
627
|
-
for ind_chunk in range(0, tot_tasks, batch_size):
|
628
|
-
args_batches.append(
|
629
|
-
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
630
|
-
)
|
631
|
-
if len(args_batches) != math.ceil(tot_tasks / config.tasks_per_job):
|
632
|
-
raise RuntimeError("Something wrong here while batching tasks")
|
633
|
-
|
634
|
-
# Part 1/3: Iterate over chunks, prepare SlurmJob objects
|
635
|
-
logger.info("[multisubmit] Prepare `SlurmJob`s.")
|
636
|
-
jobs_to_submit = []
|
637
|
-
for ind_batch, chunk in enumerate(args_batches):
|
638
|
-
# Read prefix based on the first task of this batch
|
639
|
-
prefix = list_task_files[ind_batch * batch_size].prefix
|
640
|
-
tasks = []
|
641
|
-
for ind_chunk, parameters in enumerate(chunk):
|
642
|
-
index = (ind_batch * batch_size) + ind_chunk
|
643
|
-
tasks.append(
|
644
|
-
SlurmTask(
|
683
|
+
jobs_to_submit.append(
|
684
|
+
SlurmJob(
|
645
685
|
prefix=prefix,
|
646
|
-
index=index,
|
647
|
-
component=list_task_files[index].component,
|
648
686
|
workdir_local=workdir_local,
|
649
687
|
workdir_remote=workdir_remote,
|
650
|
-
|
651
|
-
|
652
|
-
task_files=list_task_files[index],
|
653
|
-
),
|
688
|
+
tasks=tasks,
|
689
|
+
)
|
654
690
|
)
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
691
|
+
|
692
|
+
# NOTE: see issue 2431
|
693
|
+
logger.debug("[multisubmit] Transfer files and submit jobs.")
|
694
|
+
for slurm_job in jobs_to_submit:
|
695
|
+
self._submit_single_sbatch(
|
696
|
+
func,
|
697
|
+
slurm_job=slurm_job,
|
698
|
+
slurm_config=config,
|
661
699
|
)
|
662
|
-
)
|
663
700
|
|
664
|
-
|
665
|
-
logger.info("[multisubmit] Transfer files and submit jobs.")
|
666
|
-
for slurm_job in jobs_to_submit:
|
667
|
-
self._submit_single_sbatch(
|
668
|
-
func,
|
669
|
-
slurm_job=slurm_job,
|
670
|
-
slurm_config=config,
|
671
|
-
)
|
701
|
+
logger.info(f"[multisubmit] END submission phase, {self.job_ids=}")
|
672
702
|
|
673
|
-
|
703
|
+
create_accounting_record_slurm(
|
704
|
+
user_id=user_id,
|
705
|
+
slurm_job_ids=self.job_ids,
|
706
|
+
)
|
674
707
|
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
708
|
+
settings = Inject(get_settings)
|
709
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
710
|
+
logger.warning(f"[multisubmit] Now sleep {sleep_time} seconds.")
|
711
|
+
time.sleep(sleep_time)
|
712
|
+
except Exception as e:
|
713
|
+
logger.error(
|
714
|
+
"[multisubmit] Unexpected exception during submission."
|
715
|
+
f" Original error {str(e)}"
|
716
|
+
)
|
717
|
+
self.scancel_jobs()
|
718
|
+
if task_type == "parallel":
|
719
|
+
with next(get_sync_db()) as db:
|
720
|
+
bulk_update_status_of_history_unit(
|
721
|
+
history_unit_ids=history_unit_ids,
|
722
|
+
status=HistoryUnitStatus.FAILED,
|
723
|
+
db_sync=db,
|
724
|
+
)
|
725
|
+
results = {}
|
726
|
+
exceptions = {ind: e for ind in range(len(list_parameters))}
|
727
|
+
return results, exceptions
|
679
728
|
|
680
729
|
# Retrieval phase
|
681
|
-
logger.
|
730
|
+
logger.debug("[multisubmit] START retrieval phase")
|
682
731
|
scancelled_job_ids = []
|
683
732
|
while len(self.jobs) > 0:
|
684
733
|
# Look for finished jobs
|
@@ -687,20 +736,46 @@ class BaseSlurmRunner(BaseRunner):
|
|
687
736
|
finished_jobs = [
|
688
737
|
self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
|
689
738
|
]
|
690
|
-
|
739
|
+
fetch_artifacts_exception = None
|
740
|
+
try:
|
741
|
+
self._fetch_artifacts(finished_jobs)
|
742
|
+
except Exception as e:
|
743
|
+
logger.error(
|
744
|
+
"[multisubmit] Unexpected exception in "
|
745
|
+
"`_fetch_artifacts`. "
|
746
|
+
f"Original error: {str(e)}"
|
747
|
+
)
|
748
|
+
fetch_artifacts_exception = e
|
691
749
|
|
692
750
|
with next(get_sync_db()) as db:
|
693
751
|
for slurm_job_id in finished_job_ids:
|
694
|
-
logger.
|
752
|
+
logger.debug(f"[multisubmit] Now process {slurm_job_id=}")
|
695
753
|
slurm_job = self.jobs.pop(slurm_job_id)
|
696
754
|
for task in slurm_job.tasks:
|
697
|
-
logger.
|
698
|
-
|
699
|
-
result, exception = self._postprocess_single_task(
|
700
|
-
task=task,
|
701
|
-
was_job_scancelled=was_job_scancelled,
|
755
|
+
logger.debug(
|
756
|
+
f"[multisubmit] Now process {task.index=}"
|
702
757
|
)
|
703
|
-
|
758
|
+
was_job_scancelled = slurm_job_id in scancelled_job_ids
|
759
|
+
if fetch_artifacts_exception is not None:
|
760
|
+
result = None
|
761
|
+
exception = fetch_artifacts_exception
|
762
|
+
else:
|
763
|
+
try:
|
764
|
+
(
|
765
|
+
result,
|
766
|
+
exception,
|
767
|
+
) = self._postprocess_single_task(
|
768
|
+
task=task,
|
769
|
+
was_job_scancelled=was_job_scancelled,
|
770
|
+
)
|
771
|
+
except Exception as e:
|
772
|
+
logger.error(
|
773
|
+
"[multisubmit] Unexpected exception in "
|
774
|
+
"`_postprocess_single_task`. "
|
775
|
+
f"Original error: {str(e)}"
|
776
|
+
)
|
777
|
+
result = None
|
778
|
+
exception = e
|
704
779
|
# Note: the relevant done/failed check is based on
|
705
780
|
# whether `exception is None`. The fact that
|
706
781
|
# `result is None` is not relevant for this purpose.
|
@@ -728,7 +803,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
728
803
|
if len(self.jobs) > 0:
|
729
804
|
scancelled_job_ids = self.wait_and_check_shutdown()
|
730
805
|
|
731
|
-
logger.
|
806
|
+
logger.debug("[multisubmit] END")
|
732
807
|
return results, exceptions
|
733
808
|
|
734
809
|
def check_fractal_server_versions(self) -> None:
|
@@ -763,16 +838,15 @@ class BaseSlurmRunner(BaseRunner):
|
|
763
838
|
|
764
839
|
def scancel_jobs(self) -> list[str]:
|
765
840
|
logger.info("[scancel_jobs] START")
|
766
|
-
|
841
|
+
scancelled_job_ids = self.job_ids
|
767
842
|
if self.jobs:
|
768
|
-
scancelled_job_ids = self.job_ids
|
769
843
|
scancel_string = " ".join(scancelled_job_ids)
|
770
844
|
scancel_cmd = f"scancel {scancel_string}"
|
771
|
-
logger.warning(f"
|
845
|
+
logger.warning(f"[scancel_jobs] {scancel_string}")
|
772
846
|
try:
|
773
847
|
self._run_remote_cmd(scancel_cmd)
|
774
848
|
except Exception as e:
|
775
|
-
logger.
|
849
|
+
logger.error(
|
776
850
|
"[scancel_jobs] `scancel` command failed. "
|
777
851
|
f"Original error:\n{str(e)}"
|
778
852
|
)
|