fractal-server 2.14.0a33__py3-none-any.whl → 2.14.0a35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ from ..slurm_common.slurm_job_task_models import SlurmTask
15
15
  from ._job_states import STATES_FINISHED
16
16
  from fractal_server import __VERSION__
17
17
  from fractal_server.app.db import get_sync_db
18
+ from fractal_server.app.models.v2 import AccountingRecordSlurm
18
19
  from fractal_server.app.runner.exceptions import JobExecutionError
19
20
  from fractal_server.app.runner.exceptions import TaskExecutionError
20
21
  from fractal_server.app.runner.executors.base_runner import BaseRunner
@@ -34,7 +35,20 @@ SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
34
35
 
35
36
  logger = set_logger(__name__)
36
37
 
37
- # NOTE: see issue 2481.
38
+
39
+ def create_accounting_record_slurm(
40
+ *,
41
+ user_id: int,
42
+ slurm_job_ids: list[int],
43
+ ) -> None:
44
+ with next(get_sync_db()) as db:
45
+ db.add(
46
+ AccountingRecordSlurm(
47
+ user_id=user_id,
48
+ slurm_job_ids=slurm_job_ids,
49
+ )
50
+ )
51
+ db.commit()
38
52
 
39
53
 
40
54
  class BaseSlurmRunner(BaseRunner):
@@ -100,65 +114,51 @@ class BaseSlurmRunner(BaseRunner):
100
114
  def __exit__(self, exc_type, exc_val, exc_tb):
101
115
  return False
102
116
 
103
- def _run_local_cmd(self, cmd: str) -> str:
104
- raise NotImplementedError("Implement in child class.")
105
-
106
117
  def _run_remote_cmd(self, cmd: str) -> str:
107
118
  raise NotImplementedError("Implement in child class.")
108
119
 
109
- def run_squeue(self, job_ids: list[str]) -> tuple[bool, str]:
110
- # NOTE: see issue 2482
111
-
112
- if len(job_ids) == 0:
113
- return (False, "")
114
-
115
- job_id_single_str = ",".join([str(j) for j in job_ids])
116
- cmd = (
117
- f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
118
- " --states=all"
119
- )
120
-
121
- try:
122
- if self.slurm_runner_type == "sudo":
123
- stdout = self._run_local_cmd(cmd)
124
- else:
125
- stdout = self._run_remote_cmd(cmd)
126
- return (True, stdout)
127
- except Exception as e:
128
- logger.info(f"{cmd=} failed with {str(e)}")
129
- return (False, "")
120
+ def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
121
+ raise NotImplementedError("Implement in child class.")
130
122
 
131
123
  def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
132
- # If there is no Slurm job to check, return right away
133
124
 
125
+ # If there is no Slurm job to check, return right away
134
126
  if not job_ids:
135
127
  return set()
136
- id_to_state = dict()
137
128
 
138
- success, stdout = self.run_squeue(job_ids)
139
- if success:
140
- id_to_state = {
129
+ try:
130
+ stdout = self.run_squeue(job_ids=job_ids)
131
+ slurm_statuses = {
141
132
  out.split()[0]: out.split()[1] for out in stdout.splitlines()
142
133
  }
143
- else:
144
- id_to_state = dict()
145
- for j in job_ids:
146
- success, res = self.run_squeue([j])
147
- if not success:
148
- logger.info(f"Job {j} not found. Marked it as completed")
149
- id_to_state.update({str(j): "COMPLETED"})
150
- else:
151
- id_to_state.update(
152
- {res.stdout.split()[0]: res.stdout.split()[1]}
134
+ except Exception as e:
135
+ logger.warning(
136
+ "[_get_finished_jobs] `squeue` failed, "
137
+ "retry with individual job IDs. "
138
+ f"Original error: {str(e)}."
139
+ )
140
+ slurm_statuses = dict()
141
+ for job_id in job_ids:
142
+ try:
143
+ stdout = self.run_squeue(job_ids=[job_id])
144
+ slurm_statuses.update(
145
+ {stdout.split()[0]: stdout.split()[1]}
153
146
  )
147
+ except Exception as e:
148
+ logger.warning(
149
+ "[_get_finished_jobs] `squeue` failed for "
150
+ f"{job_id=}, mark job as completed. "
151
+ f"Original error: {str(e)}."
152
+ )
153
+ slurm_statuses.update({str(job_id): "COMPLETED"})
154
154
 
155
- # Finished jobs only stay in squeue for a few mins (configurable). If
156
- # a job ID isn't there, we'll assume it's finished.
157
- return {
158
- j
159
- for j in job_ids
160
- if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
155
+ # If a job is not in `squeue` output, mark it as completed.
156
+ finished_jobs = {
157
+ job_id
158
+ for job_id in job_ids
159
+ if slurm_statuses.get(job_id, "COMPLETED") in STATES_FINISHED
161
160
  }
161
+ return finished_jobs
162
162
 
163
163
  def _mkdir_local_folder(self, folder: str) -> None:
164
164
  raise NotImplementedError("Implement in child class.")
@@ -172,7 +172,7 @@ class BaseSlurmRunner(BaseRunner):
172
172
  slurm_job: SlurmJob,
173
173
  slurm_config: SlurmConfig,
174
174
  ) -> str:
175
- logger.info("[_submit_single_sbatch] START")
175
+ logger.debug("[_submit_single_sbatch] START")
176
176
  # Prepare input pickle(s)
177
177
  versions = dict(
178
178
  python=sys.version_info[:3],
@@ -189,7 +189,7 @@ class BaseSlurmRunner(BaseRunner):
189
189
  funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
190
190
  with open(task.input_pickle_file_local, "wb") as f:
191
191
  f.write(funcser)
192
- logger.info(
192
+ logger.debug(
193
193
  "[_submit_single_sbatch] Written "
194
194
  f"{task.input_pickle_file_local=}"
195
195
  )
@@ -200,7 +200,7 @@ class BaseSlurmRunner(BaseRunner):
200
200
  local=task.input_pickle_file_local,
201
201
  remote=task.input_pickle_file_remote,
202
202
  )
203
- logger.info(
203
+ logger.debug(
204
204
  "[_submit_single_sbatch] Transferred "
205
205
  f"{task.input_pickle_file_local=}"
206
206
  )
@@ -243,7 +243,7 @@ class BaseSlurmRunner(BaseRunner):
243
243
  ]
244
244
  )
245
245
  script_lines = slurm_config.sort_script_lines(script_lines)
246
- logger.info(script_lines)
246
+ logger.debug(script_lines)
247
247
 
248
248
  # Always print output of `uname -n` and `pwd`
249
249
  script_lines.append('\necho "Hostname: $(uname -n)"')
@@ -272,7 +272,7 @@ class BaseSlurmRunner(BaseRunner):
272
272
  # Write submission script
273
273
  with open(slurm_job.slurm_submission_script_local, "w") as f:
274
274
  f.write(script)
275
- logger.info(
275
+ logger.debug(
276
276
  "[_submit_single_sbatch] Written "
277
277
  f"{slurm_job.slurm_submission_script_local=}"
278
278
  )
@@ -294,10 +294,10 @@ class BaseSlurmRunner(BaseRunner):
294
294
  # Run sbatch
295
295
  pre_submission_cmds = slurm_config.pre_submission_commands
296
296
  if len(pre_submission_cmds) == 0:
297
- logger.info(f"Now run {submit_command=}")
297
+ logger.debug(f"Now run {submit_command=}")
298
298
  sbatch_stdout = self._run_remote_cmd(submit_command)
299
299
  else:
300
- logger.info(f"Now using {pre_submission_cmds=}")
300
+ logger.debug(f"Now using {pre_submission_cmds=}")
301
301
  script_lines = pre_submission_cmds + [submit_command]
302
302
  wrapper_script_contents = "\n".join(script_lines)
303
303
  wrapper_script_contents = f"{wrapper_script_contents}\n"
@@ -314,22 +314,22 @@ class BaseSlurmRunner(BaseRunner):
314
314
  )
315
315
  with open(wrapper_script, "w") as f:
316
316
  f.write(wrapper_script_contents)
317
- logger.info(f"Now run {wrapper_script=}")
317
+ logger.debug(f"Now run {wrapper_script=}")
318
318
  sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
319
319
 
320
320
  # Submit SLURM job and retrieve job ID
321
- logger.info(f"[_submit_single_sbatc] {sbatch_stdout=}")
321
+ logger.info(f"[_submit_single_sbatch] {sbatch_stdout=}")
322
322
  stdout = sbatch_stdout.strip("\n")
323
323
  submitted_job_id = int(stdout)
324
324
  slurm_job.slurm_job_id = str(submitted_job_id)
325
325
 
326
326
  # Add job to self.jobs
327
327
  self.jobs[slurm_job.slurm_job_id] = slurm_job
328
- logger.info(
328
+ logger.debug(
329
329
  "[_submit_single_sbatch] Added "
330
330
  f"{slurm_job.slurm_job_id} to self.jobs."
331
331
  )
332
- logger.info("[_submit_single_sbatch] END")
332
+ logger.debug("[_submit_single_sbatch] END")
333
333
 
334
334
  def _fetch_artifacts(
335
335
  self,
@@ -421,27 +421,34 @@ class BaseSlurmRunner(BaseRunner):
421
421
  """
422
422
  # Sleep for `self.poll_interval`, but keep checking for shutdowns
423
423
  start_time = time.perf_counter()
424
- max_time = start_time + self.poll_interval
425
- can_return = False
424
+ # Always wait at least 0.2 (note: this is for cases where
425
+ # `poll_interval=0`).
426
+ waiting_time = max(self.poll_interval, 0.2)
427
+ max_time = start_time + waiting_time
426
428
  logger.debug(
427
429
  "[wait_and_check_shutdown] "
428
430
  f"I will wait at most {self.poll_interval} s, "
429
431
  f"in blocks of {self.poll_interval_internal} s."
430
432
  )
431
433
 
432
- while (time.perf_counter() < max_time) or (can_return is False):
433
- # Handle shutdown
434
+ while time.perf_counter() < max_time:
434
435
  if self.is_shutdown():
435
436
  logger.info("[wait_and_check_shutdown] Shutdown file detected")
436
437
  scancelled_job_ids = self.scancel_jobs()
437
438
  logger.info(f"[wait_and_check_shutdown] {scancelled_job_ids=}")
438
439
  return scancelled_job_ids
439
- can_return = True
440
440
  time.sleep(self.poll_interval_internal)
441
441
 
442
442
  logger.debug("[wait_and_check_shutdown] No shutdown file detected")
443
443
  return []
444
444
 
445
+ def _check_no_active_jobs(self):
446
+ if self.jobs != {}:
447
+ raise JobExecutionError(
448
+ "Unexpected branch: jobs must be empty before new "
449
+ "submissions."
450
+ )
451
+
445
452
  def submit(
446
453
  self,
447
454
  func: callable,
@@ -455,109 +462,133 @@ class BaseSlurmRunner(BaseRunner):
455
462
  "compound",
456
463
  "converter_compound",
457
464
  ],
465
+ user_id: int,
458
466
  ) -> tuple[Any, Exception]:
459
- logger.info("[submit] START")
460
-
461
- workdir_local = task_files.wftask_subfolder_local
462
- workdir_remote = task_files.wftask_subfolder_remote
463
-
464
- if self.jobs != {}:
465
- raise JobExecutionError("Unexpected branch: jobs should be empty.")
467
+ logger.debug("[submit] START")
468
+ try:
469
+ workdir_local = task_files.wftask_subfolder_local
470
+ workdir_remote = task_files.wftask_subfolder_remote
466
471
 
467
- if self.is_shutdown():
468
- with next(get_sync_db()) as db:
469
- update_status_of_history_unit(
470
- history_unit_id=history_unit_id,
471
- status=HistoryUnitStatus.FAILED,
472
- db_sync=db,
473
- )
472
+ if self.is_shutdown():
473
+ with next(get_sync_db()) as db:
474
+ update_status_of_history_unit(
475
+ history_unit_id=history_unit_id,
476
+ status=HistoryUnitStatus.FAILED,
477
+ db_sync=db,
478
+ )
474
479
 
475
- return None, SHUTDOWN_EXCEPTION
480
+ return None, SHUTDOWN_EXCEPTION
476
481
 
477
- # Validation phase
478
- self.validate_submit_parameters(
479
- parameters=parameters,
480
- task_type=task_type,
481
- )
482
+ self._check_no_active_jobs()
482
483
 
483
- # Create task subfolder
484
- logger.info("[submit] Create local/remote folders - START")
485
- self._mkdir_local_folder(folder=workdir_local.as_posix())
486
- self._mkdir_remote_folder(folder=workdir_remote.as_posix())
487
- logger.info("[submit] Create local/remote folders - END")
488
-
489
- # Submission phase
490
- slurm_job = SlurmJob(
491
- prefix=task_files.prefix,
492
- workdir_local=workdir_local,
493
- workdir_remote=workdir_remote,
494
- tasks=[
495
- SlurmTask(
496
- prefix=task_files.prefix,
497
- index=0,
498
- component=task_files.component,
499
- parameters=parameters,
500
- workdir_remote=workdir_remote,
501
- workdir_local=workdir_local,
502
- task_files=task_files,
503
- )
504
- ],
505
- )
484
+ # Validation phase
485
+ self.validate_submit_parameters(
486
+ parameters=parameters,
487
+ task_type=task_type,
488
+ )
506
489
 
507
- config.parallel_tasks_per_job = 1
508
- self._submit_single_sbatch(
509
- func,
510
- slurm_job=slurm_job,
511
- slurm_config=config,
512
- )
513
- logger.info(f"[submit] END submission phase, {self.job_ids=}")
490
+ # Create task subfolder
491
+ logger.debug("[submit] Create local/remote folders - START")
492
+ self._mkdir_local_folder(folder=workdir_local.as_posix())
493
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
494
+ logger.debug("[submit] Create local/remote folders - END")
495
+
496
+ # Submission phase
497
+ slurm_job = SlurmJob(
498
+ prefix=task_files.prefix,
499
+ workdir_local=workdir_local,
500
+ workdir_remote=workdir_remote,
501
+ tasks=[
502
+ SlurmTask(
503
+ prefix=task_files.prefix,
504
+ index=0,
505
+ component=task_files.component,
506
+ parameters=parameters,
507
+ workdir_remote=workdir_remote,
508
+ workdir_local=workdir_local,
509
+ task_files=task_files,
510
+ )
511
+ ],
512
+ )
514
513
 
515
- # NOTE: see issue 2444
516
- settings = Inject(get_settings)
517
- sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
518
- logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
519
- time.sleep(sleep_time)
514
+ config.parallel_tasks_per_job = 1
515
+ self._submit_single_sbatch(
516
+ func,
517
+ slurm_job=slurm_job,
518
+ slurm_config=config,
519
+ )
520
+ logger.debug(f"[submit] END submission phase, {self.job_ids=}")
520
521
 
521
- # Retrieval phase
522
- logger.info("[submit] START retrieval phase")
523
- scancelled_job_ids = []
524
- while len(self.jobs) > 0:
525
- # Look for finished jobs
526
- finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
527
- logger.debug(f"[submit] {finished_job_ids=}")
528
- finished_jobs = [
529
- self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
530
- ]
531
- self._fetch_artifacts(finished_jobs)
532
- with next(get_sync_db()) as db:
533
- for slurm_job_id in finished_job_ids:
534
- logger.debug(f"[submit] Now process {slurm_job_id=}")
535
- slurm_job = self.jobs.pop(slurm_job_id)
536
- was_job_scancelled = slurm_job_id in scancelled_job_ids
537
- result, exception = self._postprocess_single_task(
538
- task=slurm_job.tasks[0],
539
- was_job_scancelled=was_job_scancelled,
540
- )
522
+ create_accounting_record_slurm(
523
+ user_id=user_id,
524
+ slurm_job_ids=self.job_ids,
525
+ )
541
526
 
542
- if exception is not None:
543
- update_status_of_history_unit(
544
- history_unit_id=history_unit_id,
545
- status=HistoryUnitStatus.FAILED,
546
- db_sync=db,
527
+ # NOTE: see issue 2444
528
+ settings = Inject(get_settings)
529
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
530
+ logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
531
+ time.sleep(sleep_time)
532
+
533
+ # Retrieval phase
534
+ logger.debug("[submit] START retrieval phase")
535
+ scancelled_job_ids = []
536
+ while len(self.jobs) > 0:
537
+ # Look for finished jobs
538
+ finished_job_ids = self._get_finished_jobs(
539
+ job_ids=self.job_ids
540
+ )
541
+ logger.debug(f"[submit] {finished_job_ids=}")
542
+ finished_jobs = [
543
+ self.jobs[_slurm_job_id]
544
+ for _slurm_job_id in finished_job_ids
545
+ ]
546
+ self._fetch_artifacts(finished_jobs)
547
+ with next(get_sync_db()) as db:
548
+ for slurm_job_id in finished_job_ids:
549
+ logger.debug(f"[submit] Now process {slurm_job_id=}")
550
+ slurm_job = self.jobs.pop(slurm_job_id)
551
+ was_job_scancelled = slurm_job_id in scancelled_job_ids
552
+ result, exception = self._postprocess_single_task(
553
+ task=slurm_job.tasks[0],
554
+ was_job_scancelled=was_job_scancelled,
547
555
  )
548
- else:
549
- if task_type not in ["compound", "converter_compound"]:
556
+
557
+ if exception is not None:
550
558
  update_status_of_history_unit(
551
559
  history_unit_id=history_unit_id,
552
- status=HistoryUnitStatus.DONE,
560
+ status=HistoryUnitStatus.FAILED,
553
561
  db_sync=db,
554
562
  )
563
+ else:
564
+ if task_type not in [
565
+ "compound",
566
+ "converter_compound",
567
+ ]:
568
+ update_status_of_history_unit(
569
+ history_unit_id=history_unit_id,
570
+ status=HistoryUnitStatus.DONE,
571
+ db_sync=db,
572
+ )
555
573
 
556
- if len(self.jobs) > 0:
557
- scancelled_job_ids = self.wait_and_check_shutdown()
574
+ if len(self.jobs) > 0:
575
+ scancelled_job_ids = self.wait_and_check_shutdown()
558
576
 
559
- logger.info("[submit] END")
560
- return result, exception
577
+ logger.debug("[submit] END")
578
+ return result, exception
579
+
580
+ except Exception as e:
581
+ logger.error(
582
+ f"[submit] Unexpected exception. Original error: {str(e)}"
583
+ )
584
+ with next(get_sync_db()) as db:
585
+ update_status_of_history_unit(
586
+ history_unit_id=history_unit_id,
587
+ status=HistoryUnitStatus.FAILED,
588
+ db_sync=db,
589
+ )
590
+ self.scancel_jobs()
591
+ return None, e
561
592
 
562
593
  def multisubmit(
563
594
  self,
@@ -567,6 +598,7 @@ class BaseSlurmRunner(BaseRunner):
567
598
  list_task_files: list[TaskFiles],
568
599
  task_type: Literal["parallel", "compound", "converter_compound"],
569
600
  config: SlurmConfig,
601
+ user_id: int,
570
602
  ) -> tuple[dict[int, Any], dict[int, BaseException]]:
571
603
  """
572
604
  Note: `list_parameters`, `list_task_files` and `history_unit_ids`
@@ -574,111 +606,128 @@ class BaseSlurmRunner(BaseRunner):
574
606
  input images, while for compound tasks these can differ.
575
607
  """
576
608
 
577
- if len(self.jobs) > 0:
578
- raise RuntimeError(
579
- f"Cannot run `multisubmit` when {len(self.jobs)=}"
609
+ logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
610
+ try:
611
+
612
+ if self.is_shutdown():
613
+ if task_type == "parallel":
614
+ with next(get_sync_db()) as db:
615
+ bulk_update_status_of_history_unit(
616
+ history_unit_ids=history_unit_ids,
617
+ status=HistoryUnitStatus.FAILED,
618
+ db_sync=db,
619
+ )
620
+ results = {}
621
+ exceptions = {
622
+ ind: SHUTDOWN_EXCEPTION
623
+ for ind in range(len(list_parameters))
624
+ }
625
+ return results, exceptions
626
+
627
+ self._check_no_active_jobs()
628
+ self.validate_multisubmit_parameters(
629
+ list_parameters=list_parameters,
630
+ task_type=task_type,
631
+ list_task_files=list_task_files,
632
+ history_unit_ids=history_unit_ids,
580
633
  )
581
634
 
582
- if self.is_shutdown():
635
+ workdir_local = list_task_files[0].wftask_subfolder_local
636
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
637
+
638
+ # Create local&remote task subfolders
583
639
  if task_type == "parallel":
584
- with next(get_sync_db()) as db:
585
- bulk_update_status_of_history_unit(
586
- history_unit_ids=history_unit_ids,
587
- status=HistoryUnitStatus.FAILED,
588
- db_sync=db,
640
+ self._mkdir_local_folder(workdir_local.as_posix())
641
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
642
+
643
+ results: dict[int, Any] = {}
644
+ exceptions: dict[int, BaseException] = {}
645
+
646
+ # NOTE: chunking has already taken place in `get_slurm_config`,
647
+ # so that `config.tasks_per_job` is now set.
648
+
649
+ # Divide arguments in batches of `tasks_per_job` tasks each
650
+ tot_tasks = len(list_parameters)
651
+ args_batches = []
652
+ batch_size = config.tasks_per_job
653
+ for ind_chunk in range(0, tot_tasks, batch_size):
654
+ args_batches.append(
655
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
656
+ )
657
+ if len(args_batches) != math.ceil(
658
+ tot_tasks / config.tasks_per_job
659
+ ):
660
+ raise RuntimeError("Something wrong here while batching tasks")
661
+
662
+ # Part 1/3: Iterate over chunks, prepare SlurmJob objects
663
+ logger.debug("[multisubmit] Prepare `SlurmJob`s.")
664
+ jobs_to_submit = []
665
+ for ind_batch, chunk in enumerate(args_batches):
666
+ # Read prefix based on the first task of this batch
667
+ prefix = list_task_files[ind_batch * batch_size].prefix
668
+ tasks = []
669
+ for ind_chunk, parameters in enumerate(chunk):
670
+ index = (ind_batch * batch_size) + ind_chunk
671
+ tasks.append(
672
+ SlurmTask(
673
+ prefix=prefix,
674
+ index=index,
675
+ component=list_task_files[index].component,
676
+ workdir_local=workdir_local,
677
+ workdir_remote=workdir_remote,
678
+ parameters=parameters,
679
+ zarr_url=parameters["zarr_url"],
680
+ task_files=list_task_files[index],
681
+ ),
589
682
  )
590
- results = {}
591
- exceptions = {
592
- ind: SHUTDOWN_EXCEPTION for ind in range(len(list_parameters))
593
- }
594
- return results, exceptions
595
-
596
- self.validate_multisubmit_parameters(
597
- list_parameters=list_parameters,
598
- task_type=task_type,
599
- list_task_files=list_task_files,
600
- history_unit_ids=history_unit_ids,
601
- )
602
-
603
- logger.info(f"[multisubmit] START, {len(list_parameters)=}")
604
-
605
- workdir_local = list_task_files[0].wftask_subfolder_local
606
- workdir_remote = list_task_files[0].wftask_subfolder_remote
607
-
608
- # Create local&remote task subfolders
609
- if task_type == "parallel":
610
- self._mkdir_local_folder(workdir_local.as_posix())
611
- self._mkdir_remote_folder(folder=workdir_remote.as_posix())
612
-
613
- # Execute tasks, in chunks of size `parallel_tasks_per_job`
614
- # TODO Pick a data structure for results and exceptions, or review the
615
- # interface
616
- results: dict[int, Any] = {}
617
- exceptions: dict[int, BaseException] = {}
618
-
619
- tot_tasks = len(list_parameters)
620
-
621
- # NOTE: chunking has already taken place in `get_slurm_config`,
622
- # so that `config.tasks_per_job` is now set.
623
-
624
- # Divide arguments in batches of `tasks_per_job` tasks each
625
- args_batches = []
626
- batch_size = config.tasks_per_job
627
- for ind_chunk in range(0, tot_tasks, batch_size):
628
- args_batches.append(
629
- list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
630
- )
631
- if len(args_batches) != math.ceil(tot_tasks / config.tasks_per_job):
632
- raise RuntimeError("Something wrong here while batching tasks")
633
-
634
- # Part 1/3: Iterate over chunks, prepare SlurmJob objects
635
- logger.info("[multisubmit] Prepare `SlurmJob`s.")
636
- jobs_to_submit = []
637
- for ind_batch, chunk in enumerate(args_batches):
638
- # Read prefix based on the first task of this batch
639
- prefix = list_task_files[ind_batch * batch_size].prefix
640
- tasks = []
641
- for ind_chunk, parameters in enumerate(chunk):
642
- index = (ind_batch * batch_size) + ind_chunk
643
- tasks.append(
644
- SlurmTask(
683
+ jobs_to_submit.append(
684
+ SlurmJob(
645
685
  prefix=prefix,
646
- index=index,
647
- component=list_task_files[index].component,
648
686
  workdir_local=workdir_local,
649
687
  workdir_remote=workdir_remote,
650
- parameters=parameters,
651
- zarr_url=parameters["zarr_url"],
652
- task_files=list_task_files[index],
653
- ),
688
+ tasks=tasks,
689
+ )
654
690
  )
655
- jobs_to_submit.append(
656
- SlurmJob(
657
- prefix=prefix,
658
- workdir_local=workdir_local,
659
- workdir_remote=workdir_remote,
660
- tasks=tasks,
691
+
692
+ # NOTE: see issue 2431
693
+ logger.debug("[multisubmit] Transfer files and submit jobs.")
694
+ for slurm_job in jobs_to_submit:
695
+ self._submit_single_sbatch(
696
+ func,
697
+ slurm_job=slurm_job,
698
+ slurm_config=config,
661
699
  )
662
- )
663
700
 
664
- # NOTE: see issue 2431
665
- logger.info("[multisubmit] Transfer files and submit jobs.")
666
- for slurm_job in jobs_to_submit:
667
- self._submit_single_sbatch(
668
- func,
669
- slurm_job=slurm_job,
670
- slurm_config=config,
671
- )
701
+ logger.info(f"[multisubmit] END submission phase, {self.job_ids=}")
672
702
 
673
- logger.info(f"END submission phase, {self.job_ids=}")
703
+ create_accounting_record_slurm(
704
+ user_id=user_id,
705
+ slurm_job_ids=self.job_ids,
706
+ )
674
707
 
675
- settings = Inject(get_settings)
676
- sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
677
- logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
678
- time.sleep(sleep_time)
708
+ settings = Inject(get_settings)
709
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
710
+ logger.warning(f"[multisubmit] Now sleep {sleep_time} seconds.")
711
+ time.sleep(sleep_time)
712
+ except Exception as e:
713
+ logger.error(
714
+ "[multisubmit] Unexpected exception during submission."
715
+ f" Original error {str(e)}"
716
+ )
717
+ self.scancel_jobs()
718
+ if task_type == "parallel":
719
+ with next(get_sync_db()) as db:
720
+ bulk_update_status_of_history_unit(
721
+ history_unit_ids=history_unit_ids,
722
+ status=HistoryUnitStatus.FAILED,
723
+ db_sync=db,
724
+ )
725
+ results = {}
726
+ exceptions = {ind: e for ind in range(len(list_parameters))}
727
+ return results, exceptions
679
728
 
680
729
  # Retrieval phase
681
- logger.info("[multisubmit] START retrieval phase")
730
+ logger.debug("[multisubmit] START retrieval phase")
682
731
  scancelled_job_ids = []
683
732
  while len(self.jobs) > 0:
684
733
  # Look for finished jobs
@@ -687,20 +736,46 @@ class BaseSlurmRunner(BaseRunner):
687
736
  finished_jobs = [
688
737
  self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
689
738
  ]
690
- self._fetch_artifacts(finished_jobs)
739
+ fetch_artifacts_exception = None
740
+ try:
741
+ self._fetch_artifacts(finished_jobs)
742
+ except Exception as e:
743
+ logger.error(
744
+ "[multisubmit] Unexpected exception in "
745
+ "`_fetch_artifacts`. "
746
+ f"Original error: {str(e)}"
747
+ )
748
+ fetch_artifacts_exception = e
691
749
 
692
750
  with next(get_sync_db()) as db:
693
751
  for slurm_job_id in finished_job_ids:
694
- logger.info(f"[multisubmit] Now process {slurm_job_id=}")
752
+ logger.debug(f"[multisubmit] Now process {slurm_job_id=}")
695
753
  slurm_job = self.jobs.pop(slurm_job_id)
696
754
  for task in slurm_job.tasks:
697
- logger.info(f"[multisubmit] Now process {task.index=}")
698
- was_job_scancelled = slurm_job_id in scancelled_job_ids
699
- result, exception = self._postprocess_single_task(
700
- task=task,
701
- was_job_scancelled=was_job_scancelled,
755
+ logger.debug(
756
+ f"[multisubmit] Now process {task.index=}"
702
757
  )
703
-
758
+ was_job_scancelled = slurm_job_id in scancelled_job_ids
759
+ if fetch_artifacts_exception is not None:
760
+ result = None
761
+ exception = fetch_artifacts_exception
762
+ else:
763
+ try:
764
+ (
765
+ result,
766
+ exception,
767
+ ) = self._postprocess_single_task(
768
+ task=task,
769
+ was_job_scancelled=was_job_scancelled,
770
+ )
771
+ except Exception as e:
772
+ logger.error(
773
+ "[multisubmit] Unexpected exception in "
774
+ "`_postprocess_single_task`. "
775
+ f"Original error: {str(e)}"
776
+ )
777
+ result = None
778
+ exception = e
704
779
  # Note: the relevant done/failed check is based on
705
780
  # whether `exception is None`. The fact that
706
781
  # `result is None` is not relevant for this purpose.
@@ -728,7 +803,7 @@ class BaseSlurmRunner(BaseRunner):
728
803
  if len(self.jobs) > 0:
729
804
  scancelled_job_ids = self.wait_and_check_shutdown()
730
805
 
731
- logger.info("[multisubmit] END")
806
+ logger.debug("[multisubmit] END")
732
807
  return results, exceptions
733
808
 
734
809
  def check_fractal_server_versions(self) -> None:
@@ -763,16 +838,15 @@ class BaseSlurmRunner(BaseRunner):
763
838
 
764
839
  def scancel_jobs(self) -> list[str]:
765
840
  logger.info("[scancel_jobs] START")
766
-
841
+ scancelled_job_ids = self.job_ids
767
842
  if self.jobs:
768
- scancelled_job_ids = self.job_ids
769
843
  scancel_string = " ".join(scancelled_job_ids)
770
844
  scancel_cmd = f"scancel {scancel_string}"
771
- logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
845
+ logger.warning(f"[scancel_jobs] {scancel_string}")
772
846
  try:
773
847
  self._run_remote_cmd(scancel_cmd)
774
848
  except Exception as e:
775
- logger.warning(
849
+ logger.error(
776
850
  "[scancel_jobs] `scancel` command failed. "
777
851
  f"Original error:\n{str(e)}"
778
852
  )