fractal-server 2.14.0a10__py3-none-any.whl → 2.14.0a12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/routes/api/v2/submit.py +1 -1
  3. fractal_server/app/runner/components.py +0 -3
  4. fractal_server/app/runner/exceptions.py +4 -0
  5. fractal_server/app/runner/executors/base_runner.py +38 -17
  6. fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
  7. fractal_server/app/runner/executors/local/runner.py +109 -59
  8. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +4 -0
  9. fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
  10. fractal_server/app/runner/executors/slurm_ssh/runner.py +6 -10
  11. fractal_server/app/runner/executors/slurm_sudo/runner.py +196 -99
  12. fractal_server/app/runner/task_files.py +8 -0
  13. fractal_server/app/runner/v2/__init__.py +0 -366
  14. fractal_server/app/runner/v2/_local.py +2 -2
  15. fractal_server/app/runner/v2/_slurm_ssh.py +2 -2
  16. fractal_server/app/runner/v2/_slurm_sudo.py +2 -2
  17. fractal_server/app/runner/v2/db_tools.py +87 -0
  18. fractal_server/app/runner/v2/runner.py +77 -81
  19. fractal_server/app/runner/v2/runner_functions.py +274 -436
  20. fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
  21. fractal_server/app/runner/v2/submit_workflow.py +366 -0
  22. fractal_server/app/runner/v2/task_interface.py +31 -0
  23. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/METADATA +1 -1
  24. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/RECORD +27 -28
  25. fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
  26. fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
  27. fractal_server/app/runner/v2/_db_tools.py +0 -48
  28. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/LICENSE +0 -0
  29. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/WHEEL +0 -0
  30. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a12.dist-info}/entry_points.txt +0 -0
@@ -9,19 +9,18 @@ import time
9
9
  from copy import copy
10
10
  from pathlib import Path
11
11
  from typing import Any
12
+ from typing import Literal
12
13
  from typing import Optional
13
14
 
14
15
  import cloudpickle
15
16
  from pydantic import BaseModel
16
17
  from pydantic import ConfigDict
17
18
 
18
- from ..slurm_common._check_jobs_status import (
19
- get_finished_jobs,
20
- )
19
+ from ..slurm_common._check_jobs_status import get_finished_jobs
21
20
  from ._subprocess_run_as_user import _mkdir_as_user
22
21
  from ._subprocess_run_as_user import _run_command_as_user
23
22
  from fractal_server import __VERSION__
24
- from fractal_server.app.runner.components import _COMPONENT_KEY_
23
+ from fractal_server.app.db import get_sync_db
25
24
  from fractal_server.app.runner.exceptions import JobExecutionError
26
25
  from fractal_server.app.runner.exceptions import TaskExecutionError
27
26
  from fractal_server.app.runner.executors.base_runner import BaseRunner
@@ -33,7 +32,8 @@ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
33
32
  )
34
33
  from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
35
34
  from fractal_server.app.runner.task_files import TaskFiles
36
- from fractal_server.app.schemas.v2.task import TaskTypeType
35
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
36
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
37
37
  from fractal_server.config import get_settings
38
38
  from fractal_server.logger import set_logger
39
39
  from fractal_server.syringe import Inject
@@ -100,48 +100,68 @@ class SlurmJob(BaseModel):
100
100
  tasks: list[SlurmTask]
101
101
 
102
102
  @property
103
- def slurm_log_file_local(self) -> str:
103
+ def slurm_submission_script_local(self) -> str:
104
+ return (
105
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
106
+ ).as_posix()
107
+
108
+ @property
109
+ def slurm_submission_script_remote(self) -> str:
110
+ return (
111
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
112
+ ).as_posix()
113
+
114
+ @property
115
+ def slurm_stdout_remote(self) -> str:
104
116
  if self.slurm_job_id:
105
117
  return (
106
- self.workdir_local
107
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
118
+ self.workdir_remote
119
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
108
120
  ).as_posix()
121
+
109
122
  else:
110
123
  return (
111
- self.workdir_local / f"slurm-{self.label}-%j.log"
124
+ self.workdir_remote / f"slurm-{self.label}-%j.out"
112
125
  ).as_posix()
113
126
 
114
127
  @property
115
- def slurm_log_file_remote(self) -> str:
128
+ def slurm_stderr_remote(self) -> str:
116
129
  if self.slurm_job_id:
117
130
  return (
118
131
  self.workdir_remote
119
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
132
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
120
133
  ).as_posix()
134
+
121
135
  else:
122
136
  return (
123
- self.workdir_remote / f"slurm-{self.label}-%j.log"
137
+ self.workdir_remote / f"slurm-{self.label}-%j.err"
124
138
  ).as_posix()
125
139
 
126
140
  @property
127
- def slurm_submission_script_local(self) -> str:
128
- return (
129
- self.workdir_local / f"slurm-{self.label}-submit.sh"
130
- ).as_posix()
141
+ def slurm_stdout_local(self) -> str:
142
+ if self.slurm_job_id:
143
+ return (
144
+ self.workdir_local
145
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
146
+ ).as_posix()
131
147
 
132
- @property
133
- def slurm_submission_script_remote(self) -> str:
134
- return (
135
- self.workdir_remote / f"slurm-{self.label}-submit.sh"
136
- ).as_posix()
148
+ else:
149
+ return (
150
+ self.workdir_local / f"slurm-{self.label}-%j.out"
151
+ ).as_posix()
137
152
 
138
153
  @property
139
- def slurm_stdout(self) -> str:
140
- return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
154
+ def slurm_stderr_local(self) -> str:
155
+ if self.slurm_job_id:
156
+ return (
157
+ self.workdir_local
158
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
159
+ ).as_posix()
141
160
 
142
- @property
143
- def slurm_stderr(self) -> str:
144
- return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
161
+ else:
162
+ return (
163
+ self.workdir_local / f"slurm-{self.label}-%j.err"
164
+ ).as_posix()
145
165
 
146
166
  @property
147
167
  def log_files_local(self) -> list[str]:
@@ -287,6 +307,7 @@ class RunnerSlurmSudo(BaseRunner):
287
307
  slurm_job: SlurmJob,
288
308
  slurm_config: SlurmConfig,
289
309
  ) -> str:
310
+ logger.debug("[_submit_single_sbatch] START")
290
311
  # Prepare input pickle(s)
291
312
  versions = dict(
292
313
  python=sys.version_info[:3],
@@ -295,10 +316,17 @@ class RunnerSlurmSudo(BaseRunner):
295
316
  )
296
317
  for task in slurm_job.tasks:
297
318
  _args = []
298
- _kwargs = dict(parameters=task.parameters)
319
+ _kwargs = dict(
320
+ parameters=task.parameters,
321
+ remote_files=task.task_files.remote_files_dict,
322
+ )
299
323
  funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
300
324
  with open(task.input_pickle_file_local, "wb") as f:
301
325
  f.write(funcser)
326
+ logger.debug(
327
+ "[_submit_single_sbatch] Written "
328
+ f"{task.input_pickle_file_local=}"
329
+ )
302
330
  # Prepare commands to be included in SLURM submission script
303
331
  settings = Inject(get_settings)
304
332
  python_worker_interpreter = (
@@ -335,8 +363,8 @@ class RunnerSlurmSudo(BaseRunner):
335
363
  # fix their order
336
364
  script_lines.extend(
337
365
  [
338
- f"#SBATCH --err={slurm_job.slurm_stderr}",
339
- f"#SBATCH --out={slurm_job.slurm_stdout}",
366
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
367
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
340
368
  f"#SBATCH -D {slurm_job.workdir_remote}",
341
369
  ]
342
370
  )
@@ -394,8 +422,10 @@ class RunnerSlurmSudo(BaseRunner):
394
422
  """
395
423
  Note: this would differ for SSH
396
424
  """
425
+ logger.debug(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
397
426
  source_target_list = [
398
- (job.slurm_log_file_remote, job.slurm_log_file_local)
427
+ (job.slurm_stdout_remote, job.slurm_stdout_local),
428
+ (job.slurm_stderr_remote, job.slurm_stderr_local),
399
429
  ]
400
430
  for task in job.tasks:
401
431
  source_target_list.extend(
@@ -463,21 +493,22 @@ class RunnerSlurmSudo(BaseRunner):
463
493
  self,
464
494
  func: callable,
465
495
  parameters: dict[str, Any],
466
- history_item_id: int,
496
+ history_unit_id: int,
467
497
  task_files: TaskFiles,
468
- slurm_config: SlurmConfig,
469
- task_type: TaskTypeType,
498
+ task_type: Literal[
499
+ "non_parallel",
500
+ "converter_non_parallel",
501
+ "compound",
502
+ "converter_compound",
503
+ ],
504
+ config: SlurmConfig,
470
505
  ) -> tuple[Any, Exception]:
471
- workdir_local = task_files.wftask_subfolder_local
472
- workdir_remote = task_files.wftask_subfolder_remote
473
506
 
474
- task_files = TaskFiles(
475
- **task_files.model_dump(
476
- exclude={"component"},
477
- ),
478
- component=parameters[_COMPONENT_KEY_],
479
- )
507
+ if len(self.jobs) > 0:
508
+ raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
480
509
 
510
+ workdir_local = task_files.wftask_subfolder_local
511
+ workdir_remote = task_files.wftask_subfolder_remote
481
512
  if self.jobs != {}:
482
513
  raise JobExecutionError("Unexpected branch: jobs should be empty.")
483
514
 
@@ -485,7 +516,7 @@ class RunnerSlurmSudo(BaseRunner):
485
516
  raise JobExecutionError("Cannot continue after shutdown.")
486
517
 
487
518
  # Validation phase
488
- self.validate_submit_parameters(parameters)
519
+ self.validate_submit_parameters(parameters, task_type=task_type)
489
520
 
490
521
  # Create task subfolder
491
522
  original_umask = os.umask(0)
@@ -504,35 +535,58 @@ class RunnerSlurmSudo(BaseRunner):
504
535
  tasks=[
505
536
  SlurmTask(
506
537
  index=0,
507
- component="0",
538
+ component=task_files.component,
508
539
  parameters=parameters,
509
540
  workdir_remote=workdir_remote,
510
541
  workdir_local=workdir_local,
511
542
  task_files=task_files,
512
543
  )
513
544
  ],
514
- ) # TODO: replace with actual values (BASED ON TASKFILES)
515
-
516
- slurm_config.parallel_tasks_per_job = 1
545
+ )
546
+ config.parallel_tasks_per_job = 1
517
547
  self._submit_single_sbatch(
518
548
  func,
519
549
  slurm_job=slurm_job,
520
- slurm_config=slurm_config,
550
+ slurm_config=config,
521
551
  )
552
+ logger.info(f"END submission phase, {self.job_ids=}")
522
553
 
523
- # LOGFILE = task_files.log_file_local
554
+ # FIXME: Replace with more robust/efficient logic
555
+ logger.warning("Now sleep 4 (FIXME)")
556
+ time.sleep(4)
524
557
 
525
558
  # Retrieval phase
559
+ logger.info("START retrieval phase")
526
560
  while len(self.jobs) > 0:
527
561
  if self.is_shutdown():
528
562
  self.scancel_jobs()
529
563
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
530
- for slurm_job_id in finished_job_ids:
531
- slurm_job = self.jobs.pop(slurm_job_id)
532
- self._copy_files_from_remote_to_local(slurm_job)
533
- result, exception = self._postprocess_single_task(
534
- task=slurm_job.tasks[0]
535
- )
564
+ logger.debug(f"{finished_job_ids=}")
565
+ with next(get_sync_db()) as db:
566
+ for slurm_job_id in finished_job_ids:
567
+ logger.debug(f"Now process {slurm_job_id=}")
568
+ slurm_job = self.jobs.pop(slurm_job_id)
569
+ self._copy_files_from_remote_to_local(slurm_job)
570
+ result, exception = self._postprocess_single_task(
571
+ task=slurm_job.tasks[0]
572
+ )
573
+ # Note: the relevant done/failed check is based on
574
+ # whether `exception is None`. The fact that
575
+ # `result is None` is not relevant for this purpose.
576
+ if exception is not None:
577
+ update_status_of_history_unit(
578
+ history_unit_id=history_unit_id,
579
+ status=HistoryUnitStatus.FAILED,
580
+ db_sync=db,
581
+ )
582
+ else:
583
+ if task_type not in ["compound", "converter_compound"]:
584
+ update_status_of_history_unit(
585
+ history_unit_id=history_unit_id,
586
+ status=HistoryUnitStatus.DONE,
587
+ db_sync=db,
588
+ )
589
+
536
590
  time.sleep(self.slurm_poll_interval)
537
591
 
538
592
  return result, exception
@@ -541,22 +595,35 @@ class RunnerSlurmSudo(BaseRunner):
541
595
  self,
542
596
  func: callable,
543
597
  list_parameters: list[dict],
544
- history_item_id: int,
545
- task_files: TaskFiles,
546
- slurm_config: SlurmConfig,
547
- task_type: TaskTypeType,
598
+ history_unit_ids: list[int],
599
+ list_task_files: list[TaskFiles],
600
+ task_type: Literal["parallel", "compound", "converter_compound"],
601
+ config: SlurmConfig,
548
602
  ):
549
- # self.scancel_jobs()
603
+
604
+ if len(self.jobs) > 0:
605
+ raise RuntimeError(
606
+ f"Cannot run .multisubmit when {len(self.jobs)=}"
607
+ )
550
608
 
551
609
  self.validate_multisubmit_parameters(
552
- list_parameters=list_parameters, task_type=task_type
610
+ list_parameters=list_parameters,
611
+ task_type=task_type,
612
+ list_task_files=list_task_files,
613
+ )
614
+ self.validate_multisubmit_history_unit_ids(
615
+ history_unit_ids=history_unit_ids,
616
+ task_type=task_type,
617
+ list_parameters=list_parameters,
553
618
  )
554
619
 
555
- workdir_local = task_files.wftask_subfolder_local
556
- workdir_remote = task_files.wftask_subfolder_remote
620
+ logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
621
+
622
+ workdir_local = list_task_files[0].wftask_subfolder_local
623
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
557
624
 
558
625
  # Create local&remote task subfolders
559
- if task_type not in ["converter_compound", "compound"]:
626
+ if task_type == "parallel":
560
627
  original_umask = os.umask(0)
561
628
  workdir_local.mkdir(parents=True, mode=0o755)
562
629
  os.umask(original_umask)
@@ -571,7 +638,7 @@ class RunnerSlurmSudo(BaseRunner):
571
638
  results: dict[int, Any] = {}
572
639
  exceptions: dict[int, BaseException] = {}
573
640
 
574
- original_task_files = task_files
641
+ original_task_files = list_task_files
575
642
  tot_tasks = len(list_parameters)
576
643
 
577
644
  # Set/validate parameters for task batching
@@ -579,21 +646,21 @@ class RunnerSlurmSudo(BaseRunner):
579
646
  # Number of parallel components (always known)
580
647
  tot_tasks=tot_tasks,
581
648
  # Optional WorkflowTask attributes:
582
- tasks_per_job=slurm_config.tasks_per_job,
583
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
649
+ tasks_per_job=config.tasks_per_job,
650
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
584
651
  # Task requirements (multiple possible sources):
585
- cpus_per_task=slurm_config.cpus_per_task,
586
- mem_per_task=slurm_config.mem_per_task_MB,
652
+ cpus_per_task=config.cpus_per_task,
653
+ mem_per_task=config.mem_per_task_MB,
587
654
  # Fractal configuration variables (soft/hard limits):
588
- target_cpus_per_job=slurm_config.target_cpus_per_job,
589
- target_mem_per_job=slurm_config.target_mem_per_job,
590
- target_num_jobs=slurm_config.target_num_jobs,
591
- max_cpus_per_job=slurm_config.max_cpus_per_job,
592
- max_mem_per_job=slurm_config.max_mem_per_job,
593
- max_num_jobs=slurm_config.max_num_jobs,
655
+ target_cpus_per_job=config.target_cpus_per_job,
656
+ target_mem_per_job=config.target_mem_per_job,
657
+ target_num_jobs=config.target_num_jobs,
658
+ max_cpus_per_job=config.max_cpus_per_job,
659
+ max_mem_per_job=config.max_mem_per_job,
660
+ max_num_jobs=config.max_num_jobs,
594
661
  )
595
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
596
- slurm_config.tasks_per_job = tasks_per_job
662
+ config.parallel_tasks_per_job = parallel_tasks_per_job
663
+ config.tasks_per_job = tasks_per_job
597
664
 
598
665
  # Divide arguments in batches of `tasks_per_job` tasks each
599
666
  args_batches = []
@@ -607,24 +674,18 @@ class RunnerSlurmSudo(BaseRunner):
607
674
 
608
675
  logger.info(f"START submission phase, {list(self.jobs.keys())=}")
609
676
  for ind_batch, chunk in enumerate(args_batches):
610
- # TODO: replace with actual values
611
677
  tasks = []
612
678
  for ind_chunk, parameters in enumerate(chunk):
613
- component = parameters[_COMPONENT_KEY_]
679
+ index = (ind_batch * batch_size) + ind_chunk
614
680
  tasks.append(
615
681
  SlurmTask(
616
- index=(ind_batch * batch_size) + ind_chunk,
617
- component=component,
682
+ index=index,
683
+ component=original_task_files[index].component,
618
684
  workdir_local=workdir_local,
619
685
  workdir_remote=workdir_remote,
620
686
  parameters=parameters,
621
687
  zarr_url=parameters["zarr_url"],
622
- task_files=TaskFiles(
623
- **original_task_files.model_dump(
624
- exclude={"component"}
625
- ),
626
- component=component,
627
- ),
688
+ task_files=original_task_files[index],
628
689
  ),
629
690
  )
630
691
 
@@ -637,26 +698,62 @@ class RunnerSlurmSudo(BaseRunner):
637
698
  self._submit_single_sbatch(
638
699
  func,
639
700
  slurm_job=slurm_job,
640
- slurm_config=slurm_config,
701
+ slurm_config=config,
641
702
  )
642
- logger.info(f"END submission phase, {list(self.jobs.keys())=}")
703
+ logger.info(f"END submission phase, {self.job_ids=}")
704
+
705
+ # FIXME: Replace with more robust/efficient logic
706
+ logger.warning("Now sleep 4 (FIXME)")
707
+ time.sleep(4)
643
708
 
644
709
  # Retrieval phase
710
+ logger.info("START retrieval phase")
645
711
  while len(self.jobs) > 0:
646
712
  if self.is_shutdown():
647
713
  self.scancel_jobs()
648
714
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
649
- for slurm_job_id in finished_job_ids:
650
- slurm_job = self.jobs.pop(slurm_job_id)
651
- self._copy_files_from_remote_to_local(slurm_job)
652
- for task in slurm_job.tasks:
653
- result, exception = self._postprocess_single_task(
654
- task=task
655
- )
656
- if exception is None:
657
- results[task.index] = result
658
- else:
659
- exceptions[task.index] = exception
715
+ logger.debug(f"{finished_job_ids=}")
716
+ with next(get_sync_db()) as db:
717
+ for slurm_job_id in finished_job_ids:
718
+ logger.debug(f"Now processing {slurm_job_id=}")
719
+ slurm_job = self.jobs.pop(slurm_job_id)
720
+ self._copy_files_from_remote_to_local(slurm_job)
721
+ for task in slurm_job.tasks:
722
+ logger.debug(f"Now processing {task.index=}")
723
+ result, exception = self._postprocess_single_task(
724
+ task=task
725
+ )
726
+
727
+ # Note: the relevant done/failed check is based on
728
+ # whether `exception is None`. The fact that
729
+ # `result is None` is not relevant for this purpose.
730
+ if exception is not None:
731
+ logger.debug(
732
+ f"Task {task.index} has an exception."
733
+ ) # FIXME # noqa
734
+ exceptions[task.index] = exception
735
+ if task_type == "parallel":
736
+ update_status_of_history_unit(
737
+ history_unit_id=history_unit_ids[
738
+ task.index
739
+ ],
740
+ status=HistoryUnitStatus.FAILED,
741
+ db_sync=db,
742
+ )
743
+ else:
744
+ logger.debug(
745
+ f"Task {task.index} has no exception."
746
+ ) # FIXME # noqa
747
+ results[task.index] = result
748
+ if task_type == "parallel":
749
+ update_status_of_history_unit(
750
+ history_unit_id=history_unit_ids[
751
+ task.index
752
+ ],
753
+ status=HistoryUnitStatus.DONE,
754
+ db_sync=db,
755
+ )
756
+
660
757
  time.sleep(self.slurm_poll_interval)
661
758
  return results, exceptions
662
759
 
@@ -96,3 +96,11 @@ class TaskFiles(BaseModel):
96
96
  return (
97
97
  self.wftask_subfolder_remote / f"{self.component}-metadiff.json"
98
98
  ).as_posix()
99
+
100
+ @property
101
+ def remote_files_dict(self) -> dict[str, str]:
102
+ return dict(
103
+ args_file_remote=self.args_file_remote,
104
+ metadiff_file_remote=self.metadiff_file_remote,
105
+ log_file_remote=self.log_file_remote,
106
+ )