fractal-server 2.14.0a10__py3-none-any.whl → 2.14.0a11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/routes/api/v2/submit.py +1 -1
  3. fractal_server/app/runner/components.py +0 -3
  4. fractal_server/app/runner/exceptions.py +4 -0
  5. fractal_server/app/runner/executors/base_runner.py +16 -17
  6. fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
  7. fractal_server/app/runner/executors/local/runner.py +117 -58
  8. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +4 -0
  9. fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
  10. fractal_server/app/runner/executors/slurm_ssh/runner.py +6 -10
  11. fractal_server/app/runner/executors/slurm_sudo/runner.py +201 -96
  12. fractal_server/app/runner/task_files.py +8 -0
  13. fractal_server/app/runner/v2/__init__.py +0 -366
  14. fractal_server/app/runner/v2/_local.py +2 -2
  15. fractal_server/app/runner/v2/_slurm_ssh.py +2 -2
  16. fractal_server/app/runner/v2/_slurm_sudo.py +2 -2
  17. fractal_server/app/runner/v2/db_tools.py +87 -0
  18. fractal_server/app/runner/v2/runner.py +77 -81
  19. fractal_server/app/runner/v2/runner_functions.py +274 -436
  20. fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
  21. fractal_server/app/runner/v2/submit_workflow.py +366 -0
  22. fractal_server/app/runner/v2/task_interface.py +31 -0
  23. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
  24. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +27 -28
  25. fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
  26. fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
  27. fractal_server/app/runner/v2/_db_tools.py +0 -48
  28. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
  29. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
  30. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -9,19 +9,19 @@ import time
9
9
  from copy import copy
10
10
  from pathlib import Path
11
11
  from typing import Any
12
+ from typing import Literal
12
13
  from typing import Optional
13
14
 
14
15
  import cloudpickle
15
16
  from pydantic import BaseModel
16
17
  from pydantic import ConfigDict
17
18
 
18
- from ..slurm_common._check_jobs_status import (
19
- get_finished_jobs,
20
- )
19
+ from ..slurm_common._check_jobs_status import get_finished_jobs
20
+ from ..slurm_common._check_jobs_status import run_squeue
21
21
  from ._subprocess_run_as_user import _mkdir_as_user
22
22
  from ._subprocess_run_as_user import _run_command_as_user
23
23
  from fractal_server import __VERSION__
24
- from fractal_server.app.runner.components import _COMPONENT_KEY_
24
+ from fractal_server.app.db import get_sync_db
25
25
  from fractal_server.app.runner.exceptions import JobExecutionError
26
26
  from fractal_server.app.runner.exceptions import TaskExecutionError
27
27
  from fractal_server.app.runner.executors.base_runner import BaseRunner
@@ -33,7 +33,8 @@ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
33
33
  )
34
34
  from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
35
35
  from fractal_server.app.runner.task_files import TaskFiles
36
- from fractal_server.app.schemas.v2.task import TaskTypeType
36
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
37
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
37
38
  from fractal_server.config import get_settings
38
39
  from fractal_server.logger import set_logger
39
40
  from fractal_server.syringe import Inject
@@ -100,48 +101,68 @@ class SlurmJob(BaseModel):
100
101
  tasks: list[SlurmTask]
101
102
 
102
103
  @property
103
- def slurm_log_file_local(self) -> str:
104
+ def slurm_submission_script_local(self) -> str:
105
+ return (
106
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
107
+ ).as_posix()
108
+
109
+ @property
110
+ def slurm_submission_script_remote(self) -> str:
111
+ return (
112
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
113
+ ).as_posix()
114
+
115
+ @property
116
+ def slurm_stdout_remote(self) -> str:
104
117
  if self.slurm_job_id:
105
118
  return (
106
- self.workdir_local
107
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
119
+ self.workdir_remote
120
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
108
121
  ).as_posix()
122
+
109
123
  else:
110
124
  return (
111
- self.workdir_local / f"slurm-{self.label}-%j.log"
125
+ self.workdir_remote / f"slurm-{self.label}-%j.out"
112
126
  ).as_posix()
113
127
 
114
128
  @property
115
- def slurm_log_file_remote(self) -> str:
129
+ def slurm_stderr_remote(self) -> str:
116
130
  if self.slurm_job_id:
117
131
  return (
118
132
  self.workdir_remote
119
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
133
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
120
134
  ).as_posix()
135
+
121
136
  else:
122
137
  return (
123
- self.workdir_remote / f"slurm-{self.label}-%j.log"
138
+ self.workdir_remote / f"slurm-{self.label}-%j.err"
124
139
  ).as_posix()
125
140
 
126
141
  @property
127
- def slurm_submission_script_local(self) -> str:
128
- return (
129
- self.workdir_local / f"slurm-{self.label}-submit.sh"
130
- ).as_posix()
142
+ def slurm_stdout_local(self) -> str:
143
+ if self.slurm_job_id:
144
+ return (
145
+ self.workdir_local
146
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
147
+ ).as_posix()
131
148
 
132
- @property
133
- def slurm_submission_script_remote(self) -> str:
134
- return (
135
- self.workdir_remote / f"slurm-{self.label}-submit.sh"
136
- ).as_posix()
149
+ else:
150
+ return (
151
+ self.workdir_local / f"slurm-{self.label}-%j.out"
152
+ ).as_posix()
137
153
 
138
154
  @property
139
- def slurm_stdout(self) -> str:
140
- return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
155
+ def slurm_stderr_local(self) -> str:
156
+ if self.slurm_job_id:
157
+ return (
158
+ self.workdir_local
159
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
160
+ ).as_posix()
141
161
 
142
- @property
143
- def slurm_stderr(self) -> str:
144
- return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
162
+ else:
163
+ return (
164
+ self.workdir_local / f"slurm-{self.label}-%j.err"
165
+ ).as_posix()
145
166
 
146
167
  @property
147
168
  def log_files_local(self) -> list[str]:
@@ -287,6 +308,7 @@ class RunnerSlurmSudo(BaseRunner):
287
308
  slurm_job: SlurmJob,
288
309
  slurm_config: SlurmConfig,
289
310
  ) -> str:
311
+ logger.debug("[_submit_single_sbatch] START")
290
312
  # Prepare input pickle(s)
291
313
  versions = dict(
292
314
  python=sys.version_info[:3],
@@ -295,10 +317,17 @@ class RunnerSlurmSudo(BaseRunner):
295
317
  )
296
318
  for task in slurm_job.tasks:
297
319
  _args = []
298
- _kwargs = dict(parameters=task.parameters)
320
+ _kwargs = dict(
321
+ parameters=task.parameters,
322
+ remote_files=task.task_files.remote_files_dict,
323
+ )
299
324
  funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
300
325
  with open(task.input_pickle_file_local, "wb") as f:
301
326
  f.write(funcser)
327
+ logger.debug(
328
+ "[_submit_single_sbatch] Written "
329
+ f"{task.input_pickle_file_local=}"
330
+ )
302
331
  # Prepare commands to be included in SLURM submission script
303
332
  settings = Inject(get_settings)
304
333
  python_worker_interpreter = (
@@ -335,8 +364,8 @@ class RunnerSlurmSudo(BaseRunner):
335
364
  # fix their order
336
365
  script_lines.extend(
337
366
  [
338
- f"#SBATCH --err={slurm_job.slurm_stderr}",
339
- f"#SBATCH --out={slurm_job.slurm_stdout}",
367
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
368
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
340
369
  f"#SBATCH -D {slurm_job.workdir_remote}",
341
370
  ]
342
371
  )
@@ -394,8 +423,10 @@ class RunnerSlurmSudo(BaseRunner):
394
423
  """
395
424
  Note: this would differ for SSH
396
425
  """
426
+ logger.debug(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
397
427
  source_target_list = [
398
- (job.slurm_log_file_remote, job.slurm_log_file_local)
428
+ (job.slurm_stdout_remote, job.slurm_stdout_local),
429
+ (job.slurm_stderr_remote, job.slurm_stderr_local),
399
430
  ]
400
431
  for task in job.tasks:
401
432
  source_target_list.extend(
@@ -463,21 +494,22 @@ class RunnerSlurmSudo(BaseRunner):
463
494
  self,
464
495
  func: callable,
465
496
  parameters: dict[str, Any],
466
- history_item_id: int,
497
+ history_unit_id: int,
467
498
  task_files: TaskFiles,
468
- slurm_config: SlurmConfig,
469
- task_type: TaskTypeType,
499
+ task_type: Literal[
500
+ "non_parallel",
501
+ "converter_non_parallel",
502
+ "compound",
503
+ "converter_compound",
504
+ ],
505
+ config: SlurmConfig,
470
506
  ) -> tuple[Any, Exception]:
471
- workdir_local = task_files.wftask_subfolder_local
472
- workdir_remote = task_files.wftask_subfolder_remote
473
507
 
474
- task_files = TaskFiles(
475
- **task_files.model_dump(
476
- exclude={"component"},
477
- ),
478
- component=parameters[_COMPONENT_KEY_],
479
- )
508
+ if len(self.jobs) > 0:
509
+ raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
480
510
 
511
+ workdir_local = task_files.wftask_subfolder_local
512
+ workdir_remote = task_files.wftask_subfolder_remote
481
513
  if self.jobs != {}:
482
514
  raise JobExecutionError("Unexpected branch: jobs should be empty.")
483
515
 
@@ -485,7 +517,7 @@ class RunnerSlurmSudo(BaseRunner):
485
517
  raise JobExecutionError("Cannot continue after shutdown.")
486
518
 
487
519
  # Validation phase
488
- self.validate_submit_parameters(parameters)
520
+ self.validate_submit_parameters(parameters, task_type=task_type)
489
521
 
490
522
  # Create task subfolder
491
523
  original_umask = os.umask(0)
@@ -504,7 +536,7 @@ class RunnerSlurmSudo(BaseRunner):
504
536
  tasks=[
505
537
  SlurmTask(
506
538
  index=0,
507
- component="0",
539
+ component=task_files.component,
508
540
  parameters=parameters,
509
541
  workdir_remote=workdir_remote,
510
542
  workdir_local=workdir_local,
@@ -513,26 +545,56 @@ class RunnerSlurmSudo(BaseRunner):
513
545
  ],
514
546
  ) # TODO: replace with actual values (BASED ON TASKFILES)
515
547
 
516
- slurm_config.parallel_tasks_per_job = 1
548
+ config.parallel_tasks_per_job = 1
517
549
  self._submit_single_sbatch(
518
550
  func,
519
551
  slurm_job=slurm_job,
520
- slurm_config=slurm_config,
552
+ slurm_config=config,
521
553
  )
522
-
523
- # LOGFILE = task_files.log_file_local
554
+ logger.debug("END SUBMISSION PHASE")
555
+ logger.debug(f"{self.jobs=}")
556
+ logger.debug(f"{self.job_ids=}")
557
+
558
+ # FIXME
559
+ jobs_that_started = set()
560
+ while len(jobs_that_started) != len(self.job_ids):
561
+ logger.debug("CALL SQUEUE")
562
+ res = run_squeue(self.job_ids)
563
+ new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
564
+ jobs_that_started = jobs_that_started.union(new_jobs)
565
+ logger.debug(f"{new_jobs=}")
566
+ logger.debug(f"{len(jobs_that_started)=}")
567
+
568
+ logger.debug("START RETRIEVAL PHASE")
524
569
 
525
570
  # Retrieval phase
526
571
  while len(self.jobs) > 0:
527
572
  if self.is_shutdown():
528
573
  self.scancel_jobs()
529
574
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
530
- for slurm_job_id in finished_job_ids:
531
- slurm_job = self.jobs.pop(slurm_job_id)
532
- self._copy_files_from_remote_to_local(slurm_job)
533
- result, exception = self._postprocess_single_task(
534
- task=slurm_job.tasks[0]
535
- )
575
+ logger.debug(f"{finished_job_ids=}")
576
+ with next(get_sync_db()) as db:
577
+ for slurm_job_id in finished_job_ids:
578
+ logger.debug(f"Now process {slurm_job_id=}")
579
+ slurm_job = self.jobs.pop(slurm_job_id)
580
+ self._copy_files_from_remote_to_local(slurm_job)
581
+ result, exception = self._postprocess_single_task(
582
+ task=slurm_job.tasks[0]
583
+ )
584
+ if result is not None:
585
+ if task_type not in ["compound", "converter_compound"]:
586
+ update_status_of_history_unit(
587
+ history_unit_id=history_unit_id,
588
+ status=HistoryUnitStatus.DONE,
589
+ db_sync=db,
590
+ )
591
+ if exception is not None:
592
+ update_status_of_history_unit(
593
+ history_unit_id=history_unit_id,
594
+ status=HistoryUnitStatus.FAILED,
595
+ db_sync=db,
596
+ )
597
+
536
598
  time.sleep(self.slurm_poll_interval)
537
599
 
538
600
  return result, exception
@@ -541,19 +603,38 @@ class RunnerSlurmSudo(BaseRunner):
541
603
  self,
542
604
  func: callable,
543
605
  list_parameters: list[dict],
544
- history_item_id: int,
545
- task_files: TaskFiles,
546
- slurm_config: SlurmConfig,
547
- task_type: TaskTypeType,
606
+ history_unit_ids: list[int],
607
+ list_task_files: list[TaskFiles],
608
+ task_type: Literal["parallel", "compound", "converter_compound"],
609
+ config: SlurmConfig,
548
610
  ):
549
- # self.scancel_jobs()
611
+
612
+ if len(self.jobs) > 0:
613
+ raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
614
+
615
+ if task_type in ["compound", "converter_compound"]:
616
+ if len(history_unit_ids) != 1:
617
+ raise NotImplementedError(
618
+ "We are breaking the assumption that compound/multisubmit "
619
+ "is associated to a single HistoryUnit. This is not "
620
+ "supported."
621
+ )
622
+ elif task_type == "parallel" and len(history_unit_ids) != len(
623
+ list_parameters
624
+ ):
625
+ raise ValueError(
626
+ f"{len(history_unit_ids)=} differs from "
627
+ f"{len(list_parameters)=}."
628
+ )
550
629
 
551
630
  self.validate_multisubmit_parameters(
552
- list_parameters=list_parameters, task_type=task_type
631
+ list_parameters=list_parameters,
632
+ task_type=task_type,
633
+ list_task_files=list_task_files,
553
634
  )
554
635
 
555
- workdir_local = task_files.wftask_subfolder_local
556
- workdir_remote = task_files.wftask_subfolder_remote
636
+ workdir_local = list_task_files[0].wftask_subfolder_local
637
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
557
638
 
558
639
  # Create local&remote task subfolders
559
640
  if task_type not in ["converter_compound", "compound"]:
@@ -571,7 +652,7 @@ class RunnerSlurmSudo(BaseRunner):
571
652
  results: dict[int, Any] = {}
572
653
  exceptions: dict[int, BaseException] = {}
573
654
 
574
- original_task_files = task_files
655
+ original_task_files = list_task_files
575
656
  tot_tasks = len(list_parameters)
576
657
 
577
658
  # Set/validate parameters for task batching
@@ -579,21 +660,21 @@ class RunnerSlurmSudo(BaseRunner):
579
660
  # Number of parallel components (always known)
580
661
  tot_tasks=tot_tasks,
581
662
  # Optional WorkflowTask attributes:
582
- tasks_per_job=slurm_config.tasks_per_job,
583
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
663
+ tasks_per_job=config.tasks_per_job,
664
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
584
665
  # Task requirements (multiple possible sources):
585
- cpus_per_task=slurm_config.cpus_per_task,
586
- mem_per_task=slurm_config.mem_per_task_MB,
666
+ cpus_per_task=config.cpus_per_task,
667
+ mem_per_task=config.mem_per_task_MB,
587
668
  # Fractal configuration variables (soft/hard limits):
588
- target_cpus_per_job=slurm_config.target_cpus_per_job,
589
- target_mem_per_job=slurm_config.target_mem_per_job,
590
- target_num_jobs=slurm_config.target_num_jobs,
591
- max_cpus_per_job=slurm_config.max_cpus_per_job,
592
- max_mem_per_job=slurm_config.max_mem_per_job,
593
- max_num_jobs=slurm_config.max_num_jobs,
669
+ target_cpus_per_job=config.target_cpus_per_job,
670
+ target_mem_per_job=config.target_mem_per_job,
671
+ target_num_jobs=config.target_num_jobs,
672
+ max_cpus_per_job=config.max_cpus_per_job,
673
+ max_mem_per_job=config.max_mem_per_job,
674
+ max_num_jobs=config.max_num_jobs,
594
675
  )
595
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
596
- slurm_config.tasks_per_job = tasks_per_job
676
+ config.parallel_tasks_per_job = parallel_tasks_per_job
677
+ config.tasks_per_job = tasks_per_job
597
678
 
598
679
  # Divide arguments in batches of `tasks_per_job` tasks each
599
680
  args_batches = []
@@ -607,24 +688,18 @@ class RunnerSlurmSudo(BaseRunner):
607
688
 
608
689
  logger.info(f"START submission phase, {list(self.jobs.keys())=}")
609
690
  for ind_batch, chunk in enumerate(args_batches):
610
- # TODO: replace with actual values
611
691
  tasks = []
612
692
  for ind_chunk, parameters in enumerate(chunk):
613
- component = parameters[_COMPONENT_KEY_]
693
+ index = (ind_batch * batch_size) + ind_chunk
614
694
  tasks.append(
615
695
  SlurmTask(
616
- index=(ind_batch * batch_size) + ind_chunk,
617
- component=component,
696
+ index=index,
697
+ component=original_task_files[index].component,
618
698
  workdir_local=workdir_local,
619
699
  workdir_remote=workdir_remote,
620
700
  parameters=parameters,
621
701
  zarr_url=parameters["zarr_url"],
622
- task_files=TaskFiles(
623
- **original_task_files.model_dump(
624
- exclude={"component"}
625
- ),
626
- component=component,
627
- ),
702
+ task_files=original_task_files[index],
628
703
  ),
629
704
  )
630
705
 
@@ -637,26 +712,56 @@ class RunnerSlurmSudo(BaseRunner):
637
712
  self._submit_single_sbatch(
638
713
  func,
639
714
  slurm_job=slurm_job,
640
- slurm_config=slurm_config,
715
+ slurm_config=config,
641
716
  )
642
717
  logger.info(f"END submission phase, {list(self.jobs.keys())=}")
643
718
 
719
+ # FIXME
720
+ jobs_that_started = set()
721
+ while len(jobs_that_started) != len(self.job_ids):
722
+ res = run_squeue(self.job_ids)
723
+ new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
724
+ jobs_that_started = jobs_that_started.union(new_jobs)
725
+ logger.debug(f"{new_jobs=}")
726
+ logger.debug(f"{len(jobs_that_started)=}")
727
+
644
728
  # Retrieval phase
645
729
  while len(self.jobs) > 0:
646
730
  if self.is_shutdown():
647
731
  self.scancel_jobs()
648
732
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
649
- for slurm_job_id in finished_job_ids:
650
- slurm_job = self.jobs.pop(slurm_job_id)
651
- self._copy_files_from_remote_to_local(slurm_job)
652
- for task in slurm_job.tasks:
653
- result, exception = self._postprocess_single_task(
654
- task=task
655
- )
656
- if exception is None:
657
- results[task.index] = result
658
- else:
659
- exceptions[task.index] = exception
733
+ logger.debug(f"{finished_job_ids=}")
734
+ with next(get_sync_db()) as db:
735
+ for slurm_job_id in finished_job_ids:
736
+ logger.debug(f"Now processing {slurm_job_id=}")
737
+ slurm_job = self.jobs.pop(slurm_job_id)
738
+ self._copy_files_from_remote_to_local(slurm_job)
739
+ for task in slurm_job.tasks:
740
+ result, exception = self._postprocess_single_task(
741
+ task=task
742
+ )
743
+
744
+ if result is not None:
745
+ results[task.index] = result
746
+ if task_type == "parallel":
747
+ update_status_of_history_unit(
748
+ history_unit_id=history_unit_ids[
749
+ task.index
750
+ ],
751
+ status=HistoryUnitStatus.DONE,
752
+ db_sync=db,
753
+ )
754
+ if exception is not None:
755
+ exceptions[task.index] = exception
756
+ if task_type == "parallel":
757
+ update_status_of_history_unit(
758
+ history_unit_id=history_unit_ids[
759
+ task.index
760
+ ],
761
+ status=HistoryUnitStatus.FAILED,
762
+ db_sync=db,
763
+ )
764
+
660
765
  time.sleep(self.slurm_poll_interval)
661
766
  return results, exceptions
662
767
 
@@ -96,3 +96,11 @@ class TaskFiles(BaseModel):
96
96
  return (
97
97
  self.wftask_subfolder_remote / f"{self.component}-metadiff.json"
98
98
  ).as_posix()
99
+
100
+ @property
101
+ def remote_files_dict(self) -> dict[str, str]:
102
+ return dict(
103
+ args_file_remote=self.args_file_remote,
104
+ metadiff_file_remote=self.metadiff_file_remote,
105
+ log_file_remote=self.log_file_remote,
106
+ )