fractal-server 2.14.0a21__py3-none-any.whl → 2.14.0a23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __VERSION__ = "2.14.0a21"
1
+ __VERSION__ = "2.14.0a23"
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from typing import Any
2
3
  from typing import Optional
3
4
 
@@ -92,7 +93,7 @@ async def get_workflow_tasks_statuses(
92
93
  db=db,
93
94
  )
94
95
 
95
- response = {}
96
+ response: dict[int, dict[str, int | str] | None] = {}
96
97
  for wftask in workflow.task_list:
97
98
  res = await db.execute(
98
99
  select(HistoryRun)
@@ -130,7 +131,18 @@ async def get_workflow_tasks_statuses(
130
131
  f"num_{target_status.value}_images"
131
132
  ] = num_images
132
133
 
133
- return JSONResponse(content=response, status_code=200)
134
+ new_response = deepcopy(response)
135
+ for key, value in response.items():
136
+ if value is not None:
137
+ num_total_images = sum(
138
+ value[f"num_{target_status.value}_images"]
139
+ for target_status in HistoryUnitStatus
140
+ )
141
+ if num_total_images > value["num_available_images"]:
142
+ value["num_available_images"] = None
143
+ new_response[key] = value
144
+
145
+ return JSONResponse(content=new_response, status_code=200)
134
146
 
135
147
 
136
148
  @router.get("/project/{project_id}/status/run/")
@@ -11,8 +11,8 @@ built-in `tarfile` library has to do with performance issues we observed
11
11
  when handling files which were just created within a SLURM job, and in the
12
12
  context of a CephFS filesystem.
13
13
  """
14
- import shutil
15
14
  import sys
15
+ import time
16
16
  from pathlib import Path
17
17
 
18
18
  from fractal_server.app.runner.run_subprocess import run_subprocess
@@ -20,48 +20,66 @@ from fractal_server.logger import get_logger
20
20
  from fractal_server.logger import set_logger
21
21
 
22
22
 
23
- def copy_subfolder(src: Path, dest: Path, logger_name: str):
23
+ def _copy_subfolder(src: Path, dest: Path, logger_name: str):
24
+ t_start = time.perf_counter()
24
25
  cmd_cp = f"cp -r {src.as_posix()} {dest.as_posix()}"
25
26
  logger = get_logger(logger_name=logger_name)
26
27
  logger.debug(f"{cmd_cp=}")
27
28
  res = run_subprocess(cmd=cmd_cp, logger_name=logger_name)
29
+ elapsed = time.perf_counter() - t_start
30
+ logger.debug(f"[_copy_subfolder] END {elapsed=} s ({dest.as_posix()})")
28
31
  return res
29
32
 
30
33
 
31
- def create_tar_archive(
32
- tarfile_path: Path,
34
+ def _create_tar_archive(
35
+ tarfile_path: str,
33
36
  subfolder_path_tmp_copy: Path,
34
37
  logger_name: str,
35
- remote_to_local: bool,
38
+ filelist_path: str | None,
36
39
  ):
37
40
  logger = get_logger(logger_name)
38
-
39
- if remote_to_local:
40
- exclude_options = "--exclude *sbatch --exclude *_in_*.pickle "
41
+ logger.debug(f"[_create_tar_archive] START ({tarfile_path})")
42
+ t_start = time.perf_counter()
43
+
44
+ if filelist_path is None:
45
+ cmd_tar = (
46
+ f"tar -c -z -f {tarfile_path} "
47
+ f"--directory={subfolder_path_tmp_copy.as_posix()} "
48
+ "."
49
+ )
41
50
  else:
42
- exclude_options = ""
51
+ cmd_tar = (
52
+ f"tar -c -z -f {tarfile_path} "
53
+ f"--directory={subfolder_path_tmp_copy.as_posix()} "
54
+ f"--files-from={filelist_path} --ignore-failed-read"
55
+ )
56
+
57
+ logger.critical(f"cmd tar:\n{cmd_tar}")
43
58
 
44
- cmd_tar = (
45
- f"tar czf {tarfile_path} "
46
- f"{exclude_options} "
47
- f"--directory={subfolder_path_tmp_copy.as_posix()} "
48
- "."
49
- )
50
- logger.debug(f"cmd tar:\n{cmd_tar}")
51
59
  run_subprocess(cmd=cmd_tar, logger_name=logger_name, allow_char="*")
60
+ elapsed = time.perf_counter() - t_start
61
+ logger.debug(f"[_create_tar_archive] END {elapsed=} s ({tarfile_path})")
52
62
 
53
63
 
54
- def remove_temp_subfolder(subfolder_path_tmp_copy: Path, logger_name: str):
64
+ def _remove_temp_subfolder(subfolder_path_tmp_copy: Path, logger_name: str):
55
65
  logger = get_logger(logger_name)
66
+ t_start = time.perf_counter()
56
67
  try:
57
- logger.debug(f"Now remove {subfolder_path_tmp_copy}")
58
- shutil.rmtree(subfolder_path_tmp_copy)
68
+ cmd_rm = f"rm -rf {subfolder_path_tmp_copy}"
69
+ logger.debug(f"cmd rm:\n{cmd_rm}")
70
+ run_subprocess(cmd=cmd_rm, logger_name=logger_name, allow_char="*")
59
71
  except Exception as e:
60
- logger.debug(f"ERROR during shutil.rmtree: {e}")
72
+ logger.debug(f"ERROR during {cmd_rm}: {e}")
73
+ elapsed = time.perf_counter() - t_start
74
+ logger.debug(
75
+ f"[_remove_temp_subfolder] END {elapsed=} s "
76
+ f"({subfolder_path_tmp_copy=})"
77
+ )
61
78
 
62
79
 
63
80
  def compress_folder(
64
- subfolder_path: Path, remote_to_local: bool = False
81
+ subfolder_path: Path,
82
+ filelist_path: str | None,
65
83
  ) -> str:
66
84
  """
67
85
  Compress e.g. `/path/archive` into `/path/archive.tar.gz`
@@ -91,14 +109,16 @@ def compress_folder(
91
109
  subfolder_path.parent / f"{subfolder_path.name}_copy"
92
110
  )
93
111
  try:
94
- copy_subfolder(
95
- subfolder_path, subfolder_path_tmp_copy, logger_name=logger_name
112
+ _copy_subfolder(
113
+ subfolder_path,
114
+ subfolder_path_tmp_copy,
115
+ logger_name=logger_name,
96
116
  )
97
- create_tar_archive(
117
+ _create_tar_archive(
98
118
  tarfile_path,
99
119
  subfolder_path_tmp_copy,
100
120
  logger_name=logger_name,
101
- remote_to_local=remote_to_local,
121
+ filelist_path=filelist_path,
102
122
  )
103
123
  return tarfile_path
104
124
 
@@ -107,7 +127,9 @@ def compress_folder(
107
127
  sys.exit(1)
108
128
 
109
129
  finally:
110
- remove_temp_subfolder(subfolder_path_tmp_copy, logger_name=logger_name)
130
+ _remove_temp_subfolder(
131
+ subfolder_path_tmp_copy, logger_name=logger_name
132
+ )
111
133
 
112
134
 
113
135
  def main(sys_argv: list[str]):
@@ -115,15 +137,21 @@ def main(sys_argv: list[str]):
115
137
  help_msg = (
116
138
  "Expected use:\n"
117
139
  "python -m fractal_server.app.runner.compress_folder "
118
- "path/to/folder [--remote-to-local]\n"
140
+ "path/to/folder [--filelist /path/to/filelist]\n"
119
141
  )
120
142
  num_args = len(sys_argv[1:])
121
143
  if num_args == 0:
122
144
  sys.exit(f"Invalid argument.\n{help_msg}\nProvided: {sys_argv[1:]=}")
123
145
  elif num_args == 1:
124
- compress_folder(subfolder_path=Path(sys_argv[1]))
125
- elif num_args == 2 and sys_argv[2] == "--remote-to-local":
126
- compress_folder(subfolder_path=Path(sys_argv[1]), remote_to_local=True)
146
+ compress_folder(
147
+ subfolder_path=Path(sys_argv[1]),
148
+ filelist_path=None,
149
+ )
150
+ elif num_args == 3 and sys_argv[2] == "--filelist":
151
+ compress_folder(
152
+ subfolder_path=Path(sys_argv[1]),
153
+ filelist_path=sys_argv[3],
154
+ )
127
155
  else:
128
156
  sys.exit(f"Invalid argument.\n{help_msg}\nProvided: {sys_argv[1:]=}")
129
157
 
@@ -60,6 +60,7 @@ class BaseSlurmRunner(BaseRunner):
60
60
  root_dir_local: Path,
61
61
  root_dir_remote: Path,
62
62
  slurm_runner_type: Literal["ssh", "sudo"],
63
+ python_worker_interpreter: str,
63
64
  common_script_lines: Optional[list[str]] = None,
64
65
  user_cache_dir: Optional[str] = None,
65
66
  poll_interval: Optional[int] = None,
@@ -70,6 +71,7 @@ class BaseSlurmRunner(BaseRunner):
70
71
  self.common_script_lines = common_script_lines or []
71
72
  self._check_slurm_account()
72
73
  self.user_cache_dir = user_cache_dir
74
+ self.python_worker_interpreter = python_worker_interpreter
73
75
 
74
76
  settings = Inject(get_settings)
75
77
 
@@ -327,9 +329,9 @@ class BaseSlurmRunner(BaseRunner):
327
329
  )
328
330
  logger.info("[_submit_single_sbatch] END")
329
331
 
330
- def _copy_files_from_remote_to_local(
332
+ def _fetch_artifacts(
331
333
  self,
332
- slurm_job: SlurmJob,
334
+ finished_slurm_jobs: list[SlurmJob],
333
335
  ) -> None:
334
336
  raise NotImplementedError("Implement in child class.")
335
337
 
@@ -530,14 +532,14 @@ class BaseSlurmRunner(BaseRunner):
530
532
  # Look for finished jobs
531
533
  finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
532
534
  logger.debug(f"[submit] {finished_job_ids=}")
533
-
535
+ finished_jobs = [
536
+ self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
537
+ ]
538
+ self._fetch_artifacts(finished_jobs)
534
539
  with next(get_sync_db()) as db:
535
540
  for slurm_job_id in finished_job_ids:
536
541
  logger.debug(f"[submit] Now process {slurm_job_id=}")
537
542
  slurm_job = self.jobs.pop(slurm_job_id)
538
- self._copy_files_from_remote_to_local(
539
- slurm_job
540
- ) # FIXME: add prefix # noqa
541
543
  was_job_scancelled = slurm_job_id in scancelled_job_ids
542
544
  result, exception = self._postprocess_single_task(
543
545
  task=slurm_job.tasks[0],
@@ -653,7 +655,9 @@ class BaseSlurmRunner(BaseRunner):
653
655
  if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
654
656
  raise RuntimeError("Something wrong here while batching tasks")
655
657
 
656
- logger.info(f"START submission phase, {list(self.jobs.keys())=}")
658
+ # Part 1/3: Iterate over chunks, prepare SlurmJob objects
659
+ logger.info("[multisubmit] Prepare `SlurmJob`s.")
660
+ jobs_to_submit = []
657
661
  for ind_batch, chunk in enumerate(args_batches):
658
662
  prefix = f"{MULTISUBMIT_PREFIX}-{ind_batch:06d}"
659
663
  tasks = []
@@ -673,17 +677,26 @@ class BaseSlurmRunner(BaseRunner):
673
677
  ),
674
678
  )
675
679
 
676
- slurm_job = SlurmJob(
677
- prefix=prefix,
678
- workdir_local=workdir_local,
679
- workdir_remote=workdir_remote,
680
- tasks=tasks,
680
+ jobs_to_submit.append(
681
+ SlurmJob(
682
+ prefix=prefix,
683
+ workdir_local=workdir_local,
684
+ workdir_remote=workdir_remote,
685
+ tasks=tasks,
686
+ )
681
687
  )
688
+
689
+ # FIXME: split parts 2 and 3
690
+ # Part 2/3. Transfer all relevant input files (for SSH)
691
+ # Part 3/3. Run all `sbatch`es and update `self.jobs`
692
+ logger.info("[multisubmit] Transfer files and submit jobs.")
693
+ for slurm_job in jobs_to_submit:
682
694
  self._submit_single_sbatch(
683
695
  func,
684
696
  slurm_job=slurm_job,
685
697
  slurm_config=config,
686
698
  )
699
+
687
700
  if task_type == "parallel":
688
701
  # FIXME: replace loop with a `bulk_update_history_unit` function
689
702
  for ind, task_files in enumerate(list_task_files):
@@ -711,20 +724,21 @@ class BaseSlurmRunner(BaseRunner):
711
724
 
712
725
  # Retrieval phase
713
726
  logger.info("[multisubmit] START retrieval phase")
727
+ scancelled_job_ids = []
714
728
  while len(self.jobs) > 0:
715
729
 
716
730
  # Look for finished jobs
717
731
  finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
718
732
  logger.debug(f"[multisubmit] {finished_job_ids=}")
733
+ finished_jobs = [
734
+ self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
735
+ ]
736
+ self._fetch_artifacts(finished_jobs)
719
737
 
720
- scancelled_job_ids = []
721
738
  with next(get_sync_db()) as db:
722
739
  for slurm_job_id in finished_job_ids:
723
740
  logger.info(f"[multisubmit] Now process {slurm_job_id=}")
724
741
  slurm_job = self.jobs.pop(slurm_job_id)
725
- self._copy_files_from_remote_to_local(
726
- slurm_job
727
- ) # FIXME: add prefix # noqa
728
742
  for task in slurm_job.tasks:
729
743
  logger.info(f"[multisubmit] Now process {task.index=}")
730
744
  was_job_scancelled = slurm_job_id in scancelled_job_ids
@@ -810,3 +824,19 @@ class BaseSlurmRunner(BaseRunner):
810
824
  )
811
825
  logger.info("[scancel_jobs] END")
812
826
  return scancelled_job_ids
827
+
828
+ def validate_slurm_jobs_workdirs(
829
+ self,
830
+ slurm_jobs: list[SlurmJob],
831
+ ) -> None:
832
+ """
833
+ Check that a list of `SlurmJob`s have homogeneous working folders.
834
+ """
835
+ # Extract `workdir_remote` and `workdir_local`
836
+ set_workdir_local = set(_job.workdir_local for _job in slurm_jobs)
837
+ set_workdir_remote = set(_job.workdir_remote for _job in slurm_jobs)
838
+
839
+ if len(set_workdir_local) > 1:
840
+ raise ValueError(f"Non-unique values in {set_workdir_local=}.")
841
+ if len(set_workdir_remote) > 1:
842
+ raise ValueError(f"Non-unique values in {set_workdir_remote=}.")
@@ -20,31 +20,47 @@ class SlurmTask(BaseModel):
20
20
  index: int
21
21
 
22
22
  @property
23
- def input_pickle_file_local(self) -> str:
23
+ def input_pickle_file_local_path(self) -> Path:
24
24
  return (
25
25
  self.workdir_local / f"{self.prefix}-{self.component}-input.pickle"
26
- ).as_posix()
26
+ )
27
27
 
28
28
  @property
29
- def input_pickle_file_remote(self) -> str:
29
+ def input_pickle_file_remote_path(self) -> Path:
30
30
  return (
31
31
  self.workdir_remote
32
32
  / f"{self.prefix}-{self.component}-input.pickle"
33
- ).as_posix()
33
+ )
34
34
 
35
35
  @property
36
- def output_pickle_file_local(self) -> str:
36
+ def output_pickle_file_local_path(self) -> Path:
37
37
  return (
38
38
  self.workdir_local
39
39
  / f"{self.prefix}-{self.component}-output.pickle"
40
- ).as_posix()
40
+ )
41
41
 
42
42
  @property
43
- def output_pickle_file_remote(self) -> str:
43
+ def output_pickle_file_remote_path(self) -> Path:
44
44
  return (
45
45
  self.workdir_remote
46
46
  / f"{self.prefix}-{self.component}-output.pickle"
47
- ).as_posix()
47
+ )
48
+
49
+ @property
50
+ def input_pickle_file_local(self) -> str:
51
+ return self.input_pickle_file_local_path.as_posix()
52
+
53
+ @property
54
+ def input_pickle_file_remote(self) -> str:
55
+ return self.input_pickle_file_remote_path.as_posix()
56
+
57
+ @property
58
+ def output_pickle_file_local(self) -> str:
59
+ return self.output_pickle_file_local_path.as_posix()
60
+
61
+ @property
62
+ def output_pickle_file_remote(self) -> str:
63
+ return self.output_pickle_file_remote_path.as_posix()
48
64
 
49
65
 
50
66
  class SlurmJob(BaseModel):
@@ -74,29 +90,45 @@ class SlurmJob(BaseModel):
74
90
  return "%j"
75
91
 
76
92
  @property
77
- def slurm_stdout_remote(self) -> str:
93
+ def slurm_stdout_remote_path(self) -> Path:
78
94
  return (
79
95
  self.workdir_remote
80
96
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
81
- ).as_posix()
97
+ )
82
98
 
83
99
  @property
84
- def slurm_stderr_remote(self) -> str:
100
+ def slurm_stdout_remote(self) -> str:
101
+ return self.slurm_stdout_remote_path.as_posix()
102
+
103
+ @property
104
+ def slurm_stderr_remote_path(self) -> Path:
85
105
  return (
86
106
  self.workdir_remote
87
107
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
88
- ).as_posix()
108
+ )
89
109
 
90
110
  @property
91
- def slurm_stdout_local(self) -> str:
111
+ def slurm_stderr_remote(self) -> str:
112
+ return self.slurm_stderr_remote_path.as_posix()
113
+
114
+ @property
115
+ def slurm_stdout_local_path(self) -> str:
92
116
  return (
93
117
  self.workdir_local
94
118
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
95
- ).as_posix()
119
+ )
96
120
 
97
121
  @property
98
- def slurm_stderr_local(self) -> str:
122
+ def slurm_stdout_local(self) -> str:
123
+ return self.slurm_stdout_local_path.as_posix()
124
+
125
+ @property
126
+ def slurm_stderr_local_path(self) -> Path:
99
127
  return (
100
128
  self.workdir_local
101
129
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
102
- ).as_posix()
130
+ )
131
+
132
+ @property
133
+ def slurm_stderr_local(self) -> str:
134
+ return self.slurm_stderr_local_path.as_posix()
@@ -38,7 +38,6 @@ class SlurmSSHRunner(BaseSlurmRunner):
38
38
  logger.warning(self.fractal_ssh)
39
39
 
40
40
  settings = Inject(get_settings)
41
- self.python_worker_interpreter = settings.FRACTAL_SLURM_WORKER_PYTHON
42
41
 
43
42
  super().__init__(
44
43
  slurm_runner_type="ssh",
@@ -47,6 +46,7 @@ class SlurmSSHRunner(BaseSlurmRunner):
47
46
  common_script_lines=common_script_lines,
48
47
  user_cache_dir=user_cache_dir,
49
48
  poll_interval=poll_interval,
49
+ python_worker_interpreter=settings.FRACTAL_SLURM_WORKER_PYTHON,
50
50
  )
51
51
 
52
52
  def _mkdir_local_folder(self, folder: str) -> None:
@@ -58,86 +58,81 @@ class SlurmSSHRunner(BaseSlurmRunner):
58
58
  parents=True,
59
59
  )
60
60
 
61
- def _copy_files_from_remote_to_local(self, slurm_job: SlurmJob) -> None:
62
- self._get_subfolder_sftp(job=slurm_job)
63
-
64
- def _put_subfolder_sftp(self, job: SlurmJob) -> None:
65
- # FIXME re-introduce use of this function, but only after splitting
66
- # submission logic into
67
- # 1. prepare all
68
- # 2. send folder
69
- # 3. submit all
61
+ def _fetch_artifacts(
62
+ self,
63
+ finished_slurm_jobs: list[SlurmJob],
64
+ ) -> None:
70
65
  """
71
- Transfer the jobs subfolder to the remote host.
66
+ Fetch artifacts for a list of SLURM jobs.
72
67
  """
73
68
 
74
- # Create local archive
75
- tarfile_path_local = compress_folder(job.workdir_local)
76
- tarfile_name = Path(tarfile_path_local).name
77
- logger.info(f"Subfolder archive created at {tarfile_path_local}")
69
+ # Check length
70
+ if len(finished_slurm_jobs) == 0:
71
+ logger.debug(f"[_fetch_artifacts] EXIT ({finished_slurm_jobs=}).")
72
+ return None
78
73
 
79
- # Transfer archive
80
- tarfile_path_remote = (
81
- job.workdir_remote.parent / tarfile_name
82
- ).as_posix()
83
- t_0_put = time.perf_counter()
84
- self.fractal_ssh.send_file(
85
- local=tarfile_path_local,
86
- remote=tarfile_path_remote,
87
- )
88
- t_1_put = time.perf_counter()
89
- logger.info(
90
- f"Subfolder archive transferred to {tarfile_path_remote}"
91
- f" - elapsed: {t_1_put - t_0_put:.3f} s"
74
+ t_0 = time.perf_counter()
75
+ logger.debug(
76
+ f"[_fetch_artifacts] START ({len(finished_slurm_jobs)=})."
92
77
  )
93
78
 
94
- # Remove local archive
95
- Path(tarfile_path_local).unlink()
96
- logger.debug(f"Local archive {tarfile_path_local} removed")
97
-
98
- # Uncompress remote archive
99
- tar_command = (
100
- f"{self.python_worker_interpreter} -m "
101
- "fractal_server.app.runner.extract_archive "
102
- f"{tarfile_path_remote}"
103
- )
104
- self.fractal_ssh.run_command(cmd=tar_command)
79
+ # Extract `workdir_remote` and `workdir_local`
80
+ self.validate_slurm_jobs_workdirs(finished_slurm_jobs)
81
+ workdir_local = finished_slurm_jobs[0].workdir_local
82
+ workdir_remote = finished_slurm_jobs[0].workdir_remote
105
83
 
106
- def _get_subfolder_sftp(self, job: SlurmJob) -> None:
107
- """
108
- Fetch a remote folder via tar+sftp+tar
109
- """
110
-
111
- t_0 = time.perf_counter()
112
- logger.debug("[_get_subfolder_sftp] Start")
84
+ # Define local/remote tarfile paths
113
85
  tarfile_path_local = (
114
- job.workdir_local.parent / f"{job.workdir_local.name}.tar.gz"
86
+ workdir_local.parent / f"{workdir_local.name}.tar.gz"
115
87
  ).as_posix()
116
88
  tarfile_path_remote = (
117
- job.workdir_remote.parent / f"{job.workdir_remote.name}.tar.gz"
89
+ workdir_remote.parent / f"{workdir_remote.name}.tar.gz"
118
90
  ).as_posix()
119
91
 
120
- # Remove remote tarfile
121
- try:
122
- rm_command = f"rm {tarfile_path_remote}"
123
- self.fractal_ssh.run_command(cmd=rm_command)
124
- logger.info(f"Removed {tarfile_path_remote=}")
125
- except RuntimeError as e:
126
- logger.info(
127
- f"Could not remove {tarfile_path_remote=}.\n"
128
- f"Original error: {str(e)}"
129
- )
92
+ # Create file list
93
+ # # FIXME can we make this more efficient with iterations?
94
+ filelist = []
95
+ for _slurm_job in finished_slurm_jobs:
96
+ _single_job_filelist = [
97
+ _slurm_job.slurm_stdout_remote_path.name,
98
+ _slurm_job.slurm_stderr_remote_path.name,
99
+ ]
100
+ for task in _slurm_job.tasks:
101
+ _single_job_filelist.extend(
102
+ [
103
+ task.output_pickle_file_remote_path.name,
104
+ task.task_files.log_file_remote_path.name,
105
+ task.task_files.args_file_remote_path.name,
106
+ task.task_files.metadiff_file_remote_path.name,
107
+ ]
108
+ )
109
+ filelist.extend(_single_job_filelist)
110
+ filelist_string = "\n".join(filelist)
111
+ elapsed = time.perf_counter() - t_0
112
+ logger.debug(
113
+ "[_fetch_artifacts] Created filelist "
114
+ f"({len(filelist)=}, from start: {elapsed:.3f} s)."
115
+ )
116
+
117
+ # Write filelist to file remotely
118
+ tmp_filelist_path = workdir_remote / f"filelist_{time.time()}.txt"
119
+ self.fractal_ssh.write_remote_file(
120
+ path=tmp_filelist_path.as_posix(),
121
+ content=f"{filelist_string}\n",
122
+ )
123
+ elapsed = time.perf_counter() - t_0
124
+ logger.debug(
125
+ f"[_fetch_artifacts] File list written to {tmp_filelist_path} "
126
+ f"(from start: {elapsed:.3f} s)."
127
+ )
130
128
 
131
129
  # Create remote tarfile
132
- # FIXME: introduce filtering by prefix, so that when the subfolder
133
- # includes N SLURM jobs we don't always copy the cumulative folder
134
- # but only the relevant part
135
130
  t_0_tar = time.perf_counter()
136
131
  tar_command = (
137
132
  f"{self.python_worker_interpreter} "
138
133
  "-m fractal_server.app.runner.compress_folder "
139
- f"{job.workdir_remote.as_posix()} "
140
- "--remote-to-local"
134
+ f"{workdir_remote.as_posix()} "
135
+ f"--filelist {tmp_filelist_path}"
141
136
  )
142
137
  self.fractal_ssh.run_command(cmd=tar_command)
143
138
  t_1_tar = time.perf_counter()
@@ -167,6 +162,47 @@ class SlurmSSHRunner(BaseSlurmRunner):
167
162
  t_1 = time.perf_counter()
168
163
  logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
169
164
 
165
+ def _send_inputs(self, jobs: list[SlurmJob]) -> None:
166
+ """
167
+ Transfer the jobs subfolder to the remote host.
168
+ """
169
+ for job in jobs:
170
+
171
+ # Create local archive
172
+ tarfile_path_local = compress_folder(
173
+ job.workdir_local,
174
+ filelist_path=None,
175
+ )
176
+ tarfile_name = Path(tarfile_path_local).name
177
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
178
+
179
+ # Transfer archive
180
+ tarfile_path_remote = (
181
+ job.workdir_remote.parent / tarfile_name
182
+ ).as_posix()
183
+ t_0_put = time.perf_counter()
184
+ self.fractal_ssh.send_file(
185
+ local=tarfile_path_local,
186
+ remote=tarfile_path_remote,
187
+ )
188
+ t_1_put = time.perf_counter()
189
+ logger.info(
190
+ f"Subfolder archive transferred to {tarfile_path_remote}"
191
+ f" - elapsed: {t_1_put - t_0_put:.3f} s"
192
+ )
193
+
194
+ # Remove local archive
195
+ Path(tarfile_path_local).unlink()
196
+ logger.debug(f"Local archive {tarfile_path_local} removed")
197
+
198
+ # Uncompress remote archive
199
+ tar_command = (
200
+ f"{self.python_worker_interpreter} -m "
201
+ "fractal_server.app.runner.extract_archive "
202
+ f"{tarfile_path_remote}"
203
+ )
204
+ self.fractal_ssh.run_command(cmd=tar_command)
205
+
170
206
  def _run_remote_cmd(self, cmd: str) -> str:
171
207
  stdout = self.fractal_ssh.run_command(cmd=cmd)
172
208
  return stdout
@@ -3,6 +3,7 @@ import os
3
3
  import shlex
4
4
  import subprocess # nosec
5
5
  import sys
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  from pathlib import Path
7
8
  from typing import Optional
8
9
 
@@ -15,7 +16,6 @@ from fractal_server.config import get_settings
15
16
  from fractal_server.logger import set_logger
16
17
  from fractal_server.syringe import Inject
17
18
 
18
-
19
19
  logger = set_logger(__name__)
20
20
 
21
21
 
@@ -67,10 +67,6 @@ class SudoSlurmRunner(BaseSlurmRunner):
67
67
  self.slurm_account = slurm_account
68
68
  settings = Inject(get_settings)
69
69
 
70
- self.python_worker_interpreter = (
71
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
72
- )
73
-
74
70
  super().__init__(
75
71
  slurm_runner_type="sudo",
76
72
  root_dir_local=root_dir_local,
@@ -78,6 +74,9 @@ class SudoSlurmRunner(BaseSlurmRunner):
78
74
  common_script_lines=common_script_lines,
79
75
  user_cache_dir=user_cache_dir,
80
76
  poll_interval=poll_interval,
77
+ python_worker_interpreter=(
78
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
79
+ ),
81
80
  )
82
81
 
83
82
  def _mkdir_local_folder(self, folder: str) -> None:
@@ -88,12 +87,12 @@ class SudoSlurmRunner(BaseSlurmRunner):
88
87
  def _mkdir_remote_folder(self, folder: str) -> None:
89
88
  _mkdir_as_user(folder=folder, user=self.slurm_user)
90
89
 
91
- def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
90
+ def _fetch_artifacts_single_job(self, job: SlurmJob) -> None:
92
91
  """
93
- Note: this would differ for SSH
92
+ Fetch artifacts for a single SLURM jobs.
94
93
  """
95
94
  logger.debug(
96
- f"[_copy_files_from_remote_to_local] {job.slurm_job_id=} START"
95
+ f"[_fetch_artifacts_single_job] {job.slurm_job_id=} START"
97
96
  )
98
97
  source_target_list = [
99
98
  (job.slurm_stdout_remote, job.slurm_stdout_local),
@@ -140,9 +139,30 @@ class SudoSlurmRunner(BaseSlurmRunner):
140
139
  f"SKIP copy {source} into {target}. "
141
140
  f"Original error: {str(e)}"
142
141
  )
142
+ logger.debug(f"[_fetch_artifacts_single_job] {job.slurm_job_id=} END")
143
+
144
+ def _fetch_artifacts(
145
+ self,
146
+ finished_slurm_jobs: list[SlurmJob],
147
+ ) -> None:
148
+ """
149
+ Fetch artifacts for a list of SLURM jobs.
150
+ """
151
+ MAX_NUM_THREADS = 4
152
+ THREAD_NAME_PREFIX = "fetch_artifacts"
143
153
  logger.debug(
144
- f"[_copy_files_from_remote_to_local] {job.slurm_job_id=} END"
154
+ "[_fetch_artifacts] START "
155
+ f"({MAX_NUM_THREADS=}, {len(finished_slurm_jobs)=})."
145
156
  )
157
+ with ThreadPoolExecutor(
158
+ max_workers=MAX_NUM_THREADS,
159
+ thread_name_prefix=THREAD_NAME_PREFIX,
160
+ ) as executor:
161
+ executor.map(
162
+ self._fetch_artifacts_single_job,
163
+ finished_slurm_jobs,
164
+ )
165
+ logger.debug("[_fetch_artifacts] END.")
146
166
 
147
167
  def _run_remote_cmd(self, cmd: str) -> str:
148
168
  res = _run_command_as_user(
@@ -57,9 +57,7 @@ def extract_archive(archive_path: Path):
57
57
 
58
58
  # Run tar command
59
59
  cmd_tar = (
60
- f"tar -xzvf {archive_path} "
61
- f"--directory={subfolder_path.as_posix()} "
62
- "."
60
+ f"tar -xzvf {archive_path} " f"--directory={subfolder_path.as_posix()}"
63
61
  )
64
62
  logger.debug(f"{cmd_tar=}")
65
63
  run_subprocess(cmd=cmd_tar, logger_name=logger_name)
@@ -85,11 +85,15 @@ class TaskFiles(BaseModel):
85
85
  ).as_posix()
86
86
 
87
87
  @property
88
- def log_file_remote(self) -> str:
88
+ def log_file_remote_path(self) -> Path:
89
89
  self._check_component()
90
90
  return (
91
91
  self.wftask_subfolder_remote / f"{self.prefix_component}-log.txt"
92
- ).as_posix()
92
+ )
93
+
94
+ @property
95
+ def log_file_remote(self) -> str:
96
+ return self.log_file_remote_path.as_posix()
93
97
 
94
98
  @property
95
99
  def args_file_local(self) -> str:
@@ -99,11 +103,15 @@ class TaskFiles(BaseModel):
99
103
  ).as_posix()
100
104
 
101
105
  @property
102
- def args_file_remote(self) -> str:
106
+ def args_file_remote_path(self) -> Path:
103
107
  self._check_component()
104
108
  return (
105
109
  self.wftask_subfolder_remote / f"{self.prefix_component}-args.json"
106
- ).as_posix()
110
+ )
111
+
112
+ @property
113
+ def args_file_remote(self) -> str:
114
+ return self.args_file_remote_path.as_posix()
107
115
 
108
116
  @property
109
117
  def metadiff_file_local(self) -> str:
@@ -114,12 +122,16 @@ class TaskFiles(BaseModel):
114
122
  ).as_posix()
115
123
 
116
124
  @property
117
- def metadiff_file_remote(self) -> str:
125
+ def metadiff_file_remote_path(self) -> Path:
118
126
  self._check_component()
119
127
  return (
120
128
  self.wftask_subfolder_remote
121
129
  / f"{self.prefix_component}-metadiff.json"
122
- ).as_posix()
130
+ )
131
+
132
+ @property
133
+ def metadiff_file_remote(self) -> str:
134
+ return self.metadiff_file_remote_path.as_posix()
123
135
 
124
136
  @property
125
137
  def remote_files_dict(self) -> dict[str, str]:
@@ -501,7 +501,8 @@ class FractalSSH(object):
501
501
  content: Contents to be written to file.
502
502
  lock_timeout: Timeout for lock acquisition (overrides default).
503
503
  """
504
- self.logger.info(f"START writing to remote file {path}.")
504
+ t_start = time.perf_counter()
505
+ self.logger.info(f"[write_remote_file] START ({path}).")
505
506
  actual_lock_timeout = self.default_lock_timeout
506
507
  if lock_timeout is not None:
507
508
  actual_lock_timeout = lock_timeout
@@ -518,7 +519,8 @@ class FractalSSH(object):
518
519
  e=e, message=f"Error in `write_remote_file`, for {path=}."
519
520
  )
520
521
 
521
- self.logger.info(f"END writing to remote file {path}.")
522
+ elapsed = time.perf_counter() - t_start
523
+ self.logger.info(f"[write_remote_file] END, {elapsed=} s ({path}).")
522
524
 
523
525
  def remote_exists(self, path: str) -> bool:
524
526
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fractal-server
3
- Version: 2.14.0a21
3
+ Version: 2.14.0a23
4
4
  Summary: Backend component of the Fractal analytics platform
5
5
  License: BSD-3-Clause
6
6
  Author: Tommaso Comparin
@@ -1,4 +1,4 @@
1
- fractal_server/__init__.py,sha256=X5Dy_f87GBiFeLzs2riLgudM2HP43U0ZuXNsU2NF7Os,26
1
+ fractal_server/__init__.py,sha256=HmMn7o6LFtf7DorxF19ycJBUGPrTwAzfEUuVJ_cF14Q,26
2
2
  fractal_server/__main__.py,sha256=rkM8xjY1KeS3l63irB8yCrlVobR-73uDapC4wvrIlxI,6957
3
3
  fractal_server/alembic.ini,sha256=MWwi7GzjzawI9cCAK1LW7NxIBQDUqD12-ptJoq5JpP0,3153
4
4
  fractal_server/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,7 +36,7 @@ fractal_server/app/routes/api/v2/_aux_functions_history.py,sha256=ZlI6nwzB5r9AiY
36
36
  fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py,sha256=qdXCb6IP8-qPEAxGZKljtjIqNzIAyRaAsQSRi5VqFHM,6773
37
37
  fractal_server/app/routes/api/v2/_aux_functions_tasks.py,sha256=uhNSs-jcS7ndIUFKiOC1yrDiViw3uvKEXi9UL04BMks,11642
38
38
  fractal_server/app/routes/api/v2/dataset.py,sha256=h5AhE0sdhQ20ZlIbEJsFnHIOUW0S1VHFpoflpBkVScs,8936
39
- fractal_server/app/routes/api/v2/history.py,sha256=lMbaybooBzzbCgD9vdzPyNxdgAZuzCH_YrW9ost-UgI,17253
39
+ fractal_server/app/routes/api/v2/history.py,sha256=FvZGl66hIdo70GvWoOhRQ__knbkbp5u440sl6qhj7nA,17748
40
40
  fractal_server/app/routes/api/v2/images.py,sha256=BGpO94gVd8BTpCN6Mun2RXmjrPmfkIp73m8RN7uiGW4,8361
41
41
  fractal_server/app/routes/api/v2/job.py,sha256=MU1sHIKk_89WrD0TD44d4ufzqnywot7On_W71KjyUbQ,6500
42
42
  fractal_server/app/routes/api/v2/project.py,sha256=uAZgATiHcOvbnRX-vv1D3HoaEUvLUd7vzVmGcqOP8ZY,4602
@@ -67,7 +67,7 @@ fractal_server/app/routes/aux/validate_user_settings.py,sha256=FLVi__8YFcm_6c_K5
67
67
  fractal_server/app/routes/pagination.py,sha256=L8F5JqekF39qz-LpeScdlhb57MQnSRXjK4ZEtsZqYLk,1210
68
68
  fractal_server/app/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  fractal_server/app/runner/components.py,sha256=-Ii5l8d_V6f5DFOd-Zsr8VYmOsyqw0Hox9fEFQiuqxY,66
70
- fractal_server/app/runner/compress_folder.py,sha256=HSc1tv7x2DBjBoXwugZlC79rm9GNBIWtQKK9yWn5ZBI,3991
70
+ fractal_server/app/runner/compress_folder.py,sha256=LswrzzW7h8LJzwNcVIdhhTwHMYo2-dua3BeO272DCaw,4879
71
71
  fractal_server/app/runner/exceptions.py,sha256=JC5ufHyeA1hYD_rkZUscI30DD8D903ncag7Z3AArmUY,4215
72
72
  fractal_server/app/runner/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  fractal_server/app/runner/executors/base_runner.py,sha256=_elfqkuw1TGRacPG4aTTYKUTbF4A-Y2Rmft9LFi-Mwc,5554
@@ -78,22 +78,22 @@ fractal_server/app/runner/executors/slurm_common/__init__.py,sha256=47DEQpj8HBSa
78
78
  fractal_server/app/runner/executors/slurm_common/_batching.py,sha256=ZY020JZlDS5mfpgpWTChQkyHU7iLE5kx2HVd57_C6XA,8850
79
79
  fractal_server/app/runner/executors/slurm_common/_job_states.py,sha256=nuV-Zba38kDrRESOVB3gaGbrSPZc4q7YGichQaeqTW0,238
80
80
  fractal_server/app/runner/executors/slurm_common/_slurm_config.py,sha256=fZaFUUXqDH0p3DndCFUpFqTqyD2tMVCuSYgYLAycpVw,15897
81
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=f01IDAtBoatOYEP4UtrH0Y4qN7BwM1ov4Bx8rotQg1M,31099
81
+ fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=UIQ3W0HGXLw8vGt-WeqFq_UafXtyPbuyDAAyXbV96Vo,32314
82
82
  fractal_server/app/runner/executors/slurm_common/get_slurm_config.py,sha256=-fAX1DZMB5RZnyYanIJD72mWOJAPkh21jd4loDXKJw4,5994
83
83
  fractal_server/app/runner/executors/slurm_common/remote.py,sha256=FS_F8EaPp-A5eQT5_ZH3ICCHt0-C8b_2OSYcyRkXnb4,5851
84
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py,sha256=YGgzTspkK9ItSMzwuYv_1tY7_1g89Qpeny5Auinxk1E,2708
84
+ fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py,sha256=RoxHLKOn0_wGjnY0Sv0a9nDSiqxYZHKRoMkT3p9_G1E,3607
85
85
  fractal_server/app/runner/executors/slurm_common/utils_executors.py,sha256=naPyJI0I3lD-sYHbSXbMFGUBK4h_SggA5V91Z1Ch1Xg,1416
86
86
  fractal_server/app/runner/executors/slurm_ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
- fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=WhcrcvDMvSoQ-GroDXWzIfESbOLxMf3m-ZE16pbqQRg,5777
87
+ fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=B_L4MczNPoOIqJPHYmJwwVRl5nLsVRecIcyW-OrgU6E,7138
88
88
  fractal_server/app/runner/executors/slurm_sudo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
89
  fractal_server/app/runner/executors/slurm_sudo/_subprocess_run_as_user.py,sha256=O1bNg1DiSDJmQE0RmOk2Ii47DagiXp5ryd0R6KxO2OM,3177
90
- fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=FdhENPwz_efXSPqG3saEvJOUiv6uG1vd0ET61QmbC4o,5136
91
- fractal_server/app/runner/extract_archive.py,sha256=tLpjDrX47OjTNhhoWvm6iNukg8KoieWyTb7ZfvE9eWU,2483
90
+ fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=WGGVHX_juqyC6OVhln9yg-YKjLiuAoWZhAGxBjhNkWw,5873
91
+ fractal_server/app/runner/extract_archive.py,sha256=iQOsJNYW-ae1TCK5WqmBvmqw7FPEaEh3U7UivVCLayY,2462
92
92
  fractal_server/app/runner/filenames.py,sha256=lPnxKHtdRizr6FqG3zOdjDPyWA7GoaJGTtiuJV0gA8E,70
93
93
  fractal_server/app/runner/run_subprocess.py,sha256=c3JbYXq3hX2aaflQU19qJ5Xs6J6oXGNvnTEoAfv2bxc,959
94
94
  fractal_server/app/runner/set_start_and_last_task_index.py,sha256=-q4zVybAj8ek2XlbENKlfOAJ39hT_zoJoZkqzDqiAMY,1254
95
95
  fractal_server/app/runner/shutdown.py,sha256=9pfSKHDNdIcm0eY-opgRTi7y0HmvfPmYiu9JR6Idark,2082
96
- fractal_server/app/runner/task_files.py,sha256=KfjDqQV9b9wImusxJtQfVST42FyvT3Kza0WooI-6OBk,3198
96
+ fractal_server/app/runner/task_files.py,sha256=wWp7GjZAt04X-N5M1YHDRyWHeNz8kFjK-jSA6DaxC20,3510
97
97
  fractal_server/app/runner/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
98
  fractal_server/app/runner/v2/_local.py,sha256=DK8yagbvd6HHjcDVhUzTy0f7MURlTkQha-NM6OZKgJc,3044
99
99
  fractal_server/app/runner/v2/_slurm_ssh.py,sha256=_bytOf8z9sdrhI03D6eqg-aQPnJ7V2-qnqpcHAYizns,3278
@@ -177,7 +177,7 @@ fractal_server/migrations/versions/f384e1c0cf5d_drop_task_default_args_columns.p
177
177
  fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py,sha256=TDWCaIoM0Q4SpRWmR9zr_rdp3lJXhCfBPTMhtrP5xYE,3950
178
178
  fractal_server/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
179
  fractal_server/ssh/__init__.py,sha256=sVUmzxf7_DuXG1xoLQ1_00fo5NPhi2LJipSmU5EAkPs,124
180
- fractal_server/ssh/_fabric.py,sha256=lNy4IX1I4We6VoWa4Bz4fUPuApLMSoejpyE6I3jDZeM,22869
180
+ fractal_server/ssh/_fabric.py,sha256=gnSv_DaQ8uYLS35Rqb84wo3HRkMazXGVd-D19fo9zqA,22967
181
181
  fractal_server/string_tools.py,sha256=niViRrrZAOo0y6pEFI9L_eUYS1PoOiQZUBtngiLc2_k,1877
182
182
  fractal_server/syringe.py,sha256=3qSMW3YaMKKnLdgnooAINOPxnCOxP7y2jeAQYB21Gdo,2786
183
183
  fractal_server/tasks/__init__.py,sha256=kadmVUoIghl8s190_Tt-8f-WBqMi8u8oU4Pvw39NHE8,23
@@ -207,8 +207,8 @@ fractal_server/tasks/v2/utils_templates.py,sha256=Kc_nSzdlV6KIsO0CQSPs1w70zLyENP
207
207
  fractal_server/urls.py,sha256=QjIKAC1a46bCdiPMu3AlpgFbcv6a4l3ABcd5xz190Og,471
208
208
  fractal_server/utils.py,sha256=PMwrxWFxRTQRl1b9h-NRIbFGPKqpH_hXnkAT3NfZdpY,3571
209
209
  fractal_server/zip_tools.py,sha256=GjDgo_sf6V_DDg6wWeBlZu5zypIxycn_l257p_YVKGc,4876
210
- fractal_server-2.14.0a21.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
211
- fractal_server-2.14.0a21.dist-info/METADATA,sha256=OZesW99axIUpT41CcJOkIszG_F5jLrDfKLT29EctIgI,4563
212
- fractal_server-2.14.0a21.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
213
- fractal_server-2.14.0a21.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
214
- fractal_server-2.14.0a21.dist-info/RECORD,,
210
+ fractal_server-2.14.0a23.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
211
+ fractal_server-2.14.0a23.dist-info/METADATA,sha256=qQR1L3oMXG0_a_rgeKkdsUbv8EjsBt8j11xNKnTZm5Q,4563
212
+ fractal_server-2.14.0a23.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
213
+ fractal_server-2.14.0a23.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
214
+ fractal_server-2.14.0a23.dist-info/RECORD,,