fractal-server 2.14.0a22__py3-none-any.whl → 2.14.0a24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __VERSION__ = "2.14.0a22"
1
+ __VERSION__ = "2.14.0a24"
@@ -35,24 +35,27 @@ def _create_tar_archive(
35
35
  tarfile_path: str,
36
36
  subfolder_path_tmp_copy: Path,
37
37
  logger_name: str,
38
- remote_to_local: bool,
38
+ filelist_path: str | None,
39
39
  ):
40
40
  logger = get_logger(logger_name)
41
41
  logger.debug(f"[_create_tar_archive] START ({tarfile_path})")
42
42
  t_start = time.perf_counter()
43
43
 
44
- if remote_to_local:
45
- exclude_options = "--exclude *sbatch --exclude *_in_*.pickle "
44
+ if filelist_path is None:
45
+ cmd_tar = (
46
+ f"tar -c -z -f {tarfile_path} "
47
+ f"--directory={subfolder_path_tmp_copy.as_posix()} "
48
+ "."
49
+ )
46
50
  else:
47
- exclude_options = ""
51
+ cmd_tar = (
52
+ f"tar -c -z -f {tarfile_path} "
53
+ f"--directory={subfolder_path_tmp_copy.as_posix()} "
54
+ f"--files-from={filelist_path} --ignore-failed-read"
55
+ )
48
56
 
49
- cmd_tar = (
50
- f"tar czf {tarfile_path} "
51
- f"{exclude_options} "
52
- f"--directory={subfolder_path_tmp_copy.as_posix()} "
53
- "."
54
- )
55
57
  logger.debug(f"cmd tar:\n{cmd_tar}")
58
+
56
59
  run_subprocess(cmd=cmd_tar, logger_name=logger_name, allow_char="*")
57
60
  elapsed = time.perf_counter() - t_start
58
61
  logger.debug(f"[_create_tar_archive] END {elapsed=} s ({tarfile_path})")
@@ -75,7 +78,8 @@ def _remove_temp_subfolder(subfolder_path_tmp_copy: Path, logger_name: str):
75
78
 
76
79
 
77
80
  def compress_folder(
78
- subfolder_path: Path, remote_to_local: bool = False
81
+ subfolder_path: Path,
82
+ filelist_path: str | None,
79
83
  ) -> str:
80
84
  """
81
85
  Compress e.g. `/path/archive` into `/path/archive.tar.gz`
@@ -114,7 +118,7 @@ def compress_folder(
114
118
  tarfile_path,
115
119
  subfolder_path_tmp_copy,
116
120
  logger_name=logger_name,
117
- remote_to_local=remote_to_local,
121
+ filelist_path=filelist_path,
118
122
  )
119
123
  return tarfile_path
120
124
 
@@ -133,15 +137,21 @@ def main(sys_argv: list[str]):
133
137
  help_msg = (
134
138
  "Expected use:\n"
135
139
  "python -m fractal_server.app.runner.compress_folder "
136
- "path/to/folder [--remote-to-local]\n"
140
+ "path/to/folder [--filelist /path/to/filelist]\n"
137
141
  )
138
142
  num_args = len(sys_argv[1:])
139
143
  if num_args == 0:
140
144
  sys.exit(f"Invalid argument.\n{help_msg}\nProvided: {sys_argv[1:]=}")
141
145
  elif num_args == 1:
142
- compress_folder(subfolder_path=Path(sys_argv[1]))
143
- elif num_args == 2 and sys_argv[2] == "--remote-to-local":
144
- compress_folder(subfolder_path=Path(sys_argv[1]), remote_to_local=True)
146
+ compress_folder(
147
+ subfolder_path=Path(sys_argv[1]),
148
+ filelist_path=None,
149
+ )
150
+ elif num_args == 3 and sys_argv[2] == "--filelist":
151
+ compress_folder(
152
+ subfolder_path=Path(sys_argv[1]),
153
+ filelist_path=sys_argv[3],
154
+ )
145
155
  else:
146
156
  sys.exit(f"Invalid argument.\n{help_msg}\nProvided: {sys_argv[1:]=}")
147
157
 
@@ -60,6 +60,7 @@ class BaseSlurmRunner(BaseRunner):
60
60
  root_dir_local: Path,
61
61
  root_dir_remote: Path,
62
62
  slurm_runner_type: Literal["ssh", "sudo"],
63
+ python_worker_interpreter: str,
63
64
  common_script_lines: Optional[list[str]] = None,
64
65
  user_cache_dir: Optional[str] = None,
65
66
  poll_interval: Optional[int] = None,
@@ -70,6 +71,7 @@ class BaseSlurmRunner(BaseRunner):
70
71
  self.common_script_lines = common_script_lines or []
71
72
  self._check_slurm_account()
72
73
  self.user_cache_dir = user_cache_dir
74
+ self.python_worker_interpreter = python_worker_interpreter
73
75
 
74
76
  settings = Inject(get_settings)
75
77
 
@@ -327,9 +329,9 @@ class BaseSlurmRunner(BaseRunner):
327
329
  )
328
330
  logger.info("[_submit_single_sbatch] END")
329
331
 
330
- def _copy_files_from_remote_to_local(
332
+ def _fetch_artifacts(
331
333
  self,
332
- slurm_job: SlurmJob,
334
+ finished_slurm_jobs: list[SlurmJob],
333
335
  ) -> None:
334
336
  raise NotImplementedError("Implement in child class.")
335
337
 
@@ -530,14 +532,14 @@ class BaseSlurmRunner(BaseRunner):
530
532
  # Look for finished jobs
531
533
  finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
532
534
  logger.debug(f"[submit] {finished_job_ids=}")
533
-
535
+ finished_jobs = [
536
+ self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
537
+ ]
538
+ self._fetch_artifacts(finished_jobs)
534
539
  with next(get_sync_db()) as db:
535
540
  for slurm_job_id in finished_job_ids:
536
541
  logger.debug(f"[submit] Now process {slurm_job_id=}")
537
542
  slurm_job = self.jobs.pop(slurm_job_id)
538
- self._copy_files_from_remote_to_local(
539
- slurm_job
540
- ) # FIXME: add prefix # noqa
541
543
  was_job_scancelled = slurm_job_id in scancelled_job_ids
542
544
  result, exception = self._postprocess_single_task(
543
545
  task=slurm_job.tasks[0],
@@ -653,7 +655,9 @@ class BaseSlurmRunner(BaseRunner):
653
655
  if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
654
656
  raise RuntimeError("Something wrong here while batching tasks")
655
657
 
656
- logger.info(f"START submission phase, {list(self.jobs.keys())=}")
658
+ # Part 1/3: Iterate over chunks, prepare SlurmJob objects
659
+ logger.info("[multisubmit] Prepare `SlurmJob`s.")
660
+ jobs_to_submit = []
657
661
  for ind_batch, chunk in enumerate(args_batches):
658
662
  prefix = f"{MULTISUBMIT_PREFIX}-{ind_batch:06d}"
659
663
  tasks = []
@@ -673,17 +677,26 @@ class BaseSlurmRunner(BaseRunner):
673
677
  ),
674
678
  )
675
679
 
676
- slurm_job = SlurmJob(
677
- prefix=prefix,
678
- workdir_local=workdir_local,
679
- workdir_remote=workdir_remote,
680
- tasks=tasks,
680
+ jobs_to_submit.append(
681
+ SlurmJob(
682
+ prefix=prefix,
683
+ workdir_local=workdir_local,
684
+ workdir_remote=workdir_remote,
685
+ tasks=tasks,
686
+ )
681
687
  )
688
+
689
+ # FIXME: split parts 2 and 3
690
+ # Part 2/3. Transfer all relevant input files (for SSH)
691
+ # Part 3/3. Run all `sbatch`es and update `self.jobs`
692
+ logger.info("[multisubmit] Transfer files and submit jobs.")
693
+ for slurm_job in jobs_to_submit:
682
694
  self._submit_single_sbatch(
683
695
  func,
684
696
  slurm_job=slurm_job,
685
697
  slurm_config=config,
686
698
  )
699
+
687
700
  if task_type == "parallel":
688
701
  # FIXME: replace loop with a `bulk_update_history_unit` function
689
702
  for ind, task_files in enumerate(list_task_files):
@@ -711,20 +724,21 @@ class BaseSlurmRunner(BaseRunner):
711
724
 
712
725
  # Retrieval phase
713
726
  logger.info("[multisubmit] START retrieval phase")
727
+ scancelled_job_ids = []
714
728
  while len(self.jobs) > 0:
715
729
 
716
730
  # Look for finished jobs
717
731
  finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
718
732
  logger.debug(f"[multisubmit] {finished_job_ids=}")
733
+ finished_jobs = [
734
+ self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
735
+ ]
736
+ self._fetch_artifacts(finished_jobs)
719
737
 
720
- scancelled_job_ids = []
721
738
  with next(get_sync_db()) as db:
722
739
  for slurm_job_id in finished_job_ids:
723
740
  logger.info(f"[multisubmit] Now process {slurm_job_id=}")
724
741
  slurm_job = self.jobs.pop(slurm_job_id)
725
- self._copy_files_from_remote_to_local(
726
- slurm_job
727
- ) # FIXME: add prefix # noqa
728
742
  for task in slurm_job.tasks:
729
743
  logger.info(f"[multisubmit] Now process {task.index=}")
730
744
  was_job_scancelled = slurm_job_id in scancelled_job_ids
@@ -810,3 +824,17 @@ class BaseSlurmRunner(BaseRunner):
810
824
  )
811
825
  logger.info("[scancel_jobs] END")
812
826
  return scancelled_job_ids
827
+
828
+ def validate_slurm_jobs_workdirs(
829
+ self,
830
+ slurm_jobs: list[SlurmJob],
831
+ ) -> None:
832
+ """
833
+ Check that a list of `SlurmJob`s have homogeneous working folders.
834
+ """
835
+ set_workdir_local = set(_job.workdir_local for _job in slurm_jobs)
836
+ set_workdir_remote = set(_job.workdir_remote for _job in slurm_jobs)
837
+ if len(set_workdir_local) > 1:
838
+ raise ValueError(f"Non-unique values in {set_workdir_local=}.")
839
+ if len(set_workdir_remote) > 1:
840
+ raise ValueError(f"Non-unique values in {set_workdir_remote=}.")
@@ -20,31 +20,47 @@ class SlurmTask(BaseModel):
20
20
  index: int
21
21
 
22
22
  @property
23
- def input_pickle_file_local(self) -> str:
23
+ def input_pickle_file_local_path(self) -> Path:
24
24
  return (
25
25
  self.workdir_local / f"{self.prefix}-{self.component}-input.pickle"
26
- ).as_posix()
26
+ )
27
27
 
28
28
  @property
29
- def input_pickle_file_remote(self) -> str:
29
+ def input_pickle_file_remote_path(self) -> Path:
30
30
  return (
31
31
  self.workdir_remote
32
32
  / f"{self.prefix}-{self.component}-input.pickle"
33
- ).as_posix()
33
+ )
34
34
 
35
35
  @property
36
- def output_pickle_file_local(self) -> str:
36
+ def output_pickle_file_local_path(self) -> Path:
37
37
  return (
38
38
  self.workdir_local
39
39
  / f"{self.prefix}-{self.component}-output.pickle"
40
- ).as_posix()
40
+ )
41
41
 
42
42
  @property
43
- def output_pickle_file_remote(self) -> str:
43
+ def output_pickle_file_remote_path(self) -> Path:
44
44
  return (
45
45
  self.workdir_remote
46
46
  / f"{self.prefix}-{self.component}-output.pickle"
47
- ).as_posix()
47
+ )
48
+
49
+ @property
50
+ def input_pickle_file_local(self) -> str:
51
+ return self.input_pickle_file_local_path.as_posix()
52
+
53
+ @property
54
+ def input_pickle_file_remote(self) -> str:
55
+ return self.input_pickle_file_remote_path.as_posix()
56
+
57
+ @property
58
+ def output_pickle_file_local(self) -> str:
59
+ return self.output_pickle_file_local_path.as_posix()
60
+
61
+ @property
62
+ def output_pickle_file_remote(self) -> str:
63
+ return self.output_pickle_file_remote_path.as_posix()
48
64
 
49
65
 
50
66
  class SlurmJob(BaseModel):
@@ -74,29 +90,45 @@ class SlurmJob(BaseModel):
74
90
  return "%j"
75
91
 
76
92
  @property
77
- def slurm_stdout_remote(self) -> str:
93
+ def slurm_stdout_remote_path(self) -> Path:
78
94
  return (
79
95
  self.workdir_remote
80
96
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
81
- ).as_posix()
97
+ )
82
98
 
83
99
  @property
84
- def slurm_stderr_remote(self) -> str:
100
+ def slurm_stdout_remote(self) -> str:
101
+ return self.slurm_stdout_remote_path.as_posix()
102
+
103
+ @property
104
+ def slurm_stderr_remote_path(self) -> Path:
85
105
  return (
86
106
  self.workdir_remote
87
107
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
88
- ).as_posix()
108
+ )
89
109
 
90
110
  @property
91
- def slurm_stdout_local(self) -> str:
111
+ def slurm_stderr_remote(self) -> str:
112
+ return self.slurm_stderr_remote_path.as_posix()
113
+
114
+ @property
115
+ def slurm_stdout_local_path(self) -> str:
92
116
  return (
93
117
  self.workdir_local
94
118
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
95
- ).as_posix()
119
+ )
96
120
 
97
121
  @property
98
- def slurm_stderr_local(self) -> str:
122
+ def slurm_stdout_local(self) -> str:
123
+ return self.slurm_stdout_local_path.as_posix()
124
+
125
+ @property
126
+ def slurm_stderr_local_path(self) -> Path:
99
127
  return (
100
128
  self.workdir_local
101
129
  / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
102
- ).as_posix()
130
+ )
131
+
132
+ @property
133
+ def slurm_stderr_local(self) -> str:
134
+ return self.slurm_stderr_local_path.as_posix()
@@ -38,7 +38,6 @@ class SlurmSSHRunner(BaseSlurmRunner):
38
38
  logger.warning(self.fractal_ssh)
39
39
 
40
40
  settings = Inject(get_settings)
41
- self.python_worker_interpreter = settings.FRACTAL_SLURM_WORKER_PYTHON
42
41
 
43
42
  super().__init__(
44
43
  slurm_runner_type="ssh",
@@ -47,6 +46,7 @@ class SlurmSSHRunner(BaseSlurmRunner):
47
46
  common_script_lines=common_script_lines,
48
47
  user_cache_dir=user_cache_dir,
49
48
  poll_interval=poll_interval,
49
+ python_worker_interpreter=settings.FRACTAL_SLURM_WORKER_PYTHON,
50
50
  )
51
51
 
52
52
  def _mkdir_local_folder(self, folder: str) -> None:
@@ -58,86 +58,81 @@ class SlurmSSHRunner(BaseSlurmRunner):
58
58
  parents=True,
59
59
  )
60
60
 
61
- def _copy_files_from_remote_to_local(self, slurm_job: SlurmJob) -> None:
62
- self._get_subfolder_sftp(job=slurm_job)
63
-
64
- def _put_subfolder_sftp(self, job: SlurmJob) -> None:
65
- # FIXME re-introduce use of this function, but only after splitting
66
- # submission logic into
67
- # 1. prepare all
68
- # 2. send folder
69
- # 3. submit all
61
+ def _fetch_artifacts(
62
+ self,
63
+ finished_slurm_jobs: list[SlurmJob],
64
+ ) -> None:
70
65
  """
71
- Transfer the jobs subfolder to the remote host.
66
+ Fetch artifacts for a list of SLURM jobs.
72
67
  """
73
68
 
74
- # Create local archive
75
- tarfile_path_local = compress_folder(job.workdir_local)
76
- tarfile_name = Path(tarfile_path_local).name
77
- logger.info(f"Subfolder archive created at {tarfile_path_local}")
69
+ # Check length
70
+ if len(finished_slurm_jobs) == 0:
71
+ logger.debug(f"[_fetch_artifacts] EXIT ({finished_slurm_jobs=}).")
72
+ return None
78
73
 
79
- # Transfer archive
80
- tarfile_path_remote = (
81
- job.workdir_remote.parent / tarfile_name
82
- ).as_posix()
83
- t_0_put = time.perf_counter()
84
- self.fractal_ssh.send_file(
85
- local=tarfile_path_local,
86
- remote=tarfile_path_remote,
87
- )
88
- t_1_put = time.perf_counter()
89
- logger.info(
90
- f"Subfolder archive transferred to {tarfile_path_remote}"
91
- f" - elapsed: {t_1_put - t_0_put:.3f} s"
74
+ t_0 = time.perf_counter()
75
+ logger.debug(
76
+ f"[_fetch_artifacts] START ({len(finished_slurm_jobs)=})."
92
77
  )
93
78
 
94
- # Remove local archive
95
- Path(tarfile_path_local).unlink()
96
- logger.debug(f"Local archive {tarfile_path_local} removed")
97
-
98
- # Uncompress remote archive
99
- tar_command = (
100
- f"{self.python_worker_interpreter} -m "
101
- "fractal_server.app.runner.extract_archive "
102
- f"{tarfile_path_remote}"
103
- )
104
- self.fractal_ssh.run_command(cmd=tar_command)
79
+ # Extract `workdir_remote` and `workdir_local`
80
+ self.validate_slurm_jobs_workdirs(finished_slurm_jobs)
81
+ workdir_local = finished_slurm_jobs[0].workdir_local
82
+ workdir_remote = finished_slurm_jobs[0].workdir_remote
105
83
 
106
- def _get_subfolder_sftp(self, job: SlurmJob) -> None:
107
- """
108
- Fetch a remote folder via tar+sftp+tar
109
- """
110
-
111
- t_0 = time.perf_counter()
112
- logger.debug("[_get_subfolder_sftp] Start")
84
+ # Define local/remote tarfile paths
113
85
  tarfile_path_local = (
114
- job.workdir_local.parent / f"{job.workdir_local.name}.tar.gz"
86
+ workdir_local.parent / f"{workdir_local.name}.tar.gz"
115
87
  ).as_posix()
116
88
  tarfile_path_remote = (
117
- job.workdir_remote.parent / f"{job.workdir_remote.name}.tar.gz"
89
+ workdir_remote.parent / f"{workdir_remote.name}.tar.gz"
118
90
  ).as_posix()
119
91
 
120
- # Remove remote tarfile
121
- try:
122
- rm_command = f"rm {tarfile_path_remote}"
123
- self.fractal_ssh.run_command(cmd=rm_command)
124
- logger.info(f"Removed {tarfile_path_remote=}")
125
- except RuntimeError as e:
126
- logger.info(
127
- f"Could not remove {tarfile_path_remote=}.\n"
128
- f"Original error: {str(e)}"
129
- )
92
+ # Create file list
93
+ # # FIXME can we make this more efficient with iterators?
94
+ filelist = []
95
+ for _slurm_job in finished_slurm_jobs:
96
+ _single_job_filelist = [
97
+ _slurm_job.slurm_stdout_remote_path.name,
98
+ _slurm_job.slurm_stderr_remote_path.name,
99
+ ]
100
+ for task in _slurm_job.tasks:
101
+ _single_job_filelist.extend(
102
+ [
103
+ task.output_pickle_file_remote_path.name,
104
+ task.task_files.log_file_remote_path.name,
105
+ task.task_files.args_file_remote_path.name,
106
+ task.task_files.metadiff_file_remote_path.name,
107
+ ]
108
+ )
109
+ filelist.extend(_single_job_filelist)
110
+ filelist_string = "\n".join(filelist)
111
+ elapsed = time.perf_counter() - t_0
112
+ logger.debug(
113
+ "[_fetch_artifacts] Created filelist "
114
+ f"({len(filelist)=}, from start: {elapsed:.3f} s)."
115
+ )
116
+
117
+ # Write filelist to file remotely
118
+ tmp_filelist_path = workdir_remote / f"filelist_{time.time()}.txt"
119
+ self.fractal_ssh.write_remote_file(
120
+ path=tmp_filelist_path.as_posix(),
121
+ content=f"{filelist_string}\n",
122
+ )
123
+ elapsed = time.perf_counter() - t_0
124
+ logger.debug(
125
+ f"[_fetch_artifacts] File list written to {tmp_filelist_path} "
126
+ f"(from start: {elapsed:.3f} s)."
127
+ )
130
128
 
131
129
  # Create remote tarfile
132
- # FIXME: introduce filtering by prefix, so that when the subfolder
133
- # includes N SLURM jobs we don't always copy the cumulative folder
134
- # but only the relevant part
135
130
  t_0_tar = time.perf_counter()
136
131
  tar_command = (
137
132
  f"{self.python_worker_interpreter} "
138
133
  "-m fractal_server.app.runner.compress_folder "
139
- f"{job.workdir_remote.as_posix()} "
140
- "--remote-to-local"
134
+ f"{workdir_remote.as_posix()} "
135
+ f"--filelist {tmp_filelist_path}"
141
136
  )
142
137
  self.fractal_ssh.run_command(cmd=tar_command)
143
138
  t_1_tar = time.perf_counter()
@@ -167,6 +162,47 @@ class SlurmSSHRunner(BaseSlurmRunner):
167
162
  t_1 = time.perf_counter()
168
163
  logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
169
164
 
165
+ def _send_inputs(self, jobs: list[SlurmJob]) -> None:
166
+ """
167
+ Transfer the jobs subfolder to the remote host.
168
+ """
169
+ for job in jobs:
170
+
171
+ # Create local archive
172
+ tarfile_path_local = compress_folder(
173
+ job.workdir_local,
174
+ filelist_path=None,
175
+ )
176
+ tarfile_name = Path(tarfile_path_local).name
177
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
178
+
179
+ # Transfer archive
180
+ tarfile_path_remote = (
181
+ job.workdir_remote.parent / tarfile_name
182
+ ).as_posix()
183
+ t_0_put = time.perf_counter()
184
+ self.fractal_ssh.send_file(
185
+ local=tarfile_path_local,
186
+ remote=tarfile_path_remote,
187
+ )
188
+ t_1_put = time.perf_counter()
189
+ logger.info(
190
+ f"Subfolder archive transferred to {tarfile_path_remote}"
191
+ f" - elapsed: {t_1_put - t_0_put:.3f} s"
192
+ )
193
+
194
+ # Remove local archive
195
+ Path(tarfile_path_local).unlink()
196
+ logger.debug(f"Local archive {tarfile_path_local} removed")
197
+
198
+ # Uncompress remote archive
199
+ tar_command = (
200
+ f"{self.python_worker_interpreter} -m "
201
+ "fractal_server.app.runner.extract_archive "
202
+ f"{tarfile_path_remote}"
203
+ )
204
+ self.fractal_ssh.run_command(cmd=tar_command)
205
+
170
206
  def _run_remote_cmd(self, cmd: str) -> str:
171
207
  stdout = self.fractal_ssh.run_command(cmd=cmd)
172
208
  return stdout
@@ -3,6 +3,7 @@ import os
3
3
  import shlex
4
4
  import subprocess # nosec
5
5
  import sys
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  from pathlib import Path
7
8
  from typing import Optional
8
9
 
@@ -15,7 +16,6 @@ from fractal_server.config import get_settings
15
16
  from fractal_server.logger import set_logger
16
17
  from fractal_server.syringe import Inject
17
18
 
18
-
19
19
  logger = set_logger(__name__)
20
20
 
21
21
 
@@ -67,10 +67,6 @@ class SudoSlurmRunner(BaseSlurmRunner):
67
67
  self.slurm_account = slurm_account
68
68
  settings = Inject(get_settings)
69
69
 
70
- self.python_worker_interpreter = (
71
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
72
- )
73
-
74
70
  super().__init__(
75
71
  slurm_runner_type="sudo",
76
72
  root_dir_local=root_dir_local,
@@ -78,6 +74,9 @@ class SudoSlurmRunner(BaseSlurmRunner):
78
74
  common_script_lines=common_script_lines,
79
75
  user_cache_dir=user_cache_dir,
80
76
  poll_interval=poll_interval,
77
+ python_worker_interpreter=(
78
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
79
+ ),
81
80
  )
82
81
 
83
82
  def _mkdir_local_folder(self, folder: str) -> None:
@@ -88,12 +87,12 @@ class SudoSlurmRunner(BaseSlurmRunner):
88
87
  def _mkdir_remote_folder(self, folder: str) -> None:
89
88
  _mkdir_as_user(folder=folder, user=self.slurm_user)
90
89
 
91
- def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
90
+ def _fetch_artifacts_single_job(self, job: SlurmJob) -> None:
92
91
  """
93
- Note: this would differ for SSH
92
+ Fetch artifacts for a single SLURM jobs.
94
93
  """
95
94
  logger.debug(
96
- f"[_copy_files_from_remote_to_local] {job.slurm_job_id=} START"
95
+ f"[_fetch_artifacts_single_job] {job.slurm_job_id=} START"
97
96
  )
98
97
  source_target_list = [
99
98
  (job.slurm_stdout_remote, job.slurm_stdout_local),
@@ -140,9 +139,30 @@ class SudoSlurmRunner(BaseSlurmRunner):
140
139
  f"SKIP copy {source} into {target}. "
141
140
  f"Original error: {str(e)}"
142
141
  )
142
+ logger.debug(f"[_fetch_artifacts_single_job] {job.slurm_job_id=} END")
143
+
144
+ def _fetch_artifacts(
145
+ self,
146
+ finished_slurm_jobs: list[SlurmJob],
147
+ ) -> None:
148
+ """
149
+ Fetch artifacts for a list of SLURM jobs.
150
+ """
151
+ MAX_NUM_THREADS = 4
152
+ THREAD_NAME_PREFIX = "fetch_artifacts"
143
153
  logger.debug(
144
- f"[_copy_files_from_remote_to_local] {job.slurm_job_id=} END"
154
+ "[_fetch_artifacts] START "
155
+ f"({MAX_NUM_THREADS=}, {len(finished_slurm_jobs)=})."
145
156
  )
157
+ with ThreadPoolExecutor(
158
+ max_workers=MAX_NUM_THREADS,
159
+ thread_name_prefix=THREAD_NAME_PREFIX,
160
+ ) as executor:
161
+ executor.map(
162
+ self._fetch_artifacts_single_job,
163
+ finished_slurm_jobs,
164
+ )
165
+ logger.debug("[_fetch_artifacts] END.")
146
166
 
147
167
  def _run_remote_cmd(self, cmd: str) -> str:
148
168
  res = _run_command_as_user(
@@ -57,9 +57,7 @@ def extract_archive(archive_path: Path):
57
57
 
58
58
  # Run tar command
59
59
  cmd_tar = (
60
- f"tar -xzvf {archive_path} "
61
- f"--directory={subfolder_path.as_posix()} "
62
- "."
60
+ f"tar -xzvf {archive_path} --directory={subfolder_path.as_posix()}"
63
61
  )
64
62
  logger.debug(f"{cmd_tar=}")
65
63
  run_subprocess(cmd=cmd_tar, logger_name=logger_name)
@@ -85,11 +85,15 @@ class TaskFiles(BaseModel):
85
85
  ).as_posix()
86
86
 
87
87
  @property
88
- def log_file_remote(self) -> str:
88
+ def log_file_remote_path(self) -> Path:
89
89
  self._check_component()
90
90
  return (
91
91
  self.wftask_subfolder_remote / f"{self.prefix_component}-log.txt"
92
- ).as_posix()
92
+ )
93
+
94
+ @property
95
+ def log_file_remote(self) -> str:
96
+ return self.log_file_remote_path.as_posix()
93
97
 
94
98
  @property
95
99
  def args_file_local(self) -> str:
@@ -99,11 +103,15 @@ class TaskFiles(BaseModel):
99
103
  ).as_posix()
100
104
 
101
105
  @property
102
- def args_file_remote(self) -> str:
106
+ def args_file_remote_path(self) -> Path:
103
107
  self._check_component()
104
108
  return (
105
109
  self.wftask_subfolder_remote / f"{self.prefix_component}-args.json"
106
- ).as_posix()
110
+ )
111
+
112
+ @property
113
+ def args_file_remote(self) -> str:
114
+ return self.args_file_remote_path.as_posix()
107
115
 
108
116
  @property
109
117
  def metadiff_file_local(self) -> str:
@@ -114,12 +122,16 @@ class TaskFiles(BaseModel):
114
122
  ).as_posix()
115
123
 
116
124
  @property
117
- def metadiff_file_remote(self) -> str:
125
+ def metadiff_file_remote_path(self) -> Path:
118
126
  self._check_component()
119
127
  return (
120
128
  self.wftask_subfolder_remote
121
129
  / f"{self.prefix_component}-metadiff.json"
122
- ).as_posix()
130
+ )
131
+
132
+ @property
133
+ def metadiff_file_remote(self) -> str:
134
+ return self.metadiff_file_remote_path.as_posix()
123
135
 
124
136
  @property
125
137
  def remote_files_dict(self) -> dict[str, str]:
@@ -501,7 +501,8 @@ class FractalSSH(object):
501
501
  content: Contents to be written to file.
502
502
  lock_timeout: Timeout for lock acquisition (overrides default).
503
503
  """
504
- self.logger.info(f"START writing to remote file {path}.")
504
+ t_start = time.perf_counter()
505
+ self.logger.info(f"[write_remote_file] START ({path}).")
505
506
  actual_lock_timeout = self.default_lock_timeout
506
507
  if lock_timeout is not None:
507
508
  actual_lock_timeout = lock_timeout
@@ -518,7 +519,8 @@ class FractalSSH(object):
518
519
  e=e, message=f"Error in `write_remote_file`, for {path=}."
519
520
  )
520
521
 
521
- self.logger.info(f"END writing to remote file {path}.")
522
+ elapsed = time.perf_counter() - t_start
523
+ self.logger.info(f"[write_remote_file] END, {elapsed=} s ({path}).")
522
524
 
523
525
  def remote_exists(self, path: str) -> bool:
524
526
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fractal-server
3
- Version: 2.14.0a22
3
+ Version: 2.14.0a24
4
4
  Summary: Backend component of the Fractal analytics platform
5
5
  License: BSD-3-Clause
6
6
  Author: Tommaso Comparin
@@ -1,4 +1,4 @@
1
- fractal_server/__init__.py,sha256=vmLoAsXfXV-STPVYAotX-Rf--p3F2TkLnffSIuukSq4,26
1
+ fractal_server/__init__.py,sha256=CUzmJ2W7Rx8Ttm0LRwSuqR8wQNBOWiu69MFDz4uNgAo,26
2
2
  fractal_server/__main__.py,sha256=rkM8xjY1KeS3l63irB8yCrlVobR-73uDapC4wvrIlxI,6957
3
3
  fractal_server/alembic.ini,sha256=MWwi7GzjzawI9cCAK1LW7NxIBQDUqD12-ptJoq5JpP0,3153
4
4
  fractal_server/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,7 +67,7 @@ fractal_server/app/routes/aux/validate_user_settings.py,sha256=FLVi__8YFcm_6c_K5
67
67
  fractal_server/app/routes/pagination.py,sha256=L8F5JqekF39qz-LpeScdlhb57MQnSRXjK4ZEtsZqYLk,1210
68
68
  fractal_server/app/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  fractal_server/app/runner/components.py,sha256=-Ii5l8d_V6f5DFOd-Zsr8VYmOsyqw0Hox9fEFQiuqxY,66
70
- fractal_server/app/runner/compress_folder.py,sha256=yEboyXe6WcNq5QUmXTJSJrC9TFfst9XYC3sVWZ6OcNE,4670
70
+ fractal_server/app/runner/compress_folder.py,sha256=DX-4IYlSXlMd0EmXDD8M8FxisfKLbooSTrdNtzYAQAM,4876
71
71
  fractal_server/app/runner/exceptions.py,sha256=JC5ufHyeA1hYD_rkZUscI30DD8D903ncag7Z3AArmUY,4215
72
72
  fractal_server/app/runner/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
73
  fractal_server/app/runner/executors/base_runner.py,sha256=_elfqkuw1TGRacPG4aTTYKUTbF4A-Y2Rmft9LFi-Mwc,5554
@@ -78,22 +78,22 @@ fractal_server/app/runner/executors/slurm_common/__init__.py,sha256=47DEQpj8HBSa
78
78
  fractal_server/app/runner/executors/slurm_common/_batching.py,sha256=ZY020JZlDS5mfpgpWTChQkyHU7iLE5kx2HVd57_C6XA,8850
79
79
  fractal_server/app/runner/executors/slurm_common/_job_states.py,sha256=nuV-Zba38kDrRESOVB3gaGbrSPZc4q7YGichQaeqTW0,238
80
80
  fractal_server/app/runner/executors/slurm_common/_slurm_config.py,sha256=fZaFUUXqDH0p3DndCFUpFqTqyD2tMVCuSYgYLAycpVw,15897
81
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=f01IDAtBoatOYEP4UtrH0Y4qN7BwM1ov4Bx8rotQg1M,31099
81
+ fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=_WXa9sDrx8MgsllO7OpHLrVjqpkapnIekqug6QyMqYE,32258
82
82
  fractal_server/app/runner/executors/slurm_common/get_slurm_config.py,sha256=-fAX1DZMB5RZnyYanIJD72mWOJAPkh21jd4loDXKJw4,5994
83
83
  fractal_server/app/runner/executors/slurm_common/remote.py,sha256=FS_F8EaPp-A5eQT5_ZH3ICCHt0-C8b_2OSYcyRkXnb4,5851
84
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py,sha256=YGgzTspkK9ItSMzwuYv_1tY7_1g89Qpeny5Auinxk1E,2708
84
+ fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py,sha256=RoxHLKOn0_wGjnY0Sv0a9nDSiqxYZHKRoMkT3p9_G1E,3607
85
85
  fractal_server/app/runner/executors/slurm_common/utils_executors.py,sha256=naPyJI0I3lD-sYHbSXbMFGUBK4h_SggA5V91Z1Ch1Xg,1416
86
86
  fractal_server/app/runner/executors/slurm_ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
- fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=WhcrcvDMvSoQ-GroDXWzIfESbOLxMf3m-ZE16pbqQRg,5777
87
+ fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=tHdgSnEzIZAFvq23UcA8zBndDemMU0cdwW4R4VZfIfA,7137
88
88
  fractal_server/app/runner/executors/slurm_sudo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
89
  fractal_server/app/runner/executors/slurm_sudo/_subprocess_run_as_user.py,sha256=O1bNg1DiSDJmQE0RmOk2Ii47DagiXp5ryd0R6KxO2OM,3177
90
- fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=FdhENPwz_efXSPqG3saEvJOUiv6uG1vd0ET61QmbC4o,5136
91
- fractal_server/app/runner/extract_archive.py,sha256=tLpjDrX47OjTNhhoWvm6iNukg8KoieWyTb7ZfvE9eWU,2483
90
+ fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=WGGVHX_juqyC6OVhln9yg-YKjLiuAoWZhAGxBjhNkWw,5873
91
+ fractal_server/app/runner/extract_archive.py,sha256=I7UGIHXXuFvlgVPsP7GMWPu2-DiS1EiyBs7a1bvgkxI,2458
92
92
  fractal_server/app/runner/filenames.py,sha256=lPnxKHtdRizr6FqG3zOdjDPyWA7GoaJGTtiuJV0gA8E,70
93
93
  fractal_server/app/runner/run_subprocess.py,sha256=c3JbYXq3hX2aaflQU19qJ5Xs6J6oXGNvnTEoAfv2bxc,959
94
94
  fractal_server/app/runner/set_start_and_last_task_index.py,sha256=-q4zVybAj8ek2XlbENKlfOAJ39hT_zoJoZkqzDqiAMY,1254
95
95
  fractal_server/app/runner/shutdown.py,sha256=9pfSKHDNdIcm0eY-opgRTi7y0HmvfPmYiu9JR6Idark,2082
96
- fractal_server/app/runner/task_files.py,sha256=KfjDqQV9b9wImusxJtQfVST42FyvT3Kza0WooI-6OBk,3198
96
+ fractal_server/app/runner/task_files.py,sha256=wWp7GjZAt04X-N5M1YHDRyWHeNz8kFjK-jSA6DaxC20,3510
97
97
  fractal_server/app/runner/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
98
  fractal_server/app/runner/v2/_local.py,sha256=DK8yagbvd6HHjcDVhUzTy0f7MURlTkQha-NM6OZKgJc,3044
99
99
  fractal_server/app/runner/v2/_slurm_ssh.py,sha256=_bytOf8z9sdrhI03D6eqg-aQPnJ7V2-qnqpcHAYizns,3278
@@ -177,7 +177,7 @@ fractal_server/migrations/versions/f384e1c0cf5d_drop_task_default_args_columns.p
177
177
  fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py,sha256=TDWCaIoM0Q4SpRWmR9zr_rdp3lJXhCfBPTMhtrP5xYE,3950
178
178
  fractal_server/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
179
  fractal_server/ssh/__init__.py,sha256=sVUmzxf7_DuXG1xoLQ1_00fo5NPhi2LJipSmU5EAkPs,124
180
- fractal_server/ssh/_fabric.py,sha256=lNy4IX1I4We6VoWa4Bz4fUPuApLMSoejpyE6I3jDZeM,22869
180
+ fractal_server/ssh/_fabric.py,sha256=gnSv_DaQ8uYLS35Rqb84wo3HRkMazXGVd-D19fo9zqA,22967
181
181
  fractal_server/string_tools.py,sha256=niViRrrZAOo0y6pEFI9L_eUYS1PoOiQZUBtngiLc2_k,1877
182
182
  fractal_server/syringe.py,sha256=3qSMW3YaMKKnLdgnooAINOPxnCOxP7y2jeAQYB21Gdo,2786
183
183
  fractal_server/tasks/__init__.py,sha256=kadmVUoIghl8s190_Tt-8f-WBqMi8u8oU4Pvw39NHE8,23
@@ -207,8 +207,8 @@ fractal_server/tasks/v2/utils_templates.py,sha256=Kc_nSzdlV6KIsO0CQSPs1w70zLyENP
207
207
  fractal_server/urls.py,sha256=QjIKAC1a46bCdiPMu3AlpgFbcv6a4l3ABcd5xz190Og,471
208
208
  fractal_server/utils.py,sha256=PMwrxWFxRTQRl1b9h-NRIbFGPKqpH_hXnkAT3NfZdpY,3571
209
209
  fractal_server/zip_tools.py,sha256=GjDgo_sf6V_DDg6wWeBlZu5zypIxycn_l257p_YVKGc,4876
210
- fractal_server-2.14.0a22.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
211
- fractal_server-2.14.0a22.dist-info/METADATA,sha256=rfqOHrXBNtolpECEX8UmGRPTaBqE8w2bFgqccZQp_d8,4563
212
- fractal_server-2.14.0a22.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
213
- fractal_server-2.14.0a22.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
214
- fractal_server-2.14.0a22.dist-info/RECORD,,
210
+ fractal_server-2.14.0a24.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
211
+ fractal_server-2.14.0a24.dist-info/METADATA,sha256=htM5Xh5zqxmdam8kHpl9STp9eMAU3M_pBW7umqJsBR0,4563
212
+ fractal_server-2.14.0a24.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
213
+ fractal_server-2.14.0a24.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
214
+ fractal_server-2.14.0a24.dist-info/RECORD,,