fractal-server 2.14.0a13__py3-none-any.whl → 2.14.0a14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/linkusergroup.py +6 -2
  3. fractal_server/app/models/v2/dataset.py +1 -1
  4. fractal_server/app/models/v2/job.py +7 -3
  5. fractal_server/app/models/v2/task_group.py +2 -2
  6. fractal_server/app/models/v2/workflow.py +1 -1
  7. fractal_server/app/models/v2/workflowtask.py +1 -1
  8. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  9. fractal_server/app/routes/api/v2/dataset.py +0 -8
  10. fractal_server/app/routes/api/v2/history.py +112 -27
  11. fractal_server/app/routes/api/v2/images.py +16 -14
  12. fractal_server/app/routes/api/v2/project.py +0 -52
  13. fractal_server/app/routes/api/v2/task_group.py +0 -17
  14. fractal_server/app/routes/api/v2/workflow.py +0 -8
  15. fractal_server/app/routes/auth/group.py +0 -16
  16. fractal_server/app/runner/executors/base_runner.py +5 -0
  17. fractal_server/app/runner/executors/local/runner.py +15 -7
  18. fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
  19. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +676 -0
  20. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
  21. fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
  22. fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
  23. fractal_server/app/runner/task_files.py +20 -6
  24. fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
  25. fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
  26. fractal_server/app/runner/v2/runner.py +4 -0
  27. fractal_server/app/runner/v2/runner_functions.py +2 -2
  28. fractal_server/app/runner/v2/submit_workflow.py +7 -16
  29. fractal_server/app/schemas/v2/__init__.py +3 -1
  30. fractal_server/app/schemas/v2/history.py +27 -2
  31. fractal_server/config.py +6 -2
  32. fractal_server/images/tools.py +23 -0
  33. fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
  34. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
  35. fractal_server/tasks/v2/utils_background.py +0 -19
  36. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/METADATA +1 -1
  37. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/RECORD +40 -41
  38. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
  39. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
  40. fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
  41. fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
  42. fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
  43. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/LICENSE +0 -0
  44. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/WHEEL +0 -0
  45. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/entry_points.txt +0 -0
@@ -1,30 +1,11 @@
1
- import json
2
- import math
3
- import sys
4
1
  import time
5
- from copy import copy
6
2
  from pathlib import Path
7
- from typing import Any
8
3
  from typing import Optional
9
4
 
10
- import cloudpickle
11
- from pydantic import BaseModel
12
- from pydantic import ConfigDict
13
-
14
- from ._check_job_status_ssh import get_finished_jobs_ssh
15
- from fractal_server import __VERSION__
16
- from fractal_server.app.runner.exceptions import JobExecutionError
17
- from fractal_server.app.runner.exceptions import TaskExecutionError
18
- from fractal_server.app.runner.executors.base_runner import BaseRunner
19
- from fractal_server.app.runner.executors.slurm_common._batching import (
20
- heuristics,
21
- )
22
- from fractal_server.app.runner.executors.slurm_common._slurm_config import (
23
- SlurmConfig,
24
- )
25
- from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
26
- from fractal_server.app.runner.task_files import TaskFiles
27
- from fractal_server.app.schemas.v2.task import TaskTypeType
5
+ from ..slurm_common.base_slurm_runner import BaseSlurmRunner
6
+ from ..slurm_common.slurm_job_task_models import SlurmJob
7
+ from fractal_server.app.runner.compress_folder import compress_folder
8
+ from fractal_server.app.runner.extract_archive import extract_archive
28
9
  from fractal_server.config import get_settings
29
10
  from fractal_server.logger import set_logger
30
11
  from fractal_server.ssh._fabric import FractalSSH
@@ -32,676 +13,157 @@ from fractal_server.syringe import Inject
32
13
 
33
14
 
34
15
  logger = set_logger(__name__)
16
+ # FIXME: Transform several logger.info into logger.debug.
35
17
 
36
18
 
37
- def _handle_exception_proxy(proxy): # FIXME
38
- if proxy.exc_type_name == "JobExecutionError":
39
- return JobExecutionError(str(proxy))
40
- else:
41
- kwargs = {}
42
- for key in [
43
- "workflow_task_id",
44
- "workflow_task_order",
45
- "task_name",
46
- ]:
47
- if key in proxy.kwargs.keys():
48
- kwargs[key] = proxy.kwargs[key]
49
- return TaskExecutionError(proxy.tb, **kwargs)
50
-
51
-
52
- class SlurmTask(BaseModel):
53
- model_config = ConfigDict(arbitrary_types_allowed=True)
54
- component: str
55
- workdir_local: Path
56
- workdir_remote: Path
57
- parameters: dict[str, Any]
58
- zarr_url: Optional[str] = None
59
- task_files: TaskFiles
60
- index: int
61
-
62
- @property
63
- def input_pickle_file_local(self) -> str:
64
- return (
65
- self.workdir_local / f"{self.component}-input.pickle"
66
- ).as_posix()
67
-
68
- @property
69
- def output_pickle_file_local(self) -> str:
70
- return (
71
- self.workdir_local / f"{self.component}-output.pickle"
72
- ).as_posix()
73
-
74
- @property
75
- def input_pickle_file_remote(self) -> str:
76
- return (
77
- self.workdir_remote / f"{self.component}-input.pickle"
78
- ).as_posix()
79
-
80
- @property
81
- def output_pickle_file_remote(self) -> str:
82
- return (
83
- self.workdir_remote / f"{self.component}-output.pickle"
84
- ).as_posix()
85
-
86
-
87
- class SlurmJob(BaseModel):
88
- slurm_job_id: Optional[str] = None
89
- label: str
90
- workdir_local: Path
91
- workdir_remote: Path
92
- tasks: list[SlurmTask]
93
-
94
- @property
95
- def slurm_log_file_local(self) -> str:
96
- if self.slurm_job_id:
97
- return (
98
- self.workdir_local
99
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
100
- ).as_posix()
101
- else:
102
- return (
103
- self.workdir_local / f"slurm-{self.label}-%j.log"
104
- ).as_posix()
105
-
106
- @property
107
- def slurm_log_file_remote(self) -> str:
108
- if self.slurm_job_id:
109
- return (
110
- self.workdir_remote
111
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
112
- ).as_posix()
113
- else:
114
- return (
115
- self.workdir_remote / f"slurm-{self.label}-%j.log"
116
- ).as_posix()
117
-
118
- @property
119
- def slurm_submission_script_local(self) -> str:
120
- return (
121
- self.workdir_local / f"slurm-{self.label}-submit.sh"
122
- ).as_posix()
123
-
124
- @property
125
- def slurm_submission_script_remote(self) -> str:
126
- return (
127
- self.workdir_remote / f"slurm-{self.label}-submit.sh"
128
- ).as_posix()
129
-
130
- @property
131
- def slurm_stdout(self) -> str:
132
- return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
133
-
134
- @property
135
- def slurm_stderr(self) -> str:
136
- return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
137
-
138
- @property
139
- def log_files_local(self) -> list[str]:
140
- return [task.task_files.log_file_local for task in self.tasks]
141
-
142
-
143
- # def _subprocess_run_or_raise(
144
- # full_command: str,
145
- # ) -> Optional[subprocess.CompletedProcess]:
146
- # try:
147
- # output = subprocess.run( # nosec
148
- # shlex.split(full_command),
149
- # capture_output=True,
150
- # check=True,
151
- # encoding="utf-8",
152
- # )
153
- # return output
154
- # except subprocess.CalledProcessError as e:
155
- # error_msg = (
156
- # f"Submit command `{full_command}` failed. "
157
- # f"Original error:\n{str(e)}\n"
158
- # f"Original stdout:\n{e.stdout}\n"
159
- # f"Original stderr:\n{e.stderr}\n"
160
- # )
161
- # logging.error(error_msg)
162
- # raise JobExecutionError(info=error_msg)
163
-
164
-
165
- class RunnerSlurmSSH(BaseRunner):
19
+ class SlurmSSHRunner(BaseSlurmRunner):
166
20
  fractal_ssh: FractalSSH
167
21
 
168
- slurm_user: str
169
- shutdown_file: Path
170
- common_script_lines: list[str]
171
- user_cache_dir: str
172
- root_dir_local: Path
173
- root_dir_remote: Path
174
- slurm_account: Optional[str] = None
175
- poll_interval: int
176
- python_worker_interpreter: str
177
- jobs: dict[str, SlurmJob]
178
-
179
22
  def __init__(
180
23
  self,
181
24
  *,
182
- fractal_ssh: FractalSSH,
183
- slurm_user: str,
25
+ # Common
184
26
  root_dir_local: Path,
185
27
  root_dir_remote: Path,
186
- slurm_account: Optional[str] = None,
187
28
  common_script_lines: Optional[list[str]] = None,
188
29
  user_cache_dir: Optional[str] = None,
189
- slurm_poll_interval: Optional[int] = None,
30
+ poll_interval: Optional[int] = None,
31
+ # Specific
32
+ fractal_ssh: FractalSSH,
190
33
  ) -> None:
191
34
  """
192
35
  Set parameters that are the same for different Fractal tasks and for
193
36
  different SLURM jobs/tasks.
194
37
  """
195
-
196
- self.slurm_user = slurm_user
197
- self.slurm_account = slurm_account
198
- self.common_script_lines = common_script_lines or []
199
-
200
- # Check that SLURM account is not set here
201
- # FIXME: move to little method
202
- try:
203
- invalid_line = next(
204
- line
205
- for line in self.common_script_lines
206
- if line.startswith("#SBATCH --account=")
207
- )
208
- raise RuntimeError(
209
- "Invalid line in `RunnerSlurmSSH.common_script_lines`: "
210
- f"'{invalid_line}'.\n"
211
- "SLURM account must be set via the request body of the "
212
- "apply-workflow endpoint, or by modifying the user properties."
213
- )
214
- except StopIteration:
215
- pass
216
-
217
- # Check Python versions
218
- settings = Inject(get_settings)
219
38
  self.fractal_ssh = fractal_ssh
220
39
  logger.warning(self.fractal_ssh)
221
40
 
222
- # It is the new handshanke
223
- if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
224
- self.check_remote_python_interpreter()
225
-
226
- # Initialize connection and perform handshake
227
- self.root_dir_local = root_dir_local
228
- self.root_dir_remote = root_dir_remote
229
-
230
- # # Create folders
231
- # original_umask = os.umask(0)
232
- # self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
233
- # os.umask(original_umask)
234
- # _mkdir_as_user(
235
- # folder=self.root_dir_remote.as_posix(),
236
- # user=self.slurm_user,
237
- # )
238
-
239
- self.user_cache_dir = user_cache_dir
240
-
241
- self.slurm_poll_interval = (
242
- slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
243
- )
244
-
245
- self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
246
-
247
- self.python_worker_interpreter = (
248
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
249
- )
250
-
251
- self.jobs = {}
252
-
253
- def __enter__(self):
254
- return self
255
-
256
- def __exit__(self, exc_type, exc_val, exc_tb):
257
- return False
258
-
259
- def is_shutdown(self) -> bool:
260
- return self.shutdown_file.exists()
261
-
262
- def scancel_jobs(self) -> None:
263
- logger.debug("[scancel_jobs] START")
264
-
265
- if self.jobs:
266
- scancel_string = " ".join(self.job_ids)
267
- scancel_cmd = f"scancel {scancel_string}"
268
- logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
269
- try:
270
- self.fractal_ssh.run_command(cmd=scancel_cmd)
271
- # _run_command_as_user(
272
- # cmd=scancel_cmd,
273
- # user=self.slurm_user,
274
- # check=True,
275
- # )
276
- except RuntimeError as e:
277
- logger.warning(
278
- "[scancel_jobs] `scancel` command failed. "
279
- f"Original error:\n{str(e)}"
280
- )
281
-
282
- logger.debug("[scancel_jobs] END")
283
-
284
- def _submit_single_sbatch(
285
- self,
286
- func,
287
- slurm_job: SlurmJob,
288
- slurm_config: SlurmConfig,
289
- ) -> str:
290
- # Prepare input pickle(s)
291
- versions = dict(
292
- python=sys.version_info[:3],
293
- cloudpickle=cloudpickle.__version__,
294
- fractal_server=__VERSION__,
295
- )
296
- for task in slurm_job.tasks:
297
- _args = []
298
- _kwargs = dict(parameters=task.parameters)
299
- funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
300
- with open(task.input_pickle_file_local, "wb") as f:
301
- f.write(funcser)
302
- # Prepare commands to be included in SLURM submission script
303
41
  settings = Inject(get_settings)
304
- python_worker_interpreter = (
305
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
306
- )
307
- cmdlines = []
308
- for task in slurm_job.tasks:
309
- input_pickle_file = task.input_pickle_file_local
310
- output_pickle_file = task.output_pickle_file_remote
311
- cmdlines.append(
312
- (
313
- f"{python_worker_interpreter}"
314
- " -m fractal_server.app.runner."
315
- "executors.slurm_common.remote "
316
- f"--input-file {input_pickle_file} "
317
- f"--output-file {output_pickle_file}"
318
- )
319
- )
320
-
321
- # ...
322
- num_tasks_max_running = slurm_config.parallel_tasks_per_job
323
- mem_per_task_MB = slurm_config.mem_per_task_MB
324
-
325
- # Set ntasks
326
- ntasks = min(len(cmdlines), num_tasks_max_running)
327
- slurm_config.parallel_tasks_per_job = ntasks
42
+ self.python_worker_interpreter = settings.FRACTAL_SLURM_WORKER_PYTHON
328
43
 
329
- # Prepare SLURM preamble based on SlurmConfig object
330
- script_lines = slurm_config.to_sbatch_preamble(
331
- remote_export_dir=self.user_cache_dir
44
+ super().__init__(
45
+ slurm_runner_type="ssh",
46
+ root_dir_local=root_dir_local,
47
+ root_dir_remote=root_dir_remote,
48
+ common_script_lines=common_script_lines,
49
+ user_cache_dir=user_cache_dir,
50
+ poll_interval=poll_interval,
332
51
  )
333
52
 
334
- # Extend SLURM preamble with variable which are not in SlurmConfig, and
335
- # fix their order
336
- script_lines.extend(
337
- [
338
- f"#SBATCH --err={slurm_job.slurm_stderr}",
339
- f"#SBATCH --out={slurm_job.slurm_stdout}",
340
- f"#SBATCH -D {slurm_job.workdir_remote}",
341
- ]
342
- )
343
- script_lines = slurm_config.sort_script_lines(script_lines)
344
- logger.debug(script_lines)
53
+ def _mkdir_local_folder(self, folder: str) -> None:
54
+ Path(folder).mkdir(parents=True)
345
55
 
346
- # Always print output of `uname -n` and `pwd`
347
- script_lines.append(
348
- '"Hostname: `uname -n`; current directory: `pwd`"\n'
56
+ def _mkdir_remote_folder(self, folder: str):
57
+ self.fractal_ssh.mkdir(
58
+ folder=folder,
59
+ parents=True,
349
60
  )
350
61
 
351
- # Complete script preamble
352
- script_lines.append("\n")
62
+ def _copy_files_from_remote_to_local(self, slurm_job: SlurmJob) -> None:
63
+ self._get_subfolder_sftp(job=slurm_job)
353
64
 
354
- # Include command lines
355
- tmp_list_commands = copy(cmdlines)
356
- while tmp_list_commands:
357
- if tmp_list_commands:
358
- cmd = tmp_list_commands.pop(0) # take first element
359
- script_lines.append(
360
- "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
361
- f"--mem={mem_per_task_MB}MB "
362
- f"{cmd} &"
363
- )
364
- script_lines.append("wait\n")
365
-
366
- script = "\n".join(script_lines)
65
+ def _put_subfolder_sftp(self, job: SlurmJob) -> None:
66
+ # FIXME re-introduce use of this function, but only after splitting
67
+ # submission logic into
68
+ # 1. prepare all
69
+ # 2. send folder
70
+ # 3. submit all
71
+ """
72
+ Transfer the jobs subfolder to the remote host.
73
+ """
367
74
 
368
- # Write submission script
369
- # submission_script_contents = "\n".join(preamble_lines + cmdlines)
370
- with open(slurm_job.slurm_submission_script_local, "w") as f:
371
- f.write(script)
75
+ # Create compressed subfolder archive (locally)
76
+ tarfile_path_local = compress_folder(job.workdir_local)
372
77
 
78
+ tarfile_name = Path(tarfile_path_local).name
79
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
80
+ tarfile_path_remote = (
81
+ job.workdir_remote.parent / tarfile_name
82
+ ).as_posix()
83
+ # Transfer archive
84
+ t_0_put = time.perf_counter()
373
85
  self.fractal_ssh.send_file(
374
- local=slurm_job.slurm_submission_script_local,
375
- remote=slurm_job.slurm_submission_script_remote,
86
+ local=tarfile_path_local,
87
+ remote=tarfile_path_remote,
376
88
  )
377
-
378
- # Run sbatch
379
- submit_command = (
380
- f"sbatch --parsable {slurm_job.slurm_submission_script_remote}"
89
+ t_1_put = time.perf_counter()
90
+ logger.info(
91
+ f"Subfolder archive transferred to {tarfile_path_remote}"
92
+ f" - elapsed: {t_1_put - t_0_put:.3f} s"
381
93
  )
382
- pre_submission_cmds = slurm_config.pre_submission_commands
383
- if len(pre_submission_cmds) == 0:
384
- sbatch_stdout = self.fractal_ssh.run_command(cmd=submit_command)
385
- else:
386
- logger.debug(f"Now using {pre_submission_cmds=}")
387
- script_lines = pre_submission_cmds + [submit_command]
388
- script_content = "\n".join(script_lines)
389
- script_content = f"{script_content}\n"
390
- script_path_remote = (
391
- f"{slurm_job.slurm_script_remote.as_posix()}_wrapper.sh"
392
- )
393
- self.fractal_ssh.write_remote_file(
394
- path=script_path_remote, content=script_content
395
- )
396
- cmd = f"bash {script_path_remote}"
397
- sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
398
-
399
- # Submit SLURM job and retrieve job ID
400
- stdout = sbatch_stdout.strip("\n")
401
- submitted_job_id = int(stdout)
402
- slurm_job.slurm_job_id = str(submitted_job_id)
403
-
404
- # Add job to self.jobs
405
- self.jobs[slurm_job.slurm_job_id] = slurm_job
406
- logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
94
+ # Uncompress archive (remotely)
95
+ tar_command = (
96
+ f"{self.python_worker_interpreter} -m "
97
+ "fractal_server.app.runner.extract_archive "
98
+ f"{tarfile_path_remote}"
99
+ )
100
+ self.fractal_ssh.run_command(cmd=tar_command)
407
101
 
408
- @property
409
- def job_ids(self) -> list[str]:
410
- return list(self.jobs.keys())
102
+ # Remove local version
103
+ t_0_rm = time.perf_counter()
104
+ Path(tarfile_path_local).unlink()
105
+ t_1_rm = time.perf_counter()
106
+ logger.info(
107
+ f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
108
+ )
411
109
 
412
- def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
413
- # FIXME: This should only transfer archives, not single files
110
+ def _get_subfolder_sftp(self, job: SlurmJob) -> None:
414
111
  """
415
- Note: this would differ for SSH
112
+ Fetch a remote folder via tar+sftp+tar
416
113
  """
417
- source_target_list = [
418
- (job.slurm_log_file_remote, job.slurm_log_file_local)
419
- ]
420
- for task in job.tasks:
421
- source_target_list.extend(
422
- [
423
- (
424
- task.output_pickle_file_remote,
425
- task.output_pickle_file_local,
426
- ),
427
- (
428
- task.task_files.log_file_remote,
429
- task.task_files.log_file_local,
430
- ),
431
- (
432
- task.task_files.args_file_remote,
433
- task.task_files.args_file_local,
434
- ),
435
- (
436
- task.task_files.metadiff_file_remote,
437
- task.task_files.metadiff_file_local,
438
- ),
439
- ]
440
- )
441
114
 
442
- for source, target in source_target_list:
443
- try:
444
- self.fractal_ssh.fetch_file(local=target, remote=source)
445
- # res = _run_command_as_user(
446
- # cmd=f"cat {source}",
447
- # user=self.slurm_user,
448
- # encoding=None,
449
- # check=True,
450
- # )
451
- # Write local file
452
- # with open(target, "wb") as f:
453
- # f.write(res.stdout)
454
- # logger.critical(f"Copied {source} into {target}")
455
- except (RuntimeError, FileNotFoundError) as e:
456
- logger.warning(
457
- f"SKIP copy {target} into {source}. "
458
- f"Original error: {str(e)}"
459
- )
115
+ t_0 = time.perf_counter()
116
+ logger.debug("[_get_subfolder_sftp] Start")
117
+ tarfile_path_local = (
118
+ job.workdir_local.parent / f"{job.workdir_local.name}.tar.gz"
119
+ ).as_posix()
120
+ tarfile_path_remote = (
121
+ job.workdir_remote.parent / f"{job.workdir_remote.name}.tar.gz"
122
+ ).as_posix()
460
123
 
461
- def _postprocess_single_task(
462
- self, *, task: SlurmTask
463
- ) -> tuple[Any, Exception]:
124
+ # Remove remote tarfile
464
125
  try:
465
- with open(task.output_pickle_file_local, "rb") as f:
466
- outdata = f.read()
467
- success, output = cloudpickle.loads(outdata)
468
- if success:
469
- result = output
470
- return result, None
471
- else:
472
- exception = _handle_exception_proxy(output)
473
- return None, exception
474
- except Exception as e:
475
- exception = JobExecutionError(f"ERROR, {str(e)}")
476
- return None, exception
477
- finally:
478
- Path(task.input_pickle_file_local).unlink(missing_ok=True)
479
- Path(task.output_pickle_file_local).unlink(missing_ok=True)
480
-
481
- def submit(
482
- self,
483
- func: callable,
484
- parameters: dict[str, Any],
485
- history_item_id: int,
486
- task_files: TaskFiles,
487
- slurm_config: SlurmConfig,
488
- task_type: TaskTypeType,
489
- ) -> tuple[Any, Exception]:
490
- workdir_local = task_files.wftask_subfolder_local
491
- workdir_remote = task_files.wftask_subfolder_remote
492
-
493
- task_files = TaskFiles(
494
- **task_files.model_dump(
495
- exclude={"component"},
496
- ),
497
- # FIXME _COMPONENT_KEY_ is deprecated
498
- component="FIXME_INVALID_FAKE_VALUE",
499
- # component=parameters[_COMPONENT_KEY_],
500
- )
501
-
502
- if self.jobs != {}:
503
- raise JobExecutionError("Unexpected branch: jobs should be empty.")
504
-
505
- if self.is_shutdown():
506
- raise JobExecutionError("Cannot continue after shutdown.")
507
-
508
- # Validation phase
509
- self.validate_submit_parameters(
510
- parameters=parameters,
511
- task_type=task_type,
512
- )
513
-
514
- # Create task subfolder
515
- workdir_local.mkdir(parents=True)
516
- self.fractal_ssh.mkdir(
517
- folder=workdir_remote.as_posix(),
518
- parents=True,
519
- )
520
-
521
- # Submission phase
522
- slurm_job = SlurmJob(
523
- label="0",
524
- workdir_local=workdir_local,
525
- workdir_remote=workdir_remote,
526
- tasks=[
527
- SlurmTask(
528
- index=0,
529
- component="0",
530
- parameters=parameters,
531
- workdir_remote=workdir_remote,
532
- workdir_local=workdir_local,
533
- task_files=task_files,
534
- )
535
- ],
536
- ) # TODO: replace with actual values (BASED ON TASKFILES)
537
-
538
- slurm_config.parallel_tasks_per_job = 1
539
- self._submit_single_sbatch(
540
- func,
541
- slurm_job=slurm_job,
542
- slurm_config=slurm_config,
126
+ rm_command = f"rm {tarfile_path_remote}"
127
+ self.fractal_ssh.run_command(cmd=rm_command)
128
+ except RuntimeError as e:
129
+ logger.warning(f"{tarfile_path_remote} already exists!\n {str(e)}")
130
+
131
+ # Create remote tarfile
132
+ # FIXME: introduce filtering by prefix, so that when the subfolder
133
+ # includes N SLURM jobs we don't always copy the cumulative folder
134
+ # but only the relevant part
135
+ tar_command = (
136
+ f"{self.python_worker_interpreter} "
137
+ "-m fractal_server.app.runner.compress_folder "
138
+ f"{job.workdir_remote.as_posix()} "
139
+ "--remote-to-local"
543
140
  )
141
+ stdout = self.fractal_ssh.run_command(cmd=tar_command)
142
+ print(stdout)
544
143
 
545
- # Retrieval phase
546
- while len(self.jobs) > 0:
547
- if self.is_shutdown():
548
- self.scancel_jobs()
549
- finished_job_ids = get_finished_jobs_ssh(
550
- job_ids=self.job_ids,
551
- fractal_ssh=self.fractal_ssh,
552
- )
553
- for slurm_job_id in finished_job_ids:
554
- slurm_job = self.jobs.pop(slurm_job_id)
555
- self._copy_files_from_remote_to_local(slurm_job)
556
- result, exception = self._postprocess_single_task(
557
- task=slurm_job.tasks[0]
558
- )
559
- time.sleep(self.slurm_poll_interval)
560
-
561
- return result, exception
562
-
563
- def multisubmit(
564
- self,
565
- func: callable,
566
- list_parameters: list[dict],
567
- history_item_id: int,
568
- task_files: TaskFiles,
569
- slurm_config: SlurmConfig,
570
- task_type: TaskTypeType,
571
- ):
572
- # self.scancel_jobs()
573
-
574
- self.validate_multisubmit_parameters(
575
- list_parameters=list_parameters,
576
- task_type=task_type,
144
+ # Fetch tarfile
145
+ t_0_get = time.perf_counter()
146
+ self.fractal_ssh.fetch_file(
147
+ remote=tarfile_path_remote,
148
+ local=tarfile_path_local,
577
149
  )
578
-
579
- workdir_local = task_files.wftask_subfolder_local
580
- workdir_remote = task_files.wftask_subfolder_remote
581
-
582
- # Create local&remote task subfolders
583
- if task_type not in ["compound", "converter_compound"]:
584
- workdir_local.mkdir(parents=True)
585
- self.fractal_ssh.mkdir(
586
- folder=workdir_remote.as_posix(),
587
- parents=True,
588
- )
589
-
590
- # Execute tasks, in chunks of size `parallel_tasks_per_job`
591
- # TODO Pick a data structure for results and exceptions, or review the
592
- # interface
593
- results: dict[int, Any] = {}
594
- exceptions: dict[int, BaseException] = {}
595
-
596
- original_task_files = task_files
597
- tot_tasks = len(list_parameters)
598
-
599
- # Set/validate parameters for task batching
600
- tasks_per_job, parallel_tasks_per_job = heuristics(
601
- # Number of parallel components (always known)
602
- tot_tasks=tot_tasks,
603
- # Optional WorkflowTask attributes:
604
- tasks_per_job=slurm_config.tasks_per_job,
605
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
606
- # Task requirements (multiple possible sources):
607
- cpus_per_task=slurm_config.cpus_per_task,
608
- mem_per_task=slurm_config.mem_per_task_MB,
609
- # Fractal configuration variables (soft/hard limits):
610
- target_cpus_per_job=slurm_config.target_cpus_per_job,
611
- target_mem_per_job=slurm_config.target_mem_per_job,
612
- target_num_jobs=slurm_config.target_num_jobs,
613
- max_cpus_per_job=slurm_config.max_cpus_per_job,
614
- max_mem_per_job=slurm_config.max_mem_per_job,
615
- max_num_jobs=slurm_config.max_num_jobs,
150
+ t_1_get = time.perf_counter()
151
+ logger.info(
152
+ f"Subfolder archive transferred back to {tarfile_path_local}"
153
+ f" - elapsed: {t_1_get - t_0_get:.3f} s"
616
154
  )
617
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
618
- slurm_config.tasks_per_job = tasks_per_job
619
-
620
- # Divide arguments in batches of `tasks_per_job` tasks each
621
- args_batches = []
622
- batch_size = tasks_per_job
623
- for ind_chunk in range(0, tot_tasks, batch_size):
624
- args_batches.append(
625
- list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
626
- )
627
- if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
628
- raise RuntimeError("Something wrong here while batching tasks")
629
155
 
630
- logger.info(f"START submission phase, {list(self.jobs.keys())=}")
631
- for ind_batch, chunk in enumerate(args_batches):
632
- # TODO: replace with actual values
633
- tasks = []
634
- for ind_chunk, parameters in enumerate(chunk):
635
- # FIXME: _COMPONENT_KEY_ is deprecated
636
- # component = parameters[_COMPONENT_KEY_]
637
- component = "INVALID_FAKE_VALUE_FIXME"
638
- tasks.append(
639
- SlurmTask(
640
- index=(ind_batch * batch_size) + ind_chunk,
641
- component=component,
642
- workdir_local=workdir_local,
643
- workdir_remote=workdir_remote,
644
- parameters=parameters,
645
- zarr_url=parameters["zarr_url"],
646
- task_files=TaskFiles(
647
- **original_task_files.model_dump(
648
- exclude={"component"}
649
- ),
650
- component=component,
651
- ),
652
- ),
653
- )
156
+ # Extract tarfile locally
157
+ extract_archive(Path(tarfile_path_local))
654
158
 
655
- slurm_job = SlurmJob(
656
- label=f"{ind_batch:06d}",
657
- workdir_local=workdir_local,
658
- workdir_remote=workdir_remote,
659
- tasks=tasks,
660
- )
661
- self._submit_single_sbatch(
662
- func,
663
- slurm_job=slurm_job,
664
- slurm_config=slurm_config,
665
- )
666
- logger.info(f"END submission phase, {list(self.jobs.keys())=}")
159
+ # Remove local tarfile
160
+ if Path(tarfile_path_local).exists():
161
+ logger.warning(f"Remove existing file {tarfile_path_local}.")
162
+ Path(tarfile_path_local).unlink()
667
163
 
668
- # Retrieval phase
669
- while len(self.jobs) > 0:
670
- if self.is_shutdown():
671
- self.scancel_jobs()
672
- finished_job_ids = get_finished_jobs_ssh(
673
- job_ids=self.job_ids,
674
- fractal_ssh=self.fractal_ssh,
675
- )
676
- for slurm_job_id in finished_job_ids:
677
- slurm_job = self.jobs.pop(slurm_job_id)
678
- self._copy_files_from_remote_to_local(slurm_job)
679
- for task in slurm_job.tasks:
680
- result, exception = self._postprocess_single_task(
681
- task=task
682
- )
683
- if exception is None:
684
- results[task.index] = result
685
- else:
686
- exceptions[task.index] = exception
687
- time.sleep(self.slurm_poll_interval)
688
- return results, exceptions
164
+ t_1 = time.perf_counter()
165
+ logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
689
166
 
690
- def check_remote_python_interpreter(self):
691
- settings = Inject(get_settings)
692
- cmd = (
693
- f"{self.python_worker_interpreter} "
694
- "-m fractal_server.app.runner.versions"
695
- )
167
+ def _run_remote_cmd(self, cmd: str) -> str:
696
168
  stdout = self.fractal_ssh.run_command(cmd=cmd)
697
- remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
698
- if remote_version != __VERSION__:
699
- error_msg = (
700
- "Fractal-server version mismatch.\n"
701
- "Local interpreter: "
702
- f"({sys.executable}): {__VERSION__}.\n"
703
- "Remote interpreter: "
704
- f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {remote_version}."
705
- )
706
- logger.error(error_msg)
707
- raise RuntimeError(error_msg)
169
+ return stdout