fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v1/state.py +1 -2
  3. fractal_server/app/routes/admin/v1.py +2 -2
  4. fractal_server/app/routes/admin/v2.py +2 -2
  5. fractal_server/app/routes/api/v1/job.py +2 -2
  6. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  7. fractal_server/app/routes/api/v2/__init__.py +23 -3
  8. fractal_server/app/routes/api/v2/job.py +2 -2
  9. fractal_server/app/routes/api/v2/submit.py +6 -0
  10. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  11. fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
  12. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  13. fractal_server/app/routes/aux/_runner.py +10 -2
  14. fractal_server/app/runner/compress_folder.py +120 -0
  15. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  16. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  17. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  18. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  19. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  21. fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
  22. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  23. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  24. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  27. fractal_server/app/runner/extract_archive.py +38 -0
  28. fractal_server/app/runner/v1/__init__.py +78 -40
  29. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  30. fractal_server/app/runner/v2/__init__.py +147 -62
  31. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  32. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  33. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  34. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
  35. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  37. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  38. fractal_server/app/runner/versions.py +30 -0
  39. fractal_server/app/schemas/v1/__init__.py +1 -0
  40. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  41. fractal_server/app/schemas/v2/__init__.py +4 -1
  42. fractal_server/app/schemas/v2/task_collection.py +97 -27
  43. fractal_server/config.py +184 -3
  44. fractal_server/main.py +25 -1
  45. fractal_server/ssh/__init__.py +4 -0
  46. fractal_server/ssh/_fabric.py +190 -0
  47. fractal_server/tasks/utils.py +12 -64
  48. fractal_server/tasks/v1/background_operations.py +2 -2
  49. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  50. fractal_server/tasks/v1/utils.py +67 -0
  51. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  52. fractal_server/tasks/v2/_venv_pip.py +195 -0
  53. fractal_server/tasks/v2/background_operations.py +257 -295
  54. fractal_server/tasks/v2/background_operations_ssh.py +304 -0
  55. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  56. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  57. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  58. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  59. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  60. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  61. fractal_server/tasks/v2/utils.py +54 -0
  62. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +4 -2
  63. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +66 -42
  64. fractal_server/tasks/v2/get_collection_data.py +0 -14
  65. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
  66. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
  67. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,120 @@
1
+ import shlex
2
+ import subprocess # nosec
3
+ import sys
4
+ import tarfile
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+
10
+ # COMPRESS_FOLDER_MODALITY = "python"
11
+ COMPRESS_FOLDER_MODALITY = "cp-tar-rmtree"
12
+
13
+
14
+ def _filter(info: tarfile.TarInfo) -> Optional[tarfile.TarInfo]:
15
+ if info.name.endswith(".pickle"):
16
+ filename = info.name.split("/")[-1]
17
+ parts = filename.split("_")
18
+ if len(parts) == 3 and parts[1] == "in":
19
+ return None
20
+ elif len(parts) == 5 and parts[3] == "in":
21
+ return None
22
+ elif info.name.endswith("slurm_submit.sbatch"):
23
+ return None
24
+ return info
25
+
26
+
27
+ if __name__ == "__main__":
28
+ help_msg = (
29
+ "Expected use:\n"
30
+ "python -m fractal_server.app.runner.compress_folder "
31
+ "path/to/folder"
32
+ )
33
+
34
+ if len(sys.argv[1:]) != 1:
35
+ raise ValueError(
36
+ "Invalid argument(s).\n" f"{help_msg}\n" f"Provided: {sys.argv=}"
37
+ )
38
+
39
+ subfolder_path = Path(sys.argv[1])
40
+ t_0 = time.perf_counter()
41
+ print("[compress_folder.py] START")
42
+ print(f"[compress_folder.py] {COMPRESS_FOLDER_MODALITY=}")
43
+ print(f"[compress_folder.py] {subfolder_path=}")
44
+
45
+ job_folder = subfolder_path.parent
46
+ subfolder_name = subfolder_path.name
47
+ tarfile_path = (job_folder / f"{subfolder_name}.tar.gz").as_posix()
48
+ print(f"[compress_folder.py] {tarfile_path=}")
49
+
50
+ if COMPRESS_FOLDER_MODALITY == "python":
51
+ raise NotImplementedError()
52
+ with tarfile.open(tarfile_path, "w:gz") as tar:
53
+ tar.add(
54
+ subfolder_path,
55
+ arcname=".", # ????
56
+ recursive=True,
57
+ filter=_filter,
58
+ )
59
+ elif COMPRESS_FOLDER_MODALITY == "cp-tar-rmtree":
60
+ import shutil
61
+ import time
62
+
63
+ subfolder_path_tmp_copy = (
64
+ subfolder_path.parent / f"{subfolder_path.name}_copy"
65
+ )
66
+
67
+ t0 = time.perf_counter()
68
+ # shutil.copytree(subfolder_path, subfolder_path_tmp_copy)
69
+ cmd_cp = (
70
+ "cp -r "
71
+ f"{subfolder_path.as_posix()} "
72
+ f"{subfolder_path_tmp_copy.as_posix()}"
73
+ )
74
+ res = subprocess.run( # nosec
75
+ shlex.split(cmd_cp),
76
+ check=True,
77
+ capture_output=True,
78
+ encoding="utf-8",
79
+ )
80
+ t1 = time.perf_counter()
81
+ print("[compress_folder.py] `cp -r` END - " f"elapsed: {t1-t0:.3f} s")
82
+
83
+ cmd_tar = (
84
+ "tar czf "
85
+ f"{tarfile_path} "
86
+ "--exclude *sbatch --exclude *_in_*.pickle "
87
+ f"--directory={subfolder_path_tmp_copy.as_posix()} "
88
+ "."
89
+ )
90
+
91
+ print(f"[compress_folder.py] cmd tar:\n{cmd_tar}")
92
+ t0 = time.perf_counter()
93
+ res = subprocess.run( # nosec
94
+ shlex.split(cmd_tar),
95
+ capture_output=True,
96
+ encoding="utf-8",
97
+ )
98
+ t1 = time.perf_counter()
99
+ t_1 = time.perf_counter()
100
+ print(f"[compress_folder.py] tar END - elapsed: {t1-t0:.3f} s")
101
+
102
+ print(f"[compress_folder] END - elapsed {t_1 - t_0:.3f} seconds")
103
+
104
+ if res.returncode != 0:
105
+ print("[compress_folder.py] ERROR in tar")
106
+ print(f"[compress_folder.py] tar stdout:\n{res.stdout}")
107
+ print(f"[compress_folder.py] tar stderr:\n{res.stderr}")
108
+
109
+ shutil.rmtree(subfolder_path_tmp_copy)
110
+ sys.exit(1)
111
+
112
+ t0 = time.perf_counter()
113
+ shutil.rmtree(subfolder_path_tmp_copy)
114
+ t1 = time.perf_counter()
115
+ print(
116
+ f"[compress_folder.py] shutil.rmtree END - elapsed: {t1-t0:.3f} s"
117
+ )
118
+
119
+ t_1 = time.perf_counter()
120
+ print(f"[compress_folder] END - elapsed {t_1 - t_0:.3f} seconds")
@@ -1,3 +0,0 @@
1
- from .executor import SlurmExecutor
2
-
3
- __all__ = ["SlurmExecutor"]
@@ -33,7 +33,6 @@ def _estimate_parallel_tasks_per_job(
33
33
  """
34
34
  Compute how many parallel tasks can fit in a given SLURM job
35
35
 
36
-
37
36
  Note: If more resources than available are requested, return 1. This
38
37
  assumes that further checks will be performed on the output of the current
39
38
  function, as is the case in the `heuristics` function below.
@@ -312,7 +312,7 @@ class SlurmConfig(BaseModel, extra=Extra.forbid):
312
312
 
313
313
  def to_sbatch_preamble(
314
314
  self,
315
- user_cache_dir: Optional[str] = None,
315
+ remote_export_dir: Optional[str] = None,
316
316
  ) -> list[str]:
317
317
  """
318
318
  Compile `SlurmConfig` object into the preamble of a SLURM submission
@@ -345,14 +345,14 @@ class SlurmConfig(BaseModel, extra=Extra.forbid):
345
345
  if value is not None:
346
346
  # Handle the `time` parameter
347
347
  if key == "time" and self.parallel_tasks_per_job > 1:
348
+ # FIXME SSH: time setting must be handled better. Right now
349
+ # we simply propagate `time`, but this is not enough when
350
+ # several `srun` are combined in a single script.
348
351
  logger.warning(
349
- "Ignore `#SBATCH --time=...` line (given: "
350
- f"{self.time=}) for parallel_tasks_per_job>1"
351
- f" (given: {self.parallel_tasks_per_job}), "
352
- "since scaling of time with number of tasks is "
353
- "not implemented."
352
+ f"`time` SLURM parameter is set to {self.time}, "
353
+ "but this does not take into account the number of "
354
+ f"SLURM tasks ({self.parallel_tasks_per_job})."
354
355
  )
355
- continue
356
356
  option = key.replace("_", "-")
357
357
  lines.append(f"{self.prefix} --{option}={value}")
358
358
 
@@ -361,12 +361,12 @@ class SlurmConfig(BaseModel, extra=Extra.forbid):
361
361
  lines.append(line)
362
362
 
363
363
  if self.user_local_exports:
364
- if user_cache_dir is None:
364
+ if remote_export_dir is None:
365
365
  raise ValueError(
366
366
  f"user_cache_dir=None but {self.user_local_exports=}"
367
367
  )
368
368
  for key, value in self.user_local_exports.items():
369
- tmp_value = str(Path(user_cache_dir) / value)
369
+ tmp_value = str(Path(remote_export_dir) / value)
370
370
  lines.append(f"export {key}={tmp_value}")
371
371
 
372
372
  """
@@ -0,0 +1,3 @@
1
+ from .executor import SlurmExecutor
2
+
3
+ __all__ = ["SlurmExecutor"]
@@ -0,0 +1,112 @@
1
+ import os
2
+ import time
3
+ import traceback
4
+ from itertools import count
5
+ from typing import Callable
6
+
7
+ from cfut import FileWaitThread
8
+
9
+ from ......logger import set_logger
10
+
11
+ logger = set_logger(__name__)
12
+
13
+
14
+ class FractalSlurmWaitThread(FileWaitThread):
15
+ """
16
+ Overrides the original clusterfutures.FileWaitThread, so that:
17
+
18
+ 1. Each jobid in the waiting list is associated to a tuple of filenames,
19
+ rather than a single one.
20
+ 2. In the `check` method, we avoid output-file existence checks (which
21
+ would require `sudo -u user ls` calls), and we rather check for the
22
+ existence of the shutdown file. All the logic to check whether a job is
23
+ complete is deferred to the `cfut.slurm.jobs_finished` function.
24
+ 3. There are additional attributes (...).
25
+
26
+ This class is based on clusterfutures 0.5. Original Copyright: 2022
27
+ Adrian Sampson, released under the MIT licence
28
+ """
29
+
30
+ shutdown_file: str
31
+ shutdown_callback: Callable
32
+ jobs_finished_callback: Callable
33
+ slurm_poll_interval = 30
34
+ active_job_ids: list[str]
35
+
36
+ def __init__(self, *args, **kwargs):
37
+ """
38
+ Init method
39
+
40
+ This method is executed on the main thread.
41
+ """
42
+ super().__init__(*args, **kwargs)
43
+ self.active_job_ids = []
44
+
45
+ def wait(self, *, job_id: str):
46
+ """
47
+ Add a a new job to the set of jobs being waited for.
48
+
49
+ This method is executed on the main thread.
50
+ """
51
+ with self.lock:
52
+ self.active_job_ids.append(job_id)
53
+
54
+ def check_shutdown(self):
55
+ """
56
+ Check whether the shutdown file exists
57
+
58
+ This method is executed on the waiting thread.
59
+ """
60
+ if os.path.exists(self.shutdown_file):
61
+ logger.info(
62
+ f"Detected executor-shutdown file {self.shutdown_file}"
63
+ )
64
+ self.shutdown = True
65
+
66
+ def check_jobs(self):
67
+ """
68
+ Check whether some jobs are over, and call callback.
69
+
70
+ This method is executed on the waiting thread.
71
+ """
72
+ try:
73
+ if self.active_job_ids == []:
74
+ return
75
+ finished_jobs = self.jobs_finished_callback(self.active_job_ids)
76
+ if finished_jobs == set(self.active_job_ids):
77
+ self.callback(self.active_job_ids)
78
+ self.active_job_ids = []
79
+
80
+ except Exception:
81
+ # If anything goes wrong, print an exception without re-raising
82
+ traceback.print_exc()
83
+
84
+ def run(self):
85
+ """
86
+ Run forever (until a shutdown takes place) and trigger callback
87
+
88
+ This method is executed on the waiting thread.
89
+
90
+ Note that `shutdown_callback` only takes care of cleaning up the
91
+ FractalSlurmExecutor variables, and then the `return` here is enough
92
+ to fully clean up the `FractalFileWaitThread` object.
93
+ """
94
+
95
+ # FIXME SSH: are those try/except below needed?
96
+
97
+ skip = max(self.slurm_poll_interval // self.interval, 1)
98
+ for ind in count():
99
+ self.check_shutdown()
100
+ if self.shutdown:
101
+ try:
102
+ self.shutdown_callback()
103
+ except Exception: # nosec
104
+ pass
105
+ return
106
+ if ind % skip == 0:
107
+ with self.lock:
108
+ try:
109
+ self.check_jobs()
110
+ except Exception: # nosec
111
+ pass
112
+ time.sleep(self.interval)
@@ -0,0 +1,120 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ from cfut.util import random_string
5
+
6
+ from fractal_server.app.runner.executors.slurm._slurm_config import (
7
+ SlurmConfig,
8
+ )
9
+
10
+
11
+ class SlurmJob:
12
+ """
13
+ Collect information related to a FractalSlurmSSHExecutor job
14
+
15
+ This includes three groups of attributes:
16
+
17
+ 1. Attributes related to the (possibly multi-task) SLURM job, e.g.
18
+ submission-file path.
19
+ 2. Attributes related to single tasks, e.g. the paths of their input/output
20
+ pickle files.
21
+ 3. SLURM configuration options, encoded in a SlurmConfig object.
22
+
23
+ Note: A SlurmJob object is generally defined as a multi-task job. Jobs
24
+ coming from the `map` method must have `single_task_submission=False` (even
25
+ if `num_tasks_tot=1`), while jobs coming from `submit` must have it set to
26
+ `True`.
27
+
28
+ Attributes:
29
+ num_tasks_tot:
30
+ Total number of tasks to be executed as part of this SLURM job.
31
+ single_task_submission:
32
+ This must be `True` for jobs submitted as part of the `submit`
33
+ method, and `False` for jobs coming from the `map` method.
34
+ slurm_file_prefix:
35
+ Prefix for SLURM-job related files (submission script and SLURM
36
+ stdout/stderr); this is also needed in the
37
+ `_copy_files_from_remote_to_local` method.
38
+ wftask_file_prefixes:
39
+ Prefix for files that are created as part of the functions
40
+ submitted for execution on the `FractalSlurmSSHExecutor`; this is
41
+ needed in the `_copy_files_from_remote_to_local` method, and also
42
+ to construct the names of per-task input/output pickle files.
43
+ wftask_subfolder_name:
44
+ Name of the per-task subfolder (e.g. `7_task_name`).
45
+ slurm_script:
46
+ Path of SLURM submission script.
47
+ slurm_stdout:
48
+ Path of SLURM stdout file; if this includes `"%j"`, then this
49
+ string will be replaced by the SLURM job ID upon `sbatch`
50
+ submission.
51
+ slurm_stderr:
52
+ Path of SLURM stderr file; see `slurm_stdout` concerning `"%j"`.
53
+ workerids:
54
+ IDs that enter in the per-task input/output pickle files (one per
55
+ task).
56
+ input_pickle_files:
57
+ Input pickle files (one per task).
58
+ output_pickle_files:
59
+ Output pickle files (one per task).
60
+ slurm_config:
61
+ `SlurmConfig` object.
62
+ """
63
+
64
+ # Job-related attributes
65
+ num_tasks_tot: int
66
+ single_task_submission: bool
67
+ slurm_file_prefix: str
68
+ slurm_script_local: Path
69
+ slurm_script_remote: Path
70
+ slurm_stdout_local: Path
71
+ slurm_stdout_remote: Path
72
+ slurm_stderr_local: Path
73
+ slurm_stderr_remote: Path
74
+
75
+ # Per-task attributes
76
+ wftask_subfolder_name: str
77
+ workerids: tuple[str, ...]
78
+ wftask_file_prefixes: tuple[str, ...]
79
+ input_pickle_files_local: tuple[Path, ...]
80
+ input_pickle_files_remote: tuple[Path, ...]
81
+ output_pickle_files_local: tuple[Path, ...]
82
+ output_pickle_files_remote: tuple[Path, ...]
83
+
84
+ # Slurm configuration
85
+ slurm_config: SlurmConfig
86
+
87
+ def __init__(
88
+ self,
89
+ num_tasks_tot: int,
90
+ slurm_config: SlurmConfig,
91
+ workflow_task_file_prefix: Optional[str] = None,
92
+ slurm_file_prefix: Optional[str] = None,
93
+ wftask_file_prefixes: Optional[tuple[str, ...]] = None,
94
+ single_task_submission: bool = False,
95
+ ):
96
+ if single_task_submission and num_tasks_tot > 1:
97
+ raise ValueError(
98
+ "Trying to initialize SlurmJob with"
99
+ f"{single_task_submission=} and {num_tasks_tot=}."
100
+ )
101
+ self.num_tasks_tot = num_tasks_tot
102
+ self.single_task_submission = single_task_submission
103
+ self.slurm_file_prefix = slurm_file_prefix or "default_slurm_prefix"
104
+ if wftask_file_prefixes is None:
105
+ self.wftask_file_prefixes = tuple(
106
+ "default_wftask_prefix" for i in range(self.num_tasks_tot)
107
+ )
108
+ else:
109
+ self.wftask_file_prefixes = wftask_file_prefixes
110
+ self.workerids = tuple(
111
+ random_string() for i in range(self.num_tasks_tot)
112
+ )
113
+ self.slurm_config = slurm_config
114
+
115
+ def get_clean_output_pickle_files(self) -> tuple[str, ...]:
116
+ """
117
+ Transform all pathlib.Path objects in self.output_pickle_files to
118
+ strings
119
+ """
120
+ return tuple(str(f.as_posix()) for f in self.output_pickle_files_local)