fractal-server 2.13.0__py3-none-any.whl → 2.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/__main__.py +3 -1
  3. fractal_server/app/models/linkusergroup.py +6 -2
  4. fractal_server/app/models/v2/__init__.py +11 -1
  5. fractal_server/app/models/v2/accounting.py +35 -0
  6. fractal_server/app/models/v2/dataset.py +1 -11
  7. fractal_server/app/models/v2/history.py +78 -0
  8. fractal_server/app/models/v2/job.py +10 -3
  9. fractal_server/app/models/v2/task_group.py +2 -2
  10. fractal_server/app/models/v2/workflow.py +1 -1
  11. fractal_server/app/models/v2/workflowtask.py +1 -1
  12. fractal_server/app/routes/admin/v2/__init__.py +4 -0
  13. fractal_server/app/routes/admin/v2/accounting.py +98 -0
  14. fractal_server/app/routes/admin/v2/impersonate.py +35 -0
  15. fractal_server/app/routes/admin/v2/job.py +5 -13
  16. fractal_server/app/routes/admin/v2/task.py +1 -1
  17. fractal_server/app/routes/admin/v2/task_group.py +4 -29
  18. fractal_server/app/routes/api/__init__.py +1 -1
  19. fractal_server/app/routes/api/v2/__init__.py +8 -2
  20. fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
  21. fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
  22. fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +3 -3
  23. fractal_server/app/routes/api/v2/dataset.py +0 -17
  24. fractal_server/app/routes/api/v2/history.py +544 -0
  25. fractal_server/app/routes/api/v2/images.py +31 -43
  26. fractal_server/app/routes/api/v2/job.py +30 -0
  27. fractal_server/app/routes/api/v2/project.py +1 -53
  28. fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
  29. fractal_server/app/routes/api/v2/submit.py +17 -14
  30. fractal_server/app/routes/api/v2/task.py +3 -10
  31. fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
  32. fractal_server/app/routes/api/v2/task_group.py +2 -22
  33. fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
  34. fractal_server/app/routes/api/v2/workflow.py +28 -69
  35. fractal_server/app/routes/api/v2/workflowtask.py +53 -50
  36. fractal_server/app/routes/auth/group.py +0 -16
  37. fractal_server/app/routes/auth/oauth.py +5 -3
  38. fractal_server/app/routes/aux/__init__.py +0 -20
  39. fractal_server/app/routes/pagination.py +47 -0
  40. fractal_server/app/runner/components.py +0 -3
  41. fractal_server/app/runner/compress_folder.py +57 -29
  42. fractal_server/app/runner/exceptions.py +4 -0
  43. fractal_server/app/runner/executors/base_runner.py +157 -0
  44. fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
  45. fractal_server/app/runner/executors/local/runner.py +248 -0
  46. fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
  47. fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
  48. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
  49. fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
  50. fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
  51. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
  52. fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
  53. fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
  54. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
  55. fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
  56. fractal_server/app/runner/extract_archive.py +1 -3
  57. fractal_server/app/runner/task_files.py +134 -87
  58. fractal_server/app/runner/v2/__init__.py +0 -395
  59. fractal_server/app/runner/v2/_local.py +88 -0
  60. fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +22 -19
  61. fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +19 -15
  62. fractal_server/app/runner/v2/db_tools.py +119 -0
  63. fractal_server/app/runner/v2/runner.py +219 -98
  64. fractal_server/app/runner/v2/runner_functions.py +491 -189
  65. fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
  66. fractal_server/app/runner/v2/submit_workflow.py +358 -0
  67. fractal_server/app/runner/v2/task_interface.py +31 -0
  68. fractal_server/app/schemas/_validators.py +13 -24
  69. fractal_server/app/schemas/user.py +10 -7
  70. fractal_server/app/schemas/user_settings.py +9 -21
  71. fractal_server/app/schemas/v2/__init__.py +10 -1
  72. fractal_server/app/schemas/v2/accounting.py +18 -0
  73. fractal_server/app/schemas/v2/dataset.py +12 -94
  74. fractal_server/app/schemas/v2/dumps.py +26 -9
  75. fractal_server/app/schemas/v2/history.py +80 -0
  76. fractal_server/app/schemas/v2/job.py +15 -8
  77. fractal_server/app/schemas/v2/manifest.py +14 -7
  78. fractal_server/app/schemas/v2/project.py +9 -7
  79. fractal_server/app/schemas/v2/status_legacy.py +35 -0
  80. fractal_server/app/schemas/v2/task.py +72 -77
  81. fractal_server/app/schemas/v2/task_collection.py +14 -32
  82. fractal_server/app/schemas/v2/task_group.py +10 -9
  83. fractal_server/app/schemas/v2/workflow.py +10 -11
  84. fractal_server/app/schemas/v2/workflowtask.py +2 -21
  85. fractal_server/app/security/__init__.py +3 -3
  86. fractal_server/app/security/signup_email.py +2 -2
  87. fractal_server/config.py +91 -90
  88. fractal_server/images/tools.py +23 -0
  89. fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
  90. fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
  91. fractal_server/migrations/versions/af1ef1c83c9b_add_accounting_tables.py +57 -0
  92. fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
  93. fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
  94. fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
  95. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
  96. fractal_server/ssh/_fabric.py +28 -14
  97. fractal_server/tasks/v2/local/collect.py +2 -2
  98. fractal_server/tasks/v2/ssh/collect.py +2 -2
  99. fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
  100. fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
  101. fractal_server/tasks/v2/utils_background.py +1 -20
  102. fractal_server/tasks/v2/utils_database.py +30 -17
  103. fractal_server/tasks/v2/utils_templates.py +6 -0
  104. {fractal_server-2.13.0.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
  105. {fractal_server-2.13.0.dist-info → fractal_server-2.14.0.dist-info}/RECORD +114 -99
  106. {fractal_server-2.13.0.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
  107. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
  108. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
  109. fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
  110. fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
  111. fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
  112. fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
  113. fractal_server/app/runner/v2/_local/__init__.py +0 -129
  114. fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
  115. fractal_server/app/runner/v2/_local/executor.py +0 -100
  116. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
  117. fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
  118. fractal_server/app/runner/v2/handle_failed_job.py +0 -59
  119. fractal_server/app/schemas/v2/status.py +0 -16
  120. /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
  121. /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
  122. /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
  123. /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
  124. /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
  125. /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
  126. {fractal_server-2.13.0.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
  127. {fractal_server-2.13.0.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -2,21 +2,16 @@ from pathlib import Path
2
2
  from typing import Literal
3
3
  from typing import Optional
4
4
 
5
+ from ._batching import heuristics
6
+ from ._slurm_config import _parse_mem_value
7
+ from ._slurm_config import load_slurm_config_file
8
+ from ._slurm_config import logger
9
+ from ._slurm_config import SlurmConfig
10
+ from ._slurm_config import SlurmConfigError
5
11
  from fractal_server.app.models.v2 import WorkflowTaskV2
6
- from fractal_server.app.runner.executors.slurm._slurm_config import (
7
- _parse_mem_value,
8
- )
9
- from fractal_server.app.runner.executors.slurm._slurm_config import (
10
- load_slurm_config_file,
11
- )
12
- from fractal_server.app.runner.executors.slurm._slurm_config import logger
13
- from fractal_server.app.runner.executors.slurm._slurm_config import SlurmConfig
14
- from fractal_server.app.runner.executors.slurm._slurm_config import (
15
- SlurmConfigError,
16
- )
17
12
 
18
13
 
19
- def get_slurm_config(
14
+ def get_slurm_config_internal(
20
15
  wftask: WorkflowTaskV2,
21
16
  which_type: Literal["non_parallel", "parallel"],
22
17
  config_path: Optional[Path] = None,
@@ -25,7 +20,7 @@ def get_slurm_config(
25
20
  Prepare a `SlurmConfig` configuration object
26
21
 
27
22
  The argument `which_type` determines whether we use `wftask.meta_parallel`
28
- or `wftask.meta_non_parallel`. In the following descritpion, let us assume
23
+ or `wftask.meta_non_parallel`. In the following description, let us assume
29
24
  that `which_type="parallel"`.
30
25
 
31
26
  The sources for `SlurmConfig` attributes, in increasing priority order, are
@@ -142,8 +137,8 @@ def get_slurm_config(
142
137
  extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
143
138
  if len(set(extra_lines)) != len(extra_lines):
144
139
  logger.debug(
145
- "[get_slurm_config] Removing repeated elements "
146
- f"from {extra_lines=}."
140
+ "[get_slurm_config] Removing repeated elements from "
141
+ f"{extra_lines=}."
147
142
  )
148
143
  extra_lines = list(set(extra_lines))
149
144
  slurm_dict["extra_lines"] = extra_lines
@@ -162,9 +157,45 @@ def get_slurm_config(
162
157
 
163
158
  # Put everything together
164
159
  logger.debug(
165
- "[get_slurm_config] Now create a SlurmConfig object based "
166
- f"on {slurm_dict=}"
160
+ "[get_slurm_config] Now create a SlurmConfig object based on "
161
+ f"{slurm_dict=}"
167
162
  )
168
163
  slurm_config = SlurmConfig(**slurm_dict)
169
164
 
170
165
  return slurm_config
166
+
167
+
168
+ def get_slurm_config(
169
+ wftask: WorkflowTaskV2,
170
+ which_type: Literal["non_parallel", "parallel"],
171
+ config_path: Optional[Path] = None,
172
+ tot_tasks: int = 1,
173
+ ) -> SlurmConfig:
174
+ config = get_slurm_config_internal(
175
+ wftask,
176
+ which_type,
177
+ config_path,
178
+ )
179
+
180
+ # Set/validate parameters for task batching
181
+ tasks_per_job, parallel_tasks_per_job = heuristics(
182
+ # Number of parallel components (always known)
183
+ tot_tasks=tot_tasks,
184
+ # Optional WorkflowTask attributes:
185
+ tasks_per_job=config.tasks_per_job,
186
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
187
+ # Task requirements (multiple possible sources):
188
+ cpus_per_task=config.cpus_per_task,
189
+ mem_per_task=config.mem_per_task_MB,
190
+ # Fractal configuration variables (soft/hard limits):
191
+ target_cpus_per_job=config.target_cpus_per_job,
192
+ target_mem_per_job=config.target_mem_per_job,
193
+ target_num_jobs=config.target_num_jobs,
194
+ max_cpus_per_job=config.max_cpus_per_job,
195
+ max_mem_per_job=config.max_mem_per_job,
196
+ max_num_jobs=config.max_num_jobs,
197
+ )
198
+ config.parallel_tasks_per_job = parallel_tasks_per_job
199
+ config.tasks_per_job = tasks_per_job
200
+
201
+ return config
@@ -19,7 +19,6 @@ import os
19
19
  import sys
20
20
  from typing import Literal
21
21
  from typing import Optional
22
- from typing import Type
23
22
  from typing import Union
24
23
 
25
24
  import cloudpickle
@@ -27,30 +26,6 @@ import cloudpickle
27
26
  from fractal_server import __VERSION__
28
27
 
29
28
 
30
- class ExceptionProxy:
31
- """
32
- Proxy class to serialise exceptions
33
-
34
- In general exceptions are not serialisable. This proxy class saves the
35
- serialisable content of an exception. On the receiving end, it can be used
36
- to reconstruct a TaskExecutionError.
37
-
38
- Attributes:
39
- exc_type_name: Name of the exception type
40
- tb: TBD
41
- args: TBD
42
- kwargs: TBD
43
- """
44
-
45
- def __init__(
46
- self, exc_type: Type[BaseException], tb: str, *args, **kwargs
47
- ):
48
- self.exc_type_name: str = exc_type.__name__
49
- self.tb: str = tb
50
- self.args = args
51
- self.kwargs: dict = kwargs
52
-
53
-
54
29
  class FractalVersionMismatch(RuntimeError):
55
30
  """
56
31
  Custom exception for version mismatch
@@ -79,18 +54,21 @@ def _check_versions_mismatch(
79
54
  do not match with the ones on the server
80
55
  """
81
56
 
82
- server_python_version = server_versions["python"]
83
- worker_python_version = sys.version_info[:3]
57
+ server_python_version = list(server_versions["python"])
58
+ worker_python_version = list(sys.version_info[:3])
84
59
  if worker_python_version != server_python_version:
85
- # FIXME: turn this into an error, after fixing a broader CI issue, see
86
- # https://github.com/fractal-analytics-platform/fractal-server/issues/375
87
- logging.critical(
88
- f"{server_python_version=} but {worker_python_version=}. "
89
- "cloudpickle is not guaranteed to correctly load "
90
- "pickle files created with different python versions. "
91
- "Note, however, that if you reached this line it means that "
92
- "the pickle file was likely loaded correctly."
93
- )
60
+ if worker_python_version[:2] != server_python_version[:2]:
61
+ # FIXME: Turn this into an error, in some version post 2.14.
62
+ logging.error(
63
+ f"{server_python_version=} but {worker_python_version=}. "
64
+ "This configuration will be deprecated in a future version, "
65
+ "please contact the admin of this Fractal instance."
66
+ )
67
+ else:
68
+ # Major.minor versions match, patch versions differ
69
+ logging.warning(
70
+ f"{server_python_version=} but {worker_python_version=}."
71
+ )
94
72
 
95
73
  server_cloudpickle_version = server_versions["cloudpickle"]
96
74
  worker_cloudpickle_version = cloudpickle.__version__
@@ -134,28 +112,39 @@ def worker(
134
112
  _extra_import_paths = extra_import_paths.split(":")
135
113
  sys.path[:0] = _extra_import_paths
136
114
 
137
- # Execute the job and catpure exceptions
115
+ # Execute the job and capture exceptions
138
116
  try:
139
117
  with open(in_fname, "rb") as f:
140
118
  indata = f.read()
141
119
  server_versions, fun, args, kwargs = cloudpickle.loads(indata)
142
120
  _check_versions_mismatch(server_versions)
143
121
 
144
- result = True, fun(*args, **kwargs)
122
+ result = (True, fun(*args, **kwargs))
145
123
  out = cloudpickle.dumps(result)
146
124
  except Exception as e:
125
+ # Exception objects are not serialisable. Here we save the relevant
126
+ # exception contents in a serializable dictionary. Note that whenever
127
+ # the task failed "properly", the exception is a `TaskExecutionError`
128
+ # and it has additional attributes.
129
+
147
130
  import traceback
148
131
 
149
- typ, value, tb = sys.exc_info()
150
- tb = tb.tb_next
151
- exc_proxy = ExceptionProxy(
152
- typ,
153
- "".join(traceback.format_exception(typ, value, tb)),
154
- *e.args,
155
- **e.__dict__,
132
+ exc_type, exc_value, traceback_obj = sys.exc_info()
133
+ traceback_obj = traceback_obj.tb_next
134
+ traceback_list = traceback.format_exception(
135
+ exc_type,
136
+ exc_value,
137
+ traceback_obj,
156
138
  )
157
-
158
- result = False, exc_proxy
139
+ traceback_string = "".join(traceback_list)
140
+ exc_proxy = dict(
141
+ exc_type_name=exc_type.__name__,
142
+ traceback_string=traceback_string,
143
+ workflow_task_order=getattr(e, "workflow_task_order", None),
144
+ workflow_task_id=getattr(e, "workflow_task_id", None),
145
+ task_name=getattr(e, "task_name", None),
146
+ )
147
+ result = (False, exc_proxy)
159
148
  out = cloudpickle.dumps(result)
160
149
 
161
150
  # Write the output pickle file
@@ -0,0 +1,134 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+ from typing import Optional
4
+
5
+ from pydantic import BaseModel
6
+ from pydantic import ConfigDict
7
+
8
+ from fractal_server.app.runner.task_files import TaskFiles
9
+
10
+
11
+ class SlurmTask(BaseModel):
12
+ model_config = ConfigDict(arbitrary_types_allowed=True)
13
+ component: str
14
+ prefix: str
15
+ workdir_local: Path
16
+ workdir_remote: Path
17
+ parameters: dict[str, Any]
18
+ zarr_url: Optional[str] = None
19
+ task_files: TaskFiles
20
+ index: int
21
+
22
+ @property
23
+ def input_pickle_file_local_path(self) -> Path:
24
+ return (
25
+ self.workdir_local / f"{self.prefix}-{self.component}-input.pickle"
26
+ )
27
+
28
+ @property
29
+ def input_pickle_file_remote_path(self) -> Path:
30
+ return (
31
+ self.workdir_remote
32
+ / f"{self.prefix}-{self.component}-input.pickle"
33
+ )
34
+
35
+ @property
36
+ def output_pickle_file_local_path(self) -> Path:
37
+ return (
38
+ self.workdir_local
39
+ / f"{self.prefix}-{self.component}-output.pickle"
40
+ )
41
+
42
+ @property
43
+ def output_pickle_file_remote_path(self) -> Path:
44
+ return (
45
+ self.workdir_remote
46
+ / f"{self.prefix}-{self.component}-output.pickle"
47
+ )
48
+
49
+ @property
50
+ def input_pickle_file_local(self) -> str:
51
+ return self.input_pickle_file_local_path.as_posix()
52
+
53
+ @property
54
+ def input_pickle_file_remote(self) -> str:
55
+ return self.input_pickle_file_remote_path.as_posix()
56
+
57
+ @property
58
+ def output_pickle_file_local(self) -> str:
59
+ return self.output_pickle_file_local_path.as_posix()
60
+
61
+ @property
62
+ def output_pickle_file_remote(self) -> str:
63
+ return self.output_pickle_file_remote_path.as_posix()
64
+
65
+
66
+ class SlurmJob(BaseModel):
67
+ slurm_job_id: Optional[str] = None
68
+ prefix: str
69
+ workdir_local: Path
70
+ workdir_remote: Path
71
+ tasks: list[SlurmTask]
72
+
73
+ @property
74
+ def slurm_submission_script_local(self) -> str:
75
+ return (
76
+ self.workdir_local / f"{self.prefix}-slurm-submit.sh"
77
+ ).as_posix()
78
+
79
+ @property
80
+ def slurm_submission_script_remote(self) -> str:
81
+ return (
82
+ self.workdir_remote / f"{self.prefix}-slurm-submit.sh"
83
+ ).as_posix()
84
+
85
+ @property
86
+ def slurm_job_id_placeholder(self) -> str:
87
+ if self.slurm_job_id:
88
+ return self.slurm_job_id
89
+ else:
90
+ return "%j"
91
+
92
+ @property
93
+ def slurm_stdout_remote_path(self) -> Path:
94
+ return (
95
+ self.workdir_remote
96
+ / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
97
+ )
98
+
99
+ @property
100
+ def slurm_stdout_remote(self) -> str:
101
+ return self.slurm_stdout_remote_path.as_posix()
102
+
103
+ @property
104
+ def slurm_stderr_remote_path(self) -> Path:
105
+ return (
106
+ self.workdir_remote
107
+ / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
108
+ )
109
+
110
+ @property
111
+ def slurm_stderr_remote(self) -> str:
112
+ return self.slurm_stderr_remote_path.as_posix()
113
+
114
+ @property
115
+ def slurm_stdout_local_path(self) -> str:
116
+ return (
117
+ self.workdir_local
118
+ / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
119
+ )
120
+
121
+ @property
122
+ def slurm_stdout_local(self) -> str:
123
+ return self.slurm_stdout_local_path.as_posix()
124
+
125
+ @property
126
+ def slurm_stderr_local_path(self) -> Path:
127
+ return (
128
+ self.workdir_local
129
+ / f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
130
+ )
131
+
132
+ @property
133
+ def slurm_stderr_local(self) -> str:
134
+ return self.slurm_stderr_local_path.as_posix()
@@ -0,0 +1,268 @@
1
+ import time
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from ..slurm_common.base_slurm_runner import BaseSlurmRunner
6
+ from ..slurm_common.slurm_job_task_models import SlurmJob
7
+ from fractal_server.app.runner.compress_folder import compress_folder
8
+ from fractal_server.app.runner.extract_archive import extract_archive
9
+ from fractal_server.config import get_settings
10
+ from fractal_server.logger import set_logger
11
+ from fractal_server.ssh._fabric import FractalSSH
12
+ from fractal_server.ssh._fabric import FractalSSHCommandError
13
+ from fractal_server.ssh._fabric import FractalSSHTimeoutError
14
+ from fractal_server.syringe import Inject
15
+
16
+
17
+ logger = set_logger(__name__)
18
+
19
+
20
+ class SlurmSSHRunner(BaseSlurmRunner):
21
+ fractal_ssh: FractalSSH
22
+
23
+ def __init__(
24
+ self,
25
+ *,
26
+ # Common
27
+ root_dir_local: Path,
28
+ root_dir_remote: Path,
29
+ common_script_lines: Optional[list[str]] = None,
30
+ user_cache_dir: Optional[str] = None,
31
+ poll_interval: Optional[int] = None,
32
+ # Specific
33
+ fractal_ssh: FractalSSH,
34
+ ) -> None:
35
+ """
36
+ Set parameters that are the same for different Fractal tasks and for
37
+ different SLURM jobs/tasks.
38
+ """
39
+ self.fractal_ssh = fractal_ssh
40
+ logger.warning(self.fractal_ssh)
41
+
42
+ settings = Inject(get_settings)
43
+
44
+ super().__init__(
45
+ slurm_runner_type="ssh",
46
+ root_dir_local=root_dir_local,
47
+ root_dir_remote=root_dir_remote,
48
+ common_script_lines=common_script_lines,
49
+ user_cache_dir=user_cache_dir,
50
+ poll_interval=poll_interval,
51
+ python_worker_interpreter=settings.FRACTAL_SLURM_WORKER_PYTHON,
52
+ )
53
+
54
+ def _mkdir_local_folder(self, folder: str) -> None:
55
+ Path(folder).mkdir(parents=True)
56
+
57
+ def _mkdir_remote_folder(self, folder: str):
58
+ self.fractal_ssh.mkdir(
59
+ folder=folder,
60
+ parents=True,
61
+ )
62
+
63
+ def _fetch_artifacts(
64
+ self,
65
+ finished_slurm_jobs: list[SlurmJob],
66
+ ) -> None:
67
+ """
68
+ Fetch artifacts for a list of SLURM jobs.
69
+ """
70
+
71
+ # Check length
72
+ if len(finished_slurm_jobs) == 0:
73
+ logger.debug(f"[_fetch_artifacts] EXIT ({finished_slurm_jobs=}).")
74
+ return None
75
+
76
+ t_0 = time.perf_counter()
77
+ logger.debug(
78
+ f"[_fetch_artifacts] START ({len(finished_slurm_jobs)=})."
79
+ )
80
+
81
+ # Extract `workdir_remote` and `workdir_local`
82
+ self.validate_slurm_jobs_workdirs(finished_slurm_jobs)
83
+ workdir_local = finished_slurm_jobs[0].workdir_local
84
+ workdir_remote = finished_slurm_jobs[0].workdir_remote
85
+
86
+ # Define local/remote tarfile paths
87
+ tarfile_path_local = (
88
+ workdir_local.parent / f"{workdir_local.name}.tar.gz"
89
+ ).as_posix()
90
+ tarfile_path_remote = (
91
+ workdir_remote.parent / f"{workdir_remote.name}.tar.gz"
92
+ ).as_posix()
93
+
94
+ # Create file list
95
+ # NOTE: see issue 2483
96
+ filelist = []
97
+ for _slurm_job in finished_slurm_jobs:
98
+ _single_job_filelist = [
99
+ _slurm_job.slurm_stdout_remote_path.name,
100
+ _slurm_job.slurm_stderr_remote_path.name,
101
+ ]
102
+ for task in _slurm_job.tasks:
103
+ _single_job_filelist.extend(
104
+ [
105
+ task.output_pickle_file_remote_path.name,
106
+ task.task_files.log_file_remote_path.name,
107
+ task.task_files.args_file_remote_path.name,
108
+ task.task_files.metadiff_file_remote_path.name,
109
+ ]
110
+ )
111
+ filelist.extend(_single_job_filelist)
112
+ filelist_string = "\n".join(filelist)
113
+ elapsed = time.perf_counter() - t_0
114
+ logger.debug(
115
+ "[_fetch_artifacts] Created filelist "
116
+ f"({len(filelist)=}, from start: {elapsed=:.3f} s)."
117
+ )
118
+
119
+ # Write filelist to file remotely
120
+ tmp_filelist_path = workdir_remote / f"filelist_{time.time()}.txt"
121
+ self.fractal_ssh.write_remote_file(
122
+ path=tmp_filelist_path.as_posix(),
123
+ content=f"{filelist_string}\n",
124
+ )
125
+ elapsed = time.perf_counter() - t_0
126
+ logger.debug(
127
+ f"[_fetch_artifacts] File list written to {tmp_filelist_path} "
128
+ f"(from start: {elapsed=:.3f} s)."
129
+ )
130
+
131
+ # Create remote tarfile
132
+ t_0_tar = time.perf_counter()
133
+ tar_command = (
134
+ f"{self.python_worker_interpreter} "
135
+ "-m fractal_server.app.runner.compress_folder "
136
+ f"{workdir_remote.as_posix()} "
137
+ f"--filelist {tmp_filelist_path}"
138
+ )
139
+ self.fractal_ssh.run_command(cmd=tar_command)
140
+ t_1_tar = time.perf_counter()
141
+ logger.info(
142
+ f"[_fetch_artifacts] Remote archive {tarfile_path_remote} created"
143
+ f" - elapsed={t_1_tar - t_0_tar:.3f} s"
144
+ )
145
+
146
+ # Fetch tarfile
147
+ t_0_get = time.perf_counter()
148
+ self.fractal_ssh.fetch_file(
149
+ remote=tarfile_path_remote,
150
+ local=tarfile_path_local,
151
+ )
152
+ t_1_get = time.perf_counter()
153
+ logger.info(
154
+ "[_fetch_artifacts] Subfolder archive transferred back "
155
+ f"to {tarfile_path_local}"
156
+ f" - elapsed={t_1_get - t_0_get:.3f} s"
157
+ )
158
+
159
+ # Extract tarfile locally
160
+ extract_archive(Path(tarfile_path_local))
161
+
162
+ # Remove local tarfile
163
+ Path(tarfile_path_local).unlink(missing_ok=True)
164
+
165
+ t_1 = time.perf_counter()
166
+ logger.info(f"[_fetch_artifacts] End - elapsed={t_1 - t_0:.3f} s")
167
+
168
+ def _send_inputs(self, jobs: list[SlurmJob]) -> None:
169
+ """
170
+ Transfer the jobs subfolder to the remote host.
171
+ """
172
+ for job in jobs:
173
+ # Create local archive
174
+ tarfile_path_local = compress_folder(
175
+ job.workdir_local,
176
+ filelist_path=None,
177
+ )
178
+ tarfile_name = Path(tarfile_path_local).name
179
+ logger.info(f"Subfolder archive created at {tarfile_path_local}")
180
+
181
+ # Transfer archive
182
+ tarfile_path_remote = (
183
+ job.workdir_remote.parent / tarfile_name
184
+ ).as_posix()
185
+ t_0_put = time.perf_counter()
186
+ self.fractal_ssh.send_file(
187
+ local=tarfile_path_local,
188
+ remote=tarfile_path_remote,
189
+ )
190
+ t_1_put = time.perf_counter()
191
+ logger.info(
192
+ f"Subfolder archive transferred to {tarfile_path_remote}"
193
+ f" - elapsed={t_1_put - t_0_put:.3f} s"
194
+ )
195
+
196
+ # Remove local archive
197
+ Path(tarfile_path_local).unlink()
198
+ logger.debug(f"Local archive {tarfile_path_local} removed")
199
+
200
+ # Uncompress remote archive
201
+ tar_command = (
202
+ f"{self.python_worker_interpreter} -m "
203
+ "fractal_server.app.runner.extract_archive "
204
+ f"{tarfile_path_remote}"
205
+ )
206
+ self.fractal_ssh.run_command(cmd=tar_command)
207
+
208
+ def _run_remote_cmd(self, cmd: str) -> str:
209
+ stdout = self.fractal_ssh.run_command(cmd=cmd)
210
+ return stdout
211
+
212
+ def run_squeue(
213
+ self,
214
+ *,
215
+ job_ids: list[str],
216
+ base_interval: float = 2.0,
217
+ max_attempts: int = 7,
218
+ ) -> str:
219
+ """
220
+ Run `squeue` for a set of SLURM job IDs.
221
+
222
+ Different scenarios:
223
+
224
+ 1. When `squeue -j` succeeds (with exit code 0), return its stdout.
225
+ 2. When `squeue -j` fails (typical example:
226
+ `squeue -j {invalid_job_id}` fails with exit code 1), re-raise.
227
+ The error will be handled upstream.
228
+ 3. When the SSH command fails because another thread is keeping the
229
+ lock of the `FractalSSH` object for a long time, mock the standard
230
+ output of the `squeue` command so that it looks like jobs are not
231
+ completed yet.
232
+ 4. When the SSH command fails for other reasons, despite a forgiving
233
+ setup (7 connection attempts with base waiting interval of 2
234
+ seconds, with a cumulative timeout of 126 seconds), return an empty
235
+ string. This will be treated upstream as an empty `squeu` output,
236
+ indirectly resulting in marking the job as completed.
237
+ """
238
+
239
+ if len(job_ids) == 0:
240
+ return ""
241
+
242
+ job_id_single_str = ",".join([str(j) for j in job_ids])
243
+ cmd = (
244
+ "squeue --noheader --format='%i %T' --states=all "
245
+ f"--jobs={job_id_single_str}"
246
+ )
247
+
248
+ try:
249
+ stdout = self.fractal_ssh.run_command(
250
+ cmd=cmd,
251
+ base_interval=base_interval,
252
+ max_attempts=max_attempts,
253
+ )
254
+ return stdout
255
+ except FractalSSHCommandError as e:
256
+ raise e
257
+ except FractalSSHTimeoutError:
258
+ logger.warning(
259
+ "[run_squeue] Could not acquire lock, use stdout placeholder."
260
+ )
261
+ FAKE_STATUS = "FRACTAL_STATUS_PLACEHOLDER"
262
+ placeholder_stdout = "\n".join(
263
+ [f"{job_id} {FAKE_STATUS}" for job_id in job_ids]
264
+ )
265
+ return placeholder_stdout
266
+ except Exception as e:
267
+ logger.error(f"Ignoring `squeue` command failure {e}")
268
+ return ""