fractal-server 2.13.1__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +3 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/__init__.py +7 -1
- fractal_server/app/models/v2/dataset.py +1 -11
- fractal_server/app/models/v2/history.py +78 -0
- fractal_server/app/models/v2/job.py +10 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/accounting.py +18 -28
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/__init__.py +1 -1
- fractal_server/app/routes/api/v2/__init__.py +8 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
- fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -17
- fractal_server/app/routes/api/v2/history.py +544 -0
- fractal_server/app/routes/api/v2/images.py +31 -43
- fractal_server/app/routes/api/v2/job.py +30 -0
- fractal_server/app/routes/api/v2/project.py +1 -53
- fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/task.py +3 -10
- fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
- fractal_server/app/routes/api/v2/workflow.py +28 -69
- fractal_server/app/routes/api/v2/workflowtask.py +53 -50
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/routes/auth/oauth.py +5 -3
- fractal_server/app/routes/pagination.py +47 -0
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/compress_folder.py +57 -29
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +157 -0
- fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
- fractal_server/app/runner/executors/local/runner.py +248 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
- fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
- fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
- fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
- fractal_server/app/runner/extract_archive.py +1 -3
- fractal_server/app/runner/task_files.py +134 -87
- fractal_server/app/runner/v2/__init__.py +0 -399
- fractal_server/app/runner/v2/_local.py +88 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +20 -19
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +17 -15
- fractal_server/app/runner/v2/db_tools.py +119 -0
- fractal_server/app/runner/v2/runner.py +206 -95
- fractal_server/app/runner/v2/runner_functions.py +488 -187
- fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
- fractal_server/app/runner/v2/submit_workflow.py +358 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/_validators.py +13 -24
- fractal_server/app/schemas/user.py +10 -7
- fractal_server/app/schemas/user_settings.py +9 -21
- fractal_server/app/schemas/v2/__init__.py +9 -1
- fractal_server/app/schemas/v2/dataset.py +12 -94
- fractal_server/app/schemas/v2/dumps.py +26 -9
- fractal_server/app/schemas/v2/history.py +80 -0
- fractal_server/app/schemas/v2/job.py +15 -8
- fractal_server/app/schemas/v2/manifest.py +14 -7
- fractal_server/app/schemas/v2/project.py +9 -7
- fractal_server/app/schemas/v2/status_legacy.py +35 -0
- fractal_server/app/schemas/v2/task.py +72 -77
- fractal_server/app/schemas/v2/task_collection.py +14 -32
- fractal_server/app/schemas/v2/task_group.py +10 -9
- fractal_server/app/schemas/v2/workflow.py +10 -11
- fractal_server/app/schemas/v2/workflowtask.py +2 -21
- fractal_server/app/security/__init__.py +3 -3
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/config.py +41 -46
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
- fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
- fractal_server/ssh/_fabric.py +28 -14
- fractal_server/tasks/v2/local/collect.py +2 -2
- fractal_server/tasks/v2/ssh/collect.py +2 -2
- fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
- fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
- fractal_server/tasks/v2/utils_background.py +0 -19
- fractal_server/tasks/v2/utils_database.py +30 -17
- fractal_server/tasks/v2/utils_templates.py +6 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/RECORD +106 -96
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- fractal_server/app/schemas/v2/status.py +0 -16
- /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -1,1386 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import math
|
3
|
-
import sys
|
4
|
-
import threading
|
5
|
-
import time
|
6
|
-
from concurrent.futures import Executor
|
7
|
-
from concurrent.futures import Future
|
8
|
-
from concurrent.futures import InvalidStateError
|
9
|
-
from copy import copy
|
10
|
-
from pathlib import Path
|
11
|
-
from typing import Any
|
12
|
-
from typing import Callable
|
13
|
-
from typing import Optional
|
14
|
-
from typing import Sequence
|
15
|
-
|
16
|
-
import cloudpickle
|
17
|
-
|
18
|
-
from ....filenames import SHUTDOWN_FILENAME
|
19
|
-
from ....task_files import get_task_file_paths
|
20
|
-
from ....task_files import TaskFiles
|
21
|
-
from ....versions import get_versions
|
22
|
-
from ..._job_states import STATES_FINISHED
|
23
|
-
from ...slurm._slurm_config import SlurmConfig
|
24
|
-
from .._batching import heuristics
|
25
|
-
from ..utils_executors import get_pickle_file_path
|
26
|
-
from ..utils_executors import get_slurm_file_path
|
27
|
-
from ..utils_executors import get_slurm_script_file_path
|
28
|
-
from ._executor_wait_thread import FractalSlurmSSHWaitThread
|
29
|
-
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
30
|
-
from fractal_server.app.runner.compress_folder import compress_folder
|
31
|
-
from fractal_server.app.runner.exceptions import JobExecutionError
|
32
|
-
from fractal_server.app.runner.exceptions import TaskExecutionError
|
33
|
-
from fractal_server.app.runner.executors.slurm.ssh._slurm_job import SlurmJob
|
34
|
-
from fractal_server.app.runner.extract_archive import extract_archive
|
35
|
-
from fractal_server.config import get_settings
|
36
|
-
from fractal_server.logger import set_logger
|
37
|
-
from fractal_server.ssh._fabric import FractalSSH
|
38
|
-
from fractal_server.syringe import Inject
|
39
|
-
|
40
|
-
|
41
|
-
logger = set_logger(__name__)
|
42
|
-
|
43
|
-
|
44
|
-
class FractalSlurmSSHExecutor(Executor):
|
45
|
-
"""
|
46
|
-
Executor to submit SLURM jobs via SSH
|
47
|
-
|
48
|
-
This class is a custom re-implementation of the SLURM executor from
|
49
|
-
|
50
|
-
> clusterfutures <https://github.com/sampsyo/clusterfutures>
|
51
|
-
> Original Copyright
|
52
|
-
> Copyright 2021 Adrian Sampson <asampson@cs.washington.edu>
|
53
|
-
> License: MIT
|
54
|
-
|
55
|
-
|
56
|
-
Attributes:
|
57
|
-
fractal_ssh: FractalSSH connection with custom lock
|
58
|
-
workflow_dir_local:
|
59
|
-
Directory for both the cfut/SLURM and fractal-server files and logs
|
60
|
-
workflow_dir_remote:
|
61
|
-
Directory for both the cfut/SLURM and fractal-server files and logs
|
62
|
-
shutdown_file:
|
63
|
-
python_remote: Equal to `settings.FRACTAL_SLURM_WORKER_PYTHON`
|
64
|
-
wait_thread_cls: Class for waiting thread
|
65
|
-
common_script_lines:
|
66
|
-
Arbitrary script lines that will always be included in the
|
67
|
-
sbatch script
|
68
|
-
slurm_account:
|
69
|
-
jobs:
|
70
|
-
map_jobid_to_slurm_files:
|
71
|
-
Dictionary with paths of slurm-related files for active jobs
|
72
|
-
"""
|
73
|
-
|
74
|
-
fractal_ssh: FractalSSH
|
75
|
-
|
76
|
-
workflow_dir_local: Path
|
77
|
-
workflow_dir_remote: Path
|
78
|
-
shutdown_file: str
|
79
|
-
python_remote: str
|
80
|
-
|
81
|
-
wait_thread_cls = FractalSlurmSSHWaitThread
|
82
|
-
|
83
|
-
common_script_lines: list[str]
|
84
|
-
slurm_account: Optional[str] = None
|
85
|
-
|
86
|
-
jobs: dict[str, tuple[Future, SlurmJob]]
|
87
|
-
map_jobid_to_slurm_files_local: dict[str, tuple[str, str, str]]
|
88
|
-
|
89
|
-
def __init__(
|
90
|
-
self,
|
91
|
-
*,
|
92
|
-
# FractalSSH connection
|
93
|
-
fractal_ssh: FractalSSH,
|
94
|
-
# Folders and files
|
95
|
-
workflow_dir_local: Path,
|
96
|
-
workflow_dir_remote: Path,
|
97
|
-
# Monitoring options
|
98
|
-
slurm_poll_interval: Optional[int] = None,
|
99
|
-
# SLURM submission script options
|
100
|
-
common_script_lines: Optional[list[str]] = None,
|
101
|
-
slurm_account: Optional[str] = None,
|
102
|
-
# Other kwargs are ignored
|
103
|
-
**kwargs,
|
104
|
-
):
|
105
|
-
"""
|
106
|
-
Init method for FractalSlurmSSHExecutor
|
107
|
-
|
108
|
-
Note: since we are not using `super().__init__`, we duplicate some
|
109
|
-
relevant bits of `cfut.ClusterExecutor.__init__`.
|
110
|
-
|
111
|
-
Args:
|
112
|
-
fractal_ssh:
|
113
|
-
workflow_dir_local:
|
114
|
-
workflow_dir_remote:
|
115
|
-
slurm_poll_interval:
|
116
|
-
common_script_lines:
|
117
|
-
slurm_account:
|
118
|
-
"""
|
119
|
-
|
120
|
-
if kwargs != {}:
|
121
|
-
raise ValueError(
|
122
|
-
f"FractalSlurmSSHExecutor received unexpected {kwargs=}"
|
123
|
-
)
|
124
|
-
|
125
|
-
self.workflow_dir_local = workflow_dir_local
|
126
|
-
self.workflow_dir_remote = workflow_dir_remote
|
127
|
-
|
128
|
-
# Relevant bits of cfut.ClusterExecutor.__init__ are copied here,
|
129
|
-
# postponing the .start() call to when the callbacks are defined
|
130
|
-
self.jobs = {}
|
131
|
-
self.job_outfiles = {}
|
132
|
-
self.jobs_lock = threading.Lock()
|
133
|
-
self.jobs_empty_cond = threading.Condition(self.jobs_lock)
|
134
|
-
self.wait_thread = self.wait_thread_cls(self._completion)
|
135
|
-
|
136
|
-
# Set up attributes and methods for self.wait_thread
|
137
|
-
# cfut.SlurmWaitThread)
|
138
|
-
self.wait_thread.shutdown_callback = self.shutdown
|
139
|
-
self.wait_thread.jobs_finished_callback = self._jobs_finished
|
140
|
-
if slurm_poll_interval is None:
|
141
|
-
settings = Inject(get_settings)
|
142
|
-
slurm_poll_interval = settings.FRACTAL_SLURM_POLL_INTERVAL
|
143
|
-
elif slurm_poll_interval <= 0:
|
144
|
-
raise ValueError(f"Invalid attribute {slurm_poll_interval=}")
|
145
|
-
self.wait_thread.slurm_poll_interval = slurm_poll_interval
|
146
|
-
self.wait_thread.shutdown_file = (
|
147
|
-
self.workflow_dir_local / SHUTDOWN_FILENAME
|
148
|
-
).as_posix()
|
149
|
-
|
150
|
-
# Now start self.wait_thread (note: this must be *after* its callback
|
151
|
-
# methods have been defined)
|
152
|
-
self.wait_thread.start()
|
153
|
-
|
154
|
-
# Define remote Python interpreter
|
155
|
-
settings = Inject(get_settings)
|
156
|
-
self.python_remote = settings.FRACTAL_SLURM_WORKER_PYTHON
|
157
|
-
if self.python_remote is None:
|
158
|
-
self._stop_and_join_wait_thread()
|
159
|
-
raise ValueError("FRACTAL_SLURM_WORKER_PYTHON is not set. Exit.")
|
160
|
-
|
161
|
-
# Initialize connection and perform handshake
|
162
|
-
self.fractal_ssh = fractal_ssh
|
163
|
-
logger.warning(self.fractal_ssh)
|
164
|
-
try:
|
165
|
-
self.handshake()
|
166
|
-
except Exception as e:
|
167
|
-
logger.warning(
|
168
|
-
"Stop/join waiting thread and then "
|
169
|
-
f"re-raise original error {str(e)}"
|
170
|
-
)
|
171
|
-
self._stop_and_join_wait_thread()
|
172
|
-
raise e
|
173
|
-
|
174
|
-
# Set/validate parameters for SLURM submission scripts
|
175
|
-
self.slurm_account = slurm_account
|
176
|
-
self.common_script_lines = common_script_lines or []
|
177
|
-
try:
|
178
|
-
self._validate_common_script_lines()
|
179
|
-
except Exception as e:
|
180
|
-
logger.warning(
|
181
|
-
"Stop/join waiting thread and then "
|
182
|
-
f"re-raise original error {str(e)}"
|
183
|
-
)
|
184
|
-
self._stop_and_join_wait_thread()
|
185
|
-
raise e
|
186
|
-
|
187
|
-
# Set/initialize some more options
|
188
|
-
self.map_jobid_to_slurm_files_local = {}
|
189
|
-
|
190
|
-
def _validate_common_script_lines(self):
|
191
|
-
"""
|
192
|
-
Check that SLURM account is not set in `self.common_script_lines`.
|
193
|
-
"""
|
194
|
-
try:
|
195
|
-
invalid_line = next(
|
196
|
-
line
|
197
|
-
for line in self.common_script_lines
|
198
|
-
if line.startswith("#SBATCH --account=")
|
199
|
-
)
|
200
|
-
raise RuntimeError(
|
201
|
-
"Invalid line in `FractalSlurmSSHExecutor."
|
202
|
-
"common_script_lines`: "
|
203
|
-
f"'{invalid_line}'.\n"
|
204
|
-
"SLURM account must be set via the request body of the "
|
205
|
-
"apply-workflow endpoint, or by modifying the user properties."
|
206
|
-
)
|
207
|
-
except StopIteration:
|
208
|
-
pass
|
209
|
-
|
210
|
-
def _cleanup(self, jobid: str) -> None:
|
211
|
-
"""
|
212
|
-
Given a job ID, perform any necessary cleanup after the job has
|
213
|
-
finished.
|
214
|
-
"""
|
215
|
-
with self.jobs_lock:
|
216
|
-
self.map_jobid_to_slurm_files_local.pop(jobid)
|
217
|
-
|
218
|
-
def submit(
|
219
|
-
self,
|
220
|
-
fun: Callable[..., Any],
|
221
|
-
*fun_args: Sequence[Any],
|
222
|
-
slurm_config: SlurmConfig,
|
223
|
-
task_files: TaskFiles,
|
224
|
-
**fun_kwargs: dict,
|
225
|
-
) -> Future:
|
226
|
-
"""
|
227
|
-
Submit a function for execution on `FractalSlurmSSHExecutor`
|
228
|
-
|
229
|
-
Arguments:
|
230
|
-
fun: The function to be executed
|
231
|
-
fun_args: Function positional arguments
|
232
|
-
fun_kwargs: Function keyword arguments
|
233
|
-
slurm_config:
|
234
|
-
A `SlurmConfig` object.
|
235
|
-
task_files:
|
236
|
-
A `TaskFiles` object.
|
237
|
-
|
238
|
-
Returns:
|
239
|
-
Future representing the execution of the current SLURM job.
|
240
|
-
"""
|
241
|
-
|
242
|
-
# Do not continue if auxiliary thread was shut down
|
243
|
-
if self.wait_thread.shutdown:
|
244
|
-
error_msg = "Cannot call `submit` method after executor shutdown"
|
245
|
-
logger.warning(error_msg)
|
246
|
-
raise JobExecutionError(info=error_msg)
|
247
|
-
|
248
|
-
# Set slurm_file_prefix
|
249
|
-
slurm_file_prefix = task_files.file_prefix
|
250
|
-
|
251
|
-
# Include common_script_lines in extra_lines
|
252
|
-
logger.debug(
|
253
|
-
f"Adding {self.common_script_lines=} to "
|
254
|
-
f"{slurm_config.extra_lines=}, from submit method."
|
255
|
-
)
|
256
|
-
current_extra_lines = slurm_config.extra_lines or []
|
257
|
-
slurm_config.extra_lines = (
|
258
|
-
current_extra_lines + self.common_script_lines
|
259
|
-
)
|
260
|
-
|
261
|
-
# Adapt slurm_config to the fact that this is a single-task SlurmJob
|
262
|
-
# instance
|
263
|
-
slurm_config.tasks_per_job = 1
|
264
|
-
slurm_config.parallel_tasks_per_job = 1
|
265
|
-
|
266
|
-
job = self._prepare_job(
|
267
|
-
fun,
|
268
|
-
slurm_config=slurm_config,
|
269
|
-
slurm_file_prefix=slurm_file_prefix,
|
270
|
-
task_files=task_files,
|
271
|
-
single_task_submission=True,
|
272
|
-
args=fun_args,
|
273
|
-
kwargs=fun_kwargs,
|
274
|
-
)
|
275
|
-
self._put_subfolder_sftp(jobs=[job])
|
276
|
-
future, job_id_str = self._submit_job(job)
|
277
|
-
self.wait_thread.wait(job_id=job_id_str)
|
278
|
-
return future
|
279
|
-
|
280
|
-
def map(
|
281
|
-
self,
|
282
|
-
fn: Callable[..., Any],
|
283
|
-
iterable: list[Sequence[Any]],
|
284
|
-
*,
|
285
|
-
slurm_config: SlurmConfig,
|
286
|
-
task_files: TaskFiles,
|
287
|
-
):
|
288
|
-
"""
|
289
|
-
Return an iterator with the results of several execution of a function
|
290
|
-
|
291
|
-
This function is based on `concurrent.futures.Executor.map` from Python
|
292
|
-
Standard Library 3.11.
|
293
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved. Licensed to
|
294
|
-
PSF under a Contributor Agreement.
|
295
|
-
|
296
|
-
Main modifications from the PSF function:
|
297
|
-
|
298
|
-
1. Only `fn` and `iterable` can be assigned as positional arguments;
|
299
|
-
2. `*iterables` argument replaced with a single `iterable`;
|
300
|
-
3. `timeout` and `chunksize` arguments are not supported.
|
301
|
-
|
302
|
-
Arguments:
|
303
|
-
fn:
|
304
|
-
The function to be executed
|
305
|
-
iterable:
|
306
|
-
An iterable such that each element is the list of arguments to
|
307
|
-
be passed to `fn`, as in `fn(*args)`.
|
308
|
-
slurm_config:
|
309
|
-
A `SlurmConfig` object.
|
310
|
-
task_files:
|
311
|
-
A `TaskFiles` object.
|
312
|
-
"""
|
313
|
-
|
314
|
-
# Do not continue if auxiliary thread was shut down
|
315
|
-
if self.wait_thread.shutdown:
|
316
|
-
error_msg = "Cannot call `map` method after executor shutdown"
|
317
|
-
logger.warning(error_msg)
|
318
|
-
raise JobExecutionError(info=error_msg)
|
319
|
-
|
320
|
-
def _result_or_cancel(fut):
|
321
|
-
"""
|
322
|
-
This function is based on the Python Standard Library 3.11.
|
323
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
324
|
-
Licensed to PSF under a Contributor Agreement.
|
325
|
-
"""
|
326
|
-
try:
|
327
|
-
try:
|
328
|
-
return fut.result()
|
329
|
-
finally:
|
330
|
-
fut.cancel()
|
331
|
-
finally:
|
332
|
-
# Break a reference cycle with the exception in
|
333
|
-
# self._exception
|
334
|
-
del fut
|
335
|
-
|
336
|
-
# Include common_script_lines in extra_lines
|
337
|
-
logger.debug(
|
338
|
-
f"Adding {self.common_script_lines=} to "
|
339
|
-
f"{slurm_config.extra_lines=}, from map method."
|
340
|
-
)
|
341
|
-
current_extra_lines = slurm_config.extra_lines or []
|
342
|
-
slurm_config.extra_lines = (
|
343
|
-
current_extra_lines + self.common_script_lines
|
344
|
-
)
|
345
|
-
|
346
|
-
# Set file prefixes
|
347
|
-
general_slurm_file_prefix = str(task_files.task_order)
|
348
|
-
|
349
|
-
# Transform iterable into a list and count its elements
|
350
|
-
list_args = list(iterable)
|
351
|
-
tot_tasks = len(list_args)
|
352
|
-
|
353
|
-
# Set/validate parameters for task batching
|
354
|
-
tasks_per_job, parallel_tasks_per_job = heuristics(
|
355
|
-
# Number of parallel components (always known)
|
356
|
-
tot_tasks=len(list_args),
|
357
|
-
# Optional WorkflowTask attributes:
|
358
|
-
tasks_per_job=slurm_config.tasks_per_job,
|
359
|
-
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
360
|
-
# Task requirements (multiple possible sources):
|
361
|
-
cpus_per_task=slurm_config.cpus_per_task,
|
362
|
-
mem_per_task=slurm_config.mem_per_task_MB,
|
363
|
-
# Fractal configuration variables (soft/hard limits):
|
364
|
-
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
365
|
-
target_mem_per_job=slurm_config.target_mem_per_job,
|
366
|
-
target_num_jobs=slurm_config.target_num_jobs,
|
367
|
-
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
368
|
-
max_mem_per_job=slurm_config.max_mem_per_job,
|
369
|
-
max_num_jobs=slurm_config.max_num_jobs,
|
370
|
-
)
|
371
|
-
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
372
|
-
slurm_config.tasks_per_job = tasks_per_job
|
373
|
-
|
374
|
-
# Divide arguments in batches of `n_tasks_per_script` tasks each
|
375
|
-
args_batches = []
|
376
|
-
batch_size = tasks_per_job
|
377
|
-
for ind_chunk in range(0, tot_tasks, batch_size):
|
378
|
-
args_batches.append(list_args[ind_chunk : ind_chunk + batch_size])
|
379
|
-
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
380
|
-
raise RuntimeError("Something wrong here while batching tasks")
|
381
|
-
|
382
|
-
# Fetch configuration variable
|
383
|
-
settings = Inject(get_settings)
|
384
|
-
FRACTAL_SLURM_SBATCH_SLEEP = settings.FRACTAL_SLURM_SBATCH_SLEEP
|
385
|
-
|
386
|
-
logger.debug("[map] Job preparation - START")
|
387
|
-
current_component_index = 0
|
388
|
-
jobs_to_submit = []
|
389
|
-
for ind_batch, batch in enumerate(args_batches):
|
390
|
-
batch_size = len(batch)
|
391
|
-
this_slurm_file_prefix = (
|
392
|
-
f"{general_slurm_file_prefix}_batch_{ind_batch:06d}"
|
393
|
-
)
|
394
|
-
new_job_to_submit = self._prepare_job(
|
395
|
-
fn,
|
396
|
-
slurm_config=slurm_config,
|
397
|
-
slurm_file_prefix=this_slurm_file_prefix,
|
398
|
-
task_files=task_files,
|
399
|
-
single_task_submission=False,
|
400
|
-
components=batch,
|
401
|
-
)
|
402
|
-
jobs_to_submit.append(new_job_to_submit)
|
403
|
-
current_component_index += batch_size
|
404
|
-
logger.debug("[map] Job preparation - END")
|
405
|
-
|
406
|
-
self._put_subfolder_sftp(jobs=jobs_to_submit)
|
407
|
-
|
408
|
-
# Construct list of futures (one per SLURM job, i.e. one per batch)
|
409
|
-
# FIXME SSH: we may create a single `_submit_many_jobs` method to
|
410
|
-
# reduce the number of commands run over SSH
|
411
|
-
logger.debug("[map] Job submission - START")
|
412
|
-
fs = []
|
413
|
-
job_ids = []
|
414
|
-
for job in jobs_to_submit:
|
415
|
-
future, job_id = self._submit_job(job)
|
416
|
-
job_ids.append(job_id)
|
417
|
-
fs.append(future)
|
418
|
-
time.sleep(FRACTAL_SLURM_SBATCH_SLEEP)
|
419
|
-
for job_id in job_ids:
|
420
|
-
self.wait_thread.wait(job_id=job_id)
|
421
|
-
logger.debug("[map] Job submission - END")
|
422
|
-
|
423
|
-
# Yield must be hidden in closure so that the futures are submitted
|
424
|
-
# before the first iterator value is required.
|
425
|
-
# NOTE: In this custom map() method, _result_or_cancel(fs.pop()) is an
|
426
|
-
# iterable of results (if successful), and we should yield its elements
|
427
|
-
# rather than the whole iterable.
|
428
|
-
def result_iterator():
|
429
|
-
"""
|
430
|
-
This function is based on the Python Standard Library 3.11.
|
431
|
-
Original Copyright 2009 Brian Quinlan. All Rights Reserved.
|
432
|
-
Licensed to PSF under a Contributor Agreement.
|
433
|
-
"""
|
434
|
-
try:
|
435
|
-
# reverse to keep finishing order
|
436
|
-
fs.reverse()
|
437
|
-
while fs:
|
438
|
-
# Careful not to keep a reference to the popped future
|
439
|
-
results = _result_or_cancel(fs.pop())
|
440
|
-
for res in results:
|
441
|
-
yield res
|
442
|
-
finally:
|
443
|
-
for future in fs:
|
444
|
-
future.cancel()
|
445
|
-
|
446
|
-
return result_iterator()
|
447
|
-
|
448
|
-
def _prepare_job(
|
449
|
-
self,
|
450
|
-
fun: Callable[..., Any],
|
451
|
-
slurm_file_prefix: str,
|
452
|
-
task_files: TaskFiles,
|
453
|
-
slurm_config: SlurmConfig,
|
454
|
-
single_task_submission: bool = False,
|
455
|
-
args: Optional[Sequence[Any]] = None,
|
456
|
-
kwargs: Optional[dict] = None,
|
457
|
-
components: Optional[list[Any]] = None,
|
458
|
-
) -> SlurmJob:
|
459
|
-
"""
|
460
|
-
Prepare a SLURM job locally, without submitting it
|
461
|
-
|
462
|
-
This function prepares and writes the local submission script, but it
|
463
|
-
does not transfer it to the SLURM cluster.
|
464
|
-
|
465
|
-
NOTE: this method has different behaviors when it is called from the
|
466
|
-
`self.submit` or `self.map` methods (which is also encoded in
|
467
|
-
`single_task_submission`):
|
468
|
-
|
469
|
-
* When called from `self.submit`, it supports general `args` and
|
470
|
-
`kwargs` arguments;
|
471
|
-
* When called from `self.map`, there cannot be any `args` or `kwargs`
|
472
|
-
argument, but there must be a `components` argument.
|
473
|
-
|
474
|
-
Arguments:
|
475
|
-
fun:
|
476
|
-
slurm_file_prefix:
|
477
|
-
task_files:
|
478
|
-
slurm_config:
|
479
|
-
single_task_submission:
|
480
|
-
args:
|
481
|
-
kwargs:
|
482
|
-
components:
|
483
|
-
|
484
|
-
Returns:
|
485
|
-
SlurmJob object
|
486
|
-
"""
|
487
|
-
|
488
|
-
# Inject SLURM account (if set) into slurm_config
|
489
|
-
if self.slurm_account:
|
490
|
-
slurm_config.account = self.slurm_account
|
491
|
-
|
492
|
-
# Define slurm-job-related files
|
493
|
-
if single_task_submission:
|
494
|
-
if components is not None:
|
495
|
-
raise ValueError(
|
496
|
-
f"{single_task_submission=} but components is not None"
|
497
|
-
)
|
498
|
-
job = SlurmJob(
|
499
|
-
slurm_file_prefix=slurm_file_prefix,
|
500
|
-
num_tasks_tot=1,
|
501
|
-
slurm_config=slurm_config,
|
502
|
-
)
|
503
|
-
if job.num_tasks_tot > 1:
|
504
|
-
raise ValueError(
|
505
|
-
"{single_task_submission=} but {job.num_tasks_tot=}"
|
506
|
-
)
|
507
|
-
job.single_task_submission = True
|
508
|
-
job.wftask_file_prefixes = (task_files.file_prefix,)
|
509
|
-
job.wftask_subfolder_name = task_files.subfolder_name
|
510
|
-
|
511
|
-
else:
|
512
|
-
if not components or len(components) < 1:
|
513
|
-
raise ValueError(
|
514
|
-
"In FractalSlurmSSHExecutor._submit_job, given "
|
515
|
-
f"{components=}."
|
516
|
-
)
|
517
|
-
num_tasks_tot = len(components)
|
518
|
-
job = SlurmJob(
|
519
|
-
slurm_file_prefix=slurm_file_prefix,
|
520
|
-
num_tasks_tot=num_tasks_tot,
|
521
|
-
slurm_config=slurm_config,
|
522
|
-
)
|
523
|
-
|
524
|
-
_prefixes = []
|
525
|
-
_subfolder_names = []
|
526
|
-
for component in components:
|
527
|
-
# In Fractal, `component` is `dict` by construction (e.g.
|
528
|
-
# `component = {"zarr_url": "/something", "param": 1}``). The
|
529
|
-
# try/except covers the case of e.g. `executor.map([1, 2])`,
|
530
|
-
# which is useful for testing.
|
531
|
-
try:
|
532
|
-
actual_component = component.get(_COMPONENT_KEY_, None)
|
533
|
-
except AttributeError:
|
534
|
-
actual_component = str(component)
|
535
|
-
|
536
|
-
_task_file_paths = get_task_file_paths(
|
537
|
-
workflow_dir_local=task_files.workflow_dir_local,
|
538
|
-
workflow_dir_remote=task_files.workflow_dir_remote,
|
539
|
-
task_name=task_files.task_name,
|
540
|
-
task_order=task_files.task_order,
|
541
|
-
component=actual_component,
|
542
|
-
)
|
543
|
-
_prefixes.append(_task_file_paths.file_prefix)
|
544
|
-
_subfolder_names.append(_task_file_paths.subfolder_name)
|
545
|
-
job.wftask_file_prefixes = tuple(_prefixes)
|
546
|
-
|
547
|
-
# Check that all components share the same subfolder
|
548
|
-
num_subfolders = len(set(_subfolder_names))
|
549
|
-
if num_subfolders != 1:
|
550
|
-
error_msg_short = (
|
551
|
-
f"[_submit_job] Subfolder list has {num_subfolders} "
|
552
|
-
"different values, but it must have only one (since "
|
553
|
-
"workflow tasks are executed one by one)."
|
554
|
-
)
|
555
|
-
error_msg_detail = (
|
556
|
-
"[_submit_job] Current unique subfolder names: "
|
557
|
-
f"{set(_subfolder_names)}"
|
558
|
-
)
|
559
|
-
logger.error(error_msg_short)
|
560
|
-
logger.error(error_msg_detail)
|
561
|
-
raise ValueError(error_msg_short)
|
562
|
-
job.wftask_subfolder_name = _subfolder_names[0]
|
563
|
-
|
564
|
-
# Check that server-side subfolder exists
|
565
|
-
subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
|
566
|
-
if not subfolder_path.exists():
|
567
|
-
raise FileNotFoundError(
|
568
|
-
f"Missing folder {subfolder_path.as_posix()}."
|
569
|
-
)
|
570
|
-
|
571
|
-
job.input_pickle_files_local = tuple(
|
572
|
-
get_pickle_file_path(
|
573
|
-
arg=job.workerids[ind],
|
574
|
-
workflow_dir=self.workflow_dir_local,
|
575
|
-
subfolder_name=job.wftask_subfolder_name,
|
576
|
-
in_or_out="in",
|
577
|
-
prefix=job.wftask_file_prefixes[ind],
|
578
|
-
)
|
579
|
-
for ind in range(job.num_tasks_tot)
|
580
|
-
)
|
581
|
-
|
582
|
-
job.input_pickle_files_remote = tuple(
|
583
|
-
get_pickle_file_path(
|
584
|
-
arg=job.workerids[ind],
|
585
|
-
workflow_dir=self.workflow_dir_remote,
|
586
|
-
subfolder_name=job.wftask_subfolder_name,
|
587
|
-
in_or_out="in",
|
588
|
-
prefix=job.wftask_file_prefixes[ind],
|
589
|
-
)
|
590
|
-
for ind in range(job.num_tasks_tot)
|
591
|
-
)
|
592
|
-
job.output_pickle_files_local = tuple(
|
593
|
-
get_pickle_file_path(
|
594
|
-
arg=job.workerids[ind],
|
595
|
-
workflow_dir=self.workflow_dir_local,
|
596
|
-
subfolder_name=job.wftask_subfolder_name,
|
597
|
-
in_or_out="out",
|
598
|
-
prefix=job.wftask_file_prefixes[ind],
|
599
|
-
)
|
600
|
-
for ind in range(job.num_tasks_tot)
|
601
|
-
)
|
602
|
-
job.output_pickle_files_remote = tuple(
|
603
|
-
get_pickle_file_path(
|
604
|
-
arg=job.workerids[ind],
|
605
|
-
workflow_dir=self.workflow_dir_remote,
|
606
|
-
subfolder_name=job.wftask_subfolder_name,
|
607
|
-
in_or_out="out",
|
608
|
-
prefix=job.wftask_file_prefixes[ind],
|
609
|
-
)
|
610
|
-
for ind in range(job.num_tasks_tot)
|
611
|
-
)
|
612
|
-
# define slurm-job file local/remote paths
|
613
|
-
job.slurm_script_local = get_slurm_script_file_path(
|
614
|
-
workflow_dir=self.workflow_dir_local,
|
615
|
-
subfolder_name=job.wftask_subfolder_name,
|
616
|
-
prefix=job.slurm_file_prefix,
|
617
|
-
)
|
618
|
-
job.slurm_script_remote = get_slurm_script_file_path(
|
619
|
-
workflow_dir=self.workflow_dir_remote,
|
620
|
-
subfolder_name=job.wftask_subfolder_name,
|
621
|
-
prefix=job.slurm_file_prefix,
|
622
|
-
)
|
623
|
-
job.slurm_stdout_local = get_slurm_file_path(
|
624
|
-
workflow_dir=self.workflow_dir_local,
|
625
|
-
subfolder_name=job.wftask_subfolder_name,
|
626
|
-
out_or_err="out",
|
627
|
-
prefix=job.slurm_file_prefix,
|
628
|
-
)
|
629
|
-
job.slurm_stdout_remote = get_slurm_file_path(
|
630
|
-
workflow_dir=self.workflow_dir_remote,
|
631
|
-
subfolder_name=job.wftask_subfolder_name,
|
632
|
-
out_or_err="out",
|
633
|
-
prefix=job.slurm_file_prefix,
|
634
|
-
)
|
635
|
-
job.slurm_stderr_local = get_slurm_file_path(
|
636
|
-
workflow_dir=self.workflow_dir_local,
|
637
|
-
subfolder_name=job.wftask_subfolder_name,
|
638
|
-
out_or_err="err",
|
639
|
-
prefix=job.slurm_file_prefix,
|
640
|
-
)
|
641
|
-
job.slurm_stderr_remote = get_slurm_file_path(
|
642
|
-
workflow_dir=self.workflow_dir_remote,
|
643
|
-
subfolder_name=job.wftask_subfolder_name,
|
644
|
-
out_or_err="err",
|
645
|
-
prefix=job.slurm_file_prefix,
|
646
|
-
)
|
647
|
-
|
648
|
-
# Dump serialized versions+function+args+kwargs to pickle file(s)
|
649
|
-
versions = get_versions()
|
650
|
-
if job.single_task_submission:
|
651
|
-
_args = args or []
|
652
|
-
_kwargs = kwargs or {}
|
653
|
-
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
654
|
-
with open(job.input_pickle_files_local[0], "wb") as f:
|
655
|
-
f.write(funcser)
|
656
|
-
else:
|
657
|
-
for ind_component, component in enumerate(components):
|
658
|
-
_args = [component]
|
659
|
-
_kwargs = {}
|
660
|
-
funcser = cloudpickle.dumps((versions, fun, _args, _kwargs))
|
661
|
-
with open(
|
662
|
-
job.input_pickle_files_local[ind_component], "wb"
|
663
|
-
) as f:
|
664
|
-
f.write(funcser)
|
665
|
-
|
666
|
-
# Prepare commands to be included in SLURM submission script
|
667
|
-
cmdlines = []
|
668
|
-
for ind_task in range(job.num_tasks_tot):
|
669
|
-
input_pickle_file = job.input_pickle_files_remote[ind_task]
|
670
|
-
output_pickle_file = job.output_pickle_files_remote[ind_task]
|
671
|
-
cmdlines.append(
|
672
|
-
(
|
673
|
-
f"{self.python_remote}"
|
674
|
-
" -m fractal_server.app.runner.executors.slurm.remote "
|
675
|
-
f"--input-file {input_pickle_file} "
|
676
|
-
f"--output-file {output_pickle_file}"
|
677
|
-
)
|
678
|
-
)
|
679
|
-
|
680
|
-
# Prepare SLURM submission script
|
681
|
-
sbatch_script_content = self._prepare_sbatch_script(
|
682
|
-
slurm_config=job.slurm_config,
|
683
|
-
list_commands=cmdlines,
|
684
|
-
slurm_out_path=str(job.slurm_stdout_remote),
|
685
|
-
slurm_err_path=str(job.slurm_stderr_remote),
|
686
|
-
)
|
687
|
-
with job.slurm_script_local.open("w") as f:
|
688
|
-
f.write(sbatch_script_content)
|
689
|
-
|
690
|
-
return job
|
691
|
-
|
692
|
-
def _put_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
|
693
|
-
"""
|
694
|
-
Transfer the jobs subfolder to the remote host.
|
695
|
-
|
696
|
-
Arguments:
|
697
|
-
jobs: The list of `SlurmJob` objects associated to a given
|
698
|
-
subfolder.
|
699
|
-
"""
|
700
|
-
|
701
|
-
# Check that the subfolder is unique
|
702
|
-
subfolder_names = [job.wftask_subfolder_name for job in jobs]
|
703
|
-
if len(set(subfolder_names)) > 1:
|
704
|
-
raise ValueError(
|
705
|
-
"[_put_subfolder] Invalid list of jobs, "
|
706
|
-
f"{set(subfolder_names)=}."
|
707
|
-
)
|
708
|
-
subfolder_name = subfolder_names[0]
|
709
|
-
|
710
|
-
# Create compressed subfolder archive (locally)
|
711
|
-
local_subfolder = self.workflow_dir_local / subfolder_name
|
712
|
-
tarfile_path_local = compress_folder(local_subfolder)
|
713
|
-
tarfile_name = Path(tarfile_path_local).name
|
714
|
-
logger.info(f"Subfolder archive created at {tarfile_path_local}")
|
715
|
-
tarfile_path_remote = (
|
716
|
-
self.workflow_dir_remote / tarfile_name
|
717
|
-
).as_posix()
|
718
|
-
|
719
|
-
# Transfer archive
|
720
|
-
t_0_put = time.perf_counter()
|
721
|
-
self.fractal_ssh.send_file(
|
722
|
-
local=tarfile_path_local,
|
723
|
-
remote=tarfile_path_remote,
|
724
|
-
)
|
725
|
-
t_1_put = time.perf_counter()
|
726
|
-
logger.info(
|
727
|
-
f"Subfolder archive transferred to {tarfile_path_remote}"
|
728
|
-
f" - elapsed: {t_1_put - t_0_put:.3f} s"
|
729
|
-
)
|
730
|
-
# Uncompress archive (remotely)
|
731
|
-
tar_command = (
|
732
|
-
f"{self.python_remote} -m "
|
733
|
-
"fractal_server.app.runner.extract_archive "
|
734
|
-
f"{tarfile_path_remote}"
|
735
|
-
)
|
736
|
-
self.fractal_ssh.run_command(cmd=tar_command)
|
737
|
-
|
738
|
-
# Remove local version
|
739
|
-
t_0_rm = time.perf_counter()
|
740
|
-
Path(tarfile_path_local).unlink()
|
741
|
-
t_1_rm = time.perf_counter()
|
742
|
-
logger.info(
|
743
|
-
f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
|
744
|
-
)
|
745
|
-
|
746
|
-
def _submit_job(self, job: SlurmJob) -> tuple[Future, str]:
|
747
|
-
"""
|
748
|
-
Submit a job to SLURM via SSH.
|
749
|
-
|
750
|
-
This method must always be called after `self._put_subfolder`.
|
751
|
-
|
752
|
-
Arguments:
|
753
|
-
job: The `SlurmJob` object to submit.
|
754
|
-
"""
|
755
|
-
|
756
|
-
# Prevent calling sbatch if auxiliary thread was shut down
|
757
|
-
if self.wait_thread.shutdown:
|
758
|
-
error_msg = (
|
759
|
-
"Cannot call `_submit_job` method after executor shutdown"
|
760
|
-
)
|
761
|
-
logger.warning(error_msg)
|
762
|
-
raise JobExecutionError(info=error_msg)
|
763
|
-
|
764
|
-
# Submit job to SLURM, and get jobid
|
765
|
-
sbatch_command = f"sbatch --parsable {job.slurm_script_remote}"
|
766
|
-
pre_submission_cmds = job.slurm_config.pre_submission_commands
|
767
|
-
if len(pre_submission_cmds) == 0:
|
768
|
-
sbatch_stdout = self.fractal_ssh.run_command(cmd=sbatch_command)
|
769
|
-
else:
|
770
|
-
logger.debug(f"Now using {pre_submission_cmds=}")
|
771
|
-
script_lines = pre_submission_cmds + [sbatch_command]
|
772
|
-
script_content = "\n".join(script_lines)
|
773
|
-
script_content = f"{script_content}\n"
|
774
|
-
script_path_remote = (
|
775
|
-
f"{job.slurm_script_remote.as_posix()}_wrapper.sh"
|
776
|
-
)
|
777
|
-
self.fractal_ssh.write_remote_file(
|
778
|
-
path=script_path_remote, content=script_content
|
779
|
-
)
|
780
|
-
cmd = f"bash {script_path_remote}"
|
781
|
-
sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
|
782
|
-
|
783
|
-
# Extract SLURM job ID from stdout
|
784
|
-
try:
|
785
|
-
stdout = sbatch_stdout.strip("\n")
|
786
|
-
jobid = int(stdout)
|
787
|
-
except ValueError as e:
|
788
|
-
error_msg = (
|
789
|
-
f"Submit command `{sbatch_command}` returned "
|
790
|
-
f"`{stdout=}` which cannot be cast to an integer "
|
791
|
-
f"SLURM-job ID.\n"
|
792
|
-
f"Note that {pre_submission_cmds=}.\n"
|
793
|
-
f"Original error:\n{str(e)}"
|
794
|
-
)
|
795
|
-
logger.error(error_msg)
|
796
|
-
raise JobExecutionError(info=error_msg)
|
797
|
-
job_id_str = str(jobid)
|
798
|
-
|
799
|
-
# Plug job id in stdout/stderr SLURM file paths (local and remote)
|
800
|
-
def _replace_job_id(_old_path: Path) -> Path:
|
801
|
-
return Path(_old_path.as_posix().replace("%j", job_id_str))
|
802
|
-
|
803
|
-
job.slurm_stdout_local = _replace_job_id(job.slurm_stdout_local)
|
804
|
-
job.slurm_stdout_remote = _replace_job_id(job.slurm_stdout_remote)
|
805
|
-
job.slurm_stderr_local = _replace_job_id(job.slurm_stderr_local)
|
806
|
-
job.slurm_stderr_remote = _replace_job_id(job.slurm_stderr_remote)
|
807
|
-
|
808
|
-
# Add the SLURM script/out/err paths to map_jobid_to_slurm_files (this
|
809
|
-
# must be after the `sbatch` call, so that "%j" has already been
|
810
|
-
# replaced with the job ID)
|
811
|
-
with self.jobs_lock:
|
812
|
-
self.map_jobid_to_slurm_files_local[job_id_str] = (
|
813
|
-
job.slurm_script_local.as_posix(),
|
814
|
-
job.slurm_stdout_local.as_posix(),
|
815
|
-
job.slurm_stderr_local.as_posix(),
|
816
|
-
)
|
817
|
-
|
818
|
-
# Create future
|
819
|
-
future = Future()
|
820
|
-
with self.jobs_lock:
|
821
|
-
self.jobs[job_id_str] = (future, job)
|
822
|
-
return future, job_id_str
|
823
|
-
|
824
|
-
def _prepare_JobExecutionError(
|
825
|
-
self, jobid: str, info: str
|
826
|
-
) -> JobExecutionError:
|
827
|
-
"""
|
828
|
-
Prepare the `JobExecutionError` for a given job
|
829
|
-
|
830
|
-
This method creates a `JobExecutionError` object and sets its attribute
|
831
|
-
to the appropriate SLURM-related file names. Note that the SLURM files
|
832
|
-
are the local ones (i.e. the ones in `self.workflow_dir_local`).
|
833
|
-
|
834
|
-
Arguments:
|
835
|
-
jobid:
|
836
|
-
ID of the SLURM job.
|
837
|
-
info:
|
838
|
-
"""
|
839
|
-
# Extract SLURM file paths
|
840
|
-
with self.jobs_lock:
|
841
|
-
(
|
842
|
-
slurm_script_file,
|
843
|
-
slurm_stdout_file,
|
844
|
-
slurm_stderr_file,
|
845
|
-
) = self.map_jobid_to_slurm_files_local[jobid]
|
846
|
-
# Construct JobExecutionError exception
|
847
|
-
job_exc = JobExecutionError(
|
848
|
-
cmd_file=slurm_script_file,
|
849
|
-
stdout_file=slurm_stdout_file,
|
850
|
-
stderr_file=slurm_stderr_file,
|
851
|
-
info=info,
|
852
|
-
)
|
853
|
-
return job_exc
|
854
|
-
|
855
|
-
def _missing_pickle_error_msg(self, out_path: Path) -> str:
|
856
|
-
settings = Inject(get_settings)
|
857
|
-
info = (
|
858
|
-
"Output pickle file of the FractalSlurmSSHExecutor "
|
859
|
-
"job not found.\n"
|
860
|
-
f"Expected file path: {out_path.as_posix()}n"
|
861
|
-
"Here are some possible reasons:\n"
|
862
|
-
"1. The SLURM job was scancel-ed, either by the user "
|
863
|
-
"or due to an error (e.g. an out-of-memory or timeout "
|
864
|
-
"error). Note that if the scancel took place before "
|
865
|
-
"the job started running, the SLURM out/err files "
|
866
|
-
"will be empty.\n"
|
867
|
-
"2. Some error occurred upon writing the file to disk "
|
868
|
-
"(e.g. because there is not enough space on disk, or "
|
869
|
-
"due to an overloaded NFS filesystem). "
|
870
|
-
"Note that the server configuration has "
|
871
|
-
"FRACTAL_SLURM_ERROR_HANDLING_INTERVAL="
|
872
|
-
f"{settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL} "
|
873
|
-
"seconds.\n"
|
874
|
-
)
|
875
|
-
return info
|
876
|
-
|
877
|
-
def _handle_remaining_jobs(
|
878
|
-
self,
|
879
|
-
remaining_futures: list[Future],
|
880
|
-
remaining_job_ids: list[str],
|
881
|
-
remaining_jobs: list[SlurmJob],
|
882
|
-
) -> None:
|
883
|
-
"""
|
884
|
-
Helper function used within _completion, when looping over a list of
|
885
|
-
several jobs/futures.
|
886
|
-
"""
|
887
|
-
for future in remaining_futures:
|
888
|
-
try:
|
889
|
-
future.cancel()
|
890
|
-
except InvalidStateError:
|
891
|
-
pass
|
892
|
-
for job_id in remaining_job_ids:
|
893
|
-
self._cleanup(job_id)
|
894
|
-
for job in remaining_jobs:
|
895
|
-
for path in job.output_pickle_files_local:
|
896
|
-
path.unlink()
|
897
|
-
for path in job.input_pickle_files_local:
|
898
|
-
path.unlink()
|
899
|
-
|
900
|
-
def _completion(self, job_ids: list[str]) -> None:
|
901
|
-
"""
|
902
|
-
Callback function to be executed whenever a job finishes.
|
903
|
-
|
904
|
-
This function is executed by self.wait_thread (triggered by either
|
905
|
-
finding an existing output pickle file `out_path` or finding that the
|
906
|
-
SLURM job is over). Since this takes place on a different thread,
|
907
|
-
failures may not be captured by the main thread; we use a broad
|
908
|
-
try/except block, so that those exceptions are reported to the main
|
909
|
-
thread via `fut.set_exception(...)`.
|
910
|
-
|
911
|
-
Arguments:
|
912
|
-
job_ids: IDs of the SLURM jobs to handle.
|
913
|
-
"""
|
914
|
-
# Handle all uncaught exceptions in this broad try/except block
|
915
|
-
try:
|
916
|
-
logger.info(
|
917
|
-
f"[FractalSlurmSSHExecutor._completion] START, for {job_ids=}."
|
918
|
-
)
|
919
|
-
|
920
|
-
# Loop over all job_ids, and fetch future and job objects
|
921
|
-
futures: list[Future] = []
|
922
|
-
jobs: list[SlurmJob] = []
|
923
|
-
with self.jobs_lock:
|
924
|
-
for job_id in job_ids:
|
925
|
-
future, job = self.jobs.pop(job_id)
|
926
|
-
futures.append(future)
|
927
|
-
jobs.append(job)
|
928
|
-
if not self.jobs:
|
929
|
-
self.jobs_empty_cond.notify_all()
|
930
|
-
|
931
|
-
# Fetch subfolder from remote host
|
932
|
-
self._get_subfolder_sftp(jobs=jobs)
|
933
|
-
|
934
|
-
# First round of checking whether all output files exist
|
935
|
-
missing_out_paths = []
|
936
|
-
for job in jobs:
|
937
|
-
for ind_out_path, out_path in enumerate(
|
938
|
-
job.output_pickle_files_local
|
939
|
-
):
|
940
|
-
if not out_path.exists():
|
941
|
-
missing_out_paths.append(out_path)
|
942
|
-
num_missing = len(missing_out_paths)
|
943
|
-
if num_missing > 0:
|
944
|
-
# Output pickle files may be missing e.g. because of some slow
|
945
|
-
# filesystem operation; wait some time before re-trying
|
946
|
-
settings = Inject(get_settings)
|
947
|
-
sleep_time = settings.FRACTAL_SLURM_ERROR_HANDLING_INTERVAL
|
948
|
-
logger.info(
|
949
|
-
f"{num_missing} output pickle files are missing; "
|
950
|
-
f"sleep {sleep_time} seconds."
|
951
|
-
)
|
952
|
-
for missing_file in missing_out_paths:
|
953
|
-
logger.debug(f"Missing output pickle file: {missing_file}")
|
954
|
-
time.sleep(sleep_time)
|
955
|
-
|
956
|
-
# Handle all jobs
|
957
|
-
for ind_job, job_id in enumerate(job_ids):
|
958
|
-
# Retrieve job and future objects
|
959
|
-
job = jobs[ind_job]
|
960
|
-
future = futures[ind_job]
|
961
|
-
remaining_job_ids = job_ids[ind_job + 1 :]
|
962
|
-
remaining_futures = futures[ind_job + 1 :]
|
963
|
-
|
964
|
-
outputs = []
|
965
|
-
|
966
|
-
for ind_out_path, out_path in enumerate(
|
967
|
-
job.output_pickle_files_local
|
968
|
-
):
|
969
|
-
in_path = job.input_pickle_files_local[ind_out_path]
|
970
|
-
if not out_path.exists():
|
971
|
-
# Output pickle file is still missing
|
972
|
-
info = self._missing_pickle_error_msg(out_path)
|
973
|
-
job_exc = self._prepare_JobExecutionError(
|
974
|
-
job_id, info=info
|
975
|
-
)
|
976
|
-
try:
|
977
|
-
future.set_exception(job_exc)
|
978
|
-
self._handle_remaining_jobs(
|
979
|
-
remaining_futures=remaining_futures,
|
980
|
-
remaining_job_ids=remaining_job_ids,
|
981
|
-
)
|
982
|
-
logger.info(
|
983
|
-
"[FractalSlurmSSHExecutor._completion] END, "
|
984
|
-
f"for {job_ids=}, with JobExecutionError due "
|
985
|
-
f"to missing {out_path.as_posix()}."
|
986
|
-
)
|
987
|
-
return
|
988
|
-
except InvalidStateError:
|
989
|
-
logger.warning(
|
990
|
-
f"Future {future} (SLURM job ID: {job_id}) "
|
991
|
-
"was already cancelled."
|
992
|
-
)
|
993
|
-
in_path.unlink()
|
994
|
-
self._cleanup(job_id)
|
995
|
-
self._handle_remaining_jobs(
|
996
|
-
remaining_futures=remaining_futures,
|
997
|
-
remaining_job_ids=remaining_job_ids,
|
998
|
-
)
|
999
|
-
logger.info(
|
1000
|
-
"[FractalSlurmSSHExecutor._completion] END, "
|
1001
|
-
f"for {job_ids=}, with JobExecutionError/"
|
1002
|
-
"InvalidStateError due to "
|
1003
|
-
f"missing {out_path.as_posix()}."
|
1004
|
-
)
|
1005
|
-
return
|
1006
|
-
|
1007
|
-
# Read the task output
|
1008
|
-
with out_path.open("rb") as f:
|
1009
|
-
outdata = f.read()
|
1010
|
-
# Note: output can be either the task result (typically a
|
1011
|
-
# dictionary) or an ExceptionProxy object; in the latter
|
1012
|
-
# case, the ExceptionProxy definition is also part of the
|
1013
|
-
# pickle file (thanks to cloudpickle.dumps).
|
1014
|
-
success, output = cloudpickle.loads(outdata)
|
1015
|
-
try:
|
1016
|
-
if success:
|
1017
|
-
outputs.append(output)
|
1018
|
-
else:
|
1019
|
-
proxy = output
|
1020
|
-
if proxy.exc_type_name == "JobExecutionError":
|
1021
|
-
job_exc = self._prepare_JobExecutionError(
|
1022
|
-
job_id, info=proxy.kwargs.get("info", None)
|
1023
|
-
)
|
1024
|
-
future.set_exception(job_exc)
|
1025
|
-
self._handle_remaining_jobs(
|
1026
|
-
remaining_futures=remaining_futures,
|
1027
|
-
remaining_job_ids=remaining_job_ids,
|
1028
|
-
)
|
1029
|
-
return
|
1030
|
-
else:
|
1031
|
-
# This branch catches both TaskExecutionError's
|
1032
|
-
# (coming from the typical fractal-server
|
1033
|
-
# execution of tasks, and with additional
|
1034
|
-
# fractal-specific kwargs) or arbitrary
|
1035
|
-
# exceptions (coming from a direct use of
|
1036
|
-
# FractalSlurmSSHExecutor, possibly outside
|
1037
|
-
# fractal-server)
|
1038
|
-
kwargs = {}
|
1039
|
-
for key in [
|
1040
|
-
"workflow_task_id",
|
1041
|
-
"workflow_task_order",
|
1042
|
-
"task_name",
|
1043
|
-
]:
|
1044
|
-
if key in proxy.kwargs.keys():
|
1045
|
-
kwargs[key] = proxy.kwargs[key]
|
1046
|
-
exc = TaskExecutionError(proxy.tb, **kwargs)
|
1047
|
-
future.set_exception(exc)
|
1048
|
-
self._handle_remaining_jobs(
|
1049
|
-
remaining_futures=remaining_futures,
|
1050
|
-
remaining_job_ids=remaining_job_ids,
|
1051
|
-
)
|
1052
|
-
return
|
1053
|
-
out_path.unlink()
|
1054
|
-
except InvalidStateError:
|
1055
|
-
logger.warning(
|
1056
|
-
f"Future {future} (SLURM job ID: {job_id}) was "
|
1057
|
-
"already cancelled, exit from "
|
1058
|
-
"FractalSlurmSSHExecutor._completion."
|
1059
|
-
)
|
1060
|
-
out_path.unlink()
|
1061
|
-
in_path.unlink()
|
1062
|
-
|
1063
|
-
self._cleanup(job_id)
|
1064
|
-
self._handle_remaining_jobs(
|
1065
|
-
remaining_futures=remaining_futures,
|
1066
|
-
remaining_job_ids=remaining_job_ids,
|
1067
|
-
)
|
1068
|
-
return
|
1069
|
-
|
1070
|
-
# Clean up input pickle file
|
1071
|
-
in_path.unlink()
|
1072
|
-
self._cleanup(job_id)
|
1073
|
-
if job.single_task_submission:
|
1074
|
-
future.set_result(outputs[0])
|
1075
|
-
else:
|
1076
|
-
future.set_result(outputs)
|
1077
|
-
|
1078
|
-
except Exception as e:
|
1079
|
-
logger.warning(
|
1080
|
-
"[FractalSlurmSSHExecutor._completion] "
|
1081
|
-
f"An exception took place: {str(e)}."
|
1082
|
-
)
|
1083
|
-
for future in futures:
|
1084
|
-
try:
|
1085
|
-
logger.info(f"Set exception for {future=}")
|
1086
|
-
future.set_exception(e)
|
1087
|
-
except InvalidStateError:
|
1088
|
-
logger.info(f"Future {future} was already cancelled.")
|
1089
|
-
logger.info(
|
1090
|
-
f"[FractalSlurmSSHExecutor._completion] END, for {job_ids=}, "
|
1091
|
-
"from within exception handling."
|
1092
|
-
)
|
1093
|
-
return
|
1094
|
-
|
1095
|
-
def _get_subfolder_sftp(self, jobs: list[SlurmJob]) -> None:
|
1096
|
-
"""
|
1097
|
-
Fetch a remote folder via tar+sftp+tar
|
1098
|
-
|
1099
|
-
Arguments:
|
1100
|
-
jobs:
|
1101
|
-
List of `SlurmJob` object (needed for their prefix-related
|
1102
|
-
attributes).
|
1103
|
-
"""
|
1104
|
-
|
1105
|
-
# Check that the subfolder is unique
|
1106
|
-
subfolder_names = [job.wftask_subfolder_name for job in jobs]
|
1107
|
-
if len(set(subfolder_names)) > 1:
|
1108
|
-
raise ValueError(
|
1109
|
-
"[_put_subfolder] Invalid list of jobs, "
|
1110
|
-
f"{set(subfolder_names)=}."
|
1111
|
-
)
|
1112
|
-
subfolder_name = subfolder_names[0]
|
1113
|
-
|
1114
|
-
t_0 = time.perf_counter()
|
1115
|
-
logger.debug("[_get_subfolder_sftp] Start")
|
1116
|
-
tarfile_path_local = (
|
1117
|
-
self.workflow_dir_local / f"{subfolder_name}.tar.gz"
|
1118
|
-
).as_posix()
|
1119
|
-
tarfile_path_remote = (
|
1120
|
-
self.workflow_dir_remote / f"{subfolder_name}.tar.gz"
|
1121
|
-
).as_posix()
|
1122
|
-
|
1123
|
-
# Remove remote tarfile
|
1124
|
-
rm_command = f"rm {tarfile_path_remote}"
|
1125
|
-
self.fractal_ssh.run_command(cmd=rm_command)
|
1126
|
-
|
1127
|
-
# Create remote tarfile
|
1128
|
-
tar_command = (
|
1129
|
-
f"{self.python_remote} "
|
1130
|
-
"-m fractal_server.app.runner.compress_folder "
|
1131
|
-
f"{(self.workflow_dir_remote / subfolder_name).as_posix()} "
|
1132
|
-
"--remote-to-local"
|
1133
|
-
)
|
1134
|
-
stdout = self.fractal_ssh.run_command(cmd=tar_command)
|
1135
|
-
print(stdout)
|
1136
|
-
|
1137
|
-
# Fetch tarfile
|
1138
|
-
t_0_get = time.perf_counter()
|
1139
|
-
self.fractal_ssh.fetch_file(
|
1140
|
-
remote=tarfile_path_remote,
|
1141
|
-
local=tarfile_path_local,
|
1142
|
-
)
|
1143
|
-
t_1_get = time.perf_counter()
|
1144
|
-
logger.info(
|
1145
|
-
f"Subfolder archive transferred back to {tarfile_path_local}"
|
1146
|
-
f" - elapsed: {t_1_get - t_0_get:.3f} s"
|
1147
|
-
)
|
1148
|
-
|
1149
|
-
# Extract tarfile locally
|
1150
|
-
extract_archive(Path(tarfile_path_local))
|
1151
|
-
|
1152
|
-
# Remove local tarfile
|
1153
|
-
if Path(tarfile_path_local).exists():
|
1154
|
-
logger.warning(f"Remove existing file {tarfile_path_local}.")
|
1155
|
-
Path(tarfile_path_local).unlink()
|
1156
|
-
|
1157
|
-
t_1 = time.perf_counter()
|
1158
|
-
logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
|
1159
|
-
|
1160
|
-
def _prepare_sbatch_script(
|
1161
|
-
self,
|
1162
|
-
*,
|
1163
|
-
list_commands: list[str],
|
1164
|
-
slurm_out_path: str,
|
1165
|
-
slurm_err_path: str,
|
1166
|
-
slurm_config: SlurmConfig,
|
1167
|
-
):
|
1168
|
-
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
1169
|
-
mem_per_task_MB = slurm_config.mem_per_task_MB
|
1170
|
-
|
1171
|
-
# Set ntasks
|
1172
|
-
ntasks = min(len(list_commands), num_tasks_max_running)
|
1173
|
-
if len(list_commands) < num_tasks_max_running:
|
1174
|
-
ntasks = len(list_commands)
|
1175
|
-
slurm_config.parallel_tasks_per_job = ntasks
|
1176
|
-
logger.debug(
|
1177
|
-
f"{len(list_commands)=} is smaller than "
|
1178
|
-
f"{num_tasks_max_running=}. Setting {ntasks=}."
|
1179
|
-
)
|
1180
|
-
|
1181
|
-
# Prepare SLURM preamble based on SlurmConfig object
|
1182
|
-
script_lines = slurm_config.to_sbatch_preamble(
|
1183
|
-
remote_export_dir=self.workflow_dir_remote.as_posix()
|
1184
|
-
)
|
1185
|
-
|
1186
|
-
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
1187
|
-
# fix their order
|
1188
|
-
script_lines.extend(
|
1189
|
-
[
|
1190
|
-
f"#SBATCH --err={slurm_err_path}",
|
1191
|
-
f"#SBATCH --out={slurm_out_path}",
|
1192
|
-
f"#SBATCH -D {self.workflow_dir_remote}",
|
1193
|
-
]
|
1194
|
-
)
|
1195
|
-
script_lines = slurm_config.sort_script_lines(script_lines)
|
1196
|
-
logger.debug(script_lines)
|
1197
|
-
|
1198
|
-
# Always print output of `uname -n` and `pwd`
|
1199
|
-
script_lines.append(
|
1200
|
-
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
1201
|
-
)
|
1202
|
-
|
1203
|
-
# Complete script preamble
|
1204
|
-
script_lines.append("\n")
|
1205
|
-
|
1206
|
-
# Include command lines
|
1207
|
-
tmp_list_commands = copy(list_commands)
|
1208
|
-
while tmp_list_commands:
|
1209
|
-
if tmp_list_commands:
|
1210
|
-
cmd = tmp_list_commands.pop(0) # take first element
|
1211
|
-
script_lines.append(
|
1212
|
-
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
1213
|
-
f"--mem={mem_per_task_MB}MB "
|
1214
|
-
f"{cmd} &"
|
1215
|
-
)
|
1216
|
-
script_lines.append("wait\n")
|
1217
|
-
|
1218
|
-
script = "\n".join(script_lines)
|
1219
|
-
return script
|
1220
|
-
|
1221
|
-
def shutdown(self, wait=True, *, cancel_futures=False):
|
1222
|
-
"""
|
1223
|
-
Clean up all executor variables. Note that this function is executed on
|
1224
|
-
the self.wait_thread thread, see _completion.
|
1225
|
-
"""
|
1226
|
-
|
1227
|
-
# Redudantly set thread shutdown attribute to True
|
1228
|
-
self.wait_thread.shutdown = True
|
1229
|
-
|
1230
|
-
logger.debug("Executor shutdown: start")
|
1231
|
-
|
1232
|
-
# Handle all job futures
|
1233
|
-
slurm_jobs_to_scancel = []
|
1234
|
-
with self.jobs_lock:
|
1235
|
-
while self.jobs:
|
1236
|
-
jobid, fut_and_job = self.jobs.popitem()
|
1237
|
-
slurm_jobs_to_scancel.append(jobid)
|
1238
|
-
fut = fut_and_job[0]
|
1239
|
-
self.map_jobid_to_slurm_files_local.pop(jobid)
|
1240
|
-
if not fut.cancelled():
|
1241
|
-
fut.set_exception(
|
1242
|
-
JobExecutionError(
|
1243
|
-
"Job cancelled due to executor shutdown."
|
1244
|
-
)
|
1245
|
-
)
|
1246
|
-
fut.cancel()
|
1247
|
-
|
1248
|
-
# Cancel SLURM jobs
|
1249
|
-
if slurm_jobs_to_scancel:
|
1250
|
-
scancel_string = " ".join(slurm_jobs_to_scancel)
|
1251
|
-
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
1252
|
-
scancel_command = f"scancel {scancel_string}"
|
1253
|
-
self.fractal_ssh.run_command(cmd=scancel_command)
|
1254
|
-
logger.debug("Executor shutdown: end")
|
1255
|
-
|
1256
|
-
def _stop_and_join_wait_thread(self):
|
1257
|
-
self.wait_thread.shutdown = True
|
1258
|
-
self.wait_thread.join()
|
1259
|
-
|
1260
|
-
def __exit__(self, *args, **kwargs):
|
1261
|
-
"""
|
1262
|
-
See
|
1263
|
-
https://github.com/fractal-analytics-platform/fractal-server/issues/1508
|
1264
|
-
"""
|
1265
|
-
logger.debug(
|
1266
|
-
"[FractalSlurmSSHExecutor.__exit__] Stop and join `wait_thread`"
|
1267
|
-
)
|
1268
|
-
self._stop_and_join_wait_thread()
|
1269
|
-
logger.debug("[FractalSlurmSSHExecutor.__exit__] End")
|
1270
|
-
|
1271
|
-
def run_squeue(self, job_ids):
|
1272
|
-
squeue_command = (
|
1273
|
-
"squeue "
|
1274
|
-
"--noheader "
|
1275
|
-
"--format='%i %T' "
|
1276
|
-
"--jobs __JOBS__ "
|
1277
|
-
"--states=all"
|
1278
|
-
)
|
1279
|
-
job_ids = ",".join([str(j) for j in job_ids])
|
1280
|
-
squeue_command = squeue_command.replace("__JOBS__", job_ids)
|
1281
|
-
stdout = self.fractal_ssh.run_command(cmd=squeue_command)
|
1282
|
-
return stdout
|
1283
|
-
|
1284
|
-
def _jobs_finished(self, job_ids: list[str]) -> set[str]:
|
1285
|
-
"""
|
1286
|
-
Check which ones of the given Slurm jobs already finished
|
1287
|
-
|
1288
|
-
The function is based on the `_jobs_finished` function from
|
1289
|
-
clusterfutures (version 0.5).
|
1290
|
-
Original Copyright: 2022 Adrian Sampson
|
1291
|
-
(released under the MIT licence)
|
1292
|
-
"""
|
1293
|
-
|
1294
|
-
logger.debug(
|
1295
|
-
f"[FractalSlurmSSHExecutor._jobs_finished] START ({job_ids=})"
|
1296
|
-
)
|
1297
|
-
|
1298
|
-
# If there is no Slurm job to check, return right away
|
1299
|
-
if not job_ids:
|
1300
|
-
logger.debug(
|
1301
|
-
"[FractalSlurmSSHExecutor._jobs_finished] "
|
1302
|
-
"No jobs provided, return."
|
1303
|
-
)
|
1304
|
-
return set()
|
1305
|
-
|
1306
|
-
try:
|
1307
|
-
stdout = self.run_squeue(job_ids)
|
1308
|
-
id_to_state = {
|
1309
|
-
out.split()[0]: out.split()[1] for out in stdout.splitlines()
|
1310
|
-
}
|
1311
|
-
# Finished jobs only stay in squeue for a few mins (configurable).
|
1312
|
-
# If a job ID isn't there, we'll assume it's finished.
|
1313
|
-
output = {
|
1314
|
-
_id
|
1315
|
-
for _id in job_ids
|
1316
|
-
if id_to_state.get(_id, "COMPLETED") in STATES_FINISHED
|
1317
|
-
}
|
1318
|
-
logger.debug(
|
1319
|
-
f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
|
1320
|
-
)
|
1321
|
-
return output
|
1322
|
-
except Exception as e:
|
1323
|
-
# If something goes wrong, proceed anyway
|
1324
|
-
logger.error(
|
1325
|
-
f"Something wrong in _jobs_finished. Original error: {str(e)}"
|
1326
|
-
)
|
1327
|
-
output = set()
|
1328
|
-
logger.debug(
|
1329
|
-
f"[FractalSlurmSSHExecutor._jobs_finished] END - {output=}"
|
1330
|
-
)
|
1331
|
-
return output
|
1332
|
-
|
1333
|
-
id_to_state = dict()
|
1334
|
-
for j in job_ids:
|
1335
|
-
res = self.run_squeue([j])
|
1336
|
-
if res.returncode != 0:
|
1337
|
-
logger.info(f"Job {j} not found. Marked it as completed")
|
1338
|
-
id_to_state.update({str(j): "COMPLETED"})
|
1339
|
-
else:
|
1340
|
-
id_to_state.update(
|
1341
|
-
{res.stdout.split()[0]: res.stdout.split()[1]}
|
1342
|
-
)
|
1343
|
-
|
1344
|
-
def handshake(self) -> dict:
|
1345
|
-
"""
|
1346
|
-
Healthcheck for SSH connection and for versions match.
|
1347
|
-
|
1348
|
-
FIXME SSH: We should add a timeout here
|
1349
|
-
FIXME SSH: We could include checks on the existence of folders
|
1350
|
-
FIXME SSH: We could include further checks on version matches
|
1351
|
-
"""
|
1352
|
-
|
1353
|
-
self.fractal_ssh.check_connection()
|
1354
|
-
|
1355
|
-
t_start_handshake = time.perf_counter()
|
1356
|
-
|
1357
|
-
logger.info("[FractalSlurmSSHExecutor.ssh_handshake] START")
|
1358
|
-
cmd = f"{self.python_remote} -m fractal_server.app.runner.versions"
|
1359
|
-
stdout = self.fractal_ssh.run_command(cmd=cmd)
|
1360
|
-
try:
|
1361
|
-
remote_versions = json.loads(stdout.strip("\n"))
|
1362
|
-
except json.decoder.JSONDecodeError as e:
|
1363
|
-
logger.error("Fractal server versions not available")
|
1364
|
-
raise e
|
1365
|
-
|
1366
|
-
# Check compatibility with local versions
|
1367
|
-
local_versions = get_versions()
|
1368
|
-
remote_fractal_server = remote_versions["fractal_server"]
|
1369
|
-
local_fractal_server = local_versions["fractal_server"]
|
1370
|
-
if remote_fractal_server != local_fractal_server:
|
1371
|
-
error_msg = (
|
1372
|
-
"Fractal-server version mismatch.\n"
|
1373
|
-
"Local interpreter: "
|
1374
|
-
f"({sys.executable}): {local_versions}.\n"
|
1375
|
-
"Remote interpreter: "
|
1376
|
-
f"({self.python_remote}): {remote_versions}."
|
1377
|
-
)
|
1378
|
-
logger.error(error_msg)
|
1379
|
-
raise ValueError(error_msg)
|
1380
|
-
|
1381
|
-
t_end_handshake = time.perf_counter()
|
1382
|
-
logger.info(
|
1383
|
-
"[FractalSlurmSSHExecutor.ssh_handshake] END"
|
1384
|
-
f" - elapsed: {t_end_handshake - t_start_handshake:.3f} s"
|
1385
|
-
)
|
1386
|
-
return remote_versions
|