fractal-server 2.14.0a13__py3-none-any.whl → 2.14.0a15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/dataset.py +1 -1
- fractal_server/app/models/v2/job.py +7 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/_aux_functions_history.py +8 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -8
- fractal_server/app/routes/api/v2/history.py +111 -27
- fractal_server/app/routes/api/v2/images.py +16 -14
- fractal_server/app/routes/api/v2/project.py +0 -52
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/workflow.py +0 -8
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/runner/executors/base_runner.py +5 -0
- fractal_server/app/runner/executors/local/runner.py +15 -7
- fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +677 -0
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
- fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
- fractal_server/app/runner/task_files.py +20 -6
- fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
- fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
- fractal_server/app/runner/v2/runner.py +4 -0
- fractal_server/app/runner/v2/runner_functions.py +2 -2
- fractal_server/app/runner/v2/submit_workflow.py +7 -16
- fractal_server/app/schemas/v2/__init__.py +3 -1
- fractal_server/app/schemas/v2/history.py +27 -2
- fractal_server/config.py +6 -2
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
- fractal_server/tasks/v2/utils_background.py +0 -19
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/RECORD +41 -42
- fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
- fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
- fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/entry_points.txt +0 -0
@@ -1,30 +1,11 @@
|
|
1
|
-
import json
|
2
|
-
import math
|
3
|
-
import sys
|
4
1
|
import time
|
5
|
-
from copy import copy
|
6
2
|
from pathlib import Path
|
7
|
-
from typing import Any
|
8
3
|
from typing import Optional
|
9
4
|
|
10
|
-
import
|
11
|
-
from
|
12
|
-
from
|
13
|
-
|
14
|
-
from ._check_job_status_ssh import get_finished_jobs_ssh
|
15
|
-
from fractal_server import __VERSION__
|
16
|
-
from fractal_server.app.runner.exceptions import JobExecutionError
|
17
|
-
from fractal_server.app.runner.exceptions import TaskExecutionError
|
18
|
-
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
19
|
-
from fractal_server.app.runner.executors.slurm_common._batching import (
|
20
|
-
heuristics,
|
21
|
-
)
|
22
|
-
from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
23
|
-
SlurmConfig,
|
24
|
-
)
|
25
|
-
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
26
|
-
from fractal_server.app.runner.task_files import TaskFiles
|
27
|
-
from fractal_server.app.schemas.v2.task import TaskTypeType
|
5
|
+
from ..slurm_common.base_slurm_runner import BaseSlurmRunner
|
6
|
+
from ..slurm_common.slurm_job_task_models import SlurmJob
|
7
|
+
from fractal_server.app.runner.compress_folder import compress_folder
|
8
|
+
from fractal_server.app.runner.extract_archive import extract_archive
|
28
9
|
from fractal_server.config import get_settings
|
29
10
|
from fractal_server.logger import set_logger
|
30
11
|
from fractal_server.ssh._fabric import FractalSSH
|
@@ -32,676 +13,157 @@ from fractal_server.syringe import Inject
|
|
32
13
|
|
33
14
|
|
34
15
|
logger = set_logger(__name__)
|
16
|
+
# FIXME: Transform several logger.info into logger.debug.
|
35
17
|
|
36
18
|
|
37
|
-
|
38
|
-
if proxy.exc_type_name == "JobExecutionError":
|
39
|
-
return JobExecutionError(str(proxy))
|
40
|
-
else:
|
41
|
-
kwargs = {}
|
42
|
-
for key in [
|
43
|
-
"workflow_task_id",
|
44
|
-
"workflow_task_order",
|
45
|
-
"task_name",
|
46
|
-
]:
|
47
|
-
if key in proxy.kwargs.keys():
|
48
|
-
kwargs[key] = proxy.kwargs[key]
|
49
|
-
return TaskExecutionError(proxy.tb, **kwargs)
|
50
|
-
|
51
|
-
|
52
|
-
class SlurmTask(BaseModel):
|
53
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
54
|
-
component: str
|
55
|
-
workdir_local: Path
|
56
|
-
workdir_remote: Path
|
57
|
-
parameters: dict[str, Any]
|
58
|
-
zarr_url: Optional[str] = None
|
59
|
-
task_files: TaskFiles
|
60
|
-
index: int
|
61
|
-
|
62
|
-
@property
|
63
|
-
def input_pickle_file_local(self) -> str:
|
64
|
-
return (
|
65
|
-
self.workdir_local / f"{self.component}-input.pickle"
|
66
|
-
).as_posix()
|
67
|
-
|
68
|
-
@property
|
69
|
-
def output_pickle_file_local(self) -> str:
|
70
|
-
return (
|
71
|
-
self.workdir_local / f"{self.component}-output.pickle"
|
72
|
-
).as_posix()
|
73
|
-
|
74
|
-
@property
|
75
|
-
def input_pickle_file_remote(self) -> str:
|
76
|
-
return (
|
77
|
-
self.workdir_remote / f"{self.component}-input.pickle"
|
78
|
-
).as_posix()
|
79
|
-
|
80
|
-
@property
|
81
|
-
def output_pickle_file_remote(self) -> str:
|
82
|
-
return (
|
83
|
-
self.workdir_remote / f"{self.component}-output.pickle"
|
84
|
-
).as_posix()
|
85
|
-
|
86
|
-
|
87
|
-
class SlurmJob(BaseModel):
|
88
|
-
slurm_job_id: Optional[str] = None
|
89
|
-
label: str
|
90
|
-
workdir_local: Path
|
91
|
-
workdir_remote: Path
|
92
|
-
tasks: list[SlurmTask]
|
93
|
-
|
94
|
-
@property
|
95
|
-
def slurm_log_file_local(self) -> str:
|
96
|
-
if self.slurm_job_id:
|
97
|
-
return (
|
98
|
-
self.workdir_local
|
99
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
100
|
-
).as_posix()
|
101
|
-
else:
|
102
|
-
return (
|
103
|
-
self.workdir_local / f"slurm-{self.label}-%j.log"
|
104
|
-
).as_posix()
|
105
|
-
|
106
|
-
@property
|
107
|
-
def slurm_log_file_remote(self) -> str:
|
108
|
-
if self.slurm_job_id:
|
109
|
-
return (
|
110
|
-
self.workdir_remote
|
111
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
112
|
-
).as_posix()
|
113
|
-
else:
|
114
|
-
return (
|
115
|
-
self.workdir_remote / f"slurm-{self.label}-%j.log"
|
116
|
-
).as_posix()
|
117
|
-
|
118
|
-
@property
|
119
|
-
def slurm_submission_script_local(self) -> str:
|
120
|
-
return (
|
121
|
-
self.workdir_local / f"slurm-{self.label}-submit.sh"
|
122
|
-
).as_posix()
|
123
|
-
|
124
|
-
@property
|
125
|
-
def slurm_submission_script_remote(self) -> str:
|
126
|
-
return (
|
127
|
-
self.workdir_remote / f"slurm-{self.label}-submit.sh"
|
128
|
-
).as_posix()
|
129
|
-
|
130
|
-
@property
|
131
|
-
def slurm_stdout(self) -> str:
|
132
|
-
return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
|
133
|
-
|
134
|
-
@property
|
135
|
-
def slurm_stderr(self) -> str:
|
136
|
-
return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
|
137
|
-
|
138
|
-
@property
|
139
|
-
def log_files_local(self) -> list[str]:
|
140
|
-
return [task.task_files.log_file_local for task in self.tasks]
|
141
|
-
|
142
|
-
|
143
|
-
# def _subprocess_run_or_raise(
|
144
|
-
# full_command: str,
|
145
|
-
# ) -> Optional[subprocess.CompletedProcess]:
|
146
|
-
# try:
|
147
|
-
# output = subprocess.run( # nosec
|
148
|
-
# shlex.split(full_command),
|
149
|
-
# capture_output=True,
|
150
|
-
# check=True,
|
151
|
-
# encoding="utf-8",
|
152
|
-
# )
|
153
|
-
# return output
|
154
|
-
# except subprocess.CalledProcessError as e:
|
155
|
-
# error_msg = (
|
156
|
-
# f"Submit command `{full_command}` failed. "
|
157
|
-
# f"Original error:\n{str(e)}\n"
|
158
|
-
# f"Original stdout:\n{e.stdout}\n"
|
159
|
-
# f"Original stderr:\n{e.stderr}\n"
|
160
|
-
# )
|
161
|
-
# logging.error(error_msg)
|
162
|
-
# raise JobExecutionError(info=error_msg)
|
163
|
-
|
164
|
-
|
165
|
-
class RunnerSlurmSSH(BaseRunner):
|
19
|
+
class SlurmSSHRunner(BaseSlurmRunner):
|
166
20
|
fractal_ssh: FractalSSH
|
167
21
|
|
168
|
-
slurm_user: str
|
169
|
-
shutdown_file: Path
|
170
|
-
common_script_lines: list[str]
|
171
|
-
user_cache_dir: str
|
172
|
-
root_dir_local: Path
|
173
|
-
root_dir_remote: Path
|
174
|
-
slurm_account: Optional[str] = None
|
175
|
-
poll_interval: int
|
176
|
-
python_worker_interpreter: str
|
177
|
-
jobs: dict[str, SlurmJob]
|
178
|
-
|
179
22
|
def __init__(
|
180
23
|
self,
|
181
24
|
*,
|
182
|
-
|
183
|
-
slurm_user: str,
|
25
|
+
# Common
|
184
26
|
root_dir_local: Path,
|
185
27
|
root_dir_remote: Path,
|
186
|
-
slurm_account: Optional[str] = None,
|
187
28
|
common_script_lines: Optional[list[str]] = None,
|
188
29
|
user_cache_dir: Optional[str] = None,
|
189
|
-
|
30
|
+
poll_interval: Optional[int] = None,
|
31
|
+
# Specific
|
32
|
+
fractal_ssh: FractalSSH,
|
190
33
|
) -> None:
|
191
34
|
"""
|
192
35
|
Set parameters that are the same for different Fractal tasks and for
|
193
36
|
different SLURM jobs/tasks.
|
194
37
|
"""
|
195
|
-
|
196
|
-
self.slurm_user = slurm_user
|
197
|
-
self.slurm_account = slurm_account
|
198
|
-
self.common_script_lines = common_script_lines or []
|
199
|
-
|
200
|
-
# Check that SLURM account is not set here
|
201
|
-
# FIXME: move to little method
|
202
|
-
try:
|
203
|
-
invalid_line = next(
|
204
|
-
line
|
205
|
-
for line in self.common_script_lines
|
206
|
-
if line.startswith("#SBATCH --account=")
|
207
|
-
)
|
208
|
-
raise RuntimeError(
|
209
|
-
"Invalid line in `RunnerSlurmSSH.common_script_lines`: "
|
210
|
-
f"'{invalid_line}'.\n"
|
211
|
-
"SLURM account must be set via the request body of the "
|
212
|
-
"apply-workflow endpoint, or by modifying the user properties."
|
213
|
-
)
|
214
|
-
except StopIteration:
|
215
|
-
pass
|
216
|
-
|
217
|
-
# Check Python versions
|
218
|
-
settings = Inject(get_settings)
|
219
38
|
self.fractal_ssh = fractal_ssh
|
220
39
|
logger.warning(self.fractal_ssh)
|
221
40
|
|
222
|
-
# It is the new handshanke
|
223
|
-
if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
|
224
|
-
self.check_remote_python_interpreter()
|
225
|
-
|
226
|
-
# Initialize connection and perform handshake
|
227
|
-
self.root_dir_local = root_dir_local
|
228
|
-
self.root_dir_remote = root_dir_remote
|
229
|
-
|
230
|
-
# # Create folders
|
231
|
-
# original_umask = os.umask(0)
|
232
|
-
# self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
|
233
|
-
# os.umask(original_umask)
|
234
|
-
# _mkdir_as_user(
|
235
|
-
# folder=self.root_dir_remote.as_posix(),
|
236
|
-
# user=self.slurm_user,
|
237
|
-
# )
|
238
|
-
|
239
|
-
self.user_cache_dir = user_cache_dir
|
240
|
-
|
241
|
-
self.slurm_poll_interval = (
|
242
|
-
slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
243
|
-
)
|
244
|
-
|
245
|
-
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
246
|
-
|
247
|
-
self.python_worker_interpreter = (
|
248
|
-
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
249
|
-
)
|
250
|
-
|
251
|
-
self.jobs = {}
|
252
|
-
|
253
|
-
def __enter__(self):
|
254
|
-
return self
|
255
|
-
|
256
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
257
|
-
return False
|
258
|
-
|
259
|
-
def is_shutdown(self) -> bool:
|
260
|
-
return self.shutdown_file.exists()
|
261
|
-
|
262
|
-
def scancel_jobs(self) -> None:
|
263
|
-
logger.debug("[scancel_jobs] START")
|
264
|
-
|
265
|
-
if self.jobs:
|
266
|
-
scancel_string = " ".join(self.job_ids)
|
267
|
-
scancel_cmd = f"scancel {scancel_string}"
|
268
|
-
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
269
|
-
try:
|
270
|
-
self.fractal_ssh.run_command(cmd=scancel_cmd)
|
271
|
-
# _run_command_as_user(
|
272
|
-
# cmd=scancel_cmd,
|
273
|
-
# user=self.slurm_user,
|
274
|
-
# check=True,
|
275
|
-
# )
|
276
|
-
except RuntimeError as e:
|
277
|
-
logger.warning(
|
278
|
-
"[scancel_jobs] `scancel` command failed. "
|
279
|
-
f"Original error:\n{str(e)}"
|
280
|
-
)
|
281
|
-
|
282
|
-
logger.debug("[scancel_jobs] END")
|
283
|
-
|
284
|
-
def _submit_single_sbatch(
|
285
|
-
self,
|
286
|
-
func,
|
287
|
-
slurm_job: SlurmJob,
|
288
|
-
slurm_config: SlurmConfig,
|
289
|
-
) -> str:
|
290
|
-
# Prepare input pickle(s)
|
291
|
-
versions = dict(
|
292
|
-
python=sys.version_info[:3],
|
293
|
-
cloudpickle=cloudpickle.__version__,
|
294
|
-
fractal_server=__VERSION__,
|
295
|
-
)
|
296
|
-
for task in slurm_job.tasks:
|
297
|
-
_args = []
|
298
|
-
_kwargs = dict(parameters=task.parameters)
|
299
|
-
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
300
|
-
with open(task.input_pickle_file_local, "wb") as f:
|
301
|
-
f.write(funcser)
|
302
|
-
# Prepare commands to be included in SLURM submission script
|
303
41
|
settings = Inject(get_settings)
|
304
|
-
python_worker_interpreter =
|
305
|
-
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
306
|
-
)
|
307
|
-
cmdlines = []
|
308
|
-
for task in slurm_job.tasks:
|
309
|
-
input_pickle_file = task.input_pickle_file_local
|
310
|
-
output_pickle_file = task.output_pickle_file_remote
|
311
|
-
cmdlines.append(
|
312
|
-
(
|
313
|
-
f"{python_worker_interpreter}"
|
314
|
-
" -m fractal_server.app.runner."
|
315
|
-
"executors.slurm_common.remote "
|
316
|
-
f"--input-file {input_pickle_file} "
|
317
|
-
f"--output-file {output_pickle_file}"
|
318
|
-
)
|
319
|
-
)
|
320
|
-
|
321
|
-
# ...
|
322
|
-
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
323
|
-
mem_per_task_MB = slurm_config.mem_per_task_MB
|
324
|
-
|
325
|
-
# Set ntasks
|
326
|
-
ntasks = min(len(cmdlines), num_tasks_max_running)
|
327
|
-
slurm_config.parallel_tasks_per_job = ntasks
|
42
|
+
self.python_worker_interpreter = settings.FRACTAL_SLURM_WORKER_PYTHON
|
328
43
|
|
329
|
-
|
330
|
-
|
331
|
-
|
44
|
+
super().__init__(
|
45
|
+
slurm_runner_type="ssh",
|
46
|
+
root_dir_local=root_dir_local,
|
47
|
+
root_dir_remote=root_dir_remote,
|
48
|
+
common_script_lines=common_script_lines,
|
49
|
+
user_cache_dir=user_cache_dir,
|
50
|
+
poll_interval=poll_interval,
|
332
51
|
)
|
333
52
|
|
334
|
-
|
335
|
-
|
336
|
-
script_lines.extend(
|
337
|
-
[
|
338
|
-
f"#SBATCH --err={slurm_job.slurm_stderr}",
|
339
|
-
f"#SBATCH --out={slurm_job.slurm_stdout}",
|
340
|
-
f"#SBATCH -D {slurm_job.workdir_remote}",
|
341
|
-
]
|
342
|
-
)
|
343
|
-
script_lines = slurm_config.sort_script_lines(script_lines)
|
344
|
-
logger.debug(script_lines)
|
53
|
+
def _mkdir_local_folder(self, folder: str) -> None:
|
54
|
+
Path(folder).mkdir(parents=True)
|
345
55
|
|
346
|
-
|
347
|
-
|
348
|
-
|
56
|
+
def _mkdir_remote_folder(self, folder: str):
|
57
|
+
self.fractal_ssh.mkdir(
|
58
|
+
folder=folder,
|
59
|
+
parents=True,
|
349
60
|
)
|
350
61
|
|
351
|
-
|
352
|
-
|
62
|
+
def _copy_files_from_remote_to_local(self, slurm_job: SlurmJob) -> None:
|
63
|
+
self._get_subfolder_sftp(job=slurm_job)
|
353
64
|
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
)
|
364
|
-
script_lines.append("wait\n")
|
365
|
-
|
366
|
-
script = "\n".join(script_lines)
|
65
|
+
def _put_subfolder_sftp(self, job: SlurmJob) -> None:
|
66
|
+
# FIXME re-introduce use of this function, but only after splitting
|
67
|
+
# submission logic into
|
68
|
+
# 1. prepare all
|
69
|
+
# 2. send folder
|
70
|
+
# 3. submit all
|
71
|
+
"""
|
72
|
+
Transfer the jobs subfolder to the remote host.
|
73
|
+
"""
|
367
74
|
|
368
|
-
#
|
369
|
-
|
370
|
-
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
371
|
-
f.write(script)
|
75
|
+
# Create compressed subfolder archive (locally)
|
76
|
+
tarfile_path_local = compress_folder(job.workdir_local)
|
372
77
|
|
78
|
+
tarfile_name = Path(tarfile_path_local).name
|
79
|
+
logger.info(f"Subfolder archive created at {tarfile_path_local}")
|
80
|
+
tarfile_path_remote = (
|
81
|
+
job.workdir_remote.parent / tarfile_name
|
82
|
+
).as_posix()
|
83
|
+
# Transfer archive
|
84
|
+
t_0_put = time.perf_counter()
|
373
85
|
self.fractal_ssh.send_file(
|
374
|
-
local=
|
375
|
-
remote=
|
86
|
+
local=tarfile_path_local,
|
87
|
+
remote=tarfile_path_remote,
|
376
88
|
)
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
f"
|
89
|
+
t_1_put = time.perf_counter()
|
90
|
+
logger.info(
|
91
|
+
f"Subfolder archive transferred to {tarfile_path_remote}"
|
92
|
+
f" - elapsed: {t_1_put - t_0_put:.3f} s"
|
381
93
|
)
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
script_content = f"{script_content}\n"
|
390
|
-
script_path_remote = (
|
391
|
-
f"{slurm_job.slurm_script_remote.as_posix()}_wrapper.sh"
|
392
|
-
)
|
393
|
-
self.fractal_ssh.write_remote_file(
|
394
|
-
path=script_path_remote, content=script_content
|
395
|
-
)
|
396
|
-
cmd = f"bash {script_path_remote}"
|
397
|
-
sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
|
398
|
-
|
399
|
-
# Submit SLURM job and retrieve job ID
|
400
|
-
stdout = sbatch_stdout.strip("\n")
|
401
|
-
submitted_job_id = int(stdout)
|
402
|
-
slurm_job.slurm_job_id = str(submitted_job_id)
|
403
|
-
|
404
|
-
# Add job to self.jobs
|
405
|
-
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
406
|
-
logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
|
94
|
+
# Uncompress archive (remotely)
|
95
|
+
tar_command = (
|
96
|
+
f"{self.python_worker_interpreter} -m "
|
97
|
+
"fractal_server.app.runner.extract_archive "
|
98
|
+
f"{tarfile_path_remote}"
|
99
|
+
)
|
100
|
+
self.fractal_ssh.run_command(cmd=tar_command)
|
407
101
|
|
408
|
-
|
409
|
-
|
410
|
-
|
102
|
+
# Remove local version
|
103
|
+
t_0_rm = time.perf_counter()
|
104
|
+
Path(tarfile_path_local).unlink()
|
105
|
+
t_1_rm = time.perf_counter()
|
106
|
+
logger.info(
|
107
|
+
f"Local archive removed - elapsed: {t_1_rm - t_0_rm:.3f} s"
|
108
|
+
)
|
411
109
|
|
412
|
-
def
|
413
|
-
# FIXME: This should only transfer archives, not single files
|
110
|
+
def _get_subfolder_sftp(self, job: SlurmJob) -> None:
|
414
111
|
"""
|
415
|
-
|
112
|
+
Fetch a remote folder via tar+sftp+tar
|
416
113
|
"""
|
417
|
-
source_target_list = [
|
418
|
-
(job.slurm_log_file_remote, job.slurm_log_file_local)
|
419
|
-
]
|
420
|
-
for task in job.tasks:
|
421
|
-
source_target_list.extend(
|
422
|
-
[
|
423
|
-
(
|
424
|
-
task.output_pickle_file_remote,
|
425
|
-
task.output_pickle_file_local,
|
426
|
-
),
|
427
|
-
(
|
428
|
-
task.task_files.log_file_remote,
|
429
|
-
task.task_files.log_file_local,
|
430
|
-
),
|
431
|
-
(
|
432
|
-
task.task_files.args_file_remote,
|
433
|
-
task.task_files.args_file_local,
|
434
|
-
),
|
435
|
-
(
|
436
|
-
task.task_files.metadiff_file_remote,
|
437
|
-
task.task_files.metadiff_file_local,
|
438
|
-
),
|
439
|
-
]
|
440
|
-
)
|
441
114
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
# )
|
451
|
-
# Write local file
|
452
|
-
# with open(target, "wb") as f:
|
453
|
-
# f.write(res.stdout)
|
454
|
-
# logger.critical(f"Copied {source} into {target}")
|
455
|
-
except (RuntimeError, FileNotFoundError) as e:
|
456
|
-
logger.warning(
|
457
|
-
f"SKIP copy {target} into {source}. "
|
458
|
-
f"Original error: {str(e)}"
|
459
|
-
)
|
115
|
+
t_0 = time.perf_counter()
|
116
|
+
logger.debug("[_get_subfolder_sftp] Start")
|
117
|
+
tarfile_path_local = (
|
118
|
+
job.workdir_local.parent / f"{job.workdir_local.name}.tar.gz"
|
119
|
+
).as_posix()
|
120
|
+
tarfile_path_remote = (
|
121
|
+
job.workdir_remote.parent / f"{job.workdir_remote.name}.tar.gz"
|
122
|
+
).as_posix()
|
460
123
|
|
461
|
-
|
462
|
-
self, *, task: SlurmTask
|
463
|
-
) -> tuple[Any, Exception]:
|
124
|
+
# Remove remote tarfile
|
464
125
|
try:
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
480
|
-
|
481
|
-
def submit(
|
482
|
-
self,
|
483
|
-
func: callable,
|
484
|
-
parameters: dict[str, Any],
|
485
|
-
history_item_id: int,
|
486
|
-
task_files: TaskFiles,
|
487
|
-
slurm_config: SlurmConfig,
|
488
|
-
task_type: TaskTypeType,
|
489
|
-
) -> tuple[Any, Exception]:
|
490
|
-
workdir_local = task_files.wftask_subfolder_local
|
491
|
-
workdir_remote = task_files.wftask_subfolder_remote
|
492
|
-
|
493
|
-
task_files = TaskFiles(
|
494
|
-
**task_files.model_dump(
|
495
|
-
exclude={"component"},
|
496
|
-
),
|
497
|
-
# FIXME _COMPONENT_KEY_ is deprecated
|
498
|
-
component="FIXME_INVALID_FAKE_VALUE",
|
499
|
-
# component=parameters[_COMPONENT_KEY_],
|
500
|
-
)
|
501
|
-
|
502
|
-
if self.jobs != {}:
|
503
|
-
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
504
|
-
|
505
|
-
if self.is_shutdown():
|
506
|
-
raise JobExecutionError("Cannot continue after shutdown.")
|
507
|
-
|
508
|
-
# Validation phase
|
509
|
-
self.validate_submit_parameters(
|
510
|
-
parameters=parameters,
|
511
|
-
task_type=task_type,
|
512
|
-
)
|
513
|
-
|
514
|
-
# Create task subfolder
|
515
|
-
workdir_local.mkdir(parents=True)
|
516
|
-
self.fractal_ssh.mkdir(
|
517
|
-
folder=workdir_remote.as_posix(),
|
518
|
-
parents=True,
|
519
|
-
)
|
520
|
-
|
521
|
-
# Submission phase
|
522
|
-
slurm_job = SlurmJob(
|
523
|
-
label="0",
|
524
|
-
workdir_local=workdir_local,
|
525
|
-
workdir_remote=workdir_remote,
|
526
|
-
tasks=[
|
527
|
-
SlurmTask(
|
528
|
-
index=0,
|
529
|
-
component="0",
|
530
|
-
parameters=parameters,
|
531
|
-
workdir_remote=workdir_remote,
|
532
|
-
workdir_local=workdir_local,
|
533
|
-
task_files=task_files,
|
534
|
-
)
|
535
|
-
],
|
536
|
-
) # TODO: replace with actual values (BASED ON TASKFILES)
|
537
|
-
|
538
|
-
slurm_config.parallel_tasks_per_job = 1
|
539
|
-
self._submit_single_sbatch(
|
540
|
-
func,
|
541
|
-
slurm_job=slurm_job,
|
542
|
-
slurm_config=slurm_config,
|
126
|
+
rm_command = f"rm {tarfile_path_remote}"
|
127
|
+
self.fractal_ssh.run_command(cmd=rm_command)
|
128
|
+
except RuntimeError as e:
|
129
|
+
logger.warning(f"{tarfile_path_remote} already exists!\n {str(e)}")
|
130
|
+
|
131
|
+
# Create remote tarfile
|
132
|
+
# FIXME: introduce filtering by prefix, so that when the subfolder
|
133
|
+
# includes N SLURM jobs we don't always copy the cumulative folder
|
134
|
+
# but only the relevant part
|
135
|
+
tar_command = (
|
136
|
+
f"{self.python_worker_interpreter} "
|
137
|
+
"-m fractal_server.app.runner.compress_folder "
|
138
|
+
f"{job.workdir_remote.as_posix()} "
|
139
|
+
"--remote-to-local"
|
543
140
|
)
|
141
|
+
stdout = self.fractal_ssh.run_command(cmd=tar_command)
|
142
|
+
print(stdout)
|
544
143
|
|
545
|
-
#
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
job_ids=self.job_ids,
|
551
|
-
fractal_ssh=self.fractal_ssh,
|
552
|
-
)
|
553
|
-
for slurm_job_id in finished_job_ids:
|
554
|
-
slurm_job = self.jobs.pop(slurm_job_id)
|
555
|
-
self._copy_files_from_remote_to_local(slurm_job)
|
556
|
-
result, exception = self._postprocess_single_task(
|
557
|
-
task=slurm_job.tasks[0]
|
558
|
-
)
|
559
|
-
time.sleep(self.slurm_poll_interval)
|
560
|
-
|
561
|
-
return result, exception
|
562
|
-
|
563
|
-
def multisubmit(
|
564
|
-
self,
|
565
|
-
func: callable,
|
566
|
-
list_parameters: list[dict],
|
567
|
-
history_item_id: int,
|
568
|
-
task_files: TaskFiles,
|
569
|
-
slurm_config: SlurmConfig,
|
570
|
-
task_type: TaskTypeType,
|
571
|
-
):
|
572
|
-
# self.scancel_jobs()
|
573
|
-
|
574
|
-
self.validate_multisubmit_parameters(
|
575
|
-
list_parameters=list_parameters,
|
576
|
-
task_type=task_type,
|
144
|
+
# Fetch tarfile
|
145
|
+
t_0_get = time.perf_counter()
|
146
|
+
self.fractal_ssh.fetch_file(
|
147
|
+
remote=tarfile_path_remote,
|
148
|
+
local=tarfile_path_local,
|
577
149
|
)
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
# Create local&remote task subfolders
|
583
|
-
if task_type not in ["compound", "converter_compound"]:
|
584
|
-
workdir_local.mkdir(parents=True)
|
585
|
-
self.fractal_ssh.mkdir(
|
586
|
-
folder=workdir_remote.as_posix(),
|
587
|
-
parents=True,
|
588
|
-
)
|
589
|
-
|
590
|
-
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
591
|
-
# TODO Pick a data structure for results and exceptions, or review the
|
592
|
-
# interface
|
593
|
-
results: dict[int, Any] = {}
|
594
|
-
exceptions: dict[int, BaseException] = {}
|
595
|
-
|
596
|
-
original_task_files = task_files
|
597
|
-
tot_tasks = len(list_parameters)
|
598
|
-
|
599
|
-
# Set/validate parameters for task batching
|
600
|
-
tasks_per_job, parallel_tasks_per_job = heuristics(
|
601
|
-
# Number of parallel components (always known)
|
602
|
-
tot_tasks=tot_tasks,
|
603
|
-
# Optional WorkflowTask attributes:
|
604
|
-
tasks_per_job=slurm_config.tasks_per_job,
|
605
|
-
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
606
|
-
# Task requirements (multiple possible sources):
|
607
|
-
cpus_per_task=slurm_config.cpus_per_task,
|
608
|
-
mem_per_task=slurm_config.mem_per_task_MB,
|
609
|
-
# Fractal configuration variables (soft/hard limits):
|
610
|
-
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
611
|
-
target_mem_per_job=slurm_config.target_mem_per_job,
|
612
|
-
target_num_jobs=slurm_config.target_num_jobs,
|
613
|
-
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
614
|
-
max_mem_per_job=slurm_config.max_mem_per_job,
|
615
|
-
max_num_jobs=slurm_config.max_num_jobs,
|
150
|
+
t_1_get = time.perf_counter()
|
151
|
+
logger.info(
|
152
|
+
f"Subfolder archive transferred back to {tarfile_path_local}"
|
153
|
+
f" - elapsed: {t_1_get - t_0_get:.3f} s"
|
616
154
|
)
|
617
|
-
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
618
|
-
slurm_config.tasks_per_job = tasks_per_job
|
619
|
-
|
620
|
-
# Divide arguments in batches of `tasks_per_job` tasks each
|
621
|
-
args_batches = []
|
622
|
-
batch_size = tasks_per_job
|
623
|
-
for ind_chunk in range(0, tot_tasks, batch_size):
|
624
|
-
args_batches.append(
|
625
|
-
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
626
|
-
)
|
627
|
-
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
628
|
-
raise RuntimeError("Something wrong here while batching tasks")
|
629
155
|
|
630
|
-
|
631
|
-
|
632
|
-
# TODO: replace with actual values
|
633
|
-
tasks = []
|
634
|
-
for ind_chunk, parameters in enumerate(chunk):
|
635
|
-
# FIXME: _COMPONENT_KEY_ is deprecated
|
636
|
-
# component = parameters[_COMPONENT_KEY_]
|
637
|
-
component = "INVALID_FAKE_VALUE_FIXME"
|
638
|
-
tasks.append(
|
639
|
-
SlurmTask(
|
640
|
-
index=(ind_batch * batch_size) + ind_chunk,
|
641
|
-
component=component,
|
642
|
-
workdir_local=workdir_local,
|
643
|
-
workdir_remote=workdir_remote,
|
644
|
-
parameters=parameters,
|
645
|
-
zarr_url=parameters["zarr_url"],
|
646
|
-
task_files=TaskFiles(
|
647
|
-
**original_task_files.model_dump(
|
648
|
-
exclude={"component"}
|
649
|
-
),
|
650
|
-
component=component,
|
651
|
-
),
|
652
|
-
),
|
653
|
-
)
|
156
|
+
# Extract tarfile locally
|
157
|
+
extract_archive(Path(tarfile_path_local))
|
654
158
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
tasks=tasks,
|
660
|
-
)
|
661
|
-
self._submit_single_sbatch(
|
662
|
-
func,
|
663
|
-
slurm_job=slurm_job,
|
664
|
-
slurm_config=slurm_config,
|
665
|
-
)
|
666
|
-
logger.info(f"END submission phase, {list(self.jobs.keys())=}")
|
159
|
+
# Remove local tarfile
|
160
|
+
if Path(tarfile_path_local).exists():
|
161
|
+
logger.warning(f"Remove existing file {tarfile_path_local}.")
|
162
|
+
Path(tarfile_path_local).unlink()
|
667
163
|
|
668
|
-
|
669
|
-
|
670
|
-
if self.is_shutdown():
|
671
|
-
self.scancel_jobs()
|
672
|
-
finished_job_ids = get_finished_jobs_ssh(
|
673
|
-
job_ids=self.job_ids,
|
674
|
-
fractal_ssh=self.fractal_ssh,
|
675
|
-
)
|
676
|
-
for slurm_job_id in finished_job_ids:
|
677
|
-
slurm_job = self.jobs.pop(slurm_job_id)
|
678
|
-
self._copy_files_from_remote_to_local(slurm_job)
|
679
|
-
for task in slurm_job.tasks:
|
680
|
-
result, exception = self._postprocess_single_task(
|
681
|
-
task=task
|
682
|
-
)
|
683
|
-
if exception is None:
|
684
|
-
results[task.index] = result
|
685
|
-
else:
|
686
|
-
exceptions[task.index] = exception
|
687
|
-
time.sleep(self.slurm_poll_interval)
|
688
|
-
return results, exceptions
|
164
|
+
t_1 = time.perf_counter()
|
165
|
+
logger.info(f"[_get_subfolder_sftp] End - elapsed: {t_1 - t_0:.3f} s")
|
689
166
|
|
690
|
-
def
|
691
|
-
settings = Inject(get_settings)
|
692
|
-
cmd = (
|
693
|
-
f"{self.python_worker_interpreter} "
|
694
|
-
"-m fractal_server.app.runner.versions"
|
695
|
-
)
|
167
|
+
def _run_remote_cmd(self, cmd: str) -> str:
|
696
168
|
stdout = self.fractal_ssh.run_command(cmd=cmd)
|
697
|
-
|
698
|
-
if remote_version != __VERSION__:
|
699
|
-
error_msg = (
|
700
|
-
"Fractal-server version mismatch.\n"
|
701
|
-
"Local interpreter: "
|
702
|
-
f"({sys.executable}): {__VERSION__}.\n"
|
703
|
-
"Remote interpreter: "
|
704
|
-
f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {remote_version}."
|
705
|
-
)
|
706
|
-
logger.error(error_msg)
|
707
|
-
raise RuntimeError(error_msg)
|
169
|
+
return stdout
|