fractal-server 2.13.1__py3-none-any.whl → 2.14.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/history/__init__.py +4 -0
- fractal_server/app/history/image_updates.py +142 -0
- fractal_server/app/history/status_enum.py +16 -0
- fractal_server/app/models/v2/__init__.py +5 -1
- fractal_server/app/models/v2/history.py +53 -0
- fractal_server/app/routes/api/v2/__init__.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +78 -0
- fractal_server/app/routes/api/v2/dataset.py +12 -9
- fractal_server/app/routes/api/v2/history.py +247 -0
- fractal_server/app/routes/api/v2/workflow.py +18 -3
- fractal_server/app/routes/api/v2/workflowtask.py +22 -0
- fractal_server/app/runner/executors/base_runner.py +114 -0
- fractal_server/app/runner/{v2/_local → executors/local}/_local_config.py +3 -3
- fractal_server/app/runner/executors/local/_submit_setup.py +54 -0
- fractal_server/app/runner/executors/local/runner.py +200 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +3 -3
- fractal_server/app/runner/{v2/_slurm_ssh → executors/slurm_common}/_submit_setup.py +13 -12
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +9 -15
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_executor_wait_thread.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_slurm_job.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/executor.py +13 -14
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_check_jobs_status.py +11 -9
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_executor_wait_thread.py +3 -3
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -68
- fractal_server/app/runner/executors/slurm_sudo/runner.py +632 -0
- fractal_server/app/runner/task_files.py +70 -96
- fractal_server/app/runner/v2/__init__.py +5 -19
- fractal_server/app/runner/v2/_local.py +84 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +10 -13
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +10 -12
- fractal_server/app/runner/v2/runner.py +93 -28
- fractal_server/app/runner/v2/runner_functions.py +85 -62
- fractal_server/app/runner/v2/runner_functions_low_level.py +20 -20
- fractal_server/app/schemas/v2/dataset.py +0 -17
- fractal_server/app/schemas/v2/history.py +23 -0
- fractal_server/config.py +2 -2
- fractal_server/migrations/versions/8223fcef886c_image_status.py +63 -0
- fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +68 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/METADATA +1 -1
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/RECORD +52 -46
- fractal_server/app/routes/api/v2/status.py +0 -168
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- /fractal_server/app/runner/executors/{slurm → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_ssh}/__init__.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_sudo}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,632 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import shlex
|
4
|
+
import subprocess # nosec
|
5
|
+
import sys
|
6
|
+
import time
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
import cloudpickle
|
12
|
+
from pydantic import BaseModel
|
13
|
+
from pydantic import ConfigDict
|
14
|
+
|
15
|
+
from ._check_jobs_status import get_finished_jobs
|
16
|
+
from ._subprocess_run_as_user import _mkdir_as_user
|
17
|
+
from ._subprocess_run_as_user import _run_command_as_user
|
18
|
+
from fractal_server import __VERSION__
|
19
|
+
from fractal_server.app.history import HistoryItemImageStatus
|
20
|
+
from fractal_server.app.history import update_all_images
|
21
|
+
from fractal_server.app.history import update_single_image
|
22
|
+
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
23
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
24
|
+
from fractal_server.app.runner.exceptions import TaskExecutionError
|
25
|
+
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
26
|
+
from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
27
|
+
SlurmConfig,
|
28
|
+
)
|
29
|
+
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
30
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
31
|
+
from fractal_server.config import get_settings
|
32
|
+
from fractal_server.logger import set_logger
|
33
|
+
from fractal_server.syringe import Inject
|
34
|
+
|
35
|
+
|
36
|
+
logger = set_logger(__name__)
|
37
|
+
|
38
|
+
|
39
|
+
def _handle_exception_proxy(proxy): # FIXME
|
40
|
+
if proxy.exc_type_name == "JobExecutionError":
|
41
|
+
return JobExecutionError(str(proxy))
|
42
|
+
else:
|
43
|
+
kwargs = {}
|
44
|
+
for key in [
|
45
|
+
"workflow_task_id",
|
46
|
+
"workflow_task_order",
|
47
|
+
"task_name",
|
48
|
+
]:
|
49
|
+
if key in proxy.kwargs.keys():
|
50
|
+
kwargs[key] = proxy.kwargs[key]
|
51
|
+
return TaskExecutionError(proxy.tb, **kwargs)
|
52
|
+
|
53
|
+
|
54
|
+
class SlurmTask(BaseModel):
|
55
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
56
|
+
component: str
|
57
|
+
workdir_local: Path
|
58
|
+
workdir_remote: Path
|
59
|
+
zarr_url: Optional[str] = None
|
60
|
+
task_files: TaskFiles
|
61
|
+
|
62
|
+
@property
|
63
|
+
def input_pickle_file_local(self) -> str:
|
64
|
+
return (
|
65
|
+
self.workdir_local / f"{self.component}-input.pickle"
|
66
|
+
).as_posix()
|
67
|
+
|
68
|
+
@property
|
69
|
+
def output_pickle_file_local(self) -> str:
|
70
|
+
return (
|
71
|
+
self.workdir_local / f"{self.component}-output.pickle"
|
72
|
+
).as_posix()
|
73
|
+
|
74
|
+
@property
|
75
|
+
def input_pickle_file_remote(self) -> str:
|
76
|
+
return (
|
77
|
+
self.workdir_remote / f"{self.component}-input.pickle"
|
78
|
+
).as_posix()
|
79
|
+
|
80
|
+
@property
|
81
|
+
def output_pickle_file_remote(self) -> str:
|
82
|
+
return (
|
83
|
+
self.workdir_remote / f"{self.component}-output.pickle"
|
84
|
+
).as_posix()
|
85
|
+
|
86
|
+
|
87
|
+
class SlurmJob(BaseModel):
|
88
|
+
slurm_job_id: Optional[str] = None
|
89
|
+
label: str
|
90
|
+
workdir_local: Path
|
91
|
+
workdir_remote: Path
|
92
|
+
tasks: tuple[SlurmTask]
|
93
|
+
|
94
|
+
@property
|
95
|
+
def slurm_log_file_local(self) -> str:
|
96
|
+
if self.slurm_job_id:
|
97
|
+
return (
|
98
|
+
self.workdir_local
|
99
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
100
|
+
).as_posix()
|
101
|
+
else:
|
102
|
+
return (
|
103
|
+
self.workdir_local / f"slurm-{self.label}-%j.log"
|
104
|
+
).as_posix()
|
105
|
+
|
106
|
+
@property
|
107
|
+
def slurm_log_file_remote(self) -> str:
|
108
|
+
if self.slurm_job_id:
|
109
|
+
return (
|
110
|
+
self.workdir_remote
|
111
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
112
|
+
).as_posix()
|
113
|
+
else:
|
114
|
+
return (
|
115
|
+
self.workdir_remote / f"slurm-{self.label}-%j.log"
|
116
|
+
).as_posix()
|
117
|
+
|
118
|
+
@property
|
119
|
+
def slurm_submission_script_local(self) -> str:
|
120
|
+
return (
|
121
|
+
self.workdir_local / f"slurm-{self.label}-submit.sh"
|
122
|
+
).as_posix()
|
123
|
+
|
124
|
+
@property
|
125
|
+
def slurm_submission_script_remote(self) -> str:
|
126
|
+
return (
|
127
|
+
self.workdir_remote / f"slurm-{self.label}-submit.sh"
|
128
|
+
).as_posix()
|
129
|
+
|
130
|
+
@property
|
131
|
+
def log_files_local(self) -> list[str]:
|
132
|
+
return [task.task_files.log_file_local for task in self.tasks]
|
133
|
+
|
134
|
+
|
135
|
+
def _subprocess_run_or_raise(
|
136
|
+
full_command: str,
|
137
|
+
) -> Optional[subprocess.CompletedProcess]:
|
138
|
+
try:
|
139
|
+
output = subprocess.run( # nosec
|
140
|
+
shlex.split(full_command),
|
141
|
+
capture_output=True,
|
142
|
+
check=True,
|
143
|
+
encoding="utf-8",
|
144
|
+
)
|
145
|
+
return output
|
146
|
+
except subprocess.CalledProcessError as e:
|
147
|
+
error_msg = (
|
148
|
+
f"Submit command `{full_command}` failed. "
|
149
|
+
f"Original error:\n{str(e)}\n"
|
150
|
+
f"Original stdout:\n{e.stdout}\n"
|
151
|
+
f"Original stderr:\n{e.stderr}\n"
|
152
|
+
)
|
153
|
+
logging.error(error_msg)
|
154
|
+
raise JobExecutionError(info=error_msg)
|
155
|
+
|
156
|
+
|
157
|
+
class RunnerSlurmSudo(BaseRunner):
|
158
|
+
slurm_user: str
|
159
|
+
slurm_user: str
|
160
|
+
shutdown_file: Path
|
161
|
+
common_script_lines: list[str]
|
162
|
+
user_cache_dir: str
|
163
|
+
root_dir_local: Path
|
164
|
+
root_dir_remote: Path
|
165
|
+
slurm_account: Optional[str] = None
|
166
|
+
poll_interval: int
|
167
|
+
python_worker_interpreter: str
|
168
|
+
jobs: dict[str, SlurmJob]
|
169
|
+
|
170
|
+
def __init__(
|
171
|
+
self,
|
172
|
+
*,
|
173
|
+
slurm_user: str,
|
174
|
+
root_dir_local: Path,
|
175
|
+
root_dir_remote: Path,
|
176
|
+
slurm_account: Optional[str] = None,
|
177
|
+
common_script_lines: Optional[list[str]] = None,
|
178
|
+
user_cache_dir: Optional[str] = None,
|
179
|
+
slurm_poll_interval: Optional[int] = None,
|
180
|
+
) -> None:
|
181
|
+
"""
|
182
|
+
Set parameters that are the same for different Fractal tasks and for
|
183
|
+
different SLURM jobs/tasks.
|
184
|
+
"""
|
185
|
+
|
186
|
+
self.slurm_user = slurm_user
|
187
|
+
self.slurm_account = slurm_account
|
188
|
+
self.common_script_lines = common_script_lines or []
|
189
|
+
|
190
|
+
# Check that SLURM account is not set here
|
191
|
+
# FIXME: move to little method
|
192
|
+
try:
|
193
|
+
invalid_line = next(
|
194
|
+
line
|
195
|
+
for line in self.common_script_lines
|
196
|
+
if line.startswith("#SBATCH --account=")
|
197
|
+
)
|
198
|
+
raise RuntimeError(
|
199
|
+
"Invalid line in `FractalSlurmExecutor.common_script_lines`: "
|
200
|
+
f"'{invalid_line}'.\n"
|
201
|
+
"SLURM account must be set via the request body of the "
|
202
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
203
|
+
)
|
204
|
+
except StopIteration:
|
205
|
+
pass
|
206
|
+
|
207
|
+
# Check Python versions
|
208
|
+
settings = Inject(get_settings)
|
209
|
+
if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
|
210
|
+
self.check_remote_python_interpreter()
|
211
|
+
|
212
|
+
self.root_dir_local = root_dir_local
|
213
|
+
self.root_dir_remote = root_dir_remote
|
214
|
+
|
215
|
+
# Create folders
|
216
|
+
self.root_dir_local.mkdir(parents=True, exist_ok=True)
|
217
|
+
_mkdir_as_user(
|
218
|
+
folder=self.root_dir_remote.as_posix(),
|
219
|
+
user=self.slurm_user,
|
220
|
+
)
|
221
|
+
|
222
|
+
self.user_cache_dir = user_cache_dir
|
223
|
+
|
224
|
+
self.slurm_poll_interval = (
|
225
|
+
slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
226
|
+
)
|
227
|
+
|
228
|
+
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
229
|
+
|
230
|
+
self.python_worker_interpreter = (
|
231
|
+
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
232
|
+
)
|
233
|
+
|
234
|
+
self.jobs = {}
|
235
|
+
|
236
|
+
def __enter__(self):
|
237
|
+
return self
|
238
|
+
|
239
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
240
|
+
return False
|
241
|
+
|
242
|
+
def is_shutdown(self) -> bool:
|
243
|
+
return self.shutdown_file.exists()
|
244
|
+
|
245
|
+
def scancel_if_shutdown(self) -> None:
|
246
|
+
|
247
|
+
logger.debug("[exit_if_shutdown] START")
|
248
|
+
|
249
|
+
if self.jobs:
|
250
|
+
scancel_string = " ".join(self.job_ids)
|
251
|
+
scancel_cmd = f"scancel {scancel_string}"
|
252
|
+
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
253
|
+
try:
|
254
|
+
_run_command_as_user(
|
255
|
+
cmd=scancel_cmd,
|
256
|
+
user=self.slurm_user,
|
257
|
+
check=True,
|
258
|
+
)
|
259
|
+
except RuntimeError as e:
|
260
|
+
logger.warning(
|
261
|
+
"[exit_if_shutdown] `scancel` command failed. "
|
262
|
+
f"Original error:\n{str(e)}"
|
263
|
+
)
|
264
|
+
|
265
|
+
logger.debug("[exit_if_shutdown] END")
|
266
|
+
|
267
|
+
def _submit_single_sbatch(
|
268
|
+
self,
|
269
|
+
func,
|
270
|
+
parameters, # FIXME this should be per-task
|
271
|
+
slurm_job: SlurmJob,
|
272
|
+
slurm_config: SlurmConfig,
|
273
|
+
) -> str:
|
274
|
+
|
275
|
+
if len(slurm_job.tasks) > 1:
|
276
|
+
raise NotImplementedError()
|
277
|
+
|
278
|
+
# Prepare input pickle(s)
|
279
|
+
versions = dict(
|
280
|
+
python=sys.version_info[:3],
|
281
|
+
cloudpickle=cloudpickle.__version__,
|
282
|
+
fractal_server=__VERSION__,
|
283
|
+
)
|
284
|
+
for task in slurm_job.tasks:
|
285
|
+
_args = []
|
286
|
+
# TODO: make parameters task-dependent
|
287
|
+
_kwargs = dict(
|
288
|
+
parameters=parameters
|
289
|
+
) # FIXME: this should be per-tas
|
290
|
+
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
291
|
+
with open(task.input_pickle_file_local, "wb") as f:
|
292
|
+
f.write(funcser)
|
293
|
+
|
294
|
+
# Prepare commands to be included in SLURM submission script
|
295
|
+
|
296
|
+
preamble_lines = [
|
297
|
+
"#!/bin/bash",
|
298
|
+
"#SBATCH --partition=main",
|
299
|
+
"#SBATCH --ntasks=1",
|
300
|
+
"#SBATCH --cpus-per-task=1",
|
301
|
+
"#SBATCH --mem=10M",
|
302
|
+
f"#SBATCH --err={slurm_job.slurm_log_file_remote}",
|
303
|
+
f"#SBATCH --out={slurm_job.slurm_log_file_remote}",
|
304
|
+
f"#SBATCH -D {slurm_job.workdir_remote}",
|
305
|
+
"#SBATCH --job-name=test",
|
306
|
+
"\n",
|
307
|
+
]
|
308
|
+
|
309
|
+
cmdlines = []
|
310
|
+
for task in slurm_job.tasks:
|
311
|
+
cmd = (
|
312
|
+
f"{self.python_worker_interpreter}"
|
313
|
+
" -m fractal_server.app.runner.executors.slurm_common.remote "
|
314
|
+
f"--input-file {task.input_pickle_file_local} "
|
315
|
+
f"--output-file {task.output_pickle_file_remote}"
|
316
|
+
)
|
317
|
+
cmdlines.append("whoami")
|
318
|
+
cmdlines.append(
|
319
|
+
f"srun --ntasks=1 --cpus-per-task=1 --mem=10MB {cmd} &"
|
320
|
+
)
|
321
|
+
cmdlines.append("wait\n")
|
322
|
+
|
323
|
+
# Write submission script
|
324
|
+
submission_script_contents = "\n".join(preamble_lines + cmdlines)
|
325
|
+
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
326
|
+
f.write(submission_script_contents)
|
327
|
+
|
328
|
+
# Run sbatch
|
329
|
+
pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
|
330
|
+
submit_command = (
|
331
|
+
f"sbatch --parsable {slurm_job.slurm_submission_script_local}"
|
332
|
+
)
|
333
|
+
full_command = f"{pre_command} {submit_command}"
|
334
|
+
|
335
|
+
# Submit SLURM job and retrieve job ID
|
336
|
+
res = _subprocess_run_or_raise(full_command)
|
337
|
+
submitted_job_id = int(res.stdout)
|
338
|
+
slurm_job.slurm_job_id = str(submitted_job_id)
|
339
|
+
|
340
|
+
# Add job to self.jobs
|
341
|
+
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
342
|
+
|
343
|
+
@property
|
344
|
+
def job_ids(self) -> list[str]:
|
345
|
+
return list(self.jobs.keys())
|
346
|
+
|
347
|
+
def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
|
348
|
+
"""
|
349
|
+
Note: this would differ for SSH
|
350
|
+
"""
|
351
|
+
source_target_list = [
|
352
|
+
(job.slurm_log_file_remote, job.slurm_log_file_local)
|
353
|
+
]
|
354
|
+
for task in job.tasks:
|
355
|
+
source_target_list.append(
|
356
|
+
(task.output_pickle_file_remote, task.output_pickle_file_local)
|
357
|
+
)
|
358
|
+
source_target_list.append(
|
359
|
+
(
|
360
|
+
task.task_files.log_file_remote,
|
361
|
+
task.task_files.log_file_local,
|
362
|
+
),
|
363
|
+
(
|
364
|
+
task.task_files.args_file_remote,
|
365
|
+
task.task_files.args_file_local,
|
366
|
+
),
|
367
|
+
(
|
368
|
+
task.task_files.metadiff_file_remote,
|
369
|
+
task.task_files.metadiff_file_local,
|
370
|
+
),
|
371
|
+
)
|
372
|
+
|
373
|
+
for source, target in source_target_list:
|
374
|
+
# NOTE: By setting encoding=None, we read/write bytes instead
|
375
|
+
# of strings; this is needed to also handle pickle files.
|
376
|
+
try:
|
377
|
+
res = _run_command_as_user(
|
378
|
+
cmd=f"cat {source}",
|
379
|
+
user=self.slurm_user,
|
380
|
+
encoding=None,
|
381
|
+
check=True,
|
382
|
+
)
|
383
|
+
# Write local file
|
384
|
+
with open(target, "wb") as f:
|
385
|
+
f.write(res.stdout)
|
386
|
+
except RuntimeError as e:
|
387
|
+
logger.warning(
|
388
|
+
f"SKIP copy {source} into {target}. "
|
389
|
+
f"Original error: {str(e)}"
|
390
|
+
)
|
391
|
+
logger.debug(f"Copied {source} into {target}")
|
392
|
+
|
393
|
+
def _postprocess_single_task(
|
394
|
+
self, *, task: SlurmTask
|
395
|
+
) -> tuple[Any, Exception]:
|
396
|
+
try:
|
397
|
+
with open(task.output_pickle_file_local, "rb") as f:
|
398
|
+
outdata = f.read()
|
399
|
+
success, output = cloudpickle.loads(outdata)
|
400
|
+
if success:
|
401
|
+
result = output
|
402
|
+
return result, None
|
403
|
+
else:
|
404
|
+
exception = _handle_exception_proxy(output)
|
405
|
+
return None, exception
|
406
|
+
except Exception as e:
|
407
|
+
exception = JobExecutionError(f"ERROR, {str(e)}")
|
408
|
+
return None, exception
|
409
|
+
finally:
|
410
|
+
Path(task.input_pickle_file_local).unlink(missing_ok=True)
|
411
|
+
Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
412
|
+
|
413
|
+
def submit(
|
414
|
+
self,
|
415
|
+
func: callable,
|
416
|
+
parameters: dict[str, Any],
|
417
|
+
history_item_id: int,
|
418
|
+
task_files: TaskFiles,
|
419
|
+
in_compound_task: bool = False,
|
420
|
+
slurm_config: Optional[SlurmConfig] = None,
|
421
|
+
**kwargs,
|
422
|
+
) -> tuple[Any, Exception]:
|
423
|
+
|
424
|
+
workdir_local = task_files.wftask_subfolder_local
|
425
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
426
|
+
|
427
|
+
task_files = TaskFiles(
|
428
|
+
**task_files.model_dump(
|
429
|
+
exclude={"component"},
|
430
|
+
),
|
431
|
+
component=parameters[_COMPONENT_KEY_],
|
432
|
+
)
|
433
|
+
|
434
|
+
if self.jobs != {}:
|
435
|
+
if not in_compound_task:
|
436
|
+
update_all_images(
|
437
|
+
history_item_id=history_item_id,
|
438
|
+
status=HistoryItemImageStatus.FAILED,
|
439
|
+
)
|
440
|
+
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
441
|
+
|
442
|
+
if self.is_shutdown():
|
443
|
+
if not in_compound_task:
|
444
|
+
update_all_images(
|
445
|
+
history_item_id=history_item_id,
|
446
|
+
status=HistoryItemImageStatus.FAILED,
|
447
|
+
)
|
448
|
+
raise JobExecutionError("Cannot continue after shutdown.")
|
449
|
+
|
450
|
+
# Validation phase
|
451
|
+
self.validate_submit_parameters(parameters)
|
452
|
+
|
453
|
+
# Create task subfolder
|
454
|
+
workdir_local.mkdir(parents=True, exist_ok=True)
|
455
|
+
_mkdir_as_user(
|
456
|
+
folder=workdir_remote.as_posix(),
|
457
|
+
user=self.slurm_user,
|
458
|
+
)
|
459
|
+
|
460
|
+
# Submission phase
|
461
|
+
slurm_job = SlurmJob(
|
462
|
+
label="0",
|
463
|
+
workdir_local=workdir_local,
|
464
|
+
workdir_remote=workdir_remote,
|
465
|
+
tasks=[
|
466
|
+
SlurmTask(
|
467
|
+
component="0",
|
468
|
+
workdir_remote=workdir_remote,
|
469
|
+
workdir_local=workdir_local,
|
470
|
+
task_files=task_files,
|
471
|
+
)
|
472
|
+
],
|
473
|
+
) # TODO: replace with actual values (BASED ON TASKFILES)
|
474
|
+
self._submit_single_sbatch(
|
475
|
+
func,
|
476
|
+
parameters=parameters,
|
477
|
+
slurm_job=slurm_job,
|
478
|
+
)
|
479
|
+
|
480
|
+
LOGFILE = task_files.log_file_local
|
481
|
+
|
482
|
+
# Retrieval phase
|
483
|
+
while len(self.jobs) > 0:
|
484
|
+
if self.is_shutdown():
|
485
|
+
self.scancel_if_shutdown()
|
486
|
+
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
487
|
+
for slurm_job_id in finished_job_ids:
|
488
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
489
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
490
|
+
result, exception = self._postprocess_single_task(
|
491
|
+
task=slurm_job.tasks[0]
|
492
|
+
)
|
493
|
+
time.sleep(self.slurm_poll_interval)
|
494
|
+
|
495
|
+
if not in_compound_task:
|
496
|
+
if exception is None:
|
497
|
+
update_all_images(
|
498
|
+
history_item_id=history_item_id,
|
499
|
+
status=HistoryItemImageStatus.DONE,
|
500
|
+
logfile=LOGFILE,
|
501
|
+
)
|
502
|
+
else:
|
503
|
+
update_all_images(
|
504
|
+
history_item_id=history_item_id,
|
505
|
+
status=HistoryItemImageStatus.FAILED,
|
506
|
+
logfile=LOGFILE,
|
507
|
+
)
|
508
|
+
|
509
|
+
return result, exception
|
510
|
+
|
511
|
+
def multisubmit(
|
512
|
+
self,
|
513
|
+
func: callable,
|
514
|
+
list_parameters: list[dict],
|
515
|
+
history_item_id: int,
|
516
|
+
task_files: TaskFiles,
|
517
|
+
in_compound_task: bool = False,
|
518
|
+
**kwargs,
|
519
|
+
):
|
520
|
+
self.scancel_if_shutdown(active_slurm_jobs=[])
|
521
|
+
|
522
|
+
self.validate_multisubmit_parameters(
|
523
|
+
list_parameters=list_parameters,
|
524
|
+
in_compound_task=in_compound_task,
|
525
|
+
)
|
526
|
+
|
527
|
+
workdir_local = task_files.wftask_subfolder_local
|
528
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
529
|
+
|
530
|
+
# Create folders
|
531
|
+
workdir_local.mkdir(parents=True, exist_ok=True)
|
532
|
+
_mkdir_as_user(
|
533
|
+
folder=workdir_remote.as_posix(),
|
534
|
+
user=self.slurm_user,
|
535
|
+
)
|
536
|
+
|
537
|
+
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
538
|
+
# TODO Pick a data structure for results and exceptions, or review the
|
539
|
+
# interface
|
540
|
+
results = []
|
541
|
+
exceptions = []
|
542
|
+
jobs: dict[str, SlurmJob] = {}
|
543
|
+
|
544
|
+
original_task_files = task_files
|
545
|
+
# TODO: Add batching
|
546
|
+
for ind, parameters in enumerate(list_parameters):
|
547
|
+
# TODO: replace with actual values
|
548
|
+
|
549
|
+
component = parameters[_COMPONENT_KEY_]
|
550
|
+
slurm_job = SlurmJob(
|
551
|
+
label=f"{ind:06d}",
|
552
|
+
workdir_local=workdir_local,
|
553
|
+
workdir_remote=workdir_remote,
|
554
|
+
tasks=[
|
555
|
+
SlurmTask(
|
556
|
+
component=component,
|
557
|
+
workdir_local=workdir_local,
|
558
|
+
workdir_remote=workdir_remote,
|
559
|
+
zarr_url=parameters["zarr_url"],
|
560
|
+
task_files=TaskFiles(
|
561
|
+
**original_task_files,
|
562
|
+
component=component,
|
563
|
+
),
|
564
|
+
)
|
565
|
+
],
|
566
|
+
)
|
567
|
+
slurm_job_id = self._submit_single_sbatch(
|
568
|
+
func,
|
569
|
+
parameters=parameters,
|
570
|
+
slurm_job=slurm_job,
|
571
|
+
)
|
572
|
+
slurm_job.slurm_job_id = slurm_job_id
|
573
|
+
jobs[slurm_job_id] = slurm_job
|
574
|
+
|
575
|
+
# Retrieval phase
|
576
|
+
while len(jobs) > 0:
|
577
|
+
if self.is_shutdown():
|
578
|
+
self.scancel_if_shutdown(active_slurm_jobs=jobs)
|
579
|
+
remaining_jobs = list(self.job_ids)
|
580
|
+
finished_jobs = get_finished_jobs(job_ids=remaining_jobs)
|
581
|
+
for slurm_job_id in finished_jobs:
|
582
|
+
slurm_job = jobs.pop(slurm_job_id)
|
583
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
584
|
+
for task in slurm_job.tasks:
|
585
|
+
result, exception = self._postprocess_single_task(
|
586
|
+
task=task
|
587
|
+
)
|
588
|
+
if not in_compound_task:
|
589
|
+
if exception is None:
|
590
|
+
update_single_image(
|
591
|
+
zarr_url=task.zarr_url,
|
592
|
+
history_item_id=history_item_id,
|
593
|
+
status=HistoryItemImageStatus.DONE,
|
594
|
+
logfile=task.task_files.log_file_local,
|
595
|
+
)
|
596
|
+
else:
|
597
|
+
update_single_image(
|
598
|
+
zarr_url=task.zarr_url,
|
599
|
+
history_item_id=history_item_id,
|
600
|
+
status=HistoryItemImageStatus.FAILED,
|
601
|
+
logfile=task.task_files.log_file_local,
|
602
|
+
)
|
603
|
+
# TODO: Now just appending, but this should be done better
|
604
|
+
results.append(result)
|
605
|
+
exceptions.append(exception)
|
606
|
+
time.sleep(self.slurm_poll_interval)
|
607
|
+
return results, exceptions
|
608
|
+
|
609
|
+
def check_remote_python_interpreter(self):
|
610
|
+
"""
|
611
|
+
Check fractal-server version on the _remote_ Python interpreter.
|
612
|
+
"""
|
613
|
+
settings = Inject(get_settings)
|
614
|
+
output = _subprocess_run_or_raise(
|
615
|
+
(
|
616
|
+
f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
|
617
|
+
"-m fractal_server.app.runner.versions"
|
618
|
+
)
|
619
|
+
)
|
620
|
+
runner_version = json.loads(output.stdout.strip("\n"))[
|
621
|
+
"fractal_server"
|
622
|
+
]
|
623
|
+
if runner_version != __VERSION__:
|
624
|
+
error_msg = (
|
625
|
+
"Fractal-server version mismatch.\n"
|
626
|
+
"Local interpreter: "
|
627
|
+
f"({sys.executable}): {__VERSION__}.\n"
|
628
|
+
"Remote interpreter: "
|
629
|
+
f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
|
630
|
+
)
|
631
|
+
logger.error(error_msg)
|
632
|
+
raise RuntimeError(error_msg)
|