fractal-server 2.14.0a13__py3-none-any.whl → 2.14.0a15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/dataset.py +1 -1
- fractal_server/app/models/v2/job.py +7 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/_aux_functions_history.py +8 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -8
- fractal_server/app/routes/api/v2/history.py +111 -27
- fractal_server/app/routes/api/v2/images.py +16 -14
- fractal_server/app/routes/api/v2/project.py +0 -52
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/workflow.py +0 -8
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/runner/executors/base_runner.py +5 -0
- fractal_server/app/runner/executors/local/runner.py +15 -7
- fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +677 -0
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
- fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
- fractal_server/app/runner/task_files.py +20 -6
- fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
- fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
- fractal_server/app/runner/v2/runner.py +4 -0
- fractal_server/app/runner/v2/runner_functions.py +2 -2
- fractal_server/app/runner/v2/submit_workflow.py +7 -16
- fractal_server/app/schemas/v2/__init__.py +3 -1
- fractal_server/app/schemas/v2/history.py +27 -2
- fractal_server/config.py +6 -2
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
- fractal_server/tasks/v2/utils_background.py +0 -19
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/RECORD +41 -42
- fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
- fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
- fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/entry_points.txt +0 -0
@@ -1,171 +1,23 @@
|
|
1
|
-
import json
|
2
1
|
import logging
|
3
|
-
import math
|
4
2
|
import os
|
5
3
|
import shlex
|
6
4
|
import subprocess # nosec
|
7
5
|
import sys
|
8
|
-
import time
|
9
|
-
from copy import copy
|
10
6
|
from pathlib import Path
|
11
|
-
from typing import Any
|
12
|
-
from typing import Literal
|
13
7
|
from typing import Optional
|
14
8
|
|
15
|
-
import
|
16
|
-
from
|
17
|
-
from pydantic import ConfigDict
|
18
|
-
|
19
|
-
from ..slurm_common._check_jobs_status import get_finished_jobs
|
9
|
+
from ..slurm_common.base_slurm_runner import BaseSlurmRunner
|
10
|
+
from ..slurm_common.slurm_job_task_models import SlurmJob
|
20
11
|
from ._subprocess_run_as_user import _mkdir_as_user
|
21
12
|
from ._subprocess_run_as_user import _run_command_as_user
|
22
|
-
from fractal_server import __VERSION__
|
23
|
-
from fractal_server.app.db import get_sync_db
|
24
13
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
25
|
-
from fractal_server.app.runner.exceptions import TaskExecutionError
|
26
|
-
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
27
|
-
from fractal_server.app.runner.executors.slurm_common._batching import (
|
28
|
-
heuristics,
|
29
|
-
)
|
30
|
-
from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
31
|
-
SlurmConfig,
|
32
|
-
)
|
33
|
-
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
34
|
-
from fractal_server.app.runner.task_files import TaskFiles
|
35
|
-
from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
|
36
|
-
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
37
14
|
from fractal_server.config import get_settings
|
38
15
|
from fractal_server.logger import set_logger
|
39
16
|
from fractal_server.syringe import Inject
|
40
17
|
|
41
18
|
|
42
19
|
logger = set_logger(__name__)
|
43
|
-
|
44
|
-
|
45
|
-
def _handle_exception_proxy(proxy): # FIXME
|
46
|
-
if proxy.exc_type_name == "JobExecutionError":
|
47
|
-
return JobExecutionError(str(proxy))
|
48
|
-
else:
|
49
|
-
kwargs = {}
|
50
|
-
for key in [
|
51
|
-
"workflow_task_id",
|
52
|
-
"workflow_task_order",
|
53
|
-
"task_name",
|
54
|
-
]:
|
55
|
-
if key in proxy.kwargs.keys():
|
56
|
-
kwargs[key] = proxy.kwargs[key]
|
57
|
-
return TaskExecutionError(proxy.tb, **kwargs)
|
58
|
-
|
59
|
-
|
60
|
-
class SlurmTask(BaseModel):
|
61
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
62
|
-
component: str
|
63
|
-
workdir_local: Path
|
64
|
-
workdir_remote: Path
|
65
|
-
parameters: dict[str, Any]
|
66
|
-
zarr_url: Optional[str] = None
|
67
|
-
task_files: TaskFiles
|
68
|
-
index: int
|
69
|
-
|
70
|
-
@property
|
71
|
-
def input_pickle_file_local(self) -> str:
|
72
|
-
return (
|
73
|
-
self.workdir_local / f"{self.component}-input.pickle"
|
74
|
-
).as_posix()
|
75
|
-
|
76
|
-
@property
|
77
|
-
def output_pickle_file_local(self) -> str:
|
78
|
-
return (
|
79
|
-
self.workdir_local / f"{self.component}-output.pickle"
|
80
|
-
).as_posix()
|
81
|
-
|
82
|
-
@property
|
83
|
-
def input_pickle_file_remote(self) -> str:
|
84
|
-
return (
|
85
|
-
self.workdir_remote / f"{self.component}-input.pickle"
|
86
|
-
).as_posix()
|
87
|
-
|
88
|
-
@property
|
89
|
-
def output_pickle_file_remote(self) -> str:
|
90
|
-
return (
|
91
|
-
self.workdir_remote / f"{self.component}-output.pickle"
|
92
|
-
).as_posix()
|
93
|
-
|
94
|
-
|
95
|
-
class SlurmJob(BaseModel):
|
96
|
-
slurm_job_id: Optional[str] = None
|
97
|
-
label: str
|
98
|
-
workdir_local: Path
|
99
|
-
workdir_remote: Path
|
100
|
-
tasks: list[SlurmTask]
|
101
|
-
|
102
|
-
@property
|
103
|
-
def slurm_submission_script_local(self) -> str:
|
104
|
-
return (
|
105
|
-
self.workdir_local / f"slurm-{self.label}-submit.sh"
|
106
|
-
).as_posix()
|
107
|
-
|
108
|
-
@property
|
109
|
-
def slurm_submission_script_remote(self) -> str:
|
110
|
-
return (
|
111
|
-
self.workdir_remote / f"slurm-{self.label}-submit.sh"
|
112
|
-
).as_posix()
|
113
|
-
|
114
|
-
@property
|
115
|
-
def slurm_stdout_remote(self) -> str:
|
116
|
-
if self.slurm_job_id:
|
117
|
-
return (
|
118
|
-
self.workdir_remote
|
119
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.out"
|
120
|
-
).as_posix()
|
121
|
-
|
122
|
-
else:
|
123
|
-
return (
|
124
|
-
self.workdir_remote / f"slurm-{self.label}-%j.out"
|
125
|
-
).as_posix()
|
126
|
-
|
127
|
-
@property
|
128
|
-
def slurm_stderr_remote(self) -> str:
|
129
|
-
if self.slurm_job_id:
|
130
|
-
return (
|
131
|
-
self.workdir_remote
|
132
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.err"
|
133
|
-
).as_posix()
|
134
|
-
|
135
|
-
else:
|
136
|
-
return (
|
137
|
-
self.workdir_remote / f"slurm-{self.label}-%j.err"
|
138
|
-
).as_posix()
|
139
|
-
|
140
|
-
@property
|
141
|
-
def slurm_stdout_local(self) -> str:
|
142
|
-
if self.slurm_job_id:
|
143
|
-
return (
|
144
|
-
self.workdir_local
|
145
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.out"
|
146
|
-
).as_posix()
|
147
|
-
|
148
|
-
else:
|
149
|
-
return (
|
150
|
-
self.workdir_local / f"slurm-{self.label}-%j.out"
|
151
|
-
).as_posix()
|
152
|
-
|
153
|
-
@property
|
154
|
-
def slurm_stderr_local(self) -> str:
|
155
|
-
if self.slurm_job_id:
|
156
|
-
return (
|
157
|
-
self.workdir_local
|
158
|
-
/ f"slurm-{self.label}-{self.slurm_job_id}.err"
|
159
|
-
).as_posix()
|
160
|
-
|
161
|
-
else:
|
162
|
-
return (
|
163
|
-
self.workdir_local / f"slurm-{self.label}-%j.err"
|
164
|
-
).as_posix()
|
165
|
-
|
166
|
-
@property
|
167
|
-
def log_files_local(self) -> list[str]:
|
168
|
-
return [task.task_files.log_file_local for task in self.tasks]
|
20
|
+
# FIXME: Transform several logger.info into logger.debug.
|
169
21
|
|
170
22
|
|
171
23
|
def _subprocess_run_or_raise(
|
@@ -190,29 +42,22 @@ def _subprocess_run_or_raise(
|
|
190
42
|
raise JobExecutionError(info=error_msg)
|
191
43
|
|
192
44
|
|
193
|
-
class
|
45
|
+
class SudoSlurmRunner(BaseSlurmRunner):
|
194
46
|
slurm_user: str
|
195
|
-
slurm_user: str
|
196
|
-
shutdown_file: Path
|
197
|
-
common_script_lines: list[str]
|
198
|
-
user_cache_dir: str
|
199
|
-
root_dir_local: Path
|
200
|
-
root_dir_remote: Path
|
201
47
|
slurm_account: Optional[str] = None
|
202
|
-
poll_interval: int
|
203
|
-
python_worker_interpreter: str
|
204
|
-
jobs: dict[str, SlurmJob]
|
205
48
|
|
206
49
|
def __init__(
|
207
50
|
self,
|
208
51
|
*,
|
209
|
-
|
52
|
+
# Common
|
210
53
|
root_dir_local: Path,
|
211
54
|
root_dir_remote: Path,
|
212
|
-
slurm_account: Optional[str] = None,
|
213
55
|
common_script_lines: Optional[list[str]] = None,
|
214
56
|
user_cache_dir: Optional[str] = None,
|
215
|
-
|
57
|
+
poll_interval: Optional[int] = None,
|
58
|
+
# Specific
|
59
|
+
slurm_account: Optional[str] = None,
|
60
|
+
slurm_user: str,
|
216
61
|
) -> None:
|
217
62
|
"""
|
218
63
|
Set parameters that are the same for different Fractal tasks and for
|
@@ -221,208 +66,34 @@ class RunnerSlurmSudo(BaseRunner):
|
|
221
66
|
|
222
67
|
self.slurm_user = slurm_user
|
223
68
|
self.slurm_account = slurm_account
|
224
|
-
self.common_script_lines = common_script_lines or []
|
225
|
-
|
226
|
-
# Check that SLURM account is not set here
|
227
|
-
# FIXME: move to little method
|
228
|
-
try:
|
229
|
-
invalid_line = next(
|
230
|
-
line
|
231
|
-
for line in self.common_script_lines
|
232
|
-
if line.startswith("#SBATCH --account=")
|
233
|
-
)
|
234
|
-
raise RuntimeError(
|
235
|
-
"Invalid line in `FractalSlurmExecutor.common_script_lines`: "
|
236
|
-
f"'{invalid_line}'.\n"
|
237
|
-
"SLURM account must be set via the request body of the "
|
238
|
-
"apply-workflow endpoint, or by modifying the user properties."
|
239
|
-
)
|
240
|
-
except StopIteration:
|
241
|
-
pass
|
242
|
-
|
243
|
-
# Check Python versions
|
244
69
|
settings = Inject(get_settings)
|
245
|
-
if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
|
246
|
-
self.check_remote_python_interpreter()
|
247
|
-
|
248
|
-
self.root_dir_local = root_dir_local
|
249
|
-
self.root_dir_remote = root_dir_remote
|
250
|
-
|
251
|
-
# Create folders
|
252
|
-
original_umask = os.umask(0)
|
253
|
-
self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
|
254
|
-
os.umask(original_umask)
|
255
|
-
_mkdir_as_user(
|
256
|
-
folder=self.root_dir_remote.as_posix(),
|
257
|
-
user=self.slurm_user,
|
258
|
-
)
|
259
|
-
|
260
|
-
self.user_cache_dir = user_cache_dir
|
261
|
-
|
262
|
-
self.slurm_poll_interval = (
|
263
|
-
slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
264
|
-
)
|
265
|
-
|
266
|
-
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
267
70
|
|
268
71
|
self.python_worker_interpreter = (
|
269
72
|
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
270
73
|
)
|
271
74
|
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
def is_shutdown(self) -> bool:
|
281
|
-
return self.shutdown_file.exists()
|
282
|
-
|
283
|
-
def scancel_jobs(self) -> None:
|
284
|
-
logger.debug("[scancel_jobs] START")
|
285
|
-
|
286
|
-
if self.jobs:
|
287
|
-
scancel_string = " ".join(self.job_ids)
|
288
|
-
scancel_cmd = f"scancel {scancel_string}"
|
289
|
-
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
290
|
-
try:
|
291
|
-
_run_command_as_user(
|
292
|
-
cmd=scancel_cmd,
|
293
|
-
user=self.slurm_user,
|
294
|
-
check=True,
|
295
|
-
)
|
296
|
-
except RuntimeError as e:
|
297
|
-
logger.warning(
|
298
|
-
"[scancel_jobs] `scancel` command failed. "
|
299
|
-
f"Original error:\n{str(e)}"
|
300
|
-
)
|
301
|
-
|
302
|
-
logger.debug("[scancel_jobs] END")
|
303
|
-
|
304
|
-
def _submit_single_sbatch(
|
305
|
-
self,
|
306
|
-
func,
|
307
|
-
slurm_job: SlurmJob,
|
308
|
-
slurm_config: SlurmConfig,
|
309
|
-
) -> str:
|
310
|
-
logger.debug("[_submit_single_sbatch] START")
|
311
|
-
# Prepare input pickle(s)
|
312
|
-
versions = dict(
|
313
|
-
python=sys.version_info[:3],
|
314
|
-
cloudpickle=cloudpickle.__version__,
|
315
|
-
fractal_server=__VERSION__,
|
316
|
-
)
|
317
|
-
for task in slurm_job.tasks:
|
318
|
-
_args = []
|
319
|
-
_kwargs = dict(
|
320
|
-
parameters=task.parameters,
|
321
|
-
remote_files=task.task_files.remote_files_dict,
|
322
|
-
)
|
323
|
-
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
324
|
-
with open(task.input_pickle_file_local, "wb") as f:
|
325
|
-
f.write(funcser)
|
326
|
-
logger.debug(
|
327
|
-
"[_submit_single_sbatch] Written "
|
328
|
-
f"{task.input_pickle_file_local=}"
|
329
|
-
)
|
330
|
-
# Prepare commands to be included in SLURM submission script
|
331
|
-
settings = Inject(get_settings)
|
332
|
-
python_worker_interpreter = (
|
333
|
-
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
334
|
-
)
|
335
|
-
cmdlines = []
|
336
|
-
for task in slurm_job.tasks:
|
337
|
-
input_pickle_file = task.input_pickle_file_local
|
338
|
-
output_pickle_file = task.output_pickle_file_remote
|
339
|
-
cmdlines.append(
|
340
|
-
(
|
341
|
-
f"{python_worker_interpreter}"
|
342
|
-
" -m fractal_server.app.runner."
|
343
|
-
"executors.slurm_common.remote "
|
344
|
-
f"--input-file {input_pickle_file} "
|
345
|
-
f"--output-file {output_pickle_file}"
|
346
|
-
)
|
347
|
-
)
|
348
|
-
|
349
|
-
# ...
|
350
|
-
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
351
|
-
mem_per_task_MB = slurm_config.mem_per_task_MB
|
352
|
-
|
353
|
-
# Set ntasks
|
354
|
-
ntasks = min(len(cmdlines), num_tasks_max_running)
|
355
|
-
slurm_config.parallel_tasks_per_job = ntasks
|
356
|
-
|
357
|
-
# Prepare SLURM preamble based on SlurmConfig object
|
358
|
-
script_lines = slurm_config.to_sbatch_preamble(
|
359
|
-
remote_export_dir=self.user_cache_dir
|
360
|
-
)
|
361
|
-
|
362
|
-
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
363
|
-
# fix their order
|
364
|
-
script_lines.extend(
|
365
|
-
[
|
366
|
-
f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
|
367
|
-
f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
|
368
|
-
f"#SBATCH -D {slurm_job.workdir_remote}",
|
369
|
-
]
|
75
|
+
super().__init__(
|
76
|
+
slurm_runner_type="sudo",
|
77
|
+
root_dir_local=root_dir_local,
|
78
|
+
root_dir_remote=root_dir_remote,
|
79
|
+
common_script_lines=common_script_lines,
|
80
|
+
user_cache_dir=user_cache_dir,
|
81
|
+
poll_interval=poll_interval,
|
370
82
|
)
|
371
|
-
script_lines = slurm_config.sort_script_lines(script_lines)
|
372
|
-
logger.debug(script_lines)
|
373
|
-
|
374
|
-
# Always print output of `uname -n` and `pwd`
|
375
|
-
script_lines.append(
|
376
|
-
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
377
|
-
)
|
378
|
-
|
379
|
-
# Complete script preamble
|
380
|
-
script_lines.append("\n")
|
381
|
-
|
382
|
-
# Include command lines
|
383
|
-
tmp_list_commands = copy(cmdlines)
|
384
|
-
while tmp_list_commands:
|
385
|
-
if tmp_list_commands:
|
386
|
-
cmd = tmp_list_commands.pop(0) # take first element
|
387
|
-
script_lines.append(
|
388
|
-
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
389
|
-
f"--mem={mem_per_task_MB}MB "
|
390
|
-
f"{cmd} &"
|
391
|
-
)
|
392
|
-
script_lines.append("wait\n")
|
393
|
-
|
394
|
-
script = "\n".join(script_lines)
|
395
|
-
|
396
|
-
# Write submission script
|
397
|
-
# submission_script_contents = "\n".join(preamble_lines + cmdlines)
|
398
|
-
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
399
|
-
f.write(script)
|
400
83
|
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
)
|
406
|
-
full_command = f"{pre_command} {submit_command}"
|
407
|
-
|
408
|
-
# Submit SLURM job and retrieve job ID
|
409
|
-
res = _subprocess_run_or_raise(full_command)
|
410
|
-
submitted_job_id = int(res.stdout)
|
411
|
-
slurm_job.slurm_job_id = str(submitted_job_id)
|
412
|
-
|
413
|
-
# Add job to self.jobs
|
414
|
-
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
415
|
-
logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
|
84
|
+
def _mkdir_local_folder(self, folder: str) -> None:
|
85
|
+
original_umask = os.umask(0)
|
86
|
+
Path(folder).mkdir(parents=True, mode=0o755)
|
87
|
+
os.umask(original_umask)
|
416
88
|
|
417
|
-
|
418
|
-
|
419
|
-
return list(self.jobs.keys())
|
89
|
+
def _mkdir_remote_folder(self, folder: str) -> None:
|
90
|
+
_mkdir_as_user(folder=folder, user=self.slurm_user)
|
420
91
|
|
421
92
|
def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
|
422
93
|
"""
|
423
94
|
Note: this would differ for SSH
|
424
95
|
"""
|
425
|
-
logger.
|
96
|
+
logger.info(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
|
426
97
|
source_target_list = [
|
427
98
|
(job.slurm_stdout_remote, job.slurm_stdout_local),
|
428
99
|
(job.slurm_stderr_remote, job.slurm_stderr_local),
|
@@ -469,315 +140,15 @@ class RunnerSlurmSudo(BaseRunner):
|
|
469
140
|
f"Original error: {str(e)}"
|
470
141
|
)
|
471
142
|
|
472
|
-
def
|
473
|
-
|
474
|
-
|
475
|
-
try:
|
476
|
-
with open(task.output_pickle_file_local, "rb") as f:
|
477
|
-
outdata = f.read()
|
478
|
-
success, output = cloudpickle.loads(outdata)
|
479
|
-
if success:
|
480
|
-
result = output
|
481
|
-
return result, None
|
482
|
-
else:
|
483
|
-
exception = _handle_exception_proxy(output)
|
484
|
-
return None, exception
|
485
|
-
except Exception as e:
|
486
|
-
exception = JobExecutionError(f"ERROR, {str(e)}")
|
487
|
-
return None, exception
|
488
|
-
finally:
|
489
|
-
Path(task.input_pickle_file_local).unlink(missing_ok=True)
|
490
|
-
Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
491
|
-
|
492
|
-
def submit(
|
493
|
-
self,
|
494
|
-
func: callable,
|
495
|
-
parameters: dict[str, Any],
|
496
|
-
history_unit_id: int,
|
497
|
-
task_files: TaskFiles,
|
498
|
-
task_type: Literal[
|
499
|
-
"non_parallel",
|
500
|
-
"converter_non_parallel",
|
501
|
-
"compound",
|
502
|
-
"converter_compound",
|
503
|
-
],
|
504
|
-
config: SlurmConfig,
|
505
|
-
) -> tuple[Any, Exception]:
|
506
|
-
|
507
|
-
if len(self.jobs) > 0:
|
508
|
-
raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
|
509
|
-
|
510
|
-
workdir_local = task_files.wftask_subfolder_local
|
511
|
-
workdir_remote = task_files.wftask_subfolder_remote
|
512
|
-
if self.jobs != {}:
|
513
|
-
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
514
|
-
|
515
|
-
if self.is_shutdown():
|
516
|
-
raise JobExecutionError("Cannot continue after shutdown.")
|
517
|
-
|
518
|
-
# Validation phase
|
519
|
-
self.validate_submit_parameters(parameters, task_type=task_type)
|
520
|
-
|
521
|
-
# Create task subfolder
|
522
|
-
original_umask = os.umask(0)
|
523
|
-
workdir_local.mkdir(parents=True, mode=0o755)
|
524
|
-
os.umask(original_umask)
|
525
|
-
_mkdir_as_user(
|
526
|
-
folder=workdir_remote.as_posix(),
|
143
|
+
def _run_remote_cmd(self, cmd: str):
|
144
|
+
res = _run_command_as_user(
|
145
|
+
cmd=cmd,
|
527
146
|
user=self.slurm_user,
|
147
|
+
encoding="utf-8",
|
148
|
+
check=True,
|
528
149
|
)
|
150
|
+
return res.stdout
|
529
151
|
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
workdir_local=workdir_local,
|
534
|
-
workdir_remote=workdir_remote,
|
535
|
-
tasks=[
|
536
|
-
SlurmTask(
|
537
|
-
index=0,
|
538
|
-
component=task_files.component,
|
539
|
-
parameters=parameters,
|
540
|
-
workdir_remote=workdir_remote,
|
541
|
-
workdir_local=workdir_local,
|
542
|
-
task_files=task_files,
|
543
|
-
)
|
544
|
-
],
|
545
|
-
)
|
546
|
-
config.parallel_tasks_per_job = 1
|
547
|
-
self._submit_single_sbatch(
|
548
|
-
func,
|
549
|
-
slurm_job=slurm_job,
|
550
|
-
slurm_config=config,
|
551
|
-
)
|
552
|
-
logger.info(f"END submission phase, {self.job_ids=}")
|
553
|
-
|
554
|
-
# FIXME: Replace with more robust/efficient logic
|
555
|
-
logger.warning("Now sleep 4 (FIXME)")
|
556
|
-
time.sleep(4)
|
557
|
-
|
558
|
-
# Retrieval phase
|
559
|
-
logger.info("START retrieval phase")
|
560
|
-
while len(self.jobs) > 0:
|
561
|
-
if self.is_shutdown():
|
562
|
-
self.scancel_jobs()
|
563
|
-
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
564
|
-
logger.debug(f"{finished_job_ids=}")
|
565
|
-
with next(get_sync_db()) as db:
|
566
|
-
for slurm_job_id in finished_job_ids:
|
567
|
-
logger.debug(f"Now process {slurm_job_id=}")
|
568
|
-
slurm_job = self.jobs.pop(slurm_job_id)
|
569
|
-
self._copy_files_from_remote_to_local(slurm_job)
|
570
|
-
result, exception = self._postprocess_single_task(
|
571
|
-
task=slurm_job.tasks[0]
|
572
|
-
)
|
573
|
-
# Note: the relevant done/failed check is based on
|
574
|
-
# whether `exception is None`. The fact that
|
575
|
-
# `result is None` is not relevant for this purpose.
|
576
|
-
if exception is not None:
|
577
|
-
update_status_of_history_unit(
|
578
|
-
history_unit_id=history_unit_id,
|
579
|
-
status=HistoryUnitStatus.FAILED,
|
580
|
-
db_sync=db,
|
581
|
-
)
|
582
|
-
else:
|
583
|
-
if task_type not in ["compound", "converter_compound"]:
|
584
|
-
update_status_of_history_unit(
|
585
|
-
history_unit_id=history_unit_id,
|
586
|
-
status=HistoryUnitStatus.DONE,
|
587
|
-
db_sync=db,
|
588
|
-
)
|
589
|
-
|
590
|
-
time.sleep(self.slurm_poll_interval)
|
591
|
-
|
592
|
-
return result, exception
|
593
|
-
|
594
|
-
def multisubmit(
|
595
|
-
self,
|
596
|
-
func: callable,
|
597
|
-
list_parameters: list[dict],
|
598
|
-
history_unit_ids: list[int],
|
599
|
-
list_task_files: list[TaskFiles],
|
600
|
-
task_type: Literal["parallel", "compound", "converter_compound"],
|
601
|
-
config: SlurmConfig,
|
602
|
-
):
|
603
|
-
|
604
|
-
if len(self.jobs) > 0:
|
605
|
-
raise RuntimeError(
|
606
|
-
f"Cannot run .multisubmit when {len(self.jobs)=}"
|
607
|
-
)
|
608
|
-
|
609
|
-
self.validate_multisubmit_parameters(
|
610
|
-
list_parameters=list_parameters,
|
611
|
-
task_type=task_type,
|
612
|
-
list_task_files=list_task_files,
|
613
|
-
)
|
614
|
-
self.validate_multisubmit_history_unit_ids(
|
615
|
-
history_unit_ids=history_unit_ids,
|
616
|
-
task_type=task_type,
|
617
|
-
list_parameters=list_parameters,
|
618
|
-
)
|
619
|
-
|
620
|
-
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
621
|
-
|
622
|
-
workdir_local = list_task_files[0].wftask_subfolder_local
|
623
|
-
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
624
|
-
|
625
|
-
# Create local&remote task subfolders
|
626
|
-
if task_type == "parallel":
|
627
|
-
original_umask = os.umask(0)
|
628
|
-
workdir_local.mkdir(parents=True, mode=0o755)
|
629
|
-
os.umask(original_umask)
|
630
|
-
_mkdir_as_user(
|
631
|
-
folder=workdir_remote.as_posix(),
|
632
|
-
user=self.slurm_user,
|
633
|
-
)
|
634
|
-
|
635
|
-
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
636
|
-
# TODO Pick a data structure for results and exceptions, or review the
|
637
|
-
# interface
|
638
|
-
results: dict[int, Any] = {}
|
639
|
-
exceptions: dict[int, BaseException] = {}
|
640
|
-
|
641
|
-
original_task_files = list_task_files
|
642
|
-
tot_tasks = len(list_parameters)
|
643
|
-
|
644
|
-
# Set/validate parameters for task batching
|
645
|
-
tasks_per_job, parallel_tasks_per_job = heuristics(
|
646
|
-
# Number of parallel components (always known)
|
647
|
-
tot_tasks=tot_tasks,
|
648
|
-
# Optional WorkflowTask attributes:
|
649
|
-
tasks_per_job=config.tasks_per_job,
|
650
|
-
parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
|
651
|
-
# Task requirements (multiple possible sources):
|
652
|
-
cpus_per_task=config.cpus_per_task,
|
653
|
-
mem_per_task=config.mem_per_task_MB,
|
654
|
-
# Fractal configuration variables (soft/hard limits):
|
655
|
-
target_cpus_per_job=config.target_cpus_per_job,
|
656
|
-
target_mem_per_job=config.target_mem_per_job,
|
657
|
-
target_num_jobs=config.target_num_jobs,
|
658
|
-
max_cpus_per_job=config.max_cpus_per_job,
|
659
|
-
max_mem_per_job=config.max_mem_per_job,
|
660
|
-
max_num_jobs=config.max_num_jobs,
|
661
|
-
)
|
662
|
-
config.parallel_tasks_per_job = parallel_tasks_per_job
|
663
|
-
config.tasks_per_job = tasks_per_job
|
664
|
-
|
665
|
-
# Divide arguments in batches of `tasks_per_job` tasks each
|
666
|
-
args_batches = []
|
667
|
-
batch_size = tasks_per_job
|
668
|
-
for ind_chunk in range(0, tot_tasks, batch_size):
|
669
|
-
args_batches.append(
|
670
|
-
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
671
|
-
)
|
672
|
-
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
673
|
-
raise RuntimeError("Something wrong here while batching tasks")
|
674
|
-
|
675
|
-
logger.info(f"START submission phase, {list(self.jobs.keys())=}")
|
676
|
-
for ind_batch, chunk in enumerate(args_batches):
|
677
|
-
tasks = []
|
678
|
-
for ind_chunk, parameters in enumerate(chunk):
|
679
|
-
index = (ind_batch * batch_size) + ind_chunk
|
680
|
-
tasks.append(
|
681
|
-
SlurmTask(
|
682
|
-
index=index,
|
683
|
-
component=original_task_files[index].component,
|
684
|
-
workdir_local=workdir_local,
|
685
|
-
workdir_remote=workdir_remote,
|
686
|
-
parameters=parameters,
|
687
|
-
zarr_url=parameters["zarr_url"],
|
688
|
-
task_files=original_task_files[index],
|
689
|
-
),
|
690
|
-
)
|
691
|
-
|
692
|
-
slurm_job = SlurmJob(
|
693
|
-
label=f"{ind_batch:06d}",
|
694
|
-
workdir_local=workdir_local,
|
695
|
-
workdir_remote=workdir_remote,
|
696
|
-
tasks=tasks,
|
697
|
-
)
|
698
|
-
self._submit_single_sbatch(
|
699
|
-
func,
|
700
|
-
slurm_job=slurm_job,
|
701
|
-
slurm_config=config,
|
702
|
-
)
|
703
|
-
logger.info(f"END submission phase, {self.job_ids=}")
|
704
|
-
|
705
|
-
# FIXME: Replace with more robust/efficient logic
|
706
|
-
logger.warning("Now sleep 4 (FIXME)")
|
707
|
-
time.sleep(4)
|
708
|
-
|
709
|
-
# Retrieval phase
|
710
|
-
logger.info("START retrieval phase")
|
711
|
-
while len(self.jobs) > 0:
|
712
|
-
if self.is_shutdown():
|
713
|
-
self.scancel_jobs()
|
714
|
-
finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
|
715
|
-
logger.debug(f"{finished_job_ids=}")
|
716
|
-
with next(get_sync_db()) as db:
|
717
|
-
for slurm_job_id in finished_job_ids:
|
718
|
-
logger.debug(f"Now processing {slurm_job_id=}")
|
719
|
-
slurm_job = self.jobs.pop(slurm_job_id)
|
720
|
-
self._copy_files_from_remote_to_local(slurm_job)
|
721
|
-
for task in slurm_job.tasks:
|
722
|
-
logger.debug(f"Now processing {task.index=}")
|
723
|
-
result, exception = self._postprocess_single_task(
|
724
|
-
task=task
|
725
|
-
)
|
726
|
-
|
727
|
-
# Note: the relevant done/failed check is based on
|
728
|
-
# whether `exception is None`. The fact that
|
729
|
-
# `result is None` is not relevant for this purpose.
|
730
|
-
if exception is not None:
|
731
|
-
logger.debug(
|
732
|
-
f"Task {task.index} has an exception."
|
733
|
-
) # FIXME # noqa
|
734
|
-
exceptions[task.index] = exception
|
735
|
-
if task_type == "parallel":
|
736
|
-
update_status_of_history_unit(
|
737
|
-
history_unit_id=history_unit_ids[
|
738
|
-
task.index
|
739
|
-
],
|
740
|
-
status=HistoryUnitStatus.FAILED,
|
741
|
-
db_sync=db,
|
742
|
-
)
|
743
|
-
else:
|
744
|
-
logger.debug(
|
745
|
-
f"Task {task.index} has no exception."
|
746
|
-
) # FIXME # noqa
|
747
|
-
results[task.index] = result
|
748
|
-
if task_type == "parallel":
|
749
|
-
update_status_of_history_unit(
|
750
|
-
history_unit_id=history_unit_ids[
|
751
|
-
task.index
|
752
|
-
],
|
753
|
-
status=HistoryUnitStatus.DONE,
|
754
|
-
db_sync=db,
|
755
|
-
)
|
756
|
-
|
757
|
-
time.sleep(self.slurm_poll_interval)
|
758
|
-
return results, exceptions
|
759
|
-
|
760
|
-
def check_remote_python_interpreter(self):
|
761
|
-
"""
|
762
|
-
Check fractal-server version on the _remote_ Python interpreter.
|
763
|
-
"""
|
764
|
-
settings = Inject(get_settings)
|
765
|
-
output = _subprocess_run_or_raise(
|
766
|
-
(
|
767
|
-
f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
|
768
|
-
"-m fractal_server.app.runner.versions"
|
769
|
-
)
|
770
|
-
)
|
771
|
-
runner_version = json.loads(output.stdout.strip("\n"))[
|
772
|
-
"fractal_server"
|
773
|
-
]
|
774
|
-
if runner_version != __VERSION__:
|
775
|
-
error_msg = (
|
776
|
-
"Fractal-server version mismatch.\n"
|
777
|
-
"Local interpreter: "
|
778
|
-
f"({sys.executable}): {__VERSION__}.\n"
|
779
|
-
"Remote interpreter: "
|
780
|
-
f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
|
781
|
-
)
|
782
|
-
logger.error(error_msg)
|
783
|
-
raise RuntimeError(error_msg)
|
152
|
+
def _run_local_cmd(self, cmd: str):
|
153
|
+
res = _subprocess_run_or_raise(cmd)
|
154
|
+
return res.stdout
|