fractal-server 2.14.0a9__py3-none-any.whl → 2.14.0a11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/v2/dataset.py +0 -10
- fractal_server/app/models/v2/job.py +3 -0
- fractal_server/app/routes/api/v2/__init__.py +2 -0
- fractal_server/app/routes/api/v2/history.py +14 -9
- fractal_server/app/routes/api/v2/images.py +5 -2
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/verify_image_types.py +64 -0
- fractal_server/app/routes/api/v2/workflow.py +11 -7
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +16 -17
- fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
- fractal_server/app/runner/executors/local/runner.py +117 -58
- fractal_server/app/runner/executors/{slurm_sudo → slurm_common}/_check_jobs_status.py +4 -0
- fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +67 -0
- fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
- fractal_server/app/runner/executors/slurm_ssh/runner.py +707 -0
- fractal_server/app/runner/executors/slurm_sudo/runner.py +265 -114
- fractal_server/app/runner/task_files.py +8 -0
- fractal_server/app/runner/v2/__init__.py +0 -365
- fractal_server/app/runner/v2/_local.py +4 -2
- fractal_server/app/runner/v2/_slurm_ssh.py +4 -2
- fractal_server/app/runner/v2/_slurm_sudo.py +4 -2
- fractal_server/app/runner/v2/db_tools.py +87 -0
- fractal_server/app/runner/v2/runner.py +83 -89
- fractal_server/app/runner/v2/runner_functions.py +279 -436
- fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
- fractal_server/app/runner/v2/submit_workflow.py +366 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/v2/dataset.py +4 -71
- fractal_server/app/schemas/v2/dumps.py +6 -5
- fractal_server/app/schemas/v2/job.py +6 -3
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +40 -36
- fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
- fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
- fractal_server/app/runner/v2/_db_tools.py +0 -48
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,707 @@
|
|
1
|
+
import json
|
2
|
+
import math
|
3
|
+
import sys
|
4
|
+
import time
|
5
|
+
from copy import copy
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any
|
8
|
+
from typing import Optional
|
9
|
+
|
10
|
+
import cloudpickle
|
11
|
+
from pydantic import BaseModel
|
12
|
+
from pydantic import ConfigDict
|
13
|
+
|
14
|
+
from ._check_job_status_ssh import get_finished_jobs_ssh
|
15
|
+
from fractal_server import __VERSION__
|
16
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
17
|
+
from fractal_server.app.runner.exceptions import TaskExecutionError
|
18
|
+
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
19
|
+
from fractal_server.app.runner.executors.slurm_common._batching import (
|
20
|
+
heuristics,
|
21
|
+
)
|
22
|
+
from fractal_server.app.runner.executors.slurm_common._slurm_config import (
|
23
|
+
SlurmConfig,
|
24
|
+
)
|
25
|
+
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
26
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
27
|
+
from fractal_server.app.schemas.v2.task import TaskTypeType
|
28
|
+
from fractal_server.config import get_settings
|
29
|
+
from fractal_server.logger import set_logger
|
30
|
+
from fractal_server.ssh._fabric import FractalSSH
|
31
|
+
from fractal_server.syringe import Inject
|
32
|
+
|
33
|
+
|
34
|
+
logger = set_logger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
def _handle_exception_proxy(proxy): # FIXME
|
38
|
+
if proxy.exc_type_name == "JobExecutionError":
|
39
|
+
return JobExecutionError(str(proxy))
|
40
|
+
else:
|
41
|
+
kwargs = {}
|
42
|
+
for key in [
|
43
|
+
"workflow_task_id",
|
44
|
+
"workflow_task_order",
|
45
|
+
"task_name",
|
46
|
+
]:
|
47
|
+
if key in proxy.kwargs.keys():
|
48
|
+
kwargs[key] = proxy.kwargs[key]
|
49
|
+
return TaskExecutionError(proxy.tb, **kwargs)
|
50
|
+
|
51
|
+
|
52
|
+
class SlurmTask(BaseModel):
|
53
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
54
|
+
component: str
|
55
|
+
workdir_local: Path
|
56
|
+
workdir_remote: Path
|
57
|
+
parameters: dict[str, Any]
|
58
|
+
zarr_url: Optional[str] = None
|
59
|
+
task_files: TaskFiles
|
60
|
+
index: int
|
61
|
+
|
62
|
+
@property
|
63
|
+
def input_pickle_file_local(self) -> str:
|
64
|
+
return (
|
65
|
+
self.workdir_local / f"{self.component}-input.pickle"
|
66
|
+
).as_posix()
|
67
|
+
|
68
|
+
@property
|
69
|
+
def output_pickle_file_local(self) -> str:
|
70
|
+
return (
|
71
|
+
self.workdir_local / f"{self.component}-output.pickle"
|
72
|
+
).as_posix()
|
73
|
+
|
74
|
+
@property
|
75
|
+
def input_pickle_file_remote(self) -> str:
|
76
|
+
return (
|
77
|
+
self.workdir_remote / f"{self.component}-input.pickle"
|
78
|
+
).as_posix()
|
79
|
+
|
80
|
+
@property
|
81
|
+
def output_pickle_file_remote(self) -> str:
|
82
|
+
return (
|
83
|
+
self.workdir_remote / f"{self.component}-output.pickle"
|
84
|
+
).as_posix()
|
85
|
+
|
86
|
+
|
87
|
+
class SlurmJob(BaseModel):
|
88
|
+
slurm_job_id: Optional[str] = None
|
89
|
+
label: str
|
90
|
+
workdir_local: Path
|
91
|
+
workdir_remote: Path
|
92
|
+
tasks: list[SlurmTask]
|
93
|
+
|
94
|
+
@property
|
95
|
+
def slurm_log_file_local(self) -> str:
|
96
|
+
if self.slurm_job_id:
|
97
|
+
return (
|
98
|
+
self.workdir_local
|
99
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
100
|
+
).as_posix()
|
101
|
+
else:
|
102
|
+
return (
|
103
|
+
self.workdir_local / f"slurm-{self.label}-%j.log"
|
104
|
+
).as_posix()
|
105
|
+
|
106
|
+
@property
|
107
|
+
def slurm_log_file_remote(self) -> str:
|
108
|
+
if self.slurm_job_id:
|
109
|
+
return (
|
110
|
+
self.workdir_remote
|
111
|
+
/ f"slurm-{self.label}-{self.slurm_job_id}.log"
|
112
|
+
).as_posix()
|
113
|
+
else:
|
114
|
+
return (
|
115
|
+
self.workdir_remote / f"slurm-{self.label}-%j.log"
|
116
|
+
).as_posix()
|
117
|
+
|
118
|
+
@property
|
119
|
+
def slurm_submission_script_local(self) -> str:
|
120
|
+
return (
|
121
|
+
self.workdir_local / f"slurm-{self.label}-submit.sh"
|
122
|
+
).as_posix()
|
123
|
+
|
124
|
+
@property
|
125
|
+
def slurm_submission_script_remote(self) -> str:
|
126
|
+
return (
|
127
|
+
self.workdir_remote / f"slurm-{self.label}-submit.sh"
|
128
|
+
).as_posix()
|
129
|
+
|
130
|
+
@property
|
131
|
+
def slurm_stdout(self) -> str:
|
132
|
+
return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
|
133
|
+
|
134
|
+
@property
|
135
|
+
def slurm_stderr(self) -> str:
|
136
|
+
return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
|
137
|
+
|
138
|
+
@property
|
139
|
+
def log_files_local(self) -> list[str]:
|
140
|
+
return [task.task_files.log_file_local for task in self.tasks]
|
141
|
+
|
142
|
+
|
143
|
+
# def _subprocess_run_or_raise(
|
144
|
+
# full_command: str,
|
145
|
+
# ) -> Optional[subprocess.CompletedProcess]:
|
146
|
+
# try:
|
147
|
+
# output = subprocess.run( # nosec
|
148
|
+
# shlex.split(full_command),
|
149
|
+
# capture_output=True,
|
150
|
+
# check=True,
|
151
|
+
# encoding="utf-8",
|
152
|
+
# )
|
153
|
+
# return output
|
154
|
+
# except subprocess.CalledProcessError as e:
|
155
|
+
# error_msg = (
|
156
|
+
# f"Submit command `{full_command}` failed. "
|
157
|
+
# f"Original error:\n{str(e)}\n"
|
158
|
+
# f"Original stdout:\n{e.stdout}\n"
|
159
|
+
# f"Original stderr:\n{e.stderr}\n"
|
160
|
+
# )
|
161
|
+
# logging.error(error_msg)
|
162
|
+
# raise JobExecutionError(info=error_msg)
|
163
|
+
|
164
|
+
|
165
|
+
class RunnerSlurmSSH(BaseRunner):
|
166
|
+
fractal_ssh: FractalSSH
|
167
|
+
|
168
|
+
slurm_user: str
|
169
|
+
shutdown_file: Path
|
170
|
+
common_script_lines: list[str]
|
171
|
+
user_cache_dir: str
|
172
|
+
root_dir_local: Path
|
173
|
+
root_dir_remote: Path
|
174
|
+
slurm_account: Optional[str] = None
|
175
|
+
poll_interval: int
|
176
|
+
python_worker_interpreter: str
|
177
|
+
jobs: dict[str, SlurmJob]
|
178
|
+
|
179
|
+
def __init__(
|
180
|
+
self,
|
181
|
+
*,
|
182
|
+
fractal_ssh: FractalSSH,
|
183
|
+
slurm_user: str,
|
184
|
+
root_dir_local: Path,
|
185
|
+
root_dir_remote: Path,
|
186
|
+
slurm_account: Optional[str] = None,
|
187
|
+
common_script_lines: Optional[list[str]] = None,
|
188
|
+
user_cache_dir: Optional[str] = None,
|
189
|
+
slurm_poll_interval: Optional[int] = None,
|
190
|
+
) -> None:
|
191
|
+
"""
|
192
|
+
Set parameters that are the same for different Fractal tasks and for
|
193
|
+
different SLURM jobs/tasks.
|
194
|
+
"""
|
195
|
+
|
196
|
+
self.slurm_user = slurm_user
|
197
|
+
self.slurm_account = slurm_account
|
198
|
+
self.common_script_lines = common_script_lines or []
|
199
|
+
|
200
|
+
# Check that SLURM account is not set here
|
201
|
+
# FIXME: move to little method
|
202
|
+
try:
|
203
|
+
invalid_line = next(
|
204
|
+
line
|
205
|
+
for line in self.common_script_lines
|
206
|
+
if line.startswith("#SBATCH --account=")
|
207
|
+
)
|
208
|
+
raise RuntimeError(
|
209
|
+
"Invalid line in `RunnerSlurmSSH.common_script_lines`: "
|
210
|
+
f"'{invalid_line}'.\n"
|
211
|
+
"SLURM account must be set via the request body of the "
|
212
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
213
|
+
)
|
214
|
+
except StopIteration:
|
215
|
+
pass
|
216
|
+
|
217
|
+
# Check Python versions
|
218
|
+
settings = Inject(get_settings)
|
219
|
+
self.fractal_ssh = fractal_ssh
|
220
|
+
logger.warning(self.fractal_ssh)
|
221
|
+
|
222
|
+
# It is the new handshanke
|
223
|
+
if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
|
224
|
+
self.check_remote_python_interpreter()
|
225
|
+
|
226
|
+
# Initialize connection and perform handshake
|
227
|
+
self.root_dir_local = root_dir_local
|
228
|
+
self.root_dir_remote = root_dir_remote
|
229
|
+
|
230
|
+
# # Create folders
|
231
|
+
# original_umask = os.umask(0)
|
232
|
+
# self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
|
233
|
+
# os.umask(original_umask)
|
234
|
+
# _mkdir_as_user(
|
235
|
+
# folder=self.root_dir_remote.as_posix(),
|
236
|
+
# user=self.slurm_user,
|
237
|
+
# )
|
238
|
+
|
239
|
+
self.user_cache_dir = user_cache_dir
|
240
|
+
|
241
|
+
self.slurm_poll_interval = (
|
242
|
+
slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
243
|
+
)
|
244
|
+
|
245
|
+
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
246
|
+
|
247
|
+
self.python_worker_interpreter = (
|
248
|
+
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
249
|
+
)
|
250
|
+
|
251
|
+
self.jobs = {}
|
252
|
+
|
253
|
+
def __enter__(self):
|
254
|
+
return self
|
255
|
+
|
256
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
257
|
+
return False
|
258
|
+
|
259
|
+
def is_shutdown(self) -> bool:
|
260
|
+
return self.shutdown_file.exists()
|
261
|
+
|
262
|
+
def scancel_jobs(self) -> None:
|
263
|
+
logger.debug("[scancel_jobs] START")
|
264
|
+
|
265
|
+
if self.jobs:
|
266
|
+
scancel_string = " ".join(self.job_ids)
|
267
|
+
scancel_cmd = f"scancel {scancel_string}"
|
268
|
+
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
269
|
+
try:
|
270
|
+
self.fractal_ssh.run_command(cmd=scancel_cmd)
|
271
|
+
# _run_command_as_user(
|
272
|
+
# cmd=scancel_cmd,
|
273
|
+
# user=self.slurm_user,
|
274
|
+
# check=True,
|
275
|
+
# )
|
276
|
+
except RuntimeError as e:
|
277
|
+
logger.warning(
|
278
|
+
"[scancel_jobs] `scancel` command failed. "
|
279
|
+
f"Original error:\n{str(e)}"
|
280
|
+
)
|
281
|
+
|
282
|
+
logger.debug("[scancel_jobs] END")
|
283
|
+
|
284
|
+
def _submit_single_sbatch(
|
285
|
+
self,
|
286
|
+
func,
|
287
|
+
slurm_job: SlurmJob,
|
288
|
+
slurm_config: SlurmConfig,
|
289
|
+
) -> str:
|
290
|
+
# Prepare input pickle(s)
|
291
|
+
versions = dict(
|
292
|
+
python=sys.version_info[:3],
|
293
|
+
cloudpickle=cloudpickle.__version__,
|
294
|
+
fractal_server=__VERSION__,
|
295
|
+
)
|
296
|
+
for task in slurm_job.tasks:
|
297
|
+
_args = []
|
298
|
+
_kwargs = dict(parameters=task.parameters)
|
299
|
+
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
300
|
+
with open(task.input_pickle_file_local, "wb") as f:
|
301
|
+
f.write(funcser)
|
302
|
+
# Prepare commands to be included in SLURM submission script
|
303
|
+
settings = Inject(get_settings)
|
304
|
+
python_worker_interpreter = (
|
305
|
+
settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
|
306
|
+
)
|
307
|
+
cmdlines = []
|
308
|
+
for task in slurm_job.tasks:
|
309
|
+
input_pickle_file = task.input_pickle_file_local
|
310
|
+
output_pickle_file = task.output_pickle_file_remote
|
311
|
+
cmdlines.append(
|
312
|
+
(
|
313
|
+
f"{python_worker_interpreter}"
|
314
|
+
" -m fractal_server.app.runner."
|
315
|
+
"executors.slurm_common.remote "
|
316
|
+
f"--input-file {input_pickle_file} "
|
317
|
+
f"--output-file {output_pickle_file}"
|
318
|
+
)
|
319
|
+
)
|
320
|
+
|
321
|
+
# ...
|
322
|
+
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
323
|
+
mem_per_task_MB = slurm_config.mem_per_task_MB
|
324
|
+
|
325
|
+
# Set ntasks
|
326
|
+
ntasks = min(len(cmdlines), num_tasks_max_running)
|
327
|
+
slurm_config.parallel_tasks_per_job = ntasks
|
328
|
+
|
329
|
+
# Prepare SLURM preamble based on SlurmConfig object
|
330
|
+
script_lines = slurm_config.to_sbatch_preamble(
|
331
|
+
remote_export_dir=self.user_cache_dir
|
332
|
+
)
|
333
|
+
|
334
|
+
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
335
|
+
# fix their order
|
336
|
+
script_lines.extend(
|
337
|
+
[
|
338
|
+
f"#SBATCH --err={slurm_job.slurm_stderr}",
|
339
|
+
f"#SBATCH --out={slurm_job.slurm_stdout}",
|
340
|
+
f"#SBATCH -D {slurm_job.workdir_remote}",
|
341
|
+
]
|
342
|
+
)
|
343
|
+
script_lines = slurm_config.sort_script_lines(script_lines)
|
344
|
+
logger.debug(script_lines)
|
345
|
+
|
346
|
+
# Always print output of `uname -n` and `pwd`
|
347
|
+
script_lines.append(
|
348
|
+
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
349
|
+
)
|
350
|
+
|
351
|
+
# Complete script preamble
|
352
|
+
script_lines.append("\n")
|
353
|
+
|
354
|
+
# Include command lines
|
355
|
+
tmp_list_commands = copy(cmdlines)
|
356
|
+
while tmp_list_commands:
|
357
|
+
if tmp_list_commands:
|
358
|
+
cmd = tmp_list_commands.pop(0) # take first element
|
359
|
+
script_lines.append(
|
360
|
+
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
361
|
+
f"--mem={mem_per_task_MB}MB "
|
362
|
+
f"{cmd} &"
|
363
|
+
)
|
364
|
+
script_lines.append("wait\n")
|
365
|
+
|
366
|
+
script = "\n".join(script_lines)
|
367
|
+
|
368
|
+
# Write submission script
|
369
|
+
# submission_script_contents = "\n".join(preamble_lines + cmdlines)
|
370
|
+
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
371
|
+
f.write(script)
|
372
|
+
|
373
|
+
self.fractal_ssh.send_file(
|
374
|
+
local=slurm_job.slurm_submission_script_local,
|
375
|
+
remote=slurm_job.slurm_submission_script_remote,
|
376
|
+
)
|
377
|
+
|
378
|
+
# Run sbatch
|
379
|
+
submit_command = (
|
380
|
+
f"sbatch --parsable {slurm_job.slurm_submission_script_remote}"
|
381
|
+
)
|
382
|
+
pre_submission_cmds = slurm_config.pre_submission_commands
|
383
|
+
if len(pre_submission_cmds) == 0:
|
384
|
+
sbatch_stdout = self.fractal_ssh.run_command(cmd=submit_command)
|
385
|
+
else:
|
386
|
+
logger.debug(f"Now using {pre_submission_cmds=}")
|
387
|
+
script_lines = pre_submission_cmds + [submit_command]
|
388
|
+
script_content = "\n".join(script_lines)
|
389
|
+
script_content = f"{script_content}\n"
|
390
|
+
script_path_remote = (
|
391
|
+
f"{slurm_job.slurm_script_remote.as_posix()}_wrapper.sh"
|
392
|
+
)
|
393
|
+
self.fractal_ssh.write_remote_file(
|
394
|
+
path=script_path_remote, content=script_content
|
395
|
+
)
|
396
|
+
cmd = f"bash {script_path_remote}"
|
397
|
+
sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
|
398
|
+
|
399
|
+
# Submit SLURM job and retrieve job ID
|
400
|
+
stdout = sbatch_stdout.strip("\n")
|
401
|
+
submitted_job_id = int(stdout)
|
402
|
+
slurm_job.slurm_job_id = str(submitted_job_id)
|
403
|
+
|
404
|
+
# Add job to self.jobs
|
405
|
+
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
406
|
+
logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
|
407
|
+
|
408
|
+
@property
|
409
|
+
def job_ids(self) -> list[str]:
|
410
|
+
return list(self.jobs.keys())
|
411
|
+
|
412
|
+
def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
|
413
|
+
# FIXME: This should only transfer archives, not single files
|
414
|
+
"""
|
415
|
+
Note: this would differ for SSH
|
416
|
+
"""
|
417
|
+
source_target_list = [
|
418
|
+
(job.slurm_log_file_remote, job.slurm_log_file_local)
|
419
|
+
]
|
420
|
+
for task in job.tasks:
|
421
|
+
source_target_list.extend(
|
422
|
+
[
|
423
|
+
(
|
424
|
+
task.output_pickle_file_remote,
|
425
|
+
task.output_pickle_file_local,
|
426
|
+
),
|
427
|
+
(
|
428
|
+
task.task_files.log_file_remote,
|
429
|
+
task.task_files.log_file_local,
|
430
|
+
),
|
431
|
+
(
|
432
|
+
task.task_files.args_file_remote,
|
433
|
+
task.task_files.args_file_local,
|
434
|
+
),
|
435
|
+
(
|
436
|
+
task.task_files.metadiff_file_remote,
|
437
|
+
task.task_files.metadiff_file_local,
|
438
|
+
),
|
439
|
+
]
|
440
|
+
)
|
441
|
+
|
442
|
+
for source, target in source_target_list:
|
443
|
+
try:
|
444
|
+
self.fractal_ssh.fetch_file(local=target, remote=source)
|
445
|
+
# res = _run_command_as_user(
|
446
|
+
# cmd=f"cat {source}",
|
447
|
+
# user=self.slurm_user,
|
448
|
+
# encoding=None,
|
449
|
+
# check=True,
|
450
|
+
# )
|
451
|
+
# Write local file
|
452
|
+
# with open(target, "wb") as f:
|
453
|
+
# f.write(res.stdout)
|
454
|
+
# logger.critical(f"Copied {source} into {target}")
|
455
|
+
except (RuntimeError, FileNotFoundError) as e:
|
456
|
+
logger.warning(
|
457
|
+
f"SKIP copy {target} into {source}. "
|
458
|
+
f"Original error: {str(e)}"
|
459
|
+
)
|
460
|
+
|
461
|
+
def _postprocess_single_task(
|
462
|
+
self, *, task: SlurmTask
|
463
|
+
) -> tuple[Any, Exception]:
|
464
|
+
try:
|
465
|
+
with open(task.output_pickle_file_local, "rb") as f:
|
466
|
+
outdata = f.read()
|
467
|
+
success, output = cloudpickle.loads(outdata)
|
468
|
+
if success:
|
469
|
+
result = output
|
470
|
+
return result, None
|
471
|
+
else:
|
472
|
+
exception = _handle_exception_proxy(output)
|
473
|
+
return None, exception
|
474
|
+
except Exception as e:
|
475
|
+
exception = JobExecutionError(f"ERROR, {str(e)}")
|
476
|
+
return None, exception
|
477
|
+
finally:
|
478
|
+
Path(task.input_pickle_file_local).unlink(missing_ok=True)
|
479
|
+
Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
480
|
+
|
481
|
+
def submit(
|
482
|
+
self,
|
483
|
+
func: callable,
|
484
|
+
parameters: dict[str, Any],
|
485
|
+
history_item_id: int,
|
486
|
+
task_files: TaskFiles,
|
487
|
+
slurm_config: SlurmConfig,
|
488
|
+
task_type: TaskTypeType,
|
489
|
+
) -> tuple[Any, Exception]:
|
490
|
+
workdir_local = task_files.wftask_subfolder_local
|
491
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
492
|
+
|
493
|
+
task_files = TaskFiles(
|
494
|
+
**task_files.model_dump(
|
495
|
+
exclude={"component"},
|
496
|
+
),
|
497
|
+
# FIXME _COMPONENT_KEY_ is deprecated
|
498
|
+
component="FIXME_INVALID_FAKE_VALUE",
|
499
|
+
# component=parameters[_COMPONENT_KEY_],
|
500
|
+
)
|
501
|
+
|
502
|
+
if self.jobs != {}:
|
503
|
+
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
504
|
+
|
505
|
+
if self.is_shutdown():
|
506
|
+
raise JobExecutionError("Cannot continue after shutdown.")
|
507
|
+
|
508
|
+
# Validation phase
|
509
|
+
self.validate_submit_parameters(
|
510
|
+
parameters=parameters,
|
511
|
+
task_type=task_type,
|
512
|
+
)
|
513
|
+
|
514
|
+
# Create task subfolder
|
515
|
+
workdir_local.mkdir(parents=True)
|
516
|
+
self.fractal_ssh.mkdir(
|
517
|
+
folder=workdir_remote.as_posix(),
|
518
|
+
parents=True,
|
519
|
+
)
|
520
|
+
|
521
|
+
# Submission phase
|
522
|
+
slurm_job = SlurmJob(
|
523
|
+
label="0",
|
524
|
+
workdir_local=workdir_local,
|
525
|
+
workdir_remote=workdir_remote,
|
526
|
+
tasks=[
|
527
|
+
SlurmTask(
|
528
|
+
index=0,
|
529
|
+
component="0",
|
530
|
+
parameters=parameters,
|
531
|
+
workdir_remote=workdir_remote,
|
532
|
+
workdir_local=workdir_local,
|
533
|
+
task_files=task_files,
|
534
|
+
)
|
535
|
+
],
|
536
|
+
) # TODO: replace with actual values (BASED ON TASKFILES)
|
537
|
+
|
538
|
+
slurm_config.parallel_tasks_per_job = 1
|
539
|
+
self._submit_single_sbatch(
|
540
|
+
func,
|
541
|
+
slurm_job=slurm_job,
|
542
|
+
slurm_config=slurm_config,
|
543
|
+
)
|
544
|
+
|
545
|
+
# Retrieval phase
|
546
|
+
while len(self.jobs) > 0:
|
547
|
+
if self.is_shutdown():
|
548
|
+
self.scancel_jobs()
|
549
|
+
finished_job_ids = get_finished_jobs_ssh(
|
550
|
+
job_ids=self.job_ids,
|
551
|
+
fractal_ssh=self.fractal_ssh,
|
552
|
+
)
|
553
|
+
for slurm_job_id in finished_job_ids:
|
554
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
555
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
556
|
+
result, exception = self._postprocess_single_task(
|
557
|
+
task=slurm_job.tasks[0]
|
558
|
+
)
|
559
|
+
time.sleep(self.slurm_poll_interval)
|
560
|
+
|
561
|
+
return result, exception
|
562
|
+
|
563
|
+
def multisubmit(
|
564
|
+
self,
|
565
|
+
func: callable,
|
566
|
+
list_parameters: list[dict],
|
567
|
+
history_item_id: int,
|
568
|
+
task_files: TaskFiles,
|
569
|
+
slurm_config: SlurmConfig,
|
570
|
+
task_type: TaskTypeType,
|
571
|
+
):
|
572
|
+
# self.scancel_jobs()
|
573
|
+
|
574
|
+
self.validate_multisubmit_parameters(
|
575
|
+
list_parameters=list_parameters,
|
576
|
+
task_type=task_type,
|
577
|
+
)
|
578
|
+
|
579
|
+
workdir_local = task_files.wftask_subfolder_local
|
580
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
581
|
+
|
582
|
+
# Create local&remote task subfolders
|
583
|
+
if task_type not in ["compound", "converter_compound"]:
|
584
|
+
workdir_local.mkdir(parents=True)
|
585
|
+
self.fractal_ssh.mkdir(
|
586
|
+
folder=workdir_remote.as_posix(),
|
587
|
+
parents=True,
|
588
|
+
)
|
589
|
+
|
590
|
+
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
591
|
+
# TODO Pick a data structure for results and exceptions, or review the
|
592
|
+
# interface
|
593
|
+
results: dict[int, Any] = {}
|
594
|
+
exceptions: dict[int, BaseException] = {}
|
595
|
+
|
596
|
+
original_task_files = task_files
|
597
|
+
tot_tasks = len(list_parameters)
|
598
|
+
|
599
|
+
# Set/validate parameters for task batching
|
600
|
+
tasks_per_job, parallel_tasks_per_job = heuristics(
|
601
|
+
# Number of parallel components (always known)
|
602
|
+
tot_tasks=tot_tasks,
|
603
|
+
# Optional WorkflowTask attributes:
|
604
|
+
tasks_per_job=slurm_config.tasks_per_job,
|
605
|
+
parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
|
606
|
+
# Task requirements (multiple possible sources):
|
607
|
+
cpus_per_task=slurm_config.cpus_per_task,
|
608
|
+
mem_per_task=slurm_config.mem_per_task_MB,
|
609
|
+
# Fractal configuration variables (soft/hard limits):
|
610
|
+
target_cpus_per_job=slurm_config.target_cpus_per_job,
|
611
|
+
target_mem_per_job=slurm_config.target_mem_per_job,
|
612
|
+
target_num_jobs=slurm_config.target_num_jobs,
|
613
|
+
max_cpus_per_job=slurm_config.max_cpus_per_job,
|
614
|
+
max_mem_per_job=slurm_config.max_mem_per_job,
|
615
|
+
max_num_jobs=slurm_config.max_num_jobs,
|
616
|
+
)
|
617
|
+
slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
|
618
|
+
slurm_config.tasks_per_job = tasks_per_job
|
619
|
+
|
620
|
+
# Divide arguments in batches of `tasks_per_job` tasks each
|
621
|
+
args_batches = []
|
622
|
+
batch_size = tasks_per_job
|
623
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
624
|
+
args_batches.append(
|
625
|
+
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
626
|
+
)
|
627
|
+
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
628
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
629
|
+
|
630
|
+
logger.info(f"START submission phase, {list(self.jobs.keys())=}")
|
631
|
+
for ind_batch, chunk in enumerate(args_batches):
|
632
|
+
# TODO: replace with actual values
|
633
|
+
tasks = []
|
634
|
+
for ind_chunk, parameters in enumerate(chunk):
|
635
|
+
# FIXME: _COMPONENT_KEY_ is deprecated
|
636
|
+
# component = parameters[_COMPONENT_KEY_]
|
637
|
+
component = "INVALID_FAKE_VALUE_FIXME"
|
638
|
+
tasks.append(
|
639
|
+
SlurmTask(
|
640
|
+
index=(ind_batch * batch_size) + ind_chunk,
|
641
|
+
component=component,
|
642
|
+
workdir_local=workdir_local,
|
643
|
+
workdir_remote=workdir_remote,
|
644
|
+
parameters=parameters,
|
645
|
+
zarr_url=parameters["zarr_url"],
|
646
|
+
task_files=TaskFiles(
|
647
|
+
**original_task_files.model_dump(
|
648
|
+
exclude={"component"}
|
649
|
+
),
|
650
|
+
component=component,
|
651
|
+
),
|
652
|
+
),
|
653
|
+
)
|
654
|
+
|
655
|
+
slurm_job = SlurmJob(
|
656
|
+
label=f"{ind_batch:06d}",
|
657
|
+
workdir_local=workdir_local,
|
658
|
+
workdir_remote=workdir_remote,
|
659
|
+
tasks=tasks,
|
660
|
+
)
|
661
|
+
self._submit_single_sbatch(
|
662
|
+
func,
|
663
|
+
slurm_job=slurm_job,
|
664
|
+
slurm_config=slurm_config,
|
665
|
+
)
|
666
|
+
logger.info(f"END submission phase, {list(self.jobs.keys())=}")
|
667
|
+
|
668
|
+
# Retrieval phase
|
669
|
+
while len(self.jobs) > 0:
|
670
|
+
if self.is_shutdown():
|
671
|
+
self.scancel_jobs()
|
672
|
+
finished_job_ids = get_finished_jobs_ssh(
|
673
|
+
job_ids=self.job_ids,
|
674
|
+
fractal_ssh=self.fractal_ssh,
|
675
|
+
)
|
676
|
+
for slurm_job_id in finished_job_ids:
|
677
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
678
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
679
|
+
for task in slurm_job.tasks:
|
680
|
+
result, exception = self._postprocess_single_task(
|
681
|
+
task=task
|
682
|
+
)
|
683
|
+
if exception is None:
|
684
|
+
results[task.index] = result
|
685
|
+
else:
|
686
|
+
exceptions[task.index] = exception
|
687
|
+
time.sleep(self.slurm_poll_interval)
|
688
|
+
return results, exceptions
|
689
|
+
|
690
|
+
def check_remote_python_interpreter(self):
|
691
|
+
settings = Inject(get_settings)
|
692
|
+
cmd = (
|
693
|
+
f"{self.python_worker_interpreter} "
|
694
|
+
"-m fractal_server.app.runner.versions"
|
695
|
+
)
|
696
|
+
stdout = self.fractal_ssh.run_command(cmd=cmd)
|
697
|
+
remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
|
698
|
+
if remote_version != __VERSION__:
|
699
|
+
error_msg = (
|
700
|
+
"Fractal-server version mismatch.\n"
|
701
|
+
"Local interpreter: "
|
702
|
+
f"({sys.executable}): {__VERSION__}.\n"
|
703
|
+
"Remote interpreter: "
|
704
|
+
f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {remote_version}."
|
705
|
+
)
|
706
|
+
logger.error(error_msg)
|
707
|
+
raise RuntimeError(error_msg)
|