fractal-server 2.14.0a12__py3-none-any.whl → 2.14.0a14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/dataset.py +1 -1
- fractal_server/app/models/v2/job.py +7 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/dataset.py +0 -8
- fractal_server/app/routes/api/v2/history.py +112 -27
- fractal_server/app/routes/api/v2/images.py +16 -14
- fractal_server/app/routes/api/v2/project.py +0 -52
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/workflow.py +0 -8
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/runner/executors/base_runner.py +5 -0
- fractal_server/app/runner/executors/local/runner.py +15 -7
- fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +676 -0
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
- fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
- fractal_server/app/runner/task_files.py +20 -6
- fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
- fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
- fractal_server/app/runner/v2/db_tools.py +1 -0
- fractal_server/app/runner/v2/runner.py +4 -0
- fractal_server/app/runner/v2/runner_functions.py +2 -2
- fractal_server/app/runner/v2/submit_workflow.py +7 -16
- fractal_server/app/schemas/v2/__init__.py +3 -1
- fractal_server/app/schemas/v2/history.py +27 -2
- fractal_server/config.py +6 -2
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
- fractal_server/tasks/v2/utils_background.py +0 -19
- {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/METADATA +1 -1
- {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/RECORD +41 -42
- fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
- fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
- fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
- {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/LICENSE +0 -0
- {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/WHEEL +0 -0
- {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
2
|
+
from fractal_server.app.runner.exceptions import TaskExecutionError
|
3
|
+
|
4
|
+
|
5
|
+
def _handle_exception_proxy(proxy): # FIXME
|
6
|
+
if proxy.exc_type_name == "JobExecutionError":
|
7
|
+
return JobExecutionError(str(proxy))
|
8
|
+
else:
|
9
|
+
kwargs = {}
|
10
|
+
for key in [
|
11
|
+
"workflow_task_id",
|
12
|
+
"workflow_task_order",
|
13
|
+
"task_name",
|
14
|
+
]:
|
15
|
+
if key in proxy.kwargs.keys():
|
16
|
+
kwargs[key] = proxy.kwargs[key]
|
17
|
+
return TaskExecutionError(proxy.tb, **kwargs)
|
@@ -0,0 +1,676 @@
|
|
1
|
+
import json
|
2
|
+
import math
|
3
|
+
import sys
|
4
|
+
import time
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any
|
7
|
+
from typing import Literal
|
8
|
+
from typing import Optional
|
9
|
+
|
10
|
+
import cloudpickle
|
11
|
+
|
12
|
+
from ..slurm_common._slurm_config import SlurmConfig
|
13
|
+
from ..slurm_common.slurm_job_task_models import SlurmJob
|
14
|
+
from ..slurm_common.slurm_job_task_models import SlurmTask
|
15
|
+
from ._batching import heuristics
|
16
|
+
from ._handle_exception_proxy import _handle_exception_proxy
|
17
|
+
from ._job_states import STATES_FINISHED
|
18
|
+
from fractal_server import __VERSION__
|
19
|
+
from fractal_server.app.db import get_sync_db
|
20
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
21
|
+
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
22
|
+
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
23
|
+
from fractal_server.app.runner.task_files import MULTISUBMIT_PREFIX
|
24
|
+
from fractal_server.app.runner.task_files import SUBMIT_PREFIX
|
25
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
26
|
+
from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
|
27
|
+
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
28
|
+
from fractal_server.config import get_settings
|
29
|
+
from fractal_server.logger import set_logger
|
30
|
+
from fractal_server.syringe import Inject
|
31
|
+
|
32
|
+
logger = set_logger(__name__)
|
33
|
+
|
34
|
+
# FIXME: Transform several logger.info into logger.debug.
|
35
|
+
|
36
|
+
|
37
|
+
class BaseSlurmRunner(BaseRunner):
|
38
|
+
shutdown_file: Path
|
39
|
+
common_script_lines: list[str]
|
40
|
+
user_cache_dir: str
|
41
|
+
root_dir_local: Path
|
42
|
+
root_dir_remote: Path
|
43
|
+
poll_interval: int
|
44
|
+
jobs: dict[str, SlurmJob]
|
45
|
+
python_worker_interpreter: str
|
46
|
+
slurm_runner_type: Literal["ssh", "sudo"]
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
root_dir_local: Path,
|
51
|
+
root_dir_remote: Path,
|
52
|
+
slurm_runner_type: Literal["ssh", "sudo"],
|
53
|
+
common_script_lines: Optional[list[str]] = None,
|
54
|
+
user_cache_dir: Optional[str] = None,
|
55
|
+
poll_interval: Optional[int] = None,
|
56
|
+
):
|
57
|
+
self.slurm_runner_type = slurm_runner_type
|
58
|
+
self.root_dir_local = root_dir_local
|
59
|
+
self.root_dir_remote = root_dir_remote
|
60
|
+
self.common_script_lines = common_script_lines or []
|
61
|
+
self._check_slurm_account()
|
62
|
+
self.user_cache_dir = user_cache_dir
|
63
|
+
|
64
|
+
settings = Inject(get_settings)
|
65
|
+
|
66
|
+
self.poll_interval = (
|
67
|
+
poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
68
|
+
)
|
69
|
+
self.check_fractal_server_versions()
|
70
|
+
|
71
|
+
# Create job folders. Note that the local one may or may not exist
|
72
|
+
# depending on whether it is a test or an actual run
|
73
|
+
if not self.root_dir_local.is_dir():
|
74
|
+
self._mkdir_local_folder(self.root_dir_local.as_posix())
|
75
|
+
self._mkdir_remote_folder(self.root_dir_remote.as_posix())
|
76
|
+
|
77
|
+
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
78
|
+
self.jobs = {}
|
79
|
+
|
80
|
+
def __enter__(self):
|
81
|
+
return self
|
82
|
+
|
83
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
84
|
+
return False
|
85
|
+
|
86
|
+
def _run_local_cmd(self, cmd: str) -> str:
|
87
|
+
raise NotImplementedError("Implement in child class.")
|
88
|
+
|
89
|
+
def _run_remote_cmd(self, cmd: str) -> str:
|
90
|
+
raise NotImplementedError("Implement in child class.")
|
91
|
+
|
92
|
+
def run_squeue(self, job_ids: list[str]) -> tuple[bool, str]:
|
93
|
+
# FIXME: review different cases (exception vs no job found)
|
94
|
+
job_id_single_str = ",".join([str(j) for j in job_ids])
|
95
|
+
cmd = (
|
96
|
+
f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
|
97
|
+
" --states=all"
|
98
|
+
)
|
99
|
+
|
100
|
+
try:
|
101
|
+
if self.slurm_runner_type == "sudo":
|
102
|
+
stdout = self._run_local_cmd(cmd)
|
103
|
+
else:
|
104
|
+
stdout = self._run_remote_cmd(cmd)
|
105
|
+
return True, stdout
|
106
|
+
except Exception as e:
|
107
|
+
logger.info(f"{cmd=} failed with {str(e)}")
|
108
|
+
return False, ""
|
109
|
+
|
110
|
+
def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
|
111
|
+
# If there is no Slurm job to check, return right away
|
112
|
+
|
113
|
+
if not job_ids:
|
114
|
+
return set()
|
115
|
+
id_to_state = dict()
|
116
|
+
|
117
|
+
success, stdout = self.run_squeue(job_ids)
|
118
|
+
if success:
|
119
|
+
id_to_state = {
|
120
|
+
out.split()[0]: out.split()[1] for out in stdout.splitlines()
|
121
|
+
}
|
122
|
+
else:
|
123
|
+
id_to_state = dict()
|
124
|
+
for j in job_ids:
|
125
|
+
success, res = self.run_squeue([j])
|
126
|
+
if not success:
|
127
|
+
logger.info(f"Job {j} not found. Marked it as completed")
|
128
|
+
id_to_state.update({str(j): "COMPLETED"})
|
129
|
+
else:
|
130
|
+
id_to_state.update(
|
131
|
+
{res.stdout.split()[0]: res.stdout.split()[1]}
|
132
|
+
)
|
133
|
+
|
134
|
+
# Finished jobs only stay in squeue for a few mins (configurable). If
|
135
|
+
# a job ID isn't there, we'll assume it's finished.
|
136
|
+
return {
|
137
|
+
j
|
138
|
+
for j in job_ids
|
139
|
+
if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
|
140
|
+
}
|
141
|
+
|
142
|
+
def _mkdir_local_folder(self, folder: str) -> None:
|
143
|
+
raise NotImplementedError("Implement in child class.")
|
144
|
+
|
145
|
+
def _mkdir_remote_folder(self, folder: str) -> None:
|
146
|
+
raise NotImplementedError("Implement in child class.")
|
147
|
+
|
148
|
+
def _submit_single_sbatch(
|
149
|
+
self,
|
150
|
+
func,
|
151
|
+
slurm_job: SlurmJob,
|
152
|
+
slurm_config: SlurmConfig,
|
153
|
+
) -> str:
|
154
|
+
logger.info("[_submit_single_sbatch] START")
|
155
|
+
# Prepare input pickle(s)
|
156
|
+
versions = dict(
|
157
|
+
python=sys.version_info[:3],
|
158
|
+
cloudpickle=cloudpickle.__version__,
|
159
|
+
fractal_server=__VERSION__,
|
160
|
+
)
|
161
|
+
for task in slurm_job.tasks:
|
162
|
+
# Wrinte input pickle
|
163
|
+
_args = []
|
164
|
+
_kwargs = dict(
|
165
|
+
parameters=task.parameters,
|
166
|
+
remote_files=task.task_files.remote_files_dict,
|
167
|
+
)
|
168
|
+
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
169
|
+
with open(task.input_pickle_file_local, "wb") as f:
|
170
|
+
f.write(funcser)
|
171
|
+
logger.info(
|
172
|
+
"[_submit_single_sbatch] Written "
|
173
|
+
f"{task.input_pickle_file_local=}"
|
174
|
+
)
|
175
|
+
|
176
|
+
if self.slurm_runner_type == "ssh":
|
177
|
+
# Send input pickle (only relevant for SSH)
|
178
|
+
self.fractal_ssh.send_file(
|
179
|
+
local=task.input_pickle_file_local,
|
180
|
+
remote=task.input_pickle_file_remote,
|
181
|
+
)
|
182
|
+
logger.info(
|
183
|
+
"[_submit_single_sbatch] Transferred "
|
184
|
+
f"{task.input_pickle_file_local=}"
|
185
|
+
)
|
186
|
+
|
187
|
+
# Prepare commands to be included in SLURM submission script
|
188
|
+
cmdlines = []
|
189
|
+
for task in slurm_job.tasks:
|
190
|
+
if self.slurm_runner_type == "ssh":
|
191
|
+
input_pickle_file = task.input_pickle_file_remote
|
192
|
+
else:
|
193
|
+
input_pickle_file = task.input_pickle_file_local
|
194
|
+
output_pickle_file = task.output_pickle_file_remote
|
195
|
+
cmdlines.append(
|
196
|
+
(
|
197
|
+
f"{self.python_worker_interpreter}"
|
198
|
+
" -m fractal_server.app.runner."
|
199
|
+
"executors.slurm_common.remote "
|
200
|
+
f"--input-file {input_pickle_file} "
|
201
|
+
f"--output-file {output_pickle_file}"
|
202
|
+
)
|
203
|
+
)
|
204
|
+
|
205
|
+
# Set ntasks
|
206
|
+
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
207
|
+
ntasks = min(len(cmdlines), num_tasks_max_running)
|
208
|
+
slurm_config.parallel_tasks_per_job = ntasks
|
209
|
+
|
210
|
+
# Prepare SLURM preamble based on SlurmConfig object
|
211
|
+
script_lines = slurm_config.to_sbatch_preamble(
|
212
|
+
remote_export_dir=self.user_cache_dir
|
213
|
+
)
|
214
|
+
|
215
|
+
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
216
|
+
# fix their order
|
217
|
+
script_lines.extend(
|
218
|
+
[
|
219
|
+
f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
|
220
|
+
f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
|
221
|
+
f"#SBATCH -D {slurm_job.workdir_remote}",
|
222
|
+
]
|
223
|
+
)
|
224
|
+
script_lines = slurm_config.sort_script_lines(script_lines)
|
225
|
+
logger.info(script_lines)
|
226
|
+
|
227
|
+
# Always print output of `uname -n` and `pwd`
|
228
|
+
script_lines.append(
|
229
|
+
'"Hostname: `uname -n`; current directory: `pwd`"\n'
|
230
|
+
)
|
231
|
+
|
232
|
+
# Complete script preamble
|
233
|
+
script_lines.append("\n")
|
234
|
+
|
235
|
+
# Include command lines
|
236
|
+
mem_per_task_MB = slurm_config.mem_per_task_MB
|
237
|
+
for cmd in cmdlines:
|
238
|
+
script_lines.append(
|
239
|
+
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
240
|
+
f"--mem={mem_per_task_MB}MB "
|
241
|
+
f"{cmd} &"
|
242
|
+
)
|
243
|
+
script_lines.append("wait\n")
|
244
|
+
script = "\n".join(script_lines)
|
245
|
+
|
246
|
+
# Write submission script
|
247
|
+
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
248
|
+
f.write(script)
|
249
|
+
logger.info(
|
250
|
+
"[_submit_single_sbatch] Written "
|
251
|
+
f"{slurm_job.slurm_submission_script_local=}"
|
252
|
+
)
|
253
|
+
|
254
|
+
if self.slurm_runner_type == "ssh":
|
255
|
+
self.fractal_ssh.send_file(
|
256
|
+
local=slurm_job.slurm_submission_script_local,
|
257
|
+
remote=slurm_job.slurm_submission_script_remote,
|
258
|
+
)
|
259
|
+
submit_command = (
|
260
|
+
"sbatch --parsable "
|
261
|
+
f"{slurm_job.slurm_submission_script_remote}"
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
submit_command = (
|
265
|
+
"sbatch --parsable "
|
266
|
+
f"{slurm_job.slurm_submission_script_local}"
|
267
|
+
)
|
268
|
+
# Run sbatch
|
269
|
+
pre_submission_cmds = slurm_config.pre_submission_commands
|
270
|
+
if len(pre_submission_cmds) == 0:
|
271
|
+
logger.info(f"Now run {submit_command=}")
|
272
|
+
sbatch_stdout = self._run_remote_cmd(submit_command)
|
273
|
+
else:
|
274
|
+
logger.info(f"Now using {pre_submission_cmds=}")
|
275
|
+
script_lines = pre_submission_cmds + [submit_command]
|
276
|
+
wrapper_script_contents = "\n".join(script_lines)
|
277
|
+
wrapper_script_contents = f"{wrapper_script_contents}\n"
|
278
|
+
if self.slurm_runner_type == "ssh":
|
279
|
+
wrapper_script = (
|
280
|
+
f"{slurm_job.slurm_submission_script_remote}_wrapper.sh"
|
281
|
+
)
|
282
|
+
self.fractal_ssh.write_remote_file(
|
283
|
+
path=wrapper_script, content=wrapper_script_contents
|
284
|
+
)
|
285
|
+
else:
|
286
|
+
wrapper_script = (
|
287
|
+
f"{slurm_job.slurm_submission_script_local}_wrapper.sh"
|
288
|
+
)
|
289
|
+
with open(wrapper_script, "w") as f:
|
290
|
+
f.write(wrapper_script_contents)
|
291
|
+
logger.info(f"Now run {wrapper_script=}")
|
292
|
+
sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
|
293
|
+
|
294
|
+
# Submit SLURM job and retrieve job ID
|
295
|
+
logger.info(f"[_submit_single_sbatc] {sbatch_stdout=}")
|
296
|
+
stdout = sbatch_stdout.strip("\n")
|
297
|
+
submitted_job_id = int(stdout)
|
298
|
+
slurm_job.slurm_job_id = str(submitted_job_id)
|
299
|
+
|
300
|
+
# Add job to self.jobs
|
301
|
+
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
302
|
+
logger.info(
|
303
|
+
"[_submit_single_sbatch] Added "
|
304
|
+
f"{slurm_job.slurm_job_id} to self.jobs."
|
305
|
+
)
|
306
|
+
logger.info("[_submit_single_sbatch] END")
|
307
|
+
|
308
|
+
def _copy_files_from_remote_to_local(
|
309
|
+
self,
|
310
|
+
slurm_job: SlurmJob,
|
311
|
+
) -> None:
|
312
|
+
raise NotImplementedError("Implement in child class.")
|
313
|
+
|
314
|
+
def _check_slurm_account(self) -> None:
|
315
|
+
"""
|
316
|
+
Check that SLURM account is not set here in `common_script_lines`.
|
317
|
+
"""
|
318
|
+
try:
|
319
|
+
invalid_line = next(
|
320
|
+
line
|
321
|
+
for line in self.common_script_lines
|
322
|
+
if line.startswith("#SBATCH --account=")
|
323
|
+
)
|
324
|
+
raise RuntimeError(
|
325
|
+
"Invalid line in `common_script_lines`: "
|
326
|
+
f"'{invalid_line}'.\n"
|
327
|
+
"SLURM account must be set via the request body of the "
|
328
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
329
|
+
)
|
330
|
+
except StopIteration:
|
331
|
+
pass
|
332
|
+
|
333
|
+
def _postprocess_single_task(
|
334
|
+
self, *, task: SlurmTask
|
335
|
+
) -> tuple[Any, Exception]:
|
336
|
+
try:
|
337
|
+
with open(task.output_pickle_file_local, "rb") as f:
|
338
|
+
outdata = f.read()
|
339
|
+
success, output = cloudpickle.loads(outdata)
|
340
|
+
if success:
|
341
|
+
result = output
|
342
|
+
return result, None
|
343
|
+
else:
|
344
|
+
exception = _handle_exception_proxy(output)
|
345
|
+
return None, exception
|
346
|
+
except Exception as e:
|
347
|
+
exception = JobExecutionError(f"ERROR, {str(e)}")
|
348
|
+
return None, exception
|
349
|
+
finally:
|
350
|
+
pass
|
351
|
+
# FIXME: Re-include unlinks of pickle files
|
352
|
+
# Path(task.input_pickle_file_local).unlink(missing_ok=True)
|
353
|
+
# Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
354
|
+
|
355
|
+
def is_shutdown(self) -> bool:
|
356
|
+
# FIXME: shutdown is not implemented
|
357
|
+
return self.shutdown_file.exists()
|
358
|
+
|
359
|
+
@property
|
360
|
+
def job_ids(self) -> list[str]:
|
361
|
+
return list(self.jobs.keys())
|
362
|
+
|
363
|
+
def submit(
|
364
|
+
self,
|
365
|
+
func: callable,
|
366
|
+
parameters: dict[str, Any],
|
367
|
+
history_unit_id: int,
|
368
|
+
task_files: TaskFiles,
|
369
|
+
config: SlurmConfig,
|
370
|
+
task_type: Literal[
|
371
|
+
"non_parallel",
|
372
|
+
"converter_non_parallel",
|
373
|
+
"compound",
|
374
|
+
"converter_compound",
|
375
|
+
],
|
376
|
+
) -> tuple[Any, Exception]:
|
377
|
+
|
378
|
+
logger.info("[submit] START")
|
379
|
+
|
380
|
+
workdir_local = task_files.wftask_subfolder_local
|
381
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
382
|
+
|
383
|
+
if self.jobs != {}:
|
384
|
+
raise JobExecutionError("Unexpected branch: jobs should be empty.")
|
385
|
+
|
386
|
+
if self.is_shutdown():
|
387
|
+
raise JobExecutionError("Cannot continue after shutdown.")
|
388
|
+
|
389
|
+
# Validation phase
|
390
|
+
self.validate_submit_parameters(
|
391
|
+
parameters=parameters,
|
392
|
+
task_type=task_type,
|
393
|
+
)
|
394
|
+
|
395
|
+
# Create task subfolder
|
396
|
+
logger.info("[submit] Create local/remote folders - START")
|
397
|
+
self._mkdir_local_folder(folder=workdir_local.as_posix())
|
398
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
399
|
+
logger.info("[submit] Create local/remote folders - END")
|
400
|
+
|
401
|
+
# Add prefix to task_files object
|
402
|
+
task_files.prefix = SUBMIT_PREFIX
|
403
|
+
|
404
|
+
# Submission phase
|
405
|
+
slurm_job = SlurmJob(
|
406
|
+
prefix=SUBMIT_PREFIX,
|
407
|
+
workdir_local=workdir_local,
|
408
|
+
workdir_remote=workdir_remote,
|
409
|
+
tasks=[
|
410
|
+
SlurmTask(
|
411
|
+
prefix=SUBMIT_PREFIX,
|
412
|
+
index=0,
|
413
|
+
component=task_files.component,
|
414
|
+
parameters=parameters,
|
415
|
+
workdir_remote=workdir_remote,
|
416
|
+
workdir_local=workdir_local,
|
417
|
+
task_files=task_files,
|
418
|
+
)
|
419
|
+
],
|
420
|
+
)
|
421
|
+
|
422
|
+
config.parallel_tasks_per_job = 1
|
423
|
+
self._submit_single_sbatch(
|
424
|
+
func,
|
425
|
+
slurm_job=slurm_job,
|
426
|
+
slurm_config=config,
|
427
|
+
)
|
428
|
+
logger.info(f"[submit] END submission phase, {self.job_ids=}")
|
429
|
+
|
430
|
+
# FIXME: replace this sleep a more precise check
|
431
|
+
settings = Inject(get_settings)
|
432
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
433
|
+
logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
|
434
|
+
time.sleep(sleep_time)
|
435
|
+
|
436
|
+
# Retrieval phase
|
437
|
+
logger.info("[submit] START retrieval phase")
|
438
|
+
while len(self.jobs) > 0:
|
439
|
+
if self.is_shutdown():
|
440
|
+
self.scancel_jobs()
|
441
|
+
finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
|
442
|
+
logger.info(f"{finished_job_ids=}")
|
443
|
+
with next(get_sync_db()) as db:
|
444
|
+
for slurm_job_id in finished_job_ids:
|
445
|
+
logger.info(f"Now process {slurm_job_id=}")
|
446
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
447
|
+
|
448
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
449
|
+
result, exception = self._postprocess_single_task(
|
450
|
+
task=slurm_job.tasks[0]
|
451
|
+
)
|
452
|
+
if exception is not None:
|
453
|
+
update_status_of_history_unit(
|
454
|
+
history_unit_id=history_unit_id,
|
455
|
+
status=HistoryUnitStatus.FAILED,
|
456
|
+
db_sync=db,
|
457
|
+
)
|
458
|
+
else:
|
459
|
+
if task_type not in ["compound", "converter_compound"]:
|
460
|
+
update_status_of_history_unit(
|
461
|
+
history_unit_id=history_unit_id,
|
462
|
+
status=HistoryUnitStatus.DONE,
|
463
|
+
db_sync=db,
|
464
|
+
)
|
465
|
+
|
466
|
+
time.sleep(self.poll_interval)
|
467
|
+
|
468
|
+
logger.info("[submit] END")
|
469
|
+
return result, exception
|
470
|
+
|
471
|
+
def multisubmit(
|
472
|
+
self,
|
473
|
+
func: callable,
|
474
|
+
list_parameters: list[dict],
|
475
|
+
history_unit_ids: list[int],
|
476
|
+
list_task_files: list[TaskFiles],
|
477
|
+
task_type: Literal["parallel", "compound", "converter_compound"],
|
478
|
+
config: SlurmConfig,
|
479
|
+
):
|
480
|
+
|
481
|
+
if len(self.jobs) > 0:
|
482
|
+
raise RuntimeError(
|
483
|
+
f"Cannot run .multisubmit when {len(self.jobs)=}"
|
484
|
+
)
|
485
|
+
|
486
|
+
self.validate_multisubmit_parameters(
|
487
|
+
list_parameters=list_parameters,
|
488
|
+
task_type=task_type,
|
489
|
+
list_task_files=list_task_files,
|
490
|
+
)
|
491
|
+
self.validate_multisubmit_history_unit_ids(
|
492
|
+
history_unit_ids=history_unit_ids,
|
493
|
+
task_type=task_type,
|
494
|
+
list_parameters=list_parameters,
|
495
|
+
)
|
496
|
+
|
497
|
+
logger.info(f"[multisubmit] START, {len(list_parameters)=}")
|
498
|
+
|
499
|
+
workdir_local = list_task_files[0].wftask_subfolder_local
|
500
|
+
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
501
|
+
|
502
|
+
# Create local&remote task subfolders
|
503
|
+
if task_type == "parallel":
|
504
|
+
self._mkdir_local_folder(workdir_local.as_posix())
|
505
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
506
|
+
|
507
|
+
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
508
|
+
# TODO Pick a data structure for results and exceptions, or review the
|
509
|
+
# interface
|
510
|
+
results: dict[int, Any] = {}
|
511
|
+
exceptions: dict[int, BaseException] = {}
|
512
|
+
|
513
|
+
tot_tasks = len(list_parameters)
|
514
|
+
|
515
|
+
# Set/validate parameters for task batching
|
516
|
+
tasks_per_job, parallel_tasks_per_job = heuristics(
|
517
|
+
# Number of parallel components (always known)
|
518
|
+
tot_tasks=tot_tasks,
|
519
|
+
# Optional WorkflowTask attributes:
|
520
|
+
tasks_per_job=config.tasks_per_job,
|
521
|
+
parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
|
522
|
+
# Task requirements (multiple possible sources):
|
523
|
+
cpus_per_task=config.cpus_per_task,
|
524
|
+
mem_per_task=config.mem_per_task_MB,
|
525
|
+
# Fractal configuration variables (soft/hard limits):
|
526
|
+
target_cpus_per_job=config.target_cpus_per_job,
|
527
|
+
target_mem_per_job=config.target_mem_per_job,
|
528
|
+
target_num_jobs=config.target_num_jobs,
|
529
|
+
max_cpus_per_job=config.max_cpus_per_job,
|
530
|
+
max_mem_per_job=config.max_mem_per_job,
|
531
|
+
max_num_jobs=config.max_num_jobs,
|
532
|
+
)
|
533
|
+
config.parallel_tasks_per_job = parallel_tasks_per_job
|
534
|
+
config.tasks_per_job = tasks_per_job
|
535
|
+
|
536
|
+
# Divide arguments in batches of `tasks_per_job` tasks each
|
537
|
+
args_batches = []
|
538
|
+
batch_size = tasks_per_job
|
539
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
540
|
+
args_batches.append(
|
541
|
+
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
542
|
+
)
|
543
|
+
if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
|
544
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
545
|
+
|
546
|
+
logger.info(f"START submission phase, {list(self.jobs.keys())=}")
|
547
|
+
for ind_batch, chunk in enumerate(args_batches):
|
548
|
+
prefix = f"{MULTISUBMIT_PREFIX}-{ind_batch:06d}"
|
549
|
+
tasks = []
|
550
|
+
for ind_chunk, parameters in enumerate(chunk):
|
551
|
+
index = (ind_batch * batch_size) + ind_chunk
|
552
|
+
current_task_files = list_task_files[index]
|
553
|
+
current_task_files.prefix = prefix
|
554
|
+
tasks.append(
|
555
|
+
SlurmTask(
|
556
|
+
prefix=prefix,
|
557
|
+
index=index,
|
558
|
+
component=current_task_files.component,
|
559
|
+
workdir_local=workdir_local,
|
560
|
+
workdir_remote=workdir_remote,
|
561
|
+
parameters=parameters,
|
562
|
+
zarr_url=parameters["zarr_url"],
|
563
|
+
task_files=current_task_files,
|
564
|
+
),
|
565
|
+
)
|
566
|
+
|
567
|
+
slurm_job = SlurmJob(
|
568
|
+
prefix=prefix,
|
569
|
+
workdir_local=workdir_local,
|
570
|
+
workdir_remote=workdir_remote,
|
571
|
+
tasks=tasks,
|
572
|
+
)
|
573
|
+
self._submit_single_sbatch(
|
574
|
+
func,
|
575
|
+
slurm_job=slurm_job,
|
576
|
+
slurm_config=config,
|
577
|
+
)
|
578
|
+
logger.info(f"END submission phase, {self.job_ids=}")
|
579
|
+
|
580
|
+
# FIXME: replace this sleep a more precise check
|
581
|
+
settings = Inject(get_settings)
|
582
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
583
|
+
logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
|
584
|
+
time.sleep(sleep_time)
|
585
|
+
|
586
|
+
# Retrieval phase
|
587
|
+
logger.info("START retrieval phase")
|
588
|
+
while len(self.jobs) > 0:
|
589
|
+
if self.is_shutdown():
|
590
|
+
self.scancel_jobs()
|
591
|
+
finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
|
592
|
+
logger.info(f"{finished_job_ids=}")
|
593
|
+
with next(get_sync_db()) as db:
|
594
|
+
for slurm_job_id in finished_job_ids:
|
595
|
+
logger.info(f"Now processing {slurm_job_id=}")
|
596
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
597
|
+
self._copy_files_from_remote_to_local(slurm_job)
|
598
|
+
for task in slurm_job.tasks:
|
599
|
+
logger.info(f"Now processing {task.index=}")
|
600
|
+
result, exception = self._postprocess_single_task(
|
601
|
+
task=task
|
602
|
+
)
|
603
|
+
|
604
|
+
# Note: the relevant done/failed check is based on
|
605
|
+
# whether `exception is None`. The fact that
|
606
|
+
# `result is None` is not relevant for this purpose.
|
607
|
+
if exception is not None:
|
608
|
+
exceptions[task.index] = exception
|
609
|
+
if task_type == "parallel":
|
610
|
+
update_status_of_history_unit(
|
611
|
+
history_unit_id=history_unit_ids[
|
612
|
+
task.index
|
613
|
+
],
|
614
|
+
status=HistoryUnitStatus.FAILED,
|
615
|
+
db_sync=db,
|
616
|
+
)
|
617
|
+
else:
|
618
|
+
results[task.index] = result
|
619
|
+
if task_type == "parallel":
|
620
|
+
update_status_of_history_unit(
|
621
|
+
history_unit_id=history_unit_ids[
|
622
|
+
task.index
|
623
|
+
],
|
624
|
+
status=HistoryUnitStatus.DONE,
|
625
|
+
db_sync=db,
|
626
|
+
)
|
627
|
+
|
628
|
+
time.sleep(self.poll_interval)
|
629
|
+
return results, exceptions
|
630
|
+
|
631
|
+
def check_fractal_server_versions(self):
|
632
|
+
"""
|
633
|
+
Compare fractal-server versions of local/remote Python interpreters.
|
634
|
+
"""
|
635
|
+
|
636
|
+
# Skip check when the local and remote interpreters are the same
|
637
|
+
# (notably for some sudo-slurm deployments)
|
638
|
+
if self.python_worker_interpreter == sys.executable:
|
639
|
+
return
|
640
|
+
|
641
|
+
# Fetch remote fractal-server version
|
642
|
+
cmd = (
|
643
|
+
f"{self.python_worker_interpreter} "
|
644
|
+
"-m fractal_server.app.runner.versions"
|
645
|
+
)
|
646
|
+
stdout = self._run_remote_cmd(cmd)
|
647
|
+
remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
|
648
|
+
|
649
|
+
# Verify local/remote version match
|
650
|
+
if remote_version != __VERSION__:
|
651
|
+
error_msg = (
|
652
|
+
"Fractal-server version mismatch.\n"
|
653
|
+
"Local interpreter: "
|
654
|
+
f"({sys.executable}): {__VERSION__}.\n"
|
655
|
+
"Remote interpreter: "
|
656
|
+
f"({self.python_worker_interpreter}): {remote_version}."
|
657
|
+
)
|
658
|
+
logger.error(error_msg)
|
659
|
+
raise RuntimeError(error_msg)
|
660
|
+
|
661
|
+
def scancel_jobs(self) -> None:
|
662
|
+
logger.info("[scancel_jobs] START")
|
663
|
+
|
664
|
+
if self.jobs:
|
665
|
+
scancel_string = " ".join(self.job_ids)
|
666
|
+
scancel_cmd = f"scancel {scancel_string}"
|
667
|
+
logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
|
668
|
+
try:
|
669
|
+
self._run_remote_cmd(scancel_cmd)
|
670
|
+
except Exception as e:
|
671
|
+
logger.warning(
|
672
|
+
"[scancel_jobs] `scancel` command failed. "
|
673
|
+
f"Original error:\n{str(e)}"
|
674
|
+
)
|
675
|
+
|
676
|
+
logger.info("[scancel_jobs] END")
|