fractal-server 2.14.0a9__py3-none-any.whl → 2.14.0a11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v2/dataset.py +0 -10
  3. fractal_server/app/models/v2/job.py +3 -0
  4. fractal_server/app/routes/api/v2/__init__.py +2 -0
  5. fractal_server/app/routes/api/v2/history.py +14 -9
  6. fractal_server/app/routes/api/v2/images.py +5 -2
  7. fractal_server/app/routes/api/v2/submit.py +16 -14
  8. fractal_server/app/routes/api/v2/verify_image_types.py +64 -0
  9. fractal_server/app/routes/api/v2/workflow.py +11 -7
  10. fractal_server/app/runner/components.py +0 -3
  11. fractal_server/app/runner/exceptions.py +4 -0
  12. fractal_server/app/runner/executors/base_runner.py +16 -17
  13. fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
  14. fractal_server/app/runner/executors/local/runner.py +117 -58
  15. fractal_server/app/runner/executors/{slurm_sudo → slurm_common}/_check_jobs_status.py +4 -0
  16. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +67 -0
  17. fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
  18. fractal_server/app/runner/executors/slurm_ssh/runner.py +707 -0
  19. fractal_server/app/runner/executors/slurm_sudo/runner.py +265 -114
  20. fractal_server/app/runner/task_files.py +8 -0
  21. fractal_server/app/runner/v2/__init__.py +0 -365
  22. fractal_server/app/runner/v2/_local.py +4 -2
  23. fractal_server/app/runner/v2/_slurm_ssh.py +4 -2
  24. fractal_server/app/runner/v2/_slurm_sudo.py +4 -2
  25. fractal_server/app/runner/v2/db_tools.py +87 -0
  26. fractal_server/app/runner/v2/runner.py +83 -89
  27. fractal_server/app/runner/v2/runner_functions.py +279 -436
  28. fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
  29. fractal_server/app/runner/v2/submit_workflow.py +366 -0
  30. fractal_server/app/runner/v2/task_interface.py +31 -0
  31. fractal_server/app/schemas/v2/dataset.py +4 -71
  32. fractal_server/app/schemas/v2/dumps.py +6 -5
  33. fractal_server/app/schemas/v2/job.py +6 -3
  34. fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
  35. fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
  36. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
  37. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +40 -36
  38. fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
  39. fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
  40. fractal_server/app/runner/v2/_db_tools.py +0 -48
  41. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
  42. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
  43. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,707 @@
1
+ import json
2
+ import math
3
+ import sys
4
+ import time
5
+ from copy import copy
6
+ from pathlib import Path
7
+ from typing import Any
8
+ from typing import Optional
9
+
10
+ import cloudpickle
11
+ from pydantic import BaseModel
12
+ from pydantic import ConfigDict
13
+
14
+ from ._check_job_status_ssh import get_finished_jobs_ssh
15
+ from fractal_server import __VERSION__
16
+ from fractal_server.app.runner.exceptions import JobExecutionError
17
+ from fractal_server.app.runner.exceptions import TaskExecutionError
18
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
19
+ from fractal_server.app.runner.executors.slurm_common._batching import (
20
+ heuristics,
21
+ )
22
+ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
23
+ SlurmConfig,
24
+ )
25
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
26
+ from fractal_server.app.runner.task_files import TaskFiles
27
+ from fractal_server.app.schemas.v2.task import TaskTypeType
28
+ from fractal_server.config import get_settings
29
+ from fractal_server.logger import set_logger
30
+ from fractal_server.ssh._fabric import FractalSSH
31
+ from fractal_server.syringe import Inject
32
+
33
+
34
+ logger = set_logger(__name__)
35
+
36
+
37
+ def _handle_exception_proxy(proxy): # FIXME
38
+ if proxy.exc_type_name == "JobExecutionError":
39
+ return JobExecutionError(str(proxy))
40
+ else:
41
+ kwargs = {}
42
+ for key in [
43
+ "workflow_task_id",
44
+ "workflow_task_order",
45
+ "task_name",
46
+ ]:
47
+ if key in proxy.kwargs.keys():
48
+ kwargs[key] = proxy.kwargs[key]
49
+ return TaskExecutionError(proxy.tb, **kwargs)
50
+
51
+
52
+ class SlurmTask(BaseModel):
53
+ model_config = ConfigDict(arbitrary_types_allowed=True)
54
+ component: str
55
+ workdir_local: Path
56
+ workdir_remote: Path
57
+ parameters: dict[str, Any]
58
+ zarr_url: Optional[str] = None
59
+ task_files: TaskFiles
60
+ index: int
61
+
62
+ @property
63
+ def input_pickle_file_local(self) -> str:
64
+ return (
65
+ self.workdir_local / f"{self.component}-input.pickle"
66
+ ).as_posix()
67
+
68
+ @property
69
+ def output_pickle_file_local(self) -> str:
70
+ return (
71
+ self.workdir_local / f"{self.component}-output.pickle"
72
+ ).as_posix()
73
+
74
+ @property
75
+ def input_pickle_file_remote(self) -> str:
76
+ return (
77
+ self.workdir_remote / f"{self.component}-input.pickle"
78
+ ).as_posix()
79
+
80
+ @property
81
+ def output_pickle_file_remote(self) -> str:
82
+ return (
83
+ self.workdir_remote / f"{self.component}-output.pickle"
84
+ ).as_posix()
85
+
86
+
87
+ class SlurmJob(BaseModel):
88
+ slurm_job_id: Optional[str] = None
89
+ label: str
90
+ workdir_local: Path
91
+ workdir_remote: Path
92
+ tasks: list[SlurmTask]
93
+
94
+ @property
95
+ def slurm_log_file_local(self) -> str:
96
+ if self.slurm_job_id:
97
+ return (
98
+ self.workdir_local
99
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
100
+ ).as_posix()
101
+ else:
102
+ return (
103
+ self.workdir_local / f"slurm-{self.label}-%j.log"
104
+ ).as_posix()
105
+
106
+ @property
107
+ def slurm_log_file_remote(self) -> str:
108
+ if self.slurm_job_id:
109
+ return (
110
+ self.workdir_remote
111
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
112
+ ).as_posix()
113
+ else:
114
+ return (
115
+ self.workdir_remote / f"slurm-{self.label}-%j.log"
116
+ ).as_posix()
117
+
118
+ @property
119
+ def slurm_submission_script_local(self) -> str:
120
+ return (
121
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
122
+ ).as_posix()
123
+
124
+ @property
125
+ def slurm_submission_script_remote(self) -> str:
126
+ return (
127
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
128
+ ).as_posix()
129
+
130
+ @property
131
+ def slurm_stdout(self) -> str:
132
+ return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
133
+
134
+ @property
135
+ def slurm_stderr(self) -> str:
136
+ return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
137
+
138
+ @property
139
+ def log_files_local(self) -> list[str]:
140
+ return [task.task_files.log_file_local for task in self.tasks]
141
+
142
+
143
+ # def _subprocess_run_or_raise(
144
+ # full_command: str,
145
+ # ) -> Optional[subprocess.CompletedProcess]:
146
+ # try:
147
+ # output = subprocess.run( # nosec
148
+ # shlex.split(full_command),
149
+ # capture_output=True,
150
+ # check=True,
151
+ # encoding="utf-8",
152
+ # )
153
+ # return output
154
+ # except subprocess.CalledProcessError as e:
155
+ # error_msg = (
156
+ # f"Submit command `{full_command}` failed. "
157
+ # f"Original error:\n{str(e)}\n"
158
+ # f"Original stdout:\n{e.stdout}\n"
159
+ # f"Original stderr:\n{e.stderr}\n"
160
+ # )
161
+ # logging.error(error_msg)
162
+ # raise JobExecutionError(info=error_msg)
163
+
164
+
165
+ class RunnerSlurmSSH(BaseRunner):
166
+ fractal_ssh: FractalSSH
167
+
168
+ slurm_user: str
169
+ shutdown_file: Path
170
+ common_script_lines: list[str]
171
+ user_cache_dir: str
172
+ root_dir_local: Path
173
+ root_dir_remote: Path
174
+ slurm_account: Optional[str] = None
175
+ poll_interval: int
176
+ python_worker_interpreter: str
177
+ jobs: dict[str, SlurmJob]
178
+
179
+ def __init__(
180
+ self,
181
+ *,
182
+ fractal_ssh: FractalSSH,
183
+ slurm_user: str,
184
+ root_dir_local: Path,
185
+ root_dir_remote: Path,
186
+ slurm_account: Optional[str] = None,
187
+ common_script_lines: Optional[list[str]] = None,
188
+ user_cache_dir: Optional[str] = None,
189
+ slurm_poll_interval: Optional[int] = None,
190
+ ) -> None:
191
+ """
192
+ Set parameters that are the same for different Fractal tasks and for
193
+ different SLURM jobs/tasks.
194
+ """
195
+
196
+ self.slurm_user = slurm_user
197
+ self.slurm_account = slurm_account
198
+ self.common_script_lines = common_script_lines or []
199
+
200
+ # Check that SLURM account is not set here
201
+ # FIXME: move to little method
202
+ try:
203
+ invalid_line = next(
204
+ line
205
+ for line in self.common_script_lines
206
+ if line.startswith("#SBATCH --account=")
207
+ )
208
+ raise RuntimeError(
209
+ "Invalid line in `RunnerSlurmSSH.common_script_lines`: "
210
+ f"'{invalid_line}'.\n"
211
+ "SLURM account must be set via the request body of the "
212
+ "apply-workflow endpoint, or by modifying the user properties."
213
+ )
214
+ except StopIteration:
215
+ pass
216
+
217
+ # Check Python versions
218
+ settings = Inject(get_settings)
219
+ self.fractal_ssh = fractal_ssh
220
+ logger.warning(self.fractal_ssh)
221
+
222
+ # It is the new handshanke
223
+ if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
224
+ self.check_remote_python_interpreter()
225
+
226
+ # Initialize connection and perform handshake
227
+ self.root_dir_local = root_dir_local
228
+ self.root_dir_remote = root_dir_remote
229
+
230
+ # # Create folders
231
+ # original_umask = os.umask(0)
232
+ # self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
233
+ # os.umask(original_umask)
234
+ # _mkdir_as_user(
235
+ # folder=self.root_dir_remote.as_posix(),
236
+ # user=self.slurm_user,
237
+ # )
238
+
239
+ self.user_cache_dir = user_cache_dir
240
+
241
+ self.slurm_poll_interval = (
242
+ slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
243
+ )
244
+
245
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
246
+
247
+ self.python_worker_interpreter = (
248
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
249
+ )
250
+
251
+ self.jobs = {}
252
+
253
+ def __enter__(self):
254
+ return self
255
+
256
+ def __exit__(self, exc_type, exc_val, exc_tb):
257
+ return False
258
+
259
+ def is_shutdown(self) -> bool:
260
+ return self.shutdown_file.exists()
261
+
262
+ def scancel_jobs(self) -> None:
263
+ logger.debug("[scancel_jobs] START")
264
+
265
+ if self.jobs:
266
+ scancel_string = " ".join(self.job_ids)
267
+ scancel_cmd = f"scancel {scancel_string}"
268
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
269
+ try:
270
+ self.fractal_ssh.run_command(cmd=scancel_cmd)
271
+ # _run_command_as_user(
272
+ # cmd=scancel_cmd,
273
+ # user=self.slurm_user,
274
+ # check=True,
275
+ # )
276
+ except RuntimeError as e:
277
+ logger.warning(
278
+ "[scancel_jobs] `scancel` command failed. "
279
+ f"Original error:\n{str(e)}"
280
+ )
281
+
282
+ logger.debug("[scancel_jobs] END")
283
+
284
+ def _submit_single_sbatch(
285
+ self,
286
+ func,
287
+ slurm_job: SlurmJob,
288
+ slurm_config: SlurmConfig,
289
+ ) -> str:
290
+ # Prepare input pickle(s)
291
+ versions = dict(
292
+ python=sys.version_info[:3],
293
+ cloudpickle=cloudpickle.__version__,
294
+ fractal_server=__VERSION__,
295
+ )
296
+ for task in slurm_job.tasks:
297
+ _args = []
298
+ _kwargs = dict(parameters=task.parameters)
299
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
300
+ with open(task.input_pickle_file_local, "wb") as f:
301
+ f.write(funcser)
302
+ # Prepare commands to be included in SLURM submission script
303
+ settings = Inject(get_settings)
304
+ python_worker_interpreter = (
305
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
306
+ )
307
+ cmdlines = []
308
+ for task in slurm_job.tasks:
309
+ input_pickle_file = task.input_pickle_file_local
310
+ output_pickle_file = task.output_pickle_file_remote
311
+ cmdlines.append(
312
+ (
313
+ f"{python_worker_interpreter}"
314
+ " -m fractal_server.app.runner."
315
+ "executors.slurm_common.remote "
316
+ f"--input-file {input_pickle_file} "
317
+ f"--output-file {output_pickle_file}"
318
+ )
319
+ )
320
+
321
+ # ...
322
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
323
+ mem_per_task_MB = slurm_config.mem_per_task_MB
324
+
325
+ # Set ntasks
326
+ ntasks = min(len(cmdlines), num_tasks_max_running)
327
+ slurm_config.parallel_tasks_per_job = ntasks
328
+
329
+ # Prepare SLURM preamble based on SlurmConfig object
330
+ script_lines = slurm_config.to_sbatch_preamble(
331
+ remote_export_dir=self.user_cache_dir
332
+ )
333
+
334
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
335
+ # fix their order
336
+ script_lines.extend(
337
+ [
338
+ f"#SBATCH --err={slurm_job.slurm_stderr}",
339
+ f"#SBATCH --out={slurm_job.slurm_stdout}",
340
+ f"#SBATCH -D {slurm_job.workdir_remote}",
341
+ ]
342
+ )
343
+ script_lines = slurm_config.sort_script_lines(script_lines)
344
+ logger.debug(script_lines)
345
+
346
+ # Always print output of `uname -n` and `pwd`
347
+ script_lines.append(
348
+ '"Hostname: `uname -n`; current directory: `pwd`"\n'
349
+ )
350
+
351
+ # Complete script preamble
352
+ script_lines.append("\n")
353
+
354
+ # Include command lines
355
+ tmp_list_commands = copy(cmdlines)
356
+ while tmp_list_commands:
357
+ if tmp_list_commands:
358
+ cmd = tmp_list_commands.pop(0) # take first element
359
+ script_lines.append(
360
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
361
+ f"--mem={mem_per_task_MB}MB "
362
+ f"{cmd} &"
363
+ )
364
+ script_lines.append("wait\n")
365
+
366
+ script = "\n".join(script_lines)
367
+
368
+ # Write submission script
369
+ # submission_script_contents = "\n".join(preamble_lines + cmdlines)
370
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
371
+ f.write(script)
372
+
373
+ self.fractal_ssh.send_file(
374
+ local=slurm_job.slurm_submission_script_local,
375
+ remote=slurm_job.slurm_submission_script_remote,
376
+ )
377
+
378
+ # Run sbatch
379
+ submit_command = (
380
+ f"sbatch --parsable {slurm_job.slurm_submission_script_remote}"
381
+ )
382
+ pre_submission_cmds = slurm_config.pre_submission_commands
383
+ if len(pre_submission_cmds) == 0:
384
+ sbatch_stdout = self.fractal_ssh.run_command(cmd=submit_command)
385
+ else:
386
+ logger.debug(f"Now using {pre_submission_cmds=}")
387
+ script_lines = pre_submission_cmds + [submit_command]
388
+ script_content = "\n".join(script_lines)
389
+ script_content = f"{script_content}\n"
390
+ script_path_remote = (
391
+ f"{slurm_job.slurm_script_remote.as_posix()}_wrapper.sh"
392
+ )
393
+ self.fractal_ssh.write_remote_file(
394
+ path=script_path_remote, content=script_content
395
+ )
396
+ cmd = f"bash {script_path_remote}"
397
+ sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
398
+
399
+ # Submit SLURM job and retrieve job ID
400
+ stdout = sbatch_stdout.strip("\n")
401
+ submitted_job_id = int(stdout)
402
+ slurm_job.slurm_job_id = str(submitted_job_id)
403
+
404
+ # Add job to self.jobs
405
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
406
+ logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
407
+
408
+ @property
409
+ def job_ids(self) -> list[str]:
410
+ return list(self.jobs.keys())
411
+
412
+ def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
413
+ # FIXME: This should only transfer archives, not single files
414
+ """
415
+ Note: this would differ for SSH
416
+ """
417
+ source_target_list = [
418
+ (job.slurm_log_file_remote, job.slurm_log_file_local)
419
+ ]
420
+ for task in job.tasks:
421
+ source_target_list.extend(
422
+ [
423
+ (
424
+ task.output_pickle_file_remote,
425
+ task.output_pickle_file_local,
426
+ ),
427
+ (
428
+ task.task_files.log_file_remote,
429
+ task.task_files.log_file_local,
430
+ ),
431
+ (
432
+ task.task_files.args_file_remote,
433
+ task.task_files.args_file_local,
434
+ ),
435
+ (
436
+ task.task_files.metadiff_file_remote,
437
+ task.task_files.metadiff_file_local,
438
+ ),
439
+ ]
440
+ )
441
+
442
+ for source, target in source_target_list:
443
+ try:
444
+ self.fractal_ssh.fetch_file(local=target, remote=source)
445
+ # res = _run_command_as_user(
446
+ # cmd=f"cat {source}",
447
+ # user=self.slurm_user,
448
+ # encoding=None,
449
+ # check=True,
450
+ # )
451
+ # Write local file
452
+ # with open(target, "wb") as f:
453
+ # f.write(res.stdout)
454
+ # logger.critical(f"Copied {source} into {target}")
455
+ except (RuntimeError, FileNotFoundError) as e:
456
+ logger.warning(
457
+ f"SKIP copy {target} into {source}. "
458
+ f"Original error: {str(e)}"
459
+ )
460
+
461
+ def _postprocess_single_task(
462
+ self, *, task: SlurmTask
463
+ ) -> tuple[Any, Exception]:
464
+ try:
465
+ with open(task.output_pickle_file_local, "rb") as f:
466
+ outdata = f.read()
467
+ success, output = cloudpickle.loads(outdata)
468
+ if success:
469
+ result = output
470
+ return result, None
471
+ else:
472
+ exception = _handle_exception_proxy(output)
473
+ return None, exception
474
+ except Exception as e:
475
+ exception = JobExecutionError(f"ERROR, {str(e)}")
476
+ return None, exception
477
+ finally:
478
+ Path(task.input_pickle_file_local).unlink(missing_ok=True)
479
+ Path(task.output_pickle_file_local).unlink(missing_ok=True)
480
+
481
+ def submit(
482
+ self,
483
+ func: callable,
484
+ parameters: dict[str, Any],
485
+ history_item_id: int,
486
+ task_files: TaskFiles,
487
+ slurm_config: SlurmConfig,
488
+ task_type: TaskTypeType,
489
+ ) -> tuple[Any, Exception]:
490
+ workdir_local = task_files.wftask_subfolder_local
491
+ workdir_remote = task_files.wftask_subfolder_remote
492
+
493
+ task_files = TaskFiles(
494
+ **task_files.model_dump(
495
+ exclude={"component"},
496
+ ),
497
+ # FIXME _COMPONENT_KEY_ is deprecated
498
+ component="FIXME_INVALID_FAKE_VALUE",
499
+ # component=parameters[_COMPONENT_KEY_],
500
+ )
501
+
502
+ if self.jobs != {}:
503
+ raise JobExecutionError("Unexpected branch: jobs should be empty.")
504
+
505
+ if self.is_shutdown():
506
+ raise JobExecutionError("Cannot continue after shutdown.")
507
+
508
+ # Validation phase
509
+ self.validate_submit_parameters(
510
+ parameters=parameters,
511
+ task_type=task_type,
512
+ )
513
+
514
+ # Create task subfolder
515
+ workdir_local.mkdir(parents=True)
516
+ self.fractal_ssh.mkdir(
517
+ folder=workdir_remote.as_posix(),
518
+ parents=True,
519
+ )
520
+
521
+ # Submission phase
522
+ slurm_job = SlurmJob(
523
+ label="0",
524
+ workdir_local=workdir_local,
525
+ workdir_remote=workdir_remote,
526
+ tasks=[
527
+ SlurmTask(
528
+ index=0,
529
+ component="0",
530
+ parameters=parameters,
531
+ workdir_remote=workdir_remote,
532
+ workdir_local=workdir_local,
533
+ task_files=task_files,
534
+ )
535
+ ],
536
+ ) # TODO: replace with actual values (BASED ON TASKFILES)
537
+
538
+ slurm_config.parallel_tasks_per_job = 1
539
+ self._submit_single_sbatch(
540
+ func,
541
+ slurm_job=slurm_job,
542
+ slurm_config=slurm_config,
543
+ )
544
+
545
+ # Retrieval phase
546
+ while len(self.jobs) > 0:
547
+ if self.is_shutdown():
548
+ self.scancel_jobs()
549
+ finished_job_ids = get_finished_jobs_ssh(
550
+ job_ids=self.job_ids,
551
+ fractal_ssh=self.fractal_ssh,
552
+ )
553
+ for slurm_job_id in finished_job_ids:
554
+ slurm_job = self.jobs.pop(slurm_job_id)
555
+ self._copy_files_from_remote_to_local(slurm_job)
556
+ result, exception = self._postprocess_single_task(
557
+ task=slurm_job.tasks[0]
558
+ )
559
+ time.sleep(self.slurm_poll_interval)
560
+
561
+ return result, exception
562
+
563
+ def multisubmit(
564
+ self,
565
+ func: callable,
566
+ list_parameters: list[dict],
567
+ history_item_id: int,
568
+ task_files: TaskFiles,
569
+ slurm_config: SlurmConfig,
570
+ task_type: TaskTypeType,
571
+ ):
572
+ # self.scancel_jobs()
573
+
574
+ self.validate_multisubmit_parameters(
575
+ list_parameters=list_parameters,
576
+ task_type=task_type,
577
+ )
578
+
579
+ workdir_local = task_files.wftask_subfolder_local
580
+ workdir_remote = task_files.wftask_subfolder_remote
581
+
582
+ # Create local&remote task subfolders
583
+ if task_type not in ["compound", "converter_compound"]:
584
+ workdir_local.mkdir(parents=True)
585
+ self.fractal_ssh.mkdir(
586
+ folder=workdir_remote.as_posix(),
587
+ parents=True,
588
+ )
589
+
590
+ # Execute tasks, in chunks of size `parallel_tasks_per_job`
591
+ # TODO Pick a data structure for results and exceptions, or review the
592
+ # interface
593
+ results: dict[int, Any] = {}
594
+ exceptions: dict[int, BaseException] = {}
595
+
596
+ original_task_files = task_files
597
+ tot_tasks = len(list_parameters)
598
+
599
+ # Set/validate parameters for task batching
600
+ tasks_per_job, parallel_tasks_per_job = heuristics(
601
+ # Number of parallel components (always known)
602
+ tot_tasks=tot_tasks,
603
+ # Optional WorkflowTask attributes:
604
+ tasks_per_job=slurm_config.tasks_per_job,
605
+ parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
606
+ # Task requirements (multiple possible sources):
607
+ cpus_per_task=slurm_config.cpus_per_task,
608
+ mem_per_task=slurm_config.mem_per_task_MB,
609
+ # Fractal configuration variables (soft/hard limits):
610
+ target_cpus_per_job=slurm_config.target_cpus_per_job,
611
+ target_mem_per_job=slurm_config.target_mem_per_job,
612
+ target_num_jobs=slurm_config.target_num_jobs,
613
+ max_cpus_per_job=slurm_config.max_cpus_per_job,
614
+ max_mem_per_job=slurm_config.max_mem_per_job,
615
+ max_num_jobs=slurm_config.max_num_jobs,
616
+ )
617
+ slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
618
+ slurm_config.tasks_per_job = tasks_per_job
619
+
620
+ # Divide arguments in batches of `tasks_per_job` tasks each
621
+ args_batches = []
622
+ batch_size = tasks_per_job
623
+ for ind_chunk in range(0, tot_tasks, batch_size):
624
+ args_batches.append(
625
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
626
+ )
627
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
628
+ raise RuntimeError("Something wrong here while batching tasks")
629
+
630
+ logger.info(f"START submission phase, {list(self.jobs.keys())=}")
631
+ for ind_batch, chunk in enumerate(args_batches):
632
+ # TODO: replace with actual values
633
+ tasks = []
634
+ for ind_chunk, parameters in enumerate(chunk):
635
+ # FIXME: _COMPONENT_KEY_ is deprecated
636
+ # component = parameters[_COMPONENT_KEY_]
637
+ component = "INVALID_FAKE_VALUE_FIXME"
638
+ tasks.append(
639
+ SlurmTask(
640
+ index=(ind_batch * batch_size) + ind_chunk,
641
+ component=component,
642
+ workdir_local=workdir_local,
643
+ workdir_remote=workdir_remote,
644
+ parameters=parameters,
645
+ zarr_url=parameters["zarr_url"],
646
+ task_files=TaskFiles(
647
+ **original_task_files.model_dump(
648
+ exclude={"component"}
649
+ ),
650
+ component=component,
651
+ ),
652
+ ),
653
+ )
654
+
655
+ slurm_job = SlurmJob(
656
+ label=f"{ind_batch:06d}",
657
+ workdir_local=workdir_local,
658
+ workdir_remote=workdir_remote,
659
+ tasks=tasks,
660
+ )
661
+ self._submit_single_sbatch(
662
+ func,
663
+ slurm_job=slurm_job,
664
+ slurm_config=slurm_config,
665
+ )
666
+ logger.info(f"END submission phase, {list(self.jobs.keys())=}")
667
+
668
+ # Retrieval phase
669
+ while len(self.jobs) > 0:
670
+ if self.is_shutdown():
671
+ self.scancel_jobs()
672
+ finished_job_ids = get_finished_jobs_ssh(
673
+ job_ids=self.job_ids,
674
+ fractal_ssh=self.fractal_ssh,
675
+ )
676
+ for slurm_job_id in finished_job_ids:
677
+ slurm_job = self.jobs.pop(slurm_job_id)
678
+ self._copy_files_from_remote_to_local(slurm_job)
679
+ for task in slurm_job.tasks:
680
+ result, exception = self._postprocess_single_task(
681
+ task=task
682
+ )
683
+ if exception is None:
684
+ results[task.index] = result
685
+ else:
686
+ exceptions[task.index] = exception
687
+ time.sleep(self.slurm_poll_interval)
688
+ return results, exceptions
689
+
690
+ def check_remote_python_interpreter(self):
691
+ settings = Inject(get_settings)
692
+ cmd = (
693
+ f"{self.python_worker_interpreter} "
694
+ "-m fractal_server.app.runner.versions"
695
+ )
696
+ stdout = self.fractal_ssh.run_command(cmd=cmd)
697
+ remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
698
+ if remote_version != __VERSION__:
699
+ error_msg = (
700
+ "Fractal-server version mismatch.\n"
701
+ "Local interpreter: "
702
+ f"({sys.executable}): {__VERSION__}.\n"
703
+ "Remote interpreter: "
704
+ f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {remote_version}."
705
+ )
706
+ logger.error(error_msg)
707
+ raise RuntimeError(error_msg)