fractal-server 2.14.0a8__py3-none-any.whl → 2.14.0a10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v2/dataset.py +0 -10
  3. fractal_server/app/models/v2/job.py +3 -0
  4. fractal_server/app/routes/api/v2/__init__.py +2 -0
  5. fractal_server/app/routes/api/v2/history.py +14 -9
  6. fractal_server/app/routes/api/v2/images.py +5 -2
  7. fractal_server/app/routes/api/v2/submit.py +16 -14
  8. fractal_server/app/routes/api/v2/verify_image_types.py +64 -0
  9. fractal_server/app/routes/api/v2/workflow.py +27 -60
  10. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +67 -0
  11. fractal_server/app/runner/executors/slurm_ssh/runner.py +711 -0
  12. fractal_server/app/runner/executors/slurm_sudo/runner.py +76 -30
  13. fractal_server/app/runner/v2/__init__.py +1 -0
  14. fractal_server/app/runner/v2/_local.py +2 -0
  15. fractal_server/app/runner/v2/_slurm_ssh.py +2 -0
  16. fractal_server/app/runner/v2/_slurm_sudo.py +2 -0
  17. fractal_server/app/runner/v2/runner.py +6 -8
  18. fractal_server/app/runner/v2/runner_functions.py +9 -4
  19. fractal_server/app/schemas/v2/dataset.py +4 -71
  20. fractal_server/app/schemas/v2/dumps.py +6 -5
  21. fractal_server/app/schemas/v2/job.py +6 -3
  22. fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
  23. fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
  24. {fractal_server-2.14.0a8.dist-info → fractal_server-2.14.0a10.dist-info}/METADATA +1 -1
  25. {fractal_server-2.14.0a8.dist-info → fractal_server-2.14.0a10.dist-info}/RECORD +29 -24
  26. /fractal_server/app/runner/executors/{slurm_sudo → slurm_common}/_check_jobs_status.py +0 -0
  27. {fractal_server-2.14.0a8.dist-info → fractal_server-2.14.0a10.dist-info}/LICENSE +0 -0
  28. {fractal_server-2.14.0a8.dist-info → fractal_server-2.14.0a10.dist-info}/WHEEL +0 -0
  29. {fractal_server-2.14.0a8.dist-info → fractal_server-2.14.0a10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,711 @@
1
+ import json
2
+ import math
3
+ import sys
4
+ import time
5
+ from copy import copy
6
+ from pathlib import Path
7
+ from typing import Any
8
+ from typing import Optional
9
+
10
+ import cloudpickle
11
+ from pydantic import BaseModel
12
+ from pydantic import ConfigDict
13
+
14
+ from ._check_job_status_ssh import get_finished_jobs_ssh
15
+ from fractal_server import __VERSION__
16
+ from fractal_server.app.runner.components import _COMPONENT_KEY_
17
+ from fractal_server.app.runner.exceptions import JobExecutionError
18
+ from fractal_server.app.runner.exceptions import TaskExecutionError
19
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
20
+ from fractal_server.app.runner.executors.slurm_common._batching import (
21
+ heuristics,
22
+ )
23
+ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
24
+ SlurmConfig,
25
+ )
26
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
27
+ from fractal_server.app.runner.task_files import TaskFiles
28
+ from fractal_server.app.schemas.v2.task import TaskTypeType
29
+ from fractal_server.config import get_settings
30
+ from fractal_server.logger import set_logger
31
+ from fractal_server.ssh._fabric import FractalSSH
32
+ from fractal_server.syringe import Inject
33
+
34
+ # from fractal_server.app.history import ImageStatus
35
+ # from fractal_server.app.history import update_all_images
36
+ # from fractal_server.app.history import update_single_image
37
+ # from fractal_server.app.history import update_single_image_logfile
38
+
39
+
40
+ logger = set_logger(__name__)
41
+
42
+
43
+ def _handle_exception_proxy(proxy): # FIXME
44
+ if proxy.exc_type_name == "JobExecutionError":
45
+ return JobExecutionError(str(proxy))
46
+ else:
47
+ kwargs = {}
48
+ for key in [
49
+ "workflow_task_id",
50
+ "workflow_task_order",
51
+ "task_name",
52
+ ]:
53
+ if key in proxy.kwargs.keys():
54
+ kwargs[key] = proxy.kwargs[key]
55
+ return TaskExecutionError(proxy.tb, **kwargs)
56
+
57
+
58
+ class SlurmTask(BaseModel):
59
+ model_config = ConfigDict(arbitrary_types_allowed=True)
60
+ component: str
61
+ workdir_local: Path
62
+ workdir_remote: Path
63
+ parameters: dict[str, Any]
64
+ zarr_url: Optional[str] = None
65
+ task_files: TaskFiles
66
+ index: int
67
+
68
+ @property
69
+ def input_pickle_file_local(self) -> str:
70
+ return (
71
+ self.workdir_local / f"{self.component}-input.pickle"
72
+ ).as_posix()
73
+
74
+ @property
75
+ def output_pickle_file_local(self) -> str:
76
+ return (
77
+ self.workdir_local / f"{self.component}-output.pickle"
78
+ ).as_posix()
79
+
80
+ @property
81
+ def input_pickle_file_remote(self) -> str:
82
+ return (
83
+ self.workdir_remote / f"{self.component}-input.pickle"
84
+ ).as_posix()
85
+
86
+ @property
87
+ def output_pickle_file_remote(self) -> str:
88
+ return (
89
+ self.workdir_remote / f"{self.component}-output.pickle"
90
+ ).as_posix()
91
+
92
+
93
+ class SlurmJob(BaseModel):
94
+ slurm_job_id: Optional[str] = None
95
+ label: str
96
+ workdir_local: Path
97
+ workdir_remote: Path
98
+ tasks: list[SlurmTask]
99
+
100
+ @property
101
+ def slurm_log_file_local(self) -> str:
102
+ if self.slurm_job_id:
103
+ return (
104
+ self.workdir_local
105
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
106
+ ).as_posix()
107
+ else:
108
+ return (
109
+ self.workdir_local / f"slurm-{self.label}-%j.log"
110
+ ).as_posix()
111
+
112
+ @property
113
+ def slurm_log_file_remote(self) -> str:
114
+ if self.slurm_job_id:
115
+ return (
116
+ self.workdir_remote
117
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
118
+ ).as_posix()
119
+ else:
120
+ return (
121
+ self.workdir_remote / f"slurm-{self.label}-%j.log"
122
+ ).as_posix()
123
+
124
+ @property
125
+ def slurm_submission_script_local(self) -> str:
126
+ return (
127
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
128
+ ).as_posix()
129
+
130
+ @property
131
+ def slurm_submission_script_remote(self) -> str:
132
+ return (
133
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
134
+ ).as_posix()
135
+
136
+ @property
137
+ def slurm_stdout(self) -> str:
138
+ return (self.workdir_remote / f"slurm-{self.label}.out").as_posix()
139
+
140
+ @property
141
+ def slurm_stderr(self) -> str:
142
+ return (self.workdir_remote / f"slurm-{self.label}.err").as_posix()
143
+
144
+ @property
145
+ def log_files_local(self) -> list[str]:
146
+ return [task.task_files.log_file_local for task in self.tasks]
147
+
148
+
149
+ # def _subprocess_run_or_raise(
150
+ # full_command: str,
151
+ # ) -> Optional[subprocess.CompletedProcess]:
152
+ # try:
153
+ # output = subprocess.run( # nosec
154
+ # shlex.split(full_command),
155
+ # capture_output=True,
156
+ # check=True,
157
+ # encoding="utf-8",
158
+ # )
159
+ # return output
160
+ # except subprocess.CalledProcessError as e:
161
+ # error_msg = (
162
+ # f"Submit command `{full_command}` failed. "
163
+ # f"Original error:\n{str(e)}\n"
164
+ # f"Original stdout:\n{e.stdout}\n"
165
+ # f"Original stderr:\n{e.stderr}\n"
166
+ # )
167
+ # logging.error(error_msg)
168
+ # raise JobExecutionError(info=error_msg)
169
+
170
+
171
+ class RunnerSlurmSSH(BaseRunner):
172
+ fractal_ssh: FractalSSH
173
+
174
+ slurm_user: str
175
+ shutdown_file: Path
176
+ common_script_lines: list[str]
177
+ user_cache_dir: str
178
+ root_dir_local: Path
179
+ root_dir_remote: Path
180
+ slurm_account: Optional[str] = None
181
+ poll_interval: int
182
+ python_worker_interpreter: str
183
+ jobs: dict[str, SlurmJob]
184
+
185
+ def __init__(
186
+ self,
187
+ *,
188
+ fractal_ssh: FractalSSH,
189
+ slurm_user: str,
190
+ root_dir_local: Path,
191
+ root_dir_remote: Path,
192
+ slurm_account: Optional[str] = None,
193
+ common_script_lines: Optional[list[str]] = None,
194
+ user_cache_dir: Optional[str] = None,
195
+ slurm_poll_interval: Optional[int] = None,
196
+ ) -> None:
197
+ """
198
+ Set parameters that are the same for different Fractal tasks and for
199
+ different SLURM jobs/tasks.
200
+ """
201
+
202
+ self.slurm_user = slurm_user
203
+ self.slurm_account = slurm_account
204
+ self.common_script_lines = common_script_lines or []
205
+
206
+ # Check that SLURM account is not set here
207
+ # FIXME: move to little method
208
+ try:
209
+ invalid_line = next(
210
+ line
211
+ for line in self.common_script_lines
212
+ if line.startswith("#SBATCH --account=")
213
+ )
214
+ raise RuntimeError(
215
+ "Invalid line in `RunnerSlurmSSH.common_script_lines`: "
216
+ f"'{invalid_line}'.\n"
217
+ "SLURM account must be set via the request body of the "
218
+ "apply-workflow endpoint, or by modifying the user properties."
219
+ )
220
+ except StopIteration:
221
+ pass
222
+
223
+ # Check Python versions
224
+ settings = Inject(get_settings)
225
+ self.fractal_ssh = fractal_ssh
226
+ logger.warning(self.fractal_ssh)
227
+
228
+ # It is the new handshanke
229
+ if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
230
+ self.check_remote_python_interpreter()
231
+
232
+ # Initialize connection and perform handshake
233
+ self.root_dir_local = root_dir_local
234
+ self.root_dir_remote = root_dir_remote
235
+
236
+ # # Create folders
237
+ # original_umask = os.umask(0)
238
+ # self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
239
+ # os.umask(original_umask)
240
+ # _mkdir_as_user(
241
+ # folder=self.root_dir_remote.as_posix(),
242
+ # user=self.slurm_user,
243
+ # )
244
+
245
+ self.user_cache_dir = user_cache_dir
246
+
247
+ self.slurm_poll_interval = (
248
+ slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
249
+ )
250
+
251
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
252
+
253
+ self.python_worker_interpreter = (
254
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
255
+ )
256
+
257
+ self.jobs = {}
258
+
259
+ def __enter__(self):
260
+ return self
261
+
262
+ def __exit__(self, exc_type, exc_val, exc_tb):
263
+ return False
264
+
265
+ def is_shutdown(self) -> bool:
266
+ return self.shutdown_file.exists()
267
+
268
+ def scancel_jobs(self) -> None:
269
+ logger.debug("[scancel_jobs] START")
270
+
271
+ if self.jobs:
272
+ scancel_string = " ".join(self.job_ids)
273
+ scancel_cmd = f"scancel {scancel_string}"
274
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
275
+ try:
276
+ self.fractal_ssh.run_command(cmd=scancel_cmd)
277
+ # _run_command_as_user(
278
+ # cmd=scancel_cmd,
279
+ # user=self.slurm_user,
280
+ # check=True,
281
+ # )
282
+ except RuntimeError as e:
283
+ logger.warning(
284
+ "[scancel_jobs] `scancel` command failed. "
285
+ f"Original error:\n{str(e)}"
286
+ )
287
+
288
+ logger.debug("[scancel_jobs] END")
289
+
290
+ def _submit_single_sbatch(
291
+ self,
292
+ func,
293
+ slurm_job: SlurmJob,
294
+ slurm_config: SlurmConfig,
295
+ ) -> str:
296
+ # Prepare input pickle(s)
297
+ versions = dict(
298
+ python=sys.version_info[:3],
299
+ cloudpickle=cloudpickle.__version__,
300
+ fractal_server=__VERSION__,
301
+ )
302
+ for task in slurm_job.tasks:
303
+ _args = []
304
+ _kwargs = dict(parameters=task.parameters)
305
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
306
+ with open(task.input_pickle_file_local, "wb") as f:
307
+ f.write(funcser)
308
+ # Prepare commands to be included in SLURM submission script
309
+ settings = Inject(get_settings)
310
+ python_worker_interpreter = (
311
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
312
+ )
313
+ cmdlines = []
314
+ for task in slurm_job.tasks:
315
+ input_pickle_file = task.input_pickle_file_local
316
+ output_pickle_file = task.output_pickle_file_remote
317
+ cmdlines.append(
318
+ (
319
+ f"{python_worker_interpreter}"
320
+ " -m fractal_server.app.runner."
321
+ "executors.slurm_common.remote "
322
+ f"--input-file {input_pickle_file} "
323
+ f"--output-file {output_pickle_file}"
324
+ )
325
+ )
326
+
327
+ # ...
328
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
329
+ mem_per_task_MB = slurm_config.mem_per_task_MB
330
+
331
+ # Set ntasks
332
+ ntasks = min(len(cmdlines), num_tasks_max_running)
333
+ slurm_config.parallel_tasks_per_job = ntasks
334
+
335
+ # Prepare SLURM preamble based on SlurmConfig object
336
+ script_lines = slurm_config.to_sbatch_preamble(
337
+ remote_export_dir=self.user_cache_dir
338
+ )
339
+
340
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
341
+ # fix their order
342
+ script_lines.extend(
343
+ [
344
+ f"#SBATCH --err={slurm_job.slurm_stderr}",
345
+ f"#SBATCH --out={slurm_job.slurm_stdout}",
346
+ f"#SBATCH -D {slurm_job.workdir_remote}",
347
+ ]
348
+ )
349
+ script_lines = slurm_config.sort_script_lines(script_lines)
350
+ logger.debug(script_lines)
351
+
352
+ # Always print output of `uname -n` and `pwd`
353
+ script_lines.append(
354
+ '"Hostname: `uname -n`; current directory: `pwd`"\n'
355
+ )
356
+
357
+ # Complete script preamble
358
+ script_lines.append("\n")
359
+
360
+ # Include command lines
361
+ tmp_list_commands = copy(cmdlines)
362
+ while tmp_list_commands:
363
+ if tmp_list_commands:
364
+ cmd = tmp_list_commands.pop(0) # take first element
365
+ script_lines.append(
366
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
367
+ f"--mem={mem_per_task_MB}MB "
368
+ f"{cmd} &"
369
+ )
370
+ script_lines.append("wait\n")
371
+
372
+ script = "\n".join(script_lines)
373
+
374
+ # Write submission script
375
+ # submission_script_contents = "\n".join(preamble_lines + cmdlines)
376
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
377
+ f.write(script)
378
+
379
+ self.fractal_ssh.send_file(
380
+ local=slurm_job.slurm_submission_script_local,
381
+ remote=slurm_job.slurm_submission_script_remote,
382
+ )
383
+
384
+ # Run sbatch
385
+ submit_command = (
386
+ f"sbatch --parsable {slurm_job.slurm_submission_script_remote}"
387
+ )
388
+ pre_submission_cmds = slurm_config.pre_submission_commands
389
+ if len(pre_submission_cmds) == 0:
390
+ sbatch_stdout = self.fractal_ssh.run_command(cmd=submit_command)
391
+ else:
392
+ logger.debug(f"Now using {pre_submission_cmds=}")
393
+ script_lines = pre_submission_cmds + [submit_command]
394
+ script_content = "\n".join(script_lines)
395
+ script_content = f"{script_content}\n"
396
+ script_path_remote = (
397
+ f"{slurm_job.slurm_script_remote.as_posix()}_wrapper.sh"
398
+ )
399
+ self.fractal_ssh.write_remote_file(
400
+ path=script_path_remote, content=script_content
401
+ )
402
+ cmd = f"bash {script_path_remote}"
403
+ sbatch_stdout = self.fractal_ssh.run_command(cmd=cmd)
404
+
405
+ # Submit SLURM job and retrieve job ID
406
+ stdout = sbatch_stdout.strip("\n")
407
+ submitted_job_id = int(stdout)
408
+ slurm_job.slurm_job_id = str(submitted_job_id)
409
+
410
+ # Add job to self.jobs
411
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
412
+ logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
413
+
414
+ @property
415
+ def job_ids(self) -> list[str]:
416
+ return list(self.jobs.keys())
417
+
418
+ def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
419
+ # FIXME: This should only transfer archives, not single files
420
+ """
421
+ Note: this would differ for SSH
422
+ """
423
+ source_target_list = [
424
+ (job.slurm_log_file_remote, job.slurm_log_file_local)
425
+ ]
426
+ for task in job.tasks:
427
+ source_target_list.extend(
428
+ [
429
+ (
430
+ task.output_pickle_file_remote,
431
+ task.output_pickle_file_local,
432
+ ),
433
+ (
434
+ task.task_files.log_file_remote,
435
+ task.task_files.log_file_local,
436
+ ),
437
+ (
438
+ task.task_files.args_file_remote,
439
+ task.task_files.args_file_local,
440
+ ),
441
+ (
442
+ task.task_files.metadiff_file_remote,
443
+ task.task_files.metadiff_file_local,
444
+ ),
445
+ ]
446
+ )
447
+
448
+ for source, target in source_target_list:
449
+ try:
450
+ self.fractal_ssh.fetch_file(local=target, remote=source)
451
+ # res = _run_command_as_user(
452
+ # cmd=f"cat {source}",
453
+ # user=self.slurm_user,
454
+ # encoding=None,
455
+ # check=True,
456
+ # )
457
+ # Write local file
458
+ # with open(target, "wb") as f:
459
+ # f.write(res.stdout)
460
+ # logger.critical(f"Copied {source} into {target}")
461
+ except (RuntimeError, FileNotFoundError) as e:
462
+ logger.warning(
463
+ f"SKIP copy {target} into {source}. "
464
+ f"Original error: {str(e)}"
465
+ )
466
+
467
+ def _postprocess_single_task(
468
+ self, *, task: SlurmTask
469
+ ) -> tuple[Any, Exception]:
470
+ try:
471
+ with open(task.output_pickle_file_local, "rb") as f:
472
+ outdata = f.read()
473
+ success, output = cloudpickle.loads(outdata)
474
+ if success:
475
+ result = output
476
+ return result, None
477
+ else:
478
+ exception = _handle_exception_proxy(output)
479
+ return None, exception
480
+ except Exception as e:
481
+ exception = JobExecutionError(f"ERROR, {str(e)}")
482
+ return None, exception
483
+ finally:
484
+ Path(task.input_pickle_file_local).unlink(missing_ok=True)
485
+ Path(task.output_pickle_file_local).unlink(missing_ok=True)
486
+
487
+ def submit(
488
+ self,
489
+ func: callable,
490
+ parameters: dict[str, Any],
491
+ history_item_id: int,
492
+ task_files: TaskFiles,
493
+ slurm_config: SlurmConfig,
494
+ task_type: TaskTypeType,
495
+ ) -> tuple[Any, Exception]:
496
+ workdir_local = task_files.wftask_subfolder_local
497
+ workdir_remote = task_files.wftask_subfolder_remote
498
+
499
+ task_files = TaskFiles(
500
+ **task_files.model_dump(
501
+ exclude={"component"},
502
+ ),
503
+ component=parameters[_COMPONENT_KEY_],
504
+ )
505
+
506
+ if self.jobs != {}:
507
+ raise JobExecutionError("Unexpected branch: jobs should be empty.")
508
+
509
+ if self.is_shutdown():
510
+ raise JobExecutionError("Cannot continue after shutdown.")
511
+
512
+ # Validation phase
513
+ self.validate_submit_parameters(
514
+ parameters=parameters,
515
+ task_type=task_type,
516
+ )
517
+
518
+ # Create task subfolder
519
+ workdir_local.mkdir(parents=True)
520
+ self.fractal_ssh.mkdir(
521
+ folder=workdir_remote.as_posix(),
522
+ parents=True,
523
+ )
524
+
525
+ # Submission phase
526
+ slurm_job = SlurmJob(
527
+ label="0",
528
+ workdir_local=workdir_local,
529
+ workdir_remote=workdir_remote,
530
+ tasks=[
531
+ SlurmTask(
532
+ index=0,
533
+ component="0",
534
+ parameters=parameters,
535
+ workdir_remote=workdir_remote,
536
+ workdir_local=workdir_local,
537
+ task_files=task_files,
538
+ )
539
+ ],
540
+ ) # TODO: replace with actual values (BASED ON TASKFILES)
541
+
542
+ slurm_config.parallel_tasks_per_job = 1
543
+ self._submit_single_sbatch(
544
+ func,
545
+ slurm_job=slurm_job,
546
+ slurm_config=slurm_config,
547
+ )
548
+
549
+ # LOGFILE = task_files.log_file_local
550
+
551
+ # Retrieval phase
552
+ while len(self.jobs) > 0:
553
+ if self.is_shutdown():
554
+ self.scancel_jobs()
555
+ finished_job_ids = get_finished_jobs_ssh(
556
+ job_ids=self.job_ids,
557
+ fractal_ssh=self.fractal_ssh,
558
+ )
559
+ for slurm_job_id in finished_job_ids:
560
+ slurm_job = self.jobs.pop(slurm_job_id)
561
+ self._copy_files_from_remote_to_local(slurm_job)
562
+ result, exception = self._postprocess_single_task(
563
+ task=slurm_job.tasks[0]
564
+ )
565
+ time.sleep(self.slurm_poll_interval)
566
+
567
+ return result, exception
568
+
569
+ def multisubmit(
570
+ self,
571
+ func: callable,
572
+ list_parameters: list[dict],
573
+ history_item_id: int,
574
+ task_files: TaskFiles,
575
+ slurm_config: SlurmConfig,
576
+ task_type: TaskTypeType,
577
+ ):
578
+ # self.scancel_jobs()
579
+
580
+ self.validate_multisubmit_parameters(
581
+ list_parameters=list_parameters,
582
+ task_type=task_type,
583
+ )
584
+
585
+ workdir_local = task_files.wftask_subfolder_local
586
+ workdir_remote = task_files.wftask_subfolder_remote
587
+
588
+ # Create local&remote task subfolders
589
+ if task_type not in ["compound", "converter_compound"]:
590
+ workdir_local.mkdir(parents=True)
591
+ self.fractal_ssh.mkdir(
592
+ folder=workdir_remote.as_posix(),
593
+ parents=True,
594
+ )
595
+
596
+ # Execute tasks, in chunks of size `parallel_tasks_per_job`
597
+ # TODO Pick a data structure for results and exceptions, or review the
598
+ # interface
599
+ results: dict[int, Any] = {}
600
+ exceptions: dict[int, BaseException] = {}
601
+
602
+ original_task_files = task_files
603
+ tot_tasks = len(list_parameters)
604
+
605
+ # Set/validate parameters for task batching
606
+ tasks_per_job, parallel_tasks_per_job = heuristics(
607
+ # Number of parallel components (always known)
608
+ tot_tasks=tot_tasks,
609
+ # Optional WorkflowTask attributes:
610
+ tasks_per_job=slurm_config.tasks_per_job,
611
+ parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
612
+ # Task requirements (multiple possible sources):
613
+ cpus_per_task=slurm_config.cpus_per_task,
614
+ mem_per_task=slurm_config.mem_per_task_MB,
615
+ # Fractal configuration variables (soft/hard limits):
616
+ target_cpus_per_job=slurm_config.target_cpus_per_job,
617
+ target_mem_per_job=slurm_config.target_mem_per_job,
618
+ target_num_jobs=slurm_config.target_num_jobs,
619
+ max_cpus_per_job=slurm_config.max_cpus_per_job,
620
+ max_mem_per_job=slurm_config.max_mem_per_job,
621
+ max_num_jobs=slurm_config.max_num_jobs,
622
+ )
623
+ slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
624
+ slurm_config.tasks_per_job = tasks_per_job
625
+
626
+ # Divide arguments in batches of `tasks_per_job` tasks each
627
+ args_batches = []
628
+ batch_size = tasks_per_job
629
+ for ind_chunk in range(0, tot_tasks, batch_size):
630
+ args_batches.append(
631
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
632
+ )
633
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
634
+ raise RuntimeError("Something wrong here while batching tasks")
635
+
636
+ logger.info(f"START submission phase, {list(self.jobs.keys())=}")
637
+ for ind_batch, chunk in enumerate(args_batches):
638
+ # TODO: replace with actual values
639
+ tasks = []
640
+ for ind_chunk, parameters in enumerate(chunk):
641
+ component = parameters[_COMPONENT_KEY_]
642
+ tasks.append(
643
+ SlurmTask(
644
+ index=(ind_batch * batch_size) + ind_chunk,
645
+ component=component,
646
+ workdir_local=workdir_local,
647
+ workdir_remote=workdir_remote,
648
+ parameters=parameters,
649
+ zarr_url=parameters["zarr_url"],
650
+ task_files=TaskFiles(
651
+ **original_task_files.model_dump(
652
+ exclude={"component"}
653
+ ),
654
+ component=component,
655
+ ),
656
+ ),
657
+ )
658
+
659
+ slurm_job = SlurmJob(
660
+ label=f"{ind_batch:06d}",
661
+ workdir_local=workdir_local,
662
+ workdir_remote=workdir_remote,
663
+ tasks=tasks,
664
+ )
665
+ self._submit_single_sbatch(
666
+ func,
667
+ slurm_job=slurm_job,
668
+ slurm_config=slurm_config,
669
+ )
670
+ logger.info(f"END submission phase, {list(self.jobs.keys())=}")
671
+
672
+ # Retrieval phase
673
+ while len(self.jobs) > 0:
674
+ if self.is_shutdown():
675
+ self.scancel_jobs()
676
+ finished_job_ids = get_finished_jobs_ssh(
677
+ job_ids=self.job_ids,
678
+ fractal_ssh=self.fractal_ssh,
679
+ )
680
+ for slurm_job_id in finished_job_ids:
681
+ slurm_job = self.jobs.pop(slurm_job_id)
682
+ self._copy_files_from_remote_to_local(slurm_job)
683
+ for task in slurm_job.tasks:
684
+ result, exception = self._postprocess_single_task(
685
+ task=task
686
+ )
687
+ if exception is None:
688
+ results[task.index] = result
689
+ else:
690
+ exceptions[task.index] = exception
691
+ time.sleep(self.slurm_poll_interval)
692
+ return results, exceptions
693
+
694
+ def check_remote_python_interpreter(self):
695
+ settings = Inject(get_settings)
696
+ cmd = (
697
+ f"{self.python_worker_interpreter} "
698
+ "-m fractal_server.app.runner.versions"
699
+ )
700
+ stdout = self.fractal_ssh.run_command(cmd=cmd)
701
+ remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
702
+ if remote_version != __VERSION__:
703
+ error_msg = (
704
+ "Fractal-server version mismatch.\n"
705
+ "Local interpreter: "
706
+ f"({sys.executable}): {__VERSION__}.\n"
707
+ "Remote interpreter: "
708
+ f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {remote_version}."
709
+ )
710
+ logger.error(error_msg)
711
+ raise RuntimeError(error_msg)