fractal-server 2.14.0a13__py3-none-any.whl → 2.14.0a14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/linkusergroup.py +6 -2
  3. fractal_server/app/models/v2/dataset.py +1 -1
  4. fractal_server/app/models/v2/job.py +7 -3
  5. fractal_server/app/models/v2/task_group.py +2 -2
  6. fractal_server/app/models/v2/workflow.py +1 -1
  7. fractal_server/app/models/v2/workflowtask.py +1 -1
  8. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  9. fractal_server/app/routes/api/v2/dataset.py +0 -8
  10. fractal_server/app/routes/api/v2/history.py +112 -27
  11. fractal_server/app/routes/api/v2/images.py +16 -14
  12. fractal_server/app/routes/api/v2/project.py +0 -52
  13. fractal_server/app/routes/api/v2/task_group.py +0 -17
  14. fractal_server/app/routes/api/v2/workflow.py +0 -8
  15. fractal_server/app/routes/auth/group.py +0 -16
  16. fractal_server/app/runner/executors/base_runner.py +5 -0
  17. fractal_server/app/runner/executors/local/runner.py +15 -7
  18. fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
  19. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +676 -0
  20. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
  21. fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
  22. fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
  23. fractal_server/app/runner/task_files.py +20 -6
  24. fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
  25. fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
  26. fractal_server/app/runner/v2/runner.py +4 -0
  27. fractal_server/app/runner/v2/runner_functions.py +2 -2
  28. fractal_server/app/runner/v2/submit_workflow.py +7 -16
  29. fractal_server/app/schemas/v2/__init__.py +3 -1
  30. fractal_server/app/schemas/v2/history.py +27 -2
  31. fractal_server/config.py +6 -2
  32. fractal_server/images/tools.py +23 -0
  33. fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
  34. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
  35. fractal_server/tasks/v2/utils_background.py +0 -19
  36. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/METADATA +1 -1
  37. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/RECORD +40 -41
  38. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
  39. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
  40. fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
  41. fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
  42. fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
  43. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/LICENSE +0 -0
  44. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/WHEEL +0 -0
  45. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a14.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,17 @@
1
+ from fractal_server.app.runner.exceptions import JobExecutionError
2
+ from fractal_server.app.runner.exceptions import TaskExecutionError
3
+
4
+
5
+ def _handle_exception_proxy(proxy): # FIXME
6
+ if proxy.exc_type_name == "JobExecutionError":
7
+ return JobExecutionError(str(proxy))
8
+ else:
9
+ kwargs = {}
10
+ for key in [
11
+ "workflow_task_id",
12
+ "workflow_task_order",
13
+ "task_name",
14
+ ]:
15
+ if key in proxy.kwargs.keys():
16
+ kwargs[key] = proxy.kwargs[key]
17
+ return TaskExecutionError(proxy.tb, **kwargs)
@@ -0,0 +1,676 @@
1
+ import json
2
+ import math
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any
7
+ from typing import Literal
8
+ from typing import Optional
9
+
10
+ import cloudpickle
11
+
12
+ from ..slurm_common._slurm_config import SlurmConfig
13
+ from ..slurm_common.slurm_job_task_models import SlurmJob
14
+ from ..slurm_common.slurm_job_task_models import SlurmTask
15
+ from ._batching import heuristics
16
+ from ._handle_exception_proxy import _handle_exception_proxy
17
+ from ._job_states import STATES_FINISHED
18
+ from fractal_server import __VERSION__
19
+ from fractal_server.app.db import get_sync_db
20
+ from fractal_server.app.runner.exceptions import JobExecutionError
21
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
22
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
23
+ from fractal_server.app.runner.task_files import MULTISUBMIT_PREFIX
24
+ from fractal_server.app.runner.task_files import SUBMIT_PREFIX
25
+ from fractal_server.app.runner.task_files import TaskFiles
26
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
27
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
28
+ from fractal_server.config import get_settings
29
+ from fractal_server.logger import set_logger
30
+ from fractal_server.syringe import Inject
31
+
32
+ logger = set_logger(__name__)
33
+
34
+ # FIXME: Transform several logger.info into logger.debug.
35
+
36
+
37
+ class BaseSlurmRunner(BaseRunner):
38
+ shutdown_file: Path
39
+ common_script_lines: list[str]
40
+ user_cache_dir: str
41
+ root_dir_local: Path
42
+ root_dir_remote: Path
43
+ poll_interval: int
44
+ jobs: dict[str, SlurmJob]
45
+ python_worker_interpreter: str
46
+ slurm_runner_type: Literal["ssh", "sudo"]
47
+
48
+ def __init__(
49
+ self,
50
+ root_dir_local: Path,
51
+ root_dir_remote: Path,
52
+ slurm_runner_type: Literal["ssh", "sudo"],
53
+ common_script_lines: Optional[list[str]] = None,
54
+ user_cache_dir: Optional[str] = None,
55
+ poll_interval: Optional[int] = None,
56
+ ):
57
+ self.slurm_runner_type = slurm_runner_type
58
+ self.root_dir_local = root_dir_local
59
+ self.root_dir_remote = root_dir_remote
60
+ self.common_script_lines = common_script_lines or []
61
+ self._check_slurm_account()
62
+ self.user_cache_dir = user_cache_dir
63
+
64
+ settings = Inject(get_settings)
65
+
66
+ self.poll_interval = (
67
+ poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
68
+ )
69
+ self.check_fractal_server_versions()
70
+
71
+ # Create job folders. Note that the local one may or may not exist
72
+ # depending on whether it is a test or an actual run
73
+ if not self.root_dir_local.is_dir():
74
+ self._mkdir_local_folder(self.root_dir_local.as_posix())
75
+ self._mkdir_remote_folder(self.root_dir_remote.as_posix())
76
+
77
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
78
+ self.jobs = {}
79
+
80
+ def __enter__(self):
81
+ return self
82
+
83
+ def __exit__(self, exc_type, exc_val, exc_tb):
84
+ return False
85
+
86
+ def _run_local_cmd(self, cmd: str) -> str:
87
+ raise NotImplementedError("Implement in child class.")
88
+
89
+ def _run_remote_cmd(self, cmd: str) -> str:
90
+ raise NotImplementedError("Implement in child class.")
91
+
92
+ def run_squeue(self, job_ids: list[str]) -> tuple[bool, str]:
93
+ # FIXME: review different cases (exception vs no job found)
94
+ job_id_single_str = ",".join([str(j) for j in job_ids])
95
+ cmd = (
96
+ f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
97
+ " --states=all"
98
+ )
99
+
100
+ try:
101
+ if self.slurm_runner_type == "sudo":
102
+ stdout = self._run_local_cmd(cmd)
103
+ else:
104
+ stdout = self._run_remote_cmd(cmd)
105
+ return True, stdout
106
+ except Exception as e:
107
+ logger.info(f"{cmd=} failed with {str(e)}")
108
+ return False, ""
109
+
110
+ def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
111
+ # If there is no Slurm job to check, return right away
112
+
113
+ if not job_ids:
114
+ return set()
115
+ id_to_state = dict()
116
+
117
+ success, stdout = self.run_squeue(job_ids)
118
+ if success:
119
+ id_to_state = {
120
+ out.split()[0]: out.split()[1] for out in stdout.splitlines()
121
+ }
122
+ else:
123
+ id_to_state = dict()
124
+ for j in job_ids:
125
+ success, res = self.run_squeue([j])
126
+ if not success:
127
+ logger.info(f"Job {j} not found. Marked it as completed")
128
+ id_to_state.update({str(j): "COMPLETED"})
129
+ else:
130
+ id_to_state.update(
131
+ {res.stdout.split()[0]: res.stdout.split()[1]}
132
+ )
133
+
134
+ # Finished jobs only stay in squeue for a few mins (configurable). If
135
+ # a job ID isn't there, we'll assume it's finished.
136
+ return {
137
+ j
138
+ for j in job_ids
139
+ if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
140
+ }
141
+
142
+ def _mkdir_local_folder(self, folder: str) -> None:
143
+ raise NotImplementedError("Implement in child class.")
144
+
145
+ def _mkdir_remote_folder(self, folder: str) -> None:
146
+ raise NotImplementedError("Implement in child class.")
147
+
148
+ def _submit_single_sbatch(
149
+ self,
150
+ func,
151
+ slurm_job: SlurmJob,
152
+ slurm_config: SlurmConfig,
153
+ ) -> str:
154
+ logger.info("[_submit_single_sbatch] START")
155
+ # Prepare input pickle(s)
156
+ versions = dict(
157
+ python=sys.version_info[:3],
158
+ cloudpickle=cloudpickle.__version__,
159
+ fractal_server=__VERSION__,
160
+ )
161
+ for task in slurm_job.tasks:
162
+ # Wrinte input pickle
163
+ _args = []
164
+ _kwargs = dict(
165
+ parameters=task.parameters,
166
+ remote_files=task.task_files.remote_files_dict,
167
+ )
168
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
169
+ with open(task.input_pickle_file_local, "wb") as f:
170
+ f.write(funcser)
171
+ logger.info(
172
+ "[_submit_single_sbatch] Written "
173
+ f"{task.input_pickle_file_local=}"
174
+ )
175
+
176
+ if self.slurm_runner_type == "ssh":
177
+ # Send input pickle (only relevant for SSH)
178
+ self.fractal_ssh.send_file(
179
+ local=task.input_pickle_file_local,
180
+ remote=task.input_pickle_file_remote,
181
+ )
182
+ logger.info(
183
+ "[_submit_single_sbatch] Transferred "
184
+ f"{task.input_pickle_file_local=}"
185
+ )
186
+
187
+ # Prepare commands to be included in SLURM submission script
188
+ cmdlines = []
189
+ for task in slurm_job.tasks:
190
+ if self.slurm_runner_type == "ssh":
191
+ input_pickle_file = task.input_pickle_file_remote
192
+ else:
193
+ input_pickle_file = task.input_pickle_file_local
194
+ output_pickle_file = task.output_pickle_file_remote
195
+ cmdlines.append(
196
+ (
197
+ f"{self.python_worker_interpreter}"
198
+ " -m fractal_server.app.runner."
199
+ "executors.slurm_common.remote "
200
+ f"--input-file {input_pickle_file} "
201
+ f"--output-file {output_pickle_file}"
202
+ )
203
+ )
204
+
205
+ # Set ntasks
206
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
207
+ ntasks = min(len(cmdlines), num_tasks_max_running)
208
+ slurm_config.parallel_tasks_per_job = ntasks
209
+
210
+ # Prepare SLURM preamble based on SlurmConfig object
211
+ script_lines = slurm_config.to_sbatch_preamble(
212
+ remote_export_dir=self.user_cache_dir
213
+ )
214
+
215
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
216
+ # fix their order
217
+ script_lines.extend(
218
+ [
219
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
220
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
221
+ f"#SBATCH -D {slurm_job.workdir_remote}",
222
+ ]
223
+ )
224
+ script_lines = slurm_config.sort_script_lines(script_lines)
225
+ logger.info(script_lines)
226
+
227
+ # Always print output of `uname -n` and `pwd`
228
+ script_lines.append(
229
+ '"Hostname: `uname -n`; current directory: `pwd`"\n'
230
+ )
231
+
232
+ # Complete script preamble
233
+ script_lines.append("\n")
234
+
235
+ # Include command lines
236
+ mem_per_task_MB = slurm_config.mem_per_task_MB
237
+ for cmd in cmdlines:
238
+ script_lines.append(
239
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
240
+ f"--mem={mem_per_task_MB}MB "
241
+ f"{cmd} &"
242
+ )
243
+ script_lines.append("wait\n")
244
+ script = "\n".join(script_lines)
245
+
246
+ # Write submission script
247
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
248
+ f.write(script)
249
+ logger.info(
250
+ "[_submit_single_sbatch] Written "
251
+ f"{slurm_job.slurm_submission_script_local=}"
252
+ )
253
+
254
+ if self.slurm_runner_type == "ssh":
255
+ self.fractal_ssh.send_file(
256
+ local=slurm_job.slurm_submission_script_local,
257
+ remote=slurm_job.slurm_submission_script_remote,
258
+ )
259
+ submit_command = (
260
+ "sbatch --parsable "
261
+ f"{slurm_job.slurm_submission_script_remote}"
262
+ )
263
+ else:
264
+ submit_command = (
265
+ "sbatch --parsable "
266
+ f"{slurm_job.slurm_submission_script_local}"
267
+ )
268
+ # Run sbatch
269
+ pre_submission_cmds = slurm_config.pre_submission_commands
270
+ if len(pre_submission_cmds) == 0:
271
+ logger.info(f"Now run {submit_command=}")
272
+ sbatch_stdout = self._run_remote_cmd(submit_command)
273
+ else:
274
+ logger.info(f"Now using {pre_submission_cmds=}")
275
+ script_lines = pre_submission_cmds + [submit_command]
276
+ wrapper_script_contents = "\n".join(script_lines)
277
+ wrapper_script_contents = f"{wrapper_script_contents}\n"
278
+ if self.slurm_runner_type == "ssh":
279
+ wrapper_script = (
280
+ f"{slurm_job.slurm_submission_script_remote}_wrapper.sh"
281
+ )
282
+ self.fractal_ssh.write_remote_file(
283
+ path=wrapper_script, content=wrapper_script_contents
284
+ )
285
+ else:
286
+ wrapper_script = (
287
+ f"{slurm_job.slurm_submission_script_local}_wrapper.sh"
288
+ )
289
+ with open(wrapper_script, "w") as f:
290
+ f.write(wrapper_script_contents)
291
+ logger.info(f"Now run {wrapper_script=}")
292
+ sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
293
+
294
+ # Submit SLURM job and retrieve job ID
295
+ logger.info(f"[_submit_single_sbatc] {sbatch_stdout=}")
296
+ stdout = sbatch_stdout.strip("\n")
297
+ submitted_job_id = int(stdout)
298
+ slurm_job.slurm_job_id = str(submitted_job_id)
299
+
300
+ # Add job to self.jobs
301
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
302
+ logger.info(
303
+ "[_submit_single_sbatch] Added "
304
+ f"{slurm_job.slurm_job_id} to self.jobs."
305
+ )
306
+ logger.info("[_submit_single_sbatch] END")
307
+
308
+ def _copy_files_from_remote_to_local(
309
+ self,
310
+ slurm_job: SlurmJob,
311
+ ) -> None:
312
+ raise NotImplementedError("Implement in child class.")
313
+
314
+ def _check_slurm_account(self) -> None:
315
+ """
316
+ Check that SLURM account is not set here in `common_script_lines`.
317
+ """
318
+ try:
319
+ invalid_line = next(
320
+ line
321
+ for line in self.common_script_lines
322
+ if line.startswith("#SBATCH --account=")
323
+ )
324
+ raise RuntimeError(
325
+ "Invalid line in `common_script_lines`: "
326
+ f"'{invalid_line}'.\n"
327
+ "SLURM account must be set via the request body of the "
328
+ "apply-workflow endpoint, or by modifying the user properties."
329
+ )
330
+ except StopIteration:
331
+ pass
332
+
333
+ def _postprocess_single_task(
334
+ self, *, task: SlurmTask
335
+ ) -> tuple[Any, Exception]:
336
+ try:
337
+ with open(task.output_pickle_file_local, "rb") as f:
338
+ outdata = f.read()
339
+ success, output = cloudpickle.loads(outdata)
340
+ if success:
341
+ result = output
342
+ return result, None
343
+ else:
344
+ exception = _handle_exception_proxy(output)
345
+ return None, exception
346
+ except Exception as e:
347
+ exception = JobExecutionError(f"ERROR, {str(e)}")
348
+ return None, exception
349
+ finally:
350
+ pass
351
+ # FIXME: Re-include unlinks of pickle files
352
+ # Path(task.input_pickle_file_local).unlink(missing_ok=True)
353
+ # Path(task.output_pickle_file_local).unlink(missing_ok=True)
354
+
355
+ def is_shutdown(self) -> bool:
356
+ # FIXME: shutdown is not implemented
357
+ return self.shutdown_file.exists()
358
+
359
+ @property
360
+ def job_ids(self) -> list[str]:
361
+ return list(self.jobs.keys())
362
+
363
+ def submit(
364
+ self,
365
+ func: callable,
366
+ parameters: dict[str, Any],
367
+ history_unit_id: int,
368
+ task_files: TaskFiles,
369
+ config: SlurmConfig,
370
+ task_type: Literal[
371
+ "non_parallel",
372
+ "converter_non_parallel",
373
+ "compound",
374
+ "converter_compound",
375
+ ],
376
+ ) -> tuple[Any, Exception]:
377
+
378
+ logger.info("[submit] START")
379
+
380
+ workdir_local = task_files.wftask_subfolder_local
381
+ workdir_remote = task_files.wftask_subfolder_remote
382
+
383
+ if self.jobs != {}:
384
+ raise JobExecutionError("Unexpected branch: jobs should be empty.")
385
+
386
+ if self.is_shutdown():
387
+ raise JobExecutionError("Cannot continue after shutdown.")
388
+
389
+ # Validation phase
390
+ self.validate_submit_parameters(
391
+ parameters=parameters,
392
+ task_type=task_type,
393
+ )
394
+
395
+ # Create task subfolder
396
+ logger.info("[submit] Create local/remote folders - START")
397
+ self._mkdir_local_folder(folder=workdir_local.as_posix())
398
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
399
+ logger.info("[submit] Create local/remote folders - END")
400
+
401
+ # Add prefix to task_files object
402
+ task_files.prefix = SUBMIT_PREFIX
403
+
404
+ # Submission phase
405
+ slurm_job = SlurmJob(
406
+ prefix=SUBMIT_PREFIX,
407
+ workdir_local=workdir_local,
408
+ workdir_remote=workdir_remote,
409
+ tasks=[
410
+ SlurmTask(
411
+ prefix=SUBMIT_PREFIX,
412
+ index=0,
413
+ component=task_files.component,
414
+ parameters=parameters,
415
+ workdir_remote=workdir_remote,
416
+ workdir_local=workdir_local,
417
+ task_files=task_files,
418
+ )
419
+ ],
420
+ )
421
+
422
+ config.parallel_tasks_per_job = 1
423
+ self._submit_single_sbatch(
424
+ func,
425
+ slurm_job=slurm_job,
426
+ slurm_config=config,
427
+ )
428
+ logger.info(f"[submit] END submission phase, {self.job_ids=}")
429
+
430
+ # FIXME: replace this sleep a more precise check
431
+ settings = Inject(get_settings)
432
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
433
+ logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
434
+ time.sleep(sleep_time)
435
+
436
+ # Retrieval phase
437
+ logger.info("[submit] START retrieval phase")
438
+ while len(self.jobs) > 0:
439
+ if self.is_shutdown():
440
+ self.scancel_jobs()
441
+ finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
442
+ logger.info(f"{finished_job_ids=}")
443
+ with next(get_sync_db()) as db:
444
+ for slurm_job_id in finished_job_ids:
445
+ logger.info(f"Now process {slurm_job_id=}")
446
+ slurm_job = self.jobs.pop(slurm_job_id)
447
+
448
+ self._copy_files_from_remote_to_local(slurm_job)
449
+ result, exception = self._postprocess_single_task(
450
+ task=slurm_job.tasks[0]
451
+ )
452
+ if exception is not None:
453
+ update_status_of_history_unit(
454
+ history_unit_id=history_unit_id,
455
+ status=HistoryUnitStatus.FAILED,
456
+ db_sync=db,
457
+ )
458
+ else:
459
+ if task_type not in ["compound", "converter_compound"]:
460
+ update_status_of_history_unit(
461
+ history_unit_id=history_unit_id,
462
+ status=HistoryUnitStatus.DONE,
463
+ db_sync=db,
464
+ )
465
+
466
+ time.sleep(self.poll_interval)
467
+
468
+ logger.info("[submit] END")
469
+ return result, exception
470
+
471
+ def multisubmit(
472
+ self,
473
+ func: callable,
474
+ list_parameters: list[dict],
475
+ history_unit_ids: list[int],
476
+ list_task_files: list[TaskFiles],
477
+ task_type: Literal["parallel", "compound", "converter_compound"],
478
+ config: SlurmConfig,
479
+ ):
480
+
481
+ if len(self.jobs) > 0:
482
+ raise RuntimeError(
483
+ f"Cannot run .multisubmit when {len(self.jobs)=}"
484
+ )
485
+
486
+ self.validate_multisubmit_parameters(
487
+ list_parameters=list_parameters,
488
+ task_type=task_type,
489
+ list_task_files=list_task_files,
490
+ )
491
+ self.validate_multisubmit_history_unit_ids(
492
+ history_unit_ids=history_unit_ids,
493
+ task_type=task_type,
494
+ list_parameters=list_parameters,
495
+ )
496
+
497
+ logger.info(f"[multisubmit] START, {len(list_parameters)=}")
498
+
499
+ workdir_local = list_task_files[0].wftask_subfolder_local
500
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
501
+
502
+ # Create local&remote task subfolders
503
+ if task_type == "parallel":
504
+ self._mkdir_local_folder(workdir_local.as_posix())
505
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
506
+
507
+ # Execute tasks, in chunks of size `parallel_tasks_per_job`
508
+ # TODO Pick a data structure for results and exceptions, or review the
509
+ # interface
510
+ results: dict[int, Any] = {}
511
+ exceptions: dict[int, BaseException] = {}
512
+
513
+ tot_tasks = len(list_parameters)
514
+
515
+ # Set/validate parameters for task batching
516
+ tasks_per_job, parallel_tasks_per_job = heuristics(
517
+ # Number of parallel components (always known)
518
+ tot_tasks=tot_tasks,
519
+ # Optional WorkflowTask attributes:
520
+ tasks_per_job=config.tasks_per_job,
521
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
522
+ # Task requirements (multiple possible sources):
523
+ cpus_per_task=config.cpus_per_task,
524
+ mem_per_task=config.mem_per_task_MB,
525
+ # Fractal configuration variables (soft/hard limits):
526
+ target_cpus_per_job=config.target_cpus_per_job,
527
+ target_mem_per_job=config.target_mem_per_job,
528
+ target_num_jobs=config.target_num_jobs,
529
+ max_cpus_per_job=config.max_cpus_per_job,
530
+ max_mem_per_job=config.max_mem_per_job,
531
+ max_num_jobs=config.max_num_jobs,
532
+ )
533
+ config.parallel_tasks_per_job = parallel_tasks_per_job
534
+ config.tasks_per_job = tasks_per_job
535
+
536
+ # Divide arguments in batches of `tasks_per_job` tasks each
537
+ args_batches = []
538
+ batch_size = tasks_per_job
539
+ for ind_chunk in range(0, tot_tasks, batch_size):
540
+ args_batches.append(
541
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
542
+ )
543
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
544
+ raise RuntimeError("Something wrong here while batching tasks")
545
+
546
+ logger.info(f"START submission phase, {list(self.jobs.keys())=}")
547
+ for ind_batch, chunk in enumerate(args_batches):
548
+ prefix = f"{MULTISUBMIT_PREFIX}-{ind_batch:06d}"
549
+ tasks = []
550
+ for ind_chunk, parameters in enumerate(chunk):
551
+ index = (ind_batch * batch_size) + ind_chunk
552
+ current_task_files = list_task_files[index]
553
+ current_task_files.prefix = prefix
554
+ tasks.append(
555
+ SlurmTask(
556
+ prefix=prefix,
557
+ index=index,
558
+ component=current_task_files.component,
559
+ workdir_local=workdir_local,
560
+ workdir_remote=workdir_remote,
561
+ parameters=parameters,
562
+ zarr_url=parameters["zarr_url"],
563
+ task_files=current_task_files,
564
+ ),
565
+ )
566
+
567
+ slurm_job = SlurmJob(
568
+ prefix=prefix,
569
+ workdir_local=workdir_local,
570
+ workdir_remote=workdir_remote,
571
+ tasks=tasks,
572
+ )
573
+ self._submit_single_sbatch(
574
+ func,
575
+ slurm_job=slurm_job,
576
+ slurm_config=config,
577
+ )
578
+ logger.info(f"END submission phase, {self.job_ids=}")
579
+
580
+ # FIXME: replace this sleep a more precise check
581
+ settings = Inject(get_settings)
582
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
583
+ logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
584
+ time.sleep(sleep_time)
585
+
586
+ # Retrieval phase
587
+ logger.info("START retrieval phase")
588
+ while len(self.jobs) > 0:
589
+ if self.is_shutdown():
590
+ self.scancel_jobs()
591
+ finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
592
+ logger.info(f"{finished_job_ids=}")
593
+ with next(get_sync_db()) as db:
594
+ for slurm_job_id in finished_job_ids:
595
+ logger.info(f"Now processing {slurm_job_id=}")
596
+ slurm_job = self.jobs.pop(slurm_job_id)
597
+ self._copy_files_from_remote_to_local(slurm_job)
598
+ for task in slurm_job.tasks:
599
+ logger.info(f"Now processing {task.index=}")
600
+ result, exception = self._postprocess_single_task(
601
+ task=task
602
+ )
603
+
604
+ # Note: the relevant done/failed check is based on
605
+ # whether `exception is None`. The fact that
606
+ # `result is None` is not relevant for this purpose.
607
+ if exception is not None:
608
+ exceptions[task.index] = exception
609
+ if task_type == "parallel":
610
+ update_status_of_history_unit(
611
+ history_unit_id=history_unit_ids[
612
+ task.index
613
+ ],
614
+ status=HistoryUnitStatus.FAILED,
615
+ db_sync=db,
616
+ )
617
+ else:
618
+ results[task.index] = result
619
+ if task_type == "parallel":
620
+ update_status_of_history_unit(
621
+ history_unit_id=history_unit_ids[
622
+ task.index
623
+ ],
624
+ status=HistoryUnitStatus.DONE,
625
+ db_sync=db,
626
+ )
627
+
628
+ time.sleep(self.poll_interval)
629
+ return results, exceptions
630
+
631
+ def check_fractal_server_versions(self):
632
+ """
633
+ Compare fractal-server versions of local/remote Python interpreters.
634
+ """
635
+
636
+ # Skip check when the local and remote interpreters are the same
637
+ # (notably for some sudo-slurm deployments)
638
+ if self.python_worker_interpreter == sys.executable:
639
+ return
640
+
641
+ # Fetch remote fractal-server version
642
+ cmd = (
643
+ f"{self.python_worker_interpreter} "
644
+ "-m fractal_server.app.runner.versions"
645
+ )
646
+ stdout = self._run_remote_cmd(cmd)
647
+ remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
648
+
649
+ # Verify local/remote version match
650
+ if remote_version != __VERSION__:
651
+ error_msg = (
652
+ "Fractal-server version mismatch.\n"
653
+ "Local interpreter: "
654
+ f"({sys.executable}): {__VERSION__}.\n"
655
+ "Remote interpreter: "
656
+ f"({self.python_worker_interpreter}): {remote_version}."
657
+ )
658
+ logger.error(error_msg)
659
+ raise RuntimeError(error_msg)
660
+
661
+ def scancel_jobs(self) -> None:
662
+ logger.info("[scancel_jobs] START")
663
+
664
+ if self.jobs:
665
+ scancel_string = " ".join(self.job_ids)
666
+ scancel_cmd = f"scancel {scancel_string}"
667
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
668
+ try:
669
+ self._run_remote_cmd(scancel_cmd)
670
+ except Exception as e:
671
+ logger.warning(
672
+ "[scancel_jobs] `scancel` command failed. "
673
+ f"Original error:\n{str(e)}"
674
+ )
675
+
676
+ logger.info("[scancel_jobs] END")