fractal-server 2.14.0a13__py3-none-any.whl → 2.14.0a15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/linkusergroup.py +6 -2
  3. fractal_server/app/models/v2/dataset.py +1 -1
  4. fractal_server/app/models/v2/job.py +7 -3
  5. fractal_server/app/models/v2/task_group.py +2 -2
  6. fractal_server/app/models/v2/workflow.py +1 -1
  7. fractal_server/app/models/v2/workflowtask.py +1 -1
  8. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  9. fractal_server/app/routes/api/v2/_aux_functions_history.py +8 -0
  10. fractal_server/app/routes/api/v2/dataset.py +0 -8
  11. fractal_server/app/routes/api/v2/history.py +111 -27
  12. fractal_server/app/routes/api/v2/images.py +16 -14
  13. fractal_server/app/routes/api/v2/project.py +0 -52
  14. fractal_server/app/routes/api/v2/task_group.py +0 -17
  15. fractal_server/app/routes/api/v2/workflow.py +0 -8
  16. fractal_server/app/routes/auth/group.py +0 -16
  17. fractal_server/app/runner/executors/base_runner.py +5 -0
  18. fractal_server/app/runner/executors/local/runner.py +15 -7
  19. fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
  20. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +677 -0
  21. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
  22. fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
  23. fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
  24. fractal_server/app/runner/task_files.py +20 -6
  25. fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
  26. fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
  27. fractal_server/app/runner/v2/runner.py +4 -0
  28. fractal_server/app/runner/v2/runner_functions.py +2 -2
  29. fractal_server/app/runner/v2/submit_workflow.py +7 -16
  30. fractal_server/app/schemas/v2/__init__.py +3 -1
  31. fractal_server/app/schemas/v2/history.py +27 -2
  32. fractal_server/config.py +6 -2
  33. fractal_server/images/tools.py +23 -0
  34. fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
  35. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
  36. fractal_server/tasks/v2/utils_background.py +0 -19
  37. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/METADATA +1 -1
  38. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/RECORD +41 -42
  39. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
  40. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
  41. fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
  42. fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
  43. fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
  44. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/LICENSE +0 -0
  45. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/WHEEL +0 -0
  46. {fractal_server-2.14.0a13.dist-info → fractal_server-2.14.0a15.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,677 @@
1
+ import json
2
+ import math
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any
7
+ from typing import Literal
8
+ from typing import Optional
9
+
10
+ import cloudpickle
11
+
12
+ from ..slurm_common._slurm_config import SlurmConfig
13
+ from ..slurm_common.slurm_job_task_models import SlurmJob
14
+ from ..slurm_common.slurm_job_task_models import SlurmTask
15
+ from ._batching import heuristics
16
+ from ._handle_exception_proxy import _handle_exception_proxy
17
+ from ._job_states import STATES_FINISHED
18
+ from fractal_server import __VERSION__
19
+ from fractal_server.app.db import get_sync_db
20
+ from fractal_server.app.runner.exceptions import JobExecutionError
21
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
22
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
23
+ from fractal_server.app.runner.task_files import MULTISUBMIT_PREFIX
24
+ from fractal_server.app.runner.task_files import SUBMIT_PREFIX
25
+ from fractal_server.app.runner.task_files import TaskFiles
26
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
27
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
28
+ from fractal_server.config import get_settings
29
+ from fractal_server.logger import set_logger
30
+ from fractal_server.syringe import Inject
31
+
32
+ logger = set_logger(__name__)
33
+
34
+ # FIXME: Transform several logger.info into logger.debug.
35
+
36
+
37
+ class BaseSlurmRunner(BaseRunner):
38
+ shutdown_file: Path
39
+ common_script_lines: list[str]
40
+ user_cache_dir: str
41
+ root_dir_local: Path
42
+ root_dir_remote: Path
43
+ poll_interval: int
44
+ jobs: dict[str, SlurmJob]
45
+ python_worker_interpreter: str
46
+ slurm_runner_type: Literal["ssh", "sudo"]
47
+
48
+ def __init__(
49
+ self,
50
+ root_dir_local: Path,
51
+ root_dir_remote: Path,
52
+ slurm_runner_type: Literal["ssh", "sudo"],
53
+ common_script_lines: Optional[list[str]] = None,
54
+ user_cache_dir: Optional[str] = None,
55
+ poll_interval: Optional[int] = None,
56
+ ):
57
+ self.slurm_runner_type = slurm_runner_type
58
+ self.root_dir_local = root_dir_local
59
+ self.root_dir_remote = root_dir_remote
60
+ self.common_script_lines = common_script_lines or []
61
+ self._check_slurm_account()
62
+ self.user_cache_dir = user_cache_dir
63
+
64
+ settings = Inject(get_settings)
65
+
66
+ self.poll_interval = (
67
+ poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
68
+ )
69
+ self.check_fractal_server_versions()
70
+
71
+ # Create job folders. Note that the local one may or may not exist
72
+ # depending on whether it is a test or an actual run
73
+ if not self.root_dir_local.is_dir():
74
+ self._mkdir_local_folder(self.root_dir_local.as_posix())
75
+ self._mkdir_remote_folder(self.root_dir_remote.as_posix())
76
+
77
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
78
+ self.jobs = {}
79
+
80
+ def __enter__(self):
81
+ return self
82
+
83
+ def __exit__(self, exc_type, exc_val, exc_tb):
84
+ return False
85
+
86
+ def _run_local_cmd(self, cmd: str) -> str:
87
+ raise NotImplementedError("Implement in child class.")
88
+
89
+ def _run_remote_cmd(self, cmd: str) -> str:
90
+ raise NotImplementedError("Implement in child class.")
91
+
92
+ def run_squeue(self, job_ids: list[str]) -> tuple[bool, str]:
93
+ # FIXME: review different cases (exception vs no job found)
94
+ job_id_single_str = ",".join([str(j) for j in job_ids])
95
+ cmd = (
96
+ f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
97
+ " --states=all"
98
+ )
99
+
100
+ try:
101
+ if self.slurm_runner_type == "sudo":
102
+ stdout = self._run_local_cmd(cmd)
103
+ else:
104
+ stdout = self._run_remote_cmd(cmd)
105
+ return True, stdout
106
+ except Exception as e:
107
+ logger.info(f"{cmd=} failed with {str(e)}")
108
+ return False, ""
109
+
110
+ def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
111
+ # If there is no Slurm job to check, return right away
112
+
113
+ if not job_ids:
114
+ return set()
115
+ id_to_state = dict()
116
+
117
+ success, stdout = self.run_squeue(job_ids)
118
+ if success:
119
+ id_to_state = {
120
+ out.split()[0]: out.split()[1] for out in stdout.splitlines()
121
+ }
122
+ else:
123
+ id_to_state = dict()
124
+ for j in job_ids:
125
+ success, res = self.run_squeue([j])
126
+ if not success:
127
+ logger.info(f"Job {j} not found. Marked it as completed")
128
+ id_to_state.update({str(j): "COMPLETED"})
129
+ else:
130
+ id_to_state.update(
131
+ {res.stdout.split()[0]: res.stdout.split()[1]}
132
+ )
133
+
134
+ # Finished jobs only stay in squeue for a few mins (configurable). If
135
+ # a job ID isn't there, we'll assume it's finished.
136
+ return {
137
+ j
138
+ for j in job_ids
139
+ if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
140
+ }
141
+
142
+ def _mkdir_local_folder(self, folder: str) -> None:
143
+ raise NotImplementedError("Implement in child class.")
144
+
145
+ def _mkdir_remote_folder(self, folder: str) -> None:
146
+ raise NotImplementedError("Implement in child class.")
147
+
148
+ def _submit_single_sbatch(
149
+ self,
150
+ func,
151
+ slurm_job: SlurmJob,
152
+ slurm_config: SlurmConfig,
153
+ ) -> str:
154
+ logger.info("[_submit_single_sbatch] START")
155
+ # Prepare input pickle(s)
156
+ versions = dict(
157
+ python=sys.version_info[:3],
158
+ cloudpickle=cloudpickle.__version__,
159
+ fractal_server=__VERSION__,
160
+ )
161
+ for task in slurm_job.tasks:
162
+ # Wrinte input pickle
163
+ _args = []
164
+ _kwargs = dict(
165
+ parameters=task.parameters,
166
+ remote_files=task.task_files.remote_files_dict,
167
+ )
168
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
169
+ with open(task.input_pickle_file_local, "wb") as f:
170
+ f.write(funcser)
171
+ logger.info(
172
+ "[_submit_single_sbatch] Written "
173
+ f"{task.input_pickle_file_local=}"
174
+ )
175
+
176
+ if self.slurm_runner_type == "ssh":
177
+ # Send input pickle (only relevant for SSH)
178
+ self.fractal_ssh.send_file(
179
+ local=task.input_pickle_file_local,
180
+ remote=task.input_pickle_file_remote,
181
+ )
182
+ logger.info(
183
+ "[_submit_single_sbatch] Transferred "
184
+ f"{task.input_pickle_file_local=}"
185
+ )
186
+
187
+ # Prepare commands to be included in SLURM submission script
188
+ cmdlines = []
189
+ for task in slurm_job.tasks:
190
+ if self.slurm_runner_type == "ssh":
191
+ input_pickle_file = task.input_pickle_file_remote
192
+ else:
193
+ input_pickle_file = task.input_pickle_file_local
194
+ output_pickle_file = task.output_pickle_file_remote
195
+ cmdlines.append(
196
+ (
197
+ f"{self.python_worker_interpreter}"
198
+ " -m fractal_server.app.runner."
199
+ "executors.slurm_common.remote "
200
+ f"--input-file {input_pickle_file} "
201
+ f"--output-file {output_pickle_file}"
202
+ )
203
+ )
204
+
205
+ # Set ntasks
206
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
207
+ ntasks = min(len(cmdlines), num_tasks_max_running)
208
+ slurm_config.parallel_tasks_per_job = ntasks
209
+
210
+ # Prepare SLURM preamble based on SlurmConfig object
211
+ script_lines = slurm_config.to_sbatch_preamble(
212
+ remote_export_dir=self.user_cache_dir
213
+ )
214
+
215
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
216
+ # fix their order
217
+ script_lines.extend(
218
+ [
219
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
220
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
221
+ f"#SBATCH -D {slurm_job.workdir_remote}",
222
+ ]
223
+ )
224
+ script_lines = slurm_config.sort_script_lines(script_lines)
225
+ logger.info(script_lines)
226
+
227
+ # Always print output of `uname -n` and `pwd`
228
+ script_lines.append("Hostname: $(uname -n)\n")
229
+ script_lines.append("Current directory : $(pwd)\n")
230
+ script_lines.append('Start time: $(date + "%Y-%m-%dT%H:%M:%S%z")\n')
231
+
232
+ # Complete script preamble
233
+ script_lines.append("\n")
234
+
235
+ # Include command lines
236
+ mem_per_task_MB = slurm_config.mem_per_task_MB
237
+ for cmd in cmdlines:
238
+ script_lines.append(
239
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
240
+ f"--mem={mem_per_task_MB}MB "
241
+ f"{cmd} &"
242
+ )
243
+ script_lines.append("wait\n")
244
+ script = "\n".join(script_lines)
245
+ script_lines.append('End time: $(date + "%Y-%m-%dT%H:%M:%S%z")\n')
246
+
247
+ # Write submission script
248
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
249
+ f.write(script)
250
+ logger.info(
251
+ "[_submit_single_sbatch] Written "
252
+ f"{slurm_job.slurm_submission_script_local=}"
253
+ )
254
+
255
+ if self.slurm_runner_type == "ssh":
256
+ self.fractal_ssh.send_file(
257
+ local=slurm_job.slurm_submission_script_local,
258
+ remote=slurm_job.slurm_submission_script_remote,
259
+ )
260
+ submit_command = (
261
+ "sbatch --parsable "
262
+ f"{slurm_job.slurm_submission_script_remote}"
263
+ )
264
+ else:
265
+ submit_command = (
266
+ "sbatch --parsable "
267
+ f"{slurm_job.slurm_submission_script_local}"
268
+ )
269
+ # Run sbatch
270
+ pre_submission_cmds = slurm_config.pre_submission_commands
271
+ if len(pre_submission_cmds) == 0:
272
+ logger.info(f"Now run {submit_command=}")
273
+ sbatch_stdout = self._run_remote_cmd(submit_command)
274
+ else:
275
+ logger.info(f"Now using {pre_submission_cmds=}")
276
+ script_lines = pre_submission_cmds + [submit_command]
277
+ wrapper_script_contents = "\n".join(script_lines)
278
+ wrapper_script_contents = f"{wrapper_script_contents}\n"
279
+ if self.slurm_runner_type == "ssh":
280
+ wrapper_script = (
281
+ f"{slurm_job.slurm_submission_script_remote}_wrapper.sh"
282
+ )
283
+ self.fractal_ssh.write_remote_file(
284
+ path=wrapper_script, content=wrapper_script_contents
285
+ )
286
+ else:
287
+ wrapper_script = (
288
+ f"{slurm_job.slurm_submission_script_local}_wrapper.sh"
289
+ )
290
+ with open(wrapper_script, "w") as f:
291
+ f.write(wrapper_script_contents)
292
+ logger.info(f"Now run {wrapper_script=}")
293
+ sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
294
+
295
+ # Submit SLURM job and retrieve job ID
296
+ logger.info(f"[_submit_single_sbatc] {sbatch_stdout=}")
297
+ stdout = sbatch_stdout.strip("\n")
298
+ submitted_job_id = int(stdout)
299
+ slurm_job.slurm_job_id = str(submitted_job_id)
300
+
301
+ # Add job to self.jobs
302
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
303
+ logger.info(
304
+ "[_submit_single_sbatch] Added "
305
+ f"{slurm_job.slurm_job_id} to self.jobs."
306
+ )
307
+ logger.info("[_submit_single_sbatch] END")
308
+
309
+ def _copy_files_from_remote_to_local(
310
+ self,
311
+ slurm_job: SlurmJob,
312
+ ) -> None:
313
+ raise NotImplementedError("Implement in child class.")
314
+
315
+ def _check_slurm_account(self) -> None:
316
+ """
317
+ Check that SLURM account is not set here in `common_script_lines`.
318
+ """
319
+ try:
320
+ invalid_line = next(
321
+ line
322
+ for line in self.common_script_lines
323
+ if line.startswith("#SBATCH --account=")
324
+ )
325
+ raise RuntimeError(
326
+ "Invalid line in `common_script_lines`: "
327
+ f"'{invalid_line}'.\n"
328
+ "SLURM account must be set via the request body of the "
329
+ "apply-workflow endpoint, or by modifying the user properties."
330
+ )
331
+ except StopIteration:
332
+ pass
333
+
334
+ def _postprocess_single_task(
335
+ self, *, task: SlurmTask
336
+ ) -> tuple[Any, Exception]:
337
+ try:
338
+ with open(task.output_pickle_file_local, "rb") as f:
339
+ outdata = f.read()
340
+ success, output = cloudpickle.loads(outdata)
341
+ if success:
342
+ result = output
343
+ return result, None
344
+ else:
345
+ exception = _handle_exception_proxy(output)
346
+ return None, exception
347
+ except Exception as e:
348
+ exception = JobExecutionError(f"ERROR, {str(e)}")
349
+ return None, exception
350
+ finally:
351
+ pass
352
+ # FIXME: Re-include unlinks of pickle files
353
+ # Path(task.input_pickle_file_local).unlink(missing_ok=True)
354
+ # Path(task.output_pickle_file_local).unlink(missing_ok=True)
355
+
356
+ def is_shutdown(self) -> bool:
357
+ # FIXME: shutdown is not implemented
358
+ return self.shutdown_file.exists()
359
+
360
+ @property
361
+ def job_ids(self) -> list[str]:
362
+ return list(self.jobs.keys())
363
+
364
+ def submit(
365
+ self,
366
+ func: callable,
367
+ parameters: dict[str, Any],
368
+ history_unit_id: int,
369
+ task_files: TaskFiles,
370
+ config: SlurmConfig,
371
+ task_type: Literal[
372
+ "non_parallel",
373
+ "converter_non_parallel",
374
+ "compound",
375
+ "converter_compound",
376
+ ],
377
+ ) -> tuple[Any, Exception]:
378
+
379
+ logger.info("[submit] START")
380
+
381
+ workdir_local = task_files.wftask_subfolder_local
382
+ workdir_remote = task_files.wftask_subfolder_remote
383
+
384
+ if self.jobs != {}:
385
+ raise JobExecutionError("Unexpected branch: jobs should be empty.")
386
+
387
+ if self.is_shutdown():
388
+ raise JobExecutionError("Cannot continue after shutdown.")
389
+
390
+ # Validation phase
391
+ self.validate_submit_parameters(
392
+ parameters=parameters,
393
+ task_type=task_type,
394
+ )
395
+
396
+ # Create task subfolder
397
+ logger.info("[submit] Create local/remote folders - START")
398
+ self._mkdir_local_folder(folder=workdir_local.as_posix())
399
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
400
+ logger.info("[submit] Create local/remote folders - END")
401
+
402
+ # Add prefix to task_files object
403
+ task_files.prefix = SUBMIT_PREFIX
404
+
405
+ # Submission phase
406
+ slurm_job = SlurmJob(
407
+ prefix=SUBMIT_PREFIX,
408
+ workdir_local=workdir_local,
409
+ workdir_remote=workdir_remote,
410
+ tasks=[
411
+ SlurmTask(
412
+ prefix=SUBMIT_PREFIX,
413
+ index=0,
414
+ component=task_files.component,
415
+ parameters=parameters,
416
+ workdir_remote=workdir_remote,
417
+ workdir_local=workdir_local,
418
+ task_files=task_files,
419
+ )
420
+ ],
421
+ )
422
+
423
+ config.parallel_tasks_per_job = 1
424
+ self._submit_single_sbatch(
425
+ func,
426
+ slurm_job=slurm_job,
427
+ slurm_config=config,
428
+ )
429
+ logger.info(f"[submit] END submission phase, {self.job_ids=}")
430
+
431
+ # FIXME: replace this sleep a more precise check
432
+ settings = Inject(get_settings)
433
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
434
+ logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
435
+ time.sleep(sleep_time)
436
+
437
+ # Retrieval phase
438
+ logger.info("[submit] START retrieval phase")
439
+ while len(self.jobs) > 0:
440
+ if self.is_shutdown():
441
+ self.scancel_jobs()
442
+ finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
443
+ logger.info(f"{finished_job_ids=}")
444
+ with next(get_sync_db()) as db:
445
+ for slurm_job_id in finished_job_ids:
446
+ logger.info(f"Now process {slurm_job_id=}")
447
+ slurm_job = self.jobs.pop(slurm_job_id)
448
+
449
+ self._copy_files_from_remote_to_local(slurm_job)
450
+ result, exception = self._postprocess_single_task(
451
+ task=slurm_job.tasks[0]
452
+ )
453
+ if exception is not None:
454
+ update_status_of_history_unit(
455
+ history_unit_id=history_unit_id,
456
+ status=HistoryUnitStatus.FAILED,
457
+ db_sync=db,
458
+ )
459
+ else:
460
+ if task_type not in ["compound", "converter_compound"]:
461
+ update_status_of_history_unit(
462
+ history_unit_id=history_unit_id,
463
+ status=HistoryUnitStatus.DONE,
464
+ db_sync=db,
465
+ )
466
+
467
+ time.sleep(self.poll_interval)
468
+
469
+ logger.info("[submit] END")
470
+ return result, exception
471
+
472
+ def multisubmit(
473
+ self,
474
+ func: callable,
475
+ list_parameters: list[dict],
476
+ history_unit_ids: list[int],
477
+ list_task_files: list[TaskFiles],
478
+ task_type: Literal["parallel", "compound", "converter_compound"],
479
+ config: SlurmConfig,
480
+ ):
481
+
482
+ if len(self.jobs) > 0:
483
+ raise RuntimeError(
484
+ f"Cannot run .multisubmit when {len(self.jobs)=}"
485
+ )
486
+
487
+ self.validate_multisubmit_parameters(
488
+ list_parameters=list_parameters,
489
+ task_type=task_type,
490
+ list_task_files=list_task_files,
491
+ )
492
+ self.validate_multisubmit_history_unit_ids(
493
+ history_unit_ids=history_unit_ids,
494
+ task_type=task_type,
495
+ list_parameters=list_parameters,
496
+ )
497
+
498
+ logger.info(f"[multisubmit] START, {len(list_parameters)=}")
499
+
500
+ workdir_local = list_task_files[0].wftask_subfolder_local
501
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
502
+
503
+ # Create local&remote task subfolders
504
+ if task_type == "parallel":
505
+ self._mkdir_local_folder(workdir_local.as_posix())
506
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
507
+
508
+ # Execute tasks, in chunks of size `parallel_tasks_per_job`
509
+ # TODO Pick a data structure for results and exceptions, or review the
510
+ # interface
511
+ results: dict[int, Any] = {}
512
+ exceptions: dict[int, BaseException] = {}
513
+
514
+ tot_tasks = len(list_parameters)
515
+
516
+ # Set/validate parameters for task batching
517
+ tasks_per_job, parallel_tasks_per_job = heuristics(
518
+ # Number of parallel components (always known)
519
+ tot_tasks=tot_tasks,
520
+ # Optional WorkflowTask attributes:
521
+ tasks_per_job=config.tasks_per_job,
522
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
523
+ # Task requirements (multiple possible sources):
524
+ cpus_per_task=config.cpus_per_task,
525
+ mem_per_task=config.mem_per_task_MB,
526
+ # Fractal configuration variables (soft/hard limits):
527
+ target_cpus_per_job=config.target_cpus_per_job,
528
+ target_mem_per_job=config.target_mem_per_job,
529
+ target_num_jobs=config.target_num_jobs,
530
+ max_cpus_per_job=config.max_cpus_per_job,
531
+ max_mem_per_job=config.max_mem_per_job,
532
+ max_num_jobs=config.max_num_jobs,
533
+ )
534
+ config.parallel_tasks_per_job = parallel_tasks_per_job
535
+ config.tasks_per_job = tasks_per_job
536
+
537
+ # Divide arguments in batches of `tasks_per_job` tasks each
538
+ args_batches = []
539
+ batch_size = tasks_per_job
540
+ for ind_chunk in range(0, tot_tasks, batch_size):
541
+ args_batches.append(
542
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
543
+ )
544
+ if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
545
+ raise RuntimeError("Something wrong here while batching tasks")
546
+
547
+ logger.info(f"START submission phase, {list(self.jobs.keys())=}")
548
+ for ind_batch, chunk in enumerate(args_batches):
549
+ prefix = f"{MULTISUBMIT_PREFIX}-{ind_batch:06d}"
550
+ tasks = []
551
+ for ind_chunk, parameters in enumerate(chunk):
552
+ index = (ind_batch * batch_size) + ind_chunk
553
+ current_task_files = list_task_files[index]
554
+ current_task_files.prefix = prefix
555
+ tasks.append(
556
+ SlurmTask(
557
+ prefix=prefix,
558
+ index=index,
559
+ component=current_task_files.component,
560
+ workdir_local=workdir_local,
561
+ workdir_remote=workdir_remote,
562
+ parameters=parameters,
563
+ zarr_url=parameters["zarr_url"],
564
+ task_files=current_task_files,
565
+ ),
566
+ )
567
+
568
+ slurm_job = SlurmJob(
569
+ prefix=prefix,
570
+ workdir_local=workdir_local,
571
+ workdir_remote=workdir_remote,
572
+ tasks=tasks,
573
+ )
574
+ self._submit_single_sbatch(
575
+ func,
576
+ slurm_job=slurm_job,
577
+ slurm_config=config,
578
+ )
579
+ logger.info(f"END submission phase, {self.job_ids=}")
580
+
581
+ # FIXME: replace this sleep a more precise check
582
+ settings = Inject(get_settings)
583
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
584
+ logger.warning(f"[submit] Now sleep {sleep_time} (FIXME)")
585
+ time.sleep(sleep_time)
586
+
587
+ # Retrieval phase
588
+ logger.info("START retrieval phase")
589
+ while len(self.jobs) > 0:
590
+ if self.is_shutdown():
591
+ self.scancel_jobs()
592
+ finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
593
+ logger.info(f"{finished_job_ids=}")
594
+ with next(get_sync_db()) as db:
595
+ for slurm_job_id in finished_job_ids:
596
+ logger.info(f"Now processing {slurm_job_id=}")
597
+ slurm_job = self.jobs.pop(slurm_job_id)
598
+ self._copy_files_from_remote_to_local(slurm_job)
599
+ for task in slurm_job.tasks:
600
+ logger.info(f"Now processing {task.index=}")
601
+ result, exception = self._postprocess_single_task(
602
+ task=task
603
+ )
604
+
605
+ # Note: the relevant done/failed check is based on
606
+ # whether `exception is None`. The fact that
607
+ # `result is None` is not relevant for this purpose.
608
+ if exception is not None:
609
+ exceptions[task.index] = exception
610
+ if task_type == "parallel":
611
+ update_status_of_history_unit(
612
+ history_unit_id=history_unit_ids[
613
+ task.index
614
+ ],
615
+ status=HistoryUnitStatus.FAILED,
616
+ db_sync=db,
617
+ )
618
+ else:
619
+ results[task.index] = result
620
+ if task_type == "parallel":
621
+ update_status_of_history_unit(
622
+ history_unit_id=history_unit_ids[
623
+ task.index
624
+ ],
625
+ status=HistoryUnitStatus.DONE,
626
+ db_sync=db,
627
+ )
628
+
629
+ time.sleep(self.poll_interval)
630
+ return results, exceptions
631
+
632
+ def check_fractal_server_versions(self):
633
+ """
634
+ Compare fractal-server versions of local/remote Python interpreters.
635
+ """
636
+
637
+ # Skip check when the local and remote interpreters are the same
638
+ # (notably for some sudo-slurm deployments)
639
+ if self.python_worker_interpreter == sys.executable:
640
+ return
641
+
642
+ # Fetch remote fractal-server version
643
+ cmd = (
644
+ f"{self.python_worker_interpreter} "
645
+ "-m fractal_server.app.runner.versions"
646
+ )
647
+ stdout = self._run_remote_cmd(cmd)
648
+ remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
649
+
650
+ # Verify local/remote version match
651
+ if remote_version != __VERSION__:
652
+ error_msg = (
653
+ "Fractal-server version mismatch.\n"
654
+ "Local interpreter: "
655
+ f"({sys.executable}): {__VERSION__}.\n"
656
+ "Remote interpreter: "
657
+ f"({self.python_worker_interpreter}): {remote_version}."
658
+ )
659
+ logger.error(error_msg)
660
+ raise RuntimeError(error_msg)
661
+
662
+ def scancel_jobs(self) -> None:
663
+ logger.info("[scancel_jobs] START")
664
+
665
+ if self.jobs:
666
+ scancel_string = " ".join(self.job_ids)
667
+ scancel_cmd = f"scancel {scancel_string}"
668
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
669
+ try:
670
+ self._run_remote_cmd(scancel_cmd)
671
+ except Exception as e:
672
+ logger.warning(
673
+ "[scancel_jobs] `scancel` command failed. "
674
+ f"Original error:\n{str(e)}"
675
+ )
676
+
677
+ logger.info("[scancel_jobs] END")