fractal-server 2.14.0a12__py3-none-any.whl → 2.14.0a14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/linkusergroup.py +6 -2
  3. fractal_server/app/models/v2/dataset.py +1 -1
  4. fractal_server/app/models/v2/job.py +7 -3
  5. fractal_server/app/models/v2/task_group.py +2 -2
  6. fractal_server/app/models/v2/workflow.py +1 -1
  7. fractal_server/app/models/v2/workflowtask.py +1 -1
  8. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  9. fractal_server/app/routes/api/v2/dataset.py +0 -8
  10. fractal_server/app/routes/api/v2/history.py +112 -27
  11. fractal_server/app/routes/api/v2/images.py +16 -14
  12. fractal_server/app/routes/api/v2/project.py +0 -52
  13. fractal_server/app/routes/api/v2/task_group.py +0 -17
  14. fractal_server/app/routes/api/v2/workflow.py +0 -8
  15. fractal_server/app/routes/auth/group.py +0 -16
  16. fractal_server/app/runner/executors/base_runner.py +5 -0
  17. fractal_server/app/runner/executors/local/runner.py +15 -7
  18. fractal_server/app/runner/executors/slurm_common/_handle_exception_proxy.py +17 -0
  19. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +676 -0
  20. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +102 -0
  21. fractal_server/app/runner/executors/slurm_ssh/runner.py +110 -648
  22. fractal_server/app/runner/executors/slurm_sudo/runner.py +32 -661
  23. fractal_server/app/runner/task_files.py +20 -6
  24. fractal_server/app/runner/v2/_slurm_ssh.py +6 -6
  25. fractal_server/app/runner/v2/_slurm_sudo.py +4 -4
  26. fractal_server/app/runner/v2/db_tools.py +1 -0
  27. fractal_server/app/runner/v2/runner.py +4 -0
  28. fractal_server/app/runner/v2/runner_functions.py +2 -2
  29. fractal_server/app/runner/v2/submit_workflow.py +7 -16
  30. fractal_server/app/schemas/v2/__init__.py +3 -1
  31. fractal_server/app/schemas/v2/history.py +27 -2
  32. fractal_server/config.py +6 -2
  33. fractal_server/images/tools.py +23 -0
  34. fractal_server/migrations/versions/5b6007027595_on_cascade.py +250 -0
  35. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +2 -2
  36. fractal_server/tasks/v2/utils_background.py +0 -19
  37. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/METADATA +1 -1
  38. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/RECORD +41 -42
  39. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +0 -77
  40. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +0 -67
  41. fractal_server/app/runner/executors/slurm_ssh/_executor_wait_thread.py +0 -126
  42. fractal_server/app/runner/executors/slurm_ssh/_slurm_job.py +0 -116
  43. fractal_server/app/runner/executors/slurm_ssh/executor.py +0 -1386
  44. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/LICENSE +0 -0
  45. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/WHEEL +0 -0
  46. {fractal_server-2.14.0a12.dist-info → fractal_server-2.14.0a14.dist-info}/entry_points.txt +0 -0
@@ -1,171 +1,23 @@
1
- import json
2
1
  import logging
3
- import math
4
2
  import os
5
3
  import shlex
6
4
  import subprocess # nosec
7
5
  import sys
8
- import time
9
- from copy import copy
10
6
  from pathlib import Path
11
- from typing import Any
12
- from typing import Literal
13
7
  from typing import Optional
14
8
 
15
- import cloudpickle
16
- from pydantic import BaseModel
17
- from pydantic import ConfigDict
18
-
19
- from ..slurm_common._check_jobs_status import get_finished_jobs
9
+ from ..slurm_common.base_slurm_runner import BaseSlurmRunner
10
+ from ..slurm_common.slurm_job_task_models import SlurmJob
20
11
  from ._subprocess_run_as_user import _mkdir_as_user
21
12
  from ._subprocess_run_as_user import _run_command_as_user
22
- from fractal_server import __VERSION__
23
- from fractal_server.app.db import get_sync_db
24
13
  from fractal_server.app.runner.exceptions import JobExecutionError
25
- from fractal_server.app.runner.exceptions import TaskExecutionError
26
- from fractal_server.app.runner.executors.base_runner import BaseRunner
27
- from fractal_server.app.runner.executors.slurm_common._batching import (
28
- heuristics,
29
- )
30
- from fractal_server.app.runner.executors.slurm_common._slurm_config import (
31
- SlurmConfig,
32
- )
33
- from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
34
- from fractal_server.app.runner.task_files import TaskFiles
35
- from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
36
- from fractal_server.app.schemas.v2 import HistoryUnitStatus
37
14
  from fractal_server.config import get_settings
38
15
  from fractal_server.logger import set_logger
39
16
  from fractal_server.syringe import Inject
40
17
 
41
18
 
42
19
  logger = set_logger(__name__)
43
-
44
-
45
- def _handle_exception_proxy(proxy): # FIXME
46
- if proxy.exc_type_name == "JobExecutionError":
47
- return JobExecutionError(str(proxy))
48
- else:
49
- kwargs = {}
50
- for key in [
51
- "workflow_task_id",
52
- "workflow_task_order",
53
- "task_name",
54
- ]:
55
- if key in proxy.kwargs.keys():
56
- kwargs[key] = proxy.kwargs[key]
57
- return TaskExecutionError(proxy.tb, **kwargs)
58
-
59
-
60
- class SlurmTask(BaseModel):
61
- model_config = ConfigDict(arbitrary_types_allowed=True)
62
- component: str
63
- workdir_local: Path
64
- workdir_remote: Path
65
- parameters: dict[str, Any]
66
- zarr_url: Optional[str] = None
67
- task_files: TaskFiles
68
- index: int
69
-
70
- @property
71
- def input_pickle_file_local(self) -> str:
72
- return (
73
- self.workdir_local / f"{self.component}-input.pickle"
74
- ).as_posix()
75
-
76
- @property
77
- def output_pickle_file_local(self) -> str:
78
- return (
79
- self.workdir_local / f"{self.component}-output.pickle"
80
- ).as_posix()
81
-
82
- @property
83
- def input_pickle_file_remote(self) -> str:
84
- return (
85
- self.workdir_remote / f"{self.component}-input.pickle"
86
- ).as_posix()
87
-
88
- @property
89
- def output_pickle_file_remote(self) -> str:
90
- return (
91
- self.workdir_remote / f"{self.component}-output.pickle"
92
- ).as_posix()
93
-
94
-
95
- class SlurmJob(BaseModel):
96
- slurm_job_id: Optional[str] = None
97
- label: str
98
- workdir_local: Path
99
- workdir_remote: Path
100
- tasks: list[SlurmTask]
101
-
102
- @property
103
- def slurm_submission_script_local(self) -> str:
104
- return (
105
- self.workdir_local / f"slurm-{self.label}-submit.sh"
106
- ).as_posix()
107
-
108
- @property
109
- def slurm_submission_script_remote(self) -> str:
110
- return (
111
- self.workdir_remote / f"slurm-{self.label}-submit.sh"
112
- ).as_posix()
113
-
114
- @property
115
- def slurm_stdout_remote(self) -> str:
116
- if self.slurm_job_id:
117
- return (
118
- self.workdir_remote
119
- / f"slurm-{self.label}-{self.slurm_job_id}.out"
120
- ).as_posix()
121
-
122
- else:
123
- return (
124
- self.workdir_remote / f"slurm-{self.label}-%j.out"
125
- ).as_posix()
126
-
127
- @property
128
- def slurm_stderr_remote(self) -> str:
129
- if self.slurm_job_id:
130
- return (
131
- self.workdir_remote
132
- / f"slurm-{self.label}-{self.slurm_job_id}.err"
133
- ).as_posix()
134
-
135
- else:
136
- return (
137
- self.workdir_remote / f"slurm-{self.label}-%j.err"
138
- ).as_posix()
139
-
140
- @property
141
- def slurm_stdout_local(self) -> str:
142
- if self.slurm_job_id:
143
- return (
144
- self.workdir_local
145
- / f"slurm-{self.label}-{self.slurm_job_id}.out"
146
- ).as_posix()
147
-
148
- else:
149
- return (
150
- self.workdir_local / f"slurm-{self.label}-%j.out"
151
- ).as_posix()
152
-
153
- @property
154
- def slurm_stderr_local(self) -> str:
155
- if self.slurm_job_id:
156
- return (
157
- self.workdir_local
158
- / f"slurm-{self.label}-{self.slurm_job_id}.err"
159
- ).as_posix()
160
-
161
- else:
162
- return (
163
- self.workdir_local / f"slurm-{self.label}-%j.err"
164
- ).as_posix()
165
-
166
- @property
167
- def log_files_local(self) -> list[str]:
168
- return [task.task_files.log_file_local for task in self.tasks]
20
+ # FIXME: Transform several logger.info into logger.debug.
169
21
 
170
22
 
171
23
  def _subprocess_run_or_raise(
@@ -190,29 +42,22 @@ def _subprocess_run_or_raise(
190
42
  raise JobExecutionError(info=error_msg)
191
43
 
192
44
 
193
- class RunnerSlurmSudo(BaseRunner):
45
+ class SudoSlurmRunner(BaseSlurmRunner):
194
46
  slurm_user: str
195
- slurm_user: str
196
- shutdown_file: Path
197
- common_script_lines: list[str]
198
- user_cache_dir: str
199
- root_dir_local: Path
200
- root_dir_remote: Path
201
47
  slurm_account: Optional[str] = None
202
- poll_interval: int
203
- python_worker_interpreter: str
204
- jobs: dict[str, SlurmJob]
205
48
 
206
49
  def __init__(
207
50
  self,
208
51
  *,
209
- slurm_user: str,
52
+ # Common
210
53
  root_dir_local: Path,
211
54
  root_dir_remote: Path,
212
- slurm_account: Optional[str] = None,
213
55
  common_script_lines: Optional[list[str]] = None,
214
56
  user_cache_dir: Optional[str] = None,
215
- slurm_poll_interval: Optional[int] = None,
57
+ poll_interval: Optional[int] = None,
58
+ # Specific
59
+ slurm_account: Optional[str] = None,
60
+ slurm_user: str,
216
61
  ) -> None:
217
62
  """
218
63
  Set parameters that are the same for different Fractal tasks and for
@@ -221,208 +66,34 @@ class RunnerSlurmSudo(BaseRunner):
221
66
 
222
67
  self.slurm_user = slurm_user
223
68
  self.slurm_account = slurm_account
224
- self.common_script_lines = common_script_lines or []
225
-
226
- # Check that SLURM account is not set here
227
- # FIXME: move to little method
228
- try:
229
- invalid_line = next(
230
- line
231
- for line in self.common_script_lines
232
- if line.startswith("#SBATCH --account=")
233
- )
234
- raise RuntimeError(
235
- "Invalid line in `FractalSlurmExecutor.common_script_lines`: "
236
- f"'{invalid_line}'.\n"
237
- "SLURM account must be set via the request body of the "
238
- "apply-workflow endpoint, or by modifying the user properties."
239
- )
240
- except StopIteration:
241
- pass
242
-
243
- # Check Python versions
244
69
  settings = Inject(get_settings)
245
- if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
246
- self.check_remote_python_interpreter()
247
-
248
- self.root_dir_local = root_dir_local
249
- self.root_dir_remote = root_dir_remote
250
-
251
- # Create folders
252
- original_umask = os.umask(0)
253
- self.root_dir_local.mkdir(parents=True, exist_ok=True, mode=0o755)
254
- os.umask(original_umask)
255
- _mkdir_as_user(
256
- folder=self.root_dir_remote.as_posix(),
257
- user=self.slurm_user,
258
- )
259
-
260
- self.user_cache_dir = user_cache_dir
261
-
262
- self.slurm_poll_interval = (
263
- slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
264
- )
265
-
266
- self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
267
70
 
268
71
  self.python_worker_interpreter = (
269
72
  settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
270
73
  )
271
74
 
272
- self.jobs = {}
273
-
274
- def __enter__(self):
275
- return self
276
-
277
- def __exit__(self, exc_type, exc_val, exc_tb):
278
- return False
279
-
280
- def is_shutdown(self) -> bool:
281
- return self.shutdown_file.exists()
282
-
283
- def scancel_jobs(self) -> None:
284
- logger.debug("[scancel_jobs] START")
285
-
286
- if self.jobs:
287
- scancel_string = " ".join(self.job_ids)
288
- scancel_cmd = f"scancel {scancel_string}"
289
- logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
290
- try:
291
- _run_command_as_user(
292
- cmd=scancel_cmd,
293
- user=self.slurm_user,
294
- check=True,
295
- )
296
- except RuntimeError as e:
297
- logger.warning(
298
- "[scancel_jobs] `scancel` command failed. "
299
- f"Original error:\n{str(e)}"
300
- )
301
-
302
- logger.debug("[scancel_jobs] END")
303
-
304
- def _submit_single_sbatch(
305
- self,
306
- func,
307
- slurm_job: SlurmJob,
308
- slurm_config: SlurmConfig,
309
- ) -> str:
310
- logger.debug("[_submit_single_sbatch] START")
311
- # Prepare input pickle(s)
312
- versions = dict(
313
- python=sys.version_info[:3],
314
- cloudpickle=cloudpickle.__version__,
315
- fractal_server=__VERSION__,
316
- )
317
- for task in slurm_job.tasks:
318
- _args = []
319
- _kwargs = dict(
320
- parameters=task.parameters,
321
- remote_files=task.task_files.remote_files_dict,
322
- )
323
- funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
324
- with open(task.input_pickle_file_local, "wb") as f:
325
- f.write(funcser)
326
- logger.debug(
327
- "[_submit_single_sbatch] Written "
328
- f"{task.input_pickle_file_local=}"
329
- )
330
- # Prepare commands to be included in SLURM submission script
331
- settings = Inject(get_settings)
332
- python_worker_interpreter = (
333
- settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
334
- )
335
- cmdlines = []
336
- for task in slurm_job.tasks:
337
- input_pickle_file = task.input_pickle_file_local
338
- output_pickle_file = task.output_pickle_file_remote
339
- cmdlines.append(
340
- (
341
- f"{python_worker_interpreter}"
342
- " -m fractal_server.app.runner."
343
- "executors.slurm_common.remote "
344
- f"--input-file {input_pickle_file} "
345
- f"--output-file {output_pickle_file}"
346
- )
347
- )
348
-
349
- # ...
350
- num_tasks_max_running = slurm_config.parallel_tasks_per_job
351
- mem_per_task_MB = slurm_config.mem_per_task_MB
352
-
353
- # Set ntasks
354
- ntasks = min(len(cmdlines), num_tasks_max_running)
355
- slurm_config.parallel_tasks_per_job = ntasks
356
-
357
- # Prepare SLURM preamble based on SlurmConfig object
358
- script_lines = slurm_config.to_sbatch_preamble(
359
- remote_export_dir=self.user_cache_dir
360
- )
361
-
362
- # Extend SLURM preamble with variable which are not in SlurmConfig, and
363
- # fix their order
364
- script_lines.extend(
365
- [
366
- f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
367
- f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
368
- f"#SBATCH -D {slurm_job.workdir_remote}",
369
- ]
75
+ super().__init__(
76
+ slurm_runner_type="sudo",
77
+ root_dir_local=root_dir_local,
78
+ root_dir_remote=root_dir_remote,
79
+ common_script_lines=common_script_lines,
80
+ user_cache_dir=user_cache_dir,
81
+ poll_interval=poll_interval,
370
82
  )
371
- script_lines = slurm_config.sort_script_lines(script_lines)
372
- logger.debug(script_lines)
373
-
374
- # Always print output of `uname -n` and `pwd`
375
- script_lines.append(
376
- '"Hostname: `uname -n`; current directory: `pwd`"\n'
377
- )
378
-
379
- # Complete script preamble
380
- script_lines.append("\n")
381
-
382
- # Include command lines
383
- tmp_list_commands = copy(cmdlines)
384
- while tmp_list_commands:
385
- if tmp_list_commands:
386
- cmd = tmp_list_commands.pop(0) # take first element
387
- script_lines.append(
388
- "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
389
- f"--mem={mem_per_task_MB}MB "
390
- f"{cmd} &"
391
- )
392
- script_lines.append("wait\n")
393
-
394
- script = "\n".join(script_lines)
395
-
396
- # Write submission script
397
- # submission_script_contents = "\n".join(preamble_lines + cmdlines)
398
- with open(slurm_job.slurm_submission_script_local, "w") as f:
399
- f.write(script)
400
83
 
401
- # Run sbatch
402
- pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
403
- submit_command = (
404
- f"sbatch --parsable {slurm_job.slurm_submission_script_local}"
405
- )
406
- full_command = f"{pre_command} {submit_command}"
407
-
408
- # Submit SLURM job and retrieve job ID
409
- res = _subprocess_run_or_raise(full_command)
410
- submitted_job_id = int(res.stdout)
411
- slurm_job.slurm_job_id = str(submitted_job_id)
412
-
413
- # Add job to self.jobs
414
- self.jobs[slurm_job.slurm_job_id] = slurm_job
415
- logger.debug(f"Added {slurm_job.slurm_job_id} to self.jobs.")
84
+ def _mkdir_local_folder(self, folder: str) -> None:
85
+ original_umask = os.umask(0)
86
+ Path(folder).mkdir(parents=True, mode=0o755)
87
+ os.umask(original_umask)
416
88
 
417
- @property
418
- def job_ids(self) -> list[str]:
419
- return list(self.jobs.keys())
89
+ def _mkdir_remote_folder(self, folder: str) -> None:
90
+ _mkdir_as_user(folder=folder, user=self.slurm_user)
420
91
 
421
92
  def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
422
93
  """
423
94
  Note: this would differ for SSH
424
95
  """
425
- logger.debug(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
96
+ logger.info(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
426
97
  source_target_list = [
427
98
  (job.slurm_stdout_remote, job.slurm_stdout_local),
428
99
  (job.slurm_stderr_remote, job.slurm_stderr_local),
@@ -469,315 +140,15 @@ class RunnerSlurmSudo(BaseRunner):
469
140
  f"Original error: {str(e)}"
470
141
  )
471
142
 
472
- def _postprocess_single_task(
473
- self, *, task: SlurmTask
474
- ) -> tuple[Any, Exception]:
475
- try:
476
- with open(task.output_pickle_file_local, "rb") as f:
477
- outdata = f.read()
478
- success, output = cloudpickle.loads(outdata)
479
- if success:
480
- result = output
481
- return result, None
482
- else:
483
- exception = _handle_exception_proxy(output)
484
- return None, exception
485
- except Exception as e:
486
- exception = JobExecutionError(f"ERROR, {str(e)}")
487
- return None, exception
488
- finally:
489
- Path(task.input_pickle_file_local).unlink(missing_ok=True)
490
- Path(task.output_pickle_file_local).unlink(missing_ok=True)
491
-
492
- def submit(
493
- self,
494
- func: callable,
495
- parameters: dict[str, Any],
496
- history_unit_id: int,
497
- task_files: TaskFiles,
498
- task_type: Literal[
499
- "non_parallel",
500
- "converter_non_parallel",
501
- "compound",
502
- "converter_compound",
503
- ],
504
- config: SlurmConfig,
505
- ) -> tuple[Any, Exception]:
506
-
507
- if len(self.jobs) > 0:
508
- raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
509
-
510
- workdir_local = task_files.wftask_subfolder_local
511
- workdir_remote = task_files.wftask_subfolder_remote
512
- if self.jobs != {}:
513
- raise JobExecutionError("Unexpected branch: jobs should be empty.")
514
-
515
- if self.is_shutdown():
516
- raise JobExecutionError("Cannot continue after shutdown.")
517
-
518
- # Validation phase
519
- self.validate_submit_parameters(parameters, task_type=task_type)
520
-
521
- # Create task subfolder
522
- original_umask = os.umask(0)
523
- workdir_local.mkdir(parents=True, mode=0o755)
524
- os.umask(original_umask)
525
- _mkdir_as_user(
526
- folder=workdir_remote.as_posix(),
143
+ def _run_remote_cmd(self, cmd: str):
144
+ res = _run_command_as_user(
145
+ cmd=cmd,
527
146
  user=self.slurm_user,
147
+ encoding="utf-8",
148
+ check=True,
528
149
  )
150
+ return res.stdout
529
151
 
530
- # Submission phase
531
- slurm_job = SlurmJob(
532
- label="0",
533
- workdir_local=workdir_local,
534
- workdir_remote=workdir_remote,
535
- tasks=[
536
- SlurmTask(
537
- index=0,
538
- component=task_files.component,
539
- parameters=parameters,
540
- workdir_remote=workdir_remote,
541
- workdir_local=workdir_local,
542
- task_files=task_files,
543
- )
544
- ],
545
- )
546
- config.parallel_tasks_per_job = 1
547
- self._submit_single_sbatch(
548
- func,
549
- slurm_job=slurm_job,
550
- slurm_config=config,
551
- )
552
- logger.info(f"END submission phase, {self.job_ids=}")
553
-
554
- # FIXME: Replace with more robust/efficient logic
555
- logger.warning("Now sleep 4 (FIXME)")
556
- time.sleep(4)
557
-
558
- # Retrieval phase
559
- logger.info("START retrieval phase")
560
- while len(self.jobs) > 0:
561
- if self.is_shutdown():
562
- self.scancel_jobs()
563
- finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
564
- logger.debug(f"{finished_job_ids=}")
565
- with next(get_sync_db()) as db:
566
- for slurm_job_id in finished_job_ids:
567
- logger.debug(f"Now process {slurm_job_id=}")
568
- slurm_job = self.jobs.pop(slurm_job_id)
569
- self._copy_files_from_remote_to_local(slurm_job)
570
- result, exception = self._postprocess_single_task(
571
- task=slurm_job.tasks[0]
572
- )
573
- # Note: the relevant done/failed check is based on
574
- # whether `exception is None`. The fact that
575
- # `result is None` is not relevant for this purpose.
576
- if exception is not None:
577
- update_status_of_history_unit(
578
- history_unit_id=history_unit_id,
579
- status=HistoryUnitStatus.FAILED,
580
- db_sync=db,
581
- )
582
- else:
583
- if task_type not in ["compound", "converter_compound"]:
584
- update_status_of_history_unit(
585
- history_unit_id=history_unit_id,
586
- status=HistoryUnitStatus.DONE,
587
- db_sync=db,
588
- )
589
-
590
- time.sleep(self.slurm_poll_interval)
591
-
592
- return result, exception
593
-
594
- def multisubmit(
595
- self,
596
- func: callable,
597
- list_parameters: list[dict],
598
- history_unit_ids: list[int],
599
- list_task_files: list[TaskFiles],
600
- task_type: Literal["parallel", "compound", "converter_compound"],
601
- config: SlurmConfig,
602
- ):
603
-
604
- if len(self.jobs) > 0:
605
- raise RuntimeError(
606
- f"Cannot run .multisubmit when {len(self.jobs)=}"
607
- )
608
-
609
- self.validate_multisubmit_parameters(
610
- list_parameters=list_parameters,
611
- task_type=task_type,
612
- list_task_files=list_task_files,
613
- )
614
- self.validate_multisubmit_history_unit_ids(
615
- history_unit_ids=history_unit_ids,
616
- task_type=task_type,
617
- list_parameters=list_parameters,
618
- )
619
-
620
- logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
621
-
622
- workdir_local = list_task_files[0].wftask_subfolder_local
623
- workdir_remote = list_task_files[0].wftask_subfolder_remote
624
-
625
- # Create local&remote task subfolders
626
- if task_type == "parallel":
627
- original_umask = os.umask(0)
628
- workdir_local.mkdir(parents=True, mode=0o755)
629
- os.umask(original_umask)
630
- _mkdir_as_user(
631
- folder=workdir_remote.as_posix(),
632
- user=self.slurm_user,
633
- )
634
-
635
- # Execute tasks, in chunks of size `parallel_tasks_per_job`
636
- # TODO Pick a data structure for results and exceptions, or review the
637
- # interface
638
- results: dict[int, Any] = {}
639
- exceptions: dict[int, BaseException] = {}
640
-
641
- original_task_files = list_task_files
642
- tot_tasks = len(list_parameters)
643
-
644
- # Set/validate parameters for task batching
645
- tasks_per_job, parallel_tasks_per_job = heuristics(
646
- # Number of parallel components (always known)
647
- tot_tasks=tot_tasks,
648
- # Optional WorkflowTask attributes:
649
- tasks_per_job=config.tasks_per_job,
650
- parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
651
- # Task requirements (multiple possible sources):
652
- cpus_per_task=config.cpus_per_task,
653
- mem_per_task=config.mem_per_task_MB,
654
- # Fractal configuration variables (soft/hard limits):
655
- target_cpus_per_job=config.target_cpus_per_job,
656
- target_mem_per_job=config.target_mem_per_job,
657
- target_num_jobs=config.target_num_jobs,
658
- max_cpus_per_job=config.max_cpus_per_job,
659
- max_mem_per_job=config.max_mem_per_job,
660
- max_num_jobs=config.max_num_jobs,
661
- )
662
- config.parallel_tasks_per_job = parallel_tasks_per_job
663
- config.tasks_per_job = tasks_per_job
664
-
665
- # Divide arguments in batches of `tasks_per_job` tasks each
666
- args_batches = []
667
- batch_size = tasks_per_job
668
- for ind_chunk in range(0, tot_tasks, batch_size):
669
- args_batches.append(
670
- list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
671
- )
672
- if len(args_batches) != math.ceil(tot_tasks / tasks_per_job):
673
- raise RuntimeError("Something wrong here while batching tasks")
674
-
675
- logger.info(f"START submission phase, {list(self.jobs.keys())=}")
676
- for ind_batch, chunk in enumerate(args_batches):
677
- tasks = []
678
- for ind_chunk, parameters in enumerate(chunk):
679
- index = (ind_batch * batch_size) + ind_chunk
680
- tasks.append(
681
- SlurmTask(
682
- index=index,
683
- component=original_task_files[index].component,
684
- workdir_local=workdir_local,
685
- workdir_remote=workdir_remote,
686
- parameters=parameters,
687
- zarr_url=parameters["zarr_url"],
688
- task_files=original_task_files[index],
689
- ),
690
- )
691
-
692
- slurm_job = SlurmJob(
693
- label=f"{ind_batch:06d}",
694
- workdir_local=workdir_local,
695
- workdir_remote=workdir_remote,
696
- tasks=tasks,
697
- )
698
- self._submit_single_sbatch(
699
- func,
700
- slurm_job=slurm_job,
701
- slurm_config=config,
702
- )
703
- logger.info(f"END submission phase, {self.job_ids=}")
704
-
705
- # FIXME: Replace with more robust/efficient logic
706
- logger.warning("Now sleep 4 (FIXME)")
707
- time.sleep(4)
708
-
709
- # Retrieval phase
710
- logger.info("START retrieval phase")
711
- while len(self.jobs) > 0:
712
- if self.is_shutdown():
713
- self.scancel_jobs()
714
- finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
715
- logger.debug(f"{finished_job_ids=}")
716
- with next(get_sync_db()) as db:
717
- for slurm_job_id in finished_job_ids:
718
- logger.debug(f"Now processing {slurm_job_id=}")
719
- slurm_job = self.jobs.pop(slurm_job_id)
720
- self._copy_files_from_remote_to_local(slurm_job)
721
- for task in slurm_job.tasks:
722
- logger.debug(f"Now processing {task.index=}")
723
- result, exception = self._postprocess_single_task(
724
- task=task
725
- )
726
-
727
- # Note: the relevant done/failed check is based on
728
- # whether `exception is None`. The fact that
729
- # `result is None` is not relevant for this purpose.
730
- if exception is not None:
731
- logger.debug(
732
- f"Task {task.index} has an exception."
733
- ) # FIXME # noqa
734
- exceptions[task.index] = exception
735
- if task_type == "parallel":
736
- update_status_of_history_unit(
737
- history_unit_id=history_unit_ids[
738
- task.index
739
- ],
740
- status=HistoryUnitStatus.FAILED,
741
- db_sync=db,
742
- )
743
- else:
744
- logger.debug(
745
- f"Task {task.index} has no exception."
746
- ) # FIXME # noqa
747
- results[task.index] = result
748
- if task_type == "parallel":
749
- update_status_of_history_unit(
750
- history_unit_id=history_unit_ids[
751
- task.index
752
- ],
753
- status=HistoryUnitStatus.DONE,
754
- db_sync=db,
755
- )
756
-
757
- time.sleep(self.slurm_poll_interval)
758
- return results, exceptions
759
-
760
- def check_remote_python_interpreter(self):
761
- """
762
- Check fractal-server version on the _remote_ Python interpreter.
763
- """
764
- settings = Inject(get_settings)
765
- output = _subprocess_run_or_raise(
766
- (
767
- f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
768
- "-m fractal_server.app.runner.versions"
769
- )
770
- )
771
- runner_version = json.loads(output.stdout.strip("\n"))[
772
- "fractal_server"
773
- ]
774
- if runner_version != __VERSION__:
775
- error_msg = (
776
- "Fractal-server version mismatch.\n"
777
- "Local interpreter: "
778
- f"({sys.executable}): {__VERSION__}.\n"
779
- "Remote interpreter: "
780
- f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
781
- )
782
- logger.error(error_msg)
783
- raise RuntimeError(error_msg)
152
+ def _run_local_cmd(self, cmd: str):
153
+ res = _subprocess_run_or_raise(cmd)
154
+ return res.stdout