fractal-server 2.14.0a9__py3-none-any.whl → 2.14.0a11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v2/dataset.py +0 -10
  3. fractal_server/app/models/v2/job.py +3 -0
  4. fractal_server/app/routes/api/v2/__init__.py +2 -0
  5. fractal_server/app/routes/api/v2/history.py +14 -9
  6. fractal_server/app/routes/api/v2/images.py +5 -2
  7. fractal_server/app/routes/api/v2/submit.py +16 -14
  8. fractal_server/app/routes/api/v2/verify_image_types.py +64 -0
  9. fractal_server/app/routes/api/v2/workflow.py +11 -7
  10. fractal_server/app/runner/components.py +0 -3
  11. fractal_server/app/runner/exceptions.py +4 -0
  12. fractal_server/app/runner/executors/base_runner.py +16 -17
  13. fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
  14. fractal_server/app/runner/executors/local/runner.py +117 -58
  15. fractal_server/app/runner/executors/{slurm_sudo → slurm_common}/_check_jobs_status.py +4 -0
  16. fractal_server/app/runner/executors/slurm_ssh/_check_job_status_ssh.py +67 -0
  17. fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
  18. fractal_server/app/runner/executors/slurm_ssh/runner.py +707 -0
  19. fractal_server/app/runner/executors/slurm_sudo/runner.py +265 -114
  20. fractal_server/app/runner/task_files.py +8 -0
  21. fractal_server/app/runner/v2/__init__.py +0 -365
  22. fractal_server/app/runner/v2/_local.py +4 -2
  23. fractal_server/app/runner/v2/_slurm_ssh.py +4 -2
  24. fractal_server/app/runner/v2/_slurm_sudo.py +4 -2
  25. fractal_server/app/runner/v2/db_tools.py +87 -0
  26. fractal_server/app/runner/v2/runner.py +83 -89
  27. fractal_server/app/runner/v2/runner_functions.py +279 -436
  28. fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
  29. fractal_server/app/runner/v2/submit_workflow.py +366 -0
  30. fractal_server/app/runner/v2/task_interface.py +31 -0
  31. fractal_server/app/schemas/v2/dataset.py +4 -71
  32. fractal_server/app/schemas/v2/dumps.py +6 -5
  33. fractal_server/app/schemas/v2/job.py +6 -3
  34. fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
  35. fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
  36. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
  37. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +40 -36
  38. fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
  39. fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
  40. fractal_server/app/runner/v2/_db_tools.py +0 -48
  41. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
  42. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
  43. {fractal_server-2.14.0a9.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -6,19 +6,22 @@ import shlex
6
6
  import subprocess # nosec
7
7
  import sys
8
8
  import time
9
+ from copy import copy
9
10
  from pathlib import Path
10
11
  from typing import Any
12
+ from typing import Literal
11
13
  from typing import Optional
12
14
 
13
15
  import cloudpickle
14
16
  from pydantic import BaseModel
15
17
  from pydantic import ConfigDict
16
18
 
17
- from ._check_jobs_status import get_finished_jobs
19
+ from ..slurm_common._check_jobs_status import get_finished_jobs
20
+ from ..slurm_common._check_jobs_status import run_squeue
18
21
  from ._subprocess_run_as_user import _mkdir_as_user
19
22
  from ._subprocess_run_as_user import _run_command_as_user
20
23
  from fractal_server import __VERSION__
21
- from fractal_server.app.runner.components import _COMPONENT_KEY_
24
+ from fractal_server.app.db import get_sync_db
22
25
  from fractal_server.app.runner.exceptions import JobExecutionError
23
26
  from fractal_server.app.runner.exceptions import TaskExecutionError
24
27
  from fractal_server.app.runner.executors.base_runner import BaseRunner
@@ -30,7 +33,8 @@ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
30
33
  )
31
34
  from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
32
35
  from fractal_server.app.runner.task_files import TaskFiles
33
- from fractal_server.app.schemas.v2.task import TaskTypeType
36
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
37
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
34
38
  from fractal_server.config import get_settings
35
39
  from fractal_server.logger import set_logger
36
40
  from fractal_server.syringe import Inject
@@ -97,40 +101,68 @@ class SlurmJob(BaseModel):
97
101
  tasks: list[SlurmTask]
98
102
 
99
103
  @property
100
- def slurm_log_file_local(self) -> str:
104
+ def slurm_submission_script_local(self) -> str:
105
+ return (
106
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
107
+ ).as_posix()
108
+
109
+ @property
110
+ def slurm_submission_script_remote(self) -> str:
111
+ return (
112
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
113
+ ).as_posix()
114
+
115
+ @property
116
+ def slurm_stdout_remote(self) -> str:
101
117
  if self.slurm_job_id:
102
118
  return (
103
- self.workdir_local
104
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
119
+ self.workdir_remote
120
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
105
121
  ).as_posix()
122
+
106
123
  else:
107
124
  return (
108
- self.workdir_local / f"slurm-{self.label}-%j.log"
125
+ self.workdir_remote / f"slurm-{self.label}-%j.out"
109
126
  ).as_posix()
110
127
 
111
128
  @property
112
- def slurm_log_file_remote(self) -> str:
129
+ def slurm_stderr_remote(self) -> str:
113
130
  if self.slurm_job_id:
114
131
  return (
115
132
  self.workdir_remote
116
- / f"slurm-{self.label}-{self.slurm_job_id}.log"
133
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
117
134
  ).as_posix()
135
+
118
136
  else:
119
137
  return (
120
- self.workdir_remote / f"slurm-{self.label}-%j.log"
138
+ self.workdir_remote / f"slurm-{self.label}-%j.err"
121
139
  ).as_posix()
122
140
 
123
141
  @property
124
- def slurm_submission_script_local(self) -> str:
125
- return (
126
- self.workdir_local / f"slurm-{self.label}-submit.sh"
127
- ).as_posix()
142
+ def slurm_stdout_local(self) -> str:
143
+ if self.slurm_job_id:
144
+ return (
145
+ self.workdir_local
146
+ / f"slurm-{self.label}-{self.slurm_job_id}.out"
147
+ ).as_posix()
148
+
149
+ else:
150
+ return (
151
+ self.workdir_local / f"slurm-{self.label}-%j.out"
152
+ ).as_posix()
128
153
 
129
154
  @property
130
- def slurm_submission_script_remote(self) -> str:
131
- return (
132
- self.workdir_remote / f"slurm-{self.label}-submit.sh"
133
- ).as_posix()
155
+ def slurm_stderr_local(self) -> str:
156
+ if self.slurm_job_id:
157
+ return (
158
+ self.workdir_local
159
+ / f"slurm-{self.label}-{self.slurm_job_id}.err"
160
+ ).as_posix()
161
+
162
+ else:
163
+ return (
164
+ self.workdir_local / f"slurm-{self.label}-%j.err"
165
+ ).as_posix()
134
166
 
135
167
  @property
136
168
  def log_files_local(self) -> list[str]:
@@ -276,9 +308,7 @@ class RunnerSlurmSudo(BaseRunner):
276
308
  slurm_job: SlurmJob,
277
309
  slurm_config: SlurmConfig,
278
310
  ) -> str:
279
- # if len(slurm_job.tasks) > 1:
280
- # raise NotImplementedError()
281
-
311
+ logger.debug("[_submit_single_sbatch] START")
282
312
  # Prepare input pickle(s)
283
313
  versions = dict(
284
314
  python=sys.version_info[:3],
@@ -287,44 +317,87 @@ class RunnerSlurmSudo(BaseRunner):
287
317
  )
288
318
  for task in slurm_job.tasks:
289
319
  _args = []
290
- _kwargs = dict(parameters=task.parameters)
320
+ _kwargs = dict(
321
+ parameters=task.parameters,
322
+ remote_files=task.task_files.remote_files_dict,
323
+ )
291
324
  funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
292
325
  with open(task.input_pickle_file_local, "wb") as f:
293
326
  f.write(funcser)
294
-
327
+ logger.debug(
328
+ "[_submit_single_sbatch] Written "
329
+ f"{task.input_pickle_file_local=}"
330
+ )
295
331
  # Prepare commands to be included in SLURM submission script
296
-
297
- preamble_lines = [
298
- "#!/bin/bash",
299
- "#SBATCH --partition=main",
300
- "#SBATCH --ntasks=1",
301
- "#SBATCH --cpus-per-task=1",
302
- "#SBATCH --mem=10M",
303
- f"#SBATCH --err={slurm_job.slurm_log_file_remote}",
304
- f"#SBATCH --out={slurm_job.slurm_log_file_remote}",
305
- f"#SBATCH -D {slurm_job.workdir_remote}",
306
- "#SBATCH --job-name=test",
307
- "\n",
308
- ]
309
-
332
+ settings = Inject(get_settings)
333
+ python_worker_interpreter = (
334
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
335
+ )
310
336
  cmdlines = []
311
337
  for task in slurm_job.tasks:
312
- cmd = (
313
- f"{self.python_worker_interpreter}"
314
- " -m fractal_server.app.runner.executors.slurm_common.remote "
315
- f"--input-file {task.input_pickle_file_local} "
316
- f"--output-file {task.output_pickle_file_remote}"
317
- )
318
- cmdlines.append("whoami")
338
+ input_pickle_file = task.input_pickle_file_local
339
+ output_pickle_file = task.output_pickle_file_remote
319
340
  cmdlines.append(
320
- f"srun --ntasks=1 --cpus-per-task=1 --mem=10MB {cmd} &"
341
+ (
342
+ f"{python_worker_interpreter}"
343
+ " -m fractal_server.app.runner."
344
+ "executors.slurm_common.remote "
345
+ f"--input-file {input_pickle_file} "
346
+ f"--output-file {output_pickle_file}"
347
+ )
321
348
  )
322
- cmdlines.append("wait\n")
349
+
350
+ # ...
351
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
352
+ mem_per_task_MB = slurm_config.mem_per_task_MB
353
+
354
+ # Set ntasks
355
+ ntasks = min(len(cmdlines), num_tasks_max_running)
356
+ slurm_config.parallel_tasks_per_job = ntasks
357
+
358
+ # Prepare SLURM preamble based on SlurmConfig object
359
+ script_lines = slurm_config.to_sbatch_preamble(
360
+ remote_export_dir=self.user_cache_dir
361
+ )
362
+
363
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
364
+ # fix their order
365
+ script_lines.extend(
366
+ [
367
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
368
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
369
+ f"#SBATCH -D {slurm_job.workdir_remote}",
370
+ ]
371
+ )
372
+ script_lines = slurm_config.sort_script_lines(script_lines)
373
+ logger.debug(script_lines)
374
+
375
+ # Always print output of `uname -n` and `pwd`
376
+ script_lines.append(
377
+ '"Hostname: `uname -n`; current directory: `pwd`"\n'
378
+ )
379
+
380
+ # Complete script preamble
381
+ script_lines.append("\n")
382
+
383
+ # Include command lines
384
+ tmp_list_commands = copy(cmdlines)
385
+ while tmp_list_commands:
386
+ if tmp_list_commands:
387
+ cmd = tmp_list_commands.pop(0) # take first element
388
+ script_lines.append(
389
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
390
+ f"--mem={mem_per_task_MB}MB "
391
+ f"{cmd} &"
392
+ )
393
+ script_lines.append("wait\n")
394
+
395
+ script = "\n".join(script_lines)
323
396
 
324
397
  # Write submission script
325
- submission_script_contents = "\n".join(preamble_lines + cmdlines)
398
+ # submission_script_contents = "\n".join(preamble_lines + cmdlines)
326
399
  with open(slurm_job.slurm_submission_script_local, "w") as f:
327
- f.write(submission_script_contents)
400
+ f.write(script)
328
401
 
329
402
  # Run sbatch
330
403
  pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
@@ -350,8 +423,10 @@ class RunnerSlurmSudo(BaseRunner):
350
423
  """
351
424
  Note: this would differ for SSH
352
425
  """
426
+ logger.debug(f"[_copy_files_from_remote_to_local] {job.slurm_job_id=}")
353
427
  source_target_list = [
354
- (job.slurm_log_file_remote, job.slurm_log_file_local)
428
+ (job.slurm_stdout_remote, job.slurm_stdout_local),
429
+ (job.slurm_stderr_remote, job.slurm_stderr_local),
355
430
  ]
356
431
  for task in job.tasks:
357
432
  source_target_list.extend(
@@ -419,21 +494,22 @@ class RunnerSlurmSudo(BaseRunner):
419
494
  self,
420
495
  func: callable,
421
496
  parameters: dict[str, Any],
422
- history_item_id: int,
497
+ history_unit_id: int,
423
498
  task_files: TaskFiles,
424
- slurm_config: SlurmConfig,
425
- task_type: TaskTypeType,
499
+ task_type: Literal[
500
+ "non_parallel",
501
+ "converter_non_parallel",
502
+ "compound",
503
+ "converter_compound",
504
+ ],
505
+ config: SlurmConfig,
426
506
  ) -> tuple[Any, Exception]:
427
- workdir_local = task_files.wftask_subfolder_local
428
- workdir_remote = task_files.wftask_subfolder_remote
429
507
 
430
- task_files = TaskFiles(
431
- **task_files.model_dump(
432
- exclude={"component"},
433
- ),
434
- component=parameters[_COMPONENT_KEY_],
435
- )
508
+ if len(self.jobs) > 0:
509
+ raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
436
510
 
511
+ workdir_local = task_files.wftask_subfolder_local
512
+ workdir_remote = task_files.wftask_subfolder_remote
437
513
  if self.jobs != {}:
438
514
  raise JobExecutionError("Unexpected branch: jobs should be empty.")
439
515
 
@@ -441,7 +517,7 @@ class RunnerSlurmSudo(BaseRunner):
441
517
  raise JobExecutionError("Cannot continue after shutdown.")
442
518
 
443
519
  # Validation phase
444
- self.validate_submit_parameters(parameters)
520
+ self.validate_submit_parameters(parameters, task_type=task_type)
445
521
 
446
522
  # Create task subfolder
447
523
  original_umask = os.umask(0)
@@ -460,7 +536,7 @@ class RunnerSlurmSudo(BaseRunner):
460
536
  tasks=[
461
537
  SlurmTask(
462
538
  index=0,
463
- component="0",
539
+ component=task_files.component,
464
540
  parameters=parameters,
465
541
  workdir_remote=workdir_remote,
466
542
  workdir_local=workdir_local,
@@ -468,25 +544,57 @@ class RunnerSlurmSudo(BaseRunner):
468
544
  )
469
545
  ],
470
546
  ) # TODO: replace with actual values (BASED ON TASKFILES)
547
+
548
+ config.parallel_tasks_per_job = 1
471
549
  self._submit_single_sbatch(
472
550
  func,
473
551
  slurm_job=slurm_job,
474
- slurm_config=slurm_config,
552
+ slurm_config=config,
475
553
  )
476
-
477
- # LOGFILE = task_files.log_file_local
554
+ logger.debug("END SUBMISSION PHASE")
555
+ logger.debug(f"{self.jobs=}")
556
+ logger.debug(f"{self.job_ids=}")
557
+
558
+ # FIXME
559
+ jobs_that_started = set()
560
+ while len(jobs_that_started) != len(self.job_ids):
561
+ logger.debug("CALL SQUEUE")
562
+ res = run_squeue(self.job_ids)
563
+ new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
564
+ jobs_that_started = jobs_that_started.union(new_jobs)
565
+ logger.debug(f"{new_jobs=}")
566
+ logger.debug(f"{len(jobs_that_started)=}")
567
+
568
+ logger.debug("START RETRIEVAL PHASE")
478
569
 
479
570
  # Retrieval phase
480
571
  while len(self.jobs) > 0:
481
572
  if self.is_shutdown():
482
573
  self.scancel_jobs()
483
574
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
484
- for slurm_job_id in finished_job_ids:
485
- slurm_job = self.jobs.pop(slurm_job_id)
486
- self._copy_files_from_remote_to_local(slurm_job)
487
- result, exception = self._postprocess_single_task(
488
- task=slurm_job.tasks[0]
489
- )
575
+ logger.debug(f"{finished_job_ids=}")
576
+ with next(get_sync_db()) as db:
577
+ for slurm_job_id in finished_job_ids:
578
+ logger.debug(f"Now process {slurm_job_id=}")
579
+ slurm_job = self.jobs.pop(slurm_job_id)
580
+ self._copy_files_from_remote_to_local(slurm_job)
581
+ result, exception = self._postprocess_single_task(
582
+ task=slurm_job.tasks[0]
583
+ )
584
+ if result is not None:
585
+ if task_type not in ["compound", "converter_compound"]:
586
+ update_status_of_history_unit(
587
+ history_unit_id=history_unit_id,
588
+ status=HistoryUnitStatus.DONE,
589
+ db_sync=db,
590
+ )
591
+ if exception is not None:
592
+ update_status_of_history_unit(
593
+ history_unit_id=history_unit_id,
594
+ status=HistoryUnitStatus.FAILED,
595
+ db_sync=db,
596
+ )
597
+
490
598
  time.sleep(self.slurm_poll_interval)
491
599
 
492
600
  return result, exception
@@ -495,19 +603,38 @@ class RunnerSlurmSudo(BaseRunner):
495
603
  self,
496
604
  func: callable,
497
605
  list_parameters: list[dict],
498
- history_item_id: int,
499
- task_files: TaskFiles,
500
- slurm_config: SlurmConfig,
501
- task_type: TaskTypeType,
606
+ history_unit_ids: list[int],
607
+ list_task_files: list[TaskFiles],
608
+ task_type: Literal["parallel", "compound", "converter_compound"],
609
+ config: SlurmConfig,
502
610
  ):
503
- # self.scancel_jobs()
611
+
612
+ if len(self.jobs) > 0:
613
+ raise RuntimeError(f"Cannot run .submit when {len(self.jobs)=}")
614
+
615
+ if task_type in ["compound", "converter_compound"]:
616
+ if len(history_unit_ids) != 1:
617
+ raise NotImplementedError(
618
+ "We are breaking the assumption that compound/multisubmit "
619
+ "is associated to a single HistoryUnit. This is not "
620
+ "supported."
621
+ )
622
+ elif task_type == "parallel" and len(history_unit_ids) != len(
623
+ list_parameters
624
+ ):
625
+ raise ValueError(
626
+ f"{len(history_unit_ids)=} differs from "
627
+ f"{len(list_parameters)=}."
628
+ )
504
629
 
505
630
  self.validate_multisubmit_parameters(
506
- list_parameters=list_parameters, task_type=task_type
631
+ list_parameters=list_parameters,
632
+ task_type=task_type,
633
+ list_task_files=list_task_files,
507
634
  )
508
635
 
509
- workdir_local = task_files.wftask_subfolder_local
510
- workdir_remote = task_files.wftask_subfolder_remote
636
+ workdir_local = list_task_files[0].wftask_subfolder_local
637
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
511
638
 
512
639
  # Create local&remote task subfolders
513
640
  if task_type not in ["converter_compound", "compound"]:
@@ -525,7 +652,7 @@ class RunnerSlurmSudo(BaseRunner):
525
652
  results: dict[int, Any] = {}
526
653
  exceptions: dict[int, BaseException] = {}
527
654
 
528
- original_task_files = task_files
655
+ original_task_files = list_task_files
529
656
  tot_tasks = len(list_parameters)
530
657
 
531
658
  # Set/validate parameters for task batching
@@ -533,21 +660,21 @@ class RunnerSlurmSudo(BaseRunner):
533
660
  # Number of parallel components (always known)
534
661
  tot_tasks=tot_tasks,
535
662
  # Optional WorkflowTask attributes:
536
- tasks_per_job=slurm_config.tasks_per_job,
537
- parallel_tasks_per_job=slurm_config.parallel_tasks_per_job, # noqa
663
+ tasks_per_job=config.tasks_per_job,
664
+ parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
538
665
  # Task requirements (multiple possible sources):
539
- cpus_per_task=slurm_config.cpus_per_task,
540
- mem_per_task=slurm_config.mem_per_task_MB,
666
+ cpus_per_task=config.cpus_per_task,
667
+ mem_per_task=config.mem_per_task_MB,
541
668
  # Fractal configuration variables (soft/hard limits):
542
- target_cpus_per_job=slurm_config.target_cpus_per_job,
543
- target_mem_per_job=slurm_config.target_mem_per_job,
544
- target_num_jobs=slurm_config.target_num_jobs,
545
- max_cpus_per_job=slurm_config.max_cpus_per_job,
546
- max_mem_per_job=slurm_config.max_mem_per_job,
547
- max_num_jobs=slurm_config.max_num_jobs,
669
+ target_cpus_per_job=config.target_cpus_per_job,
670
+ target_mem_per_job=config.target_mem_per_job,
671
+ target_num_jobs=config.target_num_jobs,
672
+ max_cpus_per_job=config.max_cpus_per_job,
673
+ max_mem_per_job=config.max_mem_per_job,
674
+ max_num_jobs=config.max_num_jobs,
548
675
  )
549
- slurm_config.parallel_tasks_per_job = parallel_tasks_per_job
550
- slurm_config.tasks_per_job = tasks_per_job
676
+ config.parallel_tasks_per_job = parallel_tasks_per_job
677
+ config.tasks_per_job = tasks_per_job
551
678
 
552
679
  # Divide arguments in batches of `tasks_per_job` tasks each
553
680
  args_batches = []
@@ -561,24 +688,18 @@ class RunnerSlurmSudo(BaseRunner):
561
688
 
562
689
  logger.info(f"START submission phase, {list(self.jobs.keys())=}")
563
690
  for ind_batch, chunk in enumerate(args_batches):
564
- # TODO: replace with actual values
565
691
  tasks = []
566
692
  for ind_chunk, parameters in enumerate(chunk):
567
- component = parameters[_COMPONENT_KEY_]
693
+ index = (ind_batch * batch_size) + ind_chunk
568
694
  tasks.append(
569
695
  SlurmTask(
570
- index=(ind_batch * batch_size) + ind_chunk,
571
- component=component,
696
+ index=index,
697
+ component=original_task_files[index].component,
572
698
  workdir_local=workdir_local,
573
699
  workdir_remote=workdir_remote,
574
700
  parameters=parameters,
575
701
  zarr_url=parameters["zarr_url"],
576
- task_files=TaskFiles(
577
- **original_task_files.model_dump(
578
- exclude={"component"}
579
- ),
580
- component=component,
581
- ),
702
+ task_files=original_task_files[index],
582
703
  ),
583
704
  )
584
705
 
@@ -591,26 +712,56 @@ class RunnerSlurmSudo(BaseRunner):
591
712
  self._submit_single_sbatch(
592
713
  func,
593
714
  slurm_job=slurm_job,
594
- slurm_config=slurm_config,
715
+ slurm_config=config,
595
716
  )
596
717
  logger.info(f"END submission phase, {list(self.jobs.keys())=}")
597
718
 
719
+ # FIXME
720
+ jobs_that_started = set()
721
+ while len(jobs_that_started) != len(self.job_ids):
722
+ res = run_squeue(self.job_ids)
723
+ new_jobs = set(out.split()[0] for out in res.stdout.splitlines())
724
+ jobs_that_started = jobs_that_started.union(new_jobs)
725
+ logger.debug(f"{new_jobs=}")
726
+ logger.debug(f"{len(jobs_that_started)=}")
727
+
598
728
  # Retrieval phase
599
729
  while len(self.jobs) > 0:
600
730
  if self.is_shutdown():
601
731
  self.scancel_jobs()
602
732
  finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
603
- for slurm_job_id in finished_job_ids:
604
- slurm_job = self.jobs.pop(slurm_job_id)
605
- self._copy_files_from_remote_to_local(slurm_job)
606
- for task in slurm_job.tasks:
607
- result, exception = self._postprocess_single_task(
608
- task=task
609
- )
610
- if exception is None:
611
- results[task.index] = result
612
- else:
613
- exceptions[task.index] = exception
733
+ logger.debug(f"{finished_job_ids=}")
734
+ with next(get_sync_db()) as db:
735
+ for slurm_job_id in finished_job_ids:
736
+ logger.debug(f"Now processing {slurm_job_id=}")
737
+ slurm_job = self.jobs.pop(slurm_job_id)
738
+ self._copy_files_from_remote_to_local(slurm_job)
739
+ for task in slurm_job.tasks:
740
+ result, exception = self._postprocess_single_task(
741
+ task=task
742
+ )
743
+
744
+ if result is not None:
745
+ results[task.index] = result
746
+ if task_type == "parallel":
747
+ update_status_of_history_unit(
748
+ history_unit_id=history_unit_ids[
749
+ task.index
750
+ ],
751
+ status=HistoryUnitStatus.DONE,
752
+ db_sync=db,
753
+ )
754
+ if exception is not None:
755
+ exceptions[task.index] = exception
756
+ if task_type == "parallel":
757
+ update_status_of_history_unit(
758
+ history_unit_id=history_unit_ids[
759
+ task.index
760
+ ],
761
+ status=HistoryUnitStatus.FAILED,
762
+ db_sync=db,
763
+ )
764
+
614
765
  time.sleep(self.slurm_poll_interval)
615
766
  return results, exceptions
616
767
 
@@ -96,3 +96,11 @@ class TaskFiles(BaseModel):
96
96
  return (
97
97
  self.wftask_subfolder_remote / f"{self.component}-metadiff.json"
98
98
  ).as_posix()
99
+
100
+ @property
101
+ def remote_files_dict(self) -> dict[str, str]:
102
+ return dict(
103
+ args_file_remote=self.args_file_remote,
104
+ metadiff_file_remote=self.metadiff_file_remote,
105
+ log_file_remote=self.log_file_remote,
106
+ )