fractal-server 2.0.6__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/db/__init__.py +1 -1
- fractal_server/app/routes/admin/v1.py +2 -4
- fractal_server/app/routes/admin/v2.py +2 -4
- fractal_server/app/routes/api/v1/_aux_functions.py +24 -0
- fractal_server/app/routes/api/v1/job.py +3 -4
- fractal_server/app/routes/api/v1/project.py +28 -18
- fractal_server/app/routes/api/v2/_aux_functions.py +35 -12
- fractal_server/app/routes/api/v2/job.py +3 -4
- fractal_server/app/routes/api/v2/project.py +21 -0
- fractal_server/app/routes/api/v2/submit.py +36 -15
- fractal_server/app/routes/aux/_job.py +3 -1
- fractal_server/app/routes/aux/_runner.py +3 -3
- fractal_server/app/runner/executors/slurm/executor.py +169 -68
- fractal_server/app/runner/shutdown.py +88 -0
- fractal_server/app/runner/task_files.py +59 -27
- fractal_server/app/runner/v1/__init__.py +113 -64
- fractal_server/app/runner/v1/_common.py +53 -51
- fractal_server/app/runner/v1/_local/__init__.py +12 -11
- fractal_server/app/runner/v1/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v1/_slurm/__init__.py +16 -16
- fractal_server/app/runner/v1/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v1/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/__init__.py +139 -60
- fractal_server/app/runner/v2/_local/__init__.py +12 -11
- fractal_server/app/runner/v2/_local/_local_config.py +1 -1
- fractal_server/app/runner/v2/_local/_submit_setup.py +4 -4
- fractal_server/app/runner/v2/_local_experimental/__init__.py +155 -0
- fractal_server/app/runner/v2/_local_experimental/_local_config.py +108 -0
- fractal_server/app/runner/v2/_local_experimental/_submit_setup.py +42 -0
- fractal_server/app/runner/v2/_local_experimental/executor.py +156 -0
- fractal_server/app/runner/v2/_slurm/__init__.py +10 -10
- fractal_server/app/runner/v2/_slurm/_submit_setup.py +11 -10
- fractal_server/app/runner/v2/_slurm/get_slurm_config.py +6 -6
- fractal_server/app/runner/v2/runner.py +17 -15
- fractal_server/app/runner/v2/runner_functions.py +38 -38
- fractal_server/app/runner/v2/runner_functions_low_level.py +12 -6
- fractal_server/app/security/__init__.py +4 -5
- fractal_server/config.py +73 -19
- fractal_server/gunicorn_fractal.py +40 -0
- fractal_server/{logger/__init__.py → logger.py} +2 -2
- fractal_server/main.py +45 -26
- fractal_server/migrations/env.py +1 -1
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/METADATA +4 -1
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/RECORD +48 -43
- fractal_server/logger/gunicorn_logger.py +0 -19
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.0.6.dist-info → fractal_server-2.2.0.dist-info}/entry_points.txt +0 -0
@@ -110,12 +110,14 @@ class SlurmJob:
|
|
110
110
|
slurm_file_prefix:
|
111
111
|
Prefix for SLURM-job related files (submission script and SLURM
|
112
112
|
stdout/stderr); this is also needed in the
|
113
|
-
`
|
113
|
+
`_copy_files_from_remote_to_local` method.
|
114
114
|
wftask_file_prefixes:
|
115
115
|
Prefix for files that are created as part of the functions
|
116
116
|
submitted for execution on the `FractalSlurmExecutor`; this is
|
117
|
-
needed in the `
|
118
|
-
construct the names of per-task input/output pickle files.
|
117
|
+
needed in the `_copy_files_from_remote_to_local` method, and also
|
118
|
+
to construct the names of per-task input/output pickle files.
|
119
|
+
wftask_subfolder_name:
|
120
|
+
Name of the per-task subfolder (e.g. `7_task_name`).
|
119
121
|
slurm_script:
|
120
122
|
Path of SLURM submission script.
|
121
123
|
slurm_stdout:
|
@@ -145,6 +147,7 @@ class SlurmJob:
|
|
145
147
|
# Per-task attributes
|
146
148
|
workerids: tuple[str, ...]
|
147
149
|
wftask_file_prefixes: tuple[str, ...]
|
150
|
+
wftask_subfolder_name: str
|
148
151
|
input_pickle_files: tuple[Path, ...]
|
149
152
|
output_pickle_files: tuple[Path, ...]
|
150
153
|
# Slurm configuration
|
@@ -196,9 +199,9 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
196
199
|
common_script_lines:
|
197
200
|
Arbitrary script lines that will always be included in the
|
198
201
|
sbatch script
|
199
|
-
|
202
|
+
workflow_dir_local:
|
200
203
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
201
|
-
|
204
|
+
workflow_dir_remote:
|
202
205
|
Directory for both the cfut/SLURM and fractal-server files and logs
|
203
206
|
map_jobid_to_slurm_files:
|
204
207
|
Dictionary with paths of slurm-related files for active jobs
|
@@ -209,17 +212,18 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
209
212
|
shutdown_file: str
|
210
213
|
common_script_lines: list[str]
|
211
214
|
user_cache_dir: str
|
212
|
-
|
213
|
-
|
215
|
+
workflow_dir_local: Path
|
216
|
+
workflow_dir_remote: Path
|
214
217
|
map_jobid_to_slurm_files: dict[str, tuple[str, str, str]]
|
215
218
|
keep_pickle_files: bool
|
216
219
|
slurm_account: Optional[str]
|
220
|
+
jobs: dict[str, tuple[Future, SlurmJob]]
|
217
221
|
|
218
222
|
def __init__(
|
219
223
|
self,
|
220
224
|
slurm_user: str,
|
221
|
-
|
222
|
-
|
225
|
+
workflow_dir_local: Path,
|
226
|
+
workflow_dir_remote: Path,
|
223
227
|
shutdown_file: Optional[str] = None,
|
224
228
|
user_cache_dir: Optional[str] = None,
|
225
229
|
common_script_lines: Optional[list[str]] = None,
|
@@ -262,14 +266,14 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
262
266
|
except StopIteration:
|
263
267
|
pass
|
264
268
|
|
265
|
-
self.
|
269
|
+
self.workflow_dir_local = workflow_dir_local
|
266
270
|
if not _path_exists_as_user(
|
267
|
-
path=str(
|
271
|
+
path=str(workflow_dir_remote), user=self.slurm_user
|
268
272
|
):
|
269
|
-
logger.info(f"Missing folder {
|
273
|
+
logger.info(f"Missing folder {workflow_dir_remote=}")
|
270
274
|
self.user_cache_dir = user_cache_dir
|
271
275
|
|
272
|
-
self.
|
276
|
+
self.workflow_dir_remote = workflow_dir_remote
|
273
277
|
self.map_jobid_to_slurm_files = {}
|
274
278
|
|
275
279
|
# Set the attribute slurm_poll_interval for self.wait_thread (see
|
@@ -281,7 +285,8 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
281
285
|
self.wait_thread.slurm_user = self.slurm_user
|
282
286
|
|
283
287
|
self.wait_thread.shutdown_file = (
|
284
|
-
shutdown_file
|
288
|
+
shutdown_file
|
289
|
+
or (self.workflow_dir_local / SHUTDOWN_FILENAME).as_posix()
|
285
290
|
)
|
286
291
|
self.wait_thread.shutdown_callback = self.shutdown
|
287
292
|
|
@@ -294,32 +299,64 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
294
299
|
self.map_jobid_to_slurm_files.pop(jobid)
|
295
300
|
|
296
301
|
def get_input_pickle_file_path(
|
297
|
-
self, arg: str, prefix: Optional[str] = None
|
302
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
298
303
|
) -> Path:
|
304
|
+
|
299
305
|
prefix = prefix or "cfut"
|
300
|
-
|
306
|
+
output = (
|
307
|
+
self.workflow_dir_local
|
308
|
+
/ subfolder_name
|
309
|
+
/ f"{prefix}_in_{arg}.pickle"
|
310
|
+
)
|
311
|
+
return output
|
301
312
|
|
302
313
|
def get_output_pickle_file_path(
|
303
|
-
self, arg: str, prefix: Optional[str] = None
|
314
|
+
self, *, arg: str, subfolder_name: str, prefix: Optional[str] = None
|
304
315
|
) -> Path:
|
305
316
|
prefix = prefix or "cfut"
|
306
|
-
return
|
317
|
+
return (
|
318
|
+
self.workflow_dir_remote
|
319
|
+
/ subfolder_name
|
320
|
+
/ f"{prefix}_out_{arg}.pickle"
|
321
|
+
)
|
307
322
|
|
308
|
-
def get_slurm_script_file_path(
|
323
|
+
def get_slurm_script_file_path(
|
324
|
+
self, *, subfolder_name: str, prefix: Optional[str] = None
|
325
|
+
) -> Path:
|
309
326
|
prefix = prefix or "_temp"
|
310
|
-
return
|
327
|
+
return (
|
328
|
+
self.workflow_dir_local
|
329
|
+
/ subfolder_name
|
330
|
+
/ f"{prefix}_slurm_submit.sbatch"
|
331
|
+
)
|
311
332
|
|
312
333
|
def get_slurm_stdout_file_path(
|
313
|
-
self,
|
334
|
+
self,
|
335
|
+
*,
|
336
|
+
subfolder_name: str,
|
337
|
+
arg: str = "%j",
|
338
|
+
prefix: Optional[str] = None,
|
314
339
|
) -> Path:
|
315
340
|
prefix = prefix or "slurmpy.stdout"
|
316
|
-
return
|
341
|
+
return (
|
342
|
+
self.workflow_dir_remote
|
343
|
+
/ subfolder_name
|
344
|
+
/ f"{prefix}_slurm_{arg}.out"
|
345
|
+
)
|
317
346
|
|
318
347
|
def get_slurm_stderr_file_path(
|
319
|
-
self,
|
348
|
+
self,
|
349
|
+
*,
|
350
|
+
subfolder_name: str,
|
351
|
+
arg: str = "%j",
|
352
|
+
prefix: Optional[str] = None,
|
320
353
|
) -> Path:
|
321
354
|
prefix = prefix or "slurmpy.stderr"
|
322
|
-
return
|
355
|
+
return (
|
356
|
+
self.workflow_dir_remote
|
357
|
+
/ subfolder_name
|
358
|
+
/ f"{prefix}_slurm_{arg}.err"
|
359
|
+
)
|
323
360
|
|
324
361
|
def submit(
|
325
362
|
self,
|
@@ -599,6 +636,8 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
599
636
|
)
|
600
637
|
job.single_task_submission = True
|
601
638
|
job.wftask_file_prefixes = (task_files.file_prefix,)
|
639
|
+
job.wftask_subfolder_name = task_files.subfolder_name
|
640
|
+
|
602
641
|
else:
|
603
642
|
if not components or len(components) < 1:
|
604
643
|
raise ValueError(
|
@@ -613,33 +652,60 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
613
652
|
)
|
614
653
|
|
615
654
|
_prefixes = []
|
655
|
+
_subfolder_names = []
|
616
656
|
for component in components:
|
617
657
|
if isinstance(component, dict):
|
618
658
|
# This is needed for V2
|
619
659
|
actual_component = component.get(_COMPONENT_KEY_, None)
|
620
660
|
else:
|
621
661
|
actual_component = component
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
).file_prefix
|
662
|
+
_task_file_paths = get_task_file_paths(
|
663
|
+
workflow_dir_local=task_files.workflow_dir_local,
|
664
|
+
workflow_dir_remote=task_files.workflow_dir_remote,
|
665
|
+
task_name=task_files.task_name,
|
666
|
+
task_order=task_files.task_order,
|
667
|
+
component=actual_component,
|
629
668
|
)
|
669
|
+
_prefixes.append(_task_file_paths.file_prefix)
|
670
|
+
_subfolder_names.append(_task_file_paths.subfolder_name)
|
630
671
|
job.wftask_file_prefixes = tuple(_prefixes)
|
631
672
|
|
673
|
+
num_subfolders = len(set(_subfolder_names))
|
674
|
+
if num_subfolders != 1:
|
675
|
+
error_msg_short = (
|
676
|
+
f"[_submit_job] Subfolder list has {num_subfolders} "
|
677
|
+
"different values, but it must have only one (since "
|
678
|
+
"workflow tasks are executed one by one)."
|
679
|
+
)
|
680
|
+
error_msg_detail = (
|
681
|
+
"[_submit_job] Current unique subfolder names: "
|
682
|
+
f"{set(_subfolder_names)}"
|
683
|
+
)
|
684
|
+
logger.error(error_msg_short)
|
685
|
+
logger.error(error_msg_detail)
|
686
|
+
raise ValueError(error_msg_short)
|
687
|
+
job.wftask_subfolder_name = _subfolder_names[0]
|
688
|
+
|
689
|
+
# Check that server-side subfolder exists
|
690
|
+
subfolder_path = self.workflow_dir_local / job.wftask_subfolder_name
|
691
|
+
if not subfolder_path.exists():
|
692
|
+
raise FileNotFoundError(
|
693
|
+
f"Missing folder {subfolder_path.as_posix()}."
|
694
|
+
)
|
695
|
+
|
632
696
|
# Define I/O pickle file names/paths
|
633
697
|
job.input_pickle_files = tuple(
|
634
698
|
self.get_input_pickle_file_path(
|
635
|
-
job.workerids[ind],
|
699
|
+
arg=job.workerids[ind],
|
700
|
+
subfolder_name=job.wftask_subfolder_name,
|
636
701
|
prefix=job.wftask_file_prefixes[ind],
|
637
702
|
)
|
638
703
|
for ind in range(job.num_tasks_tot)
|
639
704
|
)
|
640
705
|
job.output_pickle_files = tuple(
|
641
706
|
self.get_output_pickle_file_path(
|
642
|
-
job.workerids[ind],
|
707
|
+
arg=job.workerids[ind],
|
708
|
+
subfolder_name=job.wftask_subfolder_name,
|
643
709
|
prefix=job.wftask_file_prefixes[ind],
|
644
710
|
)
|
645
711
|
for ind in range(job.num_tasks_tot)
|
@@ -647,13 +713,16 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
647
713
|
|
648
714
|
# Define SLURM-job file names/paths
|
649
715
|
job.slurm_script = self.get_slurm_script_file_path(
|
650
|
-
|
716
|
+
subfolder_name=job.wftask_subfolder_name,
|
717
|
+
prefix=job.slurm_file_prefix,
|
651
718
|
)
|
652
719
|
job.slurm_stdout = self.get_slurm_stdout_file_path(
|
653
|
-
|
720
|
+
subfolder_name=job.wftask_subfolder_name,
|
721
|
+
prefix=job.slurm_file_prefix,
|
654
722
|
)
|
655
723
|
job.slurm_stderr = self.get_slurm_stderr_file_path(
|
656
|
-
|
724
|
+
subfolder_name=job.wftask_subfolder_name,
|
725
|
+
prefix=job.slurm_file_prefix,
|
657
726
|
)
|
658
727
|
|
659
728
|
# Dump serialized versions+function+args+kwargs to pickle file
|
@@ -706,10 +775,10 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
706
775
|
Prepare the `JobExecutionError` for a given job
|
707
776
|
|
708
777
|
This method creates a `JobExecutionError` object and sets its attribute
|
709
|
-
to the appropriate SLURM-related file names. Note that the method
|
710
|
-
always be called after values in `self.map_jobid_to_slurm_files`
|
711
|
-
been updated, so that they point to `self.
|
712
|
-
readable from `fractal-server`.
|
778
|
+
to the appropriate SLURM-related file names. Note that the method
|
779
|
+
should always be called after values in `self.map_jobid_to_slurm_files`
|
780
|
+
have been updated, so that they point to `self.workflow_dir_local`
|
781
|
+
files which are readable from `fractal-server`.
|
713
782
|
|
714
783
|
Arguments:
|
715
784
|
jobid:
|
@@ -758,13 +827,13 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
758
827
|
if not self.jobs:
|
759
828
|
self.jobs_empty_cond.notify_all()
|
760
829
|
|
761
|
-
# Copy all relevant files from self.
|
762
|
-
# self.
|
830
|
+
# Copy all relevant files from self.workflow_dir_remote to
|
831
|
+
# self.workflow_dir_local
|
763
832
|
|
764
|
-
self.
|
833
|
+
self._copy_files_from_remote_to_local(job)
|
765
834
|
|
766
|
-
# Update the paths to use the files in self.
|
767
|
-
# than the user's ones in self.
|
835
|
+
# Update the paths to use the files in self.workflow_dir_local
|
836
|
+
# (rather than the user's ones in self.workflow_dir_remote)
|
768
837
|
with self.jobs_lock:
|
769
838
|
self.map_jobid_to_slurm_files[jobid]
|
770
839
|
(
|
@@ -773,10 +842,14 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
773
842
|
slurm_stderr_file,
|
774
843
|
) = self.map_jobid_to_slurm_files[jobid]
|
775
844
|
new_slurm_stdout_file = str(
|
776
|
-
self.
|
845
|
+
self.workflow_dir_local
|
846
|
+
/ job.wftask_subfolder_name
|
847
|
+
/ Path(slurm_stdout_file).name
|
777
848
|
)
|
778
849
|
new_slurm_stderr_file = str(
|
779
|
-
self.
|
850
|
+
self.workflow_dir_local
|
851
|
+
/ job.wftask_subfolder_name
|
852
|
+
/ Path(slurm_stderr_file).name
|
780
853
|
)
|
781
854
|
with self.jobs_lock:
|
782
855
|
self.map_jobid_to_slurm_files[jobid] = (
|
@@ -787,7 +860,8 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
787
860
|
|
788
861
|
in_paths = job.input_pickle_files
|
789
862
|
out_paths = tuple(
|
790
|
-
self.
|
863
|
+
(self.workflow_dir_local / job.wftask_subfolder_name / f.name)
|
864
|
+
for f in job.output_pickle_files
|
791
865
|
)
|
792
866
|
|
793
867
|
outputs = []
|
@@ -908,16 +982,16 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
908
982
|
" FractalSlurmExecutor._completion."
|
909
983
|
)
|
910
984
|
|
911
|
-
def
|
985
|
+
def _copy_files_from_remote_to_local(
|
912
986
|
self,
|
913
987
|
job: SlurmJob,
|
914
988
|
):
|
915
989
|
"""
|
916
990
|
Impersonate the user and copy task-related files
|
917
991
|
|
918
|
-
For all files in `self.
|
992
|
+
For all files in `self.workflow_dir_remote` that start with
|
919
993
|
`job.file_prefix`, read them (with `sudo -u` impersonation) and write
|
920
|
-
them to `self.
|
994
|
+
them to `self.workflow_dir_local`.
|
921
995
|
|
922
996
|
Files to copy:
|
923
997
|
* Job-related files (SLURM stderr/stdout files); with prefix
|
@@ -932,36 +1006,48 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
932
1006
|
Raises:
|
933
1007
|
JobExecutionError: If a `cat` command fails.
|
934
1008
|
"""
|
935
|
-
logger.debug("
|
936
|
-
|
1009
|
+
logger.debug("[_copy_files_from_remote_to_local] Start")
|
1010
|
+
|
1011
|
+
if self.workflow_dir_remote == self.workflow_dir_local:
|
1012
|
+
logger.debug(
|
1013
|
+
"[_copy_files_from_remote_to_local] "
|
1014
|
+
"workflow_dir_local corresponds to workflow_dir_remote, "
|
1015
|
+
"return."
|
1016
|
+
)
|
937
1017
|
return
|
938
1018
|
|
1019
|
+
subfolder_name = job.wftask_subfolder_name
|
939
1020
|
prefixes = set(
|
940
1021
|
[job.slurm_file_prefix] + list(job.wftask_file_prefixes)
|
941
1022
|
)
|
942
1023
|
|
943
|
-
logger.debug(f"[_copy_files_from_user_to_server] {prefixes=}")
|
944
1024
|
logger.debug(
|
945
|
-
|
1025
|
+
"[_copy_files_from_remote_to_local] "
|
1026
|
+
f"WorkflowTask subfolder_name: {subfolder_name}"
|
1027
|
+
)
|
1028
|
+
logger.debug(f"[_copy_files_from_remote_to_local] {prefixes=}")
|
1029
|
+
logger.debug(
|
1030
|
+
"[_copy_files_from_remote_to_local] "
|
1031
|
+
f"{str(self.workflow_dir_remote)=}"
|
946
1032
|
)
|
947
1033
|
|
948
1034
|
for prefix in prefixes:
|
949
1035
|
|
950
1036
|
if prefix == job.slurm_file_prefix:
|
951
1037
|
files_to_copy = _glob_as_user(
|
952
|
-
folder=str(self.
|
1038
|
+
folder=str(self.workflow_dir_remote / subfolder_name),
|
953
1039
|
user=self.slurm_user,
|
954
1040
|
startswith=prefix,
|
955
1041
|
)
|
956
1042
|
else:
|
957
1043
|
files_to_copy = _glob_as_user_strict(
|
958
|
-
folder=str(self.
|
1044
|
+
folder=str(self.workflow_dir_remote / subfolder_name),
|
959
1045
|
user=self.slurm_user,
|
960
1046
|
startswith=prefix,
|
961
1047
|
)
|
962
1048
|
|
963
1049
|
logger.debug(
|
964
|
-
"[
|
1050
|
+
"[_copy_files_from_remote_to_local] "
|
965
1051
|
f"{prefix=}, {len(files_to_copy)=}"
|
966
1052
|
)
|
967
1053
|
|
@@ -972,7 +1058,9 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
972
1058
|
"contains whitespaces"
|
973
1059
|
)
|
974
1060
|
source_file_path = str(
|
975
|
-
self.
|
1061
|
+
self.workflow_dir_remote
|
1062
|
+
/ subfolder_name
|
1063
|
+
/ source_file_name
|
976
1064
|
)
|
977
1065
|
|
978
1066
|
# Read source_file_path (requires sudo)
|
@@ -991,10 +1079,12 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
991
1079
|
logger.error(info)
|
992
1080
|
raise JobExecutionError(info)
|
993
1081
|
# Write to dest_file_path (including empty files)
|
994
|
-
dest_file_path = str(
|
1082
|
+
dest_file_path = str(
|
1083
|
+
self.workflow_dir_local / subfolder_name / source_file_name
|
1084
|
+
)
|
995
1085
|
with open(dest_file_path, "wb") as f:
|
996
1086
|
f.write(res.stdout)
|
997
|
-
logger.debug("[
|
1087
|
+
logger.debug("[_copy_files_from_remote_to_local] End")
|
998
1088
|
|
999
1089
|
def _start(
|
1000
1090
|
self,
|
@@ -1099,7 +1189,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1099
1189
|
[
|
1100
1190
|
f"#SBATCH --err={slurm_err_path}",
|
1101
1191
|
f"#SBATCH --out={slurm_out_path}",
|
1102
|
-
f"#SBATCH -D {self.
|
1192
|
+
f"#SBATCH -D {self.workflow_dir_remote}",
|
1103
1193
|
]
|
1104
1194
|
)
|
1105
1195
|
script_lines = slurm_config.sort_script_lines(script_lines)
|
@@ -1131,12 +1221,11 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1131
1221
|
This will be called when self.submit or self.map are called from
|
1132
1222
|
outside fractal-server, and then lack some optional arguments.
|
1133
1223
|
"""
|
1134
|
-
import random
|
1135
|
-
|
1136
1224
|
task_files = TaskFiles(
|
1137
|
-
|
1138
|
-
|
1139
|
-
task_order=
|
1225
|
+
workflow_dir_local=self.workflow_dir_local,
|
1226
|
+
workflow_dir_remote=self.workflow_dir_remote,
|
1227
|
+
task_order=None,
|
1228
|
+
task_name="name",
|
1140
1229
|
)
|
1141
1230
|
return task_files
|
1142
1231
|
|
@@ -1154,7 +1243,7 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1154
1243
|
while self.jobs:
|
1155
1244
|
jobid, fut_and_job = self.jobs.popitem()
|
1156
1245
|
slurm_jobs_to_scancel.append(jobid)
|
1157
|
-
fut
|
1246
|
+
fut = fut_and_job[0]
|
1158
1247
|
self.map_jobid_to_slurm_files.pop(jobid)
|
1159
1248
|
if not fut.cancelled():
|
1160
1249
|
fut.set_exception(
|
@@ -1188,3 +1277,15 @@ class FractalSlurmExecutor(SlurmExecutor):
|
|
1188
1277
|
raise JobExecutionError(info=error_msg)
|
1189
1278
|
|
1190
1279
|
logger.debug("Executor shutdown: end")
|
1280
|
+
|
1281
|
+
def __exit__(self, *args, **kwargs):
|
1282
|
+
"""
|
1283
|
+
See
|
1284
|
+
https://github.com/fractal-analytics-platform/fractal-server/issues/1508
|
1285
|
+
"""
|
1286
|
+
logger.debug(
|
1287
|
+
"[FractalSlurmExecutor.__exit__] Stop and join `wait_thread`"
|
1288
|
+
)
|
1289
|
+
self.wait_thread.stop()
|
1290
|
+
self.wait_thread.join()
|
1291
|
+
logger.debug("[FractalSlurmExecutor.__exit__] End")
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from sqlmodel import select
|
4
|
+
|
5
|
+
from fractal_server.app.db import get_async_db
|
6
|
+
from fractal_server.app.models.v1 import ApplyWorkflow
|
7
|
+
from fractal_server.app.models.v1.job import JobStatusTypeV1
|
8
|
+
from fractal_server.app.models.v2 import JobV2
|
9
|
+
from fractal_server.app.models.v2.job import JobStatusTypeV2
|
10
|
+
from fractal_server.app.routes.aux._job import _write_shutdown_file
|
11
|
+
from fractal_server.config import get_settings
|
12
|
+
from fractal_server.logger import get_logger
|
13
|
+
from fractal_server.syringe import Inject
|
14
|
+
|
15
|
+
|
16
|
+
async def cleanup_after_shutdown(
|
17
|
+
*, jobsV1: list[int], jobsV2: list[int], logger_name: str
|
18
|
+
):
|
19
|
+
logger = get_logger(logger_name)
|
20
|
+
logger.info("Cleanup function after shutdown")
|
21
|
+
stm_v2 = (
|
22
|
+
select(JobV2)
|
23
|
+
.where(JobV2.id.in_(jobsV2))
|
24
|
+
.where(JobV2.status == JobStatusTypeV2.SUBMITTED)
|
25
|
+
)
|
26
|
+
|
27
|
+
stm_v1 = (
|
28
|
+
select(ApplyWorkflow)
|
29
|
+
.where(ApplyWorkflow.id.in_(jobsV1))
|
30
|
+
.where(ApplyWorkflow.status == JobStatusTypeV1.SUBMITTED)
|
31
|
+
)
|
32
|
+
|
33
|
+
async for session in get_async_db():
|
34
|
+
jobsV2_db = (await session.execute(stm_v2)).scalars().all()
|
35
|
+
jobsV1_db = (await session.execute(stm_v1)).scalars().all()
|
36
|
+
|
37
|
+
for job in jobsV2_db:
|
38
|
+
_write_shutdown_file(job=job)
|
39
|
+
|
40
|
+
for job in jobsV1_db:
|
41
|
+
_write_shutdown_file(job=job)
|
42
|
+
|
43
|
+
settings = Inject(get_settings)
|
44
|
+
|
45
|
+
t_start = time.perf_counter()
|
46
|
+
while (
|
47
|
+
time.perf_counter() - t_start
|
48
|
+
) < settings.FRACTAL_GRACEFUL_SHUTDOWN_TIME: # 30 seconds
|
49
|
+
logger.info("Waiting 3 seconds before checking")
|
50
|
+
time.sleep(3)
|
51
|
+
jobsV2_db = (await session.execute(stm_v2)).scalars().all()
|
52
|
+
jobsV1_db = (await session.execute(stm_v1)).scalars().all()
|
53
|
+
|
54
|
+
if len(jobsV2_db) == 0 and len(jobsV1_db) == 0:
|
55
|
+
logger.info(
|
56
|
+
(
|
57
|
+
"All jobs associated to this app are "
|
58
|
+
"either done or failed. Exit."
|
59
|
+
)
|
60
|
+
)
|
61
|
+
return
|
62
|
+
else:
|
63
|
+
logger.info(
|
64
|
+
(
|
65
|
+
f"Some jobs are still 'submitted' "
|
66
|
+
f"{jobsV1_db=}, {jobsV2_db=}"
|
67
|
+
)
|
68
|
+
)
|
69
|
+
logger.info(
|
70
|
+
(
|
71
|
+
"Graceful shutdown reached its maximum time, "
|
72
|
+
"but some jobs are still submitted"
|
73
|
+
)
|
74
|
+
)
|
75
|
+
|
76
|
+
for job in jobsV2_db:
|
77
|
+
job.status = "failed"
|
78
|
+
job.log = (job.log or "") + "\nJob stopped due to app shutdown\n"
|
79
|
+
session.add(job)
|
80
|
+
await session.commit()
|
81
|
+
|
82
|
+
for job in jobsV1_db:
|
83
|
+
job.status = "failed"
|
84
|
+
job.log = (job.log or "") + "\nJob stopped due to app shutdown\n"
|
85
|
+
session.add(job)
|
86
|
+
await session.commit()
|
87
|
+
|
88
|
+
logger.info("Exit from shutdown logic")
|