fractal-server 2.16.6__py3-none-any.whl → 2.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +178 -52
- fractal_server/app/db/__init__.py +9 -11
- fractal_server/app/models/security.py +30 -22
- fractal_server/app/models/user_settings.py +5 -4
- fractal_server/app/models/v2/__init__.py +4 -0
- fractal_server/app/models/v2/profile.py +16 -0
- fractal_server/app/models/v2/project.py +5 -0
- fractal_server/app/models/v2/resource.py +130 -0
- fractal_server/app/models/v2/task_group.py +4 -0
- fractal_server/app/routes/admin/v2/__init__.py +4 -0
- fractal_server/app/routes/admin/v2/_aux_functions.py +55 -0
- fractal_server/app/routes/admin/v2/accounting.py +3 -3
- fractal_server/app/routes/admin/v2/impersonate.py +2 -2
- fractal_server/app/routes/admin/v2/job.py +51 -15
- fractal_server/app/routes/admin/v2/profile.py +100 -0
- fractal_server/app/routes/admin/v2/project.py +2 -2
- fractal_server/app/routes/admin/v2/resource.py +222 -0
- fractal_server/app/routes/admin/v2/task.py +59 -32
- fractal_server/app/routes/admin/v2/task_group.py +17 -12
- fractal_server/app/routes/admin/v2/task_group_lifecycle.py +52 -86
- fractal_server/app/routes/api/__init__.py +45 -8
- fractal_server/app/routes/api/v2/_aux_functions.py +17 -1
- fractal_server/app/routes/api/v2/_aux_functions_history.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +3 -3
- fractal_server/app/routes/api/v2/_aux_functions_tasks.py +55 -19
- fractal_server/app/routes/api/v2/_aux_task_group_disambiguation.py +21 -17
- fractal_server/app/routes/api/v2/dataset.py +10 -19
- fractal_server/app/routes/api/v2/history.py +8 -8
- fractal_server/app/routes/api/v2/images.py +5 -5
- fractal_server/app/routes/api/v2/job.py +8 -8
- fractal_server/app/routes/api/v2/pre_submission_checks.py +3 -3
- fractal_server/app/routes/api/v2/project.py +15 -7
- fractal_server/app/routes/api/v2/status_legacy.py +2 -2
- fractal_server/app/routes/api/v2/submit.py +49 -42
- fractal_server/app/routes/api/v2/task.py +26 -8
- fractal_server/app/routes/api/v2/task_collection.py +39 -50
- fractal_server/app/routes/api/v2/task_collection_custom.py +10 -6
- fractal_server/app/routes/api/v2/task_collection_pixi.py +34 -42
- fractal_server/app/routes/api/v2/task_group.py +19 -9
- fractal_server/app/routes/api/v2/task_group_lifecycle.py +43 -86
- fractal_server/app/routes/api/v2/task_version_update.py +3 -3
- fractal_server/app/routes/api/v2/workflow.py +9 -9
- fractal_server/app/routes/api/v2/workflow_import.py +25 -13
- fractal_server/app/routes/api/v2/workflowtask.py +5 -5
- fractal_server/app/routes/auth/__init__.py +34 -5
- fractal_server/app/routes/auth/_aux_auth.py +39 -20
- fractal_server/app/routes/auth/current_user.py +56 -67
- fractal_server/app/routes/auth/group.py +29 -46
- fractal_server/app/routes/auth/oauth.py +55 -38
- fractal_server/app/routes/auth/register.py +2 -2
- fractal_server/app/routes/auth/router.py +4 -2
- fractal_server/app/routes/auth/users.py +29 -53
- fractal_server/app/routes/aux/_runner.py +2 -1
- fractal_server/app/routes/aux/validate_user_profile.py +62 -0
- fractal_server/app/schemas/__init__.py +0 -1
- fractal_server/app/schemas/user.py +43 -13
- fractal_server/app/schemas/user_group.py +2 -1
- fractal_server/app/schemas/v2/__init__.py +12 -0
- fractal_server/app/schemas/v2/profile.py +78 -0
- fractal_server/app/schemas/v2/resource.py +137 -0
- fractal_server/app/schemas/v2/task_collection.py +11 -3
- fractal_server/app/schemas/v2/task_group.py +5 -0
- fractal_server/app/security/__init__.py +174 -75
- fractal_server/app/security/signup_email.py +52 -34
- fractal_server/config/__init__.py +27 -0
- fractal_server/config/_data.py +68 -0
- fractal_server/config/_database.py +59 -0
- fractal_server/config/_email.py +133 -0
- fractal_server/config/_main.py +78 -0
- fractal_server/config/_oauth.py +69 -0
- fractal_server/config/_settings_config.py +7 -0
- fractal_server/data_migrations/2_17_0.py +339 -0
- fractal_server/images/tools.py +3 -3
- fractal_server/logger.py +3 -3
- fractal_server/main.py +17 -23
- fractal_server/migrations/naming_convention.py +1 -1
- fractal_server/migrations/versions/83bc2ad3ffcc_2_17_0.py +195 -0
- fractal_server/runner/config/__init__.py +2 -0
- fractal_server/runner/config/_local.py +21 -0
- fractal_server/runner/config/_slurm.py +129 -0
- fractal_server/runner/config/slurm_mem_to_MB.py +63 -0
- fractal_server/runner/exceptions.py +4 -0
- fractal_server/runner/executors/base_runner.py +17 -7
- fractal_server/runner/executors/local/get_local_config.py +21 -86
- fractal_server/runner/executors/local/runner.py +48 -5
- fractal_server/runner/executors/slurm_common/_batching.py +2 -2
- fractal_server/runner/executors/slurm_common/base_slurm_runner.py +60 -26
- fractal_server/runner/executors/slurm_common/get_slurm_config.py +39 -55
- fractal_server/runner/executors/slurm_common/remote.py +1 -1
- fractal_server/runner/executors/slurm_common/slurm_config.py +214 -0
- fractal_server/runner/executors/slurm_common/slurm_job_task_models.py +1 -1
- fractal_server/runner/executors/slurm_ssh/runner.py +12 -14
- fractal_server/runner/executors/slurm_sudo/_subprocess_run_as_user.py +2 -2
- fractal_server/runner/executors/slurm_sudo/runner.py +12 -12
- fractal_server/runner/v2/_local.py +36 -21
- fractal_server/runner/v2/_slurm_ssh.py +41 -4
- fractal_server/runner/v2/_slurm_sudo.py +42 -12
- fractal_server/runner/v2/db_tools.py +1 -1
- fractal_server/runner/v2/runner.py +3 -11
- fractal_server/runner/v2/runner_functions.py +42 -28
- fractal_server/runner/v2/submit_workflow.py +88 -109
- fractal_server/runner/versions.py +8 -3
- fractal_server/ssh/_fabric.py +6 -6
- fractal_server/tasks/config/__init__.py +3 -0
- fractal_server/tasks/config/_pixi.py +127 -0
- fractal_server/tasks/config/_python.py +51 -0
- fractal_server/tasks/v2/local/_utils.py +7 -7
- fractal_server/tasks/v2/local/collect.py +13 -5
- fractal_server/tasks/v2/local/collect_pixi.py +26 -10
- fractal_server/tasks/v2/local/deactivate.py +7 -1
- fractal_server/tasks/v2/local/deactivate_pixi.py +5 -1
- fractal_server/tasks/v2/local/delete.py +5 -1
- fractal_server/tasks/v2/local/reactivate.py +13 -5
- fractal_server/tasks/v2/local/reactivate_pixi.py +27 -9
- fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py +11 -10
- fractal_server/tasks/v2/ssh/_utils.py +6 -7
- fractal_server/tasks/v2/ssh/collect.py +19 -12
- fractal_server/tasks/v2/ssh/collect_pixi.py +34 -16
- fractal_server/tasks/v2/ssh/deactivate.py +12 -8
- fractal_server/tasks/v2/ssh/deactivate_pixi.py +14 -10
- fractal_server/tasks/v2/ssh/delete.py +12 -9
- fractal_server/tasks/v2/ssh/reactivate.py +18 -12
- fractal_server/tasks/v2/ssh/reactivate_pixi.py +36 -17
- fractal_server/tasks/v2/templates/4_pip_show.sh +4 -6
- fractal_server/tasks/v2/utils_database.py +2 -2
- fractal_server/tasks/v2/utils_pixi.py +3 -0
- fractal_server/tasks/v2/utils_python_interpreter.py +8 -16
- fractal_server/tasks/v2/utils_templates.py +7 -10
- fractal_server/utils.py +1 -1
- {fractal_server-2.16.6.dist-info → fractal_server-2.17.0.dist-info}/METADATA +4 -6
- {fractal_server-2.16.6.dist-info → fractal_server-2.17.0.dist-info}/RECORD +136 -117
- fractal_server/app/routes/aux/validate_user_settings.py +0 -73
- fractal_server/app/schemas/user_settings.py +0 -67
- fractal_server/app/user_settings.py +0 -42
- fractal_server/config.py +0 -906
- fractal_server/data_migrations/2_14_10.py +0 -48
- fractal_server/runner/executors/slurm_common/_slurm_config.py +0 -471
- /fractal_server/{runner → app}/shutdown.py +0 -0
- {fractal_server-2.16.6.dist-info → fractal_server-2.17.0.dist-info}/WHEEL +0 -0
- {fractal_server-2.16.6.dist-info → fractal_server-2.17.0.dist-info}/entry_points.txt +0 -0
- {fractal_server-2.16.6.dist-info → fractal_server-2.17.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,17 +9,17 @@ from typing import Literal
|
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
from pydantic import ConfigDict
|
|
11
11
|
|
|
12
|
-
from ..slurm_common._slurm_config import SlurmConfig
|
|
13
12
|
from ..slurm_common.slurm_job_task_models import SlurmJob
|
|
14
13
|
from ..slurm_common.slurm_job_task_models import SlurmTask
|
|
15
14
|
from ._job_states import STATES_FINISHED
|
|
15
|
+
from .slurm_config import SlurmConfig
|
|
16
16
|
from fractal_server import __VERSION__
|
|
17
17
|
from fractal_server.app.db import get_sync_db
|
|
18
18
|
from fractal_server.app.models.v2 import AccountingRecordSlurm
|
|
19
19
|
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
|
20
20
|
from fractal_server.app.schemas.v2 import TaskType
|
|
21
|
-
from fractal_server.config import get_settings
|
|
22
21
|
from fractal_server.logger import set_logger
|
|
22
|
+
from fractal_server.runner.config import JobRunnerConfigSLURM
|
|
23
23
|
from fractal_server.runner.exceptions import JobExecutionError
|
|
24
24
|
from fractal_server.runner.exceptions import TaskExecutionError
|
|
25
25
|
from fractal_server.runner.executors.base_runner import BaseRunner
|
|
@@ -31,7 +31,6 @@ from fractal_server.runner.v2.db_tools import (
|
|
|
31
31
|
bulk_update_status_of_history_unit,
|
|
32
32
|
)
|
|
33
33
|
from fractal_server.runner.v2.db_tools import update_status_of_history_unit
|
|
34
|
-
from fractal_server.syringe import Inject
|
|
35
34
|
|
|
36
35
|
SHUTDOWN_ERROR_MESSAGE = "Failed due to job-execution shutdown."
|
|
37
36
|
SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
|
|
@@ -77,16 +76,18 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
77
76
|
python_worker_interpreter: str
|
|
78
77
|
slurm_runner_type: Literal["ssh", "sudo"]
|
|
79
78
|
slurm_account: str | None = None
|
|
79
|
+
shared_config: JobRunnerConfigSLURM
|
|
80
80
|
|
|
81
81
|
def __init__(
|
|
82
82
|
self,
|
|
83
|
+
*,
|
|
83
84
|
root_dir_local: Path,
|
|
84
85
|
root_dir_remote: Path,
|
|
85
86
|
slurm_runner_type: Literal["ssh", "sudo"],
|
|
86
87
|
python_worker_interpreter: str,
|
|
88
|
+
poll_interval: int,
|
|
87
89
|
common_script_lines: list[str] | None = None,
|
|
88
|
-
user_cache_dir: str
|
|
89
|
-
poll_interval: int | None = None,
|
|
90
|
+
user_cache_dir: str,
|
|
90
91
|
slurm_account: str | None = None,
|
|
91
92
|
):
|
|
92
93
|
self.slurm_runner_type = slurm_runner_type
|
|
@@ -98,11 +99,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
98
99
|
self.python_worker_interpreter = python_worker_interpreter
|
|
99
100
|
self.slurm_account = slurm_account
|
|
100
101
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
self.poll_interval = (
|
|
104
|
-
poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
|
105
|
-
)
|
|
102
|
+
self.poll_interval = poll_interval
|
|
106
103
|
self.poll_interval_internal = self.poll_interval / 10.0
|
|
107
104
|
|
|
108
105
|
self.check_fractal_server_versions()
|
|
@@ -134,12 +131,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
134
131
|
def _run_remote_cmd(self, cmd: str) -> str:
|
|
135
132
|
raise NotImplementedError("Implement in child class.")
|
|
136
133
|
|
|
137
|
-
def run_squeue(self, *, job_ids: list[str]
|
|
134
|
+
def run_squeue(self, *, job_ids: list[str]) -> str:
|
|
138
135
|
raise NotImplementedError("Implement in child class.")
|
|
139
136
|
|
|
140
|
-
def _is_squeue_error_recoverable(
|
|
141
|
-
self, exception: BaseException
|
|
142
|
-
) -> Literal[True]:
|
|
137
|
+
def _is_squeue_error_recoverable(self, exception: BaseException) -> bool:
|
|
143
138
|
"""
|
|
144
139
|
Determine whether a `squeue` error is considered recoverable.
|
|
145
140
|
|
|
@@ -245,7 +240,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
245
240
|
A new, up-to-date, `SlurmConfig` object.
|
|
246
241
|
"""
|
|
247
242
|
|
|
248
|
-
new_slurm_config = slurm_config.model_copy()
|
|
243
|
+
new_slurm_config = slurm_config.model_copy(deep=True)
|
|
249
244
|
|
|
250
245
|
# Include SLURM account in `slurm_config`.
|
|
251
246
|
if self.slurm_account is not None:
|
|
@@ -257,7 +252,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
257
252
|
f"Add {self.common_script_lines} to "
|
|
258
253
|
f"{new_slurm_config.extra_lines=}."
|
|
259
254
|
)
|
|
260
|
-
current_extra_lines = new_slurm_config.extra_lines
|
|
255
|
+
current_extra_lines = new_slurm_config.extra_lines
|
|
261
256
|
new_slurm_config.extra_lines = (
|
|
262
257
|
current_extra_lines + self.common_script_lines
|
|
263
258
|
)
|
|
@@ -473,7 +468,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
473
468
|
*,
|
|
474
469
|
task: SlurmTask,
|
|
475
470
|
was_job_scancelled: bool = False,
|
|
476
|
-
) -> tuple[Any, Exception]:
|
|
471
|
+
) -> tuple[Any, Exception | None]:
|
|
477
472
|
try:
|
|
478
473
|
with open(task.output_file_local) as f:
|
|
479
474
|
output = json.load(f)
|
|
@@ -566,6 +561,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
566
561
|
def job_ids(self) -> list[str]:
|
|
567
562
|
return list(self.jobs.keys())
|
|
568
563
|
|
|
564
|
+
@property
|
|
565
|
+
def job_ids_int(self) -> list[int]:
|
|
566
|
+
return list(map(int, self.jobs.keys()))
|
|
567
|
+
|
|
569
568
|
def wait_and_check_shutdown(self) -> list[str]:
|
|
570
569
|
"""
|
|
571
570
|
Wait at most `self.poll_interval`, while also checking for shutdown.
|
|
@@ -602,6 +601,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
602
601
|
|
|
603
602
|
def submit(
|
|
604
603
|
self,
|
|
604
|
+
*,
|
|
605
605
|
base_command: str,
|
|
606
606
|
workflow_task_order: int,
|
|
607
607
|
workflow_task_id: int,
|
|
@@ -612,7 +612,23 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
612
612
|
config: SlurmConfig,
|
|
613
613
|
task_type: SubmitTaskType,
|
|
614
614
|
user_id: int,
|
|
615
|
-
) -> tuple[Any, Exception]:
|
|
615
|
+
) -> tuple[Any, Exception | None]:
|
|
616
|
+
"""
|
|
617
|
+
Run a single fractal task.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
base_command:
|
|
621
|
+
workflow_task_order:
|
|
622
|
+
workflow_task_id:
|
|
623
|
+
task_name:
|
|
624
|
+
parameters: Dictionary of parameters.
|
|
625
|
+
history_unit_id:
|
|
626
|
+
Database ID of the corresponding `HistoryUnit` entry.
|
|
627
|
+
task_type: Task type.
|
|
628
|
+
task_files: `TaskFiles` object.
|
|
629
|
+
config: Runner-specific parameters.
|
|
630
|
+
user_id:
|
|
631
|
+
"""
|
|
616
632
|
logger.debug("[submit] START")
|
|
617
633
|
|
|
618
634
|
# Always refresh `executor_error_log` before starting a task
|
|
@@ -687,7 +703,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
687
703
|
|
|
688
704
|
create_accounting_record_slurm(
|
|
689
705
|
user_id=user_id,
|
|
690
|
-
slurm_job_ids=self.
|
|
706
|
+
slurm_job_ids=self.job_ids_int,
|
|
691
707
|
)
|
|
692
708
|
|
|
693
709
|
# Retrieval phase
|
|
@@ -757,11 +773,12 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
757
773
|
|
|
758
774
|
def multisubmit(
|
|
759
775
|
self,
|
|
776
|
+
*,
|
|
760
777
|
base_command: str,
|
|
761
778
|
workflow_task_order: int,
|
|
762
779
|
workflow_task_id: int,
|
|
763
780
|
task_name: str,
|
|
764
|
-
list_parameters: list[dict],
|
|
781
|
+
list_parameters: list[dict[str, Any]],
|
|
765
782
|
history_unit_ids: list[int],
|
|
766
783
|
list_task_files: list[TaskFiles],
|
|
767
784
|
task_type: MultisubmitTaskType,
|
|
@@ -769,9 +786,26 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
769
786
|
user_id: int,
|
|
770
787
|
) -> tuple[dict[int, Any], dict[int, BaseException]]:
|
|
771
788
|
"""
|
|
789
|
+
Run a parallel fractal task.
|
|
790
|
+
|
|
772
791
|
Note: `list_parameters`, `list_task_files` and `history_unit_ids`
|
|
773
792
|
have the same size. For parallel tasks, this is also the number of
|
|
774
793
|
input images, while for compound tasks these can differ.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
base_command:
|
|
797
|
+
workflow_task_order:
|
|
798
|
+
workflow_task_id:
|
|
799
|
+
task_name:
|
|
800
|
+
list_parameters:
|
|
801
|
+
List of dictionaries of parameters (each one must include
|
|
802
|
+
`zarr_urls` key).
|
|
803
|
+
history_unit_ids:
|
|
804
|
+
Database IDs of the corresponding `HistoryUnit` entries.
|
|
805
|
+
list_task_files: `TaskFiles` objects.
|
|
806
|
+
task_type: Task type.
|
|
807
|
+
config: Runner-specific parameters.
|
|
808
|
+
user_id:
|
|
775
809
|
"""
|
|
776
810
|
|
|
777
811
|
# Always refresh `executor_error_log` before starting a task
|
|
@@ -779,6 +813,9 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
779
813
|
|
|
780
814
|
config = self._enrich_slurm_config(config)
|
|
781
815
|
|
|
816
|
+
results: dict[int, Any] = {}
|
|
817
|
+
exceptions: dict[int, BaseException] = {}
|
|
818
|
+
|
|
782
819
|
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
|
783
820
|
try:
|
|
784
821
|
if self.is_shutdown():
|
|
@@ -789,8 +826,8 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
789
826
|
status=HistoryUnitStatus.FAILED,
|
|
790
827
|
db_sync=db,
|
|
791
828
|
)
|
|
792
|
-
results
|
|
793
|
-
exceptions
|
|
829
|
+
results = {}
|
|
830
|
+
exceptions = {
|
|
794
831
|
ind: SHUTDOWN_EXCEPTION
|
|
795
832
|
for ind in range(len(list_parameters))
|
|
796
833
|
}
|
|
@@ -812,9 +849,6 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
812
849
|
self._mkdir_local_folder(workdir_local.as_posix())
|
|
813
850
|
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
|
814
851
|
|
|
815
|
-
results: dict[int, Any] = {}
|
|
816
|
-
exceptions: dict[int, BaseException] = {}
|
|
817
|
-
|
|
818
852
|
# NOTE: chunking has already taken place in `get_slurm_config`,
|
|
819
853
|
# so that `config.tasks_per_job` is now set.
|
|
820
854
|
|
|
@@ -889,7 +923,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
889
923
|
|
|
890
924
|
create_accounting_record_slurm(
|
|
891
925
|
user_id=user_id,
|
|
892
|
-
slurm_job_ids=self.
|
|
926
|
+
slurm_job_ids=self.job_ids_int,
|
|
893
927
|
)
|
|
894
928
|
|
|
895
929
|
except Exception as e:
|
|
@@ -1,50 +1,42 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
1
|
from typing import Literal
|
|
3
2
|
|
|
4
3
|
from ._batching import heuristics
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from ._slurm_config import logger
|
|
8
|
-
from ._slurm_config import SlurmConfig
|
|
9
|
-
from ._slurm_config import SlurmConfigError
|
|
4
|
+
from .slurm_config import logger
|
|
5
|
+
from .slurm_config import SlurmConfig
|
|
10
6
|
from fractal_server.app.models.v2 import WorkflowTaskV2
|
|
7
|
+
from fractal_server.runner.config import JobRunnerConfigSLURM
|
|
8
|
+
from fractal_server.runner.config.slurm_mem_to_MB import slurm_mem_to_MB
|
|
9
|
+
from fractal_server.runner.exceptions import SlurmConfigError
|
|
11
10
|
from fractal_server.string_tools import interpret_as_bool
|
|
12
11
|
|
|
13
12
|
|
|
14
|
-
def
|
|
13
|
+
def _get_slurm_config_internal(
|
|
14
|
+
shared_config: JobRunnerConfigSLURM,
|
|
15
15
|
wftask: WorkflowTaskV2,
|
|
16
16
|
which_type: Literal["non_parallel", "parallel"],
|
|
17
|
-
config_path: Path | None = None,
|
|
18
17
|
) -> SlurmConfig:
|
|
19
18
|
"""
|
|
20
|
-
Prepare a `SlurmConfig` configuration object
|
|
21
19
|
|
|
22
|
-
|
|
23
|
-
or `wftask.meta_non_parallel`. In the following description, let us assume
|
|
24
|
-
that `which_type="parallel"`.
|
|
20
|
+
Prepare a specific `SlurmConfig` configuration.
|
|
25
21
|
|
|
26
|
-
The
|
|
22
|
+
The base configuration is the runner-level `shared_config` object, based
|
|
23
|
+
on `resource.jobs_runner_config` (note that GPU-specific properties take
|
|
24
|
+
priority, when `needs_gpu=True`). We then incorporate attributes from
|
|
25
|
+
`wftask.meta_{non_parallel,parallel}` - with higher priority.
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
3. Properties in `wftask.meta_parallel` (which typically include those in
|
|
32
|
-
`wftask.task.meta_parallel`). Note that `wftask.meta_parallel` may be
|
|
33
|
-
`None`.
|
|
34
|
-
|
|
35
|
-
Arguments:
|
|
27
|
+
Args:
|
|
28
|
+
shared_config:
|
|
29
|
+
Configuration object based on `resource.jobs_runner_config`.
|
|
36
30
|
wftask:
|
|
37
|
-
|
|
38
|
-
prepared.
|
|
39
|
-
config_path:
|
|
40
|
-
Path of a Fractal SLURM configuration file; if `None`, use
|
|
41
|
-
`FRACTAL_SLURM_CONFIG_FILE` variable from settings.
|
|
31
|
+
WorkflowTaskV2 for which the backend configuration should
|
|
32
|
+
be prepared.
|
|
42
33
|
which_type:
|
|
43
|
-
|
|
34
|
+
Whether we should look at the non-parallel or parallel part
|
|
35
|
+
of `wftask`.
|
|
36
|
+
tot_tasks: Not used here, only present as a common interface.
|
|
44
37
|
|
|
45
38
|
Returns:
|
|
46
|
-
|
|
47
|
-
The SlurmConfig object
|
|
39
|
+
A ready-to-use `SlurmConfig` object.
|
|
48
40
|
"""
|
|
49
41
|
|
|
50
42
|
if which_type == "non_parallel":
|
|
@@ -60,25 +52,19 @@ def get_slurm_config_internal(
|
|
|
60
52
|
f"[get_slurm_config] WorkflowTask meta attribute: {wftask_meta=}"
|
|
61
53
|
)
|
|
62
54
|
|
|
63
|
-
#
|
|
64
|
-
|
|
65
|
-
slurm_dict = slurm_env.default_slurm_config.model_dump(
|
|
55
|
+
# Start from `shared_config`
|
|
56
|
+
slurm_dict = shared_config.default_slurm_config.model_dump(
|
|
66
57
|
exclude_unset=True, exclude={"mem"}
|
|
67
58
|
)
|
|
68
|
-
if
|
|
69
|
-
slurm_dict["mem_per_task_MB"] =
|
|
59
|
+
if shared_config.default_slurm_config.mem:
|
|
60
|
+
slurm_dict["mem_per_task_MB"] = shared_config.default_slurm_config.mem
|
|
70
61
|
|
|
71
62
|
# Incorporate slurm_env.batching_config
|
|
72
|
-
for key, value in
|
|
63
|
+
for key, value in shared_config.batching_config.model_dump().items():
|
|
73
64
|
slurm_dict[key] = value
|
|
74
65
|
|
|
75
66
|
# Incorporate slurm_env.user_local_exports
|
|
76
|
-
slurm_dict["user_local_exports"] =
|
|
77
|
-
|
|
78
|
-
logger.debug(
|
|
79
|
-
"[get_slurm_config] Fractal SLURM configuration file: "
|
|
80
|
-
f"{slurm_env.model_dump()=}"
|
|
81
|
-
)
|
|
67
|
+
slurm_dict["user_local_exports"] = shared_config.user_local_exports
|
|
82
68
|
|
|
83
69
|
# GPU-related options
|
|
84
70
|
# Notes about priority:
|
|
@@ -91,13 +77,13 @@ def get_slurm_config_internal(
|
|
|
91
77
|
else:
|
|
92
78
|
needs_gpu = False
|
|
93
79
|
logger.debug(f"[get_slurm_config] {needs_gpu=}")
|
|
94
|
-
if needs_gpu:
|
|
95
|
-
for key, value in
|
|
80
|
+
if needs_gpu and shared_config.gpu_slurm_config is not None:
|
|
81
|
+
for key, value in shared_config.gpu_slurm_config.model_dump(
|
|
96
82
|
exclude_unset=True, exclude={"mem"}
|
|
97
83
|
).items():
|
|
98
84
|
slurm_dict[key] = value
|
|
99
|
-
if
|
|
100
|
-
slurm_dict["mem_per_task_MB"] =
|
|
85
|
+
if shared_config.gpu_slurm_config.mem:
|
|
86
|
+
slurm_dict["mem_per_task_MB"] = shared_config.gpu_slurm_config.mem
|
|
101
87
|
|
|
102
88
|
# Number of CPUs per task, for multithreading
|
|
103
89
|
if wftask_meta is not None and "cpus_per_task" in wftask_meta:
|
|
@@ -107,7 +93,7 @@ def get_slurm_config_internal(
|
|
|
107
93
|
# Required memory per task, in MB
|
|
108
94
|
if wftask_meta is not None and "mem" in wftask_meta:
|
|
109
95
|
raw_mem = wftask_meta["mem"]
|
|
110
|
-
mem_per_task_MB =
|
|
96
|
+
mem_per_task_MB = slurm_mem_to_MB(raw_mem)
|
|
111
97
|
slurm_dict["mem_per_task_MB"] = mem_per_task_MB
|
|
112
98
|
|
|
113
99
|
# Job name
|
|
@@ -144,8 +130,7 @@ def get_slurm_config_internal(
|
|
|
144
130
|
extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
|
|
145
131
|
if len(set(extra_lines)) != len(extra_lines):
|
|
146
132
|
logger.debug(
|
|
147
|
-
"[get_slurm_config] Removing repeated elements
|
|
148
|
-
f"{extra_lines=}."
|
|
133
|
+
f"[get_slurm_config] Removing repeated elements in {extra_lines=}."
|
|
149
134
|
)
|
|
150
135
|
extra_lines = list(set(extra_lines))
|
|
151
136
|
slurm_dict["extra_lines"] = extra_lines
|
|
@@ -164,8 +149,7 @@ def get_slurm_config_internal(
|
|
|
164
149
|
|
|
165
150
|
# Put everything together
|
|
166
151
|
logger.debug(
|
|
167
|
-
"[get_slurm_config]
|
|
168
|
-
f"{slurm_dict=}"
|
|
152
|
+
f"[get_slurm_config] Create SlurmConfig object based on {slurm_dict=}"
|
|
169
153
|
)
|
|
170
154
|
slurm_config = SlurmConfig(**slurm_dict)
|
|
171
155
|
|
|
@@ -173,15 +157,15 @@ def get_slurm_config_internal(
|
|
|
173
157
|
|
|
174
158
|
|
|
175
159
|
def get_slurm_config(
|
|
160
|
+
shared_config: JobRunnerConfigSLURM,
|
|
176
161
|
wftask: WorkflowTaskV2,
|
|
177
162
|
which_type: Literal["non_parallel", "parallel"],
|
|
178
|
-
config_path: Path | None = None,
|
|
179
163
|
tot_tasks: int = 1,
|
|
180
164
|
) -> SlurmConfig:
|
|
181
|
-
config =
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
165
|
+
config = _get_slurm_config_internal(
|
|
166
|
+
shared_config=shared_config,
|
|
167
|
+
wftask=wftask,
|
|
168
|
+
which_type=which_type,
|
|
185
169
|
)
|
|
186
170
|
|
|
187
171
|
# Set/validate parameters for task batching
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Submodule to handle the SLURM configuration for a WorkflowTask
|
|
3
|
+
"""
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from pydantic import ConfigDict
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from fractal_server.logger import set_logger
|
|
11
|
+
|
|
12
|
+
logger = set_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SlurmConfig(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Abstraction for SLURM parameters
|
|
18
|
+
|
|
19
|
+
**NOTE**: `SlurmConfig` objects are created internally in `fractal-server`,
|
|
20
|
+
and they are not meant to be initialized by the user; the same holds for
|
|
21
|
+
`SlurmConfig` attributes (e.g. `mem_per_task_MB`), which are not meant to
|
|
22
|
+
be part of the superuser-defined `resource.jobs_runner_config` JSON field.
|
|
23
|
+
|
|
24
|
+
Part of the attributes map directly to some of the SLURM attributes (see
|
|
25
|
+
https://slurm.schedmd.com/sbatch.html), e.g. `partition`. Other attributes
|
|
26
|
+
are metaparameters which are needed in fractal-server to combine multiple
|
|
27
|
+
tasks in the same SLURM job (e.g. `parallel_tasks_per_job` or
|
|
28
|
+
`max_num_jobs`).
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
partition: Corresponds to SLURM option.
|
|
32
|
+
cpus_per_task: Corresponds to SLURM option.
|
|
33
|
+
mem_per_task_MB: Corresponds to `mem` SLURM option.
|
|
34
|
+
job_name: Corresponds to `name` SLURM option.
|
|
35
|
+
constraint: Corresponds to SLURM option.
|
|
36
|
+
gres: Corresponds to SLURM option.
|
|
37
|
+
account: Corresponds to SLURM option.
|
|
38
|
+
gpus: Corresponds to SLURM option.
|
|
39
|
+
time: Corresponds to SLURM option (WARNING: not fully supported).
|
|
40
|
+
nodelist: Corresponds to SLURM option.
|
|
41
|
+
exclude: Corresponds to SLURM option.
|
|
42
|
+
prefix: Prefix of configuration lines in SLURM submission scripts.
|
|
43
|
+
shebang_line: Shebang line for SLURM submission scripts.
|
|
44
|
+
extra_lines: Additional lines to include in SLURM submission scripts.
|
|
45
|
+
tasks_per_job: Number of tasks for each SLURM job.
|
|
46
|
+
parallel_tasks_per_job: Number of tasks to run in parallel for
|
|
47
|
+
each SLURM job.
|
|
48
|
+
target_cpus_per_job: Optimal number of CPUs to be requested in each
|
|
49
|
+
SLURM job.
|
|
50
|
+
max_cpus_per_job: Maximum number of CPUs that can be requested in each
|
|
51
|
+
SLURM job.
|
|
52
|
+
target_mem_per_job: Optimal amount of memory (in MB) to be requested in
|
|
53
|
+
each SLURM job.
|
|
54
|
+
max_mem_per_job: Maximum amount of memory (in MB) that can be requested
|
|
55
|
+
in each SLURM job.
|
|
56
|
+
target_num_jobs: Optimal number of SLURM jobs for a given WorkflowTask.
|
|
57
|
+
max_num_jobs: Maximum number of SLURM jobs for a given WorkflowTask.
|
|
58
|
+
user_local_exports:
|
|
59
|
+
Key-value pairs to be included as `export`-ed variables in SLURM
|
|
60
|
+
submission script, after prepending values with the user's cache
|
|
61
|
+
directory.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
model_config = ConfigDict(extra="forbid")
|
|
65
|
+
|
|
66
|
+
# Required SLURM parameters (note that the integer attributes are those
|
|
67
|
+
# that will need to scale up with the number of parallel tasks per job)
|
|
68
|
+
partition: str
|
|
69
|
+
cpus_per_task: int
|
|
70
|
+
mem_per_task_MB: int
|
|
71
|
+
prefix: str = "#SBATCH"
|
|
72
|
+
shebang_line: str = "#!/bin/sh"
|
|
73
|
+
|
|
74
|
+
# Optional SLURM parameters
|
|
75
|
+
job_name: str | None = None
|
|
76
|
+
constraint: str | None = None
|
|
77
|
+
gres: str | None = None
|
|
78
|
+
gpus: str | None = None
|
|
79
|
+
time: str | None = None
|
|
80
|
+
account: str | None = None
|
|
81
|
+
nodelist: str | None = None
|
|
82
|
+
exclude: str | None = None
|
|
83
|
+
|
|
84
|
+
# Free-field attribute for extra lines to be added to the SLURM job
|
|
85
|
+
# preamble
|
|
86
|
+
extra_lines: list[str] = Field(default_factory=list)
|
|
87
|
+
|
|
88
|
+
# Variables that will be `export`ed in the SLURM submission script
|
|
89
|
+
user_local_exports: dict[str, str] = Field(default_factory=dict)
|
|
90
|
+
|
|
91
|
+
# Metaparameters needed to combine multiple tasks in each SLURM job
|
|
92
|
+
tasks_per_job: int | None = None
|
|
93
|
+
parallel_tasks_per_job: int | None = None
|
|
94
|
+
target_cpus_per_job: int
|
|
95
|
+
max_cpus_per_job: int
|
|
96
|
+
target_mem_per_job: int
|
|
97
|
+
max_mem_per_job: int
|
|
98
|
+
target_num_jobs: int
|
|
99
|
+
max_num_jobs: int
|
|
100
|
+
|
|
101
|
+
def _sorted_extra_lines(self) -> list[str]:
|
|
102
|
+
"""
|
|
103
|
+
Return a copy of `self.extra_lines`, where lines starting with
|
|
104
|
+
`self.prefix` are listed first.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def _no_prefix(_line):
|
|
108
|
+
if _line.startswith(self.prefix):
|
|
109
|
+
return 0
|
|
110
|
+
else:
|
|
111
|
+
return 1
|
|
112
|
+
|
|
113
|
+
return sorted(self.extra_lines, key=_no_prefix)
|
|
114
|
+
|
|
115
|
+
def sort_script_lines(self, script_lines: list[str]) -> list[str]:
|
|
116
|
+
"""
|
|
117
|
+
Return a copy of `script_lines`, where lines are sorted as in:
|
|
118
|
+
|
|
119
|
+
1. `self.shebang_line` (if present);
|
|
120
|
+
2. Lines starting with `self.prefix`;
|
|
121
|
+
3. Other lines.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
script_lines:
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def _sorting_function(_line):
|
|
128
|
+
if _line == self.shebang_line:
|
|
129
|
+
return 0
|
|
130
|
+
elif _line.startswith(self.prefix):
|
|
131
|
+
return 1
|
|
132
|
+
else:
|
|
133
|
+
return 2
|
|
134
|
+
|
|
135
|
+
return sorted(script_lines, key=_sorting_function)
|
|
136
|
+
|
|
137
|
+
def to_sbatch_preamble(
|
|
138
|
+
self,
|
|
139
|
+
remote_export_dir: str,
|
|
140
|
+
) -> list[str]:
|
|
141
|
+
"""
|
|
142
|
+
Compile `SlurmConfig` object into the preamble of a SLURM submission
|
|
143
|
+
script.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
remote_export_dir:
|
|
147
|
+
Base directory for exports defined in
|
|
148
|
+
`self.user_local_exports`.
|
|
149
|
+
"""
|
|
150
|
+
if self.parallel_tasks_per_job is None:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
"SlurmConfig.sbatch_preamble requires that "
|
|
153
|
+
f"{self.parallel_tasks_per_job=} is not None."
|
|
154
|
+
)
|
|
155
|
+
if len(self.extra_lines) != len(set(self.extra_lines)):
|
|
156
|
+
raise ValueError(f"{self.extra_lines=} contains repetitions")
|
|
157
|
+
|
|
158
|
+
mem_per_job_MB = self.parallel_tasks_per_job * self.mem_per_task_MB
|
|
159
|
+
lines = [
|
|
160
|
+
self.shebang_line,
|
|
161
|
+
f"{self.prefix} --partition={self.partition}",
|
|
162
|
+
f"{self.prefix} --ntasks={self.parallel_tasks_per_job}",
|
|
163
|
+
f"{self.prefix} --cpus-per-task={self.cpus_per_task}",
|
|
164
|
+
f"{self.prefix} --mem={mem_per_job_MB}M",
|
|
165
|
+
]
|
|
166
|
+
for key in [
|
|
167
|
+
"job_name",
|
|
168
|
+
"constraint",
|
|
169
|
+
"gres",
|
|
170
|
+
"gpus",
|
|
171
|
+
"time",
|
|
172
|
+
"account",
|
|
173
|
+
"exclude",
|
|
174
|
+
"nodelist",
|
|
175
|
+
]:
|
|
176
|
+
value = getattr(self, key)
|
|
177
|
+
if value is not None:
|
|
178
|
+
# Handle the `time` parameter
|
|
179
|
+
if key == "time" and self.parallel_tasks_per_job > 1:
|
|
180
|
+
# NOTE: see issue #1632
|
|
181
|
+
logger.warning(
|
|
182
|
+
f"`time` SLURM parameter is set to {self.time}, "
|
|
183
|
+
"but this does not take into account the number of "
|
|
184
|
+
f"SLURM tasks ({self.parallel_tasks_per_job})."
|
|
185
|
+
)
|
|
186
|
+
option = key.replace("_", "-")
|
|
187
|
+
lines.append(f"{self.prefix} --{option}={value}")
|
|
188
|
+
|
|
189
|
+
for line in self._sorted_extra_lines():
|
|
190
|
+
lines.append(line)
|
|
191
|
+
|
|
192
|
+
if self.user_local_exports:
|
|
193
|
+
for key, value in self.user_local_exports.items():
|
|
194
|
+
tmp_value = str(Path(remote_export_dir) / value)
|
|
195
|
+
lines.append(f"export {key}={tmp_value}")
|
|
196
|
+
|
|
197
|
+
"""
|
|
198
|
+
FIXME export SRUN_CPUS_PER_TASK
|
|
199
|
+
# From https://slurm.schedmd.com/sbatch.html: Beginning with 22.05,
|
|
200
|
+
# srun will not inherit the --cpus-per-task value requested by salloc
|
|
201
|
+
# or sbatch. It must be requested again with the call to srun or set
|
|
202
|
+
# with the SRUN_CPUS_PER_TASK environment variable if desired for the
|
|
203
|
+
# task(s).
|
|
204
|
+
if config.cpus_per_task:
|
|
205
|
+
#additional_setup_lines.append(
|
|
206
|
+
f"export SRUN_CPUS_PER_TASK={config.cpus_per_task}"
|
|
207
|
+
)
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
return lines
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def batch_size(self) -> int:
|
|
214
|
+
return self.tasks_per_job
|
|
@@ -112,7 +112,7 @@ class SlurmJob(BaseModel):
|
|
|
112
112
|
return self.slurm_stderr_remote_path.as_posix()
|
|
113
113
|
|
|
114
114
|
@property
|
|
115
|
-
def slurm_stdout_local_path(self) ->
|
|
115
|
+
def slurm_stdout_local_path(self) -> Path:
|
|
116
116
|
return (
|
|
117
117
|
self.workdir_local
|
|
118
118
|
/ f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
|