fractal-server 2.16.5__py3-none-any.whl → 2.17.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +129 -22
- fractal_server/app/db/__init__.py +9 -11
- fractal_server/app/models/security.py +7 -3
- fractal_server/app/models/user_settings.py +0 -4
- fractal_server/app/models/v2/__init__.py +4 -0
- fractal_server/app/models/v2/job.py +3 -4
- fractal_server/app/models/v2/profile.py +16 -0
- fractal_server/app/models/v2/project.py +3 -0
- fractal_server/app/models/v2/resource.py +130 -0
- fractal_server/app/models/v2/task_group.py +3 -0
- fractal_server/app/routes/admin/v2/__init__.py +4 -0
- fractal_server/app/routes/admin/v2/_aux_functions.py +55 -0
- fractal_server/app/routes/admin/v2/profile.py +86 -0
- fractal_server/app/routes/admin/v2/resource.py +229 -0
- fractal_server/app/routes/admin/v2/task_group_lifecycle.py +48 -82
- fractal_server/app/routes/api/__init__.py +26 -7
- fractal_server/app/routes/api/v2/_aux_functions.py +27 -1
- fractal_server/app/routes/api/v2/_aux_functions_history.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +3 -3
- fractal_server/app/routes/api/v2/_aux_functions_tasks.py +7 -7
- fractal_server/app/routes/api/v2/project.py +5 -1
- fractal_server/app/routes/api/v2/submit.py +32 -24
- fractal_server/app/routes/api/v2/task.py +5 -0
- fractal_server/app/routes/api/v2/task_collection.py +36 -47
- fractal_server/app/routes/api/v2/task_collection_custom.py +11 -5
- fractal_server/app/routes/api/v2/task_collection_pixi.py +34 -40
- fractal_server/app/routes/api/v2/task_group_lifecycle.py +39 -82
- fractal_server/app/routes/api/v2/workflow_import.py +4 -3
- fractal_server/app/routes/auth/_aux_auth.py +3 -3
- fractal_server/app/routes/auth/current_user.py +45 -7
- fractal_server/app/routes/auth/oauth.py +1 -1
- fractal_server/app/routes/auth/users.py +9 -0
- fractal_server/app/routes/aux/_runner.py +2 -1
- fractal_server/app/routes/aux/validate_user_profile.py +62 -0
- fractal_server/app/routes/aux/validate_user_settings.py +12 -9
- fractal_server/app/schemas/user.py +20 -13
- fractal_server/app/schemas/user_settings.py +0 -4
- fractal_server/app/schemas/v2/__init__.py +11 -0
- fractal_server/app/schemas/v2/profile.py +72 -0
- fractal_server/app/schemas/v2/resource.py +117 -0
- fractal_server/app/security/__init__.py +6 -13
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/app/user_settings.py +2 -12
- fractal_server/config/__init__.py +23 -0
- fractal_server/config/_database.py +58 -0
- fractal_server/config/_email.py +170 -0
- fractal_server/config/_init_data.py +27 -0
- fractal_server/config/_main.py +216 -0
- fractal_server/config/_settings_config.py +7 -0
- fractal_server/images/tools.py +3 -3
- fractal_server/logger.py +3 -3
- fractal_server/main.py +14 -21
- fractal_server/migrations/versions/90f6508c6379_drop_useroauth_username.py +36 -0
- fractal_server/migrations/versions/a80ac5a352bf_resource_profile.py +195 -0
- fractal_server/runner/config/__init__.py +2 -0
- fractal_server/runner/config/_local.py +21 -0
- fractal_server/runner/config/_slurm.py +128 -0
- fractal_server/runner/config/slurm_mem_to_MB.py +63 -0
- fractal_server/runner/exceptions.py +4 -0
- fractal_server/runner/executors/base_runner.py +17 -7
- fractal_server/runner/executors/local/get_local_config.py +21 -86
- fractal_server/runner/executors/local/runner.py +48 -5
- fractal_server/runner/executors/slurm_common/_batching.py +2 -2
- fractal_server/runner/executors/slurm_common/base_slurm_runner.py +59 -25
- fractal_server/runner/executors/slurm_common/get_slurm_config.py +38 -54
- fractal_server/runner/executors/slurm_common/remote.py +1 -1
- fractal_server/runner/executors/slurm_common/{_slurm_config.py → slurm_config.py} +3 -254
- fractal_server/runner/executors/slurm_common/slurm_job_task_models.py +1 -1
- fractal_server/runner/executors/slurm_ssh/runner.py +12 -14
- fractal_server/runner/executors/slurm_sudo/_subprocess_run_as_user.py +2 -2
- fractal_server/runner/executors/slurm_sudo/runner.py +12 -12
- fractal_server/runner/v2/_local.py +36 -21
- fractal_server/runner/v2/_slurm_ssh.py +40 -4
- fractal_server/runner/v2/_slurm_sudo.py +41 -11
- fractal_server/runner/v2/db_tools.py +1 -1
- fractal_server/runner/v2/runner.py +3 -11
- fractal_server/runner/v2/runner_functions.py +42 -28
- fractal_server/runner/v2/submit_workflow.py +87 -108
- fractal_server/runner/versions.py +8 -3
- fractal_server/ssh/_fabric.py +6 -6
- fractal_server/tasks/config/__init__.py +3 -0
- fractal_server/tasks/config/_pixi.py +127 -0
- fractal_server/tasks/config/_python.py +51 -0
- fractal_server/tasks/v2/local/_utils.py +7 -7
- fractal_server/tasks/v2/local/collect.py +13 -5
- fractal_server/tasks/v2/local/collect_pixi.py +26 -10
- fractal_server/tasks/v2/local/deactivate.py +7 -1
- fractal_server/tasks/v2/local/deactivate_pixi.py +5 -1
- fractal_server/tasks/v2/local/delete.py +4 -0
- fractal_server/tasks/v2/local/reactivate.py +13 -5
- fractal_server/tasks/v2/local/reactivate_pixi.py +27 -9
- fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py +11 -10
- fractal_server/tasks/v2/ssh/_utils.py +6 -7
- fractal_server/tasks/v2/ssh/collect.py +19 -12
- fractal_server/tasks/v2/ssh/collect_pixi.py +34 -16
- fractal_server/tasks/v2/ssh/deactivate.py +12 -8
- fractal_server/tasks/v2/ssh/deactivate_pixi.py +14 -10
- fractal_server/tasks/v2/ssh/delete.py +12 -9
- fractal_server/tasks/v2/ssh/reactivate.py +18 -12
- fractal_server/tasks/v2/ssh/reactivate_pixi.py +36 -17
- fractal_server/tasks/v2/templates/4_pip_show.sh +4 -6
- fractal_server/tasks/v2/utils_database.py +2 -2
- fractal_server/tasks/v2/utils_python_interpreter.py +8 -16
- fractal_server/tasks/v2/utils_templates.py +7 -10
- fractal_server/utils.py +1 -1
- {fractal_server-2.16.5.dist-info → fractal_server-2.17.0a0.dist-info}/METADATA +5 -5
- {fractal_server-2.16.5.dist-info → fractal_server-2.17.0a0.dist-info}/RECORD +112 -90
- {fractal_server-2.16.5.dist-info → fractal_server-2.17.0a0.dist-info}/WHEEL +1 -1
- fractal_server/config.py +0 -906
- /fractal_server/{runner → app}/shutdown.py +0 -0
- {fractal_server-2.16.5.dist-info → fractal_server-2.17.0a0.dist-info}/entry_points.txt +0 -0
- {fractal_server-2.16.5.dist-info → fractal_server-2.17.0a0.dist-info/licenses}/LICENSE +0 -0
|
@@ -9,17 +9,17 @@ from typing import Literal
|
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
from pydantic import ConfigDict
|
|
11
11
|
|
|
12
|
-
from ..slurm_common._slurm_config import SlurmConfig
|
|
13
12
|
from ..slurm_common.slurm_job_task_models import SlurmJob
|
|
14
13
|
from ..slurm_common.slurm_job_task_models import SlurmTask
|
|
15
14
|
from ._job_states import STATES_FINISHED
|
|
15
|
+
from .slurm_config import SlurmConfig
|
|
16
16
|
from fractal_server import __VERSION__
|
|
17
17
|
from fractal_server.app.db import get_sync_db
|
|
18
18
|
from fractal_server.app.models.v2 import AccountingRecordSlurm
|
|
19
19
|
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
|
20
20
|
from fractal_server.app.schemas.v2 import TaskType
|
|
21
|
-
from fractal_server.config import get_settings
|
|
22
21
|
from fractal_server.logger import set_logger
|
|
22
|
+
from fractal_server.runner.config import JobRunnerConfigSLURM
|
|
23
23
|
from fractal_server.runner.exceptions import JobExecutionError
|
|
24
24
|
from fractal_server.runner.exceptions import TaskExecutionError
|
|
25
25
|
from fractal_server.runner.executors.base_runner import BaseRunner
|
|
@@ -31,7 +31,6 @@ from fractal_server.runner.v2.db_tools import (
|
|
|
31
31
|
bulk_update_status_of_history_unit,
|
|
32
32
|
)
|
|
33
33
|
from fractal_server.runner.v2.db_tools import update_status_of_history_unit
|
|
34
|
-
from fractal_server.syringe import Inject
|
|
35
34
|
|
|
36
35
|
SHUTDOWN_ERROR_MESSAGE = "Failed due to job-execution shutdown."
|
|
37
36
|
SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
|
|
@@ -77,16 +76,18 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
77
76
|
python_worker_interpreter: str
|
|
78
77
|
slurm_runner_type: Literal["ssh", "sudo"]
|
|
79
78
|
slurm_account: str | None = None
|
|
79
|
+
shared_config: JobRunnerConfigSLURM
|
|
80
80
|
|
|
81
81
|
def __init__(
|
|
82
82
|
self,
|
|
83
|
+
*,
|
|
83
84
|
root_dir_local: Path,
|
|
84
85
|
root_dir_remote: Path,
|
|
85
86
|
slurm_runner_type: Literal["ssh", "sudo"],
|
|
86
87
|
python_worker_interpreter: str,
|
|
88
|
+
poll_interval: int,
|
|
87
89
|
common_script_lines: list[str] | None = None,
|
|
88
|
-
user_cache_dir: str | None = None,
|
|
89
|
-
poll_interval: int | None = None,
|
|
90
|
+
user_cache_dir: str | None = None, # FIXME: make required?
|
|
90
91
|
slurm_account: str | None = None,
|
|
91
92
|
):
|
|
92
93
|
self.slurm_runner_type = slurm_runner_type
|
|
@@ -98,11 +99,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
98
99
|
self.python_worker_interpreter = python_worker_interpreter
|
|
99
100
|
self.slurm_account = slurm_account
|
|
100
101
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
self.poll_interval = (
|
|
104
|
-
poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
|
105
|
-
)
|
|
102
|
+
self.poll_interval = poll_interval
|
|
106
103
|
self.poll_interval_internal = self.poll_interval / 10.0
|
|
107
104
|
|
|
108
105
|
self.check_fractal_server_versions()
|
|
@@ -134,12 +131,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
134
131
|
def _run_remote_cmd(self, cmd: str) -> str:
|
|
135
132
|
raise NotImplementedError("Implement in child class.")
|
|
136
133
|
|
|
137
|
-
def run_squeue(self, *, job_ids: list[str]
|
|
134
|
+
def run_squeue(self, *, job_ids: list[str]) -> str:
|
|
138
135
|
raise NotImplementedError("Implement in child class.")
|
|
139
136
|
|
|
140
|
-
def _is_squeue_error_recoverable(
|
|
141
|
-
self, exception: BaseException
|
|
142
|
-
) -> Literal[True]:
|
|
137
|
+
def _is_squeue_error_recoverable(self, exception: BaseException) -> bool:
|
|
143
138
|
"""
|
|
144
139
|
Determine whether a `squeue` error is considered recoverable.
|
|
145
140
|
|
|
@@ -245,7 +240,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
245
240
|
A new, up-to-date, `SlurmConfig` object.
|
|
246
241
|
"""
|
|
247
242
|
|
|
248
|
-
new_slurm_config = slurm_config.model_copy()
|
|
243
|
+
new_slurm_config = slurm_config.model_copy(deep=True)
|
|
249
244
|
|
|
250
245
|
# Include SLURM account in `slurm_config`.
|
|
251
246
|
if self.slurm_account is not None:
|
|
@@ -473,7 +468,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
473
468
|
*,
|
|
474
469
|
task: SlurmTask,
|
|
475
470
|
was_job_scancelled: bool = False,
|
|
476
|
-
) -> tuple[Any, Exception]:
|
|
471
|
+
) -> tuple[Any, Exception | None]:
|
|
477
472
|
try:
|
|
478
473
|
with open(task.output_file_local) as f:
|
|
479
474
|
output = json.load(f)
|
|
@@ -566,6 +561,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
566
561
|
def job_ids(self) -> list[str]:
|
|
567
562
|
return list(self.jobs.keys())
|
|
568
563
|
|
|
564
|
+
@property
|
|
565
|
+
def job_ids_int(self) -> list[int]:
|
|
566
|
+
return list(map(int, self.jobs.keys()))
|
|
567
|
+
|
|
569
568
|
def wait_and_check_shutdown(self) -> list[str]:
|
|
570
569
|
"""
|
|
571
570
|
Wait at most `self.poll_interval`, while also checking for shutdown.
|
|
@@ -602,6 +601,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
602
601
|
|
|
603
602
|
def submit(
|
|
604
603
|
self,
|
|
604
|
+
*,
|
|
605
605
|
base_command: str,
|
|
606
606
|
workflow_task_order: int,
|
|
607
607
|
workflow_task_id: int,
|
|
@@ -612,7 +612,23 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
612
612
|
config: SlurmConfig,
|
|
613
613
|
task_type: SubmitTaskType,
|
|
614
614
|
user_id: int,
|
|
615
|
-
) -> tuple[Any, Exception]:
|
|
615
|
+
) -> tuple[Any, Exception | None]:
|
|
616
|
+
"""
|
|
617
|
+
Run a single fractal task.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
base_command:
|
|
621
|
+
workflow_task_order:
|
|
622
|
+
workflow_task_id:
|
|
623
|
+
task_name:
|
|
624
|
+
parameters: Dictionary of parameters.
|
|
625
|
+
history_unit_id:
|
|
626
|
+
Database ID of the corresponding `HistoryUnit` entry.
|
|
627
|
+
task_type: Task type.
|
|
628
|
+
task_files: `TaskFiles` object.
|
|
629
|
+
config: Runner-specific parameters.
|
|
630
|
+
user_id:
|
|
631
|
+
"""
|
|
616
632
|
logger.debug("[submit] START")
|
|
617
633
|
|
|
618
634
|
# Always refresh `executor_error_log` before starting a task
|
|
@@ -687,7 +703,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
687
703
|
|
|
688
704
|
create_accounting_record_slurm(
|
|
689
705
|
user_id=user_id,
|
|
690
|
-
slurm_job_ids=self.
|
|
706
|
+
slurm_job_ids=self.job_ids_int,
|
|
691
707
|
)
|
|
692
708
|
|
|
693
709
|
# Retrieval phase
|
|
@@ -757,11 +773,12 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
757
773
|
|
|
758
774
|
def multisubmit(
|
|
759
775
|
self,
|
|
776
|
+
*,
|
|
760
777
|
base_command: str,
|
|
761
778
|
workflow_task_order: int,
|
|
762
779
|
workflow_task_id: int,
|
|
763
780
|
task_name: str,
|
|
764
|
-
list_parameters: list[dict],
|
|
781
|
+
list_parameters: list[dict[str, Any]],
|
|
765
782
|
history_unit_ids: list[int],
|
|
766
783
|
list_task_files: list[TaskFiles],
|
|
767
784
|
task_type: MultisubmitTaskType,
|
|
@@ -769,9 +786,26 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
769
786
|
user_id: int,
|
|
770
787
|
) -> tuple[dict[int, Any], dict[int, BaseException]]:
|
|
771
788
|
"""
|
|
789
|
+
Run a parallel fractal task.
|
|
790
|
+
|
|
772
791
|
Note: `list_parameters`, `list_task_files` and `history_unit_ids`
|
|
773
792
|
have the same size. For parallel tasks, this is also the number of
|
|
774
793
|
input images, while for compound tasks these can differ.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
base_command:
|
|
797
|
+
workflow_task_order:
|
|
798
|
+
workflow_task_id:
|
|
799
|
+
task_name:
|
|
800
|
+
list_parameters:
|
|
801
|
+
List of dictionaries of parameters (each one must include
|
|
802
|
+
`zarr_urls` key).
|
|
803
|
+
history_unit_ids:
|
|
804
|
+
Database IDs of the corresponding `HistoryUnit` entries.
|
|
805
|
+
list_task_files: `TaskFiles` objects.
|
|
806
|
+
task_type: Task type.
|
|
807
|
+
config: Runner-specific parameters.
|
|
808
|
+
user_id:
|
|
775
809
|
"""
|
|
776
810
|
|
|
777
811
|
# Always refresh `executor_error_log` before starting a task
|
|
@@ -779,6 +813,9 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
779
813
|
|
|
780
814
|
config = self._enrich_slurm_config(config)
|
|
781
815
|
|
|
816
|
+
results: dict[int, Any] = {}
|
|
817
|
+
exceptions: dict[int, BaseException] = {}
|
|
818
|
+
|
|
782
819
|
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
|
783
820
|
try:
|
|
784
821
|
if self.is_shutdown():
|
|
@@ -789,8 +826,8 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
789
826
|
status=HistoryUnitStatus.FAILED,
|
|
790
827
|
db_sync=db,
|
|
791
828
|
)
|
|
792
|
-
results
|
|
793
|
-
exceptions
|
|
829
|
+
results = {}
|
|
830
|
+
exceptions = {
|
|
794
831
|
ind: SHUTDOWN_EXCEPTION
|
|
795
832
|
for ind in range(len(list_parameters))
|
|
796
833
|
}
|
|
@@ -812,9 +849,6 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
812
849
|
self._mkdir_local_folder(workdir_local.as_posix())
|
|
813
850
|
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
|
814
851
|
|
|
815
|
-
results: dict[int, Any] = {}
|
|
816
|
-
exceptions: dict[int, BaseException] = {}
|
|
817
|
-
|
|
818
852
|
# NOTE: chunking has already taken place in `get_slurm_config`,
|
|
819
853
|
# so that `config.tasks_per_job` is now set.
|
|
820
854
|
|
|
@@ -889,7 +923,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
889
923
|
|
|
890
924
|
create_accounting_record_slurm(
|
|
891
925
|
user_id=user_id,
|
|
892
|
-
slurm_job_ids=self.
|
|
926
|
+
slurm_job_ids=self.job_ids_int,
|
|
893
927
|
)
|
|
894
928
|
|
|
895
929
|
except Exception as e:
|
|
@@ -1,50 +1,42 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
1
|
from typing import Literal
|
|
3
2
|
|
|
4
3
|
from ._batching import heuristics
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
from ._slurm_config import logger
|
|
8
|
-
from ._slurm_config import SlurmConfig
|
|
9
|
-
from ._slurm_config import SlurmConfigError
|
|
4
|
+
from .slurm_config import logger
|
|
5
|
+
from .slurm_config import SlurmConfig
|
|
10
6
|
from fractal_server.app.models.v2 import WorkflowTaskV2
|
|
7
|
+
from fractal_server.runner.config import JobRunnerConfigSLURM
|
|
8
|
+
from fractal_server.runner.config.slurm_mem_to_MB import slurm_mem_to_MB
|
|
9
|
+
from fractal_server.runner.exceptions import SlurmConfigError
|
|
11
10
|
from fractal_server.string_tools import interpret_as_bool
|
|
12
11
|
|
|
13
12
|
|
|
14
|
-
def
|
|
13
|
+
def _get_slurm_config_internal(
|
|
14
|
+
shared_config: JobRunnerConfigSLURM,
|
|
15
15
|
wftask: WorkflowTaskV2,
|
|
16
16
|
which_type: Literal["non_parallel", "parallel"],
|
|
17
|
-
config_path: Path | None = None,
|
|
18
17
|
) -> SlurmConfig:
|
|
19
18
|
"""
|
|
20
|
-
Prepare a `SlurmConfig` configuration object
|
|
21
19
|
|
|
22
|
-
|
|
23
|
-
or `wftask.meta_non_parallel`. In the following description, let us assume
|
|
24
|
-
that `which_type="parallel"`.
|
|
20
|
+
Prepare a specific `SlurmConfig` configuration.
|
|
25
21
|
|
|
26
|
-
The
|
|
22
|
+
The base configuration is the runner-level `shared_config` object, based
|
|
23
|
+
on `resource.jobs_runner_config` (note that GPU-specific properties take
|
|
24
|
+
priority, when `needs_gpu=True`). We then incorporate attributes from
|
|
25
|
+
`wftask.meta_{non_parallel,parallel}` - with higher priority.
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
3. Properties in `wftask.meta_parallel` (which typically include those in
|
|
32
|
-
`wftask.task.meta_parallel`). Note that `wftask.meta_parallel` may be
|
|
33
|
-
`None`.
|
|
34
|
-
|
|
35
|
-
Arguments:
|
|
27
|
+
Args:
|
|
28
|
+
shared_config:
|
|
29
|
+
Configuration object based on `resource.jobs_runner_config`.
|
|
36
30
|
wftask:
|
|
37
|
-
|
|
38
|
-
prepared.
|
|
39
|
-
config_path:
|
|
40
|
-
Path of a Fractal SLURM configuration file; if `None`, use
|
|
41
|
-
`FRACTAL_SLURM_CONFIG_FILE` variable from settings.
|
|
31
|
+
WorkflowTaskV2 for which the backend configuration should
|
|
32
|
+
be prepared.
|
|
42
33
|
which_type:
|
|
43
|
-
|
|
34
|
+
Whether we should look at the non-parallel or parallel part
|
|
35
|
+
of `wftask`.
|
|
36
|
+
tot_tasks: Not used here, only present as a common interface.
|
|
44
37
|
|
|
45
38
|
Returns:
|
|
46
|
-
|
|
47
|
-
The SlurmConfig object
|
|
39
|
+
A ready-to-use `SlurmConfig` object.
|
|
48
40
|
"""
|
|
49
41
|
|
|
50
42
|
if which_type == "non_parallel":
|
|
@@ -60,25 +52,19 @@ def get_slurm_config_internal(
|
|
|
60
52
|
f"[get_slurm_config] WorkflowTask meta attribute: {wftask_meta=}"
|
|
61
53
|
)
|
|
62
54
|
|
|
63
|
-
#
|
|
64
|
-
|
|
65
|
-
slurm_dict = slurm_env.default_slurm_config.model_dump(
|
|
55
|
+
# Start from `shared_config`
|
|
56
|
+
slurm_dict = shared_config.default_slurm_config.model_dump(
|
|
66
57
|
exclude_unset=True, exclude={"mem"}
|
|
67
58
|
)
|
|
68
|
-
if
|
|
69
|
-
slurm_dict["mem_per_task_MB"] =
|
|
59
|
+
if shared_config.default_slurm_config.mem:
|
|
60
|
+
slurm_dict["mem_per_task_MB"] = shared_config.default_slurm_config.mem
|
|
70
61
|
|
|
71
62
|
# Incorporate slurm_env.batching_config
|
|
72
|
-
for key, value in
|
|
63
|
+
for key, value in shared_config.batching_config.model_dump().items():
|
|
73
64
|
slurm_dict[key] = value
|
|
74
65
|
|
|
75
66
|
# Incorporate slurm_env.user_local_exports
|
|
76
|
-
slurm_dict["user_local_exports"] =
|
|
77
|
-
|
|
78
|
-
logger.debug(
|
|
79
|
-
"[get_slurm_config] Fractal SLURM configuration file: "
|
|
80
|
-
f"{slurm_env.model_dump()=}"
|
|
81
|
-
)
|
|
67
|
+
slurm_dict["user_local_exports"] = shared_config.user_local_exports
|
|
82
68
|
|
|
83
69
|
# GPU-related options
|
|
84
70
|
# Notes about priority:
|
|
@@ -92,12 +78,12 @@ def get_slurm_config_internal(
|
|
|
92
78
|
needs_gpu = False
|
|
93
79
|
logger.debug(f"[get_slurm_config] {needs_gpu=}")
|
|
94
80
|
if needs_gpu:
|
|
95
|
-
for key, value in
|
|
81
|
+
for key, value in shared_config.gpu_slurm_config.model_dump(
|
|
96
82
|
exclude_unset=True, exclude={"mem"}
|
|
97
83
|
).items():
|
|
98
84
|
slurm_dict[key] = value
|
|
99
|
-
if
|
|
100
|
-
slurm_dict["mem_per_task_MB"] =
|
|
85
|
+
if shared_config.gpu_slurm_config.mem:
|
|
86
|
+
slurm_dict["mem_per_task_MB"] = shared_config.gpu_slurm_config.mem
|
|
101
87
|
|
|
102
88
|
# Number of CPUs per task, for multithreading
|
|
103
89
|
if wftask_meta is not None and "cpus_per_task" in wftask_meta:
|
|
@@ -107,7 +93,7 @@ def get_slurm_config_internal(
|
|
|
107
93
|
# Required memory per task, in MB
|
|
108
94
|
if wftask_meta is not None and "mem" in wftask_meta:
|
|
109
95
|
raw_mem = wftask_meta["mem"]
|
|
110
|
-
mem_per_task_MB =
|
|
96
|
+
mem_per_task_MB = slurm_mem_to_MB(raw_mem)
|
|
111
97
|
slurm_dict["mem_per_task_MB"] = mem_per_task_MB
|
|
112
98
|
|
|
113
99
|
# Job name
|
|
@@ -144,8 +130,7 @@ def get_slurm_config_internal(
|
|
|
144
130
|
extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
|
|
145
131
|
if len(set(extra_lines)) != len(extra_lines):
|
|
146
132
|
logger.debug(
|
|
147
|
-
"[get_slurm_config] Removing repeated elements
|
|
148
|
-
f"{extra_lines=}."
|
|
133
|
+
f"[get_slurm_config] Removing repeated elements in {extra_lines=}."
|
|
149
134
|
)
|
|
150
135
|
extra_lines = list(set(extra_lines))
|
|
151
136
|
slurm_dict["extra_lines"] = extra_lines
|
|
@@ -164,8 +149,7 @@ def get_slurm_config_internal(
|
|
|
164
149
|
|
|
165
150
|
# Put everything together
|
|
166
151
|
logger.debug(
|
|
167
|
-
"[get_slurm_config]
|
|
168
|
-
f"{slurm_dict=}"
|
|
152
|
+
f"[get_slurm_config] Create SlurmConfig object based on {slurm_dict=}"
|
|
169
153
|
)
|
|
170
154
|
slurm_config = SlurmConfig(**slurm_dict)
|
|
171
155
|
|
|
@@ -173,15 +157,15 @@ def get_slurm_config_internal(
|
|
|
173
157
|
|
|
174
158
|
|
|
175
159
|
def get_slurm_config(
|
|
160
|
+
shared_config: JobRunnerConfigSLURM,
|
|
176
161
|
wftask: WorkflowTaskV2,
|
|
177
162
|
which_type: Literal["non_parallel", "parallel"],
|
|
178
|
-
config_path: Path | None = None,
|
|
179
163
|
tot_tasks: int = 1,
|
|
180
164
|
) -> SlurmConfig:
|
|
181
|
-
config =
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
165
|
+
config = _get_slurm_config_internal(
|
|
166
|
+
shared_config=shared_config,
|
|
167
|
+
wftask=wftask,
|
|
168
|
+
which_type=which_type,
|
|
185
169
|
)
|
|
186
170
|
|
|
187
171
|
# Set/validate parameters for task batching
|
|
@@ -1,208 +1,17 @@
|
|
|
1
|
-
# Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
|
|
2
|
-
# University of Zurich
|
|
3
|
-
#
|
|
4
|
-
# Original authors:
|
|
5
|
-
# Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
|
|
6
|
-
# Tommaso Comparin <tommaso.comparin@exact-lab.it>
|
|
7
|
-
#
|
|
8
|
-
# This file is part of Fractal and was originally developed by eXact lab S.r.l.
|
|
9
|
-
# <exact-lab.it> under contract with Liberali Lab from the Friedrich Miescher
|
|
10
|
-
# Institute for Biomedical Research and Pelkmans Lab from the University of
|
|
11
|
-
# Zurich.
|
|
12
1
|
"""
|
|
13
2
|
Submodule to handle the SLURM configuration for a WorkflowTask
|
|
14
3
|
"""
|
|
15
|
-
import json
|
|
16
4
|
from pathlib import Path
|
|
17
5
|
|
|
18
6
|
from pydantic import BaseModel
|
|
19
7
|
from pydantic import ConfigDict
|
|
20
8
|
from pydantic import Field
|
|
21
|
-
from pydantic import ValidationError
|
|
22
9
|
|
|
23
|
-
from fractal_server.config import get_settings
|
|
24
10
|
from fractal_server.logger import set_logger
|
|
25
|
-
from fractal_server.syringe import Inject
|
|
26
11
|
|
|
27
12
|
logger = set_logger(__name__)
|
|
28
13
|
|
|
29
14
|
|
|
30
|
-
class SlurmConfigError(ValueError):
|
|
31
|
-
"""
|
|
32
|
-
Slurm configuration error
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
pass
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class _SlurmConfigSet(BaseModel):
|
|
39
|
-
"""
|
|
40
|
-
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` for the default/gpu
|
|
41
|
-
SLURM config. Only used as part of `SlurmConfigFile`.
|
|
42
|
-
|
|
43
|
-
Attributes:
|
|
44
|
-
partition:
|
|
45
|
-
cpus_per_task:
|
|
46
|
-
mem:
|
|
47
|
-
See `_parse_mem_value` for details on allowed values.
|
|
48
|
-
constraint:
|
|
49
|
-
gres:
|
|
50
|
-
time:
|
|
51
|
-
exclude:
|
|
52
|
-
nodelist:
|
|
53
|
-
account:
|
|
54
|
-
extra_lines:
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
model_config = ConfigDict(extra="forbid")
|
|
58
|
-
|
|
59
|
-
partition: str | None = None
|
|
60
|
-
cpus_per_task: int | None = None
|
|
61
|
-
mem: int | str | None = None
|
|
62
|
-
constraint: str | None = None
|
|
63
|
-
gres: str | None = None
|
|
64
|
-
exclude: str | None = None
|
|
65
|
-
nodelist: str | None = None
|
|
66
|
-
time: str | None = None
|
|
67
|
-
account: str | None = None
|
|
68
|
-
extra_lines: list[str] | None = None
|
|
69
|
-
gpus: str | None = None
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class _BatchingConfigSet(BaseModel):
|
|
73
|
-
"""
|
|
74
|
-
Options that can be set in `FRACTAL_SLURM_CONFIG_FILE` to configure the
|
|
75
|
-
batching strategy (that is, how to combine several tasks in a single SLURM
|
|
76
|
-
job). Only used as part of `SlurmConfigFile`.
|
|
77
|
-
|
|
78
|
-
Attributes:
|
|
79
|
-
target_cpus_per_job:
|
|
80
|
-
max_cpus_per_job:
|
|
81
|
-
target_mem_per_job:
|
|
82
|
-
(see `_parse_mem_value` for details on allowed values)
|
|
83
|
-
max_mem_per_job:
|
|
84
|
-
(see `_parse_mem_value` for details on allowed values)
|
|
85
|
-
target_num_jobs:
|
|
86
|
-
max_num_jobs:
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
model_config = ConfigDict(extra="forbid")
|
|
90
|
-
|
|
91
|
-
target_cpus_per_job: int
|
|
92
|
-
max_cpus_per_job: int
|
|
93
|
-
target_mem_per_job: int | str
|
|
94
|
-
max_mem_per_job: int | str
|
|
95
|
-
target_num_jobs: int
|
|
96
|
-
max_num_jobs: int
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
class SlurmConfigFile(BaseModel):
|
|
100
|
-
"""
|
|
101
|
-
Specifications for the content of `FRACTAL_SLURM_CONFIG_FILE`
|
|
102
|
-
|
|
103
|
-
This must be a JSON file, and a valid example is
|
|
104
|
-
```JSON
|
|
105
|
-
{
|
|
106
|
-
"default_slurm_config": {
|
|
107
|
-
"partition": "main",
|
|
108
|
-
"cpus_per_task": 1
|
|
109
|
-
},
|
|
110
|
-
"gpu_slurm_config": {
|
|
111
|
-
"partition": "gpu",
|
|
112
|
-
"extra_lines": ["#SBATCH --gres=gpu:v100:1"]
|
|
113
|
-
},
|
|
114
|
-
"batching_config": {
|
|
115
|
-
"target_cpus_per_job": 1,
|
|
116
|
-
"max_cpus_per_job": 1,
|
|
117
|
-
"target_mem_per_job": 200,
|
|
118
|
-
"max_mem_per_job": 500,
|
|
119
|
-
"target_num_jobs": 2,
|
|
120
|
-
"max_num_jobs": 4
|
|
121
|
-
},
|
|
122
|
-
"user_local_exports": {
|
|
123
|
-
"CELLPOSE_LOCAL_MODELS_PATH": "CELLPOSE_LOCAL_MODELS_PATH",
|
|
124
|
-
"NAPARI_CONFIG": "napari_config.json"
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
See `_SlurmConfigSet` and `_BatchingConfigSet` for more details.
|
|
130
|
-
|
|
131
|
-
Attributes:
|
|
132
|
-
default_slurm_config:
|
|
133
|
-
Common default options for all tasks.
|
|
134
|
-
gpu_slurm_config:
|
|
135
|
-
Default configuration for all GPU tasks.
|
|
136
|
-
batching_config:
|
|
137
|
-
Configuration of the batching strategy.
|
|
138
|
-
user_local_exports:
|
|
139
|
-
Key-value pairs to be included as `export`-ed variables in SLURM
|
|
140
|
-
submission script, after prepending values with the user's cache
|
|
141
|
-
directory.
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
|
-
model_config = ConfigDict(extra="forbid")
|
|
145
|
-
|
|
146
|
-
default_slurm_config: _SlurmConfigSet
|
|
147
|
-
gpu_slurm_config: _SlurmConfigSet | None = None
|
|
148
|
-
batching_config: _BatchingConfigSet
|
|
149
|
-
user_local_exports: dict[str, str] | None = None
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def load_slurm_config_file(
|
|
153
|
-
config_path: Path | None = None,
|
|
154
|
-
) -> SlurmConfigFile:
|
|
155
|
-
"""
|
|
156
|
-
Load a SLURM configuration file and validate its content with
|
|
157
|
-
`SlurmConfigFile`.
|
|
158
|
-
|
|
159
|
-
Arguments:
|
|
160
|
-
config_path:
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
if not config_path:
|
|
164
|
-
settings = Inject(get_settings)
|
|
165
|
-
config_path = settings.FRACTAL_SLURM_CONFIG_FILE
|
|
166
|
-
|
|
167
|
-
# Load file
|
|
168
|
-
logger.debug(f"[get_slurm_config] Now loading {config_path=}")
|
|
169
|
-
try:
|
|
170
|
-
with config_path.open("r") as f:
|
|
171
|
-
slurm_env = json.load(f)
|
|
172
|
-
except Exception as e:
|
|
173
|
-
raise SlurmConfigError(
|
|
174
|
-
f"Error while loading {config_path=}. "
|
|
175
|
-
f"Original error:\n{str(e)}"
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
# Validate file content
|
|
179
|
-
logger.debug(f"[load_slurm_config_file] Now validating {config_path=}")
|
|
180
|
-
logger.debug(f"[load_slurm_config_file] {slurm_env=}")
|
|
181
|
-
try:
|
|
182
|
-
obj = SlurmConfigFile(**slurm_env)
|
|
183
|
-
except ValidationError as e:
|
|
184
|
-
raise SlurmConfigError(
|
|
185
|
-
f"Error while loading {config_path=}. "
|
|
186
|
-
f"Original error:\n{str(e)}"
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
# Convert memory to MB units, in all relevant attributes
|
|
190
|
-
if obj.default_slurm_config.mem:
|
|
191
|
-
obj.default_slurm_config.mem = _parse_mem_value(
|
|
192
|
-
obj.default_slurm_config.mem
|
|
193
|
-
)
|
|
194
|
-
if obj.gpu_slurm_config and obj.gpu_slurm_config.mem:
|
|
195
|
-
obj.gpu_slurm_config.mem = _parse_mem_value(obj.gpu_slurm_config.mem)
|
|
196
|
-
obj.batching_config.target_mem_per_job = _parse_mem_value(
|
|
197
|
-
obj.batching_config.target_mem_per_job
|
|
198
|
-
)
|
|
199
|
-
obj.batching_config.max_mem_per_job = _parse_mem_value(
|
|
200
|
-
obj.batching_config.max_mem_per_job
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
return obj
|
|
204
|
-
|
|
205
|
-
|
|
206
15
|
class SlurmConfig(BaseModel):
|
|
207
16
|
"""
|
|
208
17
|
Abstraction for SLURM parameters
|
|
@@ -210,9 +19,7 @@ class SlurmConfig(BaseModel):
|
|
|
210
19
|
**NOTE**: `SlurmConfig` objects are created internally in `fractal-server`,
|
|
211
20
|
and they are not meant to be initialized by the user; the same holds for
|
|
212
21
|
`SlurmConfig` attributes (e.g. `mem_per_task_MB`), which are not meant to
|
|
213
|
-
be part of the `
|
|
214
|
-
expected file content are defined in
|
|
215
|
-
[`SlurmConfigFile`](#fractal_server.runner._slurm._slurm_config.SlurmConfigFile)).
|
|
22
|
+
be part of the superuser-defined `resource.jobs_runner_config` JSON field.
|
|
216
23
|
|
|
217
24
|
Part of the attributes map directly to some of the SLURM attributes (see
|
|
218
25
|
https://slurm.schedmd.com/sbatch.html), e.g. `partition`. Other attributes
|
|
@@ -313,7 +120,7 @@ class SlurmConfig(BaseModel):
|
|
|
313
120
|
2. Lines starting with `self.prefix`;
|
|
314
121
|
3. Other lines.
|
|
315
122
|
|
|
316
|
-
|
|
123
|
+
Args:
|
|
317
124
|
script_lines:
|
|
318
125
|
"""
|
|
319
126
|
|
|
@@ -335,7 +142,7 @@ class SlurmConfig(BaseModel):
|
|
|
335
142
|
Compile `SlurmConfig` object into the preamble of a SLURM submission
|
|
336
143
|
script.
|
|
337
144
|
|
|
338
|
-
|
|
145
|
+
Args:
|
|
339
146
|
remote_export_dir:
|
|
340
147
|
Base directory for exports defined in
|
|
341
148
|
`self.user_local_exports`.
|
|
@@ -411,61 +218,3 @@ class SlurmConfig(BaseModel):
|
|
|
411
218
|
@property
|
|
412
219
|
def batch_size(self) -> int:
|
|
413
220
|
return self.tasks_per_job
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def _parse_mem_value(raw_mem: str | int) -> int:
|
|
417
|
-
"""
|
|
418
|
-
Convert a memory-specification string into an integer (in MB units), or
|
|
419
|
-
simply return the input if it is already an integer.
|
|
420
|
-
|
|
421
|
-
Supported units are `"M", "G", "T"`, with `"M"` being the default; some
|
|
422
|
-
parsing examples are: `"10M" -> 10000`, `"3G" -> 3000000`.
|
|
423
|
-
|
|
424
|
-
Arguments:
|
|
425
|
-
raw_mem:
|
|
426
|
-
A string (e.g. `"100M"`) or an integer (in MB).
|
|
427
|
-
|
|
428
|
-
Returns:
|
|
429
|
-
Integer value of memory in MB units.
|
|
430
|
-
"""
|
|
431
|
-
|
|
432
|
-
info = f"[_parse_mem_value] {raw_mem=}"
|
|
433
|
-
error_msg = (
|
|
434
|
-
f"{info}, invalid specification of memory requirements "
|
|
435
|
-
"(valid examples: 93, 71M, 93G, 71T)."
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
# Handle integer argument
|
|
439
|
-
if type(raw_mem) is int:
|
|
440
|
-
return raw_mem
|
|
441
|
-
|
|
442
|
-
# Handle string argument
|
|
443
|
-
if not raw_mem[0].isdigit(): # fail e.g. for raw_mem="M100"
|
|
444
|
-
logger.error(error_msg)
|
|
445
|
-
raise SlurmConfigError(error_msg)
|
|
446
|
-
if raw_mem.isdigit():
|
|
447
|
-
mem_MB = int(raw_mem)
|
|
448
|
-
elif raw_mem.endswith("M"):
|
|
449
|
-
stripped_raw_mem = raw_mem.strip("M")
|
|
450
|
-
if not stripped_raw_mem.isdigit():
|
|
451
|
-
logger.error(error_msg)
|
|
452
|
-
raise SlurmConfigError(error_msg)
|
|
453
|
-
mem_MB = int(stripped_raw_mem)
|
|
454
|
-
elif raw_mem.endswith("G"):
|
|
455
|
-
stripped_raw_mem = raw_mem.strip("G")
|
|
456
|
-
if not stripped_raw_mem.isdigit():
|
|
457
|
-
logger.error(error_msg)
|
|
458
|
-
raise SlurmConfigError(error_msg)
|
|
459
|
-
mem_MB = int(stripped_raw_mem) * 10**3
|
|
460
|
-
elif raw_mem.endswith("T"):
|
|
461
|
-
stripped_raw_mem = raw_mem.strip("T")
|
|
462
|
-
if not stripped_raw_mem.isdigit():
|
|
463
|
-
logger.error(error_msg)
|
|
464
|
-
raise SlurmConfigError(error_msg)
|
|
465
|
-
mem_MB = int(stripped_raw_mem) * 10**6
|
|
466
|
-
else:
|
|
467
|
-
logger.error(error_msg)
|
|
468
|
-
raise SlurmConfigError(error_msg)
|
|
469
|
-
|
|
470
|
-
logger.debug(f"{info}, return {mem_MB}")
|
|
471
|
-
return mem_MB
|