fractal-server 2.13.1__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +3 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/__init__.py +7 -1
- fractal_server/app/models/v2/dataset.py +1 -11
- fractal_server/app/models/v2/history.py +78 -0
- fractal_server/app/models/v2/job.py +10 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/accounting.py +18 -28
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/__init__.py +1 -1
- fractal_server/app/routes/api/v2/__init__.py +8 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
- fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -17
- fractal_server/app/routes/api/v2/history.py +544 -0
- fractal_server/app/routes/api/v2/images.py +31 -43
- fractal_server/app/routes/api/v2/job.py +30 -0
- fractal_server/app/routes/api/v2/project.py +1 -53
- fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/task.py +3 -10
- fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
- fractal_server/app/routes/api/v2/workflow.py +28 -69
- fractal_server/app/routes/api/v2/workflowtask.py +53 -50
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/routes/auth/oauth.py +5 -3
- fractal_server/app/routes/pagination.py +47 -0
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/compress_folder.py +57 -29
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +157 -0
- fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
- fractal_server/app/runner/executors/local/runner.py +248 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
- fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
- fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
- fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
- fractal_server/app/runner/extract_archive.py +1 -3
- fractal_server/app/runner/task_files.py +134 -87
- fractal_server/app/runner/v2/__init__.py +0 -399
- fractal_server/app/runner/v2/_local.py +88 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +20 -19
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +17 -15
- fractal_server/app/runner/v2/db_tools.py +119 -0
- fractal_server/app/runner/v2/runner.py +206 -95
- fractal_server/app/runner/v2/runner_functions.py +488 -187
- fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
- fractal_server/app/runner/v2/submit_workflow.py +358 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/_validators.py +13 -24
- fractal_server/app/schemas/user.py +10 -7
- fractal_server/app/schemas/user_settings.py +9 -21
- fractal_server/app/schemas/v2/__init__.py +9 -1
- fractal_server/app/schemas/v2/dataset.py +12 -94
- fractal_server/app/schemas/v2/dumps.py +26 -9
- fractal_server/app/schemas/v2/history.py +80 -0
- fractal_server/app/schemas/v2/job.py +15 -8
- fractal_server/app/schemas/v2/manifest.py +14 -7
- fractal_server/app/schemas/v2/project.py +9 -7
- fractal_server/app/schemas/v2/status_legacy.py +35 -0
- fractal_server/app/schemas/v2/task.py +72 -77
- fractal_server/app/schemas/v2/task_collection.py +14 -32
- fractal_server/app/schemas/v2/task_group.py +10 -9
- fractal_server/app/schemas/v2/workflow.py +10 -11
- fractal_server/app/schemas/v2/workflowtask.py +2 -21
- fractal_server/app/security/__init__.py +3 -3
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/config.py +41 -46
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
- fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
- fractal_server/ssh/_fabric.py +28 -14
- fractal_server/tasks/v2/local/collect.py +2 -2
- fractal_server/tasks/v2/ssh/collect.py +2 -2
- fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
- fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
- fractal_server/tasks/v2/utils_background.py +0 -19
- fractal_server/tasks/v2/utils_database.py +30 -17
- fractal_server/tasks/v2/utils_templates.py +6 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/RECORD +106 -96
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- fractal_server/app/schemas/v2/status.py +0 -16
- /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -2,21 +2,16 @@ from pathlib import Path
|
|
2
2
|
from typing import Literal
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
+
from ._batching import heuristics
|
6
|
+
from ._slurm_config import _parse_mem_value
|
7
|
+
from ._slurm_config import load_slurm_config_file
|
8
|
+
from ._slurm_config import logger
|
9
|
+
from ._slurm_config import SlurmConfig
|
10
|
+
from ._slurm_config import SlurmConfigError
|
5
11
|
from fractal_server.app.models.v2 import WorkflowTaskV2
|
6
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
7
|
-
_parse_mem_value,
|
8
|
-
)
|
9
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
10
|
-
load_slurm_config_file,
|
11
|
-
)
|
12
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import logger
|
13
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import SlurmConfig
|
14
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
15
|
-
SlurmConfigError,
|
16
|
-
)
|
17
12
|
|
18
13
|
|
19
|
-
def
|
14
|
+
def get_slurm_config_internal(
|
20
15
|
wftask: WorkflowTaskV2,
|
21
16
|
which_type: Literal["non_parallel", "parallel"],
|
22
17
|
config_path: Optional[Path] = None,
|
@@ -25,7 +20,7 @@ def get_slurm_config(
|
|
25
20
|
Prepare a `SlurmConfig` configuration object
|
26
21
|
|
27
22
|
The argument `which_type` determines whether we use `wftask.meta_parallel`
|
28
|
-
or `wftask.meta_non_parallel`. In the following
|
23
|
+
or `wftask.meta_non_parallel`. In the following description, let us assume
|
29
24
|
that `which_type="parallel"`.
|
30
25
|
|
31
26
|
The sources for `SlurmConfig` attributes, in increasing priority order, are
|
@@ -142,8 +137,8 @@ def get_slurm_config(
|
|
142
137
|
extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
|
143
138
|
if len(set(extra_lines)) != len(extra_lines):
|
144
139
|
logger.debug(
|
145
|
-
"[get_slurm_config] Removing repeated elements "
|
146
|
-
f"
|
140
|
+
"[get_slurm_config] Removing repeated elements from "
|
141
|
+
f"{extra_lines=}."
|
147
142
|
)
|
148
143
|
extra_lines = list(set(extra_lines))
|
149
144
|
slurm_dict["extra_lines"] = extra_lines
|
@@ -162,9 +157,45 @@ def get_slurm_config(
|
|
162
157
|
|
163
158
|
# Put everything together
|
164
159
|
logger.debug(
|
165
|
-
"[get_slurm_config] Now create a SlurmConfig object based "
|
166
|
-
f"
|
160
|
+
"[get_slurm_config] Now create a SlurmConfig object based on "
|
161
|
+
f"{slurm_dict=}"
|
167
162
|
)
|
168
163
|
slurm_config = SlurmConfig(**slurm_dict)
|
169
164
|
|
170
165
|
return slurm_config
|
166
|
+
|
167
|
+
|
168
|
+
def get_slurm_config(
|
169
|
+
wftask: WorkflowTaskV2,
|
170
|
+
which_type: Literal["non_parallel", "parallel"],
|
171
|
+
config_path: Optional[Path] = None,
|
172
|
+
tot_tasks: int = 1,
|
173
|
+
) -> SlurmConfig:
|
174
|
+
config = get_slurm_config_internal(
|
175
|
+
wftask,
|
176
|
+
which_type,
|
177
|
+
config_path,
|
178
|
+
)
|
179
|
+
|
180
|
+
# Set/validate parameters for task batching
|
181
|
+
tasks_per_job, parallel_tasks_per_job = heuristics(
|
182
|
+
# Number of parallel components (always known)
|
183
|
+
tot_tasks=tot_tasks,
|
184
|
+
# Optional WorkflowTask attributes:
|
185
|
+
tasks_per_job=config.tasks_per_job,
|
186
|
+
parallel_tasks_per_job=config.parallel_tasks_per_job, # noqa
|
187
|
+
# Task requirements (multiple possible sources):
|
188
|
+
cpus_per_task=config.cpus_per_task,
|
189
|
+
mem_per_task=config.mem_per_task_MB,
|
190
|
+
# Fractal configuration variables (soft/hard limits):
|
191
|
+
target_cpus_per_job=config.target_cpus_per_job,
|
192
|
+
target_mem_per_job=config.target_mem_per_job,
|
193
|
+
target_num_jobs=config.target_num_jobs,
|
194
|
+
max_cpus_per_job=config.max_cpus_per_job,
|
195
|
+
max_mem_per_job=config.max_mem_per_job,
|
196
|
+
max_num_jobs=config.max_num_jobs,
|
197
|
+
)
|
198
|
+
config.parallel_tasks_per_job = parallel_tasks_per_job
|
199
|
+
config.tasks_per_job = tasks_per_job
|
200
|
+
|
201
|
+
return config
|
@@ -19,7 +19,6 @@ import os
|
|
19
19
|
import sys
|
20
20
|
from typing import Literal
|
21
21
|
from typing import Optional
|
22
|
-
from typing import Type
|
23
22
|
from typing import Union
|
24
23
|
|
25
24
|
import cloudpickle
|
@@ -27,30 +26,6 @@ import cloudpickle
|
|
27
26
|
from fractal_server import __VERSION__
|
28
27
|
|
29
28
|
|
30
|
-
class ExceptionProxy:
|
31
|
-
"""
|
32
|
-
Proxy class to serialise exceptions
|
33
|
-
|
34
|
-
In general exceptions are not serialisable. This proxy class saves the
|
35
|
-
serialisable content of an exception. On the receiving end, it can be used
|
36
|
-
to reconstruct a TaskExecutionError.
|
37
|
-
|
38
|
-
Attributes:
|
39
|
-
exc_type_name: Name of the exception type
|
40
|
-
tb: TBD
|
41
|
-
args: TBD
|
42
|
-
kwargs: TBD
|
43
|
-
"""
|
44
|
-
|
45
|
-
def __init__(
|
46
|
-
self, exc_type: Type[BaseException], tb: str, *args, **kwargs
|
47
|
-
):
|
48
|
-
self.exc_type_name: str = exc_type.__name__
|
49
|
-
self.tb: str = tb
|
50
|
-
self.args = args
|
51
|
-
self.kwargs: dict = kwargs
|
52
|
-
|
53
|
-
|
54
29
|
class FractalVersionMismatch(RuntimeError):
|
55
30
|
"""
|
56
31
|
Custom exception for version mismatch
|
@@ -79,18 +54,21 @@ def _check_versions_mismatch(
|
|
79
54
|
do not match with the ones on the server
|
80
55
|
"""
|
81
56
|
|
82
|
-
server_python_version = server_versions["python"]
|
83
|
-
worker_python_version = sys.version_info[:3]
|
57
|
+
server_python_version = list(server_versions["python"])
|
58
|
+
worker_python_version = list(sys.version_info[:3])
|
84
59
|
if worker_python_version != server_python_version:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
60
|
+
if worker_python_version[:2] != server_python_version[:2]:
|
61
|
+
# FIXME: Turn this into an error, in some version post 2.14.
|
62
|
+
logging.error(
|
63
|
+
f"{server_python_version=} but {worker_python_version=}. "
|
64
|
+
"This configuration will be deprecated in a future version, "
|
65
|
+
"please contact the admin of this Fractal instance."
|
66
|
+
)
|
67
|
+
else:
|
68
|
+
# Major.minor versions match, patch versions differ
|
69
|
+
logging.warning(
|
70
|
+
f"{server_python_version=} but {worker_python_version=}."
|
71
|
+
)
|
94
72
|
|
95
73
|
server_cloudpickle_version = server_versions["cloudpickle"]
|
96
74
|
worker_cloudpickle_version = cloudpickle.__version__
|
@@ -134,28 +112,39 @@ def worker(
|
|
134
112
|
_extra_import_paths = extra_import_paths.split(":")
|
135
113
|
sys.path[:0] = _extra_import_paths
|
136
114
|
|
137
|
-
# Execute the job and
|
115
|
+
# Execute the job and capture exceptions
|
138
116
|
try:
|
139
117
|
with open(in_fname, "rb") as f:
|
140
118
|
indata = f.read()
|
141
119
|
server_versions, fun, args, kwargs = cloudpickle.loads(indata)
|
142
120
|
_check_versions_mismatch(server_versions)
|
143
121
|
|
144
|
-
result = True, fun(*args, **kwargs)
|
122
|
+
result = (True, fun(*args, **kwargs))
|
145
123
|
out = cloudpickle.dumps(result)
|
146
124
|
except Exception as e:
|
125
|
+
# Exception objects are not serialisable. Here we save the relevant
|
126
|
+
# exception contents in a serializable dictionary. Note that whenever
|
127
|
+
# the task failed "properly", the exception is a `TaskExecutionError`
|
128
|
+
# and it has additional attributes.
|
129
|
+
|
147
130
|
import traceback
|
148
131
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
**e.__dict__,
|
132
|
+
exc_type, exc_value, traceback_obj = sys.exc_info()
|
133
|
+
traceback_obj = traceback_obj.tb_next
|
134
|
+
traceback_list = traceback.format_exception(
|
135
|
+
exc_type,
|
136
|
+
exc_value,
|
137
|
+
traceback_obj,
|
156
138
|
)
|
157
|
-
|
158
|
-
|
139
|
+
traceback_string = "".join(traceback_list)
|
140
|
+
exc_proxy = dict(
|
141
|
+
exc_type_name=exc_type.__name__,
|
142
|
+
traceback_string=traceback_string,
|
143
|
+
workflow_task_order=getattr(e, "workflow_task_order", None),
|
144
|
+
workflow_task_id=getattr(e, "workflow_task_id", None),
|
145
|
+
task_name=getattr(e, "task_name", None),
|
146
|
+
)
|
147
|
+
result = (False, exc_proxy)
|
159
148
|
out = cloudpickle.dumps(result)
|
160
149
|
|
161
150
|
# Write the output pickle file
|
@@ -0,0 +1,134 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Any
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from pydantic import BaseModel
|
6
|
+
from pydantic import ConfigDict
|
7
|
+
|
8
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
9
|
+
|
10
|
+
|
11
|
+
class SlurmTask(BaseModel):
|
12
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
13
|
+
component: str
|
14
|
+
prefix: str
|
15
|
+
workdir_local: Path
|
16
|
+
workdir_remote: Path
|
17
|
+
parameters: dict[str, Any]
|
18
|
+
zarr_url: Optional[str] = None
|
19
|
+
task_files: TaskFiles
|
20
|
+
index: int
|
21
|
+
|
22
|
+
@property
|
23
|
+
def input_pickle_file_local_path(self) -> Path:
|
24
|
+
return (
|
25
|
+
self.workdir_local / f"{self.prefix}-{self.component}-input.pickle"
|
26
|
+
)
|
27
|
+
|
28
|
+
@property
|
29
|
+
def input_pickle_file_remote_path(self) -> Path:
|
30
|
+
return (
|
31
|
+
self.workdir_remote
|
32
|
+
/ f"{self.prefix}-{self.component}-input.pickle"
|
33
|
+
)
|
34
|
+
|
35
|
+
@property
|
36
|
+
def output_pickle_file_local_path(self) -> Path:
|
37
|
+
return (
|
38
|
+
self.workdir_local
|
39
|
+
/ f"{self.prefix}-{self.component}-output.pickle"
|
40
|
+
)
|
41
|
+
|
42
|
+
@property
|
43
|
+
def output_pickle_file_remote_path(self) -> Path:
|
44
|
+
return (
|
45
|
+
self.workdir_remote
|
46
|
+
/ f"{self.prefix}-{self.component}-output.pickle"
|
47
|
+
)
|
48
|
+
|
49
|
+
@property
|
50
|
+
def input_pickle_file_local(self) -> str:
|
51
|
+
return self.input_pickle_file_local_path.as_posix()
|
52
|
+
|
53
|
+
@property
|
54
|
+
def input_pickle_file_remote(self) -> str:
|
55
|
+
return self.input_pickle_file_remote_path.as_posix()
|
56
|
+
|
57
|
+
@property
|
58
|
+
def output_pickle_file_local(self) -> str:
|
59
|
+
return self.output_pickle_file_local_path.as_posix()
|
60
|
+
|
61
|
+
@property
|
62
|
+
def output_pickle_file_remote(self) -> str:
|
63
|
+
return self.output_pickle_file_remote_path.as_posix()
|
64
|
+
|
65
|
+
|
66
|
+
class SlurmJob(BaseModel):
|
67
|
+
slurm_job_id: Optional[str] = None
|
68
|
+
prefix: str
|
69
|
+
workdir_local: Path
|
70
|
+
workdir_remote: Path
|
71
|
+
tasks: list[SlurmTask]
|
72
|
+
|
73
|
+
@property
|
74
|
+
def slurm_submission_script_local(self) -> str:
|
75
|
+
return (
|
76
|
+
self.workdir_local / f"{self.prefix}-slurm-submit.sh"
|
77
|
+
).as_posix()
|
78
|
+
|
79
|
+
@property
|
80
|
+
def slurm_submission_script_remote(self) -> str:
|
81
|
+
return (
|
82
|
+
self.workdir_remote / f"{self.prefix}-slurm-submit.sh"
|
83
|
+
).as_posix()
|
84
|
+
|
85
|
+
@property
|
86
|
+
def slurm_job_id_placeholder(self) -> str:
|
87
|
+
if self.slurm_job_id:
|
88
|
+
return self.slurm_job_id
|
89
|
+
else:
|
90
|
+
return "%j"
|
91
|
+
|
92
|
+
@property
|
93
|
+
def slurm_stdout_remote_path(self) -> Path:
|
94
|
+
return (
|
95
|
+
self.workdir_remote
|
96
|
+
/ f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
|
97
|
+
)
|
98
|
+
|
99
|
+
@property
|
100
|
+
def slurm_stdout_remote(self) -> str:
|
101
|
+
return self.slurm_stdout_remote_path.as_posix()
|
102
|
+
|
103
|
+
@property
|
104
|
+
def slurm_stderr_remote_path(self) -> Path:
|
105
|
+
return (
|
106
|
+
self.workdir_remote
|
107
|
+
/ f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
|
108
|
+
)
|
109
|
+
|
110
|
+
@property
|
111
|
+
def slurm_stderr_remote(self) -> str:
|
112
|
+
return self.slurm_stderr_remote_path.as_posix()
|
113
|
+
|
114
|
+
@property
|
115
|
+
def slurm_stdout_local_path(self) -> str:
|
116
|
+
return (
|
117
|
+
self.workdir_local
|
118
|
+
/ f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.out"
|
119
|
+
)
|
120
|
+
|
121
|
+
@property
|
122
|
+
def slurm_stdout_local(self) -> str:
|
123
|
+
return self.slurm_stdout_local_path.as_posix()
|
124
|
+
|
125
|
+
@property
|
126
|
+
def slurm_stderr_local_path(self) -> Path:
|
127
|
+
return (
|
128
|
+
self.workdir_local
|
129
|
+
/ f"{self.prefix}-slurm-{self.slurm_job_id_placeholder}.err"
|
130
|
+
)
|
131
|
+
|
132
|
+
@property
|
133
|
+
def slurm_stderr_local(self) -> str:
|
134
|
+
return self.slurm_stderr_local_path.as_posix()
|
@@ -0,0 +1,268 @@
|
|
1
|
+
import time
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from ..slurm_common.base_slurm_runner import BaseSlurmRunner
|
6
|
+
from ..slurm_common.slurm_job_task_models import SlurmJob
|
7
|
+
from fractal_server.app.runner.compress_folder import compress_folder
|
8
|
+
from fractal_server.app.runner.extract_archive import extract_archive
|
9
|
+
from fractal_server.config import get_settings
|
10
|
+
from fractal_server.logger import set_logger
|
11
|
+
from fractal_server.ssh._fabric import FractalSSH
|
12
|
+
from fractal_server.ssh._fabric import FractalSSHCommandError
|
13
|
+
from fractal_server.ssh._fabric import FractalSSHTimeoutError
|
14
|
+
from fractal_server.syringe import Inject
|
15
|
+
|
16
|
+
|
17
|
+
logger = set_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class SlurmSSHRunner(BaseSlurmRunner):
|
21
|
+
fractal_ssh: FractalSSH
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
*,
|
26
|
+
# Common
|
27
|
+
root_dir_local: Path,
|
28
|
+
root_dir_remote: Path,
|
29
|
+
common_script_lines: Optional[list[str]] = None,
|
30
|
+
user_cache_dir: Optional[str] = None,
|
31
|
+
poll_interval: Optional[int] = None,
|
32
|
+
# Specific
|
33
|
+
fractal_ssh: FractalSSH,
|
34
|
+
) -> None:
|
35
|
+
"""
|
36
|
+
Set parameters that are the same for different Fractal tasks and for
|
37
|
+
different SLURM jobs/tasks.
|
38
|
+
"""
|
39
|
+
self.fractal_ssh = fractal_ssh
|
40
|
+
logger.warning(self.fractal_ssh)
|
41
|
+
|
42
|
+
settings = Inject(get_settings)
|
43
|
+
|
44
|
+
super().__init__(
|
45
|
+
slurm_runner_type="ssh",
|
46
|
+
root_dir_local=root_dir_local,
|
47
|
+
root_dir_remote=root_dir_remote,
|
48
|
+
common_script_lines=common_script_lines,
|
49
|
+
user_cache_dir=user_cache_dir,
|
50
|
+
poll_interval=poll_interval,
|
51
|
+
python_worker_interpreter=settings.FRACTAL_SLURM_WORKER_PYTHON,
|
52
|
+
)
|
53
|
+
|
54
|
+
def _mkdir_local_folder(self, folder: str) -> None:
|
55
|
+
Path(folder).mkdir(parents=True)
|
56
|
+
|
57
|
+
def _mkdir_remote_folder(self, folder: str):
|
58
|
+
self.fractal_ssh.mkdir(
|
59
|
+
folder=folder,
|
60
|
+
parents=True,
|
61
|
+
)
|
62
|
+
|
63
|
+
def _fetch_artifacts(
|
64
|
+
self,
|
65
|
+
finished_slurm_jobs: list[SlurmJob],
|
66
|
+
) -> None:
|
67
|
+
"""
|
68
|
+
Fetch artifacts for a list of SLURM jobs.
|
69
|
+
"""
|
70
|
+
|
71
|
+
# Check length
|
72
|
+
if len(finished_slurm_jobs) == 0:
|
73
|
+
logger.debug(f"[_fetch_artifacts] EXIT ({finished_slurm_jobs=}).")
|
74
|
+
return None
|
75
|
+
|
76
|
+
t_0 = time.perf_counter()
|
77
|
+
logger.debug(
|
78
|
+
f"[_fetch_artifacts] START ({len(finished_slurm_jobs)=})."
|
79
|
+
)
|
80
|
+
|
81
|
+
# Extract `workdir_remote` and `workdir_local`
|
82
|
+
self.validate_slurm_jobs_workdirs(finished_slurm_jobs)
|
83
|
+
workdir_local = finished_slurm_jobs[0].workdir_local
|
84
|
+
workdir_remote = finished_slurm_jobs[0].workdir_remote
|
85
|
+
|
86
|
+
# Define local/remote tarfile paths
|
87
|
+
tarfile_path_local = (
|
88
|
+
workdir_local.parent / f"{workdir_local.name}.tar.gz"
|
89
|
+
).as_posix()
|
90
|
+
tarfile_path_remote = (
|
91
|
+
workdir_remote.parent / f"{workdir_remote.name}.tar.gz"
|
92
|
+
).as_posix()
|
93
|
+
|
94
|
+
# Create file list
|
95
|
+
# NOTE: see issue 2483
|
96
|
+
filelist = []
|
97
|
+
for _slurm_job in finished_slurm_jobs:
|
98
|
+
_single_job_filelist = [
|
99
|
+
_slurm_job.slurm_stdout_remote_path.name,
|
100
|
+
_slurm_job.slurm_stderr_remote_path.name,
|
101
|
+
]
|
102
|
+
for task in _slurm_job.tasks:
|
103
|
+
_single_job_filelist.extend(
|
104
|
+
[
|
105
|
+
task.output_pickle_file_remote_path.name,
|
106
|
+
task.task_files.log_file_remote_path.name,
|
107
|
+
task.task_files.args_file_remote_path.name,
|
108
|
+
task.task_files.metadiff_file_remote_path.name,
|
109
|
+
]
|
110
|
+
)
|
111
|
+
filelist.extend(_single_job_filelist)
|
112
|
+
filelist_string = "\n".join(filelist)
|
113
|
+
elapsed = time.perf_counter() - t_0
|
114
|
+
logger.debug(
|
115
|
+
"[_fetch_artifacts] Created filelist "
|
116
|
+
f"({len(filelist)=}, from start: {elapsed=:.3f} s)."
|
117
|
+
)
|
118
|
+
|
119
|
+
# Write filelist to file remotely
|
120
|
+
tmp_filelist_path = workdir_remote / f"filelist_{time.time()}.txt"
|
121
|
+
self.fractal_ssh.write_remote_file(
|
122
|
+
path=tmp_filelist_path.as_posix(),
|
123
|
+
content=f"{filelist_string}\n",
|
124
|
+
)
|
125
|
+
elapsed = time.perf_counter() - t_0
|
126
|
+
logger.debug(
|
127
|
+
f"[_fetch_artifacts] File list written to {tmp_filelist_path} "
|
128
|
+
f"(from start: {elapsed=:.3f} s)."
|
129
|
+
)
|
130
|
+
|
131
|
+
# Create remote tarfile
|
132
|
+
t_0_tar = time.perf_counter()
|
133
|
+
tar_command = (
|
134
|
+
f"{self.python_worker_interpreter} "
|
135
|
+
"-m fractal_server.app.runner.compress_folder "
|
136
|
+
f"{workdir_remote.as_posix()} "
|
137
|
+
f"--filelist {tmp_filelist_path}"
|
138
|
+
)
|
139
|
+
self.fractal_ssh.run_command(cmd=tar_command)
|
140
|
+
t_1_tar = time.perf_counter()
|
141
|
+
logger.info(
|
142
|
+
f"[_fetch_artifacts] Remote archive {tarfile_path_remote} created"
|
143
|
+
f" - elapsed={t_1_tar - t_0_tar:.3f} s"
|
144
|
+
)
|
145
|
+
|
146
|
+
# Fetch tarfile
|
147
|
+
t_0_get = time.perf_counter()
|
148
|
+
self.fractal_ssh.fetch_file(
|
149
|
+
remote=tarfile_path_remote,
|
150
|
+
local=tarfile_path_local,
|
151
|
+
)
|
152
|
+
t_1_get = time.perf_counter()
|
153
|
+
logger.info(
|
154
|
+
"[_fetch_artifacts] Subfolder archive transferred back "
|
155
|
+
f"to {tarfile_path_local}"
|
156
|
+
f" - elapsed={t_1_get - t_0_get:.3f} s"
|
157
|
+
)
|
158
|
+
|
159
|
+
# Extract tarfile locally
|
160
|
+
extract_archive(Path(tarfile_path_local))
|
161
|
+
|
162
|
+
# Remove local tarfile
|
163
|
+
Path(tarfile_path_local).unlink(missing_ok=True)
|
164
|
+
|
165
|
+
t_1 = time.perf_counter()
|
166
|
+
logger.info(f"[_fetch_artifacts] End - elapsed={t_1 - t_0:.3f} s")
|
167
|
+
|
168
|
+
def _send_inputs(self, jobs: list[SlurmJob]) -> None:
|
169
|
+
"""
|
170
|
+
Transfer the jobs subfolder to the remote host.
|
171
|
+
"""
|
172
|
+
for job in jobs:
|
173
|
+
# Create local archive
|
174
|
+
tarfile_path_local = compress_folder(
|
175
|
+
job.workdir_local,
|
176
|
+
filelist_path=None,
|
177
|
+
)
|
178
|
+
tarfile_name = Path(tarfile_path_local).name
|
179
|
+
logger.info(f"Subfolder archive created at {tarfile_path_local}")
|
180
|
+
|
181
|
+
# Transfer archive
|
182
|
+
tarfile_path_remote = (
|
183
|
+
job.workdir_remote.parent / tarfile_name
|
184
|
+
).as_posix()
|
185
|
+
t_0_put = time.perf_counter()
|
186
|
+
self.fractal_ssh.send_file(
|
187
|
+
local=tarfile_path_local,
|
188
|
+
remote=tarfile_path_remote,
|
189
|
+
)
|
190
|
+
t_1_put = time.perf_counter()
|
191
|
+
logger.info(
|
192
|
+
f"Subfolder archive transferred to {tarfile_path_remote}"
|
193
|
+
f" - elapsed={t_1_put - t_0_put:.3f} s"
|
194
|
+
)
|
195
|
+
|
196
|
+
# Remove local archive
|
197
|
+
Path(tarfile_path_local).unlink()
|
198
|
+
logger.debug(f"Local archive {tarfile_path_local} removed")
|
199
|
+
|
200
|
+
# Uncompress remote archive
|
201
|
+
tar_command = (
|
202
|
+
f"{self.python_worker_interpreter} -m "
|
203
|
+
"fractal_server.app.runner.extract_archive "
|
204
|
+
f"{tarfile_path_remote}"
|
205
|
+
)
|
206
|
+
self.fractal_ssh.run_command(cmd=tar_command)
|
207
|
+
|
208
|
+
def _run_remote_cmd(self, cmd: str) -> str:
|
209
|
+
stdout = self.fractal_ssh.run_command(cmd=cmd)
|
210
|
+
return stdout
|
211
|
+
|
212
|
+
def run_squeue(
|
213
|
+
self,
|
214
|
+
*,
|
215
|
+
job_ids: list[str],
|
216
|
+
base_interval: float = 2.0,
|
217
|
+
max_attempts: int = 7,
|
218
|
+
) -> str:
|
219
|
+
"""
|
220
|
+
Run `squeue` for a set of SLURM job IDs.
|
221
|
+
|
222
|
+
Different scenarios:
|
223
|
+
|
224
|
+
1. When `squeue -j` succeeds (with exit code 0), return its stdout.
|
225
|
+
2. When `squeue -j` fails (typical example:
|
226
|
+
`squeue -j {invalid_job_id}` fails with exit code 1), re-raise.
|
227
|
+
The error will be handled upstream.
|
228
|
+
3. When the SSH command fails because another thread is keeping the
|
229
|
+
lock of the `FractalSSH` object for a long time, mock the standard
|
230
|
+
output of the `squeue` command so that it looks like jobs are not
|
231
|
+
completed yet.
|
232
|
+
4. When the SSH command fails for other reasons, despite a forgiving
|
233
|
+
setup (7 connection attempts with base waiting interval of 2
|
234
|
+
seconds, with a cumulative timeout of 126 seconds), return an empty
|
235
|
+
string. This will be treated upstream as an empty `squeu` output,
|
236
|
+
indirectly resulting in marking the job as completed.
|
237
|
+
"""
|
238
|
+
|
239
|
+
if len(job_ids) == 0:
|
240
|
+
return ""
|
241
|
+
|
242
|
+
job_id_single_str = ",".join([str(j) for j in job_ids])
|
243
|
+
cmd = (
|
244
|
+
"squeue --noheader --format='%i %T' --states=all "
|
245
|
+
f"--jobs={job_id_single_str}"
|
246
|
+
)
|
247
|
+
|
248
|
+
try:
|
249
|
+
stdout = self.fractal_ssh.run_command(
|
250
|
+
cmd=cmd,
|
251
|
+
base_interval=base_interval,
|
252
|
+
max_attempts=max_attempts,
|
253
|
+
)
|
254
|
+
return stdout
|
255
|
+
except FractalSSHCommandError as e:
|
256
|
+
raise e
|
257
|
+
except FractalSSHTimeoutError:
|
258
|
+
logger.warning(
|
259
|
+
"[run_squeue] Could not acquire lock, use stdout placeholder."
|
260
|
+
)
|
261
|
+
FAKE_STATUS = "FRACTAL_STATUS_PLACEHOLDER"
|
262
|
+
placeholder_stdout = "\n".join(
|
263
|
+
[f"{job_id} {FAKE_STATUS}" for job_id in job_ids]
|
264
|
+
)
|
265
|
+
return placeholder_stdout
|
266
|
+
except Exception as e:
|
267
|
+
logger.error(f"Ignoring `squeue` command failure {e}")
|
268
|
+
return ""
|
File without changes
|