fractal-server 2.13.0__py3-none-any.whl → 2.14.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/history/__init__.py +4 -0
- fractal_server/app/history/image_updates.py +142 -0
- fractal_server/app/history/status_enum.py +16 -0
- fractal_server/app/models/v2/__init__.py +9 -1
- fractal_server/app/models/v2/accounting.py +35 -0
- fractal_server/app/models/v2/history.py +53 -0
- fractal_server/app/routes/admin/v2/__init__.py +4 -0
- fractal_server/app/routes/admin/v2/accounting.py +108 -0
- fractal_server/app/routes/admin/v2/impersonate.py +35 -0
- fractal_server/app/routes/admin/v2/job.py +5 -13
- fractal_server/app/routes/admin/v2/task_group.py +4 -12
- fractal_server/app/routes/api/v2/__init__.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +78 -0
- fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +3 -3
- fractal_server/app/routes/api/v2/dataset.py +12 -9
- fractal_server/app/routes/api/v2/history.py +247 -0
- fractal_server/app/routes/api/v2/submit.py +1 -0
- fractal_server/app/routes/api/v2/task_group.py +2 -5
- fractal_server/app/routes/api/v2/workflow.py +18 -3
- fractal_server/app/routes/api/v2/workflowtask.py +22 -0
- fractal_server/app/routes/aux/__init__.py +0 -20
- fractal_server/app/runner/executors/base_runner.py +114 -0
- fractal_server/app/runner/{v2/_local → executors/local}/_local_config.py +3 -3
- fractal_server/app/runner/executors/local/_submit_setup.py +54 -0
- fractal_server/app/runner/executors/local/runner.py +200 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +3 -3
- fractal_server/app/runner/{v2/_slurm_ssh → executors/slurm_common}/_submit_setup.py +13 -12
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +9 -15
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_executor_wait_thread.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_slurm_job.py +1 -1
- fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/executor.py +13 -14
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_check_jobs_status.py +11 -9
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_executor_wait_thread.py +3 -3
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -68
- fractal_server/app/runner/executors/slurm_sudo/runner.py +632 -0
- fractal_server/app/runner/task_files.py +70 -96
- fractal_server/app/runner/v2/__init__.py +9 -19
- fractal_server/app/runner/v2/_local.py +84 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +12 -13
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +12 -12
- fractal_server/app/runner/v2/runner.py +106 -31
- fractal_server/app/runner/v2/runner_functions.py +88 -64
- fractal_server/app/runner/v2/runner_functions_low_level.py +20 -20
- fractal_server/app/schemas/v2/__init__.py +1 -0
- fractal_server/app/schemas/v2/accounting.py +18 -0
- fractal_server/app/schemas/v2/dataset.py +0 -17
- fractal_server/app/schemas/v2/history.py +23 -0
- fractal_server/config.py +58 -52
- fractal_server/migrations/versions/8223fcef886c_image_status.py +63 -0
- fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +68 -0
- fractal_server/migrations/versions/af1ef1c83c9b_add_accounting_tables.py +57 -0
- fractal_server/tasks/v2/utils_background.py +1 -1
- {fractal_server-2.13.0.dist-info → fractal_server-2.14.0a0.dist-info}/METADATA +1 -1
- {fractal_server-2.13.0.dist-info → fractal_server-2.14.0a0.dist-info}/RECORD +66 -55
- fractal_server/app/routes/api/v2/status.py +0 -168
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -129
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- /fractal_server/app/runner/executors/{slurm → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_ssh}/__init__.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_sudo}/__init__.py +0 -0
- {fractal_server-2.13.0.dist-info → fractal_server-2.14.0a0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.0.dist-info → fractal_server-2.14.0a0.dist-info}/WHEEL +0 -0
- {fractal_server-2.13.0.dist-info → fractal_server-2.14.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,200 @@
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from ._local_config import get_default_local_backend_config
|
7
|
+
from ._local_config import LocalBackendConfig
|
8
|
+
from fractal_server.app.history import HistoryItemImageStatus
|
9
|
+
from fractal_server.app.history import update_all_images
|
10
|
+
from fractal_server.app.history import update_single_image
|
11
|
+
from fractal_server.app.history import update_single_image_logfile
|
12
|
+
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
13
|
+
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
14
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
15
|
+
from fractal_server.logger import set_logger
|
16
|
+
|
17
|
+
logger = set_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class LocalRunner(BaseRunner):
|
21
|
+
executor: ThreadPoolExecutor
|
22
|
+
root_dir_local: Path
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
root_dir_local: Path,
|
27
|
+
):
|
28
|
+
|
29
|
+
self.root_dir_local = root_dir_local
|
30
|
+
self.root_dir_local.mkdir(parents=True, exist_ok=True)
|
31
|
+
self.executor = ThreadPoolExecutor()
|
32
|
+
logger.debug("Create LocalRunner")
|
33
|
+
|
34
|
+
def __enter__(self):
|
35
|
+
logger.debug("Enter LocalRunner")
|
36
|
+
return self
|
37
|
+
|
38
|
+
def shutdown(self):
|
39
|
+
logger.debug("Now shut LocalRunner.executor down")
|
40
|
+
self.executor.shutdown(
|
41
|
+
wait=False,
|
42
|
+
cancel_futures=True,
|
43
|
+
)
|
44
|
+
|
45
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
46
|
+
logger.debug("Exit LocalRunner")
|
47
|
+
self.shutdown()
|
48
|
+
return self.executor.__exit__(exc_type, exc_val, exc_tb)
|
49
|
+
|
50
|
+
def submit(
|
51
|
+
self,
|
52
|
+
func: callable,
|
53
|
+
parameters: dict[str, Any],
|
54
|
+
history_item_id: int,
|
55
|
+
task_files: TaskFiles,
|
56
|
+
in_compound_task: bool = False,
|
57
|
+
**kwargs,
|
58
|
+
) -> tuple[Any, Exception]:
|
59
|
+
logger.debug("[submit] START")
|
60
|
+
|
61
|
+
current_task_files = TaskFiles(
|
62
|
+
**task_files.model_dump(
|
63
|
+
exclude={"component"},
|
64
|
+
),
|
65
|
+
component=parameters[_COMPONENT_KEY_],
|
66
|
+
)
|
67
|
+
|
68
|
+
self.validate_submit_parameters(parameters)
|
69
|
+
workdir_local = current_task_files.wftask_subfolder_local
|
70
|
+
workdir_local.mkdir()
|
71
|
+
# SUBMISSION PHASE
|
72
|
+
future = self.executor.submit(func, parameters=parameters)
|
73
|
+
|
74
|
+
# RETRIEVAL PHASE
|
75
|
+
try:
|
76
|
+
result = future.result()
|
77
|
+
if not in_compound_task:
|
78
|
+
update_all_images(
|
79
|
+
history_item_id=history_item_id,
|
80
|
+
status=HistoryItemImageStatus.DONE,
|
81
|
+
logfile=current_task_files.log_file_local,
|
82
|
+
)
|
83
|
+
logger.debug(f"[submit] END {result=}")
|
84
|
+
return result, None
|
85
|
+
except Exception as e:
|
86
|
+
exception = e
|
87
|
+
update_all_images(
|
88
|
+
history_item_id=history_item_id,
|
89
|
+
status=HistoryItemImageStatus.FAILED,
|
90
|
+
logfile=current_task_files.log_file_local,
|
91
|
+
)
|
92
|
+
logger.debug(f"[submit] END {exception=}")
|
93
|
+
return None, exception
|
94
|
+
|
95
|
+
def multisubmit(
|
96
|
+
self,
|
97
|
+
func: callable,
|
98
|
+
list_parameters: list[dict],
|
99
|
+
history_item_id: int,
|
100
|
+
task_files: TaskFiles,
|
101
|
+
in_compound_task: bool = False,
|
102
|
+
local_backend_config: Optional[LocalBackendConfig] = None,
|
103
|
+
**kwargs,
|
104
|
+
):
|
105
|
+
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
106
|
+
|
107
|
+
self.validate_multisubmit_parameters(
|
108
|
+
list_parameters=list_parameters,
|
109
|
+
in_compound_task=in_compound_task,
|
110
|
+
)
|
111
|
+
|
112
|
+
workdir_local = task_files.wftask_subfolder_local
|
113
|
+
if not in_compound_task:
|
114
|
+
workdir_local.mkdir()
|
115
|
+
|
116
|
+
# Get local_backend_config
|
117
|
+
if local_backend_config is None:
|
118
|
+
local_backend_config = get_default_local_backend_config()
|
119
|
+
|
120
|
+
# Set `n_elements` and `parallel_tasks_per_job`
|
121
|
+
n_elements = len(list_parameters)
|
122
|
+
parallel_tasks_per_job = local_backend_config.parallel_tasks_per_job
|
123
|
+
if parallel_tasks_per_job is None:
|
124
|
+
parallel_tasks_per_job = n_elements
|
125
|
+
|
126
|
+
original_task_files = task_files
|
127
|
+
|
128
|
+
# Execute tasks, in chunks of size `parallel_tasks_per_job`
|
129
|
+
results = {}
|
130
|
+
exceptions = {}
|
131
|
+
for ind_chunk in range(0, n_elements, parallel_tasks_per_job):
|
132
|
+
list_parameters_chunk = list_parameters[
|
133
|
+
ind_chunk : ind_chunk + parallel_tasks_per_job
|
134
|
+
]
|
135
|
+
from concurrent.futures import Future
|
136
|
+
|
137
|
+
active_futures: dict[int, Future] = {}
|
138
|
+
active_task_files: dict[int, TaskFiles] = {}
|
139
|
+
for ind_within_chunk, kwargs in enumerate(list_parameters_chunk):
|
140
|
+
positional_index = ind_chunk + ind_within_chunk
|
141
|
+
component = kwargs[_COMPONENT_KEY_]
|
142
|
+
future = self.executor.submit(func, parameters=kwargs)
|
143
|
+
active_futures[positional_index] = future
|
144
|
+
active_task_files[positional_index] = TaskFiles(
|
145
|
+
**original_task_files.model_dump(exclude={"component"}),
|
146
|
+
component=component,
|
147
|
+
)
|
148
|
+
|
149
|
+
while active_futures:
|
150
|
+
# FIXME: add shutdown detection
|
151
|
+
# if file exists: cancel all futures, and raise
|
152
|
+
finished_futures = [
|
153
|
+
keyval
|
154
|
+
for keyval in active_futures.items()
|
155
|
+
if not keyval[1].running()
|
156
|
+
]
|
157
|
+
for positional_index, fut in finished_futures:
|
158
|
+
active_futures.pop(positional_index)
|
159
|
+
current_task_files = active_task_files.pop(
|
160
|
+
positional_index
|
161
|
+
)
|
162
|
+
zarr_url = list_parameters[positional_index]["zarr_url"]
|
163
|
+
if not in_compound_task:
|
164
|
+
update_single_image_logfile(
|
165
|
+
history_item_id=history_item_id,
|
166
|
+
zarr_url=zarr_url,
|
167
|
+
logfile=current_task_files.log_file_local,
|
168
|
+
)
|
169
|
+
try:
|
170
|
+
results[positional_index] = fut.result()
|
171
|
+
print(f"Mark {zarr_url=} as done, {kwargs}")
|
172
|
+
if not in_compound_task:
|
173
|
+
update_single_image(
|
174
|
+
history_item_id=history_item_id,
|
175
|
+
zarr_url=zarr_url,
|
176
|
+
status=HistoryItemImageStatus.DONE,
|
177
|
+
)
|
178
|
+
except Exception as e:
|
179
|
+
print(f"Mark {zarr_url=} as failed, {kwargs} - {e}")
|
180
|
+
exceptions[positional_index] = e
|
181
|
+
if not in_compound_task:
|
182
|
+
update_single_image(
|
183
|
+
history_item_id=history_item_id,
|
184
|
+
zarr_url=zarr_url,
|
185
|
+
status=HistoryItemImageStatus.FAILED,
|
186
|
+
)
|
187
|
+
if in_compound_task:
|
188
|
+
if exceptions == {}:
|
189
|
+
update_all_images(
|
190
|
+
history_item_id=history_item_id,
|
191
|
+
status=HistoryItemImageStatus.DONE,
|
192
|
+
)
|
193
|
+
else:
|
194
|
+
update_all_images(
|
195
|
+
history_item_id=history_item_id,
|
196
|
+
status=HistoryItemImageStatus.FAILED,
|
197
|
+
)
|
198
|
+
logger.debug(f"[multisubmit] END, {results=}, {exceptions=}")
|
199
|
+
|
200
|
+
return results, exceptions
|
@@ -22,9 +22,9 @@ from pydantic import ConfigDict
|
|
22
22
|
from pydantic import Field
|
23
23
|
from pydantic import ValidationError
|
24
24
|
|
25
|
-
from
|
26
|
-
from
|
27
|
-
from
|
25
|
+
from fractal_server.config import get_settings
|
26
|
+
from fractal_server.logger import set_logger
|
27
|
+
from fractal_server.syringe import Inject
|
28
28
|
|
29
29
|
logger = set_logger(__name__)
|
30
30
|
|
@@ -14,11 +14,12 @@ Submodule to define _slurm_submit_setup, which is also the reference
|
|
14
14
|
implementation of `submit_setup_call`.
|
15
15
|
"""
|
16
16
|
from pathlib import Path
|
17
|
+
from typing import Any
|
17
18
|
from typing import Literal
|
18
19
|
|
19
|
-
from ...task_files import
|
20
|
+
from ...task_files import TaskFiles
|
20
21
|
from fractal_server.app.models.v2 import WorkflowTaskV2
|
21
|
-
from fractal_server.app.runner.
|
22
|
+
from fractal_server.app.runner.executors.slurm_common.get_slurm_config import (
|
22
23
|
get_slurm_config,
|
23
24
|
)
|
24
25
|
|
@@ -26,14 +27,16 @@ from fractal_server.app.runner.v2._slurm_common.get_slurm_config import (
|
|
26
27
|
def _slurm_submit_setup(
|
27
28
|
*,
|
28
29
|
wftask: WorkflowTaskV2,
|
29
|
-
|
30
|
-
|
30
|
+
root_dir_local: Path,
|
31
|
+
root_dir_remote: Path,
|
31
32
|
which_type: Literal["non_parallel", "parallel"],
|
32
|
-
) -> dict[str,
|
33
|
+
) -> dict[str, Any]:
|
33
34
|
"""
|
34
|
-
Collect
|
35
|
+
Collect WorkflowTask-specific configuration parameters from different
|
35
36
|
sources, and inject them for execution.
|
36
37
|
|
38
|
+
FIXME
|
39
|
+
|
37
40
|
Here goes all the logic for reading attributes from the appropriate sources
|
38
41
|
and transforming them into an appropriate `SlurmConfig` object (encoding
|
39
42
|
SLURM configuration) and `TaskFiles` object (with details e.g. about file
|
@@ -68,16 +71,14 @@ def _slurm_submit_setup(
|
|
68
71
|
)
|
69
72
|
|
70
73
|
# Get TaskFiles object
|
71
|
-
task_files =
|
72
|
-
|
73
|
-
|
74
|
+
task_files = TaskFiles(
|
75
|
+
root_dir_local=root_dir_local,
|
76
|
+
root_dir_remote=root_dir_remote,
|
74
77
|
task_order=wftask.order,
|
75
78
|
task_name=wftask.task.name,
|
76
79
|
)
|
77
80
|
|
78
|
-
|
79
|
-
submit_setup_dict = dict(
|
81
|
+
return dict(
|
80
82
|
slurm_config=slurm_config,
|
81
83
|
task_files=task_files,
|
82
84
|
)
|
83
|
-
return submit_setup_dict
|
@@ -2,18 +2,12 @@ from pathlib import Path
|
|
2
2
|
from typing import Literal
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
+
from ._slurm_config import _parse_mem_value
|
6
|
+
from ._slurm_config import load_slurm_config_file
|
7
|
+
from ._slurm_config import logger
|
8
|
+
from ._slurm_config import SlurmConfig
|
9
|
+
from ._slurm_config import SlurmConfigError
|
5
10
|
from fractal_server.app.models.v2 import WorkflowTaskV2
|
6
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
7
|
-
_parse_mem_value,
|
8
|
-
)
|
9
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
10
|
-
load_slurm_config_file,
|
11
|
-
)
|
12
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import logger
|
13
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import SlurmConfig
|
14
|
-
from fractal_server.app.runner.executors.slurm._slurm_config import (
|
15
|
-
SlurmConfigError,
|
16
|
-
)
|
17
11
|
|
18
12
|
|
19
13
|
def get_slurm_config(
|
@@ -142,8 +136,8 @@ def get_slurm_config(
|
|
142
136
|
extra_lines = slurm_dict.get("extra_lines", []) + extra_lines
|
143
137
|
if len(set(extra_lines)) != len(extra_lines):
|
144
138
|
logger.debug(
|
145
|
-
"[get_slurm_config] Removing repeated elements "
|
146
|
-
f"
|
139
|
+
"[get_slurm_config] Removing repeated elements from "
|
140
|
+
f"{extra_lines=}."
|
147
141
|
)
|
148
142
|
extra_lines = list(set(extra_lines))
|
149
143
|
slurm_dict["extra_lines"] = extra_lines
|
@@ -162,8 +156,8 @@ def get_slurm_config(
|
|
162
156
|
|
163
157
|
# Put everything together
|
164
158
|
logger.debug(
|
165
|
-
"[get_slurm_config] Now create a SlurmConfig object based "
|
166
|
-
f"
|
159
|
+
"[get_slurm_config] Now create a SlurmConfig object based on "
|
160
|
+
f"{slurm_dict=}"
|
167
161
|
)
|
168
162
|
slurm_config = SlurmConfig(**slurm_dict)
|
169
163
|
|
@@ -15,22 +15,21 @@ from typing import Sequence
|
|
15
15
|
|
16
16
|
import cloudpickle
|
17
17
|
|
18
|
-
from
|
19
|
-
from
|
20
|
-
from
|
21
|
-
from
|
22
|
-
from
|
23
|
-
from
|
24
|
-
from ..
|
25
|
-
from ..utils_executors import
|
26
|
-
from ..utils_executors import
|
27
|
-
from ..utils_executors import get_slurm_script_file_path
|
18
|
+
from ...filenames import SHUTDOWN_FILENAME
|
19
|
+
from ...task_files import TaskFiles
|
20
|
+
from ...versions import get_versions
|
21
|
+
from ..slurm_common._batching import heuristics
|
22
|
+
from ..slurm_common._job_states import STATES_FINISHED
|
23
|
+
from ..slurm_common._slurm_config import SlurmConfig
|
24
|
+
from ..slurm_common.utils_executors import get_pickle_file_path
|
25
|
+
from ..slurm_common.utils_executors import get_slurm_file_path
|
26
|
+
from ..slurm_common.utils_executors import get_slurm_script_file_path
|
28
27
|
from ._executor_wait_thread import FractalSlurmSSHWaitThread
|
29
28
|
from fractal_server.app.runner.components import _COMPONENT_KEY_
|
30
29
|
from fractal_server.app.runner.compress_folder import compress_folder
|
31
30
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
32
31
|
from fractal_server.app.runner.exceptions import TaskExecutionError
|
33
|
-
from fractal_server.app.runner.executors.
|
32
|
+
from fractal_server.app.runner.executors.slurm_ssh._slurm_job import SlurmJob
|
34
33
|
from fractal_server.app.runner.extract_archive import extract_archive
|
35
34
|
from fractal_server.config import get_settings
|
36
35
|
from fractal_server.logger import set_logger
|
@@ -533,9 +532,9 @@ class FractalSlurmSSHExecutor(Executor):
|
|
533
532
|
except AttributeError:
|
534
533
|
actual_component = str(component)
|
535
534
|
|
536
|
-
_task_file_paths =
|
537
|
-
|
538
|
-
|
535
|
+
_task_file_paths = TaskFiles(
|
536
|
+
root_dir_local=task_files.workflow_dir_local,
|
537
|
+
root_dir_remote=task_files.workflow_dir_remote,
|
539
538
|
task_name=task_files.task_name,
|
540
539
|
task_order=task_files.task_order,
|
541
540
|
component=actual_component,
|
@@ -1,18 +1,20 @@
|
|
1
|
-
|
1
|
+
import subprocess # nosec
|
2
2
|
|
3
|
-
from
|
4
|
-
|
3
|
+
from fractal_server.app.runner.executors.slurm_common._job_states import (
|
4
|
+
STATES_FINISHED,
|
5
|
+
)
|
6
|
+
from fractal_server.logger import set_logger
|
5
7
|
|
6
8
|
|
7
9
|
logger = set_logger(__name__)
|
8
10
|
|
9
11
|
|
10
|
-
def run_squeue(job_ids):
|
11
|
-
res = run( # nosec
|
12
|
+
def run_squeue(job_ids: list[str]) -> subprocess.CompletedProcess:
|
13
|
+
res = subprocess.run( # nosec
|
12
14
|
[
|
13
15
|
"squeue",
|
14
16
|
"--noheader",
|
15
|
-
"--format
|
17
|
+
"--format='%i %T'",
|
16
18
|
"--jobs",
|
17
19
|
",".join([str(j) for j in job_ids]),
|
18
20
|
"--states=all",
|
@@ -23,14 +25,14 @@ def run_squeue(job_ids):
|
|
23
25
|
)
|
24
26
|
if res.returncode != 0:
|
25
27
|
logger.warning(
|
26
|
-
f"squeue command with {job_ids}"
|
27
|
-
f"
|
28
|
+
f"squeue command with {job_ids} failed with:"
|
29
|
+
f"\n{res.stderr=}\n{res.stdout=}"
|
28
30
|
)
|
29
31
|
|
30
32
|
return res
|
31
33
|
|
32
34
|
|
33
|
-
def
|
35
|
+
def get_finished_jobs(job_ids: list[str]) -> set[str]:
|
34
36
|
"""
|
35
37
|
Check which ones of the given Slurm jobs already finished
|
36
38
|
|
@@ -5,9 +5,9 @@ import traceback
|
|
5
5
|
from itertools import count
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
-
from
|
9
|
-
from ._check_jobs_status import _jobs_finished
|
8
|
+
from ._check_jobs_status import get_finished_jobs
|
10
9
|
from fractal_server.app.runner.exceptions import JobExecutionError
|
10
|
+
from fractal_server.logger import set_logger
|
11
11
|
|
12
12
|
logger = set_logger(__name__)
|
13
13
|
|
@@ -115,7 +115,7 @@ class FractalSlurmSudoWaitThread(threading.Thread):
|
|
115
115
|
self.check_shutdown(i)
|
116
116
|
if i % (self.slurm_poll_interval // self.interval) == 0:
|
117
117
|
try:
|
118
|
-
finished_jobs =
|
118
|
+
finished_jobs = get_finished_jobs(self.waiting.values())
|
119
119
|
except Exception:
|
120
120
|
# Don't abandon completion checking if jobs_finished errors
|
121
121
|
traceback.print_exc()
|
@@ -19,7 +19,7 @@ import shlex
|
|
19
19
|
import subprocess # nosec
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from
|
22
|
+
from fractal_server.logger import set_logger
|
23
23
|
from fractal_server.string_tools import validate_cmd
|
24
24
|
|
25
25
|
logger = set_logger(__name__)
|
@@ -65,10 +65,7 @@ def _run_command_as_user(
|
|
65
65
|
|
66
66
|
if check and not res.returncode == 0:
|
67
67
|
raise RuntimeError(
|
68
|
-
f"{cmd=}\n\n"
|
69
|
-
f"{res.returncode=}\n\n"
|
70
|
-
f"{res.stdout=}\n\n"
|
71
|
-
f"{res.stderr=}\n"
|
68
|
+
f"{cmd=}\n\n{res.returncode=}\n\n{res.stdout=}\n\n{res.stderr=}\n"
|
72
69
|
)
|
73
70
|
|
74
71
|
return res
|
@@ -93,69 +90,6 @@ def _mkdir_as_user(*, folder: str, user: str) -> None:
|
|
93
90
|
_run_command_as_user(cmd=cmd, user=user, check=True)
|
94
91
|
|
95
92
|
|
96
|
-
def _glob_as_user(
|
97
|
-
*, folder: str, user: str, startswith: Optional[str] = None
|
98
|
-
) -> list[str]:
|
99
|
-
"""
|
100
|
-
Run `ls` in a folder (as a user) and filter results
|
101
|
-
|
102
|
-
Execute `ls` on a folder (impersonating a user, if `user` is not `None`)
|
103
|
-
and select results that start with `startswith` (if not `None`).
|
104
|
-
|
105
|
-
Arguments:
|
106
|
-
folder: Absolute path to the folder
|
107
|
-
user: If not `None`, the user to be impersonated via `sudo -u`
|
108
|
-
startswith: If not `None`, this is used to filter output of `ls`.
|
109
|
-
"""
|
110
|
-
|
111
|
-
res = _run_command_as_user(cmd=f"ls {folder}", user=user, check=True)
|
112
|
-
output = res.stdout.split()
|
113
|
-
if startswith:
|
114
|
-
output = [f for f in output if f.startswith(startswith)]
|
115
|
-
return output
|
116
|
-
|
117
|
-
|
118
|
-
def _glob_as_user_strict(
|
119
|
-
*,
|
120
|
-
folder: str,
|
121
|
-
user: str,
|
122
|
-
startswith: str,
|
123
|
-
) -> list[str]:
|
124
|
-
"""
|
125
|
-
Run `ls` in a folder (as a user) and filter results
|
126
|
-
|
127
|
-
Execute `ls` on a folder (impersonating a user, if `user` is not `None`)
|
128
|
-
and select results that comply with a set of rules. They all start with
|
129
|
-
`startswith` (if not `None`), and they match one of the known filename
|
130
|
-
patterns. See details in
|
131
|
-
https://github.com/fractal-analytics-platform/fractal-server/issues/1240
|
132
|
-
|
133
|
-
|
134
|
-
Arguments:
|
135
|
-
folder: Absolute path to the folder
|
136
|
-
user: If not `None`, the user to be impersonated via `sudo -u`
|
137
|
-
startswith: If not `None`, this is used to filter output of `ls`.
|
138
|
-
"""
|
139
|
-
|
140
|
-
res = _run_command_as_user(cmd=f"ls {folder}", user=user, check=True)
|
141
|
-
output = res.stdout.split()
|
142
|
-
|
143
|
-
new_output = []
|
144
|
-
known_filenames = [
|
145
|
-
f"{startswith}{suffix}"
|
146
|
-
for suffix in [".args.json", ".metadiff.json", ".err", ".out", ".log"]
|
147
|
-
]
|
148
|
-
for filename in output:
|
149
|
-
if filename in known_filenames:
|
150
|
-
new_output.append(filename)
|
151
|
-
elif filename.startswith(f"{startswith}_out_") and filename.endswith(
|
152
|
-
".pickle"
|
153
|
-
):
|
154
|
-
new_output.append(filename)
|
155
|
-
|
156
|
-
return new_output
|
157
|
-
|
158
|
-
|
159
93
|
def _path_exists_as_user(*, path: str, user: Optional[str] = None) -> bool:
|
160
94
|
"""
|
161
95
|
Impersonate a user and check if `path` exists via `ls`
|