fractal-server 2.13.1__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/__main__.py +3 -1
- fractal_server/app/models/linkusergroup.py +6 -2
- fractal_server/app/models/v2/__init__.py +7 -1
- fractal_server/app/models/v2/dataset.py +1 -11
- fractal_server/app/models/v2/history.py +78 -0
- fractal_server/app/models/v2/job.py +10 -3
- fractal_server/app/models/v2/task_group.py +2 -2
- fractal_server/app/models/v2/workflow.py +1 -1
- fractal_server/app/models/v2/workflowtask.py +1 -1
- fractal_server/app/routes/admin/v2/accounting.py +18 -28
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +0 -17
- fractal_server/app/routes/api/__init__.py +1 -1
- fractal_server/app/routes/api/v2/__init__.py +8 -2
- fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
- fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
- fractal_server/app/routes/api/v2/dataset.py +0 -17
- fractal_server/app/routes/api/v2/history.py +544 -0
- fractal_server/app/routes/api/v2/images.py +31 -43
- fractal_server/app/routes/api/v2/job.py +30 -0
- fractal_server/app/routes/api/v2/project.py +1 -53
- fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
- fractal_server/app/routes/api/v2/submit.py +16 -14
- fractal_server/app/routes/api/v2/task.py +3 -10
- fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
- fractal_server/app/routes/api/v2/task_group.py +0 -17
- fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
- fractal_server/app/routes/api/v2/workflow.py +28 -69
- fractal_server/app/routes/api/v2/workflowtask.py +53 -50
- fractal_server/app/routes/auth/group.py +0 -16
- fractal_server/app/routes/auth/oauth.py +5 -3
- fractal_server/app/routes/pagination.py +47 -0
- fractal_server/app/runner/components.py +0 -3
- fractal_server/app/runner/compress_folder.py +57 -29
- fractal_server/app/runner/exceptions.py +4 -0
- fractal_server/app/runner/executors/base_runner.py +157 -0
- fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
- fractal_server/app/runner/executors/local/runner.py +248 -0
- fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
- fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
- fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
- fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
- fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
- fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
- fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
- fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
- fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
- fractal_server/app/runner/extract_archive.py +1 -3
- fractal_server/app/runner/task_files.py +134 -87
- fractal_server/app/runner/v2/__init__.py +0 -399
- fractal_server/app/runner/v2/_local.py +88 -0
- fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +20 -19
- fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +17 -15
- fractal_server/app/runner/v2/db_tools.py +119 -0
- fractal_server/app/runner/v2/runner.py +206 -95
- fractal_server/app/runner/v2/runner_functions.py +488 -187
- fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
- fractal_server/app/runner/v2/submit_workflow.py +358 -0
- fractal_server/app/runner/v2/task_interface.py +31 -0
- fractal_server/app/schemas/_validators.py +13 -24
- fractal_server/app/schemas/user.py +10 -7
- fractal_server/app/schemas/user_settings.py +9 -21
- fractal_server/app/schemas/v2/__init__.py +9 -1
- fractal_server/app/schemas/v2/dataset.py +12 -94
- fractal_server/app/schemas/v2/dumps.py +26 -9
- fractal_server/app/schemas/v2/history.py +80 -0
- fractal_server/app/schemas/v2/job.py +15 -8
- fractal_server/app/schemas/v2/manifest.py +14 -7
- fractal_server/app/schemas/v2/project.py +9 -7
- fractal_server/app/schemas/v2/status_legacy.py +35 -0
- fractal_server/app/schemas/v2/task.py +72 -77
- fractal_server/app/schemas/v2/task_collection.py +14 -32
- fractal_server/app/schemas/v2/task_group.py +10 -9
- fractal_server/app/schemas/v2/workflow.py +10 -11
- fractal_server/app/schemas/v2/workflowtask.py +2 -21
- fractal_server/app/security/__init__.py +3 -3
- fractal_server/app/security/signup_email.py +2 -2
- fractal_server/config.py +41 -46
- fractal_server/images/tools.py +23 -0
- fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
- fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
- fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
- fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
- fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
- fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
- fractal_server/ssh/_fabric.py +28 -14
- fractal_server/tasks/v2/local/collect.py +2 -2
- fractal_server/tasks/v2/ssh/collect.py +2 -2
- fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
- fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
- fractal_server/tasks/v2/utils_background.py +0 -19
- fractal_server/tasks/v2/utils_database.py +30 -17
- fractal_server/tasks/v2/utils_templates.py +6 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/RECORD +106 -96
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
- fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
- fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
- fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
- fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
- fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
- fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
- fractal_server/app/runner/v2/_local/__init__.py +0 -132
- fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
- fractal_server/app/runner/v2/_local/executor.py +0 -100
- fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
- fractal_server/app/runner/v2/handle_failed_job.py +0 -59
- fractal_server/app/schemas/v2/status.py +0 -16
- /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
- /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
- /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
- /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
- {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,868 @@
|
|
1
|
+
import json
|
2
|
+
import math
|
3
|
+
import sys
|
4
|
+
import time
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any
|
7
|
+
from typing import Literal
|
8
|
+
from typing import Optional
|
9
|
+
|
10
|
+
import cloudpickle
|
11
|
+
|
12
|
+
from ..slurm_common._slurm_config import SlurmConfig
|
13
|
+
from ..slurm_common.slurm_job_task_models import SlurmJob
|
14
|
+
from ..slurm_common.slurm_job_task_models import SlurmTask
|
15
|
+
from ._job_states import STATES_FINISHED
|
16
|
+
from fractal_server import __VERSION__
|
17
|
+
from fractal_server.app.db import get_sync_db
|
18
|
+
from fractal_server.app.models.v2 import AccountingRecordSlurm
|
19
|
+
from fractal_server.app.runner.exceptions import JobExecutionError
|
20
|
+
from fractal_server.app.runner.exceptions import TaskExecutionError
|
21
|
+
from fractal_server.app.runner.executors.base_runner import BaseRunner
|
22
|
+
from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
|
23
|
+
from fractal_server.app.runner.task_files import TaskFiles
|
24
|
+
from fractal_server.app.runner.v2.db_tools import (
|
25
|
+
bulk_update_status_of_history_unit,
|
26
|
+
)
|
27
|
+
from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
|
28
|
+
from fractal_server.app.schemas.v2 import HistoryUnitStatus
|
29
|
+
from fractal_server.config import get_settings
|
30
|
+
from fractal_server.logger import set_logger
|
31
|
+
from fractal_server.syringe import Inject
|
32
|
+
|
33
|
+
SHUTDOWN_ERROR_MESSAGE = "Failed due to job-execution shutdown."
|
34
|
+
SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
|
35
|
+
|
36
|
+
logger = set_logger(__name__)
|
37
|
+
|
38
|
+
|
39
|
+
def create_accounting_record_slurm(
|
40
|
+
*,
|
41
|
+
user_id: int,
|
42
|
+
slurm_job_ids: list[int],
|
43
|
+
) -> None:
|
44
|
+
with next(get_sync_db()) as db:
|
45
|
+
db.add(
|
46
|
+
AccountingRecordSlurm(
|
47
|
+
user_id=user_id,
|
48
|
+
slurm_job_ids=slurm_job_ids,
|
49
|
+
)
|
50
|
+
)
|
51
|
+
db.commit()
|
52
|
+
|
53
|
+
|
54
|
+
class BaseSlurmRunner(BaseRunner):
|
55
|
+
shutdown_file: Path
|
56
|
+
common_script_lines: list[str]
|
57
|
+
user_cache_dir: str
|
58
|
+
root_dir_local: Path
|
59
|
+
root_dir_remote: Path
|
60
|
+
poll_interval: int
|
61
|
+
poll_interval_internal: float
|
62
|
+
jobs: dict[str, SlurmJob]
|
63
|
+
python_worker_interpreter: str
|
64
|
+
slurm_runner_type: Literal["ssh", "sudo"]
|
65
|
+
|
66
|
+
def __init__(
|
67
|
+
self,
|
68
|
+
root_dir_local: Path,
|
69
|
+
root_dir_remote: Path,
|
70
|
+
slurm_runner_type: Literal["ssh", "sudo"],
|
71
|
+
python_worker_interpreter: str,
|
72
|
+
common_script_lines: Optional[list[str]] = None,
|
73
|
+
user_cache_dir: Optional[str] = None,
|
74
|
+
poll_interval: Optional[int] = None,
|
75
|
+
):
|
76
|
+
self.slurm_runner_type = slurm_runner_type
|
77
|
+
self.root_dir_local = root_dir_local
|
78
|
+
self.root_dir_remote = root_dir_remote
|
79
|
+
self.common_script_lines = common_script_lines or []
|
80
|
+
self._check_slurm_account()
|
81
|
+
self.user_cache_dir = user_cache_dir
|
82
|
+
self.python_worker_interpreter = python_worker_interpreter
|
83
|
+
|
84
|
+
settings = Inject(get_settings)
|
85
|
+
|
86
|
+
self.poll_interval = (
|
87
|
+
poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
|
88
|
+
)
|
89
|
+
self.poll_interval_internal = self.poll_interval / 10.0
|
90
|
+
|
91
|
+
self.check_fractal_server_versions()
|
92
|
+
|
93
|
+
# Create job folders. Note that the local one may or may not exist
|
94
|
+
# depending on whether it is a test or an actual run
|
95
|
+
try:
|
96
|
+
if not self.root_dir_local.is_dir():
|
97
|
+
self._mkdir_local_folder(self.root_dir_local.as_posix())
|
98
|
+
self._mkdir_remote_folder(self.root_dir_remote.as_posix())
|
99
|
+
except Exception as e:
|
100
|
+
error_msg = (
|
101
|
+
f"Could not mkdir {self.root_dir_local.as_posix()} or "
|
102
|
+
f"{self.root_dir_remote.as_posix()}. "
|
103
|
+
f"Original error: {str(e)}."
|
104
|
+
)
|
105
|
+
logger.error(error_msg)
|
106
|
+
raise RuntimeError(error_msg)
|
107
|
+
|
108
|
+
self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
|
109
|
+
self.jobs = {}
|
110
|
+
|
111
|
+
def __enter__(self):
|
112
|
+
return self
|
113
|
+
|
114
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
115
|
+
return False
|
116
|
+
|
117
|
+
def _run_remote_cmd(self, cmd: str) -> str:
|
118
|
+
raise NotImplementedError("Implement in child class.")
|
119
|
+
|
120
|
+
def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
|
121
|
+
raise NotImplementedError("Implement in child class.")
|
122
|
+
|
123
|
+
def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
|
124
|
+
|
125
|
+
# If there is no Slurm job to check, return right away
|
126
|
+
if not job_ids:
|
127
|
+
return set()
|
128
|
+
|
129
|
+
try:
|
130
|
+
stdout = self.run_squeue(job_ids=job_ids)
|
131
|
+
slurm_statuses = {
|
132
|
+
out.split()[0]: out.split()[1] for out in stdout.splitlines()
|
133
|
+
}
|
134
|
+
except Exception as e:
|
135
|
+
logger.warning(
|
136
|
+
"[_get_finished_jobs] `squeue` failed, "
|
137
|
+
"retry with individual job IDs. "
|
138
|
+
f"Original error: {str(e)}."
|
139
|
+
)
|
140
|
+
slurm_statuses = dict()
|
141
|
+
for job_id in job_ids:
|
142
|
+
try:
|
143
|
+
stdout = self.run_squeue(job_ids=[job_id])
|
144
|
+
slurm_statuses.update(
|
145
|
+
{stdout.split()[0]: stdout.split()[1]}
|
146
|
+
)
|
147
|
+
except Exception as e:
|
148
|
+
logger.warning(
|
149
|
+
"[_get_finished_jobs] `squeue` failed for "
|
150
|
+
f"{job_id=}, mark job as completed. "
|
151
|
+
f"Original error: {str(e)}."
|
152
|
+
)
|
153
|
+
slurm_statuses.update({str(job_id): "COMPLETED"})
|
154
|
+
|
155
|
+
# If a job is not in `squeue` output, mark it as completed.
|
156
|
+
finished_jobs = {
|
157
|
+
job_id
|
158
|
+
for job_id in job_ids
|
159
|
+
if slurm_statuses.get(job_id, "COMPLETED") in STATES_FINISHED
|
160
|
+
}
|
161
|
+
return finished_jobs
|
162
|
+
|
163
|
+
def _mkdir_local_folder(self, folder: str) -> None:
|
164
|
+
raise NotImplementedError("Implement in child class.")
|
165
|
+
|
166
|
+
def _mkdir_remote_folder(self, folder: str) -> None:
|
167
|
+
raise NotImplementedError("Implement in child class.")
|
168
|
+
|
169
|
+
def _submit_single_sbatch(
|
170
|
+
self,
|
171
|
+
func,
|
172
|
+
slurm_job: SlurmJob,
|
173
|
+
slurm_config: SlurmConfig,
|
174
|
+
) -> str:
|
175
|
+
logger.debug("[_submit_single_sbatch] START")
|
176
|
+
# Prepare input pickle(s)
|
177
|
+
versions = dict(
|
178
|
+
python=sys.version_info[:3],
|
179
|
+
cloudpickle=cloudpickle.__version__,
|
180
|
+
fractal_server=__VERSION__,
|
181
|
+
)
|
182
|
+
for task in slurm_job.tasks:
|
183
|
+
# Write input pickle
|
184
|
+
_args = []
|
185
|
+
_kwargs = dict(
|
186
|
+
parameters=task.parameters,
|
187
|
+
remote_files=task.task_files.remote_files_dict,
|
188
|
+
)
|
189
|
+
funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
|
190
|
+
with open(task.input_pickle_file_local, "wb") as f:
|
191
|
+
f.write(funcser)
|
192
|
+
logger.debug(
|
193
|
+
"[_submit_single_sbatch] Written "
|
194
|
+
f"{task.input_pickle_file_local=}"
|
195
|
+
)
|
196
|
+
|
197
|
+
if self.slurm_runner_type == "ssh":
|
198
|
+
# Send input pickle (only relevant for SSH)
|
199
|
+
self.fractal_ssh.send_file(
|
200
|
+
local=task.input_pickle_file_local,
|
201
|
+
remote=task.input_pickle_file_remote,
|
202
|
+
)
|
203
|
+
logger.debug(
|
204
|
+
"[_submit_single_sbatch] Transferred "
|
205
|
+
f"{task.input_pickle_file_local=}"
|
206
|
+
)
|
207
|
+
|
208
|
+
# Prepare commands to be included in SLURM submission script
|
209
|
+
cmdlines = []
|
210
|
+
for task in slurm_job.tasks:
|
211
|
+
if self.slurm_runner_type == "ssh":
|
212
|
+
input_pickle_file = task.input_pickle_file_remote
|
213
|
+
else:
|
214
|
+
input_pickle_file = task.input_pickle_file_local
|
215
|
+
output_pickle_file = task.output_pickle_file_remote
|
216
|
+
cmdlines.append(
|
217
|
+
(
|
218
|
+
f"{self.python_worker_interpreter}"
|
219
|
+
" -m fractal_server.app.runner."
|
220
|
+
"executors.slurm_common.remote "
|
221
|
+
f"--input-file {input_pickle_file} "
|
222
|
+
f"--output-file {output_pickle_file}"
|
223
|
+
)
|
224
|
+
)
|
225
|
+
|
226
|
+
# Set ntasks
|
227
|
+
num_tasks_max_running = slurm_config.parallel_tasks_per_job
|
228
|
+
ntasks = min(len(cmdlines), num_tasks_max_running)
|
229
|
+
slurm_config.parallel_tasks_per_job = ntasks
|
230
|
+
|
231
|
+
# Prepare SLURM preamble based on SlurmConfig object
|
232
|
+
script_lines = slurm_config.to_sbatch_preamble(
|
233
|
+
remote_export_dir=self.user_cache_dir
|
234
|
+
)
|
235
|
+
|
236
|
+
# Extend SLURM preamble with variable which are not in SlurmConfig, and
|
237
|
+
# fix their order
|
238
|
+
script_lines.extend(
|
239
|
+
[
|
240
|
+
f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
|
241
|
+
f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
|
242
|
+
f"#SBATCH -D {slurm_job.workdir_remote}",
|
243
|
+
]
|
244
|
+
)
|
245
|
+
script_lines = slurm_config.sort_script_lines(script_lines)
|
246
|
+
logger.debug(script_lines)
|
247
|
+
|
248
|
+
# Always print output of `uname -n` and `pwd`
|
249
|
+
script_lines.append('\necho "Hostname: $(uname -n)"')
|
250
|
+
script_lines.append('echo "Current directory: $(pwd)"')
|
251
|
+
script_lines.append(
|
252
|
+
'echo "Start time: $(date +"%Y-%m-%dT%H:%M:%S%z")"'
|
253
|
+
)
|
254
|
+
|
255
|
+
# Complete script preamble
|
256
|
+
script_lines.append("\n")
|
257
|
+
|
258
|
+
# Include command lines
|
259
|
+
mem_per_task_MB = slurm_config.mem_per_task_MB
|
260
|
+
for cmd in cmdlines:
|
261
|
+
script_lines.append(
|
262
|
+
"srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
|
263
|
+
f"--mem={mem_per_task_MB}MB "
|
264
|
+
f"{cmd} &"
|
265
|
+
)
|
266
|
+
script_lines.append("wait\n")
|
267
|
+
script = "\n".join(script_lines)
|
268
|
+
script_lines.append(
|
269
|
+
'echo "End time: $(date +"%Y-%m-%dT%H:%M:%S%z")"'
|
270
|
+
)
|
271
|
+
|
272
|
+
# Write submission script
|
273
|
+
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
274
|
+
f.write(script)
|
275
|
+
logger.debug(
|
276
|
+
"[_submit_single_sbatch] Written "
|
277
|
+
f"{slurm_job.slurm_submission_script_local=}"
|
278
|
+
)
|
279
|
+
|
280
|
+
if self.slurm_runner_type == "ssh":
|
281
|
+
self.fractal_ssh.send_file(
|
282
|
+
local=slurm_job.slurm_submission_script_local,
|
283
|
+
remote=slurm_job.slurm_submission_script_remote,
|
284
|
+
)
|
285
|
+
submit_command = (
|
286
|
+
"sbatch --parsable "
|
287
|
+
f"{slurm_job.slurm_submission_script_remote}"
|
288
|
+
)
|
289
|
+
else:
|
290
|
+
submit_command = (
|
291
|
+
"sbatch --parsable "
|
292
|
+
f"{slurm_job.slurm_submission_script_local}"
|
293
|
+
)
|
294
|
+
# Run sbatch
|
295
|
+
pre_submission_cmds = slurm_config.pre_submission_commands
|
296
|
+
if len(pre_submission_cmds) == 0:
|
297
|
+
logger.debug(f"Now run {submit_command=}")
|
298
|
+
sbatch_stdout = self._run_remote_cmd(submit_command)
|
299
|
+
else:
|
300
|
+
logger.debug(f"Now using {pre_submission_cmds=}")
|
301
|
+
script_lines = pre_submission_cmds + [submit_command]
|
302
|
+
wrapper_script_contents = "\n".join(script_lines)
|
303
|
+
wrapper_script_contents = f"{wrapper_script_contents}\n"
|
304
|
+
if self.slurm_runner_type == "ssh":
|
305
|
+
wrapper_script = (
|
306
|
+
f"{slurm_job.slurm_submission_script_remote}_wrapper.sh"
|
307
|
+
)
|
308
|
+
self.fractal_ssh.write_remote_file(
|
309
|
+
path=wrapper_script, content=wrapper_script_contents
|
310
|
+
)
|
311
|
+
else:
|
312
|
+
wrapper_script = (
|
313
|
+
f"{slurm_job.slurm_submission_script_local}_wrapper.sh"
|
314
|
+
)
|
315
|
+
with open(wrapper_script, "w") as f:
|
316
|
+
f.write(wrapper_script_contents)
|
317
|
+
logger.debug(f"Now run {wrapper_script=}")
|
318
|
+
sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
|
319
|
+
|
320
|
+
# Submit SLURM job and retrieve job ID
|
321
|
+
logger.info(f"[_submit_single_sbatch] {sbatch_stdout=}")
|
322
|
+
stdout = sbatch_stdout.strip("\n")
|
323
|
+
submitted_job_id = int(stdout)
|
324
|
+
slurm_job.slurm_job_id = str(submitted_job_id)
|
325
|
+
|
326
|
+
# Add job to self.jobs
|
327
|
+
self.jobs[slurm_job.slurm_job_id] = slurm_job
|
328
|
+
logger.debug(
|
329
|
+
"[_submit_single_sbatch] Added "
|
330
|
+
f"{slurm_job.slurm_job_id} to self.jobs."
|
331
|
+
)
|
332
|
+
logger.debug("[_submit_single_sbatch] END")
|
333
|
+
|
334
|
+
def _fetch_artifacts(
|
335
|
+
self,
|
336
|
+
finished_slurm_jobs: list[SlurmJob],
|
337
|
+
) -> None:
|
338
|
+
raise NotImplementedError("Implement in child class.")
|
339
|
+
|
340
|
+
def _check_slurm_account(self) -> None:
|
341
|
+
"""
|
342
|
+
Check that SLURM account is not set here in `common_script_lines`.
|
343
|
+
"""
|
344
|
+
try:
|
345
|
+
invalid_line = next(
|
346
|
+
line
|
347
|
+
for line in self.common_script_lines
|
348
|
+
if line.startswith("#SBATCH --account=")
|
349
|
+
)
|
350
|
+
raise RuntimeError(
|
351
|
+
"Invalid line in `common_script_lines`: "
|
352
|
+
f"'{invalid_line}'.\n"
|
353
|
+
"SLURM account must be set via the request body of the "
|
354
|
+
"apply-workflow endpoint, or by modifying the user properties."
|
355
|
+
)
|
356
|
+
except StopIteration:
|
357
|
+
pass
|
358
|
+
|
359
|
+
def _postprocess_single_task(
|
360
|
+
self,
|
361
|
+
*,
|
362
|
+
task: SlurmTask,
|
363
|
+
was_job_scancelled: bool = False,
|
364
|
+
) -> tuple[Any, Exception]:
|
365
|
+
try:
|
366
|
+
with open(task.output_pickle_file_local, "rb") as f:
|
367
|
+
outdata = f.read()
|
368
|
+
success, output = cloudpickle.loads(outdata)
|
369
|
+
if success:
|
370
|
+
# Task succeeded
|
371
|
+
result = output
|
372
|
+
return (result, None)
|
373
|
+
else:
|
374
|
+
# Task failed in a controlled way, and produced an `output`
|
375
|
+
# object which is a dictionary with required keys
|
376
|
+
# `exc_type_name` and `traceback_string` and with optional
|
377
|
+
# keys `workflow_task_order`, `workflow_task_id` and
|
378
|
+
# `task_name`.
|
379
|
+
exc_type_name = output.get("exc_type_name")
|
380
|
+
logger.debug(
|
381
|
+
f"Output pickle contains a '{exc_type_name}' exception."
|
382
|
+
)
|
383
|
+
traceback_string = output.get("traceback_string")
|
384
|
+
kwargs = {
|
385
|
+
key: output[key]
|
386
|
+
for key in [
|
387
|
+
"workflow_task_order",
|
388
|
+
"workflow_task_id",
|
389
|
+
"task_name",
|
390
|
+
]
|
391
|
+
if key in output.keys()
|
392
|
+
}
|
393
|
+
exception = TaskExecutionError(traceback_string, **kwargs)
|
394
|
+
return (None, exception)
|
395
|
+
|
396
|
+
except Exception as e:
|
397
|
+
exception = JobExecutionError(f"ERROR, {str(e)}")
|
398
|
+
# If job was scancelled and task failed, replace
|
399
|
+
# exception with a shutdown-related one.
|
400
|
+
if was_job_scancelled:
|
401
|
+
logger.debug(
|
402
|
+
"Replacing exception with a shutdown-related one, "
|
403
|
+
f"for {task.index=}."
|
404
|
+
)
|
405
|
+
exception = SHUTDOWN_EXCEPTION
|
406
|
+
return (None, exception)
|
407
|
+
finally:
|
408
|
+
Path(task.input_pickle_file_local).unlink(missing_ok=True)
|
409
|
+
Path(task.output_pickle_file_local).unlink(missing_ok=True)
|
410
|
+
|
411
|
+
def is_shutdown(self) -> bool:
|
412
|
+
return self.shutdown_file.exists()
|
413
|
+
|
414
|
+
@property
|
415
|
+
def job_ids(self) -> list[str]:
|
416
|
+
return list(self.jobs.keys())
|
417
|
+
|
418
|
+
def wait_and_check_shutdown(self) -> list[str]:
|
419
|
+
"""
|
420
|
+
Wait at most `self.poll_interval`, while also checking for shutdown.
|
421
|
+
"""
|
422
|
+
# Sleep for `self.poll_interval`, but keep checking for shutdowns
|
423
|
+
start_time = time.perf_counter()
|
424
|
+
# Always wait at least 0.2 (note: this is for cases where
|
425
|
+
# `poll_interval=0`).
|
426
|
+
waiting_time = max(self.poll_interval, 0.2)
|
427
|
+
max_time = start_time + waiting_time
|
428
|
+
logger.debug(
|
429
|
+
"[wait_and_check_shutdown] "
|
430
|
+
f"I will wait at most {self.poll_interval} s, "
|
431
|
+
f"in blocks of {self.poll_interval_internal} s."
|
432
|
+
)
|
433
|
+
|
434
|
+
while time.perf_counter() < max_time:
|
435
|
+
if self.is_shutdown():
|
436
|
+
logger.info("[wait_and_check_shutdown] Shutdown file detected")
|
437
|
+
scancelled_job_ids = self.scancel_jobs()
|
438
|
+
logger.info(f"[wait_and_check_shutdown] {scancelled_job_ids=}")
|
439
|
+
return scancelled_job_ids
|
440
|
+
time.sleep(self.poll_interval_internal)
|
441
|
+
|
442
|
+
logger.debug("[wait_and_check_shutdown] No shutdown file detected")
|
443
|
+
return []
|
444
|
+
|
445
|
+
def _check_no_active_jobs(self):
|
446
|
+
if self.jobs != {}:
|
447
|
+
raise JobExecutionError(
|
448
|
+
"Unexpected branch: jobs must be empty before new "
|
449
|
+
"submissions."
|
450
|
+
)
|
451
|
+
|
452
|
+
def submit(
|
453
|
+
self,
|
454
|
+
func: callable,
|
455
|
+
parameters: dict[str, Any],
|
456
|
+
history_unit_id: int,
|
457
|
+
task_files: TaskFiles,
|
458
|
+
config: SlurmConfig,
|
459
|
+
task_type: Literal[
|
460
|
+
"non_parallel",
|
461
|
+
"converter_non_parallel",
|
462
|
+
"compound",
|
463
|
+
"converter_compound",
|
464
|
+
],
|
465
|
+
user_id: int,
|
466
|
+
) -> tuple[Any, Exception]:
|
467
|
+
logger.debug("[submit] START")
|
468
|
+
try:
|
469
|
+
workdir_local = task_files.wftask_subfolder_local
|
470
|
+
workdir_remote = task_files.wftask_subfolder_remote
|
471
|
+
|
472
|
+
if self.is_shutdown():
|
473
|
+
with next(get_sync_db()) as db:
|
474
|
+
update_status_of_history_unit(
|
475
|
+
history_unit_id=history_unit_id,
|
476
|
+
status=HistoryUnitStatus.FAILED,
|
477
|
+
db_sync=db,
|
478
|
+
)
|
479
|
+
|
480
|
+
return None, SHUTDOWN_EXCEPTION
|
481
|
+
|
482
|
+
self._check_no_active_jobs()
|
483
|
+
|
484
|
+
# Validation phase
|
485
|
+
self.validate_submit_parameters(
|
486
|
+
parameters=parameters,
|
487
|
+
task_type=task_type,
|
488
|
+
)
|
489
|
+
|
490
|
+
# Create task subfolder
|
491
|
+
logger.debug("[submit] Create local/remote folders - START")
|
492
|
+
self._mkdir_local_folder(folder=workdir_local.as_posix())
|
493
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
494
|
+
logger.debug("[submit] Create local/remote folders - END")
|
495
|
+
|
496
|
+
# Submission phase
|
497
|
+
slurm_job = SlurmJob(
|
498
|
+
prefix=task_files.prefix,
|
499
|
+
workdir_local=workdir_local,
|
500
|
+
workdir_remote=workdir_remote,
|
501
|
+
tasks=[
|
502
|
+
SlurmTask(
|
503
|
+
prefix=task_files.prefix,
|
504
|
+
index=0,
|
505
|
+
component=task_files.component,
|
506
|
+
parameters=parameters,
|
507
|
+
workdir_remote=workdir_remote,
|
508
|
+
workdir_local=workdir_local,
|
509
|
+
task_files=task_files,
|
510
|
+
)
|
511
|
+
],
|
512
|
+
)
|
513
|
+
|
514
|
+
config.parallel_tasks_per_job = 1
|
515
|
+
self._submit_single_sbatch(
|
516
|
+
func,
|
517
|
+
slurm_job=slurm_job,
|
518
|
+
slurm_config=config,
|
519
|
+
)
|
520
|
+
logger.debug(f"[submit] END submission phase, {self.job_ids=}")
|
521
|
+
|
522
|
+
create_accounting_record_slurm(
|
523
|
+
user_id=user_id,
|
524
|
+
slurm_job_ids=self.job_ids,
|
525
|
+
)
|
526
|
+
|
527
|
+
# NOTE: see issue 2444
|
528
|
+
settings = Inject(get_settings)
|
529
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
530
|
+
logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
|
531
|
+
time.sleep(sleep_time)
|
532
|
+
|
533
|
+
# Retrieval phase
|
534
|
+
logger.debug("[submit] START retrieval phase")
|
535
|
+
scancelled_job_ids = []
|
536
|
+
while len(self.jobs) > 0:
|
537
|
+
# Look for finished jobs
|
538
|
+
finished_job_ids = self._get_finished_jobs(
|
539
|
+
job_ids=self.job_ids
|
540
|
+
)
|
541
|
+
logger.debug(f"[submit] {finished_job_ids=}")
|
542
|
+
finished_jobs = [
|
543
|
+
self.jobs[_slurm_job_id]
|
544
|
+
for _slurm_job_id in finished_job_ids
|
545
|
+
]
|
546
|
+
self._fetch_artifacts(finished_jobs)
|
547
|
+
with next(get_sync_db()) as db:
|
548
|
+
for slurm_job_id in finished_job_ids:
|
549
|
+
logger.debug(f"[submit] Now process {slurm_job_id=}")
|
550
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
551
|
+
was_job_scancelled = slurm_job_id in scancelled_job_ids
|
552
|
+
result, exception = self._postprocess_single_task(
|
553
|
+
task=slurm_job.tasks[0],
|
554
|
+
was_job_scancelled=was_job_scancelled,
|
555
|
+
)
|
556
|
+
|
557
|
+
if exception is not None:
|
558
|
+
update_status_of_history_unit(
|
559
|
+
history_unit_id=history_unit_id,
|
560
|
+
status=HistoryUnitStatus.FAILED,
|
561
|
+
db_sync=db,
|
562
|
+
)
|
563
|
+
else:
|
564
|
+
if task_type not in [
|
565
|
+
"compound",
|
566
|
+
"converter_compound",
|
567
|
+
]:
|
568
|
+
update_status_of_history_unit(
|
569
|
+
history_unit_id=history_unit_id,
|
570
|
+
status=HistoryUnitStatus.DONE,
|
571
|
+
db_sync=db,
|
572
|
+
)
|
573
|
+
|
574
|
+
if len(self.jobs) > 0:
|
575
|
+
scancelled_job_ids = self.wait_and_check_shutdown()
|
576
|
+
|
577
|
+
logger.debug("[submit] END")
|
578
|
+
return result, exception
|
579
|
+
|
580
|
+
except Exception as e:
|
581
|
+
logger.error(
|
582
|
+
f"[submit] Unexpected exception. Original error: {str(e)}"
|
583
|
+
)
|
584
|
+
with next(get_sync_db()) as db:
|
585
|
+
update_status_of_history_unit(
|
586
|
+
history_unit_id=history_unit_id,
|
587
|
+
status=HistoryUnitStatus.FAILED,
|
588
|
+
db_sync=db,
|
589
|
+
)
|
590
|
+
self.scancel_jobs()
|
591
|
+
return None, e
|
592
|
+
|
593
|
+
def multisubmit(
|
594
|
+
self,
|
595
|
+
func: callable,
|
596
|
+
list_parameters: list[dict],
|
597
|
+
history_unit_ids: list[int],
|
598
|
+
list_task_files: list[TaskFiles],
|
599
|
+
task_type: Literal["parallel", "compound", "converter_compound"],
|
600
|
+
config: SlurmConfig,
|
601
|
+
user_id: int,
|
602
|
+
) -> tuple[dict[int, Any], dict[int, BaseException]]:
|
603
|
+
"""
|
604
|
+
Note: `list_parameters`, `list_task_files` and `history_unit_ids`
|
605
|
+
have the same size. For parallel tasks, this is also the number of
|
606
|
+
input images, while for compound tasks these can differ.
|
607
|
+
"""
|
608
|
+
|
609
|
+
logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
|
610
|
+
try:
|
611
|
+
|
612
|
+
if self.is_shutdown():
|
613
|
+
if task_type == "parallel":
|
614
|
+
with next(get_sync_db()) as db:
|
615
|
+
bulk_update_status_of_history_unit(
|
616
|
+
history_unit_ids=history_unit_ids,
|
617
|
+
status=HistoryUnitStatus.FAILED,
|
618
|
+
db_sync=db,
|
619
|
+
)
|
620
|
+
results = {}
|
621
|
+
exceptions = {
|
622
|
+
ind: SHUTDOWN_EXCEPTION
|
623
|
+
for ind in range(len(list_parameters))
|
624
|
+
}
|
625
|
+
return results, exceptions
|
626
|
+
|
627
|
+
self._check_no_active_jobs()
|
628
|
+
self.validate_multisubmit_parameters(
|
629
|
+
list_parameters=list_parameters,
|
630
|
+
task_type=task_type,
|
631
|
+
list_task_files=list_task_files,
|
632
|
+
history_unit_ids=history_unit_ids,
|
633
|
+
)
|
634
|
+
|
635
|
+
workdir_local = list_task_files[0].wftask_subfolder_local
|
636
|
+
workdir_remote = list_task_files[0].wftask_subfolder_remote
|
637
|
+
|
638
|
+
# Create local&remote task subfolders
|
639
|
+
if task_type == "parallel":
|
640
|
+
self._mkdir_local_folder(workdir_local.as_posix())
|
641
|
+
self._mkdir_remote_folder(folder=workdir_remote.as_posix())
|
642
|
+
|
643
|
+
results: dict[int, Any] = {}
|
644
|
+
exceptions: dict[int, BaseException] = {}
|
645
|
+
|
646
|
+
# NOTE: chunking has already taken place in `get_slurm_config`,
|
647
|
+
# so that `config.tasks_per_job` is now set.
|
648
|
+
|
649
|
+
# Divide arguments in batches of `tasks_per_job` tasks each
|
650
|
+
tot_tasks = len(list_parameters)
|
651
|
+
args_batches = []
|
652
|
+
batch_size = config.tasks_per_job
|
653
|
+
for ind_chunk in range(0, tot_tasks, batch_size):
|
654
|
+
args_batches.append(
|
655
|
+
list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
|
656
|
+
)
|
657
|
+
if len(args_batches) != math.ceil(
|
658
|
+
tot_tasks / config.tasks_per_job
|
659
|
+
):
|
660
|
+
raise RuntimeError("Something wrong here while batching tasks")
|
661
|
+
|
662
|
+
# Part 1/3: Iterate over chunks, prepare SlurmJob objects
|
663
|
+
logger.debug("[multisubmit] Prepare `SlurmJob`s.")
|
664
|
+
jobs_to_submit = []
|
665
|
+
for ind_batch, chunk in enumerate(args_batches):
|
666
|
+
# Read prefix based on the first task of this batch
|
667
|
+
prefix = list_task_files[ind_batch * batch_size].prefix
|
668
|
+
tasks = []
|
669
|
+
for ind_chunk, parameters in enumerate(chunk):
|
670
|
+
index = (ind_batch * batch_size) + ind_chunk
|
671
|
+
tasks.append(
|
672
|
+
SlurmTask(
|
673
|
+
prefix=prefix,
|
674
|
+
index=index,
|
675
|
+
component=list_task_files[index].component,
|
676
|
+
workdir_local=workdir_local,
|
677
|
+
workdir_remote=workdir_remote,
|
678
|
+
parameters=parameters,
|
679
|
+
zarr_url=parameters["zarr_url"],
|
680
|
+
task_files=list_task_files[index],
|
681
|
+
),
|
682
|
+
)
|
683
|
+
jobs_to_submit.append(
|
684
|
+
SlurmJob(
|
685
|
+
prefix=prefix,
|
686
|
+
workdir_local=workdir_local,
|
687
|
+
workdir_remote=workdir_remote,
|
688
|
+
tasks=tasks,
|
689
|
+
)
|
690
|
+
)
|
691
|
+
|
692
|
+
# NOTE: see issue 2431
|
693
|
+
logger.debug("[multisubmit] Transfer files and submit jobs.")
|
694
|
+
for slurm_job in jobs_to_submit:
|
695
|
+
self._submit_single_sbatch(
|
696
|
+
func,
|
697
|
+
slurm_job=slurm_job,
|
698
|
+
slurm_config=config,
|
699
|
+
)
|
700
|
+
|
701
|
+
logger.info(f"[multisubmit] END submission phase, {self.job_ids=}")
|
702
|
+
|
703
|
+
create_accounting_record_slurm(
|
704
|
+
user_id=user_id,
|
705
|
+
slurm_job_ids=self.job_ids,
|
706
|
+
)
|
707
|
+
|
708
|
+
settings = Inject(get_settings)
|
709
|
+
sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
|
710
|
+
logger.warning(f"[multisubmit] Now sleep {sleep_time} seconds.")
|
711
|
+
time.sleep(sleep_time)
|
712
|
+
except Exception as e:
|
713
|
+
logger.error(
|
714
|
+
"[multisubmit] Unexpected exception during submission."
|
715
|
+
f" Original error {str(e)}"
|
716
|
+
)
|
717
|
+
self.scancel_jobs()
|
718
|
+
if task_type == "parallel":
|
719
|
+
with next(get_sync_db()) as db:
|
720
|
+
bulk_update_status_of_history_unit(
|
721
|
+
history_unit_ids=history_unit_ids,
|
722
|
+
status=HistoryUnitStatus.FAILED,
|
723
|
+
db_sync=db,
|
724
|
+
)
|
725
|
+
results = {}
|
726
|
+
exceptions = {ind: e for ind in range(len(list_parameters))}
|
727
|
+
return results, exceptions
|
728
|
+
|
729
|
+
# Retrieval phase
|
730
|
+
logger.debug("[multisubmit] START retrieval phase")
|
731
|
+
scancelled_job_ids = []
|
732
|
+
while len(self.jobs) > 0:
|
733
|
+
# Look for finished jobs
|
734
|
+
finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
|
735
|
+
logger.debug(f"[multisubmit] {finished_job_ids=}")
|
736
|
+
finished_jobs = [
|
737
|
+
self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
|
738
|
+
]
|
739
|
+
fetch_artifacts_exception = None
|
740
|
+
try:
|
741
|
+
self._fetch_artifacts(finished_jobs)
|
742
|
+
except Exception as e:
|
743
|
+
logger.error(
|
744
|
+
"[multisubmit] Unexpected exception in "
|
745
|
+
"`_fetch_artifacts`. "
|
746
|
+
f"Original error: {str(e)}"
|
747
|
+
)
|
748
|
+
fetch_artifacts_exception = e
|
749
|
+
|
750
|
+
with next(get_sync_db()) as db:
|
751
|
+
for slurm_job_id in finished_job_ids:
|
752
|
+
logger.debug(f"[multisubmit] Now process {slurm_job_id=}")
|
753
|
+
slurm_job = self.jobs.pop(slurm_job_id)
|
754
|
+
for task in slurm_job.tasks:
|
755
|
+
logger.debug(
|
756
|
+
f"[multisubmit] Now process {task.index=}"
|
757
|
+
)
|
758
|
+
was_job_scancelled = slurm_job_id in scancelled_job_ids
|
759
|
+
if fetch_artifacts_exception is not None:
|
760
|
+
result = None
|
761
|
+
exception = fetch_artifacts_exception
|
762
|
+
else:
|
763
|
+
try:
|
764
|
+
(
|
765
|
+
result,
|
766
|
+
exception,
|
767
|
+
) = self._postprocess_single_task(
|
768
|
+
task=task,
|
769
|
+
was_job_scancelled=was_job_scancelled,
|
770
|
+
)
|
771
|
+
except Exception as e:
|
772
|
+
logger.error(
|
773
|
+
"[multisubmit] Unexpected exception in "
|
774
|
+
"`_postprocess_single_task`. "
|
775
|
+
f"Original error: {str(e)}"
|
776
|
+
)
|
777
|
+
result = None
|
778
|
+
exception = e
|
779
|
+
# Note: the relevant done/failed check is based on
|
780
|
+
# whether `exception is None`. The fact that
|
781
|
+
# `result is None` is not relevant for this purpose.
|
782
|
+
if exception is not None:
|
783
|
+
exceptions[task.index] = exception
|
784
|
+
if task_type == "parallel":
|
785
|
+
update_status_of_history_unit(
|
786
|
+
history_unit_id=history_unit_ids[
|
787
|
+
task.index
|
788
|
+
],
|
789
|
+
status=HistoryUnitStatus.FAILED,
|
790
|
+
db_sync=db,
|
791
|
+
)
|
792
|
+
else:
|
793
|
+
results[task.index] = result
|
794
|
+
if task_type == "parallel":
|
795
|
+
update_status_of_history_unit(
|
796
|
+
history_unit_id=history_unit_ids[
|
797
|
+
task.index
|
798
|
+
],
|
799
|
+
status=HistoryUnitStatus.DONE,
|
800
|
+
db_sync=db,
|
801
|
+
)
|
802
|
+
|
803
|
+
if len(self.jobs) > 0:
|
804
|
+
scancelled_job_ids = self.wait_and_check_shutdown()
|
805
|
+
|
806
|
+
logger.debug("[multisubmit] END")
|
807
|
+
return results, exceptions
|
808
|
+
|
809
|
+
def check_fractal_server_versions(self) -> None:
|
810
|
+
"""
|
811
|
+
Compare fractal-server versions of local/remote Python interpreters.
|
812
|
+
"""
|
813
|
+
|
814
|
+
# Skip check when the local and remote interpreters are the same
|
815
|
+
# (notably for some sudo-slurm deployments)
|
816
|
+
if self.python_worker_interpreter == sys.executable:
|
817
|
+
return
|
818
|
+
|
819
|
+
# Fetch remote fractal-server version
|
820
|
+
cmd = (
|
821
|
+
f"{self.python_worker_interpreter} "
|
822
|
+
"-m fractal_server.app.runner.versions"
|
823
|
+
)
|
824
|
+
stdout = self._run_remote_cmd(cmd)
|
825
|
+
remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
|
826
|
+
|
827
|
+
# Verify local/remote version match
|
828
|
+
if remote_version != __VERSION__:
|
829
|
+
error_msg = (
|
830
|
+
"Fractal-server version mismatch.\n"
|
831
|
+
"Local interpreter: "
|
832
|
+
f"({sys.executable}): {__VERSION__}.\n"
|
833
|
+
"Remote interpreter: "
|
834
|
+
f"({self.python_worker_interpreter}): {remote_version}."
|
835
|
+
)
|
836
|
+
logger.error(error_msg)
|
837
|
+
raise RuntimeError(error_msg)
|
838
|
+
|
839
|
+
def scancel_jobs(self) -> list[str]:
|
840
|
+
logger.info("[scancel_jobs] START")
|
841
|
+
scancelled_job_ids = self.job_ids
|
842
|
+
if self.jobs:
|
843
|
+
scancel_string = " ".join(scancelled_job_ids)
|
844
|
+
scancel_cmd = f"scancel {scancel_string}"
|
845
|
+
logger.warning(f"[scancel_jobs] {scancel_string}")
|
846
|
+
try:
|
847
|
+
self._run_remote_cmd(scancel_cmd)
|
848
|
+
except Exception as e:
|
849
|
+
logger.error(
|
850
|
+
"[scancel_jobs] `scancel` command failed. "
|
851
|
+
f"Original error:\n{str(e)}"
|
852
|
+
)
|
853
|
+
logger.info("[scancel_jobs] END")
|
854
|
+
return scancelled_job_ids
|
855
|
+
|
856
|
+
def validate_slurm_jobs_workdirs(
|
857
|
+
self,
|
858
|
+
slurm_jobs: list[SlurmJob],
|
859
|
+
) -> None:
|
860
|
+
"""
|
861
|
+
Check that a list of `SlurmJob`s have homogeneous working folders.
|
862
|
+
"""
|
863
|
+
set_workdir_local = set(_job.workdir_local for _job in slurm_jobs)
|
864
|
+
set_workdir_remote = set(_job.workdir_remote for _job in slurm_jobs)
|
865
|
+
if len(set_workdir_local) > 1:
|
866
|
+
raise ValueError(f"Non-unique values in {set_workdir_local=}.")
|
867
|
+
if len(set_workdir_remote) > 1:
|
868
|
+
raise ValueError(f"Non-unique values in {set_workdir_remote=}.")
|