fractal-server 2.15.6__py3-none-any.whl → 2.15.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/models/security.py +1 -2
- fractal_server/app/routes/admin/v2/job.py +1 -1
- fractal_server/app/routes/admin/v2/task_group_lifecycle.py +2 -3
- fractal_server/app/routes/api/v2/_aux_functions_history.py +3 -2
- fractal_server/app/routes/api/v2/_aux_functions_task_version_update.py +1 -1
- fractal_server/app/routes/api/v2/_aux_functions_tasks.py +19 -10
- fractal_server/app/routes/api/v2/submit.py +22 -12
- fractal_server/app/routes/api/v2/task.py +1 -1
- fractal_server/app/routes/api/v2/task_group_lifecycle.py +1 -1
- fractal_server/app/runner/executors/base_runner.py +5 -4
- fractal_server/app/runner/executors/slurm_common/_slurm_config.py +1 -6
- fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +89 -65
- fractal_server/app/runner/executors/slurm_ssh/runner.py +59 -4
- fractal_server/app/runner/executors/slurm_ssh/tar_commands.py +1 -2
- fractal_server/app/runner/executors/slurm_sudo/runner.py +1 -1
- fractal_server/app/runner/v2/runner.py +1 -1
- fractal_server/app/runner/v2/runner_functions.py +5 -5
- fractal_server/app/schemas/v2/task_collection.py +3 -1
- fractal_server/config.py +70 -25
- fractal_server/ssh/_fabric.py +74 -79
- fractal_server/tasks/v2/ssh/_utils.py +1 -1
- fractal_server/tasks/v2/utils_pixi.py +6 -2
- fractal_server/tasks/v2/utils_python_interpreter.py +1 -1
- {fractal_server-2.15.6.dist-info → fractal_server-2.15.8.dist-info}/METADATA +4 -3
- {fractal_server-2.15.6.dist-info → fractal_server-2.15.8.dist-info}/RECORD +29 -29
- {fractal_server-2.15.6.dist-info → fractal_server-2.15.8.dist-info}/WHEEL +1 -1
- {fractal_server-2.15.6.dist-info → fractal_server-2.15.8.dist-info}/LICENSE +0 -0
- {fractal_server-2.15.6.dist-info → fractal_server-2.15.8.dist-info}/entry_points.txt +0 -0
fractal_server/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__VERSION__ = "2.15.
|
|
1
|
+
__VERSION__ = "2.15.8"
|
|
@@ -109,7 +109,7 @@ async def view_job(
|
|
|
109
109
|
|
|
110
110
|
@router.get("/{job_id}/", response_model=JobReadV2)
|
|
111
111
|
async def view_single_job(
|
|
112
|
-
job_id: int
|
|
112
|
+
job_id: int,
|
|
113
113
|
show_tmp_logs: bool = False,
|
|
114
114
|
user: UserOAuth = Depends(current_active_superuser),
|
|
115
115
|
db: AsyncSession = Depends(get_async_db),
|
|
@@ -25,7 +25,6 @@ from fractal_server.app.routes.aux.validate_user_settings import (
|
|
|
25
25
|
from fractal_server.app.schemas.v2 import TaskGroupActivityActionV2
|
|
26
26
|
from fractal_server.app.schemas.v2 import TaskGroupActivityStatusV2
|
|
27
27
|
from fractal_server.app.schemas.v2 import TaskGroupActivityV2Read
|
|
28
|
-
from fractal_server.app.schemas.v2 import TaskGroupReadV2
|
|
29
28
|
from fractal_server.app.schemas.v2 import TaskGroupV2OriginEnum
|
|
30
29
|
from fractal_server.config import get_settings
|
|
31
30
|
from fractal_server.logger import set_logger
|
|
@@ -52,7 +51,7 @@ async def deactivate_task_group(
|
|
|
52
51
|
response: Response,
|
|
53
52
|
superuser: UserOAuth = Depends(current_active_superuser),
|
|
54
53
|
db: AsyncSession = Depends(get_async_db),
|
|
55
|
-
) ->
|
|
54
|
+
) -> TaskGroupActivityV2Read:
|
|
56
55
|
"""
|
|
57
56
|
Deactivate task-group venv
|
|
58
57
|
"""
|
|
@@ -157,7 +156,7 @@ async def reactivate_task_group(
|
|
|
157
156
|
response: Response,
|
|
158
157
|
superuser: UserOAuth = Depends(current_active_superuser),
|
|
159
158
|
db: AsyncSession = Depends(get_async_db),
|
|
160
|
-
) ->
|
|
159
|
+
) -> TaskGroupActivityV2Read:
|
|
161
160
|
"""
|
|
162
161
|
Deactivate task-group venv
|
|
163
162
|
"""
|
|
@@ -102,8 +102,9 @@ async def _verify_workflow_and_dataset_access(
|
|
|
102
102
|
Verify user access to a dataset/workflow pair.
|
|
103
103
|
|
|
104
104
|
Args:
|
|
105
|
+
project_id:
|
|
106
|
+
workflow_id:
|
|
105
107
|
dataset_id:
|
|
106
|
-
workflow_task_id:
|
|
107
108
|
user_id:
|
|
108
109
|
db:
|
|
109
110
|
"""
|
|
@@ -148,7 +149,7 @@ async def get_wftask_check_owner(
|
|
|
148
149
|
Args:
|
|
149
150
|
project_id:
|
|
150
151
|
dataset_id:
|
|
151
|
-
|
|
152
|
+
workflowtask_id:
|
|
152
153
|
user_id:
|
|
153
154
|
db:
|
|
154
155
|
"""
|
|
@@ -6,7 +6,7 @@ def get_new_workflow_task_meta(
|
|
|
6
6
|
old_workflow_task_meta: dict | None,
|
|
7
7
|
old_task_meta: dict | None,
|
|
8
8
|
new_task_meta: dict | None,
|
|
9
|
-
) -> dict[str, Any]:
|
|
9
|
+
) -> dict[str, Any] | None:
|
|
10
10
|
"""
|
|
11
11
|
Prepare new meta field based on old/new tasks and old workflow task.
|
|
12
12
|
"""
|
|
@@ -231,11 +231,14 @@ async def _get_collection_task_group_activity_status_message(
|
|
|
231
231
|
)
|
|
232
232
|
task_group_activity_list = res.scalars().all()
|
|
233
233
|
if len(task_group_activity_list) > 1:
|
|
234
|
-
|
|
235
|
-
"\nWarning: "
|
|
234
|
+
msg_short = (
|
|
236
235
|
"Expected only one TaskGroupActivityV2 associated to TaskGroup "
|
|
237
236
|
f"{task_group_id}, found {len(task_group_activity_list)} "
|
|
238
237
|
f"(IDs: {[tga.id for tga in task_group_activity_list]})."
|
|
238
|
+
)
|
|
239
|
+
logger.error(f"UnreachableBranchError: {msg_short}")
|
|
240
|
+
msg = (
|
|
241
|
+
f"\nWarning: {msg_short}\n"
|
|
239
242
|
"Warning: this should have not happened, please contact an admin."
|
|
240
243
|
)
|
|
241
244
|
elif len(task_group_activity_list) == 1:
|
|
@@ -268,13 +271,16 @@ async def _verify_non_duplication_user_constraint(
|
|
|
268
271
|
if duplicate:
|
|
269
272
|
user = await db.get(UserOAuth, user_id)
|
|
270
273
|
if len(duplicate) > 1:
|
|
274
|
+
error_msg = (
|
|
275
|
+
f"User '{user.email}' already owns {len(duplicate)} task "
|
|
276
|
+
f"groups with name='{pkg_name}' and {version=} "
|
|
277
|
+
f"(IDs: {[group.id for group in duplicate]})."
|
|
278
|
+
)
|
|
279
|
+
logger.error(f"UnreachableBranchError: {error_msg}")
|
|
271
280
|
raise HTTPException(
|
|
272
281
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
273
282
|
detail=(
|
|
274
|
-
"Invalid state
|
|
275
|
-
f"User '{user.email}' already owns {len(duplicate)} task "
|
|
276
|
-
f"groups with name='{pkg_name}' and {version=} "
|
|
277
|
-
f"(IDs: {[group.id for group in duplicate]}).\n"
|
|
283
|
+
f"Invalid state: {error_msg}\n"
|
|
278
284
|
"This should have not happened: please contact an admin."
|
|
279
285
|
),
|
|
280
286
|
)
|
|
@@ -310,13 +316,16 @@ async def _verify_non_duplication_group_constraint(
|
|
|
310
316
|
if duplicate:
|
|
311
317
|
user_group = await db.get(UserGroup, user_group_id)
|
|
312
318
|
if len(duplicate) > 1:
|
|
319
|
+
error_msg = (
|
|
320
|
+
f"UserGroup '{user_group.name}' already owns "
|
|
321
|
+
f"{len(duplicate)} task groups with name='{pkg_name}' and "
|
|
322
|
+
f"{version=} (IDs: {[group.id for group in duplicate]}).\n"
|
|
323
|
+
)
|
|
324
|
+
logger.error(error_msg)
|
|
313
325
|
raise HTTPException(
|
|
314
326
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
315
327
|
detail=(
|
|
316
|
-
"Invalid state:\n"
|
|
317
|
-
f"UserGroup '{user_group.name}' already owns "
|
|
318
|
-
f"{len(duplicate)} task groups with name='{pkg_name}' and "
|
|
319
|
-
f"{version=} (IDs: {[group.id for group in duplicate]}).\n"
|
|
328
|
+
f"Invalid state:\n{error_msg}"
|
|
320
329
|
"This should have not happened: please contact an admin."
|
|
321
330
|
),
|
|
322
331
|
)
|
|
@@ -156,6 +156,28 @@ async def apply_workflow(
|
|
|
156
156
|
if len(user_settings.slurm_accounts) > 0:
|
|
157
157
|
job_create.slurm_account = user_settings.slurm_accounts[0]
|
|
158
158
|
|
|
159
|
+
# User appropriate FractalSSH object
|
|
160
|
+
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
|
161
|
+
ssh_config = dict(
|
|
162
|
+
user=user_settings.ssh_username,
|
|
163
|
+
host=user_settings.ssh_host,
|
|
164
|
+
key_path=user_settings.ssh_private_key_path,
|
|
165
|
+
)
|
|
166
|
+
fractal_ssh_list = request.app.state.fractal_ssh_list
|
|
167
|
+
try:
|
|
168
|
+
fractal_ssh = fractal_ssh_list.get(**ssh_config)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error(
|
|
171
|
+
"Could not get a valid SSH connection in the submit endpoint. "
|
|
172
|
+
f"Original error: '{str(e)}'."
|
|
173
|
+
)
|
|
174
|
+
raise HTTPException(
|
|
175
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
176
|
+
detail="Error in setting up the SSH connection.",
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
fractal_ssh = None
|
|
180
|
+
|
|
159
181
|
# Add new Job object to DB
|
|
160
182
|
job = JobV2(
|
|
161
183
|
project_id=project_id,
|
|
@@ -219,18 +241,6 @@ async def apply_workflow(
|
|
|
219
241
|
await db.merge(job)
|
|
220
242
|
await db.commit()
|
|
221
243
|
|
|
222
|
-
# User appropriate FractalSSH object
|
|
223
|
-
if settings.FRACTAL_RUNNER_BACKEND == "slurm_ssh":
|
|
224
|
-
ssh_config = dict(
|
|
225
|
-
user=user_settings.ssh_username,
|
|
226
|
-
host=user_settings.ssh_host,
|
|
227
|
-
key_path=user_settings.ssh_private_key_path,
|
|
228
|
-
)
|
|
229
|
-
fractal_ssh_list = request.app.state.fractal_ssh_list
|
|
230
|
-
fractal_ssh = fractal_ssh_list.get(**ssh_config)
|
|
231
|
-
else:
|
|
232
|
-
fractal_ssh = None
|
|
233
|
-
|
|
234
244
|
# Expunge user settings from db, to use in background task
|
|
235
245
|
db.expunge(user_settings)
|
|
236
246
|
|
|
@@ -69,7 +69,7 @@ async def get_list_task(
|
|
|
69
69
|
stm = stm.where(TaskV2.authors.icontains(author))
|
|
70
70
|
|
|
71
71
|
res = await db.execute(stm)
|
|
72
|
-
task_list = res.scalars().all()
|
|
72
|
+
task_list = list(res.scalars().all())
|
|
73
73
|
await db.close()
|
|
74
74
|
if args_schema is False:
|
|
75
75
|
for task in task_list:
|
|
@@ -91,14 +91,15 @@ class BaseRunner:
|
|
|
91
91
|
workflow_task_order:
|
|
92
92
|
workflow_task_id:
|
|
93
93
|
task_name:
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
list_parameters:
|
|
95
|
+
List of dictionaries of parameters (each one must include
|
|
96
|
+
`zarr_urls` key).
|
|
96
97
|
history_unit_ids:
|
|
97
98
|
Database IDs of the corresponding `HistoryUnit` entries.
|
|
99
|
+
list_task_files: `TaskFiles` objects.
|
|
98
100
|
task_type: Task type.
|
|
99
|
-
task_files: `TaskFiles` object.
|
|
100
101
|
config: Runner-specific parameters.
|
|
101
|
-
user_id
|
|
102
|
+
user_id:
|
|
102
103
|
"""
|
|
103
104
|
raise NotImplementedError()
|
|
104
105
|
|
|
@@ -66,7 +66,6 @@ class _SlurmConfigSet(BaseModel):
|
|
|
66
66
|
time: str | None = None
|
|
67
67
|
account: str | None = None
|
|
68
68
|
extra_lines: list[str] | None = None
|
|
69
|
-
pre_submission_commands: list[str] | None = None
|
|
70
69
|
gpus: str | None = None
|
|
71
70
|
|
|
72
71
|
|
|
@@ -213,7 +212,7 @@ class SlurmConfig(BaseModel):
|
|
|
213
212
|
`SlurmConfig` attributes (e.g. `mem_per_task_MB`), which are not meant to
|
|
214
213
|
be part of the `FRACTAL_SLURM_CONFIG_FILE` JSON file (details on the
|
|
215
214
|
expected file content are defined in
|
|
216
|
-
[`SlurmConfigFile`](
|
|
215
|
+
[`SlurmConfigFile`](#fractal_server.app.runner._slurm._slurm_config.SlurmConfigFile)).
|
|
217
216
|
|
|
218
217
|
Part of the attributes map directly to some of the SLURM attributes (see
|
|
219
218
|
https://slurm.schedmd.com/sbatch.html), e.g. `partition`. Other attributes
|
|
@@ -253,8 +252,6 @@ class SlurmConfig(BaseModel):
|
|
|
253
252
|
Key-value pairs to be included as `export`-ed variables in SLURM
|
|
254
253
|
submission script, after prepending values with the user's cache
|
|
255
254
|
directory.
|
|
256
|
-
pre_submission_commands: List of commands to be prepended to the sbatch
|
|
257
|
-
command.
|
|
258
255
|
"""
|
|
259
256
|
|
|
260
257
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -294,8 +291,6 @@ class SlurmConfig(BaseModel):
|
|
|
294
291
|
target_num_jobs: int
|
|
295
292
|
max_num_jobs: int
|
|
296
293
|
|
|
297
|
-
pre_submission_commands: list[str] = Field(default_factory=list)
|
|
298
|
-
|
|
299
294
|
def _sorted_extra_lines(self) -> list[str]:
|
|
300
295
|
"""
|
|
301
296
|
Return a copy of `self.extra_lines`, where lines starting with
|
|
@@ -137,7 +137,9 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
137
137
|
def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
|
|
138
138
|
raise NotImplementedError("Implement in child class.")
|
|
139
139
|
|
|
140
|
-
def _is_squeue_error_recoverable(
|
|
140
|
+
def _is_squeue_error_recoverable(
|
|
141
|
+
self, exception: BaseException
|
|
142
|
+
) -> Literal[True]:
|
|
141
143
|
"""
|
|
142
144
|
Determine whether a `squeue` error is considered recoverable.
|
|
143
145
|
|
|
@@ -262,14 +264,25 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
262
264
|
|
|
263
265
|
return new_slurm_config
|
|
264
266
|
|
|
265
|
-
def
|
|
267
|
+
def _prepare_single_slurm_job(
|
|
266
268
|
self,
|
|
267
269
|
*,
|
|
268
270
|
base_command: str,
|
|
269
271
|
slurm_job: SlurmJob,
|
|
270
272
|
slurm_config: SlurmConfig,
|
|
271
273
|
) -> str:
|
|
272
|
-
|
|
274
|
+
"""
|
|
275
|
+
Prepare submission script locally.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
base_command: Base of task executable command.
|
|
279
|
+
slurm_job: `SlurmJob` object
|
|
280
|
+
slurm_config: Configuration for SLURM job
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Command to submit the SLURM job.
|
|
284
|
+
"""
|
|
285
|
+
logger.debug("[_prepare_single_slurm_job] START")
|
|
273
286
|
|
|
274
287
|
for task in slurm_job.tasks:
|
|
275
288
|
# Write input file
|
|
@@ -299,24 +312,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
299
312
|
json.dump(task.parameters, f, indent=2)
|
|
300
313
|
|
|
301
314
|
logger.debug(
|
|
302
|
-
"[
|
|
315
|
+
"[_prepare_single_slurm_job] Written "
|
|
316
|
+
f"{task.input_file_local=}"
|
|
303
317
|
)
|
|
304
318
|
|
|
305
|
-
if self.slurm_runner_type == "ssh":
|
|
306
|
-
# Send input file (only relevant for SSH)
|
|
307
|
-
self.fractal_ssh.send_file(
|
|
308
|
-
local=task.input_file_local,
|
|
309
|
-
remote=task.input_file_remote,
|
|
310
|
-
)
|
|
311
|
-
self.fractal_ssh.send_file(
|
|
312
|
-
local=task.task_files.args_file_local,
|
|
313
|
-
remote=task.task_files.args_file_remote,
|
|
314
|
-
)
|
|
315
|
-
logger.debug(
|
|
316
|
-
"[_submit_single_sbatch] Transferred "
|
|
317
|
-
f"{task.input_file_local=}"
|
|
318
|
-
)
|
|
319
|
-
|
|
320
319
|
# Prepare commands to be included in SLURM submission script
|
|
321
320
|
cmdlines = []
|
|
322
321
|
for task in slurm_job.tasks:
|
|
@@ -353,7 +352,7 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
353
352
|
]
|
|
354
353
|
)
|
|
355
354
|
script_lines = slurm_config.sort_script_lines(script_lines)
|
|
356
|
-
logger.debug(script_lines)
|
|
355
|
+
logger.debug(f"[_prepare_single_slurm_job] {script_lines=}")
|
|
357
356
|
|
|
358
357
|
# Always print output of `uname -n` and `pwd`
|
|
359
358
|
script_lines.append('\necho "Hostname: $(uname -n)"')
|
|
@@ -373,61 +372,64 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
373
372
|
f"--mem={mem_per_task_MB}MB "
|
|
374
373
|
f"{cmd} &"
|
|
375
374
|
)
|
|
376
|
-
script_lines.append("wait\n")
|
|
377
|
-
script = "\n".join(script_lines)
|
|
375
|
+
script_lines.append("wait\n\n")
|
|
378
376
|
script_lines.append(
|
|
379
377
|
'echo "End time: $(date +"%Y-%m-%dT%H:%M:%S%z")"'
|
|
380
378
|
)
|
|
379
|
+
script = "\n".join(script_lines)
|
|
381
380
|
|
|
382
381
|
# Write submission script
|
|
383
382
|
with open(slurm_job.slurm_submission_script_local, "w") as f:
|
|
384
383
|
f.write(script)
|
|
385
384
|
logger.debug(
|
|
386
|
-
"[
|
|
385
|
+
"[_prepare_single_slurm_job] Written "
|
|
387
386
|
f"{slurm_job.slurm_submission_script_local=}"
|
|
388
387
|
)
|
|
389
388
|
|
|
390
389
|
if self.slurm_runner_type == "ssh":
|
|
391
|
-
self.fractal_ssh.send_file(
|
|
392
|
-
local=slurm_job.slurm_submission_script_local,
|
|
393
|
-
remote=slurm_job.slurm_submission_script_remote,
|
|
394
|
-
)
|
|
395
390
|
submit_command = (
|
|
396
|
-
"sbatch --parsable "
|
|
397
|
-
f"{slurm_job.slurm_submission_script_remote}"
|
|
391
|
+
f"sbatch --parsable {slurm_job.slurm_submission_script_remote}"
|
|
398
392
|
)
|
|
399
393
|
else:
|
|
400
394
|
submit_command = (
|
|
401
|
-
"sbatch --parsable "
|
|
402
|
-
f"{slurm_job.slurm_submission_script_local}"
|
|
395
|
+
f"sbatch --parsable {slurm_job.slurm_submission_script_local}"
|
|
403
396
|
)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
397
|
+
logger.debug("[_prepare_single_slurm_job] END")
|
|
398
|
+
return submit_command
|
|
399
|
+
|
|
400
|
+
def _send_many_job_inputs(
|
|
401
|
+
self, *, workdir_local: Path, workdir_remote: Path
|
|
402
|
+
) -> None:
|
|
403
|
+
"""
|
|
404
|
+
Placeholder method.
|
|
405
|
+
|
|
406
|
+
This method is intentionally left unimplemented in the base class.
|
|
407
|
+
Subclasses must override it to provide the logic for transferring
|
|
408
|
+
input data.
|
|
409
|
+
"""
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
def _submit_single_sbatch(
|
|
413
|
+
self,
|
|
414
|
+
*,
|
|
415
|
+
submit_command: str,
|
|
416
|
+
slurm_job: SlurmJob,
|
|
417
|
+
) -> None:
|
|
418
|
+
"""
|
|
419
|
+
Run `sbatch` and add the `slurm_job` to `self.jobs`.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
submit_command:
|
|
423
|
+
The SLURM submission command prepared in
|
|
424
|
+
`self._prepare_single_slurm_job`.
|
|
425
|
+
slurm_job: The `SlurmJob` object.
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
logger.debug("[_submit_single_sbatch] START")
|
|
429
429
|
|
|
430
430
|
# Submit SLURM job and retrieve job ID
|
|
431
|
+
logger.debug(f"[_submit_single_sbatch] Now run {submit_command=}")
|
|
432
|
+
sbatch_stdout = self._run_remote_cmd(submit_command)
|
|
431
433
|
logger.info(f"[_submit_single_sbatch] {sbatch_stdout=}")
|
|
432
434
|
stdout = sbatch_stdout.strip("\n")
|
|
433
435
|
submitted_job_id = int(stdout)
|
|
@@ -623,11 +625,19 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
623
625
|
)
|
|
624
626
|
|
|
625
627
|
config.parallel_tasks_per_job = 1
|
|
626
|
-
self.
|
|
628
|
+
submit_command = self._prepare_single_slurm_job(
|
|
627
629
|
base_command=base_command,
|
|
628
630
|
slurm_job=slurm_job,
|
|
629
631
|
slurm_config=config,
|
|
630
632
|
)
|
|
633
|
+
self._send_many_job_inputs(
|
|
634
|
+
workdir_local=workdir_local,
|
|
635
|
+
workdir_remote=workdir_remote,
|
|
636
|
+
)
|
|
637
|
+
self._submit_single_sbatch(
|
|
638
|
+
submit_command=submit_command,
|
|
639
|
+
slurm_job=slurm_job,
|
|
640
|
+
)
|
|
631
641
|
logger.debug(f"[submit] END submission phase, {self.job_ids=}")
|
|
632
642
|
|
|
633
643
|
create_accounting_record_slurm(
|
|
@@ -726,8 +736,8 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
726
736
|
status=HistoryUnitStatus.FAILED,
|
|
727
737
|
db_sync=db,
|
|
728
738
|
)
|
|
729
|
-
results = {}
|
|
730
|
-
exceptions = {
|
|
739
|
+
results: dict[int, Any] = {}
|
|
740
|
+
exceptions: dict[int, BaseException] = {
|
|
731
741
|
ind: SHUTDOWN_EXCEPTION
|
|
732
742
|
for ind in range(len(list_parameters))
|
|
733
743
|
}
|
|
@@ -801,13 +811,25 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
801
811
|
)
|
|
802
812
|
)
|
|
803
813
|
|
|
804
|
-
|
|
805
|
-
logger.debug("[multisubmit] Transfer files and submit jobs.")
|
|
814
|
+
submit_commands = []
|
|
806
815
|
for slurm_job in jobs_to_submit:
|
|
816
|
+
submit_commands.append(
|
|
817
|
+
self._prepare_single_slurm_job(
|
|
818
|
+
base_command=base_command,
|
|
819
|
+
slurm_job=slurm_job,
|
|
820
|
+
slurm_config=config,
|
|
821
|
+
)
|
|
822
|
+
)
|
|
823
|
+
self._send_many_job_inputs(
|
|
824
|
+
workdir_local=workdir_local,
|
|
825
|
+
workdir_remote=workdir_remote,
|
|
826
|
+
)
|
|
827
|
+
for slurm_job, submit_command in zip(
|
|
828
|
+
jobs_to_submit, submit_commands
|
|
829
|
+
):
|
|
807
830
|
self._submit_single_sbatch(
|
|
808
|
-
|
|
831
|
+
submit_command=submit_command,
|
|
809
832
|
slurm_job=slurm_job,
|
|
810
|
-
slurm_config=config,
|
|
811
833
|
)
|
|
812
834
|
|
|
813
835
|
logger.info(f"[multisubmit] END submission phase, {self.job_ids=}")
|
|
@@ -830,8 +852,10 @@ class BaseSlurmRunner(BaseRunner):
|
|
|
830
852
|
status=HistoryUnitStatus.FAILED,
|
|
831
853
|
db_sync=db,
|
|
832
854
|
)
|
|
833
|
-
results = {}
|
|
834
|
-
exceptions
|
|
855
|
+
results: dict[int, Any] = {}
|
|
856
|
+
exceptions: dict[int, BaseException] = {
|
|
857
|
+
ind: e for ind in range(len(list_parameters))
|
|
858
|
+
}
|
|
835
859
|
return results, exceptions
|
|
836
860
|
|
|
837
861
|
# Retrieval phase
|
|
@@ -166,12 +166,69 @@ class SlurmSSHRunner(BaseSlurmRunner):
|
|
|
166
166
|
stdout = self.fractal_ssh.run_command(cmd=cmd)
|
|
167
167
|
return stdout
|
|
168
168
|
|
|
169
|
+
def _send_many_job_inputs(
|
|
170
|
+
self, *, workdir_local: Path, workdir_remote: Path
|
|
171
|
+
) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Compress, transfer, and extract a local working directory onto a remote
|
|
174
|
+
host.
|
|
175
|
+
|
|
176
|
+
This method creates a temporary `.tar.gz` archive of the given
|
|
177
|
+
`workdir_local`, transfers it to the remote machine via the configured
|
|
178
|
+
SSH connection, extracts it into `workdir_remote`, and removes the
|
|
179
|
+
temporary archive from both local and remote filesystems.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
logger.debug("[_send_many_job_inputs] START")
|
|
183
|
+
|
|
184
|
+
tar_path_local = workdir_local.with_suffix(".tar.gz")
|
|
185
|
+
tar_name = Path(tar_path_local).name
|
|
186
|
+
tar_path_remote = workdir_remote.parent / tar_name
|
|
187
|
+
|
|
188
|
+
tar_compression_cmd = get_tar_compression_cmd(
|
|
189
|
+
subfolder_path=workdir_local, filelist_path=None
|
|
190
|
+
)
|
|
191
|
+
_, tar_extraction_cmd = get_tar_extraction_cmd(
|
|
192
|
+
archive_path=tar_path_remote
|
|
193
|
+
)
|
|
194
|
+
rm_tar_cmd = f"rm {tar_path_remote.as_posix()}"
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
run_subprocess(tar_compression_cmd, logger_name=logger.name)
|
|
198
|
+
logger.debug(
|
|
199
|
+
"[_send_many_job_inputs] "
|
|
200
|
+
f"{workdir_local=} compressed to {tar_path_local=}."
|
|
201
|
+
)
|
|
202
|
+
self.fractal_ssh.send_file(
|
|
203
|
+
local=tar_path_local.as_posix(),
|
|
204
|
+
remote=tar_path_remote.as_posix(),
|
|
205
|
+
)
|
|
206
|
+
logger.debug(
|
|
207
|
+
"[_send_many_job_inputs] "
|
|
208
|
+
f"{tar_path_local=} sent via SSH to {tar_path_remote=}."
|
|
209
|
+
)
|
|
210
|
+
self.fractal_ssh.run_command(cmd=tar_extraction_cmd)
|
|
211
|
+
logger.debug(
|
|
212
|
+
"[_send_many_job_inputs] "
|
|
213
|
+
f"{tar_path_remote=} extracted to {workdir_remote=}."
|
|
214
|
+
)
|
|
215
|
+
self.fractal_ssh.run_command(cmd=rm_tar_cmd)
|
|
216
|
+
logger.debug(
|
|
217
|
+
"[_send_many_job_inputs] "
|
|
218
|
+
f"{tar_path_remote=} removed from remote server."
|
|
219
|
+
)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
raise e
|
|
222
|
+
finally:
|
|
223
|
+
Path(tar_path_local).unlink(missing_ok=True)
|
|
224
|
+
logger.debug(f"[_send_many_job_inputs] {tar_path_local=} removed.")
|
|
225
|
+
|
|
226
|
+
logger.debug("[_send_many_job_inputs] END.")
|
|
227
|
+
|
|
169
228
|
def run_squeue(
|
|
170
229
|
self,
|
|
171
230
|
*,
|
|
172
231
|
job_ids: list[str],
|
|
173
|
-
base_interval: float = 2.0,
|
|
174
|
-
max_attempts: int = 7,
|
|
175
232
|
) -> str:
|
|
176
233
|
"""
|
|
177
234
|
Run `squeue` for a set of SLURM job IDs.
|
|
@@ -205,8 +262,6 @@ class SlurmSSHRunner(BaseSlurmRunner):
|
|
|
205
262
|
try:
|
|
206
263
|
stdout = self.fractal_ssh.run_command(
|
|
207
264
|
cmd=cmd,
|
|
208
|
-
base_interval=base_interval,
|
|
209
|
-
max_attempts=max_attempts,
|
|
210
265
|
)
|
|
211
266
|
return stdout
|
|
212
267
|
except FractalSSHCommandError as e:
|
|
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
def get_tar_compression_cmd(
|
|
8
8
|
subfolder_path: Path,
|
|
9
9
|
filelist_path: Path | None,
|
|
10
|
-
) ->
|
|
10
|
+
) -> str:
|
|
11
11
|
"""
|
|
12
12
|
Prepare command to compress e.g. `/path/dir` into `/path/dir.tar.gz`.
|
|
13
13
|
|
|
@@ -17,7 +17,6 @@ def get_tar_compression_cmd(
|
|
|
17
17
|
Args:
|
|
18
18
|
subfolder_path: Absolute path to the folder to compress.
|
|
19
19
|
filelist_path: If set, to be used in the `--files-from` option.
|
|
20
|
-
expected_tarfile: If set, it should match to the returned one.
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
22
|
tar command
|
|
@@ -47,7 +47,7 @@ def _remove_status_from_attributes(
|
|
|
47
47
|
Drop attribute `IMAGE_STATUS_KEY` from all images.
|
|
48
48
|
"""
|
|
49
49
|
images_copy = deepcopy(images)
|
|
50
|
-
[img["attributes"].pop(IMAGE_STATUS_KEY) for img in images_copy]
|
|
50
|
+
[img["attributes"].pop(IMAGE_STATUS_KEY, None) for img in images_copy]
|
|
51
51
|
return images_copy
|
|
52
52
|
|
|
53
53
|
|