fractal-server 2.16.2a0__py3-none-any.whl → 2.16.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fractal_server/__init__.py +1 -1
- fractal_server/app/routes/admin/v2/job.py +3 -3
- fractal_server/app/routes/admin/v2/task.py +1 -1
- fractal_server/app/routes/admin/v2/task_group.py +1 -1
- fractal_server/app/routes/admin/v2/task_group_lifecycle.py +3 -3
- fractal_server/app/routes/api/v2/_aux_functions.py +7 -7
- fractal_server/app/routes/api/v2/_aux_functions_history.py +2 -2
- fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +37 -13
- fractal_server/app/routes/api/v2/_aux_functions_tasks.py +8 -8
- fractal_server/app/routes/api/v2/dataset.py +4 -4
- fractal_server/app/routes/api/v2/history.py +2 -2
- fractal_server/app/routes/api/v2/images.py +3 -3
- fractal_server/app/routes/api/v2/job.py +1 -1
- fractal_server/app/routes/api/v2/project.py +1 -1
- fractal_server/app/routes/api/v2/status_legacy.py +1 -1
- fractal_server/app/routes/api/v2/submit.py +9 -9
- fractal_server/app/routes/api/v2/task.py +4 -4
- fractal_server/app/routes/api/v2/task_collection.py +5 -5
- fractal_server/app/routes/api/v2/task_collection_custom.py +6 -6
- fractal_server/app/routes/api/v2/task_collection_pixi.py +5 -5
- fractal_server/app/routes/api/v2/task_group_lifecycle.py +3 -3
- fractal_server/app/routes/api/v2/task_version_update.py +3 -3
- fractal_server/app/routes/api/v2/workflow.py +4 -4
- fractal_server/app/routes/api/v2/workflow_import.py +1 -1
- fractal_server/app/routes/api/v2/workflowtask.py +6 -6
- fractal_server/app/routes/auth/group.py +2 -2
- fractal_server/app/routes/auth/users.py +1 -1
- fractal_server/app/routes/aux/_job.py +1 -1
- fractal_server/app/routes/aux/_runner.py +2 -2
- fractal_server/app/routes/aux/validate_user_settings.py +2 -2
- fractal_server/config.py +2 -2
- fractal_server/main.py +1 -1
- fractal_server/{app/runner → runner}/executors/base_runner.py +1 -1
- fractal_server/{app/runner → runner}/executors/call_command_wrapper.py +1 -1
- fractal_server/{app/runner → runner}/executors/local/runner.py +9 -9
- fractal_server/{app/runner → runner}/executors/slurm_common/_slurm_config.py +1 -1
- fractal_server/{app/runner → runner}/executors/slurm_common/base_slurm_runner.py +13 -13
- fractal_server/{app/runner → runner}/executors/slurm_common/slurm_job_task_models.py +1 -1
- fractal_server/{app/runner → runner}/executors/slurm_sudo/runner.py +1 -1
- fractal_server/{app/runner → runner}/task_files.py +1 -1
- fractal_server/{app/runner → runner}/v2/_local.py +2 -2
- fractal_server/{app/runner → runner}/v2/_slurm_ssh.py +3 -3
- fractal_server/{app/runner → runner}/v2/_slurm_sudo.py +2 -2
- fractal_server/{app/runner → runner}/v2/deduplicate_list.py +2 -2
- fractal_server/{app/runner → runner}/v2/merge_outputs.py +2 -2
- fractal_server/{app/runner → runner}/v2/runner.py +3 -3
- fractal_server/{app/runner → runner}/v2/runner_functions.py +12 -12
- fractal_server/{app/runner → runner}/v2/submit_workflow.py +13 -13
- fractal_server/{app/runner → runner}/v2/task_interface.py +2 -2
- fractal_server/ssh/_fabric.py +61 -18
- fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py +173 -33
- fractal_server/tasks/v2/ssh/collect_pixi.py +20 -15
- fractal_server/tasks/v2/ssh/reactivate_pixi.py +20 -15
- fractal_server/tasks/v2/utils_background.py +1 -1
- {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/METADATA +6 -6
- {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/RECORD +82 -82
- /fractal_server/{app/runner → runner}/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/components.py +0 -0
- /fractal_server/{app/runner → runner}/exceptions.py +0 -0
- /fractal_server/{app/runner → runner}/executors/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/executors/local/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/executors/local/get_local_config.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_common/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_common/_batching.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_common/_job_states.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_common/get_slurm_config.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_common/remote.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_ssh/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_ssh/run_subprocess.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_ssh/runner.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_ssh/tar_commands.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_sudo/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/executors/slurm_sudo/_subprocess_run_as_user.py +0 -0
- /fractal_server/{app/runner → runner}/filenames.py +0 -0
- /fractal_server/{app/runner → runner}/set_start_and_last_task_index.py +0 -0
- /fractal_server/{app/runner → runner}/shutdown.py +0 -0
- /fractal_server/{app/runner → runner}/v2/__init__.py +0 -0
- /fractal_server/{app/runner → runner}/v2/db_tools.py +0 -0
- /fractal_server/{app/runner → runner}/versions.py +0 -0
- {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/LICENSE +0 -0
- {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/WHEEL +0 -0
- {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/entry_points.txt +0 -0
|
@@ -11,26 +11,26 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from sqlalchemy.orm import Session as DBSyncSession
|
|
13
13
|
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from ...db import DB
|
|
23
|
-
from ...models.v2 import DatasetV2
|
|
24
|
-
from ...models.v2 import JobV2
|
|
25
|
-
from ...models.v2 import WorkflowV2
|
|
26
|
-
from ...schemas.v2 import JobStatusTypeV2
|
|
14
|
+
from ...config import get_settings
|
|
15
|
+
from ...logger import get_logger
|
|
16
|
+
from ...logger import reset_logger_handlers
|
|
17
|
+
from ...logger import set_logger
|
|
18
|
+
from ...ssh._fabric import FractalSSH
|
|
19
|
+
from ...syringe import Inject
|
|
20
|
+
from ...utils import get_timestamp
|
|
21
|
+
from ...zip_tools import _zip_folder_to_file_and_remove
|
|
27
22
|
from ..exceptions import JobExecutionError
|
|
28
23
|
from ..filenames import WORKFLOW_LOG_FILENAME
|
|
29
24
|
from ._local import process_workflow as local_process_workflow
|
|
30
25
|
from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
|
|
31
26
|
from ._slurm_sudo import process_workflow as slurm_sudo_process_workflow
|
|
32
27
|
from fractal_server import __VERSION__
|
|
28
|
+
from fractal_server.app.db import DB
|
|
33
29
|
from fractal_server.app.models import UserSettings
|
|
30
|
+
from fractal_server.app.models.v2 import DatasetV2
|
|
31
|
+
from fractal_server.app.models.v2 import JobV2
|
|
32
|
+
from fractal_server.app.models.v2 import WorkflowV2
|
|
33
|
+
from fractal_server.app.schemas.v2 import JobStatusTypeV2
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
_backends = {}
|
|
@@ -5,8 +5,8 @@ from pydantic import ConfigDict
|
|
|
5
5
|
from pydantic import Field
|
|
6
6
|
from pydantic import ValidationError
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from fractal_server.
|
|
8
|
+
from fractal_server.images import SingleImageTaskOutput
|
|
9
|
+
from fractal_server.runner.exceptions import TaskOutputValidationError
|
|
10
10
|
from fractal_server.types import ZarrUrlStr
|
|
11
11
|
|
|
12
12
|
|
fractal_server/ssh/_fabric.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
import time
|
|
4
5
|
from collections.abc import Generator
|
|
5
6
|
from contextlib import contextmanager
|
|
@@ -15,11 +16,15 @@ from invoke import UnexpectedExit
|
|
|
15
16
|
from paramiko.ssh_exception import NoValidConnectionsError
|
|
16
17
|
from pydantic import BaseModel
|
|
17
18
|
|
|
19
|
+
from ..logger import close_logger
|
|
18
20
|
from ..logger import get_logger
|
|
19
21
|
from ..logger import set_logger
|
|
20
22
|
from fractal_server.string_tools import validate_cmd
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
SSH_MONITORING_LOGGER_NAME = "ssh-log"
|
|
26
|
+
|
|
27
|
+
|
|
23
28
|
class FractalSSHTimeoutError(RuntimeError):
|
|
24
29
|
pass
|
|
25
30
|
|
|
@@ -42,9 +47,6 @@ class SSHConfig(BaseModel):
|
|
|
42
47
|
key_path: str
|
|
43
48
|
|
|
44
49
|
|
|
45
|
-
logger = set_logger(__name__)
|
|
46
|
-
|
|
47
|
-
|
|
48
50
|
def retry_if_socket_error(func):
|
|
49
51
|
@wraps(func)
|
|
50
52
|
def func_with_retry(*args, **kwargs):
|
|
@@ -76,7 +78,8 @@ def _acquire_lock_with_timeout(
|
|
|
76
78
|
lock: Lock,
|
|
77
79
|
label: str,
|
|
78
80
|
timeout: float,
|
|
79
|
-
|
|
81
|
+
pid: int,
|
|
82
|
+
logger_name: str,
|
|
80
83
|
) -> Generator[Literal[True], Any, None]:
|
|
81
84
|
"""
|
|
82
85
|
Given a `threading.Lock` object, try to acquire it within a given timeout.
|
|
@@ -88,8 +91,9 @@ def _acquire_lock_with_timeout(
|
|
|
88
91
|
logger_name:
|
|
89
92
|
"""
|
|
90
93
|
logger = get_logger(logger_name)
|
|
94
|
+
ssh_logger = get_logger(SSH_MONITORING_LOGGER_NAME)
|
|
91
95
|
logger.info(f"Trying to acquire lock for '{label}', with {timeout=}")
|
|
92
|
-
|
|
96
|
+
t_lock_request = time.perf_counter()
|
|
93
97
|
result = lock.acquire(timeout=timeout)
|
|
94
98
|
try:
|
|
95
99
|
if not result:
|
|
@@ -98,14 +102,25 @@ def _acquire_lock_with_timeout(
|
|
|
98
102
|
f"Failed to acquire lock for '{label}' within "
|
|
99
103
|
f"{timeout} seconds"
|
|
100
104
|
)
|
|
101
|
-
|
|
102
|
-
elapsed =
|
|
105
|
+
t_lock_acquisition = time.perf_counter()
|
|
106
|
+
elapsed = t_lock_acquisition - t_lock_request
|
|
103
107
|
logger.info(f"Lock for '{label}' was acquired - {elapsed=:.4f} s")
|
|
104
108
|
yield result
|
|
105
109
|
finally:
|
|
106
110
|
if result:
|
|
107
111
|
lock.release()
|
|
108
112
|
logger.info(f"Lock for '{label}' was released.")
|
|
113
|
+
t_lock_release = time.perf_counter()
|
|
114
|
+
lock_was_acquired = 1
|
|
115
|
+
else:
|
|
116
|
+
t_lock_release = time.perf_counter()
|
|
117
|
+
t_lock_acquisition = t_lock_release
|
|
118
|
+
lock_was_acquired = 0
|
|
119
|
+
lock_waiting_time = t_lock_acquisition - t_lock_request
|
|
120
|
+
lock_holding_time = t_lock_release - t_lock_acquisition
|
|
121
|
+
ssh_logger.info(
|
|
122
|
+
f"{pid} {lock_waiting_time:.6e} {lock_holding_time:.6e} {lock_was_acquired} {label.replace(' ', '_')}" # noqa
|
|
123
|
+
)
|
|
109
124
|
|
|
110
125
|
|
|
111
126
|
class FractalSSH:
|
|
@@ -130,11 +145,12 @@ class FractalSSH:
|
|
|
130
145
|
sftp_get_prefetch: bool
|
|
131
146
|
sftp_get_max_requests: int
|
|
132
147
|
logger_name: str
|
|
148
|
+
_pid: int
|
|
133
149
|
|
|
134
150
|
def __init__(
|
|
135
151
|
self,
|
|
136
152
|
connection: Connection,
|
|
137
|
-
default_timeout: float =
|
|
153
|
+
default_timeout: float = 500.0,
|
|
138
154
|
sftp_get_prefetch: bool = False,
|
|
139
155
|
sftp_get_max_requests: int = 64,
|
|
140
156
|
logger_name: str = __name__,
|
|
@@ -146,6 +162,8 @@ class FractalSSH:
|
|
|
146
162
|
self.sftp_get_max_requests = sftp_get_max_requests
|
|
147
163
|
self.logger_name = logger_name
|
|
148
164
|
set_logger(self.logger_name)
|
|
165
|
+
set_logger(SSH_MONITORING_LOGGER_NAME)
|
|
166
|
+
self._pid = os.getpid()
|
|
149
167
|
|
|
150
168
|
@property
|
|
151
169
|
def is_connected(self) -> bool:
|
|
@@ -197,6 +215,8 @@ class FractalSSH:
|
|
|
197
215
|
lock=self._lock,
|
|
198
216
|
label=label,
|
|
199
217
|
timeout=actual_lock_timeout,
|
|
218
|
+
pid=self._pid,
|
|
219
|
+
logger_name=self.logger_name,
|
|
200
220
|
):
|
|
201
221
|
return self._connection.run(*args, **kwargs)
|
|
202
222
|
|
|
@@ -212,8 +232,10 @@ class FractalSSH:
|
|
|
212
232
|
self.logger.info(f"START reading remote JSON file {filepath}.")
|
|
213
233
|
with _acquire_lock_with_timeout(
|
|
214
234
|
lock=self._lock,
|
|
215
|
-
label="read_remote_json_file",
|
|
216
235
|
timeout=self.default_lock_timeout,
|
|
236
|
+
logger_name=self.logger_name,
|
|
237
|
+
pid=self._pid,
|
|
238
|
+
label=f"read_remote_json_file({filepath})",
|
|
217
239
|
):
|
|
218
240
|
try:
|
|
219
241
|
with self._sftp_unsafe().open(filepath, "r") as f:
|
|
@@ -239,8 +261,10 @@ class FractalSSH:
|
|
|
239
261
|
self.logger.info(f"START reading remote text file {filepath}.")
|
|
240
262
|
with _acquire_lock_with_timeout(
|
|
241
263
|
lock=self._lock,
|
|
242
|
-
label="read_remote_text_file",
|
|
264
|
+
label=f"read_remote_text_file({filepath})",
|
|
243
265
|
timeout=self.default_lock_timeout,
|
|
266
|
+
pid=self._pid,
|
|
267
|
+
logger_name=self.logger_name,
|
|
244
268
|
):
|
|
245
269
|
try:
|
|
246
270
|
with self._sftp_unsafe().open(filepath, "r") as f:
|
|
@@ -298,9 +322,10 @@ class FractalSSH:
|
|
|
298
322
|
self.close()
|
|
299
323
|
with _acquire_lock_with_timeout(
|
|
300
324
|
lock=self._lock,
|
|
301
|
-
label="_connection.open",
|
|
325
|
+
label="FractalSSH._connection.{open,open_sftp}()",
|
|
302
326
|
timeout=self.default_lock_timeout,
|
|
303
327
|
logger_name=self.logger_name,
|
|
328
|
+
pid=self._pid,
|
|
304
329
|
):
|
|
305
330
|
self._connection.open()
|
|
306
331
|
self._connection.client.open_sftp()
|
|
@@ -324,12 +349,16 @@ class FractalSSH:
|
|
|
324
349
|
"""
|
|
325
350
|
with _acquire_lock_with_timeout(
|
|
326
351
|
lock=self._lock,
|
|
327
|
-
label="_connection.close",
|
|
352
|
+
label="FractalSSH._connection.close()",
|
|
328
353
|
timeout=self.default_lock_timeout,
|
|
354
|
+
pid=self._pid,
|
|
355
|
+
logger_name=self.logger_name,
|
|
329
356
|
):
|
|
330
357
|
self._connection.close()
|
|
331
358
|
if self._connection.client is not None:
|
|
332
359
|
self._connection.client.close()
|
|
360
|
+
close_logger(get_logger(self.logger_name))
|
|
361
|
+
close_logger(get_logger(SSH_MONITORING_LOGGER_NAME))
|
|
333
362
|
|
|
334
363
|
@retry_if_socket_error
|
|
335
364
|
def run_command(
|
|
@@ -364,14 +393,14 @@ class FractalSSH:
|
|
|
364
393
|
# Case 1: Command runs successfully
|
|
365
394
|
res = self._run(
|
|
366
395
|
cmd,
|
|
367
|
-
label=
|
|
396
|
+
label=cmd,
|
|
368
397
|
lock_timeout=actual_lock_timeout,
|
|
369
398
|
hide=True,
|
|
370
399
|
in_stream=False,
|
|
371
400
|
)
|
|
372
401
|
t_1 = time.perf_counter()
|
|
373
402
|
self.logger.info(
|
|
374
|
-
f"END running '{cmd}' over SSH,
|
|
403
|
+
f"END running '{cmd}' over SSH, elapsed={t_1 - t_0:.3f}"
|
|
375
404
|
)
|
|
376
405
|
self.logger.debug("STDOUT:")
|
|
377
406
|
self.logger.debug(res.stdout)
|
|
@@ -423,8 +452,10 @@ class FractalSSH:
|
|
|
423
452
|
actual_lock_timeout = lock_timeout
|
|
424
453
|
with _acquire_lock_with_timeout(
|
|
425
454
|
lock=self._lock,
|
|
426
|
-
label=f"send_file
|
|
455
|
+
label=f"send_file({local},{remote})",
|
|
427
456
|
timeout=actual_lock_timeout,
|
|
457
|
+
pid=self._pid,
|
|
458
|
+
logger_name=self.logger_name,
|
|
428
459
|
):
|
|
429
460
|
self._sftp_unsafe().put(local, remote)
|
|
430
461
|
self.logger.info(
|
|
@@ -463,8 +494,10 @@ class FractalSSH:
|
|
|
463
494
|
actual_lock_timeout = lock_timeout
|
|
464
495
|
with _acquire_lock_with_timeout(
|
|
465
496
|
lock=self._lock,
|
|
466
|
-
label=f"fetch_file
|
|
497
|
+
label=f"fetch_file({local},{remote})",
|
|
467
498
|
timeout=actual_lock_timeout,
|
|
499
|
+
pid=self._pid,
|
|
500
|
+
logger_name=self.logger_name,
|
|
468
501
|
):
|
|
469
502
|
self._sftp_unsafe().get(
|
|
470
503
|
remote,
|
|
@@ -554,8 +587,10 @@ class FractalSSH:
|
|
|
554
587
|
actual_lock_timeout = lock_timeout
|
|
555
588
|
with _acquire_lock_with_timeout(
|
|
556
589
|
lock=self._lock,
|
|
557
|
-
label=f"write_remote_file
|
|
590
|
+
label=f"write_remote_file({path})",
|
|
558
591
|
timeout=actual_lock_timeout,
|
|
592
|
+
pid=self._pid,
|
|
593
|
+
logger_name=self.logger_name,
|
|
559
594
|
):
|
|
560
595
|
try:
|
|
561
596
|
with self._sftp_unsafe().open(filename=path, mode="w") as f:
|
|
@@ -576,8 +611,10 @@ class FractalSSH:
|
|
|
576
611
|
self.logger.info(f"START remote_file_exists {path}")
|
|
577
612
|
with _acquire_lock_with_timeout(
|
|
578
613
|
lock=self._lock,
|
|
579
|
-
label=f"remote_file_exists
|
|
614
|
+
label=f"remote_file_exists({path})",
|
|
580
615
|
timeout=self.default_lock_timeout,
|
|
616
|
+
pid=self._pid,
|
|
617
|
+
logger_name=self.logger_name,
|
|
581
618
|
):
|
|
582
619
|
try:
|
|
583
620
|
self._sftp_unsafe().stat(path)
|
|
@@ -613,6 +650,7 @@ class FractalSSHList:
|
|
|
613
650
|
_lock: Lock
|
|
614
651
|
_timeout: float
|
|
615
652
|
_logger_name: str
|
|
653
|
+
_pid: int
|
|
616
654
|
|
|
617
655
|
def __init__(
|
|
618
656
|
self,
|
|
@@ -625,6 +663,7 @@ class FractalSSHList:
|
|
|
625
663
|
self._timeout = timeout
|
|
626
664
|
self._logger_name = logger_name
|
|
627
665
|
set_logger(self._logger_name)
|
|
666
|
+
self._pid = os.getpid()
|
|
628
667
|
|
|
629
668
|
@property
|
|
630
669
|
def logger(self) -> logging.Logger:
|
|
@@ -677,6 +716,8 @@ class FractalSSHList:
|
|
|
677
716
|
lock=self._lock,
|
|
678
717
|
label="FractalSSHList.get",
|
|
679
718
|
timeout=self._timeout,
|
|
719
|
+
pid=self._pid,
|
|
720
|
+
logger_name=self._logger_name,
|
|
680
721
|
):
|
|
681
722
|
self._data[key] = FractalSSH(connection=connection)
|
|
682
723
|
return self._data[key]
|
|
@@ -721,6 +762,8 @@ class FractalSSHList:
|
|
|
721
762
|
lock=self._lock,
|
|
722
763
|
timeout=self._timeout,
|
|
723
764
|
label="FractalSSHList.remove",
|
|
765
|
+
pid=self._pid,
|
|
766
|
+
logger_name=self._logger_name,
|
|
724
767
|
):
|
|
725
768
|
self.logger.info(
|
|
726
769
|
f"Removing FractalSSH object for {user}@{host} "
|
|
@@ -2,12 +2,18 @@ import os
|
|
|
2
2
|
import time
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
from sqlalchemy.orm import Session
|
|
6
|
+
|
|
7
|
+
from fractal_server.app.models.v2 import TaskGroupActivityV2
|
|
5
8
|
from fractal_server.config import get_settings
|
|
6
9
|
from fractal_server.config import PixiSLURMConfig
|
|
7
10
|
from fractal_server.logger import get_logger
|
|
8
11
|
from fractal_server.ssh._fabric import FractalSSH
|
|
9
12
|
from fractal_server.syringe import Inject
|
|
13
|
+
from fractal_server.tasks.v2.utils_background import add_commit_refresh
|
|
14
|
+
from fractal_server.tasks.v2.utils_background import get_current_log
|
|
10
15
|
|
|
16
|
+
FRACTAL_SQUEUE_ERROR_STATE = "__FRACTAL_SQUEUE_ERROR__"
|
|
11
17
|
|
|
12
18
|
# https://slurm.schedmd.com/squeue.html#lbAG
|
|
13
19
|
STATES_FINISHED = {
|
|
@@ -21,20 +27,130 @@ STATES_FINISHED = {
|
|
|
21
27
|
"PREEMPTED",
|
|
22
28
|
"SPECIAL_EXIT",
|
|
23
29
|
"TIMEOUT",
|
|
30
|
+
FRACTAL_SQUEUE_ERROR_STATE,
|
|
24
31
|
}
|
|
25
32
|
|
|
26
33
|
|
|
34
|
+
def _get_workdir_remote(script_paths: list[str]) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Check that there is one and only one `workdir`, and return it.
|
|
37
|
+
|
|
38
|
+
Note: The `is_absolute` check is to filter out a `chmod` command.
|
|
39
|
+
"""
|
|
40
|
+
workdirs = [
|
|
41
|
+
Path(script_path).parent.as_posix()
|
|
42
|
+
for script_path in script_paths
|
|
43
|
+
if Path(script_path).is_absolute()
|
|
44
|
+
]
|
|
45
|
+
if not len(set(workdirs)) == 1:
|
|
46
|
+
raise ValueError(f"Invalid {script_paths=}.")
|
|
47
|
+
return workdirs[0]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _read_file_if_exists(
|
|
51
|
+
*,
|
|
52
|
+
fractal_ssh: FractalSSH,
|
|
53
|
+
path: str,
|
|
54
|
+
) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Read a remote file if it exists, or return an empty string.
|
|
57
|
+
"""
|
|
58
|
+
if fractal_ssh.remote_exists(path=path):
|
|
59
|
+
return fractal_ssh.read_remote_text_file(path)
|
|
60
|
+
else:
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _log_change_of_job_state(
|
|
65
|
+
*,
|
|
66
|
+
old_state: str | None,
|
|
67
|
+
new_state: str,
|
|
68
|
+
logger_name: str,
|
|
69
|
+
) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Emit a log for state changes.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
old_state:
|
|
75
|
+
new_state:
|
|
76
|
+
logger_name:
|
|
77
|
+
"""
|
|
78
|
+
if new_state != old_state:
|
|
79
|
+
logger = get_logger(logger_name=logger_name)
|
|
80
|
+
logger.debug(
|
|
81
|
+
f"SLURM-job state changed from {old_state=} to {new_state=}."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _run_squeue(
|
|
86
|
+
*,
|
|
87
|
+
fractal_ssh: FractalSSH,
|
|
88
|
+
squeue_cmd: str,
|
|
89
|
+
logger_name: str,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Run a `squeue` command and handle exceptions.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
fractal_ssh:
|
|
96
|
+
logger_name:
|
|
97
|
+
squeue_cmd:
|
|
98
|
+
|
|
99
|
+
Return:
|
|
100
|
+
state: The SLURM-job state.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
cmd_stdout = fractal_ssh.run_command(cmd=squeue_cmd)
|
|
104
|
+
state = cmd_stdout.strip().split()[1]
|
|
105
|
+
return state
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger = get_logger(logger_name=logger_name)
|
|
108
|
+
logger.info(f"`squeue` command failed (original error: {e})")
|
|
109
|
+
return FRACTAL_SQUEUE_ERROR_STATE
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _verify_success_file_exists(
|
|
113
|
+
*,
|
|
114
|
+
fractal_ssh: FractalSSH,
|
|
115
|
+
success_file_remote: str,
|
|
116
|
+
logger_name: str,
|
|
117
|
+
stderr_remote: str,
|
|
118
|
+
) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Fail if the success sentinel file does not exist remotely.
|
|
121
|
+
|
|
122
|
+
Note: the `FractalSSH` methods in this function may fail, and such failures
|
|
123
|
+
are not handled in this function. Any such failure, however, will lead to
|
|
124
|
+
a "failed" task-group lifecycle activity (because it will raise an
|
|
125
|
+
exception from within `run_script_on_remote_slurm`, which will then be
|
|
126
|
+
handled at the calling-function level.
|
|
127
|
+
"""
|
|
128
|
+
if not fractal_ssh.remote_exists(path=success_file_remote):
|
|
129
|
+
logger = get_logger(logger_name=logger_name)
|
|
130
|
+
error_msg = f"{success_file_remote=} missing."
|
|
131
|
+
logger.info(error_msg)
|
|
132
|
+
|
|
133
|
+
stderr = _read_file_if_exists(
|
|
134
|
+
fractal_ssh=fractal_ssh, path=stderr_remote
|
|
135
|
+
)
|
|
136
|
+
if stderr:
|
|
137
|
+
logger.info(f"SLURM-job stderr:\n{stderr}")
|
|
138
|
+
raise RuntimeError(error_msg)
|
|
139
|
+
|
|
140
|
+
|
|
27
141
|
def run_script_on_remote_slurm(
|
|
28
142
|
*,
|
|
29
|
-
|
|
143
|
+
script_paths: list[str],
|
|
30
144
|
slurm_config: PixiSLURMConfig,
|
|
31
145
|
fractal_ssh: FractalSSH,
|
|
32
146
|
logger_name: str,
|
|
147
|
+
log_file_path: Path,
|
|
33
148
|
prefix: str,
|
|
149
|
+
db: Session,
|
|
150
|
+
activity: TaskGroupActivityV2,
|
|
34
151
|
):
|
|
35
152
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
153
|
+
Run a `pixi install` script as a SLURM job.
|
|
38
154
|
|
|
39
155
|
NOTE: This is called from within a try/except, thus we can use exceptions
|
|
40
156
|
as a mechanism to propagate failure/errors.
|
|
@@ -44,7 +160,7 @@ def run_script_on_remote_slurm(
|
|
|
44
160
|
settings = Inject(get_settings)
|
|
45
161
|
|
|
46
162
|
# (1) Prepare remote submission script
|
|
47
|
-
workdir_remote =
|
|
163
|
+
workdir_remote = _get_workdir_remote(script_paths)
|
|
48
164
|
submission_script_remote = os.path.join(
|
|
49
165
|
workdir_remote, f"{prefix}-submit.sh"
|
|
50
166
|
)
|
|
@@ -61,55 +177,79 @@ def run_script_on_remote_slurm(
|
|
|
61
177
|
f"#SBATCH --out={stdout_remote}",
|
|
62
178
|
f"#SBATCH -D {workdir_remote}",
|
|
63
179
|
"",
|
|
64
|
-
f"bash {script_path}",
|
|
65
|
-
f"touch {success_file_remote}",
|
|
66
|
-
"",
|
|
67
180
|
]
|
|
181
|
+
for script_path in script_paths:
|
|
182
|
+
script_lines.append(f"bash {script_path}")
|
|
183
|
+
script_lines.append(f"touch {success_file_remote}")
|
|
184
|
+
|
|
68
185
|
script_contents = "\n".join(script_lines)
|
|
69
186
|
fractal_ssh.write_remote_file(
|
|
70
187
|
path=submission_script_remote,
|
|
71
188
|
content=script_contents,
|
|
72
189
|
)
|
|
190
|
+
logger.debug(f"Written {submission_script_remote=}.")
|
|
191
|
+
|
|
192
|
+
activity.log = get_current_log(log_file_path)
|
|
193
|
+
activity = add_commit_refresh(obj=activity, db=db)
|
|
73
194
|
|
|
74
195
|
# (2) Submit SLURM job
|
|
75
|
-
|
|
196
|
+
logger.debug("Now submit SLURM job.")
|
|
197
|
+
sbatch_cmd = f"sbatch --parsable {submission_script_remote}"
|
|
76
198
|
try:
|
|
77
199
|
stdout = fractal_ssh.run_command(cmd=sbatch_cmd)
|
|
200
|
+
job_id = int(stdout)
|
|
201
|
+
logger.debug(f"SLURM-job submission successful ({job_id=}).")
|
|
78
202
|
except Exception as e:
|
|
79
203
|
logger.error(
|
|
80
|
-
|
|
81
|
-
|
|
204
|
+
(
|
|
205
|
+
f"Submission of {submission_script_remote} failed. "
|
|
206
|
+
f"Original error: {str(e)}"
|
|
207
|
+
)
|
|
82
208
|
)
|
|
83
209
|
raise e
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
210
|
+
finally:
|
|
211
|
+
activity.log = get_current_log(log_file_path)
|
|
212
|
+
activity = add_commit_refresh(obj=activity, db=db)
|
|
87
213
|
|
|
88
214
|
# (3) Monitor job
|
|
89
215
|
squeue_cmd = (
|
|
90
216
|
f"squeue --noheader --format='%i %T' --states=all --jobs={job_id}"
|
|
91
217
|
)
|
|
218
|
+
logger.debug(f"Start monitoring job with {squeue_cmd=}.")
|
|
219
|
+
old_state = None
|
|
92
220
|
while True:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
221
|
+
new_state = _run_squeue(
|
|
222
|
+
fractal_ssh=fractal_ssh,
|
|
223
|
+
squeue_cmd=squeue_cmd,
|
|
224
|
+
logger_name=logger_name,
|
|
225
|
+
)
|
|
226
|
+
_log_change_of_job_state(
|
|
227
|
+
old_state=old_state,
|
|
228
|
+
new_state=new_state,
|
|
229
|
+
logger_name=logger_name,
|
|
230
|
+
)
|
|
231
|
+
activity.log = get_current_log(log_file_path)
|
|
232
|
+
activity = add_commit_refresh(obj=activity, db=db)
|
|
233
|
+
if new_state in STATES_FINISHED:
|
|
234
|
+
logger.debug(f"Exit retrieval loop (state={new_state}).")
|
|
106
235
|
break
|
|
236
|
+
old_state = new_state
|
|
107
237
|
time.sleep(settings.FRACTAL_SLURM_POLL_INTERVAL)
|
|
108
238
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
239
|
+
_verify_success_file_exists(
|
|
240
|
+
fractal_ssh=fractal_ssh,
|
|
241
|
+
logger_name=logger_name,
|
|
242
|
+
success_file_remote=success_file_remote,
|
|
243
|
+
stderr_remote=stderr_remote,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
stdout = _read_file_if_exists(
|
|
247
|
+
fractal_ssh=fractal_ssh,
|
|
248
|
+
path=stdout_remote,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
logger.info("SLURM-job execution completed successfully, continue.")
|
|
252
|
+
activity.log = get_current_log(log_file_path)
|
|
253
|
+
activity = add_commit_refresh(obj=activity, db=db)
|
|
254
|
+
|
|
255
|
+
return stdout
|
|
@@ -203,12 +203,21 @@ def collect_ssh_pixi(
|
|
|
203
203
|
pyproject_toml_path=pyproject_toml_path,
|
|
204
204
|
)
|
|
205
205
|
|
|
206
|
-
#
|
|
206
|
+
# Prepare scripts 2 and 3
|
|
207
207
|
remote_script2_path = _customize_and_send_template(
|
|
208
208
|
template_filename="pixi_2_install.sh",
|
|
209
209
|
replacements=replacements,
|
|
210
210
|
**common_args,
|
|
211
211
|
)
|
|
212
|
+
remote_script3_path = _customize_and_send_template(
|
|
213
|
+
template_filename="pixi_3_post_install.sh",
|
|
214
|
+
replacements=replacements,
|
|
215
|
+
**common_args,
|
|
216
|
+
)
|
|
217
|
+
logger.debug(
|
|
218
|
+
"Post-installation script written to "
|
|
219
|
+
f"{remote_script3_path=}."
|
|
220
|
+
)
|
|
212
221
|
logger.debug(
|
|
213
222
|
"Installation script written to "
|
|
214
223
|
f"{remote_script2_path=}."
|
|
@@ -216,26 +225,24 @@ def collect_ssh_pixi(
|
|
|
216
225
|
activity.log = get_current_log(log_file_path)
|
|
217
226
|
activity = add_commit_refresh(obj=activity, db=db)
|
|
218
227
|
|
|
219
|
-
|
|
220
|
-
|
|
228
|
+
# Run scripts 2 and 3
|
|
229
|
+
stdout = run_script_on_remote_slurm(
|
|
230
|
+
script_paths=[
|
|
231
|
+
remote_script2_path,
|
|
232
|
+
remote_script3_path,
|
|
233
|
+
f"chmod -R 755 {source_dir}",
|
|
234
|
+
],
|
|
221
235
|
slurm_config=settings.pixi.SLURM_CONFIG,
|
|
222
236
|
fractal_ssh=fractal_ssh,
|
|
223
237
|
logger_name=LOGGER_NAME,
|
|
224
238
|
prefix=common_args["prefix"],
|
|
239
|
+
db=db,
|
|
240
|
+
activity=activity,
|
|
241
|
+
log_file_path=log_file_path,
|
|
225
242
|
)
|
|
226
243
|
activity.log = get_current_log(log_file_path)
|
|
227
244
|
activity = add_commit_refresh(obj=activity, db=db)
|
|
228
245
|
|
|
229
|
-
# Run script 3 - post-install
|
|
230
|
-
stdout = _customize_and_run_template(
|
|
231
|
-
template_filename="pixi_3_post_install.sh",
|
|
232
|
-
replacements=replacements,
|
|
233
|
-
**common_args,
|
|
234
|
-
)
|
|
235
|
-
logger.debug(f"STDOUT: {stdout}")
|
|
236
|
-
activity.log = get_current_log(log_file_path)
|
|
237
|
-
activity = add_commit_refresh(obj=activity, db=db)
|
|
238
|
-
|
|
239
246
|
# Parse stdout
|
|
240
247
|
parsed_output = parse_collect_stdout(stdout)
|
|
241
248
|
package_root_remote = parsed_output["package_root"]
|
|
@@ -245,8 +252,6 @@ def collect_ssh_pixi(
|
|
|
245
252
|
"project_python_wrapper"
|
|
246
253
|
]
|
|
247
254
|
|
|
248
|
-
fractal_ssh.run_command(cmd=f"chmod -R 755 {source_dir}")
|
|
249
|
-
|
|
250
255
|
# Read and validate remote manifest file
|
|
251
256
|
manifest_path_remote = (
|
|
252
257
|
f"{package_root_remote}/__FRACTAL_MANIFEST__.json"
|