fractal-server 2.16.2a0__py3-none-any.whl → 2.16.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/routes/admin/v2/job.py +3 -3
  3. fractal_server/app/routes/admin/v2/task.py +1 -1
  4. fractal_server/app/routes/admin/v2/task_group.py +1 -1
  5. fractal_server/app/routes/admin/v2/task_group_lifecycle.py +3 -3
  6. fractal_server/app/routes/api/v2/_aux_functions.py +7 -7
  7. fractal_server/app/routes/api/v2/_aux_functions_history.py +2 -2
  8. fractal_server/app/routes/api/v2/_aux_functions_task_lifecycle.py +37 -13
  9. fractal_server/app/routes/api/v2/_aux_functions_tasks.py +8 -8
  10. fractal_server/app/routes/api/v2/dataset.py +4 -4
  11. fractal_server/app/routes/api/v2/history.py +2 -2
  12. fractal_server/app/routes/api/v2/images.py +3 -3
  13. fractal_server/app/routes/api/v2/job.py +1 -1
  14. fractal_server/app/routes/api/v2/project.py +1 -1
  15. fractal_server/app/routes/api/v2/status_legacy.py +1 -1
  16. fractal_server/app/routes/api/v2/submit.py +9 -9
  17. fractal_server/app/routes/api/v2/task.py +4 -4
  18. fractal_server/app/routes/api/v2/task_collection.py +5 -5
  19. fractal_server/app/routes/api/v2/task_collection_custom.py +6 -6
  20. fractal_server/app/routes/api/v2/task_collection_pixi.py +5 -5
  21. fractal_server/app/routes/api/v2/task_group_lifecycle.py +3 -3
  22. fractal_server/app/routes/api/v2/task_version_update.py +3 -3
  23. fractal_server/app/routes/api/v2/workflow.py +4 -4
  24. fractal_server/app/routes/api/v2/workflow_import.py +1 -1
  25. fractal_server/app/routes/api/v2/workflowtask.py +6 -6
  26. fractal_server/app/routes/auth/group.py +2 -2
  27. fractal_server/app/routes/auth/users.py +1 -1
  28. fractal_server/app/routes/aux/_job.py +1 -1
  29. fractal_server/app/routes/aux/_runner.py +2 -2
  30. fractal_server/app/routes/aux/validate_user_settings.py +2 -2
  31. fractal_server/config.py +2 -2
  32. fractal_server/main.py +1 -1
  33. fractal_server/{app/runner → runner}/executors/base_runner.py +1 -1
  34. fractal_server/{app/runner → runner}/executors/call_command_wrapper.py +1 -1
  35. fractal_server/{app/runner → runner}/executors/local/runner.py +9 -9
  36. fractal_server/{app/runner → runner}/executors/slurm_common/_slurm_config.py +1 -1
  37. fractal_server/{app/runner → runner}/executors/slurm_common/base_slurm_runner.py +13 -13
  38. fractal_server/{app/runner → runner}/executors/slurm_common/slurm_job_task_models.py +1 -1
  39. fractal_server/{app/runner → runner}/executors/slurm_sudo/runner.py +1 -1
  40. fractal_server/{app/runner → runner}/task_files.py +1 -1
  41. fractal_server/{app/runner → runner}/v2/_local.py +2 -2
  42. fractal_server/{app/runner → runner}/v2/_slurm_ssh.py +3 -3
  43. fractal_server/{app/runner → runner}/v2/_slurm_sudo.py +2 -2
  44. fractal_server/{app/runner → runner}/v2/deduplicate_list.py +2 -2
  45. fractal_server/{app/runner → runner}/v2/merge_outputs.py +2 -2
  46. fractal_server/{app/runner → runner}/v2/runner.py +3 -3
  47. fractal_server/{app/runner → runner}/v2/runner_functions.py +12 -12
  48. fractal_server/{app/runner → runner}/v2/submit_workflow.py +13 -13
  49. fractal_server/{app/runner → runner}/v2/task_interface.py +2 -2
  50. fractal_server/ssh/_fabric.py +61 -18
  51. fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py +173 -33
  52. fractal_server/tasks/v2/ssh/collect_pixi.py +20 -15
  53. fractal_server/tasks/v2/ssh/reactivate_pixi.py +20 -15
  54. fractal_server/tasks/v2/utils_background.py +1 -1
  55. {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/METADATA +6 -6
  56. {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/RECORD +82 -82
  57. /fractal_server/{app/runner → runner}/__init__.py +0 -0
  58. /fractal_server/{app/runner → runner}/components.py +0 -0
  59. /fractal_server/{app/runner → runner}/exceptions.py +0 -0
  60. /fractal_server/{app/runner → runner}/executors/__init__.py +0 -0
  61. /fractal_server/{app/runner → runner}/executors/local/__init__.py +0 -0
  62. /fractal_server/{app/runner → runner}/executors/local/get_local_config.py +0 -0
  63. /fractal_server/{app/runner → runner}/executors/slurm_common/__init__.py +0 -0
  64. /fractal_server/{app/runner → runner}/executors/slurm_common/_batching.py +0 -0
  65. /fractal_server/{app/runner → runner}/executors/slurm_common/_job_states.py +0 -0
  66. /fractal_server/{app/runner → runner}/executors/slurm_common/get_slurm_config.py +0 -0
  67. /fractal_server/{app/runner → runner}/executors/slurm_common/remote.py +0 -0
  68. /fractal_server/{app/runner → runner}/executors/slurm_ssh/__init__.py +0 -0
  69. /fractal_server/{app/runner → runner}/executors/slurm_ssh/run_subprocess.py +0 -0
  70. /fractal_server/{app/runner → runner}/executors/slurm_ssh/runner.py +0 -0
  71. /fractal_server/{app/runner → runner}/executors/slurm_ssh/tar_commands.py +0 -0
  72. /fractal_server/{app/runner → runner}/executors/slurm_sudo/__init__.py +0 -0
  73. /fractal_server/{app/runner → runner}/executors/slurm_sudo/_subprocess_run_as_user.py +0 -0
  74. /fractal_server/{app/runner → runner}/filenames.py +0 -0
  75. /fractal_server/{app/runner → runner}/set_start_and_last_task_index.py +0 -0
  76. /fractal_server/{app/runner → runner}/shutdown.py +0 -0
  77. /fractal_server/{app/runner → runner}/v2/__init__.py +0 -0
  78. /fractal_server/{app/runner → runner}/v2/db_tools.py +0 -0
  79. /fractal_server/{app/runner → runner}/versions.py +0 -0
  80. {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/LICENSE +0 -0
  81. {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/WHEEL +0 -0
  82. {fractal_server-2.16.2a0.dist-info → fractal_server-2.16.4.dist-info}/entry_points.txt +0 -0
@@ -11,26 +11,26 @@ from pathlib import Path
11
11
 
12
12
  from sqlalchemy.orm import Session as DBSyncSession
13
13
 
14
- from ....config import get_settings
15
- from ....logger import get_logger
16
- from ....logger import reset_logger_handlers
17
- from ....logger import set_logger
18
- from ....ssh._fabric import FractalSSH
19
- from ....syringe import Inject
20
- from ....utils import get_timestamp
21
- from ....zip_tools import _zip_folder_to_file_and_remove
22
- from ...db import DB
23
- from ...models.v2 import DatasetV2
24
- from ...models.v2 import JobV2
25
- from ...models.v2 import WorkflowV2
26
- from ...schemas.v2 import JobStatusTypeV2
14
+ from ...config import get_settings
15
+ from ...logger import get_logger
16
+ from ...logger import reset_logger_handlers
17
+ from ...logger import set_logger
18
+ from ...ssh._fabric import FractalSSH
19
+ from ...syringe import Inject
20
+ from ...utils import get_timestamp
21
+ from ...zip_tools import _zip_folder_to_file_and_remove
27
22
  from ..exceptions import JobExecutionError
28
23
  from ..filenames import WORKFLOW_LOG_FILENAME
29
24
  from ._local import process_workflow as local_process_workflow
30
25
  from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
31
26
  from ._slurm_sudo import process_workflow as slurm_sudo_process_workflow
32
27
  from fractal_server import __VERSION__
28
+ from fractal_server.app.db import DB
33
29
  from fractal_server.app.models import UserSettings
30
+ from fractal_server.app.models.v2 import DatasetV2
31
+ from fractal_server.app.models.v2 import JobV2
32
+ from fractal_server.app.models.v2 import WorkflowV2
33
+ from fractal_server.app.schemas.v2 import JobStatusTypeV2
34
34
 
35
35
 
36
36
  _backends = {}
@@ -5,8 +5,8 @@ from pydantic import ConfigDict
5
5
  from pydantic import Field
6
6
  from pydantic import ValidationError
7
7
 
8
- from ....images import SingleImageTaskOutput
9
- from fractal_server.app.runner.exceptions import TaskOutputValidationError
8
+ from fractal_server.images import SingleImageTaskOutput
9
+ from fractal_server.runner.exceptions import TaskOutputValidationError
10
10
  from fractal_server.types import ZarrUrlStr
11
11
 
12
12
 
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  import time
4
5
  from collections.abc import Generator
5
6
  from contextlib import contextmanager
@@ -15,11 +16,15 @@ from invoke import UnexpectedExit
15
16
  from paramiko.ssh_exception import NoValidConnectionsError
16
17
  from pydantic import BaseModel
17
18
 
19
+ from ..logger import close_logger
18
20
  from ..logger import get_logger
19
21
  from ..logger import set_logger
20
22
  from fractal_server.string_tools import validate_cmd
21
23
 
22
24
 
25
+ SSH_MONITORING_LOGGER_NAME = "ssh-log"
26
+
27
+
23
28
  class FractalSSHTimeoutError(RuntimeError):
24
29
  pass
25
30
 
@@ -42,9 +47,6 @@ class SSHConfig(BaseModel):
42
47
  key_path: str
43
48
 
44
49
 
45
- logger = set_logger(__name__)
46
-
47
-
48
50
  def retry_if_socket_error(func):
49
51
  @wraps(func)
50
52
  def func_with_retry(*args, **kwargs):
@@ -76,7 +78,8 @@ def _acquire_lock_with_timeout(
76
78
  lock: Lock,
77
79
  label: str,
78
80
  timeout: float,
79
- logger_name: str = __name__,
81
+ pid: int,
82
+ logger_name: str,
80
83
  ) -> Generator[Literal[True], Any, None]:
81
84
  """
82
85
  Given a `threading.Lock` object, try to acquire it within a given timeout.
@@ -88,8 +91,9 @@ def _acquire_lock_with_timeout(
88
91
  logger_name:
89
92
  """
90
93
  logger = get_logger(logger_name)
94
+ ssh_logger = get_logger(SSH_MONITORING_LOGGER_NAME)
91
95
  logger.info(f"Trying to acquire lock for '{label}', with {timeout=}")
92
- t_start_lock_acquire = time.perf_counter()
96
+ t_lock_request = time.perf_counter()
93
97
  result = lock.acquire(timeout=timeout)
94
98
  try:
95
99
  if not result:
@@ -98,14 +102,25 @@ def _acquire_lock_with_timeout(
98
102
  f"Failed to acquire lock for '{label}' within "
99
103
  f"{timeout} seconds"
100
104
  )
101
- t_end_lock_acquire = time.perf_counter()
102
- elapsed = t_end_lock_acquire - t_start_lock_acquire
105
+ t_lock_acquisition = time.perf_counter()
106
+ elapsed = t_lock_acquisition - t_lock_request
103
107
  logger.info(f"Lock for '{label}' was acquired - {elapsed=:.4f} s")
104
108
  yield result
105
109
  finally:
106
110
  if result:
107
111
  lock.release()
108
112
  logger.info(f"Lock for '{label}' was released.")
113
+ t_lock_release = time.perf_counter()
114
+ lock_was_acquired = 1
115
+ else:
116
+ t_lock_release = time.perf_counter()
117
+ t_lock_acquisition = t_lock_release
118
+ lock_was_acquired = 0
119
+ lock_waiting_time = t_lock_acquisition - t_lock_request
120
+ lock_holding_time = t_lock_release - t_lock_acquisition
121
+ ssh_logger.info(
122
+ f"{pid} {lock_waiting_time:.6e} {lock_holding_time:.6e} {lock_was_acquired} {label.replace(' ', '_')}" # noqa
123
+ )
109
124
 
110
125
 
111
126
  class FractalSSH:
@@ -130,11 +145,12 @@ class FractalSSH:
130
145
  sftp_get_prefetch: bool
131
146
  sftp_get_max_requests: int
132
147
  logger_name: str
148
+ _pid: int
133
149
 
134
150
  def __init__(
135
151
  self,
136
152
  connection: Connection,
137
- default_timeout: float = 250,
153
+ default_timeout: float = 500.0,
138
154
  sftp_get_prefetch: bool = False,
139
155
  sftp_get_max_requests: int = 64,
140
156
  logger_name: str = __name__,
@@ -146,6 +162,8 @@ class FractalSSH:
146
162
  self.sftp_get_max_requests = sftp_get_max_requests
147
163
  self.logger_name = logger_name
148
164
  set_logger(self.logger_name)
165
+ set_logger(SSH_MONITORING_LOGGER_NAME)
166
+ self._pid = os.getpid()
149
167
 
150
168
  @property
151
169
  def is_connected(self) -> bool:
@@ -197,6 +215,8 @@ class FractalSSH:
197
215
  lock=self._lock,
198
216
  label=label,
199
217
  timeout=actual_lock_timeout,
218
+ pid=self._pid,
219
+ logger_name=self.logger_name,
200
220
  ):
201
221
  return self._connection.run(*args, **kwargs)
202
222
 
@@ -212,8 +232,10 @@ class FractalSSH:
212
232
  self.logger.info(f"START reading remote JSON file {filepath}.")
213
233
  with _acquire_lock_with_timeout(
214
234
  lock=self._lock,
215
- label="read_remote_json_file",
216
235
  timeout=self.default_lock_timeout,
236
+ logger_name=self.logger_name,
237
+ pid=self._pid,
238
+ label=f"read_remote_json_file({filepath})",
217
239
  ):
218
240
  try:
219
241
  with self._sftp_unsafe().open(filepath, "r") as f:
@@ -239,8 +261,10 @@ class FractalSSH:
239
261
  self.logger.info(f"START reading remote text file {filepath}.")
240
262
  with _acquire_lock_with_timeout(
241
263
  lock=self._lock,
242
- label="read_remote_text_file",
264
+ label=f"read_remote_text_file({filepath})",
243
265
  timeout=self.default_lock_timeout,
266
+ pid=self._pid,
267
+ logger_name=self.logger_name,
244
268
  ):
245
269
  try:
246
270
  with self._sftp_unsafe().open(filepath, "r") as f:
@@ -298,9 +322,10 @@ class FractalSSH:
298
322
  self.close()
299
323
  with _acquire_lock_with_timeout(
300
324
  lock=self._lock,
301
- label="_connection.open",
325
+ label="FractalSSH._connection.{open,open_sftp}()",
302
326
  timeout=self.default_lock_timeout,
303
327
  logger_name=self.logger_name,
328
+ pid=self._pid,
304
329
  ):
305
330
  self._connection.open()
306
331
  self._connection.client.open_sftp()
@@ -324,12 +349,16 @@ class FractalSSH:
324
349
  """
325
350
  with _acquire_lock_with_timeout(
326
351
  lock=self._lock,
327
- label="_connection.close",
352
+ label="FractalSSH._connection.close()",
328
353
  timeout=self.default_lock_timeout,
354
+ pid=self._pid,
355
+ logger_name=self.logger_name,
329
356
  ):
330
357
  self._connection.close()
331
358
  if self._connection.client is not None:
332
359
  self._connection.client.close()
360
+ close_logger(get_logger(self.logger_name))
361
+ close_logger(get_logger(SSH_MONITORING_LOGGER_NAME))
333
362
 
334
363
  @retry_if_socket_error
335
364
  def run_command(
@@ -364,14 +393,14 @@ class FractalSSH:
364
393
  # Case 1: Command runs successfully
365
394
  res = self._run(
366
395
  cmd,
367
- label=f"run {cmd}",
396
+ label=cmd,
368
397
  lock_timeout=actual_lock_timeout,
369
398
  hide=True,
370
399
  in_stream=False,
371
400
  )
372
401
  t_1 = time.perf_counter()
373
402
  self.logger.info(
374
- f"END running '{cmd}' over SSH, " f"elapsed={t_1 - t_0:.3f}"
403
+ f"END running '{cmd}' over SSH, elapsed={t_1 - t_0:.3f}"
375
404
  )
376
405
  self.logger.debug("STDOUT:")
377
406
  self.logger.debug(res.stdout)
@@ -423,8 +452,10 @@ class FractalSSH:
423
452
  actual_lock_timeout = lock_timeout
424
453
  with _acquire_lock_with_timeout(
425
454
  lock=self._lock,
426
- label=f"send_file {local=} {remote=}",
455
+ label=f"send_file({local},{remote})",
427
456
  timeout=actual_lock_timeout,
457
+ pid=self._pid,
458
+ logger_name=self.logger_name,
428
459
  ):
429
460
  self._sftp_unsafe().put(local, remote)
430
461
  self.logger.info(
@@ -463,8 +494,10 @@ class FractalSSH:
463
494
  actual_lock_timeout = lock_timeout
464
495
  with _acquire_lock_with_timeout(
465
496
  lock=self._lock,
466
- label=f"fetch_file {local=} {remote=}",
497
+ label=f"fetch_file({local},{remote})",
467
498
  timeout=actual_lock_timeout,
499
+ pid=self._pid,
500
+ logger_name=self.logger_name,
468
501
  ):
469
502
  self._sftp_unsafe().get(
470
503
  remote,
@@ -554,8 +587,10 @@ class FractalSSH:
554
587
  actual_lock_timeout = lock_timeout
555
588
  with _acquire_lock_with_timeout(
556
589
  lock=self._lock,
557
- label=f"write_remote_file {path=}",
590
+ label=f"write_remote_file({path})",
558
591
  timeout=actual_lock_timeout,
592
+ pid=self._pid,
593
+ logger_name=self.logger_name,
559
594
  ):
560
595
  try:
561
596
  with self._sftp_unsafe().open(filename=path, mode="w") as f:
@@ -576,8 +611,10 @@ class FractalSSH:
576
611
  self.logger.info(f"START remote_file_exists {path}")
577
612
  with _acquire_lock_with_timeout(
578
613
  lock=self._lock,
579
- label=f"remote_file_exists {path=}",
614
+ label=f"remote_file_exists({path})",
580
615
  timeout=self.default_lock_timeout,
616
+ pid=self._pid,
617
+ logger_name=self.logger_name,
581
618
  ):
582
619
  try:
583
620
  self._sftp_unsafe().stat(path)
@@ -613,6 +650,7 @@ class FractalSSHList:
613
650
  _lock: Lock
614
651
  _timeout: float
615
652
  _logger_name: str
653
+ _pid: int
616
654
 
617
655
  def __init__(
618
656
  self,
@@ -625,6 +663,7 @@ class FractalSSHList:
625
663
  self._timeout = timeout
626
664
  self._logger_name = logger_name
627
665
  set_logger(self._logger_name)
666
+ self._pid = os.getpid()
628
667
 
629
668
  @property
630
669
  def logger(self) -> logging.Logger:
@@ -677,6 +716,8 @@ class FractalSSHList:
677
716
  lock=self._lock,
678
717
  label="FractalSSHList.get",
679
718
  timeout=self._timeout,
719
+ pid=self._pid,
720
+ logger_name=self._logger_name,
680
721
  ):
681
722
  self._data[key] = FractalSSH(connection=connection)
682
723
  return self._data[key]
@@ -721,6 +762,8 @@ class FractalSSHList:
721
762
  lock=self._lock,
722
763
  timeout=self._timeout,
723
764
  label="FractalSSHList.remove",
765
+ pid=self._pid,
766
+ logger_name=self._logger_name,
724
767
  ):
725
768
  self.logger.info(
726
769
  f"Removing FractalSSH object for {user}@{host} "
@@ -2,12 +2,18 @@ import os
2
2
  import time
3
3
  from pathlib import Path
4
4
 
5
+ from sqlalchemy.orm import Session
6
+
7
+ from fractal_server.app.models.v2 import TaskGroupActivityV2
5
8
  from fractal_server.config import get_settings
6
9
  from fractal_server.config import PixiSLURMConfig
7
10
  from fractal_server.logger import get_logger
8
11
  from fractal_server.ssh._fabric import FractalSSH
9
12
  from fractal_server.syringe import Inject
13
+ from fractal_server.tasks.v2.utils_background import add_commit_refresh
14
+ from fractal_server.tasks.v2.utils_background import get_current_log
10
15
 
16
+ FRACTAL_SQUEUE_ERROR_STATE = "__FRACTAL_SQUEUE_ERROR__"
11
17
 
12
18
  # https://slurm.schedmd.com/squeue.html#lbAG
13
19
  STATES_FINISHED = {
@@ -21,20 +27,130 @@ STATES_FINISHED = {
21
27
  "PREEMPTED",
22
28
  "SPECIAL_EXIT",
23
29
  "TIMEOUT",
30
+ FRACTAL_SQUEUE_ERROR_STATE,
24
31
  }
25
32
 
26
33
 
34
+ def _get_workdir_remote(script_paths: list[str]) -> str:
35
+ """
36
+ Check that there is one and only one `workdir`, and return it.
37
+
38
+ Note: The `is_absolute` check is to filter out a `chmod` command.
39
+ """
40
+ workdirs = [
41
+ Path(script_path).parent.as_posix()
42
+ for script_path in script_paths
43
+ if Path(script_path).is_absolute()
44
+ ]
45
+ if not len(set(workdirs)) == 1:
46
+ raise ValueError(f"Invalid {script_paths=}.")
47
+ return workdirs[0]
48
+
49
+
50
+ def _read_file_if_exists(
51
+ *,
52
+ fractal_ssh: FractalSSH,
53
+ path: str,
54
+ ) -> str:
55
+ """
56
+ Read a remote file if it exists, or return an empty string.
57
+ """
58
+ if fractal_ssh.remote_exists(path=path):
59
+ return fractal_ssh.read_remote_text_file(path)
60
+ else:
61
+ return ""
62
+
63
+
64
+ def _log_change_of_job_state(
65
+ *,
66
+ old_state: str | None,
67
+ new_state: str,
68
+ logger_name: str,
69
+ ) -> None:
70
+ """
71
+ Emit a log for state changes.
72
+
73
+ Args:
74
+ old_state:
75
+ new_state:
76
+ logger_name:
77
+ """
78
+ if new_state != old_state:
79
+ logger = get_logger(logger_name=logger_name)
80
+ logger.debug(
81
+ f"SLURM-job state changed from {old_state=} to {new_state=}."
82
+ )
83
+
84
+
85
+ def _run_squeue(
86
+ *,
87
+ fractal_ssh: FractalSSH,
88
+ squeue_cmd: str,
89
+ logger_name: str,
90
+ ) -> str:
91
+ """
92
+ Run a `squeue` command and handle exceptions.
93
+
94
+ Args:
95
+ fractal_ssh:
96
+ logger_name:
97
+ squeue_cmd:
98
+
99
+ Return:
100
+ state: The SLURM-job state.
101
+ """
102
+ try:
103
+ cmd_stdout = fractal_ssh.run_command(cmd=squeue_cmd)
104
+ state = cmd_stdout.strip().split()[1]
105
+ return state
106
+ except Exception as e:
107
+ logger = get_logger(logger_name=logger_name)
108
+ logger.info(f"`squeue` command failed (original error: {e})")
109
+ return FRACTAL_SQUEUE_ERROR_STATE
110
+
111
+
112
+ def _verify_success_file_exists(
113
+ *,
114
+ fractal_ssh: FractalSSH,
115
+ success_file_remote: str,
116
+ logger_name: str,
117
+ stderr_remote: str,
118
+ ) -> None:
119
+ """
120
+ Fail if the success sentinel file does not exist remotely.
121
+
122
+ Note: the `FractalSSH` methods in this function may fail, and such failures
123
+ are not handled in this function. Any such failure, however, will lead to
124
+ a "failed" task-group lifecycle activity (because it will raise an
125
+ exception from within `run_script_on_remote_slurm`, which will then be
126
+ handled at the calling-function level.
127
+ """
128
+ if not fractal_ssh.remote_exists(path=success_file_remote):
129
+ logger = get_logger(logger_name=logger_name)
130
+ error_msg = f"{success_file_remote=} missing."
131
+ logger.info(error_msg)
132
+
133
+ stderr = _read_file_if_exists(
134
+ fractal_ssh=fractal_ssh, path=stderr_remote
135
+ )
136
+ if stderr:
137
+ logger.info(f"SLURM-job stderr:\n{stderr}")
138
+ raise RuntimeError(error_msg)
139
+
140
+
27
141
  def run_script_on_remote_slurm(
28
142
  *,
29
- script_path: str,
143
+ script_paths: list[str],
30
144
  slurm_config: PixiSLURMConfig,
31
145
  fractal_ssh: FractalSSH,
32
146
  logger_name: str,
147
+ log_file_path: Path,
33
148
  prefix: str,
149
+ db: Session,
150
+ activity: TaskGroupActivityV2,
34
151
  ):
35
152
  """
36
- FIXME
37
-
153
+ Run a `pixi install` script as a SLURM job.
38
154
 
39
155
  NOTE: This is called from within a try/except, thus we can use exceptions
40
156
  as a mechanism to propagate failure/errors.
@@ -44,7 +160,7 @@ def run_script_on_remote_slurm(
44
160
  settings = Inject(get_settings)
45
161
 
46
162
  # (1) Prepare remote submission script
47
- workdir_remote = Path(script_path).parent.as_posix()
163
+ workdir_remote = _get_workdir_remote(script_paths)
48
164
  submission_script_remote = os.path.join(
49
165
  workdir_remote, f"{prefix}-submit.sh"
50
166
  )
@@ -61,55 +177,79 @@ def run_script_on_remote_slurm(
61
177
  f"#SBATCH --out={stdout_remote}",
62
178
  f"#SBATCH -D {workdir_remote}",
63
179
  "",
64
- f"bash {script_path}",
65
- f"touch {success_file_remote}",
66
- "",
67
180
  ]
181
+ for script_path in script_paths:
182
+ script_lines.append(f"bash {script_path}")
183
+ script_lines.append(f"touch {success_file_remote}")
184
+
68
185
  script_contents = "\n".join(script_lines)
69
186
  fractal_ssh.write_remote_file(
70
187
  path=submission_script_remote,
71
188
  content=script_contents,
72
189
  )
190
+ logger.debug(f"Written {submission_script_remote=}.")
191
+
192
+ activity.log = get_current_log(log_file_path)
193
+ activity = add_commit_refresh(obj=activity, db=db)
73
194
 
74
195
  # (2) Submit SLURM job
75
- sbatch_cmd = f"sbatch --parsable {submission_script_remote} "
196
+ logger.debug("Now submit SLURM job.")
197
+ sbatch_cmd = f"sbatch --parsable {submission_script_remote}"
76
198
  try:
77
199
  stdout = fractal_ssh.run_command(cmd=sbatch_cmd)
200
+ job_id = int(stdout)
201
+ logger.debug(f"SLURM-job submission successful ({job_id=}).")
78
202
  except Exception as e:
79
203
  logger.error(
80
- f"Submission of {submission_script_remote} failed. "
81
- f"Original error: {str(e)}"
204
+ (
205
+ f"Submission of {submission_script_remote} failed. "
206
+ f"Original error: {str(e)}"
207
+ )
82
208
  )
83
209
  raise e
84
- logger.debug(f"Now submit job {submission_script_remote} to SLURM.")
85
- job_id = int(stdout)
86
- logger.debug(f"SLURM-job submission successful ({job_id=}).")
210
+ finally:
211
+ activity.log = get_current_log(log_file_path)
212
+ activity = add_commit_refresh(obj=activity, db=db)
87
213
 
88
214
  # (3) Monitor job
89
215
  squeue_cmd = (
90
216
  f"squeue --noheader --format='%i %T' --states=all --jobs={job_id}"
91
217
  )
218
+ logger.debug(f"Start monitoring job with {squeue_cmd=}.")
219
+ old_state = None
92
220
  while True:
93
- try:
94
- stdout = fractal_ssh.run_command(cmd=squeue_cmd)
95
- except Exception as e:
96
- # FIXME: review this logic
97
- logger.info(
98
- f"`squeue` command failed (original error: {e}), "
99
- "consider the job as complete."
100
- )
101
- break
102
- state = stdout.strip().split()[1]
103
- logger.debug(f"Status of SLURM job {job_id}: {state}")
104
- if state in STATES_FINISHED:
105
- logger.debug(f"Exit retrieval loop ({state=}).")
221
+ new_state = _run_squeue(
222
+ fractal_ssh=fractal_ssh,
223
+ squeue_cmd=squeue_cmd,
224
+ logger_name=logger_name,
225
+ )
226
+ _log_change_of_job_state(
227
+ old_state=old_state,
228
+ new_state=new_state,
229
+ logger_name=logger_name,
230
+ )
231
+ activity.log = get_current_log(log_file_path)
232
+ activity = add_commit_refresh(obj=activity, db=db)
233
+ if new_state in STATES_FINISHED:
234
+ logger.debug(f"Exit retrieval loop (state={new_state}).")
106
235
  break
236
+ old_state = new_state
107
237
  time.sleep(settings.FRACTAL_SLURM_POLL_INTERVAL)
108
238
 
109
- if fractal_ssh.remote_exists(path=success_file_remote):
110
- logger.info(f"{success_file_remote=} exists.")
111
- else:
112
- raise RuntimeError(
113
- "SLURM job did not complete correctly "
114
- f"({success_file_remote=} missing)."
115
- )
239
+ _verify_success_file_exists(
240
+ fractal_ssh=fractal_ssh,
241
+ logger_name=logger_name,
242
+ success_file_remote=success_file_remote,
243
+ stderr_remote=stderr_remote,
244
+ )
245
+
246
+ stdout = _read_file_if_exists(
247
+ fractal_ssh=fractal_ssh,
248
+ path=stdout_remote,
249
+ )
250
+
251
+ logger.info("SLURM-job execution completed successfully, continue.")
252
+ activity.log = get_current_log(log_file_path)
253
+ activity = add_commit_refresh(obj=activity, db=db)
254
+
255
+ return stdout
@@ -203,12 +203,21 @@ def collect_ssh_pixi(
203
203
  pyproject_toml_path=pyproject_toml_path,
204
204
  )
205
205
 
206
- # Run script 2 - run pixi-install command
206
+ # Prepare scripts 2 and 3
207
207
  remote_script2_path = _customize_and_send_template(
208
208
  template_filename="pixi_2_install.sh",
209
209
  replacements=replacements,
210
210
  **common_args,
211
211
  )
212
+ remote_script3_path = _customize_and_send_template(
213
+ template_filename="pixi_3_post_install.sh",
214
+ replacements=replacements,
215
+ **common_args,
216
+ )
217
+ logger.debug(
218
+ "Post-installation script written to "
219
+ f"{remote_script3_path=}."
220
+ )
212
221
  logger.debug(
213
222
  "Installation script written to "
214
223
  f"{remote_script2_path=}."
@@ -216,26 +225,24 @@ def collect_ssh_pixi(
216
225
  activity.log = get_current_log(log_file_path)
217
226
  activity = add_commit_refresh(obj=activity, db=db)
218
227
 
219
- run_script_on_remote_slurm(
220
- script_path=remote_script2_path,
228
+ # Run scripts 2 and 3
229
+ stdout = run_script_on_remote_slurm(
230
+ script_paths=[
231
+ remote_script2_path,
232
+ remote_script3_path,
233
+ f"chmod -R 755 {source_dir}",
234
+ ],
221
235
  slurm_config=settings.pixi.SLURM_CONFIG,
222
236
  fractal_ssh=fractal_ssh,
223
237
  logger_name=LOGGER_NAME,
224
238
  prefix=common_args["prefix"],
239
+ db=db,
240
+ activity=activity,
241
+ log_file_path=log_file_path,
225
242
  )
226
243
  activity.log = get_current_log(log_file_path)
227
244
  activity = add_commit_refresh(obj=activity, db=db)
228
245
 
229
- # Run script 3 - post-install
230
- stdout = _customize_and_run_template(
231
- template_filename="pixi_3_post_install.sh",
232
- replacements=replacements,
233
- **common_args,
234
- )
235
- logger.debug(f"STDOUT: {stdout}")
236
- activity.log = get_current_log(log_file_path)
237
- activity = add_commit_refresh(obj=activity, db=db)
238
-
239
246
  # Parse stdout
240
247
  parsed_output = parse_collect_stdout(stdout)
241
248
  package_root_remote = parsed_output["package_root"]
@@ -245,8 +252,6 @@ def collect_ssh_pixi(
245
252
  "project_python_wrapper"
246
253
  ]
247
254
 
248
- fractal_ssh.run_command(cmd=f"chmod -R 755 {source_dir}")
249
-
250
255
  # Read and validate remote manifest file
251
256
  manifest_path_remote = (
252
257
  f"{package_root_remote}/__FRACTAL_MANIFEST__.json"