fractal-server 2.2.0a1__py3-none-any.whl → 2.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/models/v1/state.py +1 -2
  3. fractal_server/app/routes/admin/v1.py +2 -2
  4. fractal_server/app/routes/admin/v2.py +2 -2
  5. fractal_server/app/routes/api/v1/job.py +2 -2
  6. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  7. fractal_server/app/routes/api/v2/__init__.py +23 -3
  8. fractal_server/app/routes/api/v2/job.py +2 -2
  9. fractal_server/app/routes/api/v2/submit.py +6 -0
  10. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  11. fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
  12. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  13. fractal_server/app/routes/aux/_runner.py +10 -2
  14. fractal_server/app/runner/compress_folder.py +120 -0
  15. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  16. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  17. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  18. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  19. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  21. fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
  22. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  23. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  24. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  27. fractal_server/app/runner/extract_archive.py +38 -0
  28. fractal_server/app/runner/v1/__init__.py +78 -40
  29. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  30. fractal_server/app/runner/v2/__init__.py +147 -62
  31. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  32. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  33. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  34. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
  35. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  37. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  38. fractal_server/app/runner/versions.py +30 -0
  39. fractal_server/app/schemas/v1/__init__.py +1 -0
  40. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  41. fractal_server/app/schemas/v2/__init__.py +4 -1
  42. fractal_server/app/schemas/v2/task_collection.py +97 -27
  43. fractal_server/config.py +184 -3
  44. fractal_server/main.py +25 -1
  45. fractal_server/ssh/__init__.py +4 -0
  46. fractal_server/ssh/_fabric.py +190 -0
  47. fractal_server/tasks/utils.py +12 -64
  48. fractal_server/tasks/v1/background_operations.py +2 -2
  49. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  50. fractal_server/tasks/v1/utils.py +67 -0
  51. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  52. fractal_server/tasks/v2/_venv_pip.py +195 -0
  53. fractal_server/tasks/v2/background_operations.py +257 -295
  54. fractal_server/tasks/v2/background_operations_ssh.py +304 -0
  55. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  56. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  57. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  58. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  59. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  60. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  61. fractal_server/tasks/v2/utils.py +54 -0
  62. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +4 -2
  63. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +66 -42
  64. fractal_server/tasks/v2/get_collection_data.py +0 -14
  65. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
  66. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
  67. {fractal_server-2.2.0a1.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,3 @@
1
+ from .executor import SlurmExecutor
2
+
3
+ __all__ = ["SlurmExecutor"]
@@ -2,7 +2,7 @@ from subprocess import run # nosec
2
2
 
3
3
  from cfut.slurm import STATES_FINISHED
4
4
 
5
- from .....logger import set_logger
5
+ from ......logger import set_logger
6
6
 
7
7
 
8
8
  logger = set_logger(__name__)
@@ -7,7 +7,7 @@ from typing import Optional
7
7
 
8
8
  from cfut import FileWaitThread
9
9
 
10
- from .....logger import set_logger
10
+ from ......logger import set_logger
11
11
  from ._check_jobs_status import _jobs_finished
12
12
 
13
13
  logger = set_logger(__name__)
@@ -19,7 +19,7 @@ import shlex
19
19
  import subprocess # nosec
20
20
  from typing import Optional
21
21
 
22
- from .....logger import set_logger
22
+ from ......logger import set_logger
23
23
 
24
24
  logger = set_logger(__name__)
25
25
 
@@ -29,18 +29,18 @@ import cloudpickle
29
29
  from cfut import SlurmExecutor
30
30
  from cfut.util import random_string
31
31
 
32
- from .....config import get_settings
33
- from .....logger import set_logger
34
- from .....syringe import Inject
35
- from ...exceptions import JobExecutionError
36
- from ...exceptions import TaskExecutionError
37
- from ...filenames import SHUTDOWN_FILENAME
38
- from ...task_files import get_task_file_paths
39
- from ...task_files import TaskFiles
40
- from ._batching import heuristics
32
+ from ......config import get_settings
33
+ from ......logger import set_logger
34
+ from ......syringe import Inject
35
+ from ....exceptions import JobExecutionError
36
+ from ....exceptions import TaskExecutionError
37
+ from ....filenames import SHUTDOWN_FILENAME
38
+ from ....task_files import get_task_file_paths
39
+ from ....task_files import TaskFiles
40
+ from ...slurm._slurm_config import get_default_slurm_config
41
+ from ...slurm._slurm_config import SlurmConfig
42
+ from .._batching import heuristics
41
43
  from ._executor_wait_thread import FractalSlurmWaitThread
42
- from ._slurm_config import get_default_slurm_config
43
- from ._slurm_config import SlurmConfig
44
44
  from ._subprocess_run_as_user import _glob_as_user
45
45
  from ._subprocess_run_as_user import _glob_as_user_strict
46
46
  from ._subprocess_run_as_user import _path_exists_as_user
@@ -1180,7 +1180,7 @@ class FractalSlurmExecutor(SlurmExecutor):
1180
1180
 
1181
1181
  # Prepare SLURM preamble based on SlurmConfig object
1182
1182
  script_lines = slurm_config.to_sbatch_preamble(
1183
- user_cache_dir=self.user_cache_dir
1183
+ remote_export_dir=self.user_cache_dir
1184
1184
  )
1185
1185
 
1186
1186
  # Extend SLURM preamble with variable which are not in SlurmConfig, and
@@ -0,0 +1,38 @@
1
+ import sys
2
+ import tarfile
3
+ from pathlib import Path
4
+
5
+
6
+ def _remove_suffix(*, string: str, suffix: str) -> str:
7
+ if string.endswith(suffix):
8
+ return string[: -len(suffix)]
9
+ else:
10
+ raise ValueError(f"Cannot remove {suffix=} from {string=}.")
11
+
12
+
13
+ if __name__ == "__main__":
14
+ help_msg = (
15
+ "Expected use:\n"
16
+ "python -m fractal_server.app.runner.extract_archive "
17
+ "path/to/archive.tar.gz"
18
+ )
19
+
20
+ if len(sys.argv[1:]) != 1:
21
+ raise ValueError(
22
+ f"Invalid argument.\n{help_msg}\nProvided: {sys.argv=}"
23
+ )
24
+ elif not sys.argv[1].endswith(".tar.gz"):
25
+ raise ValueError(
26
+ f"Invalid argument.\n{help_msg}\nProvided: {sys.argv=}"
27
+ )
28
+
29
+ tarfile_path = Path(sys.argv[1])
30
+
31
+ print(f"[extract_archive.py] {tarfile_path=}")
32
+
33
+ job_folder = tarfile_path.parent
34
+ subfolder_name = _remove_suffix(string=tarfile_path.name, suffix=".tar.gz")
35
+ with tarfile.open(tarfile_path) as tar:
36
+ tar.extractall(path=Path(job_folder, subfolder_name).as_posix())
37
+
38
+ print(f"[extract_archive.py] {tarfile_path=}")
@@ -22,6 +22,10 @@ import traceback
22
22
  from pathlib import Path
23
23
  from typing import Optional
24
24
 
25
+ from sqlalchemy.orm import Session as DBSyncSession
26
+
27
+ from ....logger import get_logger
28
+ from ....logger import reset_logger_handlers
25
29
  from ....logger import set_logger
26
30
  from ....syringe import Inject
27
31
  from ....utils import get_timestamp
@@ -33,7 +37,7 @@ from ...models.v1 import WorkflowTask
33
37
  from ...schemas.v1 import JobStatusTypeV1
34
38
  from ..exceptions import JobExecutionError
35
39
  from ..exceptions import TaskExecutionError
36
- from ..executors.slurm._subprocess_run_as_user import (
40
+ from ..executors.slurm.sudo._subprocess_run_as_user import (
37
41
  _mkdir_as_user,
38
42
  )
39
43
  from ..filenames import WORKFLOW_LOG_FILENAME
@@ -53,6 +57,27 @@ _backends["local"] = local_process_workflow
53
57
  _backends["slurm"] = slurm_process_workflow
54
58
 
55
59
 
60
+ def fail_job(
61
+ *,
62
+ db: DBSyncSession,
63
+ job: ApplyWorkflow,
64
+ log_msg: str,
65
+ logger_name: str,
66
+ emit_log: bool = False,
67
+ ) -> None:
68
+ logger = get_logger(logger_name=logger_name)
69
+ if emit_log:
70
+ logger.error(log_msg)
71
+ reset_logger_handlers(logger)
72
+ job.status = JobStatusTypeV1.FAILED
73
+ job.end_timestamp = get_timestamp()
74
+ job.log = log_msg
75
+ db.merge(job)
76
+ db.commit()
77
+ db.close()
78
+ return
79
+
80
+
56
81
  async def submit_workflow(
57
82
  *,
58
83
  workflow_id: int,
@@ -91,21 +116,41 @@ async def submit_workflow(
91
116
  slurm backend.
92
117
  """
93
118
 
94
- # Declare runner backend and set `process_workflow` function
95
- settings = Inject(get_settings)
96
- FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
97
- if FRACTAL_RUNNER_BACKEND == "local":
98
- process_workflow = local_process_workflow
99
- elif FRACTAL_RUNNER_BACKEND == "slurm":
100
- process_workflow = slurm_process_workflow
101
- else:
102
- raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
119
+ logger_name = f"WF{workflow_id}_job{job_id}"
120
+ logger = set_logger(logger_name=logger_name)
103
121
 
104
122
  with next(DB.get_sync_db()) as db_sync:
105
123
 
106
124
  job: ApplyWorkflow = db_sync.get(ApplyWorkflow, job_id)
107
125
  if not job:
108
- raise ValueError(f"Cannot fetch job {job_id} from database")
126
+ logger.error(f"ApplyWorkflow {job_id} does not exist")
127
+ return
128
+
129
+ settings = Inject(get_settings)
130
+ FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
131
+ if FRACTAL_RUNNER_BACKEND == "local":
132
+ process_workflow = local_process_workflow
133
+ elif FRACTAL_RUNNER_BACKEND == "slurm":
134
+ process_workflow = slurm_process_workflow
135
+ else:
136
+
137
+ if FRACTAL_RUNNER_BACKEND == "local_experimental":
138
+ log_msg = (
139
+ f"{FRACTAL_RUNNER_BACKEND=} is not available for v1 jobs."
140
+ )
141
+ else:
142
+ log_msg = f"Invalid {FRACTAL_RUNNER_BACKEND=}"
143
+
144
+ fail_job(
145
+ job=job,
146
+ db=db_sync,
147
+ log_msg=log_msg,
148
+ logger_name=logger_name,
149
+ emit_log=True,
150
+ )
151
+ return
152
+
153
+ # Declare runner backend and set `process_workflow` function
109
154
 
110
155
  input_dataset: Dataset = db_sync.get(Dataset, input_dataset_id)
111
156
  output_dataset: Dataset = db_sync.get(Dataset, output_dataset_id)
@@ -126,12 +171,9 @@ async def submit_workflow(
126
171
  log_msg += (
127
172
  f"Cannot fetch workflow {workflow_id} from database\n"
128
173
  )
129
- job.status = JobStatusTypeV1.FAILED
130
- job.end_timestamp = get_timestamp()
131
- job.log = log_msg
132
- db_sync.merge(job)
133
- db_sync.commit()
134
- db_sync.close()
174
+ fail_job(
175
+ db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
176
+ )
135
177
  return
136
178
 
137
179
  # Prepare some of process_workflow arguments
@@ -147,9 +189,14 @@ async def submit_workflow(
147
189
  )
148
190
 
149
191
  if WORKFLOW_DIR_LOCAL.exists():
150
- raise RuntimeError(
151
- f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists."
192
+ fail_job(
193
+ db=db_sync,
194
+ job=job,
195
+ log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
196
+ logger_name=logger_name,
197
+ emit_log=True,
152
198
  )
199
+ return
153
200
 
154
201
  # Create WORKFLOW_DIR
155
202
  original_umask = os.umask(0)
@@ -202,7 +249,6 @@ async def submit_workflow(
202
249
  db_sync.refresh(workflow)
203
250
 
204
251
  # Write logs
205
- logger_name = f"WF{workflow_id}_job{job_id}"
206
252
  log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
207
253
  logger = set_logger(
208
254
  logger_name=logger_name,
@@ -302,19 +348,14 @@ async def submit_workflow(
302
348
 
303
349
  db_sync.merge(output_dataset)
304
350
 
305
- job.status = JobStatusTypeV1.FAILED
306
- job.end_timestamp = get_timestamp()
307
-
308
351
  exception_args_string = "\n".join(e.args)
309
- job.log = (
352
+ log_msg = (
310
353
  f"TASK ERROR: "
311
354
  f"Task name: {e.task_name}, "
312
355
  f"position in Workflow: {e.workflow_task_order}\n"
313
356
  f"TRACEBACK:\n{exception_args_string}"
314
357
  )
315
- db_sync.merge(job)
316
- close_job_logger(logger)
317
- db_sync.commit()
358
+ fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
318
359
 
319
360
  except JobExecutionError as e:
320
361
 
@@ -334,14 +375,13 @@ async def submit_workflow(
334
375
  )
335
376
 
336
377
  db_sync.merge(output_dataset)
337
-
338
- job.status = JobStatusTypeV1.FAILED
339
- job.end_timestamp = get_timestamp()
340
378
  error = e.assemble_error()
341
- job.log = f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}"
342
- db_sync.merge(job)
343
- close_job_logger(logger)
344
- db_sync.commit()
379
+ fail_job(
380
+ db=db_sync,
381
+ job=job,
382
+ log_msg=f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}",
383
+ logger_name=logger_name,
384
+ )
345
385
 
346
386
  except Exception:
347
387
 
@@ -364,14 +404,12 @@ async def submit_workflow(
364
404
 
365
405
  db_sync.merge(output_dataset)
366
406
 
367
- job.status = JobStatusTypeV1.FAILED
368
- job.end_timestamp = get_timestamp()
369
- job.log = (
407
+ log_msg = (
370
408
  f"UNKNOWN ERROR in Fractal job {job.id}\n"
371
409
  f"TRACEBACK:\n{current_traceback}"
372
410
  )
373
- db_sync.merge(job)
374
- close_job_logger(logger)
375
- db_sync.commit()
411
+ fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
412
+
376
413
  finally:
377
414
  db_sync.close()
415
+ reset_logger_handlers(logger)
@@ -22,7 +22,7 @@ from typing import Optional
22
22
  from typing import Union
23
23
 
24
24
  from ...async_wrap import async_wrap
25
- from ...executors.slurm.executor import FractalSlurmExecutor
25
+ from ...executors.slurm.sudo.executor import FractalSlurmExecutor
26
26
  from ...set_start_and_last_task_index import set_start_and_last_task_index
27
27
  from .._common import execute_tasks
28
28
  from ..common import TaskParameters
@@ -5,14 +5,18 @@ This module is the single entry point to the runner backend subsystem V2.
5
5
  Other subystems should only import this module and not its submodules or
6
6
  the individual backends.
7
7
  """
8
+ import logging
8
9
  import os
9
10
  import traceback
10
11
  from pathlib import Path
11
12
  from typing import Optional
12
13
 
14
+ from fabric import Connection # FIXME SSH: try/except import
15
+ from sqlalchemy.orm import Session as DBSyncSession
13
16
  from sqlalchemy.orm.attributes import flag_modified
14
17
 
15
18
  from ....config import get_settings
19
+ from ....logger import get_logger
16
20
  from ....logger import reset_logger_handlers
17
21
  from ....logger import set_logger
18
22
  from ....syringe import Inject
@@ -25,14 +29,15 @@ from ...models.v2 import WorkflowV2
25
29
  from ...schemas.v2 import JobStatusTypeV2
26
30
  from ..exceptions import JobExecutionError
27
31
  from ..exceptions import TaskExecutionError
28
- from ..executors.slurm._subprocess_run_as_user import _mkdir_as_user
32
+ from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
29
33
  from ..filenames import WORKFLOW_LOG_FILENAME
30
34
  from ..task_files import task_subfolder_name
31
35
  from ._local import process_workflow as local_process_workflow
32
36
  from ._local_experimental import (
33
37
  process_workflow as local_experimental_process_workflow,
34
38
  )
35
- from ._slurm import process_workflow as slurm_process_workflow
39
+ from ._slurm import process_workflow as slurm_sudo_process_workflow
40
+ from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
36
41
  from .handle_failed_job import assemble_filters_failed_job
37
42
  from .handle_failed_job import assemble_history_failed_job
38
43
  from .handle_failed_job import assemble_images_failed_job
@@ -40,8 +45,30 @@ from fractal_server import __VERSION__
40
45
 
41
46
  _backends = {}
42
47
  _backends["local"] = local_process_workflow
48
+ _backends["slurm"] = slurm_sudo_process_workflow
49
+ _backends["slurm_ssh"] = slurm_ssh_process_workflow
43
50
  _backends["local_experimental"] = local_experimental_process_workflow
44
- _backends["slurm"] = slurm_process_workflow
51
+
52
+
53
+ def fail_job(
54
+ *,
55
+ db: DBSyncSession,
56
+ job: JobV2,
57
+ log_msg: str,
58
+ logger_name: str,
59
+ emit_log: bool = False,
60
+ ) -> None:
61
+ logger = get_logger(logger_name=logger_name)
62
+ if emit_log:
63
+ logger.error(log_msg)
64
+ reset_logger_handlers(logger)
65
+ job.status = JobStatusTypeV2.FAILED
66
+ job.end_timestamp = get_timestamp()
67
+ job.log = log_msg
68
+ db.merge(job)
69
+ db.commit()
70
+ db.close()
71
+ return
45
72
 
46
73
 
47
74
  async def submit_workflow(
@@ -52,6 +79,7 @@ async def submit_workflow(
52
79
  worker_init: Optional[str] = None,
53
80
  slurm_user: Optional[str] = None,
54
81
  user_cache_dir: Optional[str] = None,
82
+ connection: Optional[Connection] = None,
55
83
  ) -> None:
56
84
  """
57
85
  Prepares a workflow and applies it to a dataset
@@ -78,24 +106,36 @@ async def submit_workflow(
78
106
  The username to impersonate for the workflow execution, for the
79
107
  slurm backend.
80
108
  """
81
-
82
109
  # Declare runner backend and set `process_workflow` function
83
110
  settings = Inject(get_settings)
84
111
  FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
85
- if FRACTAL_RUNNER_BACKEND == "local":
86
- process_workflow = local_process_workflow
87
- elif FRACTAL_RUNNER_BACKEND == "local_experimental":
88
- process_workflow = local_experimental_process_workflow
89
- elif FRACTAL_RUNNER_BACKEND == "slurm":
90
- process_workflow = slurm_process_workflow
91
- else:
92
- raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
112
+ logger_name = f"WF{workflow_id}_job{job_id}"
113
+ logger = set_logger(logger_name=logger_name)
93
114
 
94
115
  with next(DB.get_sync_db()) as db_sync:
95
116
 
96
117
  job: JobV2 = db_sync.get(JobV2, job_id)
97
118
  if not job:
98
- raise ValueError(f"Cannot fetch job {job_id} from database")
119
+ logger.error(f"JobV2 {job_id} does not exist")
120
+ return
121
+
122
+ # Declare runner backend and set `process_workflow` function
123
+ settings = Inject(get_settings)
124
+ FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
125
+ try:
126
+ process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
127
+ except KeyError as e:
128
+ fail_job(
129
+ db=db_sync,
130
+ job=job,
131
+ log_msg=(
132
+ f"Invalid {FRACTAL_RUNNER_BACKEND=}.\n"
133
+ f"Original KeyError: {str(e)}"
134
+ ),
135
+ logger_name=logger_name,
136
+ emit_log=True,
137
+ )
138
+ return
99
139
 
100
140
  dataset: DatasetV2 = db_sync.get(DatasetV2, dataset_id)
101
141
  workflow: WorkflowV2 = db_sync.get(WorkflowV2, workflow_id)
@@ -107,31 +147,28 @@ async def submit_workflow(
107
147
  log_msg += (
108
148
  f"Cannot fetch workflow {workflow_id} from database\n"
109
149
  )
110
- job.status = JobStatusTypeV2.FAILED
111
- job.end_timestamp = get_timestamp()
112
- job.log = log_msg
113
- db_sync.merge(job)
114
- db_sync.commit()
115
- db_sync.close()
150
+ fail_job(
151
+ db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
152
+ )
116
153
  return
117
154
 
118
155
  # Define and create server-side working folder
119
156
  WORKFLOW_DIR_LOCAL = Path(job.working_dir)
120
157
  if WORKFLOW_DIR_LOCAL.exists():
121
- job.status = JobStatusTypeV2.FAILED
122
- job.end_timestamp = get_timestamp()
123
- job.log = f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists."
124
- db_sync.merge(job)
125
- db_sync.commit()
126
- db_sync.close()
158
+ fail_job(
159
+ db=db_sync,
160
+ job=job,
161
+ log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
162
+ logger_name=logger_name,
163
+ emit_log=True,
164
+ )
127
165
  return
128
166
 
129
167
  try:
130
168
 
131
- # Create WORKFLOW_DIR
169
+ # Create WORKFLOW_DIR_LOCAL
132
170
  original_umask = os.umask(0)
133
171
  WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
134
-
135
172
  os.umask(original_umask)
136
173
 
137
174
  # Define and create WORKFLOW_DIR_REMOTE
@@ -146,6 +183,24 @@ async def submit_workflow(
146
183
  _mkdir_as_user(
147
184
  folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user
148
185
  )
186
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
187
+ WORKFLOW_DIR_REMOTE = (
188
+ Path(settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR)
189
+ / WORKFLOW_DIR_LOCAL.name
190
+ )
191
+ # FIXME SSH: move mkdir to executor, likely within handshake
192
+
193
+ from ....ssh._fabric import _mkdir_over_ssh
194
+
195
+ _mkdir_over_ssh(
196
+ folder=str(WORKFLOW_DIR_REMOTE), connection=connection
197
+ )
198
+ logging.info(f"Created {str(WORKFLOW_DIR_REMOTE)} via SSH.")
199
+ else:
200
+ logging.error(
201
+ "Invalid FRACTAL_RUNNER_BACKEND="
202
+ f"{settings.FRACTAL_RUNNER_BACKEND}."
203
+ )
149
204
 
150
205
  # Create all tasks subfolders
151
206
  for order in range(job.first_task_index, job.last_task_index + 1):
@@ -166,16 +221,20 @@ async def submit_workflow(
166
221
  folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
167
222
  user=slurm_user,
168
223
  )
224
+ else:
225
+ logging.info("Skip remote-subfolder creation")
169
226
  except Exception as e:
170
- job.status = JobStatusTypeV2.FAILED
171
- job.end_timestamp = get_timestamp()
172
- job.log = (
173
- "An error occurred while creating job folder and subfolders.\n"
174
- f"Original error: {str(e)}"
227
+ error_type = type(e).__name__
228
+ fail_job(
229
+ db=db_sync,
230
+ job=job,
231
+ log_msg=(
232
+ f"{error_type} error occurred while creating job folder "
233
+ f"and subfolders.\nOriginal error: {str(e)}"
234
+ ),
235
+ logger_name=logger_name,
236
+ emit_log=True,
175
237
  )
176
- db_sync.merge(job)
177
- db_sync.commit()
178
- db_sync.close()
179
238
  return
180
239
 
181
240
  # After Session.commit() is called, either explicitly or when using a
@@ -195,7 +254,6 @@ async def submit_workflow(
195
254
  db_sync.refresh(wftask)
196
255
 
197
256
  # Write logs
198
- logger_name = f"WF{workflow_id}_job{job_id}"
199
257
  log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
200
258
  logger = set_logger(
201
259
  logger_name=logger_name,
@@ -207,9 +265,17 @@ async def submit_workflow(
207
265
  )
208
266
  logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
209
267
  logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
210
- logger.debug(f"slurm_user: {slurm_user}")
211
- logger.debug(f"slurm_account: {job.slurm_account}")
212
- logger.debug(f"worker_init: {worker_init}")
268
+ if FRACTAL_RUNNER_BACKEND == "slurm":
269
+ logger.debug(f"slurm_user: {slurm_user}")
270
+ logger.debug(f"slurm_account: {job.slurm_account}")
271
+ logger.debug(f"worker_init: {worker_init}")
272
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
273
+ logger.debug(f"ssh_host: {settings.FRACTAL_SLURM_SSH_HOST}")
274
+ logger.debug(f"ssh_user: {settings.FRACTAL_SLURM_SSH_USER}")
275
+ logger.debug(
276
+ f"base dir: {settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR}"
277
+ )
278
+ logger.debug(f"worker_init: {worker_init}")
213
279
  logger.debug(f"job.id: {job.id}")
214
280
  logger.debug(f"job.working_dir: {job.working_dir}")
215
281
  logger.debug(f"job.working_dir_user: {job.working_dir_user}")
@@ -218,6 +284,27 @@ async def submit_workflow(
218
284
  logger.debug(f'START workflow "{workflow.name}"')
219
285
 
220
286
  try:
287
+ if FRACTAL_RUNNER_BACKEND == "local":
288
+ process_workflow = local_process_workflow
289
+ backend_specific_kwargs = {}
290
+ elif FRACTAL_RUNNER_BACKEND == "local_experimental":
291
+ process_workflow = local_experimental_process_workflow
292
+ backend_specific_kwargs = {}
293
+ elif FRACTAL_RUNNER_BACKEND == "slurm":
294
+ process_workflow = slurm_sudo_process_workflow
295
+ backend_specific_kwargs = dict(
296
+ slurm_user=slurm_user,
297
+ slurm_account=job.slurm_account,
298
+ user_cache_dir=user_cache_dir,
299
+ )
300
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
301
+ process_workflow = slurm_ssh_process_workflow
302
+ backend_specific_kwargs = dict(connection=connection)
303
+ else:
304
+ raise RuntimeError(
305
+ f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}"
306
+ )
307
+
221
308
  # "The Session.close() method does not prevent the Session from being
222
309
  # used again. The Session itself does not actually have a distinct
223
310
  # “closed” state; it merely means the Session will release all database
@@ -234,15 +321,13 @@ async def submit_workflow(
234
321
  new_dataset_attributes = await process_workflow(
235
322
  workflow=workflow,
236
323
  dataset=dataset,
237
- slurm_user=slurm_user,
238
- slurm_account=job.slurm_account,
239
- user_cache_dir=user_cache_dir,
240
324
  workflow_dir_local=WORKFLOW_DIR_LOCAL,
241
325
  workflow_dir_remote=WORKFLOW_DIR_REMOTE,
242
326
  logger_name=logger_name,
243
327
  worker_init=worker_init,
244
328
  first_task_index=job.first_task_index,
245
329
  last_task_index=job.last_task_index,
330
+ **backend_specific_kwargs,
246
331
  )
247
332
 
248
333
  logger.info(
@@ -291,18 +376,14 @@ async def submit_workflow(
291
376
  dataset.images = latest_images
292
377
  db_sync.merge(dataset)
293
378
 
294
- job.status = JobStatusTypeV2.FAILED
295
- job.end_timestamp = get_timestamp()
296
-
297
379
  exception_args_string = "\n".join(e.args)
298
- job.log = (
380
+ log_msg = (
299
381
  f"TASK ERROR: "
300
382
  f"Task name: {e.task_name}, "
301
383
  f"position in Workflow: {e.workflow_task_order}\n"
302
384
  f"TRACEBACK:\n{exception_args_string}"
303
385
  )
304
- db_sync.merge(job)
305
- db_sync.commit()
386
+ fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
306
387
 
307
388
  except JobExecutionError as e:
308
389
 
@@ -325,12 +406,15 @@ async def submit_workflow(
325
406
  dataset.images = latest_images
326
407
  db_sync.merge(dataset)
327
408
 
328
- job.status = JobStatusTypeV2.FAILED
329
- job.end_timestamp = get_timestamp()
330
- error = e.assemble_error()
331
- job.log = f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}"
332
- db_sync.merge(job)
333
- db_sync.commit()
409
+ fail_job(
410
+ db=db_sync,
411
+ job=job,
412
+ log_msg=(
413
+ f"JOB ERROR in Fractal job {job.id}:\n"
414
+ f"TRACEBACK:\n{e.assemble_error()}"
415
+ ),
416
+ logger_name=logger_name,
417
+ )
334
418
 
335
419
  except Exception:
336
420
 
@@ -354,15 +438,16 @@ async def submit_workflow(
354
438
  if latest_images is not None:
355
439
  dataset.images = latest_images
356
440
  db_sync.merge(dataset)
357
-
358
- job.status = JobStatusTypeV2.FAILED
359
- job.end_timestamp = get_timestamp()
360
- job.log = (
361
- f"UNKNOWN ERROR in Fractal job {job.id}\n"
362
- f"TRACEBACK:\n{current_traceback}"
441
+ fail_job(
442
+ db=db_sync,
443
+ job=job,
444
+ log_msg=(
445
+ f"UNKNOWN ERROR in Fractal job {job.id}\n"
446
+ f"TRACEBACK:\n{current_traceback}"
447
+ ),
448
+ logger_name=logger_name,
363
449
  )
364
- db_sync.merge(job)
365
- db_sync.commit()
450
+
366
451
  finally:
367
452
  reset_logger_handlers(logger)
368
453
  db_sync.close()