fractal-server 2.2.0a0__py3-none-any.whl → 2.3.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/db/__init__.py +1 -1
  3. fractal_server/app/models/v1/state.py +1 -2
  4. fractal_server/app/routes/admin/v1.py +2 -2
  5. fractal_server/app/routes/admin/v2.py +2 -2
  6. fractal_server/app/routes/api/v1/job.py +2 -2
  7. fractal_server/app/routes/api/v1/task_collection.py +4 -4
  8. fractal_server/app/routes/api/v2/__init__.py +23 -3
  9. fractal_server/app/routes/api/v2/job.py +2 -2
  10. fractal_server/app/routes/api/v2/submit.py +6 -0
  11. fractal_server/app/routes/api/v2/task_collection.py +74 -34
  12. fractal_server/app/routes/api/v2/task_collection_custom.py +144 -0
  13. fractal_server/app/routes/api/v2/task_collection_ssh.py +125 -0
  14. fractal_server/app/routes/aux/_runner.py +10 -2
  15. fractal_server/app/runner/compress_folder.py +120 -0
  16. fractal_server/app/runner/executors/slurm/__init__.py +0 -3
  17. fractal_server/app/runner/executors/slurm/_batching.py +0 -1
  18. fractal_server/app/runner/executors/slurm/_slurm_config.py +9 -9
  19. fractal_server/app/runner/executors/slurm/ssh/__init__.py +3 -0
  20. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +112 -0
  21. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +120 -0
  22. fractal_server/app/runner/executors/slurm/ssh/executor.py +1490 -0
  23. fractal_server/app/runner/executors/slurm/sudo/__init__.py +3 -0
  24. fractal_server/app/runner/executors/slurm/{_check_jobs_status.py → sudo/_check_jobs_status.py} +1 -1
  25. fractal_server/app/runner/executors/slurm/{_executor_wait_thread.py → sudo/_executor_wait_thread.py} +1 -1
  26. fractal_server/app/runner/executors/slurm/{_subprocess_run_as_user.py → sudo/_subprocess_run_as_user.py} +1 -1
  27. fractal_server/app/runner/executors/slurm/{executor.py → sudo/executor.py} +12 -12
  28. fractal_server/app/runner/extract_archive.py +38 -0
  29. fractal_server/app/runner/v1/__init__.py +78 -40
  30. fractal_server/app/runner/v1/_slurm/__init__.py +1 -1
  31. fractal_server/app/runner/v2/__init__.py +183 -82
  32. fractal_server/app/runner/v2/_local_experimental/__init__.py +22 -12
  33. fractal_server/app/runner/v2/_local_experimental/executor.py +12 -8
  34. fractal_server/app/runner/v2/_slurm/__init__.py +1 -6
  35. fractal_server/app/runner/v2/_slurm_ssh/__init__.py +126 -0
  36. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +83 -0
  37. fractal_server/app/runner/v2/_slurm_ssh/get_slurm_config.py +182 -0
  38. fractal_server/app/runner/v2/runner_functions_low_level.py +9 -11
  39. fractal_server/app/runner/versions.py +30 -0
  40. fractal_server/app/schemas/v1/__init__.py +1 -0
  41. fractal_server/app/schemas/{state.py → v1/state.py} +4 -21
  42. fractal_server/app/schemas/v2/__init__.py +4 -1
  43. fractal_server/app/schemas/v2/task_collection.py +97 -27
  44. fractal_server/config.py +222 -21
  45. fractal_server/main.py +25 -1
  46. fractal_server/migrations/env.py +1 -1
  47. fractal_server/ssh/__init__.py +4 -0
  48. fractal_server/ssh/_fabric.py +190 -0
  49. fractal_server/tasks/utils.py +12 -64
  50. fractal_server/tasks/v1/background_operations.py +2 -2
  51. fractal_server/tasks/{endpoint_operations.py → v1/endpoint_operations.py} +7 -12
  52. fractal_server/tasks/v1/utils.py +67 -0
  53. fractal_server/tasks/v2/_TaskCollectPip.py +61 -32
  54. fractal_server/tasks/v2/_venv_pip.py +195 -0
  55. fractal_server/tasks/v2/background_operations.py +257 -295
  56. fractal_server/tasks/v2/background_operations_ssh.py +304 -0
  57. fractal_server/tasks/v2/endpoint_operations.py +136 -0
  58. fractal_server/tasks/v2/templates/_1_create_venv.sh +46 -0
  59. fractal_server/tasks/v2/templates/_2_upgrade_pip.sh +30 -0
  60. fractal_server/tasks/v2/templates/_3_pip_install.sh +32 -0
  61. fractal_server/tasks/v2/templates/_4_pip_freeze.sh +21 -0
  62. fractal_server/tasks/v2/templates/_5_pip_show.sh +59 -0
  63. fractal_server/tasks/v2/utils.py +54 -0
  64. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/METADATA +6 -2
  65. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/RECORD +68 -44
  66. fractal_server/tasks/v2/get_collection_data.py +0 -14
  67. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/LICENSE +0 -0
  68. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/WHEEL +0 -0
  69. {fractal_server-2.2.0a0.dist-info → fractal_server-2.3.0a0.dist-info}/entry_points.txt +0 -0
@@ -5,14 +5,18 @@ This module is the single entry point to the runner backend subsystem V2.
5
5
  Other subystems should only import this module and not its submodules or
6
6
  the individual backends.
7
7
  """
8
+ import logging
8
9
  import os
9
10
  import traceback
10
11
  from pathlib import Path
11
12
  from typing import Optional
12
13
 
14
+ from fabric import Connection # FIXME SSH: try/except import
15
+ from sqlalchemy.orm import Session as DBSyncSession
13
16
  from sqlalchemy.orm.attributes import flag_modified
14
17
 
15
18
  from ....config import get_settings
19
+ from ....logger import get_logger
16
20
  from ....logger import reset_logger_handlers
17
21
  from ....logger import set_logger
18
22
  from ....syringe import Inject
@@ -25,14 +29,15 @@ from ...models.v2 import WorkflowV2
25
29
  from ...schemas.v2 import JobStatusTypeV2
26
30
  from ..exceptions import JobExecutionError
27
31
  from ..exceptions import TaskExecutionError
28
- from ..executors.slurm._subprocess_run_as_user import _mkdir_as_user
32
+ from ..executors.slurm.sudo._subprocess_run_as_user import _mkdir_as_user
29
33
  from ..filenames import WORKFLOW_LOG_FILENAME
30
34
  from ..task_files import task_subfolder_name
31
35
  from ._local import process_workflow as local_process_workflow
32
36
  from ._local_experimental import (
33
37
  process_workflow as local_experimental_process_workflow,
34
38
  )
35
- from ._slurm import process_workflow as slurm_process_workflow
39
+ from ._slurm import process_workflow as slurm_sudo_process_workflow
40
+ from ._slurm_ssh import process_workflow as slurm_ssh_process_workflow
36
41
  from .handle_failed_job import assemble_filters_failed_job
37
42
  from .handle_failed_job import assemble_history_failed_job
38
43
  from .handle_failed_job import assemble_images_failed_job
@@ -40,8 +45,30 @@ from fractal_server import __VERSION__
40
45
 
41
46
  _backends = {}
42
47
  _backends["local"] = local_process_workflow
48
+ _backends["slurm"] = slurm_sudo_process_workflow
49
+ _backends["slurm_ssh"] = slurm_ssh_process_workflow
43
50
  _backends["local_experimental"] = local_experimental_process_workflow
44
- _backends["slurm"] = slurm_process_workflow
51
+
52
+
53
+ def fail_job(
54
+ *,
55
+ db: DBSyncSession,
56
+ job: JobV2,
57
+ log_msg: str,
58
+ logger_name: str,
59
+ emit_log: bool = False,
60
+ ) -> None:
61
+ logger = get_logger(logger_name=logger_name)
62
+ if emit_log:
63
+ logger.error(log_msg)
64
+ reset_logger_handlers(logger)
65
+ job.status = JobStatusTypeV2.FAILED
66
+ job.end_timestamp = get_timestamp()
67
+ job.log = log_msg
68
+ db.merge(job)
69
+ db.commit()
70
+ db.close()
71
+ return
45
72
 
46
73
 
47
74
  async def submit_workflow(
@@ -52,6 +79,7 @@ async def submit_workflow(
52
79
  worker_init: Optional[str] = None,
53
80
  slurm_user: Optional[str] = None,
54
81
  user_cache_dir: Optional[str] = None,
82
+ connection: Optional[Connection] = None,
55
83
  ) -> None:
56
84
  """
57
85
  Prepares a workflow and applies it to a dataset
@@ -78,24 +106,36 @@ async def submit_workflow(
78
106
  The username to impersonate for the workflow execution, for the
79
107
  slurm backend.
80
108
  """
81
-
82
109
  # Declare runner backend and set `process_workflow` function
83
110
  settings = Inject(get_settings)
84
111
  FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
85
- if FRACTAL_RUNNER_BACKEND == "local":
86
- process_workflow = local_process_workflow
87
- elif FRACTAL_RUNNER_BACKEND == "local_experimental":
88
- process_workflow = local_experimental_process_workflow
89
- elif FRACTAL_RUNNER_BACKEND == "slurm":
90
- process_workflow = slurm_process_workflow
91
- else:
92
- raise RuntimeError(f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}")
112
+ logger_name = f"WF{workflow_id}_job{job_id}"
113
+ logger = set_logger(logger_name=logger_name)
93
114
 
94
115
  with next(DB.get_sync_db()) as db_sync:
95
116
 
96
117
  job: JobV2 = db_sync.get(JobV2, job_id)
97
118
  if not job:
98
- raise ValueError(f"Cannot fetch job {job_id} from database")
119
+ logger.error(f"JobV2 {job_id} does not exist")
120
+ return
121
+
122
+ # Declare runner backend and set `process_workflow` function
123
+ settings = Inject(get_settings)
124
+ FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
125
+ try:
126
+ process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
127
+ except KeyError as e:
128
+ fail_job(
129
+ db=db_sync,
130
+ job=job,
131
+ log_msg=(
132
+ f"Invalid {FRACTAL_RUNNER_BACKEND=}.\n"
133
+ f"Original KeyError: {str(e)}"
134
+ ),
135
+ logger_name=logger_name,
136
+ emit_log=True,
137
+ )
138
+ return
99
139
 
100
140
  dataset: DatasetV2 = db_sync.get(DatasetV2, dataset_id)
101
141
  workflow: WorkflowV2 = db_sync.get(WorkflowV2, workflow_id)
@@ -107,61 +147,96 @@ async def submit_workflow(
107
147
  log_msg += (
108
148
  f"Cannot fetch workflow {workflow_id} from database\n"
109
149
  )
110
- job.status = JobStatusTypeV2.FAILED
111
- job.end_timestamp = get_timestamp()
112
- job.log = log_msg
113
- db_sync.merge(job)
114
- db_sync.commit()
115
- db_sync.close()
150
+ fail_job(
151
+ db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name
152
+ )
116
153
  return
117
154
 
118
155
  # Define and create server-side working folder
119
156
  WORKFLOW_DIR_LOCAL = Path(job.working_dir)
120
157
  if WORKFLOW_DIR_LOCAL.exists():
121
- job.status = JobStatusTypeV2.FAILED
122
- job.end_timestamp = get_timestamp()
123
- job.log = f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists."
124
- db_sync.merge(job)
125
- db_sync.commit()
126
- db_sync.close()
158
+ fail_job(
159
+ db=db_sync,
160
+ job=job,
161
+ log_msg=f"Workflow dir {WORKFLOW_DIR_LOCAL} already exists.",
162
+ logger_name=logger_name,
163
+ emit_log=True,
164
+ )
127
165
  return
128
166
 
129
- # Create WORKFLOW_DIR
130
- original_umask = os.umask(0)
131
- WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
132
- os.umask(original_umask)
133
-
134
- # Define and create WORKFLOW_DIR_REMOTE
135
- if FRACTAL_RUNNER_BACKEND == "local":
136
- WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
137
- elif FRACTAL_RUNNER_BACKEND == "local_experimental":
138
- WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
139
- elif FRACTAL_RUNNER_BACKEND == "slurm":
140
- WORKFLOW_DIR_REMOTE = (
141
- Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
142
- )
143
- _mkdir_as_user(folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user)
167
+ try:
144
168
 
145
- # Create all tasks subfolders
146
- for order in range(job.first_task_index, job.last_task_index + 1):
147
- this_wftask = workflow.task_list[order]
148
- if this_wftask.is_legacy_task:
149
- task_name = this_wftask.task_legacy.name
150
- else:
151
- task_name = this_wftask.task.name
152
- subfolder_name = task_subfolder_name(
153
- order=order,
154
- task_name=task_name,
155
- )
169
+ # Create WORKFLOW_DIR_LOCAL
156
170
  original_umask = os.umask(0)
157
- (WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
171
+ WORKFLOW_DIR_LOCAL.mkdir(parents=True, mode=0o755)
158
172
  os.umask(original_umask)
159
- if FRACTAL_RUNNER_BACKEND == "slurm":
173
+
174
+ # Define and create WORKFLOW_DIR_REMOTE
175
+ if FRACTAL_RUNNER_BACKEND == "local":
176
+ WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
177
+ elif FRACTAL_RUNNER_BACKEND == "local_experimental":
178
+ WORKFLOW_DIR_REMOTE = WORKFLOW_DIR_LOCAL
179
+ elif FRACTAL_RUNNER_BACKEND == "slurm":
180
+ WORKFLOW_DIR_REMOTE = (
181
+ Path(user_cache_dir) / WORKFLOW_DIR_LOCAL.name
182
+ )
160
183
  _mkdir_as_user(
161
- folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
162
- user=slurm_user,
184
+ folder=str(WORKFLOW_DIR_REMOTE), user=slurm_user
185
+ )
186
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
187
+ WORKFLOW_DIR_REMOTE = (
188
+ Path(settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR)
189
+ / WORKFLOW_DIR_LOCAL.name
190
+ )
191
+ # FIXME SSH: move mkdir to executor, likely within handshake
192
+
193
+ from ....ssh._fabric import _mkdir_over_ssh
194
+
195
+ _mkdir_over_ssh(
196
+ folder=str(WORKFLOW_DIR_REMOTE), connection=connection
197
+ )
198
+ logging.info(f"Created {str(WORKFLOW_DIR_REMOTE)} via SSH.")
199
+ else:
200
+ logging.error(
201
+ "Invalid FRACTAL_RUNNER_BACKEND="
202
+ f"{settings.FRACTAL_RUNNER_BACKEND}."
163
203
  )
164
204
 
205
+ # Create all tasks subfolders
206
+ for order in range(job.first_task_index, job.last_task_index + 1):
207
+ this_wftask = workflow.task_list[order]
208
+ if this_wftask.is_legacy_task:
209
+ task_name = this_wftask.task_legacy.name
210
+ else:
211
+ task_name = this_wftask.task.name
212
+ subfolder_name = task_subfolder_name(
213
+ order=order,
214
+ task_name=task_name,
215
+ )
216
+ original_umask = os.umask(0)
217
+ (WORKFLOW_DIR_LOCAL / subfolder_name).mkdir(mode=0o755)
218
+ os.umask(original_umask)
219
+ if FRACTAL_RUNNER_BACKEND == "slurm":
220
+ _mkdir_as_user(
221
+ folder=str(WORKFLOW_DIR_REMOTE / subfolder_name),
222
+ user=slurm_user,
223
+ )
224
+ else:
225
+ logging.info("Skip remote-subfolder creation")
226
+ except Exception as e:
227
+ error_type = type(e).__name__
228
+ fail_job(
229
+ db=db_sync,
230
+ job=job,
231
+ log_msg=(
232
+ f"{error_type} error occurred while creating job folder "
233
+ f"and subfolders.\nOriginal error: {str(e)}"
234
+ ),
235
+ logger_name=logger_name,
236
+ emit_log=True,
237
+ )
238
+ return
239
+
165
240
  # After Session.commit() is called, either explicitly or when using a
166
241
  # context manager, all objects associated with the Session are expired.
167
242
  # https://docs.sqlalchemy.org/en/14/orm/
@@ -179,7 +254,6 @@ async def submit_workflow(
179
254
  db_sync.refresh(wftask)
180
255
 
181
256
  # Write logs
182
- logger_name = f"WF{workflow_id}_job{job_id}"
183
257
  log_file_path = WORKFLOW_DIR_LOCAL / WORKFLOW_LOG_FILENAME
184
258
  logger = set_logger(
185
259
  logger_name=logger_name,
@@ -191,9 +265,17 @@ async def submit_workflow(
191
265
  )
192
266
  logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
193
267
  logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
194
- logger.debug(f"slurm_user: {slurm_user}")
195
- logger.debug(f"slurm_account: {job.slurm_account}")
196
- logger.debug(f"worker_init: {worker_init}")
268
+ if FRACTAL_RUNNER_BACKEND == "slurm":
269
+ logger.debug(f"slurm_user: {slurm_user}")
270
+ logger.debug(f"slurm_account: {job.slurm_account}")
271
+ logger.debug(f"worker_init: {worker_init}")
272
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
273
+ logger.debug(f"ssh_host: {settings.FRACTAL_SLURM_SSH_HOST}")
274
+ logger.debug(f"ssh_user: {settings.FRACTAL_SLURM_SSH_USER}")
275
+ logger.debug(
276
+ f"base dir: {settings.FRACTAL_SLURM_SSH_WORKING_BASE_DIR}"
277
+ )
278
+ logger.debug(f"worker_init: {worker_init}")
197
279
  logger.debug(f"job.id: {job.id}")
198
280
  logger.debug(f"job.working_dir: {job.working_dir}")
199
281
  logger.debug(f"job.working_dir_user: {job.working_dir_user}")
@@ -202,6 +284,27 @@ async def submit_workflow(
202
284
  logger.debug(f'START workflow "{workflow.name}"')
203
285
 
204
286
  try:
287
+ if FRACTAL_RUNNER_BACKEND == "local":
288
+ process_workflow = local_process_workflow
289
+ backend_specific_kwargs = {}
290
+ elif FRACTAL_RUNNER_BACKEND == "local_experimental":
291
+ process_workflow = local_experimental_process_workflow
292
+ backend_specific_kwargs = {}
293
+ elif FRACTAL_RUNNER_BACKEND == "slurm":
294
+ process_workflow = slurm_sudo_process_workflow
295
+ backend_specific_kwargs = dict(
296
+ slurm_user=slurm_user,
297
+ slurm_account=job.slurm_account,
298
+ user_cache_dir=user_cache_dir,
299
+ )
300
+ elif FRACTAL_RUNNER_BACKEND == "slurm_ssh":
301
+ process_workflow = slurm_ssh_process_workflow
302
+ backend_specific_kwargs = dict(connection=connection)
303
+ else:
304
+ raise RuntimeError(
305
+ f"Invalid runner backend {FRACTAL_RUNNER_BACKEND=}"
306
+ )
307
+
205
308
  # "The Session.close() method does not prevent the Session from being
206
309
  # used again. The Session itself does not actually have a distinct
207
310
  # “closed” state; it merely means the Session will release all database
@@ -218,15 +321,13 @@ async def submit_workflow(
218
321
  new_dataset_attributes = await process_workflow(
219
322
  workflow=workflow,
220
323
  dataset=dataset,
221
- slurm_user=slurm_user,
222
- slurm_account=job.slurm_account,
223
- user_cache_dir=user_cache_dir,
224
324
  workflow_dir_local=WORKFLOW_DIR_LOCAL,
225
325
  workflow_dir_remote=WORKFLOW_DIR_REMOTE,
226
326
  logger_name=logger_name,
227
327
  worker_init=worker_init,
228
328
  first_task_index=job.first_task_index,
229
329
  last_task_index=job.last_task_index,
330
+ **backend_specific_kwargs,
230
331
  )
231
332
 
232
333
  logger.info(
@@ -275,18 +376,14 @@ async def submit_workflow(
275
376
  dataset.images = latest_images
276
377
  db_sync.merge(dataset)
277
378
 
278
- job.status = JobStatusTypeV2.FAILED
279
- job.end_timestamp = get_timestamp()
280
-
281
379
  exception_args_string = "\n".join(e.args)
282
- job.log = (
380
+ log_msg = (
283
381
  f"TASK ERROR: "
284
382
  f"Task name: {e.task_name}, "
285
383
  f"position in Workflow: {e.workflow_task_order}\n"
286
384
  f"TRACEBACK:\n{exception_args_string}"
287
385
  )
288
- db_sync.merge(job)
289
- db_sync.commit()
386
+ fail_job(db=db_sync, job=job, log_msg=log_msg, logger_name=logger_name)
290
387
 
291
388
  except JobExecutionError as e:
292
389
 
@@ -309,12 +406,15 @@ async def submit_workflow(
309
406
  dataset.images = latest_images
310
407
  db_sync.merge(dataset)
311
408
 
312
- job.status = JobStatusTypeV2.FAILED
313
- job.end_timestamp = get_timestamp()
314
- error = e.assemble_error()
315
- job.log = f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}"
316
- db_sync.merge(job)
317
- db_sync.commit()
409
+ fail_job(
410
+ db=db_sync,
411
+ job=job,
412
+ log_msg=(
413
+ f"JOB ERROR in Fractal job {job.id}:\n"
414
+ f"TRACEBACK:\n{e.assemble_error()}"
415
+ ),
416
+ logger_name=logger_name,
417
+ )
318
418
 
319
419
  except Exception:
320
420
 
@@ -338,15 +438,16 @@ async def submit_workflow(
338
438
  if latest_images is not None:
339
439
  dataset.images = latest_images
340
440
  db_sync.merge(dataset)
341
-
342
- job.status = JobStatusTypeV2.FAILED
343
- job.end_timestamp = get_timestamp()
344
- job.log = (
345
- f"UNKNOWN ERROR in Fractal job {job.id}\n"
346
- f"TRACEBACK:\n{current_traceback}"
441
+ fail_job(
442
+ db=db_sync,
443
+ job=job,
444
+ log_msg=(
445
+ f"UNKNOWN ERROR in Fractal job {job.id}\n"
446
+ f"TRACEBACK:\n{current_traceback}"
447
+ ),
448
+ logger_name=logger_name,
347
449
  )
348
- db_sync.merge(job)
349
- db_sync.commit()
450
+
350
451
  finally:
351
452
  reset_logger_handlers(logger)
352
453
  db_sync.close()
@@ -1,9 +1,11 @@
1
+ from concurrent.futures.process import BrokenProcessPool
1
2
  from pathlib import Path
2
3
  from typing import Optional
3
4
 
4
5
  from ....models.v2 import DatasetV2
5
6
  from ....models.v2 import WorkflowV2
6
7
  from ...async_wrap import async_wrap
8
+ from ...exceptions import JobExecutionError
7
9
  from ...filenames import SHUTDOWN_FILENAME
8
10
  from ...set_start_and_last_task_index import set_start_and_last_task_index
9
11
  from ..runner import execute_tasks_v2
@@ -29,21 +31,29 @@ def _process_workflow(
29
31
  [process_workflow][fractal_server.app.runner.v2._local_experimental.process_workflow]
30
32
  for the call signature.
31
33
  """
32
-
33
34
  with FractalProcessPoolExecutor(
34
35
  shutdown_file=workflow_dir_local / SHUTDOWN_FILENAME
35
36
  ) as executor:
36
- new_dataset_attributes = execute_tasks_v2(
37
- wf_task_list=workflow.task_list[
38
- first_task_index : (last_task_index + 1) # noqa
39
- ], # noqa
40
- dataset=dataset,
41
- executor=executor,
42
- workflow_dir_local=workflow_dir_local,
43
- workflow_dir_remote=workflow_dir_local,
44
- logger_name=logger_name,
45
- submit_setup_call=_local_submit_setup,
46
- )
37
+ try:
38
+ new_dataset_attributes = execute_tasks_v2(
39
+ wf_task_list=workflow.task_list[
40
+ first_task_index : (last_task_index + 1) # noqa
41
+ ],
42
+ dataset=dataset,
43
+ executor=executor,
44
+ workflow_dir_local=workflow_dir_local,
45
+ workflow_dir_remote=workflow_dir_local,
46
+ logger_name=logger_name,
47
+ submit_setup_call=_local_submit_setup,
48
+ )
49
+ except BrokenProcessPool as e:
50
+ raise JobExecutionError(
51
+ info=(
52
+ "Job failed with BrokenProcessPool error, likely due to "
53
+ f"an executor shutdown.\nOriginal error:\n{e.args[0]}"
54
+ )
55
+ )
56
+
47
57
  return new_dataset_attributes
48
58
 
49
59
 
@@ -2,8 +2,6 @@
2
2
  Custom version of Python
3
3
  [ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)).
4
4
  """
5
- import os
6
- import signal
7
5
  import threading
8
6
  import time
9
7
  from concurrent.futures import ProcessPoolExecutor
@@ -14,13 +12,14 @@ from typing import Iterable
14
12
  from typing import Optional
15
13
  from typing import Sequence
16
14
 
15
+ import psutil
16
+
17
17
  from ._local_config import get_default_local_backend_config
18
18
  from ._local_config import LocalBackendConfig
19
19
  from fractal_server.app.runner.exceptions import JobExecutionError
20
- from fractal_server.logger import get_logger
21
-
20
+ from fractal_server.logger import set_logger
22
21
 
23
- logger = get_logger("FractalProcessPoolExecutor")
22
+ logger = set_logger("FractalProcessPoolExecutor")
24
23
 
25
24
 
26
25
  class FractalProcessPoolExecutor(ProcessPoolExecutor):
@@ -66,12 +65,17 @@ class FractalProcessPoolExecutor(ProcessPoolExecutor):
66
65
  """
67
66
  Running on '_shutdown_file_thread'.
68
67
  """
68
+
69
69
  logger.info("Start terminating FractalProcessPoolExecutor processes.")
70
+ # We use 'psutil' in order to easily access the PIDs of the children.
70
71
  if self._processes is not None:
71
72
  for pid in self._processes.keys():
72
- logger.debug(f"Sending SIGTERM to process {pid}")
73
- os.kill(pid, signal.SIGTERM)
74
- logger.debug(f"Process {pid} terminated.")
73
+ parent = psutil.Process(pid)
74
+ children = parent.children(recursive=True)
75
+ for child in children:
76
+ child.kill()
77
+ parent.kill()
78
+ logger.info(f"Process {pid} and its children terminated.")
75
79
  logger.info("FractalProcessPoolExecutor processes terminated.")
76
80
 
77
81
  def shutdown(self, *args, **kwargs) -> None:
@@ -24,16 +24,11 @@ from typing import Union
24
24
  from ....models.v2 import DatasetV2
25
25
  from ....models.v2 import WorkflowV2
26
26
  from ...async_wrap import async_wrap
27
- from ...executors.slurm.executor import FractalSlurmExecutor
27
+ from ...executors.slurm.sudo.executor import FractalSlurmExecutor
28
28
  from ...set_start_and_last_task_index import set_start_and_last_task_index
29
29
  from ..runner import execute_tasks_v2
30
30
  from ._submit_setup import _slurm_submit_setup
31
31
 
32
- # from .._common import execute_tasks
33
- # from ..common import async_wrap
34
- # from ..common import set_start_and_last_task_index
35
- # from ..common import TaskParameters
36
-
37
32
 
38
33
  def _process_workflow(
39
34
  *,
@@ -0,0 +1,126 @@
1
+ # Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
2
+ # University of Zurich
3
+ #
4
+ # Original authors:
5
+ # Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
6
+ # Tommaso Comparin <tommaso.comparin@exact-lab.it>
7
+ # Marco Franzon <marco.franzon@exact-lab.it>
8
+ #
9
+ # This file is part of Fractal and was originally developed by eXact lab S.r.l.
10
+ # <exact-lab.it> under contract with Liberali Lab from the Friedrich Miescher
11
+ # Institute for Biomedical Research and Pelkmans Lab from the University of
12
+ # Zurich.
13
+ """
14
+ Slurm Bakend
15
+
16
+ This backend runs fractal workflows in a SLURM cluster using Clusterfutures
17
+ Executor objects.
18
+ """
19
+ from pathlib import Path
20
+ from typing import Any
21
+ from typing import Optional
22
+ from typing import Union
23
+
24
+ from fabric import Connection
25
+
26
+ from ....models.v2 import DatasetV2
27
+ from ....models.v2 import WorkflowV2
28
+ from ...async_wrap import async_wrap
29
+ from ...executors.slurm.ssh.executor import FractalSlurmSSHExecutor
30
+ from ...set_start_and_last_task_index import set_start_and_last_task_index
31
+ from ..runner import execute_tasks_v2
32
+ from ._submit_setup import _slurm_submit_setup
33
+
34
+
35
+ def _process_workflow(
36
+ *,
37
+ workflow: WorkflowV2,
38
+ dataset: DatasetV2,
39
+ logger_name: str,
40
+ workflow_dir_local: Path,
41
+ workflow_dir_remote: Path,
42
+ first_task_index: int,
43
+ last_task_index: int,
44
+ connection: Connection,
45
+ worker_init: Optional[Union[str, list[str]]] = None,
46
+ ) -> dict[str, Any]:
47
+ """
48
+ Internal processing routine for the SLURM backend
49
+
50
+ This function initialises the a FractalSlurmExecutor, setting logging,
51
+ workflow working dir and user to impersonate. It then schedules the
52
+ workflow tasks and returns the new dataset attributes
53
+
54
+ Cf.
55
+ [process_workflow][fractal_server.app.runner.v2._local.process_workflow]
56
+
57
+ Returns:
58
+ new_dataset_attributes:
59
+ """
60
+
61
+ if isinstance(worker_init, str):
62
+ worker_init = worker_init.split("\n")
63
+
64
+ with FractalSlurmSSHExecutor(
65
+ connection=connection,
66
+ workflow_dir_local=workflow_dir_local,
67
+ workflow_dir_remote=workflow_dir_remote,
68
+ common_script_lines=worker_init,
69
+ ) as executor:
70
+ new_dataset_attributes = execute_tasks_v2(
71
+ wf_task_list=workflow.task_list[
72
+ first_task_index : (last_task_index + 1) # noqa
73
+ ], # noqa
74
+ dataset=dataset,
75
+ executor=executor,
76
+ workflow_dir_local=workflow_dir_local,
77
+ workflow_dir_remote=workflow_dir_remote,
78
+ logger_name=logger_name,
79
+ submit_setup_call=_slurm_submit_setup,
80
+ )
81
+ return new_dataset_attributes
82
+
83
+
84
+ async def process_workflow(
85
+ *,
86
+ workflow: WorkflowV2,
87
+ dataset: DatasetV2,
88
+ workflow_dir_local: Path,
89
+ workflow_dir_remote: Optional[Path] = None,
90
+ first_task_index: Optional[int] = None,
91
+ last_task_index: Optional[int] = None,
92
+ logger_name: str,
93
+ # Not used
94
+ connection: Connection,
95
+ user_cache_dir: Optional[str] = None,
96
+ slurm_user: Optional[str] = None,
97
+ slurm_account: Optional[str] = None,
98
+ worker_init: Optional[str] = None,
99
+ ) -> dict:
100
+ """
101
+ Process workflow (SLURM backend public interface)
102
+
103
+ Cf.
104
+ [process_workflow][fractal_server.app.runner.v2._local.process_workflow]
105
+ """
106
+
107
+ # Set values of first_task_index and last_task_index
108
+ num_tasks = len(workflow.task_list)
109
+ first_task_index, last_task_index = set_start_and_last_task_index(
110
+ num_tasks,
111
+ first_task_index=first_task_index,
112
+ last_task_index=last_task_index,
113
+ )
114
+
115
+ new_dataset_attributes = await async_wrap(_process_workflow)(
116
+ workflow=workflow,
117
+ dataset=dataset,
118
+ logger_name=logger_name,
119
+ workflow_dir_local=workflow_dir_local,
120
+ workflow_dir_remote=workflow_dir_remote,
121
+ first_task_index=first_task_index,
122
+ last_task_index=last_task_index,
123
+ worker_init=worker_init,
124
+ connection=connection,
125
+ )
126
+ return new_dataset_attributes