fractal-server 1.4.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/db/__init__.py +0 -1
  3. fractal_server/app/models/__init__.py +6 -8
  4. fractal_server/app/models/linkuserproject.py +9 -0
  5. fractal_server/app/models/security.py +6 -0
  6. fractal_server/app/models/v1/__init__.py +12 -0
  7. fractal_server/app/models/{dataset.py → v1/dataset.py} +5 -5
  8. fractal_server/app/models/{job.py → v1/job.py} +5 -5
  9. fractal_server/app/models/{project.py → v1/project.py} +5 -5
  10. fractal_server/app/models/{state.py → v1/state.py} +2 -2
  11. fractal_server/app/models/{task.py → v1/task.py} +7 -2
  12. fractal_server/app/models/{workflow.py → v1/workflow.py} +5 -5
  13. fractal_server/app/models/v2/__init__.py +22 -0
  14. fractal_server/app/models/v2/collection_state.py +21 -0
  15. fractal_server/app/models/v2/dataset.py +54 -0
  16. fractal_server/app/models/v2/job.py +51 -0
  17. fractal_server/app/models/v2/project.py +30 -0
  18. fractal_server/app/models/v2/task.py +93 -0
  19. fractal_server/app/models/v2/workflow.py +35 -0
  20. fractal_server/app/models/v2/workflowtask.py +49 -0
  21. fractal_server/app/routes/admin/__init__.py +0 -0
  22. fractal_server/app/routes/{admin.py → admin/v1.py} +42 -42
  23. fractal_server/app/routes/admin/v2.py +309 -0
  24. fractal_server/app/routes/api/v1/__init__.py +7 -7
  25. fractal_server/app/routes/api/v1/_aux_functions.py +8 -8
  26. fractal_server/app/routes/api/v1/dataset.py +48 -41
  27. fractal_server/app/routes/api/v1/job.py +14 -14
  28. fractal_server/app/routes/api/v1/project.py +30 -27
  29. fractal_server/app/routes/api/v1/task.py +26 -16
  30. fractal_server/app/routes/api/v1/task_collection.py +28 -16
  31. fractal_server/app/routes/api/v1/workflow.py +28 -28
  32. fractal_server/app/routes/api/v1/workflowtask.py +11 -11
  33. fractal_server/app/routes/api/v2/__init__.py +34 -0
  34. fractal_server/app/routes/api/v2/_aux_functions.py +502 -0
  35. fractal_server/app/routes/api/v2/dataset.py +293 -0
  36. fractal_server/app/routes/api/v2/images.py +279 -0
  37. fractal_server/app/routes/api/v2/job.py +200 -0
  38. fractal_server/app/routes/api/v2/project.py +186 -0
  39. fractal_server/app/routes/api/v2/status.py +150 -0
  40. fractal_server/app/routes/api/v2/submit.py +210 -0
  41. fractal_server/app/routes/api/v2/task.py +222 -0
  42. fractal_server/app/routes/api/v2/task_collection.py +239 -0
  43. fractal_server/app/routes/api/v2/task_legacy.py +59 -0
  44. fractal_server/app/routes/api/v2/workflow.py +380 -0
  45. fractal_server/app/routes/api/v2/workflowtask.py +265 -0
  46. fractal_server/app/routes/aux/_job.py +2 -2
  47. fractal_server/app/runner/__init__.py +0 -379
  48. fractal_server/app/runner/async_wrap.py +27 -0
  49. fractal_server/app/runner/components.py +5 -0
  50. fractal_server/app/runner/exceptions.py +129 -0
  51. fractal_server/app/runner/executors/__init__.py +0 -0
  52. fractal_server/app/runner/executors/slurm/__init__.py +3 -0
  53. fractal_server/app/runner/{_slurm → executors/slurm}/_batching.py +1 -1
  54. fractal_server/app/runner/executors/slurm/_check_jobs_status.py +72 -0
  55. fractal_server/app/runner/{_slurm → executors/slurm}/_executor_wait_thread.py +3 -4
  56. fractal_server/app/runner/{_slurm → executors/slurm}/_slurm_config.py +3 -152
  57. fractal_server/app/runner/{_slurm → executors/slurm}/_subprocess_run_as_user.py +42 -1
  58. fractal_server/app/runner/{_slurm → executors/slurm}/executor.py +46 -27
  59. fractal_server/app/runner/filenames.py +6 -0
  60. fractal_server/app/runner/set_start_and_last_task_index.py +39 -0
  61. fractal_server/app/runner/task_files.py +103 -0
  62. fractal_server/app/runner/v1/__init__.py +366 -0
  63. fractal_server/app/runner/{_common.py → v1/_common.py} +56 -111
  64. fractal_server/app/runner/{_local → v1/_local}/__init__.py +5 -4
  65. fractal_server/app/runner/{_local → v1/_local}/_local_config.py +6 -7
  66. fractal_server/app/runner/{_local → v1/_local}/_submit_setup.py +1 -5
  67. fractal_server/app/runner/v1/_slurm/__init__.py +312 -0
  68. fractal_server/app/runner/{_slurm → v1/_slurm}/_submit_setup.py +5 -11
  69. fractal_server/app/runner/v1/_slurm/get_slurm_config.py +163 -0
  70. fractal_server/app/runner/v1/common.py +117 -0
  71. fractal_server/app/runner/{handle_failed_job.py → v1/handle_failed_job.py} +8 -8
  72. fractal_server/app/runner/v2/__init__.py +336 -0
  73. fractal_server/app/runner/v2/_local/__init__.py +162 -0
  74. fractal_server/app/runner/v2/_local/_local_config.py +118 -0
  75. fractal_server/app/runner/v2/_local/_submit_setup.py +52 -0
  76. fractal_server/app/runner/v2/_local/executor.py +100 -0
  77. fractal_server/app/runner/{_slurm → v2/_slurm}/__init__.py +38 -47
  78. fractal_server/app/runner/v2/_slurm/_submit_setup.py +82 -0
  79. fractal_server/app/runner/v2/_slurm/get_slurm_config.py +182 -0
  80. fractal_server/app/runner/v2/deduplicate_list.py +23 -0
  81. fractal_server/app/runner/v2/handle_failed_job.py +165 -0
  82. fractal_server/app/runner/v2/merge_outputs.py +38 -0
  83. fractal_server/app/runner/v2/runner.py +343 -0
  84. fractal_server/app/runner/v2/runner_functions.py +374 -0
  85. fractal_server/app/runner/v2/runner_functions_low_level.py +130 -0
  86. fractal_server/app/runner/v2/task_interface.py +62 -0
  87. fractal_server/app/runner/v2/v1_compat.py +31 -0
  88. fractal_server/app/schemas/__init__.py +1 -42
  89. fractal_server/app/schemas/_validators.py +28 -5
  90. fractal_server/app/schemas/v1/__init__.py +36 -0
  91. fractal_server/app/schemas/{applyworkflow.py → v1/applyworkflow.py} +18 -18
  92. fractal_server/app/schemas/{dataset.py → v1/dataset.py} +30 -30
  93. fractal_server/app/schemas/{dumps.py → v1/dumps.py} +8 -8
  94. fractal_server/app/schemas/{manifest.py → v1/manifest.py} +5 -5
  95. fractal_server/app/schemas/{project.py → v1/project.py} +9 -9
  96. fractal_server/app/schemas/{task.py → v1/task.py} +12 -12
  97. fractal_server/app/schemas/{task_collection.py → v1/task_collection.py} +7 -7
  98. fractal_server/app/schemas/{workflow.py → v1/workflow.py} +38 -38
  99. fractal_server/app/schemas/v2/__init__.py +37 -0
  100. fractal_server/app/schemas/v2/dataset.py +126 -0
  101. fractal_server/app/schemas/v2/dumps.py +87 -0
  102. fractal_server/app/schemas/v2/job.py +114 -0
  103. fractal_server/app/schemas/v2/manifest.py +159 -0
  104. fractal_server/app/schemas/v2/project.py +34 -0
  105. fractal_server/app/schemas/v2/status.py +16 -0
  106. fractal_server/app/schemas/v2/task.py +151 -0
  107. fractal_server/app/schemas/v2/task_collection.py +109 -0
  108. fractal_server/app/schemas/v2/workflow.py +79 -0
  109. fractal_server/app/schemas/v2/workflowtask.py +208 -0
  110. fractal_server/config.py +13 -10
  111. fractal_server/images/__init__.py +4 -0
  112. fractal_server/images/models.py +136 -0
  113. fractal_server/images/tools.py +84 -0
  114. fractal_server/main.py +11 -3
  115. fractal_server/migrations/env.py +0 -2
  116. fractal_server/migrations/versions/5bf02391cfef_v2.py +245 -0
  117. fractal_server/tasks/__init__.py +0 -5
  118. fractal_server/tasks/endpoint_operations.py +13 -19
  119. fractal_server/tasks/utils.py +35 -0
  120. fractal_server/tasks/{_TaskCollectPip.py → v1/_TaskCollectPip.py} +3 -3
  121. fractal_server/tasks/v1/__init__.py +0 -0
  122. fractal_server/tasks/{background_operations.py → v1/background_operations.py} +20 -52
  123. fractal_server/tasks/v1/get_collection_data.py +14 -0
  124. fractal_server/tasks/v2/_TaskCollectPip.py +103 -0
  125. fractal_server/tasks/v2/__init__.py +0 -0
  126. fractal_server/tasks/v2/background_operations.py +381 -0
  127. fractal_server/tasks/v2/get_collection_data.py +14 -0
  128. fractal_server/urls.py +13 -0
  129. {fractal_server-1.4.6.dist-info → fractal_server-2.0.0.dist-info}/METADATA +11 -12
  130. fractal_server-2.0.0.dist-info/RECORD +169 -0
  131. fractal_server/app/runner/_slurm/.gitignore +0 -2
  132. fractal_server/app/runner/common.py +0 -307
  133. fractal_server/app/schemas/json_schemas/manifest.json +0 -81
  134. fractal_server-1.4.6.dist-info/RECORD +0 -97
  135. /fractal_server/app/runner/{_slurm → executors/slurm}/remote.py +0 -0
  136. /fractal_server/app/runner/{_local → v1/_local}/executor.py +0 -0
  137. {fractal_server-1.4.6.dist-info → fractal_server-2.0.0.dist-info}/LICENSE +0 -0
  138. {fractal_server-1.4.6.dist-info → fractal_server-2.0.0.dist-info}/WHEEL +0 -0
  139. {fractal_server-1.4.6.dist-info → fractal_server-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,8 @@ from pathlib import Path
3
3
  from zipfile import ZIP_DEFLATED
4
4
  from zipfile import ZipFile
5
5
 
6
- from ...models import ApplyWorkflow
7
- from ...runner._common import SHUTDOWN_FILENAME
6
+ from ...models.v1 import ApplyWorkflow
7
+ from ...runner.filenames import SHUTDOWN_FILENAME
8
8
 
9
9
 
10
10
  def _write_shutdown_file(*, job: ApplyWorkflow):
@@ -1,379 +0,0 @@
1
- # Copyright 2022 (C) Friedrich Miescher Institute for Biomedical Research and
2
- # University of Zurich
3
- #
4
- # Original authors:
5
- # Jacopo Nespolo <jacopo.nespolo@exact-lab.it>
6
- # Tommaso Comparin <tommaso.comparin@exact-lab.it>
7
- # Marco Franzon <marco.franzon@exact-lab.it>
8
- #
9
- # This file is part of Fractal and was originally developed by eXact lab S.r.l.
10
- # <exact-lab.it> under contract with Liberali Lab from the Friedrich Miescher
11
- # Institute for Biomedical Research and Pelkmans Lab from the University of
12
- # Zurich.
13
- """
14
- Runner backend subsystem root
15
-
16
- This module is the single entry point to the runner backend subsystem. Other
17
- subystems should only import this module and not its submodules or the
18
- individual backends.
19
- """
20
- import os
21
- import traceback
22
- from pathlib import Path
23
- from typing import Optional
24
-
25
- from ... import __VERSION__
26
- from ...config import get_settings
27
- from ...logger import set_logger
28
- from ...syringe import Inject
29
- from ...utils import get_timestamp
30
- from ..db import DB
31
- from ..models import ApplyWorkflow
32
- from ..models import Dataset
33
- from ..models import Workflow
34
- from ..models import WorkflowTask
35
- from ..schemas import JobStatusType
36
- from ._common import WORKFLOW_LOG_FILENAME
37
- from ._local import process_workflow as local_process_workflow
38
- from .common import close_job_logger
39
- from .common import JobExecutionError
40
- from .common import TaskExecutionError
41
- from .common import validate_workflow_compatibility # noqa: F401
42
- from .handle_failed_job import assemble_history_failed_job
43
- from .handle_failed_job import assemble_meta_failed_job
44
-
45
-
46
- _backends = {}
47
- _backend_errors: dict[str, Exception] = {}
48
- _backends["local"] = local_process_workflow
49
-
50
- try:
51
- from ._slurm import process_workflow as slurm_process_workflow
52
-
53
- _backends["slurm"] = slurm_process_workflow
54
- except ModuleNotFoundError as e:
55
- _backend_errors["slurm"] = e
56
-
57
-
58
- def get_process_workflow():
59
- settings = Inject(get_settings)
60
- try:
61
- process_workflow = _backends[settings.FRACTAL_RUNNER_BACKEND]
62
- except KeyError:
63
- raise _backend_errors.get(
64
- settings.FRACTAL_RUNNER_BACKEND,
65
- RuntimeError(
66
- "Unknown error during collection of backend "
67
- f"`{settings.FRACTAL_RUNNER_BACKEND}`"
68
- ),
69
- )
70
- return process_workflow
71
-
72
-
73
- async def submit_workflow(
74
- *,
75
- workflow_id: int,
76
- input_dataset_id: int,
77
- output_dataset_id: int,
78
- job_id: int,
79
- worker_init: Optional[str] = None,
80
- slurm_user: Optional[str] = None,
81
- user_cache_dir: Optional[str] = None,
82
- ) -> None:
83
- """
84
- Prepares a workflow and applies it to a dataset
85
-
86
- This function wraps the process_workflow one, which is different for each
87
- backend (e.g. local or slurm backend).
88
-
89
- Args:
90
- workflow_id:
91
- ID of the workflow being applied
92
- input_dataset_id:
93
- Input dataset ID
94
- output_dataset_id:
95
- ID of the destination dataset of the workflow.
96
- job_id:
97
- Id of the job record which stores the state for the current
98
- workflow application.
99
- worker_init:
100
- Custom executor parameters that get parsed before the execution of
101
- each task.
102
- user_cache_dir:
103
- Cache directory (namely a path where the user can write); for the
104
- slurm backend, this is used as a base directory for
105
- `job.working_dir_user`.
106
- slurm_user:
107
- The username to impersonate for the workflow execution, for the
108
- slurm backend.
109
- """
110
- with next(DB.get_sync_db()) as db_sync:
111
-
112
- job: ApplyWorkflow = db_sync.get(ApplyWorkflow, job_id)
113
- if not job:
114
- raise ValueError(f"Cannot fetch job {job_id} from database")
115
-
116
- input_dataset: Dataset = db_sync.get(Dataset, input_dataset_id)
117
- output_dataset: Dataset = db_sync.get(Dataset, output_dataset_id)
118
- workflow: Workflow = db_sync.get(Workflow, workflow_id)
119
- if not (input_dataset and output_dataset and workflow):
120
- log_msg = ""
121
- if not input_dataset:
122
- log_msg += (
123
- f"Cannot fetch input_dataset {input_dataset_id} "
124
- "from database\n"
125
- )
126
- if not output_dataset:
127
- log_msg += (
128
- f"Cannot fetch output_dataset {output_dataset_id} "
129
- "from database\n"
130
- )
131
- if not workflow:
132
- log_msg += (
133
- f"Cannot fetch workflow {workflow_id} from database\n"
134
- )
135
- job.status = JobStatusType.FAILED
136
- job.end_timestamp = get_timestamp()
137
- job.log = log_msg
138
- db_sync.merge(job)
139
- db_sync.commit()
140
- db_sync.close()
141
- return
142
-
143
- # Select backend
144
- settings = Inject(get_settings)
145
- FRACTAL_RUNNER_BACKEND = settings.FRACTAL_RUNNER_BACKEND
146
- process_workflow = get_process_workflow()
147
-
148
- # Prepare some of process_workflow arguments
149
- input_paths = input_dataset.paths
150
- output_path = output_dataset.paths[0]
151
-
152
- # Define and create server-side working folder
153
- project_id = workflow.project_id
154
- timestamp_string = get_timestamp().strftime("%Y%m%d_%H%M%S")
155
- WORKFLOW_DIR = (
156
- settings.FRACTAL_RUNNER_WORKING_BASE_DIR
157
- / (
158
- f"proj_{project_id:07d}_wf_{workflow_id:07d}_job_{job_id:07d}"
159
- f"_{timestamp_string}"
160
- )
161
- ).resolve()
162
-
163
- if WORKFLOW_DIR.exists():
164
- raise RuntimeError(f"Workflow dir {WORKFLOW_DIR} already exists.")
165
-
166
- # Create WORKFLOW_DIR with 755 permissions
167
- original_umask = os.umask(0)
168
- WORKFLOW_DIR.mkdir(parents=True, mode=0o755)
169
- os.umask(original_umask)
170
-
171
- # Define and create user-side working folder, if needed
172
- if FRACTAL_RUNNER_BACKEND == "local":
173
- WORKFLOW_DIR_USER = WORKFLOW_DIR
174
- elif FRACTAL_RUNNER_BACKEND == "slurm":
175
-
176
- from ._slurm._subprocess_run_as_user import _mkdir_as_user
177
-
178
- WORKFLOW_DIR_USER = (
179
- Path(user_cache_dir) / f"{WORKFLOW_DIR.name}"
180
- ).resolve()
181
- _mkdir_as_user(folder=str(WORKFLOW_DIR_USER), user=slurm_user)
182
- else:
183
- raise ValueError(f"{FRACTAL_RUNNER_BACKEND=} not supported")
184
-
185
- # Update db
186
- job.working_dir = WORKFLOW_DIR.as_posix()
187
- job.working_dir_user = WORKFLOW_DIR_USER.as_posix()
188
- db_sync.merge(job)
189
- db_sync.commit()
190
-
191
- # After Session.commit() is called, either explicitly or when using a
192
- # context manager, all objects associated with the Session are expired.
193
- # https://docs.sqlalchemy.org/en/14/orm/
194
- # session_basics.html#opening-and-closing-a-session
195
- # https://docs.sqlalchemy.org/en/14/orm/
196
- # session_state_management.html#refreshing-expiring
197
-
198
- # See issue #928:
199
- # https://github.com/fractal-analytics-platform/
200
- # fractal-server/issues/928
201
-
202
- db_sync.refresh(input_dataset)
203
- db_sync.refresh(output_dataset)
204
- db_sync.refresh(workflow)
205
-
206
- # Write logs
207
- logger_name = f"WF{workflow_id}_job{job_id}"
208
- log_file_path = WORKFLOW_DIR / WORKFLOW_LOG_FILENAME
209
- logger = set_logger(
210
- logger_name=logger_name,
211
- log_file_path=log_file_path,
212
- )
213
- logger.info(
214
- f'Start execution of workflow "{workflow.name}"; '
215
- f"more logs at {str(log_file_path)}"
216
- )
217
- logger.debug(f"fractal_server.__VERSION__: {__VERSION__}")
218
- logger.debug(f"FRACTAL_RUNNER_BACKEND: {FRACTAL_RUNNER_BACKEND}")
219
- logger.debug(f"slurm_user: {slurm_user}")
220
- logger.debug(f"slurm_account: {job.slurm_account}")
221
- logger.debug(f"worker_init: {worker_init}")
222
- logger.debug(f"input metadata: {input_dataset.meta}")
223
- logger.debug(f"input_paths: {input_paths}")
224
- logger.debug(f"output_path: {output_path}")
225
- logger.debug(f"job.id: {job.id}")
226
- logger.debug(f"job.working_dir: {job.working_dir}")
227
- logger.debug(f"job.working_dir_user: {job.working_dir_user}")
228
- logger.debug(f"job.first_task_index: {job.first_task_index}")
229
- logger.debug(f"job.last_task_index: {job.last_task_index}")
230
- logger.debug(f'START workflow "{workflow.name}"')
231
-
232
- try:
233
- # "The Session.close() method does not prevent the Session from being
234
- # used again. The Session itself does not actually have a distinct
235
- # “closed” state; it merely means the Session will release all database
236
- # connections and ORM objects."
237
- # (https://docs.sqlalchemy.org/en/20/orm/session_api.html#sqlalchemy.orm.Session.close).
238
- #
239
- # We close the session before the (possibly long) process_workflow
240
- # call, to make sure all DB connections are released. The reason why we
241
- # are not using a context manager within the try block is that we also
242
- # need access to db_sync in the except branches.
243
- db_sync = next(DB.get_sync_db())
244
- db_sync.close()
245
-
246
- output_dataset_meta_hist = await process_workflow(
247
- workflow=workflow,
248
- input_paths=input_paths,
249
- output_path=output_path,
250
- input_metadata=input_dataset.meta,
251
- input_history=input_dataset.history,
252
- slurm_user=slurm_user,
253
- slurm_account=job.slurm_account,
254
- user_cache_dir=user_cache_dir,
255
- workflow_dir=WORKFLOW_DIR,
256
- workflow_dir_user=WORKFLOW_DIR_USER,
257
- logger_name=logger_name,
258
- worker_init=worker_init,
259
- first_task_index=job.first_task_index,
260
- last_task_index=job.last_task_index,
261
- )
262
-
263
- logger.info(
264
- f'End execution of workflow "{workflow.name}"; '
265
- f"more logs at {str(log_file_path)}"
266
- )
267
- logger.debug(f'END workflow "{workflow.name}"')
268
-
269
- # Replace output_dataset.meta and output_dataset.history with their
270
- # up-to-date versions, obtained within process_workflow
271
- output_dataset.history = output_dataset_meta_hist.pop("history")
272
- output_dataset.meta = output_dataset_meta_hist.pop("metadata")
273
-
274
- db_sync.merge(output_dataset)
275
-
276
- # Update job DB entry
277
- job.status = JobStatusType.DONE
278
- job.end_timestamp = get_timestamp()
279
- with log_file_path.open("r") as f:
280
- logs = f.read()
281
- job.log = logs
282
- db_sync.merge(job)
283
- close_job_logger(logger)
284
- db_sync.commit()
285
-
286
- except TaskExecutionError as e:
287
-
288
- logger.debug(f'FAILED workflow "{workflow.name}", TaskExecutionError.')
289
- logger.info(f'Workflow "{workflow.name}" failed (TaskExecutionError).')
290
-
291
- # Assemble output_dataset.meta based on the last successful task, i.e.
292
- # based on METADATA_FILENAME
293
- output_dataset.meta = assemble_meta_failed_job(job, output_dataset)
294
-
295
- # Assemble new history and assign it to output_dataset.meta
296
- failed_wftask = db_sync.get(WorkflowTask, e.workflow_task_id)
297
- output_dataset.history = assemble_history_failed_job(
298
- job,
299
- output_dataset,
300
- workflow,
301
- logger,
302
- failed_wftask=failed_wftask,
303
- )
304
-
305
- db_sync.merge(output_dataset)
306
-
307
- job.status = JobStatusType.FAILED
308
- job.end_timestamp = get_timestamp()
309
-
310
- exception_args_string = "\n".join(e.args)
311
- job.log = (
312
- f"TASK ERROR: "
313
- f"Task name: {e.task_name}, "
314
- f"position in Workflow: {e.workflow_task_order=}\n"
315
- f"TRACEBACK:\n{exception_args_string}"
316
- )
317
- db_sync.merge(job)
318
- close_job_logger(logger)
319
- db_sync.commit()
320
-
321
- except JobExecutionError as e:
322
-
323
- logger.debug(f'FAILED workflow "{workflow.name}", JobExecutionError.')
324
- logger.info(f'Workflow "{workflow.name}" failed (JobExecutionError).')
325
-
326
- # Assemble output_dataset.meta based on the last successful task, i.e.
327
- # based on METADATA_FILENAME
328
- output_dataset.meta = assemble_meta_failed_job(job, output_dataset)
329
-
330
- # Assemble new history and assign it to output_dataset.meta
331
- output_dataset.history = assemble_history_failed_job(
332
- job,
333
- output_dataset,
334
- workflow,
335
- logger,
336
- )
337
-
338
- db_sync.merge(output_dataset)
339
-
340
- job.status = JobStatusType.FAILED
341
- job.end_timestamp = get_timestamp()
342
- error = e.assemble_error()
343
- job.log = f"JOB ERROR in Fractal job {job.id}:\nTRACEBACK:\n{error}"
344
- db_sync.merge(job)
345
- close_job_logger(logger)
346
- db_sync.commit()
347
-
348
- except Exception:
349
-
350
- logger.debug(f'FAILED workflow "{workflow.name}", unknown error.')
351
- logger.info(f'Workflow "{workflow.name}" failed (unkwnon error).')
352
-
353
- current_traceback = traceback.format_exc()
354
-
355
- # Assemble output_dataset.meta based on the last successful task, i.e.
356
- # based on METADATA_FILENAME
357
- output_dataset.meta = assemble_meta_failed_job(job, output_dataset)
358
-
359
- # Assemble new history and assign it to output_dataset.meta
360
- output_dataset.history = assemble_history_failed_job(
361
- job,
362
- output_dataset,
363
- workflow,
364
- logger,
365
- )
366
-
367
- db_sync.merge(output_dataset)
368
-
369
- job.status = JobStatusType.FAILED
370
- job.end_timestamp = get_timestamp()
371
- job.log = (
372
- f"UNKNOWN ERROR in Fractal job {job.id}\n"
373
- f"TRACEBACK:\n{current_traceback}"
374
- )
375
- db_sync.merge(job)
376
- close_job_logger(logger)
377
- db_sync.commit()
378
- finally:
379
- db_sync.close()
@@ -0,0 +1,27 @@
1
+ import asyncio
2
+ from functools import partial
3
+ from functools import wraps
4
+ from typing import Callable
5
+
6
+
7
+ def async_wrap(func: Callable) -> Callable:
8
+ """
9
+ Wrap a synchronous callable in an async task
10
+
11
+ Ref: [issue #140](https://github.com/fractal-analytics-platform/fractal-server/issues/140)
12
+ and [this StackOverflow answer](https://stackoverflow.com/q/43241221/19085332).
13
+
14
+ Returns:
15
+ async_wrapper:
16
+ A factory that allows wrapping a blocking callable within a
17
+ coroutine.
18
+ """ # noqa: E501
19
+
20
+ @wraps(func)
21
+ async def async_wrapper(*args, loop=None, executor=None, **kwargs):
22
+ if loop is None:
23
+ loop = asyncio.get_event_loop()
24
+ pfunc = partial(func, *args, **kwargs)
25
+ return await loop.run_in_executor(executor, pfunc)
26
+
27
+ return async_wrapper
@@ -0,0 +1,5 @@
1
+ def _index_to_component(ind: int) -> str:
2
+ return f"{ind:07d}"
3
+
4
+
5
+ _COMPONENT_KEY_ = "__FRACTAL_PARALLEL_COMPONENT__"
@@ -0,0 +1,129 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+
5
+ class TaskExecutionError(RuntimeError):
6
+ """
7
+ Forwards errors occurred during the execution of a task
8
+
9
+ This error wraps and forwards errors occurred during the execution of
10
+ tasks, when the exit code is larger than 0 (i.e. the error took place
11
+ within the task). This error also adds information that is useful to track
12
+ down and debug the failing task within a workflow.
13
+
14
+ Attributes:
15
+ workflow_task_id:
16
+ ID of the workflow task that failed.
17
+ workflow_task_order:
18
+ Order of the task within the workflow.
19
+ task_name:
20
+ Human readable name of the failing task.
21
+ """
22
+
23
+ workflow_task_id: Optional[int] = None
24
+ workflow_task_order: Optional[int] = None
25
+ task_name: Optional[str] = None
26
+
27
+ def __init__(
28
+ self,
29
+ *args,
30
+ workflow_task_id: Optional[int] = None,
31
+ workflow_task_order: Optional[int] = None,
32
+ task_name: Optional[str] = None,
33
+ ):
34
+ super().__init__(*args)
35
+ self.workflow_task_id = workflow_task_id
36
+ self.workflow_task_order = workflow_task_order
37
+ self.task_name = task_name
38
+
39
+
40
+ class JobExecutionError(RuntimeError):
41
+ """
42
+ Forwards errors in the execution of a task that are due to external factors
43
+
44
+ This error wraps and forwards errors occurred during the execution of
45
+ tasks, but related to external factors like:
46
+
47
+ 1. A negative exit code (e.g. because the task received a TERM or KILL
48
+ signal);
49
+ 2. An error on the executor side (e.g. the SLURM executor could not
50
+ find the pickled file with task output).
51
+
52
+ This error also adds information that is useful to track down and debug the
53
+ failing task within a workflow.
54
+
55
+ Attributes:
56
+ info:
57
+ A free field for additional information
58
+ cmd_file:
59
+ Path to the file of the command that was executed (e.g. a SLURM
60
+ submission script).
61
+ stdout_file:
62
+ Path to the file with the command stdout
63
+ stderr_file:
64
+ Path to the file with the command stderr
65
+ """
66
+
67
+ cmd_file: Optional[str] = None
68
+ stdout_file: Optional[str] = None
69
+ stderr_file: Optional[str] = None
70
+ info: Optional[str] = None
71
+
72
+ def __init__(
73
+ self,
74
+ *args,
75
+ cmd_file: Optional[str] = None,
76
+ stdout_file: Optional[str] = None,
77
+ stderr_file: Optional[str] = None,
78
+ info: Optional[str] = None,
79
+ ):
80
+ super().__init__(*args)
81
+ self.cmd_file = cmd_file
82
+ self.stdout_file = stdout_file
83
+ self.stderr_file = stderr_file
84
+ self.info = info
85
+
86
+ def _read_file(self, filepath: str) -> str:
87
+ """
88
+ Return the content of a text file, and handle the cases where it is
89
+ empty or missing
90
+ """
91
+ if os.path.exists(filepath):
92
+ with open(filepath, "r") as f:
93
+ content = f.read()
94
+ if content:
95
+ return f"Content of {filepath}:\n{content}"
96
+ else:
97
+ return f"File {filepath} is empty\n"
98
+ else:
99
+ return f"File {filepath} is missing\n"
100
+
101
+ def assemble_error(self) -> str:
102
+ """
103
+ Read the files that are specified in attributes, and combine them in an
104
+ error message.
105
+ """
106
+ if self.cmd_file:
107
+ content = self._read_file(self.cmd_file)
108
+ cmd_content = f"COMMAND:\n{content}\n\n"
109
+ else:
110
+ cmd_content = ""
111
+ if self.stdout_file:
112
+ content = self._read_file(self.stdout_file)
113
+ out_content = f"STDOUT:\n{content}\n\n"
114
+ else:
115
+ out_content = ""
116
+ if self.stderr_file:
117
+ content = self._read_file(self.stderr_file)
118
+ err_content = f"STDERR:\n{content}\n\n"
119
+ else:
120
+ err_content = ""
121
+
122
+ content = f"{cmd_content}{out_content}{err_content}"
123
+ if self.info:
124
+ content = f"{content}ADDITIONAL INFO:\n{self.info}\n\n"
125
+
126
+ if not content:
127
+ content = str(self)
128
+ message = f"JobExecutionError\n\n{content}"
129
+ return message
File without changes
@@ -0,0 +1,3 @@
1
+ from .executor import SlurmExecutor
2
+
3
+ __all__ = ["SlurmExecutor"]
@@ -14,7 +14,7 @@ Submodule to determine the number of total/parallel tasks per SLURM job.
14
14
  import math
15
15
  from typing import Optional
16
16
 
17
- from ....logger import set_logger
17
+ from .....logger import set_logger
18
18
 
19
19
  logger = set_logger(__name__)
20
20
 
@@ -0,0 +1,72 @@
1
+ from subprocess import run # nosec
2
+
3
+ from cfut.slurm import STATES_FINISHED
4
+
5
+ from .....logger import set_logger
6
+
7
+
8
+ logger = set_logger(__name__)
9
+
10
+
11
+ def run_squeue(job_ids):
12
+ res = run( # nosec
13
+ [
14
+ "squeue",
15
+ "--noheader",
16
+ "--format=%i %T",
17
+ "--jobs",
18
+ ",".join([str(j) for j in job_ids]),
19
+ "--states=all",
20
+ ],
21
+ capture_output=True,
22
+ encoding="utf-8",
23
+ check=False,
24
+ )
25
+ if res.returncode != 0:
26
+ logger.warning(
27
+ f"squeue command with {job_ids}"
28
+ f" failed with:\n{res.stderr=}\n{res.stdout=}"
29
+ )
30
+
31
+ return res
32
+
33
+
34
+ def _jobs_finished(job_ids) -> set[str]:
35
+ """
36
+ Check which ones of the given Slurm jobs already finished
37
+
38
+ The function is based on the `_jobs_finished` function from
39
+ clusterfutures (version 0.5).
40
+ Original Copyright: 2022 Adrian Sampson
41
+ (released under the MIT licence)
42
+ """
43
+
44
+ # If there is no Slurm job to check, return right away
45
+ if not job_ids:
46
+ return set()
47
+ id_to_state = dict()
48
+
49
+ res = run_squeue(job_ids)
50
+ if res.returncode == 0:
51
+ id_to_state = {
52
+ out.split()[0]: out.split()[1] for out in res.stdout.splitlines()
53
+ }
54
+ else:
55
+ id_to_state = dict()
56
+ for j in job_ids:
57
+ res = run_squeue([j])
58
+ if res.returncode != 0:
59
+ logger.info(f"Job {j} not found. Marked it as completed")
60
+ id_to_state.update({str(j): "COMPLETED"})
61
+ else:
62
+ id_to_state.update(
63
+ {res.stdout.split()[0]: res.stdout.split()[1]}
64
+ )
65
+
66
+ # Finished jobs only stay in squeue for a few mins (configurable). If
67
+ # a job ID isn't there, we'll assume it's finished.
68
+ return {
69
+ j
70
+ for j in job_ids
71
+ if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
72
+ }
@@ -6,10 +6,9 @@ from typing import Callable
6
6
  from typing import Optional
7
7
 
8
8
  from cfut import FileWaitThread
9
- from cfut import slurm
10
-
11
- from ....logger import set_logger
12
9
 
10
+ from .....logger import set_logger
11
+ from ._check_jobs_status import _jobs_finished
13
12
 
14
13
  logger = set_logger(__name__)
15
14
 
@@ -121,7 +120,7 @@ class FractalSlurmWaitThread(FractalFileWaitThread):
121
120
  super().check(i)
122
121
  if i % (self.slurm_poll_interval // self.interval) == 0:
123
122
  try:
124
- finished_jobs = slurm.jobs_finished(self.waiting.values())
123
+ finished_jobs = _jobs_finished(self.waiting.values())
125
124
  except Exception:
126
125
  # Don't abandon completion checking if jobs_finished errors
127
126
  traceback.print_exc()