fractal-server 2.13.1__py3-none-any.whl → 2.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/__main__.py +3 -1
  3. fractal_server/app/models/linkusergroup.py +6 -2
  4. fractal_server/app/models/v2/__init__.py +7 -1
  5. fractal_server/app/models/v2/dataset.py +1 -11
  6. fractal_server/app/models/v2/history.py +78 -0
  7. fractal_server/app/models/v2/job.py +10 -3
  8. fractal_server/app/models/v2/task_group.py +2 -2
  9. fractal_server/app/models/v2/workflow.py +1 -1
  10. fractal_server/app/models/v2/workflowtask.py +1 -1
  11. fractal_server/app/routes/admin/v2/accounting.py +18 -28
  12. fractal_server/app/routes/admin/v2/task.py +1 -1
  13. fractal_server/app/routes/admin/v2/task_group.py +0 -17
  14. fractal_server/app/routes/api/__init__.py +1 -1
  15. fractal_server/app/routes/api/v2/__init__.py +8 -2
  16. fractal_server/app/routes/api/v2/_aux_functions.py +66 -0
  17. fractal_server/app/routes/api/v2/_aux_functions_history.py +166 -0
  18. fractal_server/app/routes/api/v2/dataset.py +0 -17
  19. fractal_server/app/routes/api/v2/history.py +544 -0
  20. fractal_server/app/routes/api/v2/images.py +31 -43
  21. fractal_server/app/routes/api/v2/job.py +30 -0
  22. fractal_server/app/routes/api/v2/project.py +1 -53
  23. fractal_server/app/routes/api/v2/{status.py → status_legacy.py} +6 -6
  24. fractal_server/app/routes/api/v2/submit.py +16 -14
  25. fractal_server/app/routes/api/v2/task.py +3 -10
  26. fractal_server/app/routes/api/v2/task_collection_custom.py +4 -9
  27. fractal_server/app/routes/api/v2/task_group.py +0 -17
  28. fractal_server/app/routes/api/v2/verify_image_types.py +61 -0
  29. fractal_server/app/routes/api/v2/workflow.py +28 -69
  30. fractal_server/app/routes/api/v2/workflowtask.py +53 -50
  31. fractal_server/app/routes/auth/group.py +0 -16
  32. fractal_server/app/routes/auth/oauth.py +5 -3
  33. fractal_server/app/routes/pagination.py +47 -0
  34. fractal_server/app/runner/components.py +0 -3
  35. fractal_server/app/runner/compress_folder.py +57 -29
  36. fractal_server/app/runner/exceptions.py +4 -0
  37. fractal_server/app/runner/executors/base_runner.py +157 -0
  38. fractal_server/app/runner/{v2/_local/_local_config.py → executors/local/get_local_config.py} +7 -9
  39. fractal_server/app/runner/executors/local/runner.py +248 -0
  40. fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
  41. fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +9 -7
  42. fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py +868 -0
  43. fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +48 -17
  44. fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +36 -47
  45. fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py +134 -0
  46. fractal_server/app/runner/executors/slurm_ssh/runner.py +268 -0
  47. fractal_server/app/runner/executors/slurm_sudo/__init__.py +0 -0
  48. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -83
  49. fractal_server/app/runner/executors/slurm_sudo/runner.py +193 -0
  50. fractal_server/app/runner/extract_archive.py +1 -3
  51. fractal_server/app/runner/task_files.py +134 -87
  52. fractal_server/app/runner/v2/__init__.py +0 -399
  53. fractal_server/app/runner/v2/_local.py +88 -0
  54. fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +20 -19
  55. fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +17 -15
  56. fractal_server/app/runner/v2/db_tools.py +119 -0
  57. fractal_server/app/runner/v2/runner.py +206 -95
  58. fractal_server/app/runner/v2/runner_functions.py +488 -187
  59. fractal_server/app/runner/v2/runner_functions_low_level.py +40 -43
  60. fractal_server/app/runner/v2/submit_workflow.py +358 -0
  61. fractal_server/app/runner/v2/task_interface.py +31 -0
  62. fractal_server/app/schemas/_validators.py +13 -24
  63. fractal_server/app/schemas/user.py +10 -7
  64. fractal_server/app/schemas/user_settings.py +9 -21
  65. fractal_server/app/schemas/v2/__init__.py +9 -1
  66. fractal_server/app/schemas/v2/dataset.py +12 -94
  67. fractal_server/app/schemas/v2/dumps.py +26 -9
  68. fractal_server/app/schemas/v2/history.py +80 -0
  69. fractal_server/app/schemas/v2/job.py +15 -8
  70. fractal_server/app/schemas/v2/manifest.py +14 -7
  71. fractal_server/app/schemas/v2/project.py +9 -7
  72. fractal_server/app/schemas/v2/status_legacy.py +35 -0
  73. fractal_server/app/schemas/v2/task.py +72 -77
  74. fractal_server/app/schemas/v2/task_collection.py +14 -32
  75. fractal_server/app/schemas/v2/task_group.py +10 -9
  76. fractal_server/app/schemas/v2/workflow.py +10 -11
  77. fractal_server/app/schemas/v2/workflowtask.py +2 -21
  78. fractal_server/app/security/__init__.py +3 -3
  79. fractal_server/app/security/signup_email.py +2 -2
  80. fractal_server/config.py +41 -46
  81. fractal_server/images/tools.py +23 -0
  82. fractal_server/migrations/versions/47351f8c7ebc_drop_dataset_filters.py +50 -0
  83. fractal_server/migrations/versions/9db60297b8b2_set_ondelete.py +250 -0
  84. fractal_server/migrations/versions/c90a7c76e996_job_id_in_history_run.py +41 -0
  85. fractal_server/migrations/versions/e81103413827_add_job_type_filters.py +36 -0
  86. fractal_server/migrations/versions/f37aceb45062_make_historyunit_logfile_required.py +39 -0
  87. fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py +120 -0
  88. fractal_server/ssh/_fabric.py +28 -14
  89. fractal_server/tasks/v2/local/collect.py +2 -2
  90. fractal_server/tasks/v2/ssh/collect.py +2 -2
  91. fractal_server/tasks/v2/templates/2_pip_install.sh +1 -1
  92. fractal_server/tasks/v2/templates/4_pip_show.sh +1 -1
  93. fractal_server/tasks/v2/utils_background.py +0 -19
  94. fractal_server/tasks/v2/utils_database.py +30 -17
  95. fractal_server/tasks/v2/utils_templates.py +6 -0
  96. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/METADATA +4 -4
  97. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/RECORD +106 -96
  98. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/WHEEL +1 -1
  99. fractal_server/app/runner/executors/slurm/ssh/_executor_wait_thread.py +0 -126
  100. fractal_server/app/runner/executors/slurm/ssh/_slurm_job.py +0 -116
  101. fractal_server/app/runner/executors/slurm/ssh/executor.py +0 -1386
  102. fractal_server/app/runner/executors/slurm/sudo/_check_jobs_status.py +0 -71
  103. fractal_server/app/runner/executors/slurm/sudo/_executor_wait_thread.py +0 -130
  104. fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
  105. fractal_server/app/runner/v2/_local/__init__.py +0 -132
  106. fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
  107. fractal_server/app/runner/v2/_local/executor.py +0 -100
  108. fractal_server/app/runner/v2/_slurm_ssh/_submit_setup.py +0 -83
  109. fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
  110. fractal_server/app/runner/v2/handle_failed_job.py +0 -59
  111. fractal_server/app/schemas/v2/status.py +0 -16
  112. /fractal_server/app/{runner/executors/slurm → history}/__init__.py +0 -0
  113. /fractal_server/app/runner/executors/{slurm/ssh → local}/__init__.py +0 -0
  114. /fractal_server/app/runner/executors/{slurm/sudo → slurm_common}/__init__.py +0 -0
  115. /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
  116. /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
  117. /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_ssh}/__init__.py +0 -0
  118. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/LICENSE +0 -0
  119. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,868 @@
1
+ import json
2
+ import math
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any
7
+ from typing import Literal
8
+ from typing import Optional
9
+
10
+ import cloudpickle
11
+
12
+ from ..slurm_common._slurm_config import SlurmConfig
13
+ from ..slurm_common.slurm_job_task_models import SlurmJob
14
+ from ..slurm_common.slurm_job_task_models import SlurmTask
15
+ from ._job_states import STATES_FINISHED
16
+ from fractal_server import __VERSION__
17
+ from fractal_server.app.db import get_sync_db
18
+ from fractal_server.app.models.v2 import AccountingRecordSlurm
19
+ from fractal_server.app.runner.exceptions import JobExecutionError
20
+ from fractal_server.app.runner.exceptions import TaskExecutionError
21
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
22
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
23
+ from fractal_server.app.runner.task_files import TaskFiles
24
+ from fractal_server.app.runner.v2.db_tools import (
25
+ bulk_update_status_of_history_unit,
26
+ )
27
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
28
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
29
+ from fractal_server.config import get_settings
30
+ from fractal_server.logger import set_logger
31
+ from fractal_server.syringe import Inject
32
+
33
+ SHUTDOWN_ERROR_MESSAGE = "Failed due to job-execution shutdown."
34
+ SHUTDOWN_EXCEPTION = JobExecutionError(SHUTDOWN_ERROR_MESSAGE)
35
+
36
+ logger = set_logger(__name__)
37
+
38
+
39
+ def create_accounting_record_slurm(
40
+ *,
41
+ user_id: int,
42
+ slurm_job_ids: list[int],
43
+ ) -> None:
44
+ with next(get_sync_db()) as db:
45
+ db.add(
46
+ AccountingRecordSlurm(
47
+ user_id=user_id,
48
+ slurm_job_ids=slurm_job_ids,
49
+ )
50
+ )
51
+ db.commit()
52
+
53
+
54
+ class BaseSlurmRunner(BaseRunner):
55
+ shutdown_file: Path
56
+ common_script_lines: list[str]
57
+ user_cache_dir: str
58
+ root_dir_local: Path
59
+ root_dir_remote: Path
60
+ poll_interval: int
61
+ poll_interval_internal: float
62
+ jobs: dict[str, SlurmJob]
63
+ python_worker_interpreter: str
64
+ slurm_runner_type: Literal["ssh", "sudo"]
65
+
66
+ def __init__(
67
+ self,
68
+ root_dir_local: Path,
69
+ root_dir_remote: Path,
70
+ slurm_runner_type: Literal["ssh", "sudo"],
71
+ python_worker_interpreter: str,
72
+ common_script_lines: Optional[list[str]] = None,
73
+ user_cache_dir: Optional[str] = None,
74
+ poll_interval: Optional[int] = None,
75
+ ):
76
+ self.slurm_runner_type = slurm_runner_type
77
+ self.root_dir_local = root_dir_local
78
+ self.root_dir_remote = root_dir_remote
79
+ self.common_script_lines = common_script_lines or []
80
+ self._check_slurm_account()
81
+ self.user_cache_dir = user_cache_dir
82
+ self.python_worker_interpreter = python_worker_interpreter
83
+
84
+ settings = Inject(get_settings)
85
+
86
+ self.poll_interval = (
87
+ poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
88
+ )
89
+ self.poll_interval_internal = self.poll_interval / 10.0
90
+
91
+ self.check_fractal_server_versions()
92
+
93
+ # Create job folders. Note that the local one may or may not exist
94
+ # depending on whether it is a test or an actual run
95
+ try:
96
+ if not self.root_dir_local.is_dir():
97
+ self._mkdir_local_folder(self.root_dir_local.as_posix())
98
+ self._mkdir_remote_folder(self.root_dir_remote.as_posix())
99
+ except Exception as e:
100
+ error_msg = (
101
+ f"Could not mkdir {self.root_dir_local.as_posix()} or "
102
+ f"{self.root_dir_remote.as_posix()}. "
103
+ f"Original error: {str(e)}."
104
+ )
105
+ logger.error(error_msg)
106
+ raise RuntimeError(error_msg)
107
+
108
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
109
+ self.jobs = {}
110
+
111
+ def __enter__(self):
112
+ return self
113
+
114
+ def __exit__(self, exc_type, exc_val, exc_tb):
115
+ return False
116
+
117
+ def _run_remote_cmd(self, cmd: str) -> str:
118
+ raise NotImplementedError("Implement in child class.")
119
+
120
+ def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
121
+ raise NotImplementedError("Implement in child class.")
122
+
123
+ def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
124
+
125
+ # If there is no Slurm job to check, return right away
126
+ if not job_ids:
127
+ return set()
128
+
129
+ try:
130
+ stdout = self.run_squeue(job_ids=job_ids)
131
+ slurm_statuses = {
132
+ out.split()[0]: out.split()[1] for out in stdout.splitlines()
133
+ }
134
+ except Exception as e:
135
+ logger.warning(
136
+ "[_get_finished_jobs] `squeue` failed, "
137
+ "retry with individual job IDs. "
138
+ f"Original error: {str(e)}."
139
+ )
140
+ slurm_statuses = dict()
141
+ for job_id in job_ids:
142
+ try:
143
+ stdout = self.run_squeue(job_ids=[job_id])
144
+ slurm_statuses.update(
145
+ {stdout.split()[0]: stdout.split()[1]}
146
+ )
147
+ except Exception as e:
148
+ logger.warning(
149
+ "[_get_finished_jobs] `squeue` failed for "
150
+ f"{job_id=}, mark job as completed. "
151
+ f"Original error: {str(e)}."
152
+ )
153
+ slurm_statuses.update({str(job_id): "COMPLETED"})
154
+
155
+ # If a job is not in `squeue` output, mark it as completed.
156
+ finished_jobs = {
157
+ job_id
158
+ for job_id in job_ids
159
+ if slurm_statuses.get(job_id, "COMPLETED") in STATES_FINISHED
160
+ }
161
+ return finished_jobs
162
+
163
+ def _mkdir_local_folder(self, folder: str) -> None:
164
+ raise NotImplementedError("Implement in child class.")
165
+
166
+ def _mkdir_remote_folder(self, folder: str) -> None:
167
+ raise NotImplementedError("Implement in child class.")
168
+
169
+ def _submit_single_sbatch(
170
+ self,
171
+ func,
172
+ slurm_job: SlurmJob,
173
+ slurm_config: SlurmConfig,
174
+ ) -> str:
175
+ logger.debug("[_submit_single_sbatch] START")
176
+ # Prepare input pickle(s)
177
+ versions = dict(
178
+ python=sys.version_info[:3],
179
+ cloudpickle=cloudpickle.__version__,
180
+ fractal_server=__VERSION__,
181
+ )
182
+ for task in slurm_job.tasks:
183
+ # Write input pickle
184
+ _args = []
185
+ _kwargs = dict(
186
+ parameters=task.parameters,
187
+ remote_files=task.task_files.remote_files_dict,
188
+ )
189
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
190
+ with open(task.input_pickle_file_local, "wb") as f:
191
+ f.write(funcser)
192
+ logger.debug(
193
+ "[_submit_single_sbatch] Written "
194
+ f"{task.input_pickle_file_local=}"
195
+ )
196
+
197
+ if self.slurm_runner_type == "ssh":
198
+ # Send input pickle (only relevant for SSH)
199
+ self.fractal_ssh.send_file(
200
+ local=task.input_pickle_file_local,
201
+ remote=task.input_pickle_file_remote,
202
+ )
203
+ logger.debug(
204
+ "[_submit_single_sbatch] Transferred "
205
+ f"{task.input_pickle_file_local=}"
206
+ )
207
+
208
+ # Prepare commands to be included in SLURM submission script
209
+ cmdlines = []
210
+ for task in slurm_job.tasks:
211
+ if self.slurm_runner_type == "ssh":
212
+ input_pickle_file = task.input_pickle_file_remote
213
+ else:
214
+ input_pickle_file = task.input_pickle_file_local
215
+ output_pickle_file = task.output_pickle_file_remote
216
+ cmdlines.append(
217
+ (
218
+ f"{self.python_worker_interpreter}"
219
+ " -m fractal_server.app.runner."
220
+ "executors.slurm_common.remote "
221
+ f"--input-file {input_pickle_file} "
222
+ f"--output-file {output_pickle_file}"
223
+ )
224
+ )
225
+
226
+ # Set ntasks
227
+ num_tasks_max_running = slurm_config.parallel_tasks_per_job
228
+ ntasks = min(len(cmdlines), num_tasks_max_running)
229
+ slurm_config.parallel_tasks_per_job = ntasks
230
+
231
+ # Prepare SLURM preamble based on SlurmConfig object
232
+ script_lines = slurm_config.to_sbatch_preamble(
233
+ remote_export_dir=self.user_cache_dir
234
+ )
235
+
236
+ # Extend SLURM preamble with variable which are not in SlurmConfig, and
237
+ # fix their order
238
+ script_lines.extend(
239
+ [
240
+ f"#SBATCH --err={slurm_job.slurm_stderr_remote}",
241
+ f"#SBATCH --out={slurm_job.slurm_stdout_remote}",
242
+ f"#SBATCH -D {slurm_job.workdir_remote}",
243
+ ]
244
+ )
245
+ script_lines = slurm_config.sort_script_lines(script_lines)
246
+ logger.debug(script_lines)
247
+
248
+ # Always print output of `uname -n` and `pwd`
249
+ script_lines.append('\necho "Hostname: $(uname -n)"')
250
+ script_lines.append('echo "Current directory: $(pwd)"')
251
+ script_lines.append(
252
+ 'echo "Start time: $(date +"%Y-%m-%dT%H:%M:%S%z")"'
253
+ )
254
+
255
+ # Complete script preamble
256
+ script_lines.append("\n")
257
+
258
+ # Include command lines
259
+ mem_per_task_MB = slurm_config.mem_per_task_MB
260
+ for cmd in cmdlines:
261
+ script_lines.append(
262
+ "srun --ntasks=1 --cpus-per-task=$SLURM_CPUS_PER_TASK "
263
+ f"--mem={mem_per_task_MB}MB "
264
+ f"{cmd} &"
265
+ )
266
+ script_lines.append("wait\n")
267
+ script = "\n".join(script_lines)
268
+ script_lines.append(
269
+ 'echo "End time: $(date +"%Y-%m-%dT%H:%M:%S%z")"'
270
+ )
271
+
272
+ # Write submission script
273
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
274
+ f.write(script)
275
+ logger.debug(
276
+ "[_submit_single_sbatch] Written "
277
+ f"{slurm_job.slurm_submission_script_local=}"
278
+ )
279
+
280
+ if self.slurm_runner_type == "ssh":
281
+ self.fractal_ssh.send_file(
282
+ local=slurm_job.slurm_submission_script_local,
283
+ remote=slurm_job.slurm_submission_script_remote,
284
+ )
285
+ submit_command = (
286
+ "sbatch --parsable "
287
+ f"{slurm_job.slurm_submission_script_remote}"
288
+ )
289
+ else:
290
+ submit_command = (
291
+ "sbatch --parsable "
292
+ f"{slurm_job.slurm_submission_script_local}"
293
+ )
294
+ # Run sbatch
295
+ pre_submission_cmds = slurm_config.pre_submission_commands
296
+ if len(pre_submission_cmds) == 0:
297
+ logger.debug(f"Now run {submit_command=}")
298
+ sbatch_stdout = self._run_remote_cmd(submit_command)
299
+ else:
300
+ logger.debug(f"Now using {pre_submission_cmds=}")
301
+ script_lines = pre_submission_cmds + [submit_command]
302
+ wrapper_script_contents = "\n".join(script_lines)
303
+ wrapper_script_contents = f"{wrapper_script_contents}\n"
304
+ if self.slurm_runner_type == "ssh":
305
+ wrapper_script = (
306
+ f"{slurm_job.slurm_submission_script_remote}_wrapper.sh"
307
+ )
308
+ self.fractal_ssh.write_remote_file(
309
+ path=wrapper_script, content=wrapper_script_contents
310
+ )
311
+ else:
312
+ wrapper_script = (
313
+ f"{slurm_job.slurm_submission_script_local}_wrapper.sh"
314
+ )
315
+ with open(wrapper_script, "w") as f:
316
+ f.write(wrapper_script_contents)
317
+ logger.debug(f"Now run {wrapper_script=}")
318
+ sbatch_stdout = self._run_remote_cmd(f"bash {wrapper_script}")
319
+
320
+ # Submit SLURM job and retrieve job ID
321
+ logger.info(f"[_submit_single_sbatch] {sbatch_stdout=}")
322
+ stdout = sbatch_stdout.strip("\n")
323
+ submitted_job_id = int(stdout)
324
+ slurm_job.slurm_job_id = str(submitted_job_id)
325
+
326
+ # Add job to self.jobs
327
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
328
+ logger.debug(
329
+ "[_submit_single_sbatch] Added "
330
+ f"{slurm_job.slurm_job_id} to self.jobs."
331
+ )
332
+ logger.debug("[_submit_single_sbatch] END")
333
+
334
+ def _fetch_artifacts(
335
+ self,
336
+ finished_slurm_jobs: list[SlurmJob],
337
+ ) -> None:
338
+ raise NotImplementedError("Implement in child class.")
339
+
340
+ def _check_slurm_account(self) -> None:
341
+ """
342
+ Check that SLURM account is not set here in `common_script_lines`.
343
+ """
344
+ try:
345
+ invalid_line = next(
346
+ line
347
+ for line in self.common_script_lines
348
+ if line.startswith("#SBATCH --account=")
349
+ )
350
+ raise RuntimeError(
351
+ "Invalid line in `common_script_lines`: "
352
+ f"'{invalid_line}'.\n"
353
+ "SLURM account must be set via the request body of the "
354
+ "apply-workflow endpoint, or by modifying the user properties."
355
+ )
356
+ except StopIteration:
357
+ pass
358
+
359
+ def _postprocess_single_task(
360
+ self,
361
+ *,
362
+ task: SlurmTask,
363
+ was_job_scancelled: bool = False,
364
+ ) -> tuple[Any, Exception]:
365
+ try:
366
+ with open(task.output_pickle_file_local, "rb") as f:
367
+ outdata = f.read()
368
+ success, output = cloudpickle.loads(outdata)
369
+ if success:
370
+ # Task succeeded
371
+ result = output
372
+ return (result, None)
373
+ else:
374
+ # Task failed in a controlled way, and produced an `output`
375
+ # object which is a dictionary with required keys
376
+ # `exc_type_name` and `traceback_string` and with optional
377
+ # keys `workflow_task_order`, `workflow_task_id` and
378
+ # `task_name`.
379
+ exc_type_name = output.get("exc_type_name")
380
+ logger.debug(
381
+ f"Output pickle contains a '{exc_type_name}' exception."
382
+ )
383
+ traceback_string = output.get("traceback_string")
384
+ kwargs = {
385
+ key: output[key]
386
+ for key in [
387
+ "workflow_task_order",
388
+ "workflow_task_id",
389
+ "task_name",
390
+ ]
391
+ if key in output.keys()
392
+ }
393
+ exception = TaskExecutionError(traceback_string, **kwargs)
394
+ return (None, exception)
395
+
396
+ except Exception as e:
397
+ exception = JobExecutionError(f"ERROR, {str(e)}")
398
+ # If job was scancelled and task failed, replace
399
+ # exception with a shutdown-related one.
400
+ if was_job_scancelled:
401
+ logger.debug(
402
+ "Replacing exception with a shutdown-related one, "
403
+ f"for {task.index=}."
404
+ )
405
+ exception = SHUTDOWN_EXCEPTION
406
+ return (None, exception)
407
+ finally:
408
+ Path(task.input_pickle_file_local).unlink(missing_ok=True)
409
+ Path(task.output_pickle_file_local).unlink(missing_ok=True)
410
+
411
+ def is_shutdown(self) -> bool:
412
+ return self.shutdown_file.exists()
413
+
414
+ @property
415
+ def job_ids(self) -> list[str]:
416
+ return list(self.jobs.keys())
417
+
418
+ def wait_and_check_shutdown(self) -> list[str]:
419
+ """
420
+ Wait at most `self.poll_interval`, while also checking for shutdown.
421
+ """
422
+ # Sleep for `self.poll_interval`, but keep checking for shutdowns
423
+ start_time = time.perf_counter()
424
+ # Always wait at least 0.2 (note: this is for cases where
425
+ # `poll_interval=0`).
426
+ waiting_time = max(self.poll_interval, 0.2)
427
+ max_time = start_time + waiting_time
428
+ logger.debug(
429
+ "[wait_and_check_shutdown] "
430
+ f"I will wait at most {self.poll_interval} s, "
431
+ f"in blocks of {self.poll_interval_internal} s."
432
+ )
433
+
434
+ while time.perf_counter() < max_time:
435
+ if self.is_shutdown():
436
+ logger.info("[wait_and_check_shutdown] Shutdown file detected")
437
+ scancelled_job_ids = self.scancel_jobs()
438
+ logger.info(f"[wait_and_check_shutdown] {scancelled_job_ids=}")
439
+ return scancelled_job_ids
440
+ time.sleep(self.poll_interval_internal)
441
+
442
+ logger.debug("[wait_and_check_shutdown] No shutdown file detected")
443
+ return []
444
+
445
+ def _check_no_active_jobs(self):
446
+ if self.jobs != {}:
447
+ raise JobExecutionError(
448
+ "Unexpected branch: jobs must be empty before new "
449
+ "submissions."
450
+ )
451
+
452
+ def submit(
453
+ self,
454
+ func: callable,
455
+ parameters: dict[str, Any],
456
+ history_unit_id: int,
457
+ task_files: TaskFiles,
458
+ config: SlurmConfig,
459
+ task_type: Literal[
460
+ "non_parallel",
461
+ "converter_non_parallel",
462
+ "compound",
463
+ "converter_compound",
464
+ ],
465
+ user_id: int,
466
+ ) -> tuple[Any, Exception]:
467
+ logger.debug("[submit] START")
468
+ try:
469
+ workdir_local = task_files.wftask_subfolder_local
470
+ workdir_remote = task_files.wftask_subfolder_remote
471
+
472
+ if self.is_shutdown():
473
+ with next(get_sync_db()) as db:
474
+ update_status_of_history_unit(
475
+ history_unit_id=history_unit_id,
476
+ status=HistoryUnitStatus.FAILED,
477
+ db_sync=db,
478
+ )
479
+
480
+ return None, SHUTDOWN_EXCEPTION
481
+
482
+ self._check_no_active_jobs()
483
+
484
+ # Validation phase
485
+ self.validate_submit_parameters(
486
+ parameters=parameters,
487
+ task_type=task_type,
488
+ )
489
+
490
+ # Create task subfolder
491
+ logger.debug("[submit] Create local/remote folders - START")
492
+ self._mkdir_local_folder(folder=workdir_local.as_posix())
493
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
494
+ logger.debug("[submit] Create local/remote folders - END")
495
+
496
+ # Submission phase
497
+ slurm_job = SlurmJob(
498
+ prefix=task_files.prefix,
499
+ workdir_local=workdir_local,
500
+ workdir_remote=workdir_remote,
501
+ tasks=[
502
+ SlurmTask(
503
+ prefix=task_files.prefix,
504
+ index=0,
505
+ component=task_files.component,
506
+ parameters=parameters,
507
+ workdir_remote=workdir_remote,
508
+ workdir_local=workdir_local,
509
+ task_files=task_files,
510
+ )
511
+ ],
512
+ )
513
+
514
+ config.parallel_tasks_per_job = 1
515
+ self._submit_single_sbatch(
516
+ func,
517
+ slurm_job=slurm_job,
518
+ slurm_config=config,
519
+ )
520
+ logger.debug(f"[submit] END submission phase, {self.job_ids=}")
521
+
522
+ create_accounting_record_slurm(
523
+ user_id=user_id,
524
+ slurm_job_ids=self.job_ids,
525
+ )
526
+
527
+ # NOTE: see issue 2444
528
+ settings = Inject(get_settings)
529
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
530
+ logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
531
+ time.sleep(sleep_time)
532
+
533
+ # Retrieval phase
534
+ logger.debug("[submit] START retrieval phase")
535
+ scancelled_job_ids = []
536
+ while len(self.jobs) > 0:
537
+ # Look for finished jobs
538
+ finished_job_ids = self._get_finished_jobs(
539
+ job_ids=self.job_ids
540
+ )
541
+ logger.debug(f"[submit] {finished_job_ids=}")
542
+ finished_jobs = [
543
+ self.jobs[_slurm_job_id]
544
+ for _slurm_job_id in finished_job_ids
545
+ ]
546
+ self._fetch_artifacts(finished_jobs)
547
+ with next(get_sync_db()) as db:
548
+ for slurm_job_id in finished_job_ids:
549
+ logger.debug(f"[submit] Now process {slurm_job_id=}")
550
+ slurm_job = self.jobs.pop(slurm_job_id)
551
+ was_job_scancelled = slurm_job_id in scancelled_job_ids
552
+ result, exception = self._postprocess_single_task(
553
+ task=slurm_job.tasks[0],
554
+ was_job_scancelled=was_job_scancelled,
555
+ )
556
+
557
+ if exception is not None:
558
+ update_status_of_history_unit(
559
+ history_unit_id=history_unit_id,
560
+ status=HistoryUnitStatus.FAILED,
561
+ db_sync=db,
562
+ )
563
+ else:
564
+ if task_type not in [
565
+ "compound",
566
+ "converter_compound",
567
+ ]:
568
+ update_status_of_history_unit(
569
+ history_unit_id=history_unit_id,
570
+ status=HistoryUnitStatus.DONE,
571
+ db_sync=db,
572
+ )
573
+
574
+ if len(self.jobs) > 0:
575
+ scancelled_job_ids = self.wait_and_check_shutdown()
576
+
577
+ logger.debug("[submit] END")
578
+ return result, exception
579
+
580
+ except Exception as e:
581
+ logger.error(
582
+ f"[submit] Unexpected exception. Original error: {str(e)}"
583
+ )
584
+ with next(get_sync_db()) as db:
585
+ update_status_of_history_unit(
586
+ history_unit_id=history_unit_id,
587
+ status=HistoryUnitStatus.FAILED,
588
+ db_sync=db,
589
+ )
590
+ self.scancel_jobs()
591
+ return None, e
592
+
593
+ def multisubmit(
594
+ self,
595
+ func: callable,
596
+ list_parameters: list[dict],
597
+ history_unit_ids: list[int],
598
+ list_task_files: list[TaskFiles],
599
+ task_type: Literal["parallel", "compound", "converter_compound"],
600
+ config: SlurmConfig,
601
+ user_id: int,
602
+ ) -> tuple[dict[int, Any], dict[int, BaseException]]:
603
+ """
604
+ Note: `list_parameters`, `list_task_files` and `history_unit_ids`
605
+ have the same size. For parallel tasks, this is also the number of
606
+ input images, while for compound tasks these can differ.
607
+ """
608
+
609
+ logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
610
+ try:
611
+
612
+ if self.is_shutdown():
613
+ if task_type == "parallel":
614
+ with next(get_sync_db()) as db:
615
+ bulk_update_status_of_history_unit(
616
+ history_unit_ids=history_unit_ids,
617
+ status=HistoryUnitStatus.FAILED,
618
+ db_sync=db,
619
+ )
620
+ results = {}
621
+ exceptions = {
622
+ ind: SHUTDOWN_EXCEPTION
623
+ for ind in range(len(list_parameters))
624
+ }
625
+ return results, exceptions
626
+
627
+ self._check_no_active_jobs()
628
+ self.validate_multisubmit_parameters(
629
+ list_parameters=list_parameters,
630
+ task_type=task_type,
631
+ list_task_files=list_task_files,
632
+ history_unit_ids=history_unit_ids,
633
+ )
634
+
635
+ workdir_local = list_task_files[0].wftask_subfolder_local
636
+ workdir_remote = list_task_files[0].wftask_subfolder_remote
637
+
638
+ # Create local&remote task subfolders
639
+ if task_type == "parallel":
640
+ self._mkdir_local_folder(workdir_local.as_posix())
641
+ self._mkdir_remote_folder(folder=workdir_remote.as_posix())
642
+
643
+ results: dict[int, Any] = {}
644
+ exceptions: dict[int, BaseException] = {}
645
+
646
+ # NOTE: chunking has already taken place in `get_slurm_config`,
647
+ # so that `config.tasks_per_job` is now set.
648
+
649
+ # Divide arguments in batches of `tasks_per_job` tasks each
650
+ tot_tasks = len(list_parameters)
651
+ args_batches = []
652
+ batch_size = config.tasks_per_job
653
+ for ind_chunk in range(0, tot_tasks, batch_size):
654
+ args_batches.append(
655
+ list_parameters[ind_chunk : ind_chunk + batch_size] # noqa
656
+ )
657
+ if len(args_batches) != math.ceil(
658
+ tot_tasks / config.tasks_per_job
659
+ ):
660
+ raise RuntimeError("Something wrong here while batching tasks")
661
+
662
+ # Part 1/3: Iterate over chunks, prepare SlurmJob objects
663
+ logger.debug("[multisubmit] Prepare `SlurmJob`s.")
664
+ jobs_to_submit = []
665
+ for ind_batch, chunk in enumerate(args_batches):
666
+ # Read prefix based on the first task of this batch
667
+ prefix = list_task_files[ind_batch * batch_size].prefix
668
+ tasks = []
669
+ for ind_chunk, parameters in enumerate(chunk):
670
+ index = (ind_batch * batch_size) + ind_chunk
671
+ tasks.append(
672
+ SlurmTask(
673
+ prefix=prefix,
674
+ index=index,
675
+ component=list_task_files[index].component,
676
+ workdir_local=workdir_local,
677
+ workdir_remote=workdir_remote,
678
+ parameters=parameters,
679
+ zarr_url=parameters["zarr_url"],
680
+ task_files=list_task_files[index],
681
+ ),
682
+ )
683
+ jobs_to_submit.append(
684
+ SlurmJob(
685
+ prefix=prefix,
686
+ workdir_local=workdir_local,
687
+ workdir_remote=workdir_remote,
688
+ tasks=tasks,
689
+ )
690
+ )
691
+
692
+ # NOTE: see issue 2431
693
+ logger.debug("[multisubmit] Transfer files and submit jobs.")
694
+ for slurm_job in jobs_to_submit:
695
+ self._submit_single_sbatch(
696
+ func,
697
+ slurm_job=slurm_job,
698
+ slurm_config=config,
699
+ )
700
+
701
+ logger.info(f"[multisubmit] END submission phase, {self.job_ids=}")
702
+
703
+ create_accounting_record_slurm(
704
+ user_id=user_id,
705
+ slurm_job_ids=self.job_ids,
706
+ )
707
+
708
+ settings = Inject(get_settings)
709
+ sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
710
+ logger.warning(f"[multisubmit] Now sleep {sleep_time} seconds.")
711
+ time.sleep(sleep_time)
712
+ except Exception as e:
713
+ logger.error(
714
+ "[multisubmit] Unexpected exception during submission."
715
+ f" Original error {str(e)}"
716
+ )
717
+ self.scancel_jobs()
718
+ if task_type == "parallel":
719
+ with next(get_sync_db()) as db:
720
+ bulk_update_status_of_history_unit(
721
+ history_unit_ids=history_unit_ids,
722
+ status=HistoryUnitStatus.FAILED,
723
+ db_sync=db,
724
+ )
725
+ results = {}
726
+ exceptions = {ind: e for ind in range(len(list_parameters))}
727
+ return results, exceptions
728
+
729
+ # Retrieval phase
730
+ logger.debug("[multisubmit] START retrieval phase")
731
+ scancelled_job_ids = []
732
+ while len(self.jobs) > 0:
733
+ # Look for finished jobs
734
+ finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
735
+ logger.debug(f"[multisubmit] {finished_job_ids=}")
736
+ finished_jobs = [
737
+ self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
738
+ ]
739
+ fetch_artifacts_exception = None
740
+ try:
741
+ self._fetch_artifacts(finished_jobs)
742
+ except Exception as e:
743
+ logger.error(
744
+ "[multisubmit] Unexpected exception in "
745
+ "`_fetch_artifacts`. "
746
+ f"Original error: {str(e)}"
747
+ )
748
+ fetch_artifacts_exception = e
749
+
750
+ with next(get_sync_db()) as db:
751
+ for slurm_job_id in finished_job_ids:
752
+ logger.debug(f"[multisubmit] Now process {slurm_job_id=}")
753
+ slurm_job = self.jobs.pop(slurm_job_id)
754
+ for task in slurm_job.tasks:
755
+ logger.debug(
756
+ f"[multisubmit] Now process {task.index=}"
757
+ )
758
+ was_job_scancelled = slurm_job_id in scancelled_job_ids
759
+ if fetch_artifacts_exception is not None:
760
+ result = None
761
+ exception = fetch_artifacts_exception
762
+ else:
763
+ try:
764
+ (
765
+ result,
766
+ exception,
767
+ ) = self._postprocess_single_task(
768
+ task=task,
769
+ was_job_scancelled=was_job_scancelled,
770
+ )
771
+ except Exception as e:
772
+ logger.error(
773
+ "[multisubmit] Unexpected exception in "
774
+ "`_postprocess_single_task`. "
775
+ f"Original error: {str(e)}"
776
+ )
777
+ result = None
778
+ exception = e
779
+ # Note: the relevant done/failed check is based on
780
+ # whether `exception is None`. The fact that
781
+ # `result is None` is not relevant for this purpose.
782
+ if exception is not None:
783
+ exceptions[task.index] = exception
784
+ if task_type == "parallel":
785
+ update_status_of_history_unit(
786
+ history_unit_id=history_unit_ids[
787
+ task.index
788
+ ],
789
+ status=HistoryUnitStatus.FAILED,
790
+ db_sync=db,
791
+ )
792
+ else:
793
+ results[task.index] = result
794
+ if task_type == "parallel":
795
+ update_status_of_history_unit(
796
+ history_unit_id=history_unit_ids[
797
+ task.index
798
+ ],
799
+ status=HistoryUnitStatus.DONE,
800
+ db_sync=db,
801
+ )
802
+
803
+ if len(self.jobs) > 0:
804
+ scancelled_job_ids = self.wait_and_check_shutdown()
805
+
806
+ logger.debug("[multisubmit] END")
807
+ return results, exceptions
808
+
809
+ def check_fractal_server_versions(self) -> None:
810
+ """
811
+ Compare fractal-server versions of local/remote Python interpreters.
812
+ """
813
+
814
+ # Skip check when the local and remote interpreters are the same
815
+ # (notably for some sudo-slurm deployments)
816
+ if self.python_worker_interpreter == sys.executable:
817
+ return
818
+
819
+ # Fetch remote fractal-server version
820
+ cmd = (
821
+ f"{self.python_worker_interpreter} "
822
+ "-m fractal_server.app.runner.versions"
823
+ )
824
+ stdout = self._run_remote_cmd(cmd)
825
+ remote_version = json.loads(stdout.strip("\n"))["fractal_server"]
826
+
827
+ # Verify local/remote version match
828
+ if remote_version != __VERSION__:
829
+ error_msg = (
830
+ "Fractal-server version mismatch.\n"
831
+ "Local interpreter: "
832
+ f"({sys.executable}): {__VERSION__}.\n"
833
+ "Remote interpreter: "
834
+ f"({self.python_worker_interpreter}): {remote_version}."
835
+ )
836
+ logger.error(error_msg)
837
+ raise RuntimeError(error_msg)
838
+
839
+ def scancel_jobs(self) -> list[str]:
840
+ logger.info("[scancel_jobs] START")
841
+ scancelled_job_ids = self.job_ids
842
+ if self.jobs:
843
+ scancel_string = " ".join(scancelled_job_ids)
844
+ scancel_cmd = f"scancel {scancel_string}"
845
+ logger.warning(f"[scancel_jobs] {scancel_string}")
846
+ try:
847
+ self._run_remote_cmd(scancel_cmd)
848
+ except Exception as e:
849
+ logger.error(
850
+ "[scancel_jobs] `scancel` command failed. "
851
+ f"Original error:\n{str(e)}"
852
+ )
853
+ logger.info("[scancel_jobs] END")
854
+ return scancelled_job_ids
855
+
856
+ def validate_slurm_jobs_workdirs(
857
+ self,
858
+ slurm_jobs: list[SlurmJob],
859
+ ) -> None:
860
+ """
861
+ Check that a list of `SlurmJob`s have homogeneous working folders.
862
+ """
863
+ set_workdir_local = set(_job.workdir_local for _job in slurm_jobs)
864
+ set_workdir_remote = set(_job.workdir_remote for _job in slurm_jobs)
865
+ if len(set_workdir_local) > 1:
866
+ raise ValueError(f"Non-unique values in {set_workdir_local=}.")
867
+ if len(set_workdir_remote) > 1:
868
+ raise ValueError(f"Non-unique values in {set_workdir_remote=}.")