fractal-server 2.13.1__py3-none-any.whl → 2.14.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/history/__init__.py +4 -0
  3. fractal_server/app/history/image_updates.py +142 -0
  4. fractal_server/app/history/status_enum.py +16 -0
  5. fractal_server/app/models/v2/__init__.py +5 -1
  6. fractal_server/app/models/v2/history.py +53 -0
  7. fractal_server/app/routes/api/v2/__init__.py +2 -2
  8. fractal_server/app/routes/api/v2/_aux_functions.py +78 -0
  9. fractal_server/app/routes/api/v2/dataset.py +12 -9
  10. fractal_server/app/routes/api/v2/history.py +247 -0
  11. fractal_server/app/routes/api/v2/project.py +25 -0
  12. fractal_server/app/routes/api/v2/workflow.py +18 -3
  13. fractal_server/app/routes/api/v2/workflowtask.py +22 -0
  14. fractal_server/app/runner/executors/base_runner.py +114 -0
  15. fractal_server/app/runner/{v2/_local → executors/local}/_local_config.py +3 -3
  16. fractal_server/app/runner/executors/local/_submit_setup.py +54 -0
  17. fractal_server/app/runner/executors/local/runner.py +200 -0
  18. fractal_server/app/runner/executors/{slurm → slurm_common}/_batching.py +1 -1
  19. fractal_server/app/runner/executors/{slurm → slurm_common}/_slurm_config.py +3 -3
  20. fractal_server/app/runner/{v2/_slurm_ssh → executors/slurm_common}/_submit_setup.py +13 -12
  21. fractal_server/app/runner/{v2/_slurm_common → executors/slurm_common}/get_slurm_config.py +9 -15
  22. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_executor_wait_thread.py +1 -1
  23. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/_slurm_job.py +1 -1
  24. fractal_server/app/runner/executors/{slurm/ssh → slurm_ssh}/executor.py +13 -14
  25. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_check_jobs_status.py +11 -9
  26. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_executor_wait_thread.py +3 -3
  27. fractal_server/app/runner/executors/{slurm/sudo → slurm_sudo}/_subprocess_run_as_user.py +2 -68
  28. fractal_server/app/runner/executors/slurm_sudo/runner.py +632 -0
  29. fractal_server/app/runner/task_files.py +70 -96
  30. fractal_server/app/runner/v2/__init__.py +5 -19
  31. fractal_server/app/runner/v2/_local.py +84 -0
  32. fractal_server/app/runner/v2/{_slurm_ssh/__init__.py → _slurm_ssh.py} +10 -13
  33. fractal_server/app/runner/v2/{_slurm_sudo/__init__.py → _slurm_sudo.py} +10 -12
  34. fractal_server/app/runner/v2/runner.py +93 -28
  35. fractal_server/app/runner/v2/runner_functions.py +85 -62
  36. fractal_server/app/runner/v2/runner_functions_low_level.py +20 -20
  37. fractal_server/app/schemas/v2/dataset.py +0 -17
  38. fractal_server/app/schemas/v2/history.py +23 -0
  39. fractal_server/config.py +2 -2
  40. fractal_server/migrations/versions/8223fcef886c_image_status.py +63 -0
  41. fractal_server/migrations/versions/87cd72a537a2_add_historyitem_table.py +68 -0
  42. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a1.dist-info}/METADATA +1 -1
  43. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a1.dist-info}/RECORD +53 -47
  44. fractal_server/app/routes/api/v2/status.py +0 -168
  45. fractal_server/app/runner/executors/slurm/sudo/executor.py +0 -1281
  46. fractal_server/app/runner/v2/_local/__init__.py +0 -132
  47. fractal_server/app/runner/v2/_local/_submit_setup.py +0 -52
  48. fractal_server/app/runner/v2/_local/executor.py +0 -100
  49. fractal_server/app/runner/v2/_slurm_sudo/_submit_setup.py +0 -83
  50. fractal_server/app/runner/v2/handle_failed_job.py +0 -59
  51. /fractal_server/app/runner/executors/{slurm → local}/__init__.py +0 -0
  52. /fractal_server/app/runner/executors/{slurm/ssh → slurm_common}/__init__.py +0 -0
  53. /fractal_server/app/runner/executors/{_job_states.py → slurm_common/_job_states.py} +0 -0
  54. /fractal_server/app/runner/executors/{slurm → slurm_common}/remote.py +0 -0
  55. /fractal_server/app/runner/executors/{slurm → slurm_common}/utils_executors.py +0 -0
  56. /fractal_server/app/runner/executors/{slurm/sudo → slurm_ssh}/__init__.py +0 -0
  57. /fractal_server/app/runner/{v2/_slurm_common → executors/slurm_sudo}/__init__.py +0 -0
  58. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a1.dist-info}/LICENSE +0 -0
  59. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a1.dist-info}/WHEEL +0 -0
  60. {fractal_server-2.13.1.dist-info → fractal_server-2.14.0a1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,632 @@
1
+ import json
2
+ import logging
3
+ import shlex
4
+ import subprocess # nosec
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from typing import Optional
10
+
11
+ import cloudpickle
12
+ from pydantic import BaseModel
13
+ from pydantic import ConfigDict
14
+
15
+ from ._check_jobs_status import get_finished_jobs
16
+ from ._subprocess_run_as_user import _mkdir_as_user
17
+ from ._subprocess_run_as_user import _run_command_as_user
18
+ from fractal_server import __VERSION__
19
+ from fractal_server.app.history import HistoryItemImageStatus
20
+ from fractal_server.app.history import update_all_images
21
+ from fractal_server.app.history import update_single_image
22
+ from fractal_server.app.runner.components import _COMPONENT_KEY_
23
+ from fractal_server.app.runner.exceptions import JobExecutionError
24
+ from fractal_server.app.runner.exceptions import TaskExecutionError
25
+ from fractal_server.app.runner.executors.base_runner import BaseRunner
26
+ from fractal_server.app.runner.executors.slurm_common._slurm_config import (
27
+ SlurmConfig,
28
+ )
29
+ from fractal_server.app.runner.filenames import SHUTDOWN_FILENAME
30
+ from fractal_server.app.runner.task_files import TaskFiles
31
+ from fractal_server.config import get_settings
32
+ from fractal_server.logger import set_logger
33
+ from fractal_server.syringe import Inject
34
+
35
+
36
+ logger = set_logger(__name__)
37
+
38
+
39
+ def _handle_exception_proxy(proxy): # FIXME
40
+ if proxy.exc_type_name == "JobExecutionError":
41
+ return JobExecutionError(str(proxy))
42
+ else:
43
+ kwargs = {}
44
+ for key in [
45
+ "workflow_task_id",
46
+ "workflow_task_order",
47
+ "task_name",
48
+ ]:
49
+ if key in proxy.kwargs.keys():
50
+ kwargs[key] = proxy.kwargs[key]
51
+ return TaskExecutionError(proxy.tb, **kwargs)
52
+
53
+
54
+ class SlurmTask(BaseModel):
55
+ model_config = ConfigDict(arbitrary_types_allowed=True)
56
+ component: str
57
+ workdir_local: Path
58
+ workdir_remote: Path
59
+ zarr_url: Optional[str] = None
60
+ task_files: TaskFiles
61
+
62
+ @property
63
+ def input_pickle_file_local(self) -> str:
64
+ return (
65
+ self.workdir_local / f"{self.component}-input.pickle"
66
+ ).as_posix()
67
+
68
+ @property
69
+ def output_pickle_file_local(self) -> str:
70
+ return (
71
+ self.workdir_local / f"{self.component}-output.pickle"
72
+ ).as_posix()
73
+
74
+ @property
75
+ def input_pickle_file_remote(self) -> str:
76
+ return (
77
+ self.workdir_remote / f"{self.component}-input.pickle"
78
+ ).as_posix()
79
+
80
+ @property
81
+ def output_pickle_file_remote(self) -> str:
82
+ return (
83
+ self.workdir_remote / f"{self.component}-output.pickle"
84
+ ).as_posix()
85
+
86
+
87
+ class SlurmJob(BaseModel):
88
+ slurm_job_id: Optional[str] = None
89
+ label: str
90
+ workdir_local: Path
91
+ workdir_remote: Path
92
+ tasks: tuple[SlurmTask]
93
+
94
+ @property
95
+ def slurm_log_file_local(self) -> str:
96
+ if self.slurm_job_id:
97
+ return (
98
+ self.workdir_local
99
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
100
+ ).as_posix()
101
+ else:
102
+ return (
103
+ self.workdir_local / f"slurm-{self.label}-%j.log"
104
+ ).as_posix()
105
+
106
+ @property
107
+ def slurm_log_file_remote(self) -> str:
108
+ if self.slurm_job_id:
109
+ return (
110
+ self.workdir_remote
111
+ / f"slurm-{self.label}-{self.slurm_job_id}.log"
112
+ ).as_posix()
113
+ else:
114
+ return (
115
+ self.workdir_remote / f"slurm-{self.label}-%j.log"
116
+ ).as_posix()
117
+
118
+ @property
119
+ def slurm_submission_script_local(self) -> str:
120
+ return (
121
+ self.workdir_local / f"slurm-{self.label}-submit.sh"
122
+ ).as_posix()
123
+
124
+ @property
125
+ def slurm_submission_script_remote(self) -> str:
126
+ return (
127
+ self.workdir_remote / f"slurm-{self.label}-submit.sh"
128
+ ).as_posix()
129
+
130
+ @property
131
+ def log_files_local(self) -> list[str]:
132
+ return [task.task_files.log_file_local for task in self.tasks]
133
+
134
+
135
+ def _subprocess_run_or_raise(
136
+ full_command: str,
137
+ ) -> Optional[subprocess.CompletedProcess]:
138
+ try:
139
+ output = subprocess.run( # nosec
140
+ shlex.split(full_command),
141
+ capture_output=True,
142
+ check=True,
143
+ encoding="utf-8",
144
+ )
145
+ return output
146
+ except subprocess.CalledProcessError as e:
147
+ error_msg = (
148
+ f"Submit command `{full_command}` failed. "
149
+ f"Original error:\n{str(e)}\n"
150
+ f"Original stdout:\n{e.stdout}\n"
151
+ f"Original stderr:\n{e.stderr}\n"
152
+ )
153
+ logging.error(error_msg)
154
+ raise JobExecutionError(info=error_msg)
155
+
156
+
157
+ class RunnerSlurmSudo(BaseRunner):
158
+ slurm_user: str
159
+ slurm_user: str
160
+ shutdown_file: Path
161
+ common_script_lines: list[str]
162
+ user_cache_dir: str
163
+ root_dir_local: Path
164
+ root_dir_remote: Path
165
+ slurm_account: Optional[str] = None
166
+ poll_interval: int
167
+ python_worker_interpreter: str
168
+ jobs: dict[str, SlurmJob]
169
+
170
+ def __init__(
171
+ self,
172
+ *,
173
+ slurm_user: str,
174
+ root_dir_local: Path,
175
+ root_dir_remote: Path,
176
+ slurm_account: Optional[str] = None,
177
+ common_script_lines: Optional[list[str]] = None,
178
+ user_cache_dir: Optional[str] = None,
179
+ slurm_poll_interval: Optional[int] = None,
180
+ ) -> None:
181
+ """
182
+ Set parameters that are the same for different Fractal tasks and for
183
+ different SLURM jobs/tasks.
184
+ """
185
+
186
+ self.slurm_user = slurm_user
187
+ self.slurm_account = slurm_account
188
+ self.common_script_lines = common_script_lines or []
189
+
190
+ # Check that SLURM account is not set here
191
+ # FIXME: move to little method
192
+ try:
193
+ invalid_line = next(
194
+ line
195
+ for line in self.common_script_lines
196
+ if line.startswith("#SBATCH --account=")
197
+ )
198
+ raise RuntimeError(
199
+ "Invalid line in `FractalSlurmExecutor.common_script_lines`: "
200
+ f"'{invalid_line}'.\n"
201
+ "SLURM account must be set via the request body of the "
202
+ "apply-workflow endpoint, or by modifying the user properties."
203
+ )
204
+ except StopIteration:
205
+ pass
206
+
207
+ # Check Python versions
208
+ settings = Inject(get_settings)
209
+ if settings.FRACTAL_SLURM_WORKER_PYTHON is not None:
210
+ self.check_remote_python_interpreter()
211
+
212
+ self.root_dir_local = root_dir_local
213
+ self.root_dir_remote = root_dir_remote
214
+
215
+ # Create folders
216
+ self.root_dir_local.mkdir(parents=True, exist_ok=True)
217
+ _mkdir_as_user(
218
+ folder=self.root_dir_remote.as_posix(),
219
+ user=self.slurm_user,
220
+ )
221
+
222
+ self.user_cache_dir = user_cache_dir
223
+
224
+ self.slurm_poll_interval = (
225
+ slurm_poll_interval or settings.FRACTAL_SLURM_POLL_INTERVAL
226
+ )
227
+
228
+ self.shutdown_file = self.root_dir_local / SHUTDOWN_FILENAME
229
+
230
+ self.python_worker_interpreter = (
231
+ settings.FRACTAL_SLURM_WORKER_PYTHON or sys.executable
232
+ )
233
+
234
+ self.jobs = {}
235
+
236
+ def __enter__(self):
237
+ return self
238
+
239
+ def __exit__(self, exc_type, exc_val, exc_tb):
240
+ return False
241
+
242
+ def is_shutdown(self) -> bool:
243
+ return self.shutdown_file.exists()
244
+
245
+ def scancel_if_shutdown(self) -> None:
246
+
247
+ logger.debug("[exit_if_shutdown] START")
248
+
249
+ if self.jobs:
250
+ scancel_string = " ".join(self.job_ids)
251
+ scancel_cmd = f"scancel {scancel_string}"
252
+ logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
253
+ try:
254
+ _run_command_as_user(
255
+ cmd=scancel_cmd,
256
+ user=self.slurm_user,
257
+ check=True,
258
+ )
259
+ except RuntimeError as e:
260
+ logger.warning(
261
+ "[exit_if_shutdown] `scancel` command failed. "
262
+ f"Original error:\n{str(e)}"
263
+ )
264
+
265
+ logger.debug("[exit_if_shutdown] END")
266
+
267
+ def _submit_single_sbatch(
268
+ self,
269
+ func,
270
+ parameters, # FIXME this should be per-task
271
+ slurm_job: SlurmJob,
272
+ slurm_config: SlurmConfig,
273
+ ) -> str:
274
+
275
+ if len(slurm_job.tasks) > 1:
276
+ raise NotImplementedError()
277
+
278
+ # Prepare input pickle(s)
279
+ versions = dict(
280
+ python=sys.version_info[:3],
281
+ cloudpickle=cloudpickle.__version__,
282
+ fractal_server=__VERSION__,
283
+ )
284
+ for task in slurm_job.tasks:
285
+ _args = []
286
+ # TODO: make parameters task-dependent
287
+ _kwargs = dict(
288
+ parameters=parameters
289
+ ) # FIXME: this should be per-tas
290
+ funcser = cloudpickle.dumps((versions, func, _args, _kwargs))
291
+ with open(task.input_pickle_file_local, "wb") as f:
292
+ f.write(funcser)
293
+
294
+ # Prepare commands to be included in SLURM submission script
295
+
296
+ preamble_lines = [
297
+ "#!/bin/bash",
298
+ "#SBATCH --partition=main",
299
+ "#SBATCH --ntasks=1",
300
+ "#SBATCH --cpus-per-task=1",
301
+ "#SBATCH --mem=10M",
302
+ f"#SBATCH --err={slurm_job.slurm_log_file_remote}",
303
+ f"#SBATCH --out={slurm_job.slurm_log_file_remote}",
304
+ f"#SBATCH -D {slurm_job.workdir_remote}",
305
+ "#SBATCH --job-name=test",
306
+ "\n",
307
+ ]
308
+
309
+ cmdlines = []
310
+ for task in slurm_job.tasks:
311
+ cmd = (
312
+ f"{self.python_worker_interpreter}"
313
+ " -m fractal_server.app.runner.executors.slurm_common.remote "
314
+ f"--input-file {task.input_pickle_file_local} "
315
+ f"--output-file {task.output_pickle_file_remote}"
316
+ )
317
+ cmdlines.append("whoami")
318
+ cmdlines.append(
319
+ f"srun --ntasks=1 --cpus-per-task=1 --mem=10MB {cmd} &"
320
+ )
321
+ cmdlines.append("wait\n")
322
+
323
+ # Write submission script
324
+ submission_script_contents = "\n".join(preamble_lines + cmdlines)
325
+ with open(slurm_job.slurm_submission_script_local, "w") as f:
326
+ f.write(submission_script_contents)
327
+
328
+ # Run sbatch
329
+ pre_command = f"sudo --set-home --non-interactive -u {self.slurm_user}"
330
+ submit_command = (
331
+ f"sbatch --parsable {slurm_job.slurm_submission_script_local}"
332
+ )
333
+ full_command = f"{pre_command} {submit_command}"
334
+
335
+ # Submit SLURM job and retrieve job ID
336
+ res = _subprocess_run_or_raise(full_command)
337
+ submitted_job_id = int(res.stdout)
338
+ slurm_job.slurm_job_id = str(submitted_job_id)
339
+
340
+ # Add job to self.jobs
341
+ self.jobs[slurm_job.slurm_job_id] = slurm_job
342
+
343
+ @property
344
+ def job_ids(self) -> list[str]:
345
+ return list(self.jobs.keys())
346
+
347
+ def _copy_files_from_remote_to_local(self, job: SlurmJob) -> None:
348
+ """
349
+ Note: this would differ for SSH
350
+ """
351
+ source_target_list = [
352
+ (job.slurm_log_file_remote, job.slurm_log_file_local)
353
+ ]
354
+ for task in job.tasks:
355
+ source_target_list.append(
356
+ (task.output_pickle_file_remote, task.output_pickle_file_local)
357
+ )
358
+ source_target_list.append(
359
+ (
360
+ task.task_files.log_file_remote,
361
+ task.task_files.log_file_local,
362
+ ),
363
+ (
364
+ task.task_files.args_file_remote,
365
+ task.task_files.args_file_local,
366
+ ),
367
+ (
368
+ task.task_files.metadiff_file_remote,
369
+ task.task_files.metadiff_file_local,
370
+ ),
371
+ )
372
+
373
+ for source, target in source_target_list:
374
+ # NOTE: By setting encoding=None, we read/write bytes instead
375
+ # of strings; this is needed to also handle pickle files.
376
+ try:
377
+ res = _run_command_as_user(
378
+ cmd=f"cat {source}",
379
+ user=self.slurm_user,
380
+ encoding=None,
381
+ check=True,
382
+ )
383
+ # Write local file
384
+ with open(target, "wb") as f:
385
+ f.write(res.stdout)
386
+ except RuntimeError as e:
387
+ logger.warning(
388
+ f"SKIP copy {source} into {target}. "
389
+ f"Original error: {str(e)}"
390
+ )
391
+ logger.debug(f"Copied {source} into {target}")
392
+
393
+ def _postprocess_single_task(
394
+ self, *, task: SlurmTask
395
+ ) -> tuple[Any, Exception]:
396
+ try:
397
+ with open(task.output_pickle_file_local, "rb") as f:
398
+ outdata = f.read()
399
+ success, output = cloudpickle.loads(outdata)
400
+ if success:
401
+ result = output
402
+ return result, None
403
+ else:
404
+ exception = _handle_exception_proxy(output)
405
+ return None, exception
406
+ except Exception as e:
407
+ exception = JobExecutionError(f"ERROR, {str(e)}")
408
+ return None, exception
409
+ finally:
410
+ Path(task.input_pickle_file_local).unlink(missing_ok=True)
411
+ Path(task.output_pickle_file_local).unlink(missing_ok=True)
412
+
413
+ def submit(
414
+ self,
415
+ func: callable,
416
+ parameters: dict[str, Any],
417
+ history_item_id: int,
418
+ task_files: TaskFiles,
419
+ in_compound_task: bool = False,
420
+ slurm_config: Optional[SlurmConfig] = None,
421
+ **kwargs,
422
+ ) -> tuple[Any, Exception]:
423
+
424
+ workdir_local = task_files.wftask_subfolder_local
425
+ workdir_remote = task_files.wftask_subfolder_remote
426
+
427
+ task_files = TaskFiles(
428
+ **task_files.model_dump(
429
+ exclude={"component"},
430
+ ),
431
+ component=parameters[_COMPONENT_KEY_],
432
+ )
433
+
434
+ if self.jobs != {}:
435
+ if not in_compound_task:
436
+ update_all_images(
437
+ history_item_id=history_item_id,
438
+ status=HistoryItemImageStatus.FAILED,
439
+ )
440
+ raise JobExecutionError("Unexpected branch: jobs should be empty.")
441
+
442
+ if self.is_shutdown():
443
+ if not in_compound_task:
444
+ update_all_images(
445
+ history_item_id=history_item_id,
446
+ status=HistoryItemImageStatus.FAILED,
447
+ )
448
+ raise JobExecutionError("Cannot continue after shutdown.")
449
+
450
+ # Validation phase
451
+ self.validate_submit_parameters(parameters)
452
+
453
+ # Create task subfolder
454
+ workdir_local.mkdir(parents=True, exist_ok=True)
455
+ _mkdir_as_user(
456
+ folder=workdir_remote.as_posix(),
457
+ user=self.slurm_user,
458
+ )
459
+
460
+ # Submission phase
461
+ slurm_job = SlurmJob(
462
+ label="0",
463
+ workdir_local=workdir_local,
464
+ workdir_remote=workdir_remote,
465
+ tasks=[
466
+ SlurmTask(
467
+ component="0",
468
+ workdir_remote=workdir_remote,
469
+ workdir_local=workdir_local,
470
+ task_files=task_files,
471
+ )
472
+ ],
473
+ ) # TODO: replace with actual values (BASED ON TASKFILES)
474
+ self._submit_single_sbatch(
475
+ func,
476
+ parameters=parameters,
477
+ slurm_job=slurm_job,
478
+ )
479
+
480
+ LOGFILE = task_files.log_file_local
481
+
482
+ # Retrieval phase
483
+ while len(self.jobs) > 0:
484
+ if self.is_shutdown():
485
+ self.scancel_if_shutdown()
486
+ finished_job_ids = get_finished_jobs(job_ids=self.job_ids)
487
+ for slurm_job_id in finished_job_ids:
488
+ slurm_job = self.jobs.pop(slurm_job_id)
489
+ self._copy_files_from_remote_to_local(slurm_job)
490
+ result, exception = self._postprocess_single_task(
491
+ task=slurm_job.tasks[0]
492
+ )
493
+ time.sleep(self.slurm_poll_interval)
494
+
495
+ if not in_compound_task:
496
+ if exception is None:
497
+ update_all_images(
498
+ history_item_id=history_item_id,
499
+ status=HistoryItemImageStatus.DONE,
500
+ logfile=LOGFILE,
501
+ )
502
+ else:
503
+ update_all_images(
504
+ history_item_id=history_item_id,
505
+ status=HistoryItemImageStatus.FAILED,
506
+ logfile=LOGFILE,
507
+ )
508
+
509
+ return result, exception
510
+
511
+ def multisubmit(
512
+ self,
513
+ func: callable,
514
+ list_parameters: list[dict],
515
+ history_item_id: int,
516
+ task_files: TaskFiles,
517
+ in_compound_task: bool = False,
518
+ **kwargs,
519
+ ):
520
+ self.scancel_if_shutdown(active_slurm_jobs=[])
521
+
522
+ self.validate_multisubmit_parameters(
523
+ list_parameters=list_parameters,
524
+ in_compound_task=in_compound_task,
525
+ )
526
+
527
+ workdir_local = task_files.wftask_subfolder_local
528
+ workdir_remote = task_files.wftask_subfolder_remote
529
+
530
+ # Create folders
531
+ workdir_local.mkdir(parents=True, exist_ok=True)
532
+ _mkdir_as_user(
533
+ folder=workdir_remote.as_posix(),
534
+ user=self.slurm_user,
535
+ )
536
+
537
+ # Execute tasks, in chunks of size `parallel_tasks_per_job`
538
+ # TODO Pick a data structure for results and exceptions, or review the
539
+ # interface
540
+ results = []
541
+ exceptions = []
542
+ jobs: dict[str, SlurmJob] = {}
543
+
544
+ original_task_files = task_files
545
+ # TODO: Add batching
546
+ for ind, parameters in enumerate(list_parameters):
547
+ # TODO: replace with actual values
548
+
549
+ component = parameters[_COMPONENT_KEY_]
550
+ slurm_job = SlurmJob(
551
+ label=f"{ind:06d}",
552
+ workdir_local=workdir_local,
553
+ workdir_remote=workdir_remote,
554
+ tasks=[
555
+ SlurmTask(
556
+ component=component,
557
+ workdir_local=workdir_local,
558
+ workdir_remote=workdir_remote,
559
+ zarr_url=parameters["zarr_url"],
560
+ task_files=TaskFiles(
561
+ **original_task_files,
562
+ component=component,
563
+ ),
564
+ )
565
+ ],
566
+ )
567
+ slurm_job_id = self._submit_single_sbatch(
568
+ func,
569
+ parameters=parameters,
570
+ slurm_job=slurm_job,
571
+ )
572
+ slurm_job.slurm_job_id = slurm_job_id
573
+ jobs[slurm_job_id] = slurm_job
574
+
575
+ # Retrieval phase
576
+ while len(jobs) > 0:
577
+ if self.is_shutdown():
578
+ self.scancel_if_shutdown(active_slurm_jobs=jobs)
579
+ remaining_jobs = list(self.job_ids)
580
+ finished_jobs = get_finished_jobs(job_ids=remaining_jobs)
581
+ for slurm_job_id in finished_jobs:
582
+ slurm_job = jobs.pop(slurm_job_id)
583
+ self._copy_files_from_remote_to_local(slurm_job)
584
+ for task in slurm_job.tasks:
585
+ result, exception = self._postprocess_single_task(
586
+ task=task
587
+ )
588
+ if not in_compound_task:
589
+ if exception is None:
590
+ update_single_image(
591
+ zarr_url=task.zarr_url,
592
+ history_item_id=history_item_id,
593
+ status=HistoryItemImageStatus.DONE,
594
+ logfile=task.task_files.log_file_local,
595
+ )
596
+ else:
597
+ update_single_image(
598
+ zarr_url=task.zarr_url,
599
+ history_item_id=history_item_id,
600
+ status=HistoryItemImageStatus.FAILED,
601
+ logfile=task.task_files.log_file_local,
602
+ )
603
+ # TODO: Now just appending, but this should be done better
604
+ results.append(result)
605
+ exceptions.append(exception)
606
+ time.sleep(self.slurm_poll_interval)
607
+ return results, exceptions
608
+
609
+ def check_remote_python_interpreter(self):
610
+ """
611
+ Check fractal-server version on the _remote_ Python interpreter.
612
+ """
613
+ settings = Inject(get_settings)
614
+ output = _subprocess_run_or_raise(
615
+ (
616
+ f"{settings.FRACTAL_SLURM_WORKER_PYTHON} "
617
+ "-m fractal_server.app.runner.versions"
618
+ )
619
+ )
620
+ runner_version = json.loads(output.stdout.strip("\n"))[
621
+ "fractal_server"
622
+ ]
623
+ if runner_version != __VERSION__:
624
+ error_msg = (
625
+ "Fractal-server version mismatch.\n"
626
+ "Local interpreter: "
627
+ f"({sys.executable}): {__VERSION__}.\n"
628
+ "Remote interpreter: "
629
+ f"({settings.FRACTAL_SLURM_WORKER_PYTHON}): {runner_version}."
630
+ )
631
+ logger.error(error_msg)
632
+ raise RuntimeError(error_msg)