fractal-server 2.14.0a10__py3-none-any.whl → 2.14.0a11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. fractal_server/__init__.py +1 -1
  2. fractal_server/app/routes/api/v2/submit.py +1 -1
  3. fractal_server/app/runner/components.py +0 -3
  4. fractal_server/app/runner/exceptions.py +4 -0
  5. fractal_server/app/runner/executors/base_runner.py +16 -17
  6. fractal_server/app/runner/executors/local/{_local_config.py → get_local_config.py} +0 -7
  7. fractal_server/app/runner/executors/local/runner.py +117 -58
  8. fractal_server/app/runner/executors/slurm_common/_check_jobs_status.py +4 -0
  9. fractal_server/app/runner/executors/slurm_ssh/executor.py +7 -5
  10. fractal_server/app/runner/executors/slurm_ssh/runner.py +6 -10
  11. fractal_server/app/runner/executors/slurm_sudo/runner.py +201 -96
  12. fractal_server/app/runner/task_files.py +8 -0
  13. fractal_server/app/runner/v2/__init__.py +0 -366
  14. fractal_server/app/runner/v2/_local.py +2 -2
  15. fractal_server/app/runner/v2/_slurm_ssh.py +2 -2
  16. fractal_server/app/runner/v2/_slurm_sudo.py +2 -2
  17. fractal_server/app/runner/v2/db_tools.py +87 -0
  18. fractal_server/app/runner/v2/runner.py +77 -81
  19. fractal_server/app/runner/v2/runner_functions.py +274 -436
  20. fractal_server/app/runner/v2/runner_functions_low_level.py +37 -39
  21. fractal_server/app/runner/v2/submit_workflow.py +366 -0
  22. fractal_server/app/runner/v2/task_interface.py +31 -0
  23. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/METADATA +1 -1
  24. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/RECORD +27 -28
  25. fractal_server/app/runner/executors/local/_submit_setup.py +0 -46
  26. fractal_server/app/runner/executors/slurm_common/_submit_setup.py +0 -84
  27. fractal_server/app/runner/v2/_db_tools.py +0 -48
  28. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/LICENSE +0 -0
  29. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/WHEEL +0 -0
  30. {fractal_server-2.14.0a10.dist-info → fractal_server-2.14.0a11.dist-info}/entry_points.txt +0 -0
@@ -1 +1 @@
1
- __VERSION__ = "2.14.0a10"
1
+ __VERSION__ = "2.14.0a11"
@@ -30,7 +30,7 @@ from fractal_server.app.routes.aux.validate_user_settings import (
30
30
  from fractal_server.app.runner.set_start_and_last_task_index import (
31
31
  set_start_and_last_task_index,
32
32
  )
33
- from fractal_server.app.runner.v2 import submit_workflow
33
+ from fractal_server.app.runner.v2.submit_workflow import submit_workflow
34
34
  from fractal_server.app.schemas.v2 import JobCreateV2
35
35
  from fractal_server.app.schemas.v2 import JobReadV2
36
36
  from fractal_server.app.schemas.v2 import JobStatusTypeV2
@@ -1,5 +1,2 @@
1
1
  def _index_to_component(ind: int) -> str:
2
2
  return f"{ind:07d}"
3
-
4
-
5
- _COMPONENT_KEY_ = "__FRACTAL_PARALLEL_COMPONENT__"
@@ -37,6 +37,10 @@ class TaskExecutionError(RuntimeError):
37
37
  self.task_name = task_name
38
38
 
39
39
 
40
+ class TaskOutputValidationError(ValueError):
41
+ pass
42
+
43
+
40
44
  class JobExecutionError(RuntimeError):
41
45
  """
42
46
  Forwards errors in the execution of a task that are due to external factors
@@ -1,6 +1,6 @@
1
1
  from typing import Any
2
2
 
3
- from fractal_server.app.runner.components import _COMPONENT_KEY_
3
+ from fractal_server.app.runner.task_files import TaskFiles
4
4
  from fractal_server.app.schemas.v2.task import TaskTypeType
5
5
 
6
6
 
@@ -29,9 +29,10 @@ class BaseRunner(object):
29
29
  self,
30
30
  func: callable,
31
31
  parameters: dict[str, Any],
32
- history_item_id: int,
32
+ history_unit_id: int,
33
+ task_files: TaskFiles,
33
34
  task_type: TaskTypeType,
34
- **kwargs,
35
+ config: Any,
35
36
  ) -> tuple[Any, BaseException]:
36
37
  """
37
38
  Run a single fractal task.
@@ -45,7 +46,7 @@ class BaseRunner(object):
45
46
  history_item_id:
46
47
  Database ID of the corresponding `HistoryItemV2` entry.
47
48
  task_type: Task type.
48
- kwargs: Runner-specific parameters.
49
+ config: Runner-specific parameters.
49
50
  """
50
51
  raise NotImplementedError()
51
52
 
@@ -53,9 +54,10 @@ class BaseRunner(object):
53
54
  self,
54
55
  func: callable,
55
56
  list_parameters: list[dict[str, Any]],
56
- history_item_id: int,
57
+ history_unit_ids: list[int],
58
+ list_task_files: list[TaskFiles],
57
59
  task_type: TaskTypeType,
58
- **kwargs,
60
+ config: Any,
59
61
  ) -> tuple[dict[int, Any], dict[int, BaseException]]:
60
62
  """
61
63
  Run a parallel fractal task.
@@ -70,7 +72,7 @@ class BaseRunner(object):
70
72
  history_item_id:
71
73
  Database ID of the corresponding `HistoryItemV2` entry.
72
74
  task_type: Task type.
73
- kwargs: Runner-specific parameters.
75
+ config: Runner-specific parameters.
74
76
  """
75
77
  raise NotImplementedError()
76
78
 
@@ -101,15 +103,11 @@ class BaseRunner(object):
101
103
  f"Forbidden 'zarr_urls' key in {list(parameters.keys())}"
102
104
  )
103
105
 
104
- if _COMPONENT_KEY_ not in parameters.keys():
105
- raise ValueError(
106
- f"No '{_COMPONENT_KEY_}' key in in {list(parameters.keys())}"
107
- )
108
-
109
106
  def validate_multisubmit_parameters(
110
107
  self,
111
108
  list_parameters: list[dict[str, Any]],
112
109
  task_type: TaskTypeType,
110
+ list_task_files: list[TaskFiles],
113
111
  ) -> None:
114
112
  """
115
113
  Validate parameters for `multi_submit` method
@@ -121,6 +119,12 @@ class BaseRunner(object):
121
119
  if task_type not in TASK_TYPES_MULTISUBMIT:
122
120
  raise ValueError(f"Invalid {task_type=} for `multisubmit`.")
123
121
 
122
+ subfolders = set(
123
+ task_file.wftask_subfolder_local for task_file in list_task_files
124
+ )
125
+ if len(subfolders) != 1:
126
+ raise ValueError(f"More than one subfolders: {subfolders}.")
127
+
124
128
  if not isinstance(list_parameters, list):
125
129
  raise ValueError("`parameters` must be a list.")
126
130
 
@@ -131,11 +135,6 @@ class BaseRunner(object):
131
135
  raise ValueError(
132
136
  f"No 'zarr_url' key in in {list(single_kwargs.keys())}"
133
137
  )
134
- if _COMPONENT_KEY_ not in single_kwargs.keys():
135
- raise ValueError(
136
- f"No '{_COMPONENT_KEY_}' key "
137
- f"in {list(single_kwargs.keys())}"
138
- )
139
138
  if task_type == "parallel":
140
139
  zarr_urls = [kwargs["zarr_url"] for kwargs in list_parameters]
141
140
  if len(zarr_urls) != len(set(zarr_urls)):
@@ -48,13 +48,6 @@ class LocalBackendConfig(BaseModel):
48
48
  parallel_tasks_per_job: Optional[int] = None
49
49
 
50
50
 
51
- def get_default_local_backend_config():
52
- """
53
- Return a default `LocalBackendConfig` configuration object
54
- """
55
- return LocalBackendConfig(parallel_tasks_per_job=None)
56
-
57
-
58
51
  def get_local_backend_config(
59
52
  wftask: WorkflowTaskV2,
60
53
  which_type: Literal["non_parallel", "parallel"],
@@ -2,14 +2,14 @@ from concurrent.futures import Future
2
2
  from concurrent.futures import ThreadPoolExecutor
3
3
  from pathlib import Path
4
4
  from typing import Any
5
- from typing import Optional
5
+ from typing import Literal
6
6
 
7
- from ._local_config import get_default_local_backend_config
8
- from ._local_config import LocalBackendConfig
9
- from fractal_server.app.runner.components import _COMPONENT_KEY_
7
+ from .get_local_config import LocalBackendConfig
8
+ from fractal_server.app.db import get_sync_db
10
9
  from fractal_server.app.runner.executors.base_runner import BaseRunner
11
10
  from fractal_server.app.runner.task_files import TaskFiles
12
- from fractal_server.app.schemas.v2.task import TaskTypeType
11
+ from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
12
+ from fractal_server.app.schemas.v2 import HistoryUnitStatus
13
13
  from fractal_server.logger import set_logger
14
14
 
15
15
  logger = set_logger(__name__)
@@ -49,67 +49,106 @@ class LocalRunner(BaseRunner):
49
49
  self,
50
50
  func: callable,
51
51
  parameters: dict[str, Any],
52
+ history_unit_id: int,
52
53
  task_files: TaskFiles,
53
- task_type: TaskTypeType,
54
- local_backend_config: Optional[LocalBackendConfig] = None,
54
+ task_type: Literal[
55
+ "non_parallel",
56
+ "converter_non_parallel",
57
+ "compound",
58
+ "converter_compound",
59
+ ],
60
+ config: LocalBackendConfig,
55
61
  ) -> tuple[Any, Exception]:
56
62
  logger.debug("[submit] START")
57
63
 
58
- current_task_files = TaskFiles(
59
- **task_files.model_dump(
60
- exclude={"component"},
61
- ),
62
- component=parameters[_COMPONENT_KEY_],
63
- )
64
-
65
64
  self.validate_submit_parameters(parameters, task_type=task_type)
66
- workdir_local = current_task_files.wftask_subfolder_local
65
+ workdir_local = task_files.wftask_subfolder_local
67
66
  workdir_local.mkdir()
68
67
 
69
68
  # SUBMISSION PHASE
70
- future = self.executor.submit(func, parameters=parameters)
69
+ future = self.executor.submit(
70
+ func,
71
+ parameters=parameters,
72
+ remote_files=task_files.remote_files_dict,
73
+ )
71
74
 
72
75
  # RETRIEVAL PHASE
73
- try:
74
- result = future.result()
75
- logger.debug(f"[submit] END {result=}")
76
- return result, None
77
- except Exception as e:
78
- exception = e
79
- logger.debug(f"[submit] END {exception=}")
80
- return None, exception
76
+ with next(get_sync_db()) as db:
77
+ try:
78
+ result = future.result()
79
+ logger.debug("[submit] END with result")
80
+ if task_type not in ["compound", "converter_compound"]:
81
+ update_status_of_history_unit(
82
+ history_unit_id=history_unit_id,
83
+ status=HistoryUnitStatus.DONE,
84
+ db_sync=db,
85
+ )
86
+ return result, None
87
+ except Exception as e:
88
+ exception = e
89
+ logger.debug("[submit] END with exception")
90
+ update_status_of_history_unit(
91
+ history_unit_id=history_unit_id,
92
+ status=HistoryUnitStatus.FAILED,
93
+ db_sync=db,
94
+ )
95
+
96
+ return None, exception
81
97
 
82
98
  def multisubmit(
83
99
  self,
84
100
  func: callable,
85
101
  list_parameters: list[dict],
86
- task_files: TaskFiles,
87
- task_type: TaskTypeType,
88
- local_backend_config: Optional[LocalBackendConfig] = None,
102
+ history_unit_ids: list[int],
103
+ list_task_files: list[TaskFiles],
104
+ task_type: Literal["parallel", "compound", "converter_compound"],
105
+ config: LocalBackendConfig,
89
106
  ):
107
+ """
108
+ Note:
109
+
110
+ 1. The number of sruns and futures is equal to `len(list_parameters)`.
111
+ 2. The number of `HistoryUnit`s is equal to `len(history_unit_ids)`.
112
+ 3. For compound tasks, these two numbers are not the same.
113
+
114
+ For this reason, we defer database updates to the caller function,
115
+ when we are in one of the "compound" cases
116
+
117
+ """
118
+ # FIXME: De-duplicate this check
119
+ if task_type in ["compound", "converter_compound"]:
120
+ if len(history_unit_ids) != 1:
121
+ raise NotImplementedError(
122
+ "We are breaking the assumption that compound/multisubmit "
123
+ "is associated to a single HistoryUnit. This is not "
124
+ "supported."
125
+ )
126
+ elif task_type == "parallel" and len(history_unit_ids) != len(
127
+ list_parameters
128
+ ):
129
+ raise ValueError(
130
+ f"{len(history_unit_ids)=} differs from "
131
+ f"{len(list_parameters)=}."
132
+ )
133
+
90
134
  logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
91
135
 
92
136
  self.validate_multisubmit_parameters(
93
137
  list_parameters=list_parameters,
94
138
  task_type=task_type,
139
+ list_task_files=list_task_files,
95
140
  )
96
141
 
97
- workdir_local = task_files.wftask_subfolder_local
98
- if task_type not in ["compound", "converter_compound"]:
142
+ workdir_local = list_task_files[0].wftask_subfolder_local
143
+ if task_type == "parallel":
99
144
  workdir_local.mkdir()
100
145
 
101
- # Get local_backend_config
102
- if local_backend_config is None:
103
- local_backend_config = get_default_local_backend_config()
104
-
105
146
  # Set `n_elements` and `parallel_tasks_per_job`
106
147
  n_elements = len(list_parameters)
107
- parallel_tasks_per_job = local_backend_config.parallel_tasks_per_job
148
+ parallel_tasks_per_job = config.parallel_tasks_per_job
108
149
  if parallel_tasks_per_job is None:
109
150
  parallel_tasks_per_job = n_elements
110
151
 
111
- original_task_files = task_files
112
-
113
152
  # Execute tasks, in chunks of size `parallel_tasks_per_job`
114
153
  results: dict[int, Any] = {}
115
154
  exceptions: dict[int, BaseException] = {}
@@ -119,37 +158,57 @@ class LocalRunner(BaseRunner):
119
158
  ]
120
159
 
121
160
  active_futures: dict[int, Future] = {}
122
- active_task_files: dict[int, TaskFiles] = {}
123
161
  for ind_within_chunk, kwargs in enumerate(list_parameters_chunk):
124
162
  positional_index = ind_chunk + ind_within_chunk
125
- component = kwargs[_COMPONENT_KEY_]
126
- future = self.executor.submit(func, parameters=kwargs)
127
- active_futures[positional_index] = future
128
- active_task_files[positional_index] = TaskFiles(
129
- **original_task_files.model_dump(exclude={"component"}),
130
- component=component,
163
+ future = self.executor.submit(
164
+ func,
165
+ parameters=kwargs,
166
+ remote_files=list_task_files[
167
+ positional_index
168
+ ].remote_files_dict,
131
169
  )
170
+ active_futures[positional_index] = future
132
171
 
133
172
  while active_futures:
134
173
  # FIXME: add shutdown detection
135
174
  # if file exists: cancel all futures, and raise
136
175
  finished_futures = [
137
- keyval
138
- for keyval in active_futures.items()
139
- if not keyval[1].running()
176
+ index_and_future
177
+ for index_and_future in active_futures.items()
178
+ if not index_and_future[1].running()
140
179
  ]
141
- for positional_index, fut in finished_futures:
142
- active_futures.pop(positional_index)
143
- # current_task_files = active_task_files.pop(
144
- # positional_index
145
- # )
146
- zarr_url = list_parameters[positional_index]["zarr_url"]
147
- try:
148
- results[positional_index] = fut.result()
149
- print(f"Mark {zarr_url=} as done, {kwargs}")
150
- except Exception as e:
151
- print(f"Mark {zarr_url=} as failed, {kwargs} - {e}")
152
- exceptions[positional_index] = e
180
+ if len(finished_futures) == 0:
181
+ continue
182
+
183
+ with next(get_sync_db()) as db:
184
+ for positional_index, fut in finished_futures:
185
+ active_futures.pop(positional_index)
186
+ if task_type == "parallel":
187
+ current_history_unit_id = history_unit_ids[
188
+ positional_index
189
+ ]
190
+
191
+ try:
192
+ results[positional_index] = fut.result()
193
+ if task_type == "parallel":
194
+ update_status_of_history_unit(
195
+ history_unit_id=current_history_unit_id,
196
+ status=HistoryUnitStatus.DONE,
197
+ db_sync=db,
198
+ )
199
+
200
+ except Exception as e:
201
+ exceptions[positional_index] = e
202
+ if task_type == "parallel":
203
+ update_status_of_history_unit(
204
+ history_unit_id=current_history_unit_id,
205
+ status=HistoryUnitStatus.FAILED,
206
+ db_sync=db,
207
+ )
208
+
209
+ # FIXME: what should happen here? Option 1: stop
210
+ # all existing tasks and shutdown runner (for the
211
+ # compound-task case)
153
212
 
154
213
  logger.debug(f"[multisubmit] END, {results=}, {exceptions=}")
155
214
 
@@ -32,6 +32,10 @@ def run_squeue(job_ids: list[str]) -> subprocess.CompletedProcess:
32
32
  return res
33
33
 
34
34
 
35
+ def are_all_jobs_on_squeue(job_ids: list[str]) -> bool:
36
+ pass
37
+
38
+
35
39
  def get_finished_jobs(job_ids: list[str]) -> set[str]:
36
40
  """
37
41
  Check which ones of the given Slurm jobs already finished
@@ -24,7 +24,6 @@ from ..slurm_common.utils_executors import get_pickle_file_path
24
24
  from ..slurm_common.utils_executors import get_slurm_file_path
25
25
  from ..slurm_common.utils_executors import get_slurm_script_file_path
26
26
  from ._executor_wait_thread import FractalSlurmSSHWaitThread
27
- from fractal_server.app.runner.components import _COMPONENT_KEY_
28
27
  from fractal_server.app.runner.compress_folder import compress_folder
29
28
  from fractal_server.app.runner.exceptions import JobExecutionError
30
29
  from fractal_server.app.runner.exceptions import TaskExecutionError
@@ -526,10 +525,13 @@ class FractalSlurmSSHExecutor(Executor):
526
525
  # `component = {"zarr_url": "/something", "param": 1}``). The
527
526
  # try/except covers the case of e.g. `executor.map([1, 2])`,
528
527
  # which is useful for testing.
529
- try:
530
- actual_component = component.get(_COMPONENT_KEY_, None)
531
- except AttributeError:
532
- actual_component = str(component)
528
+
529
+ # FIXME: the use of _COMPONENT_KEY_ is now deprecated
530
+ # try:
531
+ # actual_component = component.get(_COMPONENT_KEY_, None)
532
+ # except AttributeError:
533
+ # actual_component = str(component)
534
+ actual_component = "FAKE_INVALID_VALUE_FIXME"
533
535
 
534
536
  _task_file_paths = TaskFiles(
535
537
  root_dir_local=task_files.workflow_dir_local,
@@ -13,7 +13,6 @@ from pydantic import ConfigDict
13
13
 
14
14
  from ._check_job_status_ssh import get_finished_jobs_ssh
15
15
  from fractal_server import __VERSION__
16
- from fractal_server.app.runner.components import _COMPONENT_KEY_
17
16
  from fractal_server.app.runner.exceptions import JobExecutionError
18
17
  from fractal_server.app.runner.exceptions import TaskExecutionError
19
18
  from fractal_server.app.runner.executors.base_runner import BaseRunner
@@ -31,11 +30,6 @@ from fractal_server.logger import set_logger
31
30
  from fractal_server.ssh._fabric import FractalSSH
32
31
  from fractal_server.syringe import Inject
33
32
 
34
- # from fractal_server.app.history import ImageStatus
35
- # from fractal_server.app.history import update_all_images
36
- # from fractal_server.app.history import update_single_image
37
- # from fractal_server.app.history import update_single_image_logfile
38
-
39
33
 
40
34
  logger = set_logger(__name__)
41
35
 
@@ -500,7 +494,9 @@ class RunnerSlurmSSH(BaseRunner):
500
494
  **task_files.model_dump(
501
495
  exclude={"component"},
502
496
  ),
503
- component=parameters[_COMPONENT_KEY_],
497
+ # FIXME _COMPONENT_KEY_ is deprecated
498
+ component="FIXME_INVALID_FAKE_VALUE",
499
+ # component=parameters[_COMPONENT_KEY_],
504
500
  )
505
501
 
506
502
  if self.jobs != {}:
@@ -546,8 +542,6 @@ class RunnerSlurmSSH(BaseRunner):
546
542
  slurm_config=slurm_config,
547
543
  )
548
544
 
549
- # LOGFILE = task_files.log_file_local
550
-
551
545
  # Retrieval phase
552
546
  while len(self.jobs) > 0:
553
547
  if self.is_shutdown():
@@ -638,7 +632,9 @@ class RunnerSlurmSSH(BaseRunner):
638
632
  # TODO: replace with actual values
639
633
  tasks = []
640
634
  for ind_chunk, parameters in enumerate(chunk):
641
- component = parameters[_COMPONENT_KEY_]
635
+ # FIXME: _COMPONENT_KEY_ is deprecated
636
+ # component = parameters[_COMPONENT_KEY_]
637
+ component = "INVALID_FAKE_VALUE_FIXME"
642
638
  tasks.append(
643
639
  SlurmTask(
644
640
  index=(ind_batch * batch_size) + ind_chunk,