PyPI - fractal-server - Versions diffs - 2.14.0a32__py3-none-any.whl → 2.14.0a34__py3-none-any.whl - Mend

fractal-server 2.14.0a32py3-none-any.whl → 2.14.0a34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

fractal_server/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __VERSION__ = "2.14.~~0a32~~"
1	+ __VERSION__ = "2.14.0a34"

fractal_server/app/runner/executors/local/runner.py CHANGED Viewed

@@ -9,6 +9,9 @@ from fractal_server.app.db import get_sync_db
 from fractal_server.app.runner.exceptions import TaskExecutionError
 from fractal_server.app.runner.executors.base_runner import BaseRunner
 from fractal_server.app.runner.task_files import TaskFiles
+from fractal_server.app.runner.v2.db_tools import (
+    bulk_update_status_of_history_unit,
+)
 from fractal_server.app.runner.v2.db_tools import update_status_of_history_unit
 from fractal_server.app.schemas.v2 import HistoryUnitStatus
 from fractal_server.logger import set_logger
@@ -58,16 +61,31 @@ class LocalRunner(BaseRunner):
     ) -> tuple[Any, Exception]:
         logger.debug("[submit] START")
-        self.validate_submit_parameters(parameters, task_type=task_type)
-        workdir_local = task_files.wftask_subfolder_local
-        workdir_local.mkdir()
+        try:
+            self.validate_submit_parameters(parameters, task_type=task_type)
+            workdir_local = task_files.wftask_subfolder_local
+            workdir_local.mkdir()
-        # SUBMISSION PHASE
-        future = self.executor.submit(
-            func,
-            parameters=parameters,
-            remote_files=task_files.remote_files_dict,
-        )
+            # SUBMISSION PHASE
+            future = self.executor.submit(
+                func,
+                parameters=parameters,
+                remote_files=task_files.remote_files_dict,
+            )
+        except Exception as e:
+            logger.error(
+                "[submit] Unexpected exception during submission. "
+                f"Original error {str(e)}"
+            )
+            result = None
+            exception = TaskExecutionError(str(e))
+            with next(get_sync_db()) as db:
+                update_status_of_history_unit(
+                    history_unit_id=history_unit_id,
+                    status=HistoryUnitStatus.FAILED,
+                    db_sync=db,
+                )
+                return None, exception
         # RETRIEVAL PHASE
         with next(get_sync_db()) as db:
@@ -105,29 +123,50 @@ class LocalRunner(BaseRunner):
         input images, while for compound tasks these can differ.
         """
-        self.validate_multisubmit_parameters(
-            list_parameters=list_parameters,
-            task_type=task_type,
-            list_task_files=list_task_files,
-            history_unit_ids=history_unit_ids,
-        )
         logger.debug(f"[multisubmit] START, {len(list_parameters)=}")
+        results: dict[int, Any] = {}
+        exceptions: dict[int, BaseException] = {}
-        workdir_local = list_task_files[0].wftask_subfolder_local
-        if task_type == "parallel":
-            workdir_local.mkdir()
-        # Set `n_elements` and `parallel_tasks_per_job`
-        n_elements = len(list_parameters)
-        parallel_tasks_per_job = config.parallel_tasks_per_job
-        if parallel_tasks_per_job is None:
-            parallel_tasks_per_job = n_elements
+        try:
+            self.validate_multisubmit_parameters(
+                list_parameters=list_parameters,
+                task_type=task_type,
+                list_task_files=list_task_files,
+                history_unit_ids=history_unit_ids,
+            )
+            workdir_local = list_task_files[0].wftask_subfolder_local
+            if task_type == "parallel":
+                workdir_local.mkdir()
+            # Set `n_elements` and `parallel_tasks_per_job`
+            n_elements = len(list_parameters)
+            parallel_tasks_per_job = config.parallel_tasks_per_job
+            if parallel_tasks_per_job is None:
+                parallel_tasks_per_job = n_elements
+        except Exception as e:
+            logger.error(
+                "[multisubmit] Unexpected exception during preliminary phase. "
+                f"Original error {str(e)}"
+            )
+            exception = TaskExecutionError(str(e))
+            exceptions = {
+                ind: exception for ind in range(len(list_parameters))
+            }
+            if task_type == "parallel":
+                with next(get_sync_db()) as db:
+                    bulk_update_status_of_history_unit(
+                        history_unit_ids=history_unit_ids,
+                        status=HistoryUnitStatus.FAILED,
+                        db_sync=db,
+                    )
+            return results, exceptions
         # Execute tasks, in chunks of size `parallel_tasks_per_job`
-        results: dict[int, Any] = {}
-        exceptions: dict[int, BaseException] = {}
         for ind_chunk in range(0, n_elements, parallel_tasks_per_job):
             list_parameters_chunk = list_parameters[
                 ind_chunk : ind_chunk + parallel_tasks_per_job
             ]
@@ -135,15 +174,31 @@ class LocalRunner(BaseRunner):
             active_futures: dict[int, Future] = {}
             for ind_within_chunk, kwargs in enumerate(list_parameters_chunk):
                 positional_index = ind_chunk + ind_within_chunk
-                future = self.executor.submit(
-                    func,
-                    parameters=kwargs,
-                    remote_files=list_task_files[
+                try:
+                    future = self.executor.submit(
+                        func,
+                        parameters=kwargs,
+                        remote_files=list_task_files[
+                            positional_index
+                        ].remote_files_dict,
+                    )
+                    active_futures[positional_index] = future
+                except Exception as e:
+                    logger.error(
+                        "[multisubmit] Unexpected exception during submission."
+                        f" Original error {str(e)}"
+                    )
+                    current_history_unit_id = history_unit_ids[
                         positional_index
-                    ].remote_files_dict,
-                )
-                active_futures[positional_index] = future
+                    ]
+                    exceptions[positional_index] = TaskExecutionError(str(e))
+                    if task_type == "parallel":
+                        with next(get_sync_db()) as db:
+                            update_status_of_history_unit(
+                                history_unit_id=current_history_unit_id,
+                                status=HistoryUnitStatus.FAILED,
+                                db_sync=db,
+                            )
             while active_futures:
                 finished_futures = [
                     index_and_future
@@ -171,6 +226,11 @@ class LocalRunner(BaseRunner):
                                 )
                         except Exception as e:
+                            logger.debug(
+                                "Multisubmit failed in retrieval "
+                                "phase with the following error "
+                                f"{str(e)}"
+                            )
                             exceptions[positional_index] = TaskExecutionError(
                                 str(e)
                             )

fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py CHANGED Viewed

@@ -100,65 +100,51 @@ class BaseSlurmRunner(BaseRunner):
     def __exit__(self, exc_type, exc_val, exc_tb):
         return False
-    def _run_local_cmd(self, cmd: str) -> str:
-        raise NotImplementedError("Implement in child class.")
     def _run_remote_cmd(self, cmd: str) -> str:
         raise NotImplementedError("Implement in child class.")
-    def run_squeue(self, job_ids: list[str]) -> tuple[bool, str]:
-        # NOTE: see issue 2482
-        if len(job_ids) == 0:
-            return (False, "")
-        job_id_single_str = ",".join([str(j) for j in job_ids])
-        cmd = (
-            f"squeue --noheader --format='%i %T' --jobs {job_id_single_str}"
-            " --states=all"
-        )
-        try:
-            if self.slurm_runner_type == "sudo":
-                stdout = self._run_local_cmd(cmd)
-            else:
-                stdout = self._run_remote_cmd(cmd)
-            return (True, stdout)
-        except Exception as e:
-            logger.info(f"{cmd=} failed with {str(e)}")
-            return (False, "")
+    def run_squeue(self, *, job_ids: list[str], **kwargs) -> str:
+        raise NotImplementedError("Implement in child class.")
     def _get_finished_jobs(self, job_ids: list[str]) -> set[str]:
-        #  If there is no Slurm job to check, return right away
+        #  If there is no Slurm job to check, return right away
         if not job_ids:
             return set()
-        id_to_state = dict()
-        success, stdout = self.run_squeue(job_ids)
-        if success:
-            id_to_state = {
+        try:
+            stdout = self.run_squeue(job_ids=job_ids)
+            slurm_statuses = {
                 out.split()[0]: out.split()[1] for out in stdout.splitlines()
             }
-        else:
-            id_to_state = dict()
-            for j in job_ids:
-                success, res = self.run_squeue([j])
-                if not success:
-                    logger.info(f"Job {j} not found. Marked it as completed")
-                    id_to_state.update({str(j): "COMPLETED"})
-                else:
-                    id_to_state.update(
-                        {res.stdout.split()[0]: res.stdout.split()[1]}
+        except Exception as e:
+            logger.warning(
+                "[_get_finished_jobs] `squeue` failed, "
+                "retry with individual job IDs. "
+                f"Original error: {str(e)}."
+            )
+            slurm_statuses = dict()
+            for job_id in job_ids:
+                try:
+                    stdout = self.run_squeue(job_ids=[job_id])
+                    slurm_statuses.update(
+                        {stdout.split()[0]: stdout.split()[1]}
                     )
+                except Exception as e:
+                    logger.warning(
+                        "[_get_finished_jobs] `squeue` failed for "
+                        f"{job_id=}, mark job as completed. "
+                        f"Original error: {str(e)}."
+                    )
+                    slurm_statuses.update({str(job_id): "COMPLETED"})
-        # Finished jobs only stay in squeue for a few mins (configurable). If
-        # a job ID isn't there, we'll assume it's finished.
-        return {
-            j
-            for j in job_ids
-            if id_to_state.get(j, "COMPLETED") in STATES_FINISHED
+        # If a job is not in `squeue` output, mark it as completed.
+        finished_jobs = {
+            job_id
+            for job_id in job_ids
+            if slurm_statuses.get(job_id, "COMPLETED") in STATES_FINISHED
         }
+        return finished_jobs
     def _mkdir_local_folder(self, folder: str) -> None:
         raise NotImplementedError("Implement in child class.")
@@ -421,27 +407,34 @@ class BaseSlurmRunner(BaseRunner):
         """
         # Sleep for `self.poll_interval`, but keep checking for shutdowns
         start_time = time.perf_counter()
-        max_time = start_time + self.poll_interval
-        can_return = False
+        # Always wait at least 0.2 (note: this is for cases where
+        # `poll_interval=0`).
+        waiting_time = max(self.poll_interval, 0.2)
+        max_time = start_time + waiting_time
         logger.debug(
             "[wait_and_check_shutdown] "
             f"I will wait at most {self.poll_interval} s, "
             f"in blocks of {self.poll_interval_internal} s."
         )
-        while (time.perf_counter() < max_time) or (can_return is False):
-            # Handle shutdown
+        while time.perf_counter() < max_time:
             if self.is_shutdown():
                 logger.info("[wait_and_check_shutdown] Shutdown file detected")
                 scancelled_job_ids = self.scancel_jobs()
                 logger.info(f"[wait_and_check_shutdown] {scancelled_job_ids=}")
                 return scancelled_job_ids
-            can_return = True
             time.sleep(self.poll_interval_internal)
         logger.debug("[wait_and_check_shutdown] No shutdown file detected")
         return []
+    def _check_no_active_jobs(self):
+        if self.jobs != {}:
+            raise JobExecutionError(
+                "Unexpected branch: jobs must be empty before new "
+                "submissions."
+            )
     def submit(
         self,
         func: callable,
@@ -457,107 +450,125 @@ class BaseSlurmRunner(BaseRunner):
         ],
     ) -> tuple[Any, Exception]:
         logger.info("[submit] START")
+        try:
+            workdir_local = task_files.wftask_subfolder_local
+            workdir_remote = task_files.wftask_subfolder_remote
-        workdir_local = task_files.wftask_subfolder_local
-        workdir_remote = task_files.wftask_subfolder_remote
-        if self.jobs != {}:
-            raise JobExecutionError("Unexpected branch: jobs should be empty.")
-        if self.is_shutdown():
-            with next(get_sync_db()) as db:
-                update_status_of_history_unit(
-                    history_unit_id=history_unit_id,
-                    status=HistoryUnitStatus.FAILED,
-                    db_sync=db,
-                )
+            if self.is_shutdown():
+                with next(get_sync_db()) as db:
+                    update_status_of_history_unit(
+                        history_unit_id=history_unit_id,
+                        status=HistoryUnitStatus.FAILED,
+                        db_sync=db,
+                    )
-            return None, SHUTDOWN_EXCEPTION
+                return None, SHUTDOWN_EXCEPTION
-        # Validation phase
-        self.validate_submit_parameters(
-            parameters=parameters,
-            task_type=task_type,
-        )
+            self._check_no_active_jobs()
-        # Create task subfolder
-        logger.info("[submit] Create local/remote folders - START")
-        self._mkdir_local_folder(folder=workdir_local.as_posix())
-        self._mkdir_remote_folder(folder=workdir_remote.as_posix())
-        logger.info("[submit] Create local/remote folders - END")
-        # Submission phase
-        slurm_job = SlurmJob(
-            prefix=task_files.prefix,
-            workdir_local=workdir_local,
-            workdir_remote=workdir_remote,
-            tasks=[
-                SlurmTask(
-                    prefix=task_files.prefix,
-                    index=0,
-                    component=task_files.component,
-                    parameters=parameters,
-                    workdir_remote=workdir_remote,
-                    workdir_local=workdir_local,
-                    task_files=task_files,
-                )
-            ],
-        )
-        config.parallel_tasks_per_job = 1
-        self._submit_single_sbatch(
-            func,
-            slurm_job=slurm_job,
-            slurm_config=config,
-        )
-        logger.info(f"[submit] END submission phase, {self.job_ids=}")
-        # NOTE: see issue 2444
-        settings = Inject(get_settings)
-        sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
-        logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
-        time.sleep(sleep_time)
+            # Validation phase
+            self.validate_submit_parameters(
+                parameters=parameters,
+                task_type=task_type,
+            )
-        # Retrieval phase
-        logger.info("[submit] START retrieval phase")
-        scancelled_job_ids = []
-        while len(self.jobs) > 0:
-            # Look for finished jobs
-            finished_job_ids = self._get_finished_jobs(job_ids=self.job_ids)
-            logger.debug(f"[submit] {finished_job_ids=}")
-            finished_jobs = [
-                self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
-            ]
-            self._fetch_artifacts(finished_jobs)
-            with next(get_sync_db()) as db:
-                for slurm_job_id in finished_job_ids:
-                    logger.debug(f"[submit] Now process {slurm_job_id=}")
-                    slurm_job = self.jobs.pop(slurm_job_id)
-                    was_job_scancelled = slurm_job_id in scancelled_job_ids
-                    result, exception = self._postprocess_single_task(
-                        task=slurm_job.tasks[0],
-                        was_job_scancelled=was_job_scancelled,
+            # Create task subfolder
+            logger.info("[submit] Create local/remote folders - START")
+            self._mkdir_local_folder(folder=workdir_local.as_posix())
+            self._mkdir_remote_folder(folder=workdir_remote.as_posix())
+            logger.info("[submit] Create local/remote folders - END")
+            # Submission phase
+            slurm_job = SlurmJob(
+                prefix=task_files.prefix,
+                workdir_local=workdir_local,
+                workdir_remote=workdir_remote,
+                tasks=[
+                    SlurmTask(
+                        prefix=task_files.prefix,
+                        index=0,
+                        component=task_files.component,
+                        parameters=parameters,
+                        workdir_remote=workdir_remote,
+                        workdir_local=workdir_local,
+                        task_files=task_files,
                     )
+                ],
+            )
-                    if exception is not None:
-                        update_status_of_history_unit(
-                            history_unit_id=history_unit_id,
-                            status=HistoryUnitStatus.FAILED,
-                            db_sync=db,
+            config.parallel_tasks_per_job = 1
+            self._submit_single_sbatch(
+                func,
+                slurm_job=slurm_job,
+                slurm_config=config,
+            )
+            logger.info(f"[submit] END submission phase, {self.job_ids=}")
+            # NOTE: see issue 2444
+            settings = Inject(get_settings)
+            sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
+            logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
+            time.sleep(sleep_time)
+            # Retrieval phase
+            logger.info("[submit] START retrieval phase")
+            scancelled_job_ids = []
+            while len(self.jobs) > 0:
+                # Look for finished jobs
+                finished_job_ids = self._get_finished_jobs(
+                    job_ids=self.job_ids
+                )
+                logger.debug(f"[submit] {finished_job_ids=}")
+                finished_jobs = [
+                    self.jobs[_slurm_job_id]
+                    for _slurm_job_id in finished_job_ids
+                ]
+                self._fetch_artifacts(finished_jobs)
+                with next(get_sync_db()) as db:
+                    for slurm_job_id in finished_job_ids:
+                        logger.debug(f"[submit] Now process {slurm_job_id=}")
+                        slurm_job = self.jobs.pop(slurm_job_id)
+                        was_job_scancelled = slurm_job_id in scancelled_job_ids
+                        result, exception = self._postprocess_single_task(
+                            task=slurm_job.tasks[0],
+                            was_job_scancelled=was_job_scancelled,
                         )
-                    else:
-                        if task_type not in ["compound", "converter_compound"]:
+                        if exception is not None:
                             update_status_of_history_unit(
                                 history_unit_id=history_unit_id,
-                                status=HistoryUnitStatus.DONE,
+                                status=HistoryUnitStatus.FAILED,
                                 db_sync=db,
                             )
+                        else:
+                            if task_type not in [
+                                "compound",
+                                "converter_compound",
+                            ]:
+                                update_status_of_history_unit(
+                                    history_unit_id=history_unit_id,
+                                    status=HistoryUnitStatus.DONE,
+                                    db_sync=db,
+                                )
-            if len(self.jobs) > 0:
-                scancelled_job_ids = self.wait_and_check_shutdown()
+                if len(self.jobs) > 0:
+                    scancelled_job_ids = self.wait_and_check_shutdown()
+            logger.info("[submit] END")
+            return result, exception
-        logger.info("[submit] END")
-        return result, exception
+        except Exception as e:
+            logger.error(
+                f"[submit] Unexpected exception. Original error: {str(e)}"
+            )
+            with next(get_sync_db()) as db:
+                update_status_of_history_unit(
+                    history_unit_id=history_unit_id,
+                    status=HistoryUnitStatus.FAILED,
+                    db_sync=db,
+                )
+            self.scancel_jobs()
+            return None, e
     def multisubmit(
         self,
@@ -574,108 +585,120 @@ class BaseSlurmRunner(BaseRunner):
         input images, while for compound tasks these can differ.
         """
-        if len(self.jobs) > 0:
-            raise RuntimeError(
-                f"Cannot run `multisubmit` when {len(self.jobs)=}"
-            )
-        if self.is_shutdown():
-            if task_type == "parallel":
-                with next(get_sync_db()) as db:
-                    bulk_update_status_of_history_unit(
-                        history_unit_ids=history_unit_ids,
-                        status=HistoryUnitStatus.FAILED,
-                        db_sync=db,
-                    )
-            results = {}
-            exceptions = {
-                ind: SHUTDOWN_EXCEPTION for ind in range(len(list_parameters))
-            }
-            return results, exceptions
-        self.validate_multisubmit_parameters(
-            list_parameters=list_parameters,
-            task_type=task_type,
-            list_task_files=list_task_files,
-            history_unit_ids=history_unit_ids,
-        )
         logger.info(f"[multisubmit] START, {len(list_parameters)=}")
+        try:
-        workdir_local = list_task_files[0].wftask_subfolder_local
-        workdir_remote = list_task_files[0].wftask_subfolder_remote
-        # Create local&remote task subfolders
-        if task_type == "parallel":
-            self._mkdir_local_folder(workdir_local.as_posix())
-            self._mkdir_remote_folder(folder=workdir_remote.as_posix())
-        # Execute tasks, in chunks of size `parallel_tasks_per_job`
-        # TODO Pick a data structure for results and exceptions, or review the
-        # interface
-        results: dict[int, Any] = {}
-        exceptions: dict[int, BaseException] = {}
-        tot_tasks = len(list_parameters)
+            if self.is_shutdown():
+                if task_type == "parallel":
+                    with next(get_sync_db()) as db:
+                        bulk_update_status_of_history_unit(
+                            history_unit_ids=history_unit_ids,
+                            status=HistoryUnitStatus.FAILED,
+                            db_sync=db,
+                        )
+                results = {}
+                exceptions = {
+                    ind: SHUTDOWN_EXCEPTION
+                    for ind in range(len(list_parameters))
+                }
+                return results, exceptions
+            self._check_no_active_jobs()
+            self.validate_multisubmit_parameters(
+                list_parameters=list_parameters,
+                task_type=task_type,
+                list_task_files=list_task_files,
+                history_unit_ids=history_unit_ids,
+            )
-        # NOTE: chunking has already taken place in `get_slurm_config`,
-        # so that `config.tasks_per_job` is now set.
+            workdir_local = list_task_files[0].wftask_subfolder_local
+            workdir_remote = list_task_files[0].wftask_subfolder_remote
-        # Divide arguments in batches of `tasks_per_job` tasks each
-        args_batches = []
-        batch_size = config.tasks_per_job
-        for ind_chunk in range(0, tot_tasks, batch_size):
-            args_batches.append(
-                list_parameters[ind_chunk : ind_chunk + batch_size]  # noqa
-            )
-        if len(args_batches) != math.ceil(tot_tasks / config.tasks_per_job):
-            raise RuntimeError("Something wrong here while batching tasks")
-        # Part 1/3: Iterate over chunks, prepare SlurmJob objects
-        logger.info("[multisubmit] Prepare `SlurmJob`s.")
-        jobs_to_submit = []
-        for ind_batch, chunk in enumerate(args_batches):
-            # Read prefix based on the first task of this batch
-            prefix = list_task_files[ind_batch * batch_size].prefix
-            tasks = []
-            for ind_chunk, parameters in enumerate(chunk):
-                index = (ind_batch * batch_size) + ind_chunk
-                tasks.append(
-                    SlurmTask(
+            # Create local&remote task subfolders
+            if task_type == "parallel":
+                self._mkdir_local_folder(workdir_local.as_posix())
+                self._mkdir_remote_folder(folder=workdir_remote.as_posix())
+            results: dict[int, Any] = {}
+            exceptions: dict[int, BaseException] = {}
+            # NOTE: chunking has already taken place in `get_slurm_config`,
+            # so that `config.tasks_per_job` is now set.
+            # Divide arguments in batches of `tasks_per_job` tasks each
+            tot_tasks = len(list_parameters)
+            args_batches = []
+            batch_size = config.tasks_per_job
+            for ind_chunk in range(0, tot_tasks, batch_size):
+                args_batches.append(
+                    list_parameters[ind_chunk : ind_chunk + batch_size]  # noqa
+                )
+            if len(args_batches) != math.ceil(
+                tot_tasks / config.tasks_per_job
+            ):
+                raise RuntimeError("Something wrong here while batching tasks")
+            # Part 1/3: Iterate over chunks, prepare SlurmJob objects
+            logger.info("[multisubmit] Prepare `SlurmJob`s.")
+            jobs_to_submit = []
+            for ind_batch, chunk in enumerate(args_batches):
+                # Read prefix based on the first task of this batch
+                prefix = list_task_files[ind_batch * batch_size].prefix
+                tasks = []
+                for ind_chunk, parameters in enumerate(chunk):
+                    index = (ind_batch * batch_size) + ind_chunk
+                    tasks.append(
+                        SlurmTask(
+                            prefix=prefix,
+                            index=index,
+                            component=list_task_files[index].component,
+                            workdir_local=workdir_local,
+                            workdir_remote=workdir_remote,
+                            parameters=parameters,
+                            zarr_url=parameters["zarr_url"],
+                            task_files=list_task_files[index],
+                        ),
+                    )
+                jobs_to_submit.append(
+                    SlurmJob(
                         prefix=prefix,
-                        index=index,
-                        component=list_task_files[index].component,
                         workdir_local=workdir_local,
                         workdir_remote=workdir_remote,
-                        parameters=parameters,
-                        zarr_url=parameters["zarr_url"],
-                        task_files=list_task_files[index],
-                    ),
-                )
-            jobs_to_submit.append(
-                SlurmJob(
-                    prefix=prefix,
-                    workdir_local=workdir_local,
-                    workdir_remote=workdir_remote,
-                    tasks=tasks,
+                        tasks=tasks,
+                    )
                 )
-            )
-        # NOTE: see issue 2431
-        logger.info("[multisubmit] Transfer files and submit jobs.")
-        for slurm_job in jobs_to_submit:
-            self._submit_single_sbatch(
-                func,
-                slurm_job=slurm_job,
-                slurm_config=config,
-            )
+            # NOTE: see issue 2431
+            logger.info("[multisubmit] Transfer files and submit jobs.")
+            for slurm_job in jobs_to_submit:
+                self._submit_single_sbatch(
+                    func,
+                    slurm_job=slurm_job,
+                    slurm_config=config,
+                )
-        logger.info(f"END submission phase, {self.job_ids=}")
+            logger.info(f"END submission phase, {self.job_ids=}")
-        settings = Inject(get_settings)
-        sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
-        logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
-        time.sleep(sleep_time)
+            settings = Inject(get_settings)
+            sleep_time = settings.FRACTAL_SLURM_INTERVAL_BEFORE_RETRIEVAL
+            logger.warning(f"[submit] Now sleep {sleep_time} seconds.")
+            time.sleep(sleep_time)
+        except Exception as e:
+            logger.error(
+                "[multisubmit] Unexpected exception during submission."
+                f" Original error {str(e)}"
+            )
+            self.scancel_jobs()
+            if task_type == "parallel":
+                with next(get_sync_db()) as db:
+                    bulk_update_status_of_history_unit(
+                        history_unit_ids=history_unit_ids,
+                        status=HistoryUnitStatus.FAILED,
+                        db_sync=db,
+                    )
+            results = {}
+            exceptions = {ind: e for ind in range(len(list_parameters))}
+            return results, exceptions
         # Retrieval phase
         logger.info("[multisubmit] START retrieval phase")
@@ -687,7 +710,16 @@ class BaseSlurmRunner(BaseRunner):
             finished_jobs = [
                 self.jobs[_slurm_job_id] for _slurm_job_id in finished_job_ids
             ]
-            self._fetch_artifacts(finished_jobs)
+            fetch_artifacts_exception = None
+            try:
+                self._fetch_artifacts(finished_jobs)
+            except Exception as e:
+                logger.error(
+                    "[multisubmit] Unexpected exception in "
+                    "`_fetch_artifacts`. "
+                    f"Original error: {str(e)}"
+                )
+                fetch_artifacts_exception = e
             with next(get_sync_db()) as db:
                 for slurm_job_id in finished_job_ids:
@@ -696,11 +728,26 @@ class BaseSlurmRunner(BaseRunner):
                     for task in slurm_job.tasks:
                         logger.info(f"[multisubmit] Now process {task.index=}")
                         was_job_scancelled = slurm_job_id in scancelled_job_ids
-                        result, exception = self._postprocess_single_task(
-                            task=task,
-                            was_job_scancelled=was_job_scancelled,
-                        )
+                        if fetch_artifacts_exception is not None:
+                            result = None
+                            exception = fetch_artifacts_exception
+                        else:
+                            try:
+                                (
+                                    result,
+                                    exception,
+                                ) = self._postprocess_single_task(
+                                    task=task,
+                                    was_job_scancelled=was_job_scancelled,
+                                )
+                            except Exception as e:
+                                logger.error(
+                                    "[multisubmit] Unexpected exception in "
+                                    "`_postprocess_single_task`. "
+                                    f"Original error: {str(e)}"
+                                )
+                                result = None
+                                exception = e
                         # Note: the relevant done/failed check is based on
                         # whether `exception is None`. The fact that
                         # `result is None` is not relevant for this purpose.
@@ -763,16 +810,15 @@ class BaseSlurmRunner(BaseRunner):
     def scancel_jobs(self) -> list[str]:
         logger.info("[scancel_jobs] START")
+        scancelled_job_ids = self.job_ids
         if self.jobs:
-            scancelled_job_ids = self.job_ids
             scancel_string = " ".join(scancelled_job_ids)
             scancel_cmd = f"scancel {scancel_string}"
             logger.warning(f"Now scancel-ing SLURM jobs {scancel_string}")
             try:
                 self._run_remote_cmd(scancel_cmd)
             except Exception as e:
-                logger.warning(
+                logger.error(
                     "[scancel_jobs] `scancel` command failed. "
                     f"Original error:\n{str(e)}"
                 )

fractal_server/app/runner/executors/slurm_ssh/runner.py CHANGED Viewed

@@ -9,6 +9,8 @@ from fractal_server.app.runner.extract_archive import extract_archive
 from fractal_server.config import get_settings
 from fractal_server.logger import set_logger
 from fractal_server.ssh._fabric import FractalSSH
+from fractal_server.ssh._fabric import FractalSSHCommandError
+from fractal_server.ssh._fabric import FractalSSHTimeoutError
 from fractal_server.syringe import Inject
@@ -206,3 +208,61 @@ class SlurmSSHRunner(BaseSlurmRunner):
     def _run_remote_cmd(self, cmd: str) -> str:
         stdout = self.fractal_ssh.run_command(cmd=cmd)
         return stdout
+    def run_squeue(
+        self,
+        *,
+        job_ids: list[str],
+        base_interval: float = 2.0,
+        max_attempts: int = 7,
+    ) -> str:
+        """
+        Run `squeue` for a set of SLURM job IDs.
+        Different scenarios:
+        1. When `squeue -j` succeeds (with exit code 0), return its stdout.
+        2. When `squeue -j` fails (typical example:
+           `squeue -j {invalid_job_id}` fails with exit code 1), re-raise.
+           The error will be handled upstream.
+        3. When the SSH command fails because another thread is keeping the
+           lock of the `FractalSSH` object for a long time, mock the standard
+           output of the `squeue` command so that it looks like jobs are not
+           completed yet.
+        4. When the SSH command fails for other reasons, despite a forgiving
+           setup (7 connection attempts with base waiting interval of 2
+           seconds, with a cumulative timeout of 126 seconds), return an empty
+           string. This will be treated upstream as an empty `squeu` output,
+           indirectly resulting in marking the job as completed.
+        """
+        if len(job_ids) == 0:
+            return ""
+        job_id_single_str = ",".join([str(j) for j in job_ids])
+        cmd = (
+            "squeue --noheader --format='%i %T' --states=all "
+            f"--jobs={job_id_single_str}"
+        )
+        try:
+            stdout = self.fractal_ssh.run_command(
+                cmd=cmd,
+                base_interval=base_interval,
+                max_attempts=max_attempts,
+            )
+            return stdout
+        except FractalSSHCommandError as e:
+            raise e
+        except FractalSSHTimeoutError:
+            logger.warning(
+                "[run_squeue] Could not acquire lock, use stdout placeholder."
+            )
+            FAKE_STATUS = "FRACTAL_STATUS_PLACEHOLDER"
+            placeholder_stdout = "\n".join(
+                [f"{job_id} {FAKE_STATUS}" for job_id in job_ids]
+            )
+            return placeholder_stdout
+        except Exception as e:
+            logger.error(f"Ignoring `squeue` command failure {e}")
+            return ""

fractal_server/app/runner/executors/slurm_sudo/runner.py CHANGED Viewed

@@ -176,6 +176,18 @@ class SudoSlurmRunner(BaseSlurmRunner):
         )
         return res.stdout
-    def _run_local_cmd(self, cmd: str) -> str:
+    def run_squeue(self, job_ids: list[str]) -> str:
+        """
+        Run `squeue` for a set of SLURM job IDs.
+        """
+        if len(job_ids) == 0:
+            return ""
+        job_id_single_str = ",".join([str(j) for j in job_ids])
+        cmd = (
+            "squeue --noheader --format='%i %T' --states=all "
+            f"--jobs {job_id_single_str}"
+        )
         res = _subprocess_run_or_raise(cmd)
         return res.stdout

fractal_server/app/runner/v2/runner_functions.py CHANGED Viewed

@@ -229,6 +229,9 @@ def run_v2_task_non_parallel(
             exception=exception,
         )
     }
+    # NOTE: Here we don't have to handle the
+    # `outcome[0].exception is not None` branch, since for non_parallel
+    # tasks it was already handled within submit
     if outcome[0].invalid_output:
         with next(get_sync_db()) as db:
             update_status_of_history_unit(
@@ -356,6 +359,9 @@ def run_v2_task_parallel(
             result=results.get(ind, None),
             exception=exceptions.get(ind, None),
         )
+        # NOTE: Here we don't have to handle the
+        # `outcome[ind].exception is not None` branch, since for parallel
+        # tasks it was already handled within multisubmit
         if outcome[ind].invalid_output:
             with next(get_sync_db()) as db:
                 update_status_of_history_unit(
@@ -576,7 +582,12 @@ def run_v2_task_compound(
             result=results.get(ind, None),
             exception=exceptions.get(ind, None),
         )
-        if compute_outcomes[ind].invalid_output:
+        # NOTE: For compound task, `multisubmit` did not handle the
+        # `exception is not None` branch, therefore we have to include it here.
+        if (
+            compute_outcomes[ind].exception is not None
+            or compute_outcomes[ind].invalid_output
+        ):
             failure = True
     # NOTE: For compound tasks, we update `HistoryUnit.status` from here,

fractal_server/ssh/_fabric.py CHANGED Viewed

@@ -23,6 +23,18 @@ class FractalSSHTimeoutError(RuntimeError):
     pass
+class FractalSSHConnectionError(RuntimeError):
+    pass
+class FractalSSHCommandError(RuntimeError):
+    pass
+class FractalSSHUnknownError(RuntimeError):
+    pass
 logger = set_logger(__name__)
@@ -170,7 +182,6 @@ class FractalSSH(object):
             label="read_remote_json_file",
             timeout=self.default_lock_timeout,
         ):
             try:
                 with self._sftp_unsafe().open(filepath, "r") as f:
                     data = json.load(f)
@@ -263,7 +274,7 @@ class FractalSSH(object):
         cmd: str,
         allow_char: Optional[str] = None,
         max_attempts: Optional[int] = None,
-        base_interval: Optional[int] = None,
+        base_interval: Optional[float] = None,
         lock_timeout: Optional[int] = None,
     ) -> str:
         """
@@ -311,7 +322,7 @@ class FractalSSH(object):
                 t_1 = time.perf_counter()
                 self.logger.info(
                     f"{prefix} END   running '{cmd}' over SSH, "
-                    f"elapsed {t_1-t_0:.3f}"
+                    f"elapsed {t_1 - t_0:.3f}"
                 )
                 self.logger.debug("STDOUT:")
                 self.logger.debug(res.stdout)
@@ -329,12 +340,16 @@ class FractalSSH(object):
                     sleeptime = actual_base_interval**ind_attempt
                     self.logger.warning(
                         f"{prefix} Now sleep {sleeptime:.3f} "
-                        "seconds and continue."
+                        "seconds and retry."
                     )
                     time.sleep(sleeptime)
                 else:
                     self.logger.error(f"{prefix} Reached last attempt")
-                    break
+                    raise FractalSSHConnectionError(
+                        f"Reached last attempt "
+                        f"({max_attempts=}) for running "
+                        f"'{cmd}' over SSH"
+                    )
             except UnexpectedExit as e:
                 # Case 3: Command fails with an actual error
                 error_msg = (
@@ -342,18 +357,15 @@ class FractalSSH(object):
                     f"Original error:\n{str(e)}."
                 )
                 self.logger.error(error_msg)
-                raise RuntimeError(error_msg)
+                raise FractalSSHCommandError(error_msg)
+            except FractalSSHTimeoutError as e:
+                raise e
             except Exception as e:
                 self.logger.error(
                     f"Running command `{cmd}` over SSH failed.\n"
                     f"Original Error:\n{str(e)}."
                 )
-                raise e
-        raise RuntimeError(
-            f"Reached last attempt ({max_attempts=}) for running "
-            f"'{cmd}' over SSH"
-        )
+                raise FractalSSHUnknownError(f"{type(e)}: {str(e)}")
     def send_file(
         self,

{fractal_server-2.14.0a32.dist-info → fractal_server-2.14.0a34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: fractal-server
-Version: 2.14.0a32
+Version: 2.14.0a34
 Summary: Backend component of the Fractal analytics platform
 License: BSD-3-Clause
 Author: Tommaso Comparin

{fractal_server-2.14.0a32.dist-info → fractal_server-2.14.0a34.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-fractal_server/__init__.py,sha256=J-oNbVXHVlnvXCyIUmhwKJKAoiWaqLdz3TGbwf6DTTQ,26
+fractal_server/__init__.py,sha256=B5mHrNKBuCS1_dfqSKK7a3mM57rWv7Sf9ODhxz6f23g,26
 fractal_server/__main__.py,sha256=rkM8xjY1KeS3l63irB8yCrlVobR-73uDapC4wvrIlxI,6957
 fractal_server/alembic.ini,sha256=MWwi7GzjzawI9cCAK1LW7NxIBQDUqD12-ptJoq5JpP0,3153
 fractal_server/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -73,21 +73,21 @@ fractal_server/app/runner/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
 fractal_server/app/runner/executors/base_runner.py,sha256=knWOERUwRLhsd9eq5GwGxH2ZVsvPOZRRjQPGbiExqcU,5052
 fractal_server/app/runner/executors/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fractal_server/app/runner/executors/local/get_local_config.py,sha256=KiakXxOahaLgWvQJ1LVGYGXht6DMGR9x8Xu-TuT9aY4,3628
-fractal_server/app/runner/executors/local/runner.py,sha256=pcwQ-ow4pJk4mkUg6mODMmfzGiMWX18vPxybrly_evY,6962
+fractal_server/app/runner/executors/local/runner.py,sha256=dPEpjIfJQu-st_tYiaI8VhH3y1uvK6DgfQ2cXU0vhOU,9543
 fractal_server/app/runner/executors/slurm_common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fractal_server/app/runner/executors/slurm_common/_batching.py,sha256=ZY020JZlDS5mfpgpWTChQkyHU7iLE5kx2HVd57_C6XA,8850
 fractal_server/app/runner/executors/slurm_common/_job_states.py,sha256=nuV-Zba38kDrRESOVB3gaGbrSPZc4q7YGichQaeqTW0,238
 fractal_server/app/runner/executors/slurm_common/_slurm_config.py,sha256=_feRRnVVnvQa3AsOQqfULfOgaoj2o6Ze0-fwXwic8p4,15795
-fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=3LHrNmJ8VuBSeFI07q4tq41DWtcYTzYJfHvsaezDyoI,30355
+fractal_server/app/runner/executors/slurm_common/base_slurm_runner.py,sha256=S9BdLz7Enqx6hjH154LYas38b-t52mved0TUWCbMTyo,33118
 fractal_server/app/runner/executors/slurm_common/get_slurm_config.py,sha256=BW6fDpPyB0VH5leVxvwzkVH3r3hC7DuSyoWmRzHITWg,7305
 fractal_server/app/runner/executors/slurm_common/remote.py,sha256=EB2uASKjrBIr25oc13XvSwf8x-TpTBr9WuaLMwNr2y4,5850
 fractal_server/app/runner/executors/slurm_common/slurm_job_task_models.py,sha256=RoxHLKOn0_wGjnY0Sv0a9nDSiqxYZHKRoMkT3p9_G1E,3607
 fractal_server/app/runner/executors/slurm_common/utils_executors.py,sha256=naPyJI0I3lD-sYHbSXbMFGUBK4h_SggA5V91Z1Ch1Xg,1416
 fractal_server/app/runner/executors/slurm_ssh/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=LeEt8a4knm9OCULxhhLkMPBanMW_65ZvL1O-HEA9QMw,7151
+fractal_server/app/runner/executors/slurm_ssh/runner.py,sha256=5ppdV5D1N6v3T2QUGBn1Q7dswcUKIpI6ZjX_yIO_Z9A,9439
 fractal_server/app/runner/executors/slurm_sudo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fractal_server/app/runner/executors/slurm_sudo/_subprocess_run_as_user.py,sha256=O1bNg1DiSDJmQE0RmOk2Ii47DagiXp5ryd0R6KxO2OM,3177
-fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=iFaE3EMepbXzmKwqydwYMGJu7D1ak4RhbA43rkVUWZo,5962
+fractal_server/app/runner/executors/slurm_sudo/runner.py,sha256=lPWkRT499mChP3dNLrdDjMT-nw7-LWv6g58kdF_sMRw,6290
 fractal_server/app/runner/extract_archive.py,sha256=I7UGIHXXuFvlgVPsP7GMWPu2-DiS1EiyBs7a1bvgkxI,2458
 fractal_server/app/runner/filenames.py,sha256=lPnxKHtdRizr6FqG3zOdjDPyWA7GoaJGTtiuJV0gA8E,70
 fractal_server/app/runner/run_subprocess.py,sha256=c3JbYXq3hX2aaflQU19qJ5Xs6J6oXGNvnTEoAfv2bxc,959
@@ -102,7 +102,7 @@ fractal_server/app/runner/v2/db_tools.py,sha256=du5dKhMMFMErQXbGIgu9JvO_vtMensod
 fractal_server/app/runner/v2/deduplicate_list.py,sha256=IVTE4abBU1bUprFTkxrTfYKnvkNTanWQ-KWh_etiT08,645
 fractal_server/app/runner/v2/merge_outputs.py,sha256=D1L4Taieq9i71SPQyNc1kMokgHh-sV_MqF3bv7QMDBc,907
 fractal_server/app/runner/v2/runner.py,sha256=B4kAF1S-zHf2PbyHedfuiaNpu4oslVDp33KgXYcoXIk,15706
-fractal_server/app/runner/v2/runner_functions.py,sha256=0CaYKA3W_wlsaEleXUON-h6Sx-8hpv3B0xFT0KhLVvY,18518
+fractal_server/app/runner/v2/runner_functions.py,sha256=2W6CFkezUsQ_k8YuC2oOEMtB_-7M9ensyhwCFvlS2No,19096
 fractal_server/app/runner/v2/runner_functions_low_level.py,sha256=_h_OOffq3d7V0uHa8Uvs0mj31y1GSZBUXjDDF3WjVjY,3620
 fractal_server/app/runner/v2/submit_workflow.py,sha256=QywUGIoHAHnrWgfnyX8W9kVqKY-RvVyNLpzrbsXZOZ4,13075
 fractal_server/app/runner/v2/task_interface.py,sha256=IXdQTI8rXFgXv1Ez0js4CjKFf3QwO2GCHRTuwiFtiTQ,2891
@@ -179,7 +179,7 @@ fractal_server/migrations/versions/f384e1c0cf5d_drop_task_default_args_columns.p
 fractal_server/migrations/versions/fbce16ff4e47_new_history_items.py,sha256=TDWCaIoM0Q4SpRWmR9zr_rdp3lJXhCfBPTMhtrP5xYE,3950
 fractal_server/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 fractal_server/ssh/__init__.py,sha256=sVUmzxf7_DuXG1xoLQ1_00fo5NPhi2LJipSmU5EAkPs,124
-fractal_server/ssh/_fabric.py,sha256=gnSv_DaQ8uYLS35Rqb84wo3HRkMazXGVd-D19fo9zqA,22967
+fractal_server/ssh/_fabric.py,sha256=Do7wX1xsV3Pjmwqg-Z_X1_QM05RN5-sAowO_Hh7-9bk,23324
 fractal_server/string_tools.py,sha256=niViRrrZAOo0y6pEFI9L_eUYS1PoOiQZUBtngiLc2_k,1877
 fractal_server/syringe.py,sha256=3qSMW3YaMKKnLdgnooAINOPxnCOxP7y2jeAQYB21Gdo,2786
 fractal_server/tasks/__init__.py,sha256=kadmVUoIghl8s190_Tt-8f-WBqMi8u8oU4Pvw39NHE8,23
@@ -209,8 +209,8 @@ fractal_server/tasks/v2/utils_templates.py,sha256=Kc_nSzdlV6KIsO0CQSPs1w70zLyENP
 fractal_server/urls.py,sha256=QjIKAC1a46bCdiPMu3AlpgFbcv6a4l3ABcd5xz190Og,471
 fractal_server/utils.py,sha256=PMwrxWFxRTQRl1b9h-NRIbFGPKqpH_hXnkAT3NfZdpY,3571
 fractal_server/zip_tools.py,sha256=GjDgo_sf6V_DDg6wWeBlZu5zypIxycn_l257p_YVKGc,4876
-fractal_server-2.14.0a32.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
-fractal_server-2.14.0a32.dist-info/METADATA,sha256=VQtcPtQieQfckUnH7Ix--dUBWG1fhg0BqaP2ugBz6wU,4563
-fractal_server-2.14.0a32.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
-fractal_server-2.14.0a32.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
-fractal_server-2.14.0a32.dist-info/RECORD,,
+fractal_server-2.14.0a34.dist-info/LICENSE,sha256=QKAharUuhxL58kSoLizKJeZE3mTCBnX6ucmz8W0lxlk,1576
+fractal_server-2.14.0a34.dist-info/METADATA,sha256=YljYi9W71066fSXY2MIAuZQ_P1AqIhfyTECxk78i4og,4563
+fractal_server-2.14.0a34.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
+fractal_server-2.14.0a34.dist-info/entry_points.txt,sha256=8tV2kynvFkjnhbtDnxAqImL6HMVKsopgGfew0DOp5UY,58
+fractal_server-2.14.0a34.dist-info/RECORD,,

{fractal_server-2.14.0a32.dist-info → fractal_server-2.14.0a34.dist-info}/LICENSE RENAMED Viewed

File without changes

{fractal_server-2.14.0a32.dist-info → fractal_server-2.14.0a34.dist-info}/WHEEL RENAMED Viewed

File without changes

{fractal_server-2.14.0a32.dist-info → fractal_server-2.14.0a34.dist-info}/entry_points.txt RENAMED Viewed

File without changes

fractal-server 2.14.0a32__py3-none-any.whl → 2.14.0a34__py3-none-any.whl

fractal-server 2.14.0a32py3-none-any.whl → 2.14.0a34py3-none-any.whl