PyPI - executorlib - Versions diffs - 1.6.2__tar.gz → 1.7.0__tar.gz - Mend

executorlib 1.6.2tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{executorlib-1.6.2 → executorlib-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: executorlib
-Version: 1.6.2
+Version: 1.7.0
 Summary: Up-scale python functions for high performance computing (HPC) with executorlib.
 Project-URL: Homepage, https://github.com/pyiron/executorlib
 Project-URL: Documentation, https://executorlib.readthedocs.io

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/__init__.py RENAMED Viewed

@@ -55,7 +55,7 @@ def terminate_tasks_in_cache(
         config_directory (str, optional): path to the config directory.
         backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"].
     """
-    from executorlib.task_scheduler.file.queue_spawner import terminate_tasks_in_cache
+    from executorlib.task_scheduler.file.spawner_pysqa import terminate_tasks_in_cache
     return terminate_tasks_in_cache(
         cache_directory=cache_directory,

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '1.6.2'
-__version_tuple__ = version_tuple = (1, 6, 2)
+__version__ = version = '1.7.0'
+__version_tuple__ = version_tuple = (1, 7, 0)
 __commit_id__ = commit_id = None

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/interactive_parallel.py RENAMED Viewed

@@ -97,7 +97,23 @@ def main() -> None:
             and "args" in input_dict
             and "kwargs" in input_dict
         ):
-            memory.update(call_funct(input_dict=input_dict, funct=None, memory=memory))
+            try:
+                memory.update(
+                    call_funct(input_dict=input_dict, funct=None, memory=memory)
+                )
+            except Exception as error:
+                if mpi_rank_zero:
+                    interface_send(
+                        socket=socket,
+                        result_dict={"error": error},
+                    )
+                    backend_write_error_file(
+                        error=error,
+                        apply_dict=input_dict,
+                    )
+            else:
+                if mpi_rank_zero:
+                    interface_send(socket=socket, result_dict={"result": True})
 if __name__ == "__main__":

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/backend/interactive_serial.py RENAMED Viewed

@@ -72,7 +72,21 @@ def main(argument_lst: Optional[list[str]] = None):
             and "args" in input_dict
             and "kwargs" in input_dict
         ):
-            memory.update(call_funct(input_dict=input_dict, funct=None, memory=memory))
+            try:
+                memory.update(
+                    call_funct(input_dict=input_dict, funct=None, memory=memory)
+                )
+            except Exception as error:
+                interface_send(
+                    socket=socket,
+                    result_dict={"error": error},
+                )
+                backend_write_error_file(
+                    error=error,
+                    apply_dict=input_dict,
+                )
+            else:
+                interface_send(socket=socket, result_dict={"result": True})
 if __name__ == "__main__":

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/flux.py RENAMED Viewed

@@ -43,6 +43,7 @@ class FluxJobExecutor(BaseExecutor):
                                                   compute notes. Defaults to False.
                               - error_log_file (str): Name of the error log file to use for storing exceptions raised
                                                       by the Python functions submitted to the Executor.
+                              - restart_limit (int): The maximum number of restarting worker processes. Default: 0
         pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
         flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
         flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
@@ -357,28 +358,48 @@ class FluxClusterExecutor(BaseExecutor):
         if not plot_dependency_graph:
             import pysqa  # noqa
-            from executorlib.task_scheduler.file.task_scheduler import (
-                create_file_executor,
-            )
+            if block_allocation:
+                from executorlib.task_scheduler.interactive.spawner_pysqa import (
+                    create_pysqa_block_allocation_scheduler,
+                )
-            super().__init__(
-                executor=create_file_executor(
-                    max_workers=max_workers,
-                    backend="flux",
-                    max_cores=max_cores,
-                    cache_directory=cache_directory,
-                    resource_dict=resource_dict,
-                    flux_executor=None,
-                    pmi_mode=pmi_mode,
-                    flux_executor_nesting=False,
-                    flux_log_files=False,
-                    pysqa_config_directory=pysqa_config_directory,
-                    hostname_localhost=hostname_localhost,
-                    block_allocation=block_allocation,
-                    init_function=init_function,
-                    disable_dependencies=disable_dependencies,
+                super().__init__(
+                    executor=create_pysqa_block_allocation_scheduler(
+                        max_cores=max_cores,
+                        cache_directory=cache_directory,
+                        hostname_localhost=hostname_localhost,
+                        log_obj_size=log_obj_size,
+                        pmi_mode=pmi_mode,
+                        init_function=init_function,
+                        max_workers=max_workers,
+                        resource_dict=resource_dict,
+                        pysqa_config_directory=pysqa_config_directory,
+                        backend="flux",
+                    )
+                )
+            else:
+                from executorlib.task_scheduler.file.task_scheduler import (
+                    create_file_executor,
+                )
+                super().__init__(
+                    executor=create_file_executor(
+                        max_workers=max_workers,
+                        backend="flux",
+                        max_cores=max_cores,
+                        cache_directory=cache_directory,
+                        resource_dict=resource_dict,
+                        flux_executor=None,
+                        pmi_mode=pmi_mode,
+                        flux_executor_nesting=False,
+                        flux_log_files=False,
+                        pysqa_config_directory=pysqa_config_directory,
+                        hostname_localhost=hostname_localhost,
+                        block_allocation=block_allocation,
+                        init_function=init_function,
+                        disable_dependencies=disable_dependencies,
+                    )
                 )
-            )
         else:
             super().__init__(
                 executor=DependencyTaskScheduler(
@@ -458,7 +479,7 @@ def create_flux_executor(
     Returns:
         InteractiveStepExecutor/ InteractiveExecutor
     """
-    from executorlib.task_scheduler.interactive.fluxspawner import (
+    from executorlib.task_scheduler.interactive.spawner_flux import (
         FluxPythonSpawner,
         validate_max_workers,
     )

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/single.py RENAMED Viewed

@@ -120,6 +120,7 @@ class SingleNodeExecutor(BaseExecutor):
                                                            only)
                                   - error_log_file (str): Name of the error log file to use for storing exceptions
                                                           raised by the Python functions submitted to the Executor.
+                                  - restart_limit (int): The maximum number of restarting worker processes. Default: 0
             hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
                                       context of an HPC cluster this essential to be able to communicate to an
                                       Executor running on a different compute node within the same allocation. And
@@ -314,7 +315,7 @@ class TestClusterExecutor(BaseExecutor):
             {k: v for k, v in default_resource_dict.items() if k not in resource_dict}
         )
         if not plot_dependency_graph:
-            from executorlib.task_scheduler.file.subprocess_spawner import (
+            from executorlib.task_scheduler.file.spawner_subprocess import (
                 execute_in_subprocess,
             )
             from executorlib.task_scheduler.file.task_scheduler import (

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/executor/slurm.py RENAMED Viewed

@@ -13,7 +13,7 @@ from executorlib.task_scheduler.interactive.blockallocation import (
 )
 from executorlib.task_scheduler.interactive.dependency import DependencyTaskScheduler
 from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler
-from executorlib.task_scheduler.interactive.slurmspawner import (
+from executorlib.task_scheduler.interactive.spawner_slurm import (
     SrunSpawner,
     validate_max_workers,
 )
@@ -43,6 +43,7 @@ class SlurmClusterExecutor(BaseExecutor):
                               - slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
                               - error_log_file (str): Name of the error log file to use for storing exceptions raised
                                                       by the Python functions submitted to the Executor.
+                              - restart_limit (int): The maximum number of restarting worker processes. Default: 0
         pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
         pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
         hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
@@ -165,28 +166,49 @@ class SlurmClusterExecutor(BaseExecutor):
         if not plot_dependency_graph:
             import pysqa  # noqa
-            from executorlib.task_scheduler.file.task_scheduler import (
-                create_file_executor,
-            )
+            if block_allocation:
+                from executorlib.task_scheduler.interactive.spawner_pysqa import (
+                    create_pysqa_block_allocation_scheduler,
+                )
-            super().__init__(
-                executor=create_file_executor(
-                    max_workers=max_workers,
-                    backend="slurm",
-                    max_cores=max_cores,
-                    cache_directory=cache_directory,
-                    resource_dict=resource_dict,
-                    pmi_mode=pmi_mode,
-                    flux_executor=None,
-                    flux_executor_nesting=False,
-                    flux_log_files=False,
-                    pysqa_config_directory=pysqa_config_directory,
-                    hostname_localhost=hostname_localhost,
-                    block_allocation=block_allocation,
-                    init_function=init_function,
-                    disable_dependencies=disable_dependencies,
+                super().__init__(
+                    executor=create_pysqa_block_allocation_scheduler(
+                        max_cores=max_cores,
+                        cache_directory=cache_directory,
+                        hostname_localhost=hostname_localhost,
+                        log_obj_size=log_obj_size,
+                        pmi_mode=pmi_mode,
+                        init_function=init_function,
+                        max_workers=max_workers,
+                        resource_dict=resource_dict,
+                        pysqa_config_directory=pysqa_config_directory,
+                        backend="slurm",
+                    ),
+                )
+            else:
+                from executorlib.task_scheduler.file.task_scheduler import (
+                    create_file_executor,
+                )
+                super().__init__(
+                    executor=create_file_executor(
+                        max_workers=max_workers,
+                        backend="slurm",
+                        max_cores=max_cores,
+                        cache_directory=cache_directory,
+                        resource_dict=resource_dict,
+                        pmi_mode=pmi_mode,
+                        flux_executor=None,
+                        flux_executor_nesting=False,
+                        flux_log_files=False,
+                        pysqa_config_directory=pysqa_config_directory,
+                        hostname_localhost=hostname_localhost,
+                        block_allocation=block_allocation,
+                        init_function=init_function,
+                        disable_dependencies=disable_dependencies,
+                    )
                 )
-            )
         else:
             super().__init__(
                 executor=DependencyTaskScheduler(

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/command.py RENAMED Viewed

@@ -3,6 +3,8 @@ import os
 import sys
 from typing import Optional
+SLURM_COMMAND = "srun"
 def get_command_path(executable: str) -> str:
     """
@@ -112,3 +114,51 @@ def get_interactive_execute_command(
     else:
         command_lst += [get_command_path(executable="interactive_serial.py")]
     return command_lst
+def generate_slurm_command(
+    cores: int,
+    cwd: Optional[str],
+    threads_per_core: int = 1,
+    gpus_per_core: int = 0,
+    num_nodes: Optional[int] = None,
+    exclusive: bool = False,
+    openmpi_oversubscribe: bool = False,
+    slurm_cmd_args: Optional[list[str]] = None,
+    pmi_mode: Optional[str] = None,
+) -> list[str]:
+    """
+    Generate the command list for the SLURM interface.
+    Args:
+        cores (int): The number of cores.
+        cwd (str): The current working directory.
+        threads_per_core (int, optional): The number of threads per core. Defaults to 1.
+        gpus_per_core (int, optional): The number of GPUs per core. Defaults to 0.
+        num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None.
+        exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False.
+        openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False.
+        slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to [].
+        pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None
+    Returns:
+        list[str]: The generated command list.
+    """
+    command_prepend_lst = [SLURM_COMMAND, "-n", str(cores)]
+    if cwd is not None:
+        command_prepend_lst += ["-D", cwd]
+    if pmi_mode is not None:
+        command_prepend_lst += ["--mpi=" + pmi_mode]
+    if num_nodes is not None:
+        command_prepend_lst += ["-N", str(num_nodes)]
+    if threads_per_core > 1:
+        command_prepend_lst += ["--cpus-per-task=" + str(threads_per_core)]
+    if gpus_per_core > 0:
+        command_prepend_lst += ["--gpus-per-task=" + str(gpus_per_core)]
+    if exclusive:
+        command_prepend_lst += ["--exact"]
+    if openmpi_oversubscribe:
+        command_prepend_lst += ["--oversubscribe"]
+    if slurm_cmd_args is not None and len(slurm_cmd_args) > 0:
+        command_prepend_lst += slurm_cmd_args
+    return command_prepend_lst

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/communication.py RENAMED Viewed

@@ -1,12 +1,16 @@
 import logging
 import sys
 from socket import gethostname
-from typing import Optional
+from typing import Any, Callable, Optional
 import cloudpickle
 import zmq
+class ExecutorlibSocketError(RuntimeError):
+    pass
 class SocketInterface:
     """
     The SocketInterface is an abstraction layer on top of the zero message queue.
@@ -14,23 +18,41 @@ class SocketInterface:
     Args:
         spawner (executorlib.shared.spawner.BaseSpawner): Interface for starting the parallel process
         log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
+        time_out_ms (int): Time out for waiting for a message on socket in milliseconds.
     """
-    def __init__(self, spawner=None, log_obj_size=False):
+    def __init__(
+        self, spawner=None, log_obj_size: bool = False, time_out_ms: int = 1000
+    ):
         """
         Initialize the SocketInterface.
         Args:
             spawner (executorlib.shared.spawner.BaseSpawner): Interface for starting the parallel process
+            log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
+            time_out_ms (int): Time out for waiting for a message on socket in milliseconds.
         """
         self._context = zmq.Context()
         self._socket = self._context.socket(zmq.PAIR)
+        self._poller = zmq.Poller()
+        self._poller.register(self._socket, zmq.POLLIN)
         self._process = None
+        self._time_out_ms = time_out_ms
+        self._logger: Optional[logging.Logger] = None
         if log_obj_size:
             self._logger = logging.getLogger("executorlib")
-        else:
-            self._logger = None
         self._spawner = spawner
+        self._command_lst: list[str] = []
+        self._booted_sucessfully: bool = False
+        self._stop_function: Optional[Callable] = None
+    @property
+    def status(self) -> bool:
+        return self._booted_sucessfully
+    @status.setter
+    def status(self, status: bool):
+        self._booted_sucessfully = status
     def send_dict(self, input_dict: dict):
         """
@@ -52,7 +74,14 @@ class SocketInterface:
         Returns:
             dict: dictionary with response received from the connected client
         """
-        data = self._socket.recv()
+        response_lst: list[tuple[Any, int]] = []
+        while len(response_lst) == 0:
+            response_lst = self._poller.poll(self._time_out_ms)
+            if not self._spawner.poll():
+                raise ExecutorlibSocketError(
+                    "SocketInterface crashed during execution."
+                )
+        data = self._socket.recv(zmq.NOBLOCK)
         if self._logger is not None:
             self._logger.warning(
                 "Received dictionary of size: " + str(sys.getsizeof(data))
@@ -69,7 +98,7 @@ class SocketInterface:
         Args:
             input_dict (dict): dictionary of commands to be communicated. The key "shutdown" is reserved to stop the
-                connected client from listening.
+                               connected client from listening.
         Returns:
             dict: dictionary with response received from the connected client
@@ -89,17 +118,30 @@ class SocketInterface:
     def bootup(
         self,
-        command_lst: list[str],
+        command_lst: Optional[list[str]] = None,
+        stop_function: Optional[Callable] = None,
     ):
         """
         Boot up the client process to connect to the SocketInterface.
         Args:
             command_lst (list): list of strings to start the client process
+            stop_function (Callable): Function to stop the interface.
         """
-        self._spawner.bootup(
-            command_lst=command_lst,
-        )
+        if command_lst is not None:
+            self._command_lst = command_lst
+        if stop_function is not None:
+            self._stop_function = stop_function
+        if len(self._command_lst) == 0:
+            raise ValueError("No command defined to boot up SocketInterface.")
+        if not self._spawner.bootup(
+            command_lst=self._command_lst,
+            stop_function=self._stop_function,
+        ):
+            self._reset_socket()
+            self._booted_sucessfully = False
+        else:
+            self._booted_sucessfully = True
     def shutdown(self, wait: bool = True):
         """
@@ -114,6 +156,13 @@ class SocketInterface:
                 input_dict={"shutdown": True, "wait": wait}
             )
             self._spawner.shutdown(wait=wait)
+        self._reset_socket()
+        return result
+    def _reset_socket(self):
+        """
+        Reset the socket and context of the SocketInterface instance.
+        """
         if self._socket is not None:
             self._socket.close()
         if self._context is not None:
@@ -121,7 +170,6 @@ class SocketInterface:
         self._process = None
         self._socket = None
         self._context = None
-        return result
     def __del__(self):
         """
@@ -137,6 +185,7 @@ def interface_bootup(
     hostname_localhost: Optional[bool] = None,
     log_obj_size: bool = False,
     worker_id: Optional[int] = None,
+    stop_function: Optional[Callable] = None,
 ) -> SocketInterface:
     """
     Start interface for ZMQ communication
@@ -155,13 +204,12 @@ def interface_bootup(
         log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects.
         worker_id (int): Communicate the worker which ID was assigned to it for future reference and resource
                          distribution.
+        stop_function (Callable): Function to stop the interface.
     Returns:
          executorlib.shared.communication.SocketInterface: socket interface for zmq communication
     """
-    if hostname_localhost is None and sys.platform == "darwin":
-        hostname_localhost = True
-    elif hostname_localhost is None:
+    if hostname_localhost is None and sys.platform != "darwin":
         hostname_localhost = False
     if not hostname_localhost:
         command_lst += [
@@ -180,6 +228,7 @@ def interface_bootup(
     ]
     interface.bootup(
         command_lst=command_lst,
+        stop_function=stop_function,
     )
     return interface

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/standalone/interactive/spawner.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import subprocess
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Callable, Optional
 MPI_COMMAND = "mpiexec"
@@ -29,12 +29,17 @@ class BaseSpawner(ABC):
     def bootup(
         self,
         command_lst: list[str],
-    ):
+        stop_function: Optional[Callable] = None,
+    ) -> bool:
         """
         Method to start the interface.
         Args:
             command_lst (list[str]): The command list to execute.
+            stop_function (Callable): Function to stop the interface.
+        Returns:
+            bool: Whether the interface was successfully started.
         """
         raise NotImplementedError
@@ -87,12 +92,17 @@ class SubprocessSpawner(BaseSpawner):
     def bootup(
         self,
         command_lst: list[str],
-    ):
+        stop_function: Optional[Callable] = None,
+    ) -> bool:
         """
         Method to start the subprocess interface.
         Args:
             command_lst (list[str]): The command list to execute.
+            stop_function (Callable): Function to stop the interface.
+        Returns:
+            bool: Whether the interface was successfully started.
         """
         if self._cwd is not None:
             os.makedirs(self._cwd, exist_ok=True)
@@ -101,6 +111,7 @@ class SubprocessSpawner(BaseSpawner):
             cwd=self._cwd,
             stdin=subprocess.DEVNULL,
         )
+        return self.poll()
     def generate_command(self, command_lst: list[str]) -> list[str]:
         """

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/shared.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Callable, Optional
 from executorlib.standalone.command import get_cache_execute_command
 from executorlib.standalone.hdf import get_cache_files, get_output
 from executorlib.standalone.serialize import serialize_funct
-from executorlib.task_scheduler.file.subprocess_spawner import terminate_subprocess
+from executorlib.task_scheduler.file.spawner_subprocess import terminate_subprocess
 class FutureItem:

{executorlib-1.6.2 → executorlib-1.7.0}/executorlib/task_scheduler/file/task_scheduler.py RENAMED Viewed

@@ -11,14 +11,14 @@ from executorlib.standalone.inputcheck import (
 )
 from executorlib.task_scheduler.base import TaskSchedulerBase
 from executorlib.task_scheduler.file.shared import execute_tasks_h5
-from executorlib.task_scheduler.file.subprocess_spawner import (
+from executorlib.task_scheduler.file.spawner_subprocess import (
     execute_in_subprocess,
     terminate_subprocess,
 )
 try:
     from executorlib.standalone.scheduler import terminate_with_pysqa
-    from executorlib.task_scheduler.file.queue_spawner import execute_with_pysqa
+    from executorlib.task_scheduler.file.spawner_pysqa import execute_with_pysqa
 except ImportError:
     # If pysqa is not available fall back to executing tasks in a subprocess
     execute_with_pysqa = execute_in_subprocess  # type: ignore

executorlib 1.6.2__tar.gz → 1.7.0__tar.gz

executorlib 1.6.2tar.gz → 1.7.0tar.gz