PyPI - parsl - Versions diffs - 2024.2.12__py3-none-any.whl → 2024.2.26__py3-none-any.whl - Mend

parsl 2024.2.12py3-none-any.whl → 2024.2.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

parsl/channels/errors.py +1 -4
parsl/configs/{comet.py → expanse.py} +5 -5
parsl/dataflow/dflow.py +12 -12
parsl/executors/flux/executor.py +5 -3
parsl/executors/high_throughput/executor.py +56 -10
parsl/executors/high_throughput/mpi_prefix_composer.py +137 -0
parsl/executors/high_throughput/mpi_resource_management.py +217 -0
parsl/executors/high_throughput/process_worker_pool.py +65 -9
parsl/executors/radical/executor.py +6 -3
parsl/executors/radical/rpex_worker.py +2 -2
parsl/jobs/states.py +5 -5
parsl/monitoring/db_manager.py +2 -1
parsl/monitoring/monitoring.py +7 -4
parsl/multiprocessing.py +3 -4
parsl/providers/cobalt/cobalt.py +6 -0
parsl/providers/pbspro/pbspro.py +18 -4
parsl/providers/pbspro/template.py +2 -2
parsl/providers/slurm/slurm.py +17 -4
parsl/providers/slurm/template.py +2 -2
parsl/serialize/__init__.py +7 -2
parsl/serialize/facade.py +32 -1
parsl/tests/test_error_handling/test_resource_spec.py +6 -0
parsl/tests/test_htex/test_htex.py +66 -3
parsl/tests/test_monitoring/test_incomplete_futures.py +65 -0
parsl/tests/test_mpi_apps/__init__.py +0 -0
parsl/tests/test_mpi_apps/test_bad_mpi_config.py +41 -0
parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +51 -0
parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +171 -0
parsl/tests/test_mpi_apps/test_mpi_prefix.py +71 -0
parsl/tests/test_mpi_apps/test_mpi_scheduler.py +158 -0
parsl/tests/test_mpi_apps/test_resource_spec.py +145 -0
parsl/tests/test_providers/test_cobalt_deprecation_warning.py +16 -0
parsl/tests/test_providers/test_pbspro_template.py +28 -0
parsl/tests/test_providers/test_slurm_template.py +29 -0
parsl/tests/test_radical/test_mpi_funcs.py +1 -0
parsl/tests/test_scaling/test_scale_down.py +6 -5
parsl/tests/test_serialization/test_htex_code_cache.py +57 -0
parsl/tests/test_serialization/test_pack_resource_spec.py +22 -0
parsl/usage_tracking/usage.py +29 -55
parsl/utils.py +12 -35
parsl/version.py +1 -1
{parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/process_worker_pool.py +65 -9
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/METADATA +2 -2
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/RECORD +50 -37
parsl/configs/cooley.py +0 -29
parsl/configs/theta.py +0 -33
{parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.2.12.data → parsl-2024.2.26.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/LICENSE +0 -0
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/WHEEL +0 -0
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/entry_points.txt +0 -0
{parsl-2024.2.12.dist-info → parsl-2024.2.26.dist-info}/top_level.txt +0 -0

parsl/channels/errors.py CHANGED Viewed

@@ -14,11 +14,8 @@ class ChannelError(ParslError):
         self.e = e
         self.hostname = hostname
-    def __repr__(self) -> str:
-        return "Hostname:{0}, Reason:{1}".format(self.hostname, self.reason)
     def __str__(self) -> str:
-        return self.__repr__()
+        return "Hostname:{0}, Reason:{1}".format(self.hostname, self.reason)
 class BadHostKeyException(ChannelError):

parsl/configs/{comet.py → expanse.py} RENAMED Viewed

@@ -7,11 +7,11 @@ from parsl.executors import HighThroughputExecutor
 config = Config(
     executors=[
         HighThroughputExecutor(
-            label='Comet_HTEX_multinode',
-            worker_logdir_root='YOUR_LOGDIR_ON_COMET',
-            max_workers=2,
+            label='Expanse_CPU_Multinode',
+            max_workers=32,
             provider=SlurmProvider(
-                'debug',
+                'compute',
+                account='YOUR_ALLOCATION_ON_EXPANSE',
                 launcher=SrunLauncher(),
                 # string to prepend to #SBATCH blocks in the submit
                 # script to the scheduler
@@ -19,7 +19,7 @@ config = Config(
                 # Command to be run before starting a worker, such as:
                 # 'module load Anaconda; source activate parsl_env'.
                 worker_init='',
-                walltime='00:10:00',
+                walltime='01:00:00',
                 init_blocks=1,
                 max_blocks=1,
                 nodes_per_block=2,

parsl/dataflow/dflow.py CHANGED Viewed

@@ -113,7 +113,7 @@ class DataFlowKernel:
             if self.monitoring.logdir is None:
                 self.monitoring.logdir = self.run_dir
             self.hub_address = self.monitoring.hub_address
-            self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir)
+            self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
         self.time_began = datetime.datetime.now()
         self.time_completed: Optional[datetime.datetime] = None
@@ -678,10 +678,10 @@ class DataFlowKernel:
             task_record : The task record
         Returns:
-            Future that tracks the execution of the submitted executable
+            Future that tracks the execution of the submitted function
         """
         task_id = task_record['id']
-        executable = task_record['func']
+        function = task_record['func']
         args = task_record['args']
         kwargs = task_record['kwargs']
@@ -706,17 +706,17 @@ class DataFlowKernel:
         if self.monitoring is not None and self.monitoring.resource_monitoring_enabled:
             wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO
-            (executable, args, kwargs) = self.monitoring.monitor_wrapper(executable, args, kwargs, try_id, task_id,
-                                                                         self.monitoring.monitoring_hub_url,
-                                                                         self.run_id,
-                                                                         wrapper_logging_level,
-                                                                         self.monitoring.resource_monitoring_interval,
-                                                                         executor.radio_mode,
-                                                                         executor.monitor_resources(),
-                                                                         self.run_dir)
+            (function, args, kwargs) = self.monitoring.monitor_wrapper(function, args, kwargs, try_id, task_id,
+                                                                       self.monitoring.monitoring_hub_url,
+                                                                       self.run_id,
+                                                                       wrapper_logging_level,
+                                                                       self.monitoring.resource_monitoring_interval,
+                                                                       executor.radio_mode,
+                                                                       executor.monitor_resources(),
+                                                                       self.run_dir)
         with self.submitter_lock:
-            exec_fu = executor.submit(executable, task_record['resource_specification'], *args, **kwargs)
+            exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
         self.update_task_state(task_record, States.launched)
         self._send_task_log_info(task_record)

parsl/executors/flux/executor.py CHANGED Viewed

@@ -24,7 +24,7 @@ from parsl.executors.flux.flux_instance_manager import __file__ as _MANAGER_PATH
 from parsl.executors.errors import ScalingFailed
 from parsl.providers import LocalProvider
 from parsl.providers.base import ExecutionProvider
-from parsl.serialize import pack_apply_message, deserialize
+from parsl.serialize import deserialize, pack_res_spec_apply_message
 from parsl.serialize.errors import SerializationError
 from parsl.app.errors import AppException
@@ -284,8 +284,10 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
             infile = os.path.join(self.working_dir, f"{task_id}_in{os.extsep}pkl")
             outfile = os.path.join(self.working_dir, f"{task_id}_out{os.extsep}pkl")
             try:
-                fn_buf = pack_apply_message(
-                    func, args, kwargs, buffer_threshold=1024 * 1024
+                fn_buf = pack_res_spec_apply_message(
+                    func, args, kwargs,
+                    resource_specification={},
+                    buffer_threshold=1024 * 1024
                 )
             except TypeError:
                 raise SerializationError(func.__name__)

parsl/executors/high_throughput/executor.py CHANGED Viewed

@@ -6,12 +6,13 @@ import threading
 import queue
 import datetime
 import pickle
-from multiprocessing import Queue
+from multiprocessing import Process, Queue
 from typing import Dict, Sequence
 from typing import List, Optional, Tuple, Union, Callable
 import math
-from parsl.serialize import pack_apply_message, deserialize
+import parsl.launchers
+from parsl.serialize import pack_res_spec_apply_message, deserialize
 from parsl.serialize.errors import SerializationError, DeserializationError
 from parsl.app.errors import RemoteExceptionWrapper
 from parsl.jobs.states import JobStatus, JobState
@@ -19,7 +20,10 @@ from parsl.executors.high_throughput import zmq_pipes
 from parsl.executors.high_throughput import interchange
 from parsl.executors.errors import (
     BadMessage, ScalingFailed,
-    UnsupportedFeatureError
+)
+from parsl.executors.high_throughput.mpi_prefix_composer import (
+    VALID_LAUNCHERS,
+    validate_resource_spec
 )
 from parsl import curvezmq
@@ -50,6 +54,8 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers} "
                       "{address_probe_timeout_string} "
                       "--hb_threshold={heartbeat_threshold} "
                       "--cpu-affinity {cpu_affinity} "
+                      "{enable_mpi_mode} "
+                      "--mpi-launcher={mpi_launcher} "
                       "--available-accelerators {accelerators}")
@@ -193,6 +199,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
     worker_logdir_root : string
         In case of a remote file system, specify the path to where logs will be kept.
+    enable_mpi_mode: bool
+        If enabled, MPI launch prefixes will be composed for the batch scheduler based on
+        the nodes available in each batch job and the resource_specification dict passed
+        from the app. This is an experimental feature, please refer to the following doc section
+        before use:  https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
+    mpi_launcher: str
+        This field is only used if enable_mpi_mode is set. Select one from the
+        list of supported MPI launchers = ("srun", "aprun", "mpiexec").
+        default: "mpiexec"
     encrypted : bool
         Flag to enable/disable encryption (CurveZMQ). Default is False.
     """
@@ -220,6 +237,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                  poll_period: int = 10,
                  address_probe_timeout: Optional[int] = None,
                  worker_logdir_root: Optional[str] = None,
+                 enable_mpi_mode: bool = False,
+                 mpi_launcher: str = "mpiexec",
                  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
                  encrypted: bool = False):
@@ -271,6 +290,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.hub_port = None  # set to the correct hub port in dfk
         self.worker_ports = worker_ports
         self.worker_port_range = worker_port_range
+        self.interchange_proc: Optional[Process] = None
         self.interchange_port_range = interchange_port_range
         self.heartbeat_threshold = heartbeat_threshold
         self.heartbeat_period = heartbeat_period
@@ -281,6 +301,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.encrypted = encrypted
         self.cert_dir = None
+        self.enable_mpi_mode = enable_mpi_mode
+        assert mpi_launcher in VALID_LAUNCHERS, \
+            f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
+        if self.enable_mpi_mode:
+            assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
+                "mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
+        self.mpi_launcher = mpi_launcher
         if not launch_cmd:
             launch_cmd = DEFAULT_LAUNCH_CMD
         self.launch_cmd = launch_cmd
@@ -302,6 +331,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         """
         debug_opts = "--debug" if self.worker_debug else ""
         max_workers = "" if self.max_workers == float('inf') else "--max_workers={}".format(self.max_workers)
+        enable_mpi_opts = "--enable_mpi_mode " if self.enable_mpi_mode else ""
         address_probe_timeout_string = ""
         if self.address_probe_timeout:
@@ -323,6 +353,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                                        cert_dir=self.cert_dir,
                                        logdir=self.worker_logdir,
                                        cpu_affinity=self.cpu_affinity,
+                                       enable_mpi_mode=enable_mpi_opts,
+                                       mpi_launcher=self.mpi_launcher,
                                        accelerators=" ".join(self.available_accelerators))
         self.launch_cmd = l_cmd
         logger.debug("Launch command: {}".format(self.launch_cmd))
@@ -584,10 +616,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         Returns:
               Future
         """
-        if resource_specification:
-            logger.error("Ignoring the call specification. "
-                         "Parsl call specification is not supported in HighThroughput Executor.")
-            raise UnsupportedFeatureError('resource specification', 'HighThroughput Executor', None)
+        validate_resource_spec(resource_specification)
         if self.bad_state_is_set:
             raise self.executor_exception
@@ -605,8 +634,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.tasks[task_id] = fut
         try:
-            fn_buf = pack_apply_message(func, args, kwargs,
-                                        buffer_threshold=1024 * 1024)
+            fn_buf = pack_res_spec_apply_message(func, args, kwargs,
+                                                 resource_specification=resource_specification,
+                                                 buffer_threshold=1024 * 1024)
         except TypeError:
             raise SerializationError(func.__name__)
@@ -737,12 +767,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                     )
         return job_status
-    def shutdown(self):
+    def shutdown(self, timeout: float = 10.0):
         """Shutdown the executor, including the interchange. This does not
         shut down any workers directly - workers should be terminated by the
         scaling mechanism or by heartbeat timeout.
+        Parameters
+        ----------
+        timeout : float
+            Amount of time to wait for the Interchange process to terminate before
+            we forcefully kill it.
         """
+        if self.interchange_proc is None:
+            logger.info("HighThroughputExecutor has not started; skipping shutdown")
+            return
         logger.info("Attempting HighThroughputExecutor shutdown")
         self.interchange_proc.terminate()
+        self.interchange_proc.join(timeout=timeout)
+        if self.interchange_proc.is_alive():
+            logger.info("Unable to terminate Interchange process; sending SIGKILL")
+            self.interchange_proc.kill()
         logger.info("Finished HighThroughputExecutor shutdown attempt")

parsl/executors/high_throughput/mpi_prefix_composer.py ADDED Viewed

@@ -0,0 +1,137 @@
+import logging
+from typing import Dict, List, Tuple, Set
+logger = logging.getLogger(__name__)
+VALID_LAUNCHERS = ('srun',
+                   'aprun',
+                   'mpiexec')
+class InvalidResourceSpecification(Exception):
+    """Exception raised when Invalid keys are supplied via resource specification"""
+    def __init__(self, invalid_keys: Set[str]):
+        self.invalid_keys = invalid_keys
+    def __str__(self):
+        return f"Invalid resource specification options supplied: {self.invalid_keys}"
+def validate_resource_spec(resource_spec: Dict[str, str]):
+    """Basic validation of keys in the resource_spec
+    Raises: InvalidResourceSpecification if the resource_spec
+        is invalid (e.g, contains invalid keys)
+    """
+    user_keys = set(resource_spec.keys())
+    legal_keys = set(("ranks_per_node",
+                      "num_nodes",
+                      "num_ranks",
+                      "launcher_options",
+                      ))
+    invalid_keys = user_keys - legal_keys
+    if invalid_keys:
+        raise InvalidResourceSpecification(invalid_keys)
+    if "num_nodes" in resource_spec:
+        if not resource_spec.get("num_ranks") and resource_spec.get("ranks_per_node"):
+            resource_spec["num_ranks"] = str(int(resource_spec["num_nodes"]) * int(resource_spec["ranks_per_node"]))
+        if not resource_spec.get("ranks_per_node") and resource_spec.get("num_ranks"):
+            resource_spec["ranks_per_node"] = str(int(resource_spec["num_ranks"]) / int(resource_spec["num_nodes"]))
+    return
+def compose_mpiexec_launch_cmd(
+    resource_spec: Dict, node_hostnames: List[str]
+) -> Tuple[str, str]:
+    """Compose mpiexec launch command prefix"""
+    node_str = ",".join(node_hostnames)
+    args = [
+        "mpiexec",
+        "-n",
+        resource_spec.get("num_ranks"),
+        "-ppn",
+        resource_spec.get("ranks_per_node"),
+        "-hosts",
+        node_str,
+        resource_spec.get("launcher_options", ""),
+    ]
+    prefix = " ".join(str(arg) for arg in args)
+    return "PARSL_MPIEXEC_PREFIX", prefix
+def compose_srun_launch_cmd(
+    resource_spec: Dict, node_hostnames: List[str]
+) -> Tuple[str, str]:
+    """Compose srun launch command prefix"""
+    num_nodes = str(len(node_hostnames))
+    args = [
+        "srun",
+        "--ntasks",
+        resource_spec.get("num_ranks"),
+        "--ntasks-per-node",
+        resource_spec.get("ranks_per_node"),
+        "--nodelist",
+        ",".join(node_hostnames),
+        "--nodes",
+        num_nodes,
+        resource_spec.get("launcher_options", ""),
+    ]
+    prefix = " ".join(str(arg) for arg in args)
+    return "PARSL_SRUN_PREFIX", prefix
+def compose_aprun_launch_cmd(
+    resource_spec: Dict, node_hostnames: List[str]
+) -> Tuple[str, str]:
+    """Compose aprun launch command prefix"""
+    node_str = ",".join(node_hostnames)
+    args = [
+        "aprun",
+        "-n",
+        resource_spec.get("num_ranks"),
+        "-N",
+        resource_spec.get("ranks_per_node"),
+        "-node-list",
+        node_str,
+        resource_spec.get("launcher_options", ""),
+    ]
+    prefix = " ".join(str(arg) for arg in args)
+    return "PARSL_APRUN_PREFIX", prefix
+def compose_all(
+    mpi_launcher: str, resource_spec: Dict, node_hostnames: List[str]
+) -> Dict[str, str]:
+    """Compose all launch command prefixes and set the default"""
+    all_prefixes = {}
+    composers = [
+        compose_aprun_launch_cmd,
+        compose_srun_launch_cmd,
+        compose_mpiexec_launch_cmd,
+    ]
+    for composer in composers:
+        try:
+            key, prefix = composer(resource_spec, node_hostnames)
+            all_prefixes[key] = prefix
+        except Exception:
+            logging.exception(
+                f"Failed to compose launch prefix with {composer} from {resource_spec}"
+            )
+            pass
+    if mpi_launcher == "srun":
+        all_prefixes["PARSL_MPI_PREFIX"] = all_prefixes["PARSL_SRUN_PREFIX"]
+    elif mpi_launcher == "aprun":
+        all_prefixes["PARSL_MPI_PREFIX"] = all_prefixes["PARSL_APRUN_PREFIX"]
+    elif mpi_launcher == "mpiexec":
+        all_prefixes["PARSL_MPI_PREFIX"] = all_prefixes["PARSL_MPIEXEC_PREFIX"]
+    else:
+        raise RuntimeError(f"Unknown mpi_launcher:{mpi_launcher}")
+    return all_prefixes

parsl/executors/high_throughput/mpi_resource_management.py ADDED Viewed

@@ -0,0 +1,217 @@
+import logging
+import multiprocessing
+import os
+import pickle
+import queue
+import subprocess
+from enum import Enum
+from typing import Dict, List
+from parsl.multiprocessing import SpawnContext
+from parsl.serialize import (pack_res_spec_apply_message,
+                             unpack_res_spec_apply_message)
+logger = logging.getLogger(__name__)
+class Scheduler(Enum):
+    Unknown = 0
+    Slurm = 1
+    PBS = 2
+    Cobalt = 3
+def get_slurm_hosts_list() -> List[str]:
+    """Get list of slurm hosts from scontrol"""
+    cmd = "scontrol show hostname $SLURM_NODELIST"
+    b_output = subprocess.check_output(
+        cmd, stderr=subprocess.STDOUT, shell=True
+    )  # bytes
+    output = b_output.decode().strip().split()
+    return output
+def get_pbs_hosts_list() -> List[str]:
+    """Get list of PBS hosts from envvar: PBS_NODEFILE"""
+    nodefile_name = os.environ["PBS_NODEFILE"]
+    with open(nodefile_name) as f:
+        return [line.strip() for line in f.readlines()]
+def get_cobalt_hosts_list() -> List[str]:
+    """Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
+    nodefile_name = os.environ["COBALT_NODEFILE"]
+    with open(nodefile_name) as f:
+        return [line.strip() for line in f.readlines()]
+def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
+    """Get nodelist from all supported schedulers"""
+    nodelist = []
+    if scheduler == Scheduler.Slurm:
+        nodelist = get_slurm_hosts_list()
+    elif scheduler == Scheduler.PBS:
+        nodelist = get_pbs_hosts_list()
+    elif scheduler == Scheduler.Cobalt:
+        nodelist = get_cobalt_hosts_list()
+    else:
+        raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
+    return nodelist
+def identify_scheduler() -> Scheduler:
+    """Use envvars to determine batch scheduler"""
+    if os.environ.get("SLURM_NODELIST"):
+        return Scheduler.Slurm
+    elif os.environ.get("PBS_NODEFILE"):
+        return Scheduler.PBS
+    elif os.environ.get("COBALT_NODEFILE"):
+        return Scheduler.Cobalt
+    else:
+        return Scheduler.Unknown
+class MPINodesUnavailable(Exception):
+    """Raised if there are no free nodes available for an MPI request"""
+    def __init__(self, requested: int, available: int):
+        self.requested = requested
+        self.available = available
+    def __str__(self):
+        return f"MPINodesUnavailable(requested={self.requested} available={self.available})"
+class TaskScheduler:
+    """Default TaskScheduler that does no taskscheduling
+    This class simply acts as an abstraction over the task_q and result_q
+    that can be extended to implement more complex task scheduling logic
+    """
+    def __init__(
+        self,
+        pending_task_q: multiprocessing.Queue,
+        pending_result_q: multiprocessing.Queue,
+    ):
+        self.pending_task_q = pending_task_q
+        self.pending_result_q = pending_result_q
+    def put_task(self, task) -> None:
+        return self.pending_task_q.put(task)
+    def get_result(self, block: bool, timeout: float):
+        return self.pending_result_q.get(block, timeout=timeout)
+class MPITaskScheduler(TaskScheduler):
+    """Extends TaskScheduler to schedule MPI functions over provisioned nodes
+    The MPITaskScheduler runs on a Manager on the lead node of a batch job, as
+    such it is expected to control task placement over this single batch job.
+    The MPITaskScheduler adds the following functionality:
+    1) Determine list of nodes attached to current batch job
+    2) put_task for execution onto workers:
+        a) if resources are available attach resource list
+        b) if unavailable place tasks into backlog
+    3) get_result will fetch a result and relinquish nodes,
+       and attempt to schedule tasks in backlog if any.
+    """
+    def __init__(
+        self,
+        pending_task_q: multiprocessing.Queue,
+        pending_result_q: multiprocessing.Queue,
+    ):
+        super().__init__(pending_task_q, pending_result_q)
+        self.scheduler = identify_scheduler()
+        # PriorityQueue is threadsafe
+        self._backlog_queue: queue.PriorityQueue = queue.PriorityQueue()
+        self._map_tasks_to_nodes: Dict[str, List[str]] = {}
+        self.available_nodes = get_nodes_in_batchjob(self.scheduler)
+        self._free_node_counter = SpawnContext.Value("i", len(self.available_nodes))
+        # mp.Value has issues with mypy
+        # issue https://github.com/python/typeshed/issues/8799
+        # from mypy 0.981 onwards
+        self.nodes_q: queue.Queue = queue.Queue()
+        for node in self.available_nodes:
+            self.nodes_q.put(node)
+        logger.info(
+            f"Starting MPITaskScheduler with {len(self.available_nodes)}"
+        )
+    def _get_nodes(self, num_nodes: int) -> List[str]:
+        """Thread safe method to acquire num_nodes from free resources
+        Raises: MPINodesUnavailable if there aren't enough resources
+        Returns: List of nodenames:str
+        """
+        logger.debug(
+            f"Requesting : {num_nodes=} we have {self._free_node_counter}"
+        )
+        acquired_nodes = []
+        with self._free_node_counter.get_lock():
+            if num_nodes <= self._free_node_counter.value:  # type: ignore[attr-defined]
+                self._free_node_counter.value -= num_nodes  # type: ignore[attr-defined]
+            else:
+                raise MPINodesUnavailable(
+                    requested=num_nodes, available=self._free_node_counter.value  # type: ignore[attr-defined]
+                )
+            for i in range(num_nodes):
+                node = self.nodes_q.get()
+                acquired_nodes.append(node)
+        return acquired_nodes
+    def _return_nodes(self, nodes: List[str]) -> None:
+        """Threadsafe method to return a list of nodes"""
+        for node in nodes:
+            self.nodes_q.put(node)
+        with self._free_node_counter.get_lock():
+            self._free_node_counter.value += len(nodes)  # type: ignore[attr-defined]
+    def put_task(self, task_package: dict):
+        """Schedule task if resources are available otherwise backlog the task"""
+        user_ns = locals()
+        user_ns.update({"__builtins__": __builtins__})
+        _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(
+            task_package["buffer"], user_ns, copy=False
+        )
+        nodes_needed = resource_spec.get("num_nodes")
+        if nodes_needed:
+            try:
+                allocated_nodes = self._get_nodes(nodes_needed)
+            except MPINodesUnavailable:
+                logger.warning("Not enough resources, placing task into backlog")
+                self._backlog_queue.put((nodes_needed, task_package))
+                return
+            else:
+                resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
+                self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
+                buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
+                task_package["buffer"] = buffer
+        self.pending_task_q.put(task_package)
+    def _schedule_backlog_tasks(self):
+        """Attempt to schedule backlogged tasks"""
+        try:
+            _nodes_requested, task_package = self._backlog_queue.get(block=False)
+            self.put_task(task_package)
+        except queue.Empty:
+            return
+        else:
+            # Keep attempting to schedule tasks till we are out of resources
+            self._schedule_backlog_tasks()
+    def get_result(self, block: bool, timeout: float):
+        """Return result and relinquish provisioned nodes"""
+        result_pkl = self.pending_result_q.get(block, timeout=timeout)
+        result_dict = pickle.loads(result_pkl)
+        if result_dict["type"] == "result":
+            task_id = result_dict["task_id"]
+            nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
+            self._return_nodes(nodes_to_reallocate)
+            self._schedule_backlog_tasks()
+        return result_pkl

parsl 2024.2.12__py3-none-any.whl → 2024.2.26__py3-none-any.whl

parsl 2024.2.12py3-none-any.whl → 2024.2.26py3-none-any.whl