PyPI - parsl - Versions diffs - 2024.4.8__py3-none-any.whl → 2024.4.22__py3-none-any.whl - Mend

parsl 2024.4.8py3-none-any.whl → 2024.4.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

parsl/addresses.py +2 -2
parsl/app/bash.py +10 -2
parsl/app/errors.py +3 -5
parsl/data_provider/data_manager.py +2 -1
parsl/data_provider/zip.py +104 -0
parsl/dataflow/dflow.py +92 -43
parsl/dataflow/futures.py +26 -12
parsl/executors/base.py +28 -9
parsl/executors/high_throughput/executor.py +14 -19
parsl/executors/high_throughput/process_worker_pool.py +3 -1
parsl/executors/status_handling.py +81 -1
parsl/executors/taskvine/executor.py +13 -2
parsl/executors/workqueue/executor.py +14 -3
parsl/jobs/job_status_poller.py +19 -113
parsl/jobs/strategy.py +22 -27
parsl/monitoring/monitoring.py +29 -23
parsl/monitoring/radios.py +15 -0
parsl/monitoring/router.py +7 -6
parsl/providers/local/local.py +1 -1
parsl/tests/configs/htex_local_alternate.py +2 -1
parsl/tests/configs/taskvine_ex.py +1 -2
parsl/tests/configs/workqueue_ex.py +1 -2
parsl/tests/conftest.py +6 -7
parsl/tests/test_bash_apps/test_basic.py +7 -4
parsl/tests/test_bash_apps/test_error_codes.py +0 -3
parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
parsl/tests/test_bash_apps/test_memoize.py +0 -2
parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
parsl/tests/test_bash_apps/test_multiline.py +0 -1
parsl/tests/test_bash_apps/test_stdout.py +11 -6
parsl/tests/test_checkpointing/test_task_exit.py +1 -1
parsl/tests/test_htex/test_zmq_binding.py +1 -0
parsl/tests/test_monitoring/test_basic.py +46 -21
parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
parsl/tests/test_monitoring/test_stdouterr.py +137 -0
parsl/tests/test_python_apps/test_context_manager.py +3 -3
parsl/tests/test_python_apps/test_outputs.py +0 -1
parsl/tests/test_scaling/test_regression_1621.py +11 -11
parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
parsl/tests/test_staging/test_staging_stdout.py +61 -0
parsl/tests/test_staging/test_zip_out.py +113 -0
parsl/utils.py +11 -2
parsl/version.py +1 -1
{parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/process_worker_pool.py +3 -1
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/METADATA +5 -4
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/RECORD +53 -48
{parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/LICENSE +0 -0
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/WHEEL +0 -0
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/entry_points.txt +0 -0
{parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/top_level.txt +0 -0

parsl/executors/status_handling.py CHANGED Viewed

@@ -1,15 +1,18 @@
 from __future__ import annotations
+import datetime
 import logging
 import threading
+import time
 from itertools import compress
 from abc import abstractmethod, abstractproperty
 from concurrent.futures import Future
-from typing import List, Any, Dict, Optional, Tuple, Union, Callable
+from typing import List, Any, Dict, Optional, Sequence, Tuple, Union, Callable
 from parsl.executors.base import ParslExecutor
 from parsl.executors.errors import BadStateException, ScalingFailed
 from parsl.jobs.states import JobStatus, JobState
 from parsl.jobs.error_handlers import simple_error_handler, noop_error_handler
+from parsl.monitoring.message_type import MessageType
 from parsl.providers.base import ExecutionProvider
 from parsl.utils import AtomicIDCounter
@@ -71,6 +74,9 @@ class BlockProviderExecutor(ParslExecutor):
         self.blocks_to_job_id = {}  # type: Dict[str, str]
         self.job_ids_to_block = {}  # type: Dict[str, str]
+        self._last_poll_time = 0.0
+        self._status = {}  # type: Dict[str, JobStatus]
     def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
         """Given a list of block ids and a list of corresponding status strings,
         returns a dictionary mapping each block id to the corresponding status
@@ -234,3 +240,77 @@ class BlockProviderExecutor(ParslExecutor):
     @abstractproperty
     def workers_per_node(self) -> Union[int, float]:
         pass
+    def send_monitoring_info(self, status: Dict) -> None:
+        # Send monitoring info for HTEX when monitoring enabled
+        if self.monitoring_radio:
+            msg = self.create_monitoring_info(status)
+            logger.debug("Sending message {} to hub from job status poller".format(msg))
+            self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
+    def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
+        """Create a monitoring message for each block based on the poll status.
+        """
+        msg = []
+        for bid, s in status.items():
+            d: Dict[str, Any] = {}
+            d['run_id'] = self.run_id
+            d['status'] = s.status_name
+            d['timestamp'] = datetime.datetime.now()
+            d['executor_label'] = self.label
+            d['job_id'] = self.blocks_to_job_id.get(bid, None)
+            d['block_id'] = bid
+            msg.append(d)
+        return msg
+    def poll_facade(self) -> None:
+        now = time.time()
+        if now >= self._last_poll_time + self.status_polling_interval:
+            previous_status = self._status
+            self._status = self.status()
+            self._last_poll_time = now
+            delta_status = {}
+            for block_id in self._status:
+                if block_id not in previous_status \
+                   or previous_status[block_id].state != self._status[block_id].state:
+                    delta_status[block_id] = self._status[block_id]
+            if delta_status:
+                self.send_monitoring_info(delta_status)
+    @property
+    def status_facade(self) -> Dict[str, JobStatus]:
+        """Return the status of all jobs/blocks of the executor of this poller.
+        :return: a dictionary mapping block ids (in string) to job status
+        """
+        return self._status
+    def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
+        if max_idletime is None:
+            block_ids = self.scale_in(n)
+        else:
+            # This is a HighThroughputExecutor-specific interface violation.
+            # This code hopes, through pan-codebase reasoning, that this
+            # scale_in method really does come from HighThroughputExecutor,
+            # and so does have an extra max_idletime parameter not present
+            # in the executor interface.
+            block_ids = self.scale_in(n, max_idletime=max_idletime)  # type: ignore[call-arg]
+        if block_ids is not None:
+            new_status = {}
+            for block_id in block_ids:
+                new_status[block_id] = JobStatus(JobState.CANCELLED)
+                del self._status[block_id]
+            self.send_monitoring_info(new_status)
+        return block_ids
+    def scale_out_facade(self, n: int) -> List[str]:
+        block_ids = self.scale_out(n)
+        if block_ids is not None:
+            new_status = {}
+            for block_id in block_ids:
+                new_status[block_id] = JobStatus(JobState.PENDING)
+            self.send_monitoring_info(new_status)
+            self._status.update(new_status)
+        return block_ids

parsl/executors/taskvine/executor.py CHANGED Viewed

@@ -596,7 +596,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
     def workers_per_node(self) -> Union[int, float]:
         return 1
-    def scale_in(self, count):
+    def scale_in(self, count: int) -> List[str]:
         """Scale in method. Cancel a given number of blocks
         """
         # Obtain list of blocks to kill
@@ -605,9 +605,14 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         # Cancel the blocks provisioned
         if self.provider:
-            self.provider.cancel(kill_ids)
+            logger.info(f"Scaling in jobs: {kill_ids}")
+            r = self.provider.cancel(kill_ids)
+            job_ids = self._filter_scale_in_ids(kill_ids, r)
+            block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
+            return block_ids_killed
         else:
             logger.error("No execution provider available to scale")
+            return []
     def shutdown(self, *args, **kwargs):
         """Shutdown the executor. Sets flag to cancel the submit process and
@@ -639,6 +644,12 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
             logger.debug("Joining on factory process")
             self._factory_process.join()
+        # Shutdown multiprocessing queues
+        self._ready_task_queue.close()
+        self._ready_task_queue.join_thread()
+        self._finished_task_queue.close()
+        self._finished_task_queue.join_thread()
         self._is_shutdown = True
         logger.debug("TaskVine shutdown completed")

parsl/executors/workqueue/executor.py CHANGED Viewed

@@ -691,7 +691,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
     def workers_per_node(self) -> Union[int, float]:
         return 1
-    def scale_in(self, count):
+    def scale_in(self, count: int) -> List[str]:
         """Scale in method.
         """
         # Obtain list of blocks to kill
@@ -700,9 +700,14 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         # Cancel the blocks provisioned
         if self.provider:
-            self.provider.cancel(kill_ids)
+            logger.info(f"Scaling in jobs: {kill_ids}")
+            r = self.provider.cancel(kill_ids)
+            job_ids = self._filter_scale_in_ids(kill_ids, r)
+            block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
+            return block_ids_killed
         else:
-            logger.error("No execution provider available to scale")
+            logger.error("No execution provider available to scale in")
+            return []
     def shutdown(self, *args, **kwargs):
         """Shutdown the executor. Sets flag to cancel the submit process and
@@ -730,6 +735,12 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         logger.debug("Joining on collector thread")
         self.collector_thread.join()
+        logger.debug("Closing multiprocessing queues")
+        self.task_queue.close()
+        self.task_queue.join_thread()
+        self.collector_queue.close()
+        self.collector_queue.join_thread()
         self.is_shutdown = True
         logger.debug("Work Queue shutdown completed")

parsl/jobs/job_status_poller.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import logging
 import parsl
-import time
-import zmq
-from typing import Dict, List, Sequence, Optional, Union
+from typing import List, Sequence, Optional, Union
-from parsl.jobs.states import JobStatus, JobState
 from parsl.jobs.strategy import Strategy
 from parsl.executors.status_handling import BlockProviderExecutor
-from parsl.monitoring.message_type import MessageType
 from parsl.utils import Timer
@@ -16,137 +12,47 @@ from parsl.utils import Timer
 logger = logging.getLogger(__name__)
-class PolledExecutorFacade:
-    def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
-        self._executor = executor
-        self._interval = executor.status_polling_interval
-        self._last_poll_time = 0.0
-        self._status = {}  # type: Dict[str, JobStatus]
-        # Create a ZMQ channel to send poll status to monitoring
-        self.monitoring_enabled = False
-        if dfk and dfk.monitoring is not None:
-            self.monitoring_enabled = True
-            hub_address = dfk.hub_address
-            hub_port = dfk.hub_zmq_port
-            context = zmq.Context()
-            self.hub_channel = context.socket(zmq.DEALER)
-            self.hub_channel.set_hwm(0)
-            self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
-            logger.info("Monitoring enabled on job status poller")
-    def _should_poll(self, now: float) -> bool:
-        return now >= self._last_poll_time + self._interval
-    def poll(self, now: float) -> None:
-        if self._should_poll(now):
-            previous_status = self._status
-            self._status = self._executor.status()
-            self._last_poll_time = now
-            delta_status = {}
-            for block_id in self._status:
-                if block_id not in previous_status \
-                   or previous_status[block_id].state != self._status[block_id].state:
-                    delta_status[block_id] = self._status[block_id]
-            if delta_status:
-                self.send_monitoring_info(delta_status)
-    def send_monitoring_info(self, status: Dict) -> None:
-        # Send monitoring info for HTEX when monitoring enabled
-        if self.monitoring_enabled:
-            msg = self._executor.create_monitoring_info(status)
-            logger.debug("Sending message {} to hub from job status poller".format(msg))
-            self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
-    @property
-    def status(self) -> Dict[str, JobStatus]:
-        """Return the status of all jobs/blocks of the executor of this poller.
-        :return: a dictionary mapping block ids (in string) to job status
-        """
-        return self._status
-    @property
-    def executor(self) -> BlockProviderExecutor:
-        return self._executor
-    def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
-        if max_idletime is None:
-            block_ids = self._executor.scale_in(n)
-        else:
-            # This is a HighThroughputExecutor-specific interface violation.
-            # This code hopes, through pan-codebase reasoning, that this
-            # scale_in method really does come from HighThroughputExecutor,
-            # and so does have an extra max_idletime parameter not present
-            # in the executor interface.
-            block_ids = self._executor.scale_in(n, max_idletime=max_idletime)  # type: ignore[call-arg]
-        if block_ids is not None:
-            new_status = {}
-            for block_id in block_ids:
-                new_status[block_id] = JobStatus(JobState.CANCELLED)
-                del self._status[block_id]
-            self.send_monitoring_info(new_status)
-        return block_ids
-    def scale_out(self, n: int) -> List[str]:
-        block_ids = self._executor.scale_out(n)
-        if block_ids is not None:
-            new_status = {}
-            for block_id in block_ids:
-                new_status[block_id] = JobStatus(JobState.PENDING)
-            self.send_monitoring_info(new_status)
-            self._status.update(new_status)
-        return block_ids
-    def __repr__(self) -> str:
-        return self._status.__repr__()
 class JobStatusPoller(Timer):
     def __init__(self, *, strategy: Optional[str], max_idletime: float,
                  strategy_period: Union[float, int],
-                 dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None) -> None:
-        self._executor_facades = []  # type: List[PolledExecutorFacade]
-        self.dfk = dfk
+                 monitoring: Optional["parsl.monitoring.radios.MonitoringRadio"] = None) -> None:
+        self._executors = []  # type: List[BlockProviderExecutor]
         self._strategy = Strategy(strategy=strategy,
                                   max_idletime=max_idletime)
         super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
     def poll(self) -> None:
         self._update_state()
-        self._run_error_handlers(self._executor_facades)
-        self._strategy.strategize(self._executor_facades)
+        self._run_error_handlers(self._executors)
+        self._strategy.strategize(self._executors)
-    def _run_error_handlers(self, status: List[PolledExecutorFacade]) -> None:
-        for es in status:
-            es.executor.handle_errors(es.status)
+    def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
+        for e in executors:
+            e.handle_errors(e.status_facade)
     def _update_state(self) -> None:
-        now = time.time()
-        for item in self._executor_facades:
-            item.poll(now)
+        for item in self._executors:
+            item.poll_facade()
     def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
         for executor in executors:
             if executor.status_polling_interval > 0:
                 logger.debug("Adding executor {}".format(executor.label))
-                self._executor_facades.append(PolledExecutorFacade(executor, self.dfk))
+                self._executors.append(executor)
         self._strategy.add_executors(executors)
-    def close(self):
-        super().close()
-        for ef in self._executor_facades:
-            if not ef.executor.bad_state_is_set:
-                logger.info(f"Scaling in executor {ef.executor.label}")
+    def close(self, timeout: Optional[float] = None) -> None:
+        super().close(timeout)
+        for executor in self._executors:
+            if not executor.bad_state_is_set:
+                logger.info(f"Scaling in executor {executor.label}")
                 # this code needs to be at least as many blocks as need
                 # cancelling, but it is safe to be more, as the scaling
                 # code will cope with being asked to cancel more blocks
                 # than exist.
-                block_count = len(ef.status)
-                ef.scale_in(block_count)
+                block_count = len(executor.status_facade)
+                executor.scale_in_facade(block_count)
             else:  # and bad_state_is_set
-                logger.warning(f"Not scaling in executor {ef.executor.label} because it is in bad state")
+                logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")

parsl/jobs/strategy.py CHANGED Viewed

@@ -5,8 +5,6 @@ import math
 import warnings
 from typing import Dict, List, Optional, Sequence, TypedDict
-import parsl.jobs.job_status_poller as jsp
 from parsl.executors import HighThroughputExecutor
 from parsl.executors.base import ParslExecutor
 from parsl.executors.status_handling import BlockProviderExecutor
@@ -150,22 +148,21 @@ class Strategy:
         for executor in executors:
             self.executors[executor.label] = {'idle_since': None, 'first': True}
-    def _strategy_init_only(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
+    def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
         """Scale up to init_blocks at the start, then nothing more.
         """
-        for ef in executor_facades:
-            executor = ef.executor
+        for executor in executors:
             if self.executors[executor.label]['first']:
                 logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
-                ef.scale_out(executor.provider.init_blocks)
+                executor.scale_out_facade(executor.provider.init_blocks)
                 self.executors[executor.label]['first'] = False
             else:
                 logger.debug("strategy_init_only: doing nothing")
-    def _strategy_simple(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
-        self._general_strategy(executor_facades, strategy_type='simple')
+    def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
+        self._general_strategy(executors, strategy_type='simple')
-    def _strategy_htex_auto_scale(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
+    def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
         """HTEX specific auto scaling strategy
         This strategy works only for HTEX. This strategy will scale out by
@@ -180,30 +177,25 @@ class Strategy:
         expected to scale in effectively only when # of workers, or tasks executing
         per block is close to 1.
         """
-        self._general_strategy(executor_facades, strategy_type='htex')
+        self._general_strategy(executors, strategy_type='htex')
     @wrap_with_logs
-    def _general_strategy(self, executor_facades, *, strategy_type):
-        logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executor_facades)} executors")
+    def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
+        logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
-        for ef in executor_facades:
-            executor = ef.executor
+        for executor in executors:
             label = executor.label
-            if not isinstance(executor, BlockProviderExecutor):
-                logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
-                continue
             logger.debug(f"Strategizing for executor {label}")
             if self.executors[label]['first']:
-                executor = ef.executor
                 logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
-                ef.scale_out(executor.provider.init_blocks)
+                executor.scale_out_facade(executor.provider.init_blocks)
                 self.executors[label]['first'] = False
             # Tasks that are either pending completion
             active_tasks = executor.outstanding
-            status = ef.status
+            status = executor.status_facade
             # FIXME we need to handle case where provider does not define these
             # FIXME probably more of this logic should be moved to the provider
@@ -247,23 +239,26 @@ class Strategy:
                 else:
                     # We want to make sure that max_idletime is reached
                     # before killing off resources
-                    logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks}) than minimum blocks ({min_blocks})")
+                    logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
+                                 f" than minimum blocks ({min_blocks})")
                     if not self.executors[executor.label]['idle_since']:
                         logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
                         self.executors[executor.label]['idle_since'] = time.time()
                     idle_since = self.executors[executor.label]['idle_since']
+                    assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
                     idle_duration = time.time() - idle_since
                     if idle_duration > self.max_idletime:
                         # We have resources idle for the max duration,
                         # we have to scale_in now.
                         logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
-                        ef.scale_in(active_blocks - min_blocks)
+                        executor.scale_in_facade(active_blocks - min_blocks)
                     else:
                         logger.debug(
-                                f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s for executor {label}; not scaling in")
+                                f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
+                                f" for executor {label}; not scaling in")
             # Case 2
             # More tasks than the available slots.
@@ -282,7 +277,7 @@ class Strategy:
                     excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
                     excess_blocks = min(excess_blocks, max_blocks - active_blocks)
                     logger.debug(f"Requesting {excess_blocks} more blocks")
-                    ef.scale_out(excess_blocks)
+                    executor.scale_out_facade(excess_blocks)
             elif active_slots == 0 and active_tasks > 0:
                 logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
@@ -291,7 +286,7 @@ class Strategy:
                 if active_blocks < max_blocks:
                     logger.debug("Requesting single block")
-                    ef.scale_out(1)
+                    executor.scale_out_facade(1)
                 else:
                     logger.debug("Not requesting single block, because at maxblocks already")
@@ -307,7 +302,7 @@ class Strategy:
                             excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
                             excess_blocks = min(excess_blocks, active_blocks - min_blocks)
                             logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
-                            ef.scale_in(excess_blocks, max_idletime=self.max_idletime)
+                            executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
                     else:
                         logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
                 else:

parsl/monitoring/monitoring.py CHANGED Viewed

@@ -3,13 +3,14 @@ from __future__ import annotations
 import os
 import time
 import logging
+import multiprocessing.synchronize as ms
 import typeguard
-import zmq
 import queue
 from parsl.multiprocessing import ForkProcess, SizedQueue
 from multiprocessing import Process
+from multiprocessing import Event
 from multiprocessing.queues import Queue
 from parsl.log_utils import set_file_logger
 from parsl.utils import RepresentationMixin
@@ -18,6 +19,7 @@ from parsl.utils import setproctitle
 from parsl.serialize import deserialize
+from parsl.monitoring.radios import MultiprocessingQueueRadio
 from parsl.monitoring.router import router_starter
 from parsl.monitoring.message_type import MessageType
 from parsl.monitoring.types import AddressedMonitoringMessage
@@ -90,12 +92,6 @@ class MonitoringHub(RepresentationMixin):
              Default: 30 seconds
         """
-        # Any is used to disable typechecking on uses of _dfk_channel,
-        # because it is used in the code as if it points to a channel, but
-        # the static type is that it can also be None. The code relies on
-        # .start() being called and initialising this to a real channel.
-        self._dfk_channel = None  # type: Any
         if _db_manager_excepts:
             raise _db_manager_excepts
@@ -157,8 +153,12 @@ class MonitoringHub(RepresentationMixin):
         self.block_msgs: Queue[AddressedMonitoringMessage]
         self.block_msgs = SizedQueue()
+        self.router_exit_event: ms.Event
+        self.router_exit_event = Event()
         self.router_proc = ForkProcess(target=router_starter,
-                                       args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
+                                       args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
+                                             self.block_msgs, self.resource_msgs, self.router_exit_event),
                                        kwargs={"hub_address": self.hub_address,
                                                "udp_port": self.hub_port,
                                                "zmq_port_range": self.hub_port_range,
@@ -191,8 +191,12 @@ class MonitoringHub(RepresentationMixin):
         self.filesystem_proc.start()
         logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
+        self.radio = MultiprocessingQueueRadio(self.block_msgs)
         try:
             comm_q_result = comm_q.get(block=True, timeout=120)
+            comm_q.close()
+            comm_q.join_thread()
         except queue.Empty:
             logger.error("Hub has not completed initialization in 120s. Aborting")
             raise Exception("Hub failed to start")
@@ -205,14 +209,6 @@ class MonitoringHub(RepresentationMixin):
         self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
-        context = zmq.Context()
-        self.dfk_channel_timeout = 10000  # in milliseconds
-        self._dfk_channel = context.socket(zmq.DEALER)
-        self._dfk_channel.setsockopt(zmq.LINGER, 0)
-        self._dfk_channel.set_hwm(0)
-        self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
-        self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
         logger.info("Monitoring Hub initialized")
         return zmq_port
@@ -220,11 +216,7 @@ class MonitoringHub(RepresentationMixin):
     # TODO: tighten the Any message format
     def send(self, mtype: MessageType, message: Any) -> None:
         logger.debug("Sending message type {}".format(mtype))
-        try:
-            self._dfk_channel.send_pyobj((mtype, message))
-        except zmq.Again:
-            logger.exception(
-                "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
+        self.radio.send((mtype, message))
     def close(self) -> None:
         logger.info("Terminating Monitoring Hub")
@@ -235,9 +227,8 @@ class MonitoringHub(RepresentationMixin):
                 logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
             except queue.Empty:
                 break
-        if self._dfk_channel and self.monitoring_hub_active:
+        if self.monitoring_hub_active:
             self.monitoring_hub_active = False
-            self._dfk_channel.close()
             if exception_msgs:
                 for exception_msg in exception_msgs:
                     logger.error(
@@ -249,6 +240,8 @@ class MonitoringHub(RepresentationMixin):
                 self.router_proc.terminate()
                 self.dbm_proc.terminate()
                 self.filesystem_proc.terminate()
+            logger.info("Setting router termination event")
+            self.router_exit_event.set()
             logger.info("Waiting for router to terminate")
             self.router_proc.join()
             logger.debug("Finished waiting for router termination")
@@ -267,6 +260,19 @@ class MonitoringHub(RepresentationMixin):
             self.filesystem_proc.terminate()
             self.filesystem_proc.join()
+            logger.info("Closing monitoring multiprocessing queues")
+            self.exception_q.close()
+            self.exception_q.join_thread()
+            self.priority_msgs.close()
+            self.priority_msgs.join_thread()
+            self.resource_msgs.close()
+            self.resource_msgs.join_thread()
+            self.node_msgs.close()
+            self.node_msgs.join_thread()
+            self.block_msgs.close()
+            self.block_msgs.join_thread()
+            logger.info("Closed monitoring multiprocessing queues")
 @wrap_with_logs
 def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:

parsl/monitoring/radios.py CHANGED Viewed

@@ -6,6 +6,7 @@ import logging
 from abc import ABCMeta, abstractmethod
+from multiprocessing.queues import Queue
 from typing import Optional
 from parsl.serialize import serialize
@@ -173,3 +174,17 @@ class UDPRadio(MonitoringRadio):
             logging.error("Could not send message within timeout limit")
             return
         return
+class MultiprocessingQueueRadio(MonitoringRadio):
+    """A monitoring radio intended which connects over a multiprocessing Queue.
+    This radio is intended to be used on the submit side, where components
+    in the submit process, or processes launched by multiprocessing, will have
+    access to a Queue shared with the monitoring database code (bypassing the
+    monitoring router).
+    """
+    def __init__(self, queue: Queue) -> None:
+        self.queue = queue
+    def send(self, message: object) -> None:
+        self.queue.put((message, 0))

parsl 2024.4.8__py3-none-any.whl → 2024.4.22__py3-none-any.whl

parsl 2024.4.8py3-none-any.whl → 2024.4.22py3-none-any.whl