PyPI - parsl - Versions diffs - 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl - Mend

parsl 2024.7.22py3-none-any.whl → 2024.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

parsl/dataflow/dflow.py +4 -10
parsl/executors/base.py +8 -8
parsl/executors/flux/executor.py +7 -7
parsl/executors/high_throughput/executor.py +55 -55
parsl/executors/high_throughput/interchange.py +37 -37
parsl/executors/high_throughput/manager_record.py +1 -0
parsl/executors/high_throughput/manager_selector.py +25 -0
parsl/executors/high_throughput/process_worker_pool.py +2 -0
parsl/executors/status_handling.py +52 -21
parsl/executors/taskvine/executor.py +0 -18
parsl/executors/workqueue/executor.py +0 -18
parsl/monitoring/errors.py +6 -0
parsl/monitoring/monitoring.py +6 -5
parsl/monitoring/radios.py +23 -7
parsl/monitoring/remote.py +12 -12
parsl/monitoring/router.py +71 -30
parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
parsl/tests/test_htex/test_htex.py +28 -19
parsl/tests/test_htex/test_zmq_binding.py +2 -0
parsl/tests/test_monitoring/test_basic.py +14 -1
parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
parsl/version.py +1 -1
{parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/interchange.py +37 -37
{parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +2 -0
parsl-2024.8.5.dist-info/METADATA +101 -0
{parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/RECORD +33 -30
{parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
parsl-2024.7.22.dist-info/METADATA +0 -101
{parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
{parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
{parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0

parsl/executors/status_handling.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from parsl.executors.base import ParslExecutor
 from parsl.executors.errors import BadStateException, ScalingFailed
 from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
-from parsl.jobs.states import JobState, JobStatus
+from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
 from parsl.monitoring.message_type import MessageType
 from parsl.providers.base import ExecutionProvider
 from parsl.utils import AtomicIDCounter
@@ -167,41 +167,82 @@ class BlockProviderExecutor(ParslExecutor):
     def provider(self):
         return self._provider
-    def _filter_scale_in_ids(self, to_kill, killed):
+    def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]:
         """ Filter out job id's that were not killed
         """
         assert len(to_kill) == len(killed)
+        if False in killed:
+            killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k]
+            not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k]
+            logger.warning("Some jobs were not killed successfully: "
+                           f"killed jobs: {killed_job_ids}, "
+                           f"not-killed jobs: {not_killed_job_ids}")
         # Filters first iterable by bool values in second
         return list(compress(to_kill, killed))
-    def _scale_out(self, blocks: int = 1) -> List[str]:
+    def scale_out_facade(self, n: int) -> List[str]:
         """Scales out the number of blocks by "blocks"
         """
         if not self.provider:
             raise ScalingFailed(self, "No execution provider available")
         block_ids = []
-        logger.info(f"Scaling out by {blocks} blocks")
-        for _ in range(blocks):
+        monitoring_status_changes = {}
+        logger.info(f"Scaling out by {n} blocks")
+        for _ in range(n):
             block_id = str(self._block_id_counter.get_id())
             logger.info(f"Allocated block ID {block_id}")
             try:
                 job_id = self._launch_block(block_id)
+                pending_status = JobStatus(JobState.PENDING)
                 self.blocks_to_job_id[block_id] = job_id
                 self.job_ids_to_block[job_id] = block_id
+                self._status[block_id] = pending_status
+                monitoring_status_changes[block_id] = pending_status
                 block_ids.append(block_id)
             except Exception as ex:
-                self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
+                failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
+                self._simulated_status[block_id] = failed_status
+                self._status[block_id] = failed_status
+        self.send_monitoring_info(monitoring_status_changes)
         return block_ids
-    @abstractmethod
     def scale_in(self, blocks: int) -> List[str]:
         """Scale in method.
         Cause the executor to reduce the number of blocks by count.
+        The default implementation will kill blocks without regard to their
+        status or whether they are executing tasks. Executors with more
+        nuanced scaling strategies might overload this method to work with
+        that strategy - see the HighThroughputExecutor for an example of that.
         :return: A list of block ids corresponding to the blocks that were removed.
         """
-        pass
+        active_blocks = [block_id for block_id, status in self._status.items()
+                         if status.state not in TERMINAL_STATES]
+        block_ids_to_kill = active_blocks[:blocks]
+        job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill]
+        # Cancel the blocks provisioned
+        if self.provider:
+            logger.info(f"Scaling in jobs: {job_ids_to_kill}")
+            r = self.provider.cancel(job_ids_to_kill)
+            job_ids = self._filter_scale_in_ids(job_ids_to_kill, r)
+            block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids]
+            return block_ids_killed
+        else:
+            logger.error("No execution provider available to scale in")
+            return []
     def _launch_block(self, block_id: str) -> Any:
         launch_cmd = self._get_launch_command(block_id)
@@ -235,10 +276,10 @@ class BlockProviderExecutor(ParslExecutor):
     def send_monitoring_info(self, status: Dict) -> None:
         # Send monitoring info for HTEX when monitoring enabled
-        if self.monitoring_radio:
+        if self.submit_monitoring_radio:
             msg = self.create_monitoring_info(status)
-            logger.debug("Sending message {} to hub from job status poller".format(msg))
-            self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
+            logger.debug("Sending block monitoring message: %r", msg)
+            self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
     def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
         """Create a monitoring message for each block based on the poll status.
@@ -310,13 +351,3 @@ class BlockProviderExecutor(ParslExecutor):
                 del self._status[block_id]
             self.send_monitoring_info(new_status)
         return block_ids
-    def scale_out_facade(self, n: int) -> List[str]:
-        block_ids = self._scale_out(n)
-        if block_ids is not None:
-            new_status = {}
-            for block_id in block_ids:
-                new_status[block_id] = JobStatus(JobState.PENDING)
-            self.send_monitoring_info(new_status)
-            self._status.update(new_status)
-        return block_ids

parsl/executors/taskvine/executor.py CHANGED Viewed

@@ -573,24 +573,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
     def workers_per_node(self) -> Union[int, float]:
         return 1
-    def scale_in(self, count: int) -> List[str]:
-        """Scale in method. Cancel a given number of blocks
-        """
-        # Obtain list of blocks to kill
-        to_kill = list(self.blocks_to_job_id.keys())[:count]
-        kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
-        # Cancel the blocks provisioned
-        if self.provider:
-            logger.info(f"Scaling in jobs: {kill_ids}")
-            r = self.provider.cancel(kill_ids)
-            job_ids = self._filter_scale_in_ids(kill_ids, r)
-            block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
-            return block_ids_killed
-        else:
-            logger.error("No execution provider available to scale")
-            return []
     def shutdown(self, *args, **kwargs):
         """Shutdown the executor. Sets flag to cancel the submit process and
         collector thread, which shuts down the TaskVine system submission.

parsl/executors/workqueue/executor.py CHANGED Viewed

@@ -689,24 +689,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
     def workers_per_node(self) -> Union[int, float]:
         return self.scaling_cores_per_worker
-    def scale_in(self, count: int) -> List[str]:
-        """Scale in method.
-        """
-        # Obtain list of blocks to kill
-        to_kill = list(self.blocks_to_job_id.keys())[:count]
-        kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
-        # Cancel the blocks provisioned
-        if self.provider:
-            logger.info(f"Scaling in jobs: {kill_ids}")
-            r = self.provider.cancel(kill_ids)
-            job_ids = self._filter_scale_in_ids(kill_ids, r)
-            block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
-            return block_ids_killed
-        else:
-            logger.error("No execution provider available to scale in")
-            return []
     def shutdown(self, *args, **kwargs):
         """Shutdown the executor. Sets flag to cancel the submit process and
         collector thread, which shuts down the Work Queue system submission.

parsl/monitoring/errors.py ADDED Viewed

@@ -0,0 +1,6 @@
+from parsl.errors import ParslError
+class MonitoringHubStartError(ParslError):
+    def __str__(self) -> str:
+        return "Hub failed to start"

parsl/monitoring/monitoring.py CHANGED Viewed

@@ -12,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
 import typeguard
 from parsl.log_utils import set_file_logger
+from parsl.monitoring.errors import MonitoringHubStartError
 from parsl.monitoring.message_type import MessageType
-from parsl.monitoring.radios import MultiprocessingQueueRadio
+from parsl.monitoring.radios import MultiprocessingQueueRadioSender
 from parsl.monitoring.router import router_starter
 from parsl.monitoring.types import AddressedMonitoringMessage
 from parsl.multiprocessing import ForkProcess, SizedQueue
@@ -105,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
         self.resource_monitoring_enabled = resource_monitoring_enabled
         self.resource_monitoring_interval = resource_monitoring_interval
-    def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
+    def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
         logger.debug("Starting MonitoringHub")
@@ -187,7 +188,7 @@ class MonitoringHub(RepresentationMixin):
         self.filesystem_proc.start()
         logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
-        self.radio = MultiprocessingQueueRadio(self.block_msgs)
+        self.radio = MultiprocessingQueueRadioSender(self.block_msgs)
         try:
             comm_q_result = comm_q.get(block=True, timeout=120)
@@ -195,7 +196,7 @@ class MonitoringHub(RepresentationMixin):
             comm_q.join_thread()
         except queue.Empty:
             logger.error("Hub has not completed initialization in 120s. Aborting")
-            raise Exception("Hub failed to start")
+            raise MonitoringHubStartError()
         if isinstance(comm_q_result, str):
             logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
@@ -207,7 +208,7 @@ class MonitoringHub(RepresentationMixin):
         logger.info("Monitoring Hub initialized")
-        return zmq_port
+        self.hub_zmq_port = zmq_port
     # TODO: tighten the Any message format
     def send(self, mtype: MessageType, message: Any) -> None:

parsl/monitoring/radios.py CHANGED Viewed

@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
 from multiprocessing.queues import Queue
 from typing import Optional
+import zmq
 from parsl.serialize import serialize
 _db_manager_excepts: Optional[Exception]
@@ -15,14 +17,14 @@ _db_manager_excepts: Optional[Exception]
 logger = logging.getLogger(__name__)
-class MonitoringRadio(metaclass=ABCMeta):
+class MonitoringRadioSender(metaclass=ABCMeta):
     @abstractmethod
     def send(self, message: object) -> None:
         pass
-class FilesystemRadio(MonitoringRadio):
-    """A MonitoringRadio that sends messages over a shared filesystem.
+class FilesystemRadioSender(MonitoringRadioSender):
+    """A MonitoringRadioSender that sends messages over a shared filesystem.
     The messsage directory structure is based on maildir,
     https://en.wikipedia.org/wiki/Maildir
@@ -36,7 +38,7 @@ class FilesystemRadio(MonitoringRadio):
     This avoids a race condition of reading partially written messages.
     This radio is likely to give higher shared filesystem load compared to
-    the UDPRadio, but should be much more reliable.
+    the UDP radio, but should be much more reliable.
     """
     def __init__(self, *, monitoring_url: str, source_id: int, timeout: int = 10, run_dir: str):
@@ -66,7 +68,7 @@ class FilesystemRadio(MonitoringRadio):
         os.rename(tmp_filename, new_filename)
-class HTEXRadio(MonitoringRadio):
+class HTEXRadioSender(MonitoringRadioSender):
     def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
         """
@@ -120,7 +122,7 @@ class HTEXRadio(MonitoringRadio):
         return
-class UDPRadio(MonitoringRadio):
+class UDPRadioSender(MonitoringRadioSender):
     def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
         """
@@ -174,7 +176,7 @@ class UDPRadio(MonitoringRadio):
         return
-class MultiprocessingQueueRadio(MonitoringRadio):
+class MultiprocessingQueueRadioSender(MonitoringRadioSender):
     """A monitoring radio which connects over a multiprocessing Queue.
     This radio is intended to be used on the submit side, where components
     in the submit process, or processes launched by multiprocessing, will have
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadio(MonitoringRadio):
     def send(self, message: object) -> None:
         self.queue.put((message, 0))
+class ZMQRadioSender(MonitoringRadioSender):
+    """A monitoring radio which connects over ZMQ. This radio is not
+    thread-safe, because its use of ZMQ is not thread-safe.
+    """
+    def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
+        self._hub_channel = zmq.Context().socket(zmq.DEALER)
+        self._hub_channel.set_hwm(0)
+        self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
+    def send(self, message: object) -> None:
+        self._hub_channel.send_pyobj(message)

parsl/monitoring/remote.py CHANGED Viewed

@@ -8,10 +8,10 @@ from typing import Any, Callable, Dict, List, Sequence, Tuple
 from parsl.monitoring.message_type import MessageType
 from parsl.monitoring.radios import (
-    FilesystemRadio,
-    HTEXRadio,
-    MonitoringRadio,
-    UDPRadio,
+    FilesystemRadioSender,
+    HTEXRadioSender,
+    MonitoringRadioSender,
+    UDPRadioSender,
 )
 from parsl.multiprocessing import ForkProcess
 from parsl.process_loggers import wrap_with_logs
@@ -100,17 +100,17 @@ def monitor_wrapper(*,
     return (wrapped, args, new_kwargs)
-def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadio:
-    radio: MonitoringRadio
+def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
+    radio: MonitoringRadioSender
     if radio_mode == "udp":
-        radio = UDPRadio(monitoring_hub_url,
-                         source_id=task_id)
+        radio = UDPRadioSender(monitoring_hub_url,
+                               source_id=task_id)
     elif radio_mode == "htex":
-        radio = HTEXRadio(monitoring_hub_url,
-                          source_id=task_id)
+        radio = HTEXRadioSender(monitoring_hub_url,
+                                source_id=task_id)
     elif radio_mode == "filesystem":
-        radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
-                                source_id=task_id, run_dir=run_dir)
+        radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
+                                      source_id=task_id, run_dir=run_dir)
     else:
         raise RuntimeError(f"Unknown radio mode: {radio_mode}")
     return radio

parsl/monitoring/router.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import pickle
 import queue
 import socket
+import threading
 import time
 from multiprocessing.synchronize import Event
 from typing import Optional, Tuple, Union
@@ -32,7 +33,12 @@ class MonitoringRouter:
                  logdir: str = ".",
                  run_id: str,
                  logging_level: int = logging.INFO,
-                 atexit_timeout: int = 3    # in seconds
+                 atexit_timeout: int = 3,   # in seconds
+                 priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
+                 node_msgs: "queue.Queue[AddressedMonitoringMessage]",
+                 block_msgs: "queue.Queue[AddressedMonitoringMessage]",
+                 resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
+                 exit_event: Event,
                  ):
         """ Initializes a monitoring configuration class.
@@ -51,7 +57,11 @@ class MonitoringRouter:
              Logging level as defined in the logging module. Default: logging.INFO
         atexit_timeout : float, optional
             The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
+        *_msgs : Queue
+            Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
+        exit_event : Event
+            An event that the main Parsl process will set to signal that the monitoring router should shut down.
         """
         os.makedirs(logdir, exist_ok=True)
         self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
@@ -93,22 +103,60 @@ class MonitoringRouter:
                                                                                min_port=zmq_port_range[0],
                                                                                max_port=zmq_port_range[1])
-    def start(self,
-              priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              node_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              block_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              exit_event: Event) -> None:
+        self.priority_msgs = priority_msgs
+        self.node_msgs = node_msgs
+        self.block_msgs = block_msgs
+        self.resource_msgs = resource_msgs
+        self.exit_event = exit_event
+    @wrap_with_logs(target="monitoring_router")
+    def start(self) -> None:
+        self.logger.info("Starting UDP listener thread")
+        udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
+        udp_radio_receiver_thread.start()
+        self.logger.info("Starting ZMQ listener thread")
+        zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
+        zmq_radio_receiver_thread.start()
+        self.logger.info("Joining on ZMQ listener thread")
+        zmq_radio_receiver_thread.join()
+        self.logger.info("Joining on UDP listener thread")
+        udp_radio_receiver_thread.join()
+        self.logger.info("Joined on both ZMQ and UDP listener threads")
+    @wrap_with_logs(target="monitoring_router")
+    def start_udp_listener(self) -> None:
         try:
-            while not exit_event.is_set():
+            while not self.exit_event.is_set():
                 try:
                     data, addr = self.udp_sock.recvfrom(2048)
                     resource_msg = pickle.loads(data)
                     self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
-                    resource_msgs.put((resource_msg, addr))
+                    self.resource_msgs.put((resource_msg, addr))
                 except socket.timeout:
                     pass
+            self.logger.info("UDP listener draining")
+            last_msg_received_time = time.time()
+            while time.time() - last_msg_received_time < self.atexit_timeout:
+                try:
+                    data, addr = self.udp_sock.recvfrom(2048)
+                    msg = pickle.loads(data)
+                    self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
+                    self.resource_msgs.put((msg, addr))
+                    last_msg_received_time = time.time()
+                except socket.timeout:
+                    pass
+            self.logger.info("UDP listener finishing normally")
+        finally:
+            self.logger.info("UDP listener finished")
+    @wrap_with_logs(target="monitoring_router")
+    def start_zmq_listener(self) -> None:
+        try:
+            while not self.exit_event.is_set():
                 try:
                     dfk_loop_start = time.time()
                     while time.time() - dfk_loop_start < 1.0:  # TODO make configurable
@@ -125,15 +173,15 @@ class MonitoringRouter:
                         if msg[0] == MessageType.NODE_INFO:
                             msg[1]['run_id'] = self.run_id
-                            node_msgs.put(msg_0)
+                            self.node_msgs.put(msg_0)
                         elif msg[0] == MessageType.RESOURCE_INFO:
-                            resource_msgs.put(msg_0)
+                            self.resource_msgs.put(msg_0)
                         elif msg[0] == MessageType.BLOCK_INFO:
-                            block_msgs.put(msg_0)
+                            self.block_msgs.put(msg_0)
                         elif msg[0] == MessageType.TASK_INFO:
-                            priority_msgs.put(msg_0)
+                            self.priority_msgs.put(msg_0)
                         elif msg[0] == MessageType.WORKFLOW_INFO:
-                            priority_msgs.put(msg_0)
+                            self.priority_msgs.put(msg_0)
                         else:
                             # There is a type: ignore here because if msg[0]
                             # is of the correct type, this code is unreachable,
@@ -151,21 +199,9 @@ class MonitoringRouter:
                     # thing to do.
                     self.logger.warning("Failure processing a ZMQ message", exc_info=True)
-            self.logger.info("Monitoring router draining")
-            last_msg_received_time = time.time()
-            while time.time() - last_msg_received_time < self.atexit_timeout:
-                try:
-                    data, addr = self.udp_sock.recvfrom(2048)
-                    msg = pickle.loads(data)
-                    self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
-                    resource_msgs.put((msg, addr))
-                    last_msg_received_time = time.time()
-                except socket.timeout:
-                    pass
-            self.logger.info("Monitoring router finishing normally")
+            self.logger.info("ZMQ listener finishing normally")
         finally:
-            self.logger.info("Monitoring router finished")
+            self.logger.info("ZMQ listener finished")
 @wrap_with_logs
@@ -191,7 +227,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
                                   zmq_port_range=zmq_port_range,
                                   logdir=logdir,
                                   logging_level=logging_level,
-                                  run_id=run_id)
+                                  run_id=run_id,
+                                  priority_msgs=priority_msgs,
+                                  node_msgs=node_msgs,
+                                  block_msgs=block_msgs,
+                                  resource_msgs=resource_msgs,
+                                  exit_event=exit_event)
     except Exception as e:
         logger.error("MonitoringRouter construction failed.", exc_info=True)
         comm_q.put(f"Monitoring router construction failed: {e}")
@@ -200,7 +241,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
         router.logger.info("Starting MonitoringRouter in router_starter")
         try:
-            router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event)
+            router.start()
         except Exception as e:
             router.logger.exception("router.start exception")
             exception_q.put(('Hub', str(e)))

parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py ADDED Viewed

@@ -0,0 +1,71 @@
+import logging
+import pytest
+import parsl
+from parsl import Config
+from parsl.executors import HighThroughputExecutor
+from parsl.executors.errors import BadStateException
+from parsl.jobs.states import JobState, JobStatus
+from parsl.providers import LocalProvider
+class FailingProvider(LocalProvider):
+    def submit(*args, **kwargs):
+        raise RuntimeError("Deliberate failure of provider.submit")
+def local_config():
+    """Config to simulate failing blocks without connecting"""
+    return Config(
+        executors=[
+            HighThroughputExecutor(
+                label="HTEX",
+                heartbeat_period=1,
+                heartbeat_threshold=2,
+                poll_period=100,
+                max_workers_per_node=1,
+                provider=FailingProvider(
+                    init_blocks=0,
+                    max_blocks=2,
+                    min_blocks=0,
+                ),
+            )
+        ],
+        max_idletime=0.5,
+        strategy='htex_auto_scale',
+        strategy_period=0.1
+        # this strategy period needs to be a few times smaller than the
+        # status_polling_interval of FailingProvider, which is 5s at
+        # time of writing
+    )
+@parsl.python_app
+def double(x):
+    return x * 2
+@pytest.mark.local
+def test_disconnected_blocks():
+    """Test reporting of blocks that fail to connect from HTEX"""
+    dfk = parsl.dfk()
+    executor = dfk.executors["HTEX"]
+    connected_blocks = executor.connected_blocks()
+    assert not connected_blocks, "Expected 0 blocks"
+    future = double(5)
+    with pytest.raises(BadStateException):
+        future.result()
+    assert isinstance(future.exception(), BadStateException)
+    status_dict = executor.status()
+    assert len(status_dict) == 1, "Expected exactly 1 block"
+    for status in status_dict.values():
+        assert isinstance(status, JobStatus)
+        assert status.state == JobState.MISSING
+    connected_blocks = executor.connected_blocks()
+    assert connected_blocks == [], "Expected exactly 0 connected blocks"

parsl 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl

parsl 2024.7.22py3-none-any.whl → 2024.8.5py3-none-any.whl