PyPI - parsl - Versions diffs - 2024.3.18__py3-none-any.whl → 2024.4.1__py3-none-any.whl - Mend

parsl 2024.3.18py3-none-any.whl → 2024.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

parsl/dataflow/dflow.py +35 -36
parsl/executors/base.py +11 -1
parsl/executors/high_throughput/executor.py +8 -20
parsl/executors/high_throughput/process_worker_pool.py +5 -2
parsl/executors/status_handling.py +8 -15
parsl/executors/taskvine/executor.py +35 -11
parsl/executors/workqueue/executor.py +33 -11
parsl/jobs/error_handlers.py +1 -1
parsl/jobs/job_status_poller.py +12 -11
parsl/jobs/strategy.py +31 -18
parsl/monitoring/monitoring.py +27 -237
parsl/monitoring/router.py +208 -0
parsl/tests/site_tests/test_provider.py +1 -1
parsl/tests/test_htex/test_disconnected_blocks.py +0 -1
parsl/tests/test_htex/test_drain.py +1 -0
parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +85 -0
parsl/tests/test_python_apps/test_context_manager.py +40 -0
parsl/tests/test_scaling/test_shutdown_scalein.py +78 -0
parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
parsl/version.py +1 -1
{parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/process_worker_pool.py +5 -2
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/METADATA +4 -4
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/RECORD +35 -30
/parsl/tests/{test_data → test_shutdown}/__init__.py +0 -0
/parsl/tests/{test_data → test_staging}/test_file.py +0 -0
/parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
/parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
/parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +0 -0
{parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/LICENSE +0 -0
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/WHEEL +0 -0
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/entry_points.txt +0 -0
{parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/top_level.txt +0 -0

parsl/jobs/strategy.py CHANGED Viewed

@@ -129,8 +129,8 @@ class Strategy:
         self.executors = {}
         self.max_idletime = max_idletime
-        self.strategies = {None: self._strategy_noop,
-                           'none': self._strategy_noop,
+        self.strategies = {None: self._strategy_init_only,
+                           'none': self._strategy_init_only,
                            'simple': self._strategy_simple,
                            'htex_auto_scale': self._strategy_htex_auto_scale}
@@ -146,15 +146,22 @@ class Strategy:
         for executor in executors:
             self.executors[executor.label] = {'idle_since': None}
-    def _strategy_noop(self, status: List[jsp.PollItem]) -> None:
-        """Do nothing.
+    def _strategy_init_only(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
+        """Scale up to init_blocks at the start, then nothing more.
         """
-        logger.debug("strategy_noop: doing nothing")
+        for ef in executor_facades:
+            if ef.first:
+                executor = ef.executor
+                logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
+                ef.scale_out(executor.provider.init_blocks)
+                ef.first = False
+            else:
+                logger.debug("strategy_init_only: doing nothing")
-    def _strategy_simple(self, status_list: List[jsp.PollItem]) -> None:
-        self._general_strategy(status_list, strategy_type='simple')
+    def _strategy_simple(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
+        self._general_strategy(executor_facades, strategy_type='simple')
-    def _strategy_htex_auto_scale(self, status_list: List[jsp.PollItem]) -> None:
+    def _strategy_htex_auto_scale(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
         """HTEX specific auto scaling strategy
         This strategy works only for HTEX. This strategy will scale out by
@@ -169,24 +176,30 @@ class Strategy:
         expected to scale in effectively only when # of workers, or tasks executing
         per block is close to 1.
         """
-        self._general_strategy(status_list, strategy_type='htex')
+        self._general_strategy(executor_facades, strategy_type='htex')
     @wrap_with_logs
-    def _general_strategy(self, status_list, *, strategy_type):
-        logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(status_list)} executors")
+    def _general_strategy(self, executor_facades, *, strategy_type):
+        logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executor_facades)} executors")
-        for exec_status in status_list:
-            executor = exec_status.executor
+        for ef in executor_facades:
+            executor = ef.executor
             label = executor.label
             if not isinstance(executor, BlockProviderExecutor):
                 logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
                 continue
             logger.debug(f"Strategizing for executor {label}")
+            if ef.first:
+                executor = ef.executor
+                logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
+                ef.scale_out(executor.provider.init_blocks)
+                ef.first = False
             # Tasks that are either pending completion
             active_tasks = executor.outstanding
-            status = exec_status.status
+            status = ef.status
             # FIXME we need to handle case where provider does not define these
             # FIXME probably more of this logic should be moved to the provider
@@ -242,7 +255,7 @@ class Strategy:
                         # We have resources idle for the max duration,
                         # we have to scale_in now.
                         logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
-                        exec_status.scale_in(active_blocks - min_blocks)
+                        ef.scale_in(active_blocks - min_blocks)
                     else:
                         logger.debug(
@@ -265,7 +278,7 @@ class Strategy:
                     excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
                     excess_blocks = min(excess_blocks, max_blocks - active_blocks)
                     logger.debug(f"Requesting {excess_blocks} more blocks")
-                    exec_status.scale_out(excess_blocks)
+                    ef.scale_out(excess_blocks)
             elif active_slots == 0 and active_tasks > 0:
                 logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
@@ -274,7 +287,7 @@ class Strategy:
                 if active_blocks < max_blocks:
                     logger.debug("Requesting single block")
-                    exec_status.scale_out(1)
+                    ef.scale_out(1)
                 else:
                     logger.debug("Not requesting single block, because at maxblocks already")
@@ -290,7 +303,7 @@ class Strategy:
                             excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
                             excess_blocks = min(excess_blocks, active_blocks - min_blocks)
                             logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
-                            exec_status.scale_in(excess_blocks, max_idletime=self.max_idletime)
+                            ef.scale_in(excess_blocks, max_idletime=self.max_idletime)
                     else:
                         logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
                 else:

parsl/monitoring/monitoring.py CHANGED Viewed

@@ -1,17 +1,13 @@
 from __future__ import annotations
 import os
-import socket
 import time
-import pickle
 import logging
 import typeguard
 import zmq
 import queue
-import parsl.monitoring.remote
 from parsl.multiprocessing import ForkProcess, SizedQueue
 from multiprocessing import Process
 from multiprocessing.queues import Queue
@@ -22,9 +18,10 @@ from parsl.utils import setproctitle
 from parsl.serialize import deserialize
+from parsl.monitoring.router import router_starter
 from parsl.monitoring.message_type import MessageType
-from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
-from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
+from parsl.monitoring.types import AddressedMonitoringMessage
+from typing import cast, Any, Optional, Tuple, Union, TYPE_CHECKING
 _db_manager_excepts: Optional[Exception]
@@ -93,8 +90,6 @@ class MonitoringHub(RepresentationMixin):
              Default: 30 seconds
         """
-        self.logger = logger
         # Any is used to disable typechecking on uses of _dfk_channel,
         # because it is used in the code as if it points to a channel, but
         # the static type is that it can also be None. The code relies on
@@ -120,6 +115,8 @@ class MonitoringHub(RepresentationMixin):
     def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
+        logger.debug("Starting MonitoringHub")
         if self.logdir is None:
             self.logdir = "."
@@ -128,9 +125,6 @@ class MonitoringHub(RepresentationMixin):
         os.makedirs(self.logdir, exist_ok=True)
-        # Initialize the ZMQ pipe to the Parsl Client
-        self.logger.debug("Initializing ZMQ Pipes to client")
         self.monitoring_hub_active = True
         # This annotation is incompatible with typeguard 4.x instrumentation
@@ -166,8 +160,8 @@ class MonitoringHub(RepresentationMixin):
         self.router_proc = ForkProcess(target=router_starter,
                                        args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
                                        kwargs={"hub_address": self.hub_address,
-                                               "hub_port": self.hub_port,
-                                               "hub_port_range": self.hub_port_range,
+                                               "udp_port": self.hub_port,
+                                               "zmq_port_range": self.hub_port_range,
                                                "logdir": self.logdir,
                                                "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
                                                "run_id": run_id
@@ -187,7 +181,7 @@ class MonitoringHub(RepresentationMixin):
                                     daemon=True,
                                     )
         self.dbm_proc.start()
-        self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
+        logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
         self.filesystem_proc = Process(target=filesystem_receiver,
                                        args=(self.logdir, self.resource_msgs, dfk_run_dir),
@@ -195,19 +189,19 @@ class MonitoringHub(RepresentationMixin):
                                        daemon=True
                                        )
         self.filesystem_proc.start()
-        self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
+        logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
         try:
             comm_q_result = comm_q.get(block=True, timeout=120)
         except queue.Empty:
-            self.logger.error("Hub has not completed initialization in 120s. Aborting")
+            logger.error("Hub has not completed initialization in 120s. Aborting")
             raise Exception("Hub failed to start")
         if isinstance(comm_q_result, str):
-            self.logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
+            logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
             raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
-        udp_port, ic_port = comm_q_result
+        udp_port, zmq_port = comm_q_result
         self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
@@ -217,28 +211,28 @@ class MonitoringHub(RepresentationMixin):
         self._dfk_channel.setsockopt(zmq.LINGER, 0)
         self._dfk_channel.set_hwm(0)
         self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
-        self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
+        self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
-        self.logger.info("Monitoring Hub initialized")
+        logger.info("Monitoring Hub initialized")
-        return ic_port
+        return zmq_port
     # TODO: tighten the Any message format
     def send(self, mtype: MessageType, message: Any) -> None:
-        self.logger.debug("Sending message type {}".format(mtype))
+        logger.debug("Sending message type {}".format(mtype))
         try:
             self._dfk_channel.send_pyobj((mtype, message))
         except zmq.Again:
-            self.logger.exception(
+            logger.exception(
                 "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
     def close(self) -> None:
-        self.logger.info("Terminating Monitoring Hub")
+        logger.info("Terminating Monitoring Hub")
         exception_msgs = []
         while True:
             try:
                 exception_msgs.append(self.exception_q.get(block=False))
-                self.logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
+                logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
             except queue.Empty:
                 break
         if self._dfk_channel and self.monitoring_hub_active:
@@ -246,7 +240,7 @@ class MonitoringHub(RepresentationMixin):
             self._dfk_channel.close()
             if exception_msgs:
                 for exception_msg in exception_msgs:
-                    self.logger.error(
+                    logger.error(
                         "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
                             exception_msg[0],
                             exception_msg[1]
@@ -255,41 +249,24 @@ class MonitoringHub(RepresentationMixin):
                 self.router_proc.terminate()
                 self.dbm_proc.terminate()
                 self.filesystem_proc.terminate()
-            self.logger.info("Waiting for router to terminate")
+            logger.info("Waiting for router to terminate")
             self.router_proc.join()
-            self.logger.debug("Finished waiting for router termination")
+            logger.debug("Finished waiting for router termination")
             if len(exception_msgs) == 0:
-                self.logger.debug("Sending STOP to DBM")
+                logger.debug("Sending STOP to DBM")
                 self.priority_msgs.put(("STOP", 0))
             else:
-                self.logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
-            self.logger.debug("Waiting for DB termination")
+                logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
+            logger.debug("Waiting for DB termination")
             self.dbm_proc.join()
-            self.logger.debug("Finished waiting for DBM termination")
+            logger.debug("Finished waiting for DBM termination")
             # should this be message based? it probably doesn't need to be if
             # we believe we've received all messages
-            self.logger.info("Terminating filesystem radio receiver process")
+            logger.info("Terminating filesystem radio receiver process")
             self.filesystem_proc.terminate()
             self.filesystem_proc.join()
-    @staticmethod
-    def monitor_wrapper(f: Any,
-                        args: Sequence,
-                        kwargs: Dict,
-                        try_id: int,
-                        task_id: int,
-                        monitoring_hub_url: str,
-                        run_id: str,
-                        logging_level: int,
-                        sleep_dur: float,
-                        radio_mode: str,
-                        monitor_resources: bool,
-                        run_dir: str) -> Tuple[Callable, Sequence, Dict]:
-        return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
-                                                       run_id, logging_level, sleep_dur, radio_mode,
-                                                       monitor_resources, run_dir)
 @wrap_with_logs
 def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
@@ -325,190 +302,3 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
                 logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
         time.sleep(1)  # whats a good time for this poll?
-class MonitoringRouter:
-    def __init__(self,
-                 *,
-                 hub_address: str,
-                 hub_port: Optional[int] = None,
-                 hub_port_range: Tuple[int, int] = (55050, 56000),
-                 monitoring_hub_address: str = "127.0.0.1",
-                 logdir: str = ".",
-                 run_id: str,
-                 logging_level: int = logging.INFO,
-                 atexit_timeout: int = 3    # in seconds
-                 ):
-        """ Initializes a monitoring configuration class.
-        Parameters
-        ----------
-        hub_address : str
-             The ip address at which the workers will be able to reach the Hub.
-        hub_port : int
-             The specific port at which workers will be able to reach the Hub via UDP. Default: None
-        hub_port_range : tuple(int, int)
-             The MonitoringHub picks ports at random from the range which will be used by Hub.
-             This is overridden when the hub_port option is set. Default: (55050, 56000)
-        logdir : str
-             Parsl log directory paths. Logs and temp files go here. Default: '.'
-        logging_level : int
-             Logging level as defined in the logging module. Default: logging.INFO
-        atexit_timeout : float, optional
-            The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
-        """
-        os.makedirs(logdir, exist_ok=True)
-        self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
-                                      name="monitoring_router",
-                                      level=logging_level)
-        self.logger.debug("Monitoring router starting")
-        self.hub_address = hub_address
-        self.atexit_timeout = atexit_timeout
-        self.run_id = run_id
-        self.loop_freq = 10.0  # milliseconds
-        # Initialize the UDP socket
-        self.sock = socket.socket(socket.AF_INET,
-                                  socket.SOCK_DGRAM,
-                                  socket.IPPROTO_UDP)
-        # We are trying to bind to all interfaces with 0.0.0.0
-        if not hub_port:
-            self.sock.bind(('0.0.0.0', 0))
-            self.hub_port = self.sock.getsockname()[1]
-        else:
-            self.hub_port = hub_port
-            try:
-                self.sock.bind(('0.0.0.0', self.hub_port))
-            except Exception as e:
-                raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
-        self.sock.settimeout(self.loop_freq / 1000)
-        self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
-        self._context = zmq.Context()
-        self.ic_channel = self._context.socket(zmq.DEALER)
-        self.ic_channel.setsockopt(zmq.LINGER, 0)
-        self.ic_channel.set_hwm(0)
-        self.ic_channel.RCVTIMEO = int(self.loop_freq)  # in milliseconds
-        self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
-        self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
-                                                           min_port=hub_port_range[0],
-                                                           max_port=hub_port_range[1])
-    def start(self,
-              priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              node_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              block_msgs: "queue.Queue[AddressedMonitoringMessage]",
-              resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
-        try:
-            router_keep_going = True
-            while router_keep_going:
-                try:
-                    data, addr = self.sock.recvfrom(2048)
-                    resource_msg = pickle.loads(data)
-                    self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
-                    resource_msgs.put((resource_msg, addr))
-                except socket.timeout:
-                    pass
-                try:
-                    dfk_loop_start = time.time()
-                    while time.time() - dfk_loop_start < 1.0:  # TODO make configurable
-                        # note that nothing checks that msg really is of the annotated type
-                        msg: TaggedMonitoringMessage
-                        msg = self.ic_channel.recv_pyobj()
-                        assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
-                        assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
-                        assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
-                        msg_0: AddressedMonitoringMessage
-                        msg_0 = (msg, 0)
-                        if msg[0] == MessageType.NODE_INFO:
-                            msg[1]['run_id'] = self.run_id
-                            node_msgs.put(msg_0)
-                        elif msg[0] == MessageType.RESOURCE_INFO:
-                            resource_msgs.put(msg_0)
-                        elif msg[0] == MessageType.BLOCK_INFO:
-                            block_msgs.put(msg_0)
-                        elif msg[0] == MessageType.TASK_INFO:
-                            priority_msgs.put(msg_0)
-                        elif msg[0] == MessageType.WORKFLOW_INFO:
-                            priority_msgs.put(msg_0)
-                            if 'exit_now' in msg[1] and msg[1]['exit_now']:
-                                router_keep_going = False
-                        else:
-                            # There is a type: ignore here because if msg[0]
-                            # is of the correct type, this code is unreachable,
-                            # but there is no verification that the message
-                            # received from ic_channel.recv_pyobj() is actually
-                            # of that type.
-                            self.logger.error("Discarding message "  # type: ignore[unreachable]
-                                              f"from interchange with unknown type {msg[0].value}")
-                except zmq.Again:
-                    pass
-                except Exception:
-                    # This will catch malformed messages. What happens if the
-                    # channel is broken in such a way that it always raises
-                    # an exception? Looping on this would maybe be the wrong
-                    # thing to do.
-                    self.logger.warning("Failure processing a ZMQ message", exc_info=True)
-            self.logger.info("Monitoring router draining")
-            last_msg_received_time = time.time()
-            while time.time() - last_msg_received_time < self.atexit_timeout:
-                try:
-                    data, addr = self.sock.recvfrom(2048)
-                    msg = pickle.loads(data)
-                    self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
-                    resource_msgs.put((msg, addr))
-                    last_msg_received_time = time.time()
-                except socket.timeout:
-                    pass
-            self.logger.info("Monitoring router finishing normally")
-        finally:
-            self.logger.info("Monitoring router finished")
-@wrap_with_logs
-def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
-                   exception_q: "queue.Queue[Tuple[str, str]]",
-                   priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
-                   node_msgs: "queue.Queue[AddressedMonitoringMessage]",
-                   block_msgs: "queue.Queue[AddressedMonitoringMessage]",
-                   resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
-                   hub_address: str,
-                   hub_port: Optional[int],
-                   hub_port_range: Tuple[int, int],
-                   logdir: str,
-                   logging_level: int,
-                   run_id: str) -> None:
-    setproctitle("parsl: monitoring router")
-    try:
-        router = MonitoringRouter(hub_address=hub_address,
-                                  hub_port=hub_port,
-                                  hub_port_range=hub_port_range,
-                                  logdir=logdir,
-                                  logging_level=logging_level,
-                                  run_id=run_id)
-    except Exception as e:
-        logger.error("MonitoringRouter construction failed.", exc_info=True)
-        comm_q.put(f"Monitoring router construction failed: {e}")
-    else:
-        comm_q.put((router.hub_port, router.ic_port))
-        router.logger.info("Starting MonitoringRouter in router_starter")
-        try:
-            router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
-        except Exception as e:
-            router.logger.exception("router.start exception")
-            exception_q.put(('Hub', str(e)))

parsl 2024.3.18__py3-none-any.whl → 2024.4.1__py3-none-any.whl

parsl 2024.3.18py3-none-any.whl → 2024.4.1py3-none-any.whl