PyPI - parsl - Versions diffs - 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl - Mend

parsl 2024.3.11py3-none-any.whl → 2024.3.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

parsl/addresses.py +3 -1
parsl/config.py +4 -0
parsl/dataflow/dflow.py +14 -5
parsl/executors/base.py +10 -0
parsl/executors/high_throughput/executor.py +12 -0
parsl/executors/high_throughput/interchange.py +30 -8
parsl/executors/high_throughput/manager_record.py +1 -0
parsl/executors/high_throughput/process_worker_pool.py +41 -5
parsl/executors/status_handling.py +2 -9
parsl/executors/taskvine/executor.py +24 -3
parsl/executors/taskvine/manager.py +1 -0
parsl/executors/taskvine/manager_config.py +3 -4
parsl/executors/workqueue/executor.py +19 -0
parsl/jobs/error_handlers.py +1 -1
parsl/jobs/job_status_poller.py +8 -7
parsl/launchers/launchers.py +6 -6
parsl/log_utils.py +8 -4
parsl/monitoring/db_manager.py +4 -2
parsl/monitoring/monitoring.py +30 -264
parsl/monitoring/router.py +208 -0
parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
parsl/monitoring/visualization/views.py +2 -1
parsl/providers/cluster_provider.py +1 -3
parsl/tests/configs/user_opts.py +2 -1
parsl/tests/test_htex/test_drain.py +78 -0
parsl/tests/test_monitoring/test_app_names.py +86 -0
parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
parsl/tests/test_python_apps/test_context_manager.py +40 -0
parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
parsl/tests/test_shutdown/__init__.py +0 -0
parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
parsl/utils.py +2 -2
parsl/version.py +1 -1
{parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
{parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
{parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
{parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0

parsl/addresses.py CHANGED Viewed

@@ -81,7 +81,9 @@ def address_by_hostname() -> str:
 def address_by_interface(ifname: str) -> str:
     """Returns the IP address of the given interface name, e.g. 'eth0'
-    This is from a Stack Overflow answer: https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
+    This is taken from a Stack Overflow answer:
+    https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
     Parameters
     ----------

parsl/config.py CHANGED Viewed

@@ -55,6 +55,8 @@ class Config(RepresentationMixin):
         or `None`.
         If 'none' or `None`, dynamic scaling will be disabled. Default is 'simple'. The literal value `None` is
         deprecated.
+    strategy_period : float or int, optional
+        How often the scaling strategy should be executed. Default is 5 seconds.
     max_idletime : float, optional
         The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
     usage_tracking : bool, optional
@@ -88,6 +90,7 @@ class Config(RepresentationMixin):
                  retry_handler: Optional[Callable[[Exception, TaskRecord], float]] = None,
                  run_dir: str = 'runinfo',
                  strategy: Optional[str] = 'simple',
+                 strategy_period: Union[float, int] = 5,
                  max_idletime: float = 120.0,
                  monitoring: Optional[MonitoringHub] = None,
                  usage_tracking: bool = False,
@@ -121,6 +124,7 @@ class Config(RepresentationMixin):
         self.retry_handler = retry_handler
         self.run_dir = run_dir
         self.strategy = strategy
+        self.strategy_period = strategy_period
         self.max_idletime = max_idletime
         self.usage_tracking = usage_tracking
         self.initialize_logging = initialize_logging

parsl/dataflow/dflow.py CHANGED Viewed

@@ -108,12 +108,12 @@ class DataFlowKernel:
         # hub address and port for interchange to connect
         self.hub_address = None  # type: Optional[str]
-        self.hub_interchange_port = None  # type: Optional[int]
+        self.hub_zmq_port = None  # type: Optional[int]
         if self.monitoring:
             if self.monitoring.logdir is None:
                 self.monitoring.logdir = self.run_dir
             self.hub_address = self.monitoring.hub_address
-            self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
+            self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
         self.time_began = datetime.datetime.now()
         self.time_completed: Optional[datetime.datetime] = None
@@ -178,6 +178,7 @@ class DataFlowKernel:
         # this must be set before executors are added since add_executors calls
         # job_status_poller.add_executors.
         self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
+                                                 strategy_period=self.config.strategy_period,
                                                  max_idletime=self.config.max_idletime,
                                                  dfk=self)
@@ -205,6 +206,13 @@ class DataFlowKernel:
         atexit.register(self.atexit_cleanup)
+    def __enter__(self):
+        pass
+    def __exit__(self, exc_type, exc_value, traceback):
+        logger.debug("Exiting the context manager, calling cleanup for DFK")
+        self.cleanup()
     def _send_task_log_info(self, task_record: TaskRecord) -> None:
         if self.monitoring:
             task_log_info = self._create_task_log_info(task_record)
@@ -1114,12 +1122,12 @@ class DataFlowKernel:
         channel.makedirs(channel.script_dir, exist_ok=True)
-    def add_executors(self, executors):
+    def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
         for executor in executors:
             executor.run_id = self.run_id
             executor.run_dir = self.run_dir
             executor.hub_address = self.hub_address
-            executor.hub_port = self.hub_interchange_port
+            executor.hub_port = self.hub_zmq_port
             if hasattr(executor, 'provider'):
                 if hasattr(executor.provider, 'script_dir'):
                     executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
@@ -1170,7 +1178,8 @@ class DataFlowKernel:
             fut = task_record['app_fu']
             if not fut.done():
                 fut.exception()
-            # now app future is done, poll until DFK state is final: a DFK state being final and the app future being done do not imply each other.
+            # now app future is done, poll until DFK state is final: a
+            # DFK state being final and the app future being done do not imply each other.
             while task_record['status'] not in FINAL_STATES:
                 time.sleep(0.1)

parsl/executors/base.py CHANGED Viewed

@@ -106,6 +106,16 @@ class ParslExecutor(metaclass=ABCMeta):
     def run_dir(self, value: str) -> None:
         self._run_dir = value
+    @property
+    def run_id(self) -> Optional[str]:
+        """UUID for the enclosing DFK.
+        """
+        return self._run_id
+    @run_id.setter
+    def run_id(self, value: Optional[str]) -> None:
+        self._run_id = value
     @property
     def hub_address(self) -> Optional[str]:
         """Address to the Hub for monitoring.

parsl/executors/high_throughput/executor.py CHANGED Viewed

@@ -55,6 +55,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
                       "--hb_period={heartbeat_period} "
                       "{address_probe_timeout_string} "
                       "--hb_threshold={heartbeat_threshold} "
+                      "--drain_period={drain_period} "
                       "--cpu-affinity {cpu_affinity} "
                       "{enable_mpi_mode} "
                       "--mpi-launcher={mpi_launcher} "
@@ -201,6 +202,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
         trades performance for cpu efficiency. Default: 10ms
+    drain_period : int
+        The number of seconds after start when workers will begin to drain
+        and then exit. Set this to a time that is slightly less than the
+        maximum walltime of batch jobs to avoid killing tasks while they
+        execute. For example, you could set this to the walltime minus a grace
+        period for the batch job to start the workers, minus the expected
+        maximum length of an individual task.
     worker_logdir_root : string
         In case of a remote file system, specify the path to where logs will be kept.
@@ -240,6 +249,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                  prefetch_capacity: int = 0,
                  heartbeat_threshold: int = 120,
                  heartbeat_period: int = 30,
+                 drain_period: Optional[int] = None,
                  poll_period: int = 10,
                  address_probe_timeout: Optional[int] = None,
                  worker_logdir_root: Optional[str] = None,
@@ -303,6 +313,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.interchange_port_range = interchange_port_range
         self.heartbeat_threshold = heartbeat_threshold
         self.heartbeat_period = heartbeat_period
+        self.drain_period = drain_period
         self.poll_period = poll_period
         self.run_dir = '.'
         self.worker_logdir_root = worker_logdir_root
@@ -376,6 +387,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                                        nodes_per_block=self.provider.nodes_per_block,
                                        heartbeat_period=self.heartbeat_period,
                                        heartbeat_threshold=self.heartbeat_threshold,
+                                       drain_period=self.drain_period,
                                        poll_period=self.poll_period,
                                        cert_dir=self.cert_dir,
                                        logdir=self.worker_logdir,

parsl/executors/high_throughput/interchange.py CHANGED Viewed

@@ -28,6 +28,7 @@ from parsl.process_loggers import wrap_with_logs
 PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
+PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
 LOGGER_NAME = "interchange"
 logger = logging.getLogger(LOGGER_NAME)
@@ -101,12 +102,12 @@ class Interchange:
              This is overridden when the worker_ports option is set. Default: (54000, 55000)
         hub_address : str
-             The ip address at which the interchange can send info about managers to when monitoring is enabled.
-             This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
+             The IP address at which the interchange can send info about managers to when monitoring is enabled.
+             Default: None (meaning monitoring disabled)
         hub_port : str
              The port at which the interchange can send info about managers to when monitoring is enabled.
-             This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
+             Default: None (meaning monitoring disabled)
         heartbeat_threshold : int
              Number of seconds since the last heartbeat after which worker is considered lost.
@@ -244,19 +245,19 @@ class Interchange:
     def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
         if self.hub_address and self.hub_port:
-            logger.info("Connecting to monitoring")
+            logger.info("Connecting to MonitoringHub")
             # This is a one-off because monitoring is unencrypted
             hub_channel = zmq.Context().socket(zmq.DEALER)
             hub_channel.set_hwm(0)
             hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
-            logger.info("Monitoring enabled and connected to hub")
+            logger.info("Connected to MonitoringHub")
             return hub_channel
         else:
             return None
     def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
         if hub_channel:
-            logger.info("Sending message {} to hub".format(manager))
+            logger.info("Sending message {} to MonitoringHub".format(manager))
             d: Dict = cast(Dict, manager.copy())
             d['timestamp'] = datetime.datetime.now()
@@ -308,7 +309,8 @@ class Interchange:
                                 'worker_count': m['worker_count'],
                                 'tasks': len(m['tasks']),
                                 'idle_duration': idle_duration,
-                                'active': m['active']}
+                                'active': m['active'],
+                                'draining': m['draining']}
                         reply.append(resp)
                 elif command_req.startswith("HOLD_WORKER"):
@@ -385,6 +387,7 @@ class Interchange:
             self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
             self.process_results_incoming(interesting_managers, hub_channel)
             self.expire_bad_managers(interesting_managers, hub_channel)
+            self.expire_drained_managers(interesting_managers, hub_channel)
             self.process_tasks_to_send(interesting_managers)
         self.zmq_context.destroy()
@@ -431,6 +434,7 @@ class Interchange:
                                                     'max_capacity': 0,
                                                     'worker_count': 0,
                                                     'active': True,
+                                                    'draining': False,
                                                     'tasks': []}
                 self.connected_block_history.append(msg['block_id'])
@@ -469,10 +473,28 @@ class Interchange:
                 self._ready_managers[manager_id]['last_heartbeat'] = time.time()
                 logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
                 self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
+            elif msg['type'] == 'drain':
+                self._ready_managers[manager_id]['draining'] = True
+                logger.debug(f"Manager {manager_id!r} requested drain")
             else:
                 logger.error(f"Unexpected message type received from manager: {msg['type']}")
             logger.debug("leaving task_outgoing section")
+    def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
+        for manager_id in list(interesting_managers):
+            # is it always true that a draining manager will be in interesting managers?
+            # i think so because it will have outstanding capacity?
+            m = self._ready_managers[manager_id]
+            if m['draining'] and len(m['tasks']) == 0:
+                logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
+                self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
+                interesting_managers.remove(manager_id)
+                self._ready_managers.pop(manager_id)
+                m['active'] = False
+                self._send_monitoring_info(hub_channel, m)
     def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
         # Check if there are tasks that could be sent to managers
@@ -490,7 +512,7 @@ class Interchange:
                 tasks_inflight = len(m['tasks'])
                 real_capacity = m['max_capacity'] - tasks_inflight
-                if (real_capacity and m['active']):
+                if (real_capacity and m['active'] and not m['draining']):
                     tasks = self.get_tasks(real_capacity)
                     if tasks:
                         self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])

parsl/executors/high_throughput/manager_record.py CHANGED Viewed

@@ -9,6 +9,7 @@ class ManagerRecord(TypedDict, total=False):
     worker_count: int
     max_capacity: int
     active: bool
+    draining: bool
     hostname: str
     last_heartbeat: float
     idle_since: Optional[float]

parsl/executors/high_throughput/process_worker_pool.py CHANGED Viewed

@@ -36,6 +36,7 @@ from parsl.executors.high_throughput.mpi_resource_management import (
 from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
 HEARTBEAT_CODE = (2 ** 32) - 1
+DRAINED_CODE = (2 ** 32) - 2
 class Manager:
@@ -73,7 +74,8 @@ class Manager:
                  enable_mpi_mode: bool = False,
                  mpi_launcher: str = "mpiexec",
                  available_accelerators: Sequence[str],
-                 cert_dir: Optional[str]):
+                 cert_dir: Optional[str],
+                 drain_period: Optional[int]):
         """
         Parameters
         ----------
@@ -138,6 +140,9 @@ class Manager:
         cert_dir : str | None
             Path to the certificate directory.
+        drain_period: int | None
+            Number of seconds to drain after  TODO: could be a nicer timespec involving m,s,h qualifiers for user friendliness?
         """
         logger.info("Manager initializing")
@@ -227,6 +232,14 @@ class Manager:
         self.heartbeat_period = heartbeat_period
         self.heartbeat_threshold = heartbeat_threshold
         self.poll_period = poll_period
+        self.drain_time: float
+        if drain_period:
+            self.drain_time = self._start_time + drain_period
+            logger.info(f"Will request drain at {self.drain_time}")
+        else:
+            self.drain_time = float('inf')
         self.cpu_affinity = cpu_affinity
         # Define accelerator available, adjust worker count accordingly
@@ -262,10 +275,19 @@ class Manager:
         """ Send heartbeat to the incoming task queue
         """
         msg = {'type': 'heartbeat'}
+        # don't need to dumps and encode this every time - could do as a global on import?
         b_msg = json.dumps(msg).encode('utf-8')
         self.task_incoming.send(b_msg)
         logger.debug("Sent heartbeat")
+    def drain_to_incoming(self):
+        """ Send heartbeat to the incoming task queue
+        """
+        msg = {'type': 'drain'}
+        b_msg = json.dumps(msg).encode('utf-8')
+        self.task_incoming.send(b_msg)
+        logger.debug("Sent drain")
     @wrap_with_logs
     def pull_tasks(self, kill_event):
         """ Pull tasks from the incoming tasks zmq pipe onto the internal
@@ -298,6 +320,7 @@ class Manager:
             # time here are correctly copy-pasted from the relevant if
             # statements.
             next_interesting_event_time = min(last_beat + self.heartbeat_period,
+                                              self.drain_time,
                                               last_interchange_contact + self.heartbeat_threshold)
             try:
                 pending_task_count = self.pending_task_queue.qsize()
@@ -312,6 +335,14 @@ class Manager:
                 self.heartbeat_to_incoming()
                 last_beat = time.time()
+            if self.drain_time and time.time() > self.drain_time:
+                logger.info("Requesting drain")
+                self.drain_to_incoming()
+                self.drain_time = None
+                # This will start the pool draining...
+                # Drained exit behaviour does not happen here. It will be
+                # driven by the interchange sending a DRAINED_CODE message.
             poll_duration_s = max(0, next_interesting_event_time - time.time())
             socks = dict(poller.poll(timeout=poll_duration_s * 1000))
@@ -322,7 +353,9 @@ class Manager:
                 if tasks == HEARTBEAT_CODE:
                     logger.debug("Got heartbeat from interchange")
+                elif tasks == DRAINED_CODE:
+                    logger.info("Got fulled drained message from interchange - setting kill flag")
+                    kill_event.set()
                 else:
                     task_recv_counter += len(tasks)
                     logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
@@ -490,9 +523,8 @@ class Manager:
         self._worker_watchdog_thread.start()
         self._monitoring_handler_thread.start()
-        logger.info("Loop start")
+        logger.info("Manager threads started")
-        # TODO : Add mechanism in this loop to stop the worker pool
         # This might need a multiprocessing event to signal back.
         self._kill_event.wait()
         logger.critical("Received kill event, terminating worker processes")
@@ -804,6 +836,8 @@ if __name__ == "__main__":
                         help="Heartbeat period in seconds. Uses manager default unless set")
     parser.add_argument("--hb_threshold", default=120,
                         help="Heartbeat threshold in seconds. Uses manager default unless set")
+    parser.add_argument("--drain_period", default=None,
+                        help="Drain this pool after specified number of seconds. By default, does not drain.")
     parser.add_argument("--address_probe_timeout", default=30,
                         help="Timeout to probe for viable address to interchange. Default: 30s")
     parser.add_argument("--poll", default=10,
@@ -824,7 +858,7 @@ if __name__ == "__main__":
                         required=True,
                         help="Whether/how workers should control CPU affinity.")
     parser.add_argument("--available-accelerators", type=str, nargs="*",
-                        help="Names of available accelerators")
+                        help="Names of available accelerators, if not given assumed to be zero accelerators available", default=[])
     parser.add_argument("--enable_mpi_mode", action='store_true',
                         help="Enable MPI mode")
     parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
@@ -856,6 +890,7 @@ if __name__ == "__main__":
         logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
         logger.info("Heartbeat threshold: {}".format(args.hb_threshold))
         logger.info("Heartbeat period: {}".format(args.hb_period))
+        logger.info("Drain period: {}".format(args.drain_period))
         logger.info("CPU affinity: {}".format(args.cpu_affinity))
         logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
         logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
@@ -876,6 +911,7 @@ if __name__ == "__main__":
                           prefetch_capacity=int(args.prefetch_capacity),
                           heartbeat_threshold=int(args.hb_threshold),
                           heartbeat_period=int(args.hb_period),
+                          drain_period=None if args.drain_period == "None" else int(args.drain_period),
                           poll_period=int(args.poll),
                           cpu_affinity=args.cpu_affinity,
                           enable_mpi_mode=args.enable_mpi_mode,

parsl/executors/status_handling.py CHANGED Viewed

@@ -61,7 +61,7 @@ class BlockProviderExecutor(ParslExecutor):
         # errors can happen during the submit call to the provider; this is used
         # to keep track of such errors so that they can be handled in one place
         # together with errors reported by status()
-        self._simulated_status: Dict[Any, JobStatus] = {}
+        self._simulated_status: Dict[str, JobStatus] = {}
         self._executor_bad_state = threading.Event()
         self._executor_exception: Optional[Exception] = None
@@ -102,13 +102,10 @@ class BlockProviderExecutor(ParslExecutor):
         else:
             return self._provider.status_polling_interval
-    def _fail_job_async(self, block_id: Any, message: str):
+    def _fail_job_async(self, block_id: str, message: str):
         """Marks a job that has failed to start but would not otherwise be included in status()
         as failed and report it in status()
         """
-        if block_id is None:
-            block_id = str(self._block_id_counter.get_id())
-            logger.info(f"Allocated block ID {block_id} for simulated failure")
         self._simulated_status[block_id] = JobStatus(JobState.FAILED, message)
     @abstractproperty
@@ -211,10 +208,6 @@ class BlockProviderExecutor(ParslExecutor):
         Cause the executor to reduce the number of blocks by count.
-        We should have the scale in method simply take resource object
-        which will have the scaling methods, scale_in itself should be a coroutine, since
-        scaling tasks can be slow.
         :return: A list of block ids corresponding to the blocks that were removed.
         """
         pass

parsl/executors/taskvine/executor.py CHANGED Viewed

@@ -4,6 +4,7 @@ high-throughput system for delegating Parsl tasks to thousands of remote machine
 """
 # Import Python built-in libraries
+import atexit
 import threading
 import multiprocessing
 import logging
@@ -171,7 +172,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         # Path to directory that holds all tasks' data and results.
         self._function_data_dir = ""
-        # helper scripts to prepare package tarballs for Parsl apps
+        # Helper scripts to prepare package tarballs for Parsl apps
         self._package_analyze_script = shutil.which("poncho_package_analyze")
         self._package_create_script = shutil.which("poncho_package_create")
         if self._package_analyze_script is None or self._package_create_script is None:
@@ -179,6 +180,18 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         else:
             self._poncho_available = True
+        # Register atexit handler to cleanup when Python shuts down
+        atexit.register(self.atexit_cleanup)
+        # Attribute indicating whether this executor was started to shut it down properly.
+        # This safeguards cases where an object of this executor is created but
+        # the executor never starts, so it shouldn't be shutdowned.
+        self._started = False
+    def atexit_cleanup(self):
+        # Calls this executor's shutdown method upon Python exiting the process.
+        self.shutdown()
     def _get_launch_command(self, block_id):
         # Implements BlockProviderExecutor's abstract method.
         # This executor uses different terminology for worker/launch
@@ -196,8 +209,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         if self.manager_config.port == 0 and self.manager_config.project_name is None:
             self.manager_config.project_name = "parsl-vine-" + str(uuid.uuid4())
-        # guess the host name if the project name is not given
-        if not self.manager_config.project_name:
+        # guess the host name if the project name is not given and none has been supplied
+        # explicitly in the manager config.
+        if not self.manager_config.project_name and self.manager_config.address is None:
             self.manager_config.address = get_any_address()
         # Factory communication settings are overridden by manager communication settings.
@@ -237,6 +251,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         retrieve Parsl tasks within the TaskVine system.
         """
+        # Mark this executor object as started
+        self._started = True
         # Synchronize connection and communication settings between the manager and factory
         self.__synchronize_manager_factory_comm_settings()
@@ -597,6 +614,10 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         """Shutdown the executor. Sets flag to cancel the submit process and
         collector thread, which shuts down the TaskVine system submission.
         """
+        if not self._started:
+            # Don't shutdown if the executor never starts.
+            return
         logger.debug("TaskVine shutdown started")
         self._should_stop.set()

parsl/executors/taskvine/manager.py CHANGED Viewed

@@ -376,6 +376,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
                             task_out_file = parsl_file_name_to_vine_file[spec.parsl_name]
                         else:
                             task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True)
+                            parsl_file_name_to_vine_file[spec.parsl_name] = task_out_file
                         t.add_output(task_out_file, spec.parsl_name)
             # Submit the task to the TaskVine object

parsl/executors/taskvine/manager_config.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import socket
 from dataclasses import dataclass
 from typing import Optional
@@ -23,9 +22,9 @@ class TaskVineManagerConfig:
         A value of 0 means TaskVine chooses any available port.
         Default is VINE_DEFAULT_PORT.
-    address: str
+    address: Optional[str]
         Address of the local machine.
-        Default is socket.gethostname().
+        If None, socket.gethostname() will be used to determine the address.
     project_name: Optional[str]
         If given, TaskVine will periodically report its status and performance
@@ -161,7 +160,7 @@ class TaskVineManagerConfig:
     # Connection and communication settings
     port: int = VINE_DEFAULT_PORT
-    address: str = socket.gethostname()
+    address: Optional[str] = None
     project_name: Optional[str] = None
     project_password_file: Optional[str] = None

parsl/executors/workqueue/executor.py CHANGED Viewed

@@ -3,6 +3,7 @@ Cooperative Computing Lab (CCL) at Notre Dame to provide a fault-tolerant,
 high-throughput system for delegating Parsl tasks to thousands of remote machines
 """
+import atexit
 import threading
 import multiprocessing
 import logging
@@ -298,6 +299,18 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         if self.init_command != "":
             self.launch_cmd = self.init_command + "; " + self.launch_cmd
+        # register atexit handler to cleanup when Python shuts down
+        atexit.register(self.atexit_cleanup)
+        # Attribute indicating whether this executor was started to shut it down properly.
+        # This safeguards cases where an object of this executor is created but
+        # the executor never starts, so it shouldn't be shutdowned.
+        self.started = False
+    def atexit_cleanup(self):
+        # Calls this executor's shutdown method upon Python exiting the process.
+        self.shutdown()
     def _get_launch_command(self, block_id):
         # this executor uses different terminology for worker/launch
         # commands than in htex
@@ -307,6 +320,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         """Create submit process and collector thread to create, send, and
         retrieve Parsl tasks within the Work Queue system.
         """
+        # Mark this executor object as started
+        self.started = True
         self.tasks_lock = threading.Lock()
         # Create directories for data and results
@@ -695,6 +710,10 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
         """Shutdown the executor. Sets flag to cancel the submit process and
         collector thread, which shuts down the Work Queue system submission.
         """
+        if not self.started:
+            # Don't shutdown if the executor never starts.
+            return
         logger.debug("Work Queue shutdown started")
         self.should_stop.value = True

parsl/jobs/error_handlers.py CHANGED Viewed

@@ -20,7 +20,7 @@ def simple_error_handler(executor: status_handling.BlockProviderExecutor, status
         executor.set_bad_state_and_fail_all(_get_error(status))
-def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3):
+def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
     sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))]
     current_window = dict(sorted_status[-threshold:])
     total, failed = _count_jobs(current_window)

parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl

parsl 2024.3.11py3-none-any.whl → 2024.3.25py3-none-any.whl