parsl 2025.3.10__py3-none-any.whl → 2025.3.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. parsl/dataflow/dflow.py +1 -3
  2. parsl/executors/base.py +13 -37
  3. parsl/executors/flux/executor.py +1 -0
  4. parsl/executors/globus_compute.py +1 -1
  5. parsl/executors/high_throughput/executor.py +18 -0
  6. parsl/executors/high_throughput/mpi_resource_management.py +2 -0
  7. parsl/executors/high_throughput/process_worker_pool.py +89 -82
  8. parsl/executors/radical/executor.py +1 -0
  9. parsl/executors/status_handling.py +8 -0
  10. parsl/executors/taskvine/executor.py +1 -0
  11. parsl/executors/workqueue/executor.py +1 -0
  12. parsl/monitoring/db_manager.py +16 -10
  13. parsl/monitoring/errors.py +5 -0
  14. parsl/monitoring/monitoring.py +61 -117
  15. parsl/monitoring/radios/filesystem_router.py +4 -2
  16. parsl/monitoring/radios/udp_router.py +1 -3
  17. parsl/monitoring/radios/zmq_router.py +80 -25
  18. parsl/multiprocessing.py +42 -2
  19. parsl/tests/test_monitoring/test_exit_helper.py +54 -0
  20. parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
  21. parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
  22. parsl/tests/test_monitoring/test_stdouterr.py +3 -0
  23. parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
  24. parsl/usage_tracking/usage.py +2 -2
  25. parsl/version.py +1 -1
  26. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/process_worker_pool.py +89 -82
  27. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/METADATA +4 -4
  28. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/RECORD +35 -33
  29. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/exec_parsl_function.py +0 -0
  30. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/interchange.py +0 -0
  31. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/parsl_coprocess.py +0 -0
  32. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/LICENSE +0 -0
  33. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/WHEEL +0 -0
  34. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/entry_points.txt +0 -0
  35. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py CHANGED
@@ -1128,9 +1128,7 @@ class DataFlowKernel:
1128
1128
  executor.run_id = self.run_id
1129
1129
  executor.run_dir = self.run_dir
1130
1130
  if self.monitoring:
1131
- executor.hub_address = self.monitoring.hub_address
1132
- executor.hub_zmq_port = self.monitoring.hub_zmq_port
1133
- executor.submit_monitoring_radio = self.monitoring.radio
1131
+ executor.monitoring_messages = self.monitoring.resource_msgs
1134
1132
  if hasattr(executor, 'provider'):
1135
1133
  if hasattr(executor.provider, 'script_dir'):
1136
1134
  executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
parsl/executors/base.py CHANGED
@@ -1,11 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  import os
2
4
  from abc import ABCMeta, abstractmethod
3
5
  from concurrent.futures import Future
6
+ from multiprocessing.queues import Queue
4
7
  from typing import Any, Callable, Dict, Optional
5
8
 
6
9
  from typing_extensions import Literal, Self
7
10
 
8
- from parsl.monitoring.radios.base import MonitoringRadioSender
11
+ from parsl.monitoring.types import TaggedMonitoringMessage
9
12
 
10
13
 
11
14
  class ParslExecutor(metaclass=ABCMeta):
@@ -42,6 +45,13 @@ class ParslExecutor(metaclass=ABCMeta):
42
45
  invariant, not co-variant, and it looks like @typeguard cannot be
43
46
  persuaded otherwise. So if you're implementing an executor and want to
44
47
  @typeguard the constructor, you'll have to use List[Any] here.
48
+
49
+ The DataFlowKernel will set this attribute before calling .start(),
50
+ if monitoring is enabled:
51
+
52
+ monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] - an executor
53
+ can send messages to the monitoring hub by putting them into
54
+ this queue.
45
55
  """
46
56
 
47
57
  label: str = "undefined"
@@ -50,15 +60,11 @@ class ParslExecutor(metaclass=ABCMeta):
50
60
  def __init__(
51
61
  self,
52
62
  *,
53
- hub_address: Optional[str] = None,
54
- hub_zmq_port: Optional[int] = None,
55
- submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
63
+ monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] = None,
56
64
  run_dir: str = ".",
57
65
  run_id: Optional[str] = None,
58
66
  ):
59
- self.hub_address = hub_address
60
- self.hub_zmq_port = hub_zmq_port
61
- self.submit_monitoring_radio = submit_monitoring_radio
67
+ self.monitoring_messages = monitoring_messages
62
68
  self.run_dir = os.path.abspath(run_dir)
63
69
  self.run_id = run_id
64
70
 
@@ -125,33 +131,3 @@ class ParslExecutor(metaclass=ABCMeta):
125
131
  @run_id.setter
126
132
  def run_id(self, value: Optional[str]) -> None:
127
133
  self._run_id = value
128
-
129
- @property
130
- def hub_address(self) -> Optional[str]:
131
- """Address to the Hub for monitoring.
132
- """
133
- return self._hub_address
134
-
135
- @hub_address.setter
136
- def hub_address(self, value: Optional[str]) -> None:
137
- self._hub_address = value
138
-
139
- @property
140
- def hub_zmq_port(self) -> Optional[int]:
141
- """Port to the Hub for monitoring.
142
- """
143
- return self._hub_zmq_port
144
-
145
- @hub_zmq_port.setter
146
- def hub_zmq_port(self, value: Optional[int]) -> None:
147
- self._hub_zmq_port = value
148
-
149
- @property
150
- def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
151
- """Local radio for sending monitoring messages
152
- """
153
- return self._submit_monitoring_radio
154
-
155
- @submit_monitoring_radio.setter
156
- def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
157
- self._submit_monitoring_radio = value
@@ -231,6 +231,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
231
231
 
232
232
  def start(self):
233
233
  """Called when DFK starts the executor when the config is loaded."""
234
+ super().start()
234
235
  os.makedirs(self.working_dir, exist_ok=True)
235
236
  self._submission_thread.start()
236
237
 
@@ -67,7 +67,7 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
67
67
 
68
68
  def start(self) -> None:
69
69
  """ Start the Globus Compute Executor """
70
- pass
70
+ super().start()
71
71
 
72
72
  def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
73
73
  """ Submit func to globus-compute
@@ -29,6 +29,7 @@ from parsl.executors.high_throughput.manager_selector import (
29
29
  )
30
30
  from parsl.executors.status_handling import BlockProviderExecutor
31
31
  from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
32
+ from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_receiver
32
33
  from parsl.process_loggers import wrap_with_logs
33
34
  from parsl.providers import LocalProvider
34
35
  from parsl.providers.base import ExecutionProvider
@@ -334,6 +335,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
334
335
  self._result_queue_thread_exit = threading.Event()
335
336
  self._result_queue_thread: Optional[threading.Thread] = None
336
337
 
338
+ self.zmq_monitoring: Optional[ZMQRadioReceiver]
339
+ self.zmq_monitoring = None
340
+ self.hub_zmq_port = None
341
+
337
342
  radio_mode = "htex"
338
343
  enable_mpi_mode: bool = False
339
344
  mpi_launcher: str = "mpiexec"
@@ -407,6 +412,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
407
412
  def start(self):
408
413
  """Create the Interchange process and connect to it.
409
414
  """
415
+ super().start()
410
416
  if self.encrypted and self.cert_dir is None:
411
417
  logger.debug("Creating CurveZMQ certificates")
412
418
  self.cert_dir = curvezmq.create_certificates(self.logdir)
@@ -427,6 +433,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
427
433
  self.loopback_address, self.interchange_port_range, self.cert_dir
428
434
  )
429
435
 
436
+ if self.monitoring_messages is not None:
437
+ self.zmq_monitoring = start_zmq_receiver(monitoring_messages=self.monitoring_messages,
438
+ loopback_address=self.loopback_address,
439
+ port_range=self.interchange_port_range,
440
+ logdir=self.logdir,
441
+ worker_debug=self.worker_debug,
442
+ )
443
+ self.hub_zmq_port = self.zmq_monitoring.port
444
+
430
445
  self._result_queue_thread = None
431
446
  self._start_result_queue_thread()
432
447
  self._start_local_interchange_process()
@@ -861,6 +876,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
861
876
  if self._result_queue_thread:
862
877
  self._result_queue_thread.join()
863
878
 
879
+ if self.zmq_monitoring:
880
+ self.zmq_monitoring.close()
881
+
864
882
  logger.info("Finished HighThroughputExecutor shutdown attempt")
865
883
 
866
884
  def get_usage_information(self):
@@ -203,6 +203,8 @@ class MPITaskScheduler(TaskScheduler):
203
203
  def get_result(self, block: bool = True, timeout: Optional[float] = None):
204
204
  """Return result and relinquish provisioned nodes"""
205
205
  result_pkl = self.pending_result_q.get(block, timeout)
206
+ if result_pkl is None:
207
+ return None
206
208
  result_dict = pickle.loads(result_pkl)
207
209
  # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
208
210
  # Causes Parsl to hang. See Issue #3427
@@ -15,6 +15,7 @@ import threading
15
15
  import time
16
16
  import uuid
17
17
  from importlib.metadata import distributions
18
+ from multiprocessing.context import SpawnProcess
18
19
  from multiprocessing.managers import DictProxy
19
20
  from multiprocessing.sharedctypes import Synchronized
20
21
  from typing import Dict, List, Optional, Sequence
@@ -403,52 +404,34 @@ class Manager:
403
404
  result_outgoing.connect(self._result_q_url)
404
405
  logger.info("Manager result pipe connected to interchange")
405
406
 
406
- push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
407
- logger.debug("push poll period: {}".format(push_poll_period))
408
-
409
- last_beat = time.time()
410
- last_result_beat = time.time()
411
- items = []
412
-
413
407
  while not self._stop_event.is_set():
408
+ logger.debug("Starting pending_result_queue get")
414
409
  try:
415
- logger.debug("Starting pending_result_queue get")
416
- r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
417
- logger.debug("Got a result item")
418
- items.append(r)
419
- except queue.Empty:
420
- logger.debug("pending_result_queue get timeout without result item")
421
- except Exception as e:
422
- logger.exception("Got an exception: {}".format(e))
423
-
424
- if time.time() > last_result_beat + self.heartbeat_period:
425
- heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
426
- logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
- last_result_beat = time.time()
428
- items.append(pickle.dumps({'type': 'heartbeat'}))
429
-
430
- if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
431
- last_beat = time.time()
432
- if items:
433
- logger.debug(f"Result send: Pushing {len(items)} items")
434
- result_outgoing.send_multipart(items)
435
- logger.debug("Result send: Pushed")
436
- items = []
437
- else:
438
- logger.debug("Result send: No items to push")
439
- else:
440
- logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
410
+ r = self.task_scheduler.get_result()
411
+ if r is None:
412
+ continue
413
+ logger.debug("Result received from worker: %s", id(r))
414
+ result_outgoing.send(r)
415
+ logger.debug("Result sent to interchange: %s", id(r))
416
+ except Exception:
417
+ logger.exception("Failed to send result to interchange")
441
418
 
442
419
  result_outgoing.close()
443
- logger.info("Exiting")
420
+ logger.debug("Exiting")
444
421
 
445
422
  @wrap_with_logs
446
- def worker_watchdog(self):
423
+ def heartbeater(self):
424
+ while not self._stop_event.wait(self.heartbeat_period):
425
+ heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
426
+ logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
+ self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
428
+
429
+ def worker_watchdog(self, procs: dict[int, SpawnProcess]):
447
430
  """Keeps workers alive."""
448
431
  logger.debug("Starting worker watchdog")
449
432
 
450
433
  while not self._stop_event.wait(self.heartbeat_period):
451
- for worker_id, p in self.procs.items():
434
+ for worker_id, p in procs.items():
452
435
  if not p.is_alive():
453
436
  logger.error("Worker {} has died".format(worker_id))
454
437
  try:
@@ -466,11 +449,10 @@ class Manager:
466
449
  except KeyError:
467
450
  logger.info("Worker {} was not busy when it died".format(worker_id))
468
451
 
469
- p = self._start_worker(worker_id)
470
- self.procs[worker_id] = p
452
+ procs[worker_id] = self._start_worker(worker_id)
471
453
  logger.info("Worker {} has been restarted".format(worker_id))
472
454
 
473
- logger.critical("Exiting")
455
+ logger.debug("Exiting")
474
456
 
475
457
  @wrap_with_logs
476
458
  def handle_monitoring_messages(self):
@@ -485,32 +467,28 @@ class Manager:
485
467
  """
486
468
  logger.debug("Starting monitoring handler thread")
487
469
 
488
- poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
489
-
490
470
  while not self._stop_event.is_set():
491
471
  try:
492
472
  logger.debug("Starting monitor_queue.get()")
493
- msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
494
- except queue.Empty:
495
- logger.debug("monitoring_queue.get() has timed out")
496
- except Exception as e:
497
- logger.exception(f"Got an exception: {e}")
498
- else:
473
+ msg = self.monitoring_queue.get(block=True)
474
+ if msg is None:
475
+ continue
499
476
  logger.debug("Got a monitoring message")
500
477
  self.pending_result_queue.put(msg)
501
478
  logger.debug("Put monitoring message on pending_result_queue")
479
+ except Exception:
480
+ logger.exception("Failed to forward monitoring message")
502
481
 
503
- logger.critical("Exiting")
482
+ logger.debug("Exiting")
504
483
 
505
484
  def start(self):
506
485
  """ Start the worker processes.
507
486
 
508
487
  TODO: Move task receiving to a thread
509
488
  """
510
- self.procs = {}
489
+ procs: dict[int, SpawnProcess] = {}
511
490
  for worker_id in range(self.worker_count):
512
- p = self._start_worker(worker_id)
513
- self.procs[worker_id] = p
491
+ procs[worker_id] = self._start_worker(worker_id)
514
492
 
515
493
  logger.debug("Workers started")
516
494
 
@@ -519,40 +497,69 @@ class Manager:
519
497
  target=self.push_results, name="Result-Pusher"
520
498
  )
521
499
  thr_worker_watchdog = threading.Thread(
522
- target=self.worker_watchdog, name="worker-watchdog"
500
+ target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
523
501
  )
524
502
  thr_monitoring_handler = threading.Thread(
525
503
  target=self.handle_monitoring_messages, name="Monitoring-Handler"
526
504
  )
505
+ thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
527
506
 
528
507
  thr_task_puller.start()
529
508
  thr_result_pusher.start()
530
509
  thr_worker_watchdog.start()
531
510
  thr_monitoring_handler.start()
511
+ thr_heartbeater.start()
532
512
 
533
513
  logger.info("Manager threads started")
534
514
 
535
515
  # This might need a multiprocessing event to signal back.
536
516
  self._stop_event.wait()
537
- logger.critical("Received kill event, terminating worker processes")
517
+ logger.info("Stop event set; terminating worker processes")
518
+
519
+ # Invite blocking threads to quit
520
+ self.monitoring_queue.put(None)
521
+ self.pending_result_queue.put(None)
538
522
 
523
+ thr_heartbeater.join()
539
524
  thr_task_puller.join()
540
525
  thr_result_pusher.join()
541
526
  thr_worker_watchdog.join()
542
527
  thr_monitoring_handler.join()
543
- for proc_id in self.procs:
544
- self.procs[proc_id].terminate()
545
- logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
546
- self.procs[proc_id].is_alive()))
547
- self.procs[proc_id].join()
548
- logger.debug("Worker {} joined successfully".format(self.procs[proc_id]))
528
+
529
+ for worker_id in procs:
530
+ p = procs[worker_id]
531
+ proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
532
+ logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
533
+ p.terminate()
549
534
 
550
535
  self.zmq_context.term()
536
+
537
+ # give processes 1 second to gracefully shut themselves down, based on the
538
+ # SIGTERM (.terminate()) just sent; after then, we pull the plug.
539
+ force_child_shutdown_at = time.monotonic() + 1
540
+ while procs:
541
+ worker_id, p = procs.popitem()
542
+ timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
543
+ p.join(timeout=timeout)
544
+ proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
545
+ if p.exitcode is not None:
546
+ logger.debug(
547
+ "Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
548
+ )
549
+
550
+ else:
551
+ logger.warning(
552
+ f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
553
+ f" manner; sending KILL signal to process. {proc_info}"
554
+ )
555
+ p.kill()
556
+ p.join()
557
+ p.close()
558
+
551
559
  delta = time.time() - self._start_time
552
560
  logger.info("process_worker_pool ran for {} seconds".format(delta))
553
- return
554
561
 
555
- def _start_worker(self, worker_id: int):
562
+ def _start_worker(self, worker_id: int) -> SpawnProcess:
556
563
  p = SpawnContext.Process(
557
564
  target=worker,
558
565
  args=(
@@ -939,27 +946,27 @@ if __name__ == "__main__":
939
946
  )
940
947
  logger.info(
941
948
  f"\n Python version: {sys.version}"
942
- f" Debug logging: {args.debug}"
943
- f" Certificates dir: {args.cert_dir}"
944
- f" Log dir: {args.logdir}"
945
- f" Manager ID: {args.uid}"
946
- f" Block ID: {args.block_id}"
947
- f" cores_per_worker: {args.cores_per_worker}"
948
- f" mem_per_worker: {args.mem_per_worker}"
949
- f" task_port: {args.task_port}"
950
- f" result_port: {args.result_port}"
951
- f" addresses: {args.addresses}"
952
- f" max_workers_per_node: {args.max_workers_per_node}"
953
- f" poll_period: {args.poll}"
954
- f" address_probe_timeout: {args.address_probe_timeout}"
955
- f" Prefetch capacity: {args.prefetch_capacity}"
956
- f" Heartbeat threshold: {args.hb_threshold}"
957
- f" Heartbeat period: {args.hb_period}"
958
- f" Drain period: {args.drain_period}"
959
- f" CPU affinity: {args.cpu_affinity}"
960
- f" Accelerators: {' '.join(args.available_accelerators)}"
961
- f" enable_mpi_mode: {args.enable_mpi_mode}"
962
- f" mpi_launcher: {args.mpi_launcher}"
949
+ f"\n Debug logging: {args.debug}"
950
+ f"\n Certificates dir: {args.cert_dir}"
951
+ f"\n Log dir: {args.logdir}"
952
+ f"\n Manager ID: {args.uid}"
953
+ f"\n Block ID: {args.block_id}"
954
+ f"\n cores_per_worker: {args.cores_per_worker}"
955
+ f"\n mem_per_worker: {args.mem_per_worker}"
956
+ f"\n task_port: {args.task_port}"
957
+ f"\n result_port: {args.result_port}"
958
+ f"\n addresses: {args.addresses}"
959
+ f"\n max_workers_per_node: {args.max_workers_per_node}"
960
+ f"\n poll_period: {args.poll}"
961
+ f"\n address_probe_timeout: {args.address_probe_timeout}"
962
+ f"\n Prefetch capacity: {args.prefetch_capacity}"
963
+ f"\n Heartbeat threshold: {args.hb_threshold}"
964
+ f"\n Heartbeat period: {args.hb_period}"
965
+ f"\n Drain period: {args.drain_period}"
966
+ f"\n CPU affinity: {args.cpu_affinity}"
967
+ f"\n Accelerators: {' '.join(args.available_accelerators)}"
968
+ f"\n enable_mpi_mode: {args.enable_mpi_mode}"
969
+ f"\n mpi_launcher: {args.mpi_launcher}"
963
970
  )
964
971
  try:
965
972
  manager = Manager(task_port=args.task_port,
@@ -215,6 +215,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
215
215
  """Create the Pilot component and pass it.
216
216
  """
217
217
  logger.info("starting RadicalPilotExecutor")
218
+ super().start()
218
219
  logger.info('Parsl: {0}'.format(parsl.__version__))
219
220
  logger.info('RADICAL pilot: {0}'.format(rp.version))
220
221
  self.session = rp.Session(cfg={'base': self.run_dir},
@@ -14,6 +14,7 @@ from parsl.executors.errors import BadStateException, ScalingFailed
14
14
  from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
15
15
  from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
16
16
  from parsl.monitoring.message_type import MessageType
17
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
17
18
  from parsl.providers.base import ExecutionProvider
18
19
  from parsl.utils import AtomicIDCounter
19
20
 
@@ -83,6 +84,13 @@ class BlockProviderExecutor(ParslExecutor):
83
84
  # of pending, active and recently terminated blocks
84
85
  self._status = {} # type: Dict[str, JobStatus]
85
86
 
87
+ self.submit_monitoring_radio: Optional[MultiprocessingQueueRadioSender] = None
88
+
89
+ def start(self):
90
+ super().start()
91
+ if self.monitoring_messages:
92
+ self.submit_monitoring_radio = MultiprocessingQueueRadioSender(self.monitoring_messages)
93
+
86
94
  def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
87
95
  """Given a list of block ids and a list of corresponding status strings,
88
96
  returns a dictionary mapping each block id to the corresponding status
@@ -239,6 +239,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
239
239
  retrieve Parsl tasks within the TaskVine system.
240
240
  """
241
241
 
242
+ super().start()
242
243
  # Synchronize connection and communication settings between the manager and factory
243
244
  self.__synchronize_manager_factory_comm_settings()
244
245
 
@@ -314,6 +314,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
314
314
  """Create submit process and collector thread to create, send, and
315
315
  retrieve Parsl tasks within the Work Queue system.
316
316
  """
317
+ super().start()
317
318
  self.tasks_lock = threading.Lock()
318
319
 
319
320
  # Create directories for data and results
@@ -1,6 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import multiprocessing.queues as mpq
4
+ import multiprocessing.synchronize as mpe
4
5
  import os
5
6
  import queue
6
7
  import threading
@@ -278,11 +279,13 @@ class Database:
278
279
 
279
280
  class DatabaseManager:
280
281
  def __init__(self,
282
+ *,
281
283
  db_url: str = 'sqlite:///runinfo/monitoring.db',
282
284
  run_dir: str = '.',
283
285
  logging_level: int = logging.INFO,
284
286
  batching_interval: float = 1,
285
287
  batching_threshold: float = 99999,
288
+ exit_event: mpe.Event
286
289
  ):
287
290
 
288
291
  self.workflow_end = False
@@ -307,6 +310,8 @@ class DatabaseManager:
307
310
  self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
308
311
  self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
309
312
 
313
+ self.external_exit_event = exit_event
314
+
310
315
  def start(self,
311
316
  resource_queue: mpq.Queue) -> None:
312
317
 
@@ -555,15 +560,16 @@ class DatabaseManager:
555
560
  while not kill_event.is_set() or logs_queue.qsize() != 0:
556
561
  logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
557
562
  kill_event.is_set(), logs_queue.qsize() != 0)
563
+
564
+ if self.external_exit_event.is_set():
565
+ self.close()
566
+
558
567
  try:
559
568
  x = logs_queue.get(timeout=0.1)
560
569
  except queue.Empty:
561
570
  continue
562
571
  else:
563
- if x == 'STOP':
564
- self.close()
565
- else:
566
- self._dispatch_to_internal(x)
572
+ self._dispatch_to_internal(x)
567
573
 
568
574
  def _dispatch_to_internal(self, x: Tuple) -> None:
569
575
  assert isinstance(x, tuple)
@@ -678,11 +684,11 @@ class DatabaseManager:
678
684
 
679
685
  @wrap_with_logs(target="database_manager")
680
686
  @typeguard.typechecked
681
- def dbm_starter(exception_q: mpq.Queue,
682
- resource_msgs: mpq.Queue,
687
+ def dbm_starter(resource_msgs: mpq.Queue,
683
688
  db_url: str,
684
689
  run_dir: str,
685
- logging_level: int) -> None:
690
+ logging_level: int,
691
+ exit_event: mpe.Event) -> None:
686
692
  """Start the database manager process
687
693
 
688
694
  The DFK should start this function. The args, kwargs match that of the monitoring config
@@ -693,16 +699,16 @@ def dbm_starter(exception_q: mpq.Queue,
693
699
  try:
694
700
  dbm = DatabaseManager(db_url=db_url,
695
701
  run_dir=run_dir,
696
- logging_level=logging_level)
702
+ logging_level=logging_level,
703
+ exit_event=exit_event)
697
704
  logger.info("Starting dbm in dbm starter")
698
705
  dbm.start(resource_msgs)
699
706
  except KeyboardInterrupt:
700
707
  logger.exception("KeyboardInterrupt signal caught")
701
708
  dbm.close()
702
709
  raise
703
- except Exception as e:
710
+ except Exception:
704
711
  logger.exception("dbm.start exception")
705
- exception_q.put(("DBM", str(e)))
706
712
  dbm.close()
707
713
 
708
714
  logger.info("End of dbm_starter")
@@ -4,3 +4,8 @@ from parsl.errors import ParslError
4
4
  class MonitoringHubStartError(ParslError):
5
5
  def __str__(self) -> str:
6
6
  return "Hub failed to start"
7
+
8
+
9
+ class MonitoringRouterStartError(ParslError):
10
+ def __str__(self) -> str:
11
+ return "Monitoring router failed to start"