parsl 2025.6.23__py3-none-any.whl → 2025.6.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. parsl/configs/osg.py +1 -1
  2. parsl/dataflow/dflow.py +14 -4
  3. parsl/executors/base.py +14 -6
  4. parsl/executors/high_throughput/executor.py +20 -15
  5. parsl/executors/high_throughput/interchange.py +173 -191
  6. parsl/executors/high_throughput/mpi_executor.py +7 -4
  7. parsl/executors/high_throughput/probe.py +4 -4
  8. parsl/executors/high_throughput/process_worker_pool.py +88 -94
  9. parsl/executors/taskvine/executor.py +9 -3
  10. parsl/executors/taskvine/manager.py +3 -1
  11. parsl/executors/threads.py +8 -1
  12. parsl/executors/workqueue/executor.py +9 -3
  13. parsl/monitoring/errors.py +5 -0
  14. parsl/monitoring/monitoring.py +25 -42
  15. parsl/monitoring/radios/base.py +63 -2
  16. parsl/monitoring/radios/filesystem.py +18 -3
  17. parsl/monitoring/radios/filesystem_router.py +13 -26
  18. parsl/monitoring/radios/htex.py +22 -13
  19. parsl/monitoring/radios/multiprocessing.py +22 -2
  20. parsl/monitoring/radios/udp.py +57 -19
  21. parsl/monitoring/radios/udp_router.py +49 -15
  22. parsl/monitoring/remote.py +19 -40
  23. parsl/providers/local/local.py +12 -13
  24. parsl/tests/configs/htex_local_alternate.py +0 -1
  25. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
  26. parsl/tests/test_htex/test_zmq_binding.py +5 -6
  27. parsl/tests/test_monitoring/test_basic.py +12 -10
  28. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
  29. parsl/tests/test_monitoring/test_radio_filesystem.py +7 -9
  30. parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
  31. parsl/tests/test_monitoring/test_radio_udp.py +163 -12
  32. parsl/tests/test_monitoring/test_stdouterr.py +1 -3
  33. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
  34. parsl/version.py +1 -1
  35. {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
  36. {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
  37. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
  38. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/RECORD +44 -43
  39. {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
  40. {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
  41. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
  42. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
  43. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
  44. {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
- import json
5
4
  import logging
6
5
  import math
7
6
  import multiprocessing
@@ -66,8 +65,7 @@ class Manager:
66
65
  def __init__(self, *,
67
66
  addresses,
68
67
  address_probe_timeout,
69
- task_port,
70
- result_port,
68
+ port,
71
69
  cores_per_worker,
72
70
  mem_per_worker,
73
71
  max_workers_per_node,
@@ -157,15 +155,13 @@ class Manager:
157
155
  self._start_time = time.time()
158
156
 
159
157
  try:
160
- ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
158
+ ix_address = probe_addresses(addresses.split(','), port, timeout=address_probe_timeout)
161
159
  if not ix_address:
162
160
  raise Exception("No viable address found")
163
161
  else:
164
- logger.info("Connection to Interchange successful on {}".format(ix_address))
165
- task_q_url = tcp_url(ix_address, task_port)
166
- result_q_url = tcp_url(ix_address, result_port)
167
- logger.info("Task url : {}".format(task_q_url))
168
- logger.info("Result url : {}".format(result_q_url))
162
+ logger.info(f"Connection to Interchange successful on {ix_address}")
163
+ ix_url = tcp_url(ix_address, port)
164
+ logger.info(f"Interchange url: {ix_url}")
169
165
  except Exception:
170
166
  logger.exception("Caught exception while trying to determine viable address to interchange")
171
167
  print("Failed to find a viable address to connect to interchange. Exiting")
@@ -174,8 +170,7 @@ class Manager:
174
170
  self.cert_dir = cert_dir
175
171
  self.zmq_context = curvezmq.ClientContext(self.cert_dir)
176
172
 
177
- self._task_q_url = task_q_url
178
- self._result_q_url = result_q_url
173
+ self._ix_url = ix_url
179
174
 
180
175
  self.uid = uid
181
176
  self.block_id = block_id
@@ -250,37 +245,37 @@ class Manager:
250
245
  self.worker_count = min(len(self.available_accelerators), self.worker_count)
251
246
  logger.info("Manager will spawn {} workers".format(self.worker_count))
252
247
 
253
- def create_reg_message(self):
248
+ def create_reg_message(self) -> dict:
254
249
  """ Creates a registration message to identify the worker to the interchange
255
250
  """
256
- msg = {'type': 'registration',
257
- 'parsl_v': PARSL_VERSION,
258
- 'python_v': "{}.{}.{}".format(sys.version_info.major,
259
- sys.version_info.minor,
260
- sys.version_info.micro),
261
- 'packages': {dist.metadata['Name']: dist.version for dist in distributions()},
262
- 'worker_count': self.worker_count,
263
- 'uid': self.uid,
264
- 'block_id': self.block_id,
265
- 'start_time': self.start_time,
266
- 'prefetch_capacity': self.prefetch_capacity,
267
- 'max_capacity': self.worker_count + self.prefetch_capacity,
268
- 'os': platform.system(),
269
- 'hostname': platform.node(),
270
- 'dir': os.getcwd(),
271
- 'cpu_count': psutil.cpu_count(logical=False),
272
- 'total_memory': psutil.virtual_memory().total,
273
- }
274
- b_msg = json.dumps(msg).encode('utf-8')
275
- return b_msg
251
+ return {
252
+ 'type': 'registration',
253
+ 'parsl_v': PARSL_VERSION,
254
+ 'python_v': "{}.{}.{}".format(
255
+ sys.version_info.major,
256
+ sys.version_info.minor,
257
+ sys.version_info.micro
258
+ ),
259
+ 'packages': {d.metadata['Name']: d.version for d in distributions()},
260
+ 'worker_count': self.worker_count,
261
+ 'uid': self.uid,
262
+ 'block_id': self.block_id,
263
+ 'start_time': self.start_time,
264
+ 'prefetch_capacity': self.prefetch_capacity,
265
+ 'max_capacity': self.worker_count + self.prefetch_capacity,
266
+ 'os': platform.system(),
267
+ 'hostname': platform.node(),
268
+ 'dir': os.getcwd(),
269
+ 'cpu_count': psutil.cpu_count(logical=False),
270
+ 'total_memory': psutil.virtual_memory().total,
271
+ }
276
272
 
277
273
  @staticmethod
278
274
  def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
279
275
  """ Send heartbeat to the incoming task queue
280
276
  """
281
- msg = {'type': 'heartbeat'}
282
277
  # don't need to dumps and encode this every time - could do as a global on import?
283
- b_msg = json.dumps(msg).encode('utf-8')
278
+ b_msg = pickle.dumps({'type': 'heartbeat'})
284
279
  task_incoming.send(b_msg)
285
280
  logger.debug("Sent heartbeat")
286
281
 
@@ -289,32 +284,38 @@ class Manager:
289
284
  """ Send heartbeat to the incoming task queue
290
285
  """
291
286
  msg = {'type': 'drain'}
292
- b_msg = json.dumps(msg).encode('utf-8')
287
+ b_msg = pickle.dumps(msg)
293
288
  task_incoming.send(b_msg)
294
289
  logger.debug("Sent drain")
295
290
 
296
291
  @wrap_with_logs
297
- def pull_tasks(self):
292
+ def interchange_communicator(self, pair_setup: threading.Event):
298
293
  """ Pull tasks from the incoming tasks zmq pipe onto the internal
299
294
  pending task queue
300
295
  """
301
296
  logger.info("starting")
302
297
 
298
+ results_sock = self.zmq_context.socket(zmq.PAIR)
299
+ results_sock.setsockopt(zmq.LINGER, 0)
300
+ results_sock.bind("inproc://results")
301
+ pair_setup.set()
302
+
303
303
  # Linger is set to 0, so that the manager can exit even when there might be
304
304
  # messages in the pipe
305
- task_incoming = self.zmq_context.socket(zmq.DEALER)
306
- task_incoming.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
307
- task_incoming.setsockopt(zmq.LINGER, 0)
308
- task_incoming.connect(self._task_q_url)
305
+ ix_sock = self.zmq_context.socket(zmq.DEALER)
306
+ ix_sock.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
307
+ ix_sock.setsockopt(zmq.LINGER, 0)
308
+ ix_sock.connect(self._ix_url)
309
309
  logger.info("Manager task pipe connected to interchange")
310
310
 
311
311
  poller = zmq.Poller()
312
- poller.register(task_incoming, zmq.POLLIN)
312
+ poller.register(results_sock, zmq.POLLIN)
313
+ poller.register(ix_sock, zmq.POLLIN)
313
314
 
314
315
  # Send a registration message
315
316
  msg = self.create_reg_message()
316
- logger.debug("Sending registration message: {}".format(msg))
317
- task_incoming.send(msg)
317
+ logger.debug("Sending registration message: %s", msg)
318
+ ix_sock.send(pickle.dumps(msg))
318
319
  last_beat = time.time()
319
320
  last_interchange_contact = time.time()
320
321
  task_recv_counter = 0
@@ -335,18 +336,21 @@ class Manager:
335
336
  pending_task_count = self.pending_task_queue.qsize()
336
337
  except NotImplementedError:
337
338
  # Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
338
- pending_task_count = f"pending task count is not available on {platform.system()}"
339
+ pending_task_count = f"pending task count is not available on {platform.system()}" # type: ignore[assignment]
339
340
 
340
- logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
341
- pending_task_count))
341
+ logger.debug(
342
+ 'ready workers: %d, pending tasks: %d',
343
+ self.ready_worker_count.value, # type: ignore[attr-defined]
344
+ pending_task_count,
345
+ )
342
346
 
343
347
  if time.time() >= last_beat + self.heartbeat_period:
344
- self.heartbeat_to_incoming(task_incoming)
348
+ self.heartbeat_to_incoming(ix_sock)
345
349
  last_beat = time.time()
346
350
 
347
351
  if time.time() > self.drain_time:
348
352
  logger.info("Requesting drain")
349
- self.drain_to_incoming(task_incoming)
353
+ self.drain_to_incoming(ix_sock)
350
354
  # This will start the pool draining...
351
355
  # Drained exit behaviour does not happen here. It will be
352
356
  # driven by the interchange sending a DRAINED_CODE message.
@@ -358,8 +362,8 @@ class Manager:
358
362
  poll_duration_s = max(0, next_interesting_event_time - time.time())
359
363
  socks = dict(poller.poll(timeout=poll_duration_s * 1000))
360
364
 
361
- if socks.get(task_incoming) == zmq.POLLIN:
362
- _, pkl_msg = task_incoming.recv_multipart()
365
+ if socks.get(ix_sock) == zmq.POLLIN:
366
+ pkl_msg = ix_sock.recv()
363
367
  tasks = pickle.loads(pkl_msg)
364
368
  last_interchange_contact = time.time()
365
369
 
@@ -377,6 +381,11 @@ class Manager:
377
381
  for task in tasks:
378
382
  self.task_scheduler.put_task(task)
379
383
 
384
+ elif socks.get(results_sock) == zmq.POLLIN:
385
+ meta_b = pickle.dumps({'type': 'result'})
386
+ ix_sock.send_multipart([meta_b, results_sock.recv()])
387
+ logger.debug("Result sent to interchange")
388
+
380
389
  else:
381
390
  logger.debug("No incoming tasks")
382
391
 
@@ -387,45 +396,36 @@ class Manager:
387
396
  logger.critical("Exiting")
388
397
  break
389
398
 
390
- task_incoming.close()
399
+ ix_sock.close()
391
400
  logger.info("Exiting")
392
401
 
393
402
  @wrap_with_logs
394
- def push_results(self):
395
- """ Listens on the pending_result_queue and sends out results via zmq
403
+ def ferry_result(self, may_connect: threading.Event):
404
+ """ Listens on the pending_result_queue and ferries results to the interchange
405
+ connected thread
396
406
  """
397
- logger.debug("Starting result push thread")
407
+ logger.debug("Begin")
398
408
 
399
409
  # Linger is set to 0, so that the manager can exit even when there might be
400
410
  # messages in the pipe
401
- result_outgoing = self.zmq_context.socket(zmq.DEALER)
402
- result_outgoing.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
403
- result_outgoing.setsockopt(zmq.LINGER, 0)
404
- result_outgoing.connect(self._result_q_url)
405
- logger.info("Manager result pipe connected to interchange")
411
+ notify_sock = self.zmq_context.socket(zmq.PAIR)
412
+ notify_sock.setsockopt(zmq.LINGER, 0)
413
+ may_connect.wait()
414
+ notify_sock.connect("inproc://results")
406
415
 
407
416
  while not self._stop_event.is_set():
408
- logger.debug("Starting pending_result_queue get")
409
417
  try:
410
418
  r = self.task_scheduler.get_result()
411
419
  if r is None:
412
420
  continue
413
- logger.debug("Result received from worker: %s", id(r))
414
- result_outgoing.send(r)
415
- logger.debug("Result sent to interchange: %s", id(r))
421
+ logger.debug("Result received from worker")
422
+ notify_sock.send(r)
416
423
  except Exception:
417
424
  logger.exception("Failed to send result to interchange")
418
425
 
419
- result_outgoing.close()
426
+ notify_sock.close()
420
427
  logger.debug("Exiting")
421
428
 
422
- @wrap_with_logs
423
- def heartbeater(self):
424
- while not self._stop_event.wait(self.heartbeat_period):
425
- heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
426
- logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
- self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
428
-
429
429
  def worker_watchdog(self, procs: dict[int, SpawnProcess]):
430
430
  """Keeps workers alive."""
431
431
  logger.debug("Starting worker watchdog")
@@ -492,23 +492,26 @@ class Manager:
492
492
 
493
493
  logger.debug("Workers started")
494
494
 
495
- thr_task_puller = threading.Thread(target=self.pull_tasks, name="Task-Puller")
496
- thr_result_pusher = threading.Thread(
497
- target=self.push_results, name="Result-Pusher"
495
+ pair_setup = threading.Event()
496
+
497
+ thr_task_puller = threading.Thread(
498
+ target=self.interchange_communicator,
499
+ args=(pair_setup,),
500
+ name="Interchange-Communicator",
498
501
  )
502
+ thr_result_ferry = threading.Thread(
503
+ target=self.ferry_result, args=(pair_setup,), name="Result-Shovel")
499
504
  thr_worker_watchdog = threading.Thread(
500
505
  target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
501
506
  )
502
507
  thr_monitoring_handler = threading.Thread(
503
508
  target=self.handle_monitoring_messages, name="Monitoring-Handler"
504
509
  )
505
- thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
506
510
 
507
511
  thr_task_puller.start()
508
- thr_result_pusher.start()
512
+ thr_result_ferry.start()
509
513
  thr_worker_watchdog.start()
510
514
  thr_monitoring_handler.start()
511
- thr_heartbeater.start()
512
515
 
513
516
  logger.info("Manager threads started")
514
517
 
@@ -520,11 +523,10 @@ class Manager:
520
523
  self.monitoring_queue.put(None)
521
524
  self.pending_result_queue.put(None)
522
525
 
523
- thr_heartbeater.join()
524
- thr_task_puller.join()
525
- thr_result_pusher.join()
526
- thr_worker_watchdog.join()
527
526
  thr_monitoring_handler.join()
527
+ thr_worker_watchdog.join()
528
+ thr_result_ferry.join()
529
+ thr_task_puller.join()
528
530
 
529
531
  for worker_id in procs:
530
532
  p = procs[worker_id]
@@ -862,10 +864,10 @@ def get_arg_parser() -> argparse.ArgumentParser:
862
864
  help="GB of memory assigned to each worker process. Default=0, no assignment",
863
865
  )
864
866
  parser.add_argument(
865
- "-t",
866
- "--task_port",
867
+ "-P",
868
+ "--port",
867
869
  required=True,
868
- help="Task port for receiving tasks from the interchange",
870
+ help="Port for communication with the interchange",
869
871
  )
870
872
  parser.add_argument(
871
873
  "--max_workers_per_node",
@@ -901,12 +903,6 @@ def get_arg_parser() -> argparse.ArgumentParser:
901
903
  parser.add_argument(
902
904
  "--poll", default=10, help="Poll period used in milliseconds"
903
905
  )
904
- parser.add_argument(
905
- "-r",
906
- "--result_port",
907
- required=True,
908
- help="Result port for posting results to the interchange",
909
- )
910
906
  parser.add_argument(
911
907
  "--cpu-affinity",
912
908
  type=strategyorlist,
@@ -953,8 +949,7 @@ if __name__ == "__main__":
953
949
  f"\n Block ID: {args.block_id}"
954
950
  f"\n cores_per_worker: {args.cores_per_worker}"
955
951
  f"\n mem_per_worker: {args.mem_per_worker}"
956
- f"\n task_port: {args.task_port}"
957
- f"\n result_port: {args.result_port}"
952
+ f"\n Interchange port: {args.port}"
958
953
  f"\n addresses: {args.addresses}"
959
954
  f"\n max_workers_per_node: {args.max_workers_per_node}"
960
955
  f"\n poll_period: {args.poll}"
@@ -969,8 +964,7 @@ if __name__ == "__main__":
969
964
  f"\n mpi_launcher: {args.mpi_launcher}"
970
965
  )
971
966
  try:
972
- manager = Manager(task_port=args.task_port,
973
- result_port=args.result_port,
967
+ manager = Manager(port=args.port,
974
968
  addresses=args.addresses,
975
969
  address_probe_timeout=int(args.address_probe_timeout),
976
970
  uid=args.uid,
@@ -40,6 +40,8 @@ from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig
40
40
  from parsl.executors.taskvine.manager import _taskvine_submit_wait
41
41
  from parsl.executors.taskvine.manager_config import TaskVineManagerConfig
42
42
  from parsl.executors.taskvine.utils import ParslFileToVine, ParslTaskToVine
43
+ from parsl.monitoring.radios.base import RadioConfig
44
+ from parsl.monitoring.radios.filesystem import FilesystemRadio
43
45
  from parsl.multiprocessing import SpawnContext
44
46
  from parsl.process_loggers import wrap_with_logs
45
47
  from parsl.providers import CondorProvider, LocalProvider
@@ -98,8 +100,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
98
100
  Default is None.
99
101
  """
100
102
 
101
- radio_mode = "filesystem"
102
-
103
103
  @typeguard.typechecked
104
104
  def __init__(self,
105
105
  label: str = "TaskVineExecutor",
@@ -108,7 +108,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
108
108
  manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
109
109
  factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
110
110
  provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
111
- storage_access: Optional[List[Staging]] = None):
111
+ storage_access: Optional[List[Staging]] = None,
112
+ remote_monitoring_radio: Optional[RadioConfig] = None):
112
113
 
113
114
  # Set worker launch option for this executor
114
115
  if worker_launch_method == 'factory' or worker_launch_method == 'manual':
@@ -134,6 +135,11 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
134
135
  self.factory_config = factory_config
135
136
  self.storage_access = storage_access
136
137
 
138
+ if remote_monitoring_radio is not None:
139
+ self.remote_monitoring_radio = remote_monitoring_radio
140
+ else:
141
+ self.remote_monitoring_radio = FilesystemRadio()
142
+
137
143
  # Queue to send ready tasks from TaskVine executor process to TaskVine manager process
138
144
  self._ready_task_queue: multiprocessing.Queue = SpawnContext.Queue()
139
145
 
@@ -6,6 +6,7 @@ import shutil
6
6
  import subprocess
7
7
  import uuid
8
8
 
9
+ import parsl
9
10
  from parsl.executors.taskvine import exec_parsl_function
10
11
  from parsl.executors.taskvine.utils import VineTaskToParsl, run_parsl_function
11
12
  from parsl.process_loggers import wrap_with_logs
@@ -255,7 +256,8 @@ def _taskvine_submit_wait(ready_task_queue=None,
255
256
  run_parsl_function,
256
257
  poncho_env=poncho_env_path,
257
258
  init_command=manager_config.init_command,
258
- add_env=add_env)
259
+ add_env=add_env,
260
+ hoisting_modules=[parsl.serialize, run_parsl_function])
259
261
 
260
262
  # Configure the library if provided
261
263
  if manager_config.library_config:
@@ -7,6 +7,8 @@ import typeguard
7
7
  from parsl.data_provider.staging import Staging
8
8
  from parsl.executors.base import ParslExecutor
9
9
  from parsl.executors.errors import InvalidResourceSpecification
10
+ from parsl.monitoring.radios.base import RadioConfig
11
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadio
10
12
  from parsl.utils import RepresentationMixin
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -28,7 +30,7 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
28
30
  @typeguard.typechecked
29
31
  def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
30
32
  thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
31
- working_dir: Optional[str] = None):
33
+ working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
32
34
  ParslExecutor.__init__(self)
33
35
  self.label = label
34
36
  self.max_threads = max_threads
@@ -40,6 +42,11 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
40
42
  self.storage_access = storage_access
41
43
  self.working_dir = working_dir
42
44
 
45
+ if remote_monitoring_radio is not None:
46
+ self.remote_monitoring_radio = remote_monitoring_radio
47
+ else:
48
+ self.remote_monitoring_radio = MultiprocessingQueueRadio()
49
+
43
50
  def start(self):
44
51
  self.executor = cf.ThreadPoolExecutor(max_workers=self.max_threads,
45
52
  thread_name_prefix=self.thread_name_prefix)
@@ -31,6 +31,8 @@ from parsl.errors import OptionalModuleMissing
31
31
  from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
32
32
  from parsl.executors.status_handling import BlockProviderExecutor
33
33
  from parsl.executors.workqueue import exec_parsl_function
34
+ from parsl.monitoring.radios.base import RadioConfig
35
+ from parsl.monitoring.radios.filesystem import FilesystemRadio
34
36
  from parsl.multiprocessing import SpawnContext, SpawnProcess
35
37
  from parsl.process_loggers import wrap_with_logs
36
38
  from parsl.providers import CondorProvider, LocalProvider
@@ -227,8 +229,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
227
229
  specifiation for each task).
228
230
  """
229
231
 
230
- radio_mode = "filesystem"
231
-
232
232
  @typeguard.typechecked
233
233
  def __init__(self,
234
234
  label: str = "WorkQueueExecutor",
@@ -255,7 +255,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
255
255
  worker_executable: str = 'work_queue_worker',
256
256
  function_dir: Optional[str] = None,
257
257
  coprocess: bool = False,
258
- scaling_cores_per_worker: int = 1):
258
+ scaling_cores_per_worker: int = 1,
259
+ remote_monitoring_radio: Optional[RadioConfig] = None):
259
260
  BlockProviderExecutor.__init__(self, provider=provider,
260
261
  block_error_handler=True)
261
262
  if not _work_queue_enabled:
@@ -308,6 +309,11 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
308
309
  if self.init_command != "":
309
310
  self.launch_cmd = self.init_command + "; " + self.launch_cmd
310
311
 
312
+ if remote_monitoring_radio is not None:
313
+ self.remote_monitoring_radio = remote_monitoring_radio
314
+ else:
315
+ self.remote_monitoring_radio = FilesystemRadio()
316
+
311
317
  def _get_launch_command(self, block_id):
312
318
  # this executor uses different terminology for worker/launch
313
319
  # commands than in htex
@@ -4,3 +4,8 @@ from parsl.errors import ParslError
4
4
  class MonitoringRouterStartError(ParslError):
5
5
  def __str__(self) -> str:
6
6
  return "Monitoring router failed to start"
7
+
8
+
9
+ class RadioRequiredError(ParslError):
10
+ def __str__(self) -> str:
11
+ return "A radio must be configured for remote task monitoring"
@@ -9,8 +9,6 @@ from typing import Any, Optional, Union
9
9
 
10
10
  import typeguard
11
11
 
12
- from parsl.monitoring.radios.filesystem_router import start_filesystem_receiver
13
- from parsl.monitoring.radios.udp_router import start_udp_receiver
14
12
  from parsl.monitoring.types import TaggedMonitoringMessage
15
13
  from parsl.multiprocessing import (
16
14
  SizedQueue,
@@ -36,9 +34,9 @@ logger = logging.getLogger(__name__)
36
34
  @typeguard.typechecked
37
35
  class MonitoringHub(RepresentationMixin):
38
36
  def __init__(self,
39
- hub_address: str,
40
- hub_port: Optional[int] = None,
41
- hub_port_range: Any = None,
37
+ hub_address: Any = None, # unused, so no type enforcement
38
+ hub_port_range: Any = None, # unused, so no type enforcement
39
+ hub_port: Any = None, # unused, so no type enforcement
42
40
 
43
41
  workflow_name: Optional[str] = None,
44
42
  workflow_version: Optional[str] = None,
@@ -49,16 +47,14 @@ class MonitoringHub(RepresentationMixin):
49
47
  """
50
48
  Parameters
51
49
  ----------
52
- hub_address : str
53
- The ip address at which the workers will be able to reach the Hub.
54
- hub_port : int
55
- The UDP port to which workers will be able to deliver messages to
56
- the monitoring router.
57
- Note that despite the similar name, this is not related to
58
- hub_port_range.
59
- Default: None
50
+ hub_address : unused
51
+ hub_port : unused
52
+ Unused, but probably retained until 2026-06-01 to give deprecation warning.
53
+ These two values previously configured UDP parameters when UDP was used
54
+ for monitoring messages from workers. These are now configured on the
55
+ relevant UDPRadio.
60
56
  hub_port_range : unused
61
- Unused, but retained until 2025-09-14 to avoid configuration errors.
57
+ Unused, but probably retained until 2026-06-01 to give deprecation warning.
62
58
  This value previously configured one ZMQ channel inside the
63
59
  HighThroughputExecutor. That ZMQ channel is now configured by the
64
60
  interchange_port_range parameter of HighThroughputExecutor.
@@ -86,15 +82,27 @@ class MonitoringHub(RepresentationMixin):
86
82
  if _db_manager_excepts:
87
83
  raise _db_manager_excepts
88
84
 
85
+ # The following three parameters need to exist as attributes to be
86
+ # output by RepresentationMixin.
87
+ if hub_address is not None:
88
+ message = "Instead of MonitoringHub.hub_address, specify UDPRadio(address=...)"
89
+ warnings.warn(message, DeprecationWarning)
90
+ logger.warning(message)
91
+
89
92
  self.hub_address = hub_address
93
+
94
+ if hub_port is not None:
95
+ message = "Instead of MonitoringHub.hub_port, specify UDPRadio(port=...)"
96
+ warnings.warn(message, DeprecationWarning)
97
+ logger.warning(message)
98
+
90
99
  self.hub_port = hub_port
91
100
 
92
101
  if hub_port_range is not None:
93
102
  message = "Instead of MonitoringHub.hub_port_range, Use HighThroughputExecutor.interchange_port_range"
94
103
  warnings.warn(message, DeprecationWarning)
95
104
  logger.warning(message)
96
- # This is used by RepresentationMixin so needs to exist as an attribute
97
- # even though now it is otherwise unused.
105
+
98
106
  self.hub_port_range = hub_port_range
99
107
 
100
108
  self.logging_endpoint = logging_endpoint
@@ -120,12 +128,6 @@ class MonitoringHub(RepresentationMixin):
120
128
  self.resource_msgs: Queue[TaggedMonitoringMessage]
121
129
  self.resource_msgs = SizedQueue()
122
130
 
123
- self.udp_receiver = start_udp_receiver(debug=self.monitoring_debug,
124
- logdir=dfk_run_dir,
125
- monitoring_messages=self.resource_msgs,
126
- port=self.hub_port
127
- )
128
-
129
131
  self.dbm_exit_event: ms.Event
130
132
  self.dbm_exit_event = SpawnEvent()
131
133
 
@@ -140,37 +142,18 @@ class MonitoringHub(RepresentationMixin):
140
142
  daemon=True,
141
143
  )
142
144
  self.dbm_proc.start()
143
- logger.info("Started UDP router process %s and DBM process %s",
144
- self.udp_receiver.process.pid, self.dbm_proc.pid)
145
-
146
- self.filesystem_receiver = start_filesystem_receiver(debug=self.monitoring_debug,
147
- logdir=dfk_run_dir,
148
- monitoring_messages=self.resource_msgs
149
- )
150
- logger.info("Started filesystem radio receiver process %s", self.filesystem_receiver.process.pid)
151
-
152
- self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, self.udp_receiver.port)
153
-
145
+ logger.info("Started DBM process %s", self.dbm_proc.pid)
154
146
  logger.info("Monitoring Hub initialized")
155
147
 
156
148
  def close(self) -> None:
157
149
  logger.info("Terminating Monitoring Hub")
158
150
  if self.monitoring_hub_active:
159
151
  self.monitoring_hub_active = False
160
- logger.info("Setting router termination event")
161
-
162
- logger.info("Waiting for UDP router to terminate")
163
- self.udp_receiver.close()
164
-
165
- logger.debug("Finished waiting for router termination")
166
152
  logger.debug("Waiting for DB termination")
167
153
  self.dbm_exit_event.set()
168
154
  join_terminate_close_proc(self.dbm_proc)
169
155
  logger.debug("Finished waiting for DBM termination")
170
156
 
171
- logger.info("Terminating filesystem radio receiver process")
172
- self.filesystem_receiver.close()
173
-
174
157
  logger.info("Closing monitoring multiprocessing queues")
175
158
  self.resource_msgs.close()
176
159
  self.resource_msgs.join_thread()