parsl 2025.6.16__py3-none-any.whl → 2025.6.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. parsl/configs/osg.py +1 -1
  2. parsl/dataflow/dflow.py +14 -4
  3. parsl/executors/base.py +19 -9
  4. parsl/executors/flux/executor.py +2 -0
  5. parsl/executors/globus_compute.py +2 -0
  6. parsl/executors/high_throughput/executor.py +22 -15
  7. parsl/executors/high_throughput/interchange.py +173 -191
  8. parsl/executors/high_throughput/mpi_executor.py +14 -4
  9. parsl/executors/high_throughput/probe.py +4 -4
  10. parsl/executors/high_throughput/process_worker_pool.py +88 -94
  11. parsl/executors/radical/executor.py +3 -0
  12. parsl/executors/taskvine/executor.py +11 -3
  13. parsl/executors/taskvine/manager.py +3 -1
  14. parsl/executors/threads.py +19 -3
  15. parsl/executors/workqueue/executor.py +11 -3
  16. parsl/monitoring/errors.py +4 -4
  17. parsl/monitoring/monitoring.py +26 -88
  18. parsl/monitoring/radios/base.py +63 -2
  19. parsl/monitoring/radios/filesystem.py +19 -4
  20. parsl/monitoring/radios/filesystem_router.py +22 -3
  21. parsl/monitoring/radios/htex.py +22 -13
  22. parsl/monitoring/radios/multiprocessing.py +22 -2
  23. parsl/monitoring/radios/udp.py +57 -19
  24. parsl/monitoring/radios/udp_router.py +119 -25
  25. parsl/monitoring/radios/zmq_router.py +9 -10
  26. parsl/monitoring/remote.py +19 -40
  27. parsl/providers/local/local.py +12 -13
  28. parsl/tests/configs/htex_local_alternate.py +0 -1
  29. parsl/tests/conftest.py +7 -4
  30. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
  31. parsl/tests/test_htex/test_zmq_binding.py +5 -6
  32. parsl/tests/test_monitoring/test_basic.py +12 -10
  33. parsl/tests/test_monitoring/{test_fuzz_zmq.py → test_htex_fuzz_zmq.py} +7 -2
  34. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
  35. parsl/tests/test_monitoring/test_radio_filesystem.py +48 -0
  36. parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
  37. parsl/tests/test_monitoring/test_radio_udp.py +204 -0
  38. parsl/tests/test_monitoring/test_stdouterr.py +1 -3
  39. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
  40. parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
  41. parsl/version.py +1 -1
  42. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
  43. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
  44. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
  45. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/RECORD +51 -50
  46. parsl/tests/configs/local_threads_monitoring.py +0 -10
  47. parsl/tests/manual_tests/test_udp_simple.py +0 -51
  48. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
  49. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
  50. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
  51. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
  52. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
  53. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
- import json
5
4
  import logging
6
5
  import math
7
6
  import multiprocessing
@@ -66,8 +65,7 @@ class Manager:
66
65
  def __init__(self, *,
67
66
  addresses,
68
67
  address_probe_timeout,
69
- task_port,
70
- result_port,
68
+ port,
71
69
  cores_per_worker,
72
70
  mem_per_worker,
73
71
  max_workers_per_node,
@@ -157,15 +155,13 @@ class Manager:
157
155
  self._start_time = time.time()
158
156
 
159
157
  try:
160
- ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
158
+ ix_address = probe_addresses(addresses.split(','), port, timeout=address_probe_timeout)
161
159
  if not ix_address:
162
160
  raise Exception("No viable address found")
163
161
  else:
164
- logger.info("Connection to Interchange successful on {}".format(ix_address))
165
- task_q_url = tcp_url(ix_address, task_port)
166
- result_q_url = tcp_url(ix_address, result_port)
167
- logger.info("Task url : {}".format(task_q_url))
168
- logger.info("Result url : {}".format(result_q_url))
162
+ logger.info(f"Connection to Interchange successful on {ix_address}")
163
+ ix_url = tcp_url(ix_address, port)
164
+ logger.info(f"Interchange url: {ix_url}")
169
165
  except Exception:
170
166
  logger.exception("Caught exception while trying to determine viable address to interchange")
171
167
  print("Failed to find a viable address to connect to interchange. Exiting")
@@ -174,8 +170,7 @@ class Manager:
174
170
  self.cert_dir = cert_dir
175
171
  self.zmq_context = curvezmq.ClientContext(self.cert_dir)
176
172
 
177
- self._task_q_url = task_q_url
178
- self._result_q_url = result_q_url
173
+ self._ix_url = ix_url
179
174
 
180
175
  self.uid = uid
181
176
  self.block_id = block_id
@@ -250,37 +245,37 @@ class Manager:
250
245
  self.worker_count = min(len(self.available_accelerators), self.worker_count)
251
246
  logger.info("Manager will spawn {} workers".format(self.worker_count))
252
247
 
253
- def create_reg_message(self):
248
+ def create_reg_message(self) -> dict:
254
249
  """ Creates a registration message to identify the worker to the interchange
255
250
  """
256
- msg = {'type': 'registration',
257
- 'parsl_v': PARSL_VERSION,
258
- 'python_v': "{}.{}.{}".format(sys.version_info.major,
259
- sys.version_info.minor,
260
- sys.version_info.micro),
261
- 'packages': {dist.metadata['Name']: dist.version for dist in distributions()},
262
- 'worker_count': self.worker_count,
263
- 'uid': self.uid,
264
- 'block_id': self.block_id,
265
- 'start_time': self.start_time,
266
- 'prefetch_capacity': self.prefetch_capacity,
267
- 'max_capacity': self.worker_count + self.prefetch_capacity,
268
- 'os': platform.system(),
269
- 'hostname': platform.node(),
270
- 'dir': os.getcwd(),
271
- 'cpu_count': psutil.cpu_count(logical=False),
272
- 'total_memory': psutil.virtual_memory().total,
273
- }
274
- b_msg = json.dumps(msg).encode('utf-8')
275
- return b_msg
251
+ return {
252
+ 'type': 'registration',
253
+ 'parsl_v': PARSL_VERSION,
254
+ 'python_v': "{}.{}.{}".format(
255
+ sys.version_info.major,
256
+ sys.version_info.minor,
257
+ sys.version_info.micro
258
+ ),
259
+ 'packages': {d.metadata['Name']: d.version for d in distributions()},
260
+ 'worker_count': self.worker_count,
261
+ 'uid': self.uid,
262
+ 'block_id': self.block_id,
263
+ 'start_time': self.start_time,
264
+ 'prefetch_capacity': self.prefetch_capacity,
265
+ 'max_capacity': self.worker_count + self.prefetch_capacity,
266
+ 'os': platform.system(),
267
+ 'hostname': platform.node(),
268
+ 'dir': os.getcwd(),
269
+ 'cpu_count': psutil.cpu_count(logical=False),
270
+ 'total_memory': psutil.virtual_memory().total,
271
+ }
276
272
 
277
273
  @staticmethod
278
274
  def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
279
275
  """ Send heartbeat to the incoming task queue
280
276
  """
281
- msg = {'type': 'heartbeat'}
282
277
  # don't need to dumps and encode this every time - could do as a global on import?
283
- b_msg = json.dumps(msg).encode('utf-8')
278
+ b_msg = pickle.dumps({'type': 'heartbeat'})
284
279
  task_incoming.send(b_msg)
285
280
  logger.debug("Sent heartbeat")
286
281
 
@@ -289,32 +284,38 @@ class Manager:
289
284
  """ Send heartbeat to the incoming task queue
290
285
  """
291
286
  msg = {'type': 'drain'}
292
- b_msg = json.dumps(msg).encode('utf-8')
287
+ b_msg = pickle.dumps(msg)
293
288
  task_incoming.send(b_msg)
294
289
  logger.debug("Sent drain")
295
290
 
296
291
  @wrap_with_logs
297
- def pull_tasks(self):
292
+ def interchange_communicator(self, pair_setup: threading.Event):
298
293
  """ Pull tasks from the incoming tasks zmq pipe onto the internal
299
294
  pending task queue
300
295
  """
301
296
  logger.info("starting")
302
297
 
298
+ results_sock = self.zmq_context.socket(zmq.PAIR)
299
+ results_sock.setsockopt(zmq.LINGER, 0)
300
+ results_sock.bind("inproc://results")
301
+ pair_setup.set()
302
+
303
303
  # Linger is set to 0, so that the manager can exit even when there might be
304
304
  # messages in the pipe
305
- task_incoming = self.zmq_context.socket(zmq.DEALER)
306
- task_incoming.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
307
- task_incoming.setsockopt(zmq.LINGER, 0)
308
- task_incoming.connect(self._task_q_url)
305
+ ix_sock = self.zmq_context.socket(zmq.DEALER)
306
+ ix_sock.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
307
+ ix_sock.setsockopt(zmq.LINGER, 0)
308
+ ix_sock.connect(self._ix_url)
309
309
  logger.info("Manager task pipe connected to interchange")
310
310
 
311
311
  poller = zmq.Poller()
312
- poller.register(task_incoming, zmq.POLLIN)
312
+ poller.register(results_sock, zmq.POLLIN)
313
+ poller.register(ix_sock, zmq.POLLIN)
313
314
 
314
315
  # Send a registration message
315
316
  msg = self.create_reg_message()
316
- logger.debug("Sending registration message: {}".format(msg))
317
- task_incoming.send(msg)
317
+ logger.debug("Sending registration message: %s", msg)
318
+ ix_sock.send(pickle.dumps(msg))
318
319
  last_beat = time.time()
319
320
  last_interchange_contact = time.time()
320
321
  task_recv_counter = 0
@@ -335,18 +336,21 @@ class Manager:
335
336
  pending_task_count = self.pending_task_queue.qsize()
336
337
  except NotImplementedError:
337
338
  # Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
338
- pending_task_count = f"pending task count is not available on {platform.system()}"
339
+ pending_task_count = f"pending task count is not available on {platform.system()}" # type: ignore[assignment]
339
340
 
340
- logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
341
- pending_task_count))
341
+ logger.debug(
342
+ 'ready workers: %d, pending tasks: %d',
343
+ self.ready_worker_count.value, # type: ignore[attr-defined]
344
+ pending_task_count,
345
+ )
342
346
 
343
347
  if time.time() >= last_beat + self.heartbeat_period:
344
- self.heartbeat_to_incoming(task_incoming)
348
+ self.heartbeat_to_incoming(ix_sock)
345
349
  last_beat = time.time()
346
350
 
347
351
  if time.time() > self.drain_time:
348
352
  logger.info("Requesting drain")
349
- self.drain_to_incoming(task_incoming)
353
+ self.drain_to_incoming(ix_sock)
350
354
  # This will start the pool draining...
351
355
  # Drained exit behaviour does not happen here. It will be
352
356
  # driven by the interchange sending a DRAINED_CODE message.
@@ -358,8 +362,8 @@ class Manager:
358
362
  poll_duration_s = max(0, next_interesting_event_time - time.time())
359
363
  socks = dict(poller.poll(timeout=poll_duration_s * 1000))
360
364
 
361
- if socks.get(task_incoming) == zmq.POLLIN:
362
- _, pkl_msg = task_incoming.recv_multipart()
365
+ if socks.get(ix_sock) == zmq.POLLIN:
366
+ pkl_msg = ix_sock.recv()
363
367
  tasks = pickle.loads(pkl_msg)
364
368
  last_interchange_contact = time.time()
365
369
 
@@ -377,6 +381,11 @@ class Manager:
377
381
  for task in tasks:
378
382
  self.task_scheduler.put_task(task)
379
383
 
384
+ elif socks.get(results_sock) == zmq.POLLIN:
385
+ meta_b = pickle.dumps({'type': 'result'})
386
+ ix_sock.send_multipart([meta_b, results_sock.recv()])
387
+ logger.debug("Result sent to interchange")
388
+
380
389
  else:
381
390
  logger.debug("No incoming tasks")
382
391
 
@@ -387,45 +396,36 @@ class Manager:
387
396
  logger.critical("Exiting")
388
397
  break
389
398
 
390
- task_incoming.close()
399
+ ix_sock.close()
391
400
  logger.info("Exiting")
392
401
 
393
402
  @wrap_with_logs
394
- def push_results(self):
395
- """ Listens on the pending_result_queue and sends out results via zmq
403
+ def ferry_result(self, may_connect: threading.Event):
404
+ """ Listens on the pending_result_queue and ferries results to the interchange
405
+ connected thread
396
406
  """
397
- logger.debug("Starting result push thread")
407
+ logger.debug("Begin")
398
408
 
399
409
  # Linger is set to 0, so that the manager can exit even when there might be
400
410
  # messages in the pipe
401
- result_outgoing = self.zmq_context.socket(zmq.DEALER)
402
- result_outgoing.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
403
- result_outgoing.setsockopt(zmq.LINGER, 0)
404
- result_outgoing.connect(self._result_q_url)
405
- logger.info("Manager result pipe connected to interchange")
411
+ notify_sock = self.zmq_context.socket(zmq.PAIR)
412
+ notify_sock.setsockopt(zmq.LINGER, 0)
413
+ may_connect.wait()
414
+ notify_sock.connect("inproc://results")
406
415
 
407
416
  while not self._stop_event.is_set():
408
- logger.debug("Starting pending_result_queue get")
409
417
  try:
410
418
  r = self.task_scheduler.get_result()
411
419
  if r is None:
412
420
  continue
413
- logger.debug("Result received from worker: %s", id(r))
414
- result_outgoing.send(r)
415
- logger.debug("Result sent to interchange: %s", id(r))
421
+ logger.debug("Result received from worker")
422
+ notify_sock.send(r)
416
423
  except Exception:
417
424
  logger.exception("Failed to send result to interchange")
418
425
 
419
- result_outgoing.close()
426
+ notify_sock.close()
420
427
  logger.debug("Exiting")
421
428
 
422
- @wrap_with_logs
423
- def heartbeater(self):
424
- while not self._stop_event.wait(self.heartbeat_period):
425
- heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
426
- logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
- self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
428
-
429
429
  def worker_watchdog(self, procs: dict[int, SpawnProcess]):
430
430
  """Keeps workers alive."""
431
431
  logger.debug("Starting worker watchdog")
@@ -492,23 +492,26 @@ class Manager:
492
492
 
493
493
  logger.debug("Workers started")
494
494
 
495
- thr_task_puller = threading.Thread(target=self.pull_tasks, name="Task-Puller")
496
- thr_result_pusher = threading.Thread(
497
- target=self.push_results, name="Result-Pusher"
495
+ pair_setup = threading.Event()
496
+
497
+ thr_task_puller = threading.Thread(
498
+ target=self.interchange_communicator,
499
+ args=(pair_setup,),
500
+ name="Interchange-Communicator",
498
501
  )
502
+ thr_result_ferry = threading.Thread(
503
+ target=self.ferry_result, args=(pair_setup,), name="Result-Shovel")
499
504
  thr_worker_watchdog = threading.Thread(
500
505
  target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
501
506
  )
502
507
  thr_monitoring_handler = threading.Thread(
503
508
  target=self.handle_monitoring_messages, name="Monitoring-Handler"
504
509
  )
505
- thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
506
510
 
507
511
  thr_task_puller.start()
508
- thr_result_pusher.start()
512
+ thr_result_ferry.start()
509
513
  thr_worker_watchdog.start()
510
514
  thr_monitoring_handler.start()
511
- thr_heartbeater.start()
512
515
 
513
516
  logger.info("Manager threads started")
514
517
 
@@ -520,11 +523,10 @@ class Manager:
520
523
  self.monitoring_queue.put(None)
521
524
  self.pending_result_queue.put(None)
522
525
 
523
- thr_heartbeater.join()
524
- thr_task_puller.join()
525
- thr_result_pusher.join()
526
- thr_worker_watchdog.join()
527
526
  thr_monitoring_handler.join()
527
+ thr_worker_watchdog.join()
528
+ thr_result_ferry.join()
529
+ thr_task_puller.join()
528
530
 
529
531
  for worker_id in procs:
530
532
  p = procs[worker_id]
@@ -862,10 +864,10 @@ def get_arg_parser() -> argparse.ArgumentParser:
862
864
  help="GB of memory assigned to each worker process. Default=0, no assignment",
863
865
  )
864
866
  parser.add_argument(
865
- "-t",
866
- "--task_port",
867
+ "-P",
868
+ "--port",
867
869
  required=True,
868
- help="Task port for receiving tasks from the interchange",
870
+ help="Port for communication with the interchange",
869
871
  )
870
872
  parser.add_argument(
871
873
  "--max_workers_per_node",
@@ -901,12 +903,6 @@ def get_arg_parser() -> argparse.ArgumentParser:
901
903
  parser.add_argument(
902
904
  "--poll", default=10, help="Poll period used in milliseconds"
903
905
  )
904
- parser.add_argument(
905
- "-r",
906
- "--result_port",
907
- required=True,
908
- help="Result port for posting results to the interchange",
909
- )
910
906
  parser.add_argument(
911
907
  "--cpu-affinity",
912
908
  type=strategyorlist,
@@ -953,8 +949,7 @@ if __name__ == "__main__":
953
949
  f"\n Block ID: {args.block_id}"
954
950
  f"\n cores_per_worker: {args.cores_per_worker}"
955
951
  f"\n mem_per_worker: {args.mem_per_worker}"
956
- f"\n task_port: {args.task_port}"
957
- f"\n result_port: {args.result_port}"
952
+ f"\n Interchange port: {args.port}"
958
953
  f"\n addresses: {args.addresses}"
959
954
  f"\n max_workers_per_node: {args.max_workers_per_node}"
960
955
  f"\n poll_period: {args.poll}"
@@ -969,8 +964,7 @@ if __name__ == "__main__":
969
964
  f"\n mpi_launcher: {args.mpi_launcher}"
970
965
  )
971
966
  try:
972
- manager = Manager(task_port=args.task_port,
973
- result_port=args.result_port,
967
+ manager = Manager(port=args.port,
974
968
  addresses=args.addresses,
975
969
  address_probe_timeout=int(args.address_probe_timeout),
976
970
  uid=args.uid,
@@ -601,6 +601,9 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
601
601
  self._bulk_thread.join()
602
602
 
603
603
  self.session.close(download=True)
604
+
605
+ super().shutdown()
606
+
604
607
  logger.info("RadicalPilotExecutor is terminated.")
605
608
 
606
609
  return True
@@ -40,6 +40,8 @@ from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig
40
40
  from parsl.executors.taskvine.manager import _taskvine_submit_wait
41
41
  from parsl.executors.taskvine.manager_config import TaskVineManagerConfig
42
42
  from parsl.executors.taskvine.utils import ParslFileToVine, ParslTaskToVine
43
+ from parsl.monitoring.radios.base import RadioConfig
44
+ from parsl.monitoring.radios.filesystem import FilesystemRadio
43
45
  from parsl.multiprocessing import SpawnContext
44
46
  from parsl.process_loggers import wrap_with_logs
45
47
  from parsl.providers import CondorProvider, LocalProvider
@@ -98,8 +100,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
98
100
  Default is None.
99
101
  """
100
102
 
101
- radio_mode = "filesystem"
102
-
103
103
  @typeguard.typechecked
104
104
  def __init__(self,
105
105
  label: str = "TaskVineExecutor",
@@ -108,7 +108,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
108
108
  manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
109
109
  factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
110
110
  provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
111
- storage_access: Optional[List[Staging]] = None):
111
+ storage_access: Optional[List[Staging]] = None,
112
+ remote_monitoring_radio: Optional[RadioConfig] = None):
112
113
 
113
114
  # Set worker launch option for this executor
114
115
  if worker_launch_method == 'factory' or worker_launch_method == 'manual':
@@ -134,6 +135,11 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
134
135
  self.factory_config = factory_config
135
136
  self.storage_access = storage_access
136
137
 
138
+ if remote_monitoring_radio is not None:
139
+ self.remote_monitoring_radio = remote_monitoring_radio
140
+ else:
141
+ self.remote_monitoring_radio = FilesystemRadio()
142
+
137
143
  # Queue to send ready tasks from TaskVine executor process to TaskVine manager process
138
144
  self._ready_task_queue: multiprocessing.Queue = SpawnContext.Queue()
139
145
 
@@ -601,6 +607,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
601
607
  self._finished_task_queue.close()
602
608
  self._finished_task_queue.join_thread()
603
609
 
610
+ super().shutdown()
611
+
604
612
  logger.debug("TaskVine shutdown completed")
605
613
 
606
614
  @wrap_with_logs
@@ -6,6 +6,7 @@ import shutil
6
6
  import subprocess
7
7
  import uuid
8
8
 
9
+ import parsl
9
10
  from parsl.executors.taskvine import exec_parsl_function
10
11
  from parsl.executors.taskvine.utils import VineTaskToParsl, run_parsl_function
11
12
  from parsl.process_loggers import wrap_with_logs
@@ -255,7 +256,8 @@ def _taskvine_submit_wait(ready_task_queue=None,
255
256
  run_parsl_function,
256
257
  poncho_env=poncho_env_path,
257
258
  init_command=manager_config.init_command,
258
- add_env=add_env)
259
+ add_env=add_env,
260
+ hoisting_modules=[parsl.serialize, run_parsl_function])
259
261
 
260
262
  # Configure the library if provided
261
263
  if manager_config.library_config:
@@ -7,6 +7,8 @@ import typeguard
7
7
  from parsl.data_provider.staging import Staging
8
8
  from parsl.executors.base import ParslExecutor
9
9
  from parsl.executors.errors import InvalidResourceSpecification
10
+ from parsl.monitoring.radios.base import RadioConfig
11
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadio
10
12
  from parsl.utils import RepresentationMixin
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -28,7 +30,7 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
28
30
  @typeguard.typechecked
29
31
  def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
30
32
  thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
31
- working_dir: Optional[str] = None):
33
+ working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
32
34
  ParslExecutor.__init__(self)
33
35
  self.label = label
34
36
  self.max_threads = max_threads
@@ -40,6 +42,11 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
40
42
  self.storage_access = storage_access
41
43
  self.working_dir = working_dir
42
44
 
45
+ if remote_monitoring_radio is not None:
46
+ self.remote_monitoring_radio = remote_monitoring_radio
47
+ else:
48
+ self.remote_monitoring_radio = MultiprocessingQueueRadio()
49
+
43
50
  def start(self):
44
51
  self.executor = cf.ThreadPoolExecutor(max_workers=self.max_threads,
45
52
  thread_name_prefix=self.thread_name_prefix)
@@ -73,9 +80,18 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
73
80
  """
74
81
  logger.debug("Shutting down executor, which involves waiting for running tasks to complete")
75
82
  self.executor.shutdown(wait=block)
83
+
84
+ super().shutdown()
85
+
76
86
  logger.debug("Done with executor shutdown")
77
87
 
78
88
  def monitor_resources(self):
79
- """Resource monitoring sometimes deadlocks when using threads, so this function
80
- returns false to disable it."""
89
+ """Resource monitoring does not make sense when using the
90
+ ThreadPoolExecutor, as there is no per-task process tree: all tasks
91
+ run inside the same single submitting process.
92
+
93
+ In addition, the use of fork-based multiprocessing in the remote
94
+ wrapper in parsl/monitoring/remote.py was especially prone to deadlock
95
+ with this executor.
96
+ """
81
97
  return False
@@ -31,6 +31,8 @@ from parsl.errors import OptionalModuleMissing
31
31
  from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
32
32
  from parsl.executors.status_handling import BlockProviderExecutor
33
33
  from parsl.executors.workqueue import exec_parsl_function
34
+ from parsl.monitoring.radios.base import RadioConfig
35
+ from parsl.monitoring.radios.filesystem import FilesystemRadio
34
36
  from parsl.multiprocessing import SpawnContext, SpawnProcess
35
37
  from parsl.process_loggers import wrap_with_logs
36
38
  from parsl.providers import CondorProvider, LocalProvider
@@ -227,8 +229,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
227
229
  specifiation for each task).
228
230
  """
229
231
 
230
- radio_mode = "filesystem"
231
-
232
232
  @typeguard.typechecked
233
233
  def __init__(self,
234
234
  label: str = "WorkQueueExecutor",
@@ -255,7 +255,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
255
255
  worker_executable: str = 'work_queue_worker',
256
256
  function_dir: Optional[str] = None,
257
257
  coprocess: bool = False,
258
- scaling_cores_per_worker: int = 1):
258
+ scaling_cores_per_worker: int = 1,
259
+ remote_monitoring_radio: Optional[RadioConfig] = None):
259
260
  BlockProviderExecutor.__init__(self, provider=provider,
260
261
  block_error_handler=True)
261
262
  if not _work_queue_enabled:
@@ -308,6 +309,11 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
308
309
  if self.init_command != "":
309
310
  self.launch_cmd = self.init_command + "; " + self.launch_cmd
310
311
 
312
+ if remote_monitoring_radio is not None:
313
+ self.remote_monitoring_radio = remote_monitoring_radio
314
+ else:
315
+ self.remote_monitoring_radio = FilesystemRadio()
316
+
311
317
  def _get_launch_command(self, block_id):
312
318
  # this executor uses different terminology for worker/launch
313
319
  # commands than in htex
@@ -714,6 +720,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
714
720
  self.collector_queue.close()
715
721
  self.collector_queue.join_thread()
716
722
 
723
+ super().shutdown()
724
+
717
725
  logger.debug("Work Queue shutdown completed")
718
726
 
719
727
  @wrap_with_logs
@@ -1,11 +1,11 @@
1
1
  from parsl.errors import ParslError
2
2
 
3
3
 
4
- class MonitoringHubStartError(ParslError):
4
+ class MonitoringRouterStartError(ParslError):
5
5
  def __str__(self) -> str:
6
- return "Hub failed to start"
6
+ return "Monitoring router failed to start"
7
7
 
8
8
 
9
- class MonitoringRouterStartError(ParslError):
9
+ class RadioRequiredError(ParslError):
10
10
  def __str__(self) -> str:
11
- return "Monitoring router failed to start"
11
+ return "A radio must be configured for remote task monitoring"