parsl 2024.7.29__py3-none-any.whl → 2024.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ import os
5
5
  import pickle
6
6
  import queue
7
7
  import socket
8
+ import threading
8
9
  import time
9
10
  from multiprocessing.synchronize import Event
10
11
  from typing import Optional, Tuple, Union
@@ -32,7 +33,12 @@ class MonitoringRouter:
32
33
  logdir: str = ".",
33
34
  run_id: str,
34
35
  logging_level: int = logging.INFO,
35
- atexit_timeout: int = 3 # in seconds
36
+ atexit_timeout: int = 3, # in seconds
37
+ priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
38
+ node_msgs: "queue.Queue[AddressedMonitoringMessage]",
39
+ block_msgs: "queue.Queue[AddressedMonitoringMessage]",
40
+ resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
41
+ exit_event: Event,
36
42
  ):
37
43
  """ Initializes a monitoring configuration class.
38
44
 
@@ -51,7 +57,11 @@ class MonitoringRouter:
51
57
  Logging level as defined in the logging module. Default: logging.INFO
52
58
  atexit_timeout : float, optional
53
59
  The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
60
+ *_msgs : Queue
61
+ Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
54
62
 
63
+ exit_event : Event
64
+ An event that the main Parsl process will set to signal that the monitoring router should shut down.
55
65
  """
56
66
  os.makedirs(logdir, exist_ok=True)
57
67
  self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
@@ -93,22 +103,60 @@ class MonitoringRouter:
93
103
  min_port=zmq_port_range[0],
94
104
  max_port=zmq_port_range[1])
95
105
 
96
- def start(self,
97
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
98
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
99
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
100
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
101
- exit_event: Event) -> None:
106
+ self.priority_msgs = priority_msgs
107
+ self.node_msgs = node_msgs
108
+ self.block_msgs = block_msgs
109
+ self.resource_msgs = resource_msgs
110
+ self.exit_event = exit_event
111
+
112
+ @wrap_with_logs(target="monitoring_router")
113
+ def start(self) -> None:
114
+ self.logger.info("Starting UDP listener thread")
115
+ udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
116
+ udp_radio_receiver_thread.start()
117
+
118
+ self.logger.info("Starting ZMQ listener thread")
119
+ zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
120
+ zmq_radio_receiver_thread.start()
121
+
122
+ self.logger.info("Joining on ZMQ listener thread")
123
+ zmq_radio_receiver_thread.join()
124
+ self.logger.info("Joining on UDP listener thread")
125
+ udp_radio_receiver_thread.join()
126
+ self.logger.info("Joined on both ZMQ and UDP listener threads")
127
+
128
+ @wrap_with_logs(target="monitoring_router")
129
+ def start_udp_listener(self) -> None:
102
130
  try:
103
- while not exit_event.is_set():
131
+ while not self.exit_event.is_set():
104
132
  try:
105
133
  data, addr = self.udp_sock.recvfrom(2048)
106
134
  resource_msg = pickle.loads(data)
107
135
  self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
108
- resource_msgs.put((resource_msg, addr))
136
+ self.resource_msgs.put((resource_msg, addr))
109
137
  except socket.timeout:
110
138
  pass
111
139
 
140
+ self.logger.info("UDP listener draining")
141
+ last_msg_received_time = time.time()
142
+ while time.time() - last_msg_received_time < self.atexit_timeout:
143
+ try:
144
+ data, addr = self.udp_sock.recvfrom(2048)
145
+ msg = pickle.loads(data)
146
+ self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
147
+ self.resource_msgs.put((msg, addr))
148
+ last_msg_received_time = time.time()
149
+ except socket.timeout:
150
+ pass
151
+
152
+ self.logger.info("UDP listener finishing normally")
153
+ finally:
154
+ self.logger.info("UDP listener finished")
155
+
156
+ @wrap_with_logs(target="monitoring_router")
157
+ def start_zmq_listener(self) -> None:
158
+ try:
159
+ while not self.exit_event.is_set():
112
160
  try:
113
161
  dfk_loop_start = time.time()
114
162
  while time.time() - dfk_loop_start < 1.0: # TODO make configurable
@@ -125,15 +173,15 @@ class MonitoringRouter:
125
173
 
126
174
  if msg[0] == MessageType.NODE_INFO:
127
175
  msg[1]['run_id'] = self.run_id
128
- node_msgs.put(msg_0)
176
+ self.node_msgs.put(msg_0)
129
177
  elif msg[0] == MessageType.RESOURCE_INFO:
130
- resource_msgs.put(msg_0)
178
+ self.resource_msgs.put(msg_0)
131
179
  elif msg[0] == MessageType.BLOCK_INFO:
132
- block_msgs.put(msg_0)
180
+ self.block_msgs.put(msg_0)
133
181
  elif msg[0] == MessageType.TASK_INFO:
134
- priority_msgs.put(msg_0)
182
+ self.priority_msgs.put(msg_0)
135
183
  elif msg[0] == MessageType.WORKFLOW_INFO:
136
- priority_msgs.put(msg_0)
184
+ self.priority_msgs.put(msg_0)
137
185
  else:
138
186
  # There is a type: ignore here because if msg[0]
139
187
  # is of the correct type, this code is unreachable,
@@ -151,21 +199,9 @@ class MonitoringRouter:
151
199
  # thing to do.
152
200
  self.logger.warning("Failure processing a ZMQ message", exc_info=True)
153
201
 
154
- self.logger.info("Monitoring router draining")
155
- last_msg_received_time = time.time()
156
- while time.time() - last_msg_received_time < self.atexit_timeout:
157
- try:
158
- data, addr = self.udp_sock.recvfrom(2048)
159
- msg = pickle.loads(data)
160
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
161
- resource_msgs.put((msg, addr))
162
- last_msg_received_time = time.time()
163
- except socket.timeout:
164
- pass
165
-
166
- self.logger.info("Monitoring router finishing normally")
202
+ self.logger.info("ZMQ listener finishing normally")
167
203
  finally:
168
- self.logger.info("Monitoring router finished")
204
+ self.logger.info("ZMQ listener finished")
169
205
 
170
206
 
171
207
  @wrap_with_logs
@@ -191,7 +227,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
191
227
  zmq_port_range=zmq_port_range,
192
228
  logdir=logdir,
193
229
  logging_level=logging_level,
194
- run_id=run_id)
230
+ run_id=run_id,
231
+ priority_msgs=priority_msgs,
232
+ node_msgs=node_msgs,
233
+ block_msgs=block_msgs,
234
+ resource_msgs=resource_msgs,
235
+ exit_event=exit_event)
195
236
  except Exception as e:
196
237
  logger.error("MonitoringRouter construction failed.", exc_info=True)
197
238
  comm_q.put(f"Monitoring router construction failed: {e}")
@@ -200,7 +241,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
200
241
 
201
242
  router.logger.info("Starting MonitoringRouter in router_starter")
202
243
  try:
203
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event)
244
+ router.start()
204
245
  except Exception as e:
205
246
  router.logger.exception("router.start exception")
206
247
  exception_q.put(('Hub', str(e)))
@@ -0,0 +1,71 @@
1
+ import logging
2
+
3
+ import pytest
4
+
5
+ import parsl
6
+ from parsl import Config
7
+ from parsl.executors import HighThroughputExecutor
8
+ from parsl.executors.errors import BadStateException
9
+ from parsl.jobs.states import JobState, JobStatus
10
+ from parsl.providers import LocalProvider
11
+
12
+
13
+ class FailingProvider(LocalProvider):
14
+ def submit(*args, **kwargs):
15
+ raise RuntimeError("Deliberate failure of provider.submit")
16
+
17
+
18
+ def local_config():
19
+ """Config to simulate failing blocks without connecting"""
20
+ return Config(
21
+ executors=[
22
+ HighThroughputExecutor(
23
+ label="HTEX",
24
+ heartbeat_period=1,
25
+ heartbeat_threshold=2,
26
+ poll_period=100,
27
+ max_workers_per_node=1,
28
+ provider=FailingProvider(
29
+ init_blocks=0,
30
+ max_blocks=2,
31
+ min_blocks=0,
32
+ ),
33
+ )
34
+ ],
35
+ max_idletime=0.5,
36
+ strategy='htex_auto_scale',
37
+ strategy_period=0.1
38
+ # this strategy period needs to be a few times smaller than the
39
+ # status_polling_interval of FailingProvider, which is 5s at
40
+ # time of writing
41
+ )
42
+
43
+
44
+ @parsl.python_app
45
+ def double(x):
46
+ return x * 2
47
+
48
+
49
+ @pytest.mark.local
50
+ def test_disconnected_blocks():
51
+ """Test reporting of blocks that fail to connect from HTEX"""
52
+ dfk = parsl.dfk()
53
+ executor = dfk.executors["HTEX"]
54
+
55
+ connected_blocks = executor.connected_blocks()
56
+ assert not connected_blocks, "Expected 0 blocks"
57
+
58
+ future = double(5)
59
+ with pytest.raises(BadStateException):
60
+ future.result()
61
+
62
+ assert isinstance(future.exception(), BadStateException)
63
+
64
+ status_dict = executor.status()
65
+ assert len(status_dict) == 1, "Expected exactly 1 block"
66
+ for status in status_dict.values():
67
+ assert isinstance(status, JobStatus)
68
+ assert status.state == JobState.MISSING
69
+
70
+ connected_blocks = executor.connected_blocks()
71
+ assert connected_blocks == [], "Expected exactly 0 connected blocks"
@@ -1,6 +1,7 @@
1
+ import logging
1
2
  import pathlib
2
- import warnings
3
3
  from subprocess import Popen, TimeoutExpired
4
+ from typing import Optional, Sequence
4
5
  from unittest import mock
5
6
 
6
7
  import pytest
@@ -71,12 +72,11 @@ def test_htex_start_encrypted(
71
72
  @pytest.mark.local
72
73
  @pytest.mark.parametrize("started", (True, False))
73
74
  @pytest.mark.parametrize("timeout_expires", (True, False))
74
- @mock.patch(f"{_MOCK_BASE}.logger")
75
75
  def test_htex_shutdown(
76
- mock_logger: mock.MagicMock,
77
76
  started: bool,
78
77
  timeout_expires: bool,
79
78
  htex: HighThroughputExecutor,
79
+ caplog
80
80
  ):
81
81
  mock_ix_proc = mock.Mock(spec=Popen)
82
82
 
@@ -108,22 +108,22 @@ def test_htex_shutdown(
108
108
 
109
109
  mock_ix_proc.terminate.side_effect = kill_interchange
110
110
 
111
- htex.shutdown()
111
+ with caplog.at_level(logging.INFO):
112
+ htex.shutdown()
112
113
 
113
- mock_logs = mock_logger.info.call_args_list
114
114
  if started:
115
115
  assert mock_ix_proc.terminate.called
116
116
  assert mock_ix_proc.wait.called
117
117
  assert {"timeout": 10} == mock_ix_proc.wait.call_args[1]
118
118
  if timeout_expires:
119
- assert "Unable to terminate Interchange" in mock_logs[1][0][0]
119
+ assert "Unable to terminate Interchange" in caplog.text
120
120
  assert mock_ix_proc.kill.called
121
- assert "Attempting" in mock_logs[0][0][0]
122
- assert "Finished" in mock_logs[-1][0][0]
121
+ assert "Attempting HighThroughputExecutor shutdown" in caplog.text
122
+ assert "Finished HighThroughputExecutor shutdown" in caplog.text
123
123
  else:
124
124
  assert not mock_ix_proc.terminate.called
125
125
  assert not mock_ix_proc.wait.called
126
- assert "has not started" in mock_logs[0][0][0]
126
+ assert "HighThroughputExecutor has not started" in caplog.text
127
127
 
128
128
 
129
129
  @pytest.mark.local
@@ -139,13 +139,22 @@ def test_max_workers_per_node():
139
139
 
140
140
 
141
141
  @pytest.mark.local
142
- def test_htex_launch_cmd():
143
- htex = HighThroughputExecutor()
144
- assert htex.launch_cmd.startswith("process_worker_pool.py")
145
- assert htex.interchange_launch_cmd == "interchange.py"
146
-
147
- launch_cmd = "custom-launch-cmd"
148
- ix_launch_cmd = "custom-ix-launch-cmd"
149
- htex = HighThroughputExecutor(launch_cmd=launch_cmd, interchange_launch_cmd=ix_launch_cmd)
150
- assert htex.launch_cmd == launch_cmd
151
- assert htex.interchange_launch_cmd == ix_launch_cmd
142
+ @pytest.mark.parametrize("cmd", (None, "custom-launch-cmd"))
143
+ def test_htex_worker_pool_launch_cmd(cmd: Optional[str]):
144
+ if cmd:
145
+ htex = HighThroughputExecutor(launch_cmd=cmd)
146
+ assert htex.launch_cmd == cmd
147
+ else:
148
+ htex = HighThroughputExecutor()
149
+ assert htex.launch_cmd.startswith("process_worker_pool.py")
150
+
151
+
152
+ @pytest.mark.local
153
+ @pytest.mark.parametrize("cmd", (None, ["custom", "launch", "cmd"]))
154
+ def test_htex_interchange_launch_cmd(cmd: Optional[Sequence[str]]):
155
+ if cmd:
156
+ htex = HighThroughputExecutor(interchange_launch_cmd=cmd)
157
+ assert htex.interchange_launch_cmd == cmd
158
+ else:
159
+ htex = HighThroughputExecutor()
160
+ assert htex.interchange_launch_cmd == ["interchange.py"]
@@ -9,6 +9,7 @@ import zmq
9
9
 
10
10
  from parsl import curvezmq
11
11
  from parsl.executors.high_throughput.interchange import Interchange
12
+ from parsl.executors.high_throughput.manager_selector import RandomManagerSelector
12
13
 
13
14
 
14
15
  def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[str]) -> Interchange:
@@ -23,6 +24,7 @@ def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[s
23
24
  heartbeat_threshold=60,
24
25
  logdir=".",
25
26
  logging_level=logging.INFO,
27
+ manager_selector=RandomManagerSelector(),
26
28
  poll_period=10)
27
29
 
28
30
 
@@ -25,10 +25,23 @@ def this_app():
25
25
  # a configuration that is suitably configured for monitoring.
26
26
 
27
27
  def htex_config():
28
+ """This config will use htex's default htex-specific monitoring radio mode"""
28
29
  from parsl.tests.configs.htex_local_alternate import fresh_config
29
30
  return fresh_config()
30
31
 
31
32
 
33
+ def htex_udp_config():
34
+ """This config will force UDP"""
35
+ from parsl.tests.configs.htex_local_alternate import fresh_config
36
+ c = fresh_config()
37
+ assert len(c.executors) == 1
38
+
39
+ assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio"
40
+ c.executors[0].radio_mode = "udp"
41
+
42
+ return c
43
+
44
+
32
45
  def workqueue_config():
33
46
  from parsl.tests.configs.workqueue_ex import fresh_config
34
47
  c = fresh_config()
@@ -48,7 +61,7 @@ def taskvine_config():
48
61
 
49
62
 
50
63
  @pytest.mark.local
51
- @pytest.mark.parametrize("fresh_config", [htex_config, workqueue_config, taskvine_config])
64
+ @pytest.mark.parametrize("fresh_config", [htex_config, htex_udp_config, workqueue_config, taskvine_config])
52
65
  def test_row_counts(tmpd_cwd, fresh_config):
53
66
  # this is imported here rather than at module level because
54
67
  # it isn't available in a plain parsl install, so this module
@@ -44,7 +44,7 @@ def test_init():
44
44
 
45
45
  new_kwargs = {'max_workers_per_block'}
46
46
  excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node',
47
- 'mem_per_worker', 'cpu_affinity', 'max_workers'}
47
+ 'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'}
48
48
 
49
49
  # Get the kwargs from both HTEx and MPIEx
50
50
  htex_kwargs = set(signature(HighThroughputExecutor.__init__).parameters)
parsl/version.py CHANGED
@@ -3,4 +3,4 @@
3
3
  Year.Month.Day[alpha/beta/..]
4
4
  Alphas will be numbered like this -> 2024.12.10a0
5
5
  """
6
- VERSION = '2024.07.29'
6
+ VERSION = '2024.08.05'
@@ -6,7 +6,6 @@ import os
6
6
  import pickle
7
7
  import platform
8
8
  import queue
9
- import random
10
9
  import signal
11
10
  import sys
12
11
  import threading
@@ -19,7 +18,9 @@ from parsl import curvezmq
19
18
  from parsl.app.errors import RemoteExceptionWrapper
20
19
  from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
21
20
  from parsl.executors.high_throughput.manager_record import ManagerRecord
21
+ from parsl.executors.high_throughput.manager_selector import ManagerSelector
22
22
  from parsl.monitoring.message_type import MessageType
23
+ from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
23
24
  from parsl.process_loggers import wrap_with_logs
24
25
  from parsl.serialize import serialize as serialize_object
25
26
  from parsl.utils import setproctitle
@@ -53,6 +54,7 @@ class Interchange:
53
54
  logging_level: int,
54
55
  poll_period: int,
55
56
  cert_dir: Optional[str],
57
+ manager_selector: ManagerSelector,
56
58
  ) -> None:
57
59
  """
58
60
  Parameters
@@ -160,6 +162,8 @@ class Interchange:
160
162
 
161
163
  self.heartbeat_threshold = heartbeat_threshold
162
164
 
165
+ self.manager_selector = manager_selector
166
+
163
167
  self.current_platform = {'parsl_v': PARSL_VERSION,
164
168
  'python_v': "{}.{}.{}".format(sys.version_info.major,
165
169
  sys.version_info.minor,
@@ -216,27 +220,15 @@ class Interchange:
216
220
  task_counter += 1
217
221
  logger.debug(f"Fetched {task_counter} tasks so far")
218
222
 
219
- def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
220
- if self.hub_address and self.hub_zmq_port:
221
- logger.info("Connecting to MonitoringHub")
222
- # This is a one-off because monitoring is unencrypted
223
- hub_channel = zmq.Context().socket(zmq.DEALER)
224
- hub_channel.set_hwm(0)
225
- hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
226
- logger.info("Connected to MonitoringHub")
227
- return hub_channel
228
- else:
229
- return None
230
-
231
- def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
232
- if hub_channel:
223
+ def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
224
+ if monitoring_radio:
233
225
  logger.info("Sending message {} to MonitoringHub".format(manager))
234
226
 
235
227
  d: Dict = cast(Dict, manager.copy())
236
228
  d['timestamp'] = datetime.datetime.now()
237
229
  d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
238
230
 
239
- hub_channel.send_pyobj((MessageType.NODE_INFO, d))
231
+ monitoring_radio.send((MessageType.NODE_INFO, d))
240
232
 
241
233
  @wrap_with_logs(target="interchange")
242
234
  def _command_server(self) -> NoReturn:
@@ -244,8 +236,11 @@ class Interchange:
244
236
  """
245
237
  logger.debug("Command Server Starting")
246
238
 
247
- # Need to create a new ZMQ socket for command server thread
248
- hub_channel = self._create_monitoring_channel()
239
+ if self.hub_address is not None and self.hub_zmq_port is not None:
240
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
241
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
242
+ else:
243
+ monitoring_radio = None
249
244
 
250
245
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
251
246
 
@@ -295,7 +290,7 @@ class Interchange:
295
290
  if manager_id in self._ready_managers:
296
291
  m = self._ready_managers[manager_id]
297
292
  m['active'] = False
298
- self._send_monitoring_info(hub_channel, m)
293
+ self._send_monitoring_info(monitoring_radio, m)
299
294
  else:
300
295
  logger.warning("Worker to hold was not in ready managers list")
301
296
 
@@ -330,9 +325,14 @@ class Interchange:
330
325
  # parent-process-inheritance problems.
331
326
  signal.signal(signal.SIGTERM, signal.SIG_DFL)
332
327
 
333
- logger.info("Incoming ports bound")
328
+ logger.info("Starting main interchange method")
334
329
 
335
- hub_channel = self._create_monitoring_channel()
330
+ if self.hub_address is not None and self.hub_zmq_port is not None:
331
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
332
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
333
+ logger.debug("Created monitoring radio")
334
+ else:
335
+ monitoring_radio = None
336
336
 
337
337
  poll_period = self.poll_period
338
338
 
@@ -363,10 +363,10 @@ class Interchange:
363
363
  while not kill_event.is_set():
364
364
  self.socks = dict(poller.poll(timeout=poll_period))
365
365
 
366
- self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
367
- self.process_results_incoming(interesting_managers, hub_channel)
368
- self.expire_bad_managers(interesting_managers, hub_channel)
369
- self.expire_drained_managers(interesting_managers, hub_channel)
366
+ self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
367
+ self.process_results_incoming(interesting_managers, monitoring_radio)
368
+ self.expire_bad_managers(interesting_managers, monitoring_radio)
369
+ self.expire_drained_managers(interesting_managers, monitoring_radio)
370
370
  self.process_tasks_to_send(interesting_managers)
371
371
 
372
372
  self.zmq_context.destroy()
@@ -377,7 +377,7 @@ class Interchange:
377
377
  def process_task_outgoing_incoming(
378
378
  self,
379
379
  interesting_managers: Set[bytes],
380
- hub_channel: Optional[zmq.Socket],
380
+ monitoring_radio: Optional[MonitoringRadioSender],
381
381
  kill_event: threading.Event
382
382
  ) -> None:
383
383
  """Process one message from manager on the task_outgoing channel.
@@ -431,7 +431,7 @@ class Interchange:
431
431
  m.update(msg) # type: ignore[typeddict-item]
432
432
 
433
433
  logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
434
- self._send_monitoring_info(hub_channel, m)
434
+ self._send_monitoring_info(monitoring_radio, m)
435
435
 
436
436
  if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
437
437
  msg['parsl_v'] != self.current_platform['parsl_v']):
@@ -462,7 +462,7 @@ class Interchange:
462
462
  logger.error(f"Unexpected message type received from manager: {msg['type']}")
463
463
  logger.debug("leaving task_outgoing section")
464
464
 
465
- def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
465
+ def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
466
466
 
467
467
  for manager_id in list(interesting_managers):
468
468
  # is it always true that a draining manager will be in interesting managers?
@@ -475,7 +475,7 @@ class Interchange:
475
475
  self._ready_managers.pop(manager_id)
476
476
 
477
477
  m['active'] = False
478
- self._send_monitoring_info(hub_channel, m)
478
+ self._send_monitoring_info(monitoring_radio, m)
479
479
 
480
480
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
481
481
  # Check if there are tasks that could be sent to managers
@@ -485,8 +485,7 @@ class Interchange:
485
485
  interesting=len(interesting_managers)))
486
486
 
487
487
  if interesting_managers and not self.pending_task_queue.empty():
488
- shuffled_managers = list(interesting_managers)
489
- random.shuffle(shuffled_managers)
488
+ shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
490
489
 
491
490
  while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
492
491
  manager_id = shuffled_managers.pop()
@@ -519,7 +518,7 @@ class Interchange:
519
518
  else:
520
519
  logger.debug("either no interesting managers or no tasks, so skipping manager pass")
521
520
 
522
- def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
521
+ def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
523
522
  # Receive any results and forward to client
524
523
  if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
525
524
  logger.debug("entering results_incoming section")
@@ -539,11 +538,11 @@ class Interchange:
539
538
  elif r['type'] == 'monitoring':
540
539
  # the monitoring code makes the assumption that no
541
540
  # monitoring messages will be received if monitoring
542
- # is not configured, and that hub_channel will only
541
+ # is not configured, and that monitoring_radio will only
543
542
  # be None when monitoring is not configurated.
544
- assert hub_channel is not None
543
+ assert monitoring_radio is not None
545
544
 
546
- hub_channel.send_pyobj(r['payload'])
545
+ monitoring_radio.send(r['payload'])
547
546
  elif r['type'] == 'heartbeat':
548
547
  logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
549
548
  b_messages.append((p_message, r))
@@ -587,7 +586,7 @@ class Interchange:
587
586
  interesting_managers.add(manager_id)
588
587
  logger.debug("leaving results_incoming section")
589
588
 
590
- def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
589
+ def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
591
590
  bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
592
591
  time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
593
592
  for (manager_id, m) in bad_managers:
@@ -595,7 +594,7 @@ class Interchange:
595
594
  logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
596
595
  if m['active']:
597
596
  m['active'] = False
598
- self._send_monitoring_info(hub_channel, m)
597
+ self._send_monitoring_info(monitoring_radio, m)
599
598
 
600
599
  logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
601
600
  for tid in m['tasks']: