parsl 2024.7.29__py3-none-any.whl → 2024.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parsl/dataflow/dflow.py CHANGED
@@ -1180,7 +1180,7 @@ class DataFlowKernel:
1180
1180
  if self.monitoring:
1181
1181
  executor.hub_address = self.monitoring.hub_address
1182
1182
  executor.hub_zmq_port = self.monitoring.hub_zmq_port
1183
- executor.monitoring_radio = self.monitoring.radio
1183
+ executor.submit_monitoring_radio = self.monitoring.radio
1184
1184
  if hasattr(executor, 'provider'):
1185
1185
  if hasattr(executor.provider, 'script_dir'):
1186
1186
  executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
parsl/executors/base.py CHANGED
@@ -52,13 +52,13 @@ class ParslExecutor(metaclass=ABCMeta):
52
52
  *,
53
53
  hub_address: Optional[str] = None,
54
54
  hub_zmq_port: Optional[int] = None,
55
- monitoring_radio: Optional[MonitoringRadioSender] = None,
55
+ submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
56
56
  run_dir: str = ".",
57
57
  run_id: Optional[str] = None,
58
58
  ):
59
59
  self.hub_address = hub_address
60
60
  self.hub_zmq_port = hub_zmq_port
61
- self.monitoring_radio = monitoring_radio
61
+ self.submit_monitoring_radio = submit_monitoring_radio
62
62
  self.run_dir = os.path.abspath(run_dir)
63
63
  self.run_id = run_id
64
64
 
@@ -147,11 +147,11 @@ class ParslExecutor(metaclass=ABCMeta):
147
147
  self._hub_zmq_port = value
148
148
 
149
149
  @property
150
- def monitoring_radio(self) -> Optional[MonitoringRadioSender]:
150
+ def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
151
151
  """Local radio for sending monitoring messages
152
152
  """
153
- return self._monitoring_radio
153
+ return self._submit_monitoring_radio
154
154
 
155
- @monitoring_radio.setter
156
- def monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
157
- self._monitoring_radio = value
155
+ @submit_monitoring_radio.setter
156
+ def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
157
+ self._submit_monitoring_radio = value
@@ -20,6 +20,10 @@ from parsl.data_provider.staging import Staging
20
20
  from parsl.executors.errors import BadMessage, ScalingFailed
21
21
  from parsl.executors.high_throughput import zmq_pipes
22
22
  from parsl.executors.high_throughput.errors import CommandClientTimeoutError
23
+ from parsl.executors.high_throughput.manager_selector import (
24
+ ManagerSelector,
25
+ RandomManagerSelector,
26
+ )
23
27
  from parsl.executors.high_throughput.mpi_prefix_composer import (
24
28
  VALID_LAUNCHERS,
25
29
  validate_resource_spec,
@@ -56,7 +60,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
56
60
  "--mpi-launcher={mpi_launcher} "
57
61
  "--available-accelerators {accelerators}")
58
62
 
59
- DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
63
+ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
60
64
 
61
65
  GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
62
66
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
@@ -78,9 +82,9 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
78
82
  cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
79
83
  launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
80
84
 
81
- interchange_launch_cmd : str
82
- Custom command line string to launch the interchange process from the executor. If undefined,
83
- the executor will use the default "interchange.py" command.
85
+ interchange_launch_cmd : Sequence[str]
86
+ Custom sequence of command line tokens to launch the interchange process from the executor. If
87
+ undefined, the executor will use the default "interchange.py" command.
84
88
 
85
89
  address : string
86
90
  An address to connect to the main Parsl process which is reachable from the network in which
@@ -238,7 +242,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
238
242
  label: str = 'HighThroughputExecutor',
239
243
  provider: ExecutionProvider = LocalProvider(),
240
244
  launch_cmd: Optional[str] = None,
241
- interchange_launch_cmd: Optional[str] = None,
245
+ interchange_launch_cmd: Optional[Sequence[str]] = None,
242
246
  address: Optional[str] = None,
243
247
  worker_ports: Optional[Tuple[int, int]] = None,
244
248
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
@@ -261,6 +265,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
261
265
  worker_logdir_root: Optional[str] = None,
262
266
  enable_mpi_mode: bool = False,
263
267
  mpi_launcher: str = "mpiexec",
268
+ manager_selector: ManagerSelector = RandomManagerSelector(),
264
269
  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
265
270
  encrypted: bool = False):
266
271
 
@@ -276,6 +281,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
276
281
  self.prefetch_capacity = prefetch_capacity
277
282
  self.address = address
278
283
  self.address_probe_timeout = address_probe_timeout
284
+ self.manager_selector = manager_selector
279
285
  if self.address:
280
286
  self.all_addresses = address
281
287
  else:
@@ -544,11 +550,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
544
550
  "poll_period": self.poll_period,
545
551
  "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
546
552
  "cert_dir": self.cert_dir,
553
+ "manager_selector": self.manager_selector,
547
554
  }
548
555
 
549
556
  config_pickle = pickle.dumps(interchange_config)
550
557
 
551
- self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE)
558
+ self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
552
559
  stdin = self.interchange_proc.stdin
553
560
  assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
554
561
 
@@ -6,7 +6,6 @@ import os
6
6
  import pickle
7
7
  import platform
8
8
  import queue
9
- import random
10
9
  import signal
11
10
  import sys
12
11
  import threading
@@ -19,7 +18,9 @@ from parsl import curvezmq
19
18
  from parsl.app.errors import RemoteExceptionWrapper
20
19
  from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
21
20
  from parsl.executors.high_throughput.manager_record import ManagerRecord
21
+ from parsl.executors.high_throughput.manager_selector import ManagerSelector
22
22
  from parsl.monitoring.message_type import MessageType
23
+ from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
23
24
  from parsl.process_loggers import wrap_with_logs
24
25
  from parsl.serialize import serialize as serialize_object
25
26
  from parsl.utils import setproctitle
@@ -53,6 +54,7 @@ class Interchange:
53
54
  logging_level: int,
54
55
  poll_period: int,
55
56
  cert_dir: Optional[str],
57
+ manager_selector: ManagerSelector,
56
58
  ) -> None:
57
59
  """
58
60
  Parameters
@@ -160,6 +162,8 @@ class Interchange:
160
162
 
161
163
  self.heartbeat_threshold = heartbeat_threshold
162
164
 
165
+ self.manager_selector = manager_selector
166
+
163
167
  self.current_platform = {'parsl_v': PARSL_VERSION,
164
168
  'python_v': "{}.{}.{}".format(sys.version_info.major,
165
169
  sys.version_info.minor,
@@ -216,27 +220,15 @@ class Interchange:
216
220
  task_counter += 1
217
221
  logger.debug(f"Fetched {task_counter} tasks so far")
218
222
 
219
- def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
220
- if self.hub_address and self.hub_zmq_port:
221
- logger.info("Connecting to MonitoringHub")
222
- # This is a one-off because monitoring is unencrypted
223
- hub_channel = zmq.Context().socket(zmq.DEALER)
224
- hub_channel.set_hwm(0)
225
- hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
226
- logger.info("Connected to MonitoringHub")
227
- return hub_channel
228
- else:
229
- return None
230
-
231
- def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
232
- if hub_channel:
223
+ def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
224
+ if monitoring_radio:
233
225
  logger.info("Sending message {} to MonitoringHub".format(manager))
234
226
 
235
227
  d: Dict = cast(Dict, manager.copy())
236
228
  d['timestamp'] = datetime.datetime.now()
237
229
  d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
238
230
 
239
- hub_channel.send_pyobj((MessageType.NODE_INFO, d))
231
+ monitoring_radio.send((MessageType.NODE_INFO, d))
240
232
 
241
233
  @wrap_with_logs(target="interchange")
242
234
  def _command_server(self) -> NoReturn:
@@ -244,8 +236,11 @@ class Interchange:
244
236
  """
245
237
  logger.debug("Command Server Starting")
246
238
 
247
- # Need to create a new ZMQ socket for command server thread
248
- hub_channel = self._create_monitoring_channel()
239
+ if self.hub_address is not None and self.hub_zmq_port is not None:
240
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
241
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
242
+ else:
243
+ monitoring_radio = None
249
244
 
250
245
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
251
246
 
@@ -295,7 +290,7 @@ class Interchange:
295
290
  if manager_id in self._ready_managers:
296
291
  m = self._ready_managers[manager_id]
297
292
  m['active'] = False
298
- self._send_monitoring_info(hub_channel, m)
293
+ self._send_monitoring_info(monitoring_radio, m)
299
294
  else:
300
295
  logger.warning("Worker to hold was not in ready managers list")
301
296
 
@@ -330,9 +325,14 @@ class Interchange:
330
325
  # parent-process-inheritance problems.
331
326
  signal.signal(signal.SIGTERM, signal.SIG_DFL)
332
327
 
333
- logger.info("Incoming ports bound")
328
+ logger.info("Starting main interchange method")
334
329
 
335
- hub_channel = self._create_monitoring_channel()
330
+ if self.hub_address is not None and self.hub_zmq_port is not None:
331
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
332
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
333
+ logger.debug("Created monitoring radio")
334
+ else:
335
+ monitoring_radio = None
336
336
 
337
337
  poll_period = self.poll_period
338
338
 
@@ -363,10 +363,10 @@ class Interchange:
363
363
  while not kill_event.is_set():
364
364
  self.socks = dict(poller.poll(timeout=poll_period))
365
365
 
366
- self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
367
- self.process_results_incoming(interesting_managers, hub_channel)
368
- self.expire_bad_managers(interesting_managers, hub_channel)
369
- self.expire_drained_managers(interesting_managers, hub_channel)
366
+ self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
367
+ self.process_results_incoming(interesting_managers, monitoring_radio)
368
+ self.expire_bad_managers(interesting_managers, monitoring_radio)
369
+ self.expire_drained_managers(interesting_managers, monitoring_radio)
370
370
  self.process_tasks_to_send(interesting_managers)
371
371
 
372
372
  self.zmq_context.destroy()
@@ -377,7 +377,7 @@ class Interchange:
377
377
  def process_task_outgoing_incoming(
378
378
  self,
379
379
  interesting_managers: Set[bytes],
380
- hub_channel: Optional[zmq.Socket],
380
+ monitoring_radio: Optional[MonitoringRadioSender],
381
381
  kill_event: threading.Event
382
382
  ) -> None:
383
383
  """Process one message from manager on the task_outgoing channel.
@@ -431,7 +431,7 @@ class Interchange:
431
431
  m.update(msg) # type: ignore[typeddict-item]
432
432
 
433
433
  logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
434
- self._send_monitoring_info(hub_channel, m)
434
+ self._send_monitoring_info(monitoring_radio, m)
435
435
 
436
436
  if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
437
437
  msg['parsl_v'] != self.current_platform['parsl_v']):
@@ -462,7 +462,7 @@ class Interchange:
462
462
  logger.error(f"Unexpected message type received from manager: {msg['type']}")
463
463
  logger.debug("leaving task_outgoing section")
464
464
 
465
- def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
465
+ def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
466
466
 
467
467
  for manager_id in list(interesting_managers):
468
468
  # is it always true that a draining manager will be in interesting managers?
@@ -475,7 +475,7 @@ class Interchange:
475
475
  self._ready_managers.pop(manager_id)
476
476
 
477
477
  m['active'] = False
478
- self._send_monitoring_info(hub_channel, m)
478
+ self._send_monitoring_info(monitoring_radio, m)
479
479
 
480
480
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
481
481
  # Check if there are tasks that could be sent to managers
@@ -485,8 +485,7 @@ class Interchange:
485
485
  interesting=len(interesting_managers)))
486
486
 
487
487
  if interesting_managers and not self.pending_task_queue.empty():
488
- shuffled_managers = list(interesting_managers)
489
- random.shuffle(shuffled_managers)
488
+ shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
490
489
 
491
490
  while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
492
491
  manager_id = shuffled_managers.pop()
@@ -519,7 +518,7 @@ class Interchange:
519
518
  else:
520
519
  logger.debug("either no interesting managers or no tasks, so skipping manager pass")
521
520
 
522
- def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
521
+ def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
523
522
  # Receive any results and forward to client
524
523
  if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
525
524
  logger.debug("entering results_incoming section")
@@ -539,11 +538,11 @@ class Interchange:
539
538
  elif r['type'] == 'monitoring':
540
539
  # the monitoring code makes the assumption that no
541
540
  # monitoring messages will be received if monitoring
542
- # is not configured, and that hub_channel will only
541
+ # is not configured, and that monitoring_radio will only
543
542
  # be None when monitoring is not configurated.
544
- assert hub_channel is not None
543
+ assert monitoring_radio is not None
545
544
 
546
- hub_channel.send_pyobj(r['payload'])
545
+ monitoring_radio.send(r['payload'])
547
546
  elif r['type'] == 'heartbeat':
548
547
  logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
549
548
  b_messages.append((p_message, r))
@@ -587,7 +586,7 @@ class Interchange:
587
586
  interesting_managers.add(manager_id)
588
587
  logger.debug("leaving results_incoming section")
589
588
 
590
- def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
589
+ def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
591
590
  bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
592
591
  time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
593
592
  for (manager_id, m) in bad_managers:
@@ -595,7 +594,7 @@ class Interchange:
595
594
  logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
596
595
  if m['active']:
597
596
  m['active'] = False
598
- self._send_monitoring_info(hub_channel, m)
597
+ self._send_monitoring_info(monitoring_radio, m)
599
598
 
600
599
  logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
601
600
  for tid in m['tasks']:
@@ -0,0 +1,25 @@
1
+ import random
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Dict, List, Set
4
+
5
+ from parsl.executors.high_throughput.manager_record import ManagerRecord
6
+
7
+
8
+ class ManagerSelector(metaclass=ABCMeta):
9
+
10
+ @abstractmethod
11
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
12
+ """ Sort a given list of managers.
13
+
14
+ Any operations pertaining to the sorting and rearrangement of the
15
+ interesting_managers Set should be performed here.
16
+ """
17
+ pass
18
+
19
+
20
+ class RandomManagerSelector(ManagerSelector):
21
+
22
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
23
+ c_manager_list = list(manager_list)
24
+ random.shuffle(c_manager_list)
25
+ return c_manager_list
@@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
12
12
  from parsl.executors.base import ParslExecutor
13
13
  from parsl.executors.errors import BadStateException, ScalingFailed
14
14
  from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
15
- from parsl.jobs.states import JobState, JobStatus
15
+ from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
16
16
  from parsl.monitoring.message_type import MessageType
17
17
  from parsl.providers.base import ExecutionProvider
18
18
  from parsl.utils import AtomicIDCounter
@@ -167,40 +167,50 @@ class BlockProviderExecutor(ParslExecutor):
167
167
  def provider(self):
168
168
  return self._provider
169
169
 
170
- def _filter_scale_in_ids(self, to_kill, killed):
170
+ def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]:
171
171
  """ Filter out job id's that were not killed
172
172
  """
173
173
  assert len(to_kill) == len(killed)
174
+
175
+ if False in killed:
176
+ killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k]
177
+ not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k]
178
+ logger.warning("Some jobs were not killed successfully: "
179
+ f"killed jobs: {killed_job_ids}, "
180
+ f"not-killed jobs: {not_killed_job_ids}")
181
+
174
182
  # Filters first iterable by bool values in second
175
183
  return list(compress(to_kill, killed))
176
184
 
177
185
  def scale_out_facade(self, n: int) -> List[str]:
178
- block_ids = self._scale_out(n)
179
- if block_ids is not None:
180
- new_status = {}
181
- for block_id in block_ids:
182
- new_status[block_id] = JobStatus(JobState.PENDING)
183
- self.send_monitoring_info(new_status)
184
- self._status.update(new_status)
185
- return block_ids
186
-
187
- def _scale_out(self, blocks: int = 1) -> List[str]:
188
186
  """Scales out the number of blocks by "blocks"
189
187
  """
190
188
  if not self.provider:
191
189
  raise ScalingFailed(self, "No execution provider available")
192
190
  block_ids = []
193
- logger.info(f"Scaling out by {blocks} blocks")
194
- for _ in range(blocks):
191
+ monitoring_status_changes = {}
192
+ logger.info(f"Scaling out by {n} blocks")
193
+ for _ in range(n):
195
194
  block_id = str(self._block_id_counter.get_id())
196
195
  logger.info(f"Allocated block ID {block_id}")
197
196
  try:
198
197
  job_id = self._launch_block(block_id)
198
+
199
+ pending_status = JobStatus(JobState.PENDING)
200
+
199
201
  self.blocks_to_job_id[block_id] = job_id
200
202
  self.job_ids_to_block[job_id] = block_id
203
+ self._status[block_id] = pending_status
204
+
205
+ monitoring_status_changes[block_id] = pending_status
201
206
  block_ids.append(block_id)
207
+
202
208
  except Exception as ex:
203
- self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
209
+ failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
210
+ self._simulated_status[block_id] = failed_status
211
+ self._status[block_id] = failed_status
212
+
213
+ self.send_monitoring_info(monitoring_status_changes)
204
214
  return block_ids
205
215
 
206
216
  def scale_in(self, blocks: int) -> List[str]:
@@ -215,16 +225,20 @@ class BlockProviderExecutor(ParslExecutor):
215
225
 
216
226
  :return: A list of block ids corresponding to the blocks that were removed.
217
227
  """
218
- # Obtain list of blocks to kill
219
- to_kill = list(self.blocks_to_job_id.keys())[:blocks]
220
- kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
228
+
229
+ active_blocks = [block_id for block_id, status in self._status.items()
230
+ if status.state not in TERMINAL_STATES]
231
+
232
+ block_ids_to_kill = active_blocks[:blocks]
233
+
234
+ job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill]
221
235
 
222
236
  # Cancel the blocks provisioned
223
237
  if self.provider:
224
- logger.info(f"Scaling in jobs: {kill_ids}")
225
- r = self.provider.cancel(kill_ids)
226
- job_ids = self._filter_scale_in_ids(kill_ids, r)
227
- block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
238
+ logger.info(f"Scaling in jobs: {job_ids_to_kill}")
239
+ r = self.provider.cancel(job_ids_to_kill)
240
+ job_ids = self._filter_scale_in_ids(job_ids_to_kill, r)
241
+ block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids]
228
242
  return block_ids_killed
229
243
  else:
230
244
  logger.error("No execution provider available to scale in")
@@ -262,10 +276,10 @@ class BlockProviderExecutor(ParslExecutor):
262
276
 
263
277
  def send_monitoring_info(self, status: Dict) -> None:
264
278
  # Send monitoring info for HTEX when monitoring enabled
265
- if self.monitoring_radio:
279
+ if self.submit_monitoring_radio:
266
280
  msg = self.create_monitoring_info(status)
267
281
  logger.debug("Sending block monitoring message: %r", msg)
268
- self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
282
+ self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
269
283
 
270
284
  def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
271
285
  """Create a monitoring message for each block based on the poll status.
@@ -0,0 +1,6 @@
1
+ from parsl.errors import ParslError
2
+
3
+
4
+ class MonitoringHubStartError(ParslError):
5
+ def __str__(self) -> str:
6
+ return "Hub failed to start"
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
12
12
  import typeguard
13
13
 
14
14
  from parsl.log_utils import set_file_logger
15
+ from parsl.monitoring.errors import MonitoringHubStartError
15
16
  from parsl.monitoring.message_type import MessageType
16
17
  from parsl.monitoring.radios import MultiprocessingQueueRadioSender
17
18
  from parsl.monitoring.router import router_starter
@@ -195,7 +196,7 @@ class MonitoringHub(RepresentationMixin):
195
196
  comm_q.join_thread()
196
197
  except queue.Empty:
197
198
  logger.error("Hub has not completed initialization in 120s. Aborting")
198
- raise Exception("Hub failed to start")
199
+ raise MonitoringHubStartError()
199
200
 
200
201
  if isinstance(comm_q_result, str):
201
202
  logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
7
7
  from multiprocessing.queues import Queue
8
8
  from typing import Optional
9
9
 
10
+ import zmq
11
+
10
12
  from parsl.serialize import serialize
11
13
 
12
14
  _db_manager_excepts: Optional[Exception]
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
186
188
 
187
189
  def send(self, message: object) -> None:
188
190
  self.queue.put((message, 0))
191
+
192
+
193
+ class ZMQRadioSender(MonitoringRadioSender):
194
+ """A monitoring radio which connects over ZMQ. This radio is not
195
+ thread-safe, because its use of ZMQ is not thread-safe.
196
+ """
197
+
198
+ def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
199
+ self._hub_channel = zmq.Context().socket(zmq.DEALER)
200
+ self._hub_channel.set_hwm(0)
201
+ self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
202
+
203
+ def send(self, message: object) -> None:
204
+ self._hub_channel.send_pyobj(message)