parsl 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. parsl/dataflow/dflow.py +4 -10
  2. parsl/executors/base.py +8 -8
  3. parsl/executors/flux/executor.py +7 -7
  4. parsl/executors/high_throughput/executor.py +55 -55
  5. parsl/executors/high_throughput/interchange.py +37 -37
  6. parsl/executors/high_throughput/manager_record.py +1 -0
  7. parsl/executors/high_throughput/manager_selector.py +25 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +2 -0
  9. parsl/executors/status_handling.py +52 -21
  10. parsl/executors/taskvine/executor.py +0 -18
  11. parsl/executors/workqueue/executor.py +0 -18
  12. parsl/monitoring/errors.py +6 -0
  13. parsl/monitoring/monitoring.py +6 -5
  14. parsl/monitoring/radios.py +23 -7
  15. parsl/monitoring/remote.py +12 -12
  16. parsl/monitoring/router.py +71 -30
  17. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  18. parsl/tests/test_htex/test_htex.py +28 -19
  19. parsl/tests/test_htex/test_zmq_binding.py +2 -0
  20. parsl/tests/test_monitoring/test_basic.py +14 -1
  21. parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
  22. parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
  23. parsl/version.py +1 -1
  24. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/interchange.py +37 -37
  25. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +2 -0
  26. parsl-2024.8.5.dist-info/METADATA +101 -0
  27. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/RECORD +33 -30
  28. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
  29. parsl-2024.7.22.dist-info/METADATA +0 -101
  30. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
  31. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
  32. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
  33. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
  34. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py CHANGED
@@ -113,14 +113,10 @@ class DataFlowKernel:
113
113
  self.monitoring: Optional[MonitoringHub]
114
114
  self.monitoring = config.monitoring
115
115
 
116
- # hub address and port for interchange to connect
117
- self.hub_address = None # type: Optional[str]
118
- self.hub_zmq_port = None # type: Optional[int]
119
116
  if self.monitoring:
120
117
  if self.monitoring.logdir is None:
121
118
  self.monitoring.logdir = self.run_dir
122
- self.hub_address = self.monitoring.hub_address
123
- self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
119
+ self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
124
120
 
125
121
  self.time_began = datetime.datetime.now()
126
122
  self.time_completed: Optional[datetime.datetime] = None
@@ -1181,10 +1177,10 @@ class DataFlowKernel:
1181
1177
  for executor in executors:
1182
1178
  executor.run_id = self.run_id
1183
1179
  executor.run_dir = self.run_dir
1184
- executor.hub_address = self.hub_address
1185
- executor.hub_zmq_port = self.hub_zmq_port
1186
1180
  if self.monitoring:
1187
- executor.monitoring_radio = self.monitoring.radio
1181
+ executor.hub_address = self.monitoring.hub_address
1182
+ executor.hub_zmq_port = self.monitoring.hub_zmq_port
1183
+ executor.submit_monitoring_radio = self.monitoring.radio
1188
1184
  if hasattr(executor, 'provider'):
1189
1185
  if hasattr(executor.provider, 'script_dir'):
1190
1186
  executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
@@ -1460,8 +1456,6 @@ class DataFlowKernel:
1460
1456
  Returns:
1461
1457
  - dict containing, hashed -> future mappings
1462
1458
  """
1463
- self.memo_lookup_table = None
1464
-
1465
1459
  if checkpointDirs:
1466
1460
  return self._load_checkpoints(checkpointDirs)
1467
1461
  else:
parsl/executors/base.py CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Optional
5
5
 
6
6
  from typing_extensions import Literal, Self
7
7
 
8
- from parsl.monitoring.radios import MonitoringRadio
8
+ from parsl.monitoring.radios import MonitoringRadioSender
9
9
 
10
10
 
11
11
  class ParslExecutor(metaclass=ABCMeta):
@@ -52,13 +52,13 @@ class ParslExecutor(metaclass=ABCMeta):
52
52
  *,
53
53
  hub_address: Optional[str] = None,
54
54
  hub_zmq_port: Optional[int] = None,
55
- monitoring_radio: Optional[MonitoringRadio] = None,
55
+ submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
56
56
  run_dir: str = ".",
57
57
  run_id: Optional[str] = None,
58
58
  ):
59
59
  self.hub_address = hub_address
60
60
  self.hub_zmq_port = hub_zmq_port
61
- self.monitoring_radio = monitoring_radio
61
+ self.submit_monitoring_radio = submit_monitoring_radio
62
62
  self.run_dir = os.path.abspath(run_dir)
63
63
  self.run_id = run_id
64
64
 
@@ -147,11 +147,11 @@ class ParslExecutor(metaclass=ABCMeta):
147
147
  self._hub_zmq_port = value
148
148
 
149
149
  @property
150
- def monitoring_radio(self) -> Optional[MonitoringRadio]:
150
+ def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
151
151
  """Local radio for sending monitoring messages
152
152
  """
153
- return self._monitoring_radio
153
+ return self._submit_monitoring_radio
154
154
 
155
- @monitoring_radio.setter
156
- def monitoring_radio(self, value: Optional[MonitoringRadio]) -> None:
157
- self._monitoring_radio = value
155
+ @submit_monitoring_radio.setter
156
+ def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
157
+ self._submit_monitoring_radio = value
@@ -200,7 +200,6 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
200
200
  raise EnvironmentError("Cannot find Flux installation in PATH")
201
201
  self.flux_path = os.path.abspath(flux_path)
202
202
  self._task_id_counter = itertools.count()
203
- self._socket = zmq.Context().socket(zmq.REP)
204
203
  # Assumes a launch command cannot be None or empty
205
204
  self.launch_cmd = launch_cmd or self.DEFAULT_LAUNCH_CMD
206
205
  self._submission_queue: queue.Queue = queue.Queue()
@@ -213,7 +212,6 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
213
212
  args=(
214
213
  self._submission_queue,
215
214
  self._stop_event,
216
- self._socket,
217
215
  self.working_dir,
218
216
  self.flux_executor_kwargs,
219
217
  self.provider,
@@ -306,11 +304,13 @@ def _submit_wrapper(
306
304
 
307
305
  If an exception is thrown, error out all submitted tasks.
308
306
  """
309
- try:
310
- _submit_flux_jobs(submission_queue, stop_event, *args, **kwargs)
311
- except Exception as exc:
312
- _error_out_jobs(submission_queue, stop_event, exc)
313
- raise
307
+ with zmq.Context() as ctx:
308
+ with ctx.socket(zmq.REP) as socket:
309
+ try:
310
+ _submit_flux_jobs(submission_queue, stop_event, socket, *args, **kwargs)
311
+ except Exception as exc:
312
+ _error_out_jobs(submission_queue, stop_event, exc)
313
+ raise
314
314
 
315
315
 
316
316
  def _error_out_jobs(
@@ -20,6 +20,10 @@ from parsl.data_provider.staging import Staging
20
20
  from parsl.executors.errors import BadMessage, ScalingFailed
21
21
  from parsl.executors.high_throughput import zmq_pipes
22
22
  from parsl.executors.high_throughput.errors import CommandClientTimeoutError
23
+ from parsl.executors.high_throughput.manager_selector import (
24
+ ManagerSelector,
25
+ RandomManagerSelector,
26
+ )
23
27
  from parsl.executors.high_throughput.mpi_prefix_composer import (
24
28
  VALID_LAUNCHERS,
25
29
  validate_resource_spec,
@@ -56,7 +60,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
56
60
  "--mpi-launcher={mpi_launcher} "
57
61
  "--available-accelerators {accelerators}")
58
62
 
59
- DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
63
+ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
60
64
 
61
65
  GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
62
66
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
@@ -78,9 +82,9 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
78
82
  cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
79
83
  launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
80
84
 
81
- interchange_launch_cmd : str
82
- Custom command line string to launch the interchange process from the executor. If undefined,
83
- the executor will use the default "interchange.py" command.
85
+ interchange_launch_cmd : Sequence[str]
86
+ Custom sequence of command line tokens to launch the interchange process from the executor. If
87
+ undefined, the executor will use the default "interchange.py" command.
84
88
 
85
89
  address : string
86
90
  An address to connect to the main Parsl process which is reachable from the network in which
@@ -238,7 +242,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
238
242
  label: str = 'HighThroughputExecutor',
239
243
  provider: ExecutionProvider = LocalProvider(),
240
244
  launch_cmd: Optional[str] = None,
241
- interchange_launch_cmd: Optional[str] = None,
245
+ interchange_launch_cmd: Optional[Sequence[str]] = None,
242
246
  address: Optional[str] = None,
243
247
  worker_ports: Optional[Tuple[int, int]] = None,
244
248
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
@@ -261,6 +265,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
261
265
  worker_logdir_root: Optional[str] = None,
262
266
  enable_mpi_mode: bool = False,
263
267
  mpi_launcher: str = "mpiexec",
268
+ manager_selector: ManagerSelector = RandomManagerSelector(),
264
269
  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
265
270
  encrypted: bool = False):
266
271
 
@@ -276,6 +281,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
276
281
  self.prefetch_capacity = prefetch_capacity
277
282
  self.address = address
278
283
  self.address_probe_timeout = address_probe_timeout
284
+ self.manager_selector = manager_selector
279
285
  if self.address:
280
286
  self.all_addresses = address
281
287
  else:
@@ -456,8 +462,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
456
462
  "task_id" : <task_id>
457
463
  "exception" : serialized exception object, on failure
458
464
  }
459
-
460
- The `None` message is a die request.
461
465
  """
462
466
  logger.debug("Result queue worker starting")
463
467
 
@@ -475,58 +479,53 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
475
479
 
476
480
  else:
477
481
 
478
- if msgs is None:
479
- logger.debug("Got None, exiting")
480
- return
482
+ for serialized_msg in msgs:
483
+ try:
484
+ msg = pickle.loads(serialized_msg)
485
+ except pickle.UnpicklingError:
486
+ raise BadMessage("Message received could not be unpickled")
481
487
 
482
- else:
483
- for serialized_msg in msgs:
488
+ if msg['type'] == 'heartbeat':
489
+ continue
490
+ elif msg['type'] == 'result':
484
491
  try:
485
- msg = pickle.loads(serialized_msg)
486
- except pickle.UnpicklingError:
487
- raise BadMessage("Message received could not be unpickled")
492
+ tid = msg['task_id']
493
+ except Exception:
494
+ raise BadMessage("Message received does not contain 'task_id' field")
495
+
496
+ if tid == -1 and 'exception' in msg:
497
+ logger.warning("Executor shutting down due to exception from interchange")
498
+ exception = deserialize(msg['exception'])
499
+ self.set_bad_state_and_fail_all(exception)
500
+ break
501
+
502
+ task_fut = self.tasks.pop(tid)
503
+
504
+ if 'result' in msg:
505
+ result = deserialize(msg['result'])
506
+ task_fut.set_result(result)
488
507
 
489
- if msg['type'] == 'heartbeat':
490
- continue
491
- elif msg['type'] == 'result':
508
+ elif 'exception' in msg:
492
509
  try:
493
- tid = msg['task_id']
494
- except Exception:
495
- raise BadMessage("Message received does not contain 'task_id' field")
496
-
497
- if tid == -1 and 'exception' in msg:
498
- logger.warning("Executor shutting down due to exception from interchange")
499
- exception = deserialize(msg['exception'])
500
- self.set_bad_state_and_fail_all(exception)
501
- break
502
-
503
- task_fut = self.tasks.pop(tid)
504
-
505
- if 'result' in msg:
506
- result = deserialize(msg['result'])
507
- task_fut.set_result(result)
508
-
509
- elif 'exception' in msg:
510
- try:
511
- s = deserialize(msg['exception'])
512
- # s should be a RemoteExceptionWrapper... so we can reraise it
513
- if isinstance(s, RemoteExceptionWrapper):
514
- try:
515
- s.reraise()
516
- except Exception as e:
517
- task_fut.set_exception(e)
518
- elif isinstance(s, Exception):
519
- task_fut.set_exception(s)
520
- else:
521
- raise ValueError("Unknown exception-like type received: {}".format(type(s)))
522
- except Exception as e:
523
- # TODO could be a proper wrapped exception?
524
- task_fut.set_exception(
525
- DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
526
- else:
527
- raise BadMessage("Message received is neither result or exception")
510
+ s = deserialize(msg['exception'])
511
+ # s should be a RemoteExceptionWrapper... so we can reraise it
512
+ if isinstance(s, RemoteExceptionWrapper):
513
+ try:
514
+ s.reraise()
515
+ except Exception as e:
516
+ task_fut.set_exception(e)
517
+ elif isinstance(s, Exception):
518
+ task_fut.set_exception(s)
519
+ else:
520
+ raise ValueError("Unknown exception-like type received: {}".format(type(s)))
521
+ except Exception as e:
522
+ # TODO could be a proper wrapped exception?
523
+ task_fut.set_exception(
524
+ DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
528
525
  else:
529
- raise BadMessage("Message received with unknown type {}".format(msg['type']))
526
+ raise BadMessage("Message received is neither result or exception")
527
+ else:
528
+ raise BadMessage("Message received with unknown type {}".format(msg['type']))
530
529
 
531
530
  logger.info("Result queue worker finished")
532
531
 
@@ -551,11 +550,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
551
550
  "poll_period": self.poll_period,
552
551
  "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
553
552
  "cert_dir": self.cert_dir,
553
+ "manager_selector": self.manager_selector,
554
554
  }
555
555
 
556
556
  config_pickle = pickle.dumps(interchange_config)
557
557
 
558
- self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd.encode("utf-8"), stdin=subprocess.PIPE)
558
+ self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
559
559
  stdin = self.interchange_proc.stdin
560
560
  assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
561
561
 
@@ -6,7 +6,6 @@ import os
6
6
  import pickle
7
7
  import platform
8
8
  import queue
9
- import random
10
9
  import signal
11
10
  import sys
12
11
  import threading
@@ -19,7 +18,9 @@ from parsl import curvezmq
19
18
  from parsl.app.errors import RemoteExceptionWrapper
20
19
  from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
21
20
  from parsl.executors.high_throughput.manager_record import ManagerRecord
21
+ from parsl.executors.high_throughput.manager_selector import ManagerSelector
22
22
  from parsl.monitoring.message_type import MessageType
23
+ from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
23
24
  from parsl.process_loggers import wrap_with_logs
24
25
  from parsl.serialize import serialize as serialize_object
25
26
  from parsl.utils import setproctitle
@@ -53,6 +54,7 @@ class Interchange:
53
54
  logging_level: int,
54
55
  poll_period: int,
55
56
  cert_dir: Optional[str],
57
+ manager_selector: ManagerSelector,
56
58
  ) -> None:
57
59
  """
58
60
  Parameters
@@ -160,6 +162,8 @@ class Interchange:
160
162
 
161
163
  self.heartbeat_threshold = heartbeat_threshold
162
164
 
165
+ self.manager_selector = manager_selector
166
+
163
167
  self.current_platform = {'parsl_v': PARSL_VERSION,
164
168
  'python_v': "{}.{}.{}".format(sys.version_info.major,
165
169
  sys.version_info.minor,
@@ -216,27 +220,15 @@ class Interchange:
216
220
  task_counter += 1
217
221
  logger.debug(f"Fetched {task_counter} tasks so far")
218
222
 
219
- def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
220
- if self.hub_address and self.hub_zmq_port:
221
- logger.info("Connecting to MonitoringHub")
222
- # This is a one-off because monitoring is unencrypted
223
- hub_channel = zmq.Context().socket(zmq.DEALER)
224
- hub_channel.set_hwm(0)
225
- hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
226
- logger.info("Connected to MonitoringHub")
227
- return hub_channel
228
- else:
229
- return None
230
-
231
- def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
232
- if hub_channel:
223
+ def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
224
+ if monitoring_radio:
233
225
  logger.info("Sending message {} to MonitoringHub".format(manager))
234
226
 
235
227
  d: Dict = cast(Dict, manager.copy())
236
228
  d['timestamp'] = datetime.datetime.now()
237
229
  d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
238
230
 
239
- hub_channel.send_pyobj((MessageType.NODE_INFO, d))
231
+ monitoring_radio.send((MessageType.NODE_INFO, d))
240
232
 
241
233
  @wrap_with_logs(target="interchange")
242
234
  def _command_server(self) -> NoReturn:
@@ -244,8 +236,11 @@ class Interchange:
244
236
  """
245
237
  logger.debug("Command Server Starting")
246
238
 
247
- # Need to create a new ZMQ socket for command server thread
248
- hub_channel = self._create_monitoring_channel()
239
+ if self.hub_address is not None and self.hub_zmq_port is not None:
240
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
241
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
242
+ else:
243
+ monitoring_radio = None
249
244
 
250
245
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
251
246
 
@@ -295,7 +290,7 @@ class Interchange:
295
290
  if manager_id in self._ready_managers:
296
291
  m = self._ready_managers[manager_id]
297
292
  m['active'] = False
298
- self._send_monitoring_info(hub_channel, m)
293
+ self._send_monitoring_info(monitoring_radio, m)
299
294
  else:
300
295
  logger.warning("Worker to hold was not in ready managers list")
301
296
 
@@ -330,9 +325,14 @@ class Interchange:
330
325
  # parent-process-inheritance problems.
331
326
  signal.signal(signal.SIGTERM, signal.SIG_DFL)
332
327
 
333
- logger.info("Incoming ports bound")
328
+ logger.info("Starting main interchange method")
334
329
 
335
- hub_channel = self._create_monitoring_channel()
330
+ if self.hub_address is not None and self.hub_zmq_port is not None:
331
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
332
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
333
+ logger.debug("Created monitoring radio")
334
+ else:
335
+ monitoring_radio = None
336
336
 
337
337
  poll_period = self.poll_period
338
338
 
@@ -363,10 +363,10 @@ class Interchange:
363
363
  while not kill_event.is_set():
364
364
  self.socks = dict(poller.poll(timeout=poll_period))
365
365
 
366
- self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
367
- self.process_results_incoming(interesting_managers, hub_channel)
368
- self.expire_bad_managers(interesting_managers, hub_channel)
369
- self.expire_drained_managers(interesting_managers, hub_channel)
366
+ self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
367
+ self.process_results_incoming(interesting_managers, monitoring_radio)
368
+ self.expire_bad_managers(interesting_managers, monitoring_radio)
369
+ self.expire_drained_managers(interesting_managers, monitoring_radio)
370
370
  self.process_tasks_to_send(interesting_managers)
371
371
 
372
372
  self.zmq_context.destroy()
@@ -377,7 +377,7 @@ class Interchange:
377
377
  def process_task_outgoing_incoming(
378
378
  self,
379
379
  interesting_managers: Set[bytes],
380
- hub_channel: Optional[zmq.Socket],
380
+ monitoring_radio: Optional[MonitoringRadioSender],
381
381
  kill_event: threading.Event
382
382
  ) -> None:
383
383
  """Process one message from manager on the task_outgoing channel.
@@ -410,6 +410,7 @@ class Interchange:
410
410
  self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
411
411
  'idle_since': time.time(),
412
412
  'block_id': None,
413
+ 'start_time': msg['start_time'],
413
414
  'max_capacity': 0,
414
415
  'worker_count': 0,
415
416
  'active': True,
@@ -430,7 +431,7 @@ class Interchange:
430
431
  m.update(msg) # type: ignore[typeddict-item]
431
432
 
432
433
  logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
433
- self._send_monitoring_info(hub_channel, m)
434
+ self._send_monitoring_info(monitoring_radio, m)
434
435
 
435
436
  if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
436
437
  msg['parsl_v'] != self.current_platform['parsl_v']):
@@ -461,7 +462,7 @@ class Interchange:
461
462
  logger.error(f"Unexpected message type received from manager: {msg['type']}")
462
463
  logger.debug("leaving task_outgoing section")
463
464
 
464
- def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
465
+ def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
465
466
 
466
467
  for manager_id in list(interesting_managers):
467
468
  # is it always true that a draining manager will be in interesting managers?
@@ -474,7 +475,7 @@ class Interchange:
474
475
  self._ready_managers.pop(manager_id)
475
476
 
476
477
  m['active'] = False
477
- self._send_monitoring_info(hub_channel, m)
478
+ self._send_monitoring_info(monitoring_radio, m)
478
479
 
479
480
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
480
481
  # Check if there are tasks that could be sent to managers
@@ -484,8 +485,7 @@ class Interchange:
484
485
  interesting=len(interesting_managers)))
485
486
 
486
487
  if interesting_managers and not self.pending_task_queue.empty():
487
- shuffled_managers = list(interesting_managers)
488
- random.shuffle(shuffled_managers)
488
+ shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
489
489
 
490
490
  while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
491
491
  manager_id = shuffled_managers.pop()
@@ -518,7 +518,7 @@ class Interchange:
518
518
  else:
519
519
  logger.debug("either no interesting managers or no tasks, so skipping manager pass")
520
520
 
521
- def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
521
+ def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
522
522
  # Receive any results and forward to client
523
523
  if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
524
524
  logger.debug("entering results_incoming section")
@@ -538,11 +538,11 @@ class Interchange:
538
538
  elif r['type'] == 'monitoring':
539
539
  # the monitoring code makes the assumption that no
540
540
  # monitoring messages will be received if monitoring
541
- # is not configured, and that hub_channel will only
541
+ # is not configured, and that monitoring_radio will only
542
542
  # be None when monitoring is not configurated.
543
- assert hub_channel is not None
543
+ assert monitoring_radio is not None
544
544
 
545
- hub_channel.send_pyobj(r['payload'])
545
+ monitoring_radio.send(r['payload'])
546
546
  elif r['type'] == 'heartbeat':
547
547
  logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
548
548
  b_messages.append((p_message, r))
@@ -586,7 +586,7 @@ class Interchange:
586
586
  interesting_managers.add(manager_id)
587
587
  logger.debug("leaving results_incoming section")
588
588
 
589
- def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
589
+ def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
590
590
  bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
591
591
  time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
592
592
  for (manager_id, m) in bad_managers:
@@ -594,7 +594,7 @@ class Interchange:
594
594
  logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
595
595
  if m['active']:
596
596
  m['active'] = False
597
- self._send_monitoring_info(hub_channel, m)
597
+ self._send_monitoring_info(monitoring_radio, m)
598
598
 
599
599
  logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
600
600
  for tid in m['tasks']:
@@ -6,6 +6,7 @@ from typing_extensions import TypedDict
6
6
 
7
7
  class ManagerRecord(TypedDict, total=False):
8
8
  block_id: Optional[str]
9
+ start_time: float
9
10
  tasks: List[Any]
10
11
  worker_count: int
11
12
  max_capacity: int
@@ -0,0 +1,25 @@
1
+ import random
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Dict, List, Set
4
+
5
+ from parsl.executors.high_throughput.manager_record import ManagerRecord
6
+
7
+
8
+ class ManagerSelector(metaclass=ABCMeta):
9
+
10
+ @abstractmethod
11
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
12
+ """ Sort a given list of managers.
13
+
14
+ Any operations pertaining to the sorting and rearrangement of the
15
+ interesting_managers Set should be performed here.
16
+ """
17
+ pass
18
+
19
+
20
+ class RandomManagerSelector(ManagerSelector):
21
+
22
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
23
+ c_manager_list = list(manager_list)
24
+ random.shuffle(c_manager_list)
25
+ return c_manager_list
@@ -184,6 +184,7 @@ class Manager:
184
184
 
185
185
  self.uid = uid
186
186
  self.block_id = block_id
187
+ self.start_time = time.time()
187
188
 
188
189
  self.enable_mpi_mode = enable_mpi_mode
189
190
  self.mpi_launcher = mpi_launcher
@@ -263,6 +264,7 @@ class Manager:
263
264
  'worker_count': self.worker_count,
264
265
  'uid': self.uid,
265
266
  'block_id': self.block_id,
267
+ 'start_time': self.start_time,
266
268
  'prefetch_capacity': self.prefetch_capacity,
267
269
  'max_capacity': self.worker_count + self.prefetch_capacity,
268
270
  'os': platform.system(),