parsl 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. parsl/dataflow/dflow.py +4 -10
  2. parsl/executors/base.py +8 -8
  3. parsl/executors/flux/executor.py +7 -7
  4. parsl/executors/high_throughput/executor.py +55 -55
  5. parsl/executors/high_throughput/interchange.py +37 -37
  6. parsl/executors/high_throughput/manager_record.py +1 -0
  7. parsl/executors/high_throughput/manager_selector.py +25 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +2 -0
  9. parsl/executors/status_handling.py +52 -21
  10. parsl/executors/taskvine/executor.py +0 -18
  11. parsl/executors/workqueue/executor.py +0 -18
  12. parsl/monitoring/errors.py +6 -0
  13. parsl/monitoring/monitoring.py +6 -5
  14. parsl/monitoring/radios.py +23 -7
  15. parsl/monitoring/remote.py +12 -12
  16. parsl/monitoring/router.py +71 -30
  17. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  18. parsl/tests/test_htex/test_htex.py +28 -19
  19. parsl/tests/test_htex/test_zmq_binding.py +2 -0
  20. parsl/tests/test_monitoring/test_basic.py +14 -1
  21. parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
  22. parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
  23. parsl/version.py +1 -1
  24. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/interchange.py +37 -37
  25. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +2 -0
  26. parsl-2024.8.5.dist-info/METADATA +101 -0
  27. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/RECORD +33 -30
  28. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
  29. parsl-2024.7.22.dist-info/METADATA +0 -101
  30. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
  31. {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
  32. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
  33. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
  34. {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
12
12
  from parsl.executors.base import ParslExecutor
13
13
  from parsl.executors.errors import BadStateException, ScalingFailed
14
14
  from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
15
- from parsl.jobs.states import JobState, JobStatus
15
+ from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
16
16
  from parsl.monitoring.message_type import MessageType
17
17
  from parsl.providers.base import ExecutionProvider
18
18
  from parsl.utils import AtomicIDCounter
@@ -167,41 +167,82 @@ class BlockProviderExecutor(ParslExecutor):
167
167
  def provider(self):
168
168
  return self._provider
169
169
 
170
- def _filter_scale_in_ids(self, to_kill, killed):
170
+ def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]:
171
171
  """ Filter out job id's that were not killed
172
172
  """
173
173
  assert len(to_kill) == len(killed)
174
+
175
+ if False in killed:
176
+ killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k]
177
+ not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k]
178
+ logger.warning("Some jobs were not killed successfully: "
179
+ f"killed jobs: {killed_job_ids}, "
180
+ f"not-killed jobs: {not_killed_job_ids}")
181
+
174
182
  # Filters first iterable by bool values in second
175
183
  return list(compress(to_kill, killed))
176
184
 
177
- def _scale_out(self, blocks: int = 1) -> List[str]:
185
+ def scale_out_facade(self, n: int) -> List[str]:
178
186
  """Scales out the number of blocks by "blocks"
179
187
  """
180
188
  if not self.provider:
181
189
  raise ScalingFailed(self, "No execution provider available")
182
190
  block_ids = []
183
- logger.info(f"Scaling out by {blocks} blocks")
184
- for _ in range(blocks):
191
+ monitoring_status_changes = {}
192
+ logger.info(f"Scaling out by {n} blocks")
193
+ for _ in range(n):
185
194
  block_id = str(self._block_id_counter.get_id())
186
195
  logger.info(f"Allocated block ID {block_id}")
187
196
  try:
188
197
  job_id = self._launch_block(block_id)
198
+
199
+ pending_status = JobStatus(JobState.PENDING)
200
+
189
201
  self.blocks_to_job_id[block_id] = job_id
190
202
  self.job_ids_to_block[job_id] = block_id
203
+ self._status[block_id] = pending_status
204
+
205
+ monitoring_status_changes[block_id] = pending_status
191
206
  block_ids.append(block_id)
207
+
192
208
  except Exception as ex:
193
- self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
209
+ failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
210
+ self._simulated_status[block_id] = failed_status
211
+ self._status[block_id] = failed_status
212
+
213
+ self.send_monitoring_info(monitoring_status_changes)
194
214
  return block_ids
195
215
 
196
- @abstractmethod
197
216
  def scale_in(self, blocks: int) -> List[str]:
198
217
  """Scale in method.
199
218
 
200
219
  Cause the executor to reduce the number of blocks by count.
201
220
 
221
+ The default implementation will kill blocks without regard to their
222
+ status or whether they are executing tasks. Executors with more
223
+ nuanced scaling strategies might overload this method to work with
224
+ that strategy - see the HighThroughputExecutor for an example of that.
225
+
202
226
  :return: A list of block ids corresponding to the blocks that were removed.
203
227
  """
204
- pass
228
+
229
+ active_blocks = [block_id for block_id, status in self._status.items()
230
+ if status.state not in TERMINAL_STATES]
231
+
232
+ block_ids_to_kill = active_blocks[:blocks]
233
+
234
+ job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill]
235
+
236
+ # Cancel the blocks provisioned
237
+ if self.provider:
238
+ logger.info(f"Scaling in jobs: {job_ids_to_kill}")
239
+ r = self.provider.cancel(job_ids_to_kill)
240
+ job_ids = self._filter_scale_in_ids(job_ids_to_kill, r)
241
+ block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids]
242
+ return block_ids_killed
243
+ else:
244
+ logger.error("No execution provider available to scale in")
245
+ return []
205
246
 
206
247
  def _launch_block(self, block_id: str) -> Any:
207
248
  launch_cmd = self._get_launch_command(block_id)
@@ -235,10 +276,10 @@ class BlockProviderExecutor(ParslExecutor):
235
276
 
236
277
  def send_monitoring_info(self, status: Dict) -> None:
237
278
  # Send monitoring info for HTEX when monitoring enabled
238
- if self.monitoring_radio:
279
+ if self.submit_monitoring_radio:
239
280
  msg = self.create_monitoring_info(status)
240
- logger.debug("Sending message {} to hub from job status poller".format(msg))
241
- self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
281
+ logger.debug("Sending block monitoring message: %r", msg)
282
+ self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
242
283
 
243
284
  def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
244
285
  """Create a monitoring message for each block based on the poll status.
@@ -310,13 +351,3 @@ class BlockProviderExecutor(ParslExecutor):
310
351
  del self._status[block_id]
311
352
  self.send_monitoring_info(new_status)
312
353
  return block_ids
313
-
314
- def scale_out_facade(self, n: int) -> List[str]:
315
- block_ids = self._scale_out(n)
316
- if block_ids is not None:
317
- new_status = {}
318
- for block_id in block_ids:
319
- new_status[block_id] = JobStatus(JobState.PENDING)
320
- self.send_monitoring_info(new_status)
321
- self._status.update(new_status)
322
- return block_ids
@@ -573,24 +573,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
573
573
  def workers_per_node(self) -> Union[int, float]:
574
574
  return 1
575
575
 
576
- def scale_in(self, count: int) -> List[str]:
577
- """Scale in method. Cancel a given number of blocks
578
- """
579
- # Obtain list of blocks to kill
580
- to_kill = list(self.blocks_to_job_id.keys())[:count]
581
- kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
582
-
583
- # Cancel the blocks provisioned
584
- if self.provider:
585
- logger.info(f"Scaling in jobs: {kill_ids}")
586
- r = self.provider.cancel(kill_ids)
587
- job_ids = self._filter_scale_in_ids(kill_ids, r)
588
- block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
589
- return block_ids_killed
590
- else:
591
- logger.error("No execution provider available to scale")
592
- return []
593
-
594
576
  def shutdown(self, *args, **kwargs):
595
577
  """Shutdown the executor. Sets flag to cancel the submit process and
596
578
  collector thread, which shuts down the TaskVine system submission.
@@ -689,24 +689,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
689
689
  def workers_per_node(self) -> Union[int, float]:
690
690
  return self.scaling_cores_per_worker
691
691
 
692
- def scale_in(self, count: int) -> List[str]:
693
- """Scale in method.
694
- """
695
- # Obtain list of blocks to kill
696
- to_kill = list(self.blocks_to_job_id.keys())[:count]
697
- kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
698
-
699
- # Cancel the blocks provisioned
700
- if self.provider:
701
- logger.info(f"Scaling in jobs: {kill_ids}")
702
- r = self.provider.cancel(kill_ids)
703
- job_ids = self._filter_scale_in_ids(kill_ids, r)
704
- block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
705
- return block_ids_killed
706
- else:
707
- logger.error("No execution provider available to scale in")
708
- return []
709
-
710
692
  def shutdown(self, *args, **kwargs):
711
693
  """Shutdown the executor. Sets flag to cancel the submit process and
712
694
  collector thread, which shuts down the Work Queue system submission.
@@ -0,0 +1,6 @@
1
+ from parsl.errors import ParslError
2
+
3
+
4
+ class MonitoringHubStartError(ParslError):
5
+ def __str__(self) -> str:
6
+ return "Hub failed to start"
@@ -12,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
12
12
  import typeguard
13
13
 
14
14
  from parsl.log_utils import set_file_logger
15
+ from parsl.monitoring.errors import MonitoringHubStartError
15
16
  from parsl.monitoring.message_type import MessageType
16
- from parsl.monitoring.radios import MultiprocessingQueueRadio
17
+ from parsl.monitoring.radios import MultiprocessingQueueRadioSender
17
18
  from parsl.monitoring.router import router_starter
18
19
  from parsl.monitoring.types import AddressedMonitoringMessage
19
20
  from parsl.multiprocessing import ForkProcess, SizedQueue
@@ -105,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
105
106
  self.resource_monitoring_enabled = resource_monitoring_enabled
106
107
  self.resource_monitoring_interval = resource_monitoring_interval
107
108
 
108
- def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
109
+ def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
109
110
 
110
111
  logger.debug("Starting MonitoringHub")
111
112
 
@@ -187,7 +188,7 @@ class MonitoringHub(RepresentationMixin):
187
188
  self.filesystem_proc.start()
188
189
  logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
189
190
 
190
- self.radio = MultiprocessingQueueRadio(self.block_msgs)
191
+ self.radio = MultiprocessingQueueRadioSender(self.block_msgs)
191
192
 
192
193
  try:
193
194
  comm_q_result = comm_q.get(block=True, timeout=120)
@@ -195,7 +196,7 @@ class MonitoringHub(RepresentationMixin):
195
196
  comm_q.join_thread()
196
197
  except queue.Empty:
197
198
  logger.error("Hub has not completed initialization in 120s. Aborting")
198
- raise Exception("Hub failed to start")
199
+ raise MonitoringHubStartError()
199
200
 
200
201
  if isinstance(comm_q_result, str):
201
202
  logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
@@ -207,7 +208,7 @@ class MonitoringHub(RepresentationMixin):
207
208
 
208
209
  logger.info("Monitoring Hub initialized")
209
210
 
210
- return zmq_port
211
+ self.hub_zmq_port = zmq_port
211
212
 
212
213
  # TODO: tighten the Any message format
213
214
  def send(self, mtype: MessageType, message: Any) -> None:
@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
7
7
  from multiprocessing.queues import Queue
8
8
  from typing import Optional
9
9
 
10
+ import zmq
11
+
10
12
  from parsl.serialize import serialize
11
13
 
12
14
  _db_manager_excepts: Optional[Exception]
@@ -15,14 +17,14 @@ _db_manager_excepts: Optional[Exception]
15
17
  logger = logging.getLogger(__name__)
16
18
 
17
19
 
18
- class MonitoringRadio(metaclass=ABCMeta):
20
+ class MonitoringRadioSender(metaclass=ABCMeta):
19
21
  @abstractmethod
20
22
  def send(self, message: object) -> None:
21
23
  pass
22
24
 
23
25
 
24
- class FilesystemRadio(MonitoringRadio):
25
- """A MonitoringRadio that sends messages over a shared filesystem.
26
+ class FilesystemRadioSender(MonitoringRadioSender):
27
+ """A MonitoringRadioSender that sends messages over a shared filesystem.
26
28
 
27
29
  The messsage directory structure is based on maildir,
28
30
  https://en.wikipedia.org/wiki/Maildir
@@ -36,7 +38,7 @@ class FilesystemRadio(MonitoringRadio):
36
38
  This avoids a race condition of reading partially written messages.
37
39
 
38
40
  This radio is likely to give higher shared filesystem load compared to
39
- the UDPRadio, but should be much more reliable.
41
+ the UDP radio, but should be much more reliable.
40
42
  """
41
43
 
42
44
  def __init__(self, *, monitoring_url: str, source_id: int, timeout: int = 10, run_dir: str):
@@ -66,7 +68,7 @@ class FilesystemRadio(MonitoringRadio):
66
68
  os.rename(tmp_filename, new_filename)
67
69
 
68
70
 
69
- class HTEXRadio(MonitoringRadio):
71
+ class HTEXRadioSender(MonitoringRadioSender):
70
72
 
71
73
  def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
72
74
  """
@@ -120,7 +122,7 @@ class HTEXRadio(MonitoringRadio):
120
122
  return
121
123
 
122
124
 
123
- class UDPRadio(MonitoringRadio):
125
+ class UDPRadioSender(MonitoringRadioSender):
124
126
 
125
127
  def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
126
128
  """
@@ -174,7 +176,7 @@ class UDPRadio(MonitoringRadio):
174
176
  return
175
177
 
176
178
 
177
- class MultiprocessingQueueRadio(MonitoringRadio):
179
+ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
178
180
  """A monitoring radio which connects over a multiprocessing Queue.
179
181
  This radio is intended to be used on the submit side, where components
180
182
  in the submit process, or processes launched by multiprocessing, will have
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadio(MonitoringRadio):
186
188
 
187
189
  def send(self, message: object) -> None:
188
190
  self.queue.put((message, 0))
191
+
192
+
193
+ class ZMQRadioSender(MonitoringRadioSender):
194
+ """A monitoring radio which connects over ZMQ. This radio is not
195
+ thread-safe, because its use of ZMQ is not thread-safe.
196
+ """
197
+
198
+ def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
199
+ self._hub_channel = zmq.Context().socket(zmq.DEALER)
200
+ self._hub_channel.set_hwm(0)
201
+ self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
202
+
203
+ def send(self, message: object) -> None:
204
+ self._hub_channel.send_pyobj(message)
@@ -8,10 +8,10 @@ from typing import Any, Callable, Dict, List, Sequence, Tuple
8
8
 
9
9
  from parsl.monitoring.message_type import MessageType
10
10
  from parsl.monitoring.radios import (
11
- FilesystemRadio,
12
- HTEXRadio,
13
- MonitoringRadio,
14
- UDPRadio,
11
+ FilesystemRadioSender,
12
+ HTEXRadioSender,
13
+ MonitoringRadioSender,
14
+ UDPRadioSender,
15
15
  )
16
16
  from parsl.multiprocessing import ForkProcess
17
17
  from parsl.process_loggers import wrap_with_logs
@@ -100,17 +100,17 @@ def monitor_wrapper(*,
100
100
  return (wrapped, args, new_kwargs)
101
101
 
102
102
 
103
- def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadio:
104
- radio: MonitoringRadio
103
+ def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
104
+ radio: MonitoringRadioSender
105
105
  if radio_mode == "udp":
106
- radio = UDPRadio(monitoring_hub_url,
107
- source_id=task_id)
106
+ radio = UDPRadioSender(monitoring_hub_url,
107
+ source_id=task_id)
108
108
  elif radio_mode == "htex":
109
- radio = HTEXRadio(monitoring_hub_url,
110
- source_id=task_id)
109
+ radio = HTEXRadioSender(monitoring_hub_url,
110
+ source_id=task_id)
111
111
  elif radio_mode == "filesystem":
112
- radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
113
- source_id=task_id, run_dir=run_dir)
112
+ radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
113
+ source_id=task_id, run_dir=run_dir)
114
114
  else:
115
115
  raise RuntimeError(f"Unknown radio mode: {radio_mode}")
116
116
  return radio
@@ -5,6 +5,7 @@ import os
5
5
  import pickle
6
6
  import queue
7
7
  import socket
8
+ import threading
8
9
  import time
9
10
  from multiprocessing.synchronize import Event
10
11
  from typing import Optional, Tuple, Union
@@ -32,7 +33,12 @@ class MonitoringRouter:
32
33
  logdir: str = ".",
33
34
  run_id: str,
34
35
  logging_level: int = logging.INFO,
35
- atexit_timeout: int = 3 # in seconds
36
+ atexit_timeout: int = 3, # in seconds
37
+ priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
38
+ node_msgs: "queue.Queue[AddressedMonitoringMessage]",
39
+ block_msgs: "queue.Queue[AddressedMonitoringMessage]",
40
+ resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
41
+ exit_event: Event,
36
42
  ):
37
43
  """ Initializes a monitoring configuration class.
38
44
 
@@ -51,7 +57,11 @@ class MonitoringRouter:
51
57
  Logging level as defined in the logging module. Default: logging.INFO
52
58
  atexit_timeout : float, optional
53
59
  The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
60
+ *_msgs : Queue
61
+ Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
54
62
 
63
+ exit_event : Event
64
+ An event that the main Parsl process will set to signal that the monitoring router should shut down.
55
65
  """
56
66
  os.makedirs(logdir, exist_ok=True)
57
67
  self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
@@ -93,22 +103,60 @@ class MonitoringRouter:
93
103
  min_port=zmq_port_range[0],
94
104
  max_port=zmq_port_range[1])
95
105
 
96
- def start(self,
97
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
98
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
99
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
100
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
101
- exit_event: Event) -> None:
106
+ self.priority_msgs = priority_msgs
107
+ self.node_msgs = node_msgs
108
+ self.block_msgs = block_msgs
109
+ self.resource_msgs = resource_msgs
110
+ self.exit_event = exit_event
111
+
112
+ @wrap_with_logs(target="monitoring_router")
113
+ def start(self) -> None:
114
+ self.logger.info("Starting UDP listener thread")
115
+ udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
116
+ udp_radio_receiver_thread.start()
117
+
118
+ self.logger.info("Starting ZMQ listener thread")
119
+ zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
120
+ zmq_radio_receiver_thread.start()
121
+
122
+ self.logger.info("Joining on ZMQ listener thread")
123
+ zmq_radio_receiver_thread.join()
124
+ self.logger.info("Joining on UDP listener thread")
125
+ udp_radio_receiver_thread.join()
126
+ self.logger.info("Joined on both ZMQ and UDP listener threads")
127
+
128
+ @wrap_with_logs(target="monitoring_router")
129
+ def start_udp_listener(self) -> None:
102
130
  try:
103
- while not exit_event.is_set():
131
+ while not self.exit_event.is_set():
104
132
  try:
105
133
  data, addr = self.udp_sock.recvfrom(2048)
106
134
  resource_msg = pickle.loads(data)
107
135
  self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
108
- resource_msgs.put((resource_msg, addr))
136
+ self.resource_msgs.put((resource_msg, addr))
109
137
  except socket.timeout:
110
138
  pass
111
139
 
140
+ self.logger.info("UDP listener draining")
141
+ last_msg_received_time = time.time()
142
+ while time.time() - last_msg_received_time < self.atexit_timeout:
143
+ try:
144
+ data, addr = self.udp_sock.recvfrom(2048)
145
+ msg = pickle.loads(data)
146
+ self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
147
+ self.resource_msgs.put((msg, addr))
148
+ last_msg_received_time = time.time()
149
+ except socket.timeout:
150
+ pass
151
+
152
+ self.logger.info("UDP listener finishing normally")
153
+ finally:
154
+ self.logger.info("UDP listener finished")
155
+
156
+ @wrap_with_logs(target="monitoring_router")
157
+ def start_zmq_listener(self) -> None:
158
+ try:
159
+ while not self.exit_event.is_set():
112
160
  try:
113
161
  dfk_loop_start = time.time()
114
162
  while time.time() - dfk_loop_start < 1.0: # TODO make configurable
@@ -125,15 +173,15 @@ class MonitoringRouter:
125
173
 
126
174
  if msg[0] == MessageType.NODE_INFO:
127
175
  msg[1]['run_id'] = self.run_id
128
- node_msgs.put(msg_0)
176
+ self.node_msgs.put(msg_0)
129
177
  elif msg[0] == MessageType.RESOURCE_INFO:
130
- resource_msgs.put(msg_0)
178
+ self.resource_msgs.put(msg_0)
131
179
  elif msg[0] == MessageType.BLOCK_INFO:
132
- block_msgs.put(msg_0)
180
+ self.block_msgs.put(msg_0)
133
181
  elif msg[0] == MessageType.TASK_INFO:
134
- priority_msgs.put(msg_0)
182
+ self.priority_msgs.put(msg_0)
135
183
  elif msg[0] == MessageType.WORKFLOW_INFO:
136
- priority_msgs.put(msg_0)
184
+ self.priority_msgs.put(msg_0)
137
185
  else:
138
186
  # There is a type: ignore here because if msg[0]
139
187
  # is of the correct type, this code is unreachable,
@@ -151,21 +199,9 @@ class MonitoringRouter:
151
199
  # thing to do.
152
200
  self.logger.warning("Failure processing a ZMQ message", exc_info=True)
153
201
 
154
- self.logger.info("Monitoring router draining")
155
- last_msg_received_time = time.time()
156
- while time.time() - last_msg_received_time < self.atexit_timeout:
157
- try:
158
- data, addr = self.udp_sock.recvfrom(2048)
159
- msg = pickle.loads(data)
160
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
161
- resource_msgs.put((msg, addr))
162
- last_msg_received_time = time.time()
163
- except socket.timeout:
164
- pass
165
-
166
- self.logger.info("Monitoring router finishing normally")
202
+ self.logger.info("ZMQ listener finishing normally")
167
203
  finally:
168
- self.logger.info("Monitoring router finished")
204
+ self.logger.info("ZMQ listener finished")
169
205
 
170
206
 
171
207
  @wrap_with_logs
@@ -191,7 +227,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
191
227
  zmq_port_range=zmq_port_range,
192
228
  logdir=logdir,
193
229
  logging_level=logging_level,
194
- run_id=run_id)
230
+ run_id=run_id,
231
+ priority_msgs=priority_msgs,
232
+ node_msgs=node_msgs,
233
+ block_msgs=block_msgs,
234
+ resource_msgs=resource_msgs,
235
+ exit_event=exit_event)
195
236
  except Exception as e:
196
237
  logger.error("MonitoringRouter construction failed.", exc_info=True)
197
238
  comm_q.put(f"Monitoring router construction failed: {e}")
@@ -200,7 +241,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
200
241
 
201
242
  router.logger.info("Starting MonitoringRouter in router_starter")
202
243
  try:
203
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event)
244
+ router.start()
204
245
  except Exception as e:
205
246
  router.logger.exception("router.start exception")
206
247
  exception_q.put(('Hub', str(e)))
@@ -0,0 +1,71 @@
1
+ import logging
2
+
3
+ import pytest
4
+
5
+ import parsl
6
+ from parsl import Config
7
+ from parsl.executors import HighThroughputExecutor
8
+ from parsl.executors.errors import BadStateException
9
+ from parsl.jobs.states import JobState, JobStatus
10
+ from parsl.providers import LocalProvider
11
+
12
+
13
+ class FailingProvider(LocalProvider):
14
+ def submit(*args, **kwargs):
15
+ raise RuntimeError("Deliberate failure of provider.submit")
16
+
17
+
18
+ def local_config():
19
+ """Config to simulate failing blocks without connecting"""
20
+ return Config(
21
+ executors=[
22
+ HighThroughputExecutor(
23
+ label="HTEX",
24
+ heartbeat_period=1,
25
+ heartbeat_threshold=2,
26
+ poll_period=100,
27
+ max_workers_per_node=1,
28
+ provider=FailingProvider(
29
+ init_blocks=0,
30
+ max_blocks=2,
31
+ min_blocks=0,
32
+ ),
33
+ )
34
+ ],
35
+ max_idletime=0.5,
36
+ strategy='htex_auto_scale',
37
+ strategy_period=0.1
38
+ # this strategy period needs to be a few times smaller than the
39
+ # status_polling_interval of FailingProvider, which is 5s at
40
+ # time of writing
41
+ )
42
+
43
+
44
+ @parsl.python_app
45
+ def double(x):
46
+ return x * 2
47
+
48
+
49
+ @pytest.mark.local
50
+ def test_disconnected_blocks():
51
+ """Test reporting of blocks that fail to connect from HTEX"""
52
+ dfk = parsl.dfk()
53
+ executor = dfk.executors["HTEX"]
54
+
55
+ connected_blocks = executor.connected_blocks()
56
+ assert not connected_blocks, "Expected 0 blocks"
57
+
58
+ future = double(5)
59
+ with pytest.raises(BadStateException):
60
+ future.result()
61
+
62
+ assert isinstance(future.exception(), BadStateException)
63
+
64
+ status_dict = executor.status()
65
+ assert len(status_dict) == 1, "Expected exactly 1 block"
66
+ for status in status_dict.values():
67
+ assert isinstance(status, JobStatus)
68
+ assert status.state == JobState.MISSING
69
+
70
+ connected_blocks = executor.connected_blocks()
71
+ assert connected_blocks == [], "Expected exactly 0 connected blocks"