parsl 2024.3.18__py3-none-any.whl → 2024.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. parsl/dataflow/dflow.py +35 -36
  2. parsl/executors/base.py +11 -1
  3. parsl/executors/high_throughput/executor.py +8 -20
  4. parsl/executors/high_throughput/process_worker_pool.py +5 -2
  5. parsl/executors/status_handling.py +8 -15
  6. parsl/executors/taskvine/executor.py +35 -11
  7. parsl/executors/workqueue/executor.py +33 -11
  8. parsl/jobs/error_handlers.py +1 -1
  9. parsl/jobs/job_status_poller.py +12 -11
  10. parsl/jobs/strategy.py +31 -18
  11. parsl/monitoring/monitoring.py +27 -237
  12. parsl/monitoring/router.py +208 -0
  13. parsl/tests/site_tests/test_provider.py +1 -1
  14. parsl/tests/test_htex/test_disconnected_blocks.py +0 -1
  15. parsl/tests/test_htex/test_drain.py +1 -0
  16. parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
  17. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +85 -0
  18. parsl/tests/test_python_apps/test_context_manager.py +40 -0
  19. parsl/tests/test_scaling/test_shutdown_scalein.py +78 -0
  20. parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
  21. parsl/version.py +1 -1
  22. {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/process_worker_pool.py +5 -2
  23. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/METADATA +4 -4
  24. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/RECORD +35 -30
  25. /parsl/tests/{test_data → test_shutdown}/__init__.py +0 -0
  26. /parsl/tests/{test_data → test_staging}/test_file.py +0 -0
  27. /parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  28. /parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  29. /parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +0 -0
  30. {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/exec_parsl_function.py +0 -0
  31. {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/parsl_coprocess.py +0 -0
  32. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/LICENSE +0 -0
  33. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/WHEEL +0 -0
  34. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/entry_points.txt +0 -0
  35. {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/top_level.txt +0 -0
parsl/jobs/strategy.py CHANGED
@@ -129,8 +129,8 @@ class Strategy:
129
129
  self.executors = {}
130
130
  self.max_idletime = max_idletime
131
131
 
132
- self.strategies = {None: self._strategy_noop,
133
- 'none': self._strategy_noop,
132
+ self.strategies = {None: self._strategy_init_only,
133
+ 'none': self._strategy_init_only,
134
134
  'simple': self._strategy_simple,
135
135
  'htex_auto_scale': self._strategy_htex_auto_scale}
136
136
 
@@ -146,15 +146,22 @@ class Strategy:
146
146
  for executor in executors:
147
147
  self.executors[executor.label] = {'idle_since': None}
148
148
 
149
- def _strategy_noop(self, status: List[jsp.PollItem]) -> None:
150
- """Do nothing.
149
+ def _strategy_init_only(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
150
+ """Scale up to init_blocks at the start, then nothing more.
151
151
  """
152
- logger.debug("strategy_noop: doing nothing")
152
+ for ef in executor_facades:
153
+ if ef.first:
154
+ executor = ef.executor
155
+ logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
156
+ ef.scale_out(executor.provider.init_blocks)
157
+ ef.first = False
158
+ else:
159
+ logger.debug("strategy_init_only: doing nothing")
153
160
 
154
- def _strategy_simple(self, status_list: List[jsp.PollItem]) -> None:
155
- self._general_strategy(status_list, strategy_type='simple')
161
+ def _strategy_simple(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
162
+ self._general_strategy(executor_facades, strategy_type='simple')
156
163
 
157
- def _strategy_htex_auto_scale(self, status_list: List[jsp.PollItem]) -> None:
164
+ def _strategy_htex_auto_scale(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
158
165
  """HTEX specific auto scaling strategy
159
166
 
160
167
  This strategy works only for HTEX. This strategy will scale out by
@@ -169,24 +176,30 @@ class Strategy:
169
176
  expected to scale in effectively only when # of workers, or tasks executing
170
177
  per block is close to 1.
171
178
  """
172
- self._general_strategy(status_list, strategy_type='htex')
179
+ self._general_strategy(executor_facades, strategy_type='htex')
173
180
 
174
181
  @wrap_with_logs
175
- def _general_strategy(self, status_list, *, strategy_type):
176
- logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(status_list)} executors")
182
+ def _general_strategy(self, executor_facades, *, strategy_type):
183
+ logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executor_facades)} executors")
177
184
 
178
- for exec_status in status_list:
179
- executor = exec_status.executor
185
+ for ef in executor_facades:
186
+ executor = ef.executor
180
187
  label = executor.label
181
188
  if not isinstance(executor, BlockProviderExecutor):
182
189
  logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
183
190
  continue
184
191
  logger.debug(f"Strategizing for executor {label}")
185
192
 
193
+ if ef.first:
194
+ executor = ef.executor
195
+ logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
196
+ ef.scale_out(executor.provider.init_blocks)
197
+ ef.first = False
198
+
186
199
  # Tasks that are either pending completion
187
200
  active_tasks = executor.outstanding
188
201
 
189
- status = exec_status.status
202
+ status = ef.status
190
203
 
191
204
  # FIXME we need to handle case where provider does not define these
192
205
  # FIXME probably more of this logic should be moved to the provider
@@ -242,7 +255,7 @@ class Strategy:
242
255
  # We have resources idle for the max duration,
243
256
  # we have to scale_in now.
244
257
  logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
245
- exec_status.scale_in(active_blocks - min_blocks)
258
+ ef.scale_in(active_blocks - min_blocks)
246
259
 
247
260
  else:
248
261
  logger.debug(
@@ -265,7 +278,7 @@ class Strategy:
265
278
  excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
266
279
  excess_blocks = min(excess_blocks, max_blocks - active_blocks)
267
280
  logger.debug(f"Requesting {excess_blocks} more blocks")
268
- exec_status.scale_out(excess_blocks)
281
+ ef.scale_out(excess_blocks)
269
282
 
270
283
  elif active_slots == 0 and active_tasks > 0:
271
284
  logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
@@ -274,7 +287,7 @@ class Strategy:
274
287
  if active_blocks < max_blocks:
275
288
  logger.debug("Requesting single block")
276
289
 
277
- exec_status.scale_out(1)
290
+ ef.scale_out(1)
278
291
  else:
279
292
  logger.debug("Not requesting single block, because at maxblocks already")
280
293
 
@@ -290,7 +303,7 @@ class Strategy:
290
303
  excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
291
304
  excess_blocks = min(excess_blocks, active_blocks - min_blocks)
292
305
  logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
293
- exec_status.scale_in(excess_blocks, max_idletime=self.max_idletime)
306
+ ef.scale_in(excess_blocks, max_idletime=self.max_idletime)
294
307
  else:
295
308
  logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
296
309
  else:
@@ -1,17 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
- import socket
5
4
  import time
6
- import pickle
7
5
  import logging
8
6
  import typeguard
9
7
  import zmq
10
8
 
11
9
  import queue
12
10
 
13
- import parsl.monitoring.remote
14
-
15
11
  from parsl.multiprocessing import ForkProcess, SizedQueue
16
12
  from multiprocessing import Process
17
13
  from multiprocessing.queues import Queue
@@ -22,9 +18,10 @@ from parsl.utils import setproctitle
22
18
 
23
19
  from parsl.serialize import deserialize
24
20
 
21
+ from parsl.monitoring.router import router_starter
25
22
  from parsl.monitoring.message_type import MessageType
26
- from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
27
- from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
23
+ from parsl.monitoring.types import AddressedMonitoringMessage
24
+ from typing import cast, Any, Optional, Tuple, Union, TYPE_CHECKING
28
25
 
29
26
  _db_manager_excepts: Optional[Exception]
30
27
 
@@ -93,8 +90,6 @@ class MonitoringHub(RepresentationMixin):
93
90
  Default: 30 seconds
94
91
  """
95
92
 
96
- self.logger = logger
97
-
98
93
  # Any is used to disable typechecking on uses of _dfk_channel,
99
94
  # because it is used in the code as if it points to a channel, but
100
95
  # the static type is that it can also be None. The code relies on
@@ -120,6 +115,8 @@ class MonitoringHub(RepresentationMixin):
120
115
 
121
116
  def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
122
117
 
118
+ logger.debug("Starting MonitoringHub")
119
+
123
120
  if self.logdir is None:
124
121
  self.logdir = "."
125
122
 
@@ -128,9 +125,6 @@ class MonitoringHub(RepresentationMixin):
128
125
 
129
126
  os.makedirs(self.logdir, exist_ok=True)
130
127
 
131
- # Initialize the ZMQ pipe to the Parsl Client
132
-
133
- self.logger.debug("Initializing ZMQ Pipes to client")
134
128
  self.monitoring_hub_active = True
135
129
 
136
130
  # This annotation is incompatible with typeguard 4.x instrumentation
@@ -166,8 +160,8 @@ class MonitoringHub(RepresentationMixin):
166
160
  self.router_proc = ForkProcess(target=router_starter,
167
161
  args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
168
162
  kwargs={"hub_address": self.hub_address,
169
- "hub_port": self.hub_port,
170
- "hub_port_range": self.hub_port_range,
163
+ "udp_port": self.hub_port,
164
+ "zmq_port_range": self.hub_port_range,
171
165
  "logdir": self.logdir,
172
166
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
173
167
  "run_id": run_id
@@ -187,7 +181,7 @@ class MonitoringHub(RepresentationMixin):
187
181
  daemon=True,
188
182
  )
189
183
  self.dbm_proc.start()
190
- self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
184
+ logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
191
185
 
192
186
  self.filesystem_proc = Process(target=filesystem_receiver,
193
187
  args=(self.logdir, self.resource_msgs, dfk_run_dir),
@@ -195,19 +189,19 @@ class MonitoringHub(RepresentationMixin):
195
189
  daemon=True
196
190
  )
197
191
  self.filesystem_proc.start()
198
- self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
192
+ logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
199
193
 
200
194
  try:
201
195
  comm_q_result = comm_q.get(block=True, timeout=120)
202
196
  except queue.Empty:
203
- self.logger.error("Hub has not completed initialization in 120s. Aborting")
197
+ logger.error("Hub has not completed initialization in 120s. Aborting")
204
198
  raise Exception("Hub failed to start")
205
199
 
206
200
  if isinstance(comm_q_result, str):
207
- self.logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
201
+ logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
208
202
  raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
209
203
 
210
- udp_port, ic_port = comm_q_result
204
+ udp_port, zmq_port = comm_q_result
211
205
 
212
206
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
213
207
 
@@ -217,28 +211,28 @@ class MonitoringHub(RepresentationMixin):
217
211
  self._dfk_channel.setsockopt(zmq.LINGER, 0)
218
212
  self._dfk_channel.set_hwm(0)
219
213
  self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
220
- self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
214
+ self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
221
215
 
222
- self.logger.info("Monitoring Hub initialized")
216
+ logger.info("Monitoring Hub initialized")
223
217
 
224
- return ic_port
218
+ return zmq_port
225
219
 
226
220
  # TODO: tighten the Any message format
227
221
  def send(self, mtype: MessageType, message: Any) -> None:
228
- self.logger.debug("Sending message type {}".format(mtype))
222
+ logger.debug("Sending message type {}".format(mtype))
229
223
  try:
230
224
  self._dfk_channel.send_pyobj((mtype, message))
231
225
  except zmq.Again:
232
- self.logger.exception(
226
+ logger.exception(
233
227
  "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
234
228
 
235
229
  def close(self) -> None:
236
- self.logger.info("Terminating Monitoring Hub")
230
+ logger.info("Terminating Monitoring Hub")
237
231
  exception_msgs = []
238
232
  while True:
239
233
  try:
240
234
  exception_msgs.append(self.exception_q.get(block=False))
241
- self.logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
235
+ logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
242
236
  except queue.Empty:
243
237
  break
244
238
  if self._dfk_channel and self.monitoring_hub_active:
@@ -246,7 +240,7 @@ class MonitoringHub(RepresentationMixin):
246
240
  self._dfk_channel.close()
247
241
  if exception_msgs:
248
242
  for exception_msg in exception_msgs:
249
- self.logger.error(
243
+ logger.error(
250
244
  "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
251
245
  exception_msg[0],
252
246
  exception_msg[1]
@@ -255,41 +249,24 @@ class MonitoringHub(RepresentationMixin):
255
249
  self.router_proc.terminate()
256
250
  self.dbm_proc.terminate()
257
251
  self.filesystem_proc.terminate()
258
- self.logger.info("Waiting for router to terminate")
252
+ logger.info("Waiting for router to terminate")
259
253
  self.router_proc.join()
260
- self.logger.debug("Finished waiting for router termination")
254
+ logger.debug("Finished waiting for router termination")
261
255
  if len(exception_msgs) == 0:
262
- self.logger.debug("Sending STOP to DBM")
256
+ logger.debug("Sending STOP to DBM")
263
257
  self.priority_msgs.put(("STOP", 0))
264
258
  else:
265
- self.logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
266
- self.logger.debug("Waiting for DB termination")
259
+ logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
260
+ logger.debug("Waiting for DB termination")
267
261
  self.dbm_proc.join()
268
- self.logger.debug("Finished waiting for DBM termination")
262
+ logger.debug("Finished waiting for DBM termination")
269
263
 
270
264
  # should this be message based? it probably doesn't need to be if
271
265
  # we believe we've received all messages
272
- self.logger.info("Terminating filesystem radio receiver process")
266
+ logger.info("Terminating filesystem radio receiver process")
273
267
  self.filesystem_proc.terminate()
274
268
  self.filesystem_proc.join()
275
269
 
276
- @staticmethod
277
- def monitor_wrapper(f: Any,
278
- args: Sequence,
279
- kwargs: Dict,
280
- try_id: int,
281
- task_id: int,
282
- monitoring_hub_url: str,
283
- run_id: str,
284
- logging_level: int,
285
- sleep_dur: float,
286
- radio_mode: str,
287
- monitor_resources: bool,
288
- run_dir: str) -> Tuple[Callable, Sequence, Dict]:
289
- return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
290
- run_id, logging_level, sleep_dur, radio_mode,
291
- monitor_resources, run_dir)
292
-
293
270
 
294
271
  @wrap_with_logs
295
272
  def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
@@ -325,190 +302,3 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
325
302
  logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
326
303
 
327
304
  time.sleep(1) # whats a good time for this poll?
328
-
329
-
330
- class MonitoringRouter:
331
-
332
- def __init__(self,
333
- *,
334
- hub_address: str,
335
- hub_port: Optional[int] = None,
336
- hub_port_range: Tuple[int, int] = (55050, 56000),
337
-
338
- monitoring_hub_address: str = "127.0.0.1",
339
- logdir: str = ".",
340
- run_id: str,
341
- logging_level: int = logging.INFO,
342
- atexit_timeout: int = 3 # in seconds
343
- ):
344
- """ Initializes a monitoring configuration class.
345
-
346
- Parameters
347
- ----------
348
- hub_address : str
349
- The ip address at which the workers will be able to reach the Hub.
350
- hub_port : int
351
- The specific port at which workers will be able to reach the Hub via UDP. Default: None
352
- hub_port_range : tuple(int, int)
353
- The MonitoringHub picks ports at random from the range which will be used by Hub.
354
- This is overridden when the hub_port option is set. Default: (55050, 56000)
355
- logdir : str
356
- Parsl log directory paths. Logs and temp files go here. Default: '.'
357
- logging_level : int
358
- Logging level as defined in the logging module. Default: logging.INFO
359
- atexit_timeout : float, optional
360
- The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
361
-
362
- """
363
- os.makedirs(logdir, exist_ok=True)
364
- self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
365
- name="monitoring_router",
366
- level=logging_level)
367
- self.logger.debug("Monitoring router starting")
368
-
369
- self.hub_address = hub_address
370
- self.atexit_timeout = atexit_timeout
371
- self.run_id = run_id
372
-
373
- self.loop_freq = 10.0 # milliseconds
374
-
375
- # Initialize the UDP socket
376
- self.sock = socket.socket(socket.AF_INET,
377
- socket.SOCK_DGRAM,
378
- socket.IPPROTO_UDP)
379
-
380
- # We are trying to bind to all interfaces with 0.0.0.0
381
- if not hub_port:
382
- self.sock.bind(('0.0.0.0', 0))
383
- self.hub_port = self.sock.getsockname()[1]
384
- else:
385
- self.hub_port = hub_port
386
- try:
387
- self.sock.bind(('0.0.0.0', self.hub_port))
388
- except Exception as e:
389
- raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
390
- self.sock.settimeout(self.loop_freq / 1000)
391
- self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
392
-
393
- self._context = zmq.Context()
394
- self.ic_channel = self._context.socket(zmq.DEALER)
395
- self.ic_channel.setsockopt(zmq.LINGER, 0)
396
- self.ic_channel.set_hwm(0)
397
- self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
398
- self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
399
- self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
400
- min_port=hub_port_range[0],
401
- max_port=hub_port_range[1])
402
-
403
- def start(self,
404
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
405
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
406
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
407
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
408
- try:
409
- router_keep_going = True
410
- while router_keep_going:
411
- try:
412
- data, addr = self.sock.recvfrom(2048)
413
- resource_msg = pickle.loads(data)
414
- self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
415
- resource_msgs.put((resource_msg, addr))
416
- except socket.timeout:
417
- pass
418
-
419
- try:
420
- dfk_loop_start = time.time()
421
- while time.time() - dfk_loop_start < 1.0: # TODO make configurable
422
- # note that nothing checks that msg really is of the annotated type
423
- msg: TaggedMonitoringMessage
424
- msg = self.ic_channel.recv_pyobj()
425
-
426
- assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
427
- assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
428
- assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
429
-
430
- msg_0: AddressedMonitoringMessage
431
- msg_0 = (msg, 0)
432
-
433
- if msg[0] == MessageType.NODE_INFO:
434
- msg[1]['run_id'] = self.run_id
435
- node_msgs.put(msg_0)
436
- elif msg[0] == MessageType.RESOURCE_INFO:
437
- resource_msgs.put(msg_0)
438
- elif msg[0] == MessageType.BLOCK_INFO:
439
- block_msgs.put(msg_0)
440
- elif msg[0] == MessageType.TASK_INFO:
441
- priority_msgs.put(msg_0)
442
- elif msg[0] == MessageType.WORKFLOW_INFO:
443
- priority_msgs.put(msg_0)
444
- if 'exit_now' in msg[1] and msg[1]['exit_now']:
445
- router_keep_going = False
446
- else:
447
- # There is a type: ignore here because if msg[0]
448
- # is of the correct type, this code is unreachable,
449
- # but there is no verification that the message
450
- # received from ic_channel.recv_pyobj() is actually
451
- # of that type.
452
- self.logger.error("Discarding message " # type: ignore[unreachable]
453
- f"from interchange with unknown type {msg[0].value}")
454
- except zmq.Again:
455
- pass
456
- except Exception:
457
- # This will catch malformed messages. What happens if the
458
- # channel is broken in such a way that it always raises
459
- # an exception? Looping on this would maybe be the wrong
460
- # thing to do.
461
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
462
-
463
- self.logger.info("Monitoring router draining")
464
- last_msg_received_time = time.time()
465
- while time.time() - last_msg_received_time < self.atexit_timeout:
466
- try:
467
- data, addr = self.sock.recvfrom(2048)
468
- msg = pickle.loads(data)
469
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
470
- resource_msgs.put((msg, addr))
471
- last_msg_received_time = time.time()
472
- except socket.timeout:
473
- pass
474
-
475
- self.logger.info("Monitoring router finishing normally")
476
- finally:
477
- self.logger.info("Monitoring router finished")
478
-
479
-
480
- @wrap_with_logs
481
- def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
482
- exception_q: "queue.Queue[Tuple[str, str]]",
483
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
484
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
485
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
486
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
487
-
488
- hub_address: str,
489
- hub_port: Optional[int],
490
- hub_port_range: Tuple[int, int],
491
-
492
- logdir: str,
493
- logging_level: int,
494
- run_id: str) -> None:
495
- setproctitle("parsl: monitoring router")
496
- try:
497
- router = MonitoringRouter(hub_address=hub_address,
498
- hub_port=hub_port,
499
- hub_port_range=hub_port_range,
500
- logdir=logdir,
501
- logging_level=logging_level,
502
- run_id=run_id)
503
- except Exception as e:
504
- logger.error("MonitoringRouter construction failed.", exc_info=True)
505
- comm_q.put(f"Monitoring router construction failed: {e}")
506
- else:
507
- comm_q.put((router.hub_port, router.ic_port))
508
-
509
- router.logger.info("Starting MonitoringRouter in router_starter")
510
- try:
511
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
512
- except Exception as e:
513
- router.logger.exception("router.start exception")
514
- exception_q.put(('Hub', str(e)))