parsl 2025.3.3__py3-none-any.whl → 2025.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,23 +3,22 @@ from __future__ import annotations
3
3
  import logging
4
4
  import multiprocessing.synchronize as ms
5
5
  import os
6
- import pickle
7
6
  import queue
8
- import time
9
7
  from multiprocessing import Event
8
+ from multiprocessing.context import ForkProcess as ForkProcessType
10
9
  from multiprocessing.queues import Queue
11
- from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
10
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
12
11
 
13
12
  import typeguard
14
13
 
15
- from parsl.log_utils import set_file_logger
16
14
  from parsl.monitoring.errors import MonitoringHubStartError
15
+ from parsl.monitoring.radios.filesystem_router import filesystem_router_starter
17
16
  from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
18
- from parsl.monitoring.router import router_starter
17
+ from parsl.monitoring.radios.udp_router import udp_router_starter
18
+ from parsl.monitoring.radios.zmq_router import zmq_router_starter
19
19
  from parsl.monitoring.types import TaggedMonitoringMessage
20
20
  from parsl.multiprocessing import ForkProcess, SizedQueue
21
- from parsl.process_loggers import wrap_with_logs
22
- from parsl.utils import RepresentationMixin, setproctitle
21
+ from parsl.utils import RepresentationMixin
23
22
 
24
23
  _db_manager_excepts: Optional[Exception]
25
24
 
@@ -121,51 +120,68 @@ class MonitoringHub(RepresentationMixin):
121
120
  # in the future, Queue will allow runtime subscripts.
122
121
 
123
122
  if TYPE_CHECKING:
124
- comm_q: Queue[Union[Tuple[int, int], str]]
123
+ zmq_comm_q: Queue[Union[int, str]]
124
+ udp_comm_q: Queue[Union[int, str]]
125
125
  else:
126
- comm_q: Queue
126
+ zmq_comm_q: Queue
127
+ udp_comm_q: Queue
127
128
 
128
- comm_q = SizedQueue(maxsize=10)
129
+ zmq_comm_q = SizedQueue(maxsize=10)
130
+ udp_comm_q = SizedQueue(maxsize=10)
129
131
 
130
- self.exception_q: Queue[Tuple[str, str]]
131
- self.exception_q = SizedQueue(maxsize=10)
132
-
133
- self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
132
+ self.resource_msgs: Queue[TaggedMonitoringMessage]
134
133
  self.resource_msgs = SizedQueue()
135
134
 
136
135
  self.router_exit_event: ms.Event
137
136
  self.router_exit_event = Event()
138
137
 
139
- self.router_proc = ForkProcess(target=router_starter,
140
- kwargs={"comm_q": comm_q,
141
- "exception_q": self.exception_q,
142
- "resource_msgs": self.resource_msgs,
143
- "exit_event": self.router_exit_event,
144
- "hub_address": self.hub_address,
145
- "udp_port": self.hub_port,
146
- "zmq_port_range": self.hub_port_range,
147
- "run_dir": dfk_run_dir,
148
- "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
149
- },
150
- name="Monitoring-Router-Process",
151
- daemon=True,
152
- )
153
- self.router_proc.start()
138
+ self.zmq_router_proc = ForkProcess(target=zmq_router_starter,
139
+ kwargs={"comm_q": zmq_comm_q,
140
+ "resource_msgs": self.resource_msgs,
141
+ "exit_event": self.router_exit_event,
142
+ "hub_address": self.hub_address,
143
+ "zmq_port_range": self.hub_port_range,
144
+ "run_dir": dfk_run_dir,
145
+ "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
146
+ },
147
+ name="Monitoring-ZMQ-Router-Process",
148
+ daemon=True,
149
+ )
150
+ self.zmq_router_proc.start()
151
+
152
+ self.udp_router_proc = ForkProcess(target=udp_router_starter,
153
+ kwargs={"comm_q": udp_comm_q,
154
+ "resource_msgs": self.resource_msgs,
155
+ "exit_event": self.router_exit_event,
156
+ "hub_address": self.hub_address,
157
+ "udp_port": self.hub_port,
158
+ "run_dir": dfk_run_dir,
159
+ "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
160
+ },
161
+ name="Monitoring-UDP-Router-Process",
162
+ daemon=True,
163
+ )
164
+ self.udp_router_proc.start()
165
+
166
+ self.dbm_exit_event: ms.Event
167
+ self.dbm_exit_event = Event()
154
168
 
155
169
  self.dbm_proc = ForkProcess(target=dbm_starter,
156
- args=(self.exception_q, self.resource_msgs,),
170
+ args=(self.resource_msgs,),
157
171
  kwargs={"run_dir": dfk_run_dir,
158
172
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
159
173
  "db_url": self.logging_endpoint,
174
+ "exit_event": self.dbm_exit_event,
160
175
  },
161
176
  name="Monitoring-DBM-Process",
162
177
  daemon=True,
163
178
  )
164
179
  self.dbm_proc.start()
165
- logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
180
+ logger.info("Started ZMQ router process %s, UDP router process %s and DBM process %s",
181
+ self.zmq_router_proc.pid, self.udp_router_proc.pid, self.dbm_proc.pid)
166
182
 
167
- self.filesystem_proc = ForkProcess(target=filesystem_receiver,
168
- args=(self.resource_msgs, dfk_run_dir),
183
+ self.filesystem_proc = ForkProcess(target=filesystem_router_starter,
184
+ args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
169
185
  name="Monitoring-Filesystem-Process",
170
186
  daemon=True
171
187
  )
@@ -175,114 +191,98 @@ class MonitoringHub(RepresentationMixin):
175
191
  self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
176
192
 
177
193
  try:
178
- comm_q_result = comm_q.get(block=True, timeout=120)
179
- comm_q.close()
180
- comm_q.join_thread()
194
+ zmq_comm_q_result = zmq_comm_q.get(block=True, timeout=120)
195
+ zmq_comm_q.close()
196
+ zmq_comm_q.join_thread()
181
197
  except queue.Empty:
182
- logger.error("Hub has not completed initialization in 120s. Aborting")
198
+ logger.error("Monitoring ZMQ Router has not reported port in 120s. Aborting")
183
199
  raise MonitoringHubStartError()
184
200
 
185
- if isinstance(comm_q_result, str):
186
- logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
187
- raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
201
+ if isinstance(zmq_comm_q_result, str):
202
+ logger.error("MonitoringRouter sent an error message: %s", zmq_comm_q_result)
203
+ raise RuntimeError(f"MonitoringRouter failed to start: {zmq_comm_q_result}")
188
204
 
189
- udp_port, zmq_port = comm_q_result
205
+ self.hub_zmq_port = zmq_comm_q_result
190
206
 
207
+ try:
208
+ udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
209
+ udp_comm_q.close()
210
+ udp_comm_q.join_thread()
211
+ except queue.Empty:
212
+ logger.error("Monitoring UDP router has not reported port in 120s. Aborting")
213
+ raise MonitoringHubStartError()
214
+
215
+ if isinstance(udp_comm_q_result, str):
216
+ logger.error("MonitoringRouter sent an error message: %s", udp_comm_q_result)
217
+ raise RuntimeError(f"MonitoringRouter failed to start: {udp_comm_q_result}")
218
+
219
+ udp_port = udp_comm_q_result
191
220
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
192
221
 
193
222
  logger.info("Monitoring Hub initialized")
194
223
 
195
- self.hub_zmq_port = zmq_port
196
-
197
224
  def send(self, message: TaggedMonitoringMessage) -> None:
198
225
  logger.debug("Sending message type %s", message[0])
199
226
  self.radio.send(message)
200
227
 
201
228
  def close(self) -> None:
202
229
  logger.info("Terminating Monitoring Hub")
203
- exception_msgs = []
204
- while True:
205
- try:
206
- exception_msgs.append(self.exception_q.get(block=False))
207
- logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
208
- except queue.Empty:
209
- break
210
230
  if self.monitoring_hub_active:
211
231
  self.monitoring_hub_active = False
212
- if exception_msgs:
213
- for exception_msg in exception_msgs:
214
- logger.error(
215
- "%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
216
- exception_msg[0],
217
- exception_msg[1]
218
- )
219
- self.router_proc.terminate()
220
- self.dbm_proc.terminate()
221
- self.filesystem_proc.terminate()
222
232
  logger.info("Setting router termination event")
223
233
  self.router_exit_event.set()
224
- logger.info("Waiting for router to terminate")
225
- self.router_proc.join()
226
- self.router_proc.close()
234
+
235
+ logger.info("Waiting for ZMQ router to terminate")
236
+ join_terminate_close_proc(self.zmq_router_proc)
237
+
238
+ logger.info("Waiting for UDP router to terminate")
239
+ join_terminate_close_proc(self.udp_router_proc)
240
+
227
241
  logger.debug("Finished waiting for router termination")
228
- if len(exception_msgs) == 0:
229
- logger.debug("Sending STOP to DBM")
230
- self.resource_msgs.put("STOP")
231
- else:
232
- logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
233
242
  logger.debug("Waiting for DB termination")
234
- self.dbm_proc.join()
235
- self.dbm_proc.close()
243
+ self.dbm_exit_event.set()
244
+ join_terminate_close_proc(self.dbm_proc)
236
245
  logger.debug("Finished waiting for DBM termination")
237
246
 
238
- # should this be message based? it probably doesn't need to be if
239
- # we believe we've received all messages
240
247
  logger.info("Terminating filesystem radio receiver process")
241
- self.filesystem_proc.terminate()
242
- self.filesystem_proc.join()
243
- self.filesystem_proc.close()
248
+ join_terminate_close_proc(self.filesystem_proc)
244
249
 
245
250
  logger.info("Closing monitoring multiprocessing queues")
246
- self.exception_q.close()
247
- self.exception_q.join_thread()
248
251
  self.resource_msgs.close()
249
252
  self.resource_msgs.join_thread()
250
253
  logger.info("Closed monitoring multiprocessing queues")
251
254
 
252
255
 
253
- @wrap_with_logs
254
- def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
255
- logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
256
- name="monitoring_filesystem_radio",
257
- level=logging.INFO)
258
-
259
- logger.info("Starting filesystem radio receiver")
260
- setproctitle("parsl: monitoring filesystem receiver")
261
- base_path = f"{run_dir}/monitor-fs-radio/"
262
- tmp_dir = f"{base_path}/tmp/"
263
- new_dir = f"{base_path}/new/"
264
- logger.debug("Creating new and tmp paths under %s", base_path)
265
-
266
- target_radio = MultiprocessingQueueRadioSender(q)
267
-
268
- os.makedirs(tmp_dir, exist_ok=True)
269
- os.makedirs(new_dir, exist_ok=True)
270
-
271
- while True: # this loop will end on process termination
272
- logger.debug("Start filesystem radio receiver loop")
273
-
274
- # iterate over files in new_dir
275
- for filename in os.listdir(new_dir):
276
- try:
277
- logger.info("Processing filesystem radio file %s", filename)
278
- full_path_filename = f"{new_dir}/{filename}"
279
- with open(full_path_filename, "rb") as f:
280
- message = pickle.load(f)
281
- logger.debug("Message received is: %s", message)
282
- assert isinstance(message, tuple)
283
- target_radio.send(cast(TaggedMonitoringMessage, message))
284
- os.remove(full_path_filename)
285
- except Exception:
286
- logger.exception("Exception processing %s - probably will be retried next iteration", filename)
287
-
288
- time.sleep(1) # whats a good time for this poll?
256
+ def join_terminate_close_proc(process: ForkProcessType, *, timeout: int = 30) -> None:
257
+ """Increasingly aggressively terminate a process.
258
+
259
+ This function assumes that the process is likely to exit before
260
+ the join timeout, driven by some other means, such as the
261
+ MonitoringHub router_exit_event. If the process does not exit, then
262
+ first terminate() and then kill() will be used to end the process.
263
+
264
+ In the case of a very mis-behaving process, this function might take
265
+ up to 3*timeout to exhaust all termination methods and return.
266
+ """
267
+ logger.debug("Joining process")
268
+ process.join(timeout)
269
+
270
+ # run a sequence of increasingly aggressive steps to shut down the process.
271
+ if process.is_alive():
272
+ logger.error("Process did not join. Terminating.")
273
+ process.terminate()
274
+ process.join(timeout)
275
+ if process.is_alive():
276
+ logger.error("Process did not join after terminate. Killing.")
277
+ process.kill()
278
+ process.join(timeout)
279
+ # This kill should not be caught by any signal handlers so it is
280
+ # unlikely that this join will timeout. If it does, there isn't
281
+ # anything further to do except log an error in the next if-block.
282
+
283
+ if process.is_alive():
284
+ logger.error("Process failed to end")
285
+ # don't call close if the process hasn't ended:
286
+ # process.close() doesn't work on a running process.
287
+ else:
288
+ process.close()
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import pickle
6
+ import time
7
+ from multiprocessing.queues import Queue
8
+ from multiprocessing.synchronize import Event
9
+ from typing import cast
10
+
11
+ from parsl.log_utils import set_file_logger
12
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
13
+ from parsl.monitoring.types import TaggedMonitoringMessage
14
+ from parsl.process_loggers import wrap_with_logs
15
+ from parsl.utils import setproctitle
16
+
17
+
18
+ @wrap_with_logs
19
+ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str, exit_event: Event) -> None:
20
+ logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
21
+ name="monitoring_filesystem_radio",
22
+ level=logging.INFO)
23
+
24
+ logger.info("Starting filesystem radio receiver")
25
+ setproctitle("parsl: monitoring filesystem receiver")
26
+ base_path = f"{run_dir}/monitor-fs-radio/"
27
+ tmp_dir = f"{base_path}/tmp/"
28
+ new_dir = f"{base_path}/new/"
29
+ logger.debug("Creating new and tmp paths under %s", base_path)
30
+
31
+ target_radio = MultiprocessingQueueRadioSender(q)
32
+
33
+ os.makedirs(tmp_dir, exist_ok=True)
34
+ os.makedirs(new_dir, exist_ok=True)
35
+
36
+ while not exit_event.is_set():
37
+ logger.debug("Start filesystem radio receiver loop")
38
+
39
+ # iterate over files in new_dir
40
+ for filename in os.listdir(new_dir):
41
+ try:
42
+ logger.info("Processing filesystem radio file %s", filename)
43
+ full_path_filename = f"{new_dir}/{filename}"
44
+ with open(full_path_filename, "rb") as f:
45
+ message = pickle.load(f)
46
+ logger.debug("Message received is: %s", message)
47
+ assert isinstance(message, tuple)
48
+ target_radio.send(cast(TaggedMonitoringMessage, message))
49
+ os.remove(full_path_filename)
50
+ except Exception:
51
+ logger.exception("Exception processing %s - probably will be retried next iteration", filename)
52
+
53
+ time.sleep(1) # whats a good time for this poll?
54
+ logger.info("Ending filesystem radio receiver")
@@ -5,17 +5,14 @@ import multiprocessing.queues as mpq
5
5
  import os
6
6
  import pickle
7
7
  import socket
8
- import threading
9
8
  import time
10
9
  from multiprocessing.synchronize import Event
11
- from typing import Optional, Tuple
10
+ from typing import Optional
12
11
 
13
12
  import typeguard
14
- import zmq
15
13
 
16
14
  from parsl.log_utils import set_file_logger
17
15
  from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
18
- from parsl.monitoring.types import TaggedMonitoringMessage
19
16
  from parsl.process_loggers import wrap_with_logs
20
17
  from parsl.utils import setproctitle
21
18
 
@@ -28,7 +25,6 @@ class MonitoringRouter:
28
25
  *,
29
26
  hub_address: str,
30
27
  udp_port: Optional[int] = None,
31
- zmq_port_range: Tuple[int, int] = (55050, 56000),
32
28
 
33
29
  monitoring_hub_address: str = "127.0.0.1",
34
30
  run_dir: str = ".",
@@ -45,9 +41,6 @@ class MonitoringRouter:
45
41
  The ip address at which the workers will be able to reach the Hub.
46
42
  udp_port : int
47
43
  The specific port at which workers will be able to reach the Hub via UDP. Default: None
48
- zmq_port_range : tuple(int, int)
49
- The MonitoringHub picks ports at random from the range which will be used by Hub.
50
- Default: (55050, 56000)
51
44
  run_dir : str
52
45
  Parsl log directory paths. Logs and temp files go here. Default: '.'
53
46
  logging_level : int
@@ -60,7 +53,7 @@ class MonitoringRouter:
60
53
  An event that the main Parsl process will set to signal that the monitoring router should shut down.
61
54
  """
62
55
  os.makedirs(run_dir, exist_ok=True)
63
- self.logger = set_file_logger(f"{run_dir}/monitoring_router.log",
56
+ self.logger = set_file_logger(f"{run_dir}/monitoring_udp_router.log",
64
57
  name="monitoring_router",
65
58
  level=logging_level)
66
59
  self.logger.debug("Monitoring router starting")
@@ -88,37 +81,12 @@ class MonitoringRouter:
88
81
  self.udp_sock.settimeout(self.loop_freq / 1000)
89
82
  self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
90
83
 
91
- self._context = zmq.Context()
92
- self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
93
- self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
94
- self.zmq_receiver_channel.set_hwm(0)
95
- self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
96
- self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
97
- self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
98
- min_port=zmq_port_range[0],
99
- max_port=zmq_port_range[1])
100
-
101
84
  self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
102
85
  self.exit_event = exit_event
103
86
 
104
87
  @wrap_with_logs(target="monitoring_router")
105
88
  def start(self) -> None:
106
- self.logger.info("Starting UDP listener thread")
107
- udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
108
- udp_radio_receiver_thread.start()
109
-
110
- self.logger.info("Starting ZMQ listener thread")
111
- zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
112
- zmq_radio_receiver_thread.start()
113
-
114
- self.logger.info("Joining on ZMQ listener thread")
115
- zmq_radio_receiver_thread.join()
116
- self.logger.info("Joining on UDP listener thread")
117
- udp_radio_receiver_thread.join()
118
- self.logger.info("Joined on both ZMQ and UDP listener threads")
119
-
120
- @wrap_with_logs(target="monitoring_router")
121
- def start_udp_listener(self) -> None:
89
+ self.logger.info("Starting UDP listener")
122
90
  try:
123
91
  while not self.exit_event.is_set():
124
92
  try:
@@ -145,55 +113,23 @@ class MonitoringRouter:
145
113
  finally:
146
114
  self.logger.info("UDP listener finished")
147
115
 
148
- @wrap_with_logs(target="monitoring_router")
149
- def start_zmq_listener(self) -> None:
150
- try:
151
- while not self.exit_event.is_set():
152
- try:
153
- dfk_loop_start = time.time()
154
- while time.time() - dfk_loop_start < 1.0: # TODO make configurable
155
- # note that nothing checks that msg really is of the annotated type
156
- msg: TaggedMonitoringMessage
157
- msg = self.zmq_receiver_channel.recv_pyobj()
158
-
159
- assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
160
- assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
161
- assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
162
-
163
- self.target_radio.send(msg)
164
- except zmq.Again:
165
- pass
166
- except Exception:
167
- # This will catch malformed messages. What happens if the
168
- # channel is broken in such a way that it always raises
169
- # an exception? Looping on this would maybe be the wrong
170
- # thing to do.
171
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
172
-
173
- self.logger.info("ZMQ listener finishing normally")
174
- finally:
175
- self.logger.info("ZMQ listener finished")
176
-
177
116
 
178
117
  @wrap_with_logs
179
118
  @typeguard.typechecked
180
- def router_starter(*,
181
- comm_q: mpq.Queue,
182
- exception_q: mpq.Queue,
183
- resource_msgs: mpq.Queue,
184
- exit_event: Event,
185
-
186
- hub_address: str,
187
- udp_port: Optional[int],
188
- zmq_port_range: Tuple[int, int],
189
-
190
- run_dir: str,
191
- logging_level: int) -> None:
192
- setproctitle("parsl: monitoring router")
119
+ def udp_router_starter(*,
120
+ comm_q: mpq.Queue,
121
+ resource_msgs: mpq.Queue,
122
+ exit_event: Event,
123
+
124
+ hub_address: str,
125
+ udp_port: Optional[int],
126
+
127
+ run_dir: str,
128
+ logging_level: int) -> None:
129
+ setproctitle("parsl: monitoring UDP router")
193
130
  try:
194
131
  router = MonitoringRouter(hub_address=hub_address,
195
132
  udp_port=udp_port,
196
- zmq_port_range=zmq_port_range,
197
133
  run_dir=run_dir,
198
134
  logging_level=logging_level,
199
135
  resource_msgs=resource_msgs,
@@ -202,11 +138,10 @@ def router_starter(*,
202
138
  logger.error("MonitoringRouter construction failed.", exc_info=True)
203
139
  comm_q.put(f"Monitoring router construction failed: {e}")
204
140
  else:
205
- comm_q.put((router.udp_port, router.zmq_receiver_port))
141
+ comm_q.put(router.udp_port)
206
142
 
207
143
  router.logger.info("Starting MonitoringRouter in router_starter")
208
144
  try:
209
145
  router.start()
210
- except Exception as e:
211
- router.logger.exception("router.start exception")
212
- exception_q.put(('Hub', str(e)))
146
+ except Exception:
147
+ router.logger.exception("UDP router start exception")
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import multiprocessing.queues as mpq
5
+ import os
6
+ import time
7
+ from multiprocessing.synchronize import Event
8
+ from typing import Tuple
9
+
10
+ import typeguard
11
+ import zmq
12
+
13
+ from parsl.log_utils import set_file_logger
14
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
15
+ from parsl.monitoring.types import TaggedMonitoringMessage
16
+ from parsl.process_loggers import wrap_with_logs
17
+ from parsl.utils import setproctitle
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class MonitoringRouter:
23
+
24
+ def __init__(self,
25
+ *,
26
+ hub_address: str,
27
+ zmq_port_range: Tuple[int, int] = (55050, 56000),
28
+
29
+ run_dir: str = ".",
30
+ logging_level: int = logging.INFO,
31
+ resource_msgs: mpq.Queue,
32
+ exit_event: Event,
33
+ ):
34
+ """ Initializes a monitoring configuration class.
35
+
36
+ Parameters
37
+ ----------
38
+ hub_address : str
39
+ The ip address at which the workers will be able to reach the Hub.
40
+ zmq_port_range : tuple(int, int)
41
+ The MonitoringHub picks ports at random from the range which will be used by Hub.
42
+ Default: (55050, 56000)
43
+ run_dir : str
44
+ Parsl log directory paths. Logs and temp files go here. Default: '.'
45
+ logging_level : int
46
+ Logging level as defined in the logging module. Default: logging.INFO
47
+ resource_msgs : multiprocessing.Queue
48
+ A multiprocessing queue to receive messages to be routed onwards to the database process
49
+ exit_event : Event
50
+ An event that the main Parsl process will set to signal that the monitoring router should shut down.
51
+ """
52
+ os.makedirs(run_dir, exist_ok=True)
53
+ self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
54
+ name="monitoring_router",
55
+ level=logging_level)
56
+ self.logger.debug("Monitoring router starting")
57
+
58
+ self.hub_address = hub_address
59
+
60
+ self.loop_freq = 10.0 # milliseconds
61
+
62
+ self._context = zmq.Context()
63
+ self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
64
+ self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
65
+ self.zmq_receiver_channel.set_hwm(0)
66
+ self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
67
+ self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
68
+ self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
69
+ min_port=zmq_port_range[0],
70
+ max_port=zmq_port_range[1])
71
+
72
+ self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
73
+ self.exit_event = exit_event
74
+
75
+ @wrap_with_logs(target="monitoring_router")
76
+ def start(self) -> None:
77
+ self.logger.info("Starting ZMQ listener")
78
+ try:
79
+ while not self.exit_event.is_set():
80
+ try:
81
+ dfk_loop_start = time.time()
82
+ while time.time() - dfk_loop_start < 1.0: # TODO make configurable
83
+ # note that nothing checks that msg really is of the annotated type
84
+ msg: TaggedMonitoringMessage
85
+ msg = self.zmq_receiver_channel.recv_pyobj()
86
+
87
+ assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
88
+ assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
89
+ assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
90
+
91
+ self.target_radio.send(msg)
92
+ except zmq.Again:
93
+ pass
94
+ except Exception:
95
+ # This will catch malformed messages. What happens if the
96
+ # channel is broken in such a way that it always raises
97
+ # an exception? Looping on this would maybe be the wrong
98
+ # thing to do.
99
+ self.logger.warning("Failure processing a ZMQ message", exc_info=True)
100
+
101
+ self.logger.info("ZMQ listener finishing normally")
102
+ finally:
103
+ self.logger.info("ZMQ listener finished")
104
+
105
+
106
+ @wrap_with_logs
107
+ @typeguard.typechecked
108
+ def zmq_router_starter(*,
109
+ comm_q: mpq.Queue,
110
+ resource_msgs: mpq.Queue,
111
+ exit_event: Event,
112
+
113
+ hub_address: str,
114
+ zmq_port_range: Tuple[int, int],
115
+
116
+ run_dir: str,
117
+ logging_level: int) -> None:
118
+ setproctitle("parsl: monitoring zmq router")
119
+ try:
120
+ router = MonitoringRouter(hub_address=hub_address,
121
+ zmq_port_range=zmq_port_range,
122
+ run_dir=run_dir,
123
+ logging_level=logging_level,
124
+ resource_msgs=resource_msgs,
125
+ exit_event=exit_event)
126
+ except Exception as e:
127
+ logger.error("MonitoringRouter construction failed.", exc_info=True)
128
+ comm_q.put(f"Monitoring router construction failed: {e}")
129
+ else:
130
+ comm_q.put(router.zmq_receiver_port)
131
+ router.start()