parsl 2025.3.3__py3-none-any.whl → 2025.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/executors/high_throughput/executor.py +1 -1
- parsl/executors/high_throughput/mpi_resource_management.py +15 -4
- parsl/executors/high_throughput/process_worker_pool.py +89 -82
- parsl/monitoring/db_manager.py +16 -10
- parsl/monitoring/monitoring.py +113 -113
- parsl/monitoring/radios/filesystem_router.py +54 -0
- parsl/monitoring/{router.py → radios/udp_router.py} +17 -82
- parsl/monitoring/radios/zmq_router.py +131 -0
- parsl/tests/test_monitoring/test_exit_helper.py +55 -0
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +25 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +4 -1
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/version.py +1 -1
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/process_worker_pool.py +89 -82
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/METADATA +4 -4
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/RECORD +23 -20
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/interchange.py +0 -0
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/LICENSE +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/WHEEL +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -3,23 +3,22 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import multiprocessing.synchronize as ms
|
5
5
|
import os
|
6
|
-
import pickle
|
7
6
|
import queue
|
8
|
-
import time
|
9
7
|
from multiprocessing import Event
|
8
|
+
from multiprocessing.context import ForkProcess as ForkProcessType
|
10
9
|
from multiprocessing.queues import Queue
|
11
|
-
from typing import TYPE_CHECKING,
|
10
|
+
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
12
11
|
|
13
12
|
import typeguard
|
14
13
|
|
15
|
-
from parsl.log_utils import set_file_logger
|
16
14
|
from parsl.monitoring.errors import MonitoringHubStartError
|
15
|
+
from parsl.monitoring.radios.filesystem_router import filesystem_router_starter
|
17
16
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
|
-
from parsl.monitoring.
|
17
|
+
from parsl.monitoring.radios.udp_router import udp_router_starter
|
18
|
+
from parsl.monitoring.radios.zmq_router import zmq_router_starter
|
19
19
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
21
|
-
from parsl.
|
22
|
-
from parsl.utils import RepresentationMixin, setproctitle
|
21
|
+
from parsl.utils import RepresentationMixin
|
23
22
|
|
24
23
|
_db_manager_excepts: Optional[Exception]
|
25
24
|
|
@@ -121,51 +120,68 @@ class MonitoringHub(RepresentationMixin):
|
|
121
120
|
# in the future, Queue will allow runtime subscripts.
|
122
121
|
|
123
122
|
if TYPE_CHECKING:
|
124
|
-
|
123
|
+
zmq_comm_q: Queue[Union[int, str]]
|
124
|
+
udp_comm_q: Queue[Union[int, str]]
|
125
125
|
else:
|
126
|
-
|
126
|
+
zmq_comm_q: Queue
|
127
|
+
udp_comm_q: Queue
|
127
128
|
|
128
|
-
|
129
|
+
zmq_comm_q = SizedQueue(maxsize=10)
|
130
|
+
udp_comm_q = SizedQueue(maxsize=10)
|
129
131
|
|
130
|
-
self.
|
131
|
-
self.exception_q = SizedQueue(maxsize=10)
|
132
|
-
|
133
|
-
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
132
|
+
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
134
133
|
self.resource_msgs = SizedQueue()
|
135
134
|
|
136
135
|
self.router_exit_event: ms.Event
|
137
136
|
self.router_exit_event = Event()
|
138
137
|
|
139
|
-
self.
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
self.
|
138
|
+
self.zmq_router_proc = ForkProcess(target=zmq_router_starter,
|
139
|
+
kwargs={"comm_q": zmq_comm_q,
|
140
|
+
"resource_msgs": self.resource_msgs,
|
141
|
+
"exit_event": self.router_exit_event,
|
142
|
+
"hub_address": self.hub_address,
|
143
|
+
"zmq_port_range": self.hub_port_range,
|
144
|
+
"run_dir": dfk_run_dir,
|
145
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
146
|
+
},
|
147
|
+
name="Monitoring-ZMQ-Router-Process",
|
148
|
+
daemon=True,
|
149
|
+
)
|
150
|
+
self.zmq_router_proc.start()
|
151
|
+
|
152
|
+
self.udp_router_proc = ForkProcess(target=udp_router_starter,
|
153
|
+
kwargs={"comm_q": udp_comm_q,
|
154
|
+
"resource_msgs": self.resource_msgs,
|
155
|
+
"exit_event": self.router_exit_event,
|
156
|
+
"hub_address": self.hub_address,
|
157
|
+
"udp_port": self.hub_port,
|
158
|
+
"run_dir": dfk_run_dir,
|
159
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
160
|
+
},
|
161
|
+
name="Monitoring-UDP-Router-Process",
|
162
|
+
daemon=True,
|
163
|
+
)
|
164
|
+
self.udp_router_proc.start()
|
165
|
+
|
166
|
+
self.dbm_exit_event: ms.Event
|
167
|
+
self.dbm_exit_event = Event()
|
154
168
|
|
155
169
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
156
|
-
args=(self.
|
170
|
+
args=(self.resource_msgs,),
|
157
171
|
kwargs={"run_dir": dfk_run_dir,
|
158
172
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
159
173
|
"db_url": self.logging_endpoint,
|
174
|
+
"exit_event": self.dbm_exit_event,
|
160
175
|
},
|
161
176
|
name="Monitoring-DBM-Process",
|
162
177
|
daemon=True,
|
163
178
|
)
|
164
179
|
self.dbm_proc.start()
|
165
|
-
logger.info("Started
|
180
|
+
logger.info("Started ZMQ router process %s, UDP router process %s and DBM process %s",
|
181
|
+
self.zmq_router_proc.pid, self.udp_router_proc.pid, self.dbm_proc.pid)
|
166
182
|
|
167
|
-
self.filesystem_proc = ForkProcess(target=
|
168
|
-
args=(self.resource_msgs, dfk_run_dir),
|
183
|
+
self.filesystem_proc = ForkProcess(target=filesystem_router_starter,
|
184
|
+
args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
|
169
185
|
name="Monitoring-Filesystem-Process",
|
170
186
|
daemon=True
|
171
187
|
)
|
@@ -175,114 +191,98 @@ class MonitoringHub(RepresentationMixin):
|
|
175
191
|
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
176
192
|
|
177
193
|
try:
|
178
|
-
|
179
|
-
|
180
|
-
|
194
|
+
zmq_comm_q_result = zmq_comm_q.get(block=True, timeout=120)
|
195
|
+
zmq_comm_q.close()
|
196
|
+
zmq_comm_q.join_thread()
|
181
197
|
except queue.Empty:
|
182
|
-
logger.error("
|
198
|
+
logger.error("Monitoring ZMQ Router has not reported port in 120s. Aborting")
|
183
199
|
raise MonitoringHubStartError()
|
184
200
|
|
185
|
-
if isinstance(
|
186
|
-
logger.error("MonitoringRouter sent an error message: %s",
|
187
|
-
raise RuntimeError(f"MonitoringRouter failed to start: {
|
201
|
+
if isinstance(zmq_comm_q_result, str):
|
202
|
+
logger.error("MonitoringRouter sent an error message: %s", zmq_comm_q_result)
|
203
|
+
raise RuntimeError(f"MonitoringRouter failed to start: {zmq_comm_q_result}")
|
188
204
|
|
189
|
-
|
205
|
+
self.hub_zmq_port = zmq_comm_q_result
|
190
206
|
|
207
|
+
try:
|
208
|
+
udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
|
209
|
+
udp_comm_q.close()
|
210
|
+
udp_comm_q.join_thread()
|
211
|
+
except queue.Empty:
|
212
|
+
logger.error("Monitoring UDP router has not reported port in 120s. Aborting")
|
213
|
+
raise MonitoringHubStartError()
|
214
|
+
|
215
|
+
if isinstance(udp_comm_q_result, str):
|
216
|
+
logger.error("MonitoringRouter sent an error message: %s", udp_comm_q_result)
|
217
|
+
raise RuntimeError(f"MonitoringRouter failed to start: {udp_comm_q_result}")
|
218
|
+
|
219
|
+
udp_port = udp_comm_q_result
|
191
220
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
192
221
|
|
193
222
|
logger.info("Monitoring Hub initialized")
|
194
223
|
|
195
|
-
self.hub_zmq_port = zmq_port
|
196
|
-
|
197
224
|
def send(self, message: TaggedMonitoringMessage) -> None:
|
198
225
|
logger.debug("Sending message type %s", message[0])
|
199
226
|
self.radio.send(message)
|
200
227
|
|
201
228
|
def close(self) -> None:
|
202
229
|
logger.info("Terminating Monitoring Hub")
|
203
|
-
exception_msgs = []
|
204
|
-
while True:
|
205
|
-
try:
|
206
|
-
exception_msgs.append(self.exception_q.get(block=False))
|
207
|
-
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
208
|
-
except queue.Empty:
|
209
|
-
break
|
210
230
|
if self.monitoring_hub_active:
|
211
231
|
self.monitoring_hub_active = False
|
212
|
-
if exception_msgs:
|
213
|
-
for exception_msg in exception_msgs:
|
214
|
-
logger.error(
|
215
|
-
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
216
|
-
exception_msg[0],
|
217
|
-
exception_msg[1]
|
218
|
-
)
|
219
|
-
self.router_proc.terminate()
|
220
|
-
self.dbm_proc.terminate()
|
221
|
-
self.filesystem_proc.terminate()
|
222
232
|
logger.info("Setting router termination event")
|
223
233
|
self.router_exit_event.set()
|
224
|
-
|
225
|
-
|
226
|
-
self.
|
234
|
+
|
235
|
+
logger.info("Waiting for ZMQ router to terminate")
|
236
|
+
join_terminate_close_proc(self.zmq_router_proc)
|
237
|
+
|
238
|
+
logger.info("Waiting for UDP router to terminate")
|
239
|
+
join_terminate_close_proc(self.udp_router_proc)
|
240
|
+
|
227
241
|
logger.debug("Finished waiting for router termination")
|
228
|
-
if len(exception_msgs) == 0:
|
229
|
-
logger.debug("Sending STOP to DBM")
|
230
|
-
self.resource_msgs.put("STOP")
|
231
|
-
else:
|
232
|
-
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
233
242
|
logger.debug("Waiting for DB termination")
|
234
|
-
self.
|
235
|
-
self.dbm_proc
|
243
|
+
self.dbm_exit_event.set()
|
244
|
+
join_terminate_close_proc(self.dbm_proc)
|
236
245
|
logger.debug("Finished waiting for DBM termination")
|
237
246
|
|
238
|
-
# should this be message based? it probably doesn't need to be if
|
239
|
-
# we believe we've received all messages
|
240
247
|
logger.info("Terminating filesystem radio receiver process")
|
241
|
-
self.filesystem_proc
|
242
|
-
self.filesystem_proc.join()
|
243
|
-
self.filesystem_proc.close()
|
248
|
+
join_terminate_close_proc(self.filesystem_proc)
|
244
249
|
|
245
250
|
logger.info("Closing monitoring multiprocessing queues")
|
246
|
-
self.exception_q.close()
|
247
|
-
self.exception_q.join_thread()
|
248
251
|
self.resource_msgs.close()
|
249
252
|
self.resource_msgs.join_thread()
|
250
253
|
logger.info("Closed monitoring multiprocessing queues")
|
251
254
|
|
252
255
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
logger.debug("
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
287
|
-
|
288
|
-
time.sleep(1) # whats a good time for this poll?
|
256
|
+
def join_terminate_close_proc(process: ForkProcessType, *, timeout: int = 30) -> None:
|
257
|
+
"""Increasingly aggressively terminate a process.
|
258
|
+
|
259
|
+
This function assumes that the process is likely to exit before
|
260
|
+
the join timeout, driven by some other means, such as the
|
261
|
+
MonitoringHub router_exit_event. If the process does not exit, then
|
262
|
+
first terminate() and then kill() will be used to end the process.
|
263
|
+
|
264
|
+
In the case of a very mis-behaving process, this function might take
|
265
|
+
up to 3*timeout to exhaust all termination methods and return.
|
266
|
+
"""
|
267
|
+
logger.debug("Joining process")
|
268
|
+
process.join(timeout)
|
269
|
+
|
270
|
+
# run a sequence of increasingly aggressive steps to shut down the process.
|
271
|
+
if process.is_alive():
|
272
|
+
logger.error("Process did not join. Terminating.")
|
273
|
+
process.terminate()
|
274
|
+
process.join(timeout)
|
275
|
+
if process.is_alive():
|
276
|
+
logger.error("Process did not join after terminate. Killing.")
|
277
|
+
process.kill()
|
278
|
+
process.join(timeout)
|
279
|
+
# This kill should not be caught by any signal handlers so it is
|
280
|
+
# unlikely that this join will timeout. If it does, there isn't
|
281
|
+
# anything further to do except log an error in the next if-block.
|
282
|
+
|
283
|
+
if process.is_alive():
|
284
|
+
logger.error("Process failed to end")
|
285
|
+
# don't call close if the process hasn't ended:
|
286
|
+
# process.close() doesn't work on a running process.
|
287
|
+
else:
|
288
|
+
process.close()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import pickle
|
6
|
+
import time
|
7
|
+
from multiprocessing.queues import Queue
|
8
|
+
from multiprocessing.synchronize import Event
|
9
|
+
from typing import cast
|
10
|
+
|
11
|
+
from parsl.log_utils import set_file_logger
|
12
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
13
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
14
|
+
from parsl.process_loggers import wrap_with_logs
|
15
|
+
from parsl.utils import setproctitle
|
16
|
+
|
17
|
+
|
18
|
+
@wrap_with_logs
|
19
|
+
def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str, exit_event: Event) -> None:
|
20
|
+
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
21
|
+
name="monitoring_filesystem_radio",
|
22
|
+
level=logging.INFO)
|
23
|
+
|
24
|
+
logger.info("Starting filesystem radio receiver")
|
25
|
+
setproctitle("parsl: monitoring filesystem receiver")
|
26
|
+
base_path = f"{run_dir}/monitor-fs-radio/"
|
27
|
+
tmp_dir = f"{base_path}/tmp/"
|
28
|
+
new_dir = f"{base_path}/new/"
|
29
|
+
logger.debug("Creating new and tmp paths under %s", base_path)
|
30
|
+
|
31
|
+
target_radio = MultiprocessingQueueRadioSender(q)
|
32
|
+
|
33
|
+
os.makedirs(tmp_dir, exist_ok=True)
|
34
|
+
os.makedirs(new_dir, exist_ok=True)
|
35
|
+
|
36
|
+
while not exit_event.is_set():
|
37
|
+
logger.debug("Start filesystem radio receiver loop")
|
38
|
+
|
39
|
+
# iterate over files in new_dir
|
40
|
+
for filename in os.listdir(new_dir):
|
41
|
+
try:
|
42
|
+
logger.info("Processing filesystem radio file %s", filename)
|
43
|
+
full_path_filename = f"{new_dir}/{filename}"
|
44
|
+
with open(full_path_filename, "rb") as f:
|
45
|
+
message = pickle.load(f)
|
46
|
+
logger.debug("Message received is: %s", message)
|
47
|
+
assert isinstance(message, tuple)
|
48
|
+
target_radio.send(cast(TaggedMonitoringMessage, message))
|
49
|
+
os.remove(full_path_filename)
|
50
|
+
except Exception:
|
51
|
+
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
52
|
+
|
53
|
+
time.sleep(1) # whats a good time for this poll?
|
54
|
+
logger.info("Ending filesystem radio receiver")
|
@@ -5,17 +5,14 @@ import multiprocessing.queues as mpq
|
|
5
5
|
import os
|
6
6
|
import pickle
|
7
7
|
import socket
|
8
|
-
import threading
|
9
8
|
import time
|
10
9
|
from multiprocessing.synchronize import Event
|
11
|
-
from typing import Optional
|
10
|
+
from typing import Optional
|
12
11
|
|
13
12
|
import typeguard
|
14
|
-
import zmq
|
15
13
|
|
16
14
|
from parsl.log_utils import set_file_logger
|
17
15
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
|
-
from parsl.monitoring.types import TaggedMonitoringMessage
|
19
16
|
from parsl.process_loggers import wrap_with_logs
|
20
17
|
from parsl.utils import setproctitle
|
21
18
|
|
@@ -28,7 +25,6 @@ class MonitoringRouter:
|
|
28
25
|
*,
|
29
26
|
hub_address: str,
|
30
27
|
udp_port: Optional[int] = None,
|
31
|
-
zmq_port_range: Tuple[int, int] = (55050, 56000),
|
32
28
|
|
33
29
|
monitoring_hub_address: str = "127.0.0.1",
|
34
30
|
run_dir: str = ".",
|
@@ -45,9 +41,6 @@ class MonitoringRouter:
|
|
45
41
|
The ip address at which the workers will be able to reach the Hub.
|
46
42
|
udp_port : int
|
47
43
|
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
48
|
-
zmq_port_range : tuple(int, int)
|
49
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
50
|
-
Default: (55050, 56000)
|
51
44
|
run_dir : str
|
52
45
|
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
53
46
|
logging_level : int
|
@@ -60,7 +53,7 @@ class MonitoringRouter:
|
|
60
53
|
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
61
54
|
"""
|
62
55
|
os.makedirs(run_dir, exist_ok=True)
|
63
|
-
self.logger = set_file_logger(f"{run_dir}/
|
56
|
+
self.logger = set_file_logger(f"{run_dir}/monitoring_udp_router.log",
|
64
57
|
name="monitoring_router",
|
65
58
|
level=logging_level)
|
66
59
|
self.logger.debug("Monitoring router starting")
|
@@ -88,37 +81,12 @@ class MonitoringRouter:
|
|
88
81
|
self.udp_sock.settimeout(self.loop_freq / 1000)
|
89
82
|
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
|
90
83
|
|
91
|
-
self._context = zmq.Context()
|
92
|
-
self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
|
93
|
-
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
94
|
-
self.zmq_receiver_channel.set_hwm(0)
|
95
|
-
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
96
|
-
self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
|
97
|
-
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
|
98
|
-
min_port=zmq_port_range[0],
|
99
|
-
max_port=zmq_port_range[1])
|
100
|
-
|
101
84
|
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
102
85
|
self.exit_event = exit_event
|
103
86
|
|
104
87
|
@wrap_with_logs(target="monitoring_router")
|
105
88
|
def start(self) -> None:
|
106
|
-
self.logger.info("Starting UDP listener
|
107
|
-
udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
|
108
|
-
udp_radio_receiver_thread.start()
|
109
|
-
|
110
|
-
self.logger.info("Starting ZMQ listener thread")
|
111
|
-
zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
|
112
|
-
zmq_radio_receiver_thread.start()
|
113
|
-
|
114
|
-
self.logger.info("Joining on ZMQ listener thread")
|
115
|
-
zmq_radio_receiver_thread.join()
|
116
|
-
self.logger.info("Joining on UDP listener thread")
|
117
|
-
udp_radio_receiver_thread.join()
|
118
|
-
self.logger.info("Joined on both ZMQ and UDP listener threads")
|
119
|
-
|
120
|
-
@wrap_with_logs(target="monitoring_router")
|
121
|
-
def start_udp_listener(self) -> None:
|
89
|
+
self.logger.info("Starting UDP listener")
|
122
90
|
try:
|
123
91
|
while not self.exit_event.is_set():
|
124
92
|
try:
|
@@ -145,55 +113,23 @@ class MonitoringRouter:
|
|
145
113
|
finally:
|
146
114
|
self.logger.info("UDP listener finished")
|
147
115
|
|
148
|
-
@wrap_with_logs(target="monitoring_router")
|
149
|
-
def start_zmq_listener(self) -> None:
|
150
|
-
try:
|
151
|
-
while not self.exit_event.is_set():
|
152
|
-
try:
|
153
|
-
dfk_loop_start = time.time()
|
154
|
-
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
155
|
-
# note that nothing checks that msg really is of the annotated type
|
156
|
-
msg: TaggedMonitoringMessage
|
157
|
-
msg = self.zmq_receiver_channel.recv_pyobj()
|
158
|
-
|
159
|
-
assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
|
160
|
-
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
161
|
-
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
162
|
-
|
163
|
-
self.target_radio.send(msg)
|
164
|
-
except zmq.Again:
|
165
|
-
pass
|
166
|
-
except Exception:
|
167
|
-
# This will catch malformed messages. What happens if the
|
168
|
-
# channel is broken in such a way that it always raises
|
169
|
-
# an exception? Looping on this would maybe be the wrong
|
170
|
-
# thing to do.
|
171
|
-
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
172
|
-
|
173
|
-
self.logger.info("ZMQ listener finishing normally")
|
174
|
-
finally:
|
175
|
-
self.logger.info("ZMQ listener finished")
|
176
|
-
|
177
116
|
|
178
117
|
@wrap_with_logs
|
179
118
|
@typeguard.typechecked
|
180
|
-
def
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
logging_level: int) -> None:
|
192
|
-
setproctitle("parsl: monitoring router")
|
119
|
+
def udp_router_starter(*,
|
120
|
+
comm_q: mpq.Queue,
|
121
|
+
resource_msgs: mpq.Queue,
|
122
|
+
exit_event: Event,
|
123
|
+
|
124
|
+
hub_address: str,
|
125
|
+
udp_port: Optional[int],
|
126
|
+
|
127
|
+
run_dir: str,
|
128
|
+
logging_level: int) -> None:
|
129
|
+
setproctitle("parsl: monitoring UDP router")
|
193
130
|
try:
|
194
131
|
router = MonitoringRouter(hub_address=hub_address,
|
195
132
|
udp_port=udp_port,
|
196
|
-
zmq_port_range=zmq_port_range,
|
197
133
|
run_dir=run_dir,
|
198
134
|
logging_level=logging_level,
|
199
135
|
resource_msgs=resource_msgs,
|
@@ -202,11 +138,10 @@ def router_starter(*,
|
|
202
138
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
203
139
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
204
140
|
else:
|
205
|
-
comm_q.put(
|
141
|
+
comm_q.put(router.udp_port)
|
206
142
|
|
207
143
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
208
144
|
try:
|
209
145
|
router.start()
|
210
|
-
except Exception
|
211
|
-
router.logger.exception("router
|
212
|
-
exception_q.put(('Hub', str(e)))
|
146
|
+
except Exception:
|
147
|
+
router.logger.exception("UDP router start exception")
|
@@ -0,0 +1,131 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import multiprocessing.queues as mpq
|
5
|
+
import os
|
6
|
+
import time
|
7
|
+
from multiprocessing.synchronize import Event
|
8
|
+
from typing import Tuple
|
9
|
+
|
10
|
+
import typeguard
|
11
|
+
import zmq
|
12
|
+
|
13
|
+
from parsl.log_utils import set_file_logger
|
14
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
15
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
16
|
+
from parsl.process_loggers import wrap_with_logs
|
17
|
+
from parsl.utils import setproctitle
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class MonitoringRouter:
|
23
|
+
|
24
|
+
def __init__(self,
|
25
|
+
*,
|
26
|
+
hub_address: str,
|
27
|
+
zmq_port_range: Tuple[int, int] = (55050, 56000),
|
28
|
+
|
29
|
+
run_dir: str = ".",
|
30
|
+
logging_level: int = logging.INFO,
|
31
|
+
resource_msgs: mpq.Queue,
|
32
|
+
exit_event: Event,
|
33
|
+
):
|
34
|
+
""" Initializes a monitoring configuration class.
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
----------
|
38
|
+
hub_address : str
|
39
|
+
The ip address at which the workers will be able to reach the Hub.
|
40
|
+
zmq_port_range : tuple(int, int)
|
41
|
+
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
42
|
+
Default: (55050, 56000)
|
43
|
+
run_dir : str
|
44
|
+
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
45
|
+
logging_level : int
|
46
|
+
Logging level as defined in the logging module. Default: logging.INFO
|
47
|
+
resource_msgs : multiprocessing.Queue
|
48
|
+
A multiprocessing queue to receive messages to be routed onwards to the database process
|
49
|
+
exit_event : Event
|
50
|
+
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
51
|
+
"""
|
52
|
+
os.makedirs(run_dir, exist_ok=True)
|
53
|
+
self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
|
54
|
+
name="monitoring_router",
|
55
|
+
level=logging_level)
|
56
|
+
self.logger.debug("Monitoring router starting")
|
57
|
+
|
58
|
+
self.hub_address = hub_address
|
59
|
+
|
60
|
+
self.loop_freq = 10.0 # milliseconds
|
61
|
+
|
62
|
+
self._context = zmq.Context()
|
63
|
+
self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
|
64
|
+
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
65
|
+
self.zmq_receiver_channel.set_hwm(0)
|
66
|
+
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
67
|
+
self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
|
68
|
+
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
|
69
|
+
min_port=zmq_port_range[0],
|
70
|
+
max_port=zmq_port_range[1])
|
71
|
+
|
72
|
+
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
73
|
+
self.exit_event = exit_event
|
74
|
+
|
75
|
+
@wrap_with_logs(target="monitoring_router")
|
76
|
+
def start(self) -> None:
|
77
|
+
self.logger.info("Starting ZMQ listener")
|
78
|
+
try:
|
79
|
+
while not self.exit_event.is_set():
|
80
|
+
try:
|
81
|
+
dfk_loop_start = time.time()
|
82
|
+
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
83
|
+
# note that nothing checks that msg really is of the annotated type
|
84
|
+
msg: TaggedMonitoringMessage
|
85
|
+
msg = self.zmq_receiver_channel.recv_pyobj()
|
86
|
+
|
87
|
+
assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
|
88
|
+
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
89
|
+
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
90
|
+
|
91
|
+
self.target_radio.send(msg)
|
92
|
+
except zmq.Again:
|
93
|
+
pass
|
94
|
+
except Exception:
|
95
|
+
# This will catch malformed messages. What happens if the
|
96
|
+
# channel is broken in such a way that it always raises
|
97
|
+
# an exception? Looping on this would maybe be the wrong
|
98
|
+
# thing to do.
|
99
|
+
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
100
|
+
|
101
|
+
self.logger.info("ZMQ listener finishing normally")
|
102
|
+
finally:
|
103
|
+
self.logger.info("ZMQ listener finished")
|
104
|
+
|
105
|
+
|
106
|
+
@wrap_with_logs
|
107
|
+
@typeguard.typechecked
|
108
|
+
def zmq_router_starter(*,
|
109
|
+
comm_q: mpq.Queue,
|
110
|
+
resource_msgs: mpq.Queue,
|
111
|
+
exit_event: Event,
|
112
|
+
|
113
|
+
hub_address: str,
|
114
|
+
zmq_port_range: Tuple[int, int],
|
115
|
+
|
116
|
+
run_dir: str,
|
117
|
+
logging_level: int) -> None:
|
118
|
+
setproctitle("parsl: monitoring zmq router")
|
119
|
+
try:
|
120
|
+
router = MonitoringRouter(hub_address=hub_address,
|
121
|
+
zmq_port_range=zmq_port_range,
|
122
|
+
run_dir=run_dir,
|
123
|
+
logging_level=logging_level,
|
124
|
+
resource_msgs=resource_msgs,
|
125
|
+
exit_event=exit_event)
|
126
|
+
except Exception as e:
|
127
|
+
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
128
|
+
comm_q.put(f"Monitoring router construction failed: {e}")
|
129
|
+
else:
|
130
|
+
comm_q.put(router.zmq_receiver_port)
|
131
|
+
router.start()
|