parsl 2025.3.10__py3-none-any.whl → 2025.3.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +1 -3
- parsl/executors/base.py +13 -37
- parsl/executors/flux/executor.py +1 -0
- parsl/executors/globus_compute.py +1 -1
- parsl/executors/high_throughput/executor.py +18 -0
- parsl/executors/high_throughput/mpi_resource_management.py +2 -0
- parsl/executors/high_throughput/process_worker_pool.py +89 -82
- parsl/executors/radical/executor.py +1 -0
- parsl/executors/status_handling.py +8 -0
- parsl/executors/taskvine/executor.py +1 -0
- parsl/executors/workqueue/executor.py +1 -0
- parsl/monitoring/db_manager.py +16 -10
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +61 -117
- parsl/monitoring/radios/filesystem_router.py +4 -2
- parsl/monitoring/radios/udp_router.py +1 -3
- parsl/monitoring/radios/zmq_router.py +80 -25
- parsl/multiprocessing.py +42 -2
- parsl/tests/test_monitoring/test_exit_helper.py +54 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
- parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
- parsl/tests/test_monitoring/test_stdouterr.py +3 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/usage_tracking/usage.py +2 -2
- parsl/version.py +1 -1
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/process_worker_pool.py +89 -82
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/METADATA +4 -4
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/RECORD +35 -33
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/interchange.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/LICENSE +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/WHEEL +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -4,9 +4,9 @@ import logging
|
|
4
4
|
import multiprocessing.synchronize as ms
|
5
5
|
import os
|
6
6
|
import queue
|
7
|
-
|
7
|
+
import warnings
|
8
8
|
from multiprocessing.queues import Queue
|
9
|
-
from typing import TYPE_CHECKING,
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
10
10
|
|
11
11
|
import typeguard
|
12
12
|
|
@@ -14,9 +14,13 @@ from parsl.monitoring.errors import MonitoringHubStartError
|
|
14
14
|
from parsl.monitoring.radios.filesystem_router import filesystem_router_starter
|
15
15
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
16
16
|
from parsl.monitoring.radios.udp_router import udp_router_starter
|
17
|
-
from parsl.monitoring.radios.zmq_router import zmq_router_starter
|
18
17
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
19
|
-
from parsl.multiprocessing import
|
18
|
+
from parsl.multiprocessing import (
|
19
|
+
SizedQueue,
|
20
|
+
SpawnEvent,
|
21
|
+
SpawnProcess,
|
22
|
+
join_terminate_close_proc,
|
23
|
+
)
|
20
24
|
from parsl.utils import RepresentationMixin
|
21
25
|
|
22
26
|
_db_manager_excepts: Optional[Exception]
|
@@ -37,7 +41,7 @@ class MonitoringHub(RepresentationMixin):
|
|
37
41
|
def __init__(self,
|
38
42
|
hub_address: str,
|
39
43
|
hub_port: Optional[int] = None,
|
40
|
-
hub_port_range:
|
44
|
+
hub_port_range: Any = None,
|
41
45
|
|
42
46
|
workflow_name: Optional[str] = None,
|
43
47
|
workflow_version: Optional[str] = None,
|
@@ -56,12 +60,11 @@ class MonitoringHub(RepresentationMixin):
|
|
56
60
|
Note that despite the similar name, this is not related to
|
57
61
|
hub_port_range.
|
58
62
|
Default: None
|
59
|
-
hub_port_range :
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
Default: (55050, 56000)
|
63
|
+
hub_port_range : unused
|
64
|
+
Unused, but retained until 2025-09-14 to avoid configuration errors.
|
65
|
+
This value previously configured one ZMQ channel inside the
|
66
|
+
HighThroughputExecutor. That ZMQ channel is now configured by the
|
67
|
+
interchange_port_range parameter of HighThroughputExecutor.
|
65
68
|
workflow_name : str
|
66
69
|
The name for the workflow. Default to the name of the parsl script
|
67
70
|
workflow_version : str
|
@@ -88,6 +91,13 @@ class MonitoringHub(RepresentationMixin):
|
|
88
91
|
|
89
92
|
self.hub_address = hub_address
|
90
93
|
self.hub_port = hub_port
|
94
|
+
|
95
|
+
if hub_port_range is not None:
|
96
|
+
message = "Instead of MonitoringHub.hub_port_range, Use HighThroughputExecutor.interchange_port_range"
|
97
|
+
warnings.warn(message, DeprecationWarning)
|
98
|
+
logger.warning(message)
|
99
|
+
# This is used by RepresentationMixin so needs to exist as an attribute
|
100
|
+
# even though now it is otherwise unused.
|
91
101
|
self.hub_port_range = hub_port_range
|
92
102
|
|
93
103
|
self.logging_endpoint = logging_endpoint
|
@@ -119,91 +129,59 @@ class MonitoringHub(RepresentationMixin):
|
|
119
129
|
# in the future, Queue will allow runtime subscripts.
|
120
130
|
|
121
131
|
if TYPE_CHECKING:
|
122
|
-
zmq_comm_q: Queue[Union[int, str]]
|
123
132
|
udp_comm_q: Queue[Union[int, str]]
|
124
133
|
else:
|
125
|
-
zmq_comm_q: Queue
|
126
134
|
udp_comm_q: Queue
|
127
135
|
|
128
|
-
zmq_comm_q = SizedQueue(maxsize=10)
|
129
136
|
udp_comm_q = SizedQueue(maxsize=10)
|
130
137
|
|
131
|
-
self.
|
132
|
-
self.exception_q = SizedQueue(maxsize=10)
|
133
|
-
|
134
|
-
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
138
|
+
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
135
139
|
self.resource_msgs = SizedQueue()
|
136
140
|
|
137
141
|
self.router_exit_event: ms.Event
|
138
|
-
self.router_exit_event =
|
139
|
-
|
140
|
-
self.
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
)
|
153
|
-
self.zmq_router_proc.start()
|
154
|
-
|
155
|
-
self.udp_router_proc = ForkProcess(target=udp_router_starter,
|
156
|
-
kwargs={"comm_q": udp_comm_q,
|
157
|
-
"exception_q": self.exception_q,
|
158
|
-
"resource_msgs": self.resource_msgs,
|
159
|
-
"exit_event": self.router_exit_event,
|
160
|
-
"hub_address": self.hub_address,
|
161
|
-
"udp_port": self.hub_port,
|
162
|
-
"run_dir": dfk_run_dir,
|
163
|
-
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
164
|
-
},
|
165
|
-
name="Monitoring-UDP-Router-Process",
|
166
|
-
daemon=True,
|
167
|
-
)
|
142
|
+
self.router_exit_event = SpawnEvent()
|
143
|
+
|
144
|
+
self.udp_router_proc = SpawnProcess(target=udp_router_starter,
|
145
|
+
kwargs={"comm_q": udp_comm_q,
|
146
|
+
"resource_msgs": self.resource_msgs,
|
147
|
+
"exit_event": self.router_exit_event,
|
148
|
+
"hub_address": self.hub_address,
|
149
|
+
"udp_port": self.hub_port,
|
150
|
+
"run_dir": dfk_run_dir,
|
151
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
152
|
+
},
|
153
|
+
name="Monitoring-UDP-Router-Process",
|
154
|
+
daemon=True,
|
155
|
+
)
|
168
156
|
self.udp_router_proc.start()
|
169
157
|
|
170
|
-
self.
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
158
|
+
self.dbm_exit_event: ms.Event
|
159
|
+
self.dbm_exit_event = SpawnEvent()
|
160
|
+
|
161
|
+
self.dbm_proc = SpawnProcess(target=dbm_starter,
|
162
|
+
args=(self.resource_msgs,),
|
163
|
+
kwargs={"run_dir": dfk_run_dir,
|
164
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
165
|
+
"db_url": self.logging_endpoint,
|
166
|
+
"exit_event": self.dbm_exit_event,
|
167
|
+
},
|
168
|
+
name="Monitoring-DBM-Process",
|
169
|
+
daemon=True,
|
170
|
+
)
|
179
171
|
self.dbm_proc.start()
|
180
|
-
logger.info("Started
|
181
|
-
self.
|
182
|
-
|
183
|
-
self.filesystem_proc =
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
172
|
+
logger.info("Started UDP router process %s and DBM process %s",
|
173
|
+
self.udp_router_proc.pid, self.dbm_proc.pid)
|
174
|
+
|
175
|
+
self.filesystem_proc = SpawnProcess(target=filesystem_router_starter,
|
176
|
+
args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
|
177
|
+
name="Monitoring-Filesystem-Process",
|
178
|
+
daemon=True
|
179
|
+
)
|
188
180
|
self.filesystem_proc.start()
|
189
181
|
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
190
182
|
|
191
183
|
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
192
184
|
|
193
|
-
try:
|
194
|
-
zmq_comm_q_result = zmq_comm_q.get(block=True, timeout=120)
|
195
|
-
zmq_comm_q.close()
|
196
|
-
zmq_comm_q.join_thread()
|
197
|
-
except queue.Empty:
|
198
|
-
logger.error("Monitoring ZMQ Router has not reported port in 120s. Aborting")
|
199
|
-
raise MonitoringHubStartError()
|
200
|
-
|
201
|
-
if isinstance(zmq_comm_q_result, str):
|
202
|
-
logger.error("MonitoringRouter sent an error message: %s", zmq_comm_q_result)
|
203
|
-
raise RuntimeError(f"MonitoringRouter failed to start: {zmq_comm_q_result}")
|
204
|
-
|
205
|
-
self.hub_zmq_port = zmq_comm_q_result
|
206
|
-
|
207
185
|
try:
|
208
186
|
udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
|
209
187
|
udp_comm_q.close()
|
@@ -227,58 +205,24 @@ class MonitoringHub(RepresentationMixin):
|
|
227
205
|
|
228
206
|
def close(self) -> None:
|
229
207
|
logger.info("Terminating Monitoring Hub")
|
230
|
-
exception_msgs = []
|
231
|
-
while True:
|
232
|
-
try:
|
233
|
-
exception_msgs.append(self.exception_q.get(block=False))
|
234
|
-
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
235
|
-
except queue.Empty:
|
236
|
-
break
|
237
208
|
if self.monitoring_hub_active:
|
238
209
|
self.monitoring_hub_active = False
|
239
|
-
if exception_msgs:
|
240
|
-
for exception_msg in exception_msgs:
|
241
|
-
logger.error(
|
242
|
-
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
243
|
-
exception_msg[0],
|
244
|
-
exception_msg[1]
|
245
|
-
)
|
246
|
-
self.zmq_router_proc.terminate()
|
247
|
-
self.udp_router_proc.terminate()
|
248
|
-
self.dbm_proc.terminate()
|
249
|
-
self.filesystem_proc.terminate()
|
250
210
|
logger.info("Setting router termination event")
|
251
211
|
self.router_exit_event.set()
|
252
212
|
|
253
|
-
logger.info("Waiting for ZMQ router to terminate")
|
254
|
-
self.zmq_router_proc.join()
|
255
|
-
self.zmq_router_proc.close()
|
256
|
-
|
257
213
|
logger.info("Waiting for UDP router to terminate")
|
258
|
-
self.udp_router_proc
|
259
|
-
self.udp_router_proc.close()
|
214
|
+
join_terminate_close_proc(self.udp_router_proc)
|
260
215
|
|
261
216
|
logger.debug("Finished waiting for router termination")
|
262
|
-
if len(exception_msgs) == 0:
|
263
|
-
logger.debug("Sending STOP to DBM")
|
264
|
-
self.resource_msgs.put("STOP")
|
265
|
-
else:
|
266
|
-
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
267
217
|
logger.debug("Waiting for DB termination")
|
268
|
-
self.
|
269
|
-
self.dbm_proc
|
218
|
+
self.dbm_exit_event.set()
|
219
|
+
join_terminate_close_proc(self.dbm_proc)
|
270
220
|
logger.debug("Finished waiting for DBM termination")
|
271
221
|
|
272
|
-
# should this be message based? it probably doesn't need to be if
|
273
|
-
# we believe we've received all messages
|
274
222
|
logger.info("Terminating filesystem radio receiver process")
|
275
|
-
self.filesystem_proc
|
276
|
-
self.filesystem_proc.join()
|
277
|
-
self.filesystem_proc.close()
|
223
|
+
join_terminate_close_proc(self.filesystem_proc)
|
278
224
|
|
279
225
|
logger.info("Closing monitoring multiprocessing queues")
|
280
|
-
self.exception_q.close()
|
281
|
-
self.exception_q.join_thread()
|
282
226
|
self.resource_msgs.close()
|
283
227
|
self.resource_msgs.join_thread()
|
284
228
|
logger.info("Closed monitoring multiprocessing queues")
|
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import pickle
|
6
6
|
import time
|
7
7
|
from multiprocessing.queues import Queue
|
8
|
+
from multiprocessing.synchronize import Event
|
8
9
|
from typing import cast
|
9
10
|
|
10
11
|
from parsl.log_utils import set_file_logger
|
@@ -15,7 +16,7 @@ from parsl.utils import setproctitle
|
|
15
16
|
|
16
17
|
|
17
18
|
@wrap_with_logs
|
18
|
-
def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
19
|
+
def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str, exit_event: Event) -> None:
|
19
20
|
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
20
21
|
name="monitoring_filesystem_radio",
|
21
22
|
level=logging.INFO)
|
@@ -32,7 +33,7 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
|
|
32
33
|
os.makedirs(tmp_dir, exist_ok=True)
|
33
34
|
os.makedirs(new_dir, exist_ok=True)
|
34
35
|
|
35
|
-
while
|
36
|
+
while not exit_event.is_set():
|
36
37
|
logger.debug("Start filesystem radio receiver loop")
|
37
38
|
|
38
39
|
# iterate over files in new_dir
|
@@ -50,3 +51,4 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
|
|
50
51
|
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
51
52
|
|
52
53
|
time.sleep(1) # whats a good time for this poll?
|
54
|
+
logger.info("Ending filesystem radio receiver")
|
@@ -118,7 +118,6 @@ class MonitoringRouter:
|
|
118
118
|
@typeguard.typechecked
|
119
119
|
def udp_router_starter(*,
|
120
120
|
comm_q: mpq.Queue,
|
121
|
-
exception_q: mpq.Queue,
|
122
121
|
resource_msgs: mpq.Queue,
|
123
122
|
exit_event: Event,
|
124
123
|
|
@@ -144,6 +143,5 @@ def udp_router_starter(*,
|
|
144
143
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
145
144
|
try:
|
146
145
|
router.start()
|
147
|
-
except Exception
|
146
|
+
except Exception:
|
148
147
|
router.logger.exception("UDP router start exception")
|
149
|
-
exception_q.put(('Hub', str(e)))
|
@@ -3,16 +3,27 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import multiprocessing.queues as mpq
|
5
5
|
import os
|
6
|
+
import queue
|
6
7
|
import time
|
7
|
-
from multiprocessing.
|
8
|
+
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
9
|
+
from multiprocessing.queues import Queue as QueueType
|
10
|
+
from multiprocessing.synchronize import Event as EventType
|
8
11
|
from typing import Tuple
|
9
12
|
|
10
13
|
import typeguard
|
11
14
|
import zmq
|
12
15
|
|
16
|
+
from parsl.addresses import tcp_url
|
13
17
|
from parsl.log_utils import set_file_logger
|
18
|
+
from parsl.monitoring.errors import MonitoringRouterStartError
|
14
19
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
15
20
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
21
|
+
from parsl.multiprocessing import (
|
22
|
+
SizedQueue,
|
23
|
+
SpawnEvent,
|
24
|
+
SpawnProcess,
|
25
|
+
join_terminate_close_proc,
|
26
|
+
)
|
16
27
|
from parsl.process_loggers import wrap_with_logs
|
17
28
|
from parsl.utils import setproctitle
|
18
29
|
|
@@ -23,21 +34,21 @@ class MonitoringRouter:
|
|
23
34
|
|
24
35
|
def __init__(self,
|
25
36
|
*,
|
26
|
-
|
27
|
-
|
37
|
+
address: str,
|
38
|
+
port_range: Tuple[int, int] = (55050, 56000),
|
28
39
|
|
29
40
|
run_dir: str = ".",
|
30
41
|
logging_level: int = logging.INFO,
|
31
42
|
resource_msgs: mpq.Queue,
|
32
|
-
exit_event:
|
43
|
+
exit_event: EventType,
|
33
44
|
):
|
34
45
|
""" Initializes a monitoring configuration class.
|
35
46
|
|
36
47
|
Parameters
|
37
48
|
----------
|
38
|
-
|
49
|
+
address : str
|
39
50
|
The ip address at which the workers will be able to reach the Hub.
|
40
|
-
|
51
|
+
port_range : tuple(int, int)
|
41
52
|
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
42
53
|
Default: (55050, 56000)
|
43
54
|
run_dir : str
|
@@ -51,11 +62,11 @@ class MonitoringRouter:
|
|
51
62
|
"""
|
52
63
|
os.makedirs(run_dir, exist_ok=True)
|
53
64
|
self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
|
54
|
-
name="
|
65
|
+
name="zmq_monitoring_router",
|
55
66
|
level=logging_level)
|
56
67
|
self.logger.debug("Monitoring router starting")
|
57
68
|
|
58
|
-
self.
|
69
|
+
self.address = address
|
59
70
|
|
60
71
|
self.loop_freq = 10.0 # milliseconds
|
61
72
|
|
@@ -64,15 +75,15 @@ class MonitoringRouter:
|
|
64
75
|
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
65
76
|
self.zmq_receiver_channel.set_hwm(0)
|
66
77
|
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
67
|
-
self.logger.debug("
|
68
|
-
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(
|
69
|
-
min_port=
|
70
|
-
max_port=
|
78
|
+
self.logger.debug("address: {}. port_range {}".format(address, port_range))
|
79
|
+
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(tcp_url(address),
|
80
|
+
min_port=port_range[0],
|
81
|
+
max_port=port_range[1])
|
71
82
|
|
72
83
|
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
73
84
|
self.exit_event = exit_event
|
74
85
|
|
75
|
-
@wrap_with_logs(target="
|
86
|
+
@wrap_with_logs(target="zmq_monitoring_router")
|
76
87
|
def start(self) -> None:
|
77
88
|
self.logger.info("Starting ZMQ listener")
|
78
89
|
try:
|
@@ -107,19 +118,18 @@ class MonitoringRouter:
|
|
107
118
|
@typeguard.typechecked
|
108
119
|
def zmq_router_starter(*,
|
109
120
|
comm_q: mpq.Queue,
|
110
|
-
exception_q: mpq.Queue,
|
111
121
|
resource_msgs: mpq.Queue,
|
112
|
-
exit_event:
|
122
|
+
exit_event: EventType,
|
113
123
|
|
114
|
-
|
115
|
-
|
124
|
+
address: str,
|
125
|
+
port_range: Tuple[int, int],
|
116
126
|
|
117
127
|
run_dir: str,
|
118
128
|
logging_level: int) -> None:
|
119
129
|
setproctitle("parsl: monitoring zmq router")
|
120
130
|
try:
|
121
|
-
router = MonitoringRouter(
|
122
|
-
|
131
|
+
router = MonitoringRouter(address=address,
|
132
|
+
port_range=port_range,
|
123
133
|
run_dir=run_dir,
|
124
134
|
logging_level=logging_level,
|
125
135
|
resource_msgs=resource_msgs,
|
@@ -129,10 +139,55 @@ def zmq_router_starter(*,
|
|
129
139
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
130
140
|
else:
|
131
141
|
comm_q.put(router.zmq_receiver_port)
|
142
|
+
router.start()
|
132
143
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
144
|
+
|
145
|
+
class ZMQRadioReceiver():
|
146
|
+
def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
|
147
|
+
self.process = process
|
148
|
+
self.exit_event = exit_event
|
149
|
+
self.port = port
|
150
|
+
|
151
|
+
def close(self) -> None:
|
152
|
+
self.exit_event.set()
|
153
|
+
join_terminate_close_proc(self.process)
|
154
|
+
|
155
|
+
|
156
|
+
def start_zmq_receiver(*,
|
157
|
+
monitoring_messages: QueueType,
|
158
|
+
loopback_address: str,
|
159
|
+
port_range: Tuple[int, int],
|
160
|
+
logdir: str,
|
161
|
+
worker_debug: bool) -> ZMQRadioReceiver:
|
162
|
+
comm_q = SizedQueue(maxsize=10)
|
163
|
+
|
164
|
+
router_exit_event = SpawnEvent()
|
165
|
+
|
166
|
+
router_proc = SpawnProcess(target=zmq_router_starter,
|
167
|
+
kwargs={"comm_q": comm_q,
|
168
|
+
"resource_msgs": monitoring_messages,
|
169
|
+
"exit_event": router_exit_event,
|
170
|
+
"address": loopback_address,
|
171
|
+
"port_range": port_range,
|
172
|
+
"run_dir": logdir,
|
173
|
+
"logging_level": logging.DEBUG if worker_debug else logging.INFO,
|
174
|
+
},
|
175
|
+
name="Monitoring-ZMQ-Router-Process",
|
176
|
+
daemon=True,
|
177
|
+
)
|
178
|
+
router_proc.start()
|
179
|
+
|
180
|
+
try:
|
181
|
+
logger.debug("Waiting for router process to report port")
|
182
|
+
comm_q_result = comm_q.get(block=True, timeout=120)
|
183
|
+
comm_q.close()
|
184
|
+
comm_q.join_thread()
|
185
|
+
except queue.Empty:
|
186
|
+
logger.error("Monitoring ZMQ Router has not reported port in 120s")
|
187
|
+
raise MonitoringRouterStartError()
|
188
|
+
|
189
|
+
if isinstance(comm_q_result, str):
|
190
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
191
|
+
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
192
|
+
|
193
|
+
return ZMQRadioReceiver(process=router_proc, exit_event=router_exit_event, port=comm_q_result)
|
parsl/multiprocessing.py
CHANGED
@@ -6,6 +6,7 @@ import multiprocessing
|
|
6
6
|
import multiprocessing.queues
|
7
7
|
import platform
|
8
8
|
from multiprocessing.context import ForkProcess as ForkProcessType
|
9
|
+
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
9
10
|
from typing import Callable
|
10
11
|
|
11
12
|
logger = logging.getLogger(__name__)
|
@@ -14,6 +15,10 @@ ForkContext = multiprocessing.get_context("fork")
|
|
14
15
|
SpawnContext = multiprocessing.get_context("spawn")
|
15
16
|
|
16
17
|
ForkProcess: Callable[..., ForkProcessType] = ForkContext.Process
|
18
|
+
SpawnProcess: Callable[..., SpawnProcessType] = SpawnContext.Process
|
19
|
+
|
20
|
+
SpawnEvent = SpawnContext.Event
|
21
|
+
SpawnQueue = SpawnContext.Queue
|
17
22
|
|
18
23
|
|
19
24
|
class MacSafeQueue(multiprocessing.queues.Queue):
|
@@ -26,7 +31,7 @@ class MacSafeQueue(multiprocessing.queues.Queue):
|
|
26
31
|
|
27
32
|
def __init__(self, *args, **kwargs):
|
28
33
|
if 'ctx' not in kwargs:
|
29
|
-
kwargs['ctx'] = multiprocessing.get_context()
|
34
|
+
kwargs['ctx'] = multiprocessing.get_context('spawn')
|
30
35
|
super().__init__(*args, **kwargs)
|
31
36
|
self._counter = multiprocessing.Value('i', 0)
|
32
37
|
|
@@ -59,6 +64,41 @@ SizedQueue: Callable[..., multiprocessing.Queue]
|
|
59
64
|
|
60
65
|
if platform.system() != 'Darwin':
|
61
66
|
import multiprocessing
|
62
|
-
SizedQueue =
|
67
|
+
SizedQueue = SpawnQueue
|
63
68
|
else:
|
64
69
|
SizedQueue = MacSafeQueue
|
70
|
+
|
71
|
+
|
72
|
+
def join_terminate_close_proc(process: SpawnProcessType, *, timeout: int = 30) -> None:
|
73
|
+
"""Increasingly aggressively terminate a process.
|
74
|
+
|
75
|
+
This function assumes that the process is likely to exit before
|
76
|
+
the join timeout, driven by some other means, such as the
|
77
|
+
MonitoringHub router_exit_event. If the process does not exit, then
|
78
|
+
first terminate() and then kill() will be used to end the process.
|
79
|
+
|
80
|
+
In the case of a very mis-behaving process, this function might take
|
81
|
+
up to 3*timeout to exhaust all termination methods and return.
|
82
|
+
"""
|
83
|
+
logger.debug("Joining process")
|
84
|
+
process.join(timeout)
|
85
|
+
|
86
|
+
# run a sequence of increasingly aggressive steps to shut down the process.
|
87
|
+
if process.is_alive():
|
88
|
+
logger.error("Process did not join. Terminating.")
|
89
|
+
process.terminate()
|
90
|
+
process.join(timeout)
|
91
|
+
if process.is_alive():
|
92
|
+
logger.error("Process did not join after terminate. Killing.")
|
93
|
+
process.kill()
|
94
|
+
process.join(timeout)
|
95
|
+
# This kill should not be caught by any signal handlers so it is
|
96
|
+
# unlikely that this join will timeout. If it does, there isn't
|
97
|
+
# anything further to do except log an error in the next if-block.
|
98
|
+
|
99
|
+
if process.is_alive():
|
100
|
+
logger.error("Process failed to end")
|
101
|
+
# don't call close if the process hasn't ended:
|
102
|
+
# process.close() doesn't work on a running process.
|
103
|
+
else:
|
104
|
+
process.close()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import multiprocessing
|
2
|
+
import signal
|
3
|
+
|
4
|
+
import psutil
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from parsl.multiprocessing import SpawnEvent, SpawnProcess, join_terminate_close_proc
|
8
|
+
|
9
|
+
|
10
|
+
def noop():
|
11
|
+
pass
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.mark.local
|
15
|
+
def test_end_process_already_exited():
|
16
|
+
p = SpawnProcess(target=noop)
|
17
|
+
p.start()
|
18
|
+
p.join()
|
19
|
+
join_terminate_close_proc(p)
|
20
|
+
|
21
|
+
|
22
|
+
def hang():
|
23
|
+
while True:
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.mark.local
|
28
|
+
def test_end_hung_process():
|
29
|
+
"""Test calling against a process that will not exit itself."""
|
30
|
+
p = SpawnProcess(target=hang)
|
31
|
+
p.start()
|
32
|
+
pid = p.pid
|
33
|
+
join_terminate_close_proc(p, timeout=1)
|
34
|
+
assert not psutil.pid_exists(pid), "process should not exist any more"
|
35
|
+
|
36
|
+
|
37
|
+
def hang_no_sigint(e):
|
38
|
+
def s(*args, **kwargs):
|
39
|
+
e.set()
|
40
|
+
signal.signal(signal.SIGTERM, s)
|
41
|
+
while True:
|
42
|
+
pass
|
43
|
+
|
44
|
+
|
45
|
+
@pytest.mark.local
|
46
|
+
def test_end_hung_process_no_sigint():
|
47
|
+
"""Test calling against a process that will not exit itself."""
|
48
|
+
e = SpawnEvent()
|
49
|
+
p = SpawnProcess(target=hang_no_sigint, args=(e,))
|
50
|
+
p.start()
|
51
|
+
pid = p.pid
|
52
|
+
join_terminate_close_proc(p, timeout=2)
|
53
|
+
assert not psutil.pid_exists(pid), "process should not exist any more"
|
54
|
+
assert e.is_set(), "hung process should have set event on signal"
|
@@ -45,7 +45,7 @@ def test_row_counts():
|
|
45
45
|
|
46
46
|
# dig out the interchange port...
|
47
47
|
hub_address = parsl.dfk().monitoring.hub_address
|
48
|
-
hub_zmq_port = parsl.dfk().
|
48
|
+
hub_zmq_port = parsl.dfk().executors["htex_Local"].hub_zmq_port
|
49
49
|
|
50
50
|
# this will send a string to a new socket connection
|
51
51
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
4
|
+
from parsl.monitoring.radios.zmq_router import start_zmq_receiver
|
5
|
+
from parsl.multiprocessing import SpawnQueue
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.mark.local
|
9
|
+
def test_send_recv_message(tmpd_cwd, try_assert):
|
10
|
+
q = SpawnQueue()
|
11
|
+
loopback = "127.0.0.1"
|
12
|
+
r = start_zmq_receiver(monitoring_messages=q,
|
13
|
+
loopback_address=loopback,
|
14
|
+
port_range=(49152, 65535),
|
15
|
+
logdir=str(tmpd_cwd),
|
16
|
+
worker_debug=False)
|
17
|
+
|
18
|
+
s = ZMQRadioSender(loopback, r.port)
|
19
|
+
|
20
|
+
test_msg = ("test", {})
|
21
|
+
s.send(test_msg)
|
22
|
+
|
23
|
+
assert q.get() == test_msg
|
24
|
+
|
25
|
+
assert r.process.is_alive()
|
26
|
+
r.exit_event.set()
|
27
|
+
try_assert(lambda: not r.process.is_alive())
|
@@ -102,6 +102,9 @@ def test_stdstream_to_monitoring(stdx, expected_stdx, stream, tmpd_cwd, caplog):
|
|
102
102
|
kwargs = {stream: stdx}
|
103
103
|
stdapp(**kwargs).result()
|
104
104
|
|
105
|
+
for record in caplog.records:
|
106
|
+
assert record.levelno < logging.ERROR
|
107
|
+
|
105
108
|
engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
|
106
109
|
with engine.begin() as connection:
|
107
110
|
|
@@ -30,7 +30,7 @@ def test_no_kills():
|
|
30
30
|
|
31
31
|
@pytest.mark.local
|
32
32
|
@pytest.mark.parametrize("sig", [signal.SIGINT, signal.SIGTERM, signal.SIGKILL, signal.SIGQUIT])
|
33
|
-
@pytest.mark.parametrize("process_attr", ["
|
33
|
+
@pytest.mark.parametrize("process_attr", ["udp_router_proc", "dbm_proc", "filesystem_proc"])
|
34
34
|
def test_kill_monitoring_helper_process(sig, process_attr, try_assert):
|
35
35
|
"""This tests that we can kill a monitoring process and still have successful shutdown.
|
36
36
|
SIGINT emulates some racy behaviour when ctrl-C is pressed: that
|