parsl 2025.3.17__py3-none-any.whl → 2025.3.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +18 -15
- parsl/executors/base.py +13 -37
- parsl/executors/flux/executor.py +1 -0
- parsl/executors/globus_compute.py +13 -2
- parsl/executors/high_throughput/executor.py +18 -0
- parsl/executors/high_throughput/interchange.py +26 -36
- parsl/executors/radical/executor.py +1 -0
- parsl/executors/status_handling.py +20 -12
- parsl/executors/taskvine/executor.py +13 -11
- parsl/executors/workqueue/executor.py +9 -7
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +55 -122
- parsl/monitoring/radios/zmq_router.py +80 -18
- parsl/multiprocessing.py +42 -2
- parsl/tests/test_monitoring/test_basic.py +1 -1
- parsl/tests/test_monitoring/test_exit_helper.py +6 -7
- parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
- parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
- parsl/tests/test_monitoring/test_stdouterr.py +3 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/usage_tracking/usage.py +2 -2
- parsl/version.py +1 -1
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/interchange.py +26 -36
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/METADATA +2 -2
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/RECORD +32 -31
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/LICENSE +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/WHEEL +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -4,20 +4,22 @@ import logging
|
|
4
4
|
import multiprocessing.synchronize as ms
|
5
5
|
import os
|
6
6
|
import queue
|
7
|
-
|
8
|
-
from multiprocessing.context import ForkProcess as ForkProcessType
|
7
|
+
import warnings
|
9
8
|
from multiprocessing.queues import Queue
|
10
|
-
from typing import TYPE_CHECKING,
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
11
10
|
|
12
11
|
import typeguard
|
13
12
|
|
14
13
|
from parsl.monitoring.errors import MonitoringHubStartError
|
15
14
|
from parsl.monitoring.radios.filesystem_router import filesystem_router_starter
|
16
|
-
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
17
15
|
from parsl.monitoring.radios.udp_router import udp_router_starter
|
18
|
-
from parsl.monitoring.radios.zmq_router import zmq_router_starter
|
19
16
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
|
-
from parsl.multiprocessing import
|
17
|
+
from parsl.multiprocessing import (
|
18
|
+
SizedQueue,
|
19
|
+
SpawnEvent,
|
20
|
+
SpawnProcess,
|
21
|
+
join_terminate_close_proc,
|
22
|
+
)
|
21
23
|
from parsl.utils import RepresentationMixin
|
22
24
|
|
23
25
|
_db_manager_excepts: Optional[Exception]
|
@@ -38,7 +40,7 @@ class MonitoringHub(RepresentationMixin):
|
|
38
40
|
def __init__(self,
|
39
41
|
hub_address: str,
|
40
42
|
hub_port: Optional[int] = None,
|
41
|
-
hub_port_range:
|
43
|
+
hub_port_range: Any = None,
|
42
44
|
|
43
45
|
workflow_name: Optional[str] = None,
|
44
46
|
workflow_version: Optional[str] = None,
|
@@ -57,12 +59,11 @@ class MonitoringHub(RepresentationMixin):
|
|
57
59
|
Note that despite the similar name, this is not related to
|
58
60
|
hub_port_range.
|
59
61
|
Default: None
|
60
|
-
hub_port_range :
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
Default: (55050, 56000)
|
62
|
+
hub_port_range : unused
|
63
|
+
Unused, but retained until 2025-09-14 to avoid configuration errors.
|
64
|
+
This value previously configured one ZMQ channel inside the
|
65
|
+
HighThroughputExecutor. That ZMQ channel is now configured by the
|
66
|
+
interchange_port_range parameter of HighThroughputExecutor.
|
66
67
|
workflow_name : str
|
67
68
|
The name for the workflow. Default to the name of the parsl script
|
68
69
|
workflow_version : str
|
@@ -89,6 +90,13 @@ class MonitoringHub(RepresentationMixin):
|
|
89
90
|
|
90
91
|
self.hub_address = hub_address
|
91
92
|
self.hub_port = hub_port
|
93
|
+
|
94
|
+
if hub_port_range is not None:
|
95
|
+
message = "Instead of MonitoringHub.hub_port_range, Use HighThroughputExecutor.interchange_port_range"
|
96
|
+
warnings.warn(message, DeprecationWarning)
|
97
|
+
logger.warning(message)
|
98
|
+
# This is used by RepresentationMixin so needs to exist as an attribute
|
99
|
+
# even though now it is otherwise unused.
|
92
100
|
self.hub_port_range = hub_port_range
|
93
101
|
|
94
102
|
self.logging_endpoint = logging_endpoint
|
@@ -120,90 +128,57 @@ class MonitoringHub(RepresentationMixin):
|
|
120
128
|
# in the future, Queue will allow runtime subscripts.
|
121
129
|
|
122
130
|
if TYPE_CHECKING:
|
123
|
-
zmq_comm_q: Queue[Union[int, str]]
|
124
131
|
udp_comm_q: Queue[Union[int, str]]
|
125
132
|
else:
|
126
|
-
zmq_comm_q: Queue
|
127
133
|
udp_comm_q: Queue
|
128
134
|
|
129
|
-
zmq_comm_q = SizedQueue(maxsize=10)
|
130
135
|
udp_comm_q = SizedQueue(maxsize=10)
|
131
136
|
|
132
137
|
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
133
138
|
self.resource_msgs = SizedQueue()
|
134
139
|
|
135
140
|
self.router_exit_event: ms.Event
|
136
|
-
self.router_exit_event =
|
137
|
-
|
138
|
-
self.
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
self.zmq_router_proc.start()
|
151
|
-
|
152
|
-
self.udp_router_proc = ForkProcess(target=udp_router_starter,
|
153
|
-
kwargs={"comm_q": udp_comm_q,
|
154
|
-
"resource_msgs": self.resource_msgs,
|
155
|
-
"exit_event": self.router_exit_event,
|
156
|
-
"hub_address": self.hub_address,
|
157
|
-
"udp_port": self.hub_port,
|
158
|
-
"run_dir": dfk_run_dir,
|
159
|
-
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
160
|
-
},
|
161
|
-
name="Monitoring-UDP-Router-Process",
|
162
|
-
daemon=True,
|
163
|
-
)
|
141
|
+
self.router_exit_event = SpawnEvent()
|
142
|
+
|
143
|
+
self.udp_router_proc = SpawnProcess(target=udp_router_starter,
|
144
|
+
kwargs={"comm_q": udp_comm_q,
|
145
|
+
"resource_msgs": self.resource_msgs,
|
146
|
+
"exit_event": self.router_exit_event,
|
147
|
+
"hub_address": self.hub_address,
|
148
|
+
"udp_port": self.hub_port,
|
149
|
+
"run_dir": dfk_run_dir,
|
150
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
151
|
+
},
|
152
|
+
name="Monitoring-UDP-Router-Process",
|
153
|
+
daemon=True,
|
154
|
+
)
|
164
155
|
self.udp_router_proc.start()
|
165
156
|
|
166
157
|
self.dbm_exit_event: ms.Event
|
167
|
-
self.dbm_exit_event =
|
168
|
-
|
169
|
-
self.dbm_proc =
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
158
|
+
self.dbm_exit_event = SpawnEvent()
|
159
|
+
|
160
|
+
self.dbm_proc = SpawnProcess(target=dbm_starter,
|
161
|
+
args=(self.resource_msgs,),
|
162
|
+
kwargs={"run_dir": dfk_run_dir,
|
163
|
+
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
164
|
+
"db_url": self.logging_endpoint,
|
165
|
+
"exit_event": self.dbm_exit_event,
|
166
|
+
},
|
167
|
+
name="Monitoring-DBM-Process",
|
168
|
+
daemon=True,
|
169
|
+
)
|
179
170
|
self.dbm_proc.start()
|
180
|
-
logger.info("Started
|
181
|
-
self.
|
182
|
-
|
183
|
-
self.filesystem_proc =
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
171
|
+
logger.info("Started UDP router process %s and DBM process %s",
|
172
|
+
self.udp_router_proc.pid, self.dbm_proc.pid)
|
173
|
+
|
174
|
+
self.filesystem_proc = SpawnProcess(target=filesystem_router_starter,
|
175
|
+
args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
|
176
|
+
name="Monitoring-Filesystem-Process",
|
177
|
+
daemon=True
|
178
|
+
)
|
188
179
|
self.filesystem_proc.start()
|
189
180
|
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
190
181
|
|
191
|
-
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
192
|
-
|
193
|
-
try:
|
194
|
-
zmq_comm_q_result = zmq_comm_q.get(block=True, timeout=120)
|
195
|
-
zmq_comm_q.close()
|
196
|
-
zmq_comm_q.join_thread()
|
197
|
-
except queue.Empty:
|
198
|
-
logger.error("Monitoring ZMQ Router has not reported port in 120s. Aborting")
|
199
|
-
raise MonitoringHubStartError()
|
200
|
-
|
201
|
-
if isinstance(zmq_comm_q_result, str):
|
202
|
-
logger.error("MonitoringRouter sent an error message: %s", zmq_comm_q_result)
|
203
|
-
raise RuntimeError(f"MonitoringRouter failed to start: {zmq_comm_q_result}")
|
204
|
-
|
205
|
-
self.hub_zmq_port = zmq_comm_q_result
|
206
|
-
|
207
182
|
try:
|
208
183
|
udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
|
209
184
|
udp_comm_q.close()
|
@@ -221,10 +196,6 @@ class MonitoringHub(RepresentationMixin):
|
|
221
196
|
|
222
197
|
logger.info("Monitoring Hub initialized")
|
223
198
|
|
224
|
-
def send(self, message: TaggedMonitoringMessage) -> None:
|
225
|
-
logger.debug("Sending message type %s", message[0])
|
226
|
-
self.radio.send(message)
|
227
|
-
|
228
199
|
def close(self) -> None:
|
229
200
|
logger.info("Terminating Monitoring Hub")
|
230
201
|
if self.monitoring_hub_active:
|
@@ -232,9 +203,6 @@ class MonitoringHub(RepresentationMixin):
|
|
232
203
|
logger.info("Setting router termination event")
|
233
204
|
self.router_exit_event.set()
|
234
205
|
|
235
|
-
logger.info("Waiting for ZMQ router to terminate")
|
236
|
-
join_terminate_close_proc(self.zmq_router_proc)
|
237
|
-
|
238
206
|
logger.info("Waiting for UDP router to terminate")
|
239
207
|
join_terminate_close_proc(self.udp_router_proc)
|
240
208
|
|
@@ -251,38 +219,3 @@ class MonitoringHub(RepresentationMixin):
|
|
251
219
|
self.resource_msgs.close()
|
252
220
|
self.resource_msgs.join_thread()
|
253
221
|
logger.info("Closed monitoring multiprocessing queues")
|
254
|
-
|
255
|
-
|
256
|
-
def join_terminate_close_proc(process: ForkProcessType, *, timeout: int = 30) -> None:
|
257
|
-
"""Increasingly aggressively terminate a process.
|
258
|
-
|
259
|
-
This function assumes that the process is likely to exit before
|
260
|
-
the join timeout, driven by some other means, such as the
|
261
|
-
MonitoringHub router_exit_event. If the process does not exit, then
|
262
|
-
first terminate() and then kill() will be used to end the process.
|
263
|
-
|
264
|
-
In the case of a very mis-behaving process, this function might take
|
265
|
-
up to 3*timeout to exhaust all termination methods and return.
|
266
|
-
"""
|
267
|
-
logger.debug("Joining process")
|
268
|
-
process.join(timeout)
|
269
|
-
|
270
|
-
# run a sequence of increasingly aggressive steps to shut down the process.
|
271
|
-
if process.is_alive():
|
272
|
-
logger.error("Process did not join. Terminating.")
|
273
|
-
process.terminate()
|
274
|
-
process.join(timeout)
|
275
|
-
if process.is_alive():
|
276
|
-
logger.error("Process did not join after terminate. Killing.")
|
277
|
-
process.kill()
|
278
|
-
process.join(timeout)
|
279
|
-
# This kill should not be caught by any signal handlers so it is
|
280
|
-
# unlikely that this join will timeout. If it does, there isn't
|
281
|
-
# anything further to do except log an error in the next if-block.
|
282
|
-
|
283
|
-
if process.is_alive():
|
284
|
-
logger.error("Process failed to end")
|
285
|
-
# don't call close if the process hasn't ended:
|
286
|
-
# process.close() doesn't work on a running process.
|
287
|
-
else:
|
288
|
-
process.close()
|
@@ -3,16 +3,27 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import multiprocessing.queues as mpq
|
5
5
|
import os
|
6
|
+
import queue
|
6
7
|
import time
|
7
|
-
from multiprocessing.
|
8
|
+
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
9
|
+
from multiprocessing.queues import Queue as QueueType
|
10
|
+
from multiprocessing.synchronize import Event as EventType
|
8
11
|
from typing import Tuple
|
9
12
|
|
10
13
|
import typeguard
|
11
14
|
import zmq
|
12
15
|
|
16
|
+
from parsl.addresses import tcp_url
|
13
17
|
from parsl.log_utils import set_file_logger
|
18
|
+
from parsl.monitoring.errors import MonitoringRouterStartError
|
14
19
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
15
20
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
21
|
+
from parsl.multiprocessing import (
|
22
|
+
SizedQueue,
|
23
|
+
SpawnEvent,
|
24
|
+
SpawnProcess,
|
25
|
+
join_terminate_close_proc,
|
26
|
+
)
|
16
27
|
from parsl.process_loggers import wrap_with_logs
|
17
28
|
from parsl.utils import setproctitle
|
18
29
|
|
@@ -23,21 +34,21 @@ class MonitoringRouter:
|
|
23
34
|
|
24
35
|
def __init__(self,
|
25
36
|
*,
|
26
|
-
|
27
|
-
|
37
|
+
address: str,
|
38
|
+
port_range: Tuple[int, int] = (55050, 56000),
|
28
39
|
|
29
40
|
run_dir: str = ".",
|
30
41
|
logging_level: int = logging.INFO,
|
31
42
|
resource_msgs: mpq.Queue,
|
32
|
-
exit_event:
|
43
|
+
exit_event: EventType,
|
33
44
|
):
|
34
45
|
""" Initializes a monitoring configuration class.
|
35
46
|
|
36
47
|
Parameters
|
37
48
|
----------
|
38
|
-
|
49
|
+
address : str
|
39
50
|
The ip address at which the workers will be able to reach the Hub.
|
40
|
-
|
51
|
+
port_range : tuple(int, int)
|
41
52
|
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
42
53
|
Default: (55050, 56000)
|
43
54
|
run_dir : str
|
@@ -51,11 +62,11 @@ class MonitoringRouter:
|
|
51
62
|
"""
|
52
63
|
os.makedirs(run_dir, exist_ok=True)
|
53
64
|
self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
|
54
|
-
name="
|
65
|
+
name="zmq_monitoring_router",
|
55
66
|
level=logging_level)
|
56
67
|
self.logger.debug("Monitoring router starting")
|
57
68
|
|
58
|
-
self.
|
69
|
+
self.address = address
|
59
70
|
|
60
71
|
self.loop_freq = 10.0 # milliseconds
|
61
72
|
|
@@ -64,15 +75,15 @@ class MonitoringRouter:
|
|
64
75
|
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
65
76
|
self.zmq_receiver_channel.set_hwm(0)
|
66
77
|
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
67
|
-
self.logger.debug("
|
68
|
-
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(
|
69
|
-
min_port=
|
70
|
-
max_port=
|
78
|
+
self.logger.debug("address: {}. port_range {}".format(address, port_range))
|
79
|
+
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(tcp_url(address),
|
80
|
+
min_port=port_range[0],
|
81
|
+
max_port=port_range[1])
|
71
82
|
|
72
83
|
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
73
84
|
self.exit_event = exit_event
|
74
85
|
|
75
|
-
@wrap_with_logs(target="
|
86
|
+
@wrap_with_logs(target="zmq_monitoring_router")
|
76
87
|
def start(self) -> None:
|
77
88
|
self.logger.info("Starting ZMQ listener")
|
78
89
|
try:
|
@@ -108,17 +119,17 @@ class MonitoringRouter:
|
|
108
119
|
def zmq_router_starter(*,
|
109
120
|
comm_q: mpq.Queue,
|
110
121
|
resource_msgs: mpq.Queue,
|
111
|
-
exit_event:
|
122
|
+
exit_event: EventType,
|
112
123
|
|
113
|
-
|
114
|
-
|
124
|
+
address: str,
|
125
|
+
port_range: Tuple[int, int],
|
115
126
|
|
116
127
|
run_dir: str,
|
117
128
|
logging_level: int) -> None:
|
118
129
|
setproctitle("parsl: monitoring zmq router")
|
119
130
|
try:
|
120
|
-
router = MonitoringRouter(
|
121
|
-
|
131
|
+
router = MonitoringRouter(address=address,
|
132
|
+
port_range=port_range,
|
122
133
|
run_dir=run_dir,
|
123
134
|
logging_level=logging_level,
|
124
135
|
resource_msgs=resource_msgs,
|
@@ -129,3 +140,54 @@ def zmq_router_starter(*,
|
|
129
140
|
else:
|
130
141
|
comm_q.put(router.zmq_receiver_port)
|
131
142
|
router.start()
|
143
|
+
|
144
|
+
|
145
|
+
class ZMQRadioReceiver():
|
146
|
+
def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
|
147
|
+
self.process = process
|
148
|
+
self.exit_event = exit_event
|
149
|
+
self.port = port
|
150
|
+
|
151
|
+
def close(self) -> None:
|
152
|
+
self.exit_event.set()
|
153
|
+
join_terminate_close_proc(self.process)
|
154
|
+
|
155
|
+
|
156
|
+
def start_zmq_receiver(*,
|
157
|
+
monitoring_messages: QueueType,
|
158
|
+
loopback_address: str,
|
159
|
+
port_range: Tuple[int, int],
|
160
|
+
logdir: str,
|
161
|
+
worker_debug: bool) -> ZMQRadioReceiver:
|
162
|
+
comm_q = SizedQueue(maxsize=10)
|
163
|
+
|
164
|
+
router_exit_event = SpawnEvent()
|
165
|
+
|
166
|
+
router_proc = SpawnProcess(target=zmq_router_starter,
|
167
|
+
kwargs={"comm_q": comm_q,
|
168
|
+
"resource_msgs": monitoring_messages,
|
169
|
+
"exit_event": router_exit_event,
|
170
|
+
"address": loopback_address,
|
171
|
+
"port_range": port_range,
|
172
|
+
"run_dir": logdir,
|
173
|
+
"logging_level": logging.DEBUG if worker_debug else logging.INFO,
|
174
|
+
},
|
175
|
+
name="Monitoring-ZMQ-Router-Process",
|
176
|
+
daemon=True,
|
177
|
+
)
|
178
|
+
router_proc.start()
|
179
|
+
|
180
|
+
try:
|
181
|
+
logger.debug("Waiting for router process to report port")
|
182
|
+
comm_q_result = comm_q.get(block=True, timeout=120)
|
183
|
+
comm_q.close()
|
184
|
+
comm_q.join_thread()
|
185
|
+
except queue.Empty:
|
186
|
+
logger.error("Monitoring ZMQ Router has not reported port in 120s")
|
187
|
+
raise MonitoringRouterStartError()
|
188
|
+
|
189
|
+
if isinstance(comm_q_result, str):
|
190
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
191
|
+
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
192
|
+
|
193
|
+
return ZMQRadioReceiver(process=router_proc, exit_event=router_exit_event, port=comm_q_result)
|
parsl/multiprocessing.py
CHANGED
@@ -6,6 +6,7 @@ import multiprocessing
|
|
6
6
|
import multiprocessing.queues
|
7
7
|
import platform
|
8
8
|
from multiprocessing.context import ForkProcess as ForkProcessType
|
9
|
+
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
9
10
|
from typing import Callable
|
10
11
|
|
11
12
|
logger = logging.getLogger(__name__)
|
@@ -14,6 +15,10 @@ ForkContext = multiprocessing.get_context("fork")
|
|
14
15
|
SpawnContext = multiprocessing.get_context("spawn")
|
15
16
|
|
16
17
|
ForkProcess: Callable[..., ForkProcessType] = ForkContext.Process
|
18
|
+
SpawnProcess: Callable[..., SpawnProcessType] = SpawnContext.Process
|
19
|
+
|
20
|
+
SpawnEvent = SpawnContext.Event
|
21
|
+
SpawnQueue = SpawnContext.Queue
|
17
22
|
|
18
23
|
|
19
24
|
class MacSafeQueue(multiprocessing.queues.Queue):
|
@@ -26,7 +31,7 @@ class MacSafeQueue(multiprocessing.queues.Queue):
|
|
26
31
|
|
27
32
|
def __init__(self, *args, **kwargs):
|
28
33
|
if 'ctx' not in kwargs:
|
29
|
-
kwargs['ctx'] = multiprocessing.get_context()
|
34
|
+
kwargs['ctx'] = multiprocessing.get_context('spawn')
|
30
35
|
super().__init__(*args, **kwargs)
|
31
36
|
self._counter = multiprocessing.Value('i', 0)
|
32
37
|
|
@@ -59,6 +64,41 @@ SizedQueue: Callable[..., multiprocessing.Queue]
|
|
59
64
|
|
60
65
|
if platform.system() != 'Darwin':
|
61
66
|
import multiprocessing
|
62
|
-
SizedQueue =
|
67
|
+
SizedQueue = SpawnQueue
|
63
68
|
else:
|
64
69
|
SizedQueue = MacSafeQueue
|
70
|
+
|
71
|
+
|
72
|
+
def join_terminate_close_proc(process: SpawnProcessType, *, timeout: int = 30) -> None:
|
73
|
+
"""Increasingly aggressively terminate a process.
|
74
|
+
|
75
|
+
This function assumes that the process is likely to exit before
|
76
|
+
the join timeout, driven by some other means, such as the
|
77
|
+
MonitoringHub router_exit_event. If the process does not exit, then
|
78
|
+
first terminate() and then kill() will be used to end the process.
|
79
|
+
|
80
|
+
In the case of a very mis-behaving process, this function might take
|
81
|
+
up to 3*timeout to exhaust all termination methods and return.
|
82
|
+
"""
|
83
|
+
logger.debug("Joining process")
|
84
|
+
process.join(timeout)
|
85
|
+
|
86
|
+
# run a sequence of increasingly aggressive steps to shut down the process.
|
87
|
+
if process.is_alive():
|
88
|
+
logger.error("Process did not join. Terminating.")
|
89
|
+
process.terminate()
|
90
|
+
process.join(timeout)
|
91
|
+
if process.is_alive():
|
92
|
+
logger.error("Process did not join after terminate. Killing.")
|
93
|
+
process.kill()
|
94
|
+
process.join(timeout)
|
95
|
+
# This kill should not be caught by any signal handlers so it is
|
96
|
+
# unlikely that this join will timeout. If it does, there isn't
|
97
|
+
# anything further to do except log an error in the next if-block.
|
98
|
+
|
99
|
+
if process.is_alive():
|
100
|
+
logger.error("Process failed to end")
|
101
|
+
# don't call close if the process hasn't ended:
|
102
|
+
# process.close() doesn't work on a running process.
|
103
|
+
else:
|
104
|
+
process.close()
|
@@ -120,7 +120,7 @@ def test_row_counts(tmpd_cwd, fresh_config):
|
|
120
120
|
# Two entries: one showing manager active, one inactive
|
121
121
|
result = connection.execute(text("SELECT COUNT(*) FROM node"))
|
122
122
|
(c, ) = result.first()
|
123
|
-
assert c ==
|
123
|
+
assert c == 4
|
124
124
|
|
125
125
|
# There should be one block polling status
|
126
126
|
# local provider has a status_polling_interval of 5s
|
@@ -4,8 +4,7 @@ import signal
|
|
4
4
|
import psutil
|
5
5
|
import pytest
|
6
6
|
|
7
|
-
from parsl.
|
8
|
-
from parsl.multiprocessing import ForkProcess
|
7
|
+
from parsl.multiprocessing import SpawnEvent, SpawnProcess, join_terminate_close_proc
|
9
8
|
|
10
9
|
|
11
10
|
def noop():
|
@@ -14,7 +13,7 @@ def noop():
|
|
14
13
|
|
15
14
|
@pytest.mark.local
|
16
15
|
def test_end_process_already_exited():
|
17
|
-
p =
|
16
|
+
p = SpawnProcess(target=noop)
|
18
17
|
p.start()
|
19
18
|
p.join()
|
20
19
|
join_terminate_close_proc(p)
|
@@ -28,7 +27,7 @@ def hang():
|
|
28
27
|
@pytest.mark.local
|
29
28
|
def test_end_hung_process():
|
30
29
|
"""Test calling against a process that will not exit itself."""
|
31
|
-
p =
|
30
|
+
p = SpawnProcess(target=hang)
|
32
31
|
p.start()
|
33
32
|
pid = p.pid
|
34
33
|
join_terminate_close_proc(p, timeout=1)
|
@@ -46,10 +45,10 @@ def hang_no_sigint(e):
|
|
46
45
|
@pytest.mark.local
|
47
46
|
def test_end_hung_process_no_sigint():
|
48
47
|
"""Test calling against a process that will not exit itself."""
|
49
|
-
e =
|
50
|
-
p =
|
48
|
+
e = SpawnEvent()
|
49
|
+
p = SpawnProcess(target=hang_no_sigint, args=(e,))
|
51
50
|
p.start()
|
52
51
|
pid = p.pid
|
53
|
-
join_terminate_close_proc(p, timeout=
|
52
|
+
join_terminate_close_proc(p, timeout=2)
|
54
53
|
assert not psutil.pid_exists(pid), "process should not exist any more"
|
55
54
|
assert e.is_set(), "hung process should have set event on signal"
|
@@ -45,7 +45,7 @@ def test_row_counts():
|
|
45
45
|
|
46
46
|
# dig out the interchange port...
|
47
47
|
hub_address = parsl.dfk().monitoring.hub_address
|
48
|
-
hub_zmq_port = parsl.dfk().
|
48
|
+
hub_zmq_port = parsl.dfk().executors["htex_Local"].hub_zmq_port
|
49
49
|
|
50
50
|
# this will send a string to a new socket connection
|
51
51
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import pytest
|
2
|
+
|
3
|
+
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
4
|
+
from parsl.monitoring.radios.zmq_router import start_zmq_receiver
|
5
|
+
from parsl.multiprocessing import SpawnQueue
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.mark.local
|
9
|
+
def test_send_recv_message(tmpd_cwd, try_assert):
|
10
|
+
q = SpawnQueue()
|
11
|
+
loopback = "127.0.0.1"
|
12
|
+
r = start_zmq_receiver(monitoring_messages=q,
|
13
|
+
loopback_address=loopback,
|
14
|
+
port_range=(49152, 65535),
|
15
|
+
logdir=str(tmpd_cwd),
|
16
|
+
worker_debug=False)
|
17
|
+
|
18
|
+
s = ZMQRadioSender(loopback, r.port)
|
19
|
+
|
20
|
+
test_msg = ("test", {})
|
21
|
+
s.send(test_msg)
|
22
|
+
|
23
|
+
assert q.get() == test_msg
|
24
|
+
|
25
|
+
assert r.process.is_alive()
|
26
|
+
r.exit_event.set()
|
27
|
+
try_assert(lambda: not r.process.is_alive())
|
@@ -102,6 +102,9 @@ def test_stdstream_to_monitoring(stdx, expected_stdx, stream, tmpd_cwd, caplog):
|
|
102
102
|
kwargs = {stream: stdx}
|
103
103
|
stdapp(**kwargs).result()
|
104
104
|
|
105
|
+
for record in caplog.records:
|
106
|
+
assert record.levelno < logging.ERROR
|
107
|
+
|
105
108
|
engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
|
106
109
|
with engine.begin() as connection:
|
107
110
|
|
@@ -30,7 +30,7 @@ def test_no_kills():
|
|
30
30
|
|
31
31
|
@pytest.mark.local
|
32
32
|
@pytest.mark.parametrize("sig", [signal.SIGINT, signal.SIGTERM, signal.SIGKILL, signal.SIGQUIT])
|
33
|
-
@pytest.mark.parametrize("process_attr", ["
|
33
|
+
@pytest.mark.parametrize("process_attr", ["udp_router_proc", "dbm_proc", "filesystem_proc"])
|
34
34
|
def test_kill_monitoring_helper_process(sig, process_attr, try_assert):
|
35
35
|
"""This tests that we can kill a monitoring process and still have successful shutdown.
|
36
36
|
SIGINT emulates some racy behaviour when ctrl-C is pressed: that
|
parsl/usage_tracking/usage.py
CHANGED
@@ -8,7 +8,7 @@ import uuid
|
|
8
8
|
|
9
9
|
from parsl.dataflow.states import States
|
10
10
|
from parsl.errors import ConfigurationError
|
11
|
-
from parsl.multiprocessing import
|
11
|
+
from parsl.multiprocessing import SpawnProcess
|
12
12
|
from parsl.usage_tracking.api import get_parsl_usage
|
13
13
|
from parsl.usage_tracking.levels import DISABLED as USAGE_TRACKING_DISABLED
|
14
14
|
from parsl.usage_tracking.levels import LEVEL_3 as USAGE_TRACKING_LEVEL_3
|
@@ -35,7 +35,7 @@ def async_process(fn: Callable[P, None]) -> Callable[P, None]:
|
|
35
35
|
""" Decorator function to launch a function as a separate process """
|
36
36
|
|
37
37
|
def run(*args, **kwargs):
|
38
|
-
proc =
|
38
|
+
proc = SpawnProcess(target=fn, args=args, kwargs=kwargs, name="Usage-Tracking")
|
39
39
|
proc.start()
|
40
40
|
return proc
|
41
41
|
|
parsl/version.py
CHANGED