parsl 2025.3.10__py3-none-any.whl → 2025.3.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. parsl/dataflow/dflow.py +1 -3
  2. parsl/executors/base.py +13 -37
  3. parsl/executors/flux/executor.py +1 -0
  4. parsl/executors/globus_compute.py +1 -1
  5. parsl/executors/high_throughput/executor.py +18 -0
  6. parsl/executors/high_throughput/mpi_resource_management.py +2 -0
  7. parsl/executors/high_throughput/process_worker_pool.py +89 -82
  8. parsl/executors/radical/executor.py +1 -0
  9. parsl/executors/status_handling.py +8 -0
  10. parsl/executors/taskvine/executor.py +1 -0
  11. parsl/executors/workqueue/executor.py +1 -0
  12. parsl/monitoring/db_manager.py +16 -10
  13. parsl/monitoring/errors.py +5 -0
  14. parsl/monitoring/monitoring.py +61 -117
  15. parsl/monitoring/radios/filesystem_router.py +4 -2
  16. parsl/monitoring/radios/udp_router.py +1 -3
  17. parsl/monitoring/radios/zmq_router.py +80 -25
  18. parsl/multiprocessing.py +42 -2
  19. parsl/tests/test_monitoring/test_exit_helper.py +54 -0
  20. parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
  21. parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
  22. parsl/tests/test_monitoring/test_stdouterr.py +3 -0
  23. parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
  24. parsl/usage_tracking/usage.py +2 -2
  25. parsl/version.py +1 -1
  26. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/process_worker_pool.py +89 -82
  27. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/METADATA +4 -4
  28. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/RECORD +35 -33
  29. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/exec_parsl_function.py +0 -0
  30. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/interchange.py +0 -0
  31. {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/parsl_coprocess.py +0 -0
  32. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/LICENSE +0 -0
  33. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/WHEEL +0 -0
  34. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/entry_points.txt +0 -0
  35. {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,9 @@ import logging
4
4
  import multiprocessing.synchronize as ms
5
5
  import os
6
6
  import queue
7
- from multiprocessing import Event
7
+ import warnings
8
8
  from multiprocessing.queues import Queue
9
- from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
10
 
11
11
  import typeguard
12
12
 
@@ -14,9 +14,13 @@ from parsl.monitoring.errors import MonitoringHubStartError
14
14
  from parsl.monitoring.radios.filesystem_router import filesystem_router_starter
15
15
  from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
16
16
  from parsl.monitoring.radios.udp_router import udp_router_starter
17
- from parsl.monitoring.radios.zmq_router import zmq_router_starter
18
17
  from parsl.monitoring.types import TaggedMonitoringMessage
19
- from parsl.multiprocessing import ForkProcess, SizedQueue
18
+ from parsl.multiprocessing import (
19
+ SizedQueue,
20
+ SpawnEvent,
21
+ SpawnProcess,
22
+ join_terminate_close_proc,
23
+ )
20
24
  from parsl.utils import RepresentationMixin
21
25
 
22
26
  _db_manager_excepts: Optional[Exception]
@@ -37,7 +41,7 @@ class MonitoringHub(RepresentationMixin):
37
41
  def __init__(self,
38
42
  hub_address: str,
39
43
  hub_port: Optional[int] = None,
40
- hub_port_range: Tuple[int, int] = (55050, 56000),
44
+ hub_port_range: Any = None,
41
45
 
42
46
  workflow_name: Optional[str] = None,
43
47
  workflow_version: Optional[str] = None,
@@ -56,12 +60,11 @@ class MonitoringHub(RepresentationMixin):
56
60
  Note that despite the similar name, this is not related to
57
61
  hub_port_range.
58
62
  Default: None
59
- hub_port_range : tuple(int, int)
60
- The port range for a ZMQ channel from an executor process
61
- (for example, the interchange in the High Throughput Executor)
62
- to deliver monitoring messages to the monitoring router.
63
- Note that despite the similar name, this is not related to hub_port.
64
- Default: (55050, 56000)
63
+ hub_port_range : unused
64
+ Unused, but retained until 2025-09-14 to avoid configuration errors.
65
+ This value previously configured one ZMQ channel inside the
66
+ HighThroughputExecutor. That ZMQ channel is now configured by the
67
+ interchange_port_range parameter of HighThroughputExecutor.
65
68
  workflow_name : str
66
69
  The name for the workflow. Default to the name of the parsl script
67
70
  workflow_version : str
@@ -88,6 +91,13 @@ class MonitoringHub(RepresentationMixin):
88
91
 
89
92
  self.hub_address = hub_address
90
93
  self.hub_port = hub_port
94
+
95
+ if hub_port_range is not None:
96
+ message = "Instead of MonitoringHub.hub_port_range, Use HighThroughputExecutor.interchange_port_range"
97
+ warnings.warn(message, DeprecationWarning)
98
+ logger.warning(message)
99
+ # This is used by RepresentationMixin so needs to exist as an attribute
100
+ # even though now it is otherwise unused.
91
101
  self.hub_port_range = hub_port_range
92
102
 
93
103
  self.logging_endpoint = logging_endpoint
@@ -119,91 +129,59 @@ class MonitoringHub(RepresentationMixin):
119
129
  # in the future, Queue will allow runtime subscripts.
120
130
 
121
131
  if TYPE_CHECKING:
122
- zmq_comm_q: Queue[Union[int, str]]
123
132
  udp_comm_q: Queue[Union[int, str]]
124
133
  else:
125
- zmq_comm_q: Queue
126
134
  udp_comm_q: Queue
127
135
 
128
- zmq_comm_q = SizedQueue(maxsize=10)
129
136
  udp_comm_q = SizedQueue(maxsize=10)
130
137
 
131
- self.exception_q: Queue[Tuple[str, str]]
132
- self.exception_q = SizedQueue(maxsize=10)
133
-
134
- self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
138
+ self.resource_msgs: Queue[TaggedMonitoringMessage]
135
139
  self.resource_msgs = SizedQueue()
136
140
 
137
141
  self.router_exit_event: ms.Event
138
- self.router_exit_event = Event()
139
-
140
- self.zmq_router_proc = ForkProcess(target=zmq_router_starter,
141
- kwargs={"comm_q": zmq_comm_q,
142
- "exception_q": self.exception_q,
143
- "resource_msgs": self.resource_msgs,
144
- "exit_event": self.router_exit_event,
145
- "hub_address": self.hub_address,
146
- "zmq_port_range": self.hub_port_range,
147
- "run_dir": dfk_run_dir,
148
- "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
149
- },
150
- name="Monitoring-ZMQ-Router-Process",
151
- daemon=True,
152
- )
153
- self.zmq_router_proc.start()
154
-
155
- self.udp_router_proc = ForkProcess(target=udp_router_starter,
156
- kwargs={"comm_q": udp_comm_q,
157
- "exception_q": self.exception_q,
158
- "resource_msgs": self.resource_msgs,
159
- "exit_event": self.router_exit_event,
160
- "hub_address": self.hub_address,
161
- "udp_port": self.hub_port,
162
- "run_dir": dfk_run_dir,
163
- "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
164
- },
165
- name="Monitoring-UDP-Router-Process",
166
- daemon=True,
167
- )
142
+ self.router_exit_event = SpawnEvent()
143
+
144
+ self.udp_router_proc = SpawnProcess(target=udp_router_starter,
145
+ kwargs={"comm_q": udp_comm_q,
146
+ "resource_msgs": self.resource_msgs,
147
+ "exit_event": self.router_exit_event,
148
+ "hub_address": self.hub_address,
149
+ "udp_port": self.hub_port,
150
+ "run_dir": dfk_run_dir,
151
+ "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
152
+ },
153
+ name="Monitoring-UDP-Router-Process",
154
+ daemon=True,
155
+ )
168
156
  self.udp_router_proc.start()
169
157
 
170
- self.dbm_proc = ForkProcess(target=dbm_starter,
171
- args=(self.exception_q, self.resource_msgs,),
172
- kwargs={"run_dir": dfk_run_dir,
173
- "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
174
- "db_url": self.logging_endpoint,
175
- },
176
- name="Monitoring-DBM-Process",
177
- daemon=True,
178
- )
158
+ self.dbm_exit_event: ms.Event
159
+ self.dbm_exit_event = SpawnEvent()
160
+
161
+ self.dbm_proc = SpawnProcess(target=dbm_starter,
162
+ args=(self.resource_msgs,),
163
+ kwargs={"run_dir": dfk_run_dir,
164
+ "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
165
+ "db_url": self.logging_endpoint,
166
+ "exit_event": self.dbm_exit_event,
167
+ },
168
+ name="Monitoring-DBM-Process",
169
+ daemon=True,
170
+ )
179
171
  self.dbm_proc.start()
180
- logger.info("Started ZMQ router process %s, UDP router process %s and DBM process %s",
181
- self.zmq_router_proc.pid, self.udp_router_proc.pid, self.dbm_proc.pid)
182
-
183
- self.filesystem_proc = ForkProcess(target=filesystem_router_starter,
184
- args=(self.resource_msgs, dfk_run_dir),
185
- name="Monitoring-Filesystem-Process",
186
- daemon=True
187
- )
172
+ logger.info("Started UDP router process %s and DBM process %s",
173
+ self.udp_router_proc.pid, self.dbm_proc.pid)
174
+
175
+ self.filesystem_proc = SpawnProcess(target=filesystem_router_starter,
176
+ args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
177
+ name="Monitoring-Filesystem-Process",
178
+ daemon=True
179
+ )
188
180
  self.filesystem_proc.start()
189
181
  logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
190
182
 
191
183
  self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
192
184
 
193
- try:
194
- zmq_comm_q_result = zmq_comm_q.get(block=True, timeout=120)
195
- zmq_comm_q.close()
196
- zmq_comm_q.join_thread()
197
- except queue.Empty:
198
- logger.error("Monitoring ZMQ Router has not reported port in 120s. Aborting")
199
- raise MonitoringHubStartError()
200
-
201
- if isinstance(zmq_comm_q_result, str):
202
- logger.error("MonitoringRouter sent an error message: %s", zmq_comm_q_result)
203
- raise RuntimeError(f"MonitoringRouter failed to start: {zmq_comm_q_result}")
204
-
205
- self.hub_zmq_port = zmq_comm_q_result
206
-
207
185
  try:
208
186
  udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
209
187
  udp_comm_q.close()
@@ -227,58 +205,24 @@ class MonitoringHub(RepresentationMixin):
227
205
 
228
206
  def close(self) -> None:
229
207
  logger.info("Terminating Monitoring Hub")
230
- exception_msgs = []
231
- while True:
232
- try:
233
- exception_msgs.append(self.exception_q.get(block=False))
234
- logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
235
- except queue.Empty:
236
- break
237
208
  if self.monitoring_hub_active:
238
209
  self.monitoring_hub_active = False
239
- if exception_msgs:
240
- for exception_msg in exception_msgs:
241
- logger.error(
242
- "%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
243
- exception_msg[0],
244
- exception_msg[1]
245
- )
246
- self.zmq_router_proc.terminate()
247
- self.udp_router_proc.terminate()
248
- self.dbm_proc.terminate()
249
- self.filesystem_proc.terminate()
250
210
  logger.info("Setting router termination event")
251
211
  self.router_exit_event.set()
252
212
 
253
- logger.info("Waiting for ZMQ router to terminate")
254
- self.zmq_router_proc.join()
255
- self.zmq_router_proc.close()
256
-
257
213
  logger.info("Waiting for UDP router to terminate")
258
- self.udp_router_proc.join()
259
- self.udp_router_proc.close()
214
+ join_terminate_close_proc(self.udp_router_proc)
260
215
 
261
216
  logger.debug("Finished waiting for router termination")
262
- if len(exception_msgs) == 0:
263
- logger.debug("Sending STOP to DBM")
264
- self.resource_msgs.put("STOP")
265
- else:
266
- logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
267
217
  logger.debug("Waiting for DB termination")
268
- self.dbm_proc.join()
269
- self.dbm_proc.close()
218
+ self.dbm_exit_event.set()
219
+ join_terminate_close_proc(self.dbm_proc)
270
220
  logger.debug("Finished waiting for DBM termination")
271
221
 
272
- # should this be message based? it probably doesn't need to be if
273
- # we believe we've received all messages
274
222
  logger.info("Terminating filesystem radio receiver process")
275
- self.filesystem_proc.terminate()
276
- self.filesystem_proc.join()
277
- self.filesystem_proc.close()
223
+ join_terminate_close_proc(self.filesystem_proc)
278
224
 
279
225
  logger.info("Closing monitoring multiprocessing queues")
280
- self.exception_q.close()
281
- self.exception_q.join_thread()
282
226
  self.resource_msgs.close()
283
227
  self.resource_msgs.join_thread()
284
228
  logger.info("Closed monitoring multiprocessing queues")
@@ -5,6 +5,7 @@ import os
5
5
  import pickle
6
6
  import time
7
7
  from multiprocessing.queues import Queue
8
+ from multiprocessing.synchronize import Event
8
9
  from typing import cast
9
10
 
10
11
  from parsl.log_utils import set_file_logger
@@ -15,7 +16,7 @@ from parsl.utils import setproctitle
15
16
 
16
17
 
17
18
  @wrap_with_logs
18
- def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
19
+ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str, exit_event: Event) -> None:
19
20
  logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
20
21
  name="monitoring_filesystem_radio",
21
22
  level=logging.INFO)
@@ -32,7 +33,7 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
32
33
  os.makedirs(tmp_dir, exist_ok=True)
33
34
  os.makedirs(new_dir, exist_ok=True)
34
35
 
35
- while True: # this loop will end on process termination
36
+ while not exit_event.is_set():
36
37
  logger.debug("Start filesystem radio receiver loop")
37
38
 
38
39
  # iterate over files in new_dir
@@ -50,3 +51,4 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
50
51
  logger.exception("Exception processing %s - probably will be retried next iteration", filename)
51
52
 
52
53
  time.sleep(1) # whats a good time for this poll?
54
+ logger.info("Ending filesystem radio receiver")
@@ -118,7 +118,6 @@ class MonitoringRouter:
118
118
  @typeguard.typechecked
119
119
  def udp_router_starter(*,
120
120
  comm_q: mpq.Queue,
121
- exception_q: mpq.Queue,
122
121
  resource_msgs: mpq.Queue,
123
122
  exit_event: Event,
124
123
 
@@ -144,6 +143,5 @@ def udp_router_starter(*,
144
143
  router.logger.info("Starting MonitoringRouter in router_starter")
145
144
  try:
146
145
  router.start()
147
- except Exception as e:
146
+ except Exception:
148
147
  router.logger.exception("UDP router start exception")
149
- exception_q.put(('Hub', str(e)))
@@ -3,16 +3,27 @@ from __future__ import annotations
3
3
  import logging
4
4
  import multiprocessing.queues as mpq
5
5
  import os
6
+ import queue
6
7
  import time
7
- from multiprocessing.synchronize import Event
8
+ from multiprocessing.context import SpawnProcess as SpawnProcessType
9
+ from multiprocessing.queues import Queue as QueueType
10
+ from multiprocessing.synchronize import Event as EventType
8
11
  from typing import Tuple
9
12
 
10
13
  import typeguard
11
14
  import zmq
12
15
 
16
+ from parsl.addresses import tcp_url
13
17
  from parsl.log_utils import set_file_logger
18
+ from parsl.monitoring.errors import MonitoringRouterStartError
14
19
  from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
15
20
  from parsl.monitoring.types import TaggedMonitoringMessage
21
+ from parsl.multiprocessing import (
22
+ SizedQueue,
23
+ SpawnEvent,
24
+ SpawnProcess,
25
+ join_terminate_close_proc,
26
+ )
16
27
  from parsl.process_loggers import wrap_with_logs
17
28
  from parsl.utils import setproctitle
18
29
 
@@ -23,21 +34,21 @@ class MonitoringRouter:
23
34
 
24
35
  def __init__(self,
25
36
  *,
26
- hub_address: str,
27
- zmq_port_range: Tuple[int, int] = (55050, 56000),
37
+ address: str,
38
+ port_range: Tuple[int, int] = (55050, 56000),
28
39
 
29
40
  run_dir: str = ".",
30
41
  logging_level: int = logging.INFO,
31
42
  resource_msgs: mpq.Queue,
32
- exit_event: Event,
43
+ exit_event: EventType,
33
44
  ):
34
45
  """ Initializes a monitoring configuration class.
35
46
 
36
47
  Parameters
37
48
  ----------
38
- hub_address : str
49
+ address : str
39
50
  The ip address at which the workers will be able to reach the Hub.
40
- zmq_port_range : tuple(int, int)
51
+ port_range : tuple(int, int)
41
52
  The MonitoringHub picks ports at random from the range which will be used by Hub.
42
53
  Default: (55050, 56000)
43
54
  run_dir : str
@@ -51,11 +62,11 @@ class MonitoringRouter:
51
62
  """
52
63
  os.makedirs(run_dir, exist_ok=True)
53
64
  self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
54
- name="monitoring_router",
65
+ name="zmq_monitoring_router",
55
66
  level=logging_level)
56
67
  self.logger.debug("Monitoring router starting")
57
68
 
58
- self.hub_address = hub_address
69
+ self.address = address
59
70
 
60
71
  self.loop_freq = 10.0 # milliseconds
61
72
 
@@ -64,15 +75,15 @@ class MonitoringRouter:
64
75
  self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
65
76
  self.zmq_receiver_channel.set_hwm(0)
66
77
  self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
67
- self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
68
- self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
69
- min_port=zmq_port_range[0],
70
- max_port=zmq_port_range[1])
78
+ self.logger.debug("address: {}. port_range {}".format(address, port_range))
79
+ self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(tcp_url(address),
80
+ min_port=port_range[0],
81
+ max_port=port_range[1])
71
82
 
72
83
  self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
73
84
  self.exit_event = exit_event
74
85
 
75
- @wrap_with_logs(target="monitoring_router")
86
+ @wrap_with_logs(target="zmq_monitoring_router")
76
87
  def start(self) -> None:
77
88
  self.logger.info("Starting ZMQ listener")
78
89
  try:
@@ -107,19 +118,18 @@ class MonitoringRouter:
107
118
  @typeguard.typechecked
108
119
  def zmq_router_starter(*,
109
120
  comm_q: mpq.Queue,
110
- exception_q: mpq.Queue,
111
121
  resource_msgs: mpq.Queue,
112
- exit_event: Event,
122
+ exit_event: EventType,
113
123
 
114
- hub_address: str,
115
- zmq_port_range: Tuple[int, int],
124
+ address: str,
125
+ port_range: Tuple[int, int],
116
126
 
117
127
  run_dir: str,
118
128
  logging_level: int) -> None:
119
129
  setproctitle("parsl: monitoring zmq router")
120
130
  try:
121
- router = MonitoringRouter(hub_address=hub_address,
122
- zmq_port_range=zmq_port_range,
131
+ router = MonitoringRouter(address=address,
132
+ port_range=port_range,
123
133
  run_dir=run_dir,
124
134
  logging_level=logging_level,
125
135
  resource_msgs=resource_msgs,
@@ -129,10 +139,55 @@ def zmq_router_starter(*,
129
139
  comm_q.put(f"Monitoring router construction failed: {e}")
130
140
  else:
131
141
  comm_q.put(router.zmq_receiver_port)
142
+ router.start()
132
143
 
133
- router.logger.info("Starting MonitoringRouter in router_starter")
134
- try:
135
- router.start()
136
- except Exception as e:
137
- router.logger.exception("ZMQ router start exception")
138
- exception_q.put(('Hub', str(e)))
144
+
145
+ class ZMQRadioReceiver():
146
+ def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
147
+ self.process = process
148
+ self.exit_event = exit_event
149
+ self.port = port
150
+
151
+ def close(self) -> None:
152
+ self.exit_event.set()
153
+ join_terminate_close_proc(self.process)
154
+
155
+
156
+ def start_zmq_receiver(*,
157
+ monitoring_messages: QueueType,
158
+ loopback_address: str,
159
+ port_range: Tuple[int, int],
160
+ logdir: str,
161
+ worker_debug: bool) -> ZMQRadioReceiver:
162
+ comm_q = SizedQueue(maxsize=10)
163
+
164
+ router_exit_event = SpawnEvent()
165
+
166
+ router_proc = SpawnProcess(target=zmq_router_starter,
167
+ kwargs={"comm_q": comm_q,
168
+ "resource_msgs": monitoring_messages,
169
+ "exit_event": router_exit_event,
170
+ "address": loopback_address,
171
+ "port_range": port_range,
172
+ "run_dir": logdir,
173
+ "logging_level": logging.DEBUG if worker_debug else logging.INFO,
174
+ },
175
+ name="Monitoring-ZMQ-Router-Process",
176
+ daemon=True,
177
+ )
178
+ router_proc.start()
179
+
180
+ try:
181
+ logger.debug("Waiting for router process to report port")
182
+ comm_q_result = comm_q.get(block=True, timeout=120)
183
+ comm_q.close()
184
+ comm_q.join_thread()
185
+ except queue.Empty:
186
+ logger.error("Monitoring ZMQ Router has not reported port in 120s")
187
+ raise MonitoringRouterStartError()
188
+
189
+ if isinstance(comm_q_result, str):
190
+ logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
191
+ raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
192
+
193
+ return ZMQRadioReceiver(process=router_proc, exit_event=router_exit_event, port=comm_q_result)
parsl/multiprocessing.py CHANGED
@@ -6,6 +6,7 @@ import multiprocessing
6
6
  import multiprocessing.queues
7
7
  import platform
8
8
  from multiprocessing.context import ForkProcess as ForkProcessType
9
+ from multiprocessing.context import SpawnProcess as SpawnProcessType
9
10
  from typing import Callable
10
11
 
11
12
  logger = logging.getLogger(__name__)
@@ -14,6 +15,10 @@ ForkContext = multiprocessing.get_context("fork")
14
15
  SpawnContext = multiprocessing.get_context("spawn")
15
16
 
16
17
  ForkProcess: Callable[..., ForkProcessType] = ForkContext.Process
18
+ SpawnProcess: Callable[..., SpawnProcessType] = SpawnContext.Process
19
+
20
+ SpawnEvent = SpawnContext.Event
21
+ SpawnQueue = SpawnContext.Queue
17
22
 
18
23
 
19
24
  class MacSafeQueue(multiprocessing.queues.Queue):
@@ -26,7 +31,7 @@ class MacSafeQueue(multiprocessing.queues.Queue):
26
31
 
27
32
  def __init__(self, *args, **kwargs):
28
33
  if 'ctx' not in kwargs:
29
- kwargs['ctx'] = multiprocessing.get_context()
34
+ kwargs['ctx'] = multiprocessing.get_context('spawn')
30
35
  super().__init__(*args, **kwargs)
31
36
  self._counter = multiprocessing.Value('i', 0)
32
37
 
@@ -59,6 +64,41 @@ SizedQueue: Callable[..., multiprocessing.Queue]
59
64
 
60
65
  if platform.system() != 'Darwin':
61
66
  import multiprocessing
62
- SizedQueue = multiprocessing.Queue
67
+ SizedQueue = SpawnQueue
63
68
  else:
64
69
  SizedQueue = MacSafeQueue
70
+
71
+
72
+ def join_terminate_close_proc(process: SpawnProcessType, *, timeout: int = 30) -> None:
73
+ """Increasingly aggressively terminate a process.
74
+
75
+ This function assumes that the process is likely to exit before
76
+ the join timeout, driven by some other means, such as the
77
+ MonitoringHub router_exit_event. If the process does not exit, then
78
+ first terminate() and then kill() will be used to end the process.
79
+
80
+ In the case of a very mis-behaving process, this function might take
81
+ up to 3*timeout to exhaust all termination methods and return.
82
+ """
83
+ logger.debug("Joining process")
84
+ process.join(timeout)
85
+
86
+ # run a sequence of increasingly aggressive steps to shut down the process.
87
+ if process.is_alive():
88
+ logger.error("Process did not join. Terminating.")
89
+ process.terminate()
90
+ process.join(timeout)
91
+ if process.is_alive():
92
+ logger.error("Process did not join after terminate. Killing.")
93
+ process.kill()
94
+ process.join(timeout)
95
+ # This kill should not be caught by any signal handlers so it is
96
+ # unlikely that this join will timeout. If it does, there isn't
97
+ # anything further to do except log an error in the next if-block.
98
+
99
+ if process.is_alive():
100
+ logger.error("Process failed to end")
101
+ # don't call close if the process hasn't ended:
102
+ # process.close() doesn't work on a running process.
103
+ else:
104
+ process.close()
@@ -0,0 +1,54 @@
1
+ import multiprocessing
2
+ import signal
3
+
4
+ import psutil
5
+ import pytest
6
+
7
+ from parsl.multiprocessing import SpawnEvent, SpawnProcess, join_terminate_close_proc
8
+
9
+
10
+ def noop():
11
+ pass
12
+
13
+
14
+ @pytest.mark.local
15
+ def test_end_process_already_exited():
16
+ p = SpawnProcess(target=noop)
17
+ p.start()
18
+ p.join()
19
+ join_terminate_close_proc(p)
20
+
21
+
22
+ def hang():
23
+ while True:
24
+ pass
25
+
26
+
27
+ @pytest.mark.local
28
+ def test_end_hung_process():
29
+ """Test calling against a process that will not exit itself."""
30
+ p = SpawnProcess(target=hang)
31
+ p.start()
32
+ pid = p.pid
33
+ join_terminate_close_proc(p, timeout=1)
34
+ assert not psutil.pid_exists(pid), "process should not exist any more"
35
+
36
+
37
+ def hang_no_sigint(e):
38
+ def s(*args, **kwargs):
39
+ e.set()
40
+ signal.signal(signal.SIGTERM, s)
41
+ while True:
42
+ pass
43
+
44
+
45
+ @pytest.mark.local
46
+ def test_end_hung_process_no_sigint():
47
+ """Test calling against a process that will not exit itself."""
48
+ e = SpawnEvent()
49
+ p = SpawnProcess(target=hang_no_sigint, args=(e,))
50
+ p.start()
51
+ pid = p.pid
52
+ join_terminate_close_proc(p, timeout=2)
53
+ assert not psutil.pid_exists(pid), "process should not exist any more"
54
+ assert e.is_set(), "hung process should have set event on signal"
@@ -45,7 +45,7 @@ def test_row_counts():
45
45
 
46
46
  # dig out the interchange port...
47
47
  hub_address = parsl.dfk().monitoring.hub_address
48
- hub_zmq_port = parsl.dfk().monitoring.hub_zmq_port
48
+ hub_zmq_port = parsl.dfk().executors["htex_Local"].hub_zmq_port
49
49
 
50
50
  # this will send a string to a new socket connection
51
51
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -0,0 +1,27 @@
1
+ import pytest
2
+
3
+ from parsl.monitoring.radios.zmq import ZMQRadioSender
4
+ from parsl.monitoring.radios.zmq_router import start_zmq_receiver
5
+ from parsl.multiprocessing import SpawnQueue
6
+
7
+
8
+ @pytest.mark.local
9
+ def test_send_recv_message(tmpd_cwd, try_assert):
10
+ q = SpawnQueue()
11
+ loopback = "127.0.0.1"
12
+ r = start_zmq_receiver(monitoring_messages=q,
13
+ loopback_address=loopback,
14
+ port_range=(49152, 65535),
15
+ logdir=str(tmpd_cwd),
16
+ worker_debug=False)
17
+
18
+ s = ZMQRadioSender(loopback, r.port)
19
+
20
+ test_msg = ("test", {})
21
+ s.send(test_msg)
22
+
23
+ assert q.get() == test_msg
24
+
25
+ assert r.process.is_alive()
26
+ r.exit_event.set()
27
+ try_assert(lambda: not r.process.is_alive())
@@ -102,6 +102,9 @@ def test_stdstream_to_monitoring(stdx, expected_stdx, stream, tmpd_cwd, caplog):
102
102
  kwargs = {stream: stdx}
103
103
  stdapp(**kwargs).result()
104
104
 
105
+ for record in caplog.records:
106
+ assert record.levelno < logging.ERROR
107
+
105
108
  engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
106
109
  with engine.begin() as connection:
107
110
 
@@ -30,7 +30,7 @@ def test_no_kills():
30
30
 
31
31
  @pytest.mark.local
32
32
  @pytest.mark.parametrize("sig", [signal.SIGINT, signal.SIGTERM, signal.SIGKILL, signal.SIGQUIT])
33
- @pytest.mark.parametrize("process_attr", ["zmq_router_proc", "udp_router_proc", "dbm_proc", "filesystem_proc"])
33
+ @pytest.mark.parametrize("process_attr", ["udp_router_proc", "dbm_proc", "filesystem_proc"])
34
34
  def test_kill_monitoring_helper_process(sig, process_attr, try_assert):
35
35
  """This tests that we can kill a monitoring process and still have successful shutdown.
36
36
  SIGINT emulates some racy behaviour when ctrl-C is pressed: that