parsl 2025.6.16__py3-none-any.whl → 2025.6.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/osg.py +1 -1
- parsl/dataflow/dflow.py +14 -4
- parsl/executors/base.py +19 -9
- parsl/executors/flux/executor.py +2 -0
- parsl/executors/globus_compute.py +2 -0
- parsl/executors/high_throughput/executor.py +22 -15
- parsl/executors/high_throughput/interchange.py +173 -191
- parsl/executors/high_throughput/mpi_executor.py +14 -4
- parsl/executors/high_throughput/probe.py +4 -4
- parsl/executors/high_throughput/process_worker_pool.py +88 -94
- parsl/executors/radical/executor.py +3 -0
- parsl/executors/taskvine/executor.py +11 -3
- parsl/executors/taskvine/manager.py +3 -1
- parsl/executors/threads.py +19 -3
- parsl/executors/workqueue/executor.py +11 -3
- parsl/monitoring/errors.py +4 -4
- parsl/monitoring/monitoring.py +26 -88
- parsl/monitoring/radios/base.py +63 -2
- parsl/monitoring/radios/filesystem.py +19 -4
- parsl/monitoring/radios/filesystem_router.py +22 -3
- parsl/monitoring/radios/htex.py +22 -13
- parsl/monitoring/radios/multiprocessing.py +22 -2
- parsl/monitoring/radios/udp.py +57 -19
- parsl/monitoring/radios/udp_router.py +119 -25
- parsl/monitoring/radios/zmq_router.py +9 -10
- parsl/monitoring/remote.py +19 -40
- parsl/providers/local/local.py +12 -13
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/conftest.py +7 -4
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
- parsl/tests/test_htex/test_zmq_binding.py +5 -6
- parsl/tests/test_monitoring/test_basic.py +12 -10
- parsl/tests/test_monitoring/{test_fuzz_zmq.py → test_htex_fuzz_zmq.py} +7 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_radio_filesystem.py +48 -0
- parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
- parsl/tests/test_monitoring/test_radio_udp.py +204 -0
- parsl/tests/test_monitoring/test_stdouterr.py +1 -3
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/version.py +1 -1
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/RECORD +51 -50
- parsl/tests/configs/local_threads_monitoring.py +0 -10
- parsl/tests/manual_tests/test_udp_simple.py +0 -51
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,31 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import hmac
|
3
4
|
import logging
|
4
5
|
import multiprocessing.queues as mpq
|
5
6
|
import os
|
6
7
|
import pickle
|
8
|
+
import queue
|
7
9
|
import socket
|
8
10
|
import time
|
11
|
+
from multiprocessing.context import SpawnProcess as SpawnProcessType
|
12
|
+
from multiprocessing.queues import Queue
|
9
13
|
from multiprocessing.synchronize import Event
|
10
|
-
from
|
14
|
+
from multiprocessing.synchronize import Event as EventType
|
15
|
+
from typing import Optional, Union
|
11
16
|
|
12
17
|
import typeguard
|
13
18
|
|
14
19
|
from parsl.log_utils import set_file_logger
|
20
|
+
from parsl.monitoring.errors import MonitoringRouterStartError
|
21
|
+
from parsl.monitoring.radios.base import MonitoringRadioReceiver
|
15
22
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
23
|
+
from parsl.multiprocessing import (
|
24
|
+
SizedQueue,
|
25
|
+
SpawnEvent,
|
26
|
+
SpawnProcess,
|
27
|
+
join_terminate_close_proc,
|
28
|
+
)
|
16
29
|
from parsl.process_loggers import wrap_with_logs
|
17
30
|
from parsl.utils import setproctitle
|
18
31
|
|
@@ -26,9 +39,11 @@ class MonitoringRouter:
|
|
26
39
|
udp_port: Optional[int] = None,
|
27
40
|
run_dir: str = ".",
|
28
41
|
logging_level: int = logging.INFO,
|
29
|
-
atexit_timeout: int
|
42
|
+
atexit_timeout: int, # in seconds
|
30
43
|
resource_msgs: mpq.Queue,
|
31
44
|
exit_event: Event,
|
45
|
+
hmac_key: bytes,
|
46
|
+
hmac_digest: str,
|
32
47
|
):
|
33
48
|
""" Initializes a monitoring configuration class.
|
34
49
|
|
@@ -48,13 +63,15 @@ class MonitoringRouter:
|
|
48
63
|
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
49
64
|
"""
|
50
65
|
os.makedirs(run_dir, exist_ok=True)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
self.logger.debug("Monitoring router starting")
|
66
|
+
set_file_logger(f"{run_dir}/monitoring_udp_router.log",
|
67
|
+
level=logging_level)
|
68
|
+
logger.debug("Monitoring router starting")
|
55
69
|
|
56
70
|
self.atexit_timeout = atexit_timeout
|
57
71
|
|
72
|
+
self.hmac_key = hmac_key
|
73
|
+
self.hmac_digest = hmac_digest
|
74
|
+
|
58
75
|
self.loop_freq = 10.0 # milliseconds
|
59
76
|
|
60
77
|
# Initialize the UDP socket
|
@@ -73,39 +90,55 @@ class MonitoringRouter:
|
|
73
90
|
except Exception as e:
|
74
91
|
raise RuntimeError(f"Could not bind to udp_port {udp_port} because: {e}")
|
75
92
|
self.udp_sock.settimeout(self.loop_freq / 1000)
|
76
|
-
|
93
|
+
logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
|
77
94
|
|
78
95
|
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
79
96
|
self.exit_event = exit_event
|
80
97
|
|
81
|
-
@wrap_with_logs
|
98
|
+
@wrap_with_logs
|
82
99
|
def start(self) -> None:
|
83
|
-
|
100
|
+
logger.info("Starting UDP listener")
|
84
101
|
try:
|
85
102
|
while not self.exit_event.is_set():
|
86
103
|
try:
|
87
|
-
|
88
|
-
resource_msg = pickle.loads(data)
|
89
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
90
|
-
self.target_radio.send(resource_msg)
|
104
|
+
self.process_message()
|
91
105
|
except socket.timeout:
|
92
106
|
pass
|
93
107
|
|
94
|
-
|
108
|
+
logger.info("UDP listener draining")
|
95
109
|
last_msg_received_time = time.time()
|
96
110
|
while time.time() - last_msg_received_time < self.atexit_timeout:
|
97
111
|
try:
|
98
|
-
|
99
|
-
msg = pickle.loads(data)
|
100
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
101
|
-
self.target_radio.send(msg)
|
112
|
+
self.process_message()
|
102
113
|
last_msg_received_time = time.time()
|
103
114
|
except socket.timeout:
|
104
115
|
pass
|
105
116
|
|
106
|
-
|
117
|
+
logger.info("UDP listener finishing normally")
|
107
118
|
finally:
|
108
|
-
|
119
|
+
logger.info("UDP listener finished")
|
120
|
+
|
121
|
+
def process_message(self) -> None:
|
122
|
+
hmdata, addr = self.udp_sock.recvfrom(2048)
|
123
|
+
h = hmac.HMAC(key=self.hmac_key, digestmod=self.hmac_digest)
|
124
|
+
origin_hmac = hmdata[0:h.digest_size]
|
125
|
+
h.update(hmdata[h.digest_size:])
|
126
|
+
data = hmdata[h.digest_size:]
|
127
|
+
|
128
|
+
# Check hmac before pickle load.
|
129
|
+
# If data is wrong, do not log it because it is suspect,
|
130
|
+
# but it should be safe to log the addr, at error level.
|
131
|
+
|
132
|
+
recomputed_hmac = h.digest()
|
133
|
+
|
134
|
+
if not hmac.compare_digest(origin_hmac, recomputed_hmac):
|
135
|
+
logger.error("HMAC does not match on received message")
|
136
|
+
# No exception, because this can be arbitrary network noise
|
137
|
+
# that shouldn't break the receiver.
|
138
|
+
else:
|
139
|
+
resource_msg = pickle.loads(data)
|
140
|
+
logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
141
|
+
self.target_radio.send(resource_msg)
|
109
142
|
|
110
143
|
|
111
144
|
@wrap_with_logs
|
@@ -116,24 +149,85 @@ def udp_router_starter(*,
|
|
116
149
|
exit_event: Event,
|
117
150
|
|
118
151
|
udp_port: Optional[int],
|
119
|
-
|
152
|
+
hmac_key: bytes,
|
120
153
|
run_dir: str,
|
121
|
-
logging_level: int
|
154
|
+
logging_level: int,
|
155
|
+
atexit_timeout: int,
|
156
|
+
hmac_digest: str) -> None:
|
122
157
|
setproctitle("parsl: monitoring UDP router")
|
123
158
|
try:
|
124
159
|
router = MonitoringRouter(udp_port=udp_port,
|
125
160
|
run_dir=run_dir,
|
126
161
|
logging_level=logging_level,
|
127
162
|
resource_msgs=resource_msgs,
|
128
|
-
exit_event=exit_event
|
163
|
+
exit_event=exit_event,
|
164
|
+
atexit_timeout=atexit_timeout,
|
165
|
+
hmac_key=hmac_key,
|
166
|
+
hmac_digest=hmac_digest)
|
129
167
|
except Exception as e:
|
130
168
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
131
169
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
132
170
|
else:
|
133
171
|
comm_q.put(router.udp_port)
|
134
172
|
|
135
|
-
|
173
|
+
logger.info("Starting MonitoringRouter in router_starter")
|
136
174
|
try:
|
137
175
|
router.start()
|
138
176
|
except Exception:
|
139
|
-
|
177
|
+
logger.exception("UDP router start exception")
|
178
|
+
|
179
|
+
|
180
|
+
class UDPRadioReceiver(MonitoringRadioReceiver):
|
181
|
+
def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
|
182
|
+
self.process = process
|
183
|
+
self.exit_event = exit_event
|
184
|
+
self.port = port
|
185
|
+
|
186
|
+
def shutdown(self) -> None:
|
187
|
+
self.exit_event.set()
|
188
|
+
join_terminate_close_proc(self.process)
|
189
|
+
|
190
|
+
|
191
|
+
def start_udp_receiver(*,
|
192
|
+
monitoring_messages: Queue,
|
193
|
+
port: Optional[int],
|
194
|
+
logdir: str,
|
195
|
+
debug: bool,
|
196
|
+
atexit_timeout: int,
|
197
|
+
hmac_key: bytes,
|
198
|
+
hmac_digest: str) -> UDPRadioReceiver:
|
199
|
+
|
200
|
+
udp_comm_q: Queue[Union[int, str]]
|
201
|
+
udp_comm_q = SizedQueue(maxsize=10)
|
202
|
+
|
203
|
+
router_exit_event = SpawnEvent()
|
204
|
+
|
205
|
+
router_proc = SpawnProcess(target=udp_router_starter,
|
206
|
+
kwargs={"comm_q": udp_comm_q,
|
207
|
+
"resource_msgs": monitoring_messages,
|
208
|
+
"exit_event": router_exit_event,
|
209
|
+
"udp_port": port,
|
210
|
+
"run_dir": logdir,
|
211
|
+
"logging_level": logging.DEBUG if debug else logging.INFO,
|
212
|
+
"atexit_timeout": atexit_timeout,
|
213
|
+
"hmac_key": hmac_key,
|
214
|
+
"hmac_digest": hmac_digest,
|
215
|
+
},
|
216
|
+
name="Monitoring-UDP-Router-Process",
|
217
|
+
daemon=True,
|
218
|
+
)
|
219
|
+
router_proc.start()
|
220
|
+
|
221
|
+
try:
|
222
|
+
udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
|
223
|
+
udp_comm_q.close()
|
224
|
+
udp_comm_q.join_thread()
|
225
|
+
except queue.Empty:
|
226
|
+
logger.error("Monitoring UDP router has not reported port in 120s. Aborting")
|
227
|
+
raise MonitoringRouterStartError()
|
228
|
+
|
229
|
+
if isinstance(udp_comm_q_result, str):
|
230
|
+
logger.error("MonitoringRouter sent an error message: %s", udp_comm_q_result)
|
231
|
+
raise RuntimeError(f"MonitoringRouter failed to start: {udp_comm_q_result}")
|
232
|
+
|
233
|
+
return UDPRadioReceiver(process=router_proc, exit_event=router_exit_event, port=udp_comm_q_result)
|
@@ -61,10 +61,9 @@ class MonitoringRouter:
|
|
61
61
|
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
62
62
|
"""
|
63
63
|
os.makedirs(run_dir, exist_ok=True)
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
self.logger.debug("Monitoring router starting")
|
64
|
+
set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
|
65
|
+
level=logging_level)
|
66
|
+
logger.debug("Monitoring router starting")
|
68
67
|
|
69
68
|
self.address = address
|
70
69
|
|
@@ -75,7 +74,7 @@ class MonitoringRouter:
|
|
75
74
|
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
76
75
|
self.zmq_receiver_channel.set_hwm(0)
|
77
76
|
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
78
|
-
|
77
|
+
logger.debug("address: {}. port_range {}".format(address, port_range))
|
79
78
|
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(tcp_url(address),
|
80
79
|
min_port=port_range[0],
|
81
80
|
max_port=port_range[1])
|
@@ -83,9 +82,9 @@ class MonitoringRouter:
|
|
83
82
|
self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
|
84
83
|
self.exit_event = exit_event
|
85
84
|
|
86
|
-
@wrap_with_logs
|
85
|
+
@wrap_with_logs
|
87
86
|
def start(self) -> None:
|
88
|
-
|
87
|
+
logger.info("Starting ZMQ listener")
|
89
88
|
try:
|
90
89
|
while not self.exit_event.is_set():
|
91
90
|
try:
|
@@ -107,11 +106,11 @@ class MonitoringRouter:
|
|
107
106
|
# channel is broken in such a way that it always raises
|
108
107
|
# an exception? Looping on this would maybe be the wrong
|
109
108
|
# thing to do.
|
110
|
-
|
109
|
+
logger.warning("Failure processing a ZMQ message", exc_info=True)
|
111
110
|
|
112
|
-
|
111
|
+
logger.info("ZMQ listener finishing normally")
|
113
112
|
finally:
|
114
|
-
|
113
|
+
logger.info("ZMQ listener finished")
|
115
114
|
|
116
115
|
|
117
116
|
@wrap_with_logs
|
parsl/monitoring/remote.py
CHANGED
@@ -7,10 +7,7 @@ from multiprocessing import Event
|
|
7
7
|
from typing import Any, Callable, Dict, List, Sequence, Tuple
|
8
8
|
|
9
9
|
from parsl.monitoring.message_type import MessageType
|
10
|
-
from parsl.monitoring.radios.base import
|
11
|
-
from parsl.monitoring.radios.filesystem import FilesystemRadioSender
|
12
|
-
from parsl.monitoring.radios.htex import HTEXRadioSender
|
13
|
-
from parsl.monitoring.radios.udp import UDPRadioSender
|
10
|
+
from parsl.monitoring.radios.base import RadioConfig
|
14
11
|
from parsl.multiprocessing import ForkProcess
|
15
12
|
from parsl.process_loggers import wrap_with_logs
|
16
13
|
|
@@ -23,11 +20,10 @@ def monitor_wrapper(*,
|
|
23
20
|
kwargs: Dict, # per invocation
|
24
21
|
x_try_id: int, # per invocation
|
25
22
|
x_task_id: int, # per invocation
|
26
|
-
|
23
|
+
radio_config: RadioConfig, # per executor
|
27
24
|
run_id: str, # per workflow
|
28
25
|
logging_level: int, # per workflow
|
29
26
|
sleep_dur: float, # per workflow
|
30
|
-
radio_mode: str, # per executor
|
31
27
|
monitor_resources: bool, # per workflow
|
32
28
|
run_dir: str) -> Tuple[Callable, Sequence, Dict]:
|
33
29
|
"""Wrap the Parsl app with a function that will call the monitor function and point it at the correct pid when the task begins.
|
@@ -41,9 +37,8 @@ def monitor_wrapper(*,
|
|
41
37
|
# Send first message to monitoring router
|
42
38
|
send_first_message(try_id,
|
43
39
|
task_id,
|
44
|
-
|
40
|
+
radio_config,
|
45
41
|
run_id,
|
46
|
-
radio_mode,
|
47
42
|
run_dir)
|
48
43
|
|
49
44
|
if monitor_resources and sleep_dur > 0:
|
@@ -52,9 +47,8 @@ def monitor_wrapper(*,
|
|
52
47
|
args=(os.getpid(),
|
53
48
|
try_id,
|
54
49
|
task_id,
|
55
|
-
|
50
|
+
radio_config,
|
56
51
|
run_id,
|
57
|
-
radio_mode,
|
58
52
|
logging_level,
|
59
53
|
sleep_dur,
|
60
54
|
run_dir,
|
@@ -87,9 +81,9 @@ def monitor_wrapper(*,
|
|
87
81
|
|
88
82
|
send_last_message(try_id,
|
89
83
|
task_id,
|
90
|
-
|
84
|
+
radio_config,
|
91
85
|
run_id,
|
92
|
-
|
86
|
+
run_dir)
|
93
87
|
|
94
88
|
new_kwargs = kwargs.copy()
|
95
89
|
new_kwargs['_parsl_monitoring_task_id'] = x_task_id
|
@@ -98,47 +92,33 @@ def monitor_wrapper(*,
|
|
98
92
|
return (wrapped, args, new_kwargs)
|
99
93
|
|
100
94
|
|
101
|
-
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
|
102
|
-
radio: MonitoringRadioSender
|
103
|
-
if radio_mode == "udp":
|
104
|
-
radio = UDPRadioSender(monitoring_hub_url)
|
105
|
-
elif radio_mode == "htex":
|
106
|
-
radio = HTEXRadioSender(monitoring_hub_url)
|
107
|
-
elif radio_mode == "filesystem":
|
108
|
-
radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
|
109
|
-
run_dir=run_dir)
|
110
|
-
else:
|
111
|
-
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
112
|
-
return radio
|
113
|
-
|
114
|
-
|
115
95
|
@wrap_with_logs
|
116
96
|
def send_first_message(try_id: int,
|
117
97
|
task_id: int,
|
118
|
-
|
119
|
-
run_id: str,
|
120
|
-
send_first_last_message(try_id, task_id,
|
121
|
-
|
98
|
+
radio_config: RadioConfig,
|
99
|
+
run_id: str, run_dir: str) -> None:
|
100
|
+
send_first_last_message(try_id, task_id, radio_config, run_id,
|
101
|
+
run_dir, False)
|
122
102
|
|
123
103
|
|
124
104
|
@wrap_with_logs
|
125
105
|
def send_last_message(try_id: int,
|
126
106
|
task_id: int,
|
127
|
-
|
128
|
-
run_id: str,
|
129
|
-
send_first_last_message(try_id, task_id,
|
130
|
-
|
107
|
+
radio_config: RadioConfig,
|
108
|
+
run_id: str, run_dir: str) -> None:
|
109
|
+
send_first_last_message(try_id, task_id, radio_config, run_id,
|
110
|
+
run_dir, True)
|
131
111
|
|
132
112
|
|
133
113
|
def send_first_last_message(try_id: int,
|
134
114
|
task_id: int,
|
135
|
-
|
136
|
-
run_id: str,
|
115
|
+
radio_config: RadioConfig,
|
116
|
+
run_id: str, run_dir: str,
|
137
117
|
is_last: bool) -> None:
|
138
118
|
import os
|
139
119
|
import platform
|
140
120
|
|
141
|
-
radio =
|
121
|
+
radio = radio_config.create_sender()
|
142
122
|
|
143
123
|
msg = (MessageType.RESOURCE_INFO,
|
144
124
|
{'run_id': run_id,
|
@@ -158,9 +138,8 @@ def send_first_last_message(try_id: int,
|
|
158
138
|
def monitor(pid: int,
|
159
139
|
try_id: int,
|
160
140
|
task_id: int,
|
161
|
-
|
141
|
+
radio_config: RadioConfig,
|
162
142
|
run_id: str,
|
163
|
-
radio_mode: str,
|
164
143
|
logging_level: int,
|
165
144
|
sleep_dur: float,
|
166
145
|
run_dir: str,
|
@@ -184,7 +163,7 @@ def monitor(pid: int,
|
|
184
163
|
|
185
164
|
setproctitle("parsl: task resource monitor")
|
186
165
|
|
187
|
-
radio =
|
166
|
+
radio = radio_config.create_sender()
|
188
167
|
|
189
168
|
logging.debug("start of monitor")
|
190
169
|
|
parsl/providers/local/local.py
CHANGED
@@ -114,17 +114,15 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
114
114
|
|
115
115
|
return [self.resources[jid]['status'] for jid in job_ids]
|
116
116
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
else:
|
127
|
-
return False
|
117
|
+
@staticmethod
|
118
|
+
def _is_alive(job_dict) -> bool:
|
119
|
+
try:
|
120
|
+
os.kill(job_dict['remote_pid'], 0)
|
121
|
+
except ProcessLookupError:
|
122
|
+
return False
|
123
|
+
except PermissionError:
|
124
|
+
pass # exists; just no permissions to send signal
|
125
|
+
return True
|
128
126
|
|
129
127
|
def _job_file_path(self, script_path: str, suffix: str) -> str:
|
130
128
|
path = '{0}{1}'.format(script_path, suffix)
|
@@ -230,8 +228,9 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
230
228
|
stdout, stderr)
|
231
229
|
for line in stdout.split('\n'):
|
232
230
|
if line.startswith("PID:"):
|
233
|
-
|
234
|
-
|
231
|
+
job_id = line.split("PID:")[1].strip()
|
232
|
+
remote_pid = int(job_id)
|
233
|
+
break
|
235
234
|
if job_id is None:
|
236
235
|
raise SubmitException(job_name, "Channel failed to start remote command/retrieve PID")
|
237
236
|
|
parsl/tests/conftest.py
CHANGED
@@ -406,13 +406,15 @@ def try_assert():
|
|
406
406
|
timeout_ms: float = 5000,
|
407
407
|
attempts: int = 0,
|
408
408
|
check_period_ms: int = 20,
|
409
|
+
factor: float = 2,
|
409
410
|
):
|
410
411
|
tb = create_traceback(start=1)
|
411
412
|
check_period_s = abs(check_period_ms) / 1000.0
|
412
413
|
if attempts > 0:
|
413
414
|
for _attempt_no in range(attempts):
|
414
|
-
|
415
|
-
check_period_s
|
415
|
+
fraction = random.random()
|
416
|
+
time.sleep(fraction * check_period_s) # jitter
|
417
|
+
check_period_s *= factor ** fraction
|
416
418
|
if test_func():
|
417
419
|
return
|
418
420
|
else:
|
@@ -427,9 +429,10 @@ def try_assert():
|
|
427
429
|
timeout_s = abs(timeout_ms) / 1000.0
|
428
430
|
end = time.monotonic() + timeout_s
|
429
431
|
while time.monotonic() < end:
|
430
|
-
|
432
|
+
fraction = random.random()
|
433
|
+
wait_for = fraction * check_period_s # jitter
|
431
434
|
time.sleep(min(wait_for, end - time.monotonic()))
|
432
|
-
check_period_s *=
|
435
|
+
check_period_s *= factor ** fraction
|
433
436
|
if test_func():
|
434
437
|
return
|
435
438
|
att_fail = (
|
@@ -40,7 +40,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
|
|
40
40
|
incoming_q.port,
|
41
41
|
command_client.port),
|
42
42
|
"interchange_address": "127.0.0.1",
|
43
|
-
"
|
43
|
+
"worker_port": None,
|
44
44
|
"worker_port_range": (50000, 60000),
|
45
45
|
"hub_address": None,
|
46
46
|
"hub_zmq_port": None,
|
@@ -67,7 +67,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
|
|
67
67
|
# responsive. if the interchange process didn't start enough to get the command
|
68
68
|
# thread running, this will time out.
|
69
69
|
|
70
|
-
|
70
|
+
worker_port = command_client.run("WORKER_BINDS", timeout_s=120)
|
71
71
|
|
72
72
|
# now we'll assume that if the interchange command thread is responding,
|
73
73
|
# then the worker polling code is also running and that the interchange has
|
@@ -80,7 +80,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
|
|
80
80
|
|
81
81
|
msg = {'type': 'registration',
|
82
82
|
'parsl_v': PARSL_VERSION,
|
83
|
-
'python_v': "
|
83
|
+
'python_v': "1.1.1", # this is the bad bit
|
84
84
|
'worker_count': 1,
|
85
85
|
'uid': 'testuid',
|
86
86
|
'block_id': 0,
|
@@ -104,11 +104,9 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
|
|
104
104
|
|
105
105
|
task_channel.set_hwm(0)
|
106
106
|
task_channel.setsockopt(zmq.SNDTIMEO, channel_timeout)
|
107
|
-
task_channel.connect(f"tcp://127.0.0.1:{
|
107
|
+
task_channel.connect(f"tcp://127.0.0.1:{worker_port}")
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
task_channel.send(b_msg)
|
109
|
+
task_channel.send(pickle.dumps(msg))
|
112
110
|
|
113
111
|
# check that the interchange exits within some reasonable time
|
114
112
|
try_assert(lambda: interchange_proc.poll() is not None, "Interchange did not exit after killing watched client process", timeout_ms=5000)
|
@@ -15,12 +15,12 @@ from parsl.executors.high_throughput.manager_selector import RandomManagerSelect
|
|
15
15
|
def make_interchange(*,
|
16
16
|
interchange_address: Optional[str],
|
17
17
|
cert_dir: Optional[str],
|
18
|
-
|
18
|
+
worker_port: Optional[int] = None) -> Interchange:
|
19
19
|
return Interchange(interchange_address=interchange_address,
|
20
20
|
cert_dir=cert_dir,
|
21
21
|
client_address="127.0.0.1",
|
22
22
|
client_ports=(50055, 50056, 50057),
|
23
|
-
|
23
|
+
worker_port=worker_port,
|
24
24
|
worker_port_range=(54000, 55000),
|
25
25
|
hub_address=None,
|
26
26
|
hub_zmq_port=None,
|
@@ -56,7 +56,7 @@ def test_interchange_curvezmq_sockets(
|
|
56
56
|
ix = make_interchange(interchange_address=address, cert_dir=cert_dir)
|
57
57
|
assert isinstance(ix.zmq_context, curvezmq.ServerContext)
|
58
58
|
assert ix.zmq_context.encrypted is encrypted
|
59
|
-
assert mock_socket.call_count ==
|
59
|
+
assert mock_socket.call_count == 4
|
60
60
|
|
61
61
|
|
62
62
|
@pytest.mark.local
|
@@ -100,11 +100,10 @@ def test_limited_interface_binding(cert_dir: Optional[str]):
|
|
100
100
|
"""When address is specified the worker_port would be bound to it rather than to 0.0.0.0"""
|
101
101
|
address = "127.0.0.1"
|
102
102
|
ix = make_interchange(interchange_address=address, cert_dir=cert_dir)
|
103
|
-
ix.worker_result_port
|
104
103
|
proc = psutil.Process()
|
105
104
|
conns = proc.connections(kind="tcp")
|
106
105
|
|
107
|
-
matched_conns = [conn for conn in conns if conn.laddr.port == ix.
|
106
|
+
matched_conns = [conn for conn in conns if conn.laddr.port == ix.worker_port]
|
108
107
|
assert len(matched_conns) == 1
|
109
108
|
# laddr.ip can return ::ffff:127.0.0.1 when using IPv6
|
110
109
|
assert address in matched_conns[0].laddr.ip
|
@@ -113,5 +112,5 @@ def test_limited_interface_binding(cert_dir: Optional[str]):
|
|
113
112
|
@pytest.mark.local
|
114
113
|
@pytest.mark.parametrize("encrypted", (True, False), indirect=True)
|
115
114
|
def test_fixed_ports(cert_dir: Optional[str]):
|
116
|
-
ix = make_interchange(interchange_address=None, cert_dir=cert_dir,
|
115
|
+
ix = make_interchange(interchange_address=None, cert_dir=cert_dir, worker_port=51117)
|
117
116
|
assert ix.interchange_address == "*"
|
@@ -8,6 +8,9 @@ from parsl import HighThroughputExecutor, ThreadPoolExecutor
|
|
8
8
|
from parsl.config import Config
|
9
9
|
from parsl.executors.status_handling import BlockProviderExecutor
|
10
10
|
from parsl.monitoring import MonitoringHub
|
11
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadio
|
12
|
+
from parsl.monitoring.radios.htex import HTEXRadio
|
13
|
+
from parsl.monitoring.radios.udp import UDPRadio
|
11
14
|
|
12
15
|
|
13
16
|
@parsl.python_app
|
@@ -25,9 +28,8 @@ def this_app():
|
|
25
28
|
# a configuration that is suitably configured for monitoring.
|
26
29
|
|
27
30
|
def thread_config():
|
28
|
-
c = Config(executors=[ThreadPoolExecutor()],
|
29
|
-
monitoring=MonitoringHub(
|
30
|
-
resource_monitoring_interval=0))
|
31
|
+
c = Config(executors=[ThreadPoolExecutor(remote_monitoring_radio=UDPRadio(address="localhost", atexit_timeout=0))],
|
32
|
+
monitoring=MonitoringHub(resource_monitoring_interval=0))
|
31
33
|
return c
|
32
34
|
|
33
35
|
|
@@ -42,9 +44,10 @@ def htex_udp_config():
|
|
42
44
|
from parsl.tests.configs.htex_local_alternate import fresh_config
|
43
45
|
c = fresh_config()
|
44
46
|
assert len(c.executors) == 1
|
47
|
+
ex = c.executors[0]
|
45
48
|
|
46
|
-
assert
|
47
|
-
|
49
|
+
assert isinstance(ex.remote_monitoring_radio, HTEXRadio), "precondition: htex is configured for the HTEXRadio"
|
50
|
+
ex.remote_monitoring_radio = UDPRadio(address="localhost", atexit_timeout=0)
|
48
51
|
|
49
52
|
return c
|
50
53
|
|
@@ -54,9 +57,10 @@ def htex_filesystem_config():
|
|
54
57
|
from parsl.tests.configs.htex_local_alternate import fresh_config
|
55
58
|
c = fresh_config()
|
56
59
|
assert len(c.executors) == 1
|
60
|
+
ex = c.executors[0]
|
57
61
|
|
58
|
-
assert
|
59
|
-
|
62
|
+
assert isinstance(ex.remote_monitoring_radio, HTEXRadio), "precondition: htex is configured for the HTEXRadio"
|
63
|
+
ex.remote_monitoring_radio = FilesystemRadio()
|
60
64
|
|
61
65
|
return c
|
62
66
|
|
@@ -65,7 +69,6 @@ def workqueue_config():
|
|
65
69
|
from parsl.tests.configs.workqueue_ex import fresh_config
|
66
70
|
c = fresh_config()
|
67
71
|
c.monitoring = MonitoringHub(
|
68
|
-
hub_address="localhost",
|
69
72
|
resource_monitoring_interval=1)
|
70
73
|
return c
|
71
74
|
|
@@ -76,8 +79,7 @@ def taskvine_config():
|
|
76
79
|
worker_launch_method='provider')],
|
77
80
|
strategy_period=0.5,
|
78
81
|
|
79
|
-
monitoring=MonitoringHub(
|
80
|
-
resource_monitoring_interval=1))
|
82
|
+
monitoring=MonitoringHub(resource_monitoring_interval=1))
|
81
83
|
return c
|
82
84
|
|
83
85
|
|
@@ -17,7 +17,12 @@ def this_app():
|
|
17
17
|
|
18
18
|
|
19
19
|
@pytest.mark.local
|
20
|
-
def
|
20
|
+
def test_fuzz():
|
21
|
+
"""This test sends fuzz into the ZMQ radio receiver that HTEX starts
|
22
|
+
for receiving monitoring messages from the interchange, and checks
|
23
|
+
that monitoring still records things ok.
|
24
|
+
"""
|
25
|
+
|
21
26
|
import sqlalchemy
|
22
27
|
from sqlalchemy import text
|
23
28
|
|
@@ -44,7 +49,7 @@ def test_row_counts():
|
|
44
49
|
# the latter is what i'm most suspicious of in my present investigation
|
45
50
|
|
46
51
|
# dig out the interchange port...
|
47
|
-
hub_address = parsl.dfk().
|
52
|
+
hub_address = parsl.dfk().executors["htex_Local"].loopback_address
|
48
53
|
hub_zmq_port = parsl.dfk().executors["htex_Local"].hub_zmq_port
|
49
54
|
|
50
55
|
# this will send a string to a new socket connection
|