parsl 2024.7.29__py3-none-any.whl → 2024.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +1 -1
- parsl/executors/base.py +7 -7
- parsl/executors/high_throughput/executor.py +13 -6
- parsl/executors/high_throughput/interchange.py +36 -37
- parsl/executors/high_throughput/manager_selector.py +25 -0
- parsl/executors/status_handling.py +38 -24
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +2 -1
- parsl/monitoring/radios.py +16 -0
- parsl/monitoring/router.py +71 -30
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_htex.py +28 -19
- parsl/tests/test_htex/test_zmq_binding.py +2 -0
- parsl/tests/test_monitoring/test_basic.py +14 -1
- parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
- parsl/version.py +1 -1
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/interchange.py +36 -37
- parsl-2024.8.5.dist-info/METADATA +101 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/RECORD +26 -23
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
- parsl-2024.7.29.dist-info/METADATA +0 -101
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
parsl/monitoring/router.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import pickle
|
6
6
|
import queue
|
7
7
|
import socket
|
8
|
+
import threading
|
8
9
|
import time
|
9
10
|
from multiprocessing.synchronize import Event
|
10
11
|
from typing import Optional, Tuple, Union
|
@@ -32,7 +33,12 @@ class MonitoringRouter:
|
|
32
33
|
logdir: str = ".",
|
33
34
|
run_id: str,
|
34
35
|
logging_level: int = logging.INFO,
|
35
|
-
atexit_timeout: int = 3
|
36
|
+
atexit_timeout: int = 3, # in seconds
|
37
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
38
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
39
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
40
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
41
|
+
exit_event: Event,
|
36
42
|
):
|
37
43
|
""" Initializes a monitoring configuration class.
|
38
44
|
|
@@ -51,7 +57,11 @@ class MonitoringRouter:
|
|
51
57
|
Logging level as defined in the logging module. Default: logging.INFO
|
52
58
|
atexit_timeout : float, optional
|
53
59
|
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
60
|
+
*_msgs : Queue
|
61
|
+
Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
|
54
62
|
|
63
|
+
exit_event : Event
|
64
|
+
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
55
65
|
"""
|
56
66
|
os.makedirs(logdir, exist_ok=True)
|
57
67
|
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
@@ -93,22 +103,60 @@ class MonitoringRouter:
|
|
93
103
|
min_port=zmq_port_range[0],
|
94
104
|
max_port=zmq_port_range[1])
|
95
105
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
106
|
+
self.priority_msgs = priority_msgs
|
107
|
+
self.node_msgs = node_msgs
|
108
|
+
self.block_msgs = block_msgs
|
109
|
+
self.resource_msgs = resource_msgs
|
110
|
+
self.exit_event = exit_event
|
111
|
+
|
112
|
+
@wrap_with_logs(target="monitoring_router")
|
113
|
+
def start(self) -> None:
|
114
|
+
self.logger.info("Starting UDP listener thread")
|
115
|
+
udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
|
116
|
+
udp_radio_receiver_thread.start()
|
117
|
+
|
118
|
+
self.logger.info("Starting ZMQ listener thread")
|
119
|
+
zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
|
120
|
+
zmq_radio_receiver_thread.start()
|
121
|
+
|
122
|
+
self.logger.info("Joining on ZMQ listener thread")
|
123
|
+
zmq_radio_receiver_thread.join()
|
124
|
+
self.logger.info("Joining on UDP listener thread")
|
125
|
+
udp_radio_receiver_thread.join()
|
126
|
+
self.logger.info("Joined on both ZMQ and UDP listener threads")
|
127
|
+
|
128
|
+
@wrap_with_logs(target="monitoring_router")
|
129
|
+
def start_udp_listener(self) -> None:
|
102
130
|
try:
|
103
|
-
while not exit_event.is_set():
|
131
|
+
while not self.exit_event.is_set():
|
104
132
|
try:
|
105
133
|
data, addr = self.udp_sock.recvfrom(2048)
|
106
134
|
resource_msg = pickle.loads(data)
|
107
135
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
108
|
-
resource_msgs.put((resource_msg, addr))
|
136
|
+
self.resource_msgs.put((resource_msg, addr))
|
109
137
|
except socket.timeout:
|
110
138
|
pass
|
111
139
|
|
140
|
+
self.logger.info("UDP listener draining")
|
141
|
+
last_msg_received_time = time.time()
|
142
|
+
while time.time() - last_msg_received_time < self.atexit_timeout:
|
143
|
+
try:
|
144
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
145
|
+
msg = pickle.loads(data)
|
146
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
147
|
+
self.resource_msgs.put((msg, addr))
|
148
|
+
last_msg_received_time = time.time()
|
149
|
+
except socket.timeout:
|
150
|
+
pass
|
151
|
+
|
152
|
+
self.logger.info("UDP listener finishing normally")
|
153
|
+
finally:
|
154
|
+
self.logger.info("UDP listener finished")
|
155
|
+
|
156
|
+
@wrap_with_logs(target="monitoring_router")
|
157
|
+
def start_zmq_listener(self) -> None:
|
158
|
+
try:
|
159
|
+
while not self.exit_event.is_set():
|
112
160
|
try:
|
113
161
|
dfk_loop_start = time.time()
|
114
162
|
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
@@ -125,15 +173,15 @@ class MonitoringRouter:
|
|
125
173
|
|
126
174
|
if msg[0] == MessageType.NODE_INFO:
|
127
175
|
msg[1]['run_id'] = self.run_id
|
128
|
-
node_msgs.put(msg_0)
|
176
|
+
self.node_msgs.put(msg_0)
|
129
177
|
elif msg[0] == MessageType.RESOURCE_INFO:
|
130
|
-
resource_msgs.put(msg_0)
|
178
|
+
self.resource_msgs.put(msg_0)
|
131
179
|
elif msg[0] == MessageType.BLOCK_INFO:
|
132
|
-
block_msgs.put(msg_0)
|
180
|
+
self.block_msgs.put(msg_0)
|
133
181
|
elif msg[0] == MessageType.TASK_INFO:
|
134
|
-
priority_msgs.put(msg_0)
|
182
|
+
self.priority_msgs.put(msg_0)
|
135
183
|
elif msg[0] == MessageType.WORKFLOW_INFO:
|
136
|
-
priority_msgs.put(msg_0)
|
184
|
+
self.priority_msgs.put(msg_0)
|
137
185
|
else:
|
138
186
|
# There is a type: ignore here because if msg[0]
|
139
187
|
# is of the correct type, this code is unreachable,
|
@@ -151,21 +199,9 @@ class MonitoringRouter:
|
|
151
199
|
# thing to do.
|
152
200
|
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
153
201
|
|
154
|
-
self.logger.info("
|
155
|
-
last_msg_received_time = time.time()
|
156
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
157
|
-
try:
|
158
|
-
data, addr = self.udp_sock.recvfrom(2048)
|
159
|
-
msg = pickle.loads(data)
|
160
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
161
|
-
resource_msgs.put((msg, addr))
|
162
|
-
last_msg_received_time = time.time()
|
163
|
-
except socket.timeout:
|
164
|
-
pass
|
165
|
-
|
166
|
-
self.logger.info("Monitoring router finishing normally")
|
202
|
+
self.logger.info("ZMQ listener finishing normally")
|
167
203
|
finally:
|
168
|
-
self.logger.info("
|
204
|
+
self.logger.info("ZMQ listener finished")
|
169
205
|
|
170
206
|
|
171
207
|
@wrap_with_logs
|
@@ -191,7 +227,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
191
227
|
zmq_port_range=zmq_port_range,
|
192
228
|
logdir=logdir,
|
193
229
|
logging_level=logging_level,
|
194
|
-
run_id=run_id
|
230
|
+
run_id=run_id,
|
231
|
+
priority_msgs=priority_msgs,
|
232
|
+
node_msgs=node_msgs,
|
233
|
+
block_msgs=block_msgs,
|
234
|
+
resource_msgs=resource_msgs,
|
235
|
+
exit_event=exit_event)
|
195
236
|
except Exception as e:
|
196
237
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
197
238
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
@@ -200,7 +241,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
200
241
|
|
201
242
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
202
243
|
try:
|
203
|
-
router.start(
|
244
|
+
router.start()
|
204
245
|
except Exception as e:
|
205
246
|
router.logger.exception("router.start exception")
|
206
247
|
exception_q.put(('Hub', str(e)))
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl import Config
|
7
|
+
from parsl.executors import HighThroughputExecutor
|
8
|
+
from parsl.executors.errors import BadStateException
|
9
|
+
from parsl.jobs.states import JobState, JobStatus
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
|
12
|
+
|
13
|
+
class FailingProvider(LocalProvider):
|
14
|
+
def submit(*args, **kwargs):
|
15
|
+
raise RuntimeError("Deliberate failure of provider.submit")
|
16
|
+
|
17
|
+
|
18
|
+
def local_config():
|
19
|
+
"""Config to simulate failing blocks without connecting"""
|
20
|
+
return Config(
|
21
|
+
executors=[
|
22
|
+
HighThroughputExecutor(
|
23
|
+
label="HTEX",
|
24
|
+
heartbeat_period=1,
|
25
|
+
heartbeat_threshold=2,
|
26
|
+
poll_period=100,
|
27
|
+
max_workers_per_node=1,
|
28
|
+
provider=FailingProvider(
|
29
|
+
init_blocks=0,
|
30
|
+
max_blocks=2,
|
31
|
+
min_blocks=0,
|
32
|
+
),
|
33
|
+
)
|
34
|
+
],
|
35
|
+
max_idletime=0.5,
|
36
|
+
strategy='htex_auto_scale',
|
37
|
+
strategy_period=0.1
|
38
|
+
# this strategy period needs to be a few times smaller than the
|
39
|
+
# status_polling_interval of FailingProvider, which is 5s at
|
40
|
+
# time of writing
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
@parsl.python_app
|
45
|
+
def double(x):
|
46
|
+
return x * 2
|
47
|
+
|
48
|
+
|
49
|
+
@pytest.mark.local
|
50
|
+
def test_disconnected_blocks():
|
51
|
+
"""Test reporting of blocks that fail to connect from HTEX"""
|
52
|
+
dfk = parsl.dfk()
|
53
|
+
executor = dfk.executors["HTEX"]
|
54
|
+
|
55
|
+
connected_blocks = executor.connected_blocks()
|
56
|
+
assert not connected_blocks, "Expected 0 blocks"
|
57
|
+
|
58
|
+
future = double(5)
|
59
|
+
with pytest.raises(BadStateException):
|
60
|
+
future.result()
|
61
|
+
|
62
|
+
assert isinstance(future.exception(), BadStateException)
|
63
|
+
|
64
|
+
status_dict = executor.status()
|
65
|
+
assert len(status_dict) == 1, "Expected exactly 1 block"
|
66
|
+
for status in status_dict.values():
|
67
|
+
assert isinstance(status, JobStatus)
|
68
|
+
assert status.state == JobState.MISSING
|
69
|
+
|
70
|
+
connected_blocks = executor.connected_blocks()
|
71
|
+
assert connected_blocks == [], "Expected exactly 0 connected blocks"
|
@@ -1,6 +1,7 @@
|
|
1
|
+
import logging
|
1
2
|
import pathlib
|
2
|
-
import warnings
|
3
3
|
from subprocess import Popen, TimeoutExpired
|
4
|
+
from typing import Optional, Sequence
|
4
5
|
from unittest import mock
|
5
6
|
|
6
7
|
import pytest
|
@@ -71,12 +72,11 @@ def test_htex_start_encrypted(
|
|
71
72
|
@pytest.mark.local
|
72
73
|
@pytest.mark.parametrize("started", (True, False))
|
73
74
|
@pytest.mark.parametrize("timeout_expires", (True, False))
|
74
|
-
@mock.patch(f"{_MOCK_BASE}.logger")
|
75
75
|
def test_htex_shutdown(
|
76
|
-
mock_logger: mock.MagicMock,
|
77
76
|
started: bool,
|
78
77
|
timeout_expires: bool,
|
79
78
|
htex: HighThroughputExecutor,
|
79
|
+
caplog
|
80
80
|
):
|
81
81
|
mock_ix_proc = mock.Mock(spec=Popen)
|
82
82
|
|
@@ -108,22 +108,22 @@ def test_htex_shutdown(
|
|
108
108
|
|
109
109
|
mock_ix_proc.terminate.side_effect = kill_interchange
|
110
110
|
|
111
|
-
|
111
|
+
with caplog.at_level(logging.INFO):
|
112
|
+
htex.shutdown()
|
112
113
|
|
113
|
-
mock_logs = mock_logger.info.call_args_list
|
114
114
|
if started:
|
115
115
|
assert mock_ix_proc.terminate.called
|
116
116
|
assert mock_ix_proc.wait.called
|
117
117
|
assert {"timeout": 10} == mock_ix_proc.wait.call_args[1]
|
118
118
|
if timeout_expires:
|
119
|
-
assert "Unable to terminate Interchange" in
|
119
|
+
assert "Unable to terminate Interchange" in caplog.text
|
120
120
|
assert mock_ix_proc.kill.called
|
121
|
-
assert "Attempting" in
|
122
|
-
assert "Finished" in
|
121
|
+
assert "Attempting HighThroughputExecutor shutdown" in caplog.text
|
122
|
+
assert "Finished HighThroughputExecutor shutdown" in caplog.text
|
123
123
|
else:
|
124
124
|
assert not mock_ix_proc.terminate.called
|
125
125
|
assert not mock_ix_proc.wait.called
|
126
|
-
assert "has not started" in
|
126
|
+
assert "HighThroughputExecutor has not started" in caplog.text
|
127
127
|
|
128
128
|
|
129
129
|
@pytest.mark.local
|
@@ -139,13 +139,22 @@ def test_max_workers_per_node():
|
|
139
139
|
|
140
140
|
|
141
141
|
@pytest.mark.local
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
142
|
+
@pytest.mark.parametrize("cmd", (None, "custom-launch-cmd"))
|
143
|
+
def test_htex_worker_pool_launch_cmd(cmd: Optional[str]):
|
144
|
+
if cmd:
|
145
|
+
htex = HighThroughputExecutor(launch_cmd=cmd)
|
146
|
+
assert htex.launch_cmd == cmd
|
147
|
+
else:
|
148
|
+
htex = HighThroughputExecutor()
|
149
|
+
assert htex.launch_cmd.startswith("process_worker_pool.py")
|
150
|
+
|
151
|
+
|
152
|
+
@pytest.mark.local
|
153
|
+
@pytest.mark.parametrize("cmd", (None, ["custom", "launch", "cmd"]))
|
154
|
+
def test_htex_interchange_launch_cmd(cmd: Optional[Sequence[str]]):
|
155
|
+
if cmd:
|
156
|
+
htex = HighThroughputExecutor(interchange_launch_cmd=cmd)
|
157
|
+
assert htex.interchange_launch_cmd == cmd
|
158
|
+
else:
|
159
|
+
htex = HighThroughputExecutor()
|
160
|
+
assert htex.interchange_launch_cmd == ["interchange.py"]
|
@@ -9,6 +9,7 @@ import zmq
|
|
9
9
|
|
10
10
|
from parsl import curvezmq
|
11
11
|
from parsl.executors.high_throughput.interchange import Interchange
|
12
|
+
from parsl.executors.high_throughput.manager_selector import RandomManagerSelector
|
12
13
|
|
13
14
|
|
14
15
|
def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[str]) -> Interchange:
|
@@ -23,6 +24,7 @@ def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[s
|
|
23
24
|
heartbeat_threshold=60,
|
24
25
|
logdir=".",
|
25
26
|
logging_level=logging.INFO,
|
27
|
+
manager_selector=RandomManagerSelector(),
|
26
28
|
poll_period=10)
|
27
29
|
|
28
30
|
|
@@ -25,10 +25,23 @@ def this_app():
|
|
25
25
|
# a configuration that is suitably configured for monitoring.
|
26
26
|
|
27
27
|
def htex_config():
|
28
|
+
"""This config will use htex's default htex-specific monitoring radio mode"""
|
28
29
|
from parsl.tests.configs.htex_local_alternate import fresh_config
|
29
30
|
return fresh_config()
|
30
31
|
|
31
32
|
|
33
|
+
def htex_udp_config():
|
34
|
+
"""This config will force UDP"""
|
35
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
36
|
+
c = fresh_config()
|
37
|
+
assert len(c.executors) == 1
|
38
|
+
|
39
|
+
assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio"
|
40
|
+
c.executors[0].radio_mode = "udp"
|
41
|
+
|
42
|
+
return c
|
43
|
+
|
44
|
+
|
32
45
|
def workqueue_config():
|
33
46
|
from parsl.tests.configs.workqueue_ex import fresh_config
|
34
47
|
c = fresh_config()
|
@@ -48,7 +61,7 @@ def taskvine_config():
|
|
48
61
|
|
49
62
|
|
50
63
|
@pytest.mark.local
|
51
|
-
@pytest.mark.parametrize("fresh_config", [htex_config, workqueue_config, taskvine_config])
|
64
|
+
@pytest.mark.parametrize("fresh_config", [htex_config, htex_udp_config, workqueue_config, taskvine_config])
|
52
65
|
def test_row_counts(tmpd_cwd, fresh_config):
|
53
66
|
# this is imported here rather than at module level because
|
54
67
|
# it isn't available in a plain parsl install, so this module
|
@@ -44,7 +44,7 @@ def test_init():
|
|
44
44
|
|
45
45
|
new_kwargs = {'max_workers_per_block'}
|
46
46
|
excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node',
|
47
|
-
'mem_per_worker', 'cpu_affinity', 'max_workers'}
|
47
|
+
'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'}
|
48
48
|
|
49
49
|
# Get the kwargs from both HTEx and MPIEx
|
50
50
|
htex_kwargs = set(signature(HighThroughputExecutor.__init__).parameters)
|
parsl/version.py
CHANGED
@@ -6,7 +6,6 @@ import os
|
|
6
6
|
import pickle
|
7
7
|
import platform
|
8
8
|
import queue
|
9
|
-
import random
|
10
9
|
import signal
|
11
10
|
import sys
|
12
11
|
import threading
|
@@ -19,7 +18,9 @@ from parsl import curvezmq
|
|
19
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
20
19
|
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
21
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
22
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
+
from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
|
23
24
|
from parsl.process_loggers import wrap_with_logs
|
24
25
|
from parsl.serialize import serialize as serialize_object
|
25
26
|
from parsl.utils import setproctitle
|
@@ -53,6 +54,7 @@ class Interchange:
|
|
53
54
|
logging_level: int,
|
54
55
|
poll_period: int,
|
55
56
|
cert_dir: Optional[str],
|
57
|
+
manager_selector: ManagerSelector,
|
56
58
|
) -> None:
|
57
59
|
"""
|
58
60
|
Parameters
|
@@ -160,6 +162,8 @@ class Interchange:
|
|
160
162
|
|
161
163
|
self.heartbeat_threshold = heartbeat_threshold
|
162
164
|
|
165
|
+
self.manager_selector = manager_selector
|
166
|
+
|
163
167
|
self.current_platform = {'parsl_v': PARSL_VERSION,
|
164
168
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
165
169
|
sys.version_info.minor,
|
@@ -216,27 +220,15 @@ class Interchange:
|
|
216
220
|
task_counter += 1
|
217
221
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
218
222
|
|
219
|
-
def
|
220
|
-
if
|
221
|
-
logger.info("Connecting to MonitoringHub")
|
222
|
-
# This is a one-off because monitoring is unencrypted
|
223
|
-
hub_channel = zmq.Context().socket(zmq.DEALER)
|
224
|
-
hub_channel.set_hwm(0)
|
225
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
|
226
|
-
logger.info("Connected to MonitoringHub")
|
227
|
-
return hub_channel
|
228
|
-
else:
|
229
|
-
return None
|
230
|
-
|
231
|
-
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
232
|
-
if hub_channel:
|
223
|
+
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
224
|
+
if monitoring_radio:
|
233
225
|
logger.info("Sending message {} to MonitoringHub".format(manager))
|
234
226
|
|
235
227
|
d: Dict = cast(Dict, manager.copy())
|
236
228
|
d['timestamp'] = datetime.datetime.now()
|
237
229
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
238
230
|
|
239
|
-
|
231
|
+
monitoring_radio.send((MessageType.NODE_INFO, d))
|
240
232
|
|
241
233
|
@wrap_with_logs(target="interchange")
|
242
234
|
def _command_server(self) -> NoReturn:
|
@@ -244,8 +236,11 @@ class Interchange:
|
|
244
236
|
"""
|
245
237
|
logger.debug("Command Server Starting")
|
246
238
|
|
247
|
-
|
248
|
-
|
239
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
240
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
241
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
242
|
+
else:
|
243
|
+
monitoring_radio = None
|
249
244
|
|
250
245
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
251
246
|
|
@@ -295,7 +290,7 @@ class Interchange:
|
|
295
290
|
if manager_id in self._ready_managers:
|
296
291
|
m = self._ready_managers[manager_id]
|
297
292
|
m['active'] = False
|
298
|
-
self._send_monitoring_info(
|
293
|
+
self._send_monitoring_info(monitoring_radio, m)
|
299
294
|
else:
|
300
295
|
logger.warning("Worker to hold was not in ready managers list")
|
301
296
|
|
@@ -330,9 +325,14 @@ class Interchange:
|
|
330
325
|
# parent-process-inheritance problems.
|
331
326
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
332
327
|
|
333
|
-
logger.info("
|
328
|
+
logger.info("Starting main interchange method")
|
334
329
|
|
335
|
-
|
330
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
331
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
332
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
333
|
+
logger.debug("Created monitoring radio")
|
334
|
+
else:
|
335
|
+
monitoring_radio = None
|
336
336
|
|
337
337
|
poll_period = self.poll_period
|
338
338
|
|
@@ -363,10 +363,10 @@ class Interchange:
|
|
363
363
|
while not kill_event.is_set():
|
364
364
|
self.socks = dict(poller.poll(timeout=poll_period))
|
365
365
|
|
366
|
-
self.process_task_outgoing_incoming(interesting_managers,
|
367
|
-
self.process_results_incoming(interesting_managers,
|
368
|
-
self.expire_bad_managers(interesting_managers,
|
369
|
-
self.expire_drained_managers(interesting_managers,
|
366
|
+
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
367
|
+
self.process_results_incoming(interesting_managers, monitoring_radio)
|
368
|
+
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
369
|
+
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
370
370
|
self.process_tasks_to_send(interesting_managers)
|
371
371
|
|
372
372
|
self.zmq_context.destroy()
|
@@ -377,7 +377,7 @@ class Interchange:
|
|
377
377
|
def process_task_outgoing_incoming(
|
378
378
|
self,
|
379
379
|
interesting_managers: Set[bytes],
|
380
|
-
|
380
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
381
381
|
kill_event: threading.Event
|
382
382
|
) -> None:
|
383
383
|
"""Process one message from manager on the task_outgoing channel.
|
@@ -431,7 +431,7 @@ class Interchange:
|
|
431
431
|
m.update(msg) # type: ignore[typeddict-item]
|
432
432
|
|
433
433
|
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
434
|
-
self._send_monitoring_info(
|
434
|
+
self._send_monitoring_info(monitoring_radio, m)
|
435
435
|
|
436
436
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
437
437
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
@@ -462,7 +462,7 @@ class Interchange:
|
|
462
462
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
463
463
|
logger.debug("leaving task_outgoing section")
|
464
464
|
|
465
|
-
def expire_drained_managers(self, interesting_managers: Set[bytes],
|
465
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
466
466
|
|
467
467
|
for manager_id in list(interesting_managers):
|
468
468
|
# is it always true that a draining manager will be in interesting managers?
|
@@ -475,7 +475,7 @@ class Interchange:
|
|
475
475
|
self._ready_managers.pop(manager_id)
|
476
476
|
|
477
477
|
m['active'] = False
|
478
|
-
self._send_monitoring_info(
|
478
|
+
self._send_monitoring_info(monitoring_radio, m)
|
479
479
|
|
480
480
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
481
481
|
# Check if there are tasks that could be sent to managers
|
@@ -485,8 +485,7 @@ class Interchange:
|
|
485
485
|
interesting=len(interesting_managers)))
|
486
486
|
|
487
487
|
if interesting_managers and not self.pending_task_queue.empty():
|
488
|
-
shuffled_managers =
|
489
|
-
random.shuffle(shuffled_managers)
|
488
|
+
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
490
489
|
|
491
490
|
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
492
491
|
manager_id = shuffled_managers.pop()
|
@@ -519,7 +518,7 @@ class Interchange:
|
|
519
518
|
else:
|
520
519
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
521
520
|
|
522
|
-
def process_results_incoming(self, interesting_managers: Set[bytes],
|
521
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
523
522
|
# Receive any results and forward to client
|
524
523
|
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
525
524
|
logger.debug("entering results_incoming section")
|
@@ -539,11 +538,11 @@ class Interchange:
|
|
539
538
|
elif r['type'] == 'monitoring':
|
540
539
|
# the monitoring code makes the assumption that no
|
541
540
|
# monitoring messages will be received if monitoring
|
542
|
-
# is not configured, and that
|
541
|
+
# is not configured, and that monitoring_radio will only
|
543
542
|
# be None when monitoring is not configurated.
|
544
|
-
assert
|
543
|
+
assert monitoring_radio is not None
|
545
544
|
|
546
|
-
|
545
|
+
monitoring_radio.send(r['payload'])
|
547
546
|
elif r['type'] == 'heartbeat':
|
548
547
|
logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
|
549
548
|
b_messages.append((p_message, r))
|
@@ -587,7 +586,7 @@ class Interchange:
|
|
587
586
|
interesting_managers.add(manager_id)
|
588
587
|
logger.debug("leaving results_incoming section")
|
589
588
|
|
590
|
-
def expire_bad_managers(self, interesting_managers: Set[bytes],
|
589
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
591
590
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
592
591
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
593
592
|
for (manager_id, m) in bad_managers:
|
@@ -595,7 +594,7 @@ class Interchange:
|
|
595
594
|
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
596
595
|
if m['active']:
|
597
596
|
m['active'] = False
|
598
|
-
self._send_monitoring_info(
|
597
|
+
self._send_monitoring_info(monitoring_radio, m)
|
599
598
|
|
600
599
|
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
601
600
|
for tid in m['tasks']:
|