parsl 2024.7.29__py3-none-any.whl → 2024.8.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/__init__.py +1 -4
- parsl/channels/oauth_ssh/oauth_ssh.py +2 -2
- parsl/channels/ssh/ssh.py +1 -1
- parsl/channels/ssh_il/ssh_il.py +2 -2
- parsl/dataflow/dflow.py +2 -2
- parsl/executors/base.py +7 -7
- parsl/executors/high_throughput/executor.py +15 -7
- parsl/executors/high_throughput/interchange.py +40 -37
- parsl/executors/high_throughput/manager_selector.py +25 -0
- parsl/executors/status_handling.py +38 -24
- parsl/executors/taskvine/executor.py +2 -0
- parsl/executors/workqueue/executor.py +2 -0
- parsl/monitoring/db_manager.py +10 -10
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +3 -3
- parsl/monitoring/radios.py +16 -0
- parsl/monitoring/remote.py +4 -4
- parsl/monitoring/router.py +71 -35
- parsl/providers/__init__.py +0 -4
- parsl/providers/ad_hoc/ad_hoc.py +6 -2
- parsl/tests/configs/local_adhoc.py +2 -2
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_htex.py +28 -19
- parsl/tests/test_htex/test_zmq_binding.py +4 -1
- parsl/tests/test_monitoring/test_basic.py +14 -1
- parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
- parsl/tests/test_providers/test_local_provider.py +6 -5
- parsl/version.py +1 -1
- {parsl-2024.7.29.data → parsl-2024.8.12.data}/scripts/interchange.py +40 -37
- parsl-2024.8.12.dist-info/METADATA +101 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.12.dist-info}/RECORD +38 -46
- {parsl-2024.7.29.dist-info → parsl-2024.8.12.dist-info}/WHEEL +1 -1
- parsl/configs/ad_hoc.py +0 -38
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -26
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -49
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl-2024.7.29.dist-info/METADATA +0 -101
- {parsl-2024.7.29.data → parsl-2024.8.12.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.12.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.12.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.12.dist-info}/LICENSE +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.12.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.12.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
|
|
12
12
|
import typeguard
|
13
13
|
|
14
14
|
from parsl.log_utils import set_file_logger
|
15
|
+
from parsl.monitoring.errors import MonitoringHubStartError
|
15
16
|
from parsl.monitoring.message_type import MessageType
|
16
17
|
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
17
18
|
from parsl.monitoring.router import router_starter
|
@@ -105,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
|
|
105
106
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
106
107
|
self.resource_monitoring_interval = resource_monitoring_interval
|
107
108
|
|
108
|
-
def start(self,
|
109
|
+
def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
|
109
110
|
|
110
111
|
logger.debug("Starting MonitoringHub")
|
111
112
|
|
@@ -160,7 +161,6 @@ class MonitoringHub(RepresentationMixin):
|
|
160
161
|
"zmq_port_range": self.hub_port_range,
|
161
162
|
"logdir": self.logdir,
|
162
163
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
163
|
-
"run_id": run_id
|
164
164
|
},
|
165
165
|
name="Monitoring-Router-Process",
|
166
166
|
daemon=True,
|
@@ -195,7 +195,7 @@ class MonitoringHub(RepresentationMixin):
|
|
195
195
|
comm_q.join_thread()
|
196
196
|
except queue.Empty:
|
197
197
|
logger.error("Hub has not completed initialization in 120s. Aborting")
|
198
|
-
raise
|
198
|
+
raise MonitoringHubStartError()
|
199
199
|
|
200
200
|
if isinstance(comm_q_result, str):
|
201
201
|
logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
|
parsl/monitoring/radios.py
CHANGED
@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
|
|
7
7
|
from multiprocessing.queues import Queue
|
8
8
|
from typing import Optional
|
9
9
|
|
10
|
+
import zmq
|
11
|
+
|
10
12
|
from parsl.serialize import serialize
|
11
13
|
|
12
14
|
_db_manager_excepts: Optional[Exception]
|
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
186
188
|
|
187
189
|
def send(self, message: object) -> None:
|
188
190
|
self.queue.put((message, 0))
|
191
|
+
|
192
|
+
|
193
|
+
class ZMQRadioSender(MonitoringRadioSender):
|
194
|
+
"""A monitoring radio which connects over ZMQ. This radio is not
|
195
|
+
thread-safe, because its use of ZMQ is not thread-safe.
|
196
|
+
"""
|
197
|
+
|
198
|
+
def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
|
199
|
+
self._hub_channel = zmq.Context().socket(zmq.DEALER)
|
200
|
+
self._hub_channel.set_hwm(0)
|
201
|
+
self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
|
202
|
+
|
203
|
+
def send(self, message: object) -> None:
|
204
|
+
self._hub_channel.send_pyobj(message)
|
parsl/monitoring/remote.py
CHANGED
@@ -199,10 +199,10 @@ def monitor(pid: int,
|
|
199
199
|
|
200
200
|
pm = psutil.Process(pid)
|
201
201
|
|
202
|
-
children_user_time
|
203
|
-
children_system_time
|
204
|
-
children_num_ctx_switches_voluntary
|
205
|
-
children_num_ctx_switches_involuntary
|
202
|
+
children_user_time: Dict[int, float] = {}
|
203
|
+
children_system_time: Dict[int, float] = {}
|
204
|
+
children_num_ctx_switches_voluntary: Dict[int, float] = {}
|
205
|
+
children_num_ctx_switches_involuntary: Dict[int, float] = {}
|
206
206
|
|
207
207
|
def accumulate_and_prepare() -> Dict[str, Any]:
|
208
208
|
d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}
|
parsl/monitoring/router.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import pickle
|
6
6
|
import queue
|
7
7
|
import socket
|
8
|
+
import threading
|
8
9
|
import time
|
9
10
|
from multiprocessing.synchronize import Event
|
10
11
|
from typing import Optional, Tuple, Union
|
@@ -30,9 +31,13 @@ class MonitoringRouter:
|
|
30
31
|
|
31
32
|
monitoring_hub_address: str = "127.0.0.1",
|
32
33
|
logdir: str = ".",
|
33
|
-
run_id: str,
|
34
34
|
logging_level: int = logging.INFO,
|
35
|
-
atexit_timeout: int = 3
|
35
|
+
atexit_timeout: int = 3, # in seconds
|
36
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
37
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
38
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
39
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
40
|
+
exit_event: Event,
|
36
41
|
):
|
37
42
|
""" Initializes a monitoring configuration class.
|
38
43
|
|
@@ -51,7 +56,11 @@ class MonitoringRouter:
|
|
51
56
|
Logging level as defined in the logging module. Default: logging.INFO
|
52
57
|
atexit_timeout : float, optional
|
53
58
|
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
59
|
+
*_msgs : Queue
|
60
|
+
Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
|
54
61
|
|
62
|
+
exit_event : Event
|
63
|
+
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
55
64
|
"""
|
56
65
|
os.makedirs(logdir, exist_ok=True)
|
57
66
|
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
@@ -61,7 +70,6 @@ class MonitoringRouter:
|
|
61
70
|
|
62
71
|
self.hub_address = hub_address
|
63
72
|
self.atexit_timeout = atexit_timeout
|
64
|
-
self.run_id = run_id
|
65
73
|
|
66
74
|
self.loop_freq = 10.0 # milliseconds
|
67
75
|
|
@@ -93,22 +101,60 @@ class MonitoringRouter:
|
|
93
101
|
min_port=zmq_port_range[0],
|
94
102
|
max_port=zmq_port_range[1])
|
95
103
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
104
|
+
self.priority_msgs = priority_msgs
|
105
|
+
self.node_msgs = node_msgs
|
106
|
+
self.block_msgs = block_msgs
|
107
|
+
self.resource_msgs = resource_msgs
|
108
|
+
self.exit_event = exit_event
|
109
|
+
|
110
|
+
@wrap_with_logs(target="monitoring_router")
|
111
|
+
def start(self) -> None:
|
112
|
+
self.logger.info("Starting UDP listener thread")
|
113
|
+
udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
|
114
|
+
udp_radio_receiver_thread.start()
|
115
|
+
|
116
|
+
self.logger.info("Starting ZMQ listener thread")
|
117
|
+
zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
|
118
|
+
zmq_radio_receiver_thread.start()
|
119
|
+
|
120
|
+
self.logger.info("Joining on ZMQ listener thread")
|
121
|
+
zmq_radio_receiver_thread.join()
|
122
|
+
self.logger.info("Joining on UDP listener thread")
|
123
|
+
udp_radio_receiver_thread.join()
|
124
|
+
self.logger.info("Joined on both ZMQ and UDP listener threads")
|
125
|
+
|
126
|
+
@wrap_with_logs(target="monitoring_router")
|
127
|
+
def start_udp_listener(self) -> None:
|
102
128
|
try:
|
103
|
-
while not exit_event.is_set():
|
129
|
+
while not self.exit_event.is_set():
|
104
130
|
try:
|
105
131
|
data, addr = self.udp_sock.recvfrom(2048)
|
106
132
|
resource_msg = pickle.loads(data)
|
107
133
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
108
|
-
resource_msgs.put((resource_msg, addr))
|
134
|
+
self.resource_msgs.put((resource_msg, addr))
|
109
135
|
except socket.timeout:
|
110
136
|
pass
|
111
137
|
|
138
|
+
self.logger.info("UDP listener draining")
|
139
|
+
last_msg_received_time = time.time()
|
140
|
+
while time.time() - last_msg_received_time < self.atexit_timeout:
|
141
|
+
try:
|
142
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
143
|
+
msg = pickle.loads(data)
|
144
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
145
|
+
self.resource_msgs.put((msg, addr))
|
146
|
+
last_msg_received_time = time.time()
|
147
|
+
except socket.timeout:
|
148
|
+
pass
|
149
|
+
|
150
|
+
self.logger.info("UDP listener finishing normally")
|
151
|
+
finally:
|
152
|
+
self.logger.info("UDP listener finished")
|
153
|
+
|
154
|
+
@wrap_with_logs(target="monitoring_router")
|
155
|
+
def start_zmq_listener(self) -> None:
|
156
|
+
try:
|
157
|
+
while not self.exit_event.is_set():
|
112
158
|
try:
|
113
159
|
dfk_loop_start = time.time()
|
114
160
|
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
@@ -124,16 +170,15 @@ class MonitoringRouter:
|
|
124
170
|
msg_0 = (msg, 0)
|
125
171
|
|
126
172
|
if msg[0] == MessageType.NODE_INFO:
|
127
|
-
|
128
|
-
node_msgs.put(msg_0)
|
173
|
+
self.node_msgs.put(msg_0)
|
129
174
|
elif msg[0] == MessageType.RESOURCE_INFO:
|
130
|
-
resource_msgs.put(msg_0)
|
175
|
+
self.resource_msgs.put(msg_0)
|
131
176
|
elif msg[0] == MessageType.BLOCK_INFO:
|
132
|
-
block_msgs.put(msg_0)
|
177
|
+
self.block_msgs.put(msg_0)
|
133
178
|
elif msg[0] == MessageType.TASK_INFO:
|
134
|
-
priority_msgs.put(msg_0)
|
179
|
+
self.priority_msgs.put(msg_0)
|
135
180
|
elif msg[0] == MessageType.WORKFLOW_INFO:
|
136
|
-
priority_msgs.put(msg_0)
|
181
|
+
self.priority_msgs.put(msg_0)
|
137
182
|
else:
|
138
183
|
# There is a type: ignore here because if msg[0]
|
139
184
|
# is of the correct type, this code is unreachable,
|
@@ -151,21 +196,9 @@ class MonitoringRouter:
|
|
151
196
|
# thing to do.
|
152
197
|
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
153
198
|
|
154
|
-
self.logger.info("
|
155
|
-
last_msg_received_time = time.time()
|
156
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
157
|
-
try:
|
158
|
-
data, addr = self.udp_sock.recvfrom(2048)
|
159
|
-
msg = pickle.loads(data)
|
160
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
161
|
-
resource_msgs.put((msg, addr))
|
162
|
-
last_msg_received_time = time.time()
|
163
|
-
except socket.timeout:
|
164
|
-
pass
|
165
|
-
|
166
|
-
self.logger.info("Monitoring router finishing normally")
|
199
|
+
self.logger.info("ZMQ listener finishing normally")
|
167
200
|
finally:
|
168
|
-
self.logger.info("
|
201
|
+
self.logger.info("ZMQ listener finished")
|
169
202
|
|
170
203
|
|
171
204
|
@wrap_with_logs
|
@@ -182,8 +215,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
182
215
|
zmq_port_range: Tuple[int, int],
|
183
216
|
|
184
217
|
logdir: str,
|
185
|
-
logging_level: int
|
186
|
-
run_id: str) -> None:
|
218
|
+
logging_level: int) -> None:
|
187
219
|
setproctitle("parsl: monitoring router")
|
188
220
|
try:
|
189
221
|
router = MonitoringRouter(hub_address=hub_address,
|
@@ -191,7 +223,11 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
191
223
|
zmq_port_range=zmq_port_range,
|
192
224
|
logdir=logdir,
|
193
225
|
logging_level=logging_level,
|
194
|
-
|
226
|
+
priority_msgs=priority_msgs,
|
227
|
+
node_msgs=node_msgs,
|
228
|
+
block_msgs=block_msgs,
|
229
|
+
resource_msgs=resource_msgs,
|
230
|
+
exit_event=exit_event)
|
195
231
|
except Exception as e:
|
196
232
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
197
233
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
@@ -200,7 +236,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
200
236
|
|
201
237
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
202
238
|
try:
|
203
|
-
router.start(
|
239
|
+
router.start()
|
204
240
|
except Exception as e:
|
205
241
|
router.logger.exception("router.start exception")
|
206
242
|
exception_q.put(('Hub', str(e)))
|
parsl/providers/__init__.py
CHANGED
@@ -1,6 +1,3 @@
|
|
1
|
-
# Workstation Provider
|
2
|
-
from parsl.providers.ad_hoc.ad_hoc import AdHocProvider
|
3
|
-
|
4
1
|
# Cloud Providers
|
5
2
|
from parsl.providers.aws.aws import AWSProvider
|
6
3
|
from parsl.providers.azure.azure import AzureProvider
|
@@ -24,7 +21,6 @@ __all__ = ['LocalProvider',
|
|
24
21
|
'SlurmProvider',
|
25
22
|
'TorqueProvider',
|
26
23
|
'LSFProvider',
|
27
|
-
'AdHocProvider',
|
28
24
|
'PBSProProvider',
|
29
25
|
'AWSProvider',
|
30
26
|
'GoogleCloudProvider',
|
parsl/providers/ad_hoc/ad_hoc.py
CHANGED
@@ -12,8 +12,12 @@ from parsl.utils import RepresentationMixin
|
|
12
12
|
logger = logging.getLogger(__name__)
|
13
13
|
|
14
14
|
|
15
|
-
class
|
16
|
-
"""
|
15
|
+
class DeprecatedAdHocProvider(ExecutionProvider, RepresentationMixin):
|
16
|
+
""" Deprecated ad-hoc execution provider
|
17
|
+
|
18
|
+
The (former) AdHocProvider is deprecated. See
|
19
|
+
`issue #3515 <https://github.com/Parsl/parsl/issues/3515>`_
|
20
|
+
for further discussion.
|
17
21
|
|
18
22
|
This provider is used to provision execution resources over one or more ad hoc nodes
|
19
23
|
that are each accessible over a Channel (say, ssh) but otherwise lack a cluster scheduler.
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from parsl.channels import LocalChannel
|
2
2
|
from parsl.config import Config
|
3
3
|
from parsl.executors import HighThroughputExecutor
|
4
|
-
from parsl.providers import
|
4
|
+
from parsl.providers.ad_hoc.ad_hoc import DeprecatedAdHocProvider
|
5
5
|
|
6
6
|
|
7
7
|
def fresh_config():
|
@@ -10,7 +10,7 @@ def fresh_config():
|
|
10
10
|
HighThroughputExecutor(
|
11
11
|
label='AdHoc',
|
12
12
|
encrypted=True,
|
13
|
-
provider=
|
13
|
+
provider=DeprecatedAdHocProvider(
|
14
14
|
channels=[LocalChannel(), LocalChannel()]
|
15
15
|
)
|
16
16
|
)
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl import Config
|
7
|
+
from parsl.executors import HighThroughputExecutor
|
8
|
+
from parsl.executors.errors import BadStateException
|
9
|
+
from parsl.jobs.states import JobState, JobStatus
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
|
12
|
+
|
13
|
+
class FailingProvider(LocalProvider):
|
14
|
+
def submit(*args, **kwargs):
|
15
|
+
raise RuntimeError("Deliberate failure of provider.submit")
|
16
|
+
|
17
|
+
|
18
|
+
def local_config():
|
19
|
+
"""Config to simulate failing blocks without connecting"""
|
20
|
+
return Config(
|
21
|
+
executors=[
|
22
|
+
HighThroughputExecutor(
|
23
|
+
label="HTEX",
|
24
|
+
heartbeat_period=1,
|
25
|
+
heartbeat_threshold=2,
|
26
|
+
poll_period=100,
|
27
|
+
max_workers_per_node=1,
|
28
|
+
provider=FailingProvider(
|
29
|
+
init_blocks=0,
|
30
|
+
max_blocks=2,
|
31
|
+
min_blocks=0,
|
32
|
+
),
|
33
|
+
)
|
34
|
+
],
|
35
|
+
max_idletime=0.5,
|
36
|
+
strategy='htex_auto_scale',
|
37
|
+
strategy_period=0.1
|
38
|
+
# this strategy period needs to be a few times smaller than the
|
39
|
+
# status_polling_interval of FailingProvider, which is 5s at
|
40
|
+
# time of writing
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
@parsl.python_app
|
45
|
+
def double(x):
|
46
|
+
return x * 2
|
47
|
+
|
48
|
+
|
49
|
+
@pytest.mark.local
|
50
|
+
def test_disconnected_blocks():
|
51
|
+
"""Test reporting of blocks that fail to connect from HTEX"""
|
52
|
+
dfk = parsl.dfk()
|
53
|
+
executor = dfk.executors["HTEX"]
|
54
|
+
|
55
|
+
connected_blocks = executor.connected_blocks()
|
56
|
+
assert not connected_blocks, "Expected 0 blocks"
|
57
|
+
|
58
|
+
future = double(5)
|
59
|
+
with pytest.raises(BadStateException):
|
60
|
+
future.result()
|
61
|
+
|
62
|
+
assert isinstance(future.exception(), BadStateException)
|
63
|
+
|
64
|
+
status_dict = executor.status()
|
65
|
+
assert len(status_dict) == 1, "Expected exactly 1 block"
|
66
|
+
for status in status_dict.values():
|
67
|
+
assert isinstance(status, JobStatus)
|
68
|
+
assert status.state == JobState.MISSING
|
69
|
+
|
70
|
+
connected_blocks = executor.connected_blocks()
|
71
|
+
assert connected_blocks == [], "Expected exactly 0 connected blocks"
|
@@ -1,6 +1,7 @@
|
|
1
|
+
import logging
|
1
2
|
import pathlib
|
2
|
-
import warnings
|
3
3
|
from subprocess import Popen, TimeoutExpired
|
4
|
+
from typing import Optional, Sequence
|
4
5
|
from unittest import mock
|
5
6
|
|
6
7
|
import pytest
|
@@ -71,12 +72,11 @@ def test_htex_start_encrypted(
|
|
71
72
|
@pytest.mark.local
|
72
73
|
@pytest.mark.parametrize("started", (True, False))
|
73
74
|
@pytest.mark.parametrize("timeout_expires", (True, False))
|
74
|
-
@mock.patch(f"{_MOCK_BASE}.logger")
|
75
75
|
def test_htex_shutdown(
|
76
|
-
mock_logger: mock.MagicMock,
|
77
76
|
started: bool,
|
78
77
|
timeout_expires: bool,
|
79
78
|
htex: HighThroughputExecutor,
|
79
|
+
caplog
|
80
80
|
):
|
81
81
|
mock_ix_proc = mock.Mock(spec=Popen)
|
82
82
|
|
@@ -108,22 +108,22 @@ def test_htex_shutdown(
|
|
108
108
|
|
109
109
|
mock_ix_proc.terminate.side_effect = kill_interchange
|
110
110
|
|
111
|
-
|
111
|
+
with caplog.at_level(logging.INFO):
|
112
|
+
htex.shutdown()
|
112
113
|
|
113
|
-
mock_logs = mock_logger.info.call_args_list
|
114
114
|
if started:
|
115
115
|
assert mock_ix_proc.terminate.called
|
116
116
|
assert mock_ix_proc.wait.called
|
117
117
|
assert {"timeout": 10} == mock_ix_proc.wait.call_args[1]
|
118
118
|
if timeout_expires:
|
119
|
-
assert "Unable to terminate Interchange" in
|
119
|
+
assert "Unable to terminate Interchange" in caplog.text
|
120
120
|
assert mock_ix_proc.kill.called
|
121
|
-
assert "Attempting" in
|
122
|
-
assert "Finished" in
|
121
|
+
assert "Attempting HighThroughputExecutor shutdown" in caplog.text
|
122
|
+
assert "Finished HighThroughputExecutor shutdown" in caplog.text
|
123
123
|
else:
|
124
124
|
assert not mock_ix_proc.terminate.called
|
125
125
|
assert not mock_ix_proc.wait.called
|
126
|
-
assert "has not started" in
|
126
|
+
assert "HighThroughputExecutor has not started" in caplog.text
|
127
127
|
|
128
128
|
|
129
129
|
@pytest.mark.local
|
@@ -139,13 +139,22 @@ def test_max_workers_per_node():
|
|
139
139
|
|
140
140
|
|
141
141
|
@pytest.mark.local
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
142
|
+
@pytest.mark.parametrize("cmd", (None, "custom-launch-cmd"))
|
143
|
+
def test_htex_worker_pool_launch_cmd(cmd: Optional[str]):
|
144
|
+
if cmd:
|
145
|
+
htex = HighThroughputExecutor(launch_cmd=cmd)
|
146
|
+
assert htex.launch_cmd == cmd
|
147
|
+
else:
|
148
|
+
htex = HighThroughputExecutor()
|
149
|
+
assert htex.launch_cmd.startswith("process_worker_pool.py")
|
150
|
+
|
151
|
+
|
152
|
+
@pytest.mark.local
|
153
|
+
@pytest.mark.parametrize("cmd", (None, ["custom", "launch", "cmd"]))
|
154
|
+
def test_htex_interchange_launch_cmd(cmd: Optional[Sequence[str]]):
|
155
|
+
if cmd:
|
156
|
+
htex = HighThroughputExecutor(interchange_launch_cmd=cmd)
|
157
|
+
assert htex.interchange_launch_cmd == cmd
|
158
|
+
else:
|
159
|
+
htex = HighThroughputExecutor()
|
160
|
+
assert htex.interchange_launch_cmd == ["interchange.py"]
|
@@ -9,6 +9,7 @@ import zmq
|
|
9
9
|
|
10
10
|
from parsl import curvezmq
|
11
11
|
from parsl.executors.high_throughput.interchange import Interchange
|
12
|
+
from parsl.executors.high_throughput.manager_selector import RandomManagerSelector
|
12
13
|
|
13
14
|
|
14
15
|
def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[str]) -> Interchange:
|
@@ -23,7 +24,9 @@ def make_interchange(*, interchange_address: Optional[str], cert_dir: Optional[s
|
|
23
24
|
heartbeat_threshold=60,
|
24
25
|
logdir=".",
|
25
26
|
logging_level=logging.INFO,
|
26
|
-
|
27
|
+
manager_selector=RandomManagerSelector(),
|
28
|
+
poll_period=10,
|
29
|
+
run_id="test_run_id")
|
27
30
|
|
28
31
|
|
29
32
|
@pytest.fixture
|
@@ -25,10 +25,23 @@ def this_app():
|
|
25
25
|
# a configuration that is suitably configured for monitoring.
|
26
26
|
|
27
27
|
def htex_config():
|
28
|
+
"""This config will use htex's default htex-specific monitoring radio mode"""
|
28
29
|
from parsl.tests.configs.htex_local_alternate import fresh_config
|
29
30
|
return fresh_config()
|
30
31
|
|
31
32
|
|
33
|
+
def htex_udp_config():
|
34
|
+
"""This config will force UDP"""
|
35
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
36
|
+
c = fresh_config()
|
37
|
+
assert len(c.executors) == 1
|
38
|
+
|
39
|
+
assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio"
|
40
|
+
c.executors[0].radio_mode = "udp"
|
41
|
+
|
42
|
+
return c
|
43
|
+
|
44
|
+
|
32
45
|
def workqueue_config():
|
33
46
|
from parsl.tests.configs.workqueue_ex import fresh_config
|
34
47
|
c = fresh_config()
|
@@ -48,7 +61,7 @@ def taskvine_config():
|
|
48
61
|
|
49
62
|
|
50
63
|
@pytest.mark.local
|
51
|
-
@pytest.mark.parametrize("fresh_config", [htex_config, workqueue_config, taskvine_config])
|
64
|
+
@pytest.mark.parametrize("fresh_config", [htex_config, htex_udp_config, workqueue_config, taskvine_config])
|
52
65
|
def test_row_counts(tmpd_cwd, fresh_config):
|
53
66
|
# this is imported here rather than at module level because
|
54
67
|
# it isn't available in a plain parsl install, so this module
|
@@ -44,7 +44,7 @@ def test_init():
|
|
44
44
|
|
45
45
|
new_kwargs = {'max_workers_per_block'}
|
46
46
|
excluded_kwargs = {'available_accelerators', 'enable_mpi_mode', 'cores_per_worker', 'max_workers_per_node',
|
47
|
-
'mem_per_worker', 'cpu_affinity', 'max_workers'}
|
47
|
+
'mem_per_worker', 'cpu_affinity', 'max_workers', 'manager_selector'}
|
48
48
|
|
49
49
|
# Get the kwargs from both HTEx and MPIEx
|
50
50
|
htex_kwargs = set(signature(HighThroughputExecutor.__init__).parameters)
|
@@ -11,7 +11,8 @@ import time
|
|
11
11
|
|
12
12
|
import pytest
|
13
13
|
|
14
|
-
from parsl.channels import LocalChannel
|
14
|
+
from parsl.channels import LocalChannel
|
15
|
+
from parsl.channels.ssh.ssh import DeprecatedSSHChannel
|
15
16
|
from parsl.jobs.states import JobState
|
16
17
|
from parsl.launchers import SingleNodeLauncher
|
17
18
|
from parsl.providers import LocalProvider
|
@@ -92,10 +93,10 @@ def test_ssh_channel():
|
|
92
93
|
# already exist, so create it here.
|
93
94
|
pathlib.Path('{}/known.hosts'.format(config_dir)).touch(mode=0o600)
|
94
95
|
script_dir = tempfile.mkdtemp()
|
95
|
-
channel =
|
96
|
-
|
97
|
-
|
98
|
-
|
96
|
+
channel = DeprecatedSSHChannel('127.0.0.1', port=server_port,
|
97
|
+
script_dir=remote_script_dir,
|
98
|
+
host_keys_filename='{}/known.hosts'.format(config_dir),
|
99
|
+
key_filename=priv_key)
|
99
100
|
try:
|
100
101
|
p = LocalProvider(channel=channel,
|
101
102
|
launcher=SingleNodeLauncher(debug=False))
|
parsl/version.py
CHANGED