parsl 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +4 -10
- parsl/executors/base.py +8 -8
- parsl/executors/flux/executor.py +7 -7
- parsl/executors/high_throughput/executor.py +55 -55
- parsl/executors/high_throughput/interchange.py +37 -37
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/manager_selector.py +25 -0
- parsl/executors/high_throughput/process_worker_pool.py +2 -0
- parsl/executors/status_handling.py +52 -21
- parsl/executors/taskvine/executor.py +0 -18
- parsl/executors/workqueue/executor.py +0 -18
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +6 -5
- parsl/monitoring/radios.py +23 -7
- parsl/monitoring/remote.py +12 -12
- parsl/monitoring/router.py +71 -30
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_htex.py +28 -19
- parsl/tests/test_htex/test_zmq_binding.py +2 -0
- parsl/tests/test_monitoring/test_basic.py +14 -1
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
- parsl/version.py +1 -1
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/interchange.py +37 -37
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +2 -0
- parsl-2024.8.5.dist-info/METADATA +101 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/RECORD +33 -30
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
- parsl-2024.7.22.dist-info/METADATA +0 -101
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
12
12
|
from parsl.executors.base import ParslExecutor
|
13
13
|
from parsl.executors.errors import BadStateException, ScalingFailed
|
14
14
|
from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
|
15
|
-
from parsl.jobs.states import JobState, JobStatus
|
15
|
+
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.providers.base import ExecutionProvider
|
18
18
|
from parsl.utils import AtomicIDCounter
|
@@ -167,41 +167,82 @@ class BlockProviderExecutor(ParslExecutor):
|
|
167
167
|
def provider(self):
|
168
168
|
return self._provider
|
169
169
|
|
170
|
-
def _filter_scale_in_ids(self, to_kill, killed):
|
170
|
+
def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]:
|
171
171
|
""" Filter out job id's that were not killed
|
172
172
|
"""
|
173
173
|
assert len(to_kill) == len(killed)
|
174
|
+
|
175
|
+
if False in killed:
|
176
|
+
killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k]
|
177
|
+
not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k]
|
178
|
+
logger.warning("Some jobs were not killed successfully: "
|
179
|
+
f"killed jobs: {killed_job_ids}, "
|
180
|
+
f"not-killed jobs: {not_killed_job_ids}")
|
181
|
+
|
174
182
|
# Filters first iterable by bool values in second
|
175
183
|
return list(compress(to_kill, killed))
|
176
184
|
|
177
|
-
def
|
185
|
+
def scale_out_facade(self, n: int) -> List[str]:
|
178
186
|
"""Scales out the number of blocks by "blocks"
|
179
187
|
"""
|
180
188
|
if not self.provider:
|
181
189
|
raise ScalingFailed(self, "No execution provider available")
|
182
190
|
block_ids = []
|
183
|
-
|
184
|
-
|
191
|
+
monitoring_status_changes = {}
|
192
|
+
logger.info(f"Scaling out by {n} blocks")
|
193
|
+
for _ in range(n):
|
185
194
|
block_id = str(self._block_id_counter.get_id())
|
186
195
|
logger.info(f"Allocated block ID {block_id}")
|
187
196
|
try:
|
188
197
|
job_id = self._launch_block(block_id)
|
198
|
+
|
199
|
+
pending_status = JobStatus(JobState.PENDING)
|
200
|
+
|
189
201
|
self.blocks_to_job_id[block_id] = job_id
|
190
202
|
self.job_ids_to_block[job_id] = block_id
|
203
|
+
self._status[block_id] = pending_status
|
204
|
+
|
205
|
+
monitoring_status_changes[block_id] = pending_status
|
191
206
|
block_ids.append(block_id)
|
207
|
+
|
192
208
|
except Exception as ex:
|
193
|
-
|
209
|
+
failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
|
210
|
+
self._simulated_status[block_id] = failed_status
|
211
|
+
self._status[block_id] = failed_status
|
212
|
+
|
213
|
+
self.send_monitoring_info(monitoring_status_changes)
|
194
214
|
return block_ids
|
195
215
|
|
196
|
-
@abstractmethod
|
197
216
|
def scale_in(self, blocks: int) -> List[str]:
|
198
217
|
"""Scale in method.
|
199
218
|
|
200
219
|
Cause the executor to reduce the number of blocks by count.
|
201
220
|
|
221
|
+
The default implementation will kill blocks without regard to their
|
222
|
+
status or whether they are executing tasks. Executors with more
|
223
|
+
nuanced scaling strategies might overload this method to work with
|
224
|
+
that strategy - see the HighThroughputExecutor for an example of that.
|
225
|
+
|
202
226
|
:return: A list of block ids corresponding to the blocks that were removed.
|
203
227
|
"""
|
204
|
-
|
228
|
+
|
229
|
+
active_blocks = [block_id for block_id, status in self._status.items()
|
230
|
+
if status.state not in TERMINAL_STATES]
|
231
|
+
|
232
|
+
block_ids_to_kill = active_blocks[:blocks]
|
233
|
+
|
234
|
+
job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill]
|
235
|
+
|
236
|
+
# Cancel the blocks provisioned
|
237
|
+
if self.provider:
|
238
|
+
logger.info(f"Scaling in jobs: {job_ids_to_kill}")
|
239
|
+
r = self.provider.cancel(job_ids_to_kill)
|
240
|
+
job_ids = self._filter_scale_in_ids(job_ids_to_kill, r)
|
241
|
+
block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids]
|
242
|
+
return block_ids_killed
|
243
|
+
else:
|
244
|
+
logger.error("No execution provider available to scale in")
|
245
|
+
return []
|
205
246
|
|
206
247
|
def _launch_block(self, block_id: str) -> Any:
|
207
248
|
launch_cmd = self._get_launch_command(block_id)
|
@@ -235,10 +276,10 @@ class BlockProviderExecutor(ParslExecutor):
|
|
235
276
|
|
236
277
|
def send_monitoring_info(self, status: Dict) -> None:
|
237
278
|
# Send monitoring info for HTEX when monitoring enabled
|
238
|
-
if self.
|
279
|
+
if self.submit_monitoring_radio:
|
239
280
|
msg = self.create_monitoring_info(status)
|
240
|
-
logger.debug("Sending
|
241
|
-
self.
|
281
|
+
logger.debug("Sending block monitoring message: %r", msg)
|
282
|
+
self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
|
242
283
|
|
243
284
|
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
|
244
285
|
"""Create a monitoring message for each block based on the poll status.
|
@@ -310,13 +351,3 @@ class BlockProviderExecutor(ParslExecutor):
|
|
310
351
|
del self._status[block_id]
|
311
352
|
self.send_monitoring_info(new_status)
|
312
353
|
return block_ids
|
313
|
-
|
314
|
-
def scale_out_facade(self, n: int) -> List[str]:
|
315
|
-
block_ids = self._scale_out(n)
|
316
|
-
if block_ids is not None:
|
317
|
-
new_status = {}
|
318
|
-
for block_id in block_ids:
|
319
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
320
|
-
self.send_monitoring_info(new_status)
|
321
|
-
self._status.update(new_status)
|
322
|
-
return block_ids
|
@@ -573,24 +573,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
573
573
|
def workers_per_node(self) -> Union[int, float]:
|
574
574
|
return 1
|
575
575
|
|
576
|
-
def scale_in(self, count: int) -> List[str]:
|
577
|
-
"""Scale in method. Cancel a given number of blocks
|
578
|
-
"""
|
579
|
-
# Obtain list of blocks to kill
|
580
|
-
to_kill = list(self.blocks_to_job_id.keys())[:count]
|
581
|
-
kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
|
582
|
-
|
583
|
-
# Cancel the blocks provisioned
|
584
|
-
if self.provider:
|
585
|
-
logger.info(f"Scaling in jobs: {kill_ids}")
|
586
|
-
r = self.provider.cancel(kill_ids)
|
587
|
-
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
588
|
-
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
589
|
-
return block_ids_killed
|
590
|
-
else:
|
591
|
-
logger.error("No execution provider available to scale")
|
592
|
-
return []
|
593
|
-
|
594
576
|
def shutdown(self, *args, **kwargs):
|
595
577
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
596
578
|
collector thread, which shuts down the TaskVine system submission.
|
@@ -689,24 +689,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
689
689
|
def workers_per_node(self) -> Union[int, float]:
|
690
690
|
return self.scaling_cores_per_worker
|
691
691
|
|
692
|
-
def scale_in(self, count: int) -> List[str]:
|
693
|
-
"""Scale in method.
|
694
|
-
"""
|
695
|
-
# Obtain list of blocks to kill
|
696
|
-
to_kill = list(self.blocks_to_job_id.keys())[:count]
|
697
|
-
kill_ids = [self.blocks_to_job_id[block] for block in to_kill]
|
698
|
-
|
699
|
-
# Cancel the blocks provisioned
|
700
|
-
if self.provider:
|
701
|
-
logger.info(f"Scaling in jobs: {kill_ids}")
|
702
|
-
r = self.provider.cancel(kill_ids)
|
703
|
-
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
704
|
-
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
705
|
-
return block_ids_killed
|
706
|
-
else:
|
707
|
-
logger.error("No execution provider available to scale in")
|
708
|
-
return []
|
709
|
-
|
710
692
|
def shutdown(self, *args, **kwargs):
|
711
693
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
712
694
|
collector thread, which shuts down the Work Queue system submission.
|
parsl/monitoring/monitoring.py
CHANGED
@@ -12,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
|
|
12
12
|
import typeguard
|
13
13
|
|
14
14
|
from parsl.log_utils import set_file_logger
|
15
|
+
from parsl.monitoring.errors import MonitoringHubStartError
|
15
16
|
from parsl.monitoring.message_type import MessageType
|
16
|
-
from parsl.monitoring.radios import
|
17
|
+
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
17
18
|
from parsl.monitoring.router import router_starter
|
18
19
|
from parsl.monitoring.types import AddressedMonitoringMessage
|
19
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
@@ -105,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
|
|
105
106
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
106
107
|
self.resource_monitoring_interval = resource_monitoring_interval
|
107
108
|
|
108
|
-
def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) ->
|
109
|
+
def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
|
109
110
|
|
110
111
|
logger.debug("Starting MonitoringHub")
|
111
112
|
|
@@ -187,7 +188,7 @@ class MonitoringHub(RepresentationMixin):
|
|
187
188
|
self.filesystem_proc.start()
|
188
189
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
189
190
|
|
190
|
-
self.radio =
|
191
|
+
self.radio = MultiprocessingQueueRadioSender(self.block_msgs)
|
191
192
|
|
192
193
|
try:
|
193
194
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
@@ -195,7 +196,7 @@ class MonitoringHub(RepresentationMixin):
|
|
195
196
|
comm_q.join_thread()
|
196
197
|
except queue.Empty:
|
197
198
|
logger.error("Hub has not completed initialization in 120s. Aborting")
|
198
|
-
raise
|
199
|
+
raise MonitoringHubStartError()
|
199
200
|
|
200
201
|
if isinstance(comm_q_result, str):
|
201
202
|
logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
|
@@ -207,7 +208,7 @@ class MonitoringHub(RepresentationMixin):
|
|
207
208
|
|
208
209
|
logger.info("Monitoring Hub initialized")
|
209
210
|
|
210
|
-
|
211
|
+
self.hub_zmq_port = zmq_port
|
211
212
|
|
212
213
|
# TODO: tighten the Any message format
|
213
214
|
def send(self, mtype: MessageType, message: Any) -> None:
|
parsl/monitoring/radios.py
CHANGED
@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
|
|
7
7
|
from multiprocessing.queues import Queue
|
8
8
|
from typing import Optional
|
9
9
|
|
10
|
+
import zmq
|
11
|
+
|
10
12
|
from parsl.serialize import serialize
|
11
13
|
|
12
14
|
_db_manager_excepts: Optional[Exception]
|
@@ -15,14 +17,14 @@ _db_manager_excepts: Optional[Exception]
|
|
15
17
|
logger = logging.getLogger(__name__)
|
16
18
|
|
17
19
|
|
18
|
-
class
|
20
|
+
class MonitoringRadioSender(metaclass=ABCMeta):
|
19
21
|
@abstractmethod
|
20
22
|
def send(self, message: object) -> None:
|
21
23
|
pass
|
22
24
|
|
23
25
|
|
24
|
-
class
|
25
|
-
"""A
|
26
|
+
class FilesystemRadioSender(MonitoringRadioSender):
|
27
|
+
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
26
28
|
|
27
29
|
The messsage directory structure is based on maildir,
|
28
30
|
https://en.wikipedia.org/wiki/Maildir
|
@@ -36,7 +38,7 @@ class FilesystemRadio(MonitoringRadio):
|
|
36
38
|
This avoids a race condition of reading partially written messages.
|
37
39
|
|
38
40
|
This radio is likely to give higher shared filesystem load compared to
|
39
|
-
the
|
41
|
+
the UDP radio, but should be much more reliable.
|
40
42
|
"""
|
41
43
|
|
42
44
|
def __init__(self, *, monitoring_url: str, source_id: int, timeout: int = 10, run_dir: str):
|
@@ -66,7 +68,7 @@ class FilesystemRadio(MonitoringRadio):
|
|
66
68
|
os.rename(tmp_filename, new_filename)
|
67
69
|
|
68
70
|
|
69
|
-
class
|
71
|
+
class HTEXRadioSender(MonitoringRadioSender):
|
70
72
|
|
71
73
|
def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
|
72
74
|
"""
|
@@ -120,7 +122,7 @@ class HTEXRadio(MonitoringRadio):
|
|
120
122
|
return
|
121
123
|
|
122
124
|
|
123
|
-
class
|
125
|
+
class UDPRadioSender(MonitoringRadioSender):
|
124
126
|
|
125
127
|
def __init__(self, monitoring_url: str, source_id: int, timeout: int = 10):
|
126
128
|
"""
|
@@ -174,7 +176,7 @@ class UDPRadio(MonitoringRadio):
|
|
174
176
|
return
|
175
177
|
|
176
178
|
|
177
|
-
class
|
179
|
+
class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
178
180
|
"""A monitoring radio which connects over a multiprocessing Queue.
|
179
181
|
This radio is intended to be used on the submit side, where components
|
180
182
|
in the submit process, or processes launched by multiprocessing, will have
|
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadio(MonitoringRadio):
|
|
186
188
|
|
187
189
|
def send(self, message: object) -> None:
|
188
190
|
self.queue.put((message, 0))
|
191
|
+
|
192
|
+
|
193
|
+
class ZMQRadioSender(MonitoringRadioSender):
|
194
|
+
"""A monitoring radio which connects over ZMQ. This radio is not
|
195
|
+
thread-safe, because its use of ZMQ is not thread-safe.
|
196
|
+
"""
|
197
|
+
|
198
|
+
def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
|
199
|
+
self._hub_channel = zmq.Context().socket(zmq.DEALER)
|
200
|
+
self._hub_channel.set_hwm(0)
|
201
|
+
self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
|
202
|
+
|
203
|
+
def send(self, message: object) -> None:
|
204
|
+
self._hub_channel.send_pyobj(message)
|
parsl/monitoring/remote.py
CHANGED
@@ -8,10 +8,10 @@ from typing import Any, Callable, Dict, List, Sequence, Tuple
|
|
8
8
|
|
9
9
|
from parsl.monitoring.message_type import MessageType
|
10
10
|
from parsl.monitoring.radios import (
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
FilesystemRadioSender,
|
12
|
+
HTEXRadioSender,
|
13
|
+
MonitoringRadioSender,
|
14
|
+
UDPRadioSender,
|
15
15
|
)
|
16
16
|
from parsl.multiprocessing import ForkProcess
|
17
17
|
from parsl.process_loggers import wrap_with_logs
|
@@ -100,17 +100,17 @@ def monitor_wrapper(*,
|
|
100
100
|
return (wrapped, args, new_kwargs)
|
101
101
|
|
102
102
|
|
103
|
-
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) ->
|
104
|
-
radio:
|
103
|
+
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
|
104
|
+
radio: MonitoringRadioSender
|
105
105
|
if radio_mode == "udp":
|
106
|
-
radio =
|
107
|
-
|
106
|
+
radio = UDPRadioSender(monitoring_hub_url,
|
107
|
+
source_id=task_id)
|
108
108
|
elif radio_mode == "htex":
|
109
|
-
radio =
|
110
|
-
|
109
|
+
radio = HTEXRadioSender(monitoring_hub_url,
|
110
|
+
source_id=task_id)
|
111
111
|
elif radio_mode == "filesystem":
|
112
|
-
radio =
|
113
|
-
|
112
|
+
radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
|
113
|
+
source_id=task_id, run_dir=run_dir)
|
114
114
|
else:
|
115
115
|
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
116
116
|
return radio
|
parsl/monitoring/router.py
CHANGED
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import pickle
|
6
6
|
import queue
|
7
7
|
import socket
|
8
|
+
import threading
|
8
9
|
import time
|
9
10
|
from multiprocessing.synchronize import Event
|
10
11
|
from typing import Optional, Tuple, Union
|
@@ -32,7 +33,12 @@ class MonitoringRouter:
|
|
32
33
|
logdir: str = ".",
|
33
34
|
run_id: str,
|
34
35
|
logging_level: int = logging.INFO,
|
35
|
-
atexit_timeout: int = 3
|
36
|
+
atexit_timeout: int = 3, # in seconds
|
37
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
38
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
39
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
40
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
41
|
+
exit_event: Event,
|
36
42
|
):
|
37
43
|
""" Initializes a monitoring configuration class.
|
38
44
|
|
@@ -51,7 +57,11 @@ class MonitoringRouter:
|
|
51
57
|
Logging level as defined in the logging module. Default: logging.INFO
|
52
58
|
atexit_timeout : float, optional
|
53
59
|
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
60
|
+
*_msgs : Queue
|
61
|
+
Four multiprocessing queues to receive messages, routed by type tag, and sometimes modified according to type tag.
|
54
62
|
|
63
|
+
exit_event : Event
|
64
|
+
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
55
65
|
"""
|
56
66
|
os.makedirs(logdir, exist_ok=True)
|
57
67
|
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
@@ -93,22 +103,60 @@ class MonitoringRouter:
|
|
93
103
|
min_port=zmq_port_range[0],
|
94
104
|
max_port=zmq_port_range[1])
|
95
105
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
106
|
+
self.priority_msgs = priority_msgs
|
107
|
+
self.node_msgs = node_msgs
|
108
|
+
self.block_msgs = block_msgs
|
109
|
+
self.resource_msgs = resource_msgs
|
110
|
+
self.exit_event = exit_event
|
111
|
+
|
112
|
+
@wrap_with_logs(target="monitoring_router")
|
113
|
+
def start(self) -> None:
|
114
|
+
self.logger.info("Starting UDP listener thread")
|
115
|
+
udp_radio_receiver_thread = threading.Thread(target=self.start_udp_listener, daemon=True)
|
116
|
+
udp_radio_receiver_thread.start()
|
117
|
+
|
118
|
+
self.logger.info("Starting ZMQ listener thread")
|
119
|
+
zmq_radio_receiver_thread = threading.Thread(target=self.start_zmq_listener, daemon=True)
|
120
|
+
zmq_radio_receiver_thread.start()
|
121
|
+
|
122
|
+
self.logger.info("Joining on ZMQ listener thread")
|
123
|
+
zmq_radio_receiver_thread.join()
|
124
|
+
self.logger.info("Joining on UDP listener thread")
|
125
|
+
udp_radio_receiver_thread.join()
|
126
|
+
self.logger.info("Joined on both ZMQ and UDP listener threads")
|
127
|
+
|
128
|
+
@wrap_with_logs(target="monitoring_router")
|
129
|
+
def start_udp_listener(self) -> None:
|
102
130
|
try:
|
103
|
-
while not exit_event.is_set():
|
131
|
+
while not self.exit_event.is_set():
|
104
132
|
try:
|
105
133
|
data, addr = self.udp_sock.recvfrom(2048)
|
106
134
|
resource_msg = pickle.loads(data)
|
107
135
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
108
|
-
resource_msgs.put((resource_msg, addr))
|
136
|
+
self.resource_msgs.put((resource_msg, addr))
|
109
137
|
except socket.timeout:
|
110
138
|
pass
|
111
139
|
|
140
|
+
self.logger.info("UDP listener draining")
|
141
|
+
last_msg_received_time = time.time()
|
142
|
+
while time.time() - last_msg_received_time < self.atexit_timeout:
|
143
|
+
try:
|
144
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
145
|
+
msg = pickle.loads(data)
|
146
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
147
|
+
self.resource_msgs.put((msg, addr))
|
148
|
+
last_msg_received_time = time.time()
|
149
|
+
except socket.timeout:
|
150
|
+
pass
|
151
|
+
|
152
|
+
self.logger.info("UDP listener finishing normally")
|
153
|
+
finally:
|
154
|
+
self.logger.info("UDP listener finished")
|
155
|
+
|
156
|
+
@wrap_with_logs(target="monitoring_router")
|
157
|
+
def start_zmq_listener(self) -> None:
|
158
|
+
try:
|
159
|
+
while not self.exit_event.is_set():
|
112
160
|
try:
|
113
161
|
dfk_loop_start = time.time()
|
114
162
|
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
@@ -125,15 +173,15 @@ class MonitoringRouter:
|
|
125
173
|
|
126
174
|
if msg[0] == MessageType.NODE_INFO:
|
127
175
|
msg[1]['run_id'] = self.run_id
|
128
|
-
node_msgs.put(msg_0)
|
176
|
+
self.node_msgs.put(msg_0)
|
129
177
|
elif msg[0] == MessageType.RESOURCE_INFO:
|
130
|
-
resource_msgs.put(msg_0)
|
178
|
+
self.resource_msgs.put(msg_0)
|
131
179
|
elif msg[0] == MessageType.BLOCK_INFO:
|
132
|
-
block_msgs.put(msg_0)
|
180
|
+
self.block_msgs.put(msg_0)
|
133
181
|
elif msg[0] == MessageType.TASK_INFO:
|
134
|
-
priority_msgs.put(msg_0)
|
182
|
+
self.priority_msgs.put(msg_0)
|
135
183
|
elif msg[0] == MessageType.WORKFLOW_INFO:
|
136
|
-
priority_msgs.put(msg_0)
|
184
|
+
self.priority_msgs.put(msg_0)
|
137
185
|
else:
|
138
186
|
# There is a type: ignore here because if msg[0]
|
139
187
|
# is of the correct type, this code is unreachable,
|
@@ -151,21 +199,9 @@ class MonitoringRouter:
|
|
151
199
|
# thing to do.
|
152
200
|
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
153
201
|
|
154
|
-
self.logger.info("
|
155
|
-
last_msg_received_time = time.time()
|
156
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
157
|
-
try:
|
158
|
-
data, addr = self.udp_sock.recvfrom(2048)
|
159
|
-
msg = pickle.loads(data)
|
160
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
161
|
-
resource_msgs.put((msg, addr))
|
162
|
-
last_msg_received_time = time.time()
|
163
|
-
except socket.timeout:
|
164
|
-
pass
|
165
|
-
|
166
|
-
self.logger.info("Monitoring router finishing normally")
|
202
|
+
self.logger.info("ZMQ listener finishing normally")
|
167
203
|
finally:
|
168
|
-
self.logger.info("
|
204
|
+
self.logger.info("ZMQ listener finished")
|
169
205
|
|
170
206
|
|
171
207
|
@wrap_with_logs
|
@@ -191,7 +227,12 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
191
227
|
zmq_port_range=zmq_port_range,
|
192
228
|
logdir=logdir,
|
193
229
|
logging_level=logging_level,
|
194
|
-
run_id=run_id
|
230
|
+
run_id=run_id,
|
231
|
+
priority_msgs=priority_msgs,
|
232
|
+
node_msgs=node_msgs,
|
233
|
+
block_msgs=block_msgs,
|
234
|
+
resource_msgs=resource_msgs,
|
235
|
+
exit_event=exit_event)
|
195
236
|
except Exception as e:
|
196
237
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
197
238
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
@@ -200,7 +241,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
200
241
|
|
201
242
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
202
243
|
try:
|
203
|
-
router.start(
|
244
|
+
router.start()
|
204
245
|
except Exception as e:
|
205
246
|
router.logger.exception("router.start exception")
|
206
247
|
exception_q.put(('Hub', str(e)))
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl import Config
|
7
|
+
from parsl.executors import HighThroughputExecutor
|
8
|
+
from parsl.executors.errors import BadStateException
|
9
|
+
from parsl.jobs.states import JobState, JobStatus
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
|
12
|
+
|
13
|
+
class FailingProvider(LocalProvider):
|
14
|
+
def submit(*args, **kwargs):
|
15
|
+
raise RuntimeError("Deliberate failure of provider.submit")
|
16
|
+
|
17
|
+
|
18
|
+
def local_config():
|
19
|
+
"""Config to simulate failing blocks without connecting"""
|
20
|
+
return Config(
|
21
|
+
executors=[
|
22
|
+
HighThroughputExecutor(
|
23
|
+
label="HTEX",
|
24
|
+
heartbeat_period=1,
|
25
|
+
heartbeat_threshold=2,
|
26
|
+
poll_period=100,
|
27
|
+
max_workers_per_node=1,
|
28
|
+
provider=FailingProvider(
|
29
|
+
init_blocks=0,
|
30
|
+
max_blocks=2,
|
31
|
+
min_blocks=0,
|
32
|
+
),
|
33
|
+
)
|
34
|
+
],
|
35
|
+
max_idletime=0.5,
|
36
|
+
strategy='htex_auto_scale',
|
37
|
+
strategy_period=0.1
|
38
|
+
# this strategy period needs to be a few times smaller than the
|
39
|
+
# status_polling_interval of FailingProvider, which is 5s at
|
40
|
+
# time of writing
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
@parsl.python_app
|
45
|
+
def double(x):
|
46
|
+
return x * 2
|
47
|
+
|
48
|
+
|
49
|
+
@pytest.mark.local
|
50
|
+
def test_disconnected_blocks():
|
51
|
+
"""Test reporting of blocks that fail to connect from HTEX"""
|
52
|
+
dfk = parsl.dfk()
|
53
|
+
executor = dfk.executors["HTEX"]
|
54
|
+
|
55
|
+
connected_blocks = executor.connected_blocks()
|
56
|
+
assert not connected_blocks, "Expected 0 blocks"
|
57
|
+
|
58
|
+
future = double(5)
|
59
|
+
with pytest.raises(BadStateException):
|
60
|
+
future.result()
|
61
|
+
|
62
|
+
assert isinstance(future.exception(), BadStateException)
|
63
|
+
|
64
|
+
status_dict = executor.status()
|
65
|
+
assert len(status_dict) == 1, "Expected exactly 1 block"
|
66
|
+
for status in status_dict.values():
|
67
|
+
assert isinstance(status, JobStatus)
|
68
|
+
assert status.state == JobState.MISSING
|
69
|
+
|
70
|
+
connected_blocks = executor.connected_blocks()
|
71
|
+
assert connected_blocks == [], "Expected exactly 0 connected blocks"
|