parsl 2025.6.23__py3-none-any.whl → 2025.6.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/osg.py +1 -1
- parsl/dataflow/dflow.py +14 -4
- parsl/executors/base.py +14 -6
- parsl/executors/high_throughput/executor.py +20 -15
- parsl/executors/high_throughput/interchange.py +173 -191
- parsl/executors/high_throughput/mpi_executor.py +7 -4
- parsl/executors/high_throughput/probe.py +4 -4
- parsl/executors/high_throughput/process_worker_pool.py +88 -94
- parsl/executors/taskvine/executor.py +9 -3
- parsl/executors/taskvine/manager.py +3 -1
- parsl/executors/threads.py +8 -1
- parsl/executors/workqueue/executor.py +9 -3
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +25 -42
- parsl/monitoring/radios/base.py +63 -2
- parsl/monitoring/radios/filesystem.py +18 -3
- parsl/monitoring/radios/filesystem_router.py +13 -26
- parsl/monitoring/radios/htex.py +22 -13
- parsl/monitoring/radios/multiprocessing.py +22 -2
- parsl/monitoring/radios/udp.py +57 -19
- parsl/monitoring/radios/udp_router.py +49 -15
- parsl/monitoring/remote.py +19 -40
- parsl/providers/local/local.py +12 -13
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
- parsl/tests/test_htex/test_zmq_binding.py +5 -6
- parsl/tests/test_monitoring/test_basic.py +12 -10
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_radio_filesystem.py +7 -9
- parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
- parsl/tests/test_monitoring/test_radio_udp.py +163 -12
- parsl/tests/test_monitoring/test_stdouterr.py +1 -3
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
- parsl/version.py +1 -1
- {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
- {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/RECORD +44 -43
- {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.6.23.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
parsl/monitoring/radios/base.py
CHANGED
@@ -1,10 +1,71 @@
|
|
1
|
-
import logging
|
2
1
|
from abc import ABCMeta, abstractmethod
|
2
|
+
from multiprocessing.queues import Queue
|
3
3
|
|
4
|
-
|
4
|
+
|
5
|
+
class MonitoringRadioReceiver(metaclass=ABCMeta):
|
6
|
+
@abstractmethod
|
7
|
+
def shutdown(self) -> None:
|
8
|
+
pass
|
5
9
|
|
6
10
|
|
7
11
|
class MonitoringRadioSender(metaclass=ABCMeta):
|
8
12
|
@abstractmethod
|
9
13
|
def send(self, message: object) -> None:
|
10
14
|
pass
|
15
|
+
|
16
|
+
|
17
|
+
class RadioConfig(metaclass=ABCMeta):
|
18
|
+
"""Base class for radio plugin configuration.
|
19
|
+
|
20
|
+
This provides the configuration for a particular way of sending monitoring
|
21
|
+
messages from a source of monitoring messages into the submit side
|
22
|
+
monitoring database.
|
23
|
+
|
24
|
+
This uses staged initialization like lots of Parsl configuration, but in
|
25
|
+
a slightly different form.
|
26
|
+
|
27
|
+
A RadioConfig object must be pickleable, because it will be sent to remote
|
28
|
+
workers to configure senders. The MonitoringRadioSender and
|
29
|
+
MonitoringRadioReceiver objects do not need to be pickleable (and often
|
30
|
+
will not be - for example, when they hold references to other processes).
|
31
|
+
|
32
|
+
The RadioConfig object will be used by Parsl in this sequence:
|
33
|
+
|
34
|
+
* A user creates a RadioConfig object from the appropriate subclass for
|
35
|
+
radio mechanism they want to use, and specifies it as part of their
|
36
|
+
executor configuration.
|
37
|
+
|
38
|
+
Methods on the RadioConfig will then be invoked by Parsl like this:
|
39
|
+
|
40
|
+
* one create_receiver call, on the submit side
|
41
|
+
- this call can modify the state of radioconfig to contain information
|
42
|
+
about how a sender can connect back to the receiver. for example,
|
43
|
+
after binding to a particular port, can store that port so that the
|
44
|
+
sender knows which port to connect to.
|
45
|
+
|
46
|
+
* Possibly many serializations to get the RadioConfig to remote workers
|
47
|
+
|
48
|
+
* Many (0 or more) create_sender calls, possibly on remote workers, to
|
49
|
+
create the sending side of the radio (MonitoringRadioSender instances)
|
50
|
+
|
51
|
+
* Those senders are used to send messages
|
52
|
+
|
53
|
+
* At executor shutdown, the receiver is shut down.
|
54
|
+
|
55
|
+
This object cannot be re-used across parsl configurations - like many other
|
56
|
+
pieces of parsl config it is single use in that respect.
|
57
|
+
"""
|
58
|
+
|
59
|
+
@abstractmethod
|
60
|
+
def create_receiver(self, *, run_dir: str, resource_msgs: Queue) -> MonitoringRadioReceiver:
|
61
|
+
"""Create a receiver for this RadioConfig, and update this RadioConfig
|
62
|
+
with enough context to create senders.
|
63
|
+
"""
|
64
|
+
pass
|
65
|
+
|
66
|
+
@abstractmethod
|
67
|
+
def create_sender(self) -> MonitoringRadioSender:
|
68
|
+
"""Create a sender to connect to the receiver created by an
|
69
|
+
earlier call to create_receiver.
|
70
|
+
"""
|
71
|
+
pass
|
@@ -2,13 +2,19 @@ import logging
|
|
2
2
|
import os
|
3
3
|
import pickle
|
4
4
|
import uuid
|
5
|
+
from multiprocessing.queues import Queue
|
5
6
|
|
6
|
-
from parsl.monitoring.radios.base import
|
7
|
+
from parsl.monitoring.radios.base import (
|
8
|
+
MonitoringRadioReceiver,
|
9
|
+
MonitoringRadioSender,
|
10
|
+
RadioConfig,
|
11
|
+
)
|
12
|
+
from parsl.monitoring.radios.filesystem_router import FilesystemRadioReceiver
|
7
13
|
|
8
14
|
logger = logging.getLogger(__name__)
|
9
15
|
|
10
16
|
|
11
|
-
class
|
17
|
+
class FilesystemRadio(RadioConfig):
|
12
18
|
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
13
19
|
|
14
20
|
The messsage directory structure is based on maildir,
|
@@ -26,7 +32,16 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
26
32
|
the UDP radio, but should be much more reliable.
|
27
33
|
"""
|
28
34
|
|
29
|
-
def
|
35
|
+
def create_sender(self) -> MonitoringRadioSender:
|
36
|
+
return FilesystemRadioSender(run_dir=self.run_dir)
|
37
|
+
|
38
|
+
def create_receiver(self, *, run_dir: str, resource_msgs: Queue) -> MonitoringRadioReceiver:
|
39
|
+
self.run_dir = run_dir
|
40
|
+
return FilesystemRadioReceiver(resource_msgs, run_dir)
|
41
|
+
|
42
|
+
|
43
|
+
class FilesystemRadioSender(MonitoringRadioSender):
|
44
|
+
def __init__(self, *, run_dir: str):
|
30
45
|
logger.info("filesystem based monitoring radio initializing")
|
31
46
|
self.base_path = f"{run_dir}/monitor-fs-radio/"
|
32
47
|
self.tmp_path = f"{self.base_path}/tmp"
|
@@ -4,15 +4,15 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import pickle
|
6
6
|
import time
|
7
|
-
from multiprocessing.context import SpawnProcess
|
8
7
|
from multiprocessing.queues import Queue
|
9
8
|
from multiprocessing.synchronize import Event
|
10
9
|
from typing import cast
|
11
10
|
|
12
11
|
from parsl.log_utils import set_file_logger
|
12
|
+
from parsl.monitoring.radios.base import MonitoringRadioReceiver
|
13
13
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
14
14
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
15
|
-
from parsl.multiprocessing import SpawnEvent, join_terminate_close_proc
|
15
|
+
from parsl.multiprocessing import SpawnEvent, SpawnProcess, join_terminate_close_proc
|
16
16
|
from parsl.process_loggers import wrap_with_logs
|
17
17
|
from parsl.utils import setproctitle
|
18
18
|
|
@@ -57,30 +57,17 @@ def filesystem_router_starter(*, q: Queue[TaggedMonitoringMessage], run_dir: str
|
|
57
57
|
logger.info("Ending filesystem radio receiver")
|
58
58
|
|
59
59
|
|
60
|
-
class FilesystemRadioReceiver():
|
61
|
-
def __init__(self,
|
62
|
-
self.
|
63
|
-
self.
|
60
|
+
class FilesystemRadioReceiver(MonitoringRadioReceiver):
|
61
|
+
def __init__(self, resource_msgs: Queue, run_dir: str) -> None:
|
62
|
+
self.exit_event = SpawnEvent()
|
63
|
+
self.process = SpawnProcess(target=filesystem_router_starter,
|
64
|
+
kwargs={"q": resource_msgs, "run_dir": run_dir, "exit_event": self.exit_event},
|
65
|
+
name="Monitoring-Filesystem-Process",
|
66
|
+
daemon=True
|
67
|
+
)
|
68
|
+
self.process.start()
|
69
|
+
logger.info("Started filesystem radio receiver process %s", self.process.pid)
|
64
70
|
|
65
|
-
def
|
71
|
+
def shutdown(self) -> None:
|
66
72
|
self.exit_event.set()
|
67
73
|
join_terminate_close_proc(self.process)
|
68
|
-
|
69
|
-
|
70
|
-
def start_filesystem_receiver(*,
|
71
|
-
monitoring_messages: Queue,
|
72
|
-
logdir: str,
|
73
|
-
debug: bool) -> FilesystemRadioReceiver:
|
74
|
-
|
75
|
-
router_exit_event = SpawnEvent()
|
76
|
-
|
77
|
-
filesystem_proc = SpawnProcess(target=filesystem_router_starter,
|
78
|
-
kwargs={"q": monitoring_messages,
|
79
|
-
"run_dir": logdir,
|
80
|
-
"exit_event": router_exit_event},
|
81
|
-
name="Monitoring-Filesystem-Process",
|
82
|
-
daemon=True
|
83
|
-
)
|
84
|
-
filesystem_proc.start()
|
85
|
-
|
86
|
-
return FilesystemRadioReceiver(process=filesystem_proc, exit_event=router_exit_event)
|
parsl/monitoring/radios/htex.py
CHANGED
@@ -1,24 +1,29 @@
|
|
1
1
|
import logging
|
2
2
|
import pickle
|
3
|
+
from multiprocessing.queues import Queue
|
3
4
|
|
4
|
-
from parsl.monitoring.radios.base import
|
5
|
+
from parsl.monitoring.radios.base import (
|
6
|
+
MonitoringRadioReceiver,
|
7
|
+
MonitoringRadioSender,
|
8
|
+
RadioConfig,
|
9
|
+
)
|
5
10
|
|
6
11
|
logger = logging.getLogger(__name__)
|
7
12
|
|
8
13
|
|
9
|
-
class
|
14
|
+
class HTEXRadio(RadioConfig):
|
15
|
+
def create_sender(self) -> MonitoringRadioSender:
|
16
|
+
return HTEXRadioSender()
|
10
17
|
|
11
|
-
def
|
12
|
-
|
13
|
-
Parameters
|
14
|
-
----------
|
18
|
+
def create_receiver(self, *, run_dir: str, resource_msgs: Queue) -> MonitoringRadioReceiver:
|
19
|
+
return HTEXRadioReceiver()
|
15
20
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
|
22
|
+
class HTEXRadioSender(MonitoringRadioSender):
|
23
|
+
|
24
|
+
def __init__(self) -> None:
|
25
|
+
# there is nothing to initialize
|
26
|
+
pass
|
22
27
|
|
23
28
|
def send(self, message: object) -> None:
|
24
29
|
""" Sends a message to the UDP receiver
|
@@ -54,4 +59,8 @@ class HTEXRadioSender(MonitoringRadioSender):
|
|
54
59
|
else:
|
55
60
|
logger.error("result_queue is uninitialized - cannot put monitoring message")
|
56
61
|
|
57
|
-
|
62
|
+
|
63
|
+
class HTEXRadioReceiver(MonitoringRadioReceiver):
|
64
|
+
def shutdown(self) -> None:
|
65
|
+
# there is nothing to shut down
|
66
|
+
pass
|
@@ -1,6 +1,10 @@
|
|
1
|
-
from multiprocessing
|
1
|
+
from multiprocessing import Queue
|
2
2
|
|
3
|
-
from parsl.monitoring.radios.base import
|
3
|
+
from parsl.monitoring.radios.base import (
|
4
|
+
MonitoringRadioReceiver,
|
5
|
+
MonitoringRadioSender,
|
6
|
+
RadioConfig,
|
7
|
+
)
|
4
8
|
|
5
9
|
|
6
10
|
class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
@@ -15,3 +19,19 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
15
19
|
|
16
20
|
def send(self, message: object) -> None:
|
17
21
|
self.queue.put(message)
|
22
|
+
|
23
|
+
|
24
|
+
class MultiprocessingQueueRadio(RadioConfig):
|
25
|
+
def create_sender(self) -> MonitoringRadioSender:
|
26
|
+
return MultiprocessingQueueRadioSender(self._queue)
|
27
|
+
|
28
|
+
def create_receiver(self, *, run_dir: str, resource_msgs: Queue) -> MonitoringRadioReceiver:
|
29
|
+
# This object is only for use with an in-process thread-pool so it
|
30
|
+
# is fine to store a reference to the message queue directly.
|
31
|
+
self._queue = resource_msgs
|
32
|
+
return MultiprocessingQueueRadioReceiver()
|
33
|
+
|
34
|
+
|
35
|
+
class MultiprocessingQueueRadioReceiver(MonitoringRadioReceiver):
|
36
|
+
def shutdown(self) -> None:
|
37
|
+
pass
|
parsl/monitoring/radios/udp.py
CHANGED
@@ -1,29 +1,63 @@
|
|
1
|
+
import hashlib
|
2
|
+
import hmac
|
1
3
|
import logging
|
2
4
|
import pickle
|
5
|
+
import secrets
|
3
6
|
import socket
|
7
|
+
from multiprocessing.queues import Queue
|
8
|
+
from typing import Optional
|
4
9
|
|
5
|
-
from parsl.monitoring.radios.base import
|
10
|
+
from parsl.monitoring.radios.base import (
|
11
|
+
MonitoringRadioReceiver,
|
12
|
+
MonitoringRadioSender,
|
13
|
+
RadioConfig,
|
14
|
+
)
|
15
|
+
from parsl.monitoring.radios.udp_router import start_udp_receiver
|
6
16
|
|
17
|
+
logger = logging.getLogger(__name__)
|
7
18
|
|
8
|
-
class UDPRadioSender(MonitoringRadioSender):
|
9
19
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
20
|
+
class UDPRadio(RadioConfig):
|
21
|
+
def __init__(self, *, port: Optional[int] = None, atexit_timeout: int = 3, address: str, debug: bool = False, hmac_digest: str = 'sha512'):
|
22
|
+
self.port = port
|
23
|
+
self.atexit_timeout = atexit_timeout
|
24
|
+
self.address = address
|
25
|
+
self.debug = debug
|
26
|
+
self.hmac_digest = hmac_digest
|
14
27
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
28
|
+
def create_sender(self) -> MonitoringRadioSender:
|
29
|
+
assert self.port is not None, "self.port should have been initialized by create_receiver"
|
30
|
+
return UDPRadioSender(self.address, self.port, self.hmac_key, self.hmac_digest)
|
31
|
+
|
32
|
+
def create_receiver(self, run_dir: str, resource_msgs: Queue) -> MonitoringRadioReceiver:
|
33
|
+
# RFC 2104 section 2 recommends that the key length be at
|
34
|
+
# least as long as the hash output (64 bytes in the case of SHA512).
|
35
|
+
# RFC 2014 section 3 talks about periodic key refreshing. This key is
|
36
|
+
# not refreshed inside a workflow run, but each separate workflow run
|
37
|
+
# uses a new key.
|
38
|
+
keysize = hashlib.new(self.hmac_digest).digest_size
|
39
|
+
self.hmac_key = secrets.token_bytes(keysize)
|
40
|
+
|
41
|
+
udp_receiver = start_udp_receiver(logdir=run_dir,
|
42
|
+
monitoring_messages=resource_msgs,
|
43
|
+
port=self.port,
|
44
|
+
debug=self.debug,
|
45
|
+
atexit_timeout=self.atexit_timeout,
|
46
|
+
hmac_key=self.hmac_key,
|
47
|
+
hmac_digest=self.hmac_digest
|
48
|
+
)
|
49
|
+
self.port = udp_receiver.port
|
50
|
+
return udp_receiver
|
51
|
+
|
52
|
+
|
53
|
+
class UDPRadioSender(MonitoringRadioSender):
|
54
|
+
|
55
|
+
def __init__(self, address: str, port: int, hmac_key: bytes, hmac_digest: str, *, timeout: int = 10) -> None:
|
21
56
|
self.sock_timeout = timeout
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
raise Exception("Failed to parse monitoring url: {}".format(monitoring_url))
|
57
|
+
self.address = address
|
58
|
+
self.port = port
|
59
|
+
self.hmac_key = hmac_key
|
60
|
+
self.hmac_digest = hmac_digest
|
27
61
|
|
28
62
|
self.sock = socket.socket(socket.AF_INET,
|
29
63
|
socket.SOCK_DGRAM,
|
@@ -42,15 +76,19 @@ class UDPRadioSender(MonitoringRadioSender):
|
|
42
76
|
Returns:
|
43
77
|
None
|
44
78
|
"""
|
79
|
+
logger.info("Starting UDP radio message send")
|
45
80
|
try:
|
46
|
-
|
81
|
+
data = pickle.dumps(message)
|
82
|
+
origin_hmac = hmac.digest(self.hmac_key, data, self.hmac_digest)
|
83
|
+
buffer = origin_hmac + data
|
47
84
|
except Exception:
|
48
85
|
logging.exception("Exception during pickling", exc_info=True)
|
49
86
|
return
|
50
87
|
|
51
88
|
try:
|
52
|
-
self.sock.sendto(buffer, (self.
|
89
|
+
self.sock.sendto(buffer, (self.address, self.port))
|
53
90
|
except socket.timeout:
|
54
91
|
logging.error("Could not send message within timeout limit")
|
55
92
|
return
|
93
|
+
logger.info("Normal ending for UDP radio message send")
|
56
94
|
return
|
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import hmac
|
3
4
|
import logging
|
4
5
|
import multiprocessing.queues as mpq
|
5
6
|
import os
|
@@ -17,6 +18,7 @@ import typeguard
|
|
17
18
|
|
18
19
|
from parsl.log_utils import set_file_logger
|
19
20
|
from parsl.monitoring.errors import MonitoringRouterStartError
|
21
|
+
from parsl.monitoring.radios.base import MonitoringRadioReceiver
|
20
22
|
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
21
23
|
from parsl.multiprocessing import (
|
22
24
|
SizedQueue,
|
@@ -37,9 +39,11 @@ class MonitoringRouter:
|
|
37
39
|
udp_port: Optional[int] = None,
|
38
40
|
run_dir: str = ".",
|
39
41
|
logging_level: int = logging.INFO,
|
40
|
-
atexit_timeout: int
|
42
|
+
atexit_timeout: int, # in seconds
|
41
43
|
resource_msgs: mpq.Queue,
|
42
44
|
exit_event: Event,
|
45
|
+
hmac_key: bytes,
|
46
|
+
hmac_digest: str,
|
43
47
|
):
|
44
48
|
""" Initializes a monitoring configuration class.
|
45
49
|
|
@@ -65,6 +69,9 @@ class MonitoringRouter:
|
|
65
69
|
|
66
70
|
self.atexit_timeout = atexit_timeout
|
67
71
|
|
72
|
+
self.hmac_key = hmac_key
|
73
|
+
self.hmac_digest = hmac_digest
|
74
|
+
|
68
75
|
self.loop_freq = 10.0 # milliseconds
|
69
76
|
|
70
77
|
# Initialize the UDP socket
|
@@ -94,10 +101,7 @@ class MonitoringRouter:
|
|
94
101
|
try:
|
95
102
|
while not self.exit_event.is_set():
|
96
103
|
try:
|
97
|
-
|
98
|
-
resource_msg = pickle.loads(data)
|
99
|
-
logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
100
|
-
self.target_radio.send(resource_msg)
|
104
|
+
self.process_message()
|
101
105
|
except socket.timeout:
|
102
106
|
pass
|
103
107
|
|
@@ -105,10 +109,7 @@ class MonitoringRouter:
|
|
105
109
|
last_msg_received_time = time.time()
|
106
110
|
while time.time() - last_msg_received_time < self.atexit_timeout:
|
107
111
|
try:
|
108
|
-
|
109
|
-
msg = pickle.loads(data)
|
110
|
-
logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
111
|
-
self.target_radio.send(msg)
|
112
|
+
self.process_message()
|
112
113
|
last_msg_received_time = time.time()
|
113
114
|
except socket.timeout:
|
114
115
|
pass
|
@@ -117,6 +118,28 @@ class MonitoringRouter:
|
|
117
118
|
finally:
|
118
119
|
logger.info("UDP listener finished")
|
119
120
|
|
121
|
+
def process_message(self) -> None:
|
122
|
+
hmdata, addr = self.udp_sock.recvfrom(2048)
|
123
|
+
h = hmac.HMAC(key=self.hmac_key, digestmod=self.hmac_digest)
|
124
|
+
origin_hmac = hmdata[0:h.digest_size]
|
125
|
+
h.update(hmdata[h.digest_size:])
|
126
|
+
data = hmdata[h.digest_size:]
|
127
|
+
|
128
|
+
# Check hmac before pickle load.
|
129
|
+
# If data is wrong, do not log it because it is suspect,
|
130
|
+
# but it should be safe to log the addr, at error level.
|
131
|
+
|
132
|
+
recomputed_hmac = h.digest()
|
133
|
+
|
134
|
+
if not hmac.compare_digest(origin_hmac, recomputed_hmac):
|
135
|
+
logger.error("HMAC does not match on received message")
|
136
|
+
# No exception, because this can be arbitrary network noise
|
137
|
+
# that shouldn't break the receiver.
|
138
|
+
else:
|
139
|
+
resource_msg = pickle.loads(data)
|
140
|
+
logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
141
|
+
self.target_radio.send(resource_msg)
|
142
|
+
|
120
143
|
|
121
144
|
@wrap_with_logs
|
122
145
|
@typeguard.typechecked
|
@@ -126,16 +149,21 @@ def udp_router_starter(*,
|
|
126
149
|
exit_event: Event,
|
127
150
|
|
128
151
|
udp_port: Optional[int],
|
129
|
-
|
152
|
+
hmac_key: bytes,
|
130
153
|
run_dir: str,
|
131
|
-
logging_level: int
|
154
|
+
logging_level: int,
|
155
|
+
atexit_timeout: int,
|
156
|
+
hmac_digest: str) -> None:
|
132
157
|
setproctitle("parsl: monitoring UDP router")
|
133
158
|
try:
|
134
159
|
router = MonitoringRouter(udp_port=udp_port,
|
135
160
|
run_dir=run_dir,
|
136
161
|
logging_level=logging_level,
|
137
162
|
resource_msgs=resource_msgs,
|
138
|
-
exit_event=exit_event
|
163
|
+
exit_event=exit_event,
|
164
|
+
atexit_timeout=atexit_timeout,
|
165
|
+
hmac_key=hmac_key,
|
166
|
+
hmac_digest=hmac_digest)
|
139
167
|
except Exception as e:
|
140
168
|
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
141
169
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
@@ -149,13 +177,13 @@ def udp_router_starter(*,
|
|
149
177
|
logger.exception("UDP router start exception")
|
150
178
|
|
151
179
|
|
152
|
-
class UDPRadioReceiver():
|
180
|
+
class UDPRadioReceiver(MonitoringRadioReceiver):
|
153
181
|
def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
|
154
182
|
self.process = process
|
155
183
|
self.exit_event = exit_event
|
156
184
|
self.port = port
|
157
185
|
|
158
|
-
def
|
186
|
+
def shutdown(self) -> None:
|
159
187
|
self.exit_event.set()
|
160
188
|
join_terminate_close_proc(self.process)
|
161
189
|
|
@@ -164,7 +192,10 @@ def start_udp_receiver(*,
|
|
164
192
|
monitoring_messages: Queue,
|
165
193
|
port: Optional[int],
|
166
194
|
logdir: str,
|
167
|
-
debug: bool
|
195
|
+
debug: bool,
|
196
|
+
atexit_timeout: int,
|
197
|
+
hmac_key: bytes,
|
198
|
+
hmac_digest: str) -> UDPRadioReceiver:
|
168
199
|
|
169
200
|
udp_comm_q: Queue[Union[int, str]]
|
170
201
|
udp_comm_q = SizedQueue(maxsize=10)
|
@@ -178,6 +209,9 @@ def start_udp_receiver(*,
|
|
178
209
|
"udp_port": port,
|
179
210
|
"run_dir": logdir,
|
180
211
|
"logging_level": logging.DEBUG if debug else logging.INFO,
|
212
|
+
"atexit_timeout": atexit_timeout,
|
213
|
+
"hmac_key": hmac_key,
|
214
|
+
"hmac_digest": hmac_digest,
|
181
215
|
},
|
182
216
|
name="Monitoring-UDP-Router-Process",
|
183
217
|
daemon=True,
|
parsl/monitoring/remote.py
CHANGED
@@ -7,10 +7,7 @@ from multiprocessing import Event
|
|
7
7
|
from typing import Any, Callable, Dict, List, Sequence, Tuple
|
8
8
|
|
9
9
|
from parsl.monitoring.message_type import MessageType
|
10
|
-
from parsl.monitoring.radios.base import
|
11
|
-
from parsl.monitoring.radios.filesystem import FilesystemRadioSender
|
12
|
-
from parsl.monitoring.radios.htex import HTEXRadioSender
|
13
|
-
from parsl.monitoring.radios.udp import UDPRadioSender
|
10
|
+
from parsl.monitoring.radios.base import RadioConfig
|
14
11
|
from parsl.multiprocessing import ForkProcess
|
15
12
|
from parsl.process_loggers import wrap_with_logs
|
16
13
|
|
@@ -23,11 +20,10 @@ def monitor_wrapper(*,
|
|
23
20
|
kwargs: Dict, # per invocation
|
24
21
|
x_try_id: int, # per invocation
|
25
22
|
x_task_id: int, # per invocation
|
26
|
-
|
23
|
+
radio_config: RadioConfig, # per executor
|
27
24
|
run_id: str, # per workflow
|
28
25
|
logging_level: int, # per workflow
|
29
26
|
sleep_dur: float, # per workflow
|
30
|
-
radio_mode: str, # per executor
|
31
27
|
monitor_resources: bool, # per workflow
|
32
28
|
run_dir: str) -> Tuple[Callable, Sequence, Dict]:
|
33
29
|
"""Wrap the Parsl app with a function that will call the monitor function and point it at the correct pid when the task begins.
|
@@ -41,9 +37,8 @@ def monitor_wrapper(*,
|
|
41
37
|
# Send first message to monitoring router
|
42
38
|
send_first_message(try_id,
|
43
39
|
task_id,
|
44
|
-
|
40
|
+
radio_config,
|
45
41
|
run_id,
|
46
|
-
radio_mode,
|
47
42
|
run_dir)
|
48
43
|
|
49
44
|
if monitor_resources and sleep_dur > 0:
|
@@ -52,9 +47,8 @@ def monitor_wrapper(*,
|
|
52
47
|
args=(os.getpid(),
|
53
48
|
try_id,
|
54
49
|
task_id,
|
55
|
-
|
50
|
+
radio_config,
|
56
51
|
run_id,
|
57
|
-
radio_mode,
|
58
52
|
logging_level,
|
59
53
|
sleep_dur,
|
60
54
|
run_dir,
|
@@ -87,9 +81,9 @@ def monitor_wrapper(*,
|
|
87
81
|
|
88
82
|
send_last_message(try_id,
|
89
83
|
task_id,
|
90
|
-
|
84
|
+
radio_config,
|
91
85
|
run_id,
|
92
|
-
|
86
|
+
run_dir)
|
93
87
|
|
94
88
|
new_kwargs = kwargs.copy()
|
95
89
|
new_kwargs['_parsl_monitoring_task_id'] = x_task_id
|
@@ -98,47 +92,33 @@ def monitor_wrapper(*,
|
|
98
92
|
return (wrapped, args, new_kwargs)
|
99
93
|
|
100
94
|
|
101
|
-
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
|
102
|
-
radio: MonitoringRadioSender
|
103
|
-
if radio_mode == "udp":
|
104
|
-
radio = UDPRadioSender(monitoring_hub_url)
|
105
|
-
elif radio_mode == "htex":
|
106
|
-
radio = HTEXRadioSender(monitoring_hub_url)
|
107
|
-
elif radio_mode == "filesystem":
|
108
|
-
radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
|
109
|
-
run_dir=run_dir)
|
110
|
-
else:
|
111
|
-
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
112
|
-
return radio
|
113
|
-
|
114
|
-
|
115
95
|
@wrap_with_logs
|
116
96
|
def send_first_message(try_id: int,
|
117
97
|
task_id: int,
|
118
|
-
|
119
|
-
run_id: str,
|
120
|
-
send_first_last_message(try_id, task_id,
|
121
|
-
|
98
|
+
radio_config: RadioConfig,
|
99
|
+
run_id: str, run_dir: str) -> None:
|
100
|
+
send_first_last_message(try_id, task_id, radio_config, run_id,
|
101
|
+
run_dir, False)
|
122
102
|
|
123
103
|
|
124
104
|
@wrap_with_logs
|
125
105
|
def send_last_message(try_id: int,
|
126
106
|
task_id: int,
|
127
|
-
|
128
|
-
run_id: str,
|
129
|
-
send_first_last_message(try_id, task_id,
|
130
|
-
|
107
|
+
radio_config: RadioConfig,
|
108
|
+
run_id: str, run_dir: str) -> None:
|
109
|
+
send_first_last_message(try_id, task_id, radio_config, run_id,
|
110
|
+
run_dir, True)
|
131
111
|
|
132
112
|
|
133
113
|
def send_first_last_message(try_id: int,
|
134
114
|
task_id: int,
|
135
|
-
|
136
|
-
run_id: str,
|
115
|
+
radio_config: RadioConfig,
|
116
|
+
run_id: str, run_dir: str,
|
137
117
|
is_last: bool) -> None:
|
138
118
|
import os
|
139
119
|
import platform
|
140
120
|
|
141
|
-
radio =
|
121
|
+
radio = radio_config.create_sender()
|
142
122
|
|
143
123
|
msg = (MessageType.RESOURCE_INFO,
|
144
124
|
{'run_id': run_id,
|
@@ -158,9 +138,8 @@ def send_first_last_message(try_id: int,
|
|
158
138
|
def monitor(pid: int,
|
159
139
|
try_id: int,
|
160
140
|
task_id: int,
|
161
|
-
|
141
|
+
radio_config: RadioConfig,
|
162
142
|
run_id: str,
|
163
|
-
radio_mode: str,
|
164
143
|
logging_level: int,
|
165
144
|
sleep_dur: float,
|
166
145
|
run_dir: str,
|
@@ -184,7 +163,7 @@ def monitor(pid: int,
|
|
184
163
|
|
185
164
|
setproctitle("parsl: task resource monitor")
|
186
165
|
|
187
|
-
radio =
|
166
|
+
radio = radio_config.create_sender()
|
188
167
|
|
189
168
|
logging.debug("start of monitor")
|
190
169
|
|