parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +3 -1
- parsl/config.py +4 -0
- parsl/dataflow/dflow.py +14 -5
- parsl/executors/base.py +10 -0
- parsl/executors/high_throughput/executor.py +12 -0
- parsl/executors/high_throughput/interchange.py +30 -8
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/process_worker_pool.py +41 -5
- parsl/executors/status_handling.py +2 -9
- parsl/executors/taskvine/executor.py +24 -3
- parsl/executors/taskvine/manager.py +1 -0
- parsl/executors/taskvine/manager_config.py +3 -4
- parsl/executors/workqueue/executor.py +19 -0
- parsl/jobs/error_handlers.py +1 -1
- parsl/jobs/job_status_poller.py +8 -7
- parsl/launchers/launchers.py +6 -6
- parsl/log_utils.py +8 -4
- parsl/monitoring/db_manager.py +4 -2
- parsl/monitoring/monitoring.py +30 -264
- parsl/monitoring/router.py +208 -0
- parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
- parsl/monitoring/visualization/views.py +2 -1
- parsl/providers/cluster_provider.py +1 -3
- parsl/tests/configs/user_opts.py +2 -1
- parsl/tests/test_htex/test_drain.py +78 -0
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
- parsl/tests/test_python_apps/test_context_manager.py +40 -0
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
- parsl/tests/test_shutdown/__init__.py +0 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
- parsl/utils.py +2 -2
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0
parsl/jobs/job_status_poller.py
CHANGED
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import parsl
|
3
3
|
import time
|
4
4
|
import zmq
|
5
|
-
from typing import Dict, List, Sequence, Optional
|
5
|
+
from typing import Dict, List, Sequence, Optional, Union
|
6
6
|
|
7
7
|
from parsl.jobs.states import JobStatus, JobState
|
8
8
|
from parsl.jobs.strategy import Strategy
|
@@ -29,7 +29,7 @@ class PollItem:
|
|
29
29
|
if self._dfk and self._dfk.monitoring is not None:
|
30
30
|
self.monitoring_enabled = True
|
31
31
|
hub_address = self._dfk.hub_address
|
32
|
-
hub_port = self._dfk.
|
32
|
+
hub_port = self._dfk.hub_zmq_port
|
33
33
|
context = zmq.Context()
|
34
34
|
self.hub_channel = context.socket(zmq.DEALER)
|
35
35
|
self.hub_channel.set_hwm(0)
|
@@ -72,7 +72,7 @@ class PollItem:
|
|
72
72
|
def executor(self) -> BlockProviderExecutor:
|
73
73
|
return self._executor
|
74
74
|
|
75
|
-
def scale_in(self, n, max_idletime=None):
|
75
|
+
def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
76
76
|
|
77
77
|
if max_idletime is None:
|
78
78
|
block_ids = self._executor.scale_in(n)
|
@@ -82,7 +82,7 @@ class PollItem:
|
|
82
82
|
# scale_in method really does come from HighThroughputExecutor,
|
83
83
|
# and so does have an extra max_idletime parameter not present
|
84
84
|
# in the executor interface.
|
85
|
-
block_ids = self._executor.scale_in(n, max_idletime=max_idletime)
|
85
|
+
block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
86
86
|
if block_ids is not None:
|
87
87
|
new_status = {}
|
88
88
|
for block_id in block_ids:
|
@@ -91,7 +91,7 @@ class PollItem:
|
|
91
91
|
self.send_monitoring_info(new_status)
|
92
92
|
return block_ids
|
93
93
|
|
94
|
-
def scale_out(self, n):
|
94
|
+
def scale_out(self, n: int) -> List[str]:
|
95
95
|
block_ids = self._executor.scale_out(n)
|
96
96
|
if block_ids is not None:
|
97
97
|
new_status = {}
|
@@ -106,13 +106,14 @@ class PollItem:
|
|
106
106
|
|
107
107
|
|
108
108
|
class JobStatusPoller(Timer):
|
109
|
-
def __init__(self, strategy: Optional[str]
|
109
|
+
def __init__(self, *, strategy: Optional[str], max_idletime: float,
|
110
|
+
strategy_period: Union[float, int],
|
110
111
|
dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None) -> None:
|
111
112
|
self._poll_items = [] # type: List[PollItem]
|
112
113
|
self.dfk = dfk
|
113
114
|
self._strategy = Strategy(strategy=strategy,
|
114
115
|
max_idletime=max_idletime)
|
115
|
-
super().__init__(self.poll, interval=
|
116
|
+
super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
|
116
117
|
|
117
118
|
def poll(self) -> None:
|
118
119
|
self._update_state()
|
parsl/launchers/launchers.py
CHANGED
@@ -8,16 +8,16 @@ logger = logging.getLogger(__name__)
|
|
8
8
|
class SimpleLauncher(Launcher):
|
9
9
|
""" Does no wrapping. Just returns the command as-is
|
10
10
|
"""
|
11
|
-
def
|
11
|
+
def __init__(self, debug: bool = True) -> None:
|
12
12
|
super().__init__(debug=debug)
|
13
13
|
|
14
14
|
def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> str:
|
15
|
-
"""
|
16
|
-
Args:
|
17
|
-
- command (string): The command string to be launched
|
18
|
-
- task_block (string) : bash evaluated string.
|
19
15
|
|
20
|
-
|
16
|
+
if nodes_per_block > 1:
|
17
|
+
logger.warning('Simple Launcher only supports single node per block. '
|
18
|
+
f'Requested nodes: {nodes_per_block}. '
|
19
|
+
'You may be getting fewer workers than expected')
|
20
|
+
|
21
21
|
return command
|
22
22
|
|
23
23
|
|
parsl/log_utils.py
CHANGED
@@ -28,7 +28,7 @@ DEFAULT_FORMAT = (
|
|
28
28
|
def set_stream_logger(name: str = 'parsl',
|
29
29
|
level: int = logging.DEBUG,
|
30
30
|
format_string: Optional[str] = None,
|
31
|
-
stream: Optional[io.TextIOWrapper] = None) ->
|
31
|
+
stream: Optional[io.TextIOWrapper] = None) -> logging.Logger:
|
32
32
|
"""Add a stream log handler.
|
33
33
|
|
34
34
|
Args:
|
@@ -39,7 +39,7 @@ def set_stream_logger(name: str = 'parsl',
|
|
39
39
|
If not specified, the default stream for logging.StreamHandler is used.
|
40
40
|
|
41
41
|
Returns:
|
42
|
-
-
|
42
|
+
- logger for specified name
|
43
43
|
"""
|
44
44
|
if format_string is None:
|
45
45
|
# format_string = "%(asctime)s %(name)s [%(levelname)s] Thread:%(thread)d %(message)s"
|
@@ -59,12 +59,14 @@ def set_stream_logger(name: str = 'parsl',
|
|
59
59
|
futures_logger = logging.getLogger("concurrent.futures")
|
60
60
|
futures_logger.addHandler(handler)
|
61
61
|
|
62
|
+
return logger
|
63
|
+
|
62
64
|
|
63
65
|
@typeguard.typechecked
|
64
66
|
def set_file_logger(filename: str,
|
65
67
|
name: str = 'parsl',
|
66
68
|
level: int = logging.DEBUG,
|
67
|
-
format_string: Optional[str] = None) ->
|
69
|
+
format_string: Optional[str] = None) -> logging.Logger:
|
68
70
|
"""Add a file log handler.
|
69
71
|
|
70
72
|
Args:
|
@@ -74,7 +76,7 @@ def set_file_logger(filename: str,
|
|
74
76
|
- format_string (string): Set the format string
|
75
77
|
|
76
78
|
Returns:
|
77
|
-
-
|
79
|
+
- logger for specified name
|
78
80
|
"""
|
79
81
|
if format_string is None:
|
80
82
|
format_string = DEFAULT_FORMAT
|
@@ -91,3 +93,5 @@ def set_file_logger(filename: str,
|
|
91
93
|
# concurrent.futures
|
92
94
|
futures_logger = logging.getLogger("concurrent.futures")
|
93
95
|
futures_logger.addHandler(handler)
|
96
|
+
|
97
|
+
return logger
|
parsl/monitoring/db_manager.py
CHANGED
@@ -633,7 +633,8 @@ class DatabaseManager:
|
|
633
633
|
# if retried - for example, the database being locked because someone else is readying
|
634
634
|
# the tables we are trying to write to. If that assumption is wrong, then this loop
|
635
635
|
# may go on forever.
|
636
|
-
logger.warning("Got a database OperationalError.
|
636
|
+
logger.warning("Got a database OperationalError. "
|
637
|
+
"Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
|
637
638
|
self.db.rollback()
|
638
639
|
time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
|
639
640
|
|
@@ -660,7 +661,8 @@ class DatabaseManager:
|
|
660
661
|
done = True
|
661
662
|
except sa.exc.OperationalError as e:
|
662
663
|
# hoping that this is a database locked error during _update, not some other problem
|
663
|
-
logger.warning("Got a database OperationalError.
|
664
|
+
logger.warning("Got a database OperationalError. "
|
665
|
+
"Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
|
664
666
|
self.db.rollback()
|
665
667
|
time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
|
666
668
|
except KeyboardInterrupt:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import os
|
4
|
-
import socket
|
5
4
|
import time
|
6
|
-
import pickle
|
7
5
|
import logging
|
8
6
|
import typeguard
|
9
7
|
import zmq
|
@@ -15,14 +13,16 @@ import parsl.monitoring.remote
|
|
15
13
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
16
14
|
from multiprocessing import Process
|
17
15
|
from multiprocessing.queues import Queue
|
16
|
+
from parsl.log_utils import set_file_logger
|
18
17
|
from parsl.utils import RepresentationMixin
|
19
18
|
from parsl.process_loggers import wrap_with_logs
|
20
19
|
from parsl.utils import setproctitle
|
21
20
|
|
22
21
|
from parsl.serialize import deserialize
|
23
22
|
|
23
|
+
from parsl.monitoring.router import router_starter
|
24
24
|
from parsl.monitoring.message_type import MessageType
|
25
|
-
from parsl.monitoring.types import AddressedMonitoringMessage
|
25
|
+
from parsl.monitoring.types import AddressedMonitoringMessage
|
26
26
|
from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
|
27
27
|
|
28
28
|
_db_manager_excepts: Optional[Exception]
|
@@ -38,40 +38,6 @@ else:
|
|
38
38
|
logger = logging.getLogger(__name__)
|
39
39
|
|
40
40
|
|
41
|
-
def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
|
42
|
-
"""Add a stream log handler.
|
43
|
-
|
44
|
-
Parameters
|
45
|
-
---------
|
46
|
-
|
47
|
-
filename: string
|
48
|
-
Name of the file to write logs to. Required.
|
49
|
-
name: string
|
50
|
-
Logger name.
|
51
|
-
level: logging.LEVEL
|
52
|
-
Set the logging level. Default=logging.DEBUG
|
53
|
-
- format_string (string): Set the format string
|
54
|
-
format_string: string
|
55
|
-
Format string to use.
|
56
|
-
|
57
|
-
Returns
|
58
|
-
-------
|
59
|
-
None.
|
60
|
-
"""
|
61
|
-
if format_string is None:
|
62
|
-
format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
|
63
|
-
|
64
|
-
logger = logging.getLogger(name)
|
65
|
-
logger.setLevel(level)
|
66
|
-
logger.propagate = False
|
67
|
-
handler = logging.FileHandler(filename)
|
68
|
-
handler.setLevel(level)
|
69
|
-
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
|
70
|
-
handler.setFormatter(formatter)
|
71
|
-
logger.addHandler(handler)
|
72
|
-
return logger
|
73
|
-
|
74
|
-
|
75
41
|
@typeguard.typechecked
|
76
42
|
class MonitoringHub(RepresentationMixin):
|
77
43
|
def __init__(self,
|
@@ -79,9 +45,6 @@ class MonitoringHub(RepresentationMixin):
|
|
79
45
|
hub_port: Optional[int] = None,
|
80
46
|
hub_port_range: Tuple[int, int] = (55050, 56000),
|
81
47
|
|
82
|
-
client_address: str = "127.0.0.1",
|
83
|
-
client_port_range: Tuple[int, int] = (55000, 56000),
|
84
|
-
|
85
48
|
workflow_name: Optional[str] = None,
|
86
49
|
workflow_version: Optional[str] = None,
|
87
50
|
logging_endpoint: Optional[str] = None,
|
@@ -106,11 +69,6 @@ class MonitoringHub(RepresentationMixin):
|
|
106
69
|
to deliver monitoring messages to the monitoring router.
|
107
70
|
Note that despite the similar name, this is not related to hub_port.
|
108
71
|
Default: (55050, 56000)
|
109
|
-
client_address : str
|
110
|
-
The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
|
111
|
-
client_port_range : tuple(int, int)
|
112
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
113
|
-
Default: (55000, 56000)
|
114
72
|
workflow_name : str
|
115
73
|
The name for the workflow. Default to the name of the parsl script
|
116
74
|
workflow_version : str
|
@@ -134,8 +92,6 @@ class MonitoringHub(RepresentationMixin):
|
|
134
92
|
Default: 30 seconds
|
135
93
|
"""
|
136
94
|
|
137
|
-
self.logger = logger
|
138
|
-
|
139
95
|
# Any is used to disable typechecking on uses of _dfk_channel,
|
140
96
|
# because it is used in the code as if it points to a channel, but
|
141
97
|
# the static type is that it can also be None. The code relies on
|
@@ -145,9 +101,6 @@ class MonitoringHub(RepresentationMixin):
|
|
145
101
|
if _db_manager_excepts:
|
146
102
|
raise _db_manager_excepts
|
147
103
|
|
148
|
-
self.client_address = client_address
|
149
|
-
self.client_port_range = client_port_range
|
150
|
-
|
151
104
|
self.hub_address = hub_address
|
152
105
|
self.hub_port = hub_port
|
153
106
|
self.hub_port_range = hub_port_range
|
@@ -164,6 +117,8 @@ class MonitoringHub(RepresentationMixin):
|
|
164
117
|
|
165
118
|
def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
|
166
119
|
|
120
|
+
logger.debug("Starting MonitoringHub")
|
121
|
+
|
167
122
|
if self.logdir is None:
|
168
123
|
self.logdir = "."
|
169
124
|
|
@@ -172,9 +127,6 @@ class MonitoringHub(RepresentationMixin):
|
|
172
127
|
|
173
128
|
os.makedirs(self.logdir, exist_ok=True)
|
174
129
|
|
175
|
-
# Initialize the ZMQ pipe to the Parsl Client
|
176
|
-
|
177
|
-
self.logger.debug("Initializing ZMQ Pipes to client")
|
178
130
|
self.monitoring_hub_active = True
|
179
131
|
|
180
132
|
# This annotation is incompatible with typeguard 4.x instrumentation
|
@@ -210,8 +162,8 @@ class MonitoringHub(RepresentationMixin):
|
|
210
162
|
self.router_proc = ForkProcess(target=router_starter,
|
211
163
|
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
|
212
164
|
kwargs={"hub_address": self.hub_address,
|
213
|
-
"
|
214
|
-
"
|
165
|
+
"udp_port": self.hub_port,
|
166
|
+
"zmq_port_range": self.hub_port_range,
|
215
167
|
"logdir": self.logdir,
|
216
168
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
217
169
|
"run_id": run_id
|
@@ -231,7 +183,7 @@ class MonitoringHub(RepresentationMixin):
|
|
231
183
|
daemon=True,
|
232
184
|
)
|
233
185
|
self.dbm_proc.start()
|
234
|
-
|
186
|
+
logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
|
235
187
|
|
236
188
|
self.filesystem_proc = Process(target=filesystem_receiver,
|
237
189
|
args=(self.logdir, self.resource_msgs, dfk_run_dir),
|
@@ -239,19 +191,19 @@ class MonitoringHub(RepresentationMixin):
|
|
239
191
|
daemon=True
|
240
192
|
)
|
241
193
|
self.filesystem_proc.start()
|
242
|
-
|
194
|
+
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
243
195
|
|
244
196
|
try:
|
245
197
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
246
198
|
except queue.Empty:
|
247
|
-
|
199
|
+
logger.error("Hub has not completed initialization in 120s. Aborting")
|
248
200
|
raise Exception("Hub failed to start")
|
249
201
|
|
250
202
|
if isinstance(comm_q_result, str):
|
251
|
-
|
203
|
+
logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
|
252
204
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
253
205
|
|
254
|
-
udp_port,
|
206
|
+
udp_port, zmq_port = comm_q_result
|
255
207
|
|
256
208
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
257
209
|
|
@@ -261,28 +213,28 @@ class MonitoringHub(RepresentationMixin):
|
|
261
213
|
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
262
214
|
self._dfk_channel.set_hwm(0)
|
263
215
|
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
264
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address,
|
216
|
+
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
|
265
217
|
|
266
|
-
|
218
|
+
logger.info("Monitoring Hub initialized")
|
267
219
|
|
268
|
-
return
|
220
|
+
return zmq_port
|
269
221
|
|
270
222
|
# TODO: tighten the Any message format
|
271
223
|
def send(self, mtype: MessageType, message: Any) -> None:
|
272
|
-
|
224
|
+
logger.debug("Sending message type {}".format(mtype))
|
273
225
|
try:
|
274
226
|
self._dfk_channel.send_pyobj((mtype, message))
|
275
227
|
except zmq.Again:
|
276
|
-
|
228
|
+
logger.exception(
|
277
229
|
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
278
230
|
|
279
231
|
def close(self) -> None:
|
280
|
-
|
232
|
+
logger.info("Terminating Monitoring Hub")
|
281
233
|
exception_msgs = []
|
282
234
|
while True:
|
283
235
|
try:
|
284
236
|
exception_msgs.append(self.exception_q.get(block=False))
|
285
|
-
|
237
|
+
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
286
238
|
except queue.Empty:
|
287
239
|
break
|
288
240
|
if self._dfk_channel and self.monitoring_hub_active:
|
@@ -290,7 +242,7 @@ class MonitoringHub(RepresentationMixin):
|
|
290
242
|
self._dfk_channel.close()
|
291
243
|
if exception_msgs:
|
292
244
|
for exception_msg in exception_msgs:
|
293
|
-
|
245
|
+
logger.error(
|
294
246
|
"{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
|
295
247
|
exception_msg[0],
|
296
248
|
exception_msg[1]
|
@@ -299,21 +251,21 @@ class MonitoringHub(RepresentationMixin):
|
|
299
251
|
self.router_proc.terminate()
|
300
252
|
self.dbm_proc.terminate()
|
301
253
|
self.filesystem_proc.terminate()
|
302
|
-
|
254
|
+
logger.info("Waiting for router to terminate")
|
303
255
|
self.router_proc.join()
|
304
|
-
|
256
|
+
logger.debug("Finished waiting for router termination")
|
305
257
|
if len(exception_msgs) == 0:
|
306
|
-
|
258
|
+
logger.debug("Sending STOP to DBM")
|
307
259
|
self.priority_msgs.put(("STOP", 0))
|
308
260
|
else:
|
309
|
-
|
310
|
-
|
261
|
+
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
262
|
+
logger.debug("Waiting for DB termination")
|
311
263
|
self.dbm_proc.join()
|
312
|
-
|
264
|
+
logger.debug("Finished waiting for DBM termination")
|
313
265
|
|
314
266
|
# should this be message based? it probably doesn't need to be if
|
315
267
|
# we believe we've received all messages
|
316
|
-
|
268
|
+
logger.info("Terminating filesystem radio receiver process")
|
317
269
|
self.filesystem_proc.terminate()
|
318
270
|
self.filesystem_proc.join()
|
319
271
|
|
@@ -337,9 +289,9 @@ class MonitoringHub(RepresentationMixin):
|
|
337
289
|
|
338
290
|
@wrap_with_logs
|
339
291
|
def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
|
340
|
-
logger =
|
341
|
-
|
342
|
-
|
292
|
+
logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
|
293
|
+
name="monitoring_filesystem_radio",
|
294
|
+
level=logging.INFO)
|
343
295
|
|
344
296
|
logger.info("Starting filesystem radio receiver")
|
345
297
|
setproctitle("parsl: monitoring filesystem receiver")
|
@@ -369,189 +321,3 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
369
321
|
logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
|
370
322
|
|
371
323
|
time.sleep(1) # whats a good time for this poll?
|
372
|
-
|
373
|
-
|
374
|
-
class MonitoringRouter:
|
375
|
-
|
376
|
-
def __init__(self,
|
377
|
-
*,
|
378
|
-
hub_address: str,
|
379
|
-
hub_port: Optional[int] = None,
|
380
|
-
hub_port_range: Tuple[int, int] = (55050, 56000),
|
381
|
-
|
382
|
-
monitoring_hub_address: str = "127.0.0.1",
|
383
|
-
logdir: str = ".",
|
384
|
-
run_id: str,
|
385
|
-
logging_level: int = logging.INFO,
|
386
|
-
atexit_timeout: int = 3 # in seconds
|
387
|
-
):
|
388
|
-
""" Initializes a monitoring configuration class.
|
389
|
-
|
390
|
-
Parameters
|
391
|
-
----------
|
392
|
-
hub_address : str
|
393
|
-
The ip address at which the workers will be able to reach the Hub.
|
394
|
-
hub_port : int
|
395
|
-
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
396
|
-
hub_port_range : tuple(int, int)
|
397
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
398
|
-
This is overridden when the hub_port option is set. Default: (55050, 56000)
|
399
|
-
logdir : str
|
400
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
401
|
-
logging_level : int
|
402
|
-
Logging level as defined in the logging module. Default: logging.INFO
|
403
|
-
atexit_timeout : float, optional
|
404
|
-
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
405
|
-
|
406
|
-
"""
|
407
|
-
os.makedirs(logdir, exist_ok=True)
|
408
|
-
self.logger = start_file_logger("{}/monitoring_router.log".format(logdir),
|
409
|
-
name="monitoring_router",
|
410
|
-
level=logging_level)
|
411
|
-
self.logger.debug("Monitoring router starting")
|
412
|
-
|
413
|
-
self.hub_address = hub_address
|
414
|
-
self.atexit_timeout = atexit_timeout
|
415
|
-
self.run_id = run_id
|
416
|
-
|
417
|
-
self.loop_freq = 10.0 # milliseconds
|
418
|
-
|
419
|
-
# Initialize the UDP socket
|
420
|
-
self.sock = socket.socket(socket.AF_INET,
|
421
|
-
socket.SOCK_DGRAM,
|
422
|
-
socket.IPPROTO_UDP)
|
423
|
-
|
424
|
-
# We are trying to bind to all interfaces with 0.0.0.0
|
425
|
-
if not hub_port:
|
426
|
-
self.sock.bind(('0.0.0.0', 0))
|
427
|
-
self.hub_port = self.sock.getsockname()[1]
|
428
|
-
else:
|
429
|
-
self.hub_port = hub_port
|
430
|
-
try:
|
431
|
-
self.sock.bind(('0.0.0.0', self.hub_port))
|
432
|
-
except Exception as e:
|
433
|
-
raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
|
434
|
-
self.sock.settimeout(self.loop_freq / 1000)
|
435
|
-
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
|
436
|
-
|
437
|
-
self._context = zmq.Context()
|
438
|
-
self.ic_channel = self._context.socket(zmq.DEALER)
|
439
|
-
self.ic_channel.setsockopt(zmq.LINGER, 0)
|
440
|
-
self.ic_channel.set_hwm(0)
|
441
|
-
self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
442
|
-
self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
|
443
|
-
self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
|
444
|
-
min_port=hub_port_range[0],
|
445
|
-
max_port=hub_port_range[1])
|
446
|
-
|
447
|
-
def start(self,
|
448
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
449
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
450
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
451
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
452
|
-
try:
|
453
|
-
router_keep_going = True
|
454
|
-
while router_keep_going:
|
455
|
-
try:
|
456
|
-
data, addr = self.sock.recvfrom(2048)
|
457
|
-
resource_msg = pickle.loads(data)
|
458
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
459
|
-
resource_msgs.put((resource_msg, addr))
|
460
|
-
except socket.timeout:
|
461
|
-
pass
|
462
|
-
|
463
|
-
try:
|
464
|
-
dfk_loop_start = time.time()
|
465
|
-
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
466
|
-
# note that nothing checks that msg really is of the annotated type
|
467
|
-
msg: TaggedMonitoringMessage
|
468
|
-
msg = self.ic_channel.recv_pyobj()
|
469
|
-
|
470
|
-
assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
|
471
|
-
assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
|
472
|
-
assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
|
473
|
-
|
474
|
-
msg_0: AddressedMonitoringMessage
|
475
|
-
msg_0 = (msg, 0)
|
476
|
-
|
477
|
-
if msg[0] == MessageType.NODE_INFO:
|
478
|
-
msg[1]['run_id'] = self.run_id
|
479
|
-
node_msgs.put(msg_0)
|
480
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
481
|
-
resource_msgs.put(msg_0)
|
482
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
483
|
-
block_msgs.put(msg_0)
|
484
|
-
elif msg[0] == MessageType.TASK_INFO:
|
485
|
-
priority_msgs.put(msg_0)
|
486
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
487
|
-
priority_msgs.put(msg_0)
|
488
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
489
|
-
router_keep_going = False
|
490
|
-
else:
|
491
|
-
# There is a type: ignore here because if msg[0]
|
492
|
-
# is of the correct type, this code is unreachable,
|
493
|
-
# but there is no verification that the message
|
494
|
-
# received from ic_channel.recv_pyobj() is actually
|
495
|
-
# of that type.
|
496
|
-
self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable]
|
497
|
-
except zmq.Again:
|
498
|
-
pass
|
499
|
-
except Exception:
|
500
|
-
# This will catch malformed messages. What happens if the
|
501
|
-
# channel is broken in such a way that it always raises
|
502
|
-
# an exception? Looping on this would maybe be the wrong
|
503
|
-
# thing to do.
|
504
|
-
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
505
|
-
|
506
|
-
self.logger.info("Monitoring router draining")
|
507
|
-
last_msg_received_time = time.time()
|
508
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
509
|
-
try:
|
510
|
-
data, addr = self.sock.recvfrom(2048)
|
511
|
-
msg = pickle.loads(data)
|
512
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
513
|
-
resource_msgs.put((msg, addr))
|
514
|
-
last_msg_received_time = time.time()
|
515
|
-
except socket.timeout:
|
516
|
-
pass
|
517
|
-
|
518
|
-
self.logger.info("Monitoring router finishing normally")
|
519
|
-
finally:
|
520
|
-
self.logger.info("Monitoring router finished")
|
521
|
-
|
522
|
-
|
523
|
-
@wrap_with_logs
|
524
|
-
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
525
|
-
exception_q: "queue.Queue[Tuple[str, str]]",
|
526
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
527
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
528
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
529
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
530
|
-
|
531
|
-
hub_address: str,
|
532
|
-
hub_port: Optional[int],
|
533
|
-
hub_port_range: Tuple[int, int],
|
534
|
-
|
535
|
-
logdir: str,
|
536
|
-
logging_level: int,
|
537
|
-
run_id: str) -> None:
|
538
|
-
setproctitle("parsl: monitoring router")
|
539
|
-
try:
|
540
|
-
router = MonitoringRouter(hub_address=hub_address,
|
541
|
-
hub_port=hub_port,
|
542
|
-
hub_port_range=hub_port_range,
|
543
|
-
logdir=logdir,
|
544
|
-
logging_level=logging_level,
|
545
|
-
run_id=run_id)
|
546
|
-
except Exception as e:
|
547
|
-
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
548
|
-
comm_q.put(f"Monitoring router construction failed: {e}")
|
549
|
-
else:
|
550
|
-
comm_q.put((router.hub_port, router.ic_port))
|
551
|
-
|
552
|
-
router.logger.info("Starting MonitoringRouter in router_starter")
|
553
|
-
try:
|
554
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
555
|
-
except Exception as e:
|
556
|
-
router.logger.exception("router.start exception")
|
557
|
-
exception_q.put(('Hub', str(e)))
|