parsl 2024.3.18__py3-none-any.whl → 2024.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +35 -36
- parsl/executors/base.py +11 -1
- parsl/executors/high_throughput/executor.py +8 -20
- parsl/executors/high_throughput/process_worker_pool.py +5 -2
- parsl/executors/status_handling.py +8 -15
- parsl/executors/taskvine/executor.py +35 -11
- parsl/executors/workqueue/executor.py +33 -11
- parsl/jobs/error_handlers.py +1 -1
- parsl/jobs/job_status_poller.py +12 -11
- parsl/jobs/strategy.py +31 -18
- parsl/monitoring/monitoring.py +27 -237
- parsl/monitoring/router.py +208 -0
- parsl/tests/site_tests/test_provider.py +1 -1
- parsl/tests/test_htex/test_disconnected_blocks.py +0 -1
- parsl/tests/test_htex/test_drain.py +1 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +85 -0
- parsl/tests/test_python_apps/test_context_manager.py +40 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +78 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/process_worker_pool.py +5 -2
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/METADATA +4 -4
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/RECORD +35 -30
- /parsl/tests/{test_data → test_shutdown}/__init__.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +0 -0
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/WHEEL +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/top_level.txt +0 -0
parsl/jobs/strategy.py
CHANGED
@@ -129,8 +129,8 @@ class Strategy:
|
|
129
129
|
self.executors = {}
|
130
130
|
self.max_idletime = max_idletime
|
131
131
|
|
132
|
-
self.strategies = {None: self.
|
133
|
-
'none': self.
|
132
|
+
self.strategies = {None: self._strategy_init_only,
|
133
|
+
'none': self._strategy_init_only,
|
134
134
|
'simple': self._strategy_simple,
|
135
135
|
'htex_auto_scale': self._strategy_htex_auto_scale}
|
136
136
|
|
@@ -146,15 +146,22 @@ class Strategy:
|
|
146
146
|
for executor in executors:
|
147
147
|
self.executors[executor.label] = {'idle_since': None}
|
148
148
|
|
149
|
-
def
|
150
|
-
"""
|
149
|
+
def _strategy_init_only(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
|
150
|
+
"""Scale up to init_blocks at the start, then nothing more.
|
151
151
|
"""
|
152
|
-
|
152
|
+
for ef in executor_facades:
|
153
|
+
if ef.first:
|
154
|
+
executor = ef.executor
|
155
|
+
logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
|
156
|
+
ef.scale_out(executor.provider.init_blocks)
|
157
|
+
ef.first = False
|
158
|
+
else:
|
159
|
+
logger.debug("strategy_init_only: doing nothing")
|
153
160
|
|
154
|
-
def _strategy_simple(self,
|
155
|
-
self._general_strategy(
|
161
|
+
def _strategy_simple(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
|
162
|
+
self._general_strategy(executor_facades, strategy_type='simple')
|
156
163
|
|
157
|
-
def _strategy_htex_auto_scale(self,
|
164
|
+
def _strategy_htex_auto_scale(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
|
158
165
|
"""HTEX specific auto scaling strategy
|
159
166
|
|
160
167
|
This strategy works only for HTEX. This strategy will scale out by
|
@@ -169,24 +176,30 @@ class Strategy:
|
|
169
176
|
expected to scale in effectively only when # of workers, or tasks executing
|
170
177
|
per block is close to 1.
|
171
178
|
"""
|
172
|
-
self._general_strategy(
|
179
|
+
self._general_strategy(executor_facades, strategy_type='htex')
|
173
180
|
|
174
181
|
@wrap_with_logs
|
175
|
-
def _general_strategy(self,
|
176
|
-
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(
|
182
|
+
def _general_strategy(self, executor_facades, *, strategy_type):
|
183
|
+
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executor_facades)} executors")
|
177
184
|
|
178
|
-
for
|
179
|
-
executor =
|
185
|
+
for ef in executor_facades:
|
186
|
+
executor = ef.executor
|
180
187
|
label = executor.label
|
181
188
|
if not isinstance(executor, BlockProviderExecutor):
|
182
189
|
logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
|
183
190
|
continue
|
184
191
|
logger.debug(f"Strategizing for executor {label}")
|
185
192
|
|
193
|
+
if ef.first:
|
194
|
+
executor = ef.executor
|
195
|
+
logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
|
196
|
+
ef.scale_out(executor.provider.init_blocks)
|
197
|
+
ef.first = False
|
198
|
+
|
186
199
|
# Tasks that are either pending completion
|
187
200
|
active_tasks = executor.outstanding
|
188
201
|
|
189
|
-
status =
|
202
|
+
status = ef.status
|
190
203
|
|
191
204
|
# FIXME we need to handle case where provider does not define these
|
192
205
|
# FIXME probably more of this logic should be moved to the provider
|
@@ -242,7 +255,7 @@ class Strategy:
|
|
242
255
|
# We have resources idle for the max duration,
|
243
256
|
# we have to scale_in now.
|
244
257
|
logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
|
245
|
-
|
258
|
+
ef.scale_in(active_blocks - min_blocks)
|
246
259
|
|
247
260
|
else:
|
248
261
|
logger.debug(
|
@@ -265,7 +278,7 @@ class Strategy:
|
|
265
278
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
266
279
|
excess_blocks = min(excess_blocks, max_blocks - active_blocks)
|
267
280
|
logger.debug(f"Requesting {excess_blocks} more blocks")
|
268
|
-
|
281
|
+
ef.scale_out(excess_blocks)
|
269
282
|
|
270
283
|
elif active_slots == 0 and active_tasks > 0:
|
271
284
|
logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
|
@@ -274,7 +287,7 @@ class Strategy:
|
|
274
287
|
if active_blocks < max_blocks:
|
275
288
|
logger.debug("Requesting single block")
|
276
289
|
|
277
|
-
|
290
|
+
ef.scale_out(1)
|
278
291
|
else:
|
279
292
|
logger.debug("Not requesting single block, because at maxblocks already")
|
280
293
|
|
@@ -290,7 +303,7 @@ class Strategy:
|
|
290
303
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
291
304
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
292
305
|
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
293
|
-
|
306
|
+
ef.scale_in(excess_blocks, max_idletime=self.max_idletime)
|
294
307
|
else:
|
295
308
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
296
309
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -1,17 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import os
|
4
|
-
import socket
|
5
4
|
import time
|
6
|
-
import pickle
|
7
5
|
import logging
|
8
6
|
import typeguard
|
9
7
|
import zmq
|
10
8
|
|
11
9
|
import queue
|
12
10
|
|
13
|
-
import parsl.monitoring.remote
|
14
|
-
|
15
11
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
16
12
|
from multiprocessing import Process
|
17
13
|
from multiprocessing.queues import Queue
|
@@ -22,9 +18,10 @@ from parsl.utils import setproctitle
|
|
22
18
|
|
23
19
|
from parsl.serialize import deserialize
|
24
20
|
|
21
|
+
from parsl.monitoring.router import router_starter
|
25
22
|
from parsl.monitoring.message_type import MessageType
|
26
|
-
from parsl.monitoring.types import AddressedMonitoringMessage
|
27
|
-
from typing import cast, Any,
|
23
|
+
from parsl.monitoring.types import AddressedMonitoringMessage
|
24
|
+
from typing import cast, Any, Optional, Tuple, Union, TYPE_CHECKING
|
28
25
|
|
29
26
|
_db_manager_excepts: Optional[Exception]
|
30
27
|
|
@@ -93,8 +90,6 @@ class MonitoringHub(RepresentationMixin):
|
|
93
90
|
Default: 30 seconds
|
94
91
|
"""
|
95
92
|
|
96
|
-
self.logger = logger
|
97
|
-
|
98
93
|
# Any is used to disable typechecking on uses of _dfk_channel,
|
99
94
|
# because it is used in the code as if it points to a channel, but
|
100
95
|
# the static type is that it can also be None. The code relies on
|
@@ -120,6 +115,8 @@ class MonitoringHub(RepresentationMixin):
|
|
120
115
|
|
121
116
|
def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
|
122
117
|
|
118
|
+
logger.debug("Starting MonitoringHub")
|
119
|
+
|
123
120
|
if self.logdir is None:
|
124
121
|
self.logdir = "."
|
125
122
|
|
@@ -128,9 +125,6 @@ class MonitoringHub(RepresentationMixin):
|
|
128
125
|
|
129
126
|
os.makedirs(self.logdir, exist_ok=True)
|
130
127
|
|
131
|
-
# Initialize the ZMQ pipe to the Parsl Client
|
132
|
-
|
133
|
-
self.logger.debug("Initializing ZMQ Pipes to client")
|
134
128
|
self.monitoring_hub_active = True
|
135
129
|
|
136
130
|
# This annotation is incompatible with typeguard 4.x instrumentation
|
@@ -166,8 +160,8 @@ class MonitoringHub(RepresentationMixin):
|
|
166
160
|
self.router_proc = ForkProcess(target=router_starter,
|
167
161
|
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
|
168
162
|
kwargs={"hub_address": self.hub_address,
|
169
|
-
"
|
170
|
-
"
|
163
|
+
"udp_port": self.hub_port,
|
164
|
+
"zmq_port_range": self.hub_port_range,
|
171
165
|
"logdir": self.logdir,
|
172
166
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
173
167
|
"run_id": run_id
|
@@ -187,7 +181,7 @@ class MonitoringHub(RepresentationMixin):
|
|
187
181
|
daemon=True,
|
188
182
|
)
|
189
183
|
self.dbm_proc.start()
|
190
|
-
|
184
|
+
logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
|
191
185
|
|
192
186
|
self.filesystem_proc = Process(target=filesystem_receiver,
|
193
187
|
args=(self.logdir, self.resource_msgs, dfk_run_dir),
|
@@ -195,19 +189,19 @@ class MonitoringHub(RepresentationMixin):
|
|
195
189
|
daemon=True
|
196
190
|
)
|
197
191
|
self.filesystem_proc.start()
|
198
|
-
|
192
|
+
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
199
193
|
|
200
194
|
try:
|
201
195
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
202
196
|
except queue.Empty:
|
203
|
-
|
197
|
+
logger.error("Hub has not completed initialization in 120s. Aborting")
|
204
198
|
raise Exception("Hub failed to start")
|
205
199
|
|
206
200
|
if isinstance(comm_q_result, str):
|
207
|
-
|
201
|
+
logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
|
208
202
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
209
203
|
|
210
|
-
udp_port,
|
204
|
+
udp_port, zmq_port = comm_q_result
|
211
205
|
|
212
206
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
213
207
|
|
@@ -217,28 +211,28 @@ class MonitoringHub(RepresentationMixin):
|
|
217
211
|
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
218
212
|
self._dfk_channel.set_hwm(0)
|
219
213
|
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
220
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address,
|
214
|
+
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
|
221
215
|
|
222
|
-
|
216
|
+
logger.info("Monitoring Hub initialized")
|
223
217
|
|
224
|
-
return
|
218
|
+
return zmq_port
|
225
219
|
|
226
220
|
# TODO: tighten the Any message format
|
227
221
|
def send(self, mtype: MessageType, message: Any) -> None:
|
228
|
-
|
222
|
+
logger.debug("Sending message type {}".format(mtype))
|
229
223
|
try:
|
230
224
|
self._dfk_channel.send_pyobj((mtype, message))
|
231
225
|
except zmq.Again:
|
232
|
-
|
226
|
+
logger.exception(
|
233
227
|
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
234
228
|
|
235
229
|
def close(self) -> None:
|
236
|
-
|
230
|
+
logger.info("Terminating Monitoring Hub")
|
237
231
|
exception_msgs = []
|
238
232
|
while True:
|
239
233
|
try:
|
240
234
|
exception_msgs.append(self.exception_q.get(block=False))
|
241
|
-
|
235
|
+
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
242
236
|
except queue.Empty:
|
243
237
|
break
|
244
238
|
if self._dfk_channel and self.monitoring_hub_active:
|
@@ -246,7 +240,7 @@ class MonitoringHub(RepresentationMixin):
|
|
246
240
|
self._dfk_channel.close()
|
247
241
|
if exception_msgs:
|
248
242
|
for exception_msg in exception_msgs:
|
249
|
-
|
243
|
+
logger.error(
|
250
244
|
"{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
|
251
245
|
exception_msg[0],
|
252
246
|
exception_msg[1]
|
@@ -255,41 +249,24 @@ class MonitoringHub(RepresentationMixin):
|
|
255
249
|
self.router_proc.terminate()
|
256
250
|
self.dbm_proc.terminate()
|
257
251
|
self.filesystem_proc.terminate()
|
258
|
-
|
252
|
+
logger.info("Waiting for router to terminate")
|
259
253
|
self.router_proc.join()
|
260
|
-
|
254
|
+
logger.debug("Finished waiting for router termination")
|
261
255
|
if len(exception_msgs) == 0:
|
262
|
-
|
256
|
+
logger.debug("Sending STOP to DBM")
|
263
257
|
self.priority_msgs.put(("STOP", 0))
|
264
258
|
else:
|
265
|
-
|
266
|
-
|
259
|
+
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
260
|
+
logger.debug("Waiting for DB termination")
|
267
261
|
self.dbm_proc.join()
|
268
|
-
|
262
|
+
logger.debug("Finished waiting for DBM termination")
|
269
263
|
|
270
264
|
# should this be message based? it probably doesn't need to be if
|
271
265
|
# we believe we've received all messages
|
272
|
-
|
266
|
+
logger.info("Terminating filesystem radio receiver process")
|
273
267
|
self.filesystem_proc.terminate()
|
274
268
|
self.filesystem_proc.join()
|
275
269
|
|
276
|
-
@staticmethod
|
277
|
-
def monitor_wrapper(f: Any,
|
278
|
-
args: Sequence,
|
279
|
-
kwargs: Dict,
|
280
|
-
try_id: int,
|
281
|
-
task_id: int,
|
282
|
-
monitoring_hub_url: str,
|
283
|
-
run_id: str,
|
284
|
-
logging_level: int,
|
285
|
-
sleep_dur: float,
|
286
|
-
radio_mode: str,
|
287
|
-
monitor_resources: bool,
|
288
|
-
run_dir: str) -> Tuple[Callable, Sequence, Dict]:
|
289
|
-
return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
|
290
|
-
run_id, logging_level, sleep_dur, radio_mode,
|
291
|
-
monitor_resources, run_dir)
|
292
|
-
|
293
270
|
|
294
271
|
@wrap_with_logs
|
295
272
|
def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
|
@@ -325,190 +302,3 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
325
302
|
logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
|
326
303
|
|
327
304
|
time.sleep(1) # whats a good time for this poll?
|
328
|
-
|
329
|
-
|
330
|
-
class MonitoringRouter:
|
331
|
-
|
332
|
-
def __init__(self,
|
333
|
-
*,
|
334
|
-
hub_address: str,
|
335
|
-
hub_port: Optional[int] = None,
|
336
|
-
hub_port_range: Tuple[int, int] = (55050, 56000),
|
337
|
-
|
338
|
-
monitoring_hub_address: str = "127.0.0.1",
|
339
|
-
logdir: str = ".",
|
340
|
-
run_id: str,
|
341
|
-
logging_level: int = logging.INFO,
|
342
|
-
atexit_timeout: int = 3 # in seconds
|
343
|
-
):
|
344
|
-
""" Initializes a monitoring configuration class.
|
345
|
-
|
346
|
-
Parameters
|
347
|
-
----------
|
348
|
-
hub_address : str
|
349
|
-
The ip address at which the workers will be able to reach the Hub.
|
350
|
-
hub_port : int
|
351
|
-
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
352
|
-
hub_port_range : tuple(int, int)
|
353
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
354
|
-
This is overridden when the hub_port option is set. Default: (55050, 56000)
|
355
|
-
logdir : str
|
356
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
357
|
-
logging_level : int
|
358
|
-
Logging level as defined in the logging module. Default: logging.INFO
|
359
|
-
atexit_timeout : float, optional
|
360
|
-
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
361
|
-
|
362
|
-
"""
|
363
|
-
os.makedirs(logdir, exist_ok=True)
|
364
|
-
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
365
|
-
name="monitoring_router",
|
366
|
-
level=logging_level)
|
367
|
-
self.logger.debug("Monitoring router starting")
|
368
|
-
|
369
|
-
self.hub_address = hub_address
|
370
|
-
self.atexit_timeout = atexit_timeout
|
371
|
-
self.run_id = run_id
|
372
|
-
|
373
|
-
self.loop_freq = 10.0 # milliseconds
|
374
|
-
|
375
|
-
# Initialize the UDP socket
|
376
|
-
self.sock = socket.socket(socket.AF_INET,
|
377
|
-
socket.SOCK_DGRAM,
|
378
|
-
socket.IPPROTO_UDP)
|
379
|
-
|
380
|
-
# We are trying to bind to all interfaces with 0.0.0.0
|
381
|
-
if not hub_port:
|
382
|
-
self.sock.bind(('0.0.0.0', 0))
|
383
|
-
self.hub_port = self.sock.getsockname()[1]
|
384
|
-
else:
|
385
|
-
self.hub_port = hub_port
|
386
|
-
try:
|
387
|
-
self.sock.bind(('0.0.0.0', self.hub_port))
|
388
|
-
except Exception as e:
|
389
|
-
raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
|
390
|
-
self.sock.settimeout(self.loop_freq / 1000)
|
391
|
-
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
|
392
|
-
|
393
|
-
self._context = zmq.Context()
|
394
|
-
self.ic_channel = self._context.socket(zmq.DEALER)
|
395
|
-
self.ic_channel.setsockopt(zmq.LINGER, 0)
|
396
|
-
self.ic_channel.set_hwm(0)
|
397
|
-
self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
398
|
-
self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
|
399
|
-
self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
|
400
|
-
min_port=hub_port_range[0],
|
401
|
-
max_port=hub_port_range[1])
|
402
|
-
|
403
|
-
def start(self,
|
404
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
405
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
406
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
407
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
408
|
-
try:
|
409
|
-
router_keep_going = True
|
410
|
-
while router_keep_going:
|
411
|
-
try:
|
412
|
-
data, addr = self.sock.recvfrom(2048)
|
413
|
-
resource_msg = pickle.loads(data)
|
414
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
415
|
-
resource_msgs.put((resource_msg, addr))
|
416
|
-
except socket.timeout:
|
417
|
-
pass
|
418
|
-
|
419
|
-
try:
|
420
|
-
dfk_loop_start = time.time()
|
421
|
-
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
422
|
-
# note that nothing checks that msg really is of the annotated type
|
423
|
-
msg: TaggedMonitoringMessage
|
424
|
-
msg = self.ic_channel.recv_pyobj()
|
425
|
-
|
426
|
-
assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
|
427
|
-
assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
|
428
|
-
assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
|
429
|
-
|
430
|
-
msg_0: AddressedMonitoringMessage
|
431
|
-
msg_0 = (msg, 0)
|
432
|
-
|
433
|
-
if msg[0] == MessageType.NODE_INFO:
|
434
|
-
msg[1]['run_id'] = self.run_id
|
435
|
-
node_msgs.put(msg_0)
|
436
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
437
|
-
resource_msgs.put(msg_0)
|
438
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
439
|
-
block_msgs.put(msg_0)
|
440
|
-
elif msg[0] == MessageType.TASK_INFO:
|
441
|
-
priority_msgs.put(msg_0)
|
442
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
443
|
-
priority_msgs.put(msg_0)
|
444
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
445
|
-
router_keep_going = False
|
446
|
-
else:
|
447
|
-
# There is a type: ignore here because if msg[0]
|
448
|
-
# is of the correct type, this code is unreachable,
|
449
|
-
# but there is no verification that the message
|
450
|
-
# received from ic_channel.recv_pyobj() is actually
|
451
|
-
# of that type.
|
452
|
-
self.logger.error("Discarding message " # type: ignore[unreachable]
|
453
|
-
f"from interchange with unknown type {msg[0].value}")
|
454
|
-
except zmq.Again:
|
455
|
-
pass
|
456
|
-
except Exception:
|
457
|
-
# This will catch malformed messages. What happens if the
|
458
|
-
# channel is broken in such a way that it always raises
|
459
|
-
# an exception? Looping on this would maybe be the wrong
|
460
|
-
# thing to do.
|
461
|
-
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
462
|
-
|
463
|
-
self.logger.info("Monitoring router draining")
|
464
|
-
last_msg_received_time = time.time()
|
465
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
466
|
-
try:
|
467
|
-
data, addr = self.sock.recvfrom(2048)
|
468
|
-
msg = pickle.loads(data)
|
469
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
470
|
-
resource_msgs.put((msg, addr))
|
471
|
-
last_msg_received_time = time.time()
|
472
|
-
except socket.timeout:
|
473
|
-
pass
|
474
|
-
|
475
|
-
self.logger.info("Monitoring router finishing normally")
|
476
|
-
finally:
|
477
|
-
self.logger.info("Monitoring router finished")
|
478
|
-
|
479
|
-
|
480
|
-
@wrap_with_logs
|
481
|
-
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
482
|
-
exception_q: "queue.Queue[Tuple[str, str]]",
|
483
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
484
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
485
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
486
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
487
|
-
|
488
|
-
hub_address: str,
|
489
|
-
hub_port: Optional[int],
|
490
|
-
hub_port_range: Tuple[int, int],
|
491
|
-
|
492
|
-
logdir: str,
|
493
|
-
logging_level: int,
|
494
|
-
run_id: str) -> None:
|
495
|
-
setproctitle("parsl: monitoring router")
|
496
|
-
try:
|
497
|
-
router = MonitoringRouter(hub_address=hub_address,
|
498
|
-
hub_port=hub_port,
|
499
|
-
hub_port_range=hub_port_range,
|
500
|
-
logdir=logdir,
|
501
|
-
logging_level=logging_level,
|
502
|
-
run_id=run_id)
|
503
|
-
except Exception as e:
|
504
|
-
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
505
|
-
comm_q.put(f"Monitoring router construction failed: {e}")
|
506
|
-
else:
|
507
|
-
comm_q.put((router.hub_port, router.ic_port))
|
508
|
-
|
509
|
-
router.logger.info("Starting MonitoringRouter in router_starter")
|
510
|
-
try:
|
511
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
512
|
-
except Exception as e:
|
513
|
-
router.logger.exception("router.start exception")
|
514
|
-
exception_q.put(('Hub', str(e)))
|