parsl 2024.7.22__py3-none-any.whl → 2024.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +4 -10
- parsl/executors/base.py +8 -8
- parsl/executors/flux/executor.py +7 -7
- parsl/executors/high_throughput/executor.py +55 -55
- parsl/executors/high_throughput/interchange.py +37 -37
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/manager_selector.py +25 -0
- parsl/executors/high_throughput/process_worker_pool.py +2 -0
- parsl/executors/status_handling.py +52 -21
- parsl/executors/taskvine/executor.py +0 -18
- parsl/executors/workqueue/executor.py +0 -18
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +6 -5
- parsl/monitoring/radios.py +23 -7
- parsl/monitoring/remote.py +12 -12
- parsl/monitoring/router.py +71 -30
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_htex.py +28 -19
- parsl/tests/test_htex/test_zmq_binding.py +2 -0
- parsl/tests/test_monitoring/test_basic.py +14 -1
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
- parsl/version.py +1 -1
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/interchange.py +37 -37
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +2 -0
- parsl-2024.8.5.dist-info/METADATA +101 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/RECORD +33 -30
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
- parsl-2024.7.22.dist-info/METADATA +0 -101
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.22.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.22.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -113,14 +113,10 @@ class DataFlowKernel:
|
|
113
113
|
self.monitoring: Optional[MonitoringHub]
|
114
114
|
self.monitoring = config.monitoring
|
115
115
|
|
116
|
-
# hub address and port for interchange to connect
|
117
|
-
self.hub_address = None # type: Optional[str]
|
118
|
-
self.hub_zmq_port = None # type: Optional[int]
|
119
116
|
if self.monitoring:
|
120
117
|
if self.monitoring.logdir is None:
|
121
118
|
self.monitoring.logdir = self.run_dir
|
122
|
-
self.
|
123
|
-
self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
|
119
|
+
self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
|
124
120
|
|
125
121
|
self.time_began = datetime.datetime.now()
|
126
122
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -1181,10 +1177,10 @@ class DataFlowKernel:
|
|
1181
1177
|
for executor in executors:
|
1182
1178
|
executor.run_id = self.run_id
|
1183
1179
|
executor.run_dir = self.run_dir
|
1184
|
-
executor.hub_address = self.hub_address
|
1185
|
-
executor.hub_zmq_port = self.hub_zmq_port
|
1186
1180
|
if self.monitoring:
|
1187
|
-
executor.
|
1181
|
+
executor.hub_address = self.monitoring.hub_address
|
1182
|
+
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1183
|
+
executor.submit_monitoring_radio = self.monitoring.radio
|
1188
1184
|
if hasattr(executor, 'provider'):
|
1189
1185
|
if hasattr(executor.provider, 'script_dir'):
|
1190
1186
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
@@ -1460,8 +1456,6 @@ class DataFlowKernel:
|
|
1460
1456
|
Returns:
|
1461
1457
|
- dict containing, hashed -> future mappings
|
1462
1458
|
"""
|
1463
|
-
self.memo_lookup_table = None
|
1464
|
-
|
1465
1459
|
if checkpointDirs:
|
1466
1460
|
return self._load_checkpoints(checkpointDirs)
|
1467
1461
|
else:
|
parsl/executors/base.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Optional
|
|
5
5
|
|
6
6
|
from typing_extensions import Literal, Self
|
7
7
|
|
8
|
-
from parsl.monitoring.radios import
|
8
|
+
from parsl.monitoring.radios import MonitoringRadioSender
|
9
9
|
|
10
10
|
|
11
11
|
class ParslExecutor(metaclass=ABCMeta):
|
@@ -52,13 +52,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
52
52
|
*,
|
53
53
|
hub_address: Optional[str] = None,
|
54
54
|
hub_zmq_port: Optional[int] = None,
|
55
|
-
|
55
|
+
submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
|
56
56
|
run_dir: str = ".",
|
57
57
|
run_id: Optional[str] = None,
|
58
58
|
):
|
59
59
|
self.hub_address = hub_address
|
60
60
|
self.hub_zmq_port = hub_zmq_port
|
61
|
-
self.
|
61
|
+
self.submit_monitoring_radio = submit_monitoring_radio
|
62
62
|
self.run_dir = os.path.abspath(run_dir)
|
63
63
|
self.run_id = run_id
|
64
64
|
|
@@ -147,11 +147,11 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
147
147
|
self._hub_zmq_port = value
|
148
148
|
|
149
149
|
@property
|
150
|
-
def
|
150
|
+
def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
|
151
151
|
"""Local radio for sending monitoring messages
|
152
152
|
"""
|
153
|
-
return self.
|
153
|
+
return self._submit_monitoring_radio
|
154
154
|
|
155
|
-
@
|
156
|
-
def
|
157
|
-
self.
|
155
|
+
@submit_monitoring_radio.setter
|
156
|
+
def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
|
157
|
+
self._submit_monitoring_radio = value
|
parsl/executors/flux/executor.py
CHANGED
@@ -200,7 +200,6 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
200
200
|
raise EnvironmentError("Cannot find Flux installation in PATH")
|
201
201
|
self.flux_path = os.path.abspath(flux_path)
|
202
202
|
self._task_id_counter = itertools.count()
|
203
|
-
self._socket = zmq.Context().socket(zmq.REP)
|
204
203
|
# Assumes a launch command cannot be None or empty
|
205
204
|
self.launch_cmd = launch_cmd or self.DEFAULT_LAUNCH_CMD
|
206
205
|
self._submission_queue: queue.Queue = queue.Queue()
|
@@ -213,7 +212,6 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
213
212
|
args=(
|
214
213
|
self._submission_queue,
|
215
214
|
self._stop_event,
|
216
|
-
self._socket,
|
217
215
|
self.working_dir,
|
218
216
|
self.flux_executor_kwargs,
|
219
217
|
self.provider,
|
@@ -306,11 +304,13 @@ def _submit_wrapper(
|
|
306
304
|
|
307
305
|
If an exception is thrown, error out all submitted tasks.
|
308
306
|
"""
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
307
|
+
with zmq.Context() as ctx:
|
308
|
+
with ctx.socket(zmq.REP) as socket:
|
309
|
+
try:
|
310
|
+
_submit_flux_jobs(submission_queue, stop_event, socket, *args, **kwargs)
|
311
|
+
except Exception as exc:
|
312
|
+
_error_out_jobs(submission_queue, stop_event, exc)
|
313
|
+
raise
|
314
314
|
|
315
315
|
|
316
316
|
def _error_out_jobs(
|
@@ -20,6 +20,10 @@ from parsl.data_provider.staging import Staging
|
|
20
20
|
from parsl.executors.errors import BadMessage, ScalingFailed
|
21
21
|
from parsl.executors.high_throughput import zmq_pipes
|
22
22
|
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
23
|
+
from parsl.executors.high_throughput.manager_selector import (
|
24
|
+
ManagerSelector,
|
25
|
+
RandomManagerSelector,
|
26
|
+
)
|
23
27
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
24
28
|
VALID_LAUNCHERS,
|
25
29
|
validate_resource_spec,
|
@@ -56,7 +60,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
56
60
|
"--mpi-launcher={mpi_launcher} "
|
57
61
|
"--available-accelerators {accelerators}")
|
58
62
|
|
59
|
-
DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
|
63
|
+
DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
60
64
|
|
61
65
|
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
62
66
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
@@ -78,9 +82,9 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
78
82
|
cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
|
79
83
|
launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
|
80
84
|
|
81
|
-
interchange_launch_cmd : str
|
82
|
-
Custom command line
|
83
|
-
the executor will use the default "interchange.py" command.
|
85
|
+
interchange_launch_cmd : Sequence[str]
|
86
|
+
Custom sequence of command line tokens to launch the interchange process from the executor. If
|
87
|
+
undefined, the executor will use the default "interchange.py" command.
|
84
88
|
|
85
89
|
address : string
|
86
90
|
An address to connect to the main Parsl process which is reachable from the network in which
|
@@ -238,7 +242,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
238
242
|
label: str = 'HighThroughputExecutor',
|
239
243
|
provider: ExecutionProvider = LocalProvider(),
|
240
244
|
launch_cmd: Optional[str] = None,
|
241
|
-
interchange_launch_cmd: Optional[str] = None,
|
245
|
+
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
242
246
|
address: Optional[str] = None,
|
243
247
|
worker_ports: Optional[Tuple[int, int]] = None,
|
244
248
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
@@ -261,6 +265,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
261
265
|
worker_logdir_root: Optional[str] = None,
|
262
266
|
enable_mpi_mode: bool = False,
|
263
267
|
mpi_launcher: str = "mpiexec",
|
268
|
+
manager_selector: ManagerSelector = RandomManagerSelector(),
|
264
269
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
265
270
|
encrypted: bool = False):
|
266
271
|
|
@@ -276,6 +281,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
276
281
|
self.prefetch_capacity = prefetch_capacity
|
277
282
|
self.address = address
|
278
283
|
self.address_probe_timeout = address_probe_timeout
|
284
|
+
self.manager_selector = manager_selector
|
279
285
|
if self.address:
|
280
286
|
self.all_addresses = address
|
281
287
|
else:
|
@@ -456,8 +462,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
456
462
|
"task_id" : <task_id>
|
457
463
|
"exception" : serialized exception object, on failure
|
458
464
|
}
|
459
|
-
|
460
|
-
The `None` message is a die request.
|
461
465
|
"""
|
462
466
|
logger.debug("Result queue worker starting")
|
463
467
|
|
@@ -475,58 +479,53 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
475
479
|
|
476
480
|
else:
|
477
481
|
|
478
|
-
|
479
|
-
|
480
|
-
|
482
|
+
for serialized_msg in msgs:
|
483
|
+
try:
|
484
|
+
msg = pickle.loads(serialized_msg)
|
485
|
+
except pickle.UnpicklingError:
|
486
|
+
raise BadMessage("Message received could not be unpickled")
|
481
487
|
|
482
|
-
|
483
|
-
|
488
|
+
if msg['type'] == 'heartbeat':
|
489
|
+
continue
|
490
|
+
elif msg['type'] == 'result':
|
484
491
|
try:
|
485
|
-
|
486
|
-
except
|
487
|
-
raise BadMessage("Message received
|
492
|
+
tid = msg['task_id']
|
493
|
+
except Exception:
|
494
|
+
raise BadMessage("Message received does not contain 'task_id' field")
|
495
|
+
|
496
|
+
if tid == -1 and 'exception' in msg:
|
497
|
+
logger.warning("Executor shutting down due to exception from interchange")
|
498
|
+
exception = deserialize(msg['exception'])
|
499
|
+
self.set_bad_state_and_fail_all(exception)
|
500
|
+
break
|
501
|
+
|
502
|
+
task_fut = self.tasks.pop(tid)
|
503
|
+
|
504
|
+
if 'result' in msg:
|
505
|
+
result = deserialize(msg['result'])
|
506
|
+
task_fut.set_result(result)
|
488
507
|
|
489
|
-
|
490
|
-
continue
|
491
|
-
elif msg['type'] == 'result':
|
508
|
+
elif 'exception' in msg:
|
492
509
|
try:
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
elif 'exception' in msg:
|
510
|
-
try:
|
511
|
-
s = deserialize(msg['exception'])
|
512
|
-
# s should be a RemoteExceptionWrapper... so we can reraise it
|
513
|
-
if isinstance(s, RemoteExceptionWrapper):
|
514
|
-
try:
|
515
|
-
s.reraise()
|
516
|
-
except Exception as e:
|
517
|
-
task_fut.set_exception(e)
|
518
|
-
elif isinstance(s, Exception):
|
519
|
-
task_fut.set_exception(s)
|
520
|
-
else:
|
521
|
-
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
522
|
-
except Exception as e:
|
523
|
-
# TODO could be a proper wrapped exception?
|
524
|
-
task_fut.set_exception(
|
525
|
-
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
526
|
-
else:
|
527
|
-
raise BadMessage("Message received is neither result or exception")
|
510
|
+
s = deserialize(msg['exception'])
|
511
|
+
# s should be a RemoteExceptionWrapper... so we can reraise it
|
512
|
+
if isinstance(s, RemoteExceptionWrapper):
|
513
|
+
try:
|
514
|
+
s.reraise()
|
515
|
+
except Exception as e:
|
516
|
+
task_fut.set_exception(e)
|
517
|
+
elif isinstance(s, Exception):
|
518
|
+
task_fut.set_exception(s)
|
519
|
+
else:
|
520
|
+
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
521
|
+
except Exception as e:
|
522
|
+
# TODO could be a proper wrapped exception?
|
523
|
+
task_fut.set_exception(
|
524
|
+
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
528
525
|
else:
|
529
|
-
raise BadMessage("Message received
|
526
|
+
raise BadMessage("Message received is neither result or exception")
|
527
|
+
else:
|
528
|
+
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
530
529
|
|
531
530
|
logger.info("Result queue worker finished")
|
532
531
|
|
@@ -551,11 +550,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
551
550
|
"poll_period": self.poll_period,
|
552
551
|
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
553
552
|
"cert_dir": self.cert_dir,
|
553
|
+
"manager_selector": self.manager_selector,
|
554
554
|
}
|
555
555
|
|
556
556
|
config_pickle = pickle.dumps(interchange_config)
|
557
557
|
|
558
|
-
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd
|
558
|
+
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
|
559
559
|
stdin = self.interchange_proc.stdin
|
560
560
|
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
561
561
|
|
@@ -6,7 +6,6 @@ import os
|
|
6
6
|
import pickle
|
7
7
|
import platform
|
8
8
|
import queue
|
9
|
-
import random
|
10
9
|
import signal
|
11
10
|
import sys
|
12
11
|
import threading
|
@@ -19,7 +18,9 @@ from parsl import curvezmq
|
|
19
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
20
19
|
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
21
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
22
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
+
from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
|
23
24
|
from parsl.process_loggers import wrap_with_logs
|
24
25
|
from parsl.serialize import serialize as serialize_object
|
25
26
|
from parsl.utils import setproctitle
|
@@ -53,6 +54,7 @@ class Interchange:
|
|
53
54
|
logging_level: int,
|
54
55
|
poll_period: int,
|
55
56
|
cert_dir: Optional[str],
|
57
|
+
manager_selector: ManagerSelector,
|
56
58
|
) -> None:
|
57
59
|
"""
|
58
60
|
Parameters
|
@@ -160,6 +162,8 @@ class Interchange:
|
|
160
162
|
|
161
163
|
self.heartbeat_threshold = heartbeat_threshold
|
162
164
|
|
165
|
+
self.manager_selector = manager_selector
|
166
|
+
|
163
167
|
self.current_platform = {'parsl_v': PARSL_VERSION,
|
164
168
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
165
169
|
sys.version_info.minor,
|
@@ -216,27 +220,15 @@ class Interchange:
|
|
216
220
|
task_counter += 1
|
217
221
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
218
222
|
|
219
|
-
def
|
220
|
-
if
|
221
|
-
logger.info("Connecting to MonitoringHub")
|
222
|
-
# This is a one-off because monitoring is unencrypted
|
223
|
-
hub_channel = zmq.Context().socket(zmq.DEALER)
|
224
|
-
hub_channel.set_hwm(0)
|
225
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
|
226
|
-
logger.info("Connected to MonitoringHub")
|
227
|
-
return hub_channel
|
228
|
-
else:
|
229
|
-
return None
|
230
|
-
|
231
|
-
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
232
|
-
if hub_channel:
|
223
|
+
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
224
|
+
if monitoring_radio:
|
233
225
|
logger.info("Sending message {} to MonitoringHub".format(manager))
|
234
226
|
|
235
227
|
d: Dict = cast(Dict, manager.copy())
|
236
228
|
d['timestamp'] = datetime.datetime.now()
|
237
229
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
238
230
|
|
239
|
-
|
231
|
+
monitoring_radio.send((MessageType.NODE_INFO, d))
|
240
232
|
|
241
233
|
@wrap_with_logs(target="interchange")
|
242
234
|
def _command_server(self) -> NoReturn:
|
@@ -244,8 +236,11 @@ class Interchange:
|
|
244
236
|
"""
|
245
237
|
logger.debug("Command Server Starting")
|
246
238
|
|
247
|
-
|
248
|
-
|
239
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
240
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
241
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
242
|
+
else:
|
243
|
+
monitoring_radio = None
|
249
244
|
|
250
245
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
251
246
|
|
@@ -295,7 +290,7 @@ class Interchange:
|
|
295
290
|
if manager_id in self._ready_managers:
|
296
291
|
m = self._ready_managers[manager_id]
|
297
292
|
m['active'] = False
|
298
|
-
self._send_monitoring_info(
|
293
|
+
self._send_monitoring_info(monitoring_radio, m)
|
299
294
|
else:
|
300
295
|
logger.warning("Worker to hold was not in ready managers list")
|
301
296
|
|
@@ -330,9 +325,14 @@ class Interchange:
|
|
330
325
|
# parent-process-inheritance problems.
|
331
326
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
332
327
|
|
333
|
-
logger.info("
|
328
|
+
logger.info("Starting main interchange method")
|
334
329
|
|
335
|
-
|
330
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
331
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
332
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
333
|
+
logger.debug("Created monitoring radio")
|
334
|
+
else:
|
335
|
+
monitoring_radio = None
|
336
336
|
|
337
337
|
poll_period = self.poll_period
|
338
338
|
|
@@ -363,10 +363,10 @@ class Interchange:
|
|
363
363
|
while not kill_event.is_set():
|
364
364
|
self.socks = dict(poller.poll(timeout=poll_period))
|
365
365
|
|
366
|
-
self.process_task_outgoing_incoming(interesting_managers,
|
367
|
-
self.process_results_incoming(interesting_managers,
|
368
|
-
self.expire_bad_managers(interesting_managers,
|
369
|
-
self.expire_drained_managers(interesting_managers,
|
366
|
+
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
367
|
+
self.process_results_incoming(interesting_managers, monitoring_radio)
|
368
|
+
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
369
|
+
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
370
370
|
self.process_tasks_to_send(interesting_managers)
|
371
371
|
|
372
372
|
self.zmq_context.destroy()
|
@@ -377,7 +377,7 @@ class Interchange:
|
|
377
377
|
def process_task_outgoing_incoming(
|
378
378
|
self,
|
379
379
|
interesting_managers: Set[bytes],
|
380
|
-
|
380
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
381
381
|
kill_event: threading.Event
|
382
382
|
) -> None:
|
383
383
|
"""Process one message from manager on the task_outgoing channel.
|
@@ -410,6 +410,7 @@ class Interchange:
|
|
410
410
|
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
411
411
|
'idle_since': time.time(),
|
412
412
|
'block_id': None,
|
413
|
+
'start_time': msg['start_time'],
|
413
414
|
'max_capacity': 0,
|
414
415
|
'worker_count': 0,
|
415
416
|
'active': True,
|
@@ -430,7 +431,7 @@ class Interchange:
|
|
430
431
|
m.update(msg) # type: ignore[typeddict-item]
|
431
432
|
|
432
433
|
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
433
|
-
self._send_monitoring_info(
|
434
|
+
self._send_monitoring_info(monitoring_radio, m)
|
434
435
|
|
435
436
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
436
437
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
@@ -461,7 +462,7 @@ class Interchange:
|
|
461
462
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
462
463
|
logger.debug("leaving task_outgoing section")
|
463
464
|
|
464
|
-
def expire_drained_managers(self, interesting_managers: Set[bytes],
|
465
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
465
466
|
|
466
467
|
for manager_id in list(interesting_managers):
|
467
468
|
# is it always true that a draining manager will be in interesting managers?
|
@@ -474,7 +475,7 @@ class Interchange:
|
|
474
475
|
self._ready_managers.pop(manager_id)
|
475
476
|
|
476
477
|
m['active'] = False
|
477
|
-
self._send_monitoring_info(
|
478
|
+
self._send_monitoring_info(monitoring_radio, m)
|
478
479
|
|
479
480
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
480
481
|
# Check if there are tasks that could be sent to managers
|
@@ -484,8 +485,7 @@ class Interchange:
|
|
484
485
|
interesting=len(interesting_managers)))
|
485
486
|
|
486
487
|
if interesting_managers and not self.pending_task_queue.empty():
|
487
|
-
shuffled_managers =
|
488
|
-
random.shuffle(shuffled_managers)
|
488
|
+
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
489
489
|
|
490
490
|
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
491
491
|
manager_id = shuffled_managers.pop()
|
@@ -518,7 +518,7 @@ class Interchange:
|
|
518
518
|
else:
|
519
519
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
520
520
|
|
521
|
-
def process_results_incoming(self, interesting_managers: Set[bytes],
|
521
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
522
522
|
# Receive any results and forward to client
|
523
523
|
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
524
524
|
logger.debug("entering results_incoming section")
|
@@ -538,11 +538,11 @@ class Interchange:
|
|
538
538
|
elif r['type'] == 'monitoring':
|
539
539
|
# the monitoring code makes the assumption that no
|
540
540
|
# monitoring messages will be received if monitoring
|
541
|
-
# is not configured, and that
|
541
|
+
# is not configured, and that monitoring_radio will only
|
542
542
|
# be None when monitoring is not configurated.
|
543
|
-
assert
|
543
|
+
assert monitoring_radio is not None
|
544
544
|
|
545
|
-
|
545
|
+
monitoring_radio.send(r['payload'])
|
546
546
|
elif r['type'] == 'heartbeat':
|
547
547
|
logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
|
548
548
|
b_messages.append((p_message, r))
|
@@ -586,7 +586,7 @@ class Interchange:
|
|
586
586
|
interesting_managers.add(manager_id)
|
587
587
|
logger.debug("leaving results_incoming section")
|
588
588
|
|
589
|
-
def expire_bad_managers(self, interesting_managers: Set[bytes],
|
589
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
590
590
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
591
591
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
592
592
|
for (manager_id, m) in bad_managers:
|
@@ -594,7 +594,7 @@ class Interchange:
|
|
594
594
|
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
595
595
|
if m['active']:
|
596
596
|
m['active'] = False
|
597
|
-
self._send_monitoring_info(
|
597
|
+
self._send_monitoring_info(monitoring_radio, m)
|
598
598
|
|
599
599
|
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
600
600
|
for tid in m['tasks']:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import random
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Dict, List, Set
|
4
|
+
|
5
|
+
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
6
|
+
|
7
|
+
|
8
|
+
class ManagerSelector(metaclass=ABCMeta):
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
12
|
+
""" Sort a given list of managers.
|
13
|
+
|
14
|
+
Any operations pertaining to the sorting and rearrangement of the
|
15
|
+
interesting_managers Set should be performed here.
|
16
|
+
"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class RandomManagerSelector(ManagerSelector):
|
21
|
+
|
22
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
23
|
+
c_manager_list = list(manager_list)
|
24
|
+
random.shuffle(c_manager_list)
|
25
|
+
return c_manager_list
|
@@ -184,6 +184,7 @@ class Manager:
|
|
184
184
|
|
185
185
|
self.uid = uid
|
186
186
|
self.block_id = block_id
|
187
|
+
self.start_time = time.time()
|
187
188
|
|
188
189
|
self.enable_mpi_mode = enable_mpi_mode
|
189
190
|
self.mpi_launcher = mpi_launcher
|
@@ -263,6 +264,7 @@ class Manager:
|
|
263
264
|
'worker_count': self.worker_count,
|
264
265
|
'uid': self.uid,
|
265
266
|
'block_id': self.block_id,
|
267
|
+
'start_time': self.start_time,
|
266
268
|
'prefetch_capacity': self.prefetch_capacity,
|
267
269
|
'max_capacity': self.worker_count + self.prefetch_capacity,
|
268
270
|
'os': platform.system(),
|