parsl 2024.7.29__py3-none-any.whl → 2024.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +1 -1
- parsl/executors/base.py +7 -7
- parsl/executors/high_throughput/executor.py +13 -6
- parsl/executors/high_throughput/interchange.py +36 -37
- parsl/executors/high_throughput/manager_selector.py +25 -0
- parsl/executors/status_handling.py +38 -24
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +2 -1
- parsl/monitoring/radios.py +16 -0
- parsl/monitoring/router.py +71 -30
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_htex.py +28 -19
- parsl/tests/test_htex/test_zmq_binding.py +2 -0
- parsl/tests/test_monitoring/test_basic.py +14 -1
- parsl/tests/test_mpi_apps/test_mpiex.py +1 -1
- parsl/version.py +1 -1
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/interchange.py +36 -37
- parsl-2024.8.5.dist-info/METADATA +101 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/RECORD +26 -23
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/WHEEL +1 -1
- parsl-2024.7.29.dist-info/METADATA +0 -101
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.7.29.data → parsl-2024.8.5.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/LICENSE +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/entry_points.txt +0 -0
- {parsl-2024.7.29.dist-info → parsl-2024.8.5.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -1180,7 +1180,7 @@ class DataFlowKernel:
|
|
1180
1180
|
if self.monitoring:
|
1181
1181
|
executor.hub_address = self.monitoring.hub_address
|
1182
1182
|
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1183
|
-
executor.
|
1183
|
+
executor.submit_monitoring_radio = self.monitoring.radio
|
1184
1184
|
if hasattr(executor, 'provider'):
|
1185
1185
|
if hasattr(executor.provider, 'script_dir'):
|
1186
1186
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
parsl/executors/base.py
CHANGED
@@ -52,13 +52,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
52
52
|
*,
|
53
53
|
hub_address: Optional[str] = None,
|
54
54
|
hub_zmq_port: Optional[int] = None,
|
55
|
-
|
55
|
+
submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
|
56
56
|
run_dir: str = ".",
|
57
57
|
run_id: Optional[str] = None,
|
58
58
|
):
|
59
59
|
self.hub_address = hub_address
|
60
60
|
self.hub_zmq_port = hub_zmq_port
|
61
|
-
self.
|
61
|
+
self.submit_monitoring_radio = submit_monitoring_radio
|
62
62
|
self.run_dir = os.path.abspath(run_dir)
|
63
63
|
self.run_id = run_id
|
64
64
|
|
@@ -147,11 +147,11 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
147
147
|
self._hub_zmq_port = value
|
148
148
|
|
149
149
|
@property
|
150
|
-
def
|
150
|
+
def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
|
151
151
|
"""Local radio for sending monitoring messages
|
152
152
|
"""
|
153
|
-
return self.
|
153
|
+
return self._submit_monitoring_radio
|
154
154
|
|
155
|
-
@
|
156
|
-
def
|
157
|
-
self.
|
155
|
+
@submit_monitoring_radio.setter
|
156
|
+
def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
|
157
|
+
self._submit_monitoring_radio = value
|
@@ -20,6 +20,10 @@ from parsl.data_provider.staging import Staging
|
|
20
20
|
from parsl.executors.errors import BadMessage, ScalingFailed
|
21
21
|
from parsl.executors.high_throughput import zmq_pipes
|
22
22
|
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
23
|
+
from parsl.executors.high_throughput.manager_selector import (
|
24
|
+
ManagerSelector,
|
25
|
+
RandomManagerSelector,
|
26
|
+
)
|
23
27
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
24
28
|
VALID_LAUNCHERS,
|
25
29
|
validate_resource_spec,
|
@@ -56,7 +60,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
56
60
|
"--mpi-launcher={mpi_launcher} "
|
57
61
|
"--available-accelerators {accelerators}")
|
58
62
|
|
59
|
-
DEFAULT_INTERCHANGE_LAUNCH_CMD = "interchange.py"
|
63
|
+
DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
60
64
|
|
61
65
|
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
62
66
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
@@ -78,9 +82,9 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
78
82
|
cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
|
79
83
|
launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
|
80
84
|
|
81
|
-
interchange_launch_cmd : str
|
82
|
-
Custom command line
|
83
|
-
the executor will use the default "interchange.py" command.
|
85
|
+
interchange_launch_cmd : Sequence[str]
|
86
|
+
Custom sequence of command line tokens to launch the interchange process from the executor. If
|
87
|
+
undefined, the executor will use the default "interchange.py" command.
|
84
88
|
|
85
89
|
address : string
|
86
90
|
An address to connect to the main Parsl process which is reachable from the network in which
|
@@ -238,7 +242,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
238
242
|
label: str = 'HighThroughputExecutor',
|
239
243
|
provider: ExecutionProvider = LocalProvider(),
|
240
244
|
launch_cmd: Optional[str] = None,
|
241
|
-
interchange_launch_cmd: Optional[str] = None,
|
245
|
+
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
242
246
|
address: Optional[str] = None,
|
243
247
|
worker_ports: Optional[Tuple[int, int]] = None,
|
244
248
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
@@ -261,6 +265,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
261
265
|
worker_logdir_root: Optional[str] = None,
|
262
266
|
enable_mpi_mode: bool = False,
|
263
267
|
mpi_launcher: str = "mpiexec",
|
268
|
+
manager_selector: ManagerSelector = RandomManagerSelector(),
|
264
269
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
265
270
|
encrypted: bool = False):
|
266
271
|
|
@@ -276,6 +281,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
276
281
|
self.prefetch_capacity = prefetch_capacity
|
277
282
|
self.address = address
|
278
283
|
self.address_probe_timeout = address_probe_timeout
|
284
|
+
self.manager_selector = manager_selector
|
279
285
|
if self.address:
|
280
286
|
self.all_addresses = address
|
281
287
|
else:
|
@@ -544,11 +550,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
544
550
|
"poll_period": self.poll_period,
|
545
551
|
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
546
552
|
"cert_dir": self.cert_dir,
|
553
|
+
"manager_selector": self.manager_selector,
|
547
554
|
}
|
548
555
|
|
549
556
|
config_pickle = pickle.dumps(interchange_config)
|
550
557
|
|
551
|
-
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd
|
558
|
+
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
|
552
559
|
stdin = self.interchange_proc.stdin
|
553
560
|
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
554
561
|
|
@@ -6,7 +6,6 @@ import os
|
|
6
6
|
import pickle
|
7
7
|
import platform
|
8
8
|
import queue
|
9
|
-
import random
|
10
9
|
import signal
|
11
10
|
import sys
|
12
11
|
import threading
|
@@ -19,7 +18,9 @@ from parsl import curvezmq
|
|
19
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
20
19
|
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
21
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
22
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
+
from parsl.monitoring.radios import MonitoringRadioSender, ZMQRadioSender
|
23
24
|
from parsl.process_loggers import wrap_with_logs
|
24
25
|
from parsl.serialize import serialize as serialize_object
|
25
26
|
from parsl.utils import setproctitle
|
@@ -53,6 +54,7 @@ class Interchange:
|
|
53
54
|
logging_level: int,
|
54
55
|
poll_period: int,
|
55
56
|
cert_dir: Optional[str],
|
57
|
+
manager_selector: ManagerSelector,
|
56
58
|
) -> None:
|
57
59
|
"""
|
58
60
|
Parameters
|
@@ -160,6 +162,8 @@ class Interchange:
|
|
160
162
|
|
161
163
|
self.heartbeat_threshold = heartbeat_threshold
|
162
164
|
|
165
|
+
self.manager_selector = manager_selector
|
166
|
+
|
163
167
|
self.current_platform = {'parsl_v': PARSL_VERSION,
|
164
168
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
165
169
|
sys.version_info.minor,
|
@@ -216,27 +220,15 @@ class Interchange:
|
|
216
220
|
task_counter += 1
|
217
221
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
218
222
|
|
219
|
-
def
|
220
|
-
if
|
221
|
-
logger.info("Connecting to MonitoringHub")
|
222
|
-
# This is a one-off because monitoring is unencrypted
|
223
|
-
hub_channel = zmq.Context().socket(zmq.DEALER)
|
224
|
-
hub_channel.set_hwm(0)
|
225
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
|
226
|
-
logger.info("Connected to MonitoringHub")
|
227
|
-
return hub_channel
|
228
|
-
else:
|
229
|
-
return None
|
230
|
-
|
231
|
-
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
232
|
-
if hub_channel:
|
223
|
+
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
224
|
+
if monitoring_radio:
|
233
225
|
logger.info("Sending message {} to MonitoringHub".format(manager))
|
234
226
|
|
235
227
|
d: Dict = cast(Dict, manager.copy())
|
236
228
|
d['timestamp'] = datetime.datetime.now()
|
237
229
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
238
230
|
|
239
|
-
|
231
|
+
monitoring_radio.send((MessageType.NODE_INFO, d))
|
240
232
|
|
241
233
|
@wrap_with_logs(target="interchange")
|
242
234
|
def _command_server(self) -> NoReturn:
|
@@ -244,8 +236,11 @@ class Interchange:
|
|
244
236
|
"""
|
245
237
|
logger.debug("Command Server Starting")
|
246
238
|
|
247
|
-
|
248
|
-
|
239
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
240
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
241
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
242
|
+
else:
|
243
|
+
monitoring_radio = None
|
249
244
|
|
250
245
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
251
246
|
|
@@ -295,7 +290,7 @@ class Interchange:
|
|
295
290
|
if manager_id in self._ready_managers:
|
296
291
|
m = self._ready_managers[manager_id]
|
297
292
|
m['active'] = False
|
298
|
-
self._send_monitoring_info(
|
293
|
+
self._send_monitoring_info(monitoring_radio, m)
|
299
294
|
else:
|
300
295
|
logger.warning("Worker to hold was not in ready managers list")
|
301
296
|
|
@@ -330,9 +325,14 @@ class Interchange:
|
|
330
325
|
# parent-process-inheritance problems.
|
331
326
|
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
332
327
|
|
333
|
-
logger.info("
|
328
|
+
logger.info("Starting main interchange method")
|
334
329
|
|
335
|
-
|
330
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
331
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
332
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
333
|
+
logger.debug("Created monitoring radio")
|
334
|
+
else:
|
335
|
+
monitoring_radio = None
|
336
336
|
|
337
337
|
poll_period = self.poll_period
|
338
338
|
|
@@ -363,10 +363,10 @@ class Interchange:
|
|
363
363
|
while not kill_event.is_set():
|
364
364
|
self.socks = dict(poller.poll(timeout=poll_period))
|
365
365
|
|
366
|
-
self.process_task_outgoing_incoming(interesting_managers,
|
367
|
-
self.process_results_incoming(interesting_managers,
|
368
|
-
self.expire_bad_managers(interesting_managers,
|
369
|
-
self.expire_drained_managers(interesting_managers,
|
366
|
+
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
367
|
+
self.process_results_incoming(interesting_managers, monitoring_radio)
|
368
|
+
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
369
|
+
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
370
370
|
self.process_tasks_to_send(interesting_managers)
|
371
371
|
|
372
372
|
self.zmq_context.destroy()
|
@@ -377,7 +377,7 @@ class Interchange:
|
|
377
377
|
def process_task_outgoing_incoming(
|
378
378
|
self,
|
379
379
|
interesting_managers: Set[bytes],
|
380
|
-
|
380
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
381
381
|
kill_event: threading.Event
|
382
382
|
) -> None:
|
383
383
|
"""Process one message from manager on the task_outgoing channel.
|
@@ -431,7 +431,7 @@ class Interchange:
|
|
431
431
|
m.update(msg) # type: ignore[typeddict-item]
|
432
432
|
|
433
433
|
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
434
|
-
self._send_monitoring_info(
|
434
|
+
self._send_monitoring_info(monitoring_radio, m)
|
435
435
|
|
436
436
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
437
437
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
@@ -462,7 +462,7 @@ class Interchange:
|
|
462
462
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
463
463
|
logger.debug("leaving task_outgoing section")
|
464
464
|
|
465
|
-
def expire_drained_managers(self, interesting_managers: Set[bytes],
|
465
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
466
466
|
|
467
467
|
for manager_id in list(interesting_managers):
|
468
468
|
# is it always true that a draining manager will be in interesting managers?
|
@@ -475,7 +475,7 @@ class Interchange:
|
|
475
475
|
self._ready_managers.pop(manager_id)
|
476
476
|
|
477
477
|
m['active'] = False
|
478
|
-
self._send_monitoring_info(
|
478
|
+
self._send_monitoring_info(monitoring_radio, m)
|
479
479
|
|
480
480
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
481
481
|
# Check if there are tasks that could be sent to managers
|
@@ -485,8 +485,7 @@ class Interchange:
|
|
485
485
|
interesting=len(interesting_managers)))
|
486
486
|
|
487
487
|
if interesting_managers and not self.pending_task_queue.empty():
|
488
|
-
shuffled_managers =
|
489
|
-
random.shuffle(shuffled_managers)
|
488
|
+
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
490
489
|
|
491
490
|
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
492
491
|
manager_id = shuffled_managers.pop()
|
@@ -519,7 +518,7 @@ class Interchange:
|
|
519
518
|
else:
|
520
519
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
521
520
|
|
522
|
-
def process_results_incoming(self, interesting_managers: Set[bytes],
|
521
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
523
522
|
# Receive any results and forward to client
|
524
523
|
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
525
524
|
logger.debug("entering results_incoming section")
|
@@ -539,11 +538,11 @@ class Interchange:
|
|
539
538
|
elif r['type'] == 'monitoring':
|
540
539
|
# the monitoring code makes the assumption that no
|
541
540
|
# monitoring messages will be received if monitoring
|
542
|
-
# is not configured, and that
|
541
|
+
# is not configured, and that monitoring_radio will only
|
543
542
|
# be None when monitoring is not configurated.
|
544
|
-
assert
|
543
|
+
assert monitoring_radio is not None
|
545
544
|
|
546
|
-
|
545
|
+
monitoring_radio.send(r['payload'])
|
547
546
|
elif r['type'] == 'heartbeat':
|
548
547
|
logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
|
549
548
|
b_messages.append((p_message, r))
|
@@ -587,7 +586,7 @@ class Interchange:
|
|
587
586
|
interesting_managers.add(manager_id)
|
588
587
|
logger.debug("leaving results_incoming section")
|
589
588
|
|
590
|
-
def expire_bad_managers(self, interesting_managers: Set[bytes],
|
589
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
591
590
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
592
591
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
593
592
|
for (manager_id, m) in bad_managers:
|
@@ -595,7 +594,7 @@ class Interchange:
|
|
595
594
|
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
596
595
|
if m['active']:
|
597
596
|
m['active'] = False
|
598
|
-
self._send_monitoring_info(
|
597
|
+
self._send_monitoring_info(monitoring_radio, m)
|
599
598
|
|
600
599
|
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
601
600
|
for tid in m['tasks']:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import random
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Dict, List, Set
|
4
|
+
|
5
|
+
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
6
|
+
|
7
|
+
|
8
|
+
class ManagerSelector(metaclass=ABCMeta):
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
12
|
+
""" Sort a given list of managers.
|
13
|
+
|
14
|
+
Any operations pertaining to the sorting and rearrangement of the
|
15
|
+
interesting_managers Set should be performed here.
|
16
|
+
"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class RandomManagerSelector(ManagerSelector):
|
21
|
+
|
22
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
23
|
+
c_manager_list = list(manager_list)
|
24
|
+
random.shuffle(c_manager_list)
|
25
|
+
return c_manager_list
|
@@ -12,7 +12,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
12
12
|
from parsl.executors.base import ParslExecutor
|
13
13
|
from parsl.executors.errors import BadStateException, ScalingFailed
|
14
14
|
from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
|
15
|
-
from parsl.jobs.states import JobState, JobStatus
|
15
|
+
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.providers.base import ExecutionProvider
|
18
18
|
from parsl.utils import AtomicIDCounter
|
@@ -167,40 +167,50 @@ class BlockProviderExecutor(ParslExecutor):
|
|
167
167
|
def provider(self):
|
168
168
|
return self._provider
|
169
169
|
|
170
|
-
def _filter_scale_in_ids(self, to_kill, killed):
|
170
|
+
def _filter_scale_in_ids(self, to_kill: Sequence[Any], killed: Sequence[bool]) -> Sequence[Any]:
|
171
171
|
""" Filter out job id's that were not killed
|
172
172
|
"""
|
173
173
|
assert len(to_kill) == len(killed)
|
174
|
+
|
175
|
+
if False in killed:
|
176
|
+
killed_job_ids = [jid for jid, k in zip(to_kill, killed) if k]
|
177
|
+
not_killed_job_ids = [jid for jid, k in zip(to_kill, killed) if not k]
|
178
|
+
logger.warning("Some jobs were not killed successfully: "
|
179
|
+
f"killed jobs: {killed_job_ids}, "
|
180
|
+
f"not-killed jobs: {not_killed_job_ids}")
|
181
|
+
|
174
182
|
# Filters first iterable by bool values in second
|
175
183
|
return list(compress(to_kill, killed))
|
176
184
|
|
177
185
|
def scale_out_facade(self, n: int) -> List[str]:
|
178
|
-
block_ids = self._scale_out(n)
|
179
|
-
if block_ids is not None:
|
180
|
-
new_status = {}
|
181
|
-
for block_id in block_ids:
|
182
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
183
|
-
self.send_monitoring_info(new_status)
|
184
|
-
self._status.update(new_status)
|
185
|
-
return block_ids
|
186
|
-
|
187
|
-
def _scale_out(self, blocks: int = 1) -> List[str]:
|
188
186
|
"""Scales out the number of blocks by "blocks"
|
189
187
|
"""
|
190
188
|
if not self.provider:
|
191
189
|
raise ScalingFailed(self, "No execution provider available")
|
192
190
|
block_ids = []
|
193
|
-
|
194
|
-
|
191
|
+
monitoring_status_changes = {}
|
192
|
+
logger.info(f"Scaling out by {n} blocks")
|
193
|
+
for _ in range(n):
|
195
194
|
block_id = str(self._block_id_counter.get_id())
|
196
195
|
logger.info(f"Allocated block ID {block_id}")
|
197
196
|
try:
|
198
197
|
job_id = self._launch_block(block_id)
|
198
|
+
|
199
|
+
pending_status = JobStatus(JobState.PENDING)
|
200
|
+
|
199
201
|
self.blocks_to_job_id[block_id] = job_id
|
200
202
|
self.job_ids_to_block[job_id] = block_id
|
203
|
+
self._status[block_id] = pending_status
|
204
|
+
|
205
|
+
monitoring_status_changes[block_id] = pending_status
|
201
206
|
block_ids.append(block_id)
|
207
|
+
|
202
208
|
except Exception as ex:
|
203
|
-
|
209
|
+
failed_status = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
|
210
|
+
self._simulated_status[block_id] = failed_status
|
211
|
+
self._status[block_id] = failed_status
|
212
|
+
|
213
|
+
self.send_monitoring_info(monitoring_status_changes)
|
204
214
|
return block_ids
|
205
215
|
|
206
216
|
def scale_in(self, blocks: int) -> List[str]:
|
@@ -215,16 +225,20 @@ class BlockProviderExecutor(ParslExecutor):
|
|
215
225
|
|
216
226
|
:return: A list of block ids corresponding to the blocks that were removed.
|
217
227
|
"""
|
218
|
-
|
219
|
-
|
220
|
-
|
228
|
+
|
229
|
+
active_blocks = [block_id for block_id, status in self._status.items()
|
230
|
+
if status.state not in TERMINAL_STATES]
|
231
|
+
|
232
|
+
block_ids_to_kill = active_blocks[:blocks]
|
233
|
+
|
234
|
+
job_ids_to_kill = [self.blocks_to_job_id[block] for block in block_ids_to_kill]
|
221
235
|
|
222
236
|
# Cancel the blocks provisioned
|
223
237
|
if self.provider:
|
224
|
-
logger.info(f"Scaling in jobs: {
|
225
|
-
r = self.provider.cancel(
|
226
|
-
job_ids = self._filter_scale_in_ids(
|
227
|
-
block_ids_killed = [self.job_ids_to_block[
|
238
|
+
logger.info(f"Scaling in jobs: {job_ids_to_kill}")
|
239
|
+
r = self.provider.cancel(job_ids_to_kill)
|
240
|
+
job_ids = self._filter_scale_in_ids(job_ids_to_kill, r)
|
241
|
+
block_ids_killed = [self.job_ids_to_block[job_id] for job_id in job_ids]
|
228
242
|
return block_ids_killed
|
229
243
|
else:
|
230
244
|
logger.error("No execution provider available to scale in")
|
@@ -262,10 +276,10 @@ class BlockProviderExecutor(ParslExecutor):
|
|
262
276
|
|
263
277
|
def send_monitoring_info(self, status: Dict) -> None:
|
264
278
|
# Send monitoring info for HTEX when monitoring enabled
|
265
|
-
if self.
|
279
|
+
if self.submit_monitoring_radio:
|
266
280
|
msg = self.create_monitoring_info(status)
|
267
281
|
logger.debug("Sending block monitoring message: %r", msg)
|
268
|
-
self.
|
282
|
+
self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
|
269
283
|
|
270
284
|
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
|
271
285
|
"""Create a monitoring message for each block based on the poll status.
|
parsl/monitoring/monitoring.py
CHANGED
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
|
|
12
12
|
import typeguard
|
13
13
|
|
14
14
|
from parsl.log_utils import set_file_logger
|
15
|
+
from parsl.monitoring.errors import MonitoringHubStartError
|
15
16
|
from parsl.monitoring.message_type import MessageType
|
16
17
|
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
17
18
|
from parsl.monitoring.router import router_starter
|
@@ -195,7 +196,7 @@ class MonitoringHub(RepresentationMixin):
|
|
195
196
|
comm_q.join_thread()
|
196
197
|
except queue.Empty:
|
197
198
|
logger.error("Hub has not completed initialization in 120s. Aborting")
|
198
|
-
raise
|
199
|
+
raise MonitoringHubStartError()
|
199
200
|
|
200
201
|
if isinstance(comm_q_result, str):
|
201
202
|
logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
|
parsl/monitoring/radios.py
CHANGED
@@ -7,6 +7,8 @@ from abc import ABCMeta, abstractmethod
|
|
7
7
|
from multiprocessing.queues import Queue
|
8
8
|
from typing import Optional
|
9
9
|
|
10
|
+
import zmq
|
11
|
+
|
10
12
|
from parsl.serialize import serialize
|
11
13
|
|
12
14
|
_db_manager_excepts: Optional[Exception]
|
@@ -186,3 +188,17 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
186
188
|
|
187
189
|
def send(self, message: object) -> None:
|
188
190
|
self.queue.put((message, 0))
|
191
|
+
|
192
|
+
|
193
|
+
class ZMQRadioSender(MonitoringRadioSender):
|
194
|
+
"""A monitoring radio which connects over ZMQ. This radio is not
|
195
|
+
thread-safe, because its use of ZMQ is not thread-safe.
|
196
|
+
"""
|
197
|
+
|
198
|
+
def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
|
199
|
+
self._hub_channel = zmq.Context().socket(zmq.DEALER)
|
200
|
+
self._hub_channel.set_hwm(0)
|
201
|
+
self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
|
202
|
+
|
203
|
+
def send(self, message: object) -> None:
|
204
|
+
self._hub_channel.send_pyobj(message)
|