parsl 2025.3.10__py3-none-any.whl → 2025.3.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +1 -3
- parsl/executors/base.py +13 -37
- parsl/executors/flux/executor.py +1 -0
- parsl/executors/globus_compute.py +1 -1
- parsl/executors/high_throughput/executor.py +18 -0
- parsl/executors/high_throughput/mpi_resource_management.py +2 -0
- parsl/executors/high_throughput/process_worker_pool.py +89 -82
- parsl/executors/radical/executor.py +1 -0
- parsl/executors/status_handling.py +8 -0
- parsl/executors/taskvine/executor.py +1 -0
- parsl/executors/workqueue/executor.py +1 -0
- parsl/monitoring/db_manager.py +16 -10
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +61 -117
- parsl/monitoring/radios/filesystem_router.py +4 -2
- parsl/monitoring/radios/udp_router.py +1 -3
- parsl/monitoring/radios/zmq_router.py +80 -25
- parsl/multiprocessing.py +42 -2
- parsl/tests/test_monitoring/test_exit_helper.py +54 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
- parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
- parsl/tests/test_monitoring/test_stdouterr.py +3 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/usage_tracking/usage.py +2 -2
- parsl/version.py +1 -1
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/process_worker_pool.py +89 -82
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/METADATA +4 -4
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/RECORD +35 -33
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/interchange.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.24.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/LICENSE +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/WHEEL +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.24.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -1128,9 +1128,7 @@ class DataFlowKernel:
|
|
1128
1128
|
executor.run_id = self.run_id
|
1129
1129
|
executor.run_dir = self.run_dir
|
1130
1130
|
if self.monitoring:
|
1131
|
-
executor.
|
1132
|
-
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1133
|
-
executor.submit_monitoring_radio = self.monitoring.radio
|
1131
|
+
executor.monitoring_messages = self.monitoring.resource_msgs
|
1134
1132
|
if hasattr(executor, 'provider'):
|
1135
1133
|
if hasattr(executor.provider, 'script_dir'):
|
1136
1134
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
parsl/executors/base.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import os
|
2
4
|
from abc import ABCMeta, abstractmethod
|
3
5
|
from concurrent.futures import Future
|
6
|
+
from multiprocessing.queues import Queue
|
4
7
|
from typing import Any, Callable, Dict, Optional
|
5
8
|
|
6
9
|
from typing_extensions import Literal, Self
|
7
10
|
|
8
|
-
from parsl.monitoring.
|
11
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
9
12
|
|
10
13
|
|
11
14
|
class ParslExecutor(metaclass=ABCMeta):
|
@@ -42,6 +45,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
42
45
|
invariant, not co-variant, and it looks like @typeguard cannot be
|
43
46
|
persuaded otherwise. So if you're implementing an executor and want to
|
44
47
|
@typeguard the constructor, you'll have to use List[Any] here.
|
48
|
+
|
49
|
+
The DataFlowKernel will set this attribute before calling .start(),
|
50
|
+
if monitoring is enabled:
|
51
|
+
|
52
|
+
monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] - an executor
|
53
|
+
can send messages to the monitoring hub by putting them into
|
54
|
+
this queue.
|
45
55
|
"""
|
46
56
|
|
47
57
|
label: str = "undefined"
|
@@ -50,15 +60,11 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
50
60
|
def __init__(
|
51
61
|
self,
|
52
62
|
*,
|
53
|
-
|
54
|
-
hub_zmq_port: Optional[int] = None,
|
55
|
-
submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
|
63
|
+
monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] = None,
|
56
64
|
run_dir: str = ".",
|
57
65
|
run_id: Optional[str] = None,
|
58
66
|
):
|
59
|
-
self.
|
60
|
-
self.hub_zmq_port = hub_zmq_port
|
61
|
-
self.submit_monitoring_radio = submit_monitoring_radio
|
67
|
+
self.monitoring_messages = monitoring_messages
|
62
68
|
self.run_dir = os.path.abspath(run_dir)
|
63
69
|
self.run_id = run_id
|
64
70
|
|
@@ -125,33 +131,3 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
125
131
|
@run_id.setter
|
126
132
|
def run_id(self, value: Optional[str]) -> None:
|
127
133
|
self._run_id = value
|
128
|
-
|
129
|
-
@property
|
130
|
-
def hub_address(self) -> Optional[str]:
|
131
|
-
"""Address to the Hub for monitoring.
|
132
|
-
"""
|
133
|
-
return self._hub_address
|
134
|
-
|
135
|
-
@hub_address.setter
|
136
|
-
def hub_address(self, value: Optional[str]) -> None:
|
137
|
-
self._hub_address = value
|
138
|
-
|
139
|
-
@property
|
140
|
-
def hub_zmq_port(self) -> Optional[int]:
|
141
|
-
"""Port to the Hub for monitoring.
|
142
|
-
"""
|
143
|
-
return self._hub_zmq_port
|
144
|
-
|
145
|
-
@hub_zmq_port.setter
|
146
|
-
def hub_zmq_port(self, value: Optional[int]) -> None:
|
147
|
-
self._hub_zmq_port = value
|
148
|
-
|
149
|
-
@property
|
150
|
-
def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
|
151
|
-
"""Local radio for sending monitoring messages
|
152
|
-
"""
|
153
|
-
return self._submit_monitoring_radio
|
154
|
-
|
155
|
-
@submit_monitoring_radio.setter
|
156
|
-
def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
|
157
|
-
self._submit_monitoring_radio = value
|
parsl/executors/flux/executor.py
CHANGED
@@ -231,6 +231,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
231
231
|
|
232
232
|
def start(self):
|
233
233
|
"""Called when DFK starts the executor when the config is loaded."""
|
234
|
+
super().start()
|
234
235
|
os.makedirs(self.working_dir, exist_ok=True)
|
235
236
|
self._submission_thread.start()
|
236
237
|
|
@@ -67,7 +67,7 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
|
67
67
|
|
68
68
|
def start(self) -> None:
|
69
69
|
""" Start the Globus Compute Executor """
|
70
|
-
|
70
|
+
super().start()
|
71
71
|
|
72
72
|
def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
|
73
73
|
""" Submit func to globus-compute
|
@@ -29,6 +29,7 @@ from parsl.executors.high_throughput.manager_selector import (
|
|
29
29
|
)
|
30
30
|
from parsl.executors.status_handling import BlockProviderExecutor
|
31
31
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
32
|
+
from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_receiver
|
32
33
|
from parsl.process_loggers import wrap_with_logs
|
33
34
|
from parsl.providers import LocalProvider
|
34
35
|
from parsl.providers.base import ExecutionProvider
|
@@ -334,6 +335,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
334
335
|
self._result_queue_thread_exit = threading.Event()
|
335
336
|
self._result_queue_thread: Optional[threading.Thread] = None
|
336
337
|
|
338
|
+
self.zmq_monitoring: Optional[ZMQRadioReceiver]
|
339
|
+
self.zmq_monitoring = None
|
340
|
+
self.hub_zmq_port = None
|
341
|
+
|
337
342
|
radio_mode = "htex"
|
338
343
|
enable_mpi_mode: bool = False
|
339
344
|
mpi_launcher: str = "mpiexec"
|
@@ -407,6 +412,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
407
412
|
def start(self):
|
408
413
|
"""Create the Interchange process and connect to it.
|
409
414
|
"""
|
415
|
+
super().start()
|
410
416
|
if self.encrypted and self.cert_dir is None:
|
411
417
|
logger.debug("Creating CurveZMQ certificates")
|
412
418
|
self.cert_dir = curvezmq.create_certificates(self.logdir)
|
@@ -427,6 +433,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
427
433
|
self.loopback_address, self.interchange_port_range, self.cert_dir
|
428
434
|
)
|
429
435
|
|
436
|
+
if self.monitoring_messages is not None:
|
437
|
+
self.zmq_monitoring = start_zmq_receiver(monitoring_messages=self.monitoring_messages,
|
438
|
+
loopback_address=self.loopback_address,
|
439
|
+
port_range=self.interchange_port_range,
|
440
|
+
logdir=self.logdir,
|
441
|
+
worker_debug=self.worker_debug,
|
442
|
+
)
|
443
|
+
self.hub_zmq_port = self.zmq_monitoring.port
|
444
|
+
|
430
445
|
self._result_queue_thread = None
|
431
446
|
self._start_result_queue_thread()
|
432
447
|
self._start_local_interchange_process()
|
@@ -861,6 +876,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
861
876
|
if self._result_queue_thread:
|
862
877
|
self._result_queue_thread.join()
|
863
878
|
|
879
|
+
if self.zmq_monitoring:
|
880
|
+
self.zmq_monitoring.close()
|
881
|
+
|
864
882
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
865
883
|
|
866
884
|
def get_usage_information(self):
|
@@ -203,6 +203,8 @@ class MPITaskScheduler(TaskScheduler):
|
|
203
203
|
def get_result(self, block: bool = True, timeout: Optional[float] = None):
|
204
204
|
"""Return result and relinquish provisioned nodes"""
|
205
205
|
result_pkl = self.pending_result_q.get(block, timeout)
|
206
|
+
if result_pkl is None:
|
207
|
+
return None
|
206
208
|
result_dict = pickle.loads(result_pkl)
|
207
209
|
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
208
210
|
# Causes Parsl to hang. See Issue #3427
|
@@ -15,6 +15,7 @@ import threading
|
|
15
15
|
import time
|
16
16
|
import uuid
|
17
17
|
from importlib.metadata import distributions
|
18
|
+
from multiprocessing.context import SpawnProcess
|
18
19
|
from multiprocessing.managers import DictProxy
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
20
21
|
from typing import Dict, List, Optional, Sequence
|
@@ -403,52 +404,34 @@ class Manager:
|
|
403
404
|
result_outgoing.connect(self._result_q_url)
|
404
405
|
logger.info("Manager result pipe connected to interchange")
|
405
406
|
|
406
|
-
push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
|
407
|
-
logger.debug("push poll period: {}".format(push_poll_period))
|
408
|
-
|
409
|
-
last_beat = time.time()
|
410
|
-
last_result_beat = time.time()
|
411
|
-
items = []
|
412
|
-
|
413
407
|
while not self._stop_event.is_set():
|
408
|
+
logger.debug("Starting pending_result_queue get")
|
414
409
|
try:
|
415
|
-
|
416
|
-
r
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
logger.debug("
|
421
|
-
except Exception
|
422
|
-
logger.exception("
|
423
|
-
|
424
|
-
if time.time() > last_result_beat + self.heartbeat_period:
|
425
|
-
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
last_result_beat = time.time()
|
428
|
-
items.append(pickle.dumps({'type': 'heartbeat'}))
|
429
|
-
|
430
|
-
if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
|
431
|
-
last_beat = time.time()
|
432
|
-
if items:
|
433
|
-
logger.debug(f"Result send: Pushing {len(items)} items")
|
434
|
-
result_outgoing.send_multipart(items)
|
435
|
-
logger.debug("Result send: Pushed")
|
436
|
-
items = []
|
437
|
-
else:
|
438
|
-
logger.debug("Result send: No items to push")
|
439
|
-
else:
|
440
|
-
logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
|
410
|
+
r = self.task_scheduler.get_result()
|
411
|
+
if r is None:
|
412
|
+
continue
|
413
|
+
logger.debug("Result received from worker: %s", id(r))
|
414
|
+
result_outgoing.send(r)
|
415
|
+
logger.debug("Result sent to interchange: %s", id(r))
|
416
|
+
except Exception:
|
417
|
+
logger.exception("Failed to send result to interchange")
|
441
418
|
|
442
419
|
result_outgoing.close()
|
443
|
-
logger.
|
420
|
+
logger.debug("Exiting")
|
444
421
|
|
445
422
|
@wrap_with_logs
|
446
|
-
def
|
423
|
+
def heartbeater(self):
|
424
|
+
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
+
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
+
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
+
|
429
|
+
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
447
430
|
"""Keeps workers alive."""
|
448
431
|
logger.debug("Starting worker watchdog")
|
449
432
|
|
450
433
|
while not self._stop_event.wait(self.heartbeat_period):
|
451
|
-
for worker_id, p in
|
434
|
+
for worker_id, p in procs.items():
|
452
435
|
if not p.is_alive():
|
453
436
|
logger.error("Worker {} has died".format(worker_id))
|
454
437
|
try:
|
@@ -466,11 +449,10 @@ class Manager:
|
|
466
449
|
except KeyError:
|
467
450
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
468
451
|
|
469
|
-
|
470
|
-
self.procs[worker_id] = p
|
452
|
+
procs[worker_id] = self._start_worker(worker_id)
|
471
453
|
logger.info("Worker {} has been restarted".format(worker_id))
|
472
454
|
|
473
|
-
logger.
|
455
|
+
logger.debug("Exiting")
|
474
456
|
|
475
457
|
@wrap_with_logs
|
476
458
|
def handle_monitoring_messages(self):
|
@@ -485,32 +467,28 @@ class Manager:
|
|
485
467
|
"""
|
486
468
|
logger.debug("Starting monitoring handler thread")
|
487
469
|
|
488
|
-
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
489
|
-
|
490
470
|
while not self._stop_event.is_set():
|
491
471
|
try:
|
492
472
|
logger.debug("Starting monitor_queue.get()")
|
493
|
-
msg = self.monitoring_queue.get(block=True
|
494
|
-
|
495
|
-
|
496
|
-
except Exception as e:
|
497
|
-
logger.exception(f"Got an exception: {e}")
|
498
|
-
else:
|
473
|
+
msg = self.monitoring_queue.get(block=True)
|
474
|
+
if msg is None:
|
475
|
+
continue
|
499
476
|
logger.debug("Got a monitoring message")
|
500
477
|
self.pending_result_queue.put(msg)
|
501
478
|
logger.debug("Put monitoring message on pending_result_queue")
|
479
|
+
except Exception:
|
480
|
+
logger.exception("Failed to forward monitoring message")
|
502
481
|
|
503
|
-
logger.
|
482
|
+
logger.debug("Exiting")
|
504
483
|
|
505
484
|
def start(self):
|
506
485
|
""" Start the worker processes.
|
507
486
|
|
508
487
|
TODO: Move task receiving to a thread
|
509
488
|
"""
|
510
|
-
|
489
|
+
procs: dict[int, SpawnProcess] = {}
|
511
490
|
for worker_id in range(self.worker_count):
|
512
|
-
|
513
|
-
self.procs[worker_id] = p
|
491
|
+
procs[worker_id] = self._start_worker(worker_id)
|
514
492
|
|
515
493
|
logger.debug("Workers started")
|
516
494
|
|
@@ -519,40 +497,69 @@ class Manager:
|
|
519
497
|
target=self.push_results, name="Result-Pusher"
|
520
498
|
)
|
521
499
|
thr_worker_watchdog = threading.Thread(
|
522
|
-
target=self.worker_watchdog, name="worker-watchdog"
|
500
|
+
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
523
501
|
)
|
524
502
|
thr_monitoring_handler = threading.Thread(
|
525
503
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
526
504
|
)
|
505
|
+
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
527
506
|
|
528
507
|
thr_task_puller.start()
|
529
508
|
thr_result_pusher.start()
|
530
509
|
thr_worker_watchdog.start()
|
531
510
|
thr_monitoring_handler.start()
|
511
|
+
thr_heartbeater.start()
|
532
512
|
|
533
513
|
logger.info("Manager threads started")
|
534
514
|
|
535
515
|
# This might need a multiprocessing event to signal back.
|
536
516
|
self._stop_event.wait()
|
537
|
-
logger.
|
517
|
+
logger.info("Stop event set; terminating worker processes")
|
518
|
+
|
519
|
+
# Invite blocking threads to quit
|
520
|
+
self.monitoring_queue.put(None)
|
521
|
+
self.pending_result_queue.put(None)
|
538
522
|
|
523
|
+
thr_heartbeater.join()
|
539
524
|
thr_task_puller.join()
|
540
525
|
thr_result_pusher.join()
|
541
526
|
thr_worker_watchdog.join()
|
542
527
|
thr_monitoring_handler.join()
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
528
|
+
|
529
|
+
for worker_id in procs:
|
530
|
+
p = procs[worker_id]
|
531
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
532
|
+
logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
|
533
|
+
p.terminate()
|
549
534
|
|
550
535
|
self.zmq_context.term()
|
536
|
+
|
537
|
+
# give processes 1 second to gracefully shut themselves down, based on the
|
538
|
+
# SIGTERM (.terminate()) just sent; after then, we pull the plug.
|
539
|
+
force_child_shutdown_at = time.monotonic() + 1
|
540
|
+
while procs:
|
541
|
+
worker_id, p = procs.popitem()
|
542
|
+
timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
|
543
|
+
p.join(timeout=timeout)
|
544
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
545
|
+
if p.exitcode is not None:
|
546
|
+
logger.debug(
|
547
|
+
"Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
|
548
|
+
)
|
549
|
+
|
550
|
+
else:
|
551
|
+
logger.warning(
|
552
|
+
f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
|
553
|
+
f" manner; sending KILL signal to process. {proc_info}"
|
554
|
+
)
|
555
|
+
p.kill()
|
556
|
+
p.join()
|
557
|
+
p.close()
|
558
|
+
|
551
559
|
delta = time.time() - self._start_time
|
552
560
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
553
|
-
return
|
554
561
|
|
555
|
-
def _start_worker(self, worker_id: int):
|
562
|
+
def _start_worker(self, worker_id: int) -> SpawnProcess:
|
556
563
|
p = SpawnContext.Process(
|
557
564
|
target=worker,
|
558
565
|
args=(
|
@@ -939,27 +946,27 @@ if __name__ == "__main__":
|
|
939
946
|
)
|
940
947
|
logger.info(
|
941
948
|
f"\n Python version: {sys.version}"
|
942
|
-
f" Debug logging: {args.debug}"
|
943
|
-
f" Certificates dir: {args.cert_dir}"
|
944
|
-
f" Log dir: {args.logdir}"
|
945
|
-
f" Manager ID: {args.uid}"
|
946
|
-
f" Block ID: {args.block_id}"
|
947
|
-
f" cores_per_worker: {args.cores_per_worker}"
|
948
|
-
f" mem_per_worker: {args.mem_per_worker}"
|
949
|
-
f" task_port: {args.task_port}"
|
950
|
-
f" result_port: {args.result_port}"
|
951
|
-
f" addresses: {args.addresses}"
|
952
|
-
f" max_workers_per_node: {args.max_workers_per_node}"
|
953
|
-
f" poll_period: {args.poll}"
|
954
|
-
f" address_probe_timeout: {args.address_probe_timeout}"
|
955
|
-
f" Prefetch capacity: {args.prefetch_capacity}"
|
956
|
-
f" Heartbeat threshold: {args.hb_threshold}"
|
957
|
-
f" Heartbeat period: {args.hb_period}"
|
958
|
-
f" Drain period: {args.drain_period}"
|
959
|
-
f" CPU affinity: {args.cpu_affinity}"
|
960
|
-
f" Accelerators: {' '.join(args.available_accelerators)}"
|
961
|
-
f" enable_mpi_mode: {args.enable_mpi_mode}"
|
962
|
-
f" mpi_launcher: {args.mpi_launcher}"
|
949
|
+
f"\n Debug logging: {args.debug}"
|
950
|
+
f"\n Certificates dir: {args.cert_dir}"
|
951
|
+
f"\n Log dir: {args.logdir}"
|
952
|
+
f"\n Manager ID: {args.uid}"
|
953
|
+
f"\n Block ID: {args.block_id}"
|
954
|
+
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
|
+
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
+
f"\n task_port: {args.task_port}"
|
957
|
+
f"\n result_port: {args.result_port}"
|
958
|
+
f"\n addresses: {args.addresses}"
|
959
|
+
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
|
+
f"\n poll_period: {args.poll}"
|
961
|
+
f"\n address_probe_timeout: {args.address_probe_timeout}"
|
962
|
+
f"\n Prefetch capacity: {args.prefetch_capacity}"
|
963
|
+
f"\n Heartbeat threshold: {args.hb_threshold}"
|
964
|
+
f"\n Heartbeat period: {args.hb_period}"
|
965
|
+
f"\n Drain period: {args.drain_period}"
|
966
|
+
f"\n CPU affinity: {args.cpu_affinity}"
|
967
|
+
f"\n Accelerators: {' '.join(args.available_accelerators)}"
|
968
|
+
f"\n enable_mpi_mode: {args.enable_mpi_mode}"
|
969
|
+
f"\n mpi_launcher: {args.mpi_launcher}"
|
963
970
|
)
|
964
971
|
try:
|
965
972
|
manager = Manager(task_port=args.task_port,
|
@@ -215,6 +215,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
215
215
|
"""Create the Pilot component and pass it.
|
216
216
|
"""
|
217
217
|
logger.info("starting RadicalPilotExecutor")
|
218
|
+
super().start()
|
218
219
|
logger.info('Parsl: {0}'.format(parsl.__version__))
|
219
220
|
logger.info('RADICAL pilot: {0}'.format(rp.version))
|
220
221
|
self.session = rp.Session(cfg={'base': self.run_dir},
|
@@ -14,6 +14,7 @@ from parsl.executors.errors import BadStateException, ScalingFailed
|
|
14
14
|
from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
|
15
15
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
17
18
|
from parsl.providers.base import ExecutionProvider
|
18
19
|
from parsl.utils import AtomicIDCounter
|
19
20
|
|
@@ -83,6 +84,13 @@ class BlockProviderExecutor(ParslExecutor):
|
|
83
84
|
# of pending, active and recently terminated blocks
|
84
85
|
self._status = {} # type: Dict[str, JobStatus]
|
85
86
|
|
87
|
+
self.submit_monitoring_radio: Optional[MultiprocessingQueueRadioSender] = None
|
88
|
+
|
89
|
+
def start(self):
|
90
|
+
super().start()
|
91
|
+
if self.monitoring_messages:
|
92
|
+
self.submit_monitoring_radio = MultiprocessingQueueRadioSender(self.monitoring_messages)
|
93
|
+
|
86
94
|
def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
|
87
95
|
"""Given a list of block ids and a list of corresponding status strings,
|
88
96
|
returns a dictionary mapping each block id to the corresponding status
|
@@ -239,6 +239,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
239
239
|
retrieve Parsl tasks within the TaskVine system.
|
240
240
|
"""
|
241
241
|
|
242
|
+
super().start()
|
242
243
|
# Synchronize connection and communication settings between the manager and factory
|
243
244
|
self.__synchronize_manager_factory_comm_settings()
|
244
245
|
|
@@ -314,6 +314,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
314
314
|
"""Create submit process and collector thread to create, send, and
|
315
315
|
retrieve Parsl tasks within the Work Queue system.
|
316
316
|
"""
|
317
|
+
super().start()
|
317
318
|
self.tasks_lock = threading.Lock()
|
318
319
|
|
319
320
|
# Create directories for data and results
|
parsl/monitoring/db_manager.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
3
|
import multiprocessing.queues as mpq
|
4
|
+
import multiprocessing.synchronize as mpe
|
4
5
|
import os
|
5
6
|
import queue
|
6
7
|
import threading
|
@@ -278,11 +279,13 @@ class Database:
|
|
278
279
|
|
279
280
|
class DatabaseManager:
|
280
281
|
def __init__(self,
|
282
|
+
*,
|
281
283
|
db_url: str = 'sqlite:///runinfo/monitoring.db',
|
282
284
|
run_dir: str = '.',
|
283
285
|
logging_level: int = logging.INFO,
|
284
286
|
batching_interval: float = 1,
|
285
287
|
batching_threshold: float = 99999,
|
288
|
+
exit_event: mpe.Event
|
286
289
|
):
|
287
290
|
|
288
291
|
self.workflow_end = False
|
@@ -307,6 +310,8 @@ class DatabaseManager:
|
|
307
310
|
self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
308
311
|
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
309
312
|
|
313
|
+
self.external_exit_event = exit_event
|
314
|
+
|
310
315
|
def start(self,
|
311
316
|
resource_queue: mpq.Queue) -> None:
|
312
317
|
|
@@ -555,15 +560,16 @@ class DatabaseManager:
|
|
555
560
|
while not kill_event.is_set() or logs_queue.qsize() != 0:
|
556
561
|
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
562
|
kill_event.is_set(), logs_queue.qsize() != 0)
|
563
|
+
|
564
|
+
if self.external_exit_event.is_set():
|
565
|
+
self.close()
|
566
|
+
|
558
567
|
try:
|
559
568
|
x = logs_queue.get(timeout=0.1)
|
560
569
|
except queue.Empty:
|
561
570
|
continue
|
562
571
|
else:
|
563
|
-
|
564
|
-
self.close()
|
565
|
-
else:
|
566
|
-
self._dispatch_to_internal(x)
|
572
|
+
self._dispatch_to_internal(x)
|
567
573
|
|
568
574
|
def _dispatch_to_internal(self, x: Tuple) -> None:
|
569
575
|
assert isinstance(x, tuple)
|
@@ -678,11 +684,11 @@ class DatabaseManager:
|
|
678
684
|
|
679
685
|
@wrap_with_logs(target="database_manager")
|
680
686
|
@typeguard.typechecked
|
681
|
-
def dbm_starter(
|
682
|
-
resource_msgs: mpq.Queue,
|
687
|
+
def dbm_starter(resource_msgs: mpq.Queue,
|
683
688
|
db_url: str,
|
684
689
|
run_dir: str,
|
685
|
-
logging_level: int
|
690
|
+
logging_level: int,
|
691
|
+
exit_event: mpe.Event) -> None:
|
686
692
|
"""Start the database manager process
|
687
693
|
|
688
694
|
The DFK should start this function. The args, kwargs match that of the monitoring config
|
@@ -693,16 +699,16 @@ def dbm_starter(exception_q: mpq.Queue,
|
|
693
699
|
try:
|
694
700
|
dbm = DatabaseManager(db_url=db_url,
|
695
701
|
run_dir=run_dir,
|
696
|
-
logging_level=logging_level
|
702
|
+
logging_level=logging_level,
|
703
|
+
exit_event=exit_event)
|
697
704
|
logger.info("Starting dbm in dbm starter")
|
698
705
|
dbm.start(resource_msgs)
|
699
706
|
except KeyboardInterrupt:
|
700
707
|
logger.exception("KeyboardInterrupt signal caught")
|
701
708
|
dbm.close()
|
702
709
|
raise
|
703
|
-
except Exception
|
710
|
+
except Exception:
|
704
711
|
logger.exception("dbm.start exception")
|
705
|
-
exception_q.put(("DBM", str(e)))
|
706
712
|
dbm.close()
|
707
713
|
|
708
714
|
logger.info("End of dbm_starter")
|
parsl/monitoring/errors.py
CHANGED