parsl 2025.3.3__py3-none-any.whl → 2025.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/executors/high_throughput/executor.py +1 -1
- parsl/executors/high_throughput/mpi_resource_management.py +15 -4
- parsl/executors/high_throughput/process_worker_pool.py +89 -82
- parsl/monitoring/db_manager.py +16 -10
- parsl/monitoring/monitoring.py +113 -113
- parsl/monitoring/radios/filesystem_router.py +54 -0
- parsl/monitoring/{router.py → radios/udp_router.py} +17 -82
- parsl/monitoring/radios/zmq_router.py +131 -0
- parsl/tests/test_monitoring/test_exit_helper.py +55 -0
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +25 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +4 -1
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/version.py +1 -1
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/process_worker_pool.py +89 -82
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/METADATA +4 -4
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/RECORD +23 -20
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/interchange.py +0 -0
- {parsl-2025.3.3.data → parsl-2025.3.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/LICENSE +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/WHEEL +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.3.dist-info → parsl-2025.3.17.dist-info}/top_level.txt +0 -0
@@ -536,7 +536,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
536
536
|
"interchange_address": self.address,
|
537
537
|
"worker_ports": self.worker_ports,
|
538
538
|
"worker_port_range": self.worker_port_range,
|
539
|
-
"hub_address": self.
|
539
|
+
"hub_address": self.loopback_address,
|
540
540
|
"hub_zmq_port": self.hub_zmq_port,
|
541
541
|
"logdir": self.logdir,
|
542
542
|
"heartbeat_threshold": self.heartbeat_threshold,
|
@@ -4,6 +4,7 @@ import os
|
|
4
4
|
import pickle
|
5
5
|
import queue
|
6
6
|
import subprocess
|
7
|
+
from dataclasses import dataclass, field
|
7
8
|
from enum import Enum
|
8
9
|
from typing import Dict, List, Optional
|
9
10
|
|
@@ -69,6 +70,14 @@ class MPINodesUnavailable(Exception):
|
|
69
70
|
return f"MPINodesUnavailable(requested={self.requested} available={self.available})"
|
70
71
|
|
71
72
|
|
73
|
+
@dataclass(order=True)
|
74
|
+
class PrioritizedTask:
|
75
|
+
# Comparing dict will fail since they are unhashable
|
76
|
+
# This dataclass limits comparison to the priority field
|
77
|
+
priority: int
|
78
|
+
task: Dict = field(compare=False)
|
79
|
+
|
80
|
+
|
72
81
|
class TaskScheduler:
|
73
82
|
"""Default TaskScheduler that does no taskscheduling
|
74
83
|
|
@@ -111,7 +120,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
111
120
|
super().__init__(pending_task_q, pending_result_q)
|
112
121
|
self.scheduler = identify_scheduler()
|
113
122
|
# PriorityQueue is threadsafe
|
114
|
-
self._backlog_queue: queue.PriorityQueue = queue.PriorityQueue()
|
123
|
+
self._backlog_queue: queue.PriorityQueue[PrioritizedTask] = queue.PriorityQueue()
|
115
124
|
self._map_tasks_to_nodes: Dict[str, List[str]] = {}
|
116
125
|
self.available_nodes = get_nodes_in_batchjob(self.scheduler)
|
117
126
|
self._free_node_counter = SpawnContext.Value("i", len(self.available_nodes))
|
@@ -169,7 +178,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
169
178
|
allocated_nodes = self._get_nodes(nodes_needed)
|
170
179
|
except MPINodesUnavailable:
|
171
180
|
logger.info(f"Not enough resources, placing task {tid} into backlog")
|
172
|
-
self._backlog_queue.put((nodes_needed, task_package))
|
181
|
+
self._backlog_queue.put(PrioritizedTask(nodes_needed, task_package))
|
173
182
|
return
|
174
183
|
else:
|
175
184
|
resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
|
@@ -183,8 +192,8 @@ class MPITaskScheduler(TaskScheduler):
|
|
183
192
|
def _schedule_backlog_tasks(self):
|
184
193
|
"""Attempt to schedule backlogged tasks"""
|
185
194
|
try:
|
186
|
-
|
187
|
-
self.put_task(
|
195
|
+
prioritized_task = self._backlog_queue.get(block=False)
|
196
|
+
self.put_task(prioritized_task.task)
|
188
197
|
except queue.Empty:
|
189
198
|
return
|
190
199
|
else:
|
@@ -194,6 +203,8 @@ class MPITaskScheduler(TaskScheduler):
|
|
194
203
|
def get_result(self, block: bool = True, timeout: Optional[float] = None):
|
195
204
|
"""Return result and relinquish provisioned nodes"""
|
196
205
|
result_pkl = self.pending_result_q.get(block, timeout)
|
206
|
+
if result_pkl is None:
|
207
|
+
return None
|
197
208
|
result_dict = pickle.loads(result_pkl)
|
198
209
|
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
199
210
|
# Causes Parsl to hang. See Issue #3427
|
@@ -15,6 +15,7 @@ import threading
|
|
15
15
|
import time
|
16
16
|
import uuid
|
17
17
|
from importlib.metadata import distributions
|
18
|
+
from multiprocessing.context import SpawnProcess
|
18
19
|
from multiprocessing.managers import DictProxy
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
20
21
|
from typing import Dict, List, Optional, Sequence
|
@@ -403,52 +404,34 @@ class Manager:
|
|
403
404
|
result_outgoing.connect(self._result_q_url)
|
404
405
|
logger.info("Manager result pipe connected to interchange")
|
405
406
|
|
406
|
-
push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
|
407
|
-
logger.debug("push poll period: {}".format(push_poll_period))
|
408
|
-
|
409
|
-
last_beat = time.time()
|
410
|
-
last_result_beat = time.time()
|
411
|
-
items = []
|
412
|
-
|
413
407
|
while not self._stop_event.is_set():
|
408
|
+
logger.debug("Starting pending_result_queue get")
|
414
409
|
try:
|
415
|
-
|
416
|
-
r
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
logger.debug("
|
421
|
-
except Exception
|
422
|
-
logger.exception("
|
423
|
-
|
424
|
-
if time.time() > last_result_beat + self.heartbeat_period:
|
425
|
-
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
last_result_beat = time.time()
|
428
|
-
items.append(pickle.dumps({'type': 'heartbeat'}))
|
429
|
-
|
430
|
-
if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
|
431
|
-
last_beat = time.time()
|
432
|
-
if items:
|
433
|
-
logger.debug(f"Result send: Pushing {len(items)} items")
|
434
|
-
result_outgoing.send_multipart(items)
|
435
|
-
logger.debug("Result send: Pushed")
|
436
|
-
items = []
|
437
|
-
else:
|
438
|
-
logger.debug("Result send: No items to push")
|
439
|
-
else:
|
440
|
-
logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
|
410
|
+
r = self.task_scheduler.get_result()
|
411
|
+
if r is None:
|
412
|
+
continue
|
413
|
+
logger.debug("Result received from worker: %s", id(r))
|
414
|
+
result_outgoing.send(r)
|
415
|
+
logger.debug("Result sent to interchange: %s", id(r))
|
416
|
+
except Exception:
|
417
|
+
logger.exception("Failed to send result to interchange")
|
441
418
|
|
442
419
|
result_outgoing.close()
|
443
|
-
logger.
|
420
|
+
logger.debug("Exiting")
|
444
421
|
|
445
422
|
@wrap_with_logs
|
446
|
-
def
|
423
|
+
def heartbeater(self):
|
424
|
+
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
+
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
+
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
+
|
429
|
+
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
447
430
|
"""Keeps workers alive."""
|
448
431
|
logger.debug("Starting worker watchdog")
|
449
432
|
|
450
433
|
while not self._stop_event.wait(self.heartbeat_period):
|
451
|
-
for worker_id, p in
|
434
|
+
for worker_id, p in procs.items():
|
452
435
|
if not p.is_alive():
|
453
436
|
logger.error("Worker {} has died".format(worker_id))
|
454
437
|
try:
|
@@ -466,11 +449,10 @@ class Manager:
|
|
466
449
|
except KeyError:
|
467
450
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
468
451
|
|
469
|
-
|
470
|
-
self.procs[worker_id] = p
|
452
|
+
procs[worker_id] = self._start_worker(worker_id)
|
471
453
|
logger.info("Worker {} has been restarted".format(worker_id))
|
472
454
|
|
473
|
-
logger.
|
455
|
+
logger.debug("Exiting")
|
474
456
|
|
475
457
|
@wrap_with_logs
|
476
458
|
def handle_monitoring_messages(self):
|
@@ -485,32 +467,28 @@ class Manager:
|
|
485
467
|
"""
|
486
468
|
logger.debug("Starting monitoring handler thread")
|
487
469
|
|
488
|
-
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
489
|
-
|
490
470
|
while not self._stop_event.is_set():
|
491
471
|
try:
|
492
472
|
logger.debug("Starting monitor_queue.get()")
|
493
|
-
msg = self.monitoring_queue.get(block=True
|
494
|
-
|
495
|
-
|
496
|
-
except Exception as e:
|
497
|
-
logger.exception(f"Got an exception: {e}")
|
498
|
-
else:
|
473
|
+
msg = self.monitoring_queue.get(block=True)
|
474
|
+
if msg is None:
|
475
|
+
continue
|
499
476
|
logger.debug("Got a monitoring message")
|
500
477
|
self.pending_result_queue.put(msg)
|
501
478
|
logger.debug("Put monitoring message on pending_result_queue")
|
479
|
+
except Exception:
|
480
|
+
logger.exception("Failed to forward monitoring message")
|
502
481
|
|
503
|
-
logger.
|
482
|
+
logger.debug("Exiting")
|
504
483
|
|
505
484
|
def start(self):
|
506
485
|
""" Start the worker processes.
|
507
486
|
|
508
487
|
TODO: Move task receiving to a thread
|
509
488
|
"""
|
510
|
-
|
489
|
+
procs: dict[int, SpawnProcess] = {}
|
511
490
|
for worker_id in range(self.worker_count):
|
512
|
-
|
513
|
-
self.procs[worker_id] = p
|
491
|
+
procs[worker_id] = self._start_worker(worker_id)
|
514
492
|
|
515
493
|
logger.debug("Workers started")
|
516
494
|
|
@@ -519,40 +497,69 @@ class Manager:
|
|
519
497
|
target=self.push_results, name="Result-Pusher"
|
520
498
|
)
|
521
499
|
thr_worker_watchdog = threading.Thread(
|
522
|
-
target=self.worker_watchdog, name="worker-watchdog"
|
500
|
+
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
523
501
|
)
|
524
502
|
thr_monitoring_handler = threading.Thread(
|
525
503
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
526
504
|
)
|
505
|
+
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
527
506
|
|
528
507
|
thr_task_puller.start()
|
529
508
|
thr_result_pusher.start()
|
530
509
|
thr_worker_watchdog.start()
|
531
510
|
thr_monitoring_handler.start()
|
511
|
+
thr_heartbeater.start()
|
532
512
|
|
533
513
|
logger.info("Manager threads started")
|
534
514
|
|
535
515
|
# This might need a multiprocessing event to signal back.
|
536
516
|
self._stop_event.wait()
|
537
|
-
logger.
|
517
|
+
logger.info("Stop event set; terminating worker processes")
|
518
|
+
|
519
|
+
# Invite blocking threads to quit
|
520
|
+
self.monitoring_queue.put(None)
|
521
|
+
self.pending_result_queue.put(None)
|
538
522
|
|
523
|
+
thr_heartbeater.join()
|
539
524
|
thr_task_puller.join()
|
540
525
|
thr_result_pusher.join()
|
541
526
|
thr_worker_watchdog.join()
|
542
527
|
thr_monitoring_handler.join()
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
528
|
+
|
529
|
+
for worker_id in procs:
|
530
|
+
p = procs[worker_id]
|
531
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
532
|
+
logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
|
533
|
+
p.terminate()
|
549
534
|
|
550
535
|
self.zmq_context.term()
|
536
|
+
|
537
|
+
# give processes 1 second to gracefully shut themselves down, based on the
|
538
|
+
# SIGTERM (.terminate()) just sent; after then, we pull the plug.
|
539
|
+
force_child_shutdown_at = time.monotonic() + 1
|
540
|
+
while procs:
|
541
|
+
worker_id, p = procs.popitem()
|
542
|
+
timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
|
543
|
+
p.join(timeout=timeout)
|
544
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
545
|
+
if p.exitcode is not None:
|
546
|
+
logger.debug(
|
547
|
+
"Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
|
548
|
+
)
|
549
|
+
|
550
|
+
else:
|
551
|
+
logger.warning(
|
552
|
+
f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
|
553
|
+
f" manner; sending KILL signal to process. {proc_info}"
|
554
|
+
)
|
555
|
+
p.kill()
|
556
|
+
p.join()
|
557
|
+
p.close()
|
558
|
+
|
551
559
|
delta = time.time() - self._start_time
|
552
560
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
553
|
-
return
|
554
561
|
|
555
|
-
def _start_worker(self, worker_id: int):
|
562
|
+
def _start_worker(self, worker_id: int) -> SpawnProcess:
|
556
563
|
p = SpawnContext.Process(
|
557
564
|
target=worker,
|
558
565
|
args=(
|
@@ -939,27 +946,27 @@ if __name__ == "__main__":
|
|
939
946
|
)
|
940
947
|
logger.info(
|
941
948
|
f"\n Python version: {sys.version}"
|
942
|
-
f" Debug logging: {args.debug}"
|
943
|
-
f" Certificates dir: {args.cert_dir}"
|
944
|
-
f" Log dir: {args.logdir}"
|
945
|
-
f" Manager ID: {args.uid}"
|
946
|
-
f" Block ID: {args.block_id}"
|
947
|
-
f" cores_per_worker: {args.cores_per_worker}"
|
948
|
-
f" mem_per_worker: {args.mem_per_worker}"
|
949
|
-
f" task_port: {args.task_port}"
|
950
|
-
f" result_port: {args.result_port}"
|
951
|
-
f" addresses: {args.addresses}"
|
952
|
-
f" max_workers_per_node: {args.max_workers_per_node}"
|
953
|
-
f" poll_period: {args.poll}"
|
954
|
-
f" address_probe_timeout: {args.address_probe_timeout}"
|
955
|
-
f" Prefetch capacity: {args.prefetch_capacity}"
|
956
|
-
f" Heartbeat threshold: {args.hb_threshold}"
|
957
|
-
f" Heartbeat period: {args.hb_period}"
|
958
|
-
f" Drain period: {args.drain_period}"
|
959
|
-
f" CPU affinity: {args.cpu_affinity}"
|
960
|
-
f" Accelerators: {' '.join(args.available_accelerators)}"
|
961
|
-
f" enable_mpi_mode: {args.enable_mpi_mode}"
|
962
|
-
f" mpi_launcher: {args.mpi_launcher}"
|
949
|
+
f"\n Debug logging: {args.debug}"
|
950
|
+
f"\n Certificates dir: {args.cert_dir}"
|
951
|
+
f"\n Log dir: {args.logdir}"
|
952
|
+
f"\n Manager ID: {args.uid}"
|
953
|
+
f"\n Block ID: {args.block_id}"
|
954
|
+
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
|
+
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
+
f"\n task_port: {args.task_port}"
|
957
|
+
f"\n result_port: {args.result_port}"
|
958
|
+
f"\n addresses: {args.addresses}"
|
959
|
+
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
|
+
f"\n poll_period: {args.poll}"
|
961
|
+
f"\n address_probe_timeout: {args.address_probe_timeout}"
|
962
|
+
f"\n Prefetch capacity: {args.prefetch_capacity}"
|
963
|
+
f"\n Heartbeat threshold: {args.hb_threshold}"
|
964
|
+
f"\n Heartbeat period: {args.hb_period}"
|
965
|
+
f"\n Drain period: {args.drain_period}"
|
966
|
+
f"\n CPU affinity: {args.cpu_affinity}"
|
967
|
+
f"\n Accelerators: {' '.join(args.available_accelerators)}"
|
968
|
+
f"\n enable_mpi_mode: {args.enable_mpi_mode}"
|
969
|
+
f"\n mpi_launcher: {args.mpi_launcher}"
|
963
970
|
)
|
964
971
|
try:
|
965
972
|
manager = Manager(task_port=args.task_port,
|
parsl/monitoring/db_manager.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
3
|
import multiprocessing.queues as mpq
|
4
|
+
import multiprocessing.synchronize as mpe
|
4
5
|
import os
|
5
6
|
import queue
|
6
7
|
import threading
|
@@ -278,11 +279,13 @@ class Database:
|
|
278
279
|
|
279
280
|
class DatabaseManager:
|
280
281
|
def __init__(self,
|
282
|
+
*,
|
281
283
|
db_url: str = 'sqlite:///runinfo/monitoring.db',
|
282
284
|
run_dir: str = '.',
|
283
285
|
logging_level: int = logging.INFO,
|
284
286
|
batching_interval: float = 1,
|
285
287
|
batching_threshold: float = 99999,
|
288
|
+
exit_event: mpe.Event
|
286
289
|
):
|
287
290
|
|
288
291
|
self.workflow_end = False
|
@@ -307,6 +310,8 @@ class DatabaseManager:
|
|
307
310
|
self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
308
311
|
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
309
312
|
|
313
|
+
self.external_exit_event = exit_event
|
314
|
+
|
310
315
|
def start(self,
|
311
316
|
resource_queue: mpq.Queue) -> None:
|
312
317
|
|
@@ -555,15 +560,16 @@ class DatabaseManager:
|
|
555
560
|
while not kill_event.is_set() or logs_queue.qsize() != 0:
|
556
561
|
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
562
|
kill_event.is_set(), logs_queue.qsize() != 0)
|
563
|
+
|
564
|
+
if self.external_exit_event.is_set():
|
565
|
+
self.close()
|
566
|
+
|
558
567
|
try:
|
559
568
|
x = logs_queue.get(timeout=0.1)
|
560
569
|
except queue.Empty:
|
561
570
|
continue
|
562
571
|
else:
|
563
|
-
|
564
|
-
self.close()
|
565
|
-
else:
|
566
|
-
self._dispatch_to_internal(x)
|
572
|
+
self._dispatch_to_internal(x)
|
567
573
|
|
568
574
|
def _dispatch_to_internal(self, x: Tuple) -> None:
|
569
575
|
assert isinstance(x, tuple)
|
@@ -678,11 +684,11 @@ class DatabaseManager:
|
|
678
684
|
|
679
685
|
@wrap_with_logs(target="database_manager")
|
680
686
|
@typeguard.typechecked
|
681
|
-
def dbm_starter(
|
682
|
-
resource_msgs: mpq.Queue,
|
687
|
+
def dbm_starter(resource_msgs: mpq.Queue,
|
683
688
|
db_url: str,
|
684
689
|
run_dir: str,
|
685
|
-
logging_level: int
|
690
|
+
logging_level: int,
|
691
|
+
exit_event: mpe.Event) -> None:
|
686
692
|
"""Start the database manager process
|
687
693
|
|
688
694
|
The DFK should start this function. The args, kwargs match that of the monitoring config
|
@@ -693,16 +699,16 @@ def dbm_starter(exception_q: mpq.Queue,
|
|
693
699
|
try:
|
694
700
|
dbm = DatabaseManager(db_url=db_url,
|
695
701
|
run_dir=run_dir,
|
696
|
-
logging_level=logging_level
|
702
|
+
logging_level=logging_level,
|
703
|
+
exit_event=exit_event)
|
697
704
|
logger.info("Starting dbm in dbm starter")
|
698
705
|
dbm.start(resource_msgs)
|
699
706
|
except KeyboardInterrupt:
|
700
707
|
logger.exception("KeyboardInterrupt signal caught")
|
701
708
|
dbm.close()
|
702
709
|
raise
|
703
|
-
except Exception
|
710
|
+
except Exception:
|
704
711
|
logger.exception("dbm.start exception")
|
705
|
-
exception_q.put(("DBM", str(e)))
|
706
712
|
dbm.close()
|
707
713
|
|
708
714
|
logger.info("End of dbm_starter")
|