parsl 2025.3.3__py3-none-any.whl → 2025.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -536,7 +536,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
536
536
  "interchange_address": self.address,
537
537
  "worker_ports": self.worker_ports,
538
538
  "worker_port_range": self.worker_port_range,
539
- "hub_address": self.hub_address,
539
+ "hub_address": self.loopback_address,
540
540
  "hub_zmq_port": self.hub_zmq_port,
541
541
  "logdir": self.logdir,
542
542
  "heartbeat_threshold": self.heartbeat_threshold,
@@ -4,6 +4,7 @@ import os
4
4
  import pickle
5
5
  import queue
6
6
  import subprocess
7
+ from dataclasses import dataclass, field
7
8
  from enum import Enum
8
9
  from typing import Dict, List, Optional
9
10
 
@@ -69,6 +70,14 @@ class MPINodesUnavailable(Exception):
69
70
  return f"MPINodesUnavailable(requested={self.requested} available={self.available})"
70
71
 
71
72
 
73
+ @dataclass(order=True)
74
+ class PrioritizedTask:
75
+ # Comparing dict will fail since they are unhashable
76
+ # This dataclass limits comparison to the priority field
77
+ priority: int
78
+ task: Dict = field(compare=False)
79
+
80
+
72
81
  class TaskScheduler:
73
82
  """Default TaskScheduler that does no taskscheduling
74
83
 
@@ -111,7 +120,7 @@ class MPITaskScheduler(TaskScheduler):
111
120
  super().__init__(pending_task_q, pending_result_q)
112
121
  self.scheduler = identify_scheduler()
113
122
  # PriorityQueue is threadsafe
114
- self._backlog_queue: queue.PriorityQueue = queue.PriorityQueue()
123
+ self._backlog_queue: queue.PriorityQueue[PrioritizedTask] = queue.PriorityQueue()
115
124
  self._map_tasks_to_nodes: Dict[str, List[str]] = {}
116
125
  self.available_nodes = get_nodes_in_batchjob(self.scheduler)
117
126
  self._free_node_counter = SpawnContext.Value("i", len(self.available_nodes))
@@ -169,7 +178,7 @@ class MPITaskScheduler(TaskScheduler):
169
178
  allocated_nodes = self._get_nodes(nodes_needed)
170
179
  except MPINodesUnavailable:
171
180
  logger.info(f"Not enough resources, placing task {tid} into backlog")
172
- self._backlog_queue.put((nodes_needed, task_package))
181
+ self._backlog_queue.put(PrioritizedTask(nodes_needed, task_package))
173
182
  return
174
183
  else:
175
184
  resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
@@ -183,8 +192,8 @@ class MPITaskScheduler(TaskScheduler):
183
192
  def _schedule_backlog_tasks(self):
184
193
  """Attempt to schedule backlogged tasks"""
185
194
  try:
186
- _nodes_requested, task_package = self._backlog_queue.get(block=False)
187
- self.put_task(task_package)
195
+ prioritized_task = self._backlog_queue.get(block=False)
196
+ self.put_task(prioritized_task.task)
188
197
  except queue.Empty:
189
198
  return
190
199
  else:
@@ -194,6 +203,8 @@ class MPITaskScheduler(TaskScheduler):
194
203
  def get_result(self, block: bool = True, timeout: Optional[float] = None):
195
204
  """Return result and relinquish provisioned nodes"""
196
205
  result_pkl = self.pending_result_q.get(block, timeout)
206
+ if result_pkl is None:
207
+ return None
197
208
  result_dict = pickle.loads(result_pkl)
198
209
  # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
199
210
  # Causes Parsl to hang. See Issue #3427
@@ -15,6 +15,7 @@ import threading
15
15
  import time
16
16
  import uuid
17
17
  from importlib.metadata import distributions
18
+ from multiprocessing.context import SpawnProcess
18
19
  from multiprocessing.managers import DictProxy
19
20
  from multiprocessing.sharedctypes import Synchronized
20
21
  from typing import Dict, List, Optional, Sequence
@@ -403,52 +404,34 @@ class Manager:
403
404
  result_outgoing.connect(self._result_q_url)
404
405
  logger.info("Manager result pipe connected to interchange")
405
406
 
406
- push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
407
- logger.debug("push poll period: {}".format(push_poll_period))
408
-
409
- last_beat = time.time()
410
- last_result_beat = time.time()
411
- items = []
412
-
413
407
  while not self._stop_event.is_set():
408
+ logger.debug("Starting pending_result_queue get")
414
409
  try:
415
- logger.debug("Starting pending_result_queue get")
416
- r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
417
- logger.debug("Got a result item")
418
- items.append(r)
419
- except queue.Empty:
420
- logger.debug("pending_result_queue get timeout without result item")
421
- except Exception as e:
422
- logger.exception("Got an exception: {}".format(e))
423
-
424
- if time.time() > last_result_beat + self.heartbeat_period:
425
- heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
426
- logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
- last_result_beat = time.time()
428
- items.append(pickle.dumps({'type': 'heartbeat'}))
429
-
430
- if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
431
- last_beat = time.time()
432
- if items:
433
- logger.debug(f"Result send: Pushing {len(items)} items")
434
- result_outgoing.send_multipart(items)
435
- logger.debug("Result send: Pushed")
436
- items = []
437
- else:
438
- logger.debug("Result send: No items to push")
439
- else:
440
- logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
410
+ r = self.task_scheduler.get_result()
411
+ if r is None:
412
+ continue
413
+ logger.debug("Result received from worker: %s", id(r))
414
+ result_outgoing.send(r)
415
+ logger.debug("Result sent to interchange: %s", id(r))
416
+ except Exception:
417
+ logger.exception("Failed to send result to interchange")
441
418
 
442
419
  result_outgoing.close()
443
- logger.info("Exiting")
420
+ logger.debug("Exiting")
444
421
 
445
422
  @wrap_with_logs
446
- def worker_watchdog(self):
423
+ def heartbeater(self):
424
+ while not self._stop_event.wait(self.heartbeat_period):
425
+ heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
426
+ logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
427
+ self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
428
+
429
+ def worker_watchdog(self, procs: dict[int, SpawnProcess]):
447
430
  """Keeps workers alive."""
448
431
  logger.debug("Starting worker watchdog")
449
432
 
450
433
  while not self._stop_event.wait(self.heartbeat_period):
451
- for worker_id, p in self.procs.items():
434
+ for worker_id, p in procs.items():
452
435
  if not p.is_alive():
453
436
  logger.error("Worker {} has died".format(worker_id))
454
437
  try:
@@ -466,11 +449,10 @@ class Manager:
466
449
  except KeyError:
467
450
  logger.info("Worker {} was not busy when it died".format(worker_id))
468
451
 
469
- p = self._start_worker(worker_id)
470
- self.procs[worker_id] = p
452
+ procs[worker_id] = self._start_worker(worker_id)
471
453
  logger.info("Worker {} has been restarted".format(worker_id))
472
454
 
473
- logger.critical("Exiting")
455
+ logger.debug("Exiting")
474
456
 
475
457
  @wrap_with_logs
476
458
  def handle_monitoring_messages(self):
@@ -485,32 +467,28 @@ class Manager:
485
467
  """
486
468
  logger.debug("Starting monitoring handler thread")
487
469
 
488
- poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
489
-
490
470
  while not self._stop_event.is_set():
491
471
  try:
492
472
  logger.debug("Starting monitor_queue.get()")
493
- msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
494
- except queue.Empty:
495
- logger.debug("monitoring_queue.get() has timed out")
496
- except Exception as e:
497
- logger.exception(f"Got an exception: {e}")
498
- else:
473
+ msg = self.monitoring_queue.get(block=True)
474
+ if msg is None:
475
+ continue
499
476
  logger.debug("Got a monitoring message")
500
477
  self.pending_result_queue.put(msg)
501
478
  logger.debug("Put monitoring message on pending_result_queue")
479
+ except Exception:
480
+ logger.exception("Failed to forward monitoring message")
502
481
 
503
- logger.critical("Exiting")
482
+ logger.debug("Exiting")
504
483
 
505
484
  def start(self):
506
485
  """ Start the worker processes.
507
486
 
508
487
  TODO: Move task receiving to a thread
509
488
  """
510
- self.procs = {}
489
+ procs: dict[int, SpawnProcess] = {}
511
490
  for worker_id in range(self.worker_count):
512
- p = self._start_worker(worker_id)
513
- self.procs[worker_id] = p
491
+ procs[worker_id] = self._start_worker(worker_id)
514
492
 
515
493
  logger.debug("Workers started")
516
494
 
@@ -519,40 +497,69 @@ class Manager:
519
497
  target=self.push_results, name="Result-Pusher"
520
498
  )
521
499
  thr_worker_watchdog = threading.Thread(
522
- target=self.worker_watchdog, name="worker-watchdog"
500
+ target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
523
501
  )
524
502
  thr_monitoring_handler = threading.Thread(
525
503
  target=self.handle_monitoring_messages, name="Monitoring-Handler"
526
504
  )
505
+ thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
527
506
 
528
507
  thr_task_puller.start()
529
508
  thr_result_pusher.start()
530
509
  thr_worker_watchdog.start()
531
510
  thr_monitoring_handler.start()
511
+ thr_heartbeater.start()
532
512
 
533
513
  logger.info("Manager threads started")
534
514
 
535
515
  # This might need a multiprocessing event to signal back.
536
516
  self._stop_event.wait()
537
- logger.critical("Received kill event, terminating worker processes")
517
+ logger.info("Stop event set; terminating worker processes")
518
+
519
+ # Invite blocking threads to quit
520
+ self.monitoring_queue.put(None)
521
+ self.pending_result_queue.put(None)
538
522
 
523
+ thr_heartbeater.join()
539
524
  thr_task_puller.join()
540
525
  thr_result_pusher.join()
541
526
  thr_worker_watchdog.join()
542
527
  thr_monitoring_handler.join()
543
- for proc_id in self.procs:
544
- self.procs[proc_id].terminate()
545
- logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
546
- self.procs[proc_id].is_alive()))
547
- self.procs[proc_id].join()
548
- logger.debug("Worker {} joined successfully".format(self.procs[proc_id]))
528
+
529
+ for worker_id in procs:
530
+ p = procs[worker_id]
531
+ proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
532
+ logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
533
+ p.terminate()
549
534
 
550
535
  self.zmq_context.term()
536
+
537
+ # give processes 1 second to gracefully shut themselves down, based on the
538
+ # SIGTERM (.terminate()) just sent; after then, we pull the plug.
539
+ force_child_shutdown_at = time.monotonic() + 1
540
+ while procs:
541
+ worker_id, p = procs.popitem()
542
+ timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
543
+ p.join(timeout=timeout)
544
+ proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
545
+ if p.exitcode is not None:
546
+ logger.debug(
547
+ "Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
548
+ )
549
+
550
+ else:
551
+ logger.warning(
552
+ f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
553
+ f" manner; sending KILL signal to process. {proc_info}"
554
+ )
555
+ p.kill()
556
+ p.join()
557
+ p.close()
558
+
551
559
  delta = time.time() - self._start_time
552
560
  logger.info("process_worker_pool ran for {} seconds".format(delta))
553
- return
554
561
 
555
- def _start_worker(self, worker_id: int):
562
+ def _start_worker(self, worker_id: int) -> SpawnProcess:
556
563
  p = SpawnContext.Process(
557
564
  target=worker,
558
565
  args=(
@@ -939,27 +946,27 @@ if __name__ == "__main__":
939
946
  )
940
947
  logger.info(
941
948
  f"\n Python version: {sys.version}"
942
- f" Debug logging: {args.debug}"
943
- f" Certificates dir: {args.cert_dir}"
944
- f" Log dir: {args.logdir}"
945
- f" Manager ID: {args.uid}"
946
- f" Block ID: {args.block_id}"
947
- f" cores_per_worker: {args.cores_per_worker}"
948
- f" mem_per_worker: {args.mem_per_worker}"
949
- f" task_port: {args.task_port}"
950
- f" result_port: {args.result_port}"
951
- f" addresses: {args.addresses}"
952
- f" max_workers_per_node: {args.max_workers_per_node}"
953
- f" poll_period: {args.poll}"
954
- f" address_probe_timeout: {args.address_probe_timeout}"
955
- f" Prefetch capacity: {args.prefetch_capacity}"
956
- f" Heartbeat threshold: {args.hb_threshold}"
957
- f" Heartbeat period: {args.hb_period}"
958
- f" Drain period: {args.drain_period}"
959
- f" CPU affinity: {args.cpu_affinity}"
960
- f" Accelerators: {' '.join(args.available_accelerators)}"
961
- f" enable_mpi_mode: {args.enable_mpi_mode}"
962
- f" mpi_launcher: {args.mpi_launcher}"
949
+ f"\n Debug logging: {args.debug}"
950
+ f"\n Certificates dir: {args.cert_dir}"
951
+ f"\n Log dir: {args.logdir}"
952
+ f"\n Manager ID: {args.uid}"
953
+ f"\n Block ID: {args.block_id}"
954
+ f"\n cores_per_worker: {args.cores_per_worker}"
955
+ f"\n mem_per_worker: {args.mem_per_worker}"
956
+ f"\n task_port: {args.task_port}"
957
+ f"\n result_port: {args.result_port}"
958
+ f"\n addresses: {args.addresses}"
959
+ f"\n max_workers_per_node: {args.max_workers_per_node}"
960
+ f"\n poll_period: {args.poll}"
961
+ f"\n address_probe_timeout: {args.address_probe_timeout}"
962
+ f"\n Prefetch capacity: {args.prefetch_capacity}"
963
+ f"\n Heartbeat threshold: {args.hb_threshold}"
964
+ f"\n Heartbeat period: {args.hb_period}"
965
+ f"\n Drain period: {args.drain_period}"
966
+ f"\n CPU affinity: {args.cpu_affinity}"
967
+ f"\n Accelerators: {' '.join(args.available_accelerators)}"
968
+ f"\n enable_mpi_mode: {args.enable_mpi_mode}"
969
+ f"\n mpi_launcher: {args.mpi_launcher}"
963
970
  )
964
971
  try:
965
972
  manager = Manager(task_port=args.task_port,
@@ -1,6 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import multiprocessing.queues as mpq
4
+ import multiprocessing.synchronize as mpe
4
5
  import os
5
6
  import queue
6
7
  import threading
@@ -278,11 +279,13 @@ class Database:
278
279
 
279
280
  class DatabaseManager:
280
281
  def __init__(self,
282
+ *,
281
283
  db_url: str = 'sqlite:///runinfo/monitoring.db',
282
284
  run_dir: str = '.',
283
285
  logging_level: int = logging.INFO,
284
286
  batching_interval: float = 1,
285
287
  batching_threshold: float = 99999,
288
+ exit_event: mpe.Event
286
289
  ):
287
290
 
288
291
  self.workflow_end = False
@@ -307,6 +310,8 @@ class DatabaseManager:
307
310
  self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
308
311
  self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
309
312
 
313
+ self.external_exit_event = exit_event
314
+
310
315
  def start(self,
311
316
  resource_queue: mpq.Queue) -> None:
312
317
 
@@ -555,15 +560,16 @@ class DatabaseManager:
555
560
  while not kill_event.is_set() or logs_queue.qsize() != 0:
556
561
  logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
557
562
  kill_event.is_set(), logs_queue.qsize() != 0)
563
+
564
+ if self.external_exit_event.is_set():
565
+ self.close()
566
+
558
567
  try:
559
568
  x = logs_queue.get(timeout=0.1)
560
569
  except queue.Empty:
561
570
  continue
562
571
  else:
563
- if x == 'STOP':
564
- self.close()
565
- else:
566
- self._dispatch_to_internal(x)
572
+ self._dispatch_to_internal(x)
567
573
 
568
574
  def _dispatch_to_internal(self, x: Tuple) -> None:
569
575
  assert isinstance(x, tuple)
@@ -678,11 +684,11 @@ class DatabaseManager:
678
684
 
679
685
  @wrap_with_logs(target="database_manager")
680
686
  @typeguard.typechecked
681
- def dbm_starter(exception_q: mpq.Queue,
682
- resource_msgs: mpq.Queue,
687
+ def dbm_starter(resource_msgs: mpq.Queue,
683
688
  db_url: str,
684
689
  run_dir: str,
685
- logging_level: int) -> None:
690
+ logging_level: int,
691
+ exit_event: mpe.Event) -> None:
686
692
  """Start the database manager process
687
693
 
688
694
  The DFK should start this function. The args, kwargs match that of the monitoring config
@@ -693,16 +699,16 @@ def dbm_starter(exception_q: mpq.Queue,
693
699
  try:
694
700
  dbm = DatabaseManager(db_url=db_url,
695
701
  run_dir=run_dir,
696
- logging_level=logging_level)
702
+ logging_level=logging_level,
703
+ exit_event=exit_event)
697
704
  logger.info("Starting dbm in dbm starter")
698
705
  dbm.start(resource_msgs)
699
706
  except KeyboardInterrupt:
700
707
  logger.exception("KeyboardInterrupt signal caught")
701
708
  dbm.close()
702
709
  raise
703
- except Exception as e:
710
+ except Exception:
704
711
  logger.exception("dbm.start exception")
705
- exception_q.put(("DBM", str(e)))
706
712
  dbm.close()
707
713
 
708
714
  logger.info("End of dbm_starter")