parsl 2025.3.10__py3-none-any.whl → 2025.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/executors/high_throughput/mpi_resource_management.py +2 -0
- parsl/executors/high_throughput/process_worker_pool.py +89 -82
- parsl/monitoring/db_manager.py +16 -10
- parsl/monitoring/monitoring.py +49 -45
- parsl/monitoring/radios/filesystem_router.py +4 -2
- parsl/monitoring/radios/udp_router.py +1 -3
- parsl/monitoring/radios/zmq_router.py +1 -8
- parsl/tests/test_monitoring/test_exit_helper.py +55 -0
- parsl/version.py +1 -1
- {parsl-2025.3.10.data → parsl-2025.3.17.data}/scripts/process_worker_pool.py +89 -82
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/METADATA +4 -4
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/RECORD +19 -18
- {parsl-2025.3.10.data → parsl-2025.3.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.17.data}/scripts/interchange.py +0 -0
- {parsl-2025.3.10.data → parsl-2025.3.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/LICENSE +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/WHEEL +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.10.dist-info → parsl-2025.3.17.dist-info}/top_level.txt +0 -0
@@ -203,6 +203,8 @@ class MPITaskScheduler(TaskScheduler):
|
|
203
203
|
def get_result(self, block: bool = True, timeout: Optional[float] = None):
|
204
204
|
"""Return result and relinquish provisioned nodes"""
|
205
205
|
result_pkl = self.pending_result_q.get(block, timeout)
|
206
|
+
if result_pkl is None:
|
207
|
+
return None
|
206
208
|
result_dict = pickle.loads(result_pkl)
|
207
209
|
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
208
210
|
# Causes Parsl to hang. See Issue #3427
|
@@ -15,6 +15,7 @@ import threading
|
|
15
15
|
import time
|
16
16
|
import uuid
|
17
17
|
from importlib.metadata import distributions
|
18
|
+
from multiprocessing.context import SpawnProcess
|
18
19
|
from multiprocessing.managers import DictProxy
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
20
21
|
from typing import Dict, List, Optional, Sequence
|
@@ -403,52 +404,34 @@ class Manager:
|
|
403
404
|
result_outgoing.connect(self._result_q_url)
|
404
405
|
logger.info("Manager result pipe connected to interchange")
|
405
406
|
|
406
|
-
push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
|
407
|
-
logger.debug("push poll period: {}".format(push_poll_period))
|
408
|
-
|
409
|
-
last_beat = time.time()
|
410
|
-
last_result_beat = time.time()
|
411
|
-
items = []
|
412
|
-
|
413
407
|
while not self._stop_event.is_set():
|
408
|
+
logger.debug("Starting pending_result_queue get")
|
414
409
|
try:
|
415
|
-
|
416
|
-
r
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
logger.debug("
|
421
|
-
except Exception
|
422
|
-
logger.exception("
|
423
|
-
|
424
|
-
if time.time() > last_result_beat + self.heartbeat_period:
|
425
|
-
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
last_result_beat = time.time()
|
428
|
-
items.append(pickle.dumps({'type': 'heartbeat'}))
|
429
|
-
|
430
|
-
if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
|
431
|
-
last_beat = time.time()
|
432
|
-
if items:
|
433
|
-
logger.debug(f"Result send: Pushing {len(items)} items")
|
434
|
-
result_outgoing.send_multipart(items)
|
435
|
-
logger.debug("Result send: Pushed")
|
436
|
-
items = []
|
437
|
-
else:
|
438
|
-
logger.debug("Result send: No items to push")
|
439
|
-
else:
|
440
|
-
logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
|
410
|
+
r = self.task_scheduler.get_result()
|
411
|
+
if r is None:
|
412
|
+
continue
|
413
|
+
logger.debug("Result received from worker: %s", id(r))
|
414
|
+
result_outgoing.send(r)
|
415
|
+
logger.debug("Result sent to interchange: %s", id(r))
|
416
|
+
except Exception:
|
417
|
+
logger.exception("Failed to send result to interchange")
|
441
418
|
|
442
419
|
result_outgoing.close()
|
443
|
-
logger.
|
420
|
+
logger.debug("Exiting")
|
444
421
|
|
445
422
|
@wrap_with_logs
|
446
|
-
def
|
423
|
+
def heartbeater(self):
|
424
|
+
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
+
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
+
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
+
|
429
|
+
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
447
430
|
"""Keeps workers alive."""
|
448
431
|
logger.debug("Starting worker watchdog")
|
449
432
|
|
450
433
|
while not self._stop_event.wait(self.heartbeat_period):
|
451
|
-
for worker_id, p in
|
434
|
+
for worker_id, p in procs.items():
|
452
435
|
if not p.is_alive():
|
453
436
|
logger.error("Worker {} has died".format(worker_id))
|
454
437
|
try:
|
@@ -466,11 +449,10 @@ class Manager:
|
|
466
449
|
except KeyError:
|
467
450
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
468
451
|
|
469
|
-
|
470
|
-
self.procs[worker_id] = p
|
452
|
+
procs[worker_id] = self._start_worker(worker_id)
|
471
453
|
logger.info("Worker {} has been restarted".format(worker_id))
|
472
454
|
|
473
|
-
logger.
|
455
|
+
logger.debug("Exiting")
|
474
456
|
|
475
457
|
@wrap_with_logs
|
476
458
|
def handle_monitoring_messages(self):
|
@@ -485,32 +467,28 @@ class Manager:
|
|
485
467
|
"""
|
486
468
|
logger.debug("Starting monitoring handler thread")
|
487
469
|
|
488
|
-
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
489
|
-
|
490
470
|
while not self._stop_event.is_set():
|
491
471
|
try:
|
492
472
|
logger.debug("Starting monitor_queue.get()")
|
493
|
-
msg = self.monitoring_queue.get(block=True
|
494
|
-
|
495
|
-
|
496
|
-
except Exception as e:
|
497
|
-
logger.exception(f"Got an exception: {e}")
|
498
|
-
else:
|
473
|
+
msg = self.monitoring_queue.get(block=True)
|
474
|
+
if msg is None:
|
475
|
+
continue
|
499
476
|
logger.debug("Got a monitoring message")
|
500
477
|
self.pending_result_queue.put(msg)
|
501
478
|
logger.debug("Put monitoring message on pending_result_queue")
|
479
|
+
except Exception:
|
480
|
+
logger.exception("Failed to forward monitoring message")
|
502
481
|
|
503
|
-
logger.
|
482
|
+
logger.debug("Exiting")
|
504
483
|
|
505
484
|
def start(self):
|
506
485
|
""" Start the worker processes.
|
507
486
|
|
508
487
|
TODO: Move task receiving to a thread
|
509
488
|
"""
|
510
|
-
|
489
|
+
procs: dict[int, SpawnProcess] = {}
|
511
490
|
for worker_id in range(self.worker_count):
|
512
|
-
|
513
|
-
self.procs[worker_id] = p
|
491
|
+
procs[worker_id] = self._start_worker(worker_id)
|
514
492
|
|
515
493
|
logger.debug("Workers started")
|
516
494
|
|
@@ -519,40 +497,69 @@ class Manager:
|
|
519
497
|
target=self.push_results, name="Result-Pusher"
|
520
498
|
)
|
521
499
|
thr_worker_watchdog = threading.Thread(
|
522
|
-
target=self.worker_watchdog, name="worker-watchdog"
|
500
|
+
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
523
501
|
)
|
524
502
|
thr_monitoring_handler = threading.Thread(
|
525
503
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
526
504
|
)
|
505
|
+
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
527
506
|
|
528
507
|
thr_task_puller.start()
|
529
508
|
thr_result_pusher.start()
|
530
509
|
thr_worker_watchdog.start()
|
531
510
|
thr_monitoring_handler.start()
|
511
|
+
thr_heartbeater.start()
|
532
512
|
|
533
513
|
logger.info("Manager threads started")
|
534
514
|
|
535
515
|
# This might need a multiprocessing event to signal back.
|
536
516
|
self._stop_event.wait()
|
537
|
-
logger.
|
517
|
+
logger.info("Stop event set; terminating worker processes")
|
518
|
+
|
519
|
+
# Invite blocking threads to quit
|
520
|
+
self.monitoring_queue.put(None)
|
521
|
+
self.pending_result_queue.put(None)
|
538
522
|
|
523
|
+
thr_heartbeater.join()
|
539
524
|
thr_task_puller.join()
|
540
525
|
thr_result_pusher.join()
|
541
526
|
thr_worker_watchdog.join()
|
542
527
|
thr_monitoring_handler.join()
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
528
|
+
|
529
|
+
for worker_id in procs:
|
530
|
+
p = procs[worker_id]
|
531
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
532
|
+
logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
|
533
|
+
p.terminate()
|
549
534
|
|
550
535
|
self.zmq_context.term()
|
536
|
+
|
537
|
+
# give processes 1 second to gracefully shut themselves down, based on the
|
538
|
+
# SIGTERM (.terminate()) just sent; after then, we pull the plug.
|
539
|
+
force_child_shutdown_at = time.monotonic() + 1
|
540
|
+
while procs:
|
541
|
+
worker_id, p = procs.popitem()
|
542
|
+
timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
|
543
|
+
p.join(timeout=timeout)
|
544
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
545
|
+
if p.exitcode is not None:
|
546
|
+
logger.debug(
|
547
|
+
"Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
|
548
|
+
)
|
549
|
+
|
550
|
+
else:
|
551
|
+
logger.warning(
|
552
|
+
f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
|
553
|
+
f" manner; sending KILL signal to process. {proc_info}"
|
554
|
+
)
|
555
|
+
p.kill()
|
556
|
+
p.join()
|
557
|
+
p.close()
|
558
|
+
|
551
559
|
delta = time.time() - self._start_time
|
552
560
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
553
|
-
return
|
554
561
|
|
555
|
-
def _start_worker(self, worker_id: int):
|
562
|
+
def _start_worker(self, worker_id: int) -> SpawnProcess:
|
556
563
|
p = SpawnContext.Process(
|
557
564
|
target=worker,
|
558
565
|
args=(
|
@@ -939,27 +946,27 @@ if __name__ == "__main__":
|
|
939
946
|
)
|
940
947
|
logger.info(
|
941
948
|
f"\n Python version: {sys.version}"
|
942
|
-
f" Debug logging: {args.debug}"
|
943
|
-
f" Certificates dir: {args.cert_dir}"
|
944
|
-
f" Log dir: {args.logdir}"
|
945
|
-
f" Manager ID: {args.uid}"
|
946
|
-
f" Block ID: {args.block_id}"
|
947
|
-
f" cores_per_worker: {args.cores_per_worker}"
|
948
|
-
f" mem_per_worker: {args.mem_per_worker}"
|
949
|
-
f" task_port: {args.task_port}"
|
950
|
-
f" result_port: {args.result_port}"
|
951
|
-
f" addresses: {args.addresses}"
|
952
|
-
f" max_workers_per_node: {args.max_workers_per_node}"
|
953
|
-
f" poll_period: {args.poll}"
|
954
|
-
f" address_probe_timeout: {args.address_probe_timeout}"
|
955
|
-
f" Prefetch capacity: {args.prefetch_capacity}"
|
956
|
-
f" Heartbeat threshold: {args.hb_threshold}"
|
957
|
-
f" Heartbeat period: {args.hb_period}"
|
958
|
-
f" Drain period: {args.drain_period}"
|
959
|
-
f" CPU affinity: {args.cpu_affinity}"
|
960
|
-
f" Accelerators: {' '.join(args.available_accelerators)}"
|
961
|
-
f" enable_mpi_mode: {args.enable_mpi_mode}"
|
962
|
-
f" mpi_launcher: {args.mpi_launcher}"
|
949
|
+
f"\n Debug logging: {args.debug}"
|
950
|
+
f"\n Certificates dir: {args.cert_dir}"
|
951
|
+
f"\n Log dir: {args.logdir}"
|
952
|
+
f"\n Manager ID: {args.uid}"
|
953
|
+
f"\n Block ID: {args.block_id}"
|
954
|
+
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
|
+
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
+
f"\n task_port: {args.task_port}"
|
957
|
+
f"\n result_port: {args.result_port}"
|
958
|
+
f"\n addresses: {args.addresses}"
|
959
|
+
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
|
+
f"\n poll_period: {args.poll}"
|
961
|
+
f"\n address_probe_timeout: {args.address_probe_timeout}"
|
962
|
+
f"\n Prefetch capacity: {args.prefetch_capacity}"
|
963
|
+
f"\n Heartbeat threshold: {args.hb_threshold}"
|
964
|
+
f"\n Heartbeat period: {args.hb_period}"
|
965
|
+
f"\n Drain period: {args.drain_period}"
|
966
|
+
f"\n CPU affinity: {args.cpu_affinity}"
|
967
|
+
f"\n Accelerators: {' '.join(args.available_accelerators)}"
|
968
|
+
f"\n enable_mpi_mode: {args.enable_mpi_mode}"
|
969
|
+
f"\n mpi_launcher: {args.mpi_launcher}"
|
963
970
|
)
|
964
971
|
try:
|
965
972
|
manager = Manager(task_port=args.task_port,
|
parsl/monitoring/db_manager.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
3
|
import multiprocessing.queues as mpq
|
4
|
+
import multiprocessing.synchronize as mpe
|
4
5
|
import os
|
5
6
|
import queue
|
6
7
|
import threading
|
@@ -278,11 +279,13 @@ class Database:
|
|
278
279
|
|
279
280
|
class DatabaseManager:
|
280
281
|
def __init__(self,
|
282
|
+
*,
|
281
283
|
db_url: str = 'sqlite:///runinfo/monitoring.db',
|
282
284
|
run_dir: str = '.',
|
283
285
|
logging_level: int = logging.INFO,
|
284
286
|
batching_interval: float = 1,
|
285
287
|
batching_threshold: float = 99999,
|
288
|
+
exit_event: mpe.Event
|
286
289
|
):
|
287
290
|
|
288
291
|
self.workflow_end = False
|
@@ -307,6 +310,8 @@ class DatabaseManager:
|
|
307
310
|
self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
308
311
|
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
309
312
|
|
313
|
+
self.external_exit_event = exit_event
|
314
|
+
|
310
315
|
def start(self,
|
311
316
|
resource_queue: mpq.Queue) -> None:
|
312
317
|
|
@@ -555,15 +560,16 @@ class DatabaseManager:
|
|
555
560
|
while not kill_event.is_set() or logs_queue.qsize() != 0:
|
556
561
|
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
562
|
kill_event.is_set(), logs_queue.qsize() != 0)
|
563
|
+
|
564
|
+
if self.external_exit_event.is_set():
|
565
|
+
self.close()
|
566
|
+
|
558
567
|
try:
|
559
568
|
x = logs_queue.get(timeout=0.1)
|
560
569
|
except queue.Empty:
|
561
570
|
continue
|
562
571
|
else:
|
563
|
-
|
564
|
-
self.close()
|
565
|
-
else:
|
566
|
-
self._dispatch_to_internal(x)
|
572
|
+
self._dispatch_to_internal(x)
|
567
573
|
|
568
574
|
def _dispatch_to_internal(self, x: Tuple) -> None:
|
569
575
|
assert isinstance(x, tuple)
|
@@ -678,11 +684,11 @@ class DatabaseManager:
|
|
678
684
|
|
679
685
|
@wrap_with_logs(target="database_manager")
|
680
686
|
@typeguard.typechecked
|
681
|
-
def dbm_starter(
|
682
|
-
resource_msgs: mpq.Queue,
|
687
|
+
def dbm_starter(resource_msgs: mpq.Queue,
|
683
688
|
db_url: str,
|
684
689
|
run_dir: str,
|
685
|
-
logging_level: int
|
690
|
+
logging_level: int,
|
691
|
+
exit_event: mpe.Event) -> None:
|
686
692
|
"""Start the database manager process
|
687
693
|
|
688
694
|
The DFK should start this function. The args, kwargs match that of the monitoring config
|
@@ -693,16 +699,16 @@ def dbm_starter(exception_q: mpq.Queue,
|
|
693
699
|
try:
|
694
700
|
dbm = DatabaseManager(db_url=db_url,
|
695
701
|
run_dir=run_dir,
|
696
|
-
logging_level=logging_level
|
702
|
+
logging_level=logging_level,
|
703
|
+
exit_event=exit_event)
|
697
704
|
logger.info("Starting dbm in dbm starter")
|
698
705
|
dbm.start(resource_msgs)
|
699
706
|
except KeyboardInterrupt:
|
700
707
|
logger.exception("KeyboardInterrupt signal caught")
|
701
708
|
dbm.close()
|
702
709
|
raise
|
703
|
-
except Exception
|
710
|
+
except Exception:
|
704
711
|
logger.exception("dbm.start exception")
|
705
|
-
exception_q.put(("DBM", str(e)))
|
706
712
|
dbm.close()
|
707
713
|
|
708
714
|
logger.info("End of dbm_starter")
|
parsl/monitoring/monitoring.py
CHANGED
@@ -5,8 +5,9 @@ import multiprocessing.synchronize as ms
|
|
5
5
|
import os
|
6
6
|
import queue
|
7
7
|
from multiprocessing import Event
|
8
|
+
from multiprocessing.context import ForkProcess as ForkProcessType
|
8
9
|
from multiprocessing.queues import Queue
|
9
|
-
from typing import TYPE_CHECKING,
|
10
|
+
from typing import TYPE_CHECKING, Optional, Tuple, Union
|
10
11
|
|
11
12
|
import typeguard
|
12
13
|
|
@@ -128,10 +129,7 @@ class MonitoringHub(RepresentationMixin):
|
|
128
129
|
zmq_comm_q = SizedQueue(maxsize=10)
|
129
130
|
udp_comm_q = SizedQueue(maxsize=10)
|
130
131
|
|
131
|
-
self.
|
132
|
-
self.exception_q = SizedQueue(maxsize=10)
|
133
|
-
|
134
|
-
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
132
|
+
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
135
133
|
self.resource_msgs = SizedQueue()
|
136
134
|
|
137
135
|
self.router_exit_event: ms.Event
|
@@ -139,7 +137,6 @@ class MonitoringHub(RepresentationMixin):
|
|
139
137
|
|
140
138
|
self.zmq_router_proc = ForkProcess(target=zmq_router_starter,
|
141
139
|
kwargs={"comm_q": zmq_comm_q,
|
142
|
-
"exception_q": self.exception_q,
|
143
140
|
"resource_msgs": self.resource_msgs,
|
144
141
|
"exit_event": self.router_exit_event,
|
145
142
|
"hub_address": self.hub_address,
|
@@ -154,7 +151,6 @@ class MonitoringHub(RepresentationMixin):
|
|
154
151
|
|
155
152
|
self.udp_router_proc = ForkProcess(target=udp_router_starter,
|
156
153
|
kwargs={"comm_q": udp_comm_q,
|
157
|
-
"exception_q": self.exception_q,
|
158
154
|
"resource_msgs": self.resource_msgs,
|
159
155
|
"exit_event": self.router_exit_event,
|
160
156
|
"hub_address": self.hub_address,
|
@@ -167,11 +163,15 @@ class MonitoringHub(RepresentationMixin):
|
|
167
163
|
)
|
168
164
|
self.udp_router_proc.start()
|
169
165
|
|
166
|
+
self.dbm_exit_event: ms.Event
|
167
|
+
self.dbm_exit_event = Event()
|
168
|
+
|
170
169
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
171
|
-
args=(self.
|
170
|
+
args=(self.resource_msgs,),
|
172
171
|
kwargs={"run_dir": dfk_run_dir,
|
173
172
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
174
173
|
"db_url": self.logging_endpoint,
|
174
|
+
"exit_event": self.dbm_exit_event,
|
175
175
|
},
|
176
176
|
name="Monitoring-DBM-Process",
|
177
177
|
daemon=True,
|
@@ -181,7 +181,7 @@ class MonitoringHub(RepresentationMixin):
|
|
181
181
|
self.zmq_router_proc.pid, self.udp_router_proc.pid, self.dbm_proc.pid)
|
182
182
|
|
183
183
|
self.filesystem_proc = ForkProcess(target=filesystem_router_starter,
|
184
|
-
args=(self.resource_msgs, dfk_run_dir),
|
184
|
+
args=(self.resource_msgs, dfk_run_dir, self.router_exit_event),
|
185
185
|
name="Monitoring-Filesystem-Process",
|
186
186
|
daemon=True
|
187
187
|
)
|
@@ -227,58 +227,62 @@ class MonitoringHub(RepresentationMixin):
|
|
227
227
|
|
228
228
|
def close(self) -> None:
|
229
229
|
logger.info("Terminating Monitoring Hub")
|
230
|
-
exception_msgs = []
|
231
|
-
while True:
|
232
|
-
try:
|
233
|
-
exception_msgs.append(self.exception_q.get(block=False))
|
234
|
-
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
235
|
-
except queue.Empty:
|
236
|
-
break
|
237
230
|
if self.monitoring_hub_active:
|
238
231
|
self.monitoring_hub_active = False
|
239
|
-
if exception_msgs:
|
240
|
-
for exception_msg in exception_msgs:
|
241
|
-
logger.error(
|
242
|
-
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
243
|
-
exception_msg[0],
|
244
|
-
exception_msg[1]
|
245
|
-
)
|
246
|
-
self.zmq_router_proc.terminate()
|
247
|
-
self.udp_router_proc.terminate()
|
248
|
-
self.dbm_proc.terminate()
|
249
|
-
self.filesystem_proc.terminate()
|
250
232
|
logger.info("Setting router termination event")
|
251
233
|
self.router_exit_event.set()
|
252
234
|
|
253
235
|
logger.info("Waiting for ZMQ router to terminate")
|
254
|
-
self.zmq_router_proc
|
255
|
-
self.zmq_router_proc.close()
|
236
|
+
join_terminate_close_proc(self.zmq_router_proc)
|
256
237
|
|
257
238
|
logger.info("Waiting for UDP router to terminate")
|
258
|
-
self.udp_router_proc
|
259
|
-
self.udp_router_proc.close()
|
239
|
+
join_terminate_close_proc(self.udp_router_proc)
|
260
240
|
|
261
241
|
logger.debug("Finished waiting for router termination")
|
262
|
-
if len(exception_msgs) == 0:
|
263
|
-
logger.debug("Sending STOP to DBM")
|
264
|
-
self.resource_msgs.put("STOP")
|
265
|
-
else:
|
266
|
-
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
267
242
|
logger.debug("Waiting for DB termination")
|
268
|
-
self.
|
269
|
-
self.dbm_proc
|
243
|
+
self.dbm_exit_event.set()
|
244
|
+
join_terminate_close_proc(self.dbm_proc)
|
270
245
|
logger.debug("Finished waiting for DBM termination")
|
271
246
|
|
272
|
-
# should this be message based? it probably doesn't need to be if
|
273
|
-
# we believe we've received all messages
|
274
247
|
logger.info("Terminating filesystem radio receiver process")
|
275
|
-
self.filesystem_proc
|
276
|
-
self.filesystem_proc.join()
|
277
|
-
self.filesystem_proc.close()
|
248
|
+
join_terminate_close_proc(self.filesystem_proc)
|
278
249
|
|
279
250
|
logger.info("Closing monitoring multiprocessing queues")
|
280
|
-
self.exception_q.close()
|
281
|
-
self.exception_q.join_thread()
|
282
251
|
self.resource_msgs.close()
|
283
252
|
self.resource_msgs.join_thread()
|
284
253
|
logger.info("Closed monitoring multiprocessing queues")
|
254
|
+
|
255
|
+
|
256
|
+
def join_terminate_close_proc(process: ForkProcessType, *, timeout: int = 30) -> None:
|
257
|
+
"""Increasingly aggressively terminate a process.
|
258
|
+
|
259
|
+
This function assumes that the process is likely to exit before
|
260
|
+
the join timeout, driven by some other means, such as the
|
261
|
+
MonitoringHub router_exit_event. If the process does not exit, then
|
262
|
+
first terminate() and then kill() will be used to end the process.
|
263
|
+
|
264
|
+
In the case of a very mis-behaving process, this function might take
|
265
|
+
up to 3*timeout to exhaust all termination methods and return.
|
266
|
+
"""
|
267
|
+
logger.debug("Joining process")
|
268
|
+
process.join(timeout)
|
269
|
+
|
270
|
+
# run a sequence of increasingly aggressive steps to shut down the process.
|
271
|
+
if process.is_alive():
|
272
|
+
logger.error("Process did not join. Terminating.")
|
273
|
+
process.terminate()
|
274
|
+
process.join(timeout)
|
275
|
+
if process.is_alive():
|
276
|
+
logger.error("Process did not join after terminate. Killing.")
|
277
|
+
process.kill()
|
278
|
+
process.join(timeout)
|
279
|
+
# This kill should not be caught by any signal handlers so it is
|
280
|
+
# unlikely that this join will timeout. If it does, there isn't
|
281
|
+
# anything further to do except log an error in the next if-block.
|
282
|
+
|
283
|
+
if process.is_alive():
|
284
|
+
logger.error("Process failed to end")
|
285
|
+
# don't call close if the process hasn't ended:
|
286
|
+
# process.close() doesn't work on a running process.
|
287
|
+
else:
|
288
|
+
process.close()
|
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import pickle
|
6
6
|
import time
|
7
7
|
from multiprocessing.queues import Queue
|
8
|
+
from multiprocessing.synchronize import Event
|
8
9
|
from typing import cast
|
9
10
|
|
10
11
|
from parsl.log_utils import set_file_logger
|
@@ -15,7 +16,7 @@ from parsl.utils import setproctitle
|
|
15
16
|
|
16
17
|
|
17
18
|
@wrap_with_logs
|
18
|
-
def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
19
|
+
def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str, exit_event: Event) -> None:
|
19
20
|
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
20
21
|
name="monitoring_filesystem_radio",
|
21
22
|
level=logging.INFO)
|
@@ -32,7 +33,7 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
|
|
32
33
|
os.makedirs(tmp_dir, exist_ok=True)
|
33
34
|
os.makedirs(new_dir, exist_ok=True)
|
34
35
|
|
35
|
-
while
|
36
|
+
while not exit_event.is_set():
|
36
37
|
logger.debug("Start filesystem radio receiver loop")
|
37
38
|
|
38
39
|
# iterate over files in new_dir
|
@@ -50,3 +51,4 @@ def filesystem_router_starter(q: Queue[TaggedMonitoringMessage], run_dir: str) -
|
|
50
51
|
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
51
52
|
|
52
53
|
time.sleep(1) # whats a good time for this poll?
|
54
|
+
logger.info("Ending filesystem radio receiver")
|
@@ -118,7 +118,6 @@ class MonitoringRouter:
|
|
118
118
|
@typeguard.typechecked
|
119
119
|
def udp_router_starter(*,
|
120
120
|
comm_q: mpq.Queue,
|
121
|
-
exception_q: mpq.Queue,
|
122
121
|
resource_msgs: mpq.Queue,
|
123
122
|
exit_event: Event,
|
124
123
|
|
@@ -144,6 +143,5 @@ def udp_router_starter(*,
|
|
144
143
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
145
144
|
try:
|
146
145
|
router.start()
|
147
|
-
except Exception
|
146
|
+
except Exception:
|
148
147
|
router.logger.exception("UDP router start exception")
|
149
|
-
exception_q.put(('Hub', str(e)))
|
@@ -107,7 +107,6 @@ class MonitoringRouter:
|
|
107
107
|
@typeguard.typechecked
|
108
108
|
def zmq_router_starter(*,
|
109
109
|
comm_q: mpq.Queue,
|
110
|
-
exception_q: mpq.Queue,
|
111
110
|
resource_msgs: mpq.Queue,
|
112
111
|
exit_event: Event,
|
113
112
|
|
@@ -129,10 +128,4 @@ def zmq_router_starter(*,
|
|
129
128
|
comm_q.put(f"Monitoring router construction failed: {e}")
|
130
129
|
else:
|
131
130
|
comm_q.put(router.zmq_receiver_port)
|
132
|
-
|
133
|
-
router.logger.info("Starting MonitoringRouter in router_starter")
|
134
|
-
try:
|
135
|
-
router.start()
|
136
|
-
except Exception as e:
|
137
|
-
router.logger.exception("ZMQ router start exception")
|
138
|
-
exception_q.put(('Hub', str(e)))
|
131
|
+
router.start()
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import multiprocessing
|
2
|
+
import signal
|
3
|
+
|
4
|
+
import psutil
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from parsl.monitoring.monitoring import join_terminate_close_proc
|
8
|
+
from parsl.multiprocessing import ForkProcess
|
9
|
+
|
10
|
+
|
11
|
+
def noop():
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
@pytest.mark.local
|
16
|
+
def test_end_process_already_exited():
|
17
|
+
p = ForkProcess(target=noop)
|
18
|
+
p.start()
|
19
|
+
p.join()
|
20
|
+
join_terminate_close_proc(p)
|
21
|
+
|
22
|
+
|
23
|
+
def hang():
|
24
|
+
while True:
|
25
|
+
pass
|
26
|
+
|
27
|
+
|
28
|
+
@pytest.mark.local
|
29
|
+
def test_end_hung_process():
|
30
|
+
"""Test calling against a process that will not exit itself."""
|
31
|
+
p = ForkProcess(target=hang)
|
32
|
+
p.start()
|
33
|
+
pid = p.pid
|
34
|
+
join_terminate_close_proc(p, timeout=1)
|
35
|
+
assert not psutil.pid_exists(pid), "process should not exist any more"
|
36
|
+
|
37
|
+
|
38
|
+
def hang_no_sigint(e):
|
39
|
+
def s(*args, **kwargs):
|
40
|
+
e.set()
|
41
|
+
signal.signal(signal.SIGTERM, s)
|
42
|
+
while True:
|
43
|
+
pass
|
44
|
+
|
45
|
+
|
46
|
+
@pytest.mark.local
|
47
|
+
def test_end_hung_process_no_sigint():
|
48
|
+
"""Test calling against a process that will not exit itself."""
|
49
|
+
e = multiprocessing.Event()
|
50
|
+
p = ForkProcess(target=hang_no_sigint, args=(e,))
|
51
|
+
p.start()
|
52
|
+
pid = p.pid
|
53
|
+
join_terminate_close_proc(p, timeout=1)
|
54
|
+
assert not psutil.pid_exists(pid), "process should not exist any more"
|
55
|
+
assert e.is_set(), "hung process should have set event on signal"
|
parsl/version.py
CHANGED
@@ -15,6 +15,7 @@ import threading
|
|
15
15
|
import time
|
16
16
|
import uuid
|
17
17
|
from importlib.metadata import distributions
|
18
|
+
from multiprocessing.context import SpawnProcess
|
18
19
|
from multiprocessing.managers import DictProxy
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
20
21
|
from typing import Dict, List, Optional, Sequence
|
@@ -403,52 +404,34 @@ class Manager:
|
|
403
404
|
result_outgoing.connect(self._result_q_url)
|
404
405
|
logger.info("Manager result pipe connected to interchange")
|
405
406
|
|
406
|
-
push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
|
407
|
-
logger.debug("push poll period: {}".format(push_poll_period))
|
408
|
-
|
409
|
-
last_beat = time.time()
|
410
|
-
last_result_beat = time.time()
|
411
|
-
items = []
|
412
|
-
|
413
407
|
while not self._stop_event.is_set():
|
408
|
+
logger.debug("Starting pending_result_queue get")
|
414
409
|
try:
|
415
|
-
|
416
|
-
r
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
logger.debug("
|
421
|
-
except Exception
|
422
|
-
logger.exception("
|
423
|
-
|
424
|
-
if time.time() > last_result_beat + self.heartbeat_period:
|
425
|
-
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
last_result_beat = time.time()
|
428
|
-
items.append(pickle.dumps({'type': 'heartbeat'}))
|
429
|
-
|
430
|
-
if len(items) >= self.max_queue_size or time.time() > last_beat + push_poll_period:
|
431
|
-
last_beat = time.time()
|
432
|
-
if items:
|
433
|
-
logger.debug(f"Result send: Pushing {len(items)} items")
|
434
|
-
result_outgoing.send_multipart(items)
|
435
|
-
logger.debug("Result send: Pushed")
|
436
|
-
items = []
|
437
|
-
else:
|
438
|
-
logger.debug("Result send: No items to push")
|
439
|
-
else:
|
440
|
-
logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
|
410
|
+
r = self.task_scheduler.get_result()
|
411
|
+
if r is None:
|
412
|
+
continue
|
413
|
+
logger.debug("Result received from worker: %s", id(r))
|
414
|
+
result_outgoing.send(r)
|
415
|
+
logger.debug("Result sent to interchange: %s", id(r))
|
416
|
+
except Exception:
|
417
|
+
logger.exception("Failed to send result to interchange")
|
441
418
|
|
442
419
|
result_outgoing.close()
|
443
|
-
logger.
|
420
|
+
logger.debug("Exiting")
|
444
421
|
|
445
422
|
@wrap_with_logs
|
446
|
-
def
|
423
|
+
def heartbeater(self):
|
424
|
+
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
+
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
+
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
+
|
429
|
+
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
447
430
|
"""Keeps workers alive."""
|
448
431
|
logger.debug("Starting worker watchdog")
|
449
432
|
|
450
433
|
while not self._stop_event.wait(self.heartbeat_period):
|
451
|
-
for worker_id, p in
|
434
|
+
for worker_id, p in procs.items():
|
452
435
|
if not p.is_alive():
|
453
436
|
logger.error("Worker {} has died".format(worker_id))
|
454
437
|
try:
|
@@ -466,11 +449,10 @@ class Manager:
|
|
466
449
|
except KeyError:
|
467
450
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
468
451
|
|
469
|
-
|
470
|
-
self.procs[worker_id] = p
|
452
|
+
procs[worker_id] = self._start_worker(worker_id)
|
471
453
|
logger.info("Worker {} has been restarted".format(worker_id))
|
472
454
|
|
473
|
-
logger.
|
455
|
+
logger.debug("Exiting")
|
474
456
|
|
475
457
|
@wrap_with_logs
|
476
458
|
def handle_monitoring_messages(self):
|
@@ -485,32 +467,28 @@ class Manager:
|
|
485
467
|
"""
|
486
468
|
logger.debug("Starting monitoring handler thread")
|
487
469
|
|
488
|
-
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
489
|
-
|
490
470
|
while not self._stop_event.is_set():
|
491
471
|
try:
|
492
472
|
logger.debug("Starting monitor_queue.get()")
|
493
|
-
msg = self.monitoring_queue.get(block=True
|
494
|
-
|
495
|
-
|
496
|
-
except Exception as e:
|
497
|
-
logger.exception(f"Got an exception: {e}")
|
498
|
-
else:
|
473
|
+
msg = self.monitoring_queue.get(block=True)
|
474
|
+
if msg is None:
|
475
|
+
continue
|
499
476
|
logger.debug("Got a monitoring message")
|
500
477
|
self.pending_result_queue.put(msg)
|
501
478
|
logger.debug("Put monitoring message on pending_result_queue")
|
479
|
+
except Exception:
|
480
|
+
logger.exception("Failed to forward monitoring message")
|
502
481
|
|
503
|
-
logger.
|
482
|
+
logger.debug("Exiting")
|
504
483
|
|
505
484
|
def start(self):
|
506
485
|
""" Start the worker processes.
|
507
486
|
|
508
487
|
TODO: Move task receiving to a thread
|
509
488
|
"""
|
510
|
-
|
489
|
+
procs: dict[int, SpawnProcess] = {}
|
511
490
|
for worker_id in range(self.worker_count):
|
512
|
-
|
513
|
-
self.procs[worker_id] = p
|
491
|
+
procs[worker_id] = self._start_worker(worker_id)
|
514
492
|
|
515
493
|
logger.debug("Workers started")
|
516
494
|
|
@@ -519,40 +497,69 @@ class Manager:
|
|
519
497
|
target=self.push_results, name="Result-Pusher"
|
520
498
|
)
|
521
499
|
thr_worker_watchdog = threading.Thread(
|
522
|
-
target=self.worker_watchdog, name="worker-watchdog"
|
500
|
+
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
523
501
|
)
|
524
502
|
thr_monitoring_handler = threading.Thread(
|
525
503
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
526
504
|
)
|
505
|
+
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
527
506
|
|
528
507
|
thr_task_puller.start()
|
529
508
|
thr_result_pusher.start()
|
530
509
|
thr_worker_watchdog.start()
|
531
510
|
thr_monitoring_handler.start()
|
511
|
+
thr_heartbeater.start()
|
532
512
|
|
533
513
|
logger.info("Manager threads started")
|
534
514
|
|
535
515
|
# This might need a multiprocessing event to signal back.
|
536
516
|
self._stop_event.wait()
|
537
|
-
logger.
|
517
|
+
logger.info("Stop event set; terminating worker processes")
|
518
|
+
|
519
|
+
# Invite blocking threads to quit
|
520
|
+
self.monitoring_queue.put(None)
|
521
|
+
self.pending_result_queue.put(None)
|
538
522
|
|
523
|
+
thr_heartbeater.join()
|
539
524
|
thr_task_puller.join()
|
540
525
|
thr_result_pusher.join()
|
541
526
|
thr_worker_watchdog.join()
|
542
527
|
thr_monitoring_handler.join()
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
528
|
+
|
529
|
+
for worker_id in procs:
|
530
|
+
p = procs[worker_id]
|
531
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
532
|
+
logger.debug(f"Signaling worker {p.name} (TERM). {proc_info}")
|
533
|
+
p.terminate()
|
549
534
|
|
550
535
|
self.zmq_context.term()
|
536
|
+
|
537
|
+
# give processes 1 second to gracefully shut themselves down, based on the
|
538
|
+
# SIGTERM (.terminate()) just sent; after then, we pull the plug.
|
539
|
+
force_child_shutdown_at = time.monotonic() + 1
|
540
|
+
while procs:
|
541
|
+
worker_id, p = procs.popitem()
|
542
|
+
timeout = max(force_child_shutdown_at - time.monotonic(), 0.000001)
|
543
|
+
p.join(timeout=timeout)
|
544
|
+
proc_info = f"(PID: {p.pid}, Worker ID: {worker_id})"
|
545
|
+
if p.exitcode is not None:
|
546
|
+
logger.debug(
|
547
|
+
"Worker joined successfully. %s (exitcode: %s)", proc_info, p.exitcode
|
548
|
+
)
|
549
|
+
|
550
|
+
else:
|
551
|
+
logger.warning(
|
552
|
+
f"Worker {p.name} ({worker_id}) failed to terminate in a timely"
|
553
|
+
f" manner; sending KILL signal to process. {proc_info}"
|
554
|
+
)
|
555
|
+
p.kill()
|
556
|
+
p.join()
|
557
|
+
p.close()
|
558
|
+
|
551
559
|
delta = time.time() - self._start_time
|
552
560
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
553
|
-
return
|
554
561
|
|
555
|
-
def _start_worker(self, worker_id: int):
|
562
|
+
def _start_worker(self, worker_id: int) -> SpawnProcess:
|
556
563
|
p = SpawnContext.Process(
|
557
564
|
target=worker,
|
558
565
|
args=(
|
@@ -939,27 +946,27 @@ if __name__ == "__main__":
|
|
939
946
|
)
|
940
947
|
logger.info(
|
941
948
|
f"\n Python version: {sys.version}"
|
942
|
-
f" Debug logging: {args.debug}"
|
943
|
-
f" Certificates dir: {args.cert_dir}"
|
944
|
-
f" Log dir: {args.logdir}"
|
945
|
-
f" Manager ID: {args.uid}"
|
946
|
-
f" Block ID: {args.block_id}"
|
947
|
-
f" cores_per_worker: {args.cores_per_worker}"
|
948
|
-
f" mem_per_worker: {args.mem_per_worker}"
|
949
|
-
f" task_port: {args.task_port}"
|
950
|
-
f" result_port: {args.result_port}"
|
951
|
-
f" addresses: {args.addresses}"
|
952
|
-
f" max_workers_per_node: {args.max_workers_per_node}"
|
953
|
-
f" poll_period: {args.poll}"
|
954
|
-
f" address_probe_timeout: {args.address_probe_timeout}"
|
955
|
-
f" Prefetch capacity: {args.prefetch_capacity}"
|
956
|
-
f" Heartbeat threshold: {args.hb_threshold}"
|
957
|
-
f" Heartbeat period: {args.hb_period}"
|
958
|
-
f" Drain period: {args.drain_period}"
|
959
|
-
f" CPU affinity: {args.cpu_affinity}"
|
960
|
-
f" Accelerators: {' '.join(args.available_accelerators)}"
|
961
|
-
f" enable_mpi_mode: {args.enable_mpi_mode}"
|
962
|
-
f" mpi_launcher: {args.mpi_launcher}"
|
949
|
+
f"\n Debug logging: {args.debug}"
|
950
|
+
f"\n Certificates dir: {args.cert_dir}"
|
951
|
+
f"\n Log dir: {args.logdir}"
|
952
|
+
f"\n Manager ID: {args.uid}"
|
953
|
+
f"\n Block ID: {args.block_id}"
|
954
|
+
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
|
+
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
+
f"\n task_port: {args.task_port}"
|
957
|
+
f"\n result_port: {args.result_port}"
|
958
|
+
f"\n addresses: {args.addresses}"
|
959
|
+
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
|
+
f"\n poll_period: {args.poll}"
|
961
|
+
f"\n address_probe_timeout: {args.address_probe_timeout}"
|
962
|
+
f"\n Prefetch capacity: {args.prefetch_capacity}"
|
963
|
+
f"\n Heartbeat threshold: {args.hb_threshold}"
|
964
|
+
f"\n Heartbeat period: {args.hb_period}"
|
965
|
+
f"\n Drain period: {args.drain_period}"
|
966
|
+
f"\n CPU affinity: {args.cpu_affinity}"
|
967
|
+
f"\n Accelerators: {' '.join(args.available_accelerators)}"
|
968
|
+
f"\n enable_mpi_mode: {args.enable_mpi_mode}"
|
969
|
+
f"\n mpi_launcher: {args.mpi_launcher}"
|
963
970
|
)
|
964
971
|
try:
|
965
972
|
manager = Manager(task_port=args.task_port,
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2025.3.
|
3
|
+
Version: 2025.3.17
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2025.03.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2025.03.17.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -40,7 +40,7 @@ Requires-Dist: boto3; extra == "all"
|
|
40
40
|
Requires-Dist: kubernetes; extra == "all"
|
41
41
|
Requires-Dist: ipython<=8.6.0; extra == "all"
|
42
42
|
Requires-Dist: nbsphinx; extra == "all"
|
43
|
-
Requires-Dist: sphinx<
|
43
|
+
Requires-Dist: sphinx<8,>=7.4; extra == "all"
|
44
44
|
Requires-Dist: sphinx-rtd-theme; extra == "all"
|
45
45
|
Requires-Dist: google-auth; extra == "all"
|
46
46
|
Requires-Dist: google-api-python-client; extra == "all"
|
@@ -63,7 +63,7 @@ Requires-Dist: msrestazure; extra == "azure"
|
|
63
63
|
Provides-Extra: docs
|
64
64
|
Requires-Dist: ipython<=8.6.0; extra == "docs"
|
65
65
|
Requires-Dist: nbsphinx; extra == "docs"
|
66
|
-
Requires-Dist: sphinx<
|
66
|
+
Requires-Dist: sphinx<8,>=7.4; extra == "docs"
|
67
67
|
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
68
68
|
Provides-Extra: flux
|
69
69
|
Requires-Dist: pyyaml; extra == "flux"
|
@@ -8,7 +8,7 @@ parsl/multiprocessing.py,sha256=MyaEcEq-Qf860u7V98u-PZrPNdtzOZL_NW6EhIJnmfQ,1937
|
|
8
8
|
parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
|
9
9
|
parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
parsl/utils.py,sha256=codTX6_KLhgeTwNkRzc1lo4bgc1M93eJ-lkqOO98fvk,14331
|
11
|
-
parsl/version.py,sha256=
|
11
|
+
parsl/version.py,sha256=_yYxGBkoJMDKADe5yJ2dAkIINmdZgyRTRJnodIasABw,131
|
12
12
|
parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
parsl/app/app.py,sha256=0gbM4AH2OtFOLsv07I5nglpElcwMSOi-FzdZZfrk7So,8532
|
14
14
|
parsl/app/bash.py,sha256=jm2AvePlCT9DZR7H_4ANDWxatp5dN_22FUlT_gWhZ-g,5528
|
@@ -80,9 +80,9 @@ parsl/executors/high_throughput/manager_selector.py,sha256=UKcUE6v0tO7PDMTThpKSK
|
|
80
80
|
parsl/executors/high_throughput/monitoring_info.py,sha256=HC0drp6nlXQpAop5PTUKNjdXMgtZVvrBL0JzZJebPP4,298
|
81
81
|
parsl/executors/high_throughput/mpi_executor.py,sha256=U-aatbLF_Mu1p6lP0HmT7Yn1Swn3cc7hPmDfuUb9TpI,4797
|
82
82
|
parsl/executors/high_throughput/mpi_prefix_composer.py,sha256=DmpKugANNa1bdYlqQBLHkrFc15fJpefPPhW9hkAlh1s,4308
|
83
|
-
parsl/executors/high_throughput/mpi_resource_management.py,sha256=
|
83
|
+
parsl/executors/high_throughput/mpi_resource_management.py,sha256=73bTW2ZbHRfcrPN318cyjiqDN50AM1cOCQqUGJDIlBg,8199
|
84
84
|
parsl/executors/high_throughput/probe.py,sha256=QOEaliO3x5cB6ltMOZMsZQ-ath9AAuFqXcBzRgWOM60,2754
|
85
|
-
parsl/executors/high_throughput/process_worker_pool.py,sha256=
|
85
|
+
parsl/executors/high_throughput/process_worker_pool.py,sha256=Q7FN0MdXIAOouxDarim6etYVHEgbXFiaMhBahC2ZtIQ,41137
|
86
86
|
parsl/executors/high_throughput/zmq_pipes.py,sha256=NUK25IEh0UkxzdqQQyM8tMtuZmjSiTeWu1DzkkAIOhA,8980
|
87
87
|
parsl/executors/radical/__init__.py,sha256=CKbtV2numw5QvgIBq1htMUrt9TqDCIC2zifyf2svTNU,186
|
88
88
|
parsl/executors/radical/executor.py,sha256=en2TKzZnJYU_juojkM_aZUdWhbAgutAYn_EL6HGpfSY,22835
|
@@ -114,10 +114,10 @@ parsl/launchers/base.py,sha256=CblcvPTJiu-MNLWaRtFe29SZQ0BpTOlaY8CGcHdlHIE,538
|
|
114
114
|
parsl/launchers/errors.py,sha256=8YMV_CHpBNVa4eXkGE4x5DaFQlZkDCRCHmBktYcY6TA,467
|
115
115
|
parsl/launchers/launchers.py,sha256=cQsNsHuCOL_nQTjPXf0--YsgsDoMoJ77bO1Wt4ncLjs,15134
|
116
116
|
parsl/monitoring/__init__.py,sha256=0ywNz6i0lM1xo_7_BIxhETDGeVd2C_0wwD7qgeaMR4c,83
|
117
|
-
parsl/monitoring/db_manager.py,sha256=
|
117
|
+
parsl/monitoring/db_manager.py,sha256=L0c5S9ockq0UIchT2bjmkSAWXS-t0G-Q_neOIBfLbm0,33444
|
118
118
|
parsl/monitoring/errors.py,sha256=D6jpYzEzp0d6FmVKGqhvjAxr4ztZfJX2s-aXemH9bBU,148
|
119
119
|
parsl/monitoring/message_type.py,sha256=Khn88afNxcOIciKiCK4GLnn90I5BlRTiOL3zK-P07yQ,401
|
120
|
-
parsl/monitoring/monitoring.py,sha256=
|
120
|
+
parsl/monitoring/monitoring.py,sha256=PspFFtf3Iaj5tl23ITRRdHrBDAocSOSvP2IVP_pmW-Y,13134
|
121
121
|
parsl/monitoring/remote.py,sha256=t0qCTUMCzeJ_JOARFpjqlTNrAWdEb20BxhmZh9X7kEM,13728
|
122
122
|
parsl/monitoring/types.py,sha256=oOCrzv-ab-_rv4pb8o58Sdb8G_RGp1aZriRbdf9zBEk,339
|
123
123
|
parsl/monitoring/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -125,13 +125,13 @@ parsl/monitoring/queries/pandas.py,sha256=0Z2r0rjTKCemf0eaDkF1irvVHn5g7KC5SYETvQ
|
|
125
125
|
parsl/monitoring/radios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
126
126
|
parsl/monitoring/radios/base.py,sha256=Ep5kHf07Sm-ApMBJVudRhoWRyuiu0udjO4NvEir5LEk,291
|
127
127
|
parsl/monitoring/radios/filesystem.py,sha256=ioZ3jOKX5Qf0DYRtWmpCEorfuMVbS58OMS_QV7DOFOs,1765
|
128
|
-
parsl/monitoring/radios/filesystem_router.py,sha256=
|
128
|
+
parsl/monitoring/radios/filesystem_router.py,sha256=kQkinktSpsVwfNESfUggSzBlRZ5JgwjM7IDN-jARAhM,2146
|
129
129
|
parsl/monitoring/radios/htex.py,sha256=qBu4O5NYnSETHX0ptdwxSpqa2Pp3Z_V6a6lb3TbjKm4,1643
|
130
130
|
parsl/monitoring/radios/multiprocessing.py,sha256=fsfaaoMDp6VJv1DSAl-P0R2ofO6jp13byx6NsPItV3Y,655
|
131
131
|
parsl/monitoring/radios/udp.py,sha256=bTpt7JYp-5hyBBLzgiLj1_BlSTn28UVp39OYgVGLXCw,1613
|
132
|
-
parsl/monitoring/radios/udp_router.py,sha256=
|
132
|
+
parsl/monitoring/radios/udp_router.py,sha256=LEiHZVhw3lVFhqUK1FAFFtpvNOWbB6RNRBK8FaMvtDw,5771
|
133
133
|
parsl/monitoring/radios/zmq.py,sha256=fhoHp9ylhf-D3eTJb2aSHRsuic8-FJ_oRNGnniGkCAI,592
|
134
|
-
parsl/monitoring/radios/zmq_router.py,sha256=
|
134
|
+
parsl/monitoring/radios/zmq_router.py,sha256=pYhol8-SV8FThv7YIjqc5tv149E4ktDLb-l7-ot4nfg,5579
|
135
135
|
parsl/monitoring/visualization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
136
136
|
parsl/monitoring/visualization/app.py,sha256=xMeRlAnzl5lHddAOdSBcqY3D5lmOYw3Z3Z2_YyoVwnw,1425
|
137
137
|
parsl/monitoring/visualization/models.py,sha256=C7CcF6w6PhtrdvDX9VgDH-aSrpLfvYU1fJ4-HDUeFVQ,5138
|
@@ -339,6 +339,7 @@ parsl/tests/test_monitoring/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
339
339
|
parsl/tests/test_monitoring/test_app_names.py,sha256=A-mOMCVhZDnUyJp32fsTUkHdcyval8o7WPEWacDkbD4,2208
|
340
340
|
parsl/tests/test_monitoring/test_basic.py,sha256=VdF6JHfqsEOIMg-ysIAREgygZIjHWNDVLNVQ7jhWxmQ,4592
|
341
341
|
parsl/tests/test_monitoring/test_db_locks.py,sha256=3s3c1xhKo230ZZIJ3f1Ca4U7LcEdXnanOGVXQyNlk2U,2895
|
342
|
+
parsl/tests/test_monitoring/test_exit_helper.py,sha256=FsMcQ1GF70vPXEfexDyo674_c5cglJBrLXKBzAYIfOk,1266
|
342
343
|
parsl/tests/test_monitoring/test_fuzz_zmq.py,sha256=--3-pQUvXXbkr8v_BEJoPvVvNly1oXvrD2nJh6yl_0M,3436
|
343
344
|
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py,sha256=_QV8zjBKVF_qBbBnhT0C3X9AmfS7IKLcOnEw_cU6HeM,2622
|
344
345
|
parsl/tests/test_monitoring/test_incomplete_futures.py,sha256=ZnO1sFSwlWUBHX64C_zwfTVRVC_UFNlU4h0POgx6NEo,2005
|
@@ -457,13 +458,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
457
458
|
parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
|
458
459
|
parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
|
459
460
|
parsl/usage_tracking/usage.py,sha256=f9k6QcpbQxkGyP5WTC9PVyv0CA05s9NDpRe5wwRdBTM,9163
|
460
|
-
parsl-2025.3.
|
461
|
-
parsl-2025.3.
|
462
|
-
parsl-2025.3.
|
463
|
-
parsl-2025.3.
|
464
|
-
parsl-2025.3.
|
465
|
-
parsl-2025.3.
|
466
|
-
parsl-2025.3.
|
467
|
-
parsl-2025.3.
|
468
|
-
parsl-2025.3.
|
469
|
-
parsl-2025.3.
|
461
|
+
parsl-2025.3.17.data/scripts/exec_parsl_function.py,sha256=YXKVVIa4zXmOtz-0Ca4E_5nQfN_3S2bh2tB75uZZB4w,7774
|
462
|
+
parsl-2025.3.17.data/scripts/interchange.py,sha256=17MrOc7-FXxKBWTwkzIbUoa8fvvDfPelfjByd3ZD2Wk,29446
|
463
|
+
parsl-2025.3.17.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
|
464
|
+
parsl-2025.3.17.data/scripts/process_worker_pool.py,sha256=__gFeFQJpV5moRofj3WKQCnKp6gmzieXjzkmzVuTmX4,41123
|
465
|
+
parsl-2025.3.17.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
466
|
+
parsl-2025.3.17.dist-info/METADATA,sha256=d_WFIKY6wmq4VQQcz-BCh0yhu9i3i627EjutSTqSNH4,4023
|
467
|
+
parsl-2025.3.17.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
468
|
+
parsl-2025.3.17.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
|
469
|
+
parsl-2025.3.17.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
|
470
|
+
parsl-2025.3.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|