parsl 2023.12.4__py3-none-any.whl → 2023.12.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +6 -3
- parsl/dataflow/errors.py +5 -3
- parsl/executors/high_throughput/process_worker_pool.py +136 -58
- parsl/executors/high_throughput/zmq_pipes.py +9 -2
- parsl/multiprocessing.py +5 -2
- parsl/providers/local/local.py +3 -2
- parsl/providers/slurm/slurm.py +13 -0
- parsl/tests/test_error_handling/test_htex_manager_failure.py +52 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +9 -1
- parsl/tests/test_python_apps/test_depfail_propagation.py +4 -0
- parsl/version.py +1 -1
- {parsl-2023.12.4.data → parsl-2023.12.18.data}/scripts/process_worker_pool.py +136 -58
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/METADATA +2 -2
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/RECORD +20 -19
- {parsl-2023.12.4.data → parsl-2023.12.18.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2023.12.4.data → parsl-2023.12.18.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/LICENSE +0 -0
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/WHEEL +0 -0
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/entry_points.txt +0 -0
- {parsl-2023.12.4.dist-info → parsl-2023.12.18.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -854,10 +854,13 @@ class DataFlowKernel:
|
|
854
854
|
try:
|
855
855
|
new_args.extend([dep.result()])
|
856
856
|
except Exception as e:
|
857
|
-
|
858
|
-
|
857
|
+
# If this Future is associated with a task inside this DFK,
|
858
|
+
# then refer to the task ID.
|
859
|
+
# Otherwise make a repr of the Future object.
|
860
|
+
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
861
|
+
tid = "task " + repr(dep.task_record['id'])
|
859
862
|
else:
|
860
|
-
tid =
|
863
|
+
tid = repr(dep)
|
861
864
|
dep_failures.extend([(e, tid)])
|
862
865
|
else:
|
863
866
|
new_args.extend([dep])
|
parsl/dataflow/errors.py
CHANGED
@@ -36,7 +36,9 @@ class DependencyError(DataFlowException):
|
|
36
36
|
in a dependency.
|
37
37
|
|
38
38
|
Args:
|
39
|
-
- dependent_exceptions_tids: List of
|
39
|
+
- dependent_exceptions_tids: List of exceptions and identifiers for
|
40
|
+
dependencies which failed. The identifier might be a task ID or
|
41
|
+
the repr of a non-DFK Future.
|
40
42
|
- task_id: Task ID of the task that failed because of the dependency error
|
41
43
|
"""
|
42
44
|
|
@@ -45,8 +47,8 @@ class DependencyError(DataFlowException):
|
|
45
47
|
self.task_id = task_id
|
46
48
|
|
47
49
|
def __str__(self) -> str:
|
48
|
-
|
49
|
-
return "Dependency failure for task {} with failed dependencies from
|
50
|
+
deps = ", ".join(tid for _exc, tid in self.dependent_exceptions_tids)
|
51
|
+
return f"Dependency failure for task {self.task_id} with failed dependencies from {deps}"
|
50
52
|
|
51
53
|
|
52
54
|
class JoinError(DataFlowException):
|
@@ -17,6 +17,8 @@ import math
|
|
17
17
|
import json
|
18
18
|
import psutil
|
19
19
|
import multiprocessing
|
20
|
+
from multiprocessing.managers import DictProxy
|
21
|
+
from multiprocessing.sharedctypes import Synchronized
|
20
22
|
|
21
23
|
from parsl.process_loggers import wrap_with_logs
|
22
24
|
|
@@ -24,10 +26,7 @@ from parsl.version import VERSION as PARSL_VERSION
|
|
24
26
|
from parsl.app.errors import RemoteExceptionWrapper
|
25
27
|
from parsl.executors.high_throughput.errors import WorkerLost
|
26
28
|
from parsl.executors.high_throughput.probe import probe_addresses
|
27
|
-
from parsl.multiprocessing import
|
28
|
-
|
29
|
-
from parsl.multiprocessing import SizedQueue as mpQueue
|
30
|
-
|
29
|
+
from parsl.multiprocessing import SpawnContext
|
31
30
|
from parsl.serialize import unpack_apply_message, serialize
|
32
31
|
|
33
32
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
@@ -49,22 +48,22 @@ class Manager:
|
|
49
48
|
| | IPC-Qeueues
|
50
49
|
|
51
50
|
"""
|
52
|
-
def __init__(self,
|
53
|
-
addresses
|
54
|
-
address_probe_timeout
|
55
|
-
task_port
|
56
|
-
result_port
|
57
|
-
cores_per_worker
|
58
|
-
mem_per_worker
|
59
|
-
max_workers
|
60
|
-
prefetch_capacity
|
61
|
-
uid
|
62
|
-
block_id
|
63
|
-
heartbeat_threshold
|
64
|
-
heartbeat_period
|
65
|
-
poll_period
|
66
|
-
cpu_affinity
|
67
|
-
available_accelerators: Sequence[str]
|
51
|
+
def __init__(self, *,
|
52
|
+
addresses,
|
53
|
+
address_probe_timeout,
|
54
|
+
task_port,
|
55
|
+
result_port,
|
56
|
+
cores_per_worker,
|
57
|
+
mem_per_worker,
|
58
|
+
max_workers,
|
59
|
+
prefetch_capacity,
|
60
|
+
uid,
|
61
|
+
block_id,
|
62
|
+
heartbeat_threshold,
|
63
|
+
heartbeat_period,
|
64
|
+
poll_period,
|
65
|
+
cpu_affinity,
|
66
|
+
available_accelerators: Sequence[str]):
|
68
67
|
"""
|
69
68
|
Parameters
|
70
69
|
----------
|
@@ -73,7 +72,7 @@ class Manager:
|
|
73
72
|
|
74
73
|
address_probe_timeout : int
|
75
74
|
Timeout in seconds for the address probe to detect viable addresses
|
76
|
-
to the interchange.
|
75
|
+
to the interchange.
|
77
76
|
|
78
77
|
uid : str
|
79
78
|
string unique identifier
|
@@ -83,43 +82,41 @@ class Manager:
|
|
83
82
|
|
84
83
|
cores_per_worker : float
|
85
84
|
cores to be assigned to each worker. Oversubscription is possible
|
86
|
-
by setting cores_per_worker < 1.0.
|
85
|
+
by setting cores_per_worker < 1.0.
|
87
86
|
|
88
87
|
mem_per_worker : float
|
89
88
|
GB of memory required per worker. If this option is specified, the node manager
|
90
89
|
will check the available memory at startup and limit the number of workers such that
|
91
90
|
the there's sufficient memory for each worker. If set to None, memory on node is not
|
92
91
|
considered in the determination of workers to be launched on node by the manager.
|
93
|
-
Default: None
|
94
92
|
|
95
93
|
max_workers : int
|
96
94
|
caps the maximum number of workers that can be launched.
|
97
|
-
default: infinity
|
98
95
|
|
99
96
|
prefetch_capacity : int
|
100
97
|
Number of tasks that could be prefetched over available worker capacity.
|
101
98
|
When there are a few tasks (<100) or when tasks are long running, this option should
|
102
|
-
be set to 0 for better load balancing.
|
99
|
+
be set to 0 for better load balancing.
|
103
100
|
|
104
101
|
heartbeat_threshold : int
|
105
102
|
Seconds since the last message from the interchange after which the
|
106
|
-
interchange is assumed to be un-available, and the manager initiates shutdown.
|
103
|
+
interchange is assumed to be un-available, and the manager initiates shutdown.
|
107
104
|
|
108
105
|
Number of seconds since the last message from the interchange after which the worker
|
109
|
-
assumes that the interchange is lost and the manager shuts down.
|
106
|
+
assumes that the interchange is lost and the manager shuts down.
|
110
107
|
|
111
108
|
heartbeat_period : int
|
112
109
|
Number of seconds after which a heartbeat message is sent to the interchange, and workers
|
113
110
|
are checked for liveness.
|
114
111
|
|
115
112
|
poll_period : int
|
116
|
-
Timeout period used by the manager in milliseconds.
|
113
|
+
Timeout period used by the manager in milliseconds.
|
117
114
|
|
118
115
|
cpu_affinity : str
|
119
116
|
Whether or how each worker should force its affinity to different CPUs
|
120
117
|
|
121
118
|
available_accelerators: list of str
|
122
|
-
List of accelerators available to the workers.
|
119
|
+
List of accelerators available to the workers.
|
123
120
|
|
124
121
|
"""
|
125
122
|
|
@@ -160,7 +157,7 @@ class Manager:
|
|
160
157
|
if os.environ.get('PARSL_CORES'):
|
161
158
|
cores_on_node = int(os.environ['PARSL_CORES'])
|
162
159
|
else:
|
163
|
-
cores_on_node =
|
160
|
+
cores_on_node = SpawnContext.cpu_count()
|
164
161
|
|
165
162
|
if os.environ.get('PARSL_MEMORY_GB'):
|
166
163
|
available_mem_on_node = float(os.environ['PARSL_MEMORY_GB'])
|
@@ -175,13 +172,16 @@ class Manager:
|
|
175
172
|
if mem_per_worker and mem_per_worker > 0:
|
176
173
|
mem_slots = math.floor(available_mem_on_node / mem_per_worker)
|
177
174
|
|
178
|
-
self.worker_count = min(max_workers,
|
179
|
-
|
180
|
-
|
175
|
+
self.worker_count: int = min(max_workers,
|
176
|
+
mem_slots,
|
177
|
+
math.floor(cores_on_node / cores_per_worker))
|
181
178
|
|
182
|
-
self.
|
183
|
-
|
184
|
-
self.
|
179
|
+
self._mp_manager = SpawnContext.Manager() # Starts a server process
|
180
|
+
|
181
|
+
self.monitoring_queue = self._mp_manager.Queue()
|
182
|
+
self.pending_task_queue = SpawnContext.Queue()
|
183
|
+
self.pending_result_queue = SpawnContext.Queue()
|
184
|
+
self.ready_worker_count = SpawnContext.Value("i", 0)
|
185
185
|
|
186
186
|
self.max_queue_size = self.prefetch_capacity + self.worker_count
|
187
187
|
|
@@ -252,10 +252,13 @@ class Manager:
|
|
252
252
|
poll_timer = self.poll_period
|
253
253
|
|
254
254
|
while not kill_event.is_set():
|
255
|
-
|
256
|
-
|
255
|
+
try:
|
256
|
+
pending_task_count = self.pending_task_queue.qsize()
|
257
|
+
except NotImplementedError:
|
258
|
+
# Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
|
259
|
+
pending_task_count = f"pending task count is not available on {platform.system()}"
|
257
260
|
|
258
|
-
logger.debug("ready workers: {}, pending tasks: {}".format(ready_worker_count,
|
261
|
+
logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
|
259
262
|
pending_task_count))
|
260
263
|
|
261
264
|
if time.time() > last_beat + self.heartbeat_period:
|
@@ -381,6 +384,36 @@ class Manager:
|
|
381
384
|
|
382
385
|
logger.critical("Exiting")
|
383
386
|
|
387
|
+
@wrap_with_logs
|
388
|
+
def handle_monitoring_messages(self, kill_event: threading.Event):
|
389
|
+
"""Transfer messages from the managed monitoring queue to the result queue.
|
390
|
+
|
391
|
+
We separate the queues so that the result queue does not rely on a manager
|
392
|
+
process, which adds overhead that causes slower queue operations but enables
|
393
|
+
use across processes started in fork and spawn contexts.
|
394
|
+
|
395
|
+
We transfer the messages to the result queue to reuse the ZMQ connection between
|
396
|
+
the manager and the interchange.
|
397
|
+
"""
|
398
|
+
logger.debug("Starting monitoring handler thread")
|
399
|
+
|
400
|
+
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
401
|
+
|
402
|
+
while not kill_event.is_set():
|
403
|
+
try:
|
404
|
+
logger.debug("Starting monitor_queue.get()")
|
405
|
+
msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
|
406
|
+
except queue.Empty:
|
407
|
+
logger.debug("monitoring_queue.get() has timed out")
|
408
|
+
except Exception as e:
|
409
|
+
logger.exception(f"Got an exception: {e}")
|
410
|
+
else:
|
411
|
+
logger.debug("Got a monitoring message")
|
412
|
+
self.pending_result_queue.put(msg)
|
413
|
+
logger.debug("Put monitoring message on pending_result_queue")
|
414
|
+
|
415
|
+
logger.critical("Exiting")
|
416
|
+
|
384
417
|
def start(self):
|
385
418
|
""" Start the worker processes.
|
386
419
|
|
@@ -388,7 +421,7 @@ class Manager:
|
|
388
421
|
"""
|
389
422
|
start = time.time()
|
390
423
|
self._kill_event = threading.Event()
|
391
|
-
self._tasks_in_progress =
|
424
|
+
self._tasks_in_progress = self._mp_manager.dict()
|
392
425
|
|
393
426
|
self.procs = {}
|
394
427
|
for worker_id in range(self.worker_count):
|
@@ -406,9 +439,14 @@ class Manager:
|
|
406
439
|
self._worker_watchdog_thread = threading.Thread(target=self.worker_watchdog,
|
407
440
|
args=(self._kill_event,),
|
408
441
|
name="worker-watchdog")
|
442
|
+
self._monitoring_handler_thread = threading.Thread(target=self.handle_monitoring_messages,
|
443
|
+
args=(self._kill_event,),
|
444
|
+
name="Monitoring-Handler")
|
445
|
+
|
409
446
|
self._task_puller_thread.start()
|
410
447
|
self._result_pusher_thread.start()
|
411
448
|
self._worker_watchdog_thread.start()
|
449
|
+
self._monitoring_handler_thread.start()
|
412
450
|
|
413
451
|
logger.info("Loop start")
|
414
452
|
|
@@ -420,6 +458,7 @@ class Manager:
|
|
420
458
|
self._task_puller_thread.join()
|
421
459
|
self._result_pusher_thread.join()
|
422
460
|
self._worker_watchdog_thread.join()
|
461
|
+
self._monitoring_handler_thread.join()
|
423
462
|
for proc_id in self.procs:
|
424
463
|
self.procs[proc_id].terminate()
|
425
464
|
logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
|
@@ -435,7 +474,7 @@ class Manager:
|
|
435
474
|
return
|
436
475
|
|
437
476
|
def _start_worker(self, worker_id: int):
|
438
|
-
p =
|
477
|
+
p = SpawnContext.Process(
|
439
478
|
target=worker,
|
440
479
|
args=(
|
441
480
|
worker_id,
|
@@ -443,10 +482,16 @@ class Manager:
|
|
443
482
|
self.worker_count,
|
444
483
|
self.pending_task_queue,
|
445
484
|
self.pending_result_queue,
|
446
|
-
self.
|
485
|
+
self.monitoring_queue,
|
486
|
+
self.ready_worker_count,
|
447
487
|
self._tasks_in_progress,
|
448
488
|
self.cpu_affinity,
|
449
489
|
self.available_accelerators[worker_id] if self.accelerators_available else None,
|
490
|
+
self.block_id,
|
491
|
+
self.heartbeat_period,
|
492
|
+
os.getpid(),
|
493
|
+
args.logdir,
|
494
|
+
args.debug,
|
450
495
|
),
|
451
496
|
name="HTEX-Worker-{}".format(worker_id),
|
452
497
|
)
|
@@ -484,7 +529,23 @@ def execute_task(bufs):
|
|
484
529
|
|
485
530
|
|
486
531
|
@wrap_with_logs(target="worker_log")
|
487
|
-
def worker(
|
532
|
+
def worker(
|
533
|
+
worker_id: int,
|
534
|
+
pool_id: str,
|
535
|
+
pool_size: int,
|
536
|
+
task_queue: multiprocessing.Queue,
|
537
|
+
result_queue: multiprocessing.Queue,
|
538
|
+
monitoring_queue: queue.Queue,
|
539
|
+
ready_worker_count: Synchronized,
|
540
|
+
tasks_in_progress: DictProxy,
|
541
|
+
cpu_affinity: str,
|
542
|
+
accelerator: Optional[str],
|
543
|
+
block_id: str,
|
544
|
+
task_queue_timeout: int,
|
545
|
+
manager_pid: int,
|
546
|
+
logdir: str,
|
547
|
+
debug: bool,
|
548
|
+
):
|
488
549
|
"""
|
489
550
|
|
490
551
|
Put request token into queue
|
@@ -496,23 +557,22 @@ def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue
|
|
496
557
|
# override the global logger inherited from the __main__ process (which
|
497
558
|
# usually logs to manager.log) with one specific to this worker.
|
498
559
|
global logger
|
499
|
-
logger = start_file_logger('{}/block-{}/{}/worker_{}.log'.format(
|
560
|
+
logger = start_file_logger('{}/block-{}/{}/worker_{}.log'.format(logdir, block_id, pool_id, worker_id),
|
500
561
|
worker_id,
|
501
562
|
name="worker_log",
|
502
|
-
level=logging.DEBUG if
|
563
|
+
level=logging.DEBUG if debug else logging.INFO)
|
503
564
|
|
504
565
|
# Store worker ID as an environment variable
|
505
566
|
os.environ['PARSL_WORKER_RANK'] = str(worker_id)
|
506
567
|
os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
|
507
568
|
os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
|
508
|
-
os.environ['PARSL_WORKER_BLOCK_ID'] = str(
|
569
|
+
os.environ['PARSL_WORKER_BLOCK_ID'] = str(block_id)
|
509
570
|
|
510
|
-
# share the result queue with monitoring code so it too can send results down that channel
|
511
571
|
import parsl.executors.high_throughput.monitoring_info as mi
|
512
|
-
mi.result_queue =
|
572
|
+
mi.result_queue = monitoring_queue
|
513
573
|
|
514
574
|
logger.info('Worker {} started'.format(worker_id))
|
515
|
-
if
|
575
|
+
if debug:
|
516
576
|
logger.debug("Debug logging enabled")
|
517
577
|
|
518
578
|
# If desired, set process affinity
|
@@ -553,20 +613,37 @@ def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue
|
|
553
613
|
|
554
614
|
logger.info(f'Pinned worker to accelerator: {accelerator}')
|
555
615
|
|
556
|
-
|
557
|
-
|
616
|
+
def manager_is_alive():
|
617
|
+
try:
|
618
|
+
# This does not kill the process, but instead raises
|
619
|
+
# an exception if the process doesn't exist
|
620
|
+
os.kill(manager_pid, 0)
|
621
|
+
except OSError:
|
622
|
+
logger.critical(f"Manager ({manager_pid}) died; worker {worker_id} shutting down")
|
623
|
+
return False
|
624
|
+
else:
|
625
|
+
return True
|
626
|
+
|
627
|
+
worker_enqueued = False
|
628
|
+
while manager_is_alive():
|
629
|
+
if not worker_enqueued:
|
630
|
+
with ready_worker_count.get_lock():
|
631
|
+
ready_worker_count.value += 1
|
632
|
+
worker_enqueued = True
|
633
|
+
|
634
|
+
try:
|
635
|
+
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
636
|
+
req = task_queue.get(timeout=task_queue_timeout)
|
637
|
+
except queue.Empty:
|
638
|
+
continue
|
558
639
|
|
559
|
-
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
560
|
-
req = task_queue.get()
|
561
640
|
tasks_in_progress[worker_id] = req
|
562
641
|
tid = req['task_id']
|
563
642
|
logger.info("Received executor task {}".format(tid))
|
564
643
|
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id))
|
569
|
-
pass
|
644
|
+
with ready_worker_count.get_lock():
|
645
|
+
ready_worker_count.value -= 1
|
646
|
+
worker_enqueued = False
|
570
647
|
|
571
648
|
try:
|
572
649
|
result = execute_task(req['buffer'])
|
@@ -653,6 +730,7 @@ if __name__ == "__main__":
|
|
653
730
|
parser.add_argument("-r", "--result_port", required=True,
|
654
731
|
help="REQUIRED: Result port for posting results to the interchange")
|
655
732
|
parser.add_argument("--cpu-affinity", type=str, choices=["none", "block", "alternating", "block-reverse"],
|
733
|
+
required=True,
|
656
734
|
help="Whether/how workers should control CPU affinity.")
|
657
735
|
parser.add_argument("--available-accelerators", type=str, nargs="*",
|
658
736
|
help="Names of available accelerators")
|
@@ -46,7 +46,6 @@ class CommandClient:
|
|
46
46
|
""" This function needs to be fast at the same time aware of the possibility of
|
47
47
|
ZMQ pipes overflowing.
|
48
48
|
|
49
|
-
The timeout increases slowly if contention is detected on ZMQ pipes.
|
50
49
|
We could set copy=False and get slightly better latency but this results
|
51
50
|
in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
|
52
51
|
This issue can be magnified if each the serialized buffer itself is larger.
|
@@ -55,8 +54,11 @@ class CommandClient:
|
|
55
54
|
with self._lock:
|
56
55
|
for _ in range(max_retries):
|
57
56
|
try:
|
57
|
+
logger.debug("Sending command client command")
|
58
58
|
self.zmq_socket.send_pyobj(message, copy=True)
|
59
|
+
logger.debug("Waiting for command client response")
|
59
60
|
reply = self.zmq_socket.recv_pyobj()
|
61
|
+
logger.debug("Received command client response")
|
60
62
|
except zmq.ZMQError:
|
61
63
|
logger.exception("Potential ZMQ REQ-REP deadlock caught")
|
62
64
|
logger.info("Trying to reestablish context")
|
@@ -115,7 +117,9 @@ class TasksOutgoing:
|
|
115
117
|
socks = dict(self.poller.poll(timeout=timeout_ms))
|
116
118
|
if self.zmq_socket in socks and socks[self.zmq_socket] == zmq.POLLOUT:
|
117
119
|
# The copy option adds latency but reduces the risk of ZMQ overflow
|
120
|
+
logger.debug("Sending TasksOutgoing message")
|
118
121
|
self.zmq_socket.send_pyobj(message, copy=True)
|
122
|
+
logger.debug("Sent TasksOutgoing message")
|
119
123
|
return
|
120
124
|
else:
|
121
125
|
timeout_ms *= 2
|
@@ -149,7 +153,10 @@ class ResultsIncoming:
|
|
149
153
|
max_port=port_range[1])
|
150
154
|
|
151
155
|
def get(self):
|
152
|
-
|
156
|
+
logger.debug("Waiting for ResultsIncoming message")
|
157
|
+
m = self.results_receiver.recv_multipart()
|
158
|
+
logger.debug("Received ResultsIncoming message")
|
159
|
+
return m
|
153
160
|
|
154
161
|
def close(self):
|
155
162
|
self.results_receiver.close()
|
parsl/multiprocessing.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
"""Helpers for cross-
|
1
|
+
"""Helpers for cross-platform multiprocessing support.
|
2
2
|
"""
|
3
3
|
|
4
4
|
import logging
|
@@ -10,9 +10,12 @@ from typing import Callable, Type
|
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
13
|
+
ForkContext = multiprocessing.get_context("fork")
|
14
|
+
SpawnContext = multiprocessing.get_context("spawn")
|
15
|
+
|
13
16
|
# maybe ForkProcess should be: Callable[..., Process] so as to make
|
14
17
|
# it clear that it returns a Process always to the type checker?
|
15
|
-
ForkProcess: Type =
|
18
|
+
ForkProcess: Type = ForkContext.Process
|
16
19
|
|
17
20
|
|
18
21
|
class MacSafeQueue(multiprocessing.queues.Queue):
|
parsl/providers/local/local.py
CHANGED
@@ -28,8 +28,9 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
28
28
|
Ratio of provisioned task slots to active tasks. A parallelism value of 1 represents aggressive
|
29
29
|
scaling where as many resources as possible are used; parallelism close to 0 represents
|
30
30
|
the opposite situation in which as few resources as possible (i.e., min_blocks) are used.
|
31
|
-
move_files : Optional[Bool]
|
32
|
-
|
31
|
+
move_files : Optional[Bool]
|
32
|
+
Should files be moved? By default, Parsl will try to figure this out itself (= None).
|
33
|
+
If True, then will always move. If False, will never move.
|
33
34
|
worker_init : str
|
34
35
|
Command to be run before starting a worker, such as 'module load Anaconda; source activate env'.
|
35
36
|
"""
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -47,6 +47,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
47
47
|
account : str
|
48
48
|
Slurm account to which to charge resources used by the job. If unspecified or ``None``, the job will use the
|
49
49
|
user's default account.
|
50
|
+
qos : str
|
51
|
+
Slurm queue to place job in. If unspecified or ``None``, no queue slurm directive will be specified.
|
52
|
+
constraint : str
|
53
|
+
Slurm job constraint, often used to choose cpu or gpu type. If unspecified or ``None``, no constraint slurm directive will be added.
|
50
54
|
channel : Channel
|
51
55
|
Channel for accessing this provider. Possible channels include
|
52
56
|
:class:`~parsl.channels.LocalChannel` (the default),
|
@@ -92,6 +96,8 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
92
96
|
def __init__(self,
|
93
97
|
partition: Optional[str] = None,
|
94
98
|
account: Optional[str] = None,
|
99
|
+
qos: Optional[str] = None,
|
100
|
+
constraint: Optional[str] = None,
|
95
101
|
channel: Channel = LocalChannel(),
|
96
102
|
nodes_per_block: int = 1,
|
97
103
|
cores_per_node: Optional[int] = None,
|
@@ -126,6 +132,8 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
126
132
|
self.exclusive = exclusive
|
127
133
|
self.move_files = move_files
|
128
134
|
self.account = account
|
135
|
+
self.qos = qos
|
136
|
+
self.constraint = constraint
|
129
137
|
self.scheduler_options = scheduler_options + '\n'
|
130
138
|
if exclusive:
|
131
139
|
self.scheduler_options += "#SBATCH --exclusive\n"
|
@@ -133,6 +141,11 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
133
141
|
self.scheduler_options += "#SBATCH --partition={}\n".format(partition)
|
134
142
|
if account:
|
135
143
|
self.scheduler_options += "#SBATCH --account={}\n".format(account)
|
144
|
+
if qos:
|
145
|
+
self.scheduler_options += "#SBATCH --qos={}\n".format(qos)
|
146
|
+
if constraint:
|
147
|
+
self.scheduler_options += "#SBATCH --constraint={}\n".format(constraint)
|
148
|
+
|
136
149
|
self.regex_job_id = regex_job_id
|
137
150
|
self.worker_init = worker_init + '\n'
|
138
151
|
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import os
|
2
|
+
import signal
|
3
|
+
import time
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
import parsl
|
8
|
+
from parsl.app.app import python_app
|
9
|
+
from parsl.tests.configs.htex_local import fresh_config
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture(autouse=True, scope="function")
|
13
|
+
def load_config():
|
14
|
+
config = fresh_config()
|
15
|
+
config.executors[0].poll_period = 1
|
16
|
+
config.executors[0].max_workers = 1
|
17
|
+
config.executors[0].heartbeat_period = 1
|
18
|
+
|
19
|
+
parsl.load(config)
|
20
|
+
yield
|
21
|
+
|
22
|
+
parsl.dfk().cleanup()
|
23
|
+
parsl.clear()
|
24
|
+
|
25
|
+
|
26
|
+
@python_app
|
27
|
+
def get_worker_pid():
|
28
|
+
import os
|
29
|
+
return os.getpid()
|
30
|
+
|
31
|
+
|
32
|
+
@python_app
|
33
|
+
def kill_manager(sig: int):
|
34
|
+
import os
|
35
|
+
os.kill(os.getppid(), sig)
|
36
|
+
|
37
|
+
|
38
|
+
@pytest.mark.local
|
39
|
+
@pytest.mark.parametrize("sig", [signal.SIGTERM, signal.SIGKILL])
|
40
|
+
def test_htex_manager_failure_worker_shutdown(sig: int):
|
41
|
+
"""Ensure that HTEX workers shut down when the Manager process dies."""
|
42
|
+
worker_pid = get_worker_pid().result()
|
43
|
+
|
44
|
+
kill_manager(sig)
|
45
|
+
|
46
|
+
with pytest.raises(OSError):
|
47
|
+
end = time.monotonic() + 5
|
48
|
+
while time.monotonic() < end:
|
49
|
+
# Raises an exception if the process
|
50
|
+
# does not exist
|
51
|
+
os.kill(worker_pid, 0)
|
52
|
+
time.sleep(.1)
|
@@ -33,4 +33,12 @@ def test_future_fail_dependency():
|
|
33
33
|
|
34
34
|
plain_fut.set_exception(ValueError("Plain failure"))
|
35
35
|
|
36
|
-
|
36
|
+
ex = parsl_fut.exception()
|
37
|
+
|
38
|
+
# check that what we got is a dependency error...
|
39
|
+
assert isinstance(ex, DependencyError)
|
40
|
+
|
41
|
+
# and that the dependency error string mentions the dependency
|
42
|
+
# Future, plain_fut, somewhere in its str
|
43
|
+
|
44
|
+
assert repr(plain_fut) in str(ex)
|
@@ -21,6 +21,10 @@ def test_depfail_once():
|
|
21
21
|
assert not isinstance(f1.exception(), DependencyError)
|
22
22
|
assert isinstance(f2.exception(), DependencyError)
|
23
23
|
|
24
|
+
# check that the task ID of the failing task is mentioned
|
25
|
+
# in the DependencyError message
|
26
|
+
assert ("task " + str(f1.task_record['id'])) in str(f2.exception())
|
27
|
+
|
24
28
|
|
25
29
|
def test_depfail_chain():
|
26
30
|
"""Test that dependency failures chain"""
|
parsl/version.py
CHANGED
@@ -17,6 +17,8 @@ import math
|
|
17
17
|
import json
|
18
18
|
import psutil
|
19
19
|
import multiprocessing
|
20
|
+
from multiprocessing.managers import DictProxy
|
21
|
+
from multiprocessing.sharedctypes import Synchronized
|
20
22
|
|
21
23
|
from parsl.process_loggers import wrap_with_logs
|
22
24
|
|
@@ -24,10 +26,7 @@ from parsl.version import VERSION as PARSL_VERSION
|
|
24
26
|
from parsl.app.errors import RemoteExceptionWrapper
|
25
27
|
from parsl.executors.high_throughput.errors import WorkerLost
|
26
28
|
from parsl.executors.high_throughput.probe import probe_addresses
|
27
|
-
from parsl.multiprocessing import
|
28
|
-
|
29
|
-
from parsl.multiprocessing import SizedQueue as mpQueue
|
30
|
-
|
29
|
+
from parsl.multiprocessing import SpawnContext
|
31
30
|
from parsl.serialize import unpack_apply_message, serialize
|
32
31
|
|
33
32
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
@@ -49,22 +48,22 @@ class Manager:
|
|
49
48
|
| | IPC-Qeueues
|
50
49
|
|
51
50
|
"""
|
52
|
-
def __init__(self,
|
53
|
-
addresses
|
54
|
-
address_probe_timeout
|
55
|
-
task_port
|
56
|
-
result_port
|
57
|
-
cores_per_worker
|
58
|
-
mem_per_worker
|
59
|
-
max_workers
|
60
|
-
prefetch_capacity
|
61
|
-
uid
|
62
|
-
block_id
|
63
|
-
heartbeat_threshold
|
64
|
-
heartbeat_period
|
65
|
-
poll_period
|
66
|
-
cpu_affinity
|
67
|
-
available_accelerators: Sequence[str]
|
51
|
+
def __init__(self, *,
|
52
|
+
addresses,
|
53
|
+
address_probe_timeout,
|
54
|
+
task_port,
|
55
|
+
result_port,
|
56
|
+
cores_per_worker,
|
57
|
+
mem_per_worker,
|
58
|
+
max_workers,
|
59
|
+
prefetch_capacity,
|
60
|
+
uid,
|
61
|
+
block_id,
|
62
|
+
heartbeat_threshold,
|
63
|
+
heartbeat_period,
|
64
|
+
poll_period,
|
65
|
+
cpu_affinity,
|
66
|
+
available_accelerators: Sequence[str]):
|
68
67
|
"""
|
69
68
|
Parameters
|
70
69
|
----------
|
@@ -73,7 +72,7 @@ class Manager:
|
|
73
72
|
|
74
73
|
address_probe_timeout : int
|
75
74
|
Timeout in seconds for the address probe to detect viable addresses
|
76
|
-
to the interchange.
|
75
|
+
to the interchange.
|
77
76
|
|
78
77
|
uid : str
|
79
78
|
string unique identifier
|
@@ -83,43 +82,41 @@ class Manager:
|
|
83
82
|
|
84
83
|
cores_per_worker : float
|
85
84
|
cores to be assigned to each worker. Oversubscription is possible
|
86
|
-
by setting cores_per_worker < 1.0.
|
85
|
+
by setting cores_per_worker < 1.0.
|
87
86
|
|
88
87
|
mem_per_worker : float
|
89
88
|
GB of memory required per worker. If this option is specified, the node manager
|
90
89
|
will check the available memory at startup and limit the number of workers such that
|
91
90
|
the there's sufficient memory for each worker. If set to None, memory on node is not
|
92
91
|
considered in the determination of workers to be launched on node by the manager.
|
93
|
-
Default: None
|
94
92
|
|
95
93
|
max_workers : int
|
96
94
|
caps the maximum number of workers that can be launched.
|
97
|
-
default: infinity
|
98
95
|
|
99
96
|
prefetch_capacity : int
|
100
97
|
Number of tasks that could be prefetched over available worker capacity.
|
101
98
|
When there are a few tasks (<100) or when tasks are long running, this option should
|
102
|
-
be set to 0 for better load balancing.
|
99
|
+
be set to 0 for better load balancing.
|
103
100
|
|
104
101
|
heartbeat_threshold : int
|
105
102
|
Seconds since the last message from the interchange after which the
|
106
|
-
interchange is assumed to be un-available, and the manager initiates shutdown.
|
103
|
+
interchange is assumed to be un-available, and the manager initiates shutdown.
|
107
104
|
|
108
105
|
Number of seconds since the last message from the interchange after which the worker
|
109
|
-
assumes that the interchange is lost and the manager shuts down.
|
106
|
+
assumes that the interchange is lost and the manager shuts down.
|
110
107
|
|
111
108
|
heartbeat_period : int
|
112
109
|
Number of seconds after which a heartbeat message is sent to the interchange, and workers
|
113
110
|
are checked for liveness.
|
114
111
|
|
115
112
|
poll_period : int
|
116
|
-
Timeout period used by the manager in milliseconds.
|
113
|
+
Timeout period used by the manager in milliseconds.
|
117
114
|
|
118
115
|
cpu_affinity : str
|
119
116
|
Whether or how each worker should force its affinity to different CPUs
|
120
117
|
|
121
118
|
available_accelerators: list of str
|
122
|
-
List of accelerators available to the workers.
|
119
|
+
List of accelerators available to the workers.
|
123
120
|
|
124
121
|
"""
|
125
122
|
|
@@ -160,7 +157,7 @@ class Manager:
|
|
160
157
|
if os.environ.get('PARSL_CORES'):
|
161
158
|
cores_on_node = int(os.environ['PARSL_CORES'])
|
162
159
|
else:
|
163
|
-
cores_on_node =
|
160
|
+
cores_on_node = SpawnContext.cpu_count()
|
164
161
|
|
165
162
|
if os.environ.get('PARSL_MEMORY_GB'):
|
166
163
|
available_mem_on_node = float(os.environ['PARSL_MEMORY_GB'])
|
@@ -175,13 +172,16 @@ class Manager:
|
|
175
172
|
if mem_per_worker and mem_per_worker > 0:
|
176
173
|
mem_slots = math.floor(available_mem_on_node / mem_per_worker)
|
177
174
|
|
178
|
-
self.worker_count = min(max_workers,
|
179
|
-
|
180
|
-
|
175
|
+
self.worker_count: int = min(max_workers,
|
176
|
+
mem_slots,
|
177
|
+
math.floor(cores_on_node / cores_per_worker))
|
181
178
|
|
182
|
-
self.
|
183
|
-
|
184
|
-
self.
|
179
|
+
self._mp_manager = SpawnContext.Manager() # Starts a server process
|
180
|
+
|
181
|
+
self.monitoring_queue = self._mp_manager.Queue()
|
182
|
+
self.pending_task_queue = SpawnContext.Queue()
|
183
|
+
self.pending_result_queue = SpawnContext.Queue()
|
184
|
+
self.ready_worker_count = SpawnContext.Value("i", 0)
|
185
185
|
|
186
186
|
self.max_queue_size = self.prefetch_capacity + self.worker_count
|
187
187
|
|
@@ -252,10 +252,13 @@ class Manager:
|
|
252
252
|
poll_timer = self.poll_period
|
253
253
|
|
254
254
|
while not kill_event.is_set():
|
255
|
-
|
256
|
-
|
255
|
+
try:
|
256
|
+
pending_task_count = self.pending_task_queue.qsize()
|
257
|
+
except NotImplementedError:
|
258
|
+
# Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
|
259
|
+
pending_task_count = f"pending task count is not available on {platform.system()}"
|
257
260
|
|
258
|
-
logger.debug("ready workers: {}, pending tasks: {}".format(ready_worker_count,
|
261
|
+
logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
|
259
262
|
pending_task_count))
|
260
263
|
|
261
264
|
if time.time() > last_beat + self.heartbeat_period:
|
@@ -381,6 +384,36 @@ class Manager:
|
|
381
384
|
|
382
385
|
logger.critical("Exiting")
|
383
386
|
|
387
|
+
@wrap_with_logs
|
388
|
+
def handle_monitoring_messages(self, kill_event: threading.Event):
|
389
|
+
"""Transfer messages from the managed monitoring queue to the result queue.
|
390
|
+
|
391
|
+
We separate the queues so that the result queue does not rely on a manager
|
392
|
+
process, which adds overhead that causes slower queue operations but enables
|
393
|
+
use across processes started in fork and spawn contexts.
|
394
|
+
|
395
|
+
We transfer the messages to the result queue to reuse the ZMQ connection between
|
396
|
+
the manager and the interchange.
|
397
|
+
"""
|
398
|
+
logger.debug("Starting monitoring handler thread")
|
399
|
+
|
400
|
+
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
401
|
+
|
402
|
+
while not kill_event.is_set():
|
403
|
+
try:
|
404
|
+
logger.debug("Starting monitor_queue.get()")
|
405
|
+
msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
|
406
|
+
except queue.Empty:
|
407
|
+
logger.debug("monitoring_queue.get() has timed out")
|
408
|
+
except Exception as e:
|
409
|
+
logger.exception(f"Got an exception: {e}")
|
410
|
+
else:
|
411
|
+
logger.debug("Got a monitoring message")
|
412
|
+
self.pending_result_queue.put(msg)
|
413
|
+
logger.debug("Put monitoring message on pending_result_queue")
|
414
|
+
|
415
|
+
logger.critical("Exiting")
|
416
|
+
|
384
417
|
def start(self):
|
385
418
|
""" Start the worker processes.
|
386
419
|
|
@@ -388,7 +421,7 @@ class Manager:
|
|
388
421
|
"""
|
389
422
|
start = time.time()
|
390
423
|
self._kill_event = threading.Event()
|
391
|
-
self._tasks_in_progress =
|
424
|
+
self._tasks_in_progress = self._mp_manager.dict()
|
392
425
|
|
393
426
|
self.procs = {}
|
394
427
|
for worker_id in range(self.worker_count):
|
@@ -406,9 +439,14 @@ class Manager:
|
|
406
439
|
self._worker_watchdog_thread = threading.Thread(target=self.worker_watchdog,
|
407
440
|
args=(self._kill_event,),
|
408
441
|
name="worker-watchdog")
|
442
|
+
self._monitoring_handler_thread = threading.Thread(target=self.handle_monitoring_messages,
|
443
|
+
args=(self._kill_event,),
|
444
|
+
name="Monitoring-Handler")
|
445
|
+
|
409
446
|
self._task_puller_thread.start()
|
410
447
|
self._result_pusher_thread.start()
|
411
448
|
self._worker_watchdog_thread.start()
|
449
|
+
self._monitoring_handler_thread.start()
|
412
450
|
|
413
451
|
logger.info("Loop start")
|
414
452
|
|
@@ -420,6 +458,7 @@ class Manager:
|
|
420
458
|
self._task_puller_thread.join()
|
421
459
|
self._result_pusher_thread.join()
|
422
460
|
self._worker_watchdog_thread.join()
|
461
|
+
self._monitoring_handler_thread.join()
|
423
462
|
for proc_id in self.procs:
|
424
463
|
self.procs[proc_id].terminate()
|
425
464
|
logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
|
@@ -435,7 +474,7 @@ class Manager:
|
|
435
474
|
return
|
436
475
|
|
437
476
|
def _start_worker(self, worker_id: int):
|
438
|
-
p =
|
477
|
+
p = SpawnContext.Process(
|
439
478
|
target=worker,
|
440
479
|
args=(
|
441
480
|
worker_id,
|
@@ -443,10 +482,16 @@ class Manager:
|
|
443
482
|
self.worker_count,
|
444
483
|
self.pending_task_queue,
|
445
484
|
self.pending_result_queue,
|
446
|
-
self.
|
485
|
+
self.monitoring_queue,
|
486
|
+
self.ready_worker_count,
|
447
487
|
self._tasks_in_progress,
|
448
488
|
self.cpu_affinity,
|
449
489
|
self.available_accelerators[worker_id] if self.accelerators_available else None,
|
490
|
+
self.block_id,
|
491
|
+
self.heartbeat_period,
|
492
|
+
os.getpid(),
|
493
|
+
args.logdir,
|
494
|
+
args.debug,
|
450
495
|
),
|
451
496
|
name="HTEX-Worker-{}".format(worker_id),
|
452
497
|
)
|
@@ -484,7 +529,23 @@ def execute_task(bufs):
|
|
484
529
|
|
485
530
|
|
486
531
|
@wrap_with_logs(target="worker_log")
|
487
|
-
def worker(
|
532
|
+
def worker(
|
533
|
+
worker_id: int,
|
534
|
+
pool_id: str,
|
535
|
+
pool_size: int,
|
536
|
+
task_queue: multiprocessing.Queue,
|
537
|
+
result_queue: multiprocessing.Queue,
|
538
|
+
monitoring_queue: queue.Queue,
|
539
|
+
ready_worker_count: Synchronized,
|
540
|
+
tasks_in_progress: DictProxy,
|
541
|
+
cpu_affinity: str,
|
542
|
+
accelerator: Optional[str],
|
543
|
+
block_id: str,
|
544
|
+
task_queue_timeout: int,
|
545
|
+
manager_pid: int,
|
546
|
+
logdir: str,
|
547
|
+
debug: bool,
|
548
|
+
):
|
488
549
|
"""
|
489
550
|
|
490
551
|
Put request token into queue
|
@@ -496,23 +557,22 @@ def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue
|
|
496
557
|
# override the global logger inherited from the __main__ process (which
|
497
558
|
# usually logs to manager.log) with one specific to this worker.
|
498
559
|
global logger
|
499
|
-
logger = start_file_logger('{}/block-{}/{}/worker_{}.log'.format(
|
560
|
+
logger = start_file_logger('{}/block-{}/{}/worker_{}.log'.format(logdir, block_id, pool_id, worker_id),
|
500
561
|
worker_id,
|
501
562
|
name="worker_log",
|
502
|
-
level=logging.DEBUG if
|
563
|
+
level=logging.DEBUG if debug else logging.INFO)
|
503
564
|
|
504
565
|
# Store worker ID as an environment variable
|
505
566
|
os.environ['PARSL_WORKER_RANK'] = str(worker_id)
|
506
567
|
os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
|
507
568
|
os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
|
508
|
-
os.environ['PARSL_WORKER_BLOCK_ID'] = str(
|
569
|
+
os.environ['PARSL_WORKER_BLOCK_ID'] = str(block_id)
|
509
570
|
|
510
|
-
# share the result queue with monitoring code so it too can send results down that channel
|
511
571
|
import parsl.executors.high_throughput.monitoring_info as mi
|
512
|
-
mi.result_queue =
|
572
|
+
mi.result_queue = monitoring_queue
|
513
573
|
|
514
574
|
logger.info('Worker {} started'.format(worker_id))
|
515
|
-
if
|
575
|
+
if debug:
|
516
576
|
logger.debug("Debug logging enabled")
|
517
577
|
|
518
578
|
# If desired, set process affinity
|
@@ -553,20 +613,37 @@ def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue
|
|
553
613
|
|
554
614
|
logger.info(f'Pinned worker to accelerator: {accelerator}')
|
555
615
|
|
556
|
-
|
557
|
-
|
616
|
+
def manager_is_alive():
|
617
|
+
try:
|
618
|
+
# This does not kill the process, but instead raises
|
619
|
+
# an exception if the process doesn't exist
|
620
|
+
os.kill(manager_pid, 0)
|
621
|
+
except OSError:
|
622
|
+
logger.critical(f"Manager ({manager_pid}) died; worker {worker_id} shutting down")
|
623
|
+
return False
|
624
|
+
else:
|
625
|
+
return True
|
626
|
+
|
627
|
+
worker_enqueued = False
|
628
|
+
while manager_is_alive():
|
629
|
+
if not worker_enqueued:
|
630
|
+
with ready_worker_count.get_lock():
|
631
|
+
ready_worker_count.value += 1
|
632
|
+
worker_enqueued = True
|
633
|
+
|
634
|
+
try:
|
635
|
+
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
636
|
+
req = task_queue.get(timeout=task_queue_timeout)
|
637
|
+
except queue.Empty:
|
638
|
+
continue
|
558
639
|
|
559
|
-
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
560
|
-
req = task_queue.get()
|
561
640
|
tasks_in_progress[worker_id] = req
|
562
641
|
tid = req['task_id']
|
563
642
|
logger.info("Received executor task {}".format(tid))
|
564
643
|
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id))
|
569
|
-
pass
|
644
|
+
with ready_worker_count.get_lock():
|
645
|
+
ready_worker_count.value -= 1
|
646
|
+
worker_enqueued = False
|
570
647
|
|
571
648
|
try:
|
572
649
|
result = execute_task(req['buffer'])
|
@@ -653,6 +730,7 @@ if __name__ == "__main__":
|
|
653
730
|
parser.add_argument("-r", "--result_port", required=True,
|
654
731
|
help="REQUIRED: Result port for posting results to the interchange")
|
655
732
|
parser.add_argument("--cpu-affinity", type=str, choices=["none", "block", "alternating", "block-reverse"],
|
733
|
+
required=True,
|
656
734
|
help="Whether/how workers should control CPU affinity.")
|
657
735
|
parser.add_argument("--available-accelerators", type=str, nargs="*",
|
658
736
|
help="Names of available accelerators")
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2023.12.
|
3
|
+
Version: 2023.12.18
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2023.12.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2023.12.18.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -3,11 +3,11 @@ parsl/addresses.py,sha256=L4RjQ-jGY9RfT-hBpsGw1uCzWaIdrEKxcPWV-TkGJes,4767
|
|
3
3
|
parsl/config.py,sha256=ysUWBfm9bygayHHdItaJbP4oozkHJJmVQVnWCt5igjE,6808
|
4
4
|
parsl/errors.py,sha256=SzINzQFZDBDbj9l-DPQznD0TbGkNhHIRAPkcBCogf_A,1019
|
5
5
|
parsl/log_utils.py,sha256=AGem-dhQs5TYUyJg6GKkRuHxAw8FHhYlWB_0s7_ROw4,3175
|
6
|
-
parsl/multiprocessing.py,sha256=
|
6
|
+
parsl/multiprocessing.py,sha256=w3t1pFkHo4oZpznc2KF6Ff-Jj8MvXqvjm-hoiRqZDDQ,1984
|
7
7
|
parsl/process_loggers.py,sha256=1G3Rfrh5wuZNo2X03grG4kTYPGOxz7hHCyG6L_A3b0A,1137
|
8
8
|
parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
parsl/utils.py,sha256=_flbNpTu6IXHbzIyE5JkUbOBIK4poc1R1bjBtwJUVdo,11622
|
10
|
-
parsl/version.py,sha256=
|
10
|
+
parsl/version.py,sha256=TdZ44_l5ykDC_eoJiyMUVahV--8rnh2rNzC-MtW2rL4,131
|
11
11
|
parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
parsl/app/app.py,sha256=wAHchJetgnicT1pn0NJKDeDX0lV3vDFlG8cQd_Ciax4,8522
|
13
13
|
parsl/app/bash.py,sha256=bx9x1XFwkOTpZZD3CPwnVL9SyNRDjbUGtOnuGLvxN_8,5396
|
@@ -62,8 +62,8 @@ parsl/data_provider/http.py,sha256=nDHTW7XmJqAukWJjPRQjyhUXt8r6GsQ36mX9mv_wOig,2
|
|
62
62
|
parsl/data_provider/rsync.py,sha256=2-ZxqrT-hBj39x082NusJaBqsGW4Jd2qCW6JkVPpEl0,4254
|
63
63
|
parsl/data_provider/staging.py,sha256=l-mAXFburs3BWPjkSmiQKuAgJpsxCG62yATPDbrafYI,4523
|
64
64
|
parsl/dataflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
|
-
parsl/dataflow/dflow.py,sha256=
|
66
|
-
parsl/dataflow/errors.py,sha256=
|
65
|
+
parsl/dataflow/dflow.py,sha256=uuXY9pURFDBpL0w52J0DWGCOtorTQ5wFy5V0WwHS9L8,63909
|
66
|
+
parsl/dataflow/errors.py,sha256=w2vOt_ymzG2dOqJUO4IDcmTlrCIHlMZL8nBVyVq0O_8,2176
|
67
67
|
parsl/dataflow/futures.py,sha256=aVfEUTzp4-EdunDAtNcqVQf8l_A7ArDi2c82KZMwxfY,5256
|
68
68
|
parsl/dataflow/memoization.py,sha256=AsJO6c6cRp2ac6H8uGn2USlEi78_nX3QWvpxYt4XdYE,9583
|
69
69
|
parsl/dataflow/rundirs.py,sha256=XKmBZpBEIsGACBhYOkbbs2e5edC0pQegJcSlk4FWeag,1154
|
@@ -85,8 +85,8 @@ parsl/executors/high_throughput/interchange.py,sha256=tX_EvQf7WkSKMJG-TNmA-WADjh
|
|
85
85
|
parsl/executors/high_throughput/manager_record.py,sha256=T8-JVMfDJU6SJfzJRooD0mO8AHGMXlcn3PBOM0m_vng,366
|
86
86
|
parsl/executors/high_throughput/monitoring_info.py,sha256=3gQpwQjjNDEBz0cQqJZB6hRiwLiWwXs83zkQDmbOwxY,297
|
87
87
|
parsl/executors/high_throughput/probe.py,sha256=lvnuf-vBv57tHvFh-J51F9sDYBES7jCgs6KYgWvmKRs,2749
|
88
|
-
parsl/executors/high_throughput/process_worker_pool.py,sha256=
|
89
|
-
parsl/executors/high_throughput/zmq_pipes.py,sha256=
|
88
|
+
parsl/executors/high_throughput/process_worker_pool.py,sha256=l0l5F3mpJ60idMCN-d1AbdaogmOtO5eO3uGWogspNXg,34070
|
89
|
+
parsl/executors/high_throughput/zmq_pipes.py,sha256=88VJz9QejOCQ_yyhaO5C1uQuDYZTovYEcnKn15WxHSU,6103
|
90
90
|
parsl/executors/radical/__init__.py,sha256=CKbtV2numw5QvgIBq1htMUrt9TqDCIC2zifyf2svTNU,186
|
91
91
|
parsl/executors/radical/executor.py,sha256=ZYycq58jXlBlhmIO1355JCK1xIJHkspiy62NN1XiMYQ,20729
|
92
92
|
parsl/executors/radical/rpex_master.py,sha256=nMGxYWw3r-8_vZVnEwfB5eCfdTqXkeQDP5yvU0jXgc8,1368
|
@@ -174,7 +174,7 @@ parsl/providers/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
174
174
|
parsl/providers/kubernetes/kube.py,sha256=uOr-sPgp73r1JFNc6wYhGhNGCvqkI8xBZznuJvfIfyk,12819
|
175
175
|
parsl/providers/kubernetes/template.py,sha256=VsRz6cmNaII-y4OdMT6sCwzQy95SJX6NMB0hmmFBhX4,50
|
176
176
|
parsl/providers/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
177
|
-
parsl/providers/local/local.py,sha256=
|
177
|
+
parsl/providers/local/local.py,sha256=4X6Ds7PUuwFgkc6ZuzkEyDWTzjchb9eNzAgrwYfUKAs,11369
|
178
178
|
parsl/providers/lsf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
179
179
|
parsl/providers/lsf/lsf.py,sha256=AECVpjl_CTreE-APFQSjMVVIb3HheG6zculJn-zYtdM,11502
|
180
180
|
parsl/providers/lsf/template.py,sha256=leQ_TpXv7ePMzbHfLaWvqMR0VORxlp-hjX5JxtkcwwU,269
|
@@ -182,7 +182,7 @@ parsl/providers/pbspro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
182
182
|
parsl/providers/pbspro/pbspro.py,sha256=zXsb45LhgCkLEwwKXjdjsm2jv884j1fXHJ2hky9auD0,7789
|
183
183
|
parsl/providers/pbspro/template.py,sha256=ozMbrx0HNsLnSoWbkZhy-55yJoTX5gpdRrDuVn6TFWA,369
|
184
184
|
parsl/providers/slurm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
185
|
-
parsl/providers/slurm/slurm.py,sha256=
|
185
|
+
parsl/providers/slurm/slurm.py,sha256=qHTNI5crS90PzdcoDu_lzDrGrYNss8yY_1XuWU5S2bc,12330
|
186
186
|
parsl/providers/slurm/template.py,sha256=cc-3l5wnThEWfqzpniPgi3FP6934Ni05UZ9r0A1RA8s,369
|
187
187
|
parsl/providers/torque/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
188
188
|
parsl/providers/torque/template.py,sha256=4qfc2gmlEhRCAD7erFDOs4prJQ43I8s4E8DSUSVQx3A,358
|
@@ -326,6 +326,7 @@ parsl/tests/test_docs/test_workflow4.py,sha256=PfOVDx5v_NtwDvg-ccC3A3SVM-SF0Pcyb
|
|
326
326
|
parsl/tests/test_error_handling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
327
327
|
parsl/tests/test_error_handling/test_fail.py,sha256=xx4TGWfL7le4cQ9nvnUkrlmKQJkskhD0l_3W1xwZSEI,282
|
328
328
|
parsl/tests/test_error_handling/test_htex_basic.py,sha256=VRP_-Ro2SYp8TqfjpG_zCBJOZWuVFFCr3E0WKN_blg8,455
|
329
|
+
parsl/tests/test_error_handling/test_htex_manager_failure.py,sha256=5YsCS1z7wOfUcFCD7uzR7t_rD3x5toZnoaCKVrHaMb0,1152
|
329
330
|
parsl/tests/test_error_handling/test_htex_missing_worker.py,sha256=Tux0Xla719eup7RdWj8LmxNH-CTscMN0NM4CPuPP1ng,967
|
330
331
|
parsl/tests/test_error_handling/test_htex_worker_failure.py,sha256=KO3ZegC8C6tY62XI1-uiS4w4gEYoRZZOEUnALEdBU8c,594
|
331
332
|
parsl/tests/test_error_handling/test_python_walltime.py,sha256=rdmGZHIkuann2Njt3i62odKJ0FaODGr7-L96rOXNVYg,950
|
@@ -350,9 +351,9 @@ parsl/tests/test_providers/test_local_provider.py,sha256=G6Fuko22SvAtD7xhfQv8k_8
|
|
350
351
|
parsl/tests/test_python_apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
351
352
|
parsl/tests/test_python_apps/test_arg_input_types.py,sha256=JXpfHiu8lr9BN6u1OzqFvGwBhxzsGTPMewHx6Wdo-HI,670
|
352
353
|
parsl/tests/test_python_apps/test_basic.py,sha256=lFqh4ugePbp_FRiHGUXxzV34iS7l8C5UkxTHuLcpnYs,855
|
353
|
-
parsl/tests/test_python_apps/test_dep_standard_futures.py,sha256=
|
354
|
+
parsl/tests/test_python_apps/test_dep_standard_futures.py,sha256=BloeaYBci0jS5al2d8Eqe3OfZ1tvolA5ZflOBQPR9Wo,859
|
354
355
|
parsl/tests/test_python_apps/test_dependencies.py,sha256=IRiTI_lPoWBSFSFnaBlE6Bv08PKEaf-qj5dfqO2RjT0,272
|
355
|
-
parsl/tests/test_python_apps/test_depfail_propagation.py,sha256=
|
356
|
+
parsl/tests/test_python_apps/test_depfail_propagation.py,sha256=3q3HlVWrOixFtXWBvR_ypKtbdAHAJcKndXQ5drwrBQU,1488
|
356
357
|
parsl/tests/test_python_apps/test_fail.py,sha256=0Gld8LS6NB0Io1bU82vVR73twkuL5nW0ifKbIUcsJcw,1671
|
357
358
|
parsl/tests/test_python_apps/test_fibonacci_iterative.py,sha256=ly2s5HuB9R53Z2FM_zy0WWdOk01iVhgcwSpQyK6ErIY,573
|
358
359
|
parsl/tests/test_python_apps/test_fibonacci_recursive.py,sha256=q7LMFcu_pJSNPdz8iY0UiRoIweEWIBGwMjQffHWAuDc,592
|
@@ -412,12 +413,12 @@ parsl/tests/test_threads/test_configs.py,sha256=QA9YjIMAtZ2jmkfOWqBzEfzQQcFVCDiz
|
|
412
413
|
parsl/tests/test_threads/test_lazy_errors.py,sha256=nGhYfCMHFZYSy6YJ4gnAmiLl9SfYs0WVnuvj8DXQ9bw,560
|
413
414
|
parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
414
415
|
parsl/usage_tracking/usage.py,sha256=TEuAIm_U_G2ojZxvd0bbVa6gZlU61_mVRa2yJC9mGiI,7555
|
415
|
-
parsl-2023.12.
|
416
|
-
parsl-2023.12.
|
417
|
-
parsl-2023.12.
|
418
|
-
parsl-2023.12.
|
419
|
-
parsl-2023.12.
|
420
|
-
parsl-2023.12.
|
421
|
-
parsl-2023.12.
|
422
|
-
parsl-2023.12.
|
423
|
-
parsl-2023.12.
|
416
|
+
parsl-2023.12.18.data/scripts/exec_parsl_function.py,sha256=NtWNeBvRqksej38eRPw8zPBJ1CeW6vgaitve0tfz_qc,7801
|
417
|
+
parsl-2023.12.18.data/scripts/parsl_coprocess.py,sha256=kzX_1RI3V2KMKs6L-il4I1qkLNVodDKFXN_1FHB9fmM,6031
|
418
|
+
parsl-2023.12.18.data/scripts/process_worker_pool.py,sha256=ytz3F8ZYeBr8tFqSRv2O9eZGdsID7oZRulBmmQmZaV8,34056
|
419
|
+
parsl-2023.12.18.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
420
|
+
parsl-2023.12.18.dist-info/METADATA,sha256=hmDBTXvoQYLYqsFRJi8HWQYLk9bwf7MCNEtljOy_tOY,3818
|
421
|
+
parsl-2023.12.18.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
422
|
+
parsl-2023.12.18.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
|
423
|
+
parsl-2023.12.18.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
|
424
|
+
parsl-2023.12.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|