parsl 2025.6.16__py3-none-any.whl → 2025.6.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/osg.py +1 -1
- parsl/dataflow/dflow.py +14 -4
- parsl/executors/base.py +19 -9
- parsl/executors/flux/executor.py +2 -0
- parsl/executors/globus_compute.py +2 -0
- parsl/executors/high_throughput/executor.py +22 -15
- parsl/executors/high_throughput/interchange.py +173 -191
- parsl/executors/high_throughput/mpi_executor.py +14 -4
- parsl/executors/high_throughput/probe.py +4 -4
- parsl/executors/high_throughput/process_worker_pool.py +88 -94
- parsl/executors/radical/executor.py +3 -0
- parsl/executors/taskvine/executor.py +11 -3
- parsl/executors/taskvine/manager.py +3 -1
- parsl/executors/threads.py +19 -3
- parsl/executors/workqueue/executor.py +11 -3
- parsl/monitoring/errors.py +4 -4
- parsl/monitoring/monitoring.py +26 -88
- parsl/monitoring/radios/base.py +63 -2
- parsl/monitoring/radios/filesystem.py +19 -4
- parsl/monitoring/radios/filesystem_router.py +22 -3
- parsl/monitoring/radios/htex.py +22 -13
- parsl/monitoring/radios/multiprocessing.py +22 -2
- parsl/monitoring/radios/udp.py +57 -19
- parsl/monitoring/radios/udp_router.py +119 -25
- parsl/monitoring/radios/zmq_router.py +9 -10
- parsl/monitoring/remote.py +19 -40
- parsl/providers/local/local.py +12 -13
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/conftest.py +7 -4
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
- parsl/tests/test_htex/test_zmq_binding.py +5 -6
- parsl/tests/test_monitoring/test_basic.py +12 -10
- parsl/tests/test_monitoring/{test_fuzz_zmq.py → test_htex_fuzz_zmq.py} +7 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_radio_filesystem.py +48 -0
- parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
- parsl/tests/test_monitoring/test_radio_udp.py +204 -0
- parsl/tests/test_monitoring/test_stdouterr.py +1 -3
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/version.py +1 -1
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/RECORD +51 -50
- parsl/tests/configs/local_threads_monitoring.py +0 -10
- parsl/tests/manual_tests/test_udp_simple.py +0 -51
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
- {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
import argparse
|
4
|
-
import json
|
5
4
|
import logging
|
6
5
|
import math
|
7
6
|
import multiprocessing
|
@@ -66,8 +65,7 @@ class Manager:
|
|
66
65
|
def __init__(self, *,
|
67
66
|
addresses,
|
68
67
|
address_probe_timeout,
|
69
|
-
|
70
|
-
result_port,
|
68
|
+
port,
|
71
69
|
cores_per_worker,
|
72
70
|
mem_per_worker,
|
73
71
|
max_workers_per_node,
|
@@ -157,15 +155,13 @@ class Manager:
|
|
157
155
|
self._start_time = time.time()
|
158
156
|
|
159
157
|
try:
|
160
|
-
ix_address = probe_addresses(addresses.split(','),
|
158
|
+
ix_address = probe_addresses(addresses.split(','), port, timeout=address_probe_timeout)
|
161
159
|
if not ix_address:
|
162
160
|
raise Exception("No viable address found")
|
163
161
|
else:
|
164
|
-
logger.info("Connection to Interchange successful on {}"
|
165
|
-
|
166
|
-
|
167
|
-
logger.info("Task url : {}".format(task_q_url))
|
168
|
-
logger.info("Result url : {}".format(result_q_url))
|
162
|
+
logger.info(f"Connection to Interchange successful on {ix_address}")
|
163
|
+
ix_url = tcp_url(ix_address, port)
|
164
|
+
logger.info(f"Interchange url: {ix_url}")
|
169
165
|
except Exception:
|
170
166
|
logger.exception("Caught exception while trying to determine viable address to interchange")
|
171
167
|
print("Failed to find a viable address to connect to interchange. Exiting")
|
@@ -174,8 +170,7 @@ class Manager:
|
|
174
170
|
self.cert_dir = cert_dir
|
175
171
|
self.zmq_context = curvezmq.ClientContext(self.cert_dir)
|
176
172
|
|
177
|
-
self.
|
178
|
-
self._result_q_url = result_q_url
|
173
|
+
self._ix_url = ix_url
|
179
174
|
|
180
175
|
self.uid = uid
|
181
176
|
self.block_id = block_id
|
@@ -250,37 +245,37 @@ class Manager:
|
|
250
245
|
self.worker_count = min(len(self.available_accelerators), self.worker_count)
|
251
246
|
logger.info("Manager will spawn {} workers".format(self.worker_count))
|
252
247
|
|
253
|
-
def create_reg_message(self):
|
248
|
+
def create_reg_message(self) -> dict:
|
254
249
|
""" Creates a registration message to identify the worker to the interchange
|
255
250
|
"""
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
251
|
+
return {
|
252
|
+
'type': 'registration',
|
253
|
+
'parsl_v': PARSL_VERSION,
|
254
|
+
'python_v': "{}.{}.{}".format(
|
255
|
+
sys.version_info.major,
|
256
|
+
sys.version_info.minor,
|
257
|
+
sys.version_info.micro
|
258
|
+
),
|
259
|
+
'packages': {d.metadata['Name']: d.version for d in distributions()},
|
260
|
+
'worker_count': self.worker_count,
|
261
|
+
'uid': self.uid,
|
262
|
+
'block_id': self.block_id,
|
263
|
+
'start_time': self.start_time,
|
264
|
+
'prefetch_capacity': self.prefetch_capacity,
|
265
|
+
'max_capacity': self.worker_count + self.prefetch_capacity,
|
266
|
+
'os': platform.system(),
|
267
|
+
'hostname': platform.node(),
|
268
|
+
'dir': os.getcwd(),
|
269
|
+
'cpu_count': psutil.cpu_count(logical=False),
|
270
|
+
'total_memory': psutil.virtual_memory().total,
|
271
|
+
}
|
276
272
|
|
277
273
|
@staticmethod
|
278
274
|
def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
|
279
275
|
""" Send heartbeat to the incoming task queue
|
280
276
|
"""
|
281
|
-
msg = {'type': 'heartbeat'}
|
282
277
|
# don't need to dumps and encode this every time - could do as a global on import?
|
283
|
-
b_msg =
|
278
|
+
b_msg = pickle.dumps({'type': 'heartbeat'})
|
284
279
|
task_incoming.send(b_msg)
|
285
280
|
logger.debug("Sent heartbeat")
|
286
281
|
|
@@ -289,32 +284,38 @@ class Manager:
|
|
289
284
|
""" Send heartbeat to the incoming task queue
|
290
285
|
"""
|
291
286
|
msg = {'type': 'drain'}
|
292
|
-
b_msg =
|
287
|
+
b_msg = pickle.dumps(msg)
|
293
288
|
task_incoming.send(b_msg)
|
294
289
|
logger.debug("Sent drain")
|
295
290
|
|
296
291
|
@wrap_with_logs
|
297
|
-
def
|
292
|
+
def interchange_communicator(self, pair_setup: threading.Event):
|
298
293
|
""" Pull tasks from the incoming tasks zmq pipe onto the internal
|
299
294
|
pending task queue
|
300
295
|
"""
|
301
296
|
logger.info("starting")
|
302
297
|
|
298
|
+
results_sock = self.zmq_context.socket(zmq.PAIR)
|
299
|
+
results_sock.setsockopt(zmq.LINGER, 0)
|
300
|
+
results_sock.bind("inproc://results")
|
301
|
+
pair_setup.set()
|
302
|
+
|
303
303
|
# Linger is set to 0, so that the manager can exit even when there might be
|
304
304
|
# messages in the pipe
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
305
|
+
ix_sock = self.zmq_context.socket(zmq.DEALER)
|
306
|
+
ix_sock.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
|
307
|
+
ix_sock.setsockopt(zmq.LINGER, 0)
|
308
|
+
ix_sock.connect(self._ix_url)
|
309
309
|
logger.info("Manager task pipe connected to interchange")
|
310
310
|
|
311
311
|
poller = zmq.Poller()
|
312
|
-
poller.register(
|
312
|
+
poller.register(results_sock, zmq.POLLIN)
|
313
|
+
poller.register(ix_sock, zmq.POLLIN)
|
313
314
|
|
314
315
|
# Send a registration message
|
315
316
|
msg = self.create_reg_message()
|
316
|
-
logger.debug("Sending registration message:
|
317
|
-
|
317
|
+
logger.debug("Sending registration message: %s", msg)
|
318
|
+
ix_sock.send(pickle.dumps(msg))
|
318
319
|
last_beat = time.time()
|
319
320
|
last_interchange_contact = time.time()
|
320
321
|
task_recv_counter = 0
|
@@ -335,18 +336,21 @@ class Manager:
|
|
335
336
|
pending_task_count = self.pending_task_queue.qsize()
|
336
337
|
except NotImplementedError:
|
337
338
|
# Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
|
338
|
-
pending_task_count = f"pending task count is not available on {platform.system()}"
|
339
|
+
pending_task_count = f"pending task count is not available on {platform.system()}" # type: ignore[assignment]
|
339
340
|
|
340
|
-
logger.debug(
|
341
|
-
|
341
|
+
logger.debug(
|
342
|
+
'ready workers: %d, pending tasks: %d',
|
343
|
+
self.ready_worker_count.value, # type: ignore[attr-defined]
|
344
|
+
pending_task_count,
|
345
|
+
)
|
342
346
|
|
343
347
|
if time.time() >= last_beat + self.heartbeat_period:
|
344
|
-
self.heartbeat_to_incoming(
|
348
|
+
self.heartbeat_to_incoming(ix_sock)
|
345
349
|
last_beat = time.time()
|
346
350
|
|
347
351
|
if time.time() > self.drain_time:
|
348
352
|
logger.info("Requesting drain")
|
349
|
-
self.drain_to_incoming(
|
353
|
+
self.drain_to_incoming(ix_sock)
|
350
354
|
# This will start the pool draining...
|
351
355
|
# Drained exit behaviour does not happen here. It will be
|
352
356
|
# driven by the interchange sending a DRAINED_CODE message.
|
@@ -358,8 +362,8 @@ class Manager:
|
|
358
362
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
359
363
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
360
364
|
|
361
|
-
if socks.get(
|
362
|
-
|
365
|
+
if socks.get(ix_sock) == zmq.POLLIN:
|
366
|
+
pkl_msg = ix_sock.recv()
|
363
367
|
tasks = pickle.loads(pkl_msg)
|
364
368
|
last_interchange_contact = time.time()
|
365
369
|
|
@@ -377,6 +381,11 @@ class Manager:
|
|
377
381
|
for task in tasks:
|
378
382
|
self.task_scheduler.put_task(task)
|
379
383
|
|
384
|
+
elif socks.get(results_sock) == zmq.POLLIN:
|
385
|
+
meta_b = pickle.dumps({'type': 'result'})
|
386
|
+
ix_sock.send_multipart([meta_b, results_sock.recv()])
|
387
|
+
logger.debug("Result sent to interchange")
|
388
|
+
|
380
389
|
else:
|
381
390
|
logger.debug("No incoming tasks")
|
382
391
|
|
@@ -387,45 +396,36 @@ class Manager:
|
|
387
396
|
logger.critical("Exiting")
|
388
397
|
break
|
389
398
|
|
390
|
-
|
399
|
+
ix_sock.close()
|
391
400
|
logger.info("Exiting")
|
392
401
|
|
393
402
|
@wrap_with_logs
|
394
|
-
def
|
395
|
-
""" Listens on the pending_result_queue and
|
403
|
+
def ferry_result(self, may_connect: threading.Event):
|
404
|
+
""" Listens on the pending_result_queue and ferries results to the interchange
|
405
|
+
connected thread
|
396
406
|
"""
|
397
|
-
logger.debug("
|
407
|
+
logger.debug("Begin")
|
398
408
|
|
399
409
|
# Linger is set to 0, so that the manager can exit even when there might be
|
400
410
|
# messages in the pipe
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
logger.info("Manager result pipe connected to interchange")
|
411
|
+
notify_sock = self.zmq_context.socket(zmq.PAIR)
|
412
|
+
notify_sock.setsockopt(zmq.LINGER, 0)
|
413
|
+
may_connect.wait()
|
414
|
+
notify_sock.connect("inproc://results")
|
406
415
|
|
407
416
|
while not self._stop_event.is_set():
|
408
|
-
logger.debug("Starting pending_result_queue get")
|
409
417
|
try:
|
410
418
|
r = self.task_scheduler.get_result()
|
411
419
|
if r is None:
|
412
420
|
continue
|
413
|
-
logger.debug("Result received from worker
|
414
|
-
|
415
|
-
logger.debug("Result sent to interchange: %s", id(r))
|
421
|
+
logger.debug("Result received from worker")
|
422
|
+
notify_sock.send(r)
|
416
423
|
except Exception:
|
417
424
|
logger.exception("Failed to send result to interchange")
|
418
425
|
|
419
|
-
|
426
|
+
notify_sock.close()
|
420
427
|
logger.debug("Exiting")
|
421
428
|
|
422
|
-
@wrap_with_logs
|
423
|
-
def heartbeater(self):
|
424
|
-
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
-
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
-
|
429
429
|
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
430
430
|
"""Keeps workers alive."""
|
431
431
|
logger.debug("Starting worker watchdog")
|
@@ -492,23 +492,26 @@ class Manager:
|
|
492
492
|
|
493
493
|
logger.debug("Workers started")
|
494
494
|
|
495
|
-
|
496
|
-
|
497
|
-
|
495
|
+
pair_setup = threading.Event()
|
496
|
+
|
497
|
+
thr_task_puller = threading.Thread(
|
498
|
+
target=self.interchange_communicator,
|
499
|
+
args=(pair_setup,),
|
500
|
+
name="Interchange-Communicator",
|
498
501
|
)
|
502
|
+
thr_result_ferry = threading.Thread(
|
503
|
+
target=self.ferry_result, args=(pair_setup,), name="Result-Shovel")
|
499
504
|
thr_worker_watchdog = threading.Thread(
|
500
505
|
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
501
506
|
)
|
502
507
|
thr_monitoring_handler = threading.Thread(
|
503
508
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
504
509
|
)
|
505
|
-
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
506
510
|
|
507
511
|
thr_task_puller.start()
|
508
|
-
|
512
|
+
thr_result_ferry.start()
|
509
513
|
thr_worker_watchdog.start()
|
510
514
|
thr_monitoring_handler.start()
|
511
|
-
thr_heartbeater.start()
|
512
515
|
|
513
516
|
logger.info("Manager threads started")
|
514
517
|
|
@@ -520,11 +523,10 @@ class Manager:
|
|
520
523
|
self.monitoring_queue.put(None)
|
521
524
|
self.pending_result_queue.put(None)
|
522
525
|
|
523
|
-
thr_heartbeater.join()
|
524
|
-
thr_task_puller.join()
|
525
|
-
thr_result_pusher.join()
|
526
|
-
thr_worker_watchdog.join()
|
527
526
|
thr_monitoring_handler.join()
|
527
|
+
thr_worker_watchdog.join()
|
528
|
+
thr_result_ferry.join()
|
529
|
+
thr_task_puller.join()
|
528
530
|
|
529
531
|
for worker_id in procs:
|
530
532
|
p = procs[worker_id]
|
@@ -862,10 +864,10 @@ def get_arg_parser() -> argparse.ArgumentParser:
|
|
862
864
|
help="GB of memory assigned to each worker process. Default=0, no assignment",
|
863
865
|
)
|
864
866
|
parser.add_argument(
|
865
|
-
"-
|
866
|
-
"--
|
867
|
+
"-P",
|
868
|
+
"--port",
|
867
869
|
required=True,
|
868
|
-
help="
|
870
|
+
help="Port for communication with the interchange",
|
869
871
|
)
|
870
872
|
parser.add_argument(
|
871
873
|
"--max_workers_per_node",
|
@@ -901,12 +903,6 @@ def get_arg_parser() -> argparse.ArgumentParser:
|
|
901
903
|
parser.add_argument(
|
902
904
|
"--poll", default=10, help="Poll period used in milliseconds"
|
903
905
|
)
|
904
|
-
parser.add_argument(
|
905
|
-
"-r",
|
906
|
-
"--result_port",
|
907
|
-
required=True,
|
908
|
-
help="Result port for posting results to the interchange",
|
909
|
-
)
|
910
906
|
parser.add_argument(
|
911
907
|
"--cpu-affinity",
|
912
908
|
type=strategyorlist,
|
@@ -953,8 +949,7 @@ if __name__ == "__main__":
|
|
953
949
|
f"\n Block ID: {args.block_id}"
|
954
950
|
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
951
|
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
-
f"\n
|
957
|
-
f"\n result_port: {args.result_port}"
|
952
|
+
f"\n Interchange port: {args.port}"
|
958
953
|
f"\n addresses: {args.addresses}"
|
959
954
|
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
955
|
f"\n poll_period: {args.poll}"
|
@@ -969,8 +964,7 @@ if __name__ == "__main__":
|
|
969
964
|
f"\n mpi_launcher: {args.mpi_launcher}"
|
970
965
|
)
|
971
966
|
try:
|
972
|
-
manager = Manager(
|
973
|
-
result_port=args.result_port,
|
967
|
+
manager = Manager(port=args.port,
|
974
968
|
addresses=args.addresses,
|
975
969
|
address_probe_timeout=int(args.address_probe_timeout),
|
976
970
|
uid=args.uid,
|
@@ -40,6 +40,8 @@ from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig
|
|
40
40
|
from parsl.executors.taskvine.manager import _taskvine_submit_wait
|
41
41
|
from parsl.executors.taskvine.manager_config import TaskVineManagerConfig
|
42
42
|
from parsl.executors.taskvine.utils import ParslFileToVine, ParslTaskToVine
|
43
|
+
from parsl.monitoring.radios.base import RadioConfig
|
44
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadio
|
43
45
|
from parsl.multiprocessing import SpawnContext
|
44
46
|
from parsl.process_loggers import wrap_with_logs
|
45
47
|
from parsl.providers import CondorProvider, LocalProvider
|
@@ -98,8 +100,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
98
100
|
Default is None.
|
99
101
|
"""
|
100
102
|
|
101
|
-
radio_mode = "filesystem"
|
102
|
-
|
103
103
|
@typeguard.typechecked
|
104
104
|
def __init__(self,
|
105
105
|
label: str = "TaskVineExecutor",
|
@@ -108,7 +108,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
108
108
|
manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
|
109
109
|
factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
|
110
110
|
provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
|
111
|
-
storage_access: Optional[List[Staging]] = None
|
111
|
+
storage_access: Optional[List[Staging]] = None,
|
112
|
+
remote_monitoring_radio: Optional[RadioConfig] = None):
|
112
113
|
|
113
114
|
# Set worker launch option for this executor
|
114
115
|
if worker_launch_method == 'factory' or worker_launch_method == 'manual':
|
@@ -134,6 +135,11 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
134
135
|
self.factory_config = factory_config
|
135
136
|
self.storage_access = storage_access
|
136
137
|
|
138
|
+
if remote_monitoring_radio is not None:
|
139
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
140
|
+
else:
|
141
|
+
self.remote_monitoring_radio = FilesystemRadio()
|
142
|
+
|
137
143
|
# Queue to send ready tasks from TaskVine executor process to TaskVine manager process
|
138
144
|
self._ready_task_queue: multiprocessing.Queue = SpawnContext.Queue()
|
139
145
|
|
@@ -601,6 +607,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
601
607
|
self._finished_task_queue.close()
|
602
608
|
self._finished_task_queue.join_thread()
|
603
609
|
|
610
|
+
super().shutdown()
|
611
|
+
|
604
612
|
logger.debug("TaskVine shutdown completed")
|
605
613
|
|
606
614
|
@wrap_with_logs
|
@@ -6,6 +6,7 @@ import shutil
|
|
6
6
|
import subprocess
|
7
7
|
import uuid
|
8
8
|
|
9
|
+
import parsl
|
9
10
|
from parsl.executors.taskvine import exec_parsl_function
|
10
11
|
from parsl.executors.taskvine.utils import VineTaskToParsl, run_parsl_function
|
11
12
|
from parsl.process_loggers import wrap_with_logs
|
@@ -255,7 +256,8 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
255
256
|
run_parsl_function,
|
256
257
|
poncho_env=poncho_env_path,
|
257
258
|
init_command=manager_config.init_command,
|
258
|
-
add_env=add_env
|
259
|
+
add_env=add_env,
|
260
|
+
hoisting_modules=[parsl.serialize, run_parsl_function])
|
259
261
|
|
260
262
|
# Configure the library if provided
|
261
263
|
if manager_config.library_config:
|
parsl/executors/threads.py
CHANGED
@@ -7,6 +7,8 @@ import typeguard
|
|
7
7
|
from parsl.data_provider.staging import Staging
|
8
8
|
from parsl.executors.base import ParslExecutor
|
9
9
|
from parsl.executors.errors import InvalidResourceSpecification
|
10
|
+
from parsl.monitoring.radios.base import RadioConfig
|
11
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadio
|
10
12
|
from parsl.utils import RepresentationMixin
|
11
13
|
|
12
14
|
logger = logging.getLogger(__name__)
|
@@ -28,7 +30,7 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
28
30
|
@typeguard.typechecked
|
29
31
|
def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
|
30
32
|
thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
|
31
|
-
working_dir: Optional[str] = None):
|
33
|
+
working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
|
32
34
|
ParslExecutor.__init__(self)
|
33
35
|
self.label = label
|
34
36
|
self.max_threads = max_threads
|
@@ -40,6 +42,11 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
40
42
|
self.storage_access = storage_access
|
41
43
|
self.working_dir = working_dir
|
42
44
|
|
45
|
+
if remote_monitoring_radio is not None:
|
46
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
47
|
+
else:
|
48
|
+
self.remote_monitoring_radio = MultiprocessingQueueRadio()
|
49
|
+
|
43
50
|
def start(self):
|
44
51
|
self.executor = cf.ThreadPoolExecutor(max_workers=self.max_threads,
|
45
52
|
thread_name_prefix=self.thread_name_prefix)
|
@@ -73,9 +80,18 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
73
80
|
"""
|
74
81
|
logger.debug("Shutting down executor, which involves waiting for running tasks to complete")
|
75
82
|
self.executor.shutdown(wait=block)
|
83
|
+
|
84
|
+
super().shutdown()
|
85
|
+
|
76
86
|
logger.debug("Done with executor shutdown")
|
77
87
|
|
78
88
|
def monitor_resources(self):
|
79
|
-
"""Resource monitoring
|
80
|
-
|
89
|
+
"""Resource monitoring does not make sense when using the
|
90
|
+
ThreadPoolExecutor, as there is no per-task process tree: all tasks
|
91
|
+
run inside the same single submitting process.
|
92
|
+
|
93
|
+
In addition, the use of fork-based multiprocessing in the remote
|
94
|
+
wrapper in parsl/monitoring/remote.py was especially prone to deadlock
|
95
|
+
with this executor.
|
96
|
+
"""
|
81
97
|
return False
|
@@ -31,6 +31,8 @@ from parsl.errors import OptionalModuleMissing
|
|
31
31
|
from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
|
32
32
|
from parsl.executors.status_handling import BlockProviderExecutor
|
33
33
|
from parsl.executors.workqueue import exec_parsl_function
|
34
|
+
from parsl.monitoring.radios.base import RadioConfig
|
35
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadio
|
34
36
|
from parsl.multiprocessing import SpawnContext, SpawnProcess
|
35
37
|
from parsl.process_loggers import wrap_with_logs
|
36
38
|
from parsl.providers import CondorProvider, LocalProvider
|
@@ -227,8 +229,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
227
229
|
specifiation for each task).
|
228
230
|
"""
|
229
231
|
|
230
|
-
radio_mode = "filesystem"
|
231
|
-
|
232
232
|
@typeguard.typechecked
|
233
233
|
def __init__(self,
|
234
234
|
label: str = "WorkQueueExecutor",
|
@@ -255,7 +255,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
255
255
|
worker_executable: str = 'work_queue_worker',
|
256
256
|
function_dir: Optional[str] = None,
|
257
257
|
coprocess: bool = False,
|
258
|
-
scaling_cores_per_worker: int = 1
|
258
|
+
scaling_cores_per_worker: int = 1,
|
259
|
+
remote_monitoring_radio: Optional[RadioConfig] = None):
|
259
260
|
BlockProviderExecutor.__init__(self, provider=provider,
|
260
261
|
block_error_handler=True)
|
261
262
|
if not _work_queue_enabled:
|
@@ -308,6 +309,11 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
308
309
|
if self.init_command != "":
|
309
310
|
self.launch_cmd = self.init_command + "; " + self.launch_cmd
|
310
311
|
|
312
|
+
if remote_monitoring_radio is not None:
|
313
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
314
|
+
else:
|
315
|
+
self.remote_monitoring_radio = FilesystemRadio()
|
316
|
+
|
311
317
|
def _get_launch_command(self, block_id):
|
312
318
|
# this executor uses different terminology for worker/launch
|
313
319
|
# commands than in htex
|
@@ -714,6 +720,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
714
720
|
self.collector_queue.close()
|
715
721
|
self.collector_queue.join_thread()
|
716
722
|
|
723
|
+
super().shutdown()
|
724
|
+
|
717
725
|
logger.debug("Work Queue shutdown completed")
|
718
726
|
|
719
727
|
@wrap_with_logs
|
parsl/monitoring/errors.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
from parsl.errors import ParslError
|
2
2
|
|
3
3
|
|
4
|
-
class
|
4
|
+
class MonitoringRouterStartError(ParslError):
|
5
5
|
def __str__(self) -> str:
|
6
|
-
return "
|
6
|
+
return "Monitoring router failed to start"
|
7
7
|
|
8
8
|
|
9
|
-
class
|
9
|
+
class RadioRequiredError(ParslError):
|
10
10
|
def __str__(self) -> str:
|
11
|
-
return "
|
11
|
+
return "A radio must be configured for remote task monitoring"
|