parsl 2025.6.23__py3-none-any.whl → 2025.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/anvil.py +34 -0
- parsl/configs/delta.py +35 -0
- parsl/configs/osg.py +1 -1
- parsl/dataflow/dflow.py +14 -4
- parsl/executors/base.py +14 -6
- parsl/executors/high_throughput/executor.py +20 -15
- parsl/executors/high_throughput/interchange.py +177 -191
- parsl/executors/high_throughput/mpi_executor.py +7 -4
- parsl/executors/high_throughput/probe.py +61 -49
- parsl/executors/high_throughput/process_worker_pool.py +105 -103
- parsl/executors/taskvine/executor.py +9 -3
- parsl/executors/taskvine/manager.py +3 -1
- parsl/executors/threads.py +8 -1
- parsl/executors/workqueue/executor.py +9 -3
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +25 -42
- parsl/monitoring/radios/base.py +63 -2
- parsl/monitoring/radios/filesystem.py +18 -3
- parsl/monitoring/radios/filesystem_router.py +13 -26
- parsl/monitoring/radios/htex.py +22 -13
- parsl/monitoring/radios/multiprocessing.py +22 -2
- parsl/monitoring/radios/udp.py +57 -19
- parsl/monitoring/radios/udp_router.py +49 -15
- parsl/monitoring/remote.py +19 -40
- parsl/providers/local/local.py +12 -13
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
- parsl/tests/test_htex/test_zmq_binding.py +5 -6
- parsl/tests/test_monitoring/test_basic.py +12 -10
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_radio_filesystem.py +7 -9
- parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
- parsl/tests/test_monitoring/test_radio_udp.py +163 -12
- parsl/tests/test_monitoring/test_stdouterr.py +1 -3
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
- parsl/version.py +1 -1
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/interchange.py +177 -191
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/process_worker_pool.py +105 -103
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/METADATA +2 -2
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/RECORD +46 -43
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/LICENSE +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/WHEEL +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/entry_points.txt +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
import argparse
|
4
|
-
import json
|
5
4
|
import logging
|
6
5
|
import math
|
7
6
|
import multiprocessing
|
@@ -66,8 +65,7 @@ class Manager:
|
|
66
65
|
def __init__(self, *,
|
67
66
|
addresses,
|
68
67
|
address_probe_timeout,
|
69
|
-
|
70
|
-
result_port,
|
68
|
+
port,
|
71
69
|
cores_per_worker,
|
72
70
|
mem_per_worker,
|
73
71
|
max_workers_per_node,
|
@@ -156,26 +154,23 @@ class Manager:
|
|
156
154
|
|
157
155
|
self._start_time = time.time()
|
158
156
|
|
159
|
-
try:
|
160
|
-
ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
|
161
|
-
if not ix_address:
|
162
|
-
raise Exception("No viable address found")
|
163
|
-
else:
|
164
|
-
logger.info("Connection to Interchange successful on {}".format(ix_address))
|
165
|
-
task_q_url = tcp_url(ix_address, task_port)
|
166
|
-
result_q_url = tcp_url(ix_address, result_port)
|
167
|
-
logger.info("Task url : {}".format(task_q_url))
|
168
|
-
logger.info("Result url : {}".format(result_q_url))
|
169
|
-
except Exception:
|
170
|
-
logger.exception("Caught exception while trying to determine viable address to interchange")
|
171
|
-
print("Failed to find a viable address to connect to interchange. Exiting")
|
172
|
-
exit(5)
|
173
|
-
|
174
157
|
self.cert_dir = cert_dir
|
175
158
|
self.zmq_context = curvezmq.ClientContext(self.cert_dir)
|
176
159
|
|
177
|
-
|
178
|
-
|
160
|
+
addresses = ','.join(tcp_url(a, port) for a in addresses.split(','))
|
161
|
+
try:
|
162
|
+
self._ix_url = probe_addresses(
|
163
|
+
self.zmq_context,
|
164
|
+
addresses,
|
165
|
+
timeout_ms=1_000 * address_probe_timeout,
|
166
|
+
identity=uid.encode('utf-8'),
|
167
|
+
)
|
168
|
+
except ConnectionError:
|
169
|
+
addys = ", ".join(addresses.split(","))
|
170
|
+
logger.error(f"Unable to connect to interchange; attempted addresses: {addys}")
|
171
|
+
raise
|
172
|
+
|
173
|
+
logger.info(f"Probe discovered interchange url: {self._ix_url}")
|
179
174
|
|
180
175
|
self.uid = uid
|
181
176
|
self.block_id = block_id
|
@@ -250,37 +245,37 @@ class Manager:
|
|
250
245
|
self.worker_count = min(len(self.available_accelerators), self.worker_count)
|
251
246
|
logger.info("Manager will spawn {} workers".format(self.worker_count))
|
252
247
|
|
253
|
-
def create_reg_message(self):
|
248
|
+
def create_reg_message(self) -> dict:
|
254
249
|
""" Creates a registration message to identify the worker to the interchange
|
255
250
|
"""
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
251
|
+
return {
|
252
|
+
'type': 'registration',
|
253
|
+
'parsl_v': PARSL_VERSION,
|
254
|
+
'python_v': "{}.{}.{}".format(
|
255
|
+
sys.version_info.major,
|
256
|
+
sys.version_info.minor,
|
257
|
+
sys.version_info.micro
|
258
|
+
),
|
259
|
+
'packages': {d.metadata['Name']: d.version for d in distributions()},
|
260
|
+
'worker_count': self.worker_count,
|
261
|
+
'uid': self.uid,
|
262
|
+
'block_id': self.block_id,
|
263
|
+
'start_time': self.start_time,
|
264
|
+
'prefetch_capacity': self.prefetch_capacity,
|
265
|
+
'max_capacity': self.worker_count + self.prefetch_capacity,
|
266
|
+
'os': platform.system(),
|
267
|
+
'hostname': platform.node(),
|
268
|
+
'dir': os.getcwd(),
|
269
|
+
'cpu_count': psutil.cpu_count(logical=False),
|
270
|
+
'total_memory': psutil.virtual_memory().total,
|
271
|
+
}
|
276
272
|
|
277
273
|
@staticmethod
|
278
274
|
def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
|
279
275
|
""" Send heartbeat to the incoming task queue
|
280
276
|
"""
|
281
|
-
msg = {'type': 'heartbeat'}
|
282
277
|
# don't need to dumps and encode this every time - could do as a global on import?
|
283
|
-
b_msg =
|
278
|
+
b_msg = pickle.dumps({'type': 'heartbeat'})
|
284
279
|
task_incoming.send(b_msg)
|
285
280
|
logger.debug("Sent heartbeat")
|
286
281
|
|
@@ -289,32 +284,46 @@ class Manager:
|
|
289
284
|
""" Send heartbeat to the incoming task queue
|
290
285
|
"""
|
291
286
|
msg = {'type': 'drain'}
|
292
|
-
b_msg =
|
287
|
+
b_msg = pickle.dumps(msg)
|
293
288
|
task_incoming.send(b_msg)
|
294
289
|
logger.debug("Sent drain")
|
295
290
|
|
296
291
|
@wrap_with_logs
|
297
|
-
def
|
292
|
+
def interchange_communicator(self, pair_setup: threading.Event):
|
298
293
|
""" Pull tasks from the incoming tasks zmq pipe onto the internal
|
299
294
|
pending task queue
|
300
295
|
"""
|
301
296
|
logger.info("starting")
|
302
297
|
|
298
|
+
results_sock = self.zmq_context.socket(zmq.PAIR)
|
299
|
+
results_sock.setsockopt(zmq.LINGER, 0)
|
300
|
+
results_sock.bind("inproc://results")
|
301
|
+
pair_setup.set()
|
302
|
+
|
303
303
|
# Linger is set to 0, so that the manager can exit even when there might be
|
304
304
|
# messages in the pipe
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
305
|
+
ix_sock = self.zmq_context.socket(zmq.DEALER)
|
306
|
+
ix_sock.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
|
307
|
+
ix_sock.setsockopt(zmq.LINGER, 0)
|
308
|
+
ix_sock.connect(self._ix_url)
|
309
309
|
logger.info("Manager task pipe connected to interchange")
|
310
310
|
|
311
311
|
poller = zmq.Poller()
|
312
|
-
poller.register(
|
312
|
+
poller.register(results_sock, zmq.POLLIN)
|
313
|
+
poller.register(ix_sock, zmq.POLLIN)
|
314
|
+
|
315
|
+
ix_sock.send(pickle.dumps({"type": "connection_probe"}))
|
316
|
+
evts = dict(poller.poll(timeout=self.heartbeat_period))
|
317
|
+
if evts.get(ix_sock) is None:
|
318
|
+
logger.error(f"Failed to connect to interchange ({self._ix_url}")
|
319
|
+
|
320
|
+
ix_sock.recv()
|
321
|
+
logger.info(f"Successfully connected to interchange via URL: {self._ix_url}")
|
313
322
|
|
314
323
|
# Send a registration message
|
315
324
|
msg = self.create_reg_message()
|
316
|
-
logger.debug("Sending registration message:
|
317
|
-
|
325
|
+
logger.debug("Sending registration message: %s", msg)
|
326
|
+
ix_sock.send(pickle.dumps(msg))
|
318
327
|
last_beat = time.time()
|
319
328
|
last_interchange_contact = time.time()
|
320
329
|
task_recv_counter = 0
|
@@ -335,18 +344,21 @@ class Manager:
|
|
335
344
|
pending_task_count = self.pending_task_queue.qsize()
|
336
345
|
except NotImplementedError:
|
337
346
|
# Ref: https://github.com/python/cpython/blob/6d5e0dc0e330f4009e8dc3d1642e46b129788877/Lib/multiprocessing/queues.py#L125
|
338
|
-
pending_task_count = f"pending task count is not available on {platform.system()}"
|
347
|
+
pending_task_count = f"pending task count is not available on {platform.system()}" # type: ignore[assignment]
|
339
348
|
|
340
|
-
logger.debug(
|
341
|
-
|
349
|
+
logger.debug(
|
350
|
+
'ready workers: %d, pending tasks: %d',
|
351
|
+
self.ready_worker_count.value, # type: ignore[attr-defined]
|
352
|
+
pending_task_count,
|
353
|
+
)
|
342
354
|
|
343
355
|
if time.time() >= last_beat + self.heartbeat_period:
|
344
|
-
self.heartbeat_to_incoming(
|
356
|
+
self.heartbeat_to_incoming(ix_sock)
|
345
357
|
last_beat = time.time()
|
346
358
|
|
347
359
|
if time.time() > self.drain_time:
|
348
360
|
logger.info("Requesting drain")
|
349
|
-
self.drain_to_incoming(
|
361
|
+
self.drain_to_incoming(ix_sock)
|
350
362
|
# This will start the pool draining...
|
351
363
|
# Drained exit behaviour does not happen here. It will be
|
352
364
|
# driven by the interchange sending a DRAINED_CODE message.
|
@@ -358,8 +370,8 @@ class Manager:
|
|
358
370
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
359
371
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
360
372
|
|
361
|
-
if socks.get(
|
362
|
-
|
373
|
+
if socks.get(ix_sock) == zmq.POLLIN:
|
374
|
+
pkl_msg = ix_sock.recv()
|
363
375
|
tasks = pickle.loads(pkl_msg)
|
364
376
|
last_interchange_contact = time.time()
|
365
377
|
|
@@ -377,6 +389,11 @@ class Manager:
|
|
377
389
|
for task in tasks:
|
378
390
|
self.task_scheduler.put_task(task)
|
379
391
|
|
392
|
+
elif socks.get(results_sock) == zmq.POLLIN:
|
393
|
+
meta_b = pickle.dumps({'type': 'result'})
|
394
|
+
ix_sock.send_multipart([meta_b, results_sock.recv()])
|
395
|
+
logger.debug("Result sent to interchange")
|
396
|
+
|
380
397
|
else:
|
381
398
|
logger.debug("No incoming tasks")
|
382
399
|
|
@@ -387,45 +404,36 @@ class Manager:
|
|
387
404
|
logger.critical("Exiting")
|
388
405
|
break
|
389
406
|
|
390
|
-
|
407
|
+
ix_sock.close()
|
391
408
|
logger.info("Exiting")
|
392
409
|
|
393
410
|
@wrap_with_logs
|
394
|
-
def
|
395
|
-
""" Listens on the pending_result_queue and
|
411
|
+
def ferry_result(self, may_connect: threading.Event):
|
412
|
+
""" Listens on the pending_result_queue and ferries results to the interchange
|
413
|
+
connected thread
|
396
414
|
"""
|
397
|
-
logger.debug("
|
415
|
+
logger.debug("Begin")
|
398
416
|
|
399
417
|
# Linger is set to 0, so that the manager can exit even when there might be
|
400
418
|
# messages in the pipe
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
logger.info("Manager result pipe connected to interchange")
|
419
|
+
notify_sock = self.zmq_context.socket(zmq.PAIR)
|
420
|
+
notify_sock.setsockopt(zmq.LINGER, 0)
|
421
|
+
may_connect.wait()
|
422
|
+
notify_sock.connect("inproc://results")
|
406
423
|
|
407
424
|
while not self._stop_event.is_set():
|
408
|
-
logger.debug("Starting pending_result_queue get")
|
409
425
|
try:
|
410
426
|
r = self.task_scheduler.get_result()
|
411
427
|
if r is None:
|
412
428
|
continue
|
413
|
-
logger.debug("Result received from worker
|
414
|
-
|
415
|
-
logger.debug("Result sent to interchange: %s", id(r))
|
429
|
+
logger.debug("Result received from worker")
|
430
|
+
notify_sock.send(r)
|
416
431
|
except Exception:
|
417
432
|
logger.exception("Failed to send result to interchange")
|
418
433
|
|
419
|
-
|
434
|
+
notify_sock.close()
|
420
435
|
logger.debug("Exiting")
|
421
436
|
|
422
|
-
@wrap_with_logs
|
423
|
-
def heartbeater(self):
|
424
|
-
while not self._stop_event.wait(self.heartbeat_period):
|
425
|
-
heartbeat_message = f"heartbeat_period={self.heartbeat_period} seconds"
|
426
|
-
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
427
|
-
self.pending_result_queue.put(pickle.dumps({'type': 'heartbeat'}))
|
428
|
-
|
429
437
|
def worker_watchdog(self, procs: dict[int, SpawnProcess]):
|
430
438
|
"""Keeps workers alive."""
|
431
439
|
logger.debug("Starting worker watchdog")
|
@@ -492,23 +500,26 @@ class Manager:
|
|
492
500
|
|
493
501
|
logger.debug("Workers started")
|
494
502
|
|
495
|
-
|
496
|
-
|
497
|
-
|
503
|
+
pair_setup = threading.Event()
|
504
|
+
|
505
|
+
thr_task_puller = threading.Thread(
|
506
|
+
target=self.interchange_communicator,
|
507
|
+
args=(pair_setup,),
|
508
|
+
name="Interchange-Communicator",
|
498
509
|
)
|
510
|
+
thr_result_ferry = threading.Thread(
|
511
|
+
target=self.ferry_result, args=(pair_setup,), name="Result-Shovel")
|
499
512
|
thr_worker_watchdog = threading.Thread(
|
500
513
|
target=self.worker_watchdog, args=(procs,), name="worker-watchdog"
|
501
514
|
)
|
502
515
|
thr_monitoring_handler = threading.Thread(
|
503
516
|
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
504
517
|
)
|
505
|
-
thr_heartbeater = threading.Thread(target=self.heartbeater, name="Heartbeater")
|
506
518
|
|
507
519
|
thr_task_puller.start()
|
508
|
-
|
520
|
+
thr_result_ferry.start()
|
509
521
|
thr_worker_watchdog.start()
|
510
522
|
thr_monitoring_handler.start()
|
511
|
-
thr_heartbeater.start()
|
512
523
|
|
513
524
|
logger.info("Manager threads started")
|
514
525
|
|
@@ -520,11 +531,10 @@ class Manager:
|
|
520
531
|
self.monitoring_queue.put(None)
|
521
532
|
self.pending_result_queue.put(None)
|
522
533
|
|
523
|
-
thr_heartbeater.join()
|
524
|
-
thr_task_puller.join()
|
525
|
-
thr_result_pusher.join()
|
526
|
-
thr_worker_watchdog.join()
|
527
534
|
thr_monitoring_handler.join()
|
535
|
+
thr_worker_watchdog.join()
|
536
|
+
thr_result_ferry.join()
|
537
|
+
thr_task_puller.join()
|
528
538
|
|
529
539
|
for worker_id in procs:
|
530
540
|
p = procs[worker_id]
|
@@ -862,10 +872,10 @@ def get_arg_parser() -> argparse.ArgumentParser:
|
|
862
872
|
help="GB of memory assigned to each worker process. Default=0, no assignment",
|
863
873
|
)
|
864
874
|
parser.add_argument(
|
865
|
-
"-
|
866
|
-
"--
|
875
|
+
"-P",
|
876
|
+
"--port",
|
867
877
|
required=True,
|
868
|
-
help="
|
878
|
+
help="Port for communication with the interchange",
|
869
879
|
)
|
870
880
|
parser.add_argument(
|
871
881
|
"--max_workers_per_node",
|
@@ -901,12 +911,6 @@ def get_arg_parser() -> argparse.ArgumentParser:
|
|
901
911
|
parser.add_argument(
|
902
912
|
"--poll", default=10, help="Poll period used in milliseconds"
|
903
913
|
)
|
904
|
-
parser.add_argument(
|
905
|
-
"-r",
|
906
|
-
"--result_port",
|
907
|
-
required=True,
|
908
|
-
help="Result port for posting results to the interchange",
|
909
|
-
)
|
910
914
|
parser.add_argument(
|
911
915
|
"--cpu-affinity",
|
912
916
|
type=strategyorlist,
|
@@ -953,8 +957,7 @@ if __name__ == "__main__":
|
|
953
957
|
f"\n Block ID: {args.block_id}"
|
954
958
|
f"\n cores_per_worker: {args.cores_per_worker}"
|
955
959
|
f"\n mem_per_worker: {args.mem_per_worker}"
|
956
|
-
f"\n
|
957
|
-
f"\n result_port: {args.result_port}"
|
960
|
+
f"\n Interchange port: {args.port}"
|
958
961
|
f"\n addresses: {args.addresses}"
|
959
962
|
f"\n max_workers_per_node: {args.max_workers_per_node}"
|
960
963
|
f"\n poll_period: {args.poll}"
|
@@ -969,8 +972,7 @@ if __name__ == "__main__":
|
|
969
972
|
f"\n mpi_launcher: {args.mpi_launcher}"
|
970
973
|
)
|
971
974
|
try:
|
972
|
-
manager = Manager(
|
973
|
-
result_port=args.result_port,
|
975
|
+
manager = Manager(port=args.port,
|
974
976
|
addresses=args.addresses,
|
975
977
|
address_probe_timeout=int(args.address_probe_timeout),
|
976
978
|
uid=args.uid,
|
@@ -40,6 +40,8 @@ from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig
|
|
40
40
|
from parsl.executors.taskvine.manager import _taskvine_submit_wait
|
41
41
|
from parsl.executors.taskvine.manager_config import TaskVineManagerConfig
|
42
42
|
from parsl.executors.taskvine.utils import ParslFileToVine, ParslTaskToVine
|
43
|
+
from parsl.monitoring.radios.base import RadioConfig
|
44
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadio
|
43
45
|
from parsl.multiprocessing import SpawnContext
|
44
46
|
from parsl.process_loggers import wrap_with_logs
|
45
47
|
from parsl.providers import CondorProvider, LocalProvider
|
@@ -98,8 +100,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
98
100
|
Default is None.
|
99
101
|
"""
|
100
102
|
|
101
|
-
radio_mode = "filesystem"
|
102
|
-
|
103
103
|
@typeguard.typechecked
|
104
104
|
def __init__(self,
|
105
105
|
label: str = "TaskVineExecutor",
|
@@ -108,7 +108,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
108
108
|
manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
|
109
109
|
factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
|
110
110
|
provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
|
111
|
-
storage_access: Optional[List[Staging]] = None
|
111
|
+
storage_access: Optional[List[Staging]] = None,
|
112
|
+
remote_monitoring_radio: Optional[RadioConfig] = None):
|
112
113
|
|
113
114
|
# Set worker launch option for this executor
|
114
115
|
if worker_launch_method == 'factory' or worker_launch_method == 'manual':
|
@@ -134,6 +135,11 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
134
135
|
self.factory_config = factory_config
|
135
136
|
self.storage_access = storage_access
|
136
137
|
|
138
|
+
if remote_monitoring_radio is not None:
|
139
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
140
|
+
else:
|
141
|
+
self.remote_monitoring_radio = FilesystemRadio()
|
142
|
+
|
137
143
|
# Queue to send ready tasks from TaskVine executor process to TaskVine manager process
|
138
144
|
self._ready_task_queue: multiprocessing.Queue = SpawnContext.Queue()
|
139
145
|
|
@@ -6,6 +6,7 @@ import shutil
|
|
6
6
|
import subprocess
|
7
7
|
import uuid
|
8
8
|
|
9
|
+
import parsl
|
9
10
|
from parsl.executors.taskvine import exec_parsl_function
|
10
11
|
from parsl.executors.taskvine.utils import VineTaskToParsl, run_parsl_function
|
11
12
|
from parsl.process_loggers import wrap_with_logs
|
@@ -255,7 +256,8 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
255
256
|
run_parsl_function,
|
256
257
|
poncho_env=poncho_env_path,
|
257
258
|
init_command=manager_config.init_command,
|
258
|
-
add_env=add_env
|
259
|
+
add_env=add_env,
|
260
|
+
hoisting_modules=[parsl.serialize, run_parsl_function])
|
259
261
|
|
260
262
|
# Configure the library if provided
|
261
263
|
if manager_config.library_config:
|
parsl/executors/threads.py
CHANGED
@@ -7,6 +7,8 @@ import typeguard
|
|
7
7
|
from parsl.data_provider.staging import Staging
|
8
8
|
from parsl.executors.base import ParslExecutor
|
9
9
|
from parsl.executors.errors import InvalidResourceSpecification
|
10
|
+
from parsl.monitoring.radios.base import RadioConfig
|
11
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadio
|
10
12
|
from parsl.utils import RepresentationMixin
|
11
13
|
|
12
14
|
logger = logging.getLogger(__name__)
|
@@ -28,7 +30,7 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
28
30
|
@typeguard.typechecked
|
29
31
|
def __init__(self, label: str = 'threads', max_threads: Optional[int] = 2,
|
30
32
|
thread_name_prefix: str = '', storage_access: Optional[List[Staging]] = None,
|
31
|
-
working_dir: Optional[str] = None):
|
33
|
+
working_dir: Optional[str] = None, remote_monitoring_radio: Optional[RadioConfig] = None):
|
32
34
|
ParslExecutor.__init__(self)
|
33
35
|
self.label = label
|
34
36
|
self.max_threads = max_threads
|
@@ -40,6 +42,11 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
40
42
|
self.storage_access = storage_access
|
41
43
|
self.working_dir = working_dir
|
42
44
|
|
45
|
+
if remote_monitoring_radio is not None:
|
46
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
47
|
+
else:
|
48
|
+
self.remote_monitoring_radio = MultiprocessingQueueRadio()
|
49
|
+
|
43
50
|
def start(self):
|
44
51
|
self.executor = cf.ThreadPoolExecutor(max_workers=self.max_threads,
|
45
52
|
thread_name_prefix=self.thread_name_prefix)
|
@@ -31,6 +31,8 @@ from parsl.errors import OptionalModuleMissing
|
|
31
31
|
from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
|
32
32
|
from parsl.executors.status_handling import BlockProviderExecutor
|
33
33
|
from parsl.executors.workqueue import exec_parsl_function
|
34
|
+
from parsl.monitoring.radios.base import RadioConfig
|
35
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadio
|
34
36
|
from parsl.multiprocessing import SpawnContext, SpawnProcess
|
35
37
|
from parsl.process_loggers import wrap_with_logs
|
36
38
|
from parsl.providers import CondorProvider, LocalProvider
|
@@ -227,8 +229,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
227
229
|
specifiation for each task).
|
228
230
|
"""
|
229
231
|
|
230
|
-
radio_mode = "filesystem"
|
231
|
-
|
232
232
|
@typeguard.typechecked
|
233
233
|
def __init__(self,
|
234
234
|
label: str = "WorkQueueExecutor",
|
@@ -255,7 +255,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
255
255
|
worker_executable: str = 'work_queue_worker',
|
256
256
|
function_dir: Optional[str] = None,
|
257
257
|
coprocess: bool = False,
|
258
|
-
scaling_cores_per_worker: int = 1
|
258
|
+
scaling_cores_per_worker: int = 1,
|
259
|
+
remote_monitoring_radio: Optional[RadioConfig] = None):
|
259
260
|
BlockProviderExecutor.__init__(self, provider=provider,
|
260
261
|
block_error_handler=True)
|
261
262
|
if not _work_queue_enabled:
|
@@ -308,6 +309,11 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
308
309
|
if self.init_command != "":
|
309
310
|
self.launch_cmd = self.init_command + "; " + self.launch_cmd
|
310
311
|
|
312
|
+
if remote_monitoring_radio is not None:
|
313
|
+
self.remote_monitoring_radio = remote_monitoring_radio
|
314
|
+
else:
|
315
|
+
self.remote_monitoring_radio = FilesystemRadio()
|
316
|
+
|
311
317
|
def _get_launch_command(self, block_id):
|
312
318
|
# this executor uses different terminology for worker/launch
|
313
319
|
# commands than in htex
|
parsl/monitoring/errors.py
CHANGED
@@ -4,3 +4,8 @@ from parsl.errors import ParslError
|
|
4
4
|
class MonitoringRouterStartError(ParslError):
|
5
5
|
def __str__(self) -> str:
|
6
6
|
return "Monitoring router failed to start"
|
7
|
+
|
8
|
+
|
9
|
+
class RadioRequiredError(ParslError):
|
10
|
+
def __str__(self) -> str:
|
11
|
+
return "A radio must be configured for remote task monitoring"
|
parsl/monitoring/monitoring.py
CHANGED
@@ -9,8 +9,6 @@ from typing import Any, Optional, Union
|
|
9
9
|
|
10
10
|
import typeguard
|
11
11
|
|
12
|
-
from parsl.monitoring.radios.filesystem_router import start_filesystem_receiver
|
13
|
-
from parsl.monitoring.radios.udp_router import start_udp_receiver
|
14
12
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
15
13
|
from parsl.multiprocessing import (
|
16
14
|
SizedQueue,
|
@@ -36,9 +34,9 @@ logger = logging.getLogger(__name__)
|
|
36
34
|
@typeguard.typechecked
|
37
35
|
class MonitoringHub(RepresentationMixin):
|
38
36
|
def __init__(self,
|
39
|
-
hub_address:
|
40
|
-
|
41
|
-
|
37
|
+
hub_address: Any = None, # unused, so no type enforcement
|
38
|
+
hub_port_range: Any = None, # unused, so no type enforcement
|
39
|
+
hub_port: Any = None, # unused, so no type enforcement
|
42
40
|
|
43
41
|
workflow_name: Optional[str] = None,
|
44
42
|
workflow_version: Optional[str] = None,
|
@@ -49,16 +47,14 @@ class MonitoringHub(RepresentationMixin):
|
|
49
47
|
"""
|
50
48
|
Parameters
|
51
49
|
----------
|
52
|
-
hub_address :
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
hub_port_range.
|
59
|
-
Default: None
|
50
|
+
hub_address : unused
|
51
|
+
hub_port : unused
|
52
|
+
Unused, but probably retained until 2026-06-01 to give deprecation warning.
|
53
|
+
These two values previously configured UDP parameters when UDP was used
|
54
|
+
for monitoring messages from workers. These are now configured on the
|
55
|
+
relevant UDPRadio.
|
60
56
|
hub_port_range : unused
|
61
|
-
Unused, but retained until
|
57
|
+
Unused, but probably retained until 2026-06-01 to give deprecation warning.
|
62
58
|
This value previously configured one ZMQ channel inside the
|
63
59
|
HighThroughputExecutor. That ZMQ channel is now configured by the
|
64
60
|
interchange_port_range parameter of HighThroughputExecutor.
|
@@ -86,15 +82,27 @@ class MonitoringHub(RepresentationMixin):
|
|
86
82
|
if _db_manager_excepts:
|
87
83
|
raise _db_manager_excepts
|
88
84
|
|
85
|
+
# The following three parameters need to exist as attributes to be
|
86
|
+
# output by RepresentationMixin.
|
87
|
+
if hub_address is not None:
|
88
|
+
message = "Instead of MonitoringHub.hub_address, specify UDPRadio(address=...)"
|
89
|
+
warnings.warn(message, DeprecationWarning)
|
90
|
+
logger.warning(message)
|
91
|
+
|
89
92
|
self.hub_address = hub_address
|
93
|
+
|
94
|
+
if hub_port is not None:
|
95
|
+
message = "Instead of MonitoringHub.hub_port, specify UDPRadio(port=...)"
|
96
|
+
warnings.warn(message, DeprecationWarning)
|
97
|
+
logger.warning(message)
|
98
|
+
|
90
99
|
self.hub_port = hub_port
|
91
100
|
|
92
101
|
if hub_port_range is not None:
|
93
102
|
message = "Instead of MonitoringHub.hub_port_range, Use HighThroughputExecutor.interchange_port_range"
|
94
103
|
warnings.warn(message, DeprecationWarning)
|
95
104
|
logger.warning(message)
|
96
|
-
|
97
|
-
# even though now it is otherwise unused.
|
105
|
+
|
98
106
|
self.hub_port_range = hub_port_range
|
99
107
|
|
100
108
|
self.logging_endpoint = logging_endpoint
|
@@ -120,12 +128,6 @@ class MonitoringHub(RepresentationMixin):
|
|
120
128
|
self.resource_msgs: Queue[TaggedMonitoringMessage]
|
121
129
|
self.resource_msgs = SizedQueue()
|
122
130
|
|
123
|
-
self.udp_receiver = start_udp_receiver(debug=self.monitoring_debug,
|
124
|
-
logdir=dfk_run_dir,
|
125
|
-
monitoring_messages=self.resource_msgs,
|
126
|
-
port=self.hub_port
|
127
|
-
)
|
128
|
-
|
129
131
|
self.dbm_exit_event: ms.Event
|
130
132
|
self.dbm_exit_event = SpawnEvent()
|
131
133
|
|
@@ -140,37 +142,18 @@ class MonitoringHub(RepresentationMixin):
|
|
140
142
|
daemon=True,
|
141
143
|
)
|
142
144
|
self.dbm_proc.start()
|
143
|
-
logger.info("Started
|
144
|
-
self.udp_receiver.process.pid, self.dbm_proc.pid)
|
145
|
-
|
146
|
-
self.filesystem_receiver = start_filesystem_receiver(debug=self.monitoring_debug,
|
147
|
-
logdir=dfk_run_dir,
|
148
|
-
monitoring_messages=self.resource_msgs
|
149
|
-
)
|
150
|
-
logger.info("Started filesystem radio receiver process %s", self.filesystem_receiver.process.pid)
|
151
|
-
|
152
|
-
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, self.udp_receiver.port)
|
153
|
-
|
145
|
+
logger.info("Started DBM process %s", self.dbm_proc.pid)
|
154
146
|
logger.info("Monitoring Hub initialized")
|
155
147
|
|
156
148
|
def close(self) -> None:
|
157
149
|
logger.info("Terminating Monitoring Hub")
|
158
150
|
if self.monitoring_hub_active:
|
159
151
|
self.monitoring_hub_active = False
|
160
|
-
logger.info("Setting router termination event")
|
161
|
-
|
162
|
-
logger.info("Waiting for UDP router to terminate")
|
163
|
-
self.udp_receiver.close()
|
164
|
-
|
165
|
-
logger.debug("Finished waiting for router termination")
|
166
152
|
logger.debug("Waiting for DB termination")
|
167
153
|
self.dbm_exit_event.set()
|
168
154
|
join_terminate_close_proc(self.dbm_proc)
|
169
155
|
logger.debug("Finished waiting for DBM termination")
|
170
156
|
|
171
|
-
logger.info("Terminating filesystem radio receiver process")
|
172
|
-
self.filesystem_receiver.close()
|
173
|
-
|
174
157
|
logger.info("Closing monitoring multiprocessing queues")
|
175
158
|
self.resource_msgs.close()
|
176
159
|
self.resource_msgs.join_thread()
|