parsl 2025.6.23__py3-none-any.whl → 2025.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/anvil.py +34 -0
- parsl/configs/delta.py +35 -0
- parsl/configs/osg.py +1 -1
- parsl/dataflow/dflow.py +14 -4
- parsl/executors/base.py +14 -6
- parsl/executors/high_throughput/executor.py +20 -15
- parsl/executors/high_throughput/interchange.py +177 -191
- parsl/executors/high_throughput/mpi_executor.py +7 -4
- parsl/executors/high_throughput/probe.py +61 -49
- parsl/executors/high_throughput/process_worker_pool.py +105 -103
- parsl/executors/taskvine/executor.py +9 -3
- parsl/executors/taskvine/manager.py +3 -1
- parsl/executors/threads.py +8 -1
- parsl/executors/workqueue/executor.py +9 -3
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +25 -42
- parsl/monitoring/radios/base.py +63 -2
- parsl/monitoring/radios/filesystem.py +18 -3
- parsl/monitoring/radios/filesystem_router.py +13 -26
- parsl/monitoring/radios/htex.py +22 -13
- parsl/monitoring/radios/multiprocessing.py +22 -2
- parsl/monitoring/radios/udp.py +57 -19
- parsl/monitoring/radios/udp_router.py +49 -15
- parsl/monitoring/remote.py +19 -40
- parsl/providers/local/local.py +12 -13
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
- parsl/tests/test_htex/test_zmq_binding.py +5 -6
- parsl/tests/test_monitoring/test_basic.py +12 -10
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_radio_filesystem.py +7 -9
- parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
- parsl/tests/test_monitoring/test_radio_udp.py +163 -12
- parsl/tests/test_monitoring/test_stdouterr.py +1 -3
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
- parsl/version.py +1 -1
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/interchange.py +177 -191
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/process_worker_pool.py +105 -103
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/METADATA +2 -2
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/RECORD +46 -43
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.6.23.data → parsl-2025.7.7.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/LICENSE +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/WHEEL +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/entry_points.txt +0 -0
- {parsl-2025.6.23.dist-info → parsl-2025.7.7.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
2
|
import datetime
|
3
|
-
import json
|
4
3
|
import logging
|
5
4
|
import os
|
6
5
|
import pickle
|
@@ -46,7 +45,7 @@ class Interchange:
|
|
46
45
|
client_address: str,
|
47
46
|
interchange_address: Optional[str],
|
48
47
|
client_ports: Tuple[int, int, int],
|
49
|
-
|
48
|
+
worker_port: Optional[int],
|
50
49
|
worker_port_range: Tuple[int, int],
|
51
50
|
hub_address: Optional[str],
|
52
51
|
hub_zmq_port: Optional[int],
|
@@ -71,12 +70,12 @@ class Interchange:
|
|
71
70
|
client_ports : tuple(int, int, int)
|
72
71
|
The ports at which the client can be reached
|
73
72
|
|
74
|
-
|
75
|
-
The specific
|
73
|
+
worker_port : int
|
74
|
+
The specific port to which workers will connect to the Interchange.
|
76
75
|
|
77
76
|
worker_port_range : tuple(int, int)
|
78
77
|
The interchange picks ports at random from the range which will be used by workers.
|
79
|
-
This is overridden when the
|
78
|
+
This is overridden when the worker_port option is set.
|
80
79
|
|
81
80
|
hub_address : str
|
82
81
|
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
@@ -139,31 +138,23 @@ class Interchange:
|
|
139
138
|
# count of tasks that have been sent out to worker pools
|
140
139
|
self.count = 0
|
141
140
|
|
142
|
-
self.
|
143
|
-
self.
|
141
|
+
self.manager_sock = self.zmq_context.socket(zmq.ROUTER)
|
142
|
+
self.manager_sock.set_hwm(0)
|
144
143
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
self.results_incoming.set_hwm(0)
|
149
|
-
|
150
|
-
if self.worker_ports:
|
151
|
-
self.worker_task_port = self.worker_ports[0]
|
152
|
-
self.worker_result_port = self.worker_ports[1]
|
153
|
-
|
154
|
-
self.task_outgoing.bind(tcp_url(self.interchange_address, self.worker_task_port))
|
155
|
-
self.results_incoming.bind(tcp_url(self.interchange_address, self.worker_result_port))
|
144
|
+
if worker_port:
|
145
|
+
task_addy = tcp_url(self.interchange_address, worker_port)
|
146
|
+
self.manager_sock.bind(task_addy)
|
156
147
|
|
157
148
|
else:
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
149
|
+
worker_port = self.manager_sock.bind_to_random_port(
|
150
|
+
tcp_url(self.interchange_address),
|
151
|
+
min_port=worker_port_range[0],
|
152
|
+
max_port=worker_port_range[1],
|
153
|
+
max_tries=100,
|
154
|
+
)
|
155
|
+
self.worker_port = worker_port
|
164
156
|
|
165
|
-
logger.info("Bound to
|
166
|
-
self.worker_task_port, self.worker_result_port))
|
157
|
+
logger.info(f"Bound to port {worker_port} for incoming worker connections")
|
167
158
|
|
168
159
|
self._ready_managers: Dict[bytes, ManagerRecord] = {}
|
169
160
|
self.connected_block_history: List[str] = []
|
@@ -276,8 +267,8 @@ class Interchange:
|
|
276
267
|
|
277
268
|
reply = None
|
278
269
|
|
279
|
-
elif command_req == "
|
280
|
-
reply =
|
270
|
+
elif command_req == "WORKER_BINDS":
|
271
|
+
reply = self.worker_port
|
281
272
|
|
282
273
|
else:
|
283
274
|
logger.error(f"Received unknown command: {command_req}")
|
@@ -307,8 +298,7 @@ class Interchange:
|
|
307
298
|
kill_event = threading.Event()
|
308
299
|
|
309
300
|
poller = zmq.Poller()
|
310
|
-
poller.register(self.
|
311
|
-
poller.register(self.results_incoming, zmq.POLLIN)
|
301
|
+
poller.register(self.manager_sock, zmq.POLLIN)
|
312
302
|
poller.register(self.task_incoming, zmq.POLLIN)
|
313
303
|
poller.register(self.command_channel, zmq.POLLIN)
|
314
304
|
|
@@ -323,8 +313,7 @@ class Interchange:
|
|
323
313
|
|
324
314
|
self.process_command(monitoring_radio)
|
325
315
|
self.process_task_incoming()
|
326
|
-
self.
|
327
|
-
self.process_results_incoming(interesting_managers, monitoring_radio)
|
316
|
+
self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
|
328
317
|
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
329
318
|
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
330
319
|
self.process_tasks_to_send(interesting_managers, monitoring_radio)
|
@@ -353,116 +342,171 @@ class Interchange:
|
|
353
342
|
self.task_counter += 1
|
354
343
|
logger.debug(f"Fetched {self.task_counter} tasks so far")
|
355
344
|
|
356
|
-
def
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
345
|
+
def process_manager_socket_message(
|
346
|
+
self,
|
347
|
+
interesting_managers: Set[bytes],
|
348
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
349
|
+
kill_event: threading.Event,
|
361
350
|
) -> None:
|
362
|
-
"""Process one message from manager on the
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
manager_id =
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
351
|
+
"""Process one message from manager on the manager_sock channel."""
|
352
|
+
if not self.socks.get(self.manager_sock) == zmq.POLLIN:
|
353
|
+
return
|
354
|
+
|
355
|
+
logger.debug('starting worker message section')
|
356
|
+
msg_parts = self.manager_sock.recv_multipart()
|
357
|
+
try:
|
358
|
+
manager_id, meta_b, *msgs = msg_parts
|
359
|
+
meta = pickle.loads(meta_b)
|
360
|
+
mtype = meta['type']
|
361
|
+
except Exception as e:
|
362
|
+
logger.warning(
|
363
|
+
f'Failed to read manager message ([{type(e).__name__}] {e})'
|
364
|
+
)
|
365
|
+
logger.debug('Message:\n %r\n', msg_parts, exc_info=e)
|
366
|
+
return
|
367
|
+
|
368
|
+
logger.debug(
|
369
|
+
'Processing message type %r from manager %r', mtype, manager_id
|
370
|
+
)
|
371
|
+
|
372
|
+
if mtype == 'connection_probe':
|
373
|
+
self.manager_sock.send_multipart([manager_id, b''])
|
374
|
+
return
|
375
|
+
|
376
|
+
elif mtype == 'registration':
|
377
|
+
ix_minor_py = self.current_platform['python_v'].rsplit('.', 1)[0]
|
378
|
+
ix_parsl_v = self.current_platform['parsl_v']
|
379
|
+
mgr_minor_py = meta['python_v'].rsplit('.', 1)[0]
|
380
|
+
mgr_parsl_v = meta['parsl_v']
|
381
|
+
|
382
|
+
new_rec = ManagerRecord(
|
383
|
+
block_id=None,
|
384
|
+
start_time=meta['start_time'],
|
385
|
+
tasks=[],
|
386
|
+
worker_count=0,
|
387
|
+
max_capacity=0,
|
388
|
+
active=True,
|
389
|
+
draining=False,
|
390
|
+
last_heartbeat=time.time(),
|
391
|
+
idle_since=time.time(),
|
392
|
+
parsl_version=mgr_parsl_v,
|
393
|
+
python_version=meta['python_v'],
|
394
|
+
)
|
395
|
+
|
396
|
+
# m is a ManagerRecord, but meta is a dict[Any,Any] and so can
|
397
|
+
# contain arbitrary fields beyond those in ManagerRecord (and
|
398
|
+
# indeed does - for example, python_v) which are then ignored
|
399
|
+
# later.
|
400
|
+
new_rec.update(meta)
|
401
|
+
|
402
|
+
logger.info(f'Registration info for manager {manager_id!r}: {meta}')
|
403
|
+
self._send_monitoring_info(monitoring_radio, new_rec)
|
404
|
+
|
405
|
+
if (mgr_minor_py, mgr_parsl_v) != (ix_minor_py, ix_parsl_v):
|
406
|
+
kill_event.set()
|
407
|
+
vm_exc = VersionMismatch(
|
408
|
+
f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
|
409
|
+
f"py.v={mgr_minor_py} parsl.v={mgr_parsl_v}",
|
410
|
+
)
|
411
|
+
result_package = {
|
412
|
+
'type': 'result',
|
413
|
+
'task_id': -1,
|
414
|
+
'exception': serialize_object(vm_exc),
|
415
|
+
}
|
416
|
+
pkl_package = pickle.dumps(result_package)
|
417
|
+
self.results_outgoing.send(pkl_package)
|
418
|
+
logger.error(
|
419
|
+
'Manager has incompatible version info with the interchange;'
|
420
|
+
' sending failure reports and shutting down:'
|
421
|
+
f'\n Interchange: {vm_exc.interchange_version}'
|
422
|
+
f'\n Manager: {vm_exc.manager_version}'
|
404
423
|
)
|
405
424
|
|
406
|
-
|
407
|
-
#
|
408
|
-
#
|
409
|
-
|
410
|
-
m.update(msg) # type: ignore[typeddict-item]
|
425
|
+
else:
|
426
|
+
# We really should update the associated data structure; but not
|
427
|
+
# at this time. *kicks can down the road*
|
428
|
+
assert new_rec['block_id'] is not None, 'Verified externally'
|
411
429
|
|
412
|
-
|
413
|
-
self.
|
430
|
+
# set up entry only if we accept the registration
|
431
|
+
self._ready_managers[manager_id] = new_rec
|
432
|
+
self.connected_block_history.append(new_rec['block_id'])
|
414
433
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
434
|
+
interesting_managers.add(manager_id)
|
435
|
+
|
436
|
+
logger.info(
|
437
|
+
f"Registered manager {manager_id!r} (py{mgr_minor_py},"
|
438
|
+
f" {mgr_parsl_v}) and added to ready queue"
|
439
|
+
)
|
440
|
+
logger.debug("Manager %r -> %s", manager_id, new_rec)
|
441
|
+
|
442
|
+
return
|
443
|
+
|
444
|
+
if not (m := self._ready_managers.get(manager_id)):
|
445
|
+
logger.warning(f"Ignoring message from unknown manager: {manager_id!r}")
|
446
|
+
return
|
447
|
+
|
448
|
+
if mtype == 'result':
|
449
|
+
logger.debug("Number of results in batch: %d", len(msgs))
|
450
|
+
b_messages_to_send = []
|
451
|
+
|
452
|
+
for p_message in msgs:
|
453
|
+
r = pickle.loads(p_message)
|
454
|
+
r_type = r['type']
|
455
|
+
if r_type == 'result':
|
456
|
+
# process this for task ID and forward to executor
|
457
|
+
tid = r['task_id']
|
458
|
+
logger.debug("Removing task %s from manager", tid)
|
459
|
+
try:
|
460
|
+
m['tasks'].remove(tid)
|
461
|
+
b_messages_to_send.append(p_message)
|
462
|
+
except Exception:
|
463
|
+
logger.exception(
|
464
|
+
'Ignoring exception removing task_id %s from manager'
|
465
|
+
' task list %s',
|
466
|
+
tid,
|
467
|
+
m['tasks']
|
468
|
+
)
|
469
|
+
elif r_type == 'monitoring':
|
470
|
+
# the monitoring code makes the assumption that no
|
471
|
+
# monitoring messages will be received if monitoring
|
472
|
+
# is not configured, and that monitoring_radio will only
|
473
|
+
# be None when monitoring is not configurated.
|
474
|
+
assert monitoring_radio is not None
|
475
|
+
|
476
|
+
monitoring_radio.send(r['payload'])
|
477
|
+
|
478
|
+
else:
|
428
479
|
logger.error(
|
429
|
-
|
430
|
-
" sending failure reports and shutting down:"
|
431
|
-
f"\n Interchange: {e.interchange_version}"
|
432
|
-
f"\n Manager: {e.manager_version}"
|
480
|
+
f'Discarding result message of unknown type: {r_type}'
|
433
481
|
)
|
434
482
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
483
|
+
if b_messages_to_send:
|
484
|
+
logger.debug(
|
485
|
+
'Sending messages (%d) on results_outgoing',
|
486
|
+
len(b_messages_to_send),
|
487
|
+
)
|
488
|
+
self.results_outgoing.send_multipart(b_messages_to_send)
|
489
|
+
logger.debug('Sent messages on results_outgoing')
|
439
490
|
|
440
|
-
|
441
|
-
|
442
|
-
self.connected_block_history.append(m['block_id'])
|
491
|
+
# At least one result received, so manager now has idle capacity
|
492
|
+
interesting_managers.add(manager_id)
|
443
493
|
|
444
|
-
|
494
|
+
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
495
|
+
m['idle_since'] = time.time()
|
445
496
|
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
|
460
|
-
elif msg['type'] == 'drain':
|
461
|
-
self._ready_managers[manager_id]['draining'] = True
|
462
|
-
logger.debug("Manager %r requested drain", manager_id)
|
463
|
-
else:
|
464
|
-
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
465
|
-
logger.debug("leaving task_outgoing section")
|
497
|
+
self._send_monitoring_info(monitoring_radio, m)
|
498
|
+
|
499
|
+
elif mtype == 'heartbeat':
|
500
|
+
m['last_heartbeat'] = time.time()
|
501
|
+
self.manager_sock.send_multipart([manager_id, PKL_HEARTBEAT_CODE])
|
502
|
+
|
503
|
+
elif mtype == 'drain':
|
504
|
+
m['draining'] = True
|
505
|
+
|
506
|
+
else:
|
507
|
+
logger.error(f"Unexpected message type received from manager: {mtype}")
|
508
|
+
|
509
|
+
logger.debug("leaving worker message section")
|
466
510
|
|
467
511
|
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
468
512
|
|
@@ -472,7 +516,7 @@ class Interchange:
|
|
472
516
|
m = self._ready_managers[manager_id]
|
473
517
|
if m['draining'] and len(m['tasks']) == 0:
|
474
518
|
logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
|
475
|
-
self.
|
519
|
+
self.manager_sock.send_multipart([manager_id, PKL_DRAINED_CODE])
|
476
520
|
interesting_managers.remove(manager_id)
|
477
521
|
self._ready_managers.pop(manager_id)
|
478
522
|
|
@@ -500,7 +544,7 @@ class Interchange:
|
|
500
544
|
if real_capacity and m["active"] and not m["draining"]:
|
501
545
|
tasks = self.get_tasks(real_capacity)
|
502
546
|
if tasks:
|
503
|
-
self.
|
547
|
+
self.manager_sock.send_multipart([manager_id, pickle.dumps(tasks)])
|
504
548
|
task_count = len(tasks)
|
505
549
|
self.count += task_count
|
506
550
|
tids = [t['task_id'] for t in tasks]
|
@@ -520,64 +564,6 @@ class Interchange:
|
|
520
564
|
interesting_managers.remove(manager_id)
|
521
565
|
logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
|
522
566
|
|
523
|
-
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
524
|
-
# Receive any results and forward to client
|
525
|
-
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
526
|
-
logger.debug("entering results_incoming section")
|
527
|
-
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
528
|
-
if manager_id not in self._ready_managers:
|
529
|
-
logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
|
530
|
-
else:
|
531
|
-
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
532
|
-
|
533
|
-
m = self._ready_managers[manager_id]
|
534
|
-
b_messages_to_send = []
|
535
|
-
|
536
|
-
for p_message in all_messages:
|
537
|
-
r = pickle.loads(p_message)
|
538
|
-
if r['type'] == 'result':
|
539
|
-
# process this for task ID and forward to executor
|
540
|
-
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
541
|
-
try:
|
542
|
-
m['tasks'].remove(r['task_id'])
|
543
|
-
b_messages_to_send.append(p_message)
|
544
|
-
except Exception:
|
545
|
-
logger.exception(
|
546
|
-
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
547
|
-
r['task_id'],
|
548
|
-
manager_id,
|
549
|
-
m["tasks"]
|
550
|
-
)
|
551
|
-
elif r['type'] == 'monitoring':
|
552
|
-
# the monitoring code makes the assumption that no
|
553
|
-
# monitoring messages will be received if monitoring
|
554
|
-
# is not configured, and that monitoring_radio will only
|
555
|
-
# be None when monitoring is not configurated.
|
556
|
-
assert monitoring_radio is not None
|
557
|
-
|
558
|
-
monitoring_radio.send(r['payload'])
|
559
|
-
elif r['type'] == 'heartbeat':
|
560
|
-
logger.debug("Manager %r sent heartbeat via results connection", manager_id)
|
561
|
-
else:
|
562
|
-
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
563
|
-
|
564
|
-
if b_messages_to_send:
|
565
|
-
logger.debug("Sending messages on results_outgoing")
|
566
|
-
self.results_outgoing.send_multipart(b_messages_to_send)
|
567
|
-
logger.debug("Sent messages on results_outgoing")
|
568
|
-
|
569
|
-
# At least one result received, so manager now has idle capacity
|
570
|
-
interesting_managers.add(manager_id)
|
571
|
-
|
572
|
-
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
573
|
-
m['idle_since'] = time.time()
|
574
|
-
|
575
|
-
self._send_monitoring_info(monitoring_radio, m)
|
576
|
-
|
577
|
-
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
578
|
-
|
579
|
-
logger.debug("leaving results_incoming section")
|
580
|
-
|
581
567
|
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
582
568
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
583
569
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
@@ -15,6 +15,7 @@ from parsl.executors.high_throughput.mpi_prefix_composer import (
|
|
15
15
|
from parsl.executors.status_handling import BlockProviderExecutor
|
16
16
|
from parsl.jobs.states import JobStatus
|
17
17
|
from parsl.launchers import SimpleLauncher
|
18
|
+
from parsl.monitoring.radios.base import RadioConfig
|
18
19
|
from parsl.providers import LocalProvider
|
19
20
|
from parsl.providers.base import ExecutionProvider
|
20
21
|
|
@@ -51,7 +52,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
51
52
|
interchange_launch_cmd: Optional[str] = None,
|
52
53
|
address: Optional[str] = None,
|
53
54
|
loopback_address: str = "127.0.0.1",
|
54
|
-
|
55
|
+
worker_port: Optional[int] = None,
|
55
56
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
56
57
|
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
57
58
|
storage_access: Optional[List[Staging]] = None,
|
@@ -67,7 +68,8 @@ class MPIExecutor(HighThroughputExecutor):
|
|
67
68
|
worker_logdir_root: Optional[str] = None,
|
68
69
|
mpi_launcher: str = "mpiexec",
|
69
70
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
70
|
-
encrypted: bool = False
|
71
|
+
encrypted: bool = False,
|
72
|
+
remote_monitoring_radio: Optional[RadioConfig] = None):
|
71
73
|
super().__init__(
|
72
74
|
# Hard-coded settings
|
73
75
|
cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
|
@@ -80,7 +82,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
80
82
|
interchange_launch_cmd=interchange_launch_cmd,
|
81
83
|
address=address,
|
82
84
|
loopback_address=loopback_address,
|
83
|
-
|
85
|
+
worker_port=worker_port,
|
84
86
|
worker_port_range=worker_port_range,
|
85
87
|
interchange_port_range=interchange_port_range,
|
86
88
|
storage_access=storage_access,
|
@@ -94,7 +96,8 @@ class MPIExecutor(HighThroughputExecutor):
|
|
94
96
|
address_probe_timeout=address_probe_timeout,
|
95
97
|
worker_logdir_root=worker_logdir_root,
|
96
98
|
block_error_handler=block_error_handler,
|
97
|
-
encrypted=encrypted
|
99
|
+
encrypted=encrypted,
|
100
|
+
remote_monitoring_radio=remote_monitoring_radio
|
98
101
|
)
|
99
102
|
self.enable_mpi_mode = True
|
100
103
|
self.mpi_launcher = mpi_launcher
|
@@ -1,62 +1,70 @@
|
|
1
|
-
import argparse
|
2
1
|
import logging
|
3
|
-
import
|
4
|
-
import
|
2
|
+
import pickle
|
3
|
+
from contextlib import ExitStack
|
4
|
+
from typing import Optional
|
5
5
|
|
6
6
|
import zmq
|
7
|
-
from zmq.utils.monitor import recv_monitor_message
|
8
7
|
|
9
|
-
from parsl
|
8
|
+
from parsl import curvezmq
|
10
9
|
|
11
10
|
logger = logging.getLogger(__name__)
|
12
11
|
|
13
12
|
|
14
|
-
def probe_addresses(
|
13
|
+
def probe_addresses(
|
14
|
+
zmq_context: curvezmq.ClientContext,
|
15
|
+
addresses: str,
|
16
|
+
timeout_ms: int = 120_000,
|
17
|
+
identity: Optional[bytes] = None,
|
18
|
+
):
|
15
19
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
20
|
+
Given a single-line CSV list of addresses, return the first proven valid address.
|
21
|
+
|
22
|
+
This function will connect to each address in ``addresses`` (a comma-separated
|
23
|
+
list of URLs) and attempt to send a CONNECTION_PROBE packet. Returns the first
|
24
|
+
address that receives a response.
|
25
|
+
|
26
|
+
If no address receives a response within the ``timeout_ms`` (specified in
|
27
|
+
milliseconds), then raise ``ConnectionError``.
|
28
|
+
|
29
|
+
:param zmq_context: A ZMQ Context; the call-site may provide an encrypted ZMQ
|
30
|
+
context for assurance that the returned address is the expected and correct
|
31
|
+
endpoint
|
32
|
+
:param addresses: a comma-separated string of addresses to attempt. Example:
|
33
|
+
``tcp://127.0.0.1:1234,tcp://[3812::03aa]:5678``
|
34
|
+
:param timeout_ms: how long to wait for a response from the probes. The probes
|
35
|
+
are initiated and await concurrently, so this timeout will be the total wall
|
36
|
+
time in the worst case of "no addresses are valid."
|
37
|
+
:param identity: a ZMQ connection identity; used for logging connection probes
|
38
|
+
at the interchange
|
39
|
+
:raises: ``ConnectionError`` if no addresses are determined valid
|
40
|
+
:returns: a single address, the first one that received a response
|
29
41
|
"""
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
for addr in addr_map:
|
57
|
-
addr_map[addr]['sock'].close()
|
58
|
-
|
59
|
-
return first_connected
|
42
|
+
if not addresses:
|
43
|
+
raise ValueError("No address to probe!")
|
44
|
+
|
45
|
+
sock_map = {}
|
46
|
+
urls = addresses.split(",")
|
47
|
+
with ExitStack() as stk:
|
48
|
+
poller = zmq.Poller()
|
49
|
+
for url in urls:
|
50
|
+
logger.debug("Testing ZMQ connection to url: %s", url)
|
51
|
+
s: zmq.Socket = stk.enter_context(zmq_context.socket(zmq.DEALER))
|
52
|
+
s.setsockopt(zmq.LINGER, 0)
|
53
|
+
s.setsockopt(zmq.IPV6, True)
|
54
|
+
if identity:
|
55
|
+
s.setsockopt(zmq.IDENTITY, identity)
|
56
|
+
stk.enter_context(s.connect(url))
|
57
|
+
poller.register(s, zmq.POLLIN)
|
58
|
+
sock_map[s] = url
|
59
|
+
|
60
|
+
s.send(pickle.dumps({'type': 'connection_probe'}))
|
61
|
+
|
62
|
+
for sock, evt in poller.poll(timeout=timeout_ms):
|
63
|
+
sock.recv() # clear the buffer for good netizenry
|
64
|
+
return sock_map.get(sock)
|
65
|
+
|
66
|
+
addys = ", ".join(urls) # just slightly more human friendly
|
67
|
+
raise ConnectionError(f"No viable ZMQ url from: {addys}")
|
60
68
|
|
61
69
|
|
62
70
|
class TestWorker:
|
@@ -84,6 +92,10 @@ class TestWorker:
|
|
84
92
|
|
85
93
|
|
86
94
|
if __name__ == "__main__":
|
95
|
+
import argparse
|
96
|
+
import uuid
|
97
|
+
|
98
|
+
from parsl.addresses import get_all_addresses, tcp_url
|
87
99
|
|
88
100
|
parser = argparse.ArgumentParser()
|
89
101
|
parser.add_argument("-p", "--port", required=True,
|