parsl 2025.2.3__py3-none-any.whl → 2025.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/executors/high_throughput/executor.py +6 -0
- parsl/executors/high_throughput/interchange.py +7 -0
- parsl/executors/high_throughput/manager_record.py +2 -1
- parsl/executors/high_throughput/mpi_resource_management.py +8 -7
- parsl/executors/high_throughput/process_worker_pool.py +220 -156
- parsl/monitoring/visualization/plots/default/workflow_plots.py +2 -2
- parsl/tests/test_htex/test_managers_command.py +18 -1
- parsl/version.py +1 -1
- {parsl-2025.2.3.data → parsl-2025.2.17.data}/scripts/interchange.py +7 -0
- {parsl-2025.2.3.data → parsl-2025.2.17.data}/scripts/process_worker_pool.py +220 -156
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/METADATA +2 -2
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/RECORD +18 -18
- {parsl-2025.2.3.data → parsl-2025.2.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.2.3.data → parsl-2025.2.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/LICENSE +0 -0
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/WHEEL +0 -0
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/entry_points.txt +0 -0
- {parsl-2025.2.3.dist-info → parsl-2025.2.17.dist-info}/top_level.txt +0 -0
@@ -617,6 +617,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
617
617
|
"""
|
618
618
|
return self.command_client.run("MANAGERS")
|
619
619
|
|
620
|
+
def connected_managers_packages(self) -> Dict[str, Dict[str, str]]:
|
621
|
+
"""Returns a dict mapping each manager ID to a dict of installed
|
622
|
+
packages and their versions
|
623
|
+
"""
|
624
|
+
return self.command_client.run("MANAGERS_PACKAGES")
|
625
|
+
|
620
626
|
def connected_blocks(self) -> List[str]:
|
621
627
|
"""List of connected block ids"""
|
622
628
|
return self.command_client.run("CONNECTED_BLOCKS")
|
@@ -257,6 +257,13 @@ class Interchange:
|
|
257
257
|
'draining': m['draining']}
|
258
258
|
reply.append(resp)
|
259
259
|
|
260
|
+
elif command_req == "MANAGERS_PACKAGES":
|
261
|
+
reply = {}
|
262
|
+
for manager_id in self._ready_managers:
|
263
|
+
m = self._ready_managers[manager_id]
|
264
|
+
manager_id_str = manager_id.decode('utf-8')
|
265
|
+
reply[manager_id_str] = m["packages"]
|
266
|
+
|
260
267
|
elif command_req.startswith("HOLD_WORKER"):
|
261
268
|
cmd, s_manager = command_req.split(';')
|
262
269
|
manager_id = s_manager.encode('utf-8')
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from datetime import datetime
|
2
|
-
from typing import Any, List, Optional
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
4
|
from typing_extensions import TypedDict
|
5
5
|
|
@@ -18,3 +18,4 @@ class ManagerRecord(TypedDict, total=False):
|
|
18
18
|
timestamp: datetime
|
19
19
|
parsl_version: str
|
20
20
|
python_version: str
|
21
|
+
packages: Dict[str, str]
|
@@ -5,7 +5,7 @@ import pickle
|
|
5
5
|
import queue
|
6
6
|
import subprocess
|
7
7
|
from enum import Enum
|
8
|
-
from typing import Dict, List
|
8
|
+
from typing import Dict, List, Optional
|
9
9
|
|
10
10
|
from parsl.multiprocessing import SpawnContext
|
11
11
|
from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
|
@@ -86,8 +86,8 @@ class TaskScheduler:
|
|
86
86
|
def put_task(self, task) -> None:
|
87
87
|
return self.pending_task_q.put(task)
|
88
88
|
|
89
|
-
def get_result(self, block: bool, timeout: float):
|
90
|
-
return self.pending_result_q.get(block, timeout
|
89
|
+
def get_result(self, block: bool = True, timeout: Optional[float] = None):
|
90
|
+
return self.pending_result_q.get(block, timeout)
|
91
91
|
|
92
92
|
|
93
93
|
class MPITaskScheduler(TaskScheduler):
|
@@ -163,16 +163,17 @@ class MPITaskScheduler(TaskScheduler):
|
|
163
163
|
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
|
164
164
|
|
165
165
|
nodes_needed = resource_spec.get("num_nodes")
|
166
|
+
tid = task_package["task_id"]
|
166
167
|
if nodes_needed:
|
167
168
|
try:
|
168
169
|
allocated_nodes = self._get_nodes(nodes_needed)
|
169
170
|
except MPINodesUnavailable:
|
170
|
-
logger.
|
171
|
+
logger.info(f"Not enough resources, placing task {tid} into backlog")
|
171
172
|
self._backlog_queue.put((nodes_needed, task_package))
|
172
173
|
return
|
173
174
|
else:
|
174
175
|
resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
|
175
|
-
self._map_tasks_to_nodes[
|
176
|
+
self._map_tasks_to_nodes[tid] = allocated_nodes
|
176
177
|
buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
|
177
178
|
task_package["buffer"] = buffer
|
178
179
|
task_package["resource_spec"] = resource_spec
|
@@ -190,9 +191,9 @@ class MPITaskScheduler(TaskScheduler):
|
|
190
191
|
# Keep attempting to schedule tasks till we are out of resources
|
191
192
|
self._schedule_backlog_tasks()
|
192
193
|
|
193
|
-
def get_result(self, block: bool, timeout: float):
|
194
|
+
def get_result(self, block: bool = True, timeout: Optional[float] = None):
|
194
195
|
"""Return result and relinquish provisioned nodes"""
|
195
|
-
result_pkl = self.pending_result_q.get(block, timeout
|
196
|
+
result_pkl = self.pending_result_q.get(block, timeout)
|
196
197
|
result_dict = pickle.loads(result_pkl)
|
197
198
|
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
198
199
|
# Causes Parsl to hang. See Issue #3427
|
@@ -14,6 +14,7 @@ import sys
|
|
14
14
|
import threading
|
15
15
|
import time
|
16
16
|
import uuid
|
17
|
+
from importlib.metadata import distributions
|
17
18
|
from multiprocessing.managers import DictProxy
|
18
19
|
from multiprocessing.sharedctypes import Synchronized
|
19
20
|
from typing import Dict, List, Optional, Sequence
|
@@ -171,18 +172,9 @@ class Manager:
|
|
171
172
|
|
172
173
|
self.cert_dir = cert_dir
|
173
174
|
self.zmq_context = curvezmq.ClientContext(self.cert_dir)
|
174
|
-
self.task_incoming = self.zmq_context.socket(zmq.DEALER)
|
175
|
-
self.task_incoming.setsockopt(zmq.IDENTITY, uid.encode('utf-8'))
|
176
|
-
# Linger is set to 0, so that the manager can exit even when there might be
|
177
|
-
# messages in the pipe
|
178
|
-
self.task_incoming.setsockopt(zmq.LINGER, 0)
|
179
|
-
self.task_incoming.connect(task_q_url)
|
180
175
|
|
181
|
-
self.
|
182
|
-
self.
|
183
|
-
self.result_outgoing.setsockopt(zmq.LINGER, 0)
|
184
|
-
self.result_outgoing.connect(result_q_url)
|
185
|
-
logger.info("Manager connected to interchange")
|
176
|
+
self._task_q_url = task_q_url
|
177
|
+
self._result_q_url = result_q_url
|
186
178
|
|
187
179
|
self.uid = uid
|
188
180
|
self.block_id = block_id
|
@@ -214,6 +206,8 @@ class Manager:
|
|
214
206
|
math.floor(cores_on_node / cores_per_worker))
|
215
207
|
|
216
208
|
self._mp_manager = SpawnContext.Manager() # Starts a server process
|
209
|
+
self._tasks_in_progress = self._mp_manager.dict()
|
210
|
+
self._stop_event = threading.Event() # when set, will begin shutdown process
|
217
211
|
|
218
212
|
self.monitoring_queue = self._mp_manager.Queue()
|
219
213
|
self.pending_task_queue = SpawnContext.Queue()
|
@@ -263,6 +257,7 @@ class Manager:
|
|
263
257
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
264
258
|
sys.version_info.minor,
|
265
259
|
sys.version_info.micro),
|
260
|
+
'packages': {dist.metadata['Name']: dist.version for dist in distributions()},
|
266
261
|
'worker_count': self.worker_count,
|
267
262
|
'uid': self.uid,
|
268
263
|
'block_id': self.block_id,
|
@@ -278,46 +273,52 @@ class Manager:
|
|
278
273
|
b_msg = json.dumps(msg).encode('utf-8')
|
279
274
|
return b_msg
|
280
275
|
|
281
|
-
|
276
|
+
@staticmethod
|
277
|
+
def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
|
282
278
|
""" Send heartbeat to the incoming task queue
|
283
279
|
"""
|
284
280
|
msg = {'type': 'heartbeat'}
|
285
281
|
# don't need to dumps and encode this every time - could do as a global on import?
|
286
282
|
b_msg = json.dumps(msg).encode('utf-8')
|
287
|
-
|
283
|
+
task_incoming.send(b_msg)
|
288
284
|
logger.debug("Sent heartbeat")
|
289
285
|
|
290
|
-
|
286
|
+
@staticmethod
|
287
|
+
def drain_to_incoming(task_incoming: zmq.Socket) -> None:
|
291
288
|
""" Send heartbeat to the incoming task queue
|
292
289
|
"""
|
293
290
|
msg = {'type': 'drain'}
|
294
291
|
b_msg = json.dumps(msg).encode('utf-8')
|
295
|
-
|
292
|
+
task_incoming.send(b_msg)
|
296
293
|
logger.debug("Sent drain")
|
297
294
|
|
298
295
|
@wrap_with_logs
|
299
|
-
def pull_tasks(self
|
296
|
+
def pull_tasks(self):
|
300
297
|
""" Pull tasks from the incoming tasks zmq pipe onto the internal
|
301
298
|
pending task queue
|
302
|
-
|
303
|
-
Parameters:
|
304
|
-
-----------
|
305
|
-
kill_event : threading.Event
|
306
|
-
Event to let the thread know when it is time to die.
|
307
299
|
"""
|
308
300
|
logger.info("starting")
|
301
|
+
|
302
|
+
# Linger is set to 0, so that the manager can exit even when there might be
|
303
|
+
# messages in the pipe
|
304
|
+
task_incoming = self.zmq_context.socket(zmq.DEALER)
|
305
|
+
task_incoming.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
|
306
|
+
task_incoming.setsockopt(zmq.LINGER, 0)
|
307
|
+
task_incoming.connect(self._task_q_url)
|
308
|
+
logger.info("Manager task pipe connected to interchange")
|
309
|
+
|
309
310
|
poller = zmq.Poller()
|
310
|
-
poller.register(
|
311
|
+
poller.register(task_incoming, zmq.POLLIN)
|
311
312
|
|
312
313
|
# Send a registration message
|
313
314
|
msg = self.create_reg_message()
|
314
315
|
logger.debug("Sending registration message: {}".format(msg))
|
315
|
-
|
316
|
+
task_incoming.send(msg)
|
316
317
|
last_beat = time.time()
|
317
318
|
last_interchange_contact = time.time()
|
318
319
|
task_recv_counter = 0
|
319
320
|
|
320
|
-
while not
|
321
|
+
while not self._stop_event.is_set():
|
321
322
|
|
322
323
|
# This loop will sit inside poller.poll until either a message
|
323
324
|
# arrives or one of these event times is reached. This code
|
@@ -339,12 +340,12 @@ class Manager:
|
|
339
340
|
pending_task_count))
|
340
341
|
|
341
342
|
if time.time() >= last_beat + self.heartbeat_period:
|
342
|
-
self.heartbeat_to_incoming()
|
343
|
+
self.heartbeat_to_incoming(task_incoming)
|
343
344
|
last_beat = time.time()
|
344
345
|
|
345
346
|
if time.time() > self.drain_time:
|
346
347
|
logger.info("Requesting drain")
|
347
|
-
self.drain_to_incoming()
|
348
|
+
self.drain_to_incoming(task_incoming)
|
348
349
|
# This will start the pool draining...
|
349
350
|
# Drained exit behaviour does not happen here. It will be
|
350
351
|
# driven by the interchange sending a DRAINED_CODE message.
|
@@ -356,8 +357,8 @@ class Manager:
|
|
356
357
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
357
358
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
358
359
|
|
359
|
-
if
|
360
|
-
_, pkl_msg =
|
360
|
+
if socks.get(task_incoming) == zmq.POLLIN:
|
361
|
+
_, pkl_msg = task_incoming.recv_multipart()
|
361
362
|
tasks = pickle.loads(pkl_msg)
|
362
363
|
last_interchange_contact = time.time()
|
363
364
|
|
@@ -365,7 +366,7 @@ class Manager:
|
|
365
366
|
logger.debug("Got heartbeat from interchange")
|
366
367
|
elif tasks == DRAINED_CODE:
|
367
368
|
logger.info("Got fully drained message from interchange - setting kill flag")
|
368
|
-
|
369
|
+
self._stop_event.set()
|
369
370
|
else:
|
370
371
|
task_recv_counter += len(tasks)
|
371
372
|
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
|
@@ -381,22 +382,27 @@ class Manager:
|
|
381
382
|
# Only check if no messages were received.
|
382
383
|
if time.time() >= last_interchange_contact + self.heartbeat_threshold:
|
383
384
|
logger.critical("Missing contact with interchange beyond heartbeat_threshold")
|
384
|
-
|
385
|
+
self._stop_event.set()
|
385
386
|
logger.critical("Exiting")
|
386
387
|
break
|
387
388
|
|
389
|
+
task_incoming.close()
|
390
|
+
logger.info("Exiting")
|
391
|
+
|
388
392
|
@wrap_with_logs
|
389
|
-
def push_results(self
|
393
|
+
def push_results(self):
|
390
394
|
""" Listens on the pending_result_queue and sends out results via zmq
|
391
|
-
|
392
|
-
Parameters:
|
393
|
-
-----------
|
394
|
-
kill_event : threading.Event
|
395
|
-
Event to let the thread know when it is time to die.
|
396
395
|
"""
|
397
|
-
|
398
396
|
logger.debug("Starting result push thread")
|
399
397
|
|
398
|
+
# Linger is set to 0, so that the manager can exit even when there might be
|
399
|
+
# messages in the pipe
|
400
|
+
result_outgoing = self.zmq_context.socket(zmq.DEALER)
|
401
|
+
result_outgoing.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
|
402
|
+
result_outgoing.setsockopt(zmq.LINGER, 0)
|
403
|
+
result_outgoing.connect(self._result_q_url)
|
404
|
+
logger.info("Manager result pipe connected to interchange")
|
405
|
+
|
400
406
|
push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
|
401
407
|
logger.debug("push poll period: {}".format(push_poll_period))
|
402
408
|
|
@@ -404,7 +410,7 @@ class Manager:
|
|
404
410
|
last_result_beat = time.time()
|
405
411
|
items = []
|
406
412
|
|
407
|
-
while not
|
413
|
+
while not self._stop_event.is_set():
|
408
414
|
try:
|
409
415
|
logger.debug("Starting pending_result_queue get")
|
410
416
|
r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
|
@@ -425,7 +431,7 @@ class Manager:
|
|
425
431
|
last_beat = time.time()
|
426
432
|
if items:
|
427
433
|
logger.debug(f"Result send: Pushing {len(items)} items")
|
428
|
-
|
434
|
+
result_outgoing.send_multipart(items)
|
429
435
|
logger.debug("Result send: Pushed")
|
430
436
|
items = []
|
431
437
|
else:
|
@@ -433,21 +439,15 @@ class Manager:
|
|
433
439
|
else:
|
434
440
|
logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
|
435
441
|
|
436
|
-
|
442
|
+
result_outgoing.close()
|
443
|
+
logger.info("Exiting")
|
437
444
|
|
438
445
|
@wrap_with_logs
|
439
|
-
def worker_watchdog(self
|
440
|
-
"""Keeps workers alive.
|
441
|
-
|
442
|
-
Parameters:
|
443
|
-
-----------
|
444
|
-
kill_event : threading.Event
|
445
|
-
Event to let the thread know when it is time to die.
|
446
|
-
"""
|
447
|
-
|
446
|
+
def worker_watchdog(self):
|
447
|
+
"""Keeps workers alive."""
|
448
448
|
logger.debug("Starting worker watchdog")
|
449
449
|
|
450
|
-
while not
|
450
|
+
while not self._stop_event.wait(self.heartbeat_period):
|
451
451
|
for worker_id, p in self.procs.items():
|
452
452
|
if not p.is_alive():
|
453
453
|
logger.error("Worker {} has died".format(worker_id))
|
@@ -473,7 +473,7 @@ class Manager:
|
|
473
473
|
logger.critical("Exiting")
|
474
474
|
|
475
475
|
@wrap_with_logs
|
476
|
-
def handle_monitoring_messages(self
|
476
|
+
def handle_monitoring_messages(self):
|
477
477
|
"""Transfer messages from the managed monitoring queue to the result queue.
|
478
478
|
|
479
479
|
We separate the queues so that the result queue does not rely on a manager
|
@@ -487,7 +487,7 @@ class Manager:
|
|
487
487
|
|
488
488
|
poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
|
489
489
|
|
490
|
-
while not
|
490
|
+
while not self._stop_event.is_set():
|
491
491
|
try:
|
492
492
|
logger.debug("Starting monitor_queue.get()")
|
493
493
|
msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
|
@@ -507,9 +507,6 @@ class Manager:
|
|
507
507
|
|
508
508
|
TODO: Move task receiving to a thread
|
509
509
|
"""
|
510
|
-
self._kill_event = threading.Event()
|
511
|
-
self._tasks_in_progress = self._mp_manager.dict()
|
512
|
-
|
513
510
|
self.procs = {}
|
514
511
|
for worker_id in range(self.worker_count):
|
515
512
|
p = self._start_worker(worker_id)
|
@@ -517,34 +514,32 @@ class Manager:
|
|
517
514
|
|
518
515
|
logger.debug("Workers started")
|
519
516
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
self._worker_watchdog_thread.start()
|
536
|
-
self._monitoring_handler_thread.start()
|
517
|
+
thr_task_puller = threading.Thread(target=self.pull_tasks, name="Task-Puller")
|
518
|
+
thr_result_pusher = threading.Thread(
|
519
|
+
target=self.push_results, name="Result-Pusher"
|
520
|
+
)
|
521
|
+
thr_worker_watchdog = threading.Thread(
|
522
|
+
target=self.worker_watchdog, name="worker-watchdog"
|
523
|
+
)
|
524
|
+
thr_monitoring_handler = threading.Thread(
|
525
|
+
target=self.handle_monitoring_messages, name="Monitoring-Handler"
|
526
|
+
)
|
527
|
+
|
528
|
+
thr_task_puller.start()
|
529
|
+
thr_result_pusher.start()
|
530
|
+
thr_worker_watchdog.start()
|
531
|
+
thr_monitoring_handler.start()
|
537
532
|
|
538
533
|
logger.info("Manager threads started")
|
539
534
|
|
540
535
|
# This might need a multiprocessing event to signal back.
|
541
|
-
self.
|
536
|
+
self._stop_event.wait()
|
542
537
|
logger.critical("Received kill event, terminating worker processes")
|
543
538
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
539
|
+
thr_task_puller.join()
|
540
|
+
thr_result_pusher.join()
|
541
|
+
thr_worker_watchdog.join()
|
542
|
+
thr_monitoring_handler.join()
|
548
543
|
for proc_id in self.procs:
|
549
544
|
self.procs[proc_id].terminate()
|
550
545
|
logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
|
@@ -552,8 +547,6 @@ class Manager:
|
|
552
547
|
self.procs[proc_id].join()
|
553
548
|
logger.debug("Worker {} joined successfully".format(self.procs[proc_id]))
|
554
549
|
|
555
|
-
self.task_incoming.close()
|
556
|
-
self.result_outgoing.close()
|
557
550
|
self.zmq_context.term()
|
558
551
|
delta = time.time() - self._start_time
|
559
552
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
@@ -809,95 +802,166 @@ def start_file_logger(filename, rank, name='parsl', level=logging.DEBUG, format_
|
|
809
802
|
return logger
|
810
803
|
|
811
804
|
|
812
|
-
|
813
|
-
|
814
|
-
parser = argparse.ArgumentParser()
|
815
|
-
parser.add_argument("-d", "--debug", action='store_true',
|
816
|
-
help="Enable logging at DEBUG level")
|
817
|
-
parser.add_argument("-a", "--addresses", default='',
|
818
|
-
help="Comma separated list of addresses at which the interchange could be reached")
|
819
|
-
parser.add_argument("--cert_dir", required=True,
|
820
|
-
help="Path to certificate directory.")
|
821
|
-
parser.add_argument("-l", "--logdir", default="process_worker_pool_logs",
|
822
|
-
help="Process worker pool log directory")
|
823
|
-
parser.add_argument("-u", "--uid", default=str(uuid.uuid4()).split('-')[-1],
|
824
|
-
help="Unique identifier string for Manager")
|
825
|
-
parser.add_argument("-b", "--block_id", default=None,
|
826
|
-
help="Block identifier for Manager")
|
827
|
-
parser.add_argument("-c", "--cores_per_worker", default="1.0",
|
828
|
-
help="Number of cores assigned to each worker process. Default=1.0")
|
829
|
-
parser.add_argument("-m", "--mem_per_worker", default=0,
|
830
|
-
help="GB of memory assigned to each worker process. Default=0, no assignment")
|
831
|
-
parser.add_argument("-t", "--task_port", required=True,
|
832
|
-
help="REQUIRED: Task port for receiving tasks from the interchange")
|
833
|
-
parser.add_argument("--max_workers_per_node", default=float('inf'),
|
834
|
-
help="Caps the maximum workers that can be launched, default:infinity")
|
835
|
-
parser.add_argument("-p", "--prefetch_capacity", default=0,
|
836
|
-
help="Number of tasks that can be prefetched to the manager. Default is 0.")
|
837
|
-
parser.add_argument("--hb_period", default=30,
|
838
|
-
help="Heartbeat period in seconds. Uses manager default unless set")
|
839
|
-
parser.add_argument("--hb_threshold", default=120,
|
840
|
-
help="Heartbeat threshold in seconds. Uses manager default unless set")
|
841
|
-
parser.add_argument("--drain_period", default=None,
|
842
|
-
help="Drain this pool after specified number of seconds. By default, does not drain.")
|
843
|
-
parser.add_argument("--address_probe_timeout", default=30,
|
844
|
-
help="Timeout to probe for viable address to interchange. Default: 30s")
|
845
|
-
parser.add_argument("--poll", default=10,
|
846
|
-
help="Poll period used in milliseconds")
|
847
|
-
parser.add_argument("-r", "--result_port", required=True,
|
848
|
-
help="REQUIRED: Result port for posting results to the interchange")
|
805
|
+
def get_arg_parser() -> argparse.ArgumentParser:
|
849
806
|
|
850
807
|
def strategyorlist(s: str):
|
851
|
-
|
808
|
+
s = s.lower()
|
809
|
+
allowed_strategies = ("none", "block", "alternating", "block-reverse")
|
852
810
|
if s in allowed_strategies:
|
853
811
|
return s
|
854
812
|
elif s[0:4] == "list":
|
855
813
|
return s
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
parser.
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
814
|
+
err_msg = f"cpu-affinity must be one of {allowed_strategies} or a list format"
|
815
|
+
raise argparse.ArgumentTypeError(err_msg)
|
816
|
+
|
817
|
+
parser = argparse.ArgumentParser()
|
818
|
+
parser.add_argument(
|
819
|
+
"-d", "--debug", action='store_true', help="Enable logging at DEBUG level",
|
820
|
+
)
|
821
|
+
parser.add_argument(
|
822
|
+
"-a",
|
823
|
+
"--addresses",
|
824
|
+
required=True,
|
825
|
+
help="Comma separated list of addresses at which the interchange could be reached",
|
826
|
+
)
|
827
|
+
parser.add_argument(
|
828
|
+
"--cert_dir", required=True, help="Path to certificate directory."
|
829
|
+
)
|
830
|
+
parser.add_argument(
|
831
|
+
"-l",
|
832
|
+
"--logdir",
|
833
|
+
default="process_worker_pool_logs",
|
834
|
+
help="Process worker pool log directory",
|
835
|
+
)
|
836
|
+
parser.add_argument(
|
837
|
+
"-u",
|
838
|
+
"--uid",
|
839
|
+
default=str(uuid.uuid4()).split('-')[-1],
|
840
|
+
help="Unique identifier string for Manager",
|
841
|
+
)
|
842
|
+
parser.add_argument(
|
843
|
+
"-b", "--block_id", default=None, help="Block identifier for Manager"
|
844
|
+
)
|
845
|
+
parser.add_argument(
|
846
|
+
"-c",
|
847
|
+
"--cores_per_worker",
|
848
|
+
default="1.0",
|
849
|
+
help="Number of cores assigned to each worker process. Default=1.0",
|
850
|
+
)
|
851
|
+
parser.add_argument(
|
852
|
+
"-m",
|
853
|
+
"--mem_per_worker",
|
854
|
+
default=0,
|
855
|
+
help="GB of memory assigned to each worker process. Default=0, no assignment",
|
856
|
+
)
|
857
|
+
parser.add_argument(
|
858
|
+
"-t",
|
859
|
+
"--task_port",
|
860
|
+
required=True,
|
861
|
+
help="Task port for receiving tasks from the interchange",
|
862
|
+
)
|
863
|
+
parser.add_argument(
|
864
|
+
"--max_workers_per_node",
|
865
|
+
default=float('inf'),
|
866
|
+
help="Caps the maximum workers that can be launched, default:infinity",
|
867
|
+
)
|
868
|
+
parser.add_argument(
|
869
|
+
"-p",
|
870
|
+
"--prefetch_capacity",
|
871
|
+
default=0,
|
872
|
+
help="Number of tasks that can be prefetched to the manager. Default is 0.",
|
873
|
+
)
|
874
|
+
parser.add_argument(
|
875
|
+
"--hb_period",
|
876
|
+
default=30,
|
877
|
+
help="Heartbeat period in seconds. Uses manager default unless set",
|
878
|
+
)
|
879
|
+
parser.add_argument(
|
880
|
+
"--hb_threshold",
|
881
|
+
default=120,
|
882
|
+
help="Heartbeat threshold in seconds. Uses manager default unless set",
|
883
|
+
)
|
884
|
+
parser.add_argument(
|
885
|
+
"--drain_period",
|
886
|
+
default=None,
|
887
|
+
help="Drain this pool after specified number of seconds. By default, does not drain.",
|
888
|
+
)
|
889
|
+
parser.add_argument(
|
890
|
+
"--address_probe_timeout",
|
891
|
+
default=30,
|
892
|
+
help="Timeout to probe for viable address to interchange. Default: 30s",
|
893
|
+
)
|
894
|
+
parser.add_argument(
|
895
|
+
"--poll", default=10, help="Poll period used in milliseconds"
|
896
|
+
)
|
897
|
+
parser.add_argument(
|
898
|
+
"-r",
|
899
|
+
"--result_port",
|
900
|
+
required=True,
|
901
|
+
help="Result port for posting results to the interchange",
|
902
|
+
)
|
903
|
+
parser.add_argument(
|
904
|
+
"--cpu-affinity",
|
905
|
+
type=strategyorlist,
|
906
|
+
required=True,
|
907
|
+
help="Whether/how workers should control CPU affinity.",
|
908
|
+
)
|
909
|
+
parser.add_argument(
|
910
|
+
"--available-accelerators",
|
911
|
+
type=str,
|
912
|
+
nargs="*",
|
913
|
+
default=[],
|
914
|
+
help="Names of available accelerators, if not given assumed to be zero accelerators available",
|
915
|
+
)
|
916
|
+
parser.add_argument(
|
917
|
+
"--enable_mpi_mode", action='store_true', help="Enable MPI mode"
|
918
|
+
)
|
919
|
+
parser.add_argument(
|
920
|
+
"--mpi-launcher",
|
921
|
+
type=str,
|
922
|
+
choices=VALID_LAUNCHERS,
|
923
|
+
help="MPI launcher to use iff enable_mpi_mode=true",
|
924
|
+
)
|
925
|
+
|
926
|
+
return parser
|
927
|
+
|
868
928
|
|
929
|
+
if __name__ == "__main__":
|
930
|
+
parser = get_arg_parser()
|
869
931
|
args = parser.parse_args()
|
870
932
|
|
871
933
|
os.makedirs(os.path.join(args.logdir, "block-{}".format(args.block_id), args.uid), exist_ok=True)
|
872
934
|
|
935
|
+
logger = start_file_logger(
|
936
|
+
f'{args.logdir}/block-{args.block_id}/{args.uid}/manager.log',
|
937
|
+
0,
|
938
|
+
level=logging.DEBUG if args.debug is True else logging.INFO
|
939
|
+
)
|
940
|
+
logger.info(
|
941
|
+
f"\n Python version: {sys.version}"
|
942
|
+
f" Debug logging: {args.debug}"
|
943
|
+
f" Certificates dir: {args.cert_dir}"
|
944
|
+
f" Log dir: {args.logdir}"
|
945
|
+
f" Manager ID: {args.uid}"
|
946
|
+
f" Block ID: {args.block_id}"
|
947
|
+
f" cores_per_worker: {args.cores_per_worker}"
|
948
|
+
f" mem_per_worker: {args.mem_per_worker}"
|
949
|
+
f" task_port: {args.task_port}"
|
950
|
+
f" result_port: {args.result_port}"
|
951
|
+
f" addresses: {args.addresses}"
|
952
|
+
f" max_workers_per_node: {args.max_workers_per_node}"
|
953
|
+
f" poll_period: {args.poll}"
|
954
|
+
f" address_probe_timeout: {args.address_probe_timeout}"
|
955
|
+
f" Prefetch capacity: {args.prefetch_capacity}"
|
956
|
+
f" Heartbeat threshold: {args.hb_threshold}"
|
957
|
+
f" Heartbeat period: {args.hb_period}"
|
958
|
+
f" Drain period: {args.drain_period}"
|
959
|
+
f" CPU affinity: {args.cpu_affinity}"
|
960
|
+
f" Accelerators: {' '.join(args.available_accelerators)}"
|
961
|
+
f" enable_mpi_mode: {args.enable_mpi_mode}"
|
962
|
+
f" mpi_launcher: {args.mpi_launcher}"
|
963
|
+
)
|
873
964
|
try:
|
874
|
-
logger = start_file_logger('{}/block-{}/{}/manager.log'.format(args.logdir, args.block_id, args.uid),
|
875
|
-
0,
|
876
|
-
level=logging.DEBUG if args.debug is True else logging.INFO)
|
877
|
-
|
878
|
-
logger.info("Python version: {}".format(sys.version))
|
879
|
-
logger.info("Debug logging: {}".format(args.debug))
|
880
|
-
logger.info("Certificates dir: {}".format(args.cert_dir))
|
881
|
-
logger.info("Log dir: {}".format(args.logdir))
|
882
|
-
logger.info("Manager ID: {}".format(args.uid))
|
883
|
-
logger.info("Block ID: {}".format(args.block_id))
|
884
|
-
logger.info("cores_per_worker: {}".format(args.cores_per_worker))
|
885
|
-
logger.info("mem_per_worker: {}".format(args.mem_per_worker))
|
886
|
-
logger.info("task_port: {}".format(args.task_port))
|
887
|
-
logger.info("result_port: {}".format(args.result_port))
|
888
|
-
logger.info("addresses: {}".format(args.addresses))
|
889
|
-
logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
|
890
|
-
logger.info("poll_period: {}".format(args.poll))
|
891
|
-
logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
|
892
|
-
logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
|
893
|
-
logger.info("Heartbeat threshold: {}".format(args.hb_threshold))
|
894
|
-
logger.info("Heartbeat period: {}".format(args.hb_period))
|
895
|
-
logger.info("Drain period: {}".format(args.drain_period))
|
896
|
-
logger.info("CPU affinity: {}".format(args.cpu_affinity))
|
897
|
-
logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
|
898
|
-
logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
|
899
|
-
logger.info("mpi_launcher: {}".format(args.mpi_launcher))
|
900
|
-
|
901
965
|
manager = Manager(task_port=args.task_port,
|
902
966
|
result_port=args.result_port,
|
903
967
|
addresses=args.addresses,
|
@@ -290,10 +290,10 @@ def workflow_dag_plot(df_tasks, group_by_apps=True):
|
|
290
290
|
edge_trace['y'] += tuple([y0, y1, None])
|
291
291
|
|
292
292
|
# Create figure:
|
293
|
+
title = go.layout.Title(text='Workflow DAG', font=dict(size=16))
|
293
294
|
fig = go.Figure(data=[edge_trace] + node_traces,
|
294
295
|
layout=go.Layout(
|
295
|
-
title=
|
296
|
-
titlefont=dict(size=16),
|
296
|
+
title=title,
|
297
297
|
showlegend=True,
|
298
298
|
hovermode='closest',
|
299
299
|
margin=dict(b=20, l=5, r=5, t=40), # noqa: E741
|