parsl 2025.2.10__py3-none-any.whl → 2025.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -617,6 +617,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
617
617
  """
618
618
  return self.command_client.run("MANAGERS")
619
619
 
620
+ def connected_managers_packages(self) -> Dict[str, Dict[str, str]]:
621
+ """Returns a dict mapping each manager ID to a dict of installed
622
+ packages and their versions
623
+ """
624
+ return self.command_client.run("MANAGERS_PACKAGES")
625
+
620
626
  def connected_blocks(self) -> List[str]:
621
627
  """List of connected block ids"""
622
628
  return self.command_client.run("CONNECTED_BLOCKS")
@@ -257,6 +257,13 @@ class Interchange:
257
257
  'draining': m['draining']}
258
258
  reply.append(resp)
259
259
 
260
+ elif command_req == "MANAGERS_PACKAGES":
261
+ reply = {}
262
+ for manager_id in self._ready_managers:
263
+ m = self._ready_managers[manager_id]
264
+ manager_id_str = manager_id.decode('utf-8')
265
+ reply[manager_id_str] = m["packages"]
266
+
260
267
  elif command_req.startswith("HOLD_WORKER"):
261
268
  cmd, s_manager = command_req.split(';')
262
269
  manager_id = s_manager.encode('utf-8')
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from typing import Any, List, Optional
2
+ from typing import Any, Dict, List, Optional
3
3
 
4
4
  from typing_extensions import TypedDict
5
5
 
@@ -18,3 +18,4 @@ class ManagerRecord(TypedDict, total=False):
18
18
  timestamp: datetime
19
19
  parsl_version: str
20
20
  python_version: str
21
+ packages: Dict[str, str]
@@ -5,7 +5,7 @@ import pickle
5
5
  import queue
6
6
  import subprocess
7
7
  from enum import Enum
8
- from typing import Dict, List
8
+ from typing import Dict, List, Optional
9
9
 
10
10
  from parsl.multiprocessing import SpawnContext
11
11
  from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
@@ -86,8 +86,8 @@ class TaskScheduler:
86
86
  def put_task(self, task) -> None:
87
87
  return self.pending_task_q.put(task)
88
88
 
89
- def get_result(self, block: bool, timeout: float):
90
- return self.pending_result_q.get(block, timeout=timeout)
89
+ def get_result(self, block: bool = True, timeout: Optional[float] = None):
90
+ return self.pending_result_q.get(block, timeout)
91
91
 
92
92
 
93
93
  class MPITaskScheduler(TaskScheduler):
@@ -163,16 +163,17 @@ class MPITaskScheduler(TaskScheduler):
163
163
  _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
164
164
 
165
165
  nodes_needed = resource_spec.get("num_nodes")
166
+ tid = task_package["task_id"]
166
167
  if nodes_needed:
167
168
  try:
168
169
  allocated_nodes = self._get_nodes(nodes_needed)
169
170
  except MPINodesUnavailable:
170
- logger.warning("Not enough resources, placing task into backlog")
171
+ logger.info(f"Not enough resources, placing task {tid} into backlog")
171
172
  self._backlog_queue.put((nodes_needed, task_package))
172
173
  return
173
174
  else:
174
175
  resource_spec["MPI_NODELIST"] = ",".join(allocated_nodes)
175
- self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
176
+ self._map_tasks_to_nodes[tid] = allocated_nodes
176
177
  buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
177
178
  task_package["buffer"] = buffer
178
179
  task_package["resource_spec"] = resource_spec
@@ -190,9 +191,9 @@ class MPITaskScheduler(TaskScheduler):
190
191
  # Keep attempting to schedule tasks till we are out of resources
191
192
  self._schedule_backlog_tasks()
192
193
 
193
- def get_result(self, block: bool, timeout: float):
194
+ def get_result(self, block: bool = True, timeout: Optional[float] = None):
194
195
  """Return result and relinquish provisioned nodes"""
195
- result_pkl = self.pending_result_q.get(block, timeout=timeout)
196
+ result_pkl = self.pending_result_q.get(block, timeout)
196
197
  result_dict = pickle.loads(result_pkl)
197
198
  # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
198
199
  # Causes Parsl to hang. See Issue #3427
@@ -14,6 +14,7 @@ import sys
14
14
  import threading
15
15
  import time
16
16
  import uuid
17
+ from importlib.metadata import distributions
17
18
  from multiprocessing.managers import DictProxy
18
19
  from multiprocessing.sharedctypes import Synchronized
19
20
  from typing import Dict, List, Optional, Sequence
@@ -171,18 +172,9 @@ class Manager:
171
172
 
172
173
  self.cert_dir = cert_dir
173
174
  self.zmq_context = curvezmq.ClientContext(self.cert_dir)
174
- self.task_incoming = self.zmq_context.socket(zmq.DEALER)
175
- self.task_incoming.setsockopt(zmq.IDENTITY, uid.encode('utf-8'))
176
- # Linger is set to 0, so that the manager can exit even when there might be
177
- # messages in the pipe
178
- self.task_incoming.setsockopt(zmq.LINGER, 0)
179
- self.task_incoming.connect(task_q_url)
180
175
 
181
- self.result_outgoing = self.zmq_context.socket(zmq.DEALER)
182
- self.result_outgoing.setsockopt(zmq.IDENTITY, uid.encode('utf-8'))
183
- self.result_outgoing.setsockopt(zmq.LINGER, 0)
184
- self.result_outgoing.connect(result_q_url)
185
- logger.info("Manager connected to interchange")
176
+ self._task_q_url = task_q_url
177
+ self._result_q_url = result_q_url
186
178
 
187
179
  self.uid = uid
188
180
  self.block_id = block_id
@@ -214,6 +206,8 @@ class Manager:
214
206
  math.floor(cores_on_node / cores_per_worker))
215
207
 
216
208
  self._mp_manager = SpawnContext.Manager() # Starts a server process
209
+ self._tasks_in_progress = self._mp_manager.dict()
210
+ self._stop_event = threading.Event() # when set, will begin shutdown process
217
211
 
218
212
  self.monitoring_queue = self._mp_manager.Queue()
219
213
  self.pending_task_queue = SpawnContext.Queue()
@@ -263,6 +257,7 @@ class Manager:
263
257
  'python_v': "{}.{}.{}".format(sys.version_info.major,
264
258
  sys.version_info.minor,
265
259
  sys.version_info.micro),
260
+ 'packages': {dist.metadata['Name']: dist.version for dist in distributions()},
266
261
  'worker_count': self.worker_count,
267
262
  'uid': self.uid,
268
263
  'block_id': self.block_id,
@@ -278,46 +273,52 @@ class Manager:
278
273
  b_msg = json.dumps(msg).encode('utf-8')
279
274
  return b_msg
280
275
 
281
- def heartbeat_to_incoming(self):
276
+ @staticmethod
277
+ def heartbeat_to_incoming(task_incoming: zmq.Socket) -> None:
282
278
  """ Send heartbeat to the incoming task queue
283
279
  """
284
280
  msg = {'type': 'heartbeat'}
285
281
  # don't need to dumps and encode this every time - could do as a global on import?
286
282
  b_msg = json.dumps(msg).encode('utf-8')
287
- self.task_incoming.send(b_msg)
283
+ task_incoming.send(b_msg)
288
284
  logger.debug("Sent heartbeat")
289
285
 
290
- def drain_to_incoming(self):
286
+ @staticmethod
287
+ def drain_to_incoming(task_incoming: zmq.Socket) -> None:
291
288
  """ Send heartbeat to the incoming task queue
292
289
  """
293
290
  msg = {'type': 'drain'}
294
291
  b_msg = json.dumps(msg).encode('utf-8')
295
- self.task_incoming.send(b_msg)
292
+ task_incoming.send(b_msg)
296
293
  logger.debug("Sent drain")
297
294
 
298
295
  @wrap_with_logs
299
- def pull_tasks(self, kill_event):
296
+ def pull_tasks(self):
300
297
  """ Pull tasks from the incoming tasks zmq pipe onto the internal
301
298
  pending task queue
302
-
303
- Parameters:
304
- -----------
305
- kill_event : threading.Event
306
- Event to let the thread know when it is time to die.
307
299
  """
308
300
  logger.info("starting")
301
+
302
+ # Linger is set to 0, so that the manager can exit even when there might be
303
+ # messages in the pipe
304
+ task_incoming = self.zmq_context.socket(zmq.DEALER)
305
+ task_incoming.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
306
+ task_incoming.setsockopt(zmq.LINGER, 0)
307
+ task_incoming.connect(self._task_q_url)
308
+ logger.info("Manager task pipe connected to interchange")
309
+
309
310
  poller = zmq.Poller()
310
- poller.register(self.task_incoming, zmq.POLLIN)
311
+ poller.register(task_incoming, zmq.POLLIN)
311
312
 
312
313
  # Send a registration message
313
314
  msg = self.create_reg_message()
314
315
  logger.debug("Sending registration message: {}".format(msg))
315
- self.task_incoming.send(msg)
316
+ task_incoming.send(msg)
316
317
  last_beat = time.time()
317
318
  last_interchange_contact = time.time()
318
319
  task_recv_counter = 0
319
320
 
320
- while not kill_event.is_set():
321
+ while not self._stop_event.is_set():
321
322
 
322
323
  # This loop will sit inside poller.poll until either a message
323
324
  # arrives or one of these event times is reached. This code
@@ -339,12 +340,12 @@ class Manager:
339
340
  pending_task_count))
340
341
 
341
342
  if time.time() >= last_beat + self.heartbeat_period:
342
- self.heartbeat_to_incoming()
343
+ self.heartbeat_to_incoming(task_incoming)
343
344
  last_beat = time.time()
344
345
 
345
346
  if time.time() > self.drain_time:
346
347
  logger.info("Requesting drain")
347
- self.drain_to_incoming()
348
+ self.drain_to_incoming(task_incoming)
348
349
  # This will start the pool draining...
349
350
  # Drained exit behaviour does not happen here. It will be
350
351
  # driven by the interchange sending a DRAINED_CODE message.
@@ -356,8 +357,8 @@ class Manager:
356
357
  poll_duration_s = max(0, next_interesting_event_time - time.time())
357
358
  socks = dict(poller.poll(timeout=poll_duration_s * 1000))
358
359
 
359
- if self.task_incoming in socks and socks[self.task_incoming] == zmq.POLLIN:
360
- _, pkl_msg = self.task_incoming.recv_multipart()
360
+ if socks.get(task_incoming) == zmq.POLLIN:
361
+ _, pkl_msg = task_incoming.recv_multipart()
361
362
  tasks = pickle.loads(pkl_msg)
362
363
  last_interchange_contact = time.time()
363
364
 
@@ -365,7 +366,7 @@ class Manager:
365
366
  logger.debug("Got heartbeat from interchange")
366
367
  elif tasks == DRAINED_CODE:
367
368
  logger.info("Got fully drained message from interchange - setting kill flag")
368
- kill_event.set()
369
+ self._stop_event.set()
369
370
  else:
370
371
  task_recv_counter += len(tasks)
371
372
  logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
@@ -381,22 +382,27 @@ class Manager:
381
382
  # Only check if no messages were received.
382
383
  if time.time() >= last_interchange_contact + self.heartbeat_threshold:
383
384
  logger.critical("Missing contact with interchange beyond heartbeat_threshold")
384
- kill_event.set()
385
+ self._stop_event.set()
385
386
  logger.critical("Exiting")
386
387
  break
387
388
 
389
+ task_incoming.close()
390
+ logger.info("Exiting")
391
+
388
392
  @wrap_with_logs
389
- def push_results(self, kill_event):
393
+ def push_results(self):
390
394
  """ Listens on the pending_result_queue and sends out results via zmq
391
-
392
- Parameters:
393
- -----------
394
- kill_event : threading.Event
395
- Event to let the thread know when it is time to die.
396
395
  """
397
-
398
396
  logger.debug("Starting result push thread")
399
397
 
398
+ # Linger is set to 0, so that the manager can exit even when there might be
399
+ # messages in the pipe
400
+ result_outgoing = self.zmq_context.socket(zmq.DEALER)
401
+ result_outgoing.setsockopt(zmq.IDENTITY, self.uid.encode('utf-8'))
402
+ result_outgoing.setsockopt(zmq.LINGER, 0)
403
+ result_outgoing.connect(self._result_q_url)
404
+ logger.info("Manager result pipe connected to interchange")
405
+
400
406
  push_poll_period = max(10, self.poll_period) / 1000 # push_poll_period must be atleast 10 ms
401
407
  logger.debug("push poll period: {}".format(push_poll_period))
402
408
 
@@ -404,7 +410,7 @@ class Manager:
404
410
  last_result_beat = time.time()
405
411
  items = []
406
412
 
407
- while not kill_event.is_set():
413
+ while not self._stop_event.is_set():
408
414
  try:
409
415
  logger.debug("Starting pending_result_queue get")
410
416
  r = self.task_scheduler.get_result(block=True, timeout=push_poll_period)
@@ -425,7 +431,7 @@ class Manager:
425
431
  last_beat = time.time()
426
432
  if items:
427
433
  logger.debug(f"Result send: Pushing {len(items)} items")
428
- self.result_outgoing.send_multipart(items)
434
+ result_outgoing.send_multipart(items)
429
435
  logger.debug("Result send: Pushed")
430
436
  items = []
431
437
  else:
@@ -433,21 +439,15 @@ class Manager:
433
439
  else:
434
440
  logger.debug(f"Result send: check condition not met - deferring {len(items)} result items")
435
441
 
436
- logger.critical("Exiting")
442
+ result_outgoing.close()
443
+ logger.info("Exiting")
437
444
 
438
445
  @wrap_with_logs
439
- def worker_watchdog(self, kill_event: threading.Event):
440
- """Keeps workers alive.
441
-
442
- Parameters:
443
- -----------
444
- kill_event : threading.Event
445
- Event to let the thread know when it is time to die.
446
- """
447
-
446
+ def worker_watchdog(self):
447
+ """Keeps workers alive."""
448
448
  logger.debug("Starting worker watchdog")
449
449
 
450
- while not kill_event.wait(self.heartbeat_period):
450
+ while not self._stop_event.wait(self.heartbeat_period):
451
451
  for worker_id, p in self.procs.items():
452
452
  if not p.is_alive():
453
453
  logger.error("Worker {} has died".format(worker_id))
@@ -473,7 +473,7 @@ class Manager:
473
473
  logger.critical("Exiting")
474
474
 
475
475
  @wrap_with_logs
476
- def handle_monitoring_messages(self, kill_event: threading.Event):
476
+ def handle_monitoring_messages(self):
477
477
  """Transfer messages from the managed monitoring queue to the result queue.
478
478
 
479
479
  We separate the queues so that the result queue does not rely on a manager
@@ -487,7 +487,7 @@ class Manager:
487
487
 
488
488
  poll_period_s = max(10, self.poll_period) / 1000 # Must be at least 10 ms
489
489
 
490
- while not kill_event.is_set():
490
+ while not self._stop_event.is_set():
491
491
  try:
492
492
  logger.debug("Starting monitor_queue.get()")
493
493
  msg = self.monitoring_queue.get(block=True, timeout=poll_period_s)
@@ -507,9 +507,6 @@ class Manager:
507
507
 
508
508
  TODO: Move task receiving to a thread
509
509
  """
510
- self._kill_event = threading.Event()
511
- self._tasks_in_progress = self._mp_manager.dict()
512
-
513
510
  self.procs = {}
514
511
  for worker_id in range(self.worker_count):
515
512
  p = self._start_worker(worker_id)
@@ -517,34 +514,32 @@ class Manager:
517
514
 
518
515
  logger.debug("Workers started")
519
516
 
520
- self._task_puller_thread = threading.Thread(target=self.pull_tasks,
521
- args=(self._kill_event,),
522
- name="Task-Puller")
523
- self._result_pusher_thread = threading.Thread(target=self.push_results,
524
- args=(self._kill_event,),
525
- name="Result-Pusher")
526
- self._worker_watchdog_thread = threading.Thread(target=self.worker_watchdog,
527
- args=(self._kill_event,),
528
- name="worker-watchdog")
529
- self._monitoring_handler_thread = threading.Thread(target=self.handle_monitoring_messages,
530
- args=(self._kill_event,),
531
- name="Monitoring-Handler")
532
-
533
- self._task_puller_thread.start()
534
- self._result_pusher_thread.start()
535
- self._worker_watchdog_thread.start()
536
- self._monitoring_handler_thread.start()
517
+ thr_task_puller = threading.Thread(target=self.pull_tasks, name="Task-Puller")
518
+ thr_result_pusher = threading.Thread(
519
+ target=self.push_results, name="Result-Pusher"
520
+ )
521
+ thr_worker_watchdog = threading.Thread(
522
+ target=self.worker_watchdog, name="worker-watchdog"
523
+ )
524
+ thr_monitoring_handler = threading.Thread(
525
+ target=self.handle_monitoring_messages, name="Monitoring-Handler"
526
+ )
527
+
528
+ thr_task_puller.start()
529
+ thr_result_pusher.start()
530
+ thr_worker_watchdog.start()
531
+ thr_monitoring_handler.start()
537
532
 
538
533
  logger.info("Manager threads started")
539
534
 
540
535
  # This might need a multiprocessing event to signal back.
541
- self._kill_event.wait()
536
+ self._stop_event.wait()
542
537
  logger.critical("Received kill event, terminating worker processes")
543
538
 
544
- self._task_puller_thread.join()
545
- self._result_pusher_thread.join()
546
- self._worker_watchdog_thread.join()
547
- self._monitoring_handler_thread.join()
539
+ thr_task_puller.join()
540
+ thr_result_pusher.join()
541
+ thr_worker_watchdog.join()
542
+ thr_monitoring_handler.join()
548
543
  for proc_id in self.procs:
549
544
  self.procs[proc_id].terminate()
550
545
  logger.critical("Terminating worker {}: is_alive()={}".format(self.procs[proc_id],
@@ -552,8 +547,6 @@ class Manager:
552
547
  self.procs[proc_id].join()
553
548
  logger.debug("Worker {} joined successfully".format(self.procs[proc_id]))
554
549
 
555
- self.task_incoming.close()
556
- self.result_outgoing.close()
557
550
  self.zmq_context.term()
558
551
  delta = time.time() - self._start_time
559
552
  logger.info("process_worker_pool ran for {} seconds".format(delta))
@@ -809,95 +802,166 @@ def start_file_logger(filename, rank, name='parsl', level=logging.DEBUG, format_
809
802
  return logger
810
803
 
811
804
 
812
- if __name__ == "__main__":
813
-
814
- parser = argparse.ArgumentParser()
815
- parser.add_argument("-d", "--debug", action='store_true',
816
- help="Enable logging at DEBUG level")
817
- parser.add_argument("-a", "--addresses", default='',
818
- help="Comma separated list of addresses at which the interchange could be reached")
819
- parser.add_argument("--cert_dir", required=True,
820
- help="Path to certificate directory.")
821
- parser.add_argument("-l", "--logdir", default="process_worker_pool_logs",
822
- help="Process worker pool log directory")
823
- parser.add_argument("-u", "--uid", default=str(uuid.uuid4()).split('-')[-1],
824
- help="Unique identifier string for Manager")
825
- parser.add_argument("-b", "--block_id", default=None,
826
- help="Block identifier for Manager")
827
- parser.add_argument("-c", "--cores_per_worker", default="1.0",
828
- help="Number of cores assigned to each worker process. Default=1.0")
829
- parser.add_argument("-m", "--mem_per_worker", default=0,
830
- help="GB of memory assigned to each worker process. Default=0, no assignment")
831
- parser.add_argument("-t", "--task_port", required=True,
832
- help="REQUIRED: Task port for receiving tasks from the interchange")
833
- parser.add_argument("--max_workers_per_node", default=float('inf'),
834
- help="Caps the maximum workers that can be launched, default:infinity")
835
- parser.add_argument("-p", "--prefetch_capacity", default=0,
836
- help="Number of tasks that can be prefetched to the manager. Default is 0.")
837
- parser.add_argument("--hb_period", default=30,
838
- help="Heartbeat period in seconds. Uses manager default unless set")
839
- parser.add_argument("--hb_threshold", default=120,
840
- help="Heartbeat threshold in seconds. Uses manager default unless set")
841
- parser.add_argument("--drain_period", default=None,
842
- help="Drain this pool after specified number of seconds. By default, does not drain.")
843
- parser.add_argument("--address_probe_timeout", default=30,
844
- help="Timeout to probe for viable address to interchange. Default: 30s")
845
- parser.add_argument("--poll", default=10,
846
- help="Poll period used in milliseconds")
847
- parser.add_argument("-r", "--result_port", required=True,
848
- help="REQUIRED: Result port for posting results to the interchange")
805
+ def get_arg_parser() -> argparse.ArgumentParser:
849
806
 
850
807
  def strategyorlist(s: str):
851
- allowed_strategies = ["none", "block", "alternating", "block-reverse"]
808
+ s = s.lower()
809
+ allowed_strategies = ("none", "block", "alternating", "block-reverse")
852
810
  if s in allowed_strategies:
853
811
  return s
854
812
  elif s[0:4] == "list":
855
813
  return s
856
- else:
857
- raise argparse.ArgumentTypeError("cpu-affinity must be one of {} or a list format".format(allowed_strategies))
858
-
859
- parser.add_argument("--cpu-affinity", type=strategyorlist,
860
- required=True,
861
- help="Whether/how workers should control CPU affinity.")
862
- parser.add_argument("--available-accelerators", type=str, nargs="*",
863
- help="Names of available accelerators, if not given assumed to be zero accelerators available", default=[])
864
- parser.add_argument("--enable_mpi_mode", action='store_true',
865
- help="Enable MPI mode")
866
- parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
867
- help="MPI launcher to use iff enable_mpi_mode=true")
814
+ err_msg = f"cpu-affinity must be one of {allowed_strategies} or a list format"
815
+ raise argparse.ArgumentTypeError(err_msg)
816
+
817
+ parser = argparse.ArgumentParser()
818
+ parser.add_argument(
819
+ "-d", "--debug", action='store_true', help="Enable logging at DEBUG level",
820
+ )
821
+ parser.add_argument(
822
+ "-a",
823
+ "--addresses",
824
+ required=True,
825
+ help="Comma separated list of addresses at which the interchange could be reached",
826
+ )
827
+ parser.add_argument(
828
+ "--cert_dir", required=True, help="Path to certificate directory."
829
+ )
830
+ parser.add_argument(
831
+ "-l",
832
+ "--logdir",
833
+ default="process_worker_pool_logs",
834
+ help="Process worker pool log directory",
835
+ )
836
+ parser.add_argument(
837
+ "-u",
838
+ "--uid",
839
+ default=str(uuid.uuid4()).split('-')[-1],
840
+ help="Unique identifier string for Manager",
841
+ )
842
+ parser.add_argument(
843
+ "-b", "--block_id", default=None, help="Block identifier for Manager"
844
+ )
845
+ parser.add_argument(
846
+ "-c",
847
+ "--cores_per_worker",
848
+ default="1.0",
849
+ help="Number of cores assigned to each worker process. Default=1.0",
850
+ )
851
+ parser.add_argument(
852
+ "-m",
853
+ "--mem_per_worker",
854
+ default=0,
855
+ help="GB of memory assigned to each worker process. Default=0, no assignment",
856
+ )
857
+ parser.add_argument(
858
+ "-t",
859
+ "--task_port",
860
+ required=True,
861
+ help="Task port for receiving tasks from the interchange",
862
+ )
863
+ parser.add_argument(
864
+ "--max_workers_per_node",
865
+ default=float('inf'),
866
+ help="Caps the maximum workers that can be launched, default:infinity",
867
+ )
868
+ parser.add_argument(
869
+ "-p",
870
+ "--prefetch_capacity",
871
+ default=0,
872
+ help="Number of tasks that can be prefetched to the manager. Default is 0.",
873
+ )
874
+ parser.add_argument(
875
+ "--hb_period",
876
+ default=30,
877
+ help="Heartbeat period in seconds. Uses manager default unless set",
878
+ )
879
+ parser.add_argument(
880
+ "--hb_threshold",
881
+ default=120,
882
+ help="Heartbeat threshold in seconds. Uses manager default unless set",
883
+ )
884
+ parser.add_argument(
885
+ "--drain_period",
886
+ default=None,
887
+ help="Drain this pool after specified number of seconds. By default, does not drain.",
888
+ )
889
+ parser.add_argument(
890
+ "--address_probe_timeout",
891
+ default=30,
892
+ help="Timeout to probe for viable address to interchange. Default: 30s",
893
+ )
894
+ parser.add_argument(
895
+ "--poll", default=10, help="Poll period used in milliseconds"
896
+ )
897
+ parser.add_argument(
898
+ "-r",
899
+ "--result_port",
900
+ required=True,
901
+ help="Result port for posting results to the interchange",
902
+ )
903
+ parser.add_argument(
904
+ "--cpu-affinity",
905
+ type=strategyorlist,
906
+ required=True,
907
+ help="Whether/how workers should control CPU affinity.",
908
+ )
909
+ parser.add_argument(
910
+ "--available-accelerators",
911
+ type=str,
912
+ nargs="*",
913
+ default=[],
914
+ help="Names of available accelerators, if not given assumed to be zero accelerators available",
915
+ )
916
+ parser.add_argument(
917
+ "--enable_mpi_mode", action='store_true', help="Enable MPI mode"
918
+ )
919
+ parser.add_argument(
920
+ "--mpi-launcher",
921
+ type=str,
922
+ choices=VALID_LAUNCHERS,
923
+ help="MPI launcher to use iff enable_mpi_mode=true",
924
+ )
925
+
926
+ return parser
927
+
868
928
 
929
+ if __name__ == "__main__":
930
+ parser = get_arg_parser()
869
931
  args = parser.parse_args()
870
932
 
871
933
  os.makedirs(os.path.join(args.logdir, "block-{}".format(args.block_id), args.uid), exist_ok=True)
872
934
 
935
+ logger = start_file_logger(
936
+ f'{args.logdir}/block-{args.block_id}/{args.uid}/manager.log',
937
+ 0,
938
+ level=logging.DEBUG if args.debug is True else logging.INFO
939
+ )
940
+ logger.info(
941
+ f"\n Python version: {sys.version}"
942
+ f" Debug logging: {args.debug}"
943
+ f" Certificates dir: {args.cert_dir}"
944
+ f" Log dir: {args.logdir}"
945
+ f" Manager ID: {args.uid}"
946
+ f" Block ID: {args.block_id}"
947
+ f" cores_per_worker: {args.cores_per_worker}"
948
+ f" mem_per_worker: {args.mem_per_worker}"
949
+ f" task_port: {args.task_port}"
950
+ f" result_port: {args.result_port}"
951
+ f" addresses: {args.addresses}"
952
+ f" max_workers_per_node: {args.max_workers_per_node}"
953
+ f" poll_period: {args.poll}"
954
+ f" address_probe_timeout: {args.address_probe_timeout}"
955
+ f" Prefetch capacity: {args.prefetch_capacity}"
956
+ f" Heartbeat threshold: {args.hb_threshold}"
957
+ f" Heartbeat period: {args.hb_period}"
958
+ f" Drain period: {args.drain_period}"
959
+ f" CPU affinity: {args.cpu_affinity}"
960
+ f" Accelerators: {' '.join(args.available_accelerators)}"
961
+ f" enable_mpi_mode: {args.enable_mpi_mode}"
962
+ f" mpi_launcher: {args.mpi_launcher}"
963
+ )
873
964
  try:
874
- logger = start_file_logger('{}/block-{}/{}/manager.log'.format(args.logdir, args.block_id, args.uid),
875
- 0,
876
- level=logging.DEBUG if args.debug is True else logging.INFO)
877
-
878
- logger.info("Python version: {}".format(sys.version))
879
- logger.info("Debug logging: {}".format(args.debug))
880
- logger.info("Certificates dir: {}".format(args.cert_dir))
881
- logger.info("Log dir: {}".format(args.logdir))
882
- logger.info("Manager ID: {}".format(args.uid))
883
- logger.info("Block ID: {}".format(args.block_id))
884
- logger.info("cores_per_worker: {}".format(args.cores_per_worker))
885
- logger.info("mem_per_worker: {}".format(args.mem_per_worker))
886
- logger.info("task_port: {}".format(args.task_port))
887
- logger.info("result_port: {}".format(args.result_port))
888
- logger.info("addresses: {}".format(args.addresses))
889
- logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
890
- logger.info("poll_period: {}".format(args.poll))
891
- logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
892
- logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
893
- logger.info("Heartbeat threshold: {}".format(args.hb_threshold))
894
- logger.info("Heartbeat period: {}".format(args.hb_period))
895
- logger.info("Drain period: {}".format(args.drain_period))
896
- logger.info("CPU affinity: {}".format(args.cpu_affinity))
897
- logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
898
- logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
899
- logger.info("mpi_launcher: {}".format(args.mpi_launcher))
900
-
901
965
  manager = Manager(task_port=args.task_port,
902
966
  result_port=args.result_port,
903
967
  addresses=args.addresses,
@@ -290,10 +290,10 @@ def workflow_dag_plot(df_tasks, group_by_apps=True):
290
290
  edge_trace['y'] += tuple([y0, y1, None])
291
291
 
292
292
  # Create figure:
293
+ title = go.layout.Title(text='Workflow DAG', font=dict(size=16))
293
294
  fig = go.Figure(data=[edge_trace] + node_traces,
294
295
  layout=go.Layout(
295
- title='Workflow DAG',
296
- titlefont=dict(size=16),
296
+ title=title,
297
297
  showlegend=True,
298
298
  hovermode='closest',
299
299
  margin=dict(b=20, l=5, r=5, t=40), # noqa: E741