parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. parsl/addresses.py +3 -1
  2. parsl/config.py +4 -0
  3. parsl/dataflow/dflow.py +14 -5
  4. parsl/executors/base.py +10 -0
  5. parsl/executors/high_throughput/executor.py +12 -0
  6. parsl/executors/high_throughput/interchange.py +30 -8
  7. parsl/executors/high_throughput/manager_record.py +1 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +41 -5
  9. parsl/executors/status_handling.py +2 -9
  10. parsl/executors/taskvine/executor.py +24 -3
  11. parsl/executors/taskvine/manager.py +1 -0
  12. parsl/executors/taskvine/manager_config.py +3 -4
  13. parsl/executors/workqueue/executor.py +19 -0
  14. parsl/jobs/error_handlers.py +1 -1
  15. parsl/jobs/job_status_poller.py +8 -7
  16. parsl/launchers/launchers.py +6 -6
  17. parsl/log_utils.py +8 -4
  18. parsl/monitoring/db_manager.py +4 -2
  19. parsl/monitoring/monitoring.py +30 -264
  20. parsl/monitoring/router.py +208 -0
  21. parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
  22. parsl/monitoring/visualization/views.py +2 -1
  23. parsl/providers/cluster_provider.py +1 -3
  24. parsl/tests/configs/user_opts.py +2 -1
  25. parsl/tests/test_htex/test_drain.py +78 -0
  26. parsl/tests/test_monitoring/test_app_names.py +86 -0
  27. parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
  28. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
  29. parsl/tests/test_python_apps/test_context_manager.py +40 -0
  30. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
  31. parsl/tests/test_shutdown/__init__.py +0 -0
  32. parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
  33. parsl/utils.py +2 -2
  34. parsl/version.py +1 -1
  35. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
  36. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
  37. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
  38. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
  39. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
  40. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
  41. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
  42. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
  43. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0
parsl/addresses.py CHANGED
@@ -81,7 +81,9 @@ def address_by_hostname() -> str:
81
81
  def address_by_interface(ifname: str) -> str:
82
82
  """Returns the IP address of the given interface name, e.g. 'eth0'
83
83
 
84
- This is from a Stack Overflow answer: https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
84
+ This is taken from a Stack Overflow answer:
85
+ https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
86
+
85
87
 
86
88
  Parameters
87
89
  ----------
parsl/config.py CHANGED
@@ -55,6 +55,8 @@ class Config(RepresentationMixin):
55
55
  or `None`.
56
56
  If 'none' or `None`, dynamic scaling will be disabled. Default is 'simple'. The literal value `None` is
57
57
  deprecated.
58
+ strategy_period : float or int, optional
59
+ How often the scaling strategy should be executed. Default is 5 seconds.
58
60
  max_idletime : float, optional
59
61
  The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
60
62
  usage_tracking : bool, optional
@@ -88,6 +90,7 @@ class Config(RepresentationMixin):
88
90
  retry_handler: Optional[Callable[[Exception, TaskRecord], float]] = None,
89
91
  run_dir: str = 'runinfo',
90
92
  strategy: Optional[str] = 'simple',
93
+ strategy_period: Union[float, int] = 5,
91
94
  max_idletime: float = 120.0,
92
95
  monitoring: Optional[MonitoringHub] = None,
93
96
  usage_tracking: bool = False,
@@ -121,6 +124,7 @@ class Config(RepresentationMixin):
121
124
  self.retry_handler = retry_handler
122
125
  self.run_dir = run_dir
123
126
  self.strategy = strategy
127
+ self.strategy_period = strategy_period
124
128
  self.max_idletime = max_idletime
125
129
  self.usage_tracking = usage_tracking
126
130
  self.initialize_logging = initialize_logging
parsl/dataflow/dflow.py CHANGED
@@ -108,12 +108,12 @@ class DataFlowKernel:
108
108
 
109
109
  # hub address and port for interchange to connect
110
110
  self.hub_address = None # type: Optional[str]
111
- self.hub_interchange_port = None # type: Optional[int]
111
+ self.hub_zmq_port = None # type: Optional[int]
112
112
  if self.monitoring:
113
113
  if self.monitoring.logdir is None:
114
114
  self.monitoring.logdir = self.run_dir
115
115
  self.hub_address = self.monitoring.hub_address
116
- self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
116
+ self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
117
117
 
118
118
  self.time_began = datetime.datetime.now()
119
119
  self.time_completed: Optional[datetime.datetime] = None
@@ -178,6 +178,7 @@ class DataFlowKernel:
178
178
  # this must be set before executors are added since add_executors calls
179
179
  # job_status_poller.add_executors.
180
180
  self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
181
+ strategy_period=self.config.strategy_period,
181
182
  max_idletime=self.config.max_idletime,
182
183
  dfk=self)
183
184
 
@@ -205,6 +206,13 @@ class DataFlowKernel:
205
206
 
206
207
  atexit.register(self.atexit_cleanup)
207
208
 
209
+ def __enter__(self):
210
+ pass
211
+
212
+ def __exit__(self, exc_type, exc_value, traceback):
213
+ logger.debug("Exiting the context manager, calling cleanup for DFK")
214
+ self.cleanup()
215
+
208
216
  def _send_task_log_info(self, task_record: TaskRecord) -> None:
209
217
  if self.monitoring:
210
218
  task_log_info = self._create_task_log_info(task_record)
@@ -1114,12 +1122,12 @@ class DataFlowKernel:
1114
1122
 
1115
1123
  channel.makedirs(channel.script_dir, exist_ok=True)
1116
1124
 
1117
- def add_executors(self, executors):
1125
+ def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
1118
1126
  for executor in executors:
1119
1127
  executor.run_id = self.run_id
1120
1128
  executor.run_dir = self.run_dir
1121
1129
  executor.hub_address = self.hub_address
1122
- executor.hub_port = self.hub_interchange_port
1130
+ executor.hub_port = self.hub_zmq_port
1123
1131
  if hasattr(executor, 'provider'):
1124
1132
  if hasattr(executor.provider, 'script_dir'):
1125
1133
  executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
@@ -1170,7 +1178,8 @@ class DataFlowKernel:
1170
1178
  fut = task_record['app_fu']
1171
1179
  if not fut.done():
1172
1180
  fut.exception()
1173
- # now app future is done, poll until DFK state is final: a DFK state being final and the app future being done do not imply each other.
1181
+ # now app future is done, poll until DFK state is final: a
1182
+ # DFK state being final and the app future being done do not imply each other.
1174
1183
  while task_record['status'] not in FINAL_STATES:
1175
1184
  time.sleep(0.1)
1176
1185
 
parsl/executors/base.py CHANGED
@@ -106,6 +106,16 @@ class ParslExecutor(metaclass=ABCMeta):
106
106
  def run_dir(self, value: str) -> None:
107
107
  self._run_dir = value
108
108
 
109
+ @property
110
+ def run_id(self) -> Optional[str]:
111
+ """UUID for the enclosing DFK.
112
+ """
113
+ return self._run_id
114
+
115
+ @run_id.setter
116
+ def run_id(self, value: Optional[str]) -> None:
117
+ self._run_id = value
118
+
109
119
  @property
110
120
  def hub_address(self) -> Optional[str]:
111
121
  """Address to the Hub for monitoring.
@@ -55,6 +55,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
55
55
  "--hb_period={heartbeat_period} "
56
56
  "{address_probe_timeout_string} "
57
57
  "--hb_threshold={heartbeat_threshold} "
58
+ "--drain_period={drain_period} "
58
59
  "--cpu-affinity {cpu_affinity} "
59
60
  "{enable_mpi_mode} "
60
61
  "--mpi-launcher={mpi_launcher} "
@@ -201,6 +202,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
201
202
  Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
202
203
  trades performance for cpu efficiency. Default: 10ms
203
204
 
205
+ drain_period : int
206
+ The number of seconds after start when workers will begin to drain
207
+ and then exit. Set this to a time that is slightly less than the
208
+ maximum walltime of batch jobs to avoid killing tasks while they
209
+ execute. For example, you could set this to the walltime minus a grace
210
+ period for the batch job to start the workers, minus the expected
211
+ maximum length of an individual task.
212
+
204
213
  worker_logdir_root : string
205
214
  In case of a remote file system, specify the path to where logs will be kept.
206
215
 
@@ -240,6 +249,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
240
249
  prefetch_capacity: int = 0,
241
250
  heartbeat_threshold: int = 120,
242
251
  heartbeat_period: int = 30,
252
+ drain_period: Optional[int] = None,
243
253
  poll_period: int = 10,
244
254
  address_probe_timeout: Optional[int] = None,
245
255
  worker_logdir_root: Optional[str] = None,
@@ -303,6 +313,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
303
313
  self.interchange_port_range = interchange_port_range
304
314
  self.heartbeat_threshold = heartbeat_threshold
305
315
  self.heartbeat_period = heartbeat_period
316
+ self.drain_period = drain_period
306
317
  self.poll_period = poll_period
307
318
  self.run_dir = '.'
308
319
  self.worker_logdir_root = worker_logdir_root
@@ -376,6 +387,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
376
387
  nodes_per_block=self.provider.nodes_per_block,
377
388
  heartbeat_period=self.heartbeat_period,
378
389
  heartbeat_threshold=self.heartbeat_threshold,
390
+ drain_period=self.drain_period,
379
391
  poll_period=self.poll_period,
380
392
  cert_dir=self.cert_dir,
381
393
  logdir=self.worker_logdir,
@@ -28,6 +28,7 @@ from parsl.process_loggers import wrap_with_logs
28
28
 
29
29
 
30
30
  PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
31
+ PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
31
32
 
32
33
  LOGGER_NAME = "interchange"
33
34
  logger = logging.getLogger(LOGGER_NAME)
@@ -101,12 +102,12 @@ class Interchange:
101
102
  This is overridden when the worker_ports option is set. Default: (54000, 55000)
102
103
 
103
104
  hub_address : str
104
- The ip address at which the interchange can send info about managers to when monitoring is enabled.
105
- This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
105
+ The IP address at which the interchange can send info about managers to when monitoring is enabled.
106
+ Default: None (meaning monitoring disabled)
106
107
 
107
108
  hub_port : str
108
109
  The port at which the interchange can send info about managers to when monitoring is enabled.
109
- This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
110
+ Default: None (meaning monitoring disabled)
110
111
 
111
112
  heartbeat_threshold : int
112
113
  Number of seconds since the last heartbeat after which worker is considered lost.
@@ -244,19 +245,19 @@ class Interchange:
244
245
 
245
246
  def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
246
247
  if self.hub_address and self.hub_port:
247
- logger.info("Connecting to monitoring")
248
+ logger.info("Connecting to MonitoringHub")
248
249
  # This is a one-off because monitoring is unencrypted
249
250
  hub_channel = zmq.Context().socket(zmq.DEALER)
250
251
  hub_channel.set_hwm(0)
251
252
  hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
252
- logger.info("Monitoring enabled and connected to hub")
253
+ logger.info("Connected to MonitoringHub")
253
254
  return hub_channel
254
255
  else:
255
256
  return None
256
257
 
257
258
  def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
258
259
  if hub_channel:
259
- logger.info("Sending message {} to hub".format(manager))
260
+ logger.info("Sending message {} to MonitoringHub".format(manager))
260
261
 
261
262
  d: Dict = cast(Dict, manager.copy())
262
263
  d['timestamp'] = datetime.datetime.now()
@@ -308,7 +309,8 @@ class Interchange:
308
309
  'worker_count': m['worker_count'],
309
310
  'tasks': len(m['tasks']),
310
311
  'idle_duration': idle_duration,
311
- 'active': m['active']}
312
+ 'active': m['active'],
313
+ 'draining': m['draining']}
312
314
  reply.append(resp)
313
315
 
314
316
  elif command_req.startswith("HOLD_WORKER"):
@@ -385,6 +387,7 @@ class Interchange:
385
387
  self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
386
388
  self.process_results_incoming(interesting_managers, hub_channel)
387
389
  self.expire_bad_managers(interesting_managers, hub_channel)
390
+ self.expire_drained_managers(interesting_managers, hub_channel)
388
391
  self.process_tasks_to_send(interesting_managers)
389
392
 
390
393
  self.zmq_context.destroy()
@@ -431,6 +434,7 @@ class Interchange:
431
434
  'max_capacity': 0,
432
435
  'worker_count': 0,
433
436
  'active': True,
437
+ 'draining': False,
434
438
  'tasks': []}
435
439
  self.connected_block_history.append(msg['block_id'])
436
440
 
@@ -469,10 +473,28 @@ class Interchange:
469
473
  self._ready_managers[manager_id]['last_heartbeat'] = time.time()
470
474
  logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
471
475
  self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
476
+ elif msg['type'] == 'drain':
477
+ self._ready_managers[manager_id]['draining'] = True
478
+ logger.debug(f"Manager {manager_id!r} requested drain")
472
479
  else:
473
480
  logger.error(f"Unexpected message type received from manager: {msg['type']}")
474
481
  logger.debug("leaving task_outgoing section")
475
482
 
483
+ def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
484
+
485
+ for manager_id in list(interesting_managers):
486
+ # is it always true that a draining manager will be in interesting managers?
487
+ # i think so because it will have outstanding capacity?
488
+ m = self._ready_managers[manager_id]
489
+ if m['draining'] and len(m['tasks']) == 0:
490
+ logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
491
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
492
+ interesting_managers.remove(manager_id)
493
+ self._ready_managers.pop(manager_id)
494
+
495
+ m['active'] = False
496
+ self._send_monitoring_info(hub_channel, m)
497
+
476
498
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
477
499
  # Check if there are tasks that could be sent to managers
478
500
 
@@ -490,7 +512,7 @@ class Interchange:
490
512
  tasks_inflight = len(m['tasks'])
491
513
  real_capacity = m['max_capacity'] - tasks_inflight
492
514
 
493
- if (real_capacity and m['active']):
515
+ if (real_capacity and m['active'] and not m['draining']):
494
516
  tasks = self.get_tasks(real_capacity)
495
517
  if tasks:
496
518
  self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
@@ -9,6 +9,7 @@ class ManagerRecord(TypedDict, total=False):
9
9
  worker_count: int
10
10
  max_capacity: int
11
11
  active: bool
12
+ draining: bool
12
13
  hostname: str
13
14
  last_heartbeat: float
14
15
  idle_since: Optional[float]
@@ -36,6 +36,7 @@ from parsl.executors.high_throughput.mpi_resource_management import (
36
36
  from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
37
37
 
38
38
  HEARTBEAT_CODE = (2 ** 32) - 1
39
+ DRAINED_CODE = (2 ** 32) - 2
39
40
 
40
41
 
41
42
  class Manager:
@@ -73,7 +74,8 @@ class Manager:
73
74
  enable_mpi_mode: bool = False,
74
75
  mpi_launcher: str = "mpiexec",
75
76
  available_accelerators: Sequence[str],
76
- cert_dir: Optional[str]):
77
+ cert_dir: Optional[str],
78
+ drain_period: Optional[int]):
77
79
  """
78
80
  Parameters
79
81
  ----------
@@ -138,6 +140,9 @@ class Manager:
138
140
 
139
141
  cert_dir : str | None
140
142
  Path to the certificate directory.
143
+
144
+ drain_period: int | None
145
+ Number of seconds to drain after TODO: could be a nicer timespec involving m,s,h qualifiers for user friendliness?
141
146
  """
142
147
 
143
148
  logger.info("Manager initializing")
@@ -227,6 +232,14 @@ class Manager:
227
232
  self.heartbeat_period = heartbeat_period
228
233
  self.heartbeat_threshold = heartbeat_threshold
229
234
  self.poll_period = poll_period
235
+
236
+ self.drain_time: float
237
+ if drain_period:
238
+ self.drain_time = self._start_time + drain_period
239
+ logger.info(f"Will request drain at {self.drain_time}")
240
+ else:
241
+ self.drain_time = float('inf')
242
+
230
243
  self.cpu_affinity = cpu_affinity
231
244
 
232
245
  # Define accelerator available, adjust worker count accordingly
@@ -262,10 +275,19 @@ class Manager:
262
275
  """ Send heartbeat to the incoming task queue
263
276
  """
264
277
  msg = {'type': 'heartbeat'}
278
+ # don't need to dumps and encode this every time - could do as a global on import?
265
279
  b_msg = json.dumps(msg).encode('utf-8')
266
280
  self.task_incoming.send(b_msg)
267
281
  logger.debug("Sent heartbeat")
268
282
 
283
+ def drain_to_incoming(self):
284
+ """ Send heartbeat to the incoming task queue
285
+ """
286
+ msg = {'type': 'drain'}
287
+ b_msg = json.dumps(msg).encode('utf-8')
288
+ self.task_incoming.send(b_msg)
289
+ logger.debug("Sent drain")
290
+
269
291
  @wrap_with_logs
270
292
  def pull_tasks(self, kill_event):
271
293
  """ Pull tasks from the incoming tasks zmq pipe onto the internal
@@ -298,6 +320,7 @@ class Manager:
298
320
  # time here are correctly copy-pasted from the relevant if
299
321
  # statements.
300
322
  next_interesting_event_time = min(last_beat + self.heartbeat_period,
323
+ self.drain_time,
301
324
  last_interchange_contact + self.heartbeat_threshold)
302
325
  try:
303
326
  pending_task_count = self.pending_task_queue.qsize()
@@ -312,6 +335,14 @@ class Manager:
312
335
  self.heartbeat_to_incoming()
313
336
  last_beat = time.time()
314
337
 
338
+ if self.drain_time and time.time() > self.drain_time:
339
+ logger.info("Requesting drain")
340
+ self.drain_to_incoming()
341
+ self.drain_time = None
342
+ # This will start the pool draining...
343
+ # Drained exit behaviour does not happen here. It will be
344
+ # driven by the interchange sending a DRAINED_CODE message.
345
+
315
346
  poll_duration_s = max(0, next_interesting_event_time - time.time())
316
347
  socks = dict(poller.poll(timeout=poll_duration_s * 1000))
317
348
 
@@ -322,7 +353,9 @@ class Manager:
322
353
 
323
354
  if tasks == HEARTBEAT_CODE:
324
355
  logger.debug("Got heartbeat from interchange")
325
-
356
+ elif tasks == DRAINED_CODE:
357
+ logger.info("Got fulled drained message from interchange - setting kill flag")
358
+ kill_event.set()
326
359
  else:
327
360
  task_recv_counter += len(tasks)
328
361
  logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
@@ -490,9 +523,8 @@ class Manager:
490
523
  self._worker_watchdog_thread.start()
491
524
  self._monitoring_handler_thread.start()
492
525
 
493
- logger.info("Loop start")
526
+ logger.info("Manager threads started")
494
527
 
495
- # TODO : Add mechanism in this loop to stop the worker pool
496
528
  # This might need a multiprocessing event to signal back.
497
529
  self._kill_event.wait()
498
530
  logger.critical("Received kill event, terminating worker processes")
@@ -804,6 +836,8 @@ if __name__ == "__main__":
804
836
  help="Heartbeat period in seconds. Uses manager default unless set")
805
837
  parser.add_argument("--hb_threshold", default=120,
806
838
  help="Heartbeat threshold in seconds. Uses manager default unless set")
839
+ parser.add_argument("--drain_period", default=None,
840
+ help="Drain this pool after specified number of seconds. By default, does not drain.")
807
841
  parser.add_argument("--address_probe_timeout", default=30,
808
842
  help="Timeout to probe for viable address to interchange. Default: 30s")
809
843
  parser.add_argument("--poll", default=10,
@@ -824,7 +858,7 @@ if __name__ == "__main__":
824
858
  required=True,
825
859
  help="Whether/how workers should control CPU affinity.")
826
860
  parser.add_argument("--available-accelerators", type=str, nargs="*",
827
- help="Names of available accelerators")
861
+ help="Names of available accelerators, if not given assumed to be zero accelerators available", default=[])
828
862
  parser.add_argument("--enable_mpi_mode", action='store_true',
829
863
  help="Enable MPI mode")
830
864
  parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
@@ -856,6 +890,7 @@ if __name__ == "__main__":
856
890
  logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
857
891
  logger.info("Heartbeat threshold: {}".format(args.hb_threshold))
858
892
  logger.info("Heartbeat period: {}".format(args.hb_period))
893
+ logger.info("Drain period: {}".format(args.drain_period))
859
894
  logger.info("CPU affinity: {}".format(args.cpu_affinity))
860
895
  logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
861
896
  logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
@@ -876,6 +911,7 @@ if __name__ == "__main__":
876
911
  prefetch_capacity=int(args.prefetch_capacity),
877
912
  heartbeat_threshold=int(args.hb_threshold),
878
913
  heartbeat_period=int(args.hb_period),
914
+ drain_period=None if args.drain_period == "None" else int(args.drain_period),
879
915
  poll_period=int(args.poll),
880
916
  cpu_affinity=args.cpu_affinity,
881
917
  enable_mpi_mode=args.enable_mpi_mode,
@@ -61,7 +61,7 @@ class BlockProviderExecutor(ParslExecutor):
61
61
  # errors can happen during the submit call to the provider; this is used
62
62
  # to keep track of such errors so that they can be handled in one place
63
63
  # together with errors reported by status()
64
- self._simulated_status: Dict[Any, JobStatus] = {}
64
+ self._simulated_status: Dict[str, JobStatus] = {}
65
65
  self._executor_bad_state = threading.Event()
66
66
  self._executor_exception: Optional[Exception] = None
67
67
 
@@ -102,13 +102,10 @@ class BlockProviderExecutor(ParslExecutor):
102
102
  else:
103
103
  return self._provider.status_polling_interval
104
104
 
105
- def _fail_job_async(self, block_id: Any, message: str):
105
+ def _fail_job_async(self, block_id: str, message: str):
106
106
  """Marks a job that has failed to start but would not otherwise be included in status()
107
107
  as failed and report it in status()
108
108
  """
109
- if block_id is None:
110
- block_id = str(self._block_id_counter.get_id())
111
- logger.info(f"Allocated block ID {block_id} for simulated failure")
112
109
  self._simulated_status[block_id] = JobStatus(JobState.FAILED, message)
113
110
 
114
111
  @abstractproperty
@@ -211,10 +208,6 @@ class BlockProviderExecutor(ParslExecutor):
211
208
 
212
209
  Cause the executor to reduce the number of blocks by count.
213
210
 
214
- We should have the scale in method simply take resource object
215
- which will have the scaling methods, scale_in itself should be a coroutine, since
216
- scaling tasks can be slow.
217
-
218
211
  :return: A list of block ids corresponding to the blocks that were removed.
219
212
  """
220
213
  pass
@@ -4,6 +4,7 @@ high-throughput system for delegating Parsl tasks to thousands of remote machine
4
4
  """
5
5
 
6
6
  # Import Python built-in libraries
7
+ import atexit
7
8
  import threading
8
9
  import multiprocessing
9
10
  import logging
@@ -171,7 +172,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
171
172
  # Path to directory that holds all tasks' data and results.
172
173
  self._function_data_dir = ""
173
174
 
174
- # helper scripts to prepare package tarballs for Parsl apps
175
+ # Helper scripts to prepare package tarballs for Parsl apps
175
176
  self._package_analyze_script = shutil.which("poncho_package_analyze")
176
177
  self._package_create_script = shutil.which("poncho_package_create")
177
178
  if self._package_analyze_script is None or self._package_create_script is None:
@@ -179,6 +180,18 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
179
180
  else:
180
181
  self._poncho_available = True
181
182
 
183
+ # Register atexit handler to cleanup when Python shuts down
184
+ atexit.register(self.atexit_cleanup)
185
+
186
+ # Attribute indicating whether this executor was started to shut it down properly.
187
+ # This safeguards cases where an object of this executor is created but
188
+ # the executor never starts, so it shouldn't be shutdowned.
189
+ self._started = False
190
+
191
+ def atexit_cleanup(self):
192
+ # Calls this executor's shutdown method upon Python exiting the process.
193
+ self.shutdown()
194
+
182
195
  def _get_launch_command(self, block_id):
183
196
  # Implements BlockProviderExecutor's abstract method.
184
197
  # This executor uses different terminology for worker/launch
@@ -196,8 +209,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
196
209
  if self.manager_config.port == 0 and self.manager_config.project_name is None:
197
210
  self.manager_config.project_name = "parsl-vine-" + str(uuid.uuid4())
198
211
 
199
- # guess the host name if the project name is not given
200
- if not self.manager_config.project_name:
212
+ # guess the host name if the project name is not given and none has been supplied
213
+ # explicitly in the manager config.
214
+ if not self.manager_config.project_name and self.manager_config.address is None:
201
215
  self.manager_config.address = get_any_address()
202
216
 
203
217
  # Factory communication settings are overridden by manager communication settings.
@@ -237,6 +251,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
237
251
  retrieve Parsl tasks within the TaskVine system.
238
252
  """
239
253
 
254
+ # Mark this executor object as started
255
+ self._started = True
256
+
240
257
  # Synchronize connection and communication settings between the manager and factory
241
258
  self.__synchronize_manager_factory_comm_settings()
242
259
 
@@ -597,6 +614,10 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
597
614
  """Shutdown the executor. Sets flag to cancel the submit process and
598
615
  collector thread, which shuts down the TaskVine system submission.
599
616
  """
617
+ if not self._started:
618
+ # Don't shutdown if the executor never starts.
619
+ return
620
+
600
621
  logger.debug("TaskVine shutdown started")
601
622
  self._should_stop.set()
602
623
 
@@ -376,6 +376,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
376
376
  task_out_file = parsl_file_name_to_vine_file[spec.parsl_name]
377
377
  else:
378
378
  task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True)
379
+ parsl_file_name_to_vine_file[spec.parsl_name] = task_out_file
379
380
  t.add_output(task_out_file, spec.parsl_name)
380
381
 
381
382
  # Submit the task to the TaskVine object
@@ -1,4 +1,3 @@
1
- import socket
2
1
  from dataclasses import dataclass
3
2
  from typing import Optional
4
3
 
@@ -23,9 +22,9 @@ class TaskVineManagerConfig:
23
22
  A value of 0 means TaskVine chooses any available port.
24
23
  Default is VINE_DEFAULT_PORT.
25
24
 
26
- address: str
25
+ address: Optional[str]
27
26
  Address of the local machine.
28
- Default is socket.gethostname().
27
+ If None, socket.gethostname() will be used to determine the address.
29
28
 
30
29
  project_name: Optional[str]
31
30
  If given, TaskVine will periodically report its status and performance
@@ -161,7 +160,7 @@ class TaskVineManagerConfig:
161
160
 
162
161
  # Connection and communication settings
163
162
  port: int = VINE_DEFAULT_PORT
164
- address: str = socket.gethostname()
163
+ address: Optional[str] = None
165
164
  project_name: Optional[str] = None
166
165
  project_password_file: Optional[str] = None
167
166
 
@@ -3,6 +3,7 @@ Cooperative Computing Lab (CCL) at Notre Dame to provide a fault-tolerant,
3
3
  high-throughput system for delegating Parsl tasks to thousands of remote machines
4
4
  """
5
5
 
6
+ import atexit
6
7
  import threading
7
8
  import multiprocessing
8
9
  import logging
@@ -298,6 +299,18 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
298
299
  if self.init_command != "":
299
300
  self.launch_cmd = self.init_command + "; " + self.launch_cmd
300
301
 
302
+ # register atexit handler to cleanup when Python shuts down
303
+ atexit.register(self.atexit_cleanup)
304
+
305
+ # Attribute indicating whether this executor was started to shut it down properly.
306
+ # This safeguards cases where an object of this executor is created but
307
+ # the executor never starts, so it shouldn't be shutdowned.
308
+ self.started = False
309
+
310
+ def atexit_cleanup(self):
311
+ # Calls this executor's shutdown method upon Python exiting the process.
312
+ self.shutdown()
313
+
301
314
  def _get_launch_command(self, block_id):
302
315
  # this executor uses different terminology for worker/launch
303
316
  # commands than in htex
@@ -307,6 +320,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
307
320
  """Create submit process and collector thread to create, send, and
308
321
  retrieve Parsl tasks within the Work Queue system.
309
322
  """
323
+ # Mark this executor object as started
324
+ self.started = True
310
325
  self.tasks_lock = threading.Lock()
311
326
 
312
327
  # Create directories for data and results
@@ -695,6 +710,10 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
695
710
  """Shutdown the executor. Sets flag to cancel the submit process and
696
711
  collector thread, which shuts down the Work Queue system submission.
697
712
  """
713
+ if not self.started:
714
+ # Don't shutdown if the executor never starts.
715
+ return
716
+
698
717
  logger.debug("Work Queue shutdown started")
699
718
  self.should_stop.value = True
700
719
 
@@ -20,7 +20,7 @@ def simple_error_handler(executor: status_handling.BlockProviderExecutor, status
20
20
  executor.set_bad_state_and_fail_all(_get_error(status))
21
21
 
22
22
 
23
- def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3):
23
+ def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
24
24
  sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))]
25
25
  current_window = dict(sorted_status[-threshold:])
26
26
  total, failed = _count_jobs(current_window)