parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. parsl/addresses.py +1 -1
  2. parsl/configs/ASPIRE1.py +1 -1
  3. parsl/configs/ad_hoc.py +1 -1
  4. parsl/configs/bridges.py +1 -1
  5. parsl/configs/cc_in2p3.py +1 -1
  6. parsl/configs/expanse.py +1 -1
  7. parsl/configs/frontera.py +1 -1
  8. parsl/configs/kubernetes.py +1 -1
  9. parsl/configs/midway.py +1 -1
  10. parsl/configs/osg.py +1 -1
  11. parsl/configs/stampede2.py +1 -1
  12. parsl/dataflow/dflow.py +11 -6
  13. parsl/dataflow/taskrecord.py +3 -1
  14. parsl/executors/high_throughput/executor.py +69 -37
  15. parsl/executors/high_throughput/interchange.py +78 -59
  16. parsl/executors/high_throughput/process_worker_pool.py +40 -28
  17. parsl/executors/taskvine/executor.py +3 -1
  18. parsl/executors/workqueue/executor.py +5 -2
  19. parsl/executors/workqueue/parsl_coprocess.py +107 -95
  20. parsl/jobs/job_status_poller.py +9 -3
  21. parsl/jobs/strategy.py +4 -3
  22. parsl/monitoring/db_manager.py +25 -5
  23. parsl/monitoring/monitoring.py +6 -2
  24. parsl/monitoring/remote.py +29 -0
  25. parsl/monitoring/visualization/models.py +7 -0
  26. parsl/providers/slurm/slurm.py +13 -2
  27. parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
  28. parsl/tests/configs/bluewaters.py +1 -1
  29. parsl/tests/configs/bridges.py +1 -1
  30. parsl/tests/configs/cc_in2p3.py +1 -1
  31. parsl/tests/configs/comet.py +1 -1
  32. parsl/tests/configs/frontera.py +1 -1
  33. parsl/tests/configs/midway.py +1 -1
  34. parsl/tests/configs/nscc_singapore.py +1 -1
  35. parsl/tests/configs/osg_htex.py +1 -1
  36. parsl/tests/configs/petrelkube.py +1 -1
  37. parsl/tests/configs/summit.py +1 -1
  38. parsl/tests/configs/theta.py +1 -1
  39. parsl/tests/configs/user_opts.py +3 -1
  40. parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
  41. parsl/tests/scaling_tests/htex_local.py +1 -1
  42. parsl/tests/sites/test_affinity.py +1 -1
  43. parsl/tests/sites/test_concurrent.py +1 -1
  44. parsl/tests/sites/test_dynamic_executor.py +1 -1
  45. parsl/tests/sites/test_worker_info.py +1 -1
  46. parsl/tests/test_htex/test_basic.py +1 -1
  47. parsl/tests/test_htex/test_connected_blocks.py +1 -1
  48. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
  49. parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
  50. parsl/tests/test_htex/test_htex.py +13 -0
  51. parsl/tests/test_htex/test_manager_failure.py +1 -1
  52. parsl/tests/test_htex/test_missing_worker.py +1 -1
  53. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
  54. parsl/tests/test_htex/test_worker_failure.py +1 -1
  55. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
  56. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
  57. parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
  58. parsl/tests/test_scaling/test_scale_down.py +2 -2
  59. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
  60. parsl/usage_tracking/usage.py +5 -9
  61. parsl/version.py +1 -1
  62. parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
  63. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
  64. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
  65. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
  66. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
  67. parsl/configs/bluewaters.py +0 -28
  68. parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
  69. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
  70. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
  71. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
  72. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
parsl/addresses.py CHANGED
@@ -81,7 +81,7 @@ def address_by_hostname() -> str:
81
81
  def address_by_interface(ifname: str) -> str:
82
82
  """Returns the IP address of the given interface name, e.g. 'eth0'
83
83
 
84
- This is taken from a Stack Overflow answer: https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
84
+ This is from a Stack Overflow answer: https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
85
85
 
86
86
  Parameters
87
87
  ----------
parsl/configs/ASPIRE1.py CHANGED
@@ -12,7 +12,7 @@ config = Config(
12
12
  heartbeat_period=15,
13
13
  heartbeat_threshold=120,
14
14
  worker_debug=True,
15
- max_workers=4,
15
+ max_workers_per_node=4,
16
16
  address=address_by_interface('ib0'),
17
17
  provider=PBSProProvider(
18
18
  launcher=MpiRunLauncher(),
parsl/configs/ad_hoc.py CHANGED
@@ -17,7 +17,7 @@ config = Config(
17
17
  executors=[
18
18
  HighThroughputExecutor(
19
19
  label='remote_htex',
20
- max_workers=2,
20
+ max_workers_per_node=2,
21
21
  worker_logdir_root=user_opts['adhoc']['script_dir'],
22
22
  provider=AdHocProvider(
23
23
  # Command to be run before starting a worker, such as:
parsl/configs/bridges.py CHANGED
@@ -13,7 +13,7 @@ config = Config(
13
13
  HighThroughputExecutor(
14
14
  label='Bridges_HTEX_multinode',
15
15
  address=address_by_interface('ens3f0'),
16
- max_workers=1,
16
+ max_workers_per_node=1,
17
17
  provider=SlurmProvider(
18
18
  'YOUR_PARTITION_NAME', # Specify Partition / QOS, for eg. RM-small
19
19
  nodes_per_block=2,
parsl/configs/cc_in2p3.py CHANGED
@@ -7,7 +7,7 @@ config = Config(
7
7
  executors=[
8
8
  HighThroughputExecutor(
9
9
  label='cc_in2p3_htex',
10
- max_workers=2,
10
+ max_workers_per_node=2,
11
11
  provider=GridEngineProvider(
12
12
  channel=LocalChannel(),
13
13
  nodes_per_block=1,
parsl/configs/expanse.py CHANGED
@@ -8,7 +8,7 @@ config = Config(
8
8
  executors=[
9
9
  HighThroughputExecutor(
10
10
  label='Expanse_CPU_Multinode',
11
- max_workers=32,
11
+ max_workers_per_node=32,
12
12
  provider=SlurmProvider(
13
13
  'compute',
14
14
  account='YOUR_ALLOCATION_ON_EXPANSE',
parsl/configs/frontera.py CHANGED
@@ -12,7 +12,7 @@ config = Config(
12
12
  executors=[
13
13
  HighThroughputExecutor(
14
14
  label="frontera_htex",
15
- max_workers=1, # Set number of workers per node
15
+ max_workers_per_node=1, # Set number of workers per node
16
16
  provider=SlurmProvider(
17
17
  cmd_timeout=60, # Add extra time for slow scheduler responses
18
18
  channel=LocalChannel(),
@@ -9,7 +9,7 @@ config = Config(
9
9
  HighThroughputExecutor(
10
10
  label='kube-htex',
11
11
  cores_per_worker=1,
12
- max_workers=1,
12
+ max_workers_per_node=1,
13
13
  worker_logdir_root='YOUR_WORK_DIR',
14
14
 
15
15
  # Address for the pod worker to connect back
parsl/configs/midway.py CHANGED
@@ -10,7 +10,7 @@ config = Config(
10
10
  label='Midway_HTEX_multinode',
11
11
  address=address_by_interface('bond0'),
12
12
  worker_debug=False,
13
- max_workers=2,
13
+ max_workers_per_node=2,
14
14
  provider=SlurmProvider(
15
15
  'YOUR_PARTITION', # Partition name, e.g 'broadwl'
16
16
  launcher=SrunLauncher(),
parsl/configs/osg.py CHANGED
@@ -6,7 +6,7 @@ config = Config(
6
6
  executors=[
7
7
  HighThroughputExecutor(
8
8
  label='OSG_HTEX',
9
- max_workers=1,
9
+ max_workers_per_node=1,
10
10
  provider=CondorProvider(
11
11
  nodes_per_block=1,
12
12
  init_blocks=4,
@@ -11,7 +11,7 @@ config = Config(
11
11
  HighThroughputExecutor(
12
12
  label='Stampede2_HTEX',
13
13
  address=address_by_interface('em3'),
14
- max_workers=2,
14
+ max_workers_per_node=2,
15
15
  provider=SlurmProvider(
16
16
  nodes_per_block=2,
17
17
  init_blocks=1,
parsl/dataflow/dflow.py CHANGED
@@ -95,7 +95,7 @@ class DataFlowKernel:
95
95
  self.checkpoint_lock = threading.Lock()
96
96
 
97
97
  self.usage_tracker = UsageTracker(self)
98
- self.usage_tracker.send_message()
98
+ self.usage_tracker.send_start_message()
99
99
 
100
100
  self.task_state_counts_lock = threading.Lock()
101
101
  self.task_state_counts = {state: 0 for state in States}
@@ -722,7 +722,10 @@ class DataFlowKernel:
722
722
  self._send_task_log_info(task_record)
723
723
 
724
724
  if hasattr(exec_fu, "parsl_executor_task_id"):
725
- logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label} with executor id {exec_fu.parsl_executor_task_id}")
725
+ logger.info(
726
+ f"Parsl task {task_id} try {try_id} launched on executor {executor.label} "
727
+ f"with executor id {exec_fu.parsl_executor_task_id}")
728
+
726
729
  else:
727
730
  logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label}")
728
731
 
@@ -730,7 +733,8 @@ class DataFlowKernel:
730
733
 
731
734
  return exec_fu
732
735
 
733
- def _add_input_deps(self, executor: str, args: Sequence[Any], kwargs: Dict[str, Any], func: Callable) -> Tuple[Sequence[Any], Dict[str, Any], Callable]:
736
+ def _add_input_deps(self, executor: str, args: Sequence[Any], kwargs: Dict[str, Any], func: Callable) -> Tuple[Sequence[Any], Dict[str, Any],
737
+ Callable]:
734
738
  """Look for inputs of the app that are files. Give the data manager
735
739
  the opportunity to replace a file with a data future for that file,
736
740
  for example wrapping the result of a staging action.
@@ -1142,8 +1146,9 @@ class DataFlowKernel:
1142
1146
 
1143
1147
  def atexit_cleanup(self) -> None:
1144
1148
  if not self.cleanup_called:
1145
- logger.info("DFK cleanup because python process is exiting")
1146
- self.cleanup()
1149
+ logger.warning("Python is exiting with a DFK still running. "
1150
+ "You should call parsl.dfk().cleanup() before "
1151
+ "exiting to release any resources")
1147
1152
  else:
1148
1153
  logger.info("python process is exiting, but DFK has already been cleaned up")
1149
1154
 
@@ -1200,7 +1205,7 @@ class DataFlowKernel:
1200
1205
  self._checkpoint_timer.close()
1201
1206
 
1202
1207
  # Send final stats
1203
- self.usage_tracker.send_message()
1208
+ self.usage_tracker.send_end_message()
1204
1209
  self.usage_tracker.close()
1205
1210
 
1206
1211
  logger.info("Closing job status poller")
@@ -70,7 +70,9 @@ class TaskRecord(TypedDict, total=False):
70
70
  # these three could be more strongly typed perhaps but I'm not thinking about that now
71
71
  func: Callable
72
72
  fn_hash: str
73
- args: Sequence[Any] # in some places we uses a Tuple[Any, ...] and in some places a List[Any]. This is an attempt to correctly type both of those.
73
+ args: Sequence[Any]
74
+ # in some places we uses a Tuple[Any, ...] and in some places a List[Any].
75
+ # This is an attempt to correctly type both of those.
74
76
  kwargs: Dict[str, Any]
75
77
 
76
78
  time_invoked: Optional[datetime.datetime]
@@ -6,10 +6,12 @@ import threading
6
6
  import queue
7
7
  import datetime
8
8
  import pickle
9
+ from dataclasses import dataclass
9
10
  from multiprocessing import Process, Queue
10
11
  from typing import Dict, Sequence
11
12
  from typing import List, Optional, Tuple, Union, Callable
12
13
  import math
14
+ import warnings
13
15
 
14
16
  import parsl.launchers
15
17
  from parsl.serialize import pack_res_spec_apply_message, deserialize
@@ -39,7 +41,7 @@ from parsl.providers import LocalProvider
39
41
 
40
42
  logger = logging.getLogger(__name__)
41
43
 
42
- DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers} "
44
+ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
43
45
  "-a {addresses} "
44
46
  "-p {prefetch_capacity} "
45
47
  "-c {cores_per_worker} "
@@ -154,7 +156,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
154
156
  the there's sufficient memory for each worker. Default: None
155
157
 
156
158
  max_workers : int
157
- Caps the number of workers launched per node. Default: infinity
159
+ Deprecated. Please use max_workers_per_node instead.
160
+
161
+ max_workers_per_node : int
162
+ Caps the number of workers launched per node. Default: None
158
163
 
159
164
  cpu_affinity: string
160
165
  Whether or how each worker process sets thread affinity. Options include "none" to forgo
@@ -228,7 +233,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
228
233
  worker_debug: bool = False,
229
234
  cores_per_worker: float = 1.0,
230
235
  mem_per_worker: Optional[float] = None,
231
- max_workers: Union[int, float] = float('inf'),
236
+ max_workers: Optional[Union[int, float]] = None,
237
+ max_workers_per_node: Optional[Union[int, float]] = None,
232
238
  cpu_affinity: str = 'none',
233
239
  available_accelerators: Union[int, Sequence[str]] = (),
234
240
  prefetch_capacity: int = 0,
@@ -251,7 +257,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
251
257
  self.working_dir = working_dir
252
258
  self.cores_per_worker = cores_per_worker
253
259
  self.mem_per_worker = mem_per_worker
254
- self.max_workers = max_workers
255
260
  self.prefetch_capacity = prefetch_capacity
256
261
  self.address = address
257
262
  self.address_probe_timeout = address_probe_timeout
@@ -260,8 +265,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
260
265
  else:
261
266
  self.all_addresses = ','.join(get_all_addresses())
262
267
 
263
- mem_slots = max_workers
264
- cpu_slots = max_workers
268
+ if max_workers:
269
+ self._warn_deprecated("max_workers", "max_workers_per_node")
270
+ self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
271
+
272
+ mem_slots = self.max_workers_per_node
273
+ cpu_slots = self.max_workers_per_node
265
274
  if hasattr(self.provider, 'mem_per_node') and \
266
275
  self.provider.mem_per_node is not None and \
267
276
  mem_per_worker is not None and \
@@ -278,7 +287,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
278
287
  self.available_accelerators = list(available_accelerators)
279
288
 
280
289
  # Determine the number of workers per node
281
- self._workers_per_node = min(max_workers, mem_slots, cpu_slots)
290
+ self._workers_per_node = min(self.max_workers_per_node, mem_slots, cpu_slots)
282
291
  if len(self.available_accelerators) > 0:
283
292
  self._workers_per_node = min(self._workers_per_node, len(available_accelerators))
284
293
  if self._workers_per_node == float('inf'):
@@ -316,6 +325,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
316
325
 
317
326
  radio_mode = "htex"
318
327
 
328
+ def _warn_deprecated(self, old: str, new: str):
329
+ warnings.warn(
330
+ f"{old} is deprecated and will be removed in a future release. "
331
+ f"Please use {new} instead.",
332
+ DeprecationWarning,
333
+ stacklevel=2
334
+ )
335
+
336
+ @property
337
+ def max_workers(self):
338
+ self._warn_deprecated("max_workers", "max_workers_per_node")
339
+ return self.max_workers_per_node
340
+
341
+ @max_workers.setter
342
+ def max_workers(self, val: Union[int, float]):
343
+ self._warn_deprecated("max_workers", "max_workers_per_node")
344
+ self.max_workers_per_node = val
345
+
319
346
  @property
320
347
  def logdir(self):
321
348
  return "{}/{}".format(self.run_dir, self.label)
@@ -330,7 +357,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
330
357
  """Compose the launch command and scale out the initial blocks.
331
358
  """
332
359
  debug_opts = "--debug" if self.worker_debug else ""
333
- max_workers = "" if self.max_workers == float('inf') else "--max_workers={}".format(self.max_workers)
360
+ max_workers_per_node = "" if self.max_workers_per_node == float('inf') else "--max_workers_per_node={}".format(self.max_workers_per_node)
334
361
  enable_mpi_opts = "--enable_mpi_mode " if self.enable_mpi_mode else ""
335
362
 
336
363
  address_probe_timeout_string = ""
@@ -345,7 +372,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
345
372
  result_port=self.worker_result_port,
346
373
  cores_per_worker=self.cores_per_worker,
347
374
  mem_per_worker=self.mem_per_worker,
348
- max_workers=max_workers,
375
+ max_workers_per_node=max_workers_per_node,
349
376
  nodes_per_block=self.provider.nodes_per_block,
350
377
  heartbeat_period=self.heartbeat_period,
351
378
  heartbeat_threshold=self.heartbeat_threshold,
@@ -602,8 +629,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
602
629
  """Submits work to the outgoing_q.
603
630
 
604
631
  The outgoing_q is an external process listens on this
605
- queue for new work. This method behaves like a
606
- submit call as described here `Python docs: <https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor>`_
632
+ queue for new work. This method behaves like a submit call as described here `Python docs: <https://docs.python.org/3/
633
+ library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor>`_
607
634
 
608
635
  Args:
609
636
  - func (callable) : Callable function
@@ -668,7 +695,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
668
695
  def workers_per_node(self) -> Union[int, float]:
669
696
  return self._workers_per_node
670
697
 
671
- def scale_in(self, blocks, force=True, max_idletime=None):
698
+ def scale_in(self, blocks: int, max_idletime: Optional[float] = None) -> List[str]:
672
699
  """Scale in the number of active blocks by specified amount.
673
700
 
674
701
  The scale in method here is very rude. It doesn't give the workers
@@ -681,49 +708,54 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
681
708
  blocks : int
682
709
  Number of blocks to terminate and scale_in by
683
710
 
684
- force : Bool
685
- Used along with blocks to indicate whether blocks should be terminated by force.
686
-
687
- When force = True, we will kill blocks regardless of the blocks being busy
711
+ max_idletime: float
712
+ A time to indicate how long a block should be idle to be a
713
+ candidate for scaling in.
688
714
 
689
- When force = False, only idle blocks will be terminated. If the
690
- number of idle blocks < ``blocks``, then fewer than ``blocks``
691
- blocks will be terminated.
715
+ If None then blocks will be force scaled in even if they are busy.
692
716
 
693
- max_idletime: float
694
- A time to indicate how long a block can be idle.
695
- Used along with force = False to kill blocks that have been idle for that long.
717
+ If a float, then only idle blocks will be terminated, which may be less than
718
+ the requested number.
696
719
 
697
720
  Returns
698
721
  -------
699
722
  List of block IDs scaled in
700
723
  """
701
724
  logger.debug(f"Scale in called, blocks={blocks}")
725
+
726
+ @dataclass
727
+ class BlockInfo:
728
+ tasks: int # sum of tasks in this block
729
+ idle: float # shortest idle time of any manager in this block
730
+
702
731
  managers = self.connected_managers()
703
- block_info = {} # block id -> list( tasks, idle duration )
732
+ block_info: Dict[str, BlockInfo] = {}
704
733
  for manager in managers:
705
734
  if not manager['active']:
706
735
  continue
707
736
  b_id = manager['block_id']
708
737
  if b_id not in block_info:
709
- block_info[b_id] = [0, float('inf')]
710
- block_info[b_id][0] += manager['tasks']
711
- block_info[b_id][1] = min(block_info[b_id][1], manager['idle_duration'])
738
+ block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
739
+ block_info[b_id].tasks += manager['tasks']
740
+ block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
741
+
742
+ # The scaling policy is that longest idle blocks should be scaled down
743
+ # in preference to least idle (most recently used) blocks.
744
+ # Other policies could be implemented here.
745
+
746
+ sorted_blocks = sorted(block_info.items(), key=lambda item: (-item[1].idle, item[1].tasks))
712
747
 
713
- sorted_blocks = sorted(block_info.items(), key=lambda item: (item[1][1], item[1][0]))
714
748
  logger.debug(f"Scale in selecting from {len(sorted_blocks)} blocks")
715
- if force is True:
749
+ if max_idletime is None:
716
750
  block_ids_to_kill = [x[0] for x in sorted_blocks[:blocks]]
717
751
  else:
718
- if not max_idletime:
719
- block_ids_to_kill = [x[0] for x in sorted_blocks if x[1][0] == 0][:blocks]
720
- else:
721
- block_ids_to_kill = []
722
- for x in sorted_blocks:
723
- if x[1][1] > max_idletime and x[1][0] == 0:
724
- block_ids_to_kill.append(x[0])
725
- if len(block_ids_to_kill) == blocks:
726
- break
752
+ block_ids_to_kill = []
753
+ for x in sorted_blocks:
754
+ if x[1].idle > max_idletime and x[1].tasks == 0:
755
+ block_ids_to_kill.append(x[0])
756
+ if len(block_ids_to_kill) == blocks:
757
+ break
758
+
727
759
  logger.debug("Selected idle block ids to kill: {}".format(
728
760
  block_ids_to_kill))
729
761
  if len(block_ids_to_kill) < blocks:
@@ -27,7 +27,6 @@ from parsl.monitoring.message_type import MessageType
27
27
  from parsl.process_loggers import wrap_with_logs
28
28
 
29
29
 
30
- HEARTBEAT_CODE = (2 ** 32) - 1
31
30
  PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
32
31
 
33
32
  LOGGER_NAME = "interchange"
@@ -393,71 +392,85 @@ class Interchange:
393
392
  logger.info("Processed {} tasks in {} seconds".format(self.count, delta))
394
393
  logger.warning("Exiting")
395
394
 
396
- def process_task_outgoing_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket], kill_event: threading.Event) -> None:
397
- # Listen for registrations and heartbeats
395
+ def process_task_outgoing_incoming(
396
+ self,
397
+ interesting_managers: Set[bytes],
398
+ hub_channel: Optional[zmq.Socket],
399
+ kill_event: threading.Event
400
+ ) -> None:
401
+ """Process one message from manager on the task_outgoing channel.
402
+ Note that this message flow is in contradiction to the name of the
403
+ channel - it is not an outgoing message and it is not a task.
404
+ """
398
405
  if self.task_outgoing in self.socks and self.socks[self.task_outgoing] == zmq.POLLIN:
399
406
  logger.debug("starting task_outgoing section")
400
407
  message = self.task_outgoing.recv_multipart()
401
408
  manager_id = message[0]
402
409
 
403
- if manager_id not in self._ready_managers:
404
- reg_flag = False
410
+ try:
411
+ msg = json.loads(message[1].decode('utf-8'))
412
+ except Exception:
413
+ logger.warning("Got Exception reading message from manager: {!r}".format(
414
+ manager_id), exc_info=True)
415
+ logger.debug("Message: \n{!r}\n".format(message[1]))
416
+ return
417
+
418
+ # perform a bit of validation on the structure of the deserialized
419
+ # object, at least enough to behave like a deserialization error
420
+ # in obviously malformed cases
421
+ if not isinstance(msg, dict) or 'type' not in msg:
422
+ logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
423
+ logger.debug("Message: \n{!r}\n".format(message[1]))
424
+ return
425
+
426
+ if msg['type'] == 'registration':
427
+ # We set up an entry only if registration works correctly
428
+ self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
429
+ 'idle_since': time.time(),
430
+ 'block_id': None,
431
+ 'max_capacity': 0,
432
+ 'worker_count': 0,
433
+ 'active': True,
434
+ 'tasks': []}
435
+ self.connected_block_history.append(msg['block_id'])
436
+
437
+ interesting_managers.add(manager_id)
438
+ logger.info("Adding manager: {!r} to ready queue".format(manager_id))
439
+ m = self._ready_managers[manager_id]
405
440
 
406
- try:
407
- msg = json.loads(message[1].decode('utf-8'))
408
- reg_flag = True
409
- except Exception:
410
- logger.warning("Got Exception reading registration message from manager: {!r}".format(
411
- manager_id), exc_info=True)
412
- logger.debug("Message: \n{!r}\n".format(message[1]))
413
- else:
414
- # We set up an entry only if registration works correctly
415
- self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
416
- 'idle_since': time.time(),
417
- 'block_id': None,
418
- 'max_capacity': 0,
419
- 'worker_count': 0,
420
- 'active': True,
421
- 'tasks': []}
422
- self.connected_block_history.append(msg['block_id'])
423
- if reg_flag is True:
424
- interesting_managers.add(manager_id)
425
- logger.info("Adding manager: {!r} to ready queue".format(manager_id))
426
- m = self._ready_managers[manager_id]
427
- m.update(msg)
428
- logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
429
- self._send_monitoring_info(hub_channel, m)
430
-
431
- if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
432
- msg['parsl_v'] != self.current_platform['parsl_v']):
433
- logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
434
- logger.debug("Setting kill event")
435
- kill_event.set()
436
- e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
437
- self.current_platform['parsl_v']),
438
- "py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
439
- msg['parsl_v'])
440
- )
441
- result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
442
- pkl_package = pickle.dumps(result_package)
443
- self.results_outgoing.send(pkl_package)
444
- logger.error("Sent failure reports, shutting down interchange")
445
- else:
446
- logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
447
- logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
448
- msg['python_v'].rsplit(".", 1)[0]))
449
- else:
450
- # Registration has failed.
451
- logger.debug("Suppressing bad registration from manager: {!r}".format(manager_id))
441
+ # m is a ManagerRecord, but msg is a dict[Any,Any] and so can
442
+ # contain arbitrary fields beyond those in ManagerRecord (and
443
+ # indeed does - for example, python_v) which are then ignored
444
+ # later.
445
+ m.update(msg) # type: ignore[typeddict-item]
452
446
 
453
- else:
454
- heartbeat = int.from_bytes(message[1], "little")
455
- self._ready_managers[manager_id]['last_heartbeat'] = time.time()
456
- if heartbeat == HEARTBEAT_CODE:
457
- logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
458
- self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
447
+ logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
448
+ self._send_monitoring_info(hub_channel, m)
449
+
450
+ if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
451
+ msg['parsl_v'] != self.current_platform['parsl_v']):
452
+ logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
453
+ logger.debug("Setting kill event")
454
+ kill_event.set()
455
+ e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
456
+ self.current_platform['parsl_v']),
457
+ "py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
458
+ msg['parsl_v'])
459
+ )
460
+ result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
461
+ pkl_package = pickle.dumps(result_package)
462
+ self.results_outgoing.send(pkl_package)
463
+ logger.error("Sent failure reports, shutting down interchange")
459
464
  else:
460
- logger.error("Unexpected non-heartbeat message received from manager {}")
465
+ logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
466
+ logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
467
+ msg['python_v'].rsplit(".", 1)[0]))
468
+ elif msg['type'] == 'heartbeat':
469
+ self._ready_managers[manager_id]['last_heartbeat'] = time.time()
470
+ logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
471
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
472
+ else:
473
+ logger.error(f"Unexpected message type received from manager: {msg['type']}")
461
474
  logger.debug("leaving task_outgoing section")
462
475
 
463
476
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
@@ -613,7 +626,13 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
613
626
  None.
614
627
  """
615
628
  if format_string is None:
616
- format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d %(processName)s(%(process)d) %(threadName)s %(funcName)s [%(levelname)s] %(message)s"
629
+ format_string = (
630
+
631
+ "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d "
632
+ "%(processName)s(%(process)d) %(threadName)s "
633
+ "%(funcName)s [%(levelname)s] %(message)s"
634
+
635
+ )
617
636
 
618
637
  global logger
619
638
  logger = logging.getLogger(LOGGER_NAME)