parsl 2024.5.13__py3-none-any.whl → 2024.5.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. parsl/channels/base.py +2 -9
  2. parsl/channels/local/local.py +3 -6
  3. parsl/channels/oauth_ssh/oauth_ssh.py +2 -2
  4. parsl/channels/ssh/ssh.py +2 -2
  5. parsl/config.py +7 -1
  6. parsl/dataflow/dependency_resolvers.py +115 -0
  7. parsl/dataflow/dflow.py +45 -39
  8. parsl/executors/__init__.py +2 -0
  9. parsl/executors/base.py +7 -7
  10. parsl/executors/high_throughput/errors.py +10 -0
  11. parsl/executors/high_throughput/executor.py +85 -84
  12. parsl/executors/high_throughput/interchange.py +6 -5
  13. parsl/executors/high_throughput/mpi_executor.py +85 -0
  14. parsl/executors/high_throughput/mpi_prefix_composer.py +18 -2
  15. parsl/executors/high_throughput/mpi_resource_management.py +3 -0
  16. parsl/executors/high_throughput/zmq_pipes.py +36 -2
  17. parsl/executors/radical/rpex_resources.py +3 -7
  18. parsl/monitoring/remote.py +18 -24
  19. parsl/providers/local/local.py +1 -1
  20. parsl/tests/conftest.py +2 -2
  21. parsl/tests/sites/test_dynamic_executor.py +0 -1
  22. parsl/tests/test_bash_apps/test_std_uri.py +0 -6
  23. parsl/tests/test_checkpointing/test_periodic.py +2 -7
  24. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +0 -1
  25. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  26. parsl/tests/test_checkpointing/test_task_exit.py +0 -1
  27. parsl/tests/test_htex/test_basic.py +0 -1
  28. parsl/tests/test_htex/test_command_client_timeout.py +69 -0
  29. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -8
  30. parsl/tests/test_htex/test_manager_failure.py +0 -1
  31. parsl/tests/test_htex/test_managers_command.py +2 -7
  32. parsl/tests/test_htex/test_missing_worker.py +2 -8
  33. parsl/tests/test_monitoring/test_app_names.py +0 -1
  34. parsl/tests/test_monitoring/test_basic.py +0 -2
  35. parsl/tests/test_monitoring/test_db_locks.py +0 -1
  36. parsl/tests/test_monitoring/test_fuzz_zmq.py +0 -1
  37. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -2
  38. parsl/tests/test_monitoring/test_incomplete_futures.py +0 -1
  39. parsl/tests/test_monitoring/test_memoization_representation.py +0 -1
  40. parsl/tests/test_monitoring/test_stdouterr.py +0 -2
  41. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +6 -14
  42. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +2 -8
  43. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +10 -1
  44. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  45. parsl/tests/test_mpi_apps/test_resource_spec.py +14 -9
  46. parsl/tests/test_python_apps/test_context_manager.py +1 -9
  47. parsl/tests/test_python_apps/test_lifted.py +10 -6
  48. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  49. parsl/tests/test_scaling/test_regression_1621.py +0 -2
  50. parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
  51. parsl/tests/test_serialization/test_proxystore_configured.py +0 -1
  52. parsl/tests/test_shutdown/test_kill_monitoring.py +0 -2
  53. parsl/tests/test_staging/test_1316.py +0 -2
  54. parsl/tests/test_staging/test_elaborate_noop_file.py +0 -1
  55. parsl/tests/test_summary.py +0 -1
  56. parsl/tests/test_threads/test_configs.py +0 -1
  57. parsl/tests/test_threads/test_lazy_errors.py +0 -1
  58. parsl/version.py +1 -1
  59. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/METADATA +6 -4
  60. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/RECORD +67 -62
  61. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/exec_parsl_function.py +0 -0
  62. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/parsl_coprocess.py +0 -0
  63. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/process_worker_pool.py +0 -0
  64. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/LICENSE +0 -0
  65. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/WHEEL +0 -0
  66. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/entry_points.txt +0 -0
  67. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/top_level.txt +0 -0
parsl/channels/base.py CHANGED
@@ -89,15 +89,8 @@ class Channel(metaclass=ABCMeta):
89
89
  pass
90
90
 
91
91
  @abstractmethod
92
- def close(self) -> bool:
93
- ''' Closes the channel. Clean out any auth credentials.
94
-
95
- Args:
96
- None
97
-
98
- Returns:
99
- Bool
100
-
92
+ def close(self) -> None:
93
+ ''' Closes the channel.
101
94
  '''
102
95
  pass
103
96
 
@@ -107,13 +107,10 @@ class LocalChannel(Channel, RepresentationMixin):
107
107
  def pull_file(self, remote_source, local_dir):
108
108
  return self.push_file(remote_source, local_dir)
109
109
 
110
- def close(self):
111
- ''' There's nothing to close here, and this really doesn't do anything
112
-
113
- Returns:
114
- - False, because it really did not "close" this channel.
110
+ def close(self) -> None:
111
+ ''' There's nothing to close here, and so this doesn't do anything
115
112
  '''
116
- return False
113
+ pass
117
114
 
118
115
  def isdir(self, path):
119
116
  """Return true if the path refers to an existing directory.
@@ -106,5 +106,5 @@ class OAuthSSHChannel(SSHChannel):
106
106
 
107
107
  return exit_status, stdout, stderr
108
108
 
109
- def close(self):
110
- return self.transport.close()
109
+ def close(self) -> None:
110
+ self.transport.close()
parsl/channels/ssh/ssh.py CHANGED
@@ -217,9 +217,9 @@ class SSHChannel(Channel, RepresentationMixin):
217
217
 
218
218
  return local_dest
219
219
 
220
- def close(self):
220
+ def close(self) -> None:
221
221
  if self._is_connected():
222
- return self.ssh_client.close()
222
+ self.ssh_client.close()
223
223
 
224
224
  def isdir(self, path):
225
225
  """Return true if the path refers to an existing directory.
parsl/config.py CHANGED
@@ -5,6 +5,7 @@ from typing import Callable, Iterable, Optional, Sequence, Union
5
5
  from typing_extensions import Literal
6
6
 
7
7
  from parsl.utils import RepresentationMixin
8
+ from parsl.dataflow.dependency_resolvers import DependencyResolver
8
9
  from parsl.executors.base import ParslExecutor
9
10
  from parsl.executors.threads import ThreadPoolExecutor
10
11
  from parsl.errors import ConfigurationError
@@ -35,6 +36,8 @@ class Config(RepresentationMixin, UsageInformation):
35
36
  checkpoint_period : str, optional
36
37
  Time interval (in "HH:MM:SS") at which to checkpoint completed tasks. Only has an effect if
37
38
  ``checkpoint_mode='periodic'``.
39
+ dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
40
+ using the `SHALLOW_DEPENDENCY_RESOLVER`.
38
41
  garbage_collect : bool. optional.
39
42
  Delete task records from DFK when tasks have completed. Default: True
40
43
  internal_tasks_max_threads : int, optional
@@ -88,6 +91,7 @@ class Config(RepresentationMixin, UsageInformation):
88
91
  Literal['dfk_exit'],
89
92
  Literal['manual']] = None,
90
93
  checkpoint_period: Optional[str] = None,
94
+ dependency_resolver: Optional[DependencyResolver] = None,
91
95
  garbage_collect: bool = True,
92
96
  internal_tasks_max_threads: int = 10,
93
97
  retries: int = 0,
@@ -123,6 +127,7 @@ class Config(RepresentationMixin, UsageInformation):
123
127
  if checkpoint_mode == 'periodic' and checkpoint_period is None:
124
128
  checkpoint_period = "00:30:00"
125
129
  self.checkpoint_period = checkpoint_period
130
+ self.dependency_resolver = dependency_resolver
126
131
  self.garbage_collect = garbage_collect
127
132
  self.internal_tasks_max_threads = internal_tasks_max_threads
128
133
  self.retries = retries
@@ -152,4 +157,5 @@ class Config(RepresentationMixin, UsageInformation):
152
157
  ', '.join(['label={}'.format(repr(d)) for d in duplicates])))
153
158
 
154
159
  def get_usage_information(self):
155
- return {"executors_len": len(self.executors)}
160
+ return {"executors_len": len(self.executors),
161
+ "dependency_resolver": self.dependency_resolver is not None}
@@ -0,0 +1,115 @@
1
+ from concurrent.futures import Future
2
+ from dataclasses import dataclass
3
+ from functools import singledispatch
4
+ from typing import Callable, Sequence
5
+
6
+
7
+ @dataclass
8
+ class DependencyResolver:
9
+ """A DependencyResolver describes how app dependencies can be resolved.
10
+ It is specified as two functions: `traverse_to_gather` which turns an
11
+ app parameter into a sequence of futures which must be waited for before
12
+ the task can be executed (for example, in the case of
13
+ `DEEP_DEPENDENCY_RESOLVER` this traverses structures such as lists to
14
+ find every contained ``Future``), and `traverse_to_unwrap` which turns an
15
+ app parameter into its value to be passed to the app on execution
16
+ (for example in the case of `DEEP_DEPENDENCY_RESOLVER` this replaces a
17
+ list containing futures with a new list containing the values of those
18
+ resolved futures).
19
+
20
+ By default, Parsl will use `SHALLOW_DEPENDENCY_RESOLVER` which only
21
+ resolves Futures passed directly as arguments.
22
+ """
23
+ traverse_to_gather: Callable[[object], Sequence[Future]]
24
+ traverse_to_unwrap: Callable[[object], object]
25
+
26
+
27
+ @singledispatch
28
+ def shallow_traverse_to_gather(o):
29
+ # objects in general do not expose futures that we can see
30
+ return []
31
+
32
+
33
+ @singledispatch
34
+ def shallow_traverse_to_unwrap(o):
35
+ # objects in general unwrap to themselves
36
+ return o
37
+
38
+
39
+ @shallow_traverse_to_gather.register
40
+ def _(fut: Future):
41
+ return [fut]
42
+
43
+
44
+ @shallow_traverse_to_unwrap.register
45
+ @singledispatch
46
+ def _(fut: Future):
47
+ assert fut.done()
48
+ return fut.result()
49
+
50
+
51
+ @singledispatch
52
+ def deep_traverse_to_gather(o):
53
+ # objects in general do not expose futures that we can see
54
+ return []
55
+
56
+
57
+ @singledispatch
58
+ def deep_traverse_to_unwrap(o):
59
+ # objects in general unwrap to themselves
60
+ return o
61
+
62
+
63
+ @deep_traverse_to_gather.register
64
+ def _(fut: Future):
65
+ return [fut]
66
+
67
+
68
+ @deep_traverse_to_unwrap.register
69
+ @singledispatch
70
+ def _(fut: Future):
71
+ assert fut.done()
72
+ return fut.result()
73
+
74
+
75
+ @deep_traverse_to_gather.register(tuple)
76
+ @deep_traverse_to_gather.register(list)
77
+ @deep_traverse_to_gather.register(set)
78
+ def _(iterable):
79
+ return [e for v in iterable for e in deep_traverse_to_gather(v)]
80
+
81
+
82
+ @deep_traverse_to_unwrap.register(tuple)
83
+ @deep_traverse_to_unwrap.register(list)
84
+ @deep_traverse_to_unwrap.register(set)
85
+ @singledispatch
86
+ def _(iterable):
87
+
88
+ type_ = type(iterable)
89
+ return type_(map(deep_traverse_to_unwrap, iterable))
90
+
91
+
92
+ @deep_traverse_to_gather.register(dict)
93
+ def _(dictionary):
94
+ futures = []
95
+ for key, value in dictionary.items():
96
+ futures.extend(deep_traverse_to_gather(key))
97
+ futures.extend(deep_traverse_to_gather(value))
98
+ return futures
99
+
100
+
101
+ @deep_traverse_to_unwrap.register(dict)
102
+ def _(dictionary):
103
+ unwrapped_dict = {}
104
+ for key, value in dictionary.items():
105
+ key = deep_traverse_to_unwrap(key)
106
+ value = deep_traverse_to_unwrap(value)
107
+ unwrapped_dict[key] = value
108
+ return unwrapped_dict
109
+
110
+
111
+ DEEP_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=deep_traverse_to_gather,
112
+ traverse_to_unwrap=deep_traverse_to_unwrap)
113
+
114
+ SHALLOW_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=shallow_traverse_to_gather,
115
+ traverse_to_unwrap=shallow_traverse_to_unwrap)
parsl/dataflow/dflow.py CHANGED
@@ -26,6 +26,7 @@ from parsl.channels import Channel
26
26
  from parsl.config import Config
27
27
  from parsl.data_provider.data_manager import DataManager
28
28
  from parsl.data_provider.files import File
29
+ from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
29
30
  from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
30
31
  from parsl.dataflow.futures import AppFuture
31
32
  from parsl.dataflow.memoization import Memoizer
@@ -203,6 +204,9 @@ class DataFlowKernel:
203
204
  self.tasks: Dict[int, TaskRecord] = {}
204
205
  self.submitter_lock = threading.Lock()
205
206
 
207
+ self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
208
+ else SHALLOW_DEPENDENCY_RESOLVER
209
+
206
210
  atexit.register(self.atexit_cleanup)
207
211
 
208
212
  def __enter__(self):
@@ -852,8 +856,11 @@ class DataFlowKernel:
852
856
  depends: List[Future] = []
853
857
 
854
858
  def check_dep(d: Any) -> None:
855
- if isinstance(d, Future):
856
- depends.extend([d])
859
+ try:
860
+ depends.extend(self.dependency_resolver.traverse_to_gather(d))
861
+ except Exception:
862
+ logger.exception("Exception in dependency_resolver.traverse_to_gather")
863
+ raise
857
864
 
858
865
  # Check the positional args
859
866
  for dep in args:
@@ -870,7 +877,8 @@ class DataFlowKernel:
870
877
 
871
878
  return depends
872
879
 
873
- def _unwrap_futures(self, args, kwargs):
880
+ def _unwrap_futures(self, args: Sequence[Any], kwargs: Dict[str, Any]) \
881
+ -> Tuple[Sequence[Any], Dict[str, Any], Sequence[Tuple[Exception, str]]]:
874
882
  """This function should be called when all dependencies have completed.
875
883
 
876
884
  It will rewrite the arguments for that task, replacing each Future
@@ -891,53 +899,40 @@ class DataFlowKernel:
891
899
  """
892
900
  dep_failures = []
893
901
 
902
+ def append_failure(e: Exception, dep: Future) -> None:
903
+ # If this Future is associated with a task inside this DFK,
904
+ # then refer to the task ID.
905
+ # Otherwise make a repr of the Future object.
906
+ if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
907
+ tid = "task " + repr(dep.task_record['id'])
908
+ else:
909
+ tid = repr(dep)
910
+ dep_failures.extend([(e, tid)])
911
+
894
912
  # Replace item in args
895
913
  new_args = []
896
914
  for dep in args:
897
- if isinstance(dep, Future):
898
- try:
899
- new_args.extend([dep.result()])
900
- except Exception as e:
901
- # If this Future is associated with a task inside this DFK,
902
- # then refer to the task ID.
903
- # Otherwise make a repr of the Future object.
904
- if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
905
- tid = "task " + repr(dep.task_record['id'])
906
- else:
907
- tid = repr(dep)
908
- dep_failures.extend([(e, tid)])
909
- else:
910
- new_args.extend([dep])
915
+ try:
916
+ new_args.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
917
+ except Exception as e:
918
+ append_failure(e, dep)
911
919
 
912
920
  # Check for explicit kwargs ex, fu_1=<fut>
913
921
  for key in kwargs:
914
922
  dep = kwargs[key]
915
- if isinstance(dep, Future):
916
- try:
917
- kwargs[key] = dep.result()
918
- except Exception as e:
919
- if hasattr(dep, 'task_record'):
920
- tid = dep.task_record['id']
921
- else:
922
- tid = None
923
- dep_failures.extend([(e, tid)])
923
+ try:
924
+ kwargs[key] = self.dependency_resolver.traverse_to_unwrap(dep)
925
+ except Exception as e:
926
+ append_failure(e, dep)
924
927
 
925
928
  # Check for futures in inputs=[<fut>...]
926
929
  if 'inputs' in kwargs:
927
930
  new_inputs = []
928
931
  for dep in kwargs['inputs']:
929
- if isinstance(dep, Future):
930
- try:
931
- new_inputs.extend([dep.result()])
932
- except Exception as e:
933
- if hasattr(dep, 'task_record'):
934
- tid = dep.task_record['id']
935
- else:
936
- tid = None
937
- dep_failures.extend([(e, tid)])
938
-
939
- else:
940
- new_inputs.extend([dep])
932
+ try:
933
+ new_inputs.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
934
+ except Exception as e:
935
+ append_failure(e, dep)
941
936
  kwargs['inputs'] = new_inputs
942
937
 
943
938
  return new_args, kwargs, dep_failures
@@ -1042,6 +1037,8 @@ class DataFlowKernel:
1042
1037
 
1043
1038
  func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
1044
1039
 
1040
+ logger.debug("Added output dependencies")
1041
+
1045
1042
  # Replace the function invocation in the TaskRecord with whatever file-staging
1046
1043
  # substitutions have been made.
1047
1044
  task_record.update({
@@ -1053,8 +1050,10 @@ class DataFlowKernel:
1053
1050
 
1054
1051
  self.tasks[task_id] = task_record
1055
1052
 
1053
+ logger.debug("Gathering dependencies")
1056
1054
  # Get the list of dependencies for the task
1057
1055
  depends = self._gather_all_deps(app_args, app_kwargs)
1056
+ logger.debug("Gathered dependencies")
1058
1057
  task_record['depends'] = depends
1059
1058
 
1060
1059
  depend_descs = []
@@ -1156,7 +1155,7 @@ class DataFlowKernel:
1156
1155
  executor.run_id = self.run_id
1157
1156
  executor.run_dir = self.run_dir
1158
1157
  executor.hub_address = self.hub_address
1159
- executor.hub_port = self.hub_zmq_port
1158
+ executor.hub_zmq_port = self.hub_zmq_port
1160
1159
  if self.monitoring:
1161
1160
  executor.monitoring_radio = self.monitoring.radio
1162
1161
  if hasattr(executor, 'provider'):
@@ -1271,6 +1270,13 @@ class DataFlowKernel:
1271
1270
  atexit.unregister(self.atexit_cleanup)
1272
1271
  logger.info("Unregistered atexit hook")
1273
1272
 
1273
+ if DataFlowKernelLoader._dfk is self:
1274
+ logger.info("Unregistering default DFK")
1275
+ parsl.clear()
1276
+ logger.info("Unregistered default DFK")
1277
+ else:
1278
+ logger.debug("Cleaning up non-default DFK - not unregistering")
1279
+
1274
1280
  logger.info("DFK cleanup complete")
1275
1281
 
1276
1282
  def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
@@ -1,9 +1,11 @@
1
1
  from parsl.executors.threads import ThreadPoolExecutor
2
2
  from parsl.executors.workqueue.executor import WorkQueueExecutor
3
3
  from parsl.executors.high_throughput.executor import HighThroughputExecutor
4
+ from parsl.executors.high_throughput.mpi_executor import MPIExecutor
4
5
  from parsl.executors.flux.executor import FluxExecutor
5
6
 
6
7
  __all__ = ['ThreadPoolExecutor',
7
8
  'HighThroughputExecutor',
9
+ 'MPIExecutor',
8
10
  'WorkQueueExecutor',
9
11
  'FluxExecutor']
parsl/executors/base.py CHANGED
@@ -50,13 +50,13 @@ class ParslExecutor(metaclass=ABCMeta):
50
50
  self,
51
51
  *,
52
52
  hub_address: Optional[str] = None,
53
- hub_port: Optional[int] = None,
53
+ hub_zmq_port: Optional[int] = None,
54
54
  monitoring_radio: Optional[MonitoringRadio] = None,
55
55
  run_dir: str = ".",
56
56
  run_id: Optional[str] = None,
57
57
  ):
58
58
  self.hub_address = hub_address
59
- self.hub_port = hub_port
59
+ self.hub_zmq_port = hub_zmq_port
60
60
  self.monitoring_radio = monitoring_radio
61
61
  self.run_dir = os.path.abspath(run_dir)
62
62
  self.run_id = run_id
@@ -136,14 +136,14 @@ class ParslExecutor(metaclass=ABCMeta):
136
136
  self._hub_address = value
137
137
 
138
138
  @property
139
- def hub_port(self) -> Optional[int]:
139
+ def hub_zmq_port(self) -> Optional[int]:
140
140
  """Port to the Hub for monitoring.
141
141
  """
142
- return self._hub_port
142
+ return self._hub_zmq_port
143
143
 
144
- @hub_port.setter
145
- def hub_port(self, value: Optional[int]) -> None:
146
- self._hub_port = value
144
+ @hub_zmq_port.setter
145
+ def hub_zmq_port(self, value: Optional[int]) -> None:
146
+ self._hub_zmq_port = value
147
147
 
148
148
  @property
149
149
  def monitoring_radio(self) -> Optional[MonitoringRadio]:
@@ -10,3 +10,13 @@ class WorkerLost(Exception):
10
10
 
11
11
  def __str__(self):
12
12
  return self.__repr__()
13
+
14
+
15
+ class CommandClientTimeoutError(Exception):
16
+ """Raised when the command client times out waiting for a response.
17
+ """
18
+
19
+
20
+ class CommandClientBadError(Exception):
21
+ """Raised when the command client is bad from an earlier timeout.
22
+ """
@@ -62,47 +62,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
62
62
  "--mpi-launcher={mpi_launcher} "
63
63
  "--available-accelerators {accelerators}")
64
64
 
65
-
66
- class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
67
- """Executor designed for cluster-scale
68
-
69
- The HighThroughputExecutor system has the following components:
70
- 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
71
- 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
72
- 3. The multiprocessing based worker pool which coordinates task execution over several
73
- cores on a node.
74
- 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
75
-
76
- Here is a diagram
77
-
78
- .. code:: python
79
-
80
-
81
- | Data | Executor | Interchange | External Process(es)
82
- | Flow | | |
83
- Task | Kernel | | |
84
- +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
85
- | | | | batching | | |
86
- Parsl<---Fut-| | | load-balancing| result exception
87
- ^ | | | watchdogs | | |
88
- | | | Q_mngmnt | | V V
89
- | | | Thread<--|-incoming_q<---|--- +---------+
90
- | | | | | |
91
- | | | | | |
92
- +----update_fut-----+
93
-
94
-
95
- Each of the workers in each process_worker_pool has access to its local rank through
96
- an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
97
- and is an integer in the range from 0 to the number of workers per in the pool minus 1.
98
- The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
99
- and the size of the worker pool as ``PARSL_WORKER_COUNT``.
100
-
101
-
102
- Parameters
103
- ----------
104
-
105
- provider : :class:`~parsl.providers.base.ExecutionProvider`
65
+ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
106
66
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
107
67
  :class:`~parsl.providers.cobalt.cobalt.Cobalt`,
108
68
  :class:`~parsl.providers.condor.condor.Condor`,
@@ -148,39 +108,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
148
108
  worker_debug : Bool
149
109
  Enables worker debug logging.
150
110
 
151
- cores_per_worker : float
152
- cores to be assigned to each worker. Oversubscription is possible
153
- by setting cores_per_worker < 1.0. Default=1
154
-
155
- mem_per_worker : float
156
- GB of memory required per worker. If this option is specified, the node manager
157
- will check the available memory at startup and limit the number of workers such that
158
- the there's sufficient memory for each worker. Default: None
159
-
160
- max_workers : int
161
- Deprecated. Please use max_workers_per_node instead.
162
-
163
- max_workers_per_node : int
164
- Caps the number of workers launched per node. Default: None
165
-
166
- cpu_affinity: string
167
- Whether or how each worker process sets thread affinity. Options include "none" to forgo
168
- any CPU affinity configuration, "block" to assign adjacent cores to workers
169
- (ex: assign 0-1 to worker 0, 2-3 to worker 1), and
170
- "alternating" to assign cores to workers in round-robin
171
- (ex: assign 0,2 to worker 0, 1,3 to worker 1).
172
- The "block-reverse" option assigns adjacent cores to workers, but assigns
173
- the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
174
-
175
- available_accelerators: int | list
176
- Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
177
- accelerators, and no more workers will be launched than the number of accelerators.
178
-
179
- Either provide the list of accelerator names or the number available. If a number is provided,
180
- Parsl will create names as integers starting with 0.
181
-
182
- default: empty list
183
-
184
111
  prefetch_capacity : int
185
112
  Number of tasks that could be prefetched over available worker capacity.
186
113
  When there are a few tasks (<100) or when tasks are long running, this option should
@@ -214,6 +141,85 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
214
141
  worker_logdir_root : string
215
142
  In case of a remote file system, specify the path to where logs will be kept.
216
143
 
144
+ encrypted : bool
145
+ Flag to enable/disable encryption (CurveZMQ). Default is False.
146
+ """ # Documentation for params used by both HTEx and MPIEx
147
+
148
+
149
+ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
150
+ __doc__ = f"""Executor designed for cluster-scale
151
+
152
+ The HighThroughputExecutor system has the following components:
153
+ 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
154
+ 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
155
+ 3. The multiprocessing based worker pool which coordinates task execution over several
156
+ cores on a node.
157
+ 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
158
+
159
+ Here is a diagram
160
+
161
+ .. code:: python
162
+
163
+
164
+ | Data | Executor | Interchange | External Process(es)
165
+ | Flow | | |
166
+ Task | Kernel | | |
167
+ +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
168
+ | | | | batching | | |
169
+ Parsl<---Fut-| | | load-balancing| result exception
170
+ ^ | | | watchdogs | | |
171
+ | | | Q_mngmnt | | V V
172
+ | | | Thread<--|-incoming_q<---|--- +---------+
173
+ | | | | | |
174
+ | | | | | |
175
+ +----update_fut-----+
176
+
177
+
178
+ Each of the workers in each process_worker_pool has access to its local rank through
179
+ an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
180
+ and is an integer in the range from 0 to the number of workers per in the pool minus 1.
181
+ The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
182
+ and the size of the worker pool as ``PARSL_WORKER_COUNT``.
183
+
184
+
185
+ Parameters
186
+ ----------
187
+
188
+ {GENERAL_HTEX_PARAM_DOCS}
189
+
190
+ cores_per_worker : float
191
+ cores to be assigned to each worker. Oversubscription is possible
192
+ by setting cores_per_worker < 1.0. Default=1
193
+
194
+ mem_per_worker : float
195
+ GB of memory required per worker. If this option is specified, the node manager
196
+ will check the available memory at startup and limit the number of workers such that
197
+ the there's sufficient memory for each worker. Default: None
198
+
199
+ max_workers : int
200
+ Deprecated. Please use max_workers_per_node instead.
201
+
202
+ max_workers_per_node : int
203
+ Caps the number of workers launched per node. Default: None
204
+
205
+ cpu_affinity: string
206
+ Whether or how each worker process sets thread affinity. Options include "none" to forgo
207
+ any CPU affinity configuration, "block" to assign adjacent cores to workers
208
+ (ex: assign 0-1 to worker 0, 2-3 to worker 1), and
209
+ "alternating" to assign cores to workers in round-robin
210
+ (ex: assign 0,2 to worker 0, 1,3 to worker 1).
211
+ The "block-reverse" option assigns adjacent cores to workers, but assigns
212
+ the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
213
+
214
+ available_accelerators: int | list
215
+ Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
216
+ accelerators, and no more workers will be launched than the number of accelerators.
217
+
218
+ Either provide the list of accelerator names or the number available. If a number is provided,
219
+ Parsl will create names as integers starting with 0.
220
+
221
+ default: empty list
222
+
217
223
  enable_mpi_mode: bool
218
224
  If enabled, MPI launch prefixes will be composed for the batch scheduler based on
219
225
  the nodes available in each batch job and the resource_specification dict passed
@@ -224,9 +230,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
224
230
  This field is only used if enable_mpi_mode is set. Select one from the
225
231
  list of supported MPI launchers = ("srun", "aprun", "mpiexec").
226
232
  default: "mpiexec"
227
-
228
- encrypted : bool
229
- Flag to enable/disable encryption (CurveZMQ). Default is False.
230
233
  """
231
234
 
232
235
  @typeguard.typechecked
@@ -305,9 +308,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
305
308
  self._workers_per_node = 1 # our best guess-- we do not have any provider hints
306
309
 
307
310
  self._task_counter = 0
308
- self.run_id = None # set to the correct run_id in dfk
309
- self.hub_address = None # set to the correct hub address in dfk
310
- self.hub_port = None # set to the correct hub port in dfk
311
311
  self.worker_ports = worker_ports
312
312
  self.worker_port_range = worker_port_range
313
313
  self.interchange_proc: Optional[Process] = None
@@ -326,8 +326,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
326
326
  assert mpi_launcher in VALID_LAUNCHERS, \
327
327
  f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
328
328
  if self.enable_mpi_mode:
329
- assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
330
- "mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
329
+ assert isinstance(self.provider.launcher, parsl.launchers.SimpleLauncher), \
330
+ "mpi_mode requires the provider to be configured to use a SimpleLauncher"
331
331
 
332
332
  self.mpi_launcher = mpi_launcher
333
333
 
@@ -541,7 +541,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
541
541
  "worker_ports": self.worker_ports,
542
542
  "worker_port_range": self.worker_port_range,
543
543
  "hub_address": self.hub_address,
544
- "hub_port": self.hub_port,
544
+ "hub_zmq_port": self.hub_zmq_port,
545
545
  "logdir": self.logdir,
546
546
  "heartbeat_threshold": self.heartbeat_threshold,
547
547
  "poll_period": self.poll_period,
@@ -645,7 +645,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
645
645
  Returns:
646
646
  Future
647
647
  """
648
- validate_resource_spec(resource_specification)
648
+
649
+ validate_resource_spec(resource_specification, self.enable_mpi_mode)
649
650
 
650
651
  if self.bad_state_is_set:
651
652
  raise self.executor_exception