parsl 2024.5.13__py3-none-any.whl → 2024.5.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +2 -9
- parsl/channels/local/local.py +3 -6
- parsl/channels/oauth_ssh/oauth_ssh.py +2 -2
- parsl/channels/ssh/ssh.py +2 -2
- parsl/config.py +7 -1
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +45 -39
- parsl/executors/__init__.py +2 -0
- parsl/executors/base.py +7 -7
- parsl/executors/high_throughput/errors.py +10 -0
- parsl/executors/high_throughput/executor.py +85 -84
- parsl/executors/high_throughput/interchange.py +6 -5
- parsl/executors/high_throughput/mpi_executor.py +85 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +18 -2
- parsl/executors/high_throughput/mpi_resource_management.py +3 -0
- parsl/executors/high_throughput/zmq_pipes.py +36 -2
- parsl/executors/radical/rpex_resources.py +3 -7
- parsl/monitoring/remote.py +18 -24
- parsl/providers/local/local.py +1 -1
- parsl/tests/conftest.py +2 -2
- parsl/tests/sites/test_dynamic_executor.py +0 -1
- parsl/tests/test_bash_apps/test_std_uri.py +0 -6
- parsl/tests/test_checkpointing/test_periodic.py +2 -7
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +0 -1
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_task_exit.py +0 -1
- parsl/tests/test_htex/test_basic.py +0 -1
- parsl/tests/test_htex/test_command_client_timeout.py +69 -0
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -8
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_managers_command.py +2 -7
- parsl/tests/test_htex/test_missing_worker.py +2 -8
- parsl/tests/test_monitoring/test_app_names.py +0 -1
- parsl/tests/test_monitoring/test_basic.py +0 -2
- parsl/tests/test_monitoring/test_db_locks.py +0 -1
- parsl/tests/test_monitoring/test_fuzz_zmq.py +0 -1
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -2
- parsl/tests/test_monitoring/test_incomplete_futures.py +0 -1
- parsl/tests/test_monitoring/test_memoization_representation.py +0 -1
- parsl/tests/test_monitoring/test_stdouterr.py +0 -2
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +6 -14
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +2 -8
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +10 -1
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +14 -9
- parsl/tests/test_python_apps/test_context_manager.py +1 -9
- parsl/tests/test_python_apps/test_lifted.py +10 -6
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_scaling/test_regression_1621.py +0 -2
- parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
- parsl/tests/test_serialization/test_proxystore_configured.py +0 -1
- parsl/tests/test_shutdown/test_kill_monitoring.py +0 -2
- parsl/tests/test_staging/test_1316.py +0 -2
- parsl/tests/test_staging/test_elaborate_noop_file.py +0 -1
- parsl/tests/test_summary.py +0 -1
- parsl/tests/test_threads/test_configs.py +0 -1
- parsl/tests/test_threads/test_lazy_errors.py +0 -1
- parsl/version.py +1 -1
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/METADATA +6 -4
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/RECORD +67 -62
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/LICENSE +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/WHEEL +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/entry_points.txt +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/top_level.txt +0 -0
parsl/channels/base.py
CHANGED
@@ -89,15 +89,8 @@ class Channel(metaclass=ABCMeta):
|
|
89
89
|
pass
|
90
90
|
|
91
91
|
@abstractmethod
|
92
|
-
def close(self) ->
|
93
|
-
''' Closes the channel.
|
94
|
-
|
95
|
-
Args:
|
96
|
-
None
|
97
|
-
|
98
|
-
Returns:
|
99
|
-
Bool
|
100
|
-
|
92
|
+
def close(self) -> None:
|
93
|
+
''' Closes the channel.
|
101
94
|
'''
|
102
95
|
pass
|
103
96
|
|
parsl/channels/local/local.py
CHANGED
@@ -107,13 +107,10 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
107
107
|
def pull_file(self, remote_source, local_dir):
|
108
108
|
return self.push_file(remote_source, local_dir)
|
109
109
|
|
110
|
-
def close(self):
|
111
|
-
''' There's nothing to close here, and this
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
- False, because it really did not "close" this channel.
|
110
|
+
def close(self) -> None:
|
111
|
+
''' There's nothing to close here, and so this doesn't do anything
|
115
112
|
'''
|
116
|
-
|
113
|
+
pass
|
117
114
|
|
118
115
|
def isdir(self, path):
|
119
116
|
"""Return true if the path refers to an existing directory.
|
parsl/channels/ssh/ssh.py
CHANGED
@@ -217,9 +217,9 @@ class SSHChannel(Channel, RepresentationMixin):
|
|
217
217
|
|
218
218
|
return local_dest
|
219
219
|
|
220
|
-
def close(self):
|
220
|
+
def close(self) -> None:
|
221
221
|
if self._is_connected():
|
222
|
-
|
222
|
+
self.ssh_client.close()
|
223
223
|
|
224
224
|
def isdir(self, path):
|
225
225
|
"""Return true if the path refers to an existing directory.
|
parsl/config.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Callable, Iterable, Optional, Sequence, Union
|
|
5
5
|
from typing_extensions import Literal
|
6
6
|
|
7
7
|
from parsl.utils import RepresentationMixin
|
8
|
+
from parsl.dataflow.dependency_resolvers import DependencyResolver
|
8
9
|
from parsl.executors.base import ParslExecutor
|
9
10
|
from parsl.executors.threads import ThreadPoolExecutor
|
10
11
|
from parsl.errors import ConfigurationError
|
@@ -35,6 +36,8 @@ class Config(RepresentationMixin, UsageInformation):
|
|
35
36
|
checkpoint_period : str, optional
|
36
37
|
Time interval (in "HH:MM:SS") at which to checkpoint completed tasks. Only has an effect if
|
37
38
|
``checkpoint_mode='periodic'``.
|
39
|
+
dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
|
40
|
+
using the `SHALLOW_DEPENDENCY_RESOLVER`.
|
38
41
|
garbage_collect : bool. optional.
|
39
42
|
Delete task records from DFK when tasks have completed. Default: True
|
40
43
|
internal_tasks_max_threads : int, optional
|
@@ -88,6 +91,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
88
91
|
Literal['dfk_exit'],
|
89
92
|
Literal['manual']] = None,
|
90
93
|
checkpoint_period: Optional[str] = None,
|
94
|
+
dependency_resolver: Optional[DependencyResolver] = None,
|
91
95
|
garbage_collect: bool = True,
|
92
96
|
internal_tasks_max_threads: int = 10,
|
93
97
|
retries: int = 0,
|
@@ -123,6 +127,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
123
127
|
if checkpoint_mode == 'periodic' and checkpoint_period is None:
|
124
128
|
checkpoint_period = "00:30:00"
|
125
129
|
self.checkpoint_period = checkpoint_period
|
130
|
+
self.dependency_resolver = dependency_resolver
|
126
131
|
self.garbage_collect = garbage_collect
|
127
132
|
self.internal_tasks_max_threads = internal_tasks_max_threads
|
128
133
|
self.retries = retries
|
@@ -152,4 +157,5 @@ class Config(RepresentationMixin, UsageInformation):
|
|
152
157
|
', '.join(['label={}'.format(repr(d)) for d in duplicates])))
|
153
158
|
|
154
159
|
def get_usage_information(self):
|
155
|
-
return {"executors_len": len(self.executors)
|
160
|
+
return {"executors_len": len(self.executors),
|
161
|
+
"dependency_resolver": self.dependency_resolver is not None}
|
@@ -0,0 +1,115 @@
|
|
1
|
+
from concurrent.futures import Future
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from functools import singledispatch
|
4
|
+
from typing import Callable, Sequence
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class DependencyResolver:
|
9
|
+
"""A DependencyResolver describes how app dependencies can be resolved.
|
10
|
+
It is specified as two functions: `traverse_to_gather` which turns an
|
11
|
+
app parameter into a sequence of futures which must be waited for before
|
12
|
+
the task can be executed (for example, in the case of
|
13
|
+
`DEEP_DEPENDENCY_RESOLVER` this traverses structures such as lists to
|
14
|
+
find every contained ``Future``), and `traverse_to_unwrap` which turns an
|
15
|
+
app parameter into its value to be passed to the app on execution
|
16
|
+
(for example in the case of `DEEP_DEPENDENCY_RESOLVER` this replaces a
|
17
|
+
list containing futures with a new list containing the values of those
|
18
|
+
resolved futures).
|
19
|
+
|
20
|
+
By default, Parsl will use `SHALLOW_DEPENDENCY_RESOLVER` which only
|
21
|
+
resolves Futures passed directly as arguments.
|
22
|
+
"""
|
23
|
+
traverse_to_gather: Callable[[object], Sequence[Future]]
|
24
|
+
traverse_to_unwrap: Callable[[object], object]
|
25
|
+
|
26
|
+
|
27
|
+
@singledispatch
|
28
|
+
def shallow_traverse_to_gather(o):
|
29
|
+
# objects in general do not expose futures that we can see
|
30
|
+
return []
|
31
|
+
|
32
|
+
|
33
|
+
@singledispatch
|
34
|
+
def shallow_traverse_to_unwrap(o):
|
35
|
+
# objects in general unwrap to themselves
|
36
|
+
return o
|
37
|
+
|
38
|
+
|
39
|
+
@shallow_traverse_to_gather.register
|
40
|
+
def _(fut: Future):
|
41
|
+
return [fut]
|
42
|
+
|
43
|
+
|
44
|
+
@shallow_traverse_to_unwrap.register
|
45
|
+
@singledispatch
|
46
|
+
def _(fut: Future):
|
47
|
+
assert fut.done()
|
48
|
+
return fut.result()
|
49
|
+
|
50
|
+
|
51
|
+
@singledispatch
|
52
|
+
def deep_traverse_to_gather(o):
|
53
|
+
# objects in general do not expose futures that we can see
|
54
|
+
return []
|
55
|
+
|
56
|
+
|
57
|
+
@singledispatch
|
58
|
+
def deep_traverse_to_unwrap(o):
|
59
|
+
# objects in general unwrap to themselves
|
60
|
+
return o
|
61
|
+
|
62
|
+
|
63
|
+
@deep_traverse_to_gather.register
|
64
|
+
def _(fut: Future):
|
65
|
+
return [fut]
|
66
|
+
|
67
|
+
|
68
|
+
@deep_traverse_to_unwrap.register
|
69
|
+
@singledispatch
|
70
|
+
def _(fut: Future):
|
71
|
+
assert fut.done()
|
72
|
+
return fut.result()
|
73
|
+
|
74
|
+
|
75
|
+
@deep_traverse_to_gather.register(tuple)
|
76
|
+
@deep_traverse_to_gather.register(list)
|
77
|
+
@deep_traverse_to_gather.register(set)
|
78
|
+
def _(iterable):
|
79
|
+
return [e for v in iterable for e in deep_traverse_to_gather(v)]
|
80
|
+
|
81
|
+
|
82
|
+
@deep_traverse_to_unwrap.register(tuple)
|
83
|
+
@deep_traverse_to_unwrap.register(list)
|
84
|
+
@deep_traverse_to_unwrap.register(set)
|
85
|
+
@singledispatch
|
86
|
+
def _(iterable):
|
87
|
+
|
88
|
+
type_ = type(iterable)
|
89
|
+
return type_(map(deep_traverse_to_unwrap, iterable))
|
90
|
+
|
91
|
+
|
92
|
+
@deep_traverse_to_gather.register(dict)
|
93
|
+
def _(dictionary):
|
94
|
+
futures = []
|
95
|
+
for key, value in dictionary.items():
|
96
|
+
futures.extend(deep_traverse_to_gather(key))
|
97
|
+
futures.extend(deep_traverse_to_gather(value))
|
98
|
+
return futures
|
99
|
+
|
100
|
+
|
101
|
+
@deep_traverse_to_unwrap.register(dict)
|
102
|
+
def _(dictionary):
|
103
|
+
unwrapped_dict = {}
|
104
|
+
for key, value in dictionary.items():
|
105
|
+
key = deep_traverse_to_unwrap(key)
|
106
|
+
value = deep_traverse_to_unwrap(value)
|
107
|
+
unwrapped_dict[key] = value
|
108
|
+
return unwrapped_dict
|
109
|
+
|
110
|
+
|
111
|
+
DEEP_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=deep_traverse_to_gather,
|
112
|
+
traverse_to_unwrap=deep_traverse_to_unwrap)
|
113
|
+
|
114
|
+
SHALLOW_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=shallow_traverse_to_gather,
|
115
|
+
traverse_to_unwrap=shallow_traverse_to_unwrap)
|
parsl/dataflow/dflow.py
CHANGED
@@ -26,6 +26,7 @@ from parsl.channels import Channel
|
|
26
26
|
from parsl.config import Config
|
27
27
|
from parsl.data_provider.data_manager import DataManager
|
28
28
|
from parsl.data_provider.files import File
|
29
|
+
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
|
29
30
|
from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
|
30
31
|
from parsl.dataflow.futures import AppFuture
|
31
32
|
from parsl.dataflow.memoization import Memoizer
|
@@ -203,6 +204,9 @@ class DataFlowKernel:
|
|
203
204
|
self.tasks: Dict[int, TaskRecord] = {}
|
204
205
|
self.submitter_lock = threading.Lock()
|
205
206
|
|
207
|
+
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
208
|
+
else SHALLOW_DEPENDENCY_RESOLVER
|
209
|
+
|
206
210
|
atexit.register(self.atexit_cleanup)
|
207
211
|
|
208
212
|
def __enter__(self):
|
@@ -852,8 +856,11 @@ class DataFlowKernel:
|
|
852
856
|
depends: List[Future] = []
|
853
857
|
|
854
858
|
def check_dep(d: Any) -> None:
|
855
|
-
|
856
|
-
depends.extend(
|
859
|
+
try:
|
860
|
+
depends.extend(self.dependency_resolver.traverse_to_gather(d))
|
861
|
+
except Exception:
|
862
|
+
logger.exception("Exception in dependency_resolver.traverse_to_gather")
|
863
|
+
raise
|
857
864
|
|
858
865
|
# Check the positional args
|
859
866
|
for dep in args:
|
@@ -870,7 +877,8 @@ class DataFlowKernel:
|
|
870
877
|
|
871
878
|
return depends
|
872
879
|
|
873
|
-
def _unwrap_futures(self, args, kwargs)
|
880
|
+
def _unwrap_futures(self, args: Sequence[Any], kwargs: Dict[str, Any]) \
|
881
|
+
-> Tuple[Sequence[Any], Dict[str, Any], Sequence[Tuple[Exception, str]]]:
|
874
882
|
"""This function should be called when all dependencies have completed.
|
875
883
|
|
876
884
|
It will rewrite the arguments for that task, replacing each Future
|
@@ -891,53 +899,40 @@ class DataFlowKernel:
|
|
891
899
|
"""
|
892
900
|
dep_failures = []
|
893
901
|
|
902
|
+
def append_failure(e: Exception, dep: Future) -> None:
|
903
|
+
# If this Future is associated with a task inside this DFK,
|
904
|
+
# then refer to the task ID.
|
905
|
+
# Otherwise make a repr of the Future object.
|
906
|
+
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
907
|
+
tid = "task " + repr(dep.task_record['id'])
|
908
|
+
else:
|
909
|
+
tid = repr(dep)
|
910
|
+
dep_failures.extend([(e, tid)])
|
911
|
+
|
894
912
|
# Replace item in args
|
895
913
|
new_args = []
|
896
914
|
for dep in args:
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
# If this Future is associated with a task inside this DFK,
|
902
|
-
# then refer to the task ID.
|
903
|
-
# Otherwise make a repr of the Future object.
|
904
|
-
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
905
|
-
tid = "task " + repr(dep.task_record['id'])
|
906
|
-
else:
|
907
|
-
tid = repr(dep)
|
908
|
-
dep_failures.extend([(e, tid)])
|
909
|
-
else:
|
910
|
-
new_args.extend([dep])
|
915
|
+
try:
|
916
|
+
new_args.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
917
|
+
except Exception as e:
|
918
|
+
append_failure(e, dep)
|
911
919
|
|
912
920
|
# Check for explicit kwargs ex, fu_1=<fut>
|
913
921
|
for key in kwargs:
|
914
922
|
dep = kwargs[key]
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
if hasattr(dep, 'task_record'):
|
920
|
-
tid = dep.task_record['id']
|
921
|
-
else:
|
922
|
-
tid = None
|
923
|
-
dep_failures.extend([(e, tid)])
|
923
|
+
try:
|
924
|
+
kwargs[key] = self.dependency_resolver.traverse_to_unwrap(dep)
|
925
|
+
except Exception as e:
|
926
|
+
append_failure(e, dep)
|
924
927
|
|
925
928
|
# Check for futures in inputs=[<fut>...]
|
926
929
|
if 'inputs' in kwargs:
|
927
930
|
new_inputs = []
|
928
931
|
for dep in kwargs['inputs']:
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
if hasattr(dep, 'task_record'):
|
934
|
-
tid = dep.task_record['id']
|
935
|
-
else:
|
936
|
-
tid = None
|
937
|
-
dep_failures.extend([(e, tid)])
|
938
|
-
|
939
|
-
else:
|
940
|
-
new_inputs.extend([dep])
|
932
|
+
try:
|
933
|
+
new_inputs.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
934
|
+
except Exception as e:
|
935
|
+
append_failure(e, dep)
|
941
936
|
kwargs['inputs'] = new_inputs
|
942
937
|
|
943
938
|
return new_args, kwargs, dep_failures
|
@@ -1042,6 +1037,8 @@ class DataFlowKernel:
|
|
1042
1037
|
|
1043
1038
|
func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
|
1044
1039
|
|
1040
|
+
logger.debug("Added output dependencies")
|
1041
|
+
|
1045
1042
|
# Replace the function invocation in the TaskRecord with whatever file-staging
|
1046
1043
|
# substitutions have been made.
|
1047
1044
|
task_record.update({
|
@@ -1053,8 +1050,10 @@ class DataFlowKernel:
|
|
1053
1050
|
|
1054
1051
|
self.tasks[task_id] = task_record
|
1055
1052
|
|
1053
|
+
logger.debug("Gathering dependencies")
|
1056
1054
|
# Get the list of dependencies for the task
|
1057
1055
|
depends = self._gather_all_deps(app_args, app_kwargs)
|
1056
|
+
logger.debug("Gathered dependencies")
|
1058
1057
|
task_record['depends'] = depends
|
1059
1058
|
|
1060
1059
|
depend_descs = []
|
@@ -1156,7 +1155,7 @@ class DataFlowKernel:
|
|
1156
1155
|
executor.run_id = self.run_id
|
1157
1156
|
executor.run_dir = self.run_dir
|
1158
1157
|
executor.hub_address = self.hub_address
|
1159
|
-
executor.
|
1158
|
+
executor.hub_zmq_port = self.hub_zmq_port
|
1160
1159
|
if self.monitoring:
|
1161
1160
|
executor.monitoring_radio = self.monitoring.radio
|
1162
1161
|
if hasattr(executor, 'provider'):
|
@@ -1271,6 +1270,13 @@ class DataFlowKernel:
|
|
1271
1270
|
atexit.unregister(self.atexit_cleanup)
|
1272
1271
|
logger.info("Unregistered atexit hook")
|
1273
1272
|
|
1273
|
+
if DataFlowKernelLoader._dfk is self:
|
1274
|
+
logger.info("Unregistering default DFK")
|
1275
|
+
parsl.clear()
|
1276
|
+
logger.info("Unregistered default DFK")
|
1277
|
+
else:
|
1278
|
+
logger.debug("Cleaning up non-default DFK - not unregistering")
|
1279
|
+
|
1274
1280
|
logger.info("DFK cleanup complete")
|
1275
1281
|
|
1276
1282
|
def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
|
parsl/executors/__init__.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
from parsl.executors.threads import ThreadPoolExecutor
|
2
2
|
from parsl.executors.workqueue.executor import WorkQueueExecutor
|
3
3
|
from parsl.executors.high_throughput.executor import HighThroughputExecutor
|
4
|
+
from parsl.executors.high_throughput.mpi_executor import MPIExecutor
|
4
5
|
from parsl.executors.flux.executor import FluxExecutor
|
5
6
|
|
6
7
|
__all__ = ['ThreadPoolExecutor',
|
7
8
|
'HighThroughputExecutor',
|
9
|
+
'MPIExecutor',
|
8
10
|
'WorkQueueExecutor',
|
9
11
|
'FluxExecutor']
|
parsl/executors/base.py
CHANGED
@@ -50,13 +50,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
50
50
|
self,
|
51
51
|
*,
|
52
52
|
hub_address: Optional[str] = None,
|
53
|
-
|
53
|
+
hub_zmq_port: Optional[int] = None,
|
54
54
|
monitoring_radio: Optional[MonitoringRadio] = None,
|
55
55
|
run_dir: str = ".",
|
56
56
|
run_id: Optional[str] = None,
|
57
57
|
):
|
58
58
|
self.hub_address = hub_address
|
59
|
-
self.
|
59
|
+
self.hub_zmq_port = hub_zmq_port
|
60
60
|
self.monitoring_radio = monitoring_radio
|
61
61
|
self.run_dir = os.path.abspath(run_dir)
|
62
62
|
self.run_id = run_id
|
@@ -136,14 +136,14 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
136
136
|
self._hub_address = value
|
137
137
|
|
138
138
|
@property
|
139
|
-
def
|
139
|
+
def hub_zmq_port(self) -> Optional[int]:
|
140
140
|
"""Port to the Hub for monitoring.
|
141
141
|
"""
|
142
|
-
return self.
|
142
|
+
return self._hub_zmq_port
|
143
143
|
|
144
|
-
@
|
145
|
-
def
|
146
|
-
self.
|
144
|
+
@hub_zmq_port.setter
|
145
|
+
def hub_zmq_port(self, value: Optional[int]) -> None:
|
146
|
+
self._hub_zmq_port = value
|
147
147
|
|
148
148
|
@property
|
149
149
|
def monitoring_radio(self) -> Optional[MonitoringRadio]:
|
@@ -10,3 +10,13 @@ class WorkerLost(Exception):
|
|
10
10
|
|
11
11
|
def __str__(self):
|
12
12
|
return self.__repr__()
|
13
|
+
|
14
|
+
|
15
|
+
class CommandClientTimeoutError(Exception):
|
16
|
+
"""Raised when the command client times out waiting for a response.
|
17
|
+
"""
|
18
|
+
|
19
|
+
|
20
|
+
class CommandClientBadError(Exception):
|
21
|
+
"""Raised when the command client is bad from an earlier timeout.
|
22
|
+
"""
|
@@ -62,47 +62,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
62
62
|
"--mpi-launcher={mpi_launcher} "
|
63
63
|
"--available-accelerators {accelerators}")
|
64
64
|
|
65
|
-
|
66
|
-
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
67
|
-
"""Executor designed for cluster-scale
|
68
|
-
|
69
|
-
The HighThroughputExecutor system has the following components:
|
70
|
-
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
71
|
-
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
72
|
-
3. The multiprocessing based worker pool which coordinates task execution over several
|
73
|
-
cores on a node.
|
74
|
-
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
75
|
-
|
76
|
-
Here is a diagram
|
77
|
-
|
78
|
-
.. code:: python
|
79
|
-
|
80
|
-
|
81
|
-
| Data | Executor | Interchange | External Process(es)
|
82
|
-
| Flow | | |
|
83
|
-
Task | Kernel | | |
|
84
|
-
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
85
|
-
| | | | batching | | |
|
86
|
-
Parsl<---Fut-| | | load-balancing| result exception
|
87
|
-
^ | | | watchdogs | | |
|
88
|
-
| | | Q_mngmnt | | V V
|
89
|
-
| | | Thread<--|-incoming_q<---|--- +---------+
|
90
|
-
| | | | | |
|
91
|
-
| | | | | |
|
92
|
-
+----update_fut-----+
|
93
|
-
|
94
|
-
|
95
|
-
Each of the workers in each process_worker_pool has access to its local rank through
|
96
|
-
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
97
|
-
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
98
|
-
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
99
|
-
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
100
|
-
|
101
|
-
|
102
|
-
Parameters
|
103
|
-
----------
|
104
|
-
|
105
|
-
provider : :class:`~parsl.providers.base.ExecutionProvider`
|
65
|
+
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
106
66
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
107
67
|
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
108
68
|
:class:`~parsl.providers.condor.condor.Condor`,
|
@@ -148,39 +108,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
148
108
|
worker_debug : Bool
|
149
109
|
Enables worker debug logging.
|
150
110
|
|
151
|
-
cores_per_worker : float
|
152
|
-
cores to be assigned to each worker. Oversubscription is possible
|
153
|
-
by setting cores_per_worker < 1.0. Default=1
|
154
|
-
|
155
|
-
mem_per_worker : float
|
156
|
-
GB of memory required per worker. If this option is specified, the node manager
|
157
|
-
will check the available memory at startup and limit the number of workers such that
|
158
|
-
the there's sufficient memory for each worker. Default: None
|
159
|
-
|
160
|
-
max_workers : int
|
161
|
-
Deprecated. Please use max_workers_per_node instead.
|
162
|
-
|
163
|
-
max_workers_per_node : int
|
164
|
-
Caps the number of workers launched per node. Default: None
|
165
|
-
|
166
|
-
cpu_affinity: string
|
167
|
-
Whether or how each worker process sets thread affinity. Options include "none" to forgo
|
168
|
-
any CPU affinity configuration, "block" to assign adjacent cores to workers
|
169
|
-
(ex: assign 0-1 to worker 0, 2-3 to worker 1), and
|
170
|
-
"alternating" to assign cores to workers in round-robin
|
171
|
-
(ex: assign 0,2 to worker 0, 1,3 to worker 1).
|
172
|
-
The "block-reverse" option assigns adjacent cores to workers, but assigns
|
173
|
-
the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
|
174
|
-
|
175
|
-
available_accelerators: int | list
|
176
|
-
Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
|
177
|
-
accelerators, and no more workers will be launched than the number of accelerators.
|
178
|
-
|
179
|
-
Either provide the list of accelerator names or the number available. If a number is provided,
|
180
|
-
Parsl will create names as integers starting with 0.
|
181
|
-
|
182
|
-
default: empty list
|
183
|
-
|
184
111
|
prefetch_capacity : int
|
185
112
|
Number of tasks that could be prefetched over available worker capacity.
|
186
113
|
When there are a few tasks (<100) or when tasks are long running, this option should
|
@@ -214,6 +141,85 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
214
141
|
worker_logdir_root : string
|
215
142
|
In case of a remote file system, specify the path to where logs will be kept.
|
216
143
|
|
144
|
+
encrypted : bool
|
145
|
+
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
146
|
+
""" # Documentation for params used by both HTEx and MPIEx
|
147
|
+
|
148
|
+
|
149
|
+
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
150
|
+
__doc__ = f"""Executor designed for cluster-scale
|
151
|
+
|
152
|
+
The HighThroughputExecutor system has the following components:
|
153
|
+
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
154
|
+
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
155
|
+
3. The multiprocessing based worker pool which coordinates task execution over several
|
156
|
+
cores on a node.
|
157
|
+
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
158
|
+
|
159
|
+
Here is a diagram
|
160
|
+
|
161
|
+
.. code:: python
|
162
|
+
|
163
|
+
|
164
|
+
| Data | Executor | Interchange | External Process(es)
|
165
|
+
| Flow | | |
|
166
|
+
Task | Kernel | | |
|
167
|
+
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
168
|
+
| | | | batching | | |
|
169
|
+
Parsl<---Fut-| | | load-balancing| result exception
|
170
|
+
^ | | | watchdogs | | |
|
171
|
+
| | | Q_mngmnt | | V V
|
172
|
+
| | | Thread<--|-incoming_q<---|--- +---------+
|
173
|
+
| | | | | |
|
174
|
+
| | | | | |
|
175
|
+
+----update_fut-----+
|
176
|
+
|
177
|
+
|
178
|
+
Each of the workers in each process_worker_pool has access to its local rank through
|
179
|
+
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
180
|
+
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
181
|
+
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
182
|
+
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
183
|
+
|
184
|
+
|
185
|
+
Parameters
|
186
|
+
----------
|
187
|
+
|
188
|
+
{GENERAL_HTEX_PARAM_DOCS}
|
189
|
+
|
190
|
+
cores_per_worker : float
|
191
|
+
cores to be assigned to each worker. Oversubscription is possible
|
192
|
+
by setting cores_per_worker < 1.0. Default=1
|
193
|
+
|
194
|
+
mem_per_worker : float
|
195
|
+
GB of memory required per worker. If this option is specified, the node manager
|
196
|
+
will check the available memory at startup and limit the number of workers such that
|
197
|
+
the there's sufficient memory for each worker. Default: None
|
198
|
+
|
199
|
+
max_workers : int
|
200
|
+
Deprecated. Please use max_workers_per_node instead.
|
201
|
+
|
202
|
+
max_workers_per_node : int
|
203
|
+
Caps the number of workers launched per node. Default: None
|
204
|
+
|
205
|
+
cpu_affinity: string
|
206
|
+
Whether or how each worker process sets thread affinity. Options include "none" to forgo
|
207
|
+
any CPU affinity configuration, "block" to assign adjacent cores to workers
|
208
|
+
(ex: assign 0-1 to worker 0, 2-3 to worker 1), and
|
209
|
+
"alternating" to assign cores to workers in round-robin
|
210
|
+
(ex: assign 0,2 to worker 0, 1,3 to worker 1).
|
211
|
+
The "block-reverse" option assigns adjacent cores to workers, but assigns
|
212
|
+
the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
|
213
|
+
|
214
|
+
available_accelerators: int | list
|
215
|
+
Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
|
216
|
+
accelerators, and no more workers will be launched than the number of accelerators.
|
217
|
+
|
218
|
+
Either provide the list of accelerator names or the number available. If a number is provided,
|
219
|
+
Parsl will create names as integers starting with 0.
|
220
|
+
|
221
|
+
default: empty list
|
222
|
+
|
217
223
|
enable_mpi_mode: bool
|
218
224
|
If enabled, MPI launch prefixes will be composed for the batch scheduler based on
|
219
225
|
the nodes available in each batch job and the resource_specification dict passed
|
@@ -224,9 +230,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
224
230
|
This field is only used if enable_mpi_mode is set. Select one from the
|
225
231
|
list of supported MPI launchers = ("srun", "aprun", "mpiexec").
|
226
232
|
default: "mpiexec"
|
227
|
-
|
228
|
-
encrypted : bool
|
229
|
-
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
230
233
|
"""
|
231
234
|
|
232
235
|
@typeguard.typechecked
|
@@ -305,9 +308,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
305
308
|
self._workers_per_node = 1 # our best guess-- we do not have any provider hints
|
306
309
|
|
307
310
|
self._task_counter = 0
|
308
|
-
self.run_id = None # set to the correct run_id in dfk
|
309
|
-
self.hub_address = None # set to the correct hub address in dfk
|
310
|
-
self.hub_port = None # set to the correct hub port in dfk
|
311
311
|
self.worker_ports = worker_ports
|
312
312
|
self.worker_port_range = worker_port_range
|
313
313
|
self.interchange_proc: Optional[Process] = None
|
@@ -326,8 +326,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
326
326
|
assert mpi_launcher in VALID_LAUNCHERS, \
|
327
327
|
f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
|
328
328
|
if self.enable_mpi_mode:
|
329
|
-
assert isinstance(self.provider.launcher, parsl.launchers.
|
330
|
-
"mpi_mode requires the provider to be configured to use a
|
329
|
+
assert isinstance(self.provider.launcher, parsl.launchers.SimpleLauncher), \
|
330
|
+
"mpi_mode requires the provider to be configured to use a SimpleLauncher"
|
331
331
|
|
332
332
|
self.mpi_launcher = mpi_launcher
|
333
333
|
|
@@ -541,7 +541,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
541
541
|
"worker_ports": self.worker_ports,
|
542
542
|
"worker_port_range": self.worker_port_range,
|
543
543
|
"hub_address": self.hub_address,
|
544
|
-
"
|
544
|
+
"hub_zmq_port": self.hub_zmq_port,
|
545
545
|
"logdir": self.logdir,
|
546
546
|
"heartbeat_threshold": self.heartbeat_threshold,
|
547
547
|
"poll_period": self.poll_period,
|
@@ -645,7 +645,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
645
645
|
Returns:
|
646
646
|
Future
|
647
647
|
"""
|
648
|
-
|
648
|
+
|
649
|
+
validate_resource_spec(resource_specification, self.enable_mpi_mode)
|
649
650
|
|
650
651
|
if self.bad_state_is_set:
|
651
652
|
raise self.executor_exception
|