parsl 2024.5.20__py3-none-any.whl → 2024.5.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/config.py +7 -1
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +44 -38
- parsl/executors/high_throughput/errors.py +10 -0
- parsl/executors/high_throughput/executor.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +1 -1
- parsl/executors/high_throughput/mpi_prefix_composer.py +18 -2
- parsl/executors/high_throughput/zmq_pipes.py +36 -2
- parsl/executors/radical/rpex_resources.py +3 -7
- parsl/tests/conftest.py +2 -2
- parsl/tests/sites/test_dynamic_executor.py +0 -1
- parsl/tests/test_bash_apps/test_std_uri.py +0 -6
- parsl/tests/test_checkpointing/test_periodic.py +2 -7
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +0 -1
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_task_exit.py +0 -1
- parsl/tests/test_htex/test_basic.py +0 -1
- parsl/tests/test_htex/test_command_client_timeout.py +69 -0
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -8
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_managers_command.py +2 -7
- parsl/tests/test_htex/test_missing_worker.py +2 -8
- parsl/tests/test_monitoring/test_app_names.py +0 -1
- parsl/tests/test_monitoring/test_basic.py +0 -2
- parsl/tests/test_monitoring/test_db_locks.py +0 -1
- parsl/tests/test_monitoring/test_fuzz_zmq.py +0 -1
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -2
- parsl/tests/test_monitoring/test_incomplete_futures.py +0 -1
- parsl/tests/test_monitoring/test_memoization_representation.py +0 -1
- parsl/tests/test_monitoring/test_stdouterr.py +0 -2
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +2 -7
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +10 -1
- parsl/tests/test_mpi_apps/test_resource_spec.py +14 -9
- parsl/tests/test_python_apps/test_context_manager.py +1 -9
- parsl/tests/test_python_apps/test_lifted.py +10 -6
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_scaling/test_regression_1621.py +0 -2
- parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
- parsl/tests/test_serialization/test_proxystore_configured.py +0 -1
- parsl/tests/test_shutdown/test_kill_monitoring.py +0 -2
- parsl/tests/test_staging/test_1316.py +0 -2
- parsl/tests/test_staging/test_elaborate_noop_file.py +0 -1
- parsl/tests/test_summary.py +0 -1
- parsl/tests/test_threads/test_configs.py +0 -1
- parsl/tests/test_threads/test_lazy_errors.py +0 -1
- parsl/version.py +1 -1
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/METADATA +6 -6
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/RECORD +55 -52
- {parsl-2024.5.20.data → parsl-2024.5.27.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.5.20.data → parsl-2024.5.27.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.5.20.data → parsl-2024.5.27.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/LICENSE +0 -0
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/WHEEL +0 -0
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/entry_points.txt +0 -0
- {parsl-2024.5.20.dist-info → parsl-2024.5.27.dist-info}/top_level.txt +0 -0
parsl/config.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Callable, Iterable, Optional, Sequence, Union
|
|
5
5
|
from typing_extensions import Literal
|
6
6
|
|
7
7
|
from parsl.utils import RepresentationMixin
|
8
|
+
from parsl.dataflow.dependency_resolvers import DependencyResolver
|
8
9
|
from parsl.executors.base import ParslExecutor
|
9
10
|
from parsl.executors.threads import ThreadPoolExecutor
|
10
11
|
from parsl.errors import ConfigurationError
|
@@ -35,6 +36,8 @@ class Config(RepresentationMixin, UsageInformation):
|
|
35
36
|
checkpoint_period : str, optional
|
36
37
|
Time interval (in "HH:MM:SS") at which to checkpoint completed tasks. Only has an effect if
|
37
38
|
``checkpoint_mode='periodic'``.
|
39
|
+
dependency_resolver: plugin point for custom dependency resolvers. Default: only resolve Futures,
|
40
|
+
using the `SHALLOW_DEPENDENCY_RESOLVER`.
|
38
41
|
garbage_collect : bool. optional.
|
39
42
|
Delete task records from DFK when tasks have completed. Default: True
|
40
43
|
internal_tasks_max_threads : int, optional
|
@@ -88,6 +91,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
88
91
|
Literal['dfk_exit'],
|
89
92
|
Literal['manual']] = None,
|
90
93
|
checkpoint_period: Optional[str] = None,
|
94
|
+
dependency_resolver: Optional[DependencyResolver] = None,
|
91
95
|
garbage_collect: bool = True,
|
92
96
|
internal_tasks_max_threads: int = 10,
|
93
97
|
retries: int = 0,
|
@@ -123,6 +127,7 @@ class Config(RepresentationMixin, UsageInformation):
|
|
123
127
|
if checkpoint_mode == 'periodic' and checkpoint_period is None:
|
124
128
|
checkpoint_period = "00:30:00"
|
125
129
|
self.checkpoint_period = checkpoint_period
|
130
|
+
self.dependency_resolver = dependency_resolver
|
126
131
|
self.garbage_collect = garbage_collect
|
127
132
|
self.internal_tasks_max_threads = internal_tasks_max_threads
|
128
133
|
self.retries = retries
|
@@ -152,4 +157,5 @@ class Config(RepresentationMixin, UsageInformation):
|
|
152
157
|
', '.join(['label={}'.format(repr(d)) for d in duplicates])))
|
153
158
|
|
154
159
|
def get_usage_information(self):
|
155
|
-
return {"executors_len": len(self.executors)
|
160
|
+
return {"executors_len": len(self.executors),
|
161
|
+
"dependency_resolver": self.dependency_resolver is not None}
|
@@ -0,0 +1,115 @@
|
|
1
|
+
from concurrent.futures import Future
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from functools import singledispatch
|
4
|
+
from typing import Callable, Sequence
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class DependencyResolver:
|
9
|
+
"""A DependencyResolver describes how app dependencies can be resolved.
|
10
|
+
It is specified as two functions: `traverse_to_gather` which turns an
|
11
|
+
app parameter into a sequence of futures which must be waited for before
|
12
|
+
the task can be executed (for example, in the case of
|
13
|
+
`DEEP_DEPENDENCY_RESOLVER` this traverses structures such as lists to
|
14
|
+
find every contained ``Future``), and `traverse_to_unwrap` which turns an
|
15
|
+
app parameter into its value to be passed to the app on execution
|
16
|
+
(for example in the case of `DEEP_DEPENDENCY_RESOLVER` this replaces a
|
17
|
+
list containing futures with a new list containing the values of those
|
18
|
+
resolved futures).
|
19
|
+
|
20
|
+
By default, Parsl will use `SHALLOW_DEPENDENCY_RESOLVER` which only
|
21
|
+
resolves Futures passed directly as arguments.
|
22
|
+
"""
|
23
|
+
traverse_to_gather: Callable[[object], Sequence[Future]]
|
24
|
+
traverse_to_unwrap: Callable[[object], object]
|
25
|
+
|
26
|
+
|
27
|
+
@singledispatch
|
28
|
+
def shallow_traverse_to_gather(o):
|
29
|
+
# objects in general do not expose futures that we can see
|
30
|
+
return []
|
31
|
+
|
32
|
+
|
33
|
+
@singledispatch
|
34
|
+
def shallow_traverse_to_unwrap(o):
|
35
|
+
# objects in general unwrap to themselves
|
36
|
+
return o
|
37
|
+
|
38
|
+
|
39
|
+
@shallow_traverse_to_gather.register
|
40
|
+
def _(fut: Future):
|
41
|
+
return [fut]
|
42
|
+
|
43
|
+
|
44
|
+
@shallow_traverse_to_unwrap.register
|
45
|
+
@singledispatch
|
46
|
+
def _(fut: Future):
|
47
|
+
assert fut.done()
|
48
|
+
return fut.result()
|
49
|
+
|
50
|
+
|
51
|
+
@singledispatch
|
52
|
+
def deep_traverse_to_gather(o):
|
53
|
+
# objects in general do not expose futures that we can see
|
54
|
+
return []
|
55
|
+
|
56
|
+
|
57
|
+
@singledispatch
|
58
|
+
def deep_traverse_to_unwrap(o):
|
59
|
+
# objects in general unwrap to themselves
|
60
|
+
return o
|
61
|
+
|
62
|
+
|
63
|
+
@deep_traverse_to_gather.register
|
64
|
+
def _(fut: Future):
|
65
|
+
return [fut]
|
66
|
+
|
67
|
+
|
68
|
+
@deep_traverse_to_unwrap.register
|
69
|
+
@singledispatch
|
70
|
+
def _(fut: Future):
|
71
|
+
assert fut.done()
|
72
|
+
return fut.result()
|
73
|
+
|
74
|
+
|
75
|
+
@deep_traverse_to_gather.register(tuple)
|
76
|
+
@deep_traverse_to_gather.register(list)
|
77
|
+
@deep_traverse_to_gather.register(set)
|
78
|
+
def _(iterable):
|
79
|
+
return [e for v in iterable for e in deep_traverse_to_gather(v)]
|
80
|
+
|
81
|
+
|
82
|
+
@deep_traverse_to_unwrap.register(tuple)
|
83
|
+
@deep_traverse_to_unwrap.register(list)
|
84
|
+
@deep_traverse_to_unwrap.register(set)
|
85
|
+
@singledispatch
|
86
|
+
def _(iterable):
|
87
|
+
|
88
|
+
type_ = type(iterable)
|
89
|
+
return type_(map(deep_traverse_to_unwrap, iterable))
|
90
|
+
|
91
|
+
|
92
|
+
@deep_traverse_to_gather.register(dict)
|
93
|
+
def _(dictionary):
|
94
|
+
futures = []
|
95
|
+
for key, value in dictionary.items():
|
96
|
+
futures.extend(deep_traverse_to_gather(key))
|
97
|
+
futures.extend(deep_traverse_to_gather(value))
|
98
|
+
return futures
|
99
|
+
|
100
|
+
|
101
|
+
@deep_traverse_to_unwrap.register(dict)
|
102
|
+
def _(dictionary):
|
103
|
+
unwrapped_dict = {}
|
104
|
+
for key, value in dictionary.items():
|
105
|
+
key = deep_traverse_to_unwrap(key)
|
106
|
+
value = deep_traverse_to_unwrap(value)
|
107
|
+
unwrapped_dict[key] = value
|
108
|
+
return unwrapped_dict
|
109
|
+
|
110
|
+
|
111
|
+
DEEP_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=deep_traverse_to_gather,
|
112
|
+
traverse_to_unwrap=deep_traverse_to_unwrap)
|
113
|
+
|
114
|
+
SHALLOW_DEPENDENCY_RESOLVER = DependencyResolver(traverse_to_gather=shallow_traverse_to_gather,
|
115
|
+
traverse_to_unwrap=shallow_traverse_to_unwrap)
|
parsl/dataflow/dflow.py
CHANGED
@@ -26,6 +26,7 @@ from parsl.channels import Channel
|
|
26
26
|
from parsl.config import Config
|
27
27
|
from parsl.data_provider.data_manager import DataManager
|
28
28
|
from parsl.data_provider.files import File
|
29
|
+
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
|
29
30
|
from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
|
30
31
|
from parsl.dataflow.futures import AppFuture
|
31
32
|
from parsl.dataflow.memoization import Memoizer
|
@@ -203,6 +204,9 @@ class DataFlowKernel:
|
|
203
204
|
self.tasks: Dict[int, TaskRecord] = {}
|
204
205
|
self.submitter_lock = threading.Lock()
|
205
206
|
|
207
|
+
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
208
|
+
else SHALLOW_DEPENDENCY_RESOLVER
|
209
|
+
|
206
210
|
atexit.register(self.atexit_cleanup)
|
207
211
|
|
208
212
|
def __enter__(self):
|
@@ -852,8 +856,11 @@ class DataFlowKernel:
|
|
852
856
|
depends: List[Future] = []
|
853
857
|
|
854
858
|
def check_dep(d: Any) -> None:
|
855
|
-
|
856
|
-
depends.extend(
|
859
|
+
try:
|
860
|
+
depends.extend(self.dependency_resolver.traverse_to_gather(d))
|
861
|
+
except Exception:
|
862
|
+
logger.exception("Exception in dependency_resolver.traverse_to_gather")
|
863
|
+
raise
|
857
864
|
|
858
865
|
# Check the positional args
|
859
866
|
for dep in args:
|
@@ -870,7 +877,8 @@ class DataFlowKernel:
|
|
870
877
|
|
871
878
|
return depends
|
872
879
|
|
873
|
-
def _unwrap_futures(self, args, kwargs)
|
880
|
+
def _unwrap_futures(self, args: Sequence[Any], kwargs: Dict[str, Any]) \
|
881
|
+
-> Tuple[Sequence[Any], Dict[str, Any], Sequence[Tuple[Exception, str]]]:
|
874
882
|
"""This function should be called when all dependencies have completed.
|
875
883
|
|
876
884
|
It will rewrite the arguments for that task, replacing each Future
|
@@ -891,53 +899,40 @@ class DataFlowKernel:
|
|
891
899
|
"""
|
892
900
|
dep_failures = []
|
893
901
|
|
902
|
+
def append_failure(e: Exception, dep: Future) -> None:
|
903
|
+
# If this Future is associated with a task inside this DFK,
|
904
|
+
# then refer to the task ID.
|
905
|
+
# Otherwise make a repr of the Future object.
|
906
|
+
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
907
|
+
tid = "task " + repr(dep.task_record['id'])
|
908
|
+
else:
|
909
|
+
tid = repr(dep)
|
910
|
+
dep_failures.extend([(e, tid)])
|
911
|
+
|
894
912
|
# Replace item in args
|
895
913
|
new_args = []
|
896
914
|
for dep in args:
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
# If this Future is associated with a task inside this DFK,
|
902
|
-
# then refer to the task ID.
|
903
|
-
# Otherwise make a repr of the Future object.
|
904
|
-
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
905
|
-
tid = "task " + repr(dep.task_record['id'])
|
906
|
-
else:
|
907
|
-
tid = repr(dep)
|
908
|
-
dep_failures.extend([(e, tid)])
|
909
|
-
else:
|
910
|
-
new_args.extend([dep])
|
915
|
+
try:
|
916
|
+
new_args.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
917
|
+
except Exception as e:
|
918
|
+
append_failure(e, dep)
|
911
919
|
|
912
920
|
# Check for explicit kwargs ex, fu_1=<fut>
|
913
921
|
for key in kwargs:
|
914
922
|
dep = kwargs[key]
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
if hasattr(dep, 'task_record'):
|
920
|
-
tid = dep.task_record['id']
|
921
|
-
else:
|
922
|
-
tid = None
|
923
|
-
dep_failures.extend([(e, tid)])
|
923
|
+
try:
|
924
|
+
kwargs[key] = self.dependency_resolver.traverse_to_unwrap(dep)
|
925
|
+
except Exception as e:
|
926
|
+
append_failure(e, dep)
|
924
927
|
|
925
928
|
# Check for futures in inputs=[<fut>...]
|
926
929
|
if 'inputs' in kwargs:
|
927
930
|
new_inputs = []
|
928
931
|
for dep in kwargs['inputs']:
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
if hasattr(dep, 'task_record'):
|
934
|
-
tid = dep.task_record['id']
|
935
|
-
else:
|
936
|
-
tid = None
|
937
|
-
dep_failures.extend([(e, tid)])
|
938
|
-
|
939
|
-
else:
|
940
|
-
new_inputs.extend([dep])
|
932
|
+
try:
|
933
|
+
new_inputs.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
934
|
+
except Exception as e:
|
935
|
+
append_failure(e, dep)
|
941
936
|
kwargs['inputs'] = new_inputs
|
942
937
|
|
943
938
|
return new_args, kwargs, dep_failures
|
@@ -1042,6 +1037,8 @@ class DataFlowKernel:
|
|
1042
1037
|
|
1043
1038
|
func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
|
1044
1039
|
|
1040
|
+
logger.debug("Added output dependencies")
|
1041
|
+
|
1045
1042
|
# Replace the function invocation in the TaskRecord with whatever file-staging
|
1046
1043
|
# substitutions have been made.
|
1047
1044
|
task_record.update({
|
@@ -1053,8 +1050,10 @@ class DataFlowKernel:
|
|
1053
1050
|
|
1054
1051
|
self.tasks[task_id] = task_record
|
1055
1052
|
|
1053
|
+
logger.debug("Gathering dependencies")
|
1056
1054
|
# Get the list of dependencies for the task
|
1057
1055
|
depends = self._gather_all_deps(app_args, app_kwargs)
|
1056
|
+
logger.debug("Gathered dependencies")
|
1058
1057
|
task_record['depends'] = depends
|
1059
1058
|
|
1060
1059
|
depend_descs = []
|
@@ -1271,6 +1270,13 @@ class DataFlowKernel:
|
|
1271
1270
|
atexit.unregister(self.atexit_cleanup)
|
1272
1271
|
logger.info("Unregistered atexit hook")
|
1273
1272
|
|
1273
|
+
if DataFlowKernelLoader._dfk is self:
|
1274
|
+
logger.info("Unregistering default DFK")
|
1275
|
+
parsl.clear()
|
1276
|
+
logger.info("Unregistered default DFK")
|
1277
|
+
else:
|
1278
|
+
logger.debug("Cleaning up non-default DFK - not unregistering")
|
1279
|
+
|
1274
1280
|
logger.info("DFK cleanup complete")
|
1275
1281
|
|
1276
1282
|
def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
|
@@ -10,3 +10,13 @@ class WorkerLost(Exception):
|
|
10
10
|
|
11
11
|
def __str__(self):
|
12
12
|
return self.__repr__()
|
13
|
+
|
14
|
+
|
15
|
+
class CommandClientTimeoutError(Exception):
|
16
|
+
"""Raised when the command client times out waiting for a response.
|
17
|
+
"""
|
18
|
+
|
19
|
+
|
20
|
+
class CommandClientBadError(Exception):
|
21
|
+
"""Raised when the command client is bad from an earlier timeout.
|
22
|
+
"""
|
@@ -645,7 +645,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
645
645
|
Returns:
|
646
646
|
Future
|
647
647
|
"""
|
648
|
-
|
648
|
+
|
649
|
+
validate_resource_spec(resource_specification, self.enable_mpi_mode)
|
649
650
|
|
650
651
|
if self.bad_state_is_set:
|
651
652
|
raise self.executor_exception
|
@@ -20,7 +20,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
20
20
|
to spawn multi-node tasks.
|
21
21
|
|
22
22
|
Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
|
23
|
-
The
|
23
|
+
The value should be less than or equal to the ``nodes_per_block`` in the Provider.
|
24
24
|
|
25
25
|
Parameters
|
26
26
|
----------
|
@@ -8,8 +8,18 @@ VALID_LAUNCHERS = ('srun',
|
|
8
8
|
'mpiexec')
|
9
9
|
|
10
10
|
|
11
|
+
class MissingResourceSpecification(Exception):
|
12
|
+
"""Exception raised when input is not supplied a resource specification"""
|
13
|
+
|
14
|
+
def __init__(self, reason: str):
|
15
|
+
self.reason = reason
|
16
|
+
|
17
|
+
def __str__(self):
|
18
|
+
return f"Missing resource specification: {self.reason}"
|
19
|
+
|
20
|
+
|
11
21
|
class InvalidResourceSpecification(Exception):
|
12
|
-
"""Exception raised when Invalid
|
22
|
+
"""Exception raised when Invalid input is supplied via resource specification"""
|
13
23
|
|
14
24
|
def __init__(self, invalid_keys: Set[str]):
|
15
25
|
self.invalid_keys = invalid_keys
|
@@ -18,13 +28,19 @@ class InvalidResourceSpecification(Exception):
|
|
18
28
|
return f"Invalid resource specification options supplied: {self.invalid_keys}"
|
19
29
|
|
20
30
|
|
21
|
-
def validate_resource_spec(resource_spec: Dict[str, str]):
|
31
|
+
def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
|
22
32
|
"""Basic validation of keys in the resource_spec
|
23
33
|
|
24
34
|
Raises: InvalidResourceSpecification if the resource_spec
|
25
35
|
is invalid (e.g, contains invalid keys)
|
26
36
|
"""
|
27
37
|
user_keys = set(resource_spec.keys())
|
38
|
+
|
39
|
+
# empty resource_spec when mpi_mode is set causes parsl to hang
|
40
|
+
# ref issue #3427
|
41
|
+
if is_mpi_enabled and len(user_keys) == 0:
|
42
|
+
raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured')
|
43
|
+
|
28
44
|
legal_keys = set(("ranks_per_node",
|
29
45
|
"num_nodes",
|
30
46
|
"num_ranks",
|
@@ -3,8 +3,11 @@
|
|
3
3
|
import zmq
|
4
4
|
import logging
|
5
5
|
import threading
|
6
|
+
import time
|
6
7
|
|
7
8
|
from parsl import curvezmq
|
9
|
+
from parsl.errors import InternalConsistencyError
|
10
|
+
from parsl.executors.high_throughput.errors import CommandClientBadError, CommandClientTimeoutError
|
8
11
|
|
9
12
|
logger = logging.getLogger(__name__)
|
10
13
|
|
@@ -31,6 +34,7 @@ class CommandClient:
|
|
31
34
|
self.port = None
|
32
35
|
self.create_socket_and_bind()
|
33
36
|
self._lock = threading.Lock()
|
37
|
+
self.ok = True
|
34
38
|
|
35
39
|
def create_socket_and_bind(self):
|
36
40
|
""" Creates socket and binds to a port.
|
@@ -46,7 +50,7 @@ class CommandClient:
|
|
46
50
|
else:
|
47
51
|
self.zmq_socket.bind("tcp://{}:{}".format(self.ip_address, self.port))
|
48
52
|
|
49
|
-
def run(self, message, max_retries=3):
|
53
|
+
def run(self, message, max_retries=3, timeout_s=None):
|
50
54
|
""" This function needs to be fast at the same time aware of the possibility of
|
51
55
|
ZMQ pipes overflowing.
|
52
56
|
|
@@ -54,13 +58,43 @@ class CommandClient:
|
|
54
58
|
in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
|
55
59
|
This issue can be magnified if each the serialized buffer itself is larger.
|
56
60
|
"""
|
61
|
+
if not self.ok:
|
62
|
+
raise CommandClientBadError()
|
63
|
+
|
64
|
+
start_time_s = time.monotonic()
|
65
|
+
|
57
66
|
reply = '__PARSL_ZMQ_PIPES_MAGIC__'
|
58
67
|
with self._lock:
|
59
68
|
for _ in range(max_retries):
|
60
69
|
try:
|
61
70
|
logger.debug("Sending command client command")
|
71
|
+
|
72
|
+
if timeout_s is not None:
|
73
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
74
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
|
75
|
+
if poll_result == zmq.POLLOUT:
|
76
|
+
pass # this is OK, so continue
|
77
|
+
elif poll_result == 0:
|
78
|
+
raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
|
79
|
+
else:
|
80
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
81
|
+
|
62
82
|
self.zmq_socket.send_pyobj(message, copy=True)
|
63
|
-
|
83
|
+
|
84
|
+
if timeout_s is not None:
|
85
|
+
logger.debug("Polling for command client response or timeout")
|
86
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
87
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
|
88
|
+
if poll_result == zmq.POLLIN:
|
89
|
+
pass # this is OK, so continue
|
90
|
+
elif poll_result == 0:
|
91
|
+
logger.error("Command timed-out - command client is now bad forever")
|
92
|
+
self.ok = False
|
93
|
+
raise CommandClientTimeoutError("Waiting for a reply from command channel")
|
94
|
+
else:
|
95
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
96
|
+
|
97
|
+
logger.debug("Receiving command client response")
|
64
98
|
reply = self.zmq_socket.recv_pyobj()
|
65
99
|
logger.debug("Received command client response")
|
66
100
|
except zmq.ZMQError:
|
@@ -3,15 +3,11 @@ import json
|
|
3
3
|
|
4
4
|
from typing import List
|
5
5
|
|
6
|
-
_setup_paths: List[str]
|
6
|
+
_setup_paths: List[str] = []
|
7
7
|
try:
|
8
8
|
import radical.pilot as rp
|
9
|
-
import radical.utils as ru
|
10
9
|
except ImportError:
|
11
|
-
|
12
|
-
else:
|
13
|
-
_setup_paths = [rp.sdist_path,
|
14
|
-
ru.sdist_path]
|
10
|
+
pass
|
15
11
|
|
16
12
|
|
17
13
|
MPI = "mpi"
|
@@ -77,7 +73,7 @@ class ResourceConfig:
|
|
77
73
|
|
78
74
|
pilot_env_setup : list
|
79
75
|
List of setup commands/packages for the pilot environment.
|
80
|
-
Default
|
76
|
+
Default is an empty list.
|
81
77
|
|
82
78
|
python_v : str
|
83
79
|
The Python version to be used in the pilot environment.
|
parsl/tests/conftest.py
CHANGED
@@ -201,7 +201,7 @@ def load_dfk_session(request, pytestconfig, tmpd_cwd_session):
|
|
201
201
|
if parsl.dfk() != dfk:
|
202
202
|
raise RuntimeError("DFK changed unexpectedly during test")
|
203
203
|
dfk.cleanup()
|
204
|
-
|
204
|
+
assert DataFlowKernelLoader._dfk is None
|
205
205
|
else:
|
206
206
|
yield
|
207
207
|
|
@@ -253,7 +253,7 @@ def load_dfk_local_module(request, pytestconfig, tmpd_cwd_session):
|
|
253
253
|
if parsl.dfk() != dfk:
|
254
254
|
raise RuntimeError("DFK changed unexpectedly during test")
|
255
255
|
dfk.cleanup()
|
256
|
-
|
256
|
+
assert DataFlowKernelLoader._dfk is None
|
257
257
|
|
258
258
|
else:
|
259
259
|
yield
|
@@ -35,8 +35,6 @@ def const_with_cpath(autopath_specifier, content_path, caplog):
|
|
35
35
|
for record in caplog.records:
|
36
36
|
assert record.levelno < logging.ERROR
|
37
37
|
|
38
|
-
parsl.clear()
|
39
|
-
|
40
38
|
|
41
39
|
@pytest.mark.local
|
42
40
|
def test_std_autopath_const_str(caplog, tmpd_cwd):
|
@@ -74,8 +72,6 @@ def test_std_autopath_fail(caplog):
|
|
74
72
|
with pytest.raises(URIFailError):
|
75
73
|
app_stdout()
|
76
74
|
|
77
|
-
parsl.clear()
|
78
|
-
|
79
75
|
|
80
76
|
@parsl.bash_app
|
81
77
|
def app_both(stdout=parsl.AUTO_LOGNAME, stderr=parsl.AUTO_LOGNAME):
|
@@ -124,5 +120,3 @@ def test_std_autopath_zip(caplog, tmpd_cwd):
|
|
124
120
|
|
125
121
|
for record in caplog.records:
|
126
122
|
assert record.levelno < logging.ERROR
|
127
|
-
|
128
|
-
parsl.clear()
|
@@ -9,12 +9,6 @@ def local_setup():
|
|
9
9
|
parsl.load(fresh_config())
|
10
10
|
|
11
11
|
|
12
|
-
def local_teardown():
|
13
|
-
# explicit clear without dfk.cleanup here, because the
|
14
|
-
# test does that already
|
15
|
-
parsl.clear()
|
16
|
-
|
17
|
-
|
18
12
|
@python_app(cache=True)
|
19
13
|
def slow_double(x, sleep_dur=1):
|
20
14
|
import time
|
@@ -39,9 +33,10 @@ def test_periodic():
|
|
39
33
|
with parsl.dfk():
|
40
34
|
futs = [slow_double(sleep_for) for _ in range(4)]
|
41
35
|
[f.result() for f in futs]
|
36
|
+
run_dir = parsl.dfk().run_dir
|
42
37
|
|
43
38
|
# Here we will check if the loglines came back with 5 seconds deltas
|
44
|
-
with open("{}/parsl.log".format(
|
39
|
+
with open("{}/parsl.log".format(run_dir)) as f:
|
45
40
|
log_lines = f.readlines()
|
46
41
|
expected_msg = " Done checkpointing"
|
47
42
|
expected_msg2 = " No tasks checkpointed in this pass"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import pytest
|
2
|
+
import threading
|
3
|
+
import time
|
4
|
+
import zmq
|
5
|
+
from parsl import curvezmq
|
6
|
+
from parsl.executors.high_throughput.zmq_pipes import CommandClient
|
7
|
+
from parsl.executors.high_throughput.errors import CommandClientTimeoutError, CommandClientBadError
|
8
|
+
|
9
|
+
|
10
|
+
# Time constant used for timeout tests: various delays and
|
11
|
+
# timeouts will be appropriate multiples of this, but the
|
12
|
+
# value of T itself should not matter too much as long as
|
13
|
+
# it is big enough for zmq connections to happen successfully.
|
14
|
+
T = 0.25
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.mark.local
|
18
|
+
def test_command_not_sent() -> None:
|
19
|
+
"""Tests timeout on command send.
|
20
|
+
"""
|
21
|
+
ctx = curvezmq.ClientContext(None)
|
22
|
+
|
23
|
+
# RFC6335 ephemeral port range
|
24
|
+
cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
|
25
|
+
|
26
|
+
# cc will now wait for a connection, but we won't do anything to make the
|
27
|
+
# other side of the connection exist, so any command given to cc should
|
28
|
+
# timeout.
|
29
|
+
|
30
|
+
with pytest.raises(CommandClientTimeoutError):
|
31
|
+
cc.run("SOMECOMMAND", timeout_s=T)
|
32
|
+
|
33
|
+
cc.close()
|
34
|
+
|
35
|
+
|
36
|
+
@pytest.mark.local
|
37
|
+
def test_command_ignored() -> None:
|
38
|
+
"""Tests timeout on command response.
|
39
|
+
Tests that we timeout after a response and that the command client
|
40
|
+
sets itself into a bad state.
|
41
|
+
|
42
|
+
This only tests sequential access to the command client, even though
|
43
|
+
htex makes multithreaded use of the command client: see issue #3376 about
|
44
|
+
that lack of thread safety.
|
45
|
+
"""
|
46
|
+
ctx = curvezmq.ClientContext(None)
|
47
|
+
|
48
|
+
# RFC6335 ephemeral port range
|
49
|
+
cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
|
50
|
+
|
51
|
+
ic_ctx = curvezmq.ServerContext(None)
|
52
|
+
ic_channel = ic_ctx.socket(zmq.REP)
|
53
|
+
ic_channel.connect(f"tcp://127.0.0.1:{cc.port}")
|
54
|
+
|
55
|
+
with pytest.raises(CommandClientTimeoutError):
|
56
|
+
cc.run("SLOW_COMMAND", timeout_s=T)
|
57
|
+
|
58
|
+
req = ic_channel.recv_pyobj()
|
59
|
+
assert req == "SLOW_COMMAND", "Should have received command on interchange side"
|
60
|
+
assert not cc.ok, "CommandClient should have set itself to bad"
|
61
|
+
|
62
|
+
with pytest.raises(CommandClientBadError):
|
63
|
+
cc.run("ANOTHER_COMMAND")
|
64
|
+
|
65
|
+
cc.close()
|
66
|
+
ctx.term()
|
67
|
+
|
68
|
+
ic_channel.close()
|
69
|
+
ic_ctx.term()
|
@@ -37,16 +37,9 @@ def test_cpu_affinity_explicit():
|
|
37
37
|
config.executors[0].max_workers_per_node = 1
|
38
38
|
|
39
39
|
logger.debug(f"config: {config}")
|
40
|
-
# TODO: is there a `with` style for this, to properly deal with exceptions?
|
41
|
-
|
42
|
-
parsl.load(config)
|
43
|
-
try:
|
44
40
|
|
41
|
+
with parsl.load(config):
|
45
42
|
worker_affinity = my_affinity().result()
|
46
43
|
logger.debug(f"worker reported this affinity: {worker_affinity}")
|
47
44
|
assert len(worker_affinity) == 1
|
48
45
|
assert worker_affinity == set((single_core,))
|
49
|
-
|
50
|
-
finally:
|
51
|
-
parsl.dfk().cleanup()
|
52
|
-
parsl.clear()
|