parsl 2024.5.13__py3-none-any.whl → 2024.5.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +2 -9
- parsl/channels/local/local.py +3 -6
- parsl/channels/oauth_ssh/oauth_ssh.py +2 -2
- parsl/channels/ssh/ssh.py +2 -2
- parsl/config.py +7 -1
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +45 -39
- parsl/executors/__init__.py +2 -0
- parsl/executors/base.py +7 -7
- parsl/executors/high_throughput/errors.py +10 -0
- parsl/executors/high_throughput/executor.py +85 -84
- parsl/executors/high_throughput/interchange.py +6 -5
- parsl/executors/high_throughput/mpi_executor.py +85 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +18 -2
- parsl/executors/high_throughput/mpi_resource_management.py +3 -0
- parsl/executors/high_throughput/zmq_pipes.py +36 -2
- parsl/executors/radical/rpex_resources.py +3 -7
- parsl/monitoring/remote.py +18 -24
- parsl/providers/local/local.py +1 -1
- parsl/tests/conftest.py +2 -2
- parsl/tests/sites/test_dynamic_executor.py +0 -1
- parsl/tests/test_bash_apps/test_std_uri.py +0 -6
- parsl/tests/test_checkpointing/test_periodic.py +2 -7
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +0 -1
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_task_exit.py +0 -1
- parsl/tests/test_htex/test_basic.py +0 -1
- parsl/tests/test_htex/test_command_client_timeout.py +69 -0
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -8
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_managers_command.py +2 -7
- parsl/tests/test_htex/test_missing_worker.py +2 -8
- parsl/tests/test_monitoring/test_app_names.py +0 -1
- parsl/tests/test_monitoring/test_basic.py +0 -2
- parsl/tests/test_monitoring/test_db_locks.py +0 -1
- parsl/tests/test_monitoring/test_fuzz_zmq.py +0 -1
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -2
- parsl/tests/test_monitoring/test_incomplete_futures.py +0 -1
- parsl/tests/test_monitoring/test_memoization_representation.py +0 -1
- parsl/tests/test_monitoring/test_stdouterr.py +0 -2
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +6 -14
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +2 -8
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +10 -1
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +14 -9
- parsl/tests/test_python_apps/test_context_manager.py +1 -9
- parsl/tests/test_python_apps/test_lifted.py +10 -6
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_scaling/test_regression_1621.py +0 -2
- parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
- parsl/tests/test_serialization/test_proxystore_configured.py +0 -1
- parsl/tests/test_shutdown/test_kill_monitoring.py +0 -2
- parsl/tests/test_staging/test_1316.py +0 -2
- parsl/tests/test_staging/test_elaborate_noop_file.py +0 -1
- parsl/tests/test_summary.py +0 -1
- parsl/tests/test_threads/test_configs.py +0 -1
- parsl/tests/test_threads/test_lazy_errors.py +0 -1
- parsl/version.py +1 -1
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/METADATA +6 -4
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/RECORD +67 -62
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/LICENSE +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/WHEEL +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/entry_points.txt +0 -0
- {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/top_level.txt +0 -0
@@ -74,7 +74,7 @@ class Interchange:
|
|
74
74
|
worker_ports: Optional[Tuple[int, int]] = None,
|
75
75
|
worker_port_range: Tuple[int, int] = (54000, 55000),
|
76
76
|
hub_address: Optional[str] = None,
|
77
|
-
|
77
|
+
hub_zmq_port: Optional[int] = None,
|
78
78
|
heartbeat_threshold: int = 60,
|
79
79
|
logdir: str = ".",
|
80
80
|
logging_level: int = logging.INFO,
|
@@ -105,7 +105,7 @@ class Interchange:
|
|
105
105
|
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
106
106
|
Default: None (meaning monitoring disabled)
|
107
107
|
|
108
|
-
|
108
|
+
hub_zmq_port : str
|
109
109
|
The port at which the interchange can send info about managers to when monitoring is enabled.
|
110
110
|
Default: None (meaning monitoring disabled)
|
111
111
|
|
@@ -151,7 +151,7 @@ class Interchange:
|
|
151
151
|
logger.info("Connected to client")
|
152
152
|
|
153
153
|
self.hub_address = hub_address
|
154
|
-
self.
|
154
|
+
self.hub_zmq_port = hub_zmq_port
|
155
155
|
|
156
156
|
self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
|
157
157
|
self.count = 0
|
@@ -244,12 +244,12 @@ class Interchange:
|
|
244
244
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
245
245
|
|
246
246
|
def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
|
247
|
-
if self.hub_address and self.
|
247
|
+
if self.hub_address and self.hub_zmq_port:
|
248
248
|
logger.info("Connecting to MonitoringHub")
|
249
249
|
# This is a one-off because monitoring is unencrypted
|
250
250
|
hub_channel = zmq.Context().socket(zmq.DEALER)
|
251
251
|
hub_channel.set_hwm(0)
|
252
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.
|
252
|
+
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
|
253
253
|
logger.info("Connected to MonitoringHub")
|
254
254
|
return hub_channel
|
255
255
|
else:
|
@@ -329,6 +329,7 @@ class Interchange:
|
|
329
329
|
reply = None
|
330
330
|
|
331
331
|
else:
|
332
|
+
logger.error(f"Received unknown command: {command_req}")
|
332
333
|
reply = None
|
333
334
|
|
334
335
|
logger.debug("Reply: {}".format(reply))
|
@@ -0,0 +1,85 @@
|
|
1
|
+
"""A simplified interface for HTEx when running in MPI mode"""
|
2
|
+
from typing import Optional, Tuple, List, Union, Callable, Dict
|
3
|
+
|
4
|
+
import typeguard
|
5
|
+
|
6
|
+
from parsl.data_provider.staging import Staging
|
7
|
+
from parsl.executors.high_throughput.executor import HighThroughputExecutor, GENERAL_HTEX_PARAM_DOCS
|
8
|
+
from parsl.executors.status_handling import BlockProviderExecutor
|
9
|
+
from parsl.jobs.states import JobStatus
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
from parsl.providers.base import ExecutionProvider
|
12
|
+
|
13
|
+
|
14
|
+
class MPIExecutor(HighThroughputExecutor):
|
15
|
+
__doc__ = f"""A version of :class:`~parsl.HighThroughputExecutor` tuned for executing multi-node (e.g., MPI) tasks.
|
16
|
+
|
17
|
+
The Provider _must_ use the :class:`~parsl.launchers.SimpleLauncher`,
|
18
|
+
which places a single pool of workers on the first node of a block.
|
19
|
+
Each worker can then make system calls which use an MPI launcher (e.g., ``mpirun``, ``srun``)
|
20
|
+
to spawn multi-node tasks.
|
21
|
+
|
22
|
+
Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
|
23
|
+
The value should be less than or equal to the ``nodes_per_block`` in the Provider.
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
max_workers_per_block: int
|
28
|
+
Maximum number of MPI applications to run at once per block
|
29
|
+
|
30
|
+
{GENERAL_HTEX_PARAM_DOCS}
|
31
|
+
"""
|
32
|
+
|
33
|
+
@typeguard.typechecked
|
34
|
+
def __init__(self,
|
35
|
+
label: str = 'MPIExecutor',
|
36
|
+
provider: ExecutionProvider = LocalProvider(),
|
37
|
+
launch_cmd: Optional[str] = None,
|
38
|
+
address: Optional[str] = None,
|
39
|
+
worker_ports: Optional[Tuple[int, int]] = None,
|
40
|
+
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
41
|
+
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
42
|
+
storage_access: Optional[List[Staging]] = None,
|
43
|
+
working_dir: Optional[str] = None,
|
44
|
+
worker_debug: bool = False,
|
45
|
+
max_workers_per_block: int = 1,
|
46
|
+
prefetch_capacity: int = 0,
|
47
|
+
heartbeat_threshold: int = 120,
|
48
|
+
heartbeat_period: int = 30,
|
49
|
+
drain_period: Optional[int] = None,
|
50
|
+
poll_period: int = 10,
|
51
|
+
address_probe_timeout: Optional[int] = None,
|
52
|
+
worker_logdir_root: Optional[str] = None,
|
53
|
+
mpi_launcher: str = "mpiexec",
|
54
|
+
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
55
|
+
encrypted: bool = False):
|
56
|
+
super().__init__(
|
57
|
+
# Hard-coded settings
|
58
|
+
cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
|
59
|
+
enable_mpi_mode=True,
|
60
|
+
max_workers_per_node=max_workers_per_block,
|
61
|
+
|
62
|
+
# Everything else
|
63
|
+
label=label,
|
64
|
+
provider=provider,
|
65
|
+
launch_cmd=launch_cmd,
|
66
|
+
address=address,
|
67
|
+
worker_ports=worker_ports,
|
68
|
+
worker_port_range=worker_port_range,
|
69
|
+
interchange_port_range=interchange_port_range,
|
70
|
+
storage_access=storage_access,
|
71
|
+
working_dir=working_dir,
|
72
|
+
worker_debug=worker_debug,
|
73
|
+
prefetch_capacity=prefetch_capacity,
|
74
|
+
heartbeat_threshold=heartbeat_threshold,
|
75
|
+
heartbeat_period=heartbeat_period,
|
76
|
+
drain_period=drain_period,
|
77
|
+
poll_period=poll_period,
|
78
|
+
address_probe_timeout=address_probe_timeout,
|
79
|
+
worker_logdir_root=worker_logdir_root,
|
80
|
+
mpi_launcher=mpi_launcher,
|
81
|
+
block_error_handler=block_error_handler,
|
82
|
+
encrypted=encrypted
|
83
|
+
)
|
84
|
+
|
85
|
+
self.max_workers_per_block = max_workers_per_block
|
@@ -8,8 +8,18 @@ VALID_LAUNCHERS = ('srun',
|
|
8
8
|
'mpiexec')
|
9
9
|
|
10
10
|
|
11
|
+
class MissingResourceSpecification(Exception):
|
12
|
+
"""Exception raised when input is not supplied a resource specification"""
|
13
|
+
|
14
|
+
def __init__(self, reason: str):
|
15
|
+
self.reason = reason
|
16
|
+
|
17
|
+
def __str__(self):
|
18
|
+
return f"Missing resource specification: {self.reason}"
|
19
|
+
|
20
|
+
|
11
21
|
class InvalidResourceSpecification(Exception):
|
12
|
-
"""Exception raised when Invalid
|
22
|
+
"""Exception raised when Invalid input is supplied via resource specification"""
|
13
23
|
|
14
24
|
def __init__(self, invalid_keys: Set[str]):
|
15
25
|
self.invalid_keys = invalid_keys
|
@@ -18,13 +28,19 @@ class InvalidResourceSpecification(Exception):
|
|
18
28
|
return f"Invalid resource specification options supplied: {self.invalid_keys}"
|
19
29
|
|
20
30
|
|
21
|
-
def validate_resource_spec(resource_spec: Dict[str, str]):
|
31
|
+
def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
|
22
32
|
"""Basic validation of keys in the resource_spec
|
23
33
|
|
24
34
|
Raises: InvalidResourceSpecification if the resource_spec
|
25
35
|
is invalid (e.g, contains invalid keys)
|
26
36
|
"""
|
27
37
|
user_keys = set(resource_spec.keys())
|
38
|
+
|
39
|
+
# empty resource_spec when mpi_mode is set causes parsl to hang
|
40
|
+
# ref issue #3427
|
41
|
+
if is_mpi_enabled and len(user_keys) == 0:
|
42
|
+
raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured')
|
43
|
+
|
28
44
|
legal_keys = set(("ranks_per_node",
|
29
45
|
"num_nodes",
|
30
46
|
"num_ranks",
|
@@ -208,8 +208,11 @@ class MPITaskScheduler(TaskScheduler):
|
|
208
208
|
"""Return result and relinquish provisioned nodes"""
|
209
209
|
result_pkl = self.pending_result_q.get(block, timeout=timeout)
|
210
210
|
result_dict = pickle.loads(result_pkl)
|
211
|
+
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
212
|
+
# Causes Parsl to hang. See Issue #3427
|
211
213
|
if result_dict["type"] == "result":
|
212
214
|
task_id = result_dict["task_id"]
|
215
|
+
assert task_id in self._map_tasks_to_nodes, "You are about to experience issue #3427"
|
213
216
|
nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
|
214
217
|
self._return_nodes(nodes_to_reallocate)
|
215
218
|
self._schedule_backlog_tasks()
|
@@ -3,8 +3,11 @@
|
|
3
3
|
import zmq
|
4
4
|
import logging
|
5
5
|
import threading
|
6
|
+
import time
|
6
7
|
|
7
8
|
from parsl import curvezmq
|
9
|
+
from parsl.errors import InternalConsistencyError
|
10
|
+
from parsl.executors.high_throughput.errors import CommandClientBadError, CommandClientTimeoutError
|
8
11
|
|
9
12
|
logger = logging.getLogger(__name__)
|
10
13
|
|
@@ -31,6 +34,7 @@ class CommandClient:
|
|
31
34
|
self.port = None
|
32
35
|
self.create_socket_and_bind()
|
33
36
|
self._lock = threading.Lock()
|
37
|
+
self.ok = True
|
34
38
|
|
35
39
|
def create_socket_and_bind(self):
|
36
40
|
""" Creates socket and binds to a port.
|
@@ -46,7 +50,7 @@ class CommandClient:
|
|
46
50
|
else:
|
47
51
|
self.zmq_socket.bind("tcp://{}:{}".format(self.ip_address, self.port))
|
48
52
|
|
49
|
-
def run(self, message, max_retries=3):
|
53
|
+
def run(self, message, max_retries=3, timeout_s=None):
|
50
54
|
""" This function needs to be fast at the same time aware of the possibility of
|
51
55
|
ZMQ pipes overflowing.
|
52
56
|
|
@@ -54,13 +58,43 @@ class CommandClient:
|
|
54
58
|
in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
|
55
59
|
This issue can be magnified if each the serialized buffer itself is larger.
|
56
60
|
"""
|
61
|
+
if not self.ok:
|
62
|
+
raise CommandClientBadError()
|
63
|
+
|
64
|
+
start_time_s = time.monotonic()
|
65
|
+
|
57
66
|
reply = '__PARSL_ZMQ_PIPES_MAGIC__'
|
58
67
|
with self._lock:
|
59
68
|
for _ in range(max_retries):
|
60
69
|
try:
|
61
70
|
logger.debug("Sending command client command")
|
71
|
+
|
72
|
+
if timeout_s is not None:
|
73
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
74
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
|
75
|
+
if poll_result == zmq.POLLOUT:
|
76
|
+
pass # this is OK, so continue
|
77
|
+
elif poll_result == 0:
|
78
|
+
raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
|
79
|
+
else:
|
80
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
81
|
+
|
62
82
|
self.zmq_socket.send_pyobj(message, copy=True)
|
63
|
-
|
83
|
+
|
84
|
+
if timeout_s is not None:
|
85
|
+
logger.debug("Polling for command client response or timeout")
|
86
|
+
remaining_time_s = start_time_s + timeout_s - time.monotonic()
|
87
|
+
poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
|
88
|
+
if poll_result == zmq.POLLIN:
|
89
|
+
pass # this is OK, so continue
|
90
|
+
elif poll_result == 0:
|
91
|
+
logger.error("Command timed-out - command client is now bad forever")
|
92
|
+
self.ok = False
|
93
|
+
raise CommandClientTimeoutError("Waiting for a reply from command channel")
|
94
|
+
else:
|
95
|
+
raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
|
96
|
+
|
97
|
+
logger.debug("Receiving command client response")
|
64
98
|
reply = self.zmq_socket.recv_pyobj()
|
65
99
|
logger.debug("Received command client response")
|
66
100
|
except zmq.ZMQError:
|
@@ -3,15 +3,11 @@ import json
|
|
3
3
|
|
4
4
|
from typing import List
|
5
5
|
|
6
|
-
_setup_paths: List[str]
|
6
|
+
_setup_paths: List[str] = []
|
7
7
|
try:
|
8
8
|
import radical.pilot as rp
|
9
|
-
import radical.utils as ru
|
10
9
|
except ImportError:
|
11
|
-
|
12
|
-
else:
|
13
|
-
_setup_paths = [rp.sdist_path,
|
14
|
-
ru.sdist_path]
|
10
|
+
pass
|
15
11
|
|
16
12
|
|
17
13
|
MPI = "mpi"
|
@@ -77,7 +73,7 @@ class ResourceConfig:
|
|
77
73
|
|
78
74
|
pilot_env_setup : list
|
79
75
|
List of setup commands/packages for the pilot environment.
|
80
|
-
Default
|
76
|
+
Default is an empty list.
|
81
77
|
|
82
78
|
python_v : str
|
83
79
|
The Python version to be used in the pilot environment.
|
parsl/monitoring/remote.py
CHANGED
@@ -96,6 +96,22 @@ def monitor_wrapper(*,
|
|
96
96
|
return (wrapped, args, new_kwargs)
|
97
97
|
|
98
98
|
|
99
|
+
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadio:
|
100
|
+
radio: MonitoringRadio
|
101
|
+
if radio_mode == "udp":
|
102
|
+
radio = UDPRadio(monitoring_hub_url,
|
103
|
+
source_id=task_id)
|
104
|
+
elif radio_mode == "htex":
|
105
|
+
radio = HTEXRadio(monitoring_hub_url,
|
106
|
+
source_id=task_id)
|
107
|
+
elif radio_mode == "filesystem":
|
108
|
+
radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
|
109
|
+
source_id=task_id, run_dir=run_dir)
|
110
|
+
else:
|
111
|
+
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
112
|
+
return radio
|
113
|
+
|
114
|
+
|
99
115
|
@wrap_with_logs
|
100
116
|
def send_first_message(try_id: int,
|
101
117
|
task_id: int,
|
@@ -122,18 +138,7 @@ def send_first_last_message(try_id: int,
|
|
122
138
|
import platform
|
123
139
|
import os
|
124
140
|
|
125
|
-
radio
|
126
|
-
if radio_mode == "udp":
|
127
|
-
radio = UDPRadio(monitoring_hub_url,
|
128
|
-
source_id=task_id)
|
129
|
-
elif radio_mode == "htex":
|
130
|
-
radio = HTEXRadio(monitoring_hub_url,
|
131
|
-
source_id=task_id)
|
132
|
-
elif radio_mode == "filesystem":
|
133
|
-
radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
|
134
|
-
source_id=task_id, run_dir=run_dir)
|
135
|
-
else:
|
136
|
-
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
141
|
+
radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
|
137
142
|
|
138
143
|
msg = (MessageType.RESOURCE_INFO,
|
139
144
|
{'run_id': run_id,
|
@@ -178,18 +183,7 @@ def monitor(pid: int,
|
|
178
183
|
|
179
184
|
setproctitle("parsl: task resource monitor")
|
180
185
|
|
181
|
-
radio
|
182
|
-
if radio_mode == "udp":
|
183
|
-
radio = UDPRadio(monitoring_hub_url,
|
184
|
-
source_id=task_id)
|
185
|
-
elif radio_mode == "htex":
|
186
|
-
radio = HTEXRadio(monitoring_hub_url,
|
187
|
-
source_id=task_id)
|
188
|
-
elif radio_mode == "filesystem":
|
189
|
-
radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
|
190
|
-
source_id=task_id, run_dir=run_dir)
|
191
|
-
else:
|
192
|
-
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
186
|
+
radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
|
193
187
|
|
194
188
|
logging.debug("start of monitor")
|
195
189
|
|
parsl/providers/local/local.py
CHANGED
@@ -206,7 +206,7 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
206
206
|
script_path = "{0}/{1}.sh".format(self.script_dir, job_name)
|
207
207
|
script_path = os.path.abspath(script_path)
|
208
208
|
|
209
|
-
wrap_command = self.worker_init + f'\nexport JOBNAME
|
209
|
+
wrap_command = self.worker_init + f'\nexport JOBNAME={job_name}\n' + self.launcher(command, tasks_per_node, self.nodes_per_block)
|
210
210
|
|
211
211
|
self._write_submit_script(wrap_command, script_path)
|
212
212
|
|
parsl/tests/conftest.py
CHANGED
@@ -201,7 +201,7 @@ def load_dfk_session(request, pytestconfig, tmpd_cwd_session):
|
|
201
201
|
if parsl.dfk() != dfk:
|
202
202
|
raise RuntimeError("DFK changed unexpectedly during test")
|
203
203
|
dfk.cleanup()
|
204
|
-
|
204
|
+
assert DataFlowKernelLoader._dfk is None
|
205
205
|
else:
|
206
206
|
yield
|
207
207
|
|
@@ -253,7 +253,7 @@ def load_dfk_local_module(request, pytestconfig, tmpd_cwd_session):
|
|
253
253
|
if parsl.dfk() != dfk:
|
254
254
|
raise RuntimeError("DFK changed unexpectedly during test")
|
255
255
|
dfk.cleanup()
|
256
|
-
|
256
|
+
assert DataFlowKernelLoader._dfk is None
|
257
257
|
|
258
258
|
else:
|
259
259
|
yield
|
@@ -35,8 +35,6 @@ def const_with_cpath(autopath_specifier, content_path, caplog):
|
|
35
35
|
for record in caplog.records:
|
36
36
|
assert record.levelno < logging.ERROR
|
37
37
|
|
38
|
-
parsl.clear()
|
39
|
-
|
40
38
|
|
41
39
|
@pytest.mark.local
|
42
40
|
def test_std_autopath_const_str(caplog, tmpd_cwd):
|
@@ -74,8 +72,6 @@ def test_std_autopath_fail(caplog):
|
|
74
72
|
with pytest.raises(URIFailError):
|
75
73
|
app_stdout()
|
76
74
|
|
77
|
-
parsl.clear()
|
78
|
-
|
79
75
|
|
80
76
|
@parsl.bash_app
|
81
77
|
def app_both(stdout=parsl.AUTO_LOGNAME, stderr=parsl.AUTO_LOGNAME):
|
@@ -124,5 +120,3 @@ def test_std_autopath_zip(caplog, tmpd_cwd):
|
|
124
120
|
|
125
121
|
for record in caplog.records:
|
126
122
|
assert record.levelno < logging.ERROR
|
127
|
-
|
128
|
-
parsl.clear()
|
@@ -9,12 +9,6 @@ def local_setup():
|
|
9
9
|
parsl.load(fresh_config())
|
10
10
|
|
11
11
|
|
12
|
-
def local_teardown():
|
13
|
-
# explicit clear without dfk.cleanup here, because the
|
14
|
-
# test does that already
|
15
|
-
parsl.clear()
|
16
|
-
|
17
|
-
|
18
12
|
@python_app(cache=True)
|
19
13
|
def slow_double(x, sleep_dur=1):
|
20
14
|
import time
|
@@ -39,9 +33,10 @@ def test_periodic():
|
|
39
33
|
with parsl.dfk():
|
40
34
|
futs = [slow_double(sleep_for) for _ in range(4)]
|
41
35
|
[f.result() for f in futs]
|
36
|
+
run_dir = parsl.dfk().run_dir
|
42
37
|
|
43
38
|
# Here we will check if the loglines came back with 5 seconds deltas
|
44
|
-
with open("{}/parsl.log".format(
|
39
|
+
with open("{}/parsl.log".format(run_dir)) as f:
|
45
40
|
log_lines = f.readlines()
|
46
41
|
expected_msg = " Done checkpointing"
|
47
42
|
expected_msg2 = " No tasks checkpointed in this pass"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import pytest
|
2
|
+
import threading
|
3
|
+
import time
|
4
|
+
import zmq
|
5
|
+
from parsl import curvezmq
|
6
|
+
from parsl.executors.high_throughput.zmq_pipes import CommandClient
|
7
|
+
from parsl.executors.high_throughput.errors import CommandClientTimeoutError, CommandClientBadError
|
8
|
+
|
9
|
+
|
10
|
+
# Time constant used for timeout tests: various delays and
|
11
|
+
# timeouts will be appropriate multiples of this, but the
|
12
|
+
# value of T itself should not matter too much as long as
|
13
|
+
# it is big enough for zmq connections to happen successfully.
|
14
|
+
T = 0.25
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.mark.local
|
18
|
+
def test_command_not_sent() -> None:
|
19
|
+
"""Tests timeout on command send.
|
20
|
+
"""
|
21
|
+
ctx = curvezmq.ClientContext(None)
|
22
|
+
|
23
|
+
# RFC6335 ephemeral port range
|
24
|
+
cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
|
25
|
+
|
26
|
+
# cc will now wait for a connection, but we won't do anything to make the
|
27
|
+
# other side of the connection exist, so any command given to cc should
|
28
|
+
# timeout.
|
29
|
+
|
30
|
+
with pytest.raises(CommandClientTimeoutError):
|
31
|
+
cc.run("SOMECOMMAND", timeout_s=T)
|
32
|
+
|
33
|
+
cc.close()
|
34
|
+
|
35
|
+
|
36
|
+
@pytest.mark.local
|
37
|
+
def test_command_ignored() -> None:
|
38
|
+
"""Tests timeout on command response.
|
39
|
+
Tests that we timeout after a response and that the command client
|
40
|
+
sets itself into a bad state.
|
41
|
+
|
42
|
+
This only tests sequential access to the command client, even though
|
43
|
+
htex makes multithreaded use of the command client: see issue #3376 about
|
44
|
+
that lack of thread safety.
|
45
|
+
"""
|
46
|
+
ctx = curvezmq.ClientContext(None)
|
47
|
+
|
48
|
+
# RFC6335 ephemeral port range
|
49
|
+
cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
|
50
|
+
|
51
|
+
ic_ctx = curvezmq.ServerContext(None)
|
52
|
+
ic_channel = ic_ctx.socket(zmq.REP)
|
53
|
+
ic_channel.connect(f"tcp://127.0.0.1:{cc.port}")
|
54
|
+
|
55
|
+
with pytest.raises(CommandClientTimeoutError):
|
56
|
+
cc.run("SLOW_COMMAND", timeout_s=T)
|
57
|
+
|
58
|
+
req = ic_channel.recv_pyobj()
|
59
|
+
assert req == "SLOW_COMMAND", "Should have received command on interchange side"
|
60
|
+
assert not cc.ok, "CommandClient should have set itself to bad"
|
61
|
+
|
62
|
+
with pytest.raises(CommandClientBadError):
|
63
|
+
cc.run("ANOTHER_COMMAND")
|
64
|
+
|
65
|
+
cc.close()
|
66
|
+
ctx.term()
|
67
|
+
|
68
|
+
ic_channel.close()
|
69
|
+
ic_ctx.term()
|
@@ -37,16 +37,9 @@ def test_cpu_affinity_explicit():
|
|
37
37
|
config.executors[0].max_workers_per_node = 1
|
38
38
|
|
39
39
|
logger.debug(f"config: {config}")
|
40
|
-
# TODO: is there a `with` style for this, to properly deal with exceptions?
|
41
|
-
|
42
|
-
parsl.load(config)
|
43
|
-
try:
|
44
40
|
|
41
|
+
with parsl.load(config):
|
45
42
|
worker_affinity = my_affinity().result()
|
46
43
|
logger.debug(f"worker reported this affinity: {worker_affinity}")
|
47
44
|
assert len(worker_affinity) == 1
|
48
45
|
assert worker_affinity == set((single_core,))
|
49
|
-
|
50
|
-
finally:
|
51
|
-
parsl.dfk().cleanup()
|
52
|
-
parsl.clear()
|
@@ -7,16 +7,11 @@ from parsl.app.app import python_app
|
|
7
7
|
from parsl.tests.configs.htex_local import fresh_config
|
8
8
|
|
9
9
|
|
10
|
-
def
|
10
|
+
def local_config():
|
11
11
|
config = fresh_config()
|
12
12
|
config.executors[0].poll_period = 1
|
13
13
|
config.executors[0].max_workers_per_node = 1
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def local_teardown():
|
18
|
-
parsl.dfk().cleanup()
|
19
|
-
parsl.clear()
|
14
|
+
return config
|
20
15
|
|
21
16
|
|
22
17
|
@python_app
|
@@ -5,18 +5,12 @@ from parsl.app.app import python_app
|
|
5
5
|
from parsl.tests.configs.htex_local import fresh_config
|
6
6
|
|
7
7
|
|
8
|
-
def
|
8
|
+
def local_config():
|
9
9
|
config = fresh_config()
|
10
10
|
config.executors[0].poll_period = 1
|
11
11
|
config.executors[0].max_workers_per_node = 1
|
12
12
|
config.executors[0].launch_cmd = "executable_that_hopefully_does_not_exist_1030509.py"
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
def local_teardown():
|
17
|
-
|
18
|
-
parsl.dfk().cleanup()
|
19
|
-
parsl.clear()
|
13
|
+
return config
|
20
14
|
|
21
15
|
|
22
16
|
@python_app
|
@@ -61,7 +61,6 @@ def test_app_name(get_app, expected_name, expected_result, tmpd_cwd):
|
|
61
61
|
assert app().result() == expected_result
|
62
62
|
|
63
63
|
parsl.dfk().cleanup()
|
64
|
-
parsl.clear()
|
65
64
|
|
66
65
|
engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
|
67
66
|
with engine.begin() as connection:
|
@@ -52,7 +52,6 @@ def test_future_representation(tmpd_cwd):
|
|
52
52
|
# seconds, with the assumption "data will arrive in the DB within
|
53
53
|
# 30 seconds, but probably much sooner".
|
54
54
|
parsl.dfk().cleanup()
|
55
|
-
parsl.clear()
|
56
55
|
|
57
56
|
engine = sqlalchemy.create_engine(monitoring_url)
|
58
57
|
with engine.begin() as connection:
|