parsl 2024.5.13__py3-none-any.whl → 2024.5.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. parsl/channels/base.py +2 -9
  2. parsl/channels/local/local.py +3 -6
  3. parsl/channels/oauth_ssh/oauth_ssh.py +2 -2
  4. parsl/channels/ssh/ssh.py +2 -2
  5. parsl/config.py +7 -1
  6. parsl/dataflow/dependency_resolvers.py +115 -0
  7. parsl/dataflow/dflow.py +45 -39
  8. parsl/executors/__init__.py +2 -0
  9. parsl/executors/base.py +7 -7
  10. parsl/executors/high_throughput/errors.py +10 -0
  11. parsl/executors/high_throughput/executor.py +85 -84
  12. parsl/executors/high_throughput/interchange.py +6 -5
  13. parsl/executors/high_throughput/mpi_executor.py +85 -0
  14. parsl/executors/high_throughput/mpi_prefix_composer.py +18 -2
  15. parsl/executors/high_throughput/mpi_resource_management.py +3 -0
  16. parsl/executors/high_throughput/zmq_pipes.py +36 -2
  17. parsl/executors/radical/rpex_resources.py +3 -7
  18. parsl/monitoring/remote.py +18 -24
  19. parsl/providers/local/local.py +1 -1
  20. parsl/tests/conftest.py +2 -2
  21. parsl/tests/sites/test_dynamic_executor.py +0 -1
  22. parsl/tests/test_bash_apps/test_std_uri.py +0 -6
  23. parsl/tests/test_checkpointing/test_periodic.py +2 -7
  24. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +0 -1
  25. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  26. parsl/tests/test_checkpointing/test_task_exit.py +0 -1
  27. parsl/tests/test_htex/test_basic.py +0 -1
  28. parsl/tests/test_htex/test_command_client_timeout.py +69 -0
  29. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -8
  30. parsl/tests/test_htex/test_manager_failure.py +0 -1
  31. parsl/tests/test_htex/test_managers_command.py +2 -7
  32. parsl/tests/test_htex/test_missing_worker.py +2 -8
  33. parsl/tests/test_monitoring/test_app_names.py +0 -1
  34. parsl/tests/test_monitoring/test_basic.py +0 -2
  35. parsl/tests/test_monitoring/test_db_locks.py +0 -1
  36. parsl/tests/test_monitoring/test_fuzz_zmq.py +0 -1
  37. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -2
  38. parsl/tests/test_monitoring/test_incomplete_futures.py +0 -1
  39. parsl/tests/test_monitoring/test_memoization_representation.py +0 -1
  40. parsl/tests/test_monitoring/test_stdouterr.py +0 -2
  41. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +6 -14
  42. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +2 -8
  43. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +10 -1
  44. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  45. parsl/tests/test_mpi_apps/test_resource_spec.py +14 -9
  46. parsl/tests/test_python_apps/test_context_manager.py +1 -9
  47. parsl/tests/test_python_apps/test_lifted.py +10 -6
  48. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  49. parsl/tests/test_scaling/test_regression_1621.py +0 -2
  50. parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
  51. parsl/tests/test_serialization/test_proxystore_configured.py +0 -1
  52. parsl/tests/test_shutdown/test_kill_monitoring.py +0 -2
  53. parsl/tests/test_staging/test_1316.py +0 -2
  54. parsl/tests/test_staging/test_elaborate_noop_file.py +0 -1
  55. parsl/tests/test_summary.py +0 -1
  56. parsl/tests/test_threads/test_configs.py +0 -1
  57. parsl/tests/test_threads/test_lazy_errors.py +0 -1
  58. parsl/version.py +1 -1
  59. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/METADATA +6 -4
  60. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/RECORD +67 -62
  61. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/exec_parsl_function.py +0 -0
  62. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/parsl_coprocess.py +0 -0
  63. {parsl-2024.5.13.data → parsl-2024.5.27.data}/scripts/process_worker_pool.py +0 -0
  64. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/LICENSE +0 -0
  65. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/WHEEL +0 -0
  66. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/entry_points.txt +0 -0
  67. {parsl-2024.5.13.dist-info → parsl-2024.5.27.dist-info}/top_level.txt +0 -0
@@ -74,7 +74,7 @@ class Interchange:
74
74
  worker_ports: Optional[Tuple[int, int]] = None,
75
75
  worker_port_range: Tuple[int, int] = (54000, 55000),
76
76
  hub_address: Optional[str] = None,
77
- hub_port: Optional[int] = None,
77
+ hub_zmq_port: Optional[int] = None,
78
78
  heartbeat_threshold: int = 60,
79
79
  logdir: str = ".",
80
80
  logging_level: int = logging.INFO,
@@ -105,7 +105,7 @@ class Interchange:
105
105
  The IP address at which the interchange can send info about managers to when monitoring is enabled.
106
106
  Default: None (meaning monitoring disabled)
107
107
 
108
- hub_port : str
108
+ hub_zmq_port : str
109
109
  The port at which the interchange can send info about managers to when monitoring is enabled.
110
110
  Default: None (meaning monitoring disabled)
111
111
 
@@ -151,7 +151,7 @@ class Interchange:
151
151
  logger.info("Connected to client")
152
152
 
153
153
  self.hub_address = hub_address
154
- self.hub_port = hub_port
154
+ self.hub_zmq_port = hub_zmq_port
155
155
 
156
156
  self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
157
157
  self.count = 0
@@ -244,12 +244,12 @@ class Interchange:
244
244
  logger.debug(f"Fetched {task_counter} tasks so far")
245
245
 
246
246
  def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
247
- if self.hub_address and self.hub_port:
247
+ if self.hub_address and self.hub_zmq_port:
248
248
  logger.info("Connecting to MonitoringHub")
249
249
  # This is a one-off because monitoring is unencrypted
250
250
  hub_channel = zmq.Context().socket(zmq.DEALER)
251
251
  hub_channel.set_hwm(0)
252
- hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
252
+ hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
253
253
  logger.info("Connected to MonitoringHub")
254
254
  return hub_channel
255
255
  else:
@@ -329,6 +329,7 @@ class Interchange:
329
329
  reply = None
330
330
 
331
331
  else:
332
+ logger.error(f"Received unknown command: {command_req}")
332
333
  reply = None
333
334
 
334
335
  logger.debug("Reply: {}".format(reply))
@@ -0,0 +1,85 @@
1
+ """A simplified interface for HTEx when running in MPI mode"""
2
+ from typing import Optional, Tuple, List, Union, Callable, Dict
3
+
4
+ import typeguard
5
+
6
+ from parsl.data_provider.staging import Staging
7
+ from parsl.executors.high_throughput.executor import HighThroughputExecutor, GENERAL_HTEX_PARAM_DOCS
8
+ from parsl.executors.status_handling import BlockProviderExecutor
9
+ from parsl.jobs.states import JobStatus
10
+ from parsl.providers import LocalProvider
11
+ from parsl.providers.base import ExecutionProvider
12
+
13
+
14
+ class MPIExecutor(HighThroughputExecutor):
15
+ __doc__ = f"""A version of :class:`~parsl.HighThroughputExecutor` tuned for executing multi-node (e.g., MPI) tasks.
16
+
17
+ The Provider _must_ use the :class:`~parsl.launchers.SimpleLauncher`,
18
+ which places a single pool of workers on the first node of a block.
19
+ Each worker can then make system calls which use an MPI launcher (e.g., ``mpirun``, ``srun``)
20
+ to spawn multi-node tasks.
21
+
22
+ Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
23
+ The value should be less than or equal to the ``nodes_per_block`` in the Provider.
24
+
25
+ Parameters
26
+ ----------
27
+ max_workers_per_block: int
28
+ Maximum number of MPI applications to run at once per block
29
+
30
+ {GENERAL_HTEX_PARAM_DOCS}
31
+ """
32
+
33
+ @typeguard.typechecked
34
+ def __init__(self,
35
+ label: str = 'MPIExecutor',
36
+ provider: ExecutionProvider = LocalProvider(),
37
+ launch_cmd: Optional[str] = None,
38
+ address: Optional[str] = None,
39
+ worker_ports: Optional[Tuple[int, int]] = None,
40
+ worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
41
+ interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
42
+ storage_access: Optional[List[Staging]] = None,
43
+ working_dir: Optional[str] = None,
44
+ worker_debug: bool = False,
45
+ max_workers_per_block: int = 1,
46
+ prefetch_capacity: int = 0,
47
+ heartbeat_threshold: int = 120,
48
+ heartbeat_period: int = 30,
49
+ drain_period: Optional[int] = None,
50
+ poll_period: int = 10,
51
+ address_probe_timeout: Optional[int] = None,
52
+ worker_logdir_root: Optional[str] = None,
53
+ mpi_launcher: str = "mpiexec",
54
+ block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
55
+ encrypted: bool = False):
56
+ super().__init__(
57
+ # Hard-coded settings
58
+ cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
59
+ enable_mpi_mode=True,
60
+ max_workers_per_node=max_workers_per_block,
61
+
62
+ # Everything else
63
+ label=label,
64
+ provider=provider,
65
+ launch_cmd=launch_cmd,
66
+ address=address,
67
+ worker_ports=worker_ports,
68
+ worker_port_range=worker_port_range,
69
+ interchange_port_range=interchange_port_range,
70
+ storage_access=storage_access,
71
+ working_dir=working_dir,
72
+ worker_debug=worker_debug,
73
+ prefetch_capacity=prefetch_capacity,
74
+ heartbeat_threshold=heartbeat_threshold,
75
+ heartbeat_period=heartbeat_period,
76
+ drain_period=drain_period,
77
+ poll_period=poll_period,
78
+ address_probe_timeout=address_probe_timeout,
79
+ worker_logdir_root=worker_logdir_root,
80
+ mpi_launcher=mpi_launcher,
81
+ block_error_handler=block_error_handler,
82
+ encrypted=encrypted
83
+ )
84
+
85
+ self.max_workers_per_block = max_workers_per_block
@@ -8,8 +8,18 @@ VALID_LAUNCHERS = ('srun',
8
8
  'mpiexec')
9
9
 
10
10
 
11
+ class MissingResourceSpecification(Exception):
12
+ """Exception raised when input is not supplied a resource specification"""
13
+
14
+ def __init__(self, reason: str):
15
+ self.reason = reason
16
+
17
+ def __str__(self):
18
+ return f"Missing resource specification: {self.reason}"
19
+
20
+
11
21
  class InvalidResourceSpecification(Exception):
12
- """Exception raised when Invalid keys are supplied via resource specification"""
22
+ """Exception raised when Invalid input is supplied via resource specification"""
13
23
 
14
24
  def __init__(self, invalid_keys: Set[str]):
15
25
  self.invalid_keys = invalid_keys
@@ -18,13 +28,19 @@ class InvalidResourceSpecification(Exception):
18
28
  return f"Invalid resource specification options supplied: {self.invalid_keys}"
19
29
 
20
30
 
21
- def validate_resource_spec(resource_spec: Dict[str, str]):
31
+ def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
22
32
  """Basic validation of keys in the resource_spec
23
33
 
24
34
  Raises: InvalidResourceSpecification if the resource_spec
25
35
  is invalid (e.g, contains invalid keys)
26
36
  """
27
37
  user_keys = set(resource_spec.keys())
38
+
39
+ # empty resource_spec when mpi_mode is set causes parsl to hang
40
+ # ref issue #3427
41
+ if is_mpi_enabled and len(user_keys) == 0:
42
+ raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured')
43
+
28
44
  legal_keys = set(("ranks_per_node",
29
45
  "num_nodes",
30
46
  "num_ranks",
@@ -208,8 +208,11 @@ class MPITaskScheduler(TaskScheduler):
208
208
  """Return result and relinquish provisioned nodes"""
209
209
  result_pkl = self.pending_result_q.get(block, timeout=timeout)
210
210
  result_dict = pickle.loads(result_pkl)
211
+ # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
212
+ # Causes Parsl to hang. See Issue #3427
211
213
  if result_dict["type"] == "result":
212
214
  task_id = result_dict["task_id"]
215
+ assert task_id in self._map_tasks_to_nodes, "You are about to experience issue #3427"
213
216
  nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
214
217
  self._return_nodes(nodes_to_reallocate)
215
218
  self._schedule_backlog_tasks()
@@ -3,8 +3,11 @@
3
3
  import zmq
4
4
  import logging
5
5
  import threading
6
+ import time
6
7
 
7
8
  from parsl import curvezmq
9
+ from parsl.errors import InternalConsistencyError
10
+ from parsl.executors.high_throughput.errors import CommandClientBadError, CommandClientTimeoutError
8
11
 
9
12
  logger = logging.getLogger(__name__)
10
13
 
@@ -31,6 +34,7 @@ class CommandClient:
31
34
  self.port = None
32
35
  self.create_socket_and_bind()
33
36
  self._lock = threading.Lock()
37
+ self.ok = True
34
38
 
35
39
  def create_socket_and_bind(self):
36
40
  """ Creates socket and binds to a port.
@@ -46,7 +50,7 @@ class CommandClient:
46
50
  else:
47
51
  self.zmq_socket.bind("tcp://{}:{}".format(self.ip_address, self.port))
48
52
 
49
- def run(self, message, max_retries=3):
53
+ def run(self, message, max_retries=3, timeout_s=None):
50
54
  """ This function needs to be fast at the same time aware of the possibility of
51
55
  ZMQ pipes overflowing.
52
56
 
@@ -54,13 +58,43 @@ class CommandClient:
54
58
  in ZMQ sockets reaching a broken state once there are ~10k tasks in flight.
55
59
  This issue can be magnified if each the serialized buffer itself is larger.
56
60
  """
61
+ if not self.ok:
62
+ raise CommandClientBadError()
63
+
64
+ start_time_s = time.monotonic()
65
+
57
66
  reply = '__PARSL_ZMQ_PIPES_MAGIC__'
58
67
  with self._lock:
59
68
  for _ in range(max_retries):
60
69
  try:
61
70
  logger.debug("Sending command client command")
71
+
72
+ if timeout_s is not None:
73
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
74
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLOUT)
75
+ if poll_result == zmq.POLLOUT:
76
+ pass # this is OK, so continue
77
+ elif poll_result == 0:
78
+ raise CommandClientTimeoutError("Waiting for command channel to be ready for a command")
79
+ else:
80
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
81
+
62
82
  self.zmq_socket.send_pyobj(message, copy=True)
63
- logger.debug("Waiting for command client response")
83
+
84
+ if timeout_s is not None:
85
+ logger.debug("Polling for command client response or timeout")
86
+ remaining_time_s = start_time_s + timeout_s - time.monotonic()
87
+ poll_result = self.zmq_socket.poll(timeout=remaining_time_s * 1000, flags=zmq.POLLIN)
88
+ if poll_result == zmq.POLLIN:
89
+ pass # this is OK, so continue
90
+ elif poll_result == 0:
91
+ logger.error("Command timed-out - command client is now bad forever")
92
+ self.ok = False
93
+ raise CommandClientTimeoutError("Waiting for a reply from command channel")
94
+ else:
95
+ raise InternalConsistencyError(f"ZMQ poll returned unexpected value: {poll_result}")
96
+
97
+ logger.debug("Receiving command client response")
64
98
  reply = self.zmq_socket.recv_pyobj()
65
99
  logger.debug("Received command client response")
66
100
  except zmq.ZMQError:
@@ -3,15 +3,11 @@ import json
3
3
 
4
4
  from typing import List
5
5
 
6
- _setup_paths: List[str]
6
+ _setup_paths: List[str] = []
7
7
  try:
8
8
  import radical.pilot as rp
9
- import radical.utils as ru
10
9
  except ImportError:
11
- _setup_paths = []
12
- else:
13
- _setup_paths = [rp.sdist_path,
14
- ru.sdist_path]
10
+ pass
15
11
 
16
12
 
17
13
  MPI = "mpi"
@@ -77,7 +73,7 @@ class ResourceConfig:
77
73
 
78
74
  pilot_env_setup : list
79
75
  List of setup commands/packages for the pilot environment.
80
- Default setup includes "parsl", rp.sdist_path, and ru.sdist_path.
76
+ Default is an empty list.
81
77
 
82
78
  python_v : str
83
79
  The Python version to be used in the pilot environment.
@@ -96,6 +96,22 @@ def monitor_wrapper(*,
96
96
  return (wrapped, args, new_kwargs)
97
97
 
98
98
 
99
+ def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadio:
100
+ radio: MonitoringRadio
101
+ if radio_mode == "udp":
102
+ radio = UDPRadio(monitoring_hub_url,
103
+ source_id=task_id)
104
+ elif radio_mode == "htex":
105
+ radio = HTEXRadio(monitoring_hub_url,
106
+ source_id=task_id)
107
+ elif radio_mode == "filesystem":
108
+ radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
109
+ source_id=task_id, run_dir=run_dir)
110
+ else:
111
+ raise RuntimeError(f"Unknown radio mode: {radio_mode}")
112
+ return radio
113
+
114
+
99
115
  @wrap_with_logs
100
116
  def send_first_message(try_id: int,
101
117
  task_id: int,
@@ -122,18 +138,7 @@ def send_first_last_message(try_id: int,
122
138
  import platform
123
139
  import os
124
140
 
125
- radio: MonitoringRadio
126
- if radio_mode == "udp":
127
- radio = UDPRadio(monitoring_hub_url,
128
- source_id=task_id)
129
- elif radio_mode == "htex":
130
- radio = HTEXRadio(monitoring_hub_url,
131
- source_id=task_id)
132
- elif radio_mode == "filesystem":
133
- radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
134
- source_id=task_id, run_dir=run_dir)
135
- else:
136
- raise RuntimeError(f"Unknown radio mode: {radio_mode}")
141
+ radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
137
142
 
138
143
  msg = (MessageType.RESOURCE_INFO,
139
144
  {'run_id': run_id,
@@ -178,18 +183,7 @@ def monitor(pid: int,
178
183
 
179
184
  setproctitle("parsl: task resource monitor")
180
185
 
181
- radio: MonitoringRadio
182
- if radio_mode == "udp":
183
- radio = UDPRadio(monitoring_hub_url,
184
- source_id=task_id)
185
- elif radio_mode == "htex":
186
- radio = HTEXRadio(monitoring_hub_url,
187
- source_id=task_id)
188
- elif radio_mode == "filesystem":
189
- radio = FilesystemRadio(monitoring_url=monitoring_hub_url,
190
- source_id=task_id, run_dir=run_dir)
191
- else:
192
- raise RuntimeError(f"Unknown radio mode: {radio_mode}")
186
+ radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
193
187
 
194
188
  logging.debug("start of monitor")
195
189
 
@@ -206,7 +206,7 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
206
206
  script_path = "{0}/{1}.sh".format(self.script_dir, job_name)
207
207
  script_path = os.path.abspath(script_path)
208
208
 
209
- wrap_command = self.worker_init + f'\nexport JOBNAME=${job_name}\n' + self.launcher(command, tasks_per_node, self.nodes_per_block)
209
+ wrap_command = self.worker_init + f'\nexport JOBNAME={job_name}\n' + self.launcher(command, tasks_per_node, self.nodes_per_block)
210
210
 
211
211
  self._write_submit_script(wrap_command, script_path)
212
212
 
parsl/tests/conftest.py CHANGED
@@ -201,7 +201,7 @@ def load_dfk_session(request, pytestconfig, tmpd_cwd_session):
201
201
  if parsl.dfk() != dfk:
202
202
  raise RuntimeError("DFK changed unexpectedly during test")
203
203
  dfk.cleanup()
204
- parsl.clear()
204
+ assert DataFlowKernelLoader._dfk is None
205
205
  else:
206
206
  yield
207
207
 
@@ -253,7 +253,7 @@ def load_dfk_local_module(request, pytestconfig, tmpd_cwd_session):
253
253
  if parsl.dfk() != dfk:
254
254
  raise RuntimeError("DFK changed unexpectedly during test")
255
255
  dfk.cleanup()
256
- parsl.clear()
256
+ assert DataFlowKernelLoader._dfk is None
257
257
 
258
258
  else:
259
259
  yield
@@ -75,4 +75,3 @@ def test_dynamic_executor():
75
75
  print("Done testing")
76
76
 
77
77
  dfk.cleanup()
78
- parsl.clear()
@@ -35,8 +35,6 @@ def const_with_cpath(autopath_specifier, content_path, caplog):
35
35
  for record in caplog.records:
36
36
  assert record.levelno < logging.ERROR
37
37
 
38
- parsl.clear()
39
-
40
38
 
41
39
  @pytest.mark.local
42
40
  def test_std_autopath_const_str(caplog, tmpd_cwd):
@@ -74,8 +72,6 @@ def test_std_autopath_fail(caplog):
74
72
  with pytest.raises(URIFailError):
75
73
  app_stdout()
76
74
 
77
- parsl.clear()
78
-
79
75
 
80
76
  @parsl.bash_app
81
77
  def app_both(stdout=parsl.AUTO_LOGNAME, stderr=parsl.AUTO_LOGNAME):
@@ -124,5 +120,3 @@ def test_std_autopath_zip(caplog, tmpd_cwd):
124
120
 
125
121
  for record in caplog.records:
126
122
  assert record.levelno < logging.ERROR
127
-
128
- parsl.clear()
@@ -9,12 +9,6 @@ def local_setup():
9
9
  parsl.load(fresh_config())
10
10
 
11
11
 
12
- def local_teardown():
13
- # explicit clear without dfk.cleanup here, because the
14
- # test does that already
15
- parsl.clear()
16
-
17
-
18
12
  @python_app(cache=True)
19
13
  def slow_double(x, sleep_dur=1):
20
14
  import time
@@ -39,9 +33,10 @@ def test_periodic():
39
33
  with parsl.dfk():
40
34
  futs = [slow_double(sleep_for) for _ in range(4)]
41
35
  [f.result() for f in futs]
36
+ run_dir = parsl.dfk().run_dir
42
37
 
43
38
  # Here we will check if the loglines came back with 5 seconds deltas
44
- with open("{}/parsl.log".format(parsl.dfk().run_dir)) as f:
39
+ with open("{}/parsl.log".format(run_dir)) as f:
45
40
  log_lines = f.readlines()
46
41
  expected_msg = " Done checkpointing"
47
42
  expected_msg2 = " No tasks checkpointed in this pass"
@@ -19,7 +19,6 @@ def parsl_configured(run_dir, **kw):
19
19
  yield dfk
20
20
 
21
21
  parsl.dfk().cleanup()
22
- parsl.clear()
23
22
 
24
23
 
25
24
  @python_app(cache=True)
@@ -14,7 +14,6 @@ def local_setup():
14
14
 
15
15
  def local_teardown():
16
16
  parsl.dfk().cleanup()
17
- parsl.clear()
18
17
 
19
18
 
20
19
  @python_app
@@ -16,7 +16,6 @@ def local_setup():
16
16
 
17
17
  def local_teardown():
18
18
  parsl.dfk().cleanup()
19
- parsl.clear()
20
19
 
21
20
 
22
21
  @python_app(cache=True)
@@ -14,7 +14,6 @@ def local_setup():
14
14
 
15
15
  def local_teardown():
16
16
  parsl.dfk().cleanup()
17
- parsl.clear()
18
17
 
19
18
 
20
19
  @python_app
@@ -0,0 +1,69 @@
1
+ import pytest
2
+ import threading
3
+ import time
4
+ import zmq
5
+ from parsl import curvezmq
6
+ from parsl.executors.high_throughput.zmq_pipes import CommandClient
7
+ from parsl.executors.high_throughput.errors import CommandClientTimeoutError, CommandClientBadError
8
+
9
+
10
+ # Time constant used for timeout tests: various delays and
11
+ # timeouts will be appropriate multiples of this, but the
12
+ # value of T itself should not matter too much as long as
13
+ # it is big enough for zmq connections to happen successfully.
14
+ T = 0.25
15
+
16
+
17
+ @pytest.mark.local
18
+ def test_command_not_sent() -> None:
19
+ """Tests timeout on command send.
20
+ """
21
+ ctx = curvezmq.ClientContext(None)
22
+
23
+ # RFC6335 ephemeral port range
24
+ cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
25
+
26
+ # cc will now wait for a connection, but we won't do anything to make the
27
+ # other side of the connection exist, so any command given to cc should
28
+ # timeout.
29
+
30
+ with pytest.raises(CommandClientTimeoutError):
31
+ cc.run("SOMECOMMAND", timeout_s=T)
32
+
33
+ cc.close()
34
+
35
+
36
+ @pytest.mark.local
37
+ def test_command_ignored() -> None:
38
+ """Tests timeout on command response.
39
+ Tests that we timeout after a response and that the command client
40
+ sets itself into a bad state.
41
+
42
+ This only tests sequential access to the command client, even though
43
+ htex makes multithreaded use of the command client: see issue #3376 about
44
+ that lack of thread safety.
45
+ """
46
+ ctx = curvezmq.ClientContext(None)
47
+
48
+ # RFC6335 ephemeral port range
49
+ cc = CommandClient(ctx, "127.0.0.1", (49152, 65535))
50
+
51
+ ic_ctx = curvezmq.ServerContext(None)
52
+ ic_channel = ic_ctx.socket(zmq.REP)
53
+ ic_channel.connect(f"tcp://127.0.0.1:{cc.port}")
54
+
55
+ with pytest.raises(CommandClientTimeoutError):
56
+ cc.run("SLOW_COMMAND", timeout_s=T)
57
+
58
+ req = ic_channel.recv_pyobj()
59
+ assert req == "SLOW_COMMAND", "Should have received command on interchange side"
60
+ assert not cc.ok, "CommandClient should have set itself to bad"
61
+
62
+ with pytest.raises(CommandClientBadError):
63
+ cc.run("ANOTHER_COMMAND")
64
+
65
+ cc.close()
66
+ ctx.term()
67
+
68
+ ic_channel.close()
69
+ ic_ctx.term()
@@ -37,16 +37,9 @@ def test_cpu_affinity_explicit():
37
37
  config.executors[0].max_workers_per_node = 1
38
38
 
39
39
  logger.debug(f"config: {config}")
40
- # TODO: is there a `with` style for this, to properly deal with exceptions?
41
-
42
- parsl.load(config)
43
- try:
44
40
 
41
+ with parsl.load(config):
45
42
  worker_affinity = my_affinity().result()
46
43
  logger.debug(f"worker reported this affinity: {worker_affinity}")
47
44
  assert len(worker_affinity) == 1
48
45
  assert worker_affinity == set((single_core,))
49
-
50
- finally:
51
- parsl.dfk().cleanup()
52
- parsl.clear()
@@ -20,7 +20,6 @@ def load_config():
20
20
  yield
21
21
 
22
22
  parsl.dfk().cleanup()
23
- parsl.clear()
24
23
 
25
24
 
26
25
  @python_app
@@ -7,16 +7,11 @@ from parsl.app.app import python_app
7
7
  from parsl.tests.configs.htex_local import fresh_config
8
8
 
9
9
 
10
- def local_setup():
10
+ def local_config():
11
11
  config = fresh_config()
12
12
  config.executors[0].poll_period = 1
13
13
  config.executors[0].max_workers_per_node = 1
14
- parsl.load(config)
15
-
16
-
17
- def local_teardown():
18
- parsl.dfk().cleanup()
19
- parsl.clear()
14
+ return config
20
15
 
21
16
 
22
17
  @python_app
@@ -5,18 +5,12 @@ from parsl.app.app import python_app
5
5
  from parsl.tests.configs.htex_local import fresh_config
6
6
 
7
7
 
8
- def local_setup():
8
+ def local_config():
9
9
  config = fresh_config()
10
10
  config.executors[0].poll_period = 1
11
11
  config.executors[0].max_workers_per_node = 1
12
12
  config.executors[0].launch_cmd = "executable_that_hopefully_does_not_exist_1030509.py"
13
- parsl.load(config)
14
-
15
-
16
- def local_teardown():
17
-
18
- parsl.dfk().cleanup()
19
- parsl.clear()
13
+ return config
20
14
 
21
15
 
22
16
  @python_app
@@ -61,7 +61,6 @@ def test_app_name(get_app, expected_name, expected_result, tmpd_cwd):
61
61
  assert app().result() == expected_result
62
62
 
63
63
  parsl.dfk().cleanup()
64
- parsl.clear()
65
64
 
66
65
  engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
67
66
  with engine.begin() as connection:
@@ -66,8 +66,6 @@ def test_row_counts(tmpd_cwd, fresh_config):
66
66
  with parsl.load(config):
67
67
  assert this_app().result() == 5
68
68
 
69
- parsl.clear()
70
-
71
69
  # at this point, we should find one row in the monitoring database.
72
70
 
73
71
  engine = sqlalchemy.create_engine(db_url)
@@ -63,7 +63,6 @@ def test_row_counts():
63
63
 
64
64
  logger.info("cleaning up parsl")
65
65
  parsl.dfk().cleanup()
66
- parsl.clear()
67
66
 
68
67
  # at this point, we should find data consistent with executing one
69
68
  # task in the database.
@@ -83,7 +83,6 @@ def test_row_counts():
83
83
 
84
84
  logger.info("cleaning up parsl")
85
85
  parsl.dfk().cleanup()
86
- parsl.clear()
87
86
 
88
87
  # at this point, we should find one row in the monitoring database.
89
88
 
@@ -65,8 +65,6 @@ def test_row_counts(tmpd_cwd, strategy):
65
65
 
66
66
  this_app().result()
67
67
 
68
- parsl.clear()
69
-
70
68
  engine = sqlalchemy.create_engine(db_url)
71
69
  with engine.begin() as connection:
72
70
 
@@ -52,7 +52,6 @@ def test_future_representation(tmpd_cwd):
52
52
  # seconds, with the assumption "data will arrive in the DB within
53
53
  # 30 seconds, but probably much sooner".
54
54
  parsl.dfk().cleanup()
55
- parsl.clear()
56
55
 
57
56
  engine = sqlalchemy.create_engine(monitoring_url)
58
57
  with engine.begin() as connection: