parsl 2025.6.16__py3-none-any.whl → 2025.6.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. parsl/configs/osg.py +1 -1
  2. parsl/dataflow/dflow.py +14 -4
  3. parsl/executors/base.py +19 -9
  4. parsl/executors/flux/executor.py +2 -0
  5. parsl/executors/globus_compute.py +2 -0
  6. parsl/executors/high_throughput/executor.py +22 -15
  7. parsl/executors/high_throughput/interchange.py +173 -191
  8. parsl/executors/high_throughput/mpi_executor.py +14 -4
  9. parsl/executors/high_throughput/probe.py +4 -4
  10. parsl/executors/high_throughput/process_worker_pool.py +88 -94
  11. parsl/executors/radical/executor.py +3 -0
  12. parsl/executors/taskvine/executor.py +11 -3
  13. parsl/executors/taskvine/manager.py +3 -1
  14. parsl/executors/threads.py +19 -3
  15. parsl/executors/workqueue/executor.py +11 -3
  16. parsl/monitoring/errors.py +4 -4
  17. parsl/monitoring/monitoring.py +26 -88
  18. parsl/monitoring/radios/base.py +63 -2
  19. parsl/monitoring/radios/filesystem.py +19 -4
  20. parsl/monitoring/radios/filesystem_router.py +22 -3
  21. parsl/monitoring/radios/htex.py +22 -13
  22. parsl/monitoring/radios/multiprocessing.py +22 -2
  23. parsl/monitoring/radios/udp.py +57 -19
  24. parsl/monitoring/radios/udp_router.py +119 -25
  25. parsl/monitoring/radios/zmq_router.py +9 -10
  26. parsl/monitoring/remote.py +19 -40
  27. parsl/providers/local/local.py +12 -13
  28. parsl/tests/configs/htex_local_alternate.py +0 -1
  29. parsl/tests/conftest.py +7 -4
  30. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +5 -7
  31. parsl/tests/test_htex/test_zmq_binding.py +5 -6
  32. parsl/tests/test_monitoring/test_basic.py +12 -10
  33. parsl/tests/test_monitoring/{test_fuzz_zmq.py → test_htex_fuzz_zmq.py} +7 -2
  34. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
  35. parsl/tests/test_monitoring/test_radio_filesystem.py +48 -0
  36. parsl/tests/test_monitoring/test_radio_multiprocessing.py +44 -0
  37. parsl/tests/test_monitoring/test_radio_udp.py +204 -0
  38. parsl/tests/test_monitoring/test_stdouterr.py +1 -3
  39. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +3 -7
  40. parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
  41. parsl/version.py +1 -1
  42. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/interchange.py +173 -191
  43. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/process_worker_pool.py +88 -94
  44. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/METADATA +2 -2
  45. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/RECORD +51 -50
  46. parsl/tests/configs/local_threads_monitoring.py +0 -10
  47. parsl/tests/manual_tests/test_udp_simple.py +0 -51
  48. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/exec_parsl_function.py +0 -0
  49. {parsl-2025.6.16.data → parsl-2025.6.30.data}/scripts/parsl_coprocess.py +0 -0
  50. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/LICENSE +0 -0
  51. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/WHEEL +0 -0
  52. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/entry_points.txt +0 -0
  53. {parsl-2025.6.16.dist-info → parsl-2025.6.30.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,31 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hmac
3
4
  import logging
4
5
  import multiprocessing.queues as mpq
5
6
  import os
6
7
  import pickle
8
+ import queue
7
9
  import socket
8
10
  import time
11
+ from multiprocessing.context import SpawnProcess as SpawnProcessType
12
+ from multiprocessing.queues import Queue
9
13
  from multiprocessing.synchronize import Event
10
- from typing import Optional
14
+ from multiprocessing.synchronize import Event as EventType
15
+ from typing import Optional, Union
11
16
 
12
17
  import typeguard
13
18
 
14
19
  from parsl.log_utils import set_file_logger
20
+ from parsl.monitoring.errors import MonitoringRouterStartError
21
+ from parsl.monitoring.radios.base import MonitoringRadioReceiver
15
22
  from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
23
+ from parsl.multiprocessing import (
24
+ SizedQueue,
25
+ SpawnEvent,
26
+ SpawnProcess,
27
+ join_terminate_close_proc,
28
+ )
16
29
  from parsl.process_loggers import wrap_with_logs
17
30
  from parsl.utils import setproctitle
18
31
 
@@ -26,9 +39,11 @@ class MonitoringRouter:
26
39
  udp_port: Optional[int] = None,
27
40
  run_dir: str = ".",
28
41
  logging_level: int = logging.INFO,
29
- atexit_timeout: int = 3, # in seconds
42
+ atexit_timeout: int, # in seconds
30
43
  resource_msgs: mpq.Queue,
31
44
  exit_event: Event,
45
+ hmac_key: bytes,
46
+ hmac_digest: str,
32
47
  ):
33
48
  """ Initializes a monitoring configuration class.
34
49
 
@@ -48,13 +63,15 @@ class MonitoringRouter:
48
63
  An event that the main Parsl process will set to signal that the monitoring router should shut down.
49
64
  """
50
65
  os.makedirs(run_dir, exist_ok=True)
51
- self.logger = set_file_logger(f"{run_dir}/monitoring_udp_router.log",
52
- name="monitoring_router",
53
- level=logging_level)
54
- self.logger.debug("Monitoring router starting")
66
+ set_file_logger(f"{run_dir}/monitoring_udp_router.log",
67
+ level=logging_level)
68
+ logger.debug("Monitoring router starting")
55
69
 
56
70
  self.atexit_timeout = atexit_timeout
57
71
 
72
+ self.hmac_key = hmac_key
73
+ self.hmac_digest = hmac_digest
74
+
58
75
  self.loop_freq = 10.0 # milliseconds
59
76
 
60
77
  # Initialize the UDP socket
@@ -73,39 +90,55 @@ class MonitoringRouter:
73
90
  except Exception as e:
74
91
  raise RuntimeError(f"Could not bind to udp_port {udp_port} because: {e}")
75
92
  self.udp_sock.settimeout(self.loop_freq / 1000)
76
- self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
93
+ logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
77
94
 
78
95
  self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
79
96
  self.exit_event = exit_event
80
97
 
81
- @wrap_with_logs(target="monitoring_router")
98
+ @wrap_with_logs
82
99
  def start(self) -> None:
83
- self.logger.info("Starting UDP listener")
100
+ logger.info("Starting UDP listener")
84
101
  try:
85
102
  while not self.exit_event.is_set():
86
103
  try:
87
- data, addr = self.udp_sock.recvfrom(2048)
88
- resource_msg = pickle.loads(data)
89
- self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
90
- self.target_radio.send(resource_msg)
104
+ self.process_message()
91
105
  except socket.timeout:
92
106
  pass
93
107
 
94
- self.logger.info("UDP listener draining")
108
+ logger.info("UDP listener draining")
95
109
  last_msg_received_time = time.time()
96
110
  while time.time() - last_msg_received_time < self.atexit_timeout:
97
111
  try:
98
- data, addr = self.udp_sock.recvfrom(2048)
99
- msg = pickle.loads(data)
100
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
101
- self.target_radio.send(msg)
112
+ self.process_message()
102
113
  last_msg_received_time = time.time()
103
114
  except socket.timeout:
104
115
  pass
105
116
 
106
- self.logger.info("UDP listener finishing normally")
117
+ logger.info("UDP listener finishing normally")
107
118
  finally:
108
- self.logger.info("UDP listener finished")
119
+ logger.info("UDP listener finished")
120
+
121
+ def process_message(self) -> None:
122
+ hmdata, addr = self.udp_sock.recvfrom(2048)
123
+ h = hmac.HMAC(key=self.hmac_key, digestmod=self.hmac_digest)
124
+ origin_hmac = hmdata[0:h.digest_size]
125
+ h.update(hmdata[h.digest_size:])
126
+ data = hmdata[h.digest_size:]
127
+
128
+ # Check hmac before pickle load.
129
+ # If data is wrong, do not log it because it is suspect,
130
+ # but it should be safe to log the addr, at error level.
131
+
132
+ recomputed_hmac = h.digest()
133
+
134
+ if not hmac.compare_digest(origin_hmac, recomputed_hmac):
135
+ logger.error("HMAC does not match on received message")
136
+ # No exception, because this can be arbitrary network noise
137
+ # that shouldn't break the receiver.
138
+ else:
139
+ resource_msg = pickle.loads(data)
140
+ logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
141
+ self.target_radio.send(resource_msg)
109
142
 
110
143
 
111
144
  @wrap_with_logs
@@ -116,24 +149,85 @@ def udp_router_starter(*,
116
149
  exit_event: Event,
117
150
 
118
151
  udp_port: Optional[int],
119
-
152
+ hmac_key: bytes,
120
153
  run_dir: str,
121
- logging_level: int) -> None:
154
+ logging_level: int,
155
+ atexit_timeout: int,
156
+ hmac_digest: str) -> None:
122
157
  setproctitle("parsl: monitoring UDP router")
123
158
  try:
124
159
  router = MonitoringRouter(udp_port=udp_port,
125
160
  run_dir=run_dir,
126
161
  logging_level=logging_level,
127
162
  resource_msgs=resource_msgs,
128
- exit_event=exit_event)
163
+ exit_event=exit_event,
164
+ atexit_timeout=atexit_timeout,
165
+ hmac_key=hmac_key,
166
+ hmac_digest=hmac_digest)
129
167
  except Exception as e:
130
168
  logger.error("MonitoringRouter construction failed.", exc_info=True)
131
169
  comm_q.put(f"Monitoring router construction failed: {e}")
132
170
  else:
133
171
  comm_q.put(router.udp_port)
134
172
 
135
- router.logger.info("Starting MonitoringRouter in router_starter")
173
+ logger.info("Starting MonitoringRouter in router_starter")
136
174
  try:
137
175
  router.start()
138
176
  except Exception:
139
- router.logger.exception("UDP router start exception")
177
+ logger.exception("UDP router start exception")
178
+
179
+
180
+ class UDPRadioReceiver(MonitoringRadioReceiver):
181
+ def __init__(self, *, process: SpawnProcessType, exit_event: EventType, port: int) -> None:
182
+ self.process = process
183
+ self.exit_event = exit_event
184
+ self.port = port
185
+
186
+ def shutdown(self) -> None:
187
+ self.exit_event.set()
188
+ join_terminate_close_proc(self.process)
189
+
190
+
191
+ def start_udp_receiver(*,
192
+ monitoring_messages: Queue,
193
+ port: Optional[int],
194
+ logdir: str,
195
+ debug: bool,
196
+ atexit_timeout: int,
197
+ hmac_key: bytes,
198
+ hmac_digest: str) -> UDPRadioReceiver:
199
+
200
+ udp_comm_q: Queue[Union[int, str]]
201
+ udp_comm_q = SizedQueue(maxsize=10)
202
+
203
+ router_exit_event = SpawnEvent()
204
+
205
+ router_proc = SpawnProcess(target=udp_router_starter,
206
+ kwargs={"comm_q": udp_comm_q,
207
+ "resource_msgs": monitoring_messages,
208
+ "exit_event": router_exit_event,
209
+ "udp_port": port,
210
+ "run_dir": logdir,
211
+ "logging_level": logging.DEBUG if debug else logging.INFO,
212
+ "atexit_timeout": atexit_timeout,
213
+ "hmac_key": hmac_key,
214
+ "hmac_digest": hmac_digest,
215
+ },
216
+ name="Monitoring-UDP-Router-Process",
217
+ daemon=True,
218
+ )
219
+ router_proc.start()
220
+
221
+ try:
222
+ udp_comm_q_result = udp_comm_q.get(block=True, timeout=120)
223
+ udp_comm_q.close()
224
+ udp_comm_q.join_thread()
225
+ except queue.Empty:
226
+ logger.error("Monitoring UDP router has not reported port in 120s. Aborting")
227
+ raise MonitoringRouterStartError()
228
+
229
+ if isinstance(udp_comm_q_result, str):
230
+ logger.error("MonitoringRouter sent an error message: %s", udp_comm_q_result)
231
+ raise RuntimeError(f"MonitoringRouter failed to start: {udp_comm_q_result}")
232
+
233
+ return UDPRadioReceiver(process=router_proc, exit_event=router_exit_event, port=udp_comm_q_result)
@@ -61,10 +61,9 @@ class MonitoringRouter:
61
61
  An event that the main Parsl process will set to signal that the monitoring router should shut down.
62
62
  """
63
63
  os.makedirs(run_dir, exist_ok=True)
64
- self.logger = set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
65
- name="zmq_monitoring_router",
66
- level=logging_level)
67
- self.logger.debug("Monitoring router starting")
64
+ set_file_logger(f"{run_dir}/monitoring_zmq_router.log",
65
+ level=logging_level)
66
+ logger.debug("Monitoring router starting")
68
67
 
69
68
  self.address = address
70
69
 
@@ -75,7 +74,7 @@ class MonitoringRouter:
75
74
  self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
76
75
  self.zmq_receiver_channel.set_hwm(0)
77
76
  self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
78
- self.logger.debug("address: {}. port_range {}".format(address, port_range))
77
+ logger.debug("address: {}. port_range {}".format(address, port_range))
79
78
  self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port(tcp_url(address),
80
79
  min_port=port_range[0],
81
80
  max_port=port_range[1])
@@ -83,9 +82,9 @@ class MonitoringRouter:
83
82
  self.target_radio = MultiprocessingQueueRadioSender(resource_msgs)
84
83
  self.exit_event = exit_event
85
84
 
86
- @wrap_with_logs(target="zmq_monitoring_router")
85
+ @wrap_with_logs
87
86
  def start(self) -> None:
88
- self.logger.info("Starting ZMQ listener")
87
+ logger.info("Starting ZMQ listener")
89
88
  try:
90
89
  while not self.exit_event.is_set():
91
90
  try:
@@ -107,11 +106,11 @@ class MonitoringRouter:
107
106
  # channel is broken in such a way that it always raises
108
107
  # an exception? Looping on this would maybe be the wrong
109
108
  # thing to do.
110
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
109
+ logger.warning("Failure processing a ZMQ message", exc_info=True)
111
110
 
112
- self.logger.info("ZMQ listener finishing normally")
111
+ logger.info("ZMQ listener finishing normally")
113
112
  finally:
114
- self.logger.info("ZMQ listener finished")
113
+ logger.info("ZMQ listener finished")
115
114
 
116
115
 
117
116
  @wrap_with_logs
@@ -7,10 +7,7 @@ from multiprocessing import Event
7
7
  from typing import Any, Callable, Dict, List, Sequence, Tuple
8
8
 
9
9
  from parsl.monitoring.message_type import MessageType
10
- from parsl.monitoring.radios.base import MonitoringRadioSender
11
- from parsl.monitoring.radios.filesystem import FilesystemRadioSender
12
- from parsl.monitoring.radios.htex import HTEXRadioSender
13
- from parsl.monitoring.radios.udp import UDPRadioSender
10
+ from parsl.monitoring.radios.base import RadioConfig
14
11
  from parsl.multiprocessing import ForkProcess
15
12
  from parsl.process_loggers import wrap_with_logs
16
13
 
@@ -23,11 +20,10 @@ def monitor_wrapper(*,
23
20
  kwargs: Dict, # per invocation
24
21
  x_try_id: int, # per invocation
25
22
  x_task_id: int, # per invocation
26
- monitoring_hub_url: str, # per workflow
23
+ radio_config: RadioConfig, # per executor
27
24
  run_id: str, # per workflow
28
25
  logging_level: int, # per workflow
29
26
  sleep_dur: float, # per workflow
30
- radio_mode: str, # per executor
31
27
  monitor_resources: bool, # per workflow
32
28
  run_dir: str) -> Tuple[Callable, Sequence, Dict]:
33
29
  """Wrap the Parsl app with a function that will call the monitor function and point it at the correct pid when the task begins.
@@ -41,9 +37,8 @@ def monitor_wrapper(*,
41
37
  # Send first message to monitoring router
42
38
  send_first_message(try_id,
43
39
  task_id,
44
- monitoring_hub_url,
40
+ radio_config,
45
41
  run_id,
46
- radio_mode,
47
42
  run_dir)
48
43
 
49
44
  if monitor_resources and sleep_dur > 0:
@@ -52,9 +47,8 @@ def monitor_wrapper(*,
52
47
  args=(os.getpid(),
53
48
  try_id,
54
49
  task_id,
55
- monitoring_hub_url,
50
+ radio_config,
56
51
  run_id,
57
- radio_mode,
58
52
  logging_level,
59
53
  sleep_dur,
60
54
  run_dir,
@@ -87,9 +81,9 @@ def monitor_wrapper(*,
87
81
 
88
82
  send_last_message(try_id,
89
83
  task_id,
90
- monitoring_hub_url,
84
+ radio_config,
91
85
  run_id,
92
- radio_mode, run_dir)
86
+ run_dir)
93
87
 
94
88
  new_kwargs = kwargs.copy()
95
89
  new_kwargs['_parsl_monitoring_task_id'] = x_task_id
@@ -98,47 +92,33 @@ def monitor_wrapper(*,
98
92
  return (wrapped, args, new_kwargs)
99
93
 
100
94
 
101
- def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
102
- radio: MonitoringRadioSender
103
- if radio_mode == "udp":
104
- radio = UDPRadioSender(monitoring_hub_url)
105
- elif radio_mode == "htex":
106
- radio = HTEXRadioSender(monitoring_hub_url)
107
- elif radio_mode == "filesystem":
108
- radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
109
- run_dir=run_dir)
110
- else:
111
- raise RuntimeError(f"Unknown radio mode: {radio_mode}")
112
- return radio
113
-
114
-
115
95
  @wrap_with_logs
116
96
  def send_first_message(try_id: int,
117
97
  task_id: int,
118
- monitoring_hub_url: str,
119
- run_id: str, radio_mode: str, run_dir: str) -> None:
120
- send_first_last_message(try_id, task_id, monitoring_hub_url, run_id,
121
- radio_mode, run_dir, False)
98
+ radio_config: RadioConfig,
99
+ run_id: str, run_dir: str) -> None:
100
+ send_first_last_message(try_id, task_id, radio_config, run_id,
101
+ run_dir, False)
122
102
 
123
103
 
124
104
  @wrap_with_logs
125
105
  def send_last_message(try_id: int,
126
106
  task_id: int,
127
- monitoring_hub_url: str,
128
- run_id: str, radio_mode: str, run_dir: str) -> None:
129
- send_first_last_message(try_id, task_id, monitoring_hub_url, run_id,
130
- radio_mode, run_dir, True)
107
+ radio_config: RadioConfig,
108
+ run_id: str, run_dir: str) -> None:
109
+ send_first_last_message(try_id, task_id, radio_config, run_id,
110
+ run_dir, True)
131
111
 
132
112
 
133
113
  def send_first_last_message(try_id: int,
134
114
  task_id: int,
135
- monitoring_hub_url: str,
136
- run_id: str, radio_mode: str, run_dir: str,
115
+ radio_config: RadioConfig,
116
+ run_id: str, run_dir: str,
137
117
  is_last: bool) -> None:
138
118
  import os
139
119
  import platform
140
120
 
141
- radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
121
+ radio = radio_config.create_sender()
142
122
 
143
123
  msg = (MessageType.RESOURCE_INFO,
144
124
  {'run_id': run_id,
@@ -158,9 +138,8 @@ def send_first_last_message(try_id: int,
158
138
  def monitor(pid: int,
159
139
  try_id: int,
160
140
  task_id: int,
161
- monitoring_hub_url: str,
141
+ radio_config: RadioConfig,
162
142
  run_id: str,
163
- radio_mode: str,
164
143
  logging_level: int,
165
144
  sleep_dur: float,
166
145
  run_dir: str,
@@ -184,7 +163,7 @@ def monitor(pid: int,
184
163
 
185
164
  setproctitle("parsl: task resource monitor")
186
165
 
187
- radio = get_radio(radio_mode, monitoring_hub_url, task_id, run_dir)
166
+ radio = radio_config.create_sender()
188
167
 
189
168
  logging.debug("start of monitor")
190
169
 
@@ -114,17 +114,15 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
114
114
 
115
115
  return [self.resources[jid]['status'] for jid in job_ids]
116
116
 
117
- def _is_alive(self, job_dict):
118
- retcode, stdout, stderr = execute_wait(
119
- 'ps -p {} > /dev/null 2> /dev/null; echo "STATUS:$?" '.format(
120
- job_dict['remote_pid']), self.cmd_timeout)
121
- for line in stdout.split('\n'):
122
- if line.startswith("STATUS:"):
123
- status = line.split("STATUS:")[1].strip()
124
- if status == "0":
125
- return True
126
- else:
127
- return False
117
+ @staticmethod
118
+ def _is_alive(job_dict) -> bool:
119
+ try:
120
+ os.kill(job_dict['remote_pid'], 0)
121
+ except ProcessLookupError:
122
+ return False
123
+ except PermissionError:
124
+ pass # exists; just no permissions to send signal
125
+ return True
128
126
 
129
127
  def _job_file_path(self, script_path: str, suffix: str) -> str:
130
128
  path = '{0}{1}'.format(script_path, suffix)
@@ -230,8 +228,9 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
230
228
  stdout, stderr)
231
229
  for line in stdout.split('\n'):
232
230
  if line.startswith("PID:"):
233
- remote_pid = line.split("PID:")[1].strip()
234
- job_id = remote_pid
231
+ job_id = line.split("PID:")[1].strip()
232
+ remote_pid = int(job_id)
233
+ break
235
234
  if job_id is None:
236
235
  raise SubmitException(job_name, "Channel failed to start remote command/retrieve PID")
237
236
 
@@ -59,7 +59,6 @@ def fresh_config():
59
59
  app_cache=True, checkpoint_mode='task_exit',
60
60
  retries=2,
61
61
  monitoring=MonitoringHub(
62
- hub_address="localhost",
63
62
  monitoring_debug=False,
64
63
  resource_monitoring_interval=1,
65
64
  ),
parsl/tests/conftest.py CHANGED
@@ -406,13 +406,15 @@ def try_assert():
406
406
  timeout_ms: float = 5000,
407
407
  attempts: int = 0,
408
408
  check_period_ms: int = 20,
409
+ factor: float = 2,
409
410
  ):
410
411
  tb = create_traceback(start=1)
411
412
  check_period_s = abs(check_period_ms) / 1000.0
412
413
  if attempts > 0:
413
414
  for _attempt_no in range(attempts):
414
- time.sleep(random.random() * check_period_s) # jitter
415
- check_period_s *= 2
415
+ fraction = random.random()
416
+ time.sleep(fraction * check_period_s) # jitter
417
+ check_period_s *= factor ** fraction
416
418
  if test_func():
417
419
  return
418
420
  else:
@@ -427,9 +429,10 @@ def try_assert():
427
429
  timeout_s = abs(timeout_ms) / 1000.0
428
430
  end = time.monotonic() + timeout_s
429
431
  while time.monotonic() < end:
430
- wait_for = random.random() * check_period_s # jitter
432
+ fraction = random.random()
433
+ wait_for = fraction * check_period_s # jitter
431
434
  time.sleep(min(wait_for, end - time.monotonic()))
432
- check_period_s *= 2
435
+ check_period_s *= factor ** fraction
433
436
  if test_func():
434
437
  return
435
438
  att_fail = (
@@ -40,7 +40,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
40
40
  incoming_q.port,
41
41
  command_client.port),
42
42
  "interchange_address": "127.0.0.1",
43
- "worker_ports": None,
43
+ "worker_port": None,
44
44
  "worker_port_range": (50000, 60000),
45
45
  "hub_address": None,
46
46
  "hub_zmq_port": None,
@@ -67,7 +67,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
67
67
  # responsive. if the interchange process didn't start enough to get the command
68
68
  # thread running, this will time out.
69
69
 
70
- (task_port, result_port) = command_client.run("WORKER_PORTS", timeout_s=120)
70
+ worker_port = command_client.run("WORKER_BINDS", timeout_s=120)
71
71
 
72
72
  # now we'll assume that if the interchange command thread is responding,
73
73
  # then the worker polling code is also running and that the interchange has
@@ -80,7 +80,7 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
80
80
 
81
81
  msg = {'type': 'registration',
82
82
  'parsl_v': PARSL_VERSION,
83
- 'python_v': "{}.{}.{}".format(1, 1, 1), # this is the bad bit
83
+ 'python_v': "1.1.1", # this is the bad bit
84
84
  'worker_count': 1,
85
85
  'uid': 'testuid',
86
86
  'block_id': 0,
@@ -104,11 +104,9 @@ def test_exit_with_bad_registration(tmpd_cwd, try_assert):
104
104
 
105
105
  task_channel.set_hwm(0)
106
106
  task_channel.setsockopt(zmq.SNDTIMEO, channel_timeout)
107
- task_channel.connect(f"tcp://127.0.0.1:{task_port}")
107
+ task_channel.connect(f"tcp://127.0.0.1:{worker_port}")
108
108
 
109
- b_msg = json.dumps(msg).encode('utf-8')
110
-
111
- task_channel.send(b_msg)
109
+ task_channel.send(pickle.dumps(msg))
112
110
 
113
111
  # check that the interchange exits within some reasonable time
114
112
  try_assert(lambda: interchange_proc.poll() is not None, "Interchange did not exit after killing watched client process", timeout_ms=5000)
@@ -15,12 +15,12 @@ from parsl.executors.high_throughput.manager_selector import RandomManagerSelect
15
15
  def make_interchange(*,
16
16
  interchange_address: Optional[str],
17
17
  cert_dir: Optional[str],
18
- worker_ports: Optional[tuple[int, int]] = None) -> Interchange:
18
+ worker_port: Optional[int] = None) -> Interchange:
19
19
  return Interchange(interchange_address=interchange_address,
20
20
  cert_dir=cert_dir,
21
21
  client_address="127.0.0.1",
22
22
  client_ports=(50055, 50056, 50057),
23
- worker_ports=worker_ports,
23
+ worker_port=worker_port,
24
24
  worker_port_range=(54000, 55000),
25
25
  hub_address=None,
26
26
  hub_zmq_port=None,
@@ -56,7 +56,7 @@ def test_interchange_curvezmq_sockets(
56
56
  ix = make_interchange(interchange_address=address, cert_dir=cert_dir)
57
57
  assert isinstance(ix.zmq_context, curvezmq.ServerContext)
58
58
  assert ix.zmq_context.encrypted is encrypted
59
- assert mock_socket.call_count == 5
59
+ assert mock_socket.call_count == 4
60
60
 
61
61
 
62
62
  @pytest.mark.local
@@ -100,11 +100,10 @@ def test_limited_interface_binding(cert_dir: Optional[str]):
100
100
  """When address is specified the worker_port would be bound to it rather than to 0.0.0.0"""
101
101
  address = "127.0.0.1"
102
102
  ix = make_interchange(interchange_address=address, cert_dir=cert_dir)
103
- ix.worker_result_port
104
103
  proc = psutil.Process()
105
104
  conns = proc.connections(kind="tcp")
106
105
 
107
- matched_conns = [conn for conn in conns if conn.laddr.port == ix.worker_result_port]
106
+ matched_conns = [conn for conn in conns if conn.laddr.port == ix.worker_port]
108
107
  assert len(matched_conns) == 1
109
108
  # laddr.ip can return ::ffff:127.0.0.1 when using IPv6
110
109
  assert address in matched_conns[0].laddr.ip
@@ -113,5 +112,5 @@ def test_limited_interface_binding(cert_dir: Optional[str]):
113
112
  @pytest.mark.local
114
113
  @pytest.mark.parametrize("encrypted", (True, False), indirect=True)
115
114
  def test_fixed_ports(cert_dir: Optional[str]):
116
- ix = make_interchange(interchange_address=None, cert_dir=cert_dir, worker_ports=(51117, 51118))
115
+ ix = make_interchange(interchange_address=None, cert_dir=cert_dir, worker_port=51117)
117
116
  assert ix.interchange_address == "*"
@@ -8,6 +8,9 @@ from parsl import HighThroughputExecutor, ThreadPoolExecutor
8
8
  from parsl.config import Config
9
9
  from parsl.executors.status_handling import BlockProviderExecutor
10
10
  from parsl.monitoring import MonitoringHub
11
+ from parsl.monitoring.radios.filesystem import FilesystemRadio
12
+ from parsl.monitoring.radios.htex import HTEXRadio
13
+ from parsl.monitoring.radios.udp import UDPRadio
11
14
 
12
15
 
13
16
  @parsl.python_app
@@ -25,9 +28,8 @@ def this_app():
25
28
  # a configuration that is suitably configured for monitoring.
26
29
 
27
30
  def thread_config():
28
- c = Config(executors=[ThreadPoolExecutor()],
29
- monitoring=MonitoringHub(hub_address="localhost",
30
- resource_monitoring_interval=0))
31
+ c = Config(executors=[ThreadPoolExecutor(remote_monitoring_radio=UDPRadio(address="localhost", atexit_timeout=0))],
32
+ monitoring=MonitoringHub(resource_monitoring_interval=0))
31
33
  return c
32
34
 
33
35
 
@@ -42,9 +44,10 @@ def htex_udp_config():
42
44
  from parsl.tests.configs.htex_local_alternate import fresh_config
43
45
  c = fresh_config()
44
46
  assert len(c.executors) == 1
47
+ ex = c.executors[0]
45
48
 
46
- assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio"
47
- c.executors[0].radio_mode = "udp"
49
+ assert isinstance(ex.remote_monitoring_radio, HTEXRadio), "precondition: htex is configured for the HTEXRadio"
50
+ ex.remote_monitoring_radio = UDPRadio(address="localhost", atexit_timeout=0)
48
51
 
49
52
  return c
50
53
 
@@ -54,9 +57,10 @@ def htex_filesystem_config():
54
57
  from parsl.tests.configs.htex_local_alternate import fresh_config
55
58
  c = fresh_config()
56
59
  assert len(c.executors) == 1
60
+ ex = c.executors[0]
57
61
 
58
- assert c.executors[0].radio_mode == "htex", "precondition: htex has a radio mode attribute, configured for htex radio"
59
- c.executors[0].radio_mode = "filesystem"
62
+ assert isinstance(ex.remote_monitoring_radio, HTEXRadio), "precondition: htex is configured for the HTEXRadio"
63
+ ex.remote_monitoring_radio = FilesystemRadio()
60
64
 
61
65
  return c
62
66
 
@@ -65,7 +69,6 @@ def workqueue_config():
65
69
  from parsl.tests.configs.workqueue_ex import fresh_config
66
70
  c = fresh_config()
67
71
  c.monitoring = MonitoringHub(
68
- hub_address="localhost",
69
72
  resource_monitoring_interval=1)
70
73
  return c
71
74
 
@@ -76,8 +79,7 @@ def taskvine_config():
76
79
  worker_launch_method='provider')],
77
80
  strategy_period=0.5,
78
81
 
79
- monitoring=MonitoringHub(hub_address="localhost",
80
- resource_monitoring_interval=1))
82
+ monitoring=MonitoringHub(resource_monitoring_interval=1))
81
83
  return c
82
84
 
83
85
 
@@ -17,7 +17,12 @@ def this_app():
17
17
 
18
18
 
19
19
  @pytest.mark.local
20
- def test_row_counts():
20
+ def test_fuzz():
21
+ """This test sends fuzz into the ZMQ radio receiver that HTEX starts
22
+ for receiving monitoring messages from the interchange, and checks
23
+ that monitoring still records things ok.
24
+ """
25
+
21
26
  import sqlalchemy
22
27
  from sqlalchemy import text
23
28
 
@@ -44,7 +49,7 @@ def test_row_counts():
44
49
  # the latter is what i'm most suspicious of in my present investigation
45
50
 
46
51
  # dig out the interchange port...
47
- hub_address = parsl.dfk().monitoring.hub_address
52
+ hub_address = parsl.dfk().executors["htex_Local"].loopback_address
48
53
  hub_zmq_port = parsl.dfk().executors["htex_Local"].hub_zmq_port
49
54
 
50
55
  # this will send a string to a new socket connection