parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. parsl/addresses.py +3 -1
  2. parsl/config.py +4 -0
  3. parsl/dataflow/dflow.py +14 -5
  4. parsl/executors/base.py +10 -0
  5. parsl/executors/high_throughput/executor.py +12 -0
  6. parsl/executors/high_throughput/interchange.py +30 -8
  7. parsl/executors/high_throughput/manager_record.py +1 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +41 -5
  9. parsl/executors/status_handling.py +2 -9
  10. parsl/executors/taskvine/executor.py +24 -3
  11. parsl/executors/taskvine/manager.py +1 -0
  12. parsl/executors/taskvine/manager_config.py +3 -4
  13. parsl/executors/workqueue/executor.py +19 -0
  14. parsl/jobs/error_handlers.py +1 -1
  15. parsl/jobs/job_status_poller.py +8 -7
  16. parsl/launchers/launchers.py +6 -6
  17. parsl/log_utils.py +8 -4
  18. parsl/monitoring/db_manager.py +4 -2
  19. parsl/monitoring/monitoring.py +30 -264
  20. parsl/monitoring/router.py +208 -0
  21. parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
  22. parsl/monitoring/visualization/views.py +2 -1
  23. parsl/providers/cluster_provider.py +1 -3
  24. parsl/tests/configs/user_opts.py +2 -1
  25. parsl/tests/test_htex/test_drain.py +78 -0
  26. parsl/tests/test_monitoring/test_app_names.py +86 -0
  27. parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
  28. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
  29. parsl/tests/test_python_apps/test_context_manager.py +40 -0
  30. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
  31. parsl/tests/test_shutdown/__init__.py +0 -0
  32. parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
  33. parsl/utils.py +2 -2
  34. parsl/version.py +1 -1
  35. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
  36. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
  37. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
  38. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
  39. {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
  40. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
  41. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
  42. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
  43. {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import logging
2
2
  import parsl
3
3
  import time
4
4
  import zmq
5
- from typing import Dict, List, Sequence, Optional
5
+ from typing import Dict, List, Sequence, Optional, Union
6
6
 
7
7
  from parsl.jobs.states import JobStatus, JobState
8
8
  from parsl.jobs.strategy import Strategy
@@ -29,7 +29,7 @@ class PollItem:
29
29
  if self._dfk and self._dfk.monitoring is not None:
30
30
  self.monitoring_enabled = True
31
31
  hub_address = self._dfk.hub_address
32
- hub_port = self._dfk.hub_interchange_port
32
+ hub_port = self._dfk.hub_zmq_port
33
33
  context = zmq.Context()
34
34
  self.hub_channel = context.socket(zmq.DEALER)
35
35
  self.hub_channel.set_hwm(0)
@@ -72,7 +72,7 @@ class PollItem:
72
72
  def executor(self) -> BlockProviderExecutor:
73
73
  return self._executor
74
74
 
75
- def scale_in(self, n, max_idletime=None):
75
+ def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
76
76
 
77
77
  if max_idletime is None:
78
78
  block_ids = self._executor.scale_in(n)
@@ -82,7 +82,7 @@ class PollItem:
82
82
  # scale_in method really does come from HighThroughputExecutor,
83
83
  # and so does have an extra max_idletime parameter not present
84
84
  # in the executor interface.
85
- block_ids = self._executor.scale_in(n, max_idletime=max_idletime)
85
+ block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
86
86
  if block_ids is not None:
87
87
  new_status = {}
88
88
  for block_id in block_ids:
@@ -91,7 +91,7 @@ class PollItem:
91
91
  self.send_monitoring_info(new_status)
92
92
  return block_ids
93
93
 
94
- def scale_out(self, n):
94
+ def scale_out(self, n: int) -> List[str]:
95
95
  block_ids = self._executor.scale_out(n)
96
96
  if block_ids is not None:
97
97
  new_status = {}
@@ -106,13 +106,14 @@ class PollItem:
106
106
 
107
107
 
108
108
  class JobStatusPoller(Timer):
109
- def __init__(self, strategy: Optional[str] = None, max_idletime: float = 0.0,
109
+ def __init__(self, *, strategy: Optional[str], max_idletime: float,
110
+ strategy_period: Union[float, int],
110
111
  dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None) -> None:
111
112
  self._poll_items = [] # type: List[PollItem]
112
113
  self.dfk = dfk
113
114
  self._strategy = Strategy(strategy=strategy,
114
115
  max_idletime=max_idletime)
115
- super().__init__(self.poll, interval=5, name="JobStatusPoller")
116
+ super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
116
117
 
117
118
  def poll(self) -> None:
118
119
  self._update_state()
@@ -8,16 +8,16 @@ logger = logging.getLogger(__name__)
8
8
  class SimpleLauncher(Launcher):
9
9
  """ Does no wrapping. Just returns the command as-is
10
10
  """
11
- def __init_(self, debug: bool = True) -> None:
11
+ def __init__(self, debug: bool = True) -> None:
12
12
  super().__init__(debug=debug)
13
13
 
14
14
  def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> str:
15
- """
16
- Args:
17
- - command (string): The command string to be launched
18
- - task_block (string) : bash evaluated string.
19
15
 
20
- """
16
+ if nodes_per_block > 1:
17
+ logger.warning('Simple Launcher only supports single node per block. '
18
+ f'Requested nodes: {nodes_per_block}. '
19
+ 'You may be getting fewer workers than expected')
20
+
21
21
  return command
22
22
 
23
23
 
parsl/log_utils.py CHANGED
@@ -28,7 +28,7 @@ DEFAULT_FORMAT = (
28
28
  def set_stream_logger(name: str = 'parsl',
29
29
  level: int = logging.DEBUG,
30
30
  format_string: Optional[str] = None,
31
- stream: Optional[io.TextIOWrapper] = None) -> None:
31
+ stream: Optional[io.TextIOWrapper] = None) -> logging.Logger:
32
32
  """Add a stream log handler.
33
33
 
34
34
  Args:
@@ -39,7 +39,7 @@ def set_stream_logger(name: str = 'parsl',
39
39
  If not specified, the default stream for logging.StreamHandler is used.
40
40
 
41
41
  Returns:
42
- - None
42
+ - logger for specified name
43
43
  """
44
44
  if format_string is None:
45
45
  # format_string = "%(asctime)s %(name)s [%(levelname)s] Thread:%(thread)d %(message)s"
@@ -59,12 +59,14 @@ def set_stream_logger(name: str = 'parsl',
59
59
  futures_logger = logging.getLogger("concurrent.futures")
60
60
  futures_logger.addHandler(handler)
61
61
 
62
+ return logger
63
+
62
64
 
63
65
  @typeguard.typechecked
64
66
  def set_file_logger(filename: str,
65
67
  name: str = 'parsl',
66
68
  level: int = logging.DEBUG,
67
- format_string: Optional[str] = None) -> None:
69
+ format_string: Optional[str] = None) -> logging.Logger:
68
70
  """Add a file log handler.
69
71
 
70
72
  Args:
@@ -74,7 +76,7 @@ def set_file_logger(filename: str,
74
76
  - format_string (string): Set the format string
75
77
 
76
78
  Returns:
77
- - None
79
+ - logger for specified name
78
80
  """
79
81
  if format_string is None:
80
82
  format_string = DEFAULT_FORMAT
@@ -91,3 +93,5 @@ def set_file_logger(filename: str,
91
93
  # concurrent.futures
92
94
  futures_logger = logging.getLogger("concurrent.futures")
93
95
  futures_logger.addHandler(handler)
96
+
97
+ return logger
@@ -633,7 +633,8 @@ class DatabaseManager:
633
633
  # if retried - for example, the database being locked because someone else is readying
634
634
  # the tables we are trying to write to. If that assumption is wrong, then this loop
635
635
  # may go on forever.
636
- logger.warning("Got a database OperationalError. Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
636
+ logger.warning("Got a database OperationalError. "
637
+ "Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
637
638
  self.db.rollback()
638
639
  time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
639
640
 
@@ -660,7 +661,8 @@ class DatabaseManager:
660
661
  done = True
661
662
  except sa.exc.OperationalError as e:
662
663
  # hoping that this is a database locked error during _update, not some other problem
663
- logger.warning("Got a database OperationalError. Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
664
+ logger.warning("Got a database OperationalError. "
665
+ "Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
664
666
  self.db.rollback()
665
667
  time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
666
668
  except KeyboardInterrupt:
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
- import socket
5
4
  import time
6
- import pickle
7
5
  import logging
8
6
  import typeguard
9
7
  import zmq
@@ -15,14 +13,16 @@ import parsl.monitoring.remote
15
13
  from parsl.multiprocessing import ForkProcess, SizedQueue
16
14
  from multiprocessing import Process
17
15
  from multiprocessing.queues import Queue
16
+ from parsl.log_utils import set_file_logger
18
17
  from parsl.utils import RepresentationMixin
19
18
  from parsl.process_loggers import wrap_with_logs
20
19
  from parsl.utils import setproctitle
21
20
 
22
21
  from parsl.serialize import deserialize
23
22
 
23
+ from parsl.monitoring.router import router_starter
24
24
  from parsl.monitoring.message_type import MessageType
25
- from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
25
+ from parsl.monitoring.types import AddressedMonitoringMessage
26
26
  from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
27
27
 
28
28
  _db_manager_excepts: Optional[Exception]
@@ -38,40 +38,6 @@ else:
38
38
  logger = logging.getLogger(__name__)
39
39
 
40
40
 
41
- def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
42
- """Add a stream log handler.
43
-
44
- Parameters
45
- ---------
46
-
47
- filename: string
48
- Name of the file to write logs to. Required.
49
- name: string
50
- Logger name.
51
- level: logging.LEVEL
52
- Set the logging level. Default=logging.DEBUG
53
- - format_string (string): Set the format string
54
- format_string: string
55
- Format string to use.
56
-
57
- Returns
58
- -------
59
- None.
60
- """
61
- if format_string is None:
62
- format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
63
-
64
- logger = logging.getLogger(name)
65
- logger.setLevel(level)
66
- logger.propagate = False
67
- handler = logging.FileHandler(filename)
68
- handler.setLevel(level)
69
- formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
70
- handler.setFormatter(formatter)
71
- logger.addHandler(handler)
72
- return logger
73
-
74
-
75
41
  @typeguard.typechecked
76
42
  class MonitoringHub(RepresentationMixin):
77
43
  def __init__(self,
@@ -79,9 +45,6 @@ class MonitoringHub(RepresentationMixin):
79
45
  hub_port: Optional[int] = None,
80
46
  hub_port_range: Tuple[int, int] = (55050, 56000),
81
47
 
82
- client_address: str = "127.0.0.1",
83
- client_port_range: Tuple[int, int] = (55000, 56000),
84
-
85
48
  workflow_name: Optional[str] = None,
86
49
  workflow_version: Optional[str] = None,
87
50
  logging_endpoint: Optional[str] = None,
@@ -106,11 +69,6 @@ class MonitoringHub(RepresentationMixin):
106
69
  to deliver monitoring messages to the monitoring router.
107
70
  Note that despite the similar name, this is not related to hub_port.
108
71
  Default: (55050, 56000)
109
- client_address : str
110
- The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
111
- client_port_range : tuple(int, int)
112
- The MonitoringHub picks ports at random from the range which will be used by Hub.
113
- Default: (55000, 56000)
114
72
  workflow_name : str
115
73
  The name for the workflow. Default to the name of the parsl script
116
74
  workflow_version : str
@@ -134,8 +92,6 @@ class MonitoringHub(RepresentationMixin):
134
92
  Default: 30 seconds
135
93
  """
136
94
 
137
- self.logger = logger
138
-
139
95
  # Any is used to disable typechecking on uses of _dfk_channel,
140
96
  # because it is used in the code as if it points to a channel, but
141
97
  # the static type is that it can also be None. The code relies on
@@ -145,9 +101,6 @@ class MonitoringHub(RepresentationMixin):
145
101
  if _db_manager_excepts:
146
102
  raise _db_manager_excepts
147
103
 
148
- self.client_address = client_address
149
- self.client_port_range = client_port_range
150
-
151
104
  self.hub_address = hub_address
152
105
  self.hub_port = hub_port
153
106
  self.hub_port_range = hub_port_range
@@ -164,6 +117,8 @@ class MonitoringHub(RepresentationMixin):
164
117
 
165
118
  def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
166
119
 
120
+ logger.debug("Starting MonitoringHub")
121
+
167
122
  if self.logdir is None:
168
123
  self.logdir = "."
169
124
 
@@ -172,9 +127,6 @@ class MonitoringHub(RepresentationMixin):
172
127
 
173
128
  os.makedirs(self.logdir, exist_ok=True)
174
129
 
175
- # Initialize the ZMQ pipe to the Parsl Client
176
-
177
- self.logger.debug("Initializing ZMQ Pipes to client")
178
130
  self.monitoring_hub_active = True
179
131
 
180
132
  # This annotation is incompatible with typeguard 4.x instrumentation
@@ -210,8 +162,8 @@ class MonitoringHub(RepresentationMixin):
210
162
  self.router_proc = ForkProcess(target=router_starter,
211
163
  args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
212
164
  kwargs={"hub_address": self.hub_address,
213
- "hub_port": self.hub_port,
214
- "hub_port_range": self.hub_port_range,
165
+ "udp_port": self.hub_port,
166
+ "zmq_port_range": self.hub_port_range,
215
167
  "logdir": self.logdir,
216
168
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
217
169
  "run_id": run_id
@@ -231,7 +183,7 @@ class MonitoringHub(RepresentationMixin):
231
183
  daemon=True,
232
184
  )
233
185
  self.dbm_proc.start()
234
- self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
186
+ logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
235
187
 
236
188
  self.filesystem_proc = Process(target=filesystem_receiver,
237
189
  args=(self.logdir, self.resource_msgs, dfk_run_dir),
@@ -239,19 +191,19 @@ class MonitoringHub(RepresentationMixin):
239
191
  daemon=True
240
192
  )
241
193
  self.filesystem_proc.start()
242
- self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
194
+ logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
243
195
 
244
196
  try:
245
197
  comm_q_result = comm_q.get(block=True, timeout=120)
246
198
  except queue.Empty:
247
- self.logger.error("Hub has not completed initialization in 120s. Aborting")
199
+ logger.error("Hub has not completed initialization in 120s. Aborting")
248
200
  raise Exception("Hub failed to start")
249
201
 
250
202
  if isinstance(comm_q_result, str):
251
- self.logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
203
+ logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
252
204
  raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
253
205
 
254
- udp_port, ic_port = comm_q_result
206
+ udp_port, zmq_port = comm_q_result
255
207
 
256
208
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
257
209
 
@@ -261,28 +213,28 @@ class MonitoringHub(RepresentationMixin):
261
213
  self._dfk_channel.setsockopt(zmq.LINGER, 0)
262
214
  self._dfk_channel.set_hwm(0)
263
215
  self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
264
- self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
216
+ self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
265
217
 
266
- self.logger.info("Monitoring Hub initialized")
218
+ logger.info("Monitoring Hub initialized")
267
219
 
268
- return ic_port
220
+ return zmq_port
269
221
 
270
222
  # TODO: tighten the Any message format
271
223
  def send(self, mtype: MessageType, message: Any) -> None:
272
- self.logger.debug("Sending message type {}".format(mtype))
224
+ logger.debug("Sending message type {}".format(mtype))
273
225
  try:
274
226
  self._dfk_channel.send_pyobj((mtype, message))
275
227
  except zmq.Again:
276
- self.logger.exception(
228
+ logger.exception(
277
229
  "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
278
230
 
279
231
  def close(self) -> None:
280
- self.logger.info("Terminating Monitoring Hub")
232
+ logger.info("Terminating Monitoring Hub")
281
233
  exception_msgs = []
282
234
  while True:
283
235
  try:
284
236
  exception_msgs.append(self.exception_q.get(block=False))
285
- self.logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
237
+ logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
286
238
  except queue.Empty:
287
239
  break
288
240
  if self._dfk_channel and self.monitoring_hub_active:
@@ -290,7 +242,7 @@ class MonitoringHub(RepresentationMixin):
290
242
  self._dfk_channel.close()
291
243
  if exception_msgs:
292
244
  for exception_msg in exception_msgs:
293
- self.logger.error(
245
+ logger.error(
294
246
  "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
295
247
  exception_msg[0],
296
248
  exception_msg[1]
@@ -299,21 +251,21 @@ class MonitoringHub(RepresentationMixin):
299
251
  self.router_proc.terminate()
300
252
  self.dbm_proc.terminate()
301
253
  self.filesystem_proc.terminate()
302
- self.logger.info("Waiting for router to terminate")
254
+ logger.info("Waiting for router to terminate")
303
255
  self.router_proc.join()
304
- self.logger.debug("Finished waiting for router termination")
256
+ logger.debug("Finished waiting for router termination")
305
257
  if len(exception_msgs) == 0:
306
- self.logger.debug("Sending STOP to DBM")
258
+ logger.debug("Sending STOP to DBM")
307
259
  self.priority_msgs.put(("STOP", 0))
308
260
  else:
309
- self.logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
310
- self.logger.debug("Waiting for DB termination")
261
+ logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
262
+ logger.debug("Waiting for DB termination")
311
263
  self.dbm_proc.join()
312
- self.logger.debug("Finished waiting for DBM termination")
264
+ logger.debug("Finished waiting for DBM termination")
313
265
 
314
266
  # should this be message based? it probably doesn't need to be if
315
267
  # we believe we've received all messages
316
- self.logger.info("Terminating filesystem radio receiver process")
268
+ logger.info("Terminating filesystem radio receiver process")
317
269
  self.filesystem_proc.terminate()
318
270
  self.filesystem_proc.join()
319
271
 
@@ -337,9 +289,9 @@ class MonitoringHub(RepresentationMixin):
337
289
 
338
290
  @wrap_with_logs
339
291
  def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
340
- logger = start_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
341
- name="monitoring_filesystem_radio",
342
- level=logging.INFO)
292
+ logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
293
+ name="monitoring_filesystem_radio",
294
+ level=logging.INFO)
343
295
 
344
296
  logger.info("Starting filesystem radio receiver")
345
297
  setproctitle("parsl: monitoring filesystem receiver")
@@ -369,189 +321,3 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
369
321
  logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
370
322
 
371
323
  time.sleep(1) # whats a good time for this poll?
372
-
373
-
374
- class MonitoringRouter:
375
-
376
- def __init__(self,
377
- *,
378
- hub_address: str,
379
- hub_port: Optional[int] = None,
380
- hub_port_range: Tuple[int, int] = (55050, 56000),
381
-
382
- monitoring_hub_address: str = "127.0.0.1",
383
- logdir: str = ".",
384
- run_id: str,
385
- logging_level: int = logging.INFO,
386
- atexit_timeout: int = 3 # in seconds
387
- ):
388
- """ Initializes a monitoring configuration class.
389
-
390
- Parameters
391
- ----------
392
- hub_address : str
393
- The ip address at which the workers will be able to reach the Hub.
394
- hub_port : int
395
- The specific port at which workers will be able to reach the Hub via UDP. Default: None
396
- hub_port_range : tuple(int, int)
397
- The MonitoringHub picks ports at random from the range which will be used by Hub.
398
- This is overridden when the hub_port option is set. Default: (55050, 56000)
399
- logdir : str
400
- Parsl log directory paths. Logs and temp files go here. Default: '.'
401
- logging_level : int
402
- Logging level as defined in the logging module. Default: logging.INFO
403
- atexit_timeout : float, optional
404
- The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
405
-
406
- """
407
- os.makedirs(logdir, exist_ok=True)
408
- self.logger = start_file_logger("{}/monitoring_router.log".format(logdir),
409
- name="monitoring_router",
410
- level=logging_level)
411
- self.logger.debug("Monitoring router starting")
412
-
413
- self.hub_address = hub_address
414
- self.atexit_timeout = atexit_timeout
415
- self.run_id = run_id
416
-
417
- self.loop_freq = 10.0 # milliseconds
418
-
419
- # Initialize the UDP socket
420
- self.sock = socket.socket(socket.AF_INET,
421
- socket.SOCK_DGRAM,
422
- socket.IPPROTO_UDP)
423
-
424
- # We are trying to bind to all interfaces with 0.0.0.0
425
- if not hub_port:
426
- self.sock.bind(('0.0.0.0', 0))
427
- self.hub_port = self.sock.getsockname()[1]
428
- else:
429
- self.hub_port = hub_port
430
- try:
431
- self.sock.bind(('0.0.0.0', self.hub_port))
432
- except Exception as e:
433
- raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
434
- self.sock.settimeout(self.loop_freq / 1000)
435
- self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
436
-
437
- self._context = zmq.Context()
438
- self.ic_channel = self._context.socket(zmq.DEALER)
439
- self.ic_channel.setsockopt(zmq.LINGER, 0)
440
- self.ic_channel.set_hwm(0)
441
- self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
442
- self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
443
- self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
444
- min_port=hub_port_range[0],
445
- max_port=hub_port_range[1])
446
-
447
- def start(self,
448
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
449
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
450
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
451
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
452
- try:
453
- router_keep_going = True
454
- while router_keep_going:
455
- try:
456
- data, addr = self.sock.recvfrom(2048)
457
- resource_msg = pickle.loads(data)
458
- self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
459
- resource_msgs.put((resource_msg, addr))
460
- except socket.timeout:
461
- pass
462
-
463
- try:
464
- dfk_loop_start = time.time()
465
- while time.time() - dfk_loop_start < 1.0: # TODO make configurable
466
- # note that nothing checks that msg really is of the annotated type
467
- msg: TaggedMonitoringMessage
468
- msg = self.ic_channel.recv_pyobj()
469
-
470
- assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
471
- assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
472
- assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
473
-
474
- msg_0: AddressedMonitoringMessage
475
- msg_0 = (msg, 0)
476
-
477
- if msg[0] == MessageType.NODE_INFO:
478
- msg[1]['run_id'] = self.run_id
479
- node_msgs.put(msg_0)
480
- elif msg[0] == MessageType.RESOURCE_INFO:
481
- resource_msgs.put(msg_0)
482
- elif msg[0] == MessageType.BLOCK_INFO:
483
- block_msgs.put(msg_0)
484
- elif msg[0] == MessageType.TASK_INFO:
485
- priority_msgs.put(msg_0)
486
- elif msg[0] == MessageType.WORKFLOW_INFO:
487
- priority_msgs.put(msg_0)
488
- if 'exit_now' in msg[1] and msg[1]['exit_now']:
489
- router_keep_going = False
490
- else:
491
- # There is a type: ignore here because if msg[0]
492
- # is of the correct type, this code is unreachable,
493
- # but there is no verification that the message
494
- # received from ic_channel.recv_pyobj() is actually
495
- # of that type.
496
- self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable]
497
- except zmq.Again:
498
- pass
499
- except Exception:
500
- # This will catch malformed messages. What happens if the
501
- # channel is broken in such a way that it always raises
502
- # an exception? Looping on this would maybe be the wrong
503
- # thing to do.
504
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
505
-
506
- self.logger.info("Monitoring router draining")
507
- last_msg_received_time = time.time()
508
- while time.time() - last_msg_received_time < self.atexit_timeout:
509
- try:
510
- data, addr = self.sock.recvfrom(2048)
511
- msg = pickle.loads(data)
512
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
513
- resource_msgs.put((msg, addr))
514
- last_msg_received_time = time.time()
515
- except socket.timeout:
516
- pass
517
-
518
- self.logger.info("Monitoring router finishing normally")
519
- finally:
520
- self.logger.info("Monitoring router finished")
521
-
522
-
523
- @wrap_with_logs
524
- def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
525
- exception_q: "queue.Queue[Tuple[str, str]]",
526
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
527
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
528
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
529
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
530
-
531
- hub_address: str,
532
- hub_port: Optional[int],
533
- hub_port_range: Tuple[int, int],
534
-
535
- logdir: str,
536
- logging_level: int,
537
- run_id: str) -> None:
538
- setproctitle("parsl: monitoring router")
539
- try:
540
- router = MonitoringRouter(hub_address=hub_address,
541
- hub_port=hub_port,
542
- hub_port_range=hub_port_range,
543
- logdir=logdir,
544
- logging_level=logging_level,
545
- run_id=run_id)
546
- except Exception as e:
547
- logger.error("MonitoringRouter construction failed.", exc_info=True)
548
- comm_q.put(f"Monitoring router construction failed: {e}")
549
- else:
550
- comm_q.put((router.hub_port, router.ic_port))
551
-
552
- router.logger.info("Starting MonitoringRouter in router_starter")
553
- try:
554
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
555
- except Exception as e:
556
- router.logger.exception("router.start exception")
557
- exception_q.put(('Hub', str(e)))