parsl 2024.4.8__py3-none-any.whl → 2024.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. parsl/addresses.py +2 -2
  2. parsl/app/bash.py +10 -2
  3. parsl/app/errors.py +3 -5
  4. parsl/data_provider/data_manager.py +2 -1
  5. parsl/data_provider/zip.py +104 -0
  6. parsl/dataflow/dflow.py +92 -43
  7. parsl/dataflow/futures.py +26 -12
  8. parsl/executors/base.py +28 -9
  9. parsl/executors/high_throughput/executor.py +14 -19
  10. parsl/executors/high_throughput/process_worker_pool.py +3 -1
  11. parsl/executors/status_handling.py +81 -1
  12. parsl/executors/taskvine/executor.py +13 -2
  13. parsl/executors/workqueue/executor.py +14 -3
  14. parsl/jobs/job_status_poller.py +19 -113
  15. parsl/jobs/strategy.py +22 -27
  16. parsl/monitoring/monitoring.py +29 -23
  17. parsl/monitoring/radios.py +15 -0
  18. parsl/monitoring/router.py +7 -6
  19. parsl/providers/local/local.py +1 -1
  20. parsl/tests/configs/htex_local_alternate.py +2 -1
  21. parsl/tests/configs/taskvine_ex.py +1 -2
  22. parsl/tests/configs/workqueue_ex.py +1 -2
  23. parsl/tests/conftest.py +6 -7
  24. parsl/tests/test_bash_apps/test_basic.py +7 -4
  25. parsl/tests/test_bash_apps/test_error_codes.py +0 -3
  26. parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
  27. parsl/tests/test_bash_apps/test_memoize.py +0 -2
  28. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
  29. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
  30. parsl/tests/test_bash_apps/test_multiline.py +0 -1
  31. parsl/tests/test_bash_apps/test_stdout.py +11 -6
  32. parsl/tests/test_checkpointing/test_task_exit.py +1 -1
  33. parsl/tests/test_htex/test_zmq_binding.py +1 -0
  34. parsl/tests/test_monitoring/test_basic.py +46 -21
  35. parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
  36. parsl/tests/test_monitoring/test_stdouterr.py +137 -0
  37. parsl/tests/test_python_apps/test_context_manager.py +3 -3
  38. parsl/tests/test_python_apps/test_outputs.py +0 -1
  39. parsl/tests/test_scaling/test_regression_1621.py +11 -11
  40. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
  41. parsl/tests/test_staging/test_staging_stdout.py +61 -0
  42. parsl/tests/test_staging/test_zip_out.py +113 -0
  43. parsl/utils.py +11 -2
  44. parsl/version.py +1 -1
  45. {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/process_worker_pool.py +3 -1
  46. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/METADATA +5 -4
  47. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/RECORD +53 -48
  48. {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/exec_parsl_function.py +0 -0
  49. {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/parsl_coprocess.py +0 -0
  50. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/LICENSE +0 -0
  51. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/WHEEL +0 -0
  52. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/entry_points.txt +0 -0
  53. {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,18 @@
1
1
  from __future__ import annotations
2
+ import datetime
2
3
  import logging
3
4
  import threading
5
+ import time
4
6
  from itertools import compress
5
7
  from abc import abstractmethod, abstractproperty
6
8
  from concurrent.futures import Future
7
- from typing import List, Any, Dict, Optional, Tuple, Union, Callable
9
+ from typing import List, Any, Dict, Optional, Sequence, Tuple, Union, Callable
8
10
 
9
11
  from parsl.executors.base import ParslExecutor
10
12
  from parsl.executors.errors import BadStateException, ScalingFailed
11
13
  from parsl.jobs.states import JobStatus, JobState
12
14
  from parsl.jobs.error_handlers import simple_error_handler, noop_error_handler
15
+ from parsl.monitoring.message_type import MessageType
13
16
  from parsl.providers.base import ExecutionProvider
14
17
  from parsl.utils import AtomicIDCounter
15
18
 
@@ -71,6 +74,9 @@ class BlockProviderExecutor(ParslExecutor):
71
74
  self.blocks_to_job_id = {} # type: Dict[str, str]
72
75
  self.job_ids_to_block = {} # type: Dict[str, str]
73
76
 
77
+ self._last_poll_time = 0.0
78
+ self._status = {} # type: Dict[str, JobStatus]
79
+
74
80
  def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
75
81
  """Given a list of block ids and a list of corresponding status strings,
76
82
  returns a dictionary mapping each block id to the corresponding status
@@ -234,3 +240,77 @@ class BlockProviderExecutor(ParslExecutor):
234
240
  @abstractproperty
235
241
  def workers_per_node(self) -> Union[int, float]:
236
242
  pass
243
+
244
+ def send_monitoring_info(self, status: Dict) -> None:
245
+ # Send monitoring info for HTEX when monitoring enabled
246
+ if self.monitoring_radio:
247
+ msg = self.create_monitoring_info(status)
248
+ logger.debug("Sending message {} to hub from job status poller".format(msg))
249
+ self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
250
+
251
+ def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
252
+ """Create a monitoring message for each block based on the poll status.
253
+ """
254
+ msg = []
255
+ for bid, s in status.items():
256
+ d: Dict[str, Any] = {}
257
+ d['run_id'] = self.run_id
258
+ d['status'] = s.status_name
259
+ d['timestamp'] = datetime.datetime.now()
260
+ d['executor_label'] = self.label
261
+ d['job_id'] = self.blocks_to_job_id.get(bid, None)
262
+ d['block_id'] = bid
263
+ msg.append(d)
264
+ return msg
265
+
266
+ def poll_facade(self) -> None:
267
+ now = time.time()
268
+ if now >= self._last_poll_time + self.status_polling_interval:
269
+ previous_status = self._status
270
+ self._status = self.status()
271
+ self._last_poll_time = now
272
+ delta_status = {}
273
+ for block_id in self._status:
274
+ if block_id not in previous_status \
275
+ or previous_status[block_id].state != self._status[block_id].state:
276
+ delta_status[block_id] = self._status[block_id]
277
+
278
+ if delta_status:
279
+ self.send_monitoring_info(delta_status)
280
+
281
+ @property
282
+ def status_facade(self) -> Dict[str, JobStatus]:
283
+ """Return the status of all jobs/blocks of the executor of this poller.
284
+
285
+ :return: a dictionary mapping block ids (in string) to job status
286
+ """
287
+ return self._status
288
+
289
+ def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
290
+
291
+ if max_idletime is None:
292
+ block_ids = self.scale_in(n)
293
+ else:
294
+ # This is a HighThroughputExecutor-specific interface violation.
295
+ # This code hopes, through pan-codebase reasoning, that this
296
+ # scale_in method really does come from HighThroughputExecutor,
297
+ # and so does have an extra max_idletime parameter not present
298
+ # in the executor interface.
299
+ block_ids = self.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
300
+ if block_ids is not None:
301
+ new_status = {}
302
+ for block_id in block_ids:
303
+ new_status[block_id] = JobStatus(JobState.CANCELLED)
304
+ del self._status[block_id]
305
+ self.send_monitoring_info(new_status)
306
+ return block_ids
307
+
308
+ def scale_out_facade(self, n: int) -> List[str]:
309
+ block_ids = self.scale_out(n)
310
+ if block_ids is not None:
311
+ new_status = {}
312
+ for block_id in block_ids:
313
+ new_status[block_id] = JobStatus(JobState.PENDING)
314
+ self.send_monitoring_info(new_status)
315
+ self._status.update(new_status)
316
+ return block_ids
@@ -596,7 +596,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
596
596
  def workers_per_node(self) -> Union[int, float]:
597
597
  return 1
598
598
 
599
- def scale_in(self, count):
599
+ def scale_in(self, count: int) -> List[str]:
600
600
  """Scale in method. Cancel a given number of blocks
601
601
  """
602
602
  # Obtain list of blocks to kill
@@ -605,9 +605,14 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
605
605
 
606
606
  # Cancel the blocks provisioned
607
607
  if self.provider:
608
- self.provider.cancel(kill_ids)
608
+ logger.info(f"Scaling in jobs: {kill_ids}")
609
+ r = self.provider.cancel(kill_ids)
610
+ job_ids = self._filter_scale_in_ids(kill_ids, r)
611
+ block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
612
+ return block_ids_killed
609
613
  else:
610
614
  logger.error("No execution provider available to scale")
615
+ return []
611
616
 
612
617
  def shutdown(self, *args, **kwargs):
613
618
  """Shutdown the executor. Sets flag to cancel the submit process and
@@ -639,6 +644,12 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
639
644
  logger.debug("Joining on factory process")
640
645
  self._factory_process.join()
641
646
 
647
+ # Shutdown multiprocessing queues
648
+ self._ready_task_queue.close()
649
+ self._ready_task_queue.join_thread()
650
+ self._finished_task_queue.close()
651
+ self._finished_task_queue.join_thread()
652
+
642
653
  self._is_shutdown = True
643
654
  logger.debug("TaskVine shutdown completed")
644
655
 
@@ -691,7 +691,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
691
691
  def workers_per_node(self) -> Union[int, float]:
692
692
  return 1
693
693
 
694
- def scale_in(self, count):
694
+ def scale_in(self, count: int) -> List[str]:
695
695
  """Scale in method.
696
696
  """
697
697
  # Obtain list of blocks to kill
@@ -700,9 +700,14 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
700
700
 
701
701
  # Cancel the blocks provisioned
702
702
  if self.provider:
703
- self.provider.cancel(kill_ids)
703
+ logger.info(f"Scaling in jobs: {kill_ids}")
704
+ r = self.provider.cancel(kill_ids)
705
+ job_ids = self._filter_scale_in_ids(kill_ids, r)
706
+ block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
707
+ return block_ids_killed
704
708
  else:
705
- logger.error("No execution provider available to scale")
709
+ logger.error("No execution provider available to scale in")
710
+ return []
706
711
 
707
712
  def shutdown(self, *args, **kwargs):
708
713
  """Shutdown the executor. Sets flag to cancel the submit process and
@@ -730,6 +735,12 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
730
735
  logger.debug("Joining on collector thread")
731
736
  self.collector_thread.join()
732
737
 
738
+ logger.debug("Closing multiprocessing queues")
739
+ self.task_queue.close()
740
+ self.task_queue.join_thread()
741
+ self.collector_queue.close()
742
+ self.collector_queue.join_thread()
743
+
733
744
  self.is_shutdown = True
734
745
  logger.debug("Work Queue shutdown completed")
735
746
 
@@ -1,13 +1,9 @@
1
1
  import logging
2
2
  import parsl
3
- import time
4
- import zmq
5
- from typing import Dict, List, Sequence, Optional, Union
3
+ from typing import List, Sequence, Optional, Union
6
4
 
7
- from parsl.jobs.states import JobStatus, JobState
8
5
  from parsl.jobs.strategy import Strategy
9
6
  from parsl.executors.status_handling import BlockProviderExecutor
10
- from parsl.monitoring.message_type import MessageType
11
7
 
12
8
 
13
9
  from parsl.utils import Timer
@@ -16,137 +12,47 @@ from parsl.utils import Timer
16
12
  logger = logging.getLogger(__name__)
17
13
 
18
14
 
19
- class PolledExecutorFacade:
20
- def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
21
- self._executor = executor
22
- self._interval = executor.status_polling_interval
23
- self._last_poll_time = 0.0
24
- self._status = {} # type: Dict[str, JobStatus]
25
-
26
- # Create a ZMQ channel to send poll status to monitoring
27
- self.monitoring_enabled = False
28
- if dfk and dfk.monitoring is not None:
29
- self.monitoring_enabled = True
30
- hub_address = dfk.hub_address
31
- hub_port = dfk.hub_zmq_port
32
- context = zmq.Context()
33
- self.hub_channel = context.socket(zmq.DEALER)
34
- self.hub_channel.set_hwm(0)
35
- self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
36
- logger.info("Monitoring enabled on job status poller")
37
-
38
- def _should_poll(self, now: float) -> bool:
39
- return now >= self._last_poll_time + self._interval
40
-
41
- def poll(self, now: float) -> None:
42
- if self._should_poll(now):
43
- previous_status = self._status
44
- self._status = self._executor.status()
45
- self._last_poll_time = now
46
- delta_status = {}
47
- for block_id in self._status:
48
- if block_id not in previous_status \
49
- or previous_status[block_id].state != self._status[block_id].state:
50
- delta_status[block_id] = self._status[block_id]
51
-
52
- if delta_status:
53
- self.send_monitoring_info(delta_status)
54
-
55
- def send_monitoring_info(self, status: Dict) -> None:
56
- # Send monitoring info for HTEX when monitoring enabled
57
- if self.monitoring_enabled:
58
- msg = self._executor.create_monitoring_info(status)
59
- logger.debug("Sending message {} to hub from job status poller".format(msg))
60
- self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
61
-
62
- @property
63
- def status(self) -> Dict[str, JobStatus]:
64
- """Return the status of all jobs/blocks of the executor of this poller.
65
-
66
- :return: a dictionary mapping block ids (in string) to job status
67
- """
68
- return self._status
69
-
70
- @property
71
- def executor(self) -> BlockProviderExecutor:
72
- return self._executor
73
-
74
- def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
75
-
76
- if max_idletime is None:
77
- block_ids = self._executor.scale_in(n)
78
- else:
79
- # This is a HighThroughputExecutor-specific interface violation.
80
- # This code hopes, through pan-codebase reasoning, that this
81
- # scale_in method really does come from HighThroughputExecutor,
82
- # and so does have an extra max_idletime parameter not present
83
- # in the executor interface.
84
- block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
85
- if block_ids is not None:
86
- new_status = {}
87
- for block_id in block_ids:
88
- new_status[block_id] = JobStatus(JobState.CANCELLED)
89
- del self._status[block_id]
90
- self.send_monitoring_info(new_status)
91
- return block_ids
92
-
93
- def scale_out(self, n: int) -> List[str]:
94
- block_ids = self._executor.scale_out(n)
95
- if block_ids is not None:
96
- new_status = {}
97
- for block_id in block_ids:
98
- new_status[block_id] = JobStatus(JobState.PENDING)
99
- self.send_monitoring_info(new_status)
100
- self._status.update(new_status)
101
- return block_ids
102
-
103
- def __repr__(self) -> str:
104
- return self._status.__repr__()
105
-
106
-
107
15
  class JobStatusPoller(Timer):
108
16
  def __init__(self, *, strategy: Optional[str], max_idletime: float,
109
17
  strategy_period: Union[float, int],
110
- dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None) -> None:
111
- self._executor_facades = [] # type: List[PolledExecutorFacade]
112
- self.dfk = dfk
18
+ monitoring: Optional["parsl.monitoring.radios.MonitoringRadio"] = None) -> None:
19
+ self._executors = [] # type: List[BlockProviderExecutor]
113
20
  self._strategy = Strategy(strategy=strategy,
114
21
  max_idletime=max_idletime)
115
22
  super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
116
23
 
117
24
  def poll(self) -> None:
118
25
  self._update_state()
119
- self._run_error_handlers(self._executor_facades)
120
- self._strategy.strategize(self._executor_facades)
26
+ self._run_error_handlers(self._executors)
27
+ self._strategy.strategize(self._executors)
121
28
 
122
- def _run_error_handlers(self, status: List[PolledExecutorFacade]) -> None:
123
- for es in status:
124
- es.executor.handle_errors(es.status)
29
+ def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
30
+ for e in executors:
31
+ e.handle_errors(e.status_facade)
125
32
 
126
33
  def _update_state(self) -> None:
127
- now = time.time()
128
- for item in self._executor_facades:
129
- item.poll(now)
34
+ for item in self._executors:
35
+ item.poll_facade()
130
36
 
131
37
  def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
132
38
  for executor in executors:
133
39
  if executor.status_polling_interval > 0:
134
40
  logger.debug("Adding executor {}".format(executor.label))
135
- self._executor_facades.append(PolledExecutorFacade(executor, self.dfk))
41
+ self._executors.append(executor)
136
42
  self._strategy.add_executors(executors)
137
43
 
138
- def close(self):
139
- super().close()
140
- for ef in self._executor_facades:
141
- if not ef.executor.bad_state_is_set:
142
- logger.info(f"Scaling in executor {ef.executor.label}")
44
+ def close(self, timeout: Optional[float] = None) -> None:
45
+ super().close(timeout)
46
+ for executor in self._executors:
47
+ if not executor.bad_state_is_set:
48
+ logger.info(f"Scaling in executor {executor.label}")
143
49
 
144
50
  # this code needs to be at least as many blocks as need
145
51
  # cancelling, but it is safe to be more, as the scaling
146
52
  # code will cope with being asked to cancel more blocks
147
53
  # than exist.
148
- block_count = len(ef.status)
149
- ef.scale_in(block_count)
54
+ block_count = len(executor.status_facade)
55
+ executor.scale_in_facade(block_count)
150
56
 
151
57
  else: # and bad_state_is_set
152
- logger.warning(f"Not scaling in executor {ef.executor.label} because it is in bad state")
58
+ logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
parsl/jobs/strategy.py CHANGED
@@ -5,8 +5,6 @@ import math
5
5
  import warnings
6
6
  from typing import Dict, List, Optional, Sequence, TypedDict
7
7
 
8
- import parsl.jobs.job_status_poller as jsp
9
-
10
8
  from parsl.executors import HighThroughputExecutor
11
9
  from parsl.executors.base import ParslExecutor
12
10
  from parsl.executors.status_handling import BlockProviderExecutor
@@ -150,22 +148,21 @@ class Strategy:
150
148
  for executor in executors:
151
149
  self.executors[executor.label] = {'idle_since': None, 'first': True}
152
150
 
153
- def _strategy_init_only(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
151
+ def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
154
152
  """Scale up to init_blocks at the start, then nothing more.
155
153
  """
156
- for ef in executor_facades:
157
- executor = ef.executor
154
+ for executor in executors:
158
155
  if self.executors[executor.label]['first']:
159
156
  logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
160
- ef.scale_out(executor.provider.init_blocks)
157
+ executor.scale_out_facade(executor.provider.init_blocks)
161
158
  self.executors[executor.label]['first'] = False
162
159
  else:
163
160
  logger.debug("strategy_init_only: doing nothing")
164
161
 
165
- def _strategy_simple(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
166
- self._general_strategy(executor_facades, strategy_type='simple')
162
+ def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
163
+ self._general_strategy(executors, strategy_type='simple')
167
164
 
168
- def _strategy_htex_auto_scale(self, executor_facades: List[jsp.PolledExecutorFacade]) -> None:
165
+ def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
169
166
  """HTEX specific auto scaling strategy
170
167
 
171
168
  This strategy works only for HTEX. This strategy will scale out by
@@ -180,30 +177,25 @@ class Strategy:
180
177
  expected to scale in effectively only when # of workers, or tasks executing
181
178
  per block is close to 1.
182
179
  """
183
- self._general_strategy(executor_facades, strategy_type='htex')
180
+ self._general_strategy(executors, strategy_type='htex')
184
181
 
185
182
  @wrap_with_logs
186
- def _general_strategy(self, executor_facades, *, strategy_type):
187
- logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executor_facades)} executors")
183
+ def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
184
+ logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
188
185
 
189
- for ef in executor_facades:
190
- executor = ef.executor
186
+ for executor in executors:
191
187
  label = executor.label
192
- if not isinstance(executor, BlockProviderExecutor):
193
- logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
194
- continue
195
188
  logger.debug(f"Strategizing for executor {label}")
196
189
 
197
190
  if self.executors[label]['first']:
198
- executor = ef.executor
199
191
  logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
200
- ef.scale_out(executor.provider.init_blocks)
192
+ executor.scale_out_facade(executor.provider.init_blocks)
201
193
  self.executors[label]['first'] = False
202
194
 
203
195
  # Tasks that are either pending completion
204
196
  active_tasks = executor.outstanding
205
197
 
206
- status = ef.status
198
+ status = executor.status_facade
207
199
 
208
200
  # FIXME we need to handle case where provider does not define these
209
201
  # FIXME probably more of this logic should be moved to the provider
@@ -247,23 +239,26 @@ class Strategy:
247
239
  else:
248
240
  # We want to make sure that max_idletime is reached
249
241
  # before killing off resources
250
- logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks}) than minimum blocks ({min_blocks})")
242
+ logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
243
+ f" than minimum blocks ({min_blocks})")
251
244
 
252
245
  if not self.executors[executor.label]['idle_since']:
253
246
  logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
254
247
  self.executors[executor.label]['idle_since'] = time.time()
255
-
256
248
  idle_since = self.executors[executor.label]['idle_since']
249
+ assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
250
+
257
251
  idle_duration = time.time() - idle_since
258
252
  if idle_duration > self.max_idletime:
259
253
  # We have resources idle for the max duration,
260
254
  # we have to scale_in now.
261
255
  logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
262
- ef.scale_in(active_blocks - min_blocks)
256
+ executor.scale_in_facade(active_blocks - min_blocks)
263
257
 
264
258
  else:
265
259
  logger.debug(
266
- f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s for executor {label}; not scaling in")
260
+ f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
261
+ f" for executor {label}; not scaling in")
267
262
 
268
263
  # Case 2
269
264
  # More tasks than the available slots.
@@ -282,7 +277,7 @@ class Strategy:
282
277
  excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
283
278
  excess_blocks = min(excess_blocks, max_blocks - active_blocks)
284
279
  logger.debug(f"Requesting {excess_blocks} more blocks")
285
- ef.scale_out(excess_blocks)
280
+ executor.scale_out_facade(excess_blocks)
286
281
 
287
282
  elif active_slots == 0 and active_tasks > 0:
288
283
  logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
@@ -291,7 +286,7 @@ class Strategy:
291
286
  if active_blocks < max_blocks:
292
287
  logger.debug("Requesting single block")
293
288
 
294
- ef.scale_out(1)
289
+ executor.scale_out_facade(1)
295
290
  else:
296
291
  logger.debug("Not requesting single block, because at maxblocks already")
297
292
 
@@ -307,7 +302,7 @@ class Strategy:
307
302
  excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
308
303
  excess_blocks = min(excess_blocks, active_blocks - min_blocks)
309
304
  logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
310
- ef.scale_in(excess_blocks, max_idletime=self.max_idletime)
305
+ executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
311
306
  else:
312
307
  logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
313
308
  else:
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  import os
4
4
  import time
5
5
  import logging
6
+ import multiprocessing.synchronize as ms
6
7
  import typeguard
7
- import zmq
8
8
 
9
9
  import queue
10
10
 
11
11
  from parsl.multiprocessing import ForkProcess, SizedQueue
12
12
  from multiprocessing import Process
13
+ from multiprocessing import Event
13
14
  from multiprocessing.queues import Queue
14
15
  from parsl.log_utils import set_file_logger
15
16
  from parsl.utils import RepresentationMixin
@@ -18,6 +19,7 @@ from parsl.utils import setproctitle
18
19
 
19
20
  from parsl.serialize import deserialize
20
21
 
22
+ from parsl.monitoring.radios import MultiprocessingQueueRadio
21
23
  from parsl.monitoring.router import router_starter
22
24
  from parsl.monitoring.message_type import MessageType
23
25
  from parsl.monitoring.types import AddressedMonitoringMessage
@@ -90,12 +92,6 @@ class MonitoringHub(RepresentationMixin):
90
92
  Default: 30 seconds
91
93
  """
92
94
 
93
- # Any is used to disable typechecking on uses of _dfk_channel,
94
- # because it is used in the code as if it points to a channel, but
95
- # the static type is that it can also be None. The code relies on
96
- # .start() being called and initialising this to a real channel.
97
- self._dfk_channel = None # type: Any
98
-
99
95
  if _db_manager_excepts:
100
96
  raise _db_manager_excepts
101
97
 
@@ -157,8 +153,12 @@ class MonitoringHub(RepresentationMixin):
157
153
  self.block_msgs: Queue[AddressedMonitoringMessage]
158
154
  self.block_msgs = SizedQueue()
159
155
 
156
+ self.router_exit_event: ms.Event
157
+ self.router_exit_event = Event()
158
+
160
159
  self.router_proc = ForkProcess(target=router_starter,
161
- args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
160
+ args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
161
+ self.block_msgs, self.resource_msgs, self.router_exit_event),
162
162
  kwargs={"hub_address": self.hub_address,
163
163
  "udp_port": self.hub_port,
164
164
  "zmq_port_range": self.hub_port_range,
@@ -191,8 +191,12 @@ class MonitoringHub(RepresentationMixin):
191
191
  self.filesystem_proc.start()
192
192
  logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
193
193
 
194
+ self.radio = MultiprocessingQueueRadio(self.block_msgs)
195
+
194
196
  try:
195
197
  comm_q_result = comm_q.get(block=True, timeout=120)
198
+ comm_q.close()
199
+ comm_q.join_thread()
196
200
  except queue.Empty:
197
201
  logger.error("Hub has not completed initialization in 120s. Aborting")
198
202
  raise Exception("Hub failed to start")
@@ -205,14 +209,6 @@ class MonitoringHub(RepresentationMixin):
205
209
 
206
210
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
207
211
 
208
- context = zmq.Context()
209
- self.dfk_channel_timeout = 10000 # in milliseconds
210
- self._dfk_channel = context.socket(zmq.DEALER)
211
- self._dfk_channel.setsockopt(zmq.LINGER, 0)
212
- self._dfk_channel.set_hwm(0)
213
- self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
214
- self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
215
-
216
212
  logger.info("Monitoring Hub initialized")
217
213
 
218
214
  return zmq_port
@@ -220,11 +216,7 @@ class MonitoringHub(RepresentationMixin):
220
216
  # TODO: tighten the Any message format
221
217
  def send(self, mtype: MessageType, message: Any) -> None:
222
218
  logger.debug("Sending message type {}".format(mtype))
223
- try:
224
- self._dfk_channel.send_pyobj((mtype, message))
225
- except zmq.Again:
226
- logger.exception(
227
- "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
219
+ self.radio.send((mtype, message))
228
220
 
229
221
  def close(self) -> None:
230
222
  logger.info("Terminating Monitoring Hub")
@@ -235,9 +227,8 @@ class MonitoringHub(RepresentationMixin):
235
227
  logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
236
228
  except queue.Empty:
237
229
  break
238
- if self._dfk_channel and self.monitoring_hub_active:
230
+ if self.monitoring_hub_active:
239
231
  self.monitoring_hub_active = False
240
- self._dfk_channel.close()
241
232
  if exception_msgs:
242
233
  for exception_msg in exception_msgs:
243
234
  logger.error(
@@ -249,6 +240,8 @@ class MonitoringHub(RepresentationMixin):
249
240
  self.router_proc.terminate()
250
241
  self.dbm_proc.terminate()
251
242
  self.filesystem_proc.terminate()
243
+ logger.info("Setting router termination event")
244
+ self.router_exit_event.set()
252
245
  logger.info("Waiting for router to terminate")
253
246
  self.router_proc.join()
254
247
  logger.debug("Finished waiting for router termination")
@@ -267,6 +260,19 @@ class MonitoringHub(RepresentationMixin):
267
260
  self.filesystem_proc.terminate()
268
261
  self.filesystem_proc.join()
269
262
 
263
+ logger.info("Closing monitoring multiprocessing queues")
264
+ self.exception_q.close()
265
+ self.exception_q.join_thread()
266
+ self.priority_msgs.close()
267
+ self.priority_msgs.join_thread()
268
+ self.resource_msgs.close()
269
+ self.resource_msgs.join_thread()
270
+ self.node_msgs.close()
271
+ self.node_msgs.join_thread()
272
+ self.block_msgs.close()
273
+ self.block_msgs.join_thread()
274
+ logger.info("Closed monitoring multiprocessing queues")
275
+
270
276
 
271
277
  @wrap_with_logs
272
278
  def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
@@ -6,6 +6,7 @@ import logging
6
6
 
7
7
  from abc import ABCMeta, abstractmethod
8
8
 
9
+ from multiprocessing.queues import Queue
9
10
  from typing import Optional
10
11
 
11
12
  from parsl.serialize import serialize
@@ -173,3 +174,17 @@ class UDPRadio(MonitoringRadio):
173
174
  logging.error("Could not send message within timeout limit")
174
175
  return
175
176
  return
177
+
178
+
179
+ class MultiprocessingQueueRadio(MonitoringRadio):
180
+ """A monitoring radio intended which connects over a multiprocessing Queue.
181
+ This radio is intended to be used on the submit side, where components
182
+ in the submit process, or processes launched by multiprocessing, will have
183
+ access to a Queue shared with the monitoring database code (bypassing the
184
+ monitoring router).
185
+ """
186
+ def __init__(self, queue: Queue) -> None:
187
+ self.queue = queue
188
+
189
+ def send(self, message: object) -> None:
190
+ self.queue.put((message, 0))