parsl 2024.4.8__py3-none-any.whl → 2024.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +2 -2
- parsl/app/bash.py +10 -2
- parsl/app/errors.py +3 -5
- parsl/data_provider/data_manager.py +2 -1
- parsl/data_provider/zip.py +104 -0
- parsl/dataflow/dflow.py +92 -43
- parsl/dataflow/futures.py +26 -12
- parsl/executors/base.py +28 -9
- parsl/executors/high_throughput/executor.py +14 -19
- parsl/executors/high_throughput/process_worker_pool.py +3 -1
- parsl/executors/status_handling.py +81 -1
- parsl/executors/taskvine/executor.py +13 -2
- parsl/executors/workqueue/executor.py +14 -3
- parsl/jobs/job_status_poller.py +19 -113
- parsl/jobs/strategy.py +22 -27
- parsl/monitoring/monitoring.py +29 -23
- parsl/monitoring/radios.py +15 -0
- parsl/monitoring/router.py +7 -6
- parsl/providers/local/local.py +1 -1
- parsl/tests/configs/htex_local_alternate.py +2 -1
- parsl/tests/configs/taskvine_ex.py +1 -2
- parsl/tests/configs/workqueue_ex.py +1 -2
- parsl/tests/conftest.py +6 -7
- parsl/tests/test_bash_apps/test_basic.py +7 -4
- parsl/tests/test_bash_apps/test_error_codes.py +0 -3
- parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
- parsl/tests/test_bash_apps/test_memoize.py +0 -2
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
- parsl/tests/test_bash_apps/test_multiline.py +0 -1
- parsl/tests/test_bash_apps/test_stdout.py +11 -6
- parsl/tests/test_checkpointing/test_task_exit.py +1 -1
- parsl/tests/test_htex/test_zmq_binding.py +1 -0
- parsl/tests/test_monitoring/test_basic.py +46 -21
- parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
- parsl/tests/test_monitoring/test_stdouterr.py +137 -0
- parsl/tests/test_python_apps/test_context_manager.py +3 -3
- parsl/tests/test_python_apps/test_outputs.py +0 -1
- parsl/tests/test_scaling/test_regression_1621.py +11 -11
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
- parsl/tests/test_staging/test_staging_stdout.py +61 -0
- parsl/tests/test_staging/test_zip_out.py +113 -0
- parsl/utils.py +11 -2
- parsl/version.py +1 -1
- {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/process_worker_pool.py +3 -1
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/METADATA +5 -4
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/RECORD +53 -48
- {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.4.8.data → parsl-2024.4.22.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/LICENSE +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/WHEEL +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/entry_points.txt +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.22.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,18 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
import datetime
|
2
3
|
import logging
|
3
4
|
import threading
|
5
|
+
import time
|
4
6
|
from itertools import compress
|
5
7
|
from abc import abstractmethod, abstractproperty
|
6
8
|
from concurrent.futures import Future
|
7
|
-
from typing import List, Any, Dict, Optional, Tuple, Union, Callable
|
9
|
+
from typing import List, Any, Dict, Optional, Sequence, Tuple, Union, Callable
|
8
10
|
|
9
11
|
from parsl.executors.base import ParslExecutor
|
10
12
|
from parsl.executors.errors import BadStateException, ScalingFailed
|
11
13
|
from parsl.jobs.states import JobStatus, JobState
|
12
14
|
from parsl.jobs.error_handlers import simple_error_handler, noop_error_handler
|
15
|
+
from parsl.monitoring.message_type import MessageType
|
13
16
|
from parsl.providers.base import ExecutionProvider
|
14
17
|
from parsl.utils import AtomicIDCounter
|
15
18
|
|
@@ -71,6 +74,9 @@ class BlockProviderExecutor(ParslExecutor):
|
|
71
74
|
self.blocks_to_job_id = {} # type: Dict[str, str]
|
72
75
|
self.job_ids_to_block = {} # type: Dict[str, str]
|
73
76
|
|
77
|
+
self._last_poll_time = 0.0
|
78
|
+
self._status = {} # type: Dict[str, JobStatus]
|
79
|
+
|
74
80
|
def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
|
75
81
|
"""Given a list of block ids and a list of corresponding status strings,
|
76
82
|
returns a dictionary mapping each block id to the corresponding status
|
@@ -234,3 +240,77 @@ class BlockProviderExecutor(ParslExecutor):
|
|
234
240
|
@abstractproperty
|
235
241
|
def workers_per_node(self) -> Union[int, float]:
|
236
242
|
pass
|
243
|
+
|
244
|
+
def send_monitoring_info(self, status: Dict) -> None:
|
245
|
+
# Send monitoring info for HTEX when monitoring enabled
|
246
|
+
if self.monitoring_radio:
|
247
|
+
msg = self.create_monitoring_info(status)
|
248
|
+
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
249
|
+
self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
|
250
|
+
|
251
|
+
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
|
252
|
+
"""Create a monitoring message for each block based on the poll status.
|
253
|
+
"""
|
254
|
+
msg = []
|
255
|
+
for bid, s in status.items():
|
256
|
+
d: Dict[str, Any] = {}
|
257
|
+
d['run_id'] = self.run_id
|
258
|
+
d['status'] = s.status_name
|
259
|
+
d['timestamp'] = datetime.datetime.now()
|
260
|
+
d['executor_label'] = self.label
|
261
|
+
d['job_id'] = self.blocks_to_job_id.get(bid, None)
|
262
|
+
d['block_id'] = bid
|
263
|
+
msg.append(d)
|
264
|
+
return msg
|
265
|
+
|
266
|
+
def poll_facade(self) -> None:
|
267
|
+
now = time.time()
|
268
|
+
if now >= self._last_poll_time + self.status_polling_interval:
|
269
|
+
previous_status = self._status
|
270
|
+
self._status = self.status()
|
271
|
+
self._last_poll_time = now
|
272
|
+
delta_status = {}
|
273
|
+
for block_id in self._status:
|
274
|
+
if block_id not in previous_status \
|
275
|
+
or previous_status[block_id].state != self._status[block_id].state:
|
276
|
+
delta_status[block_id] = self._status[block_id]
|
277
|
+
|
278
|
+
if delta_status:
|
279
|
+
self.send_monitoring_info(delta_status)
|
280
|
+
|
281
|
+
@property
|
282
|
+
def status_facade(self) -> Dict[str, JobStatus]:
|
283
|
+
"""Return the status of all jobs/blocks of the executor of this poller.
|
284
|
+
|
285
|
+
:return: a dictionary mapping block ids (in string) to job status
|
286
|
+
"""
|
287
|
+
return self._status
|
288
|
+
|
289
|
+
def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
290
|
+
|
291
|
+
if max_idletime is None:
|
292
|
+
block_ids = self.scale_in(n)
|
293
|
+
else:
|
294
|
+
# This is a HighThroughputExecutor-specific interface violation.
|
295
|
+
# This code hopes, through pan-codebase reasoning, that this
|
296
|
+
# scale_in method really does come from HighThroughputExecutor,
|
297
|
+
# and so does have an extra max_idletime parameter not present
|
298
|
+
# in the executor interface.
|
299
|
+
block_ids = self.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
300
|
+
if block_ids is not None:
|
301
|
+
new_status = {}
|
302
|
+
for block_id in block_ids:
|
303
|
+
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
304
|
+
del self._status[block_id]
|
305
|
+
self.send_monitoring_info(new_status)
|
306
|
+
return block_ids
|
307
|
+
|
308
|
+
def scale_out_facade(self, n: int) -> List[str]:
|
309
|
+
block_ids = self.scale_out(n)
|
310
|
+
if block_ids is not None:
|
311
|
+
new_status = {}
|
312
|
+
for block_id in block_ids:
|
313
|
+
new_status[block_id] = JobStatus(JobState.PENDING)
|
314
|
+
self.send_monitoring_info(new_status)
|
315
|
+
self._status.update(new_status)
|
316
|
+
return block_ids
|
@@ -596,7 +596,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
596
596
|
def workers_per_node(self) -> Union[int, float]:
|
597
597
|
return 1
|
598
598
|
|
599
|
-
def scale_in(self, count):
|
599
|
+
def scale_in(self, count: int) -> List[str]:
|
600
600
|
"""Scale in method. Cancel a given number of blocks
|
601
601
|
"""
|
602
602
|
# Obtain list of blocks to kill
|
@@ -605,9 +605,14 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
605
605
|
|
606
606
|
# Cancel the blocks provisioned
|
607
607
|
if self.provider:
|
608
|
-
|
608
|
+
logger.info(f"Scaling in jobs: {kill_ids}")
|
609
|
+
r = self.provider.cancel(kill_ids)
|
610
|
+
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
611
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
612
|
+
return block_ids_killed
|
609
613
|
else:
|
610
614
|
logger.error("No execution provider available to scale")
|
615
|
+
return []
|
611
616
|
|
612
617
|
def shutdown(self, *args, **kwargs):
|
613
618
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
@@ -639,6 +644,12 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
639
644
|
logger.debug("Joining on factory process")
|
640
645
|
self._factory_process.join()
|
641
646
|
|
647
|
+
# Shutdown multiprocessing queues
|
648
|
+
self._ready_task_queue.close()
|
649
|
+
self._ready_task_queue.join_thread()
|
650
|
+
self._finished_task_queue.close()
|
651
|
+
self._finished_task_queue.join_thread()
|
652
|
+
|
642
653
|
self._is_shutdown = True
|
643
654
|
logger.debug("TaskVine shutdown completed")
|
644
655
|
|
@@ -691,7 +691,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
691
691
|
def workers_per_node(self) -> Union[int, float]:
|
692
692
|
return 1
|
693
693
|
|
694
|
-
def scale_in(self, count):
|
694
|
+
def scale_in(self, count: int) -> List[str]:
|
695
695
|
"""Scale in method.
|
696
696
|
"""
|
697
697
|
# Obtain list of blocks to kill
|
@@ -700,9 +700,14 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
700
700
|
|
701
701
|
# Cancel the blocks provisioned
|
702
702
|
if self.provider:
|
703
|
-
|
703
|
+
logger.info(f"Scaling in jobs: {kill_ids}")
|
704
|
+
r = self.provider.cancel(kill_ids)
|
705
|
+
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
706
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
707
|
+
return block_ids_killed
|
704
708
|
else:
|
705
|
-
logger.error("No execution provider available to scale")
|
709
|
+
logger.error("No execution provider available to scale in")
|
710
|
+
return []
|
706
711
|
|
707
712
|
def shutdown(self, *args, **kwargs):
|
708
713
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
@@ -730,6 +735,12 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
730
735
|
logger.debug("Joining on collector thread")
|
731
736
|
self.collector_thread.join()
|
732
737
|
|
738
|
+
logger.debug("Closing multiprocessing queues")
|
739
|
+
self.task_queue.close()
|
740
|
+
self.task_queue.join_thread()
|
741
|
+
self.collector_queue.close()
|
742
|
+
self.collector_queue.join_thread()
|
743
|
+
|
733
744
|
self.is_shutdown = True
|
734
745
|
logger.debug("Work Queue shutdown completed")
|
735
746
|
|
parsl/jobs/job_status_poller.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import parsl
|
3
|
-
import
|
4
|
-
import zmq
|
5
|
-
from typing import Dict, List, Sequence, Optional, Union
|
3
|
+
from typing import List, Sequence, Optional, Union
|
6
4
|
|
7
|
-
from parsl.jobs.states import JobStatus, JobState
|
8
5
|
from parsl.jobs.strategy import Strategy
|
9
6
|
from parsl.executors.status_handling import BlockProviderExecutor
|
10
|
-
from parsl.monitoring.message_type import MessageType
|
11
7
|
|
12
8
|
|
13
9
|
from parsl.utils import Timer
|
@@ -16,137 +12,47 @@ from parsl.utils import Timer
|
|
16
12
|
logger = logging.getLogger(__name__)
|
17
13
|
|
18
14
|
|
19
|
-
class PolledExecutorFacade:
|
20
|
-
def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
|
21
|
-
self._executor = executor
|
22
|
-
self._interval = executor.status_polling_interval
|
23
|
-
self._last_poll_time = 0.0
|
24
|
-
self._status = {} # type: Dict[str, JobStatus]
|
25
|
-
|
26
|
-
# Create a ZMQ channel to send poll status to monitoring
|
27
|
-
self.monitoring_enabled = False
|
28
|
-
if dfk and dfk.monitoring is not None:
|
29
|
-
self.monitoring_enabled = True
|
30
|
-
hub_address = dfk.hub_address
|
31
|
-
hub_port = dfk.hub_zmq_port
|
32
|
-
context = zmq.Context()
|
33
|
-
self.hub_channel = context.socket(zmq.DEALER)
|
34
|
-
self.hub_channel.set_hwm(0)
|
35
|
-
self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
|
36
|
-
logger.info("Monitoring enabled on job status poller")
|
37
|
-
|
38
|
-
def _should_poll(self, now: float) -> bool:
|
39
|
-
return now >= self._last_poll_time + self._interval
|
40
|
-
|
41
|
-
def poll(self, now: float) -> None:
|
42
|
-
if self._should_poll(now):
|
43
|
-
previous_status = self._status
|
44
|
-
self._status = self._executor.status()
|
45
|
-
self._last_poll_time = now
|
46
|
-
delta_status = {}
|
47
|
-
for block_id in self._status:
|
48
|
-
if block_id not in previous_status \
|
49
|
-
or previous_status[block_id].state != self._status[block_id].state:
|
50
|
-
delta_status[block_id] = self._status[block_id]
|
51
|
-
|
52
|
-
if delta_status:
|
53
|
-
self.send_monitoring_info(delta_status)
|
54
|
-
|
55
|
-
def send_monitoring_info(self, status: Dict) -> None:
|
56
|
-
# Send monitoring info for HTEX when monitoring enabled
|
57
|
-
if self.monitoring_enabled:
|
58
|
-
msg = self._executor.create_monitoring_info(status)
|
59
|
-
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
60
|
-
self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
|
61
|
-
|
62
|
-
@property
|
63
|
-
def status(self) -> Dict[str, JobStatus]:
|
64
|
-
"""Return the status of all jobs/blocks of the executor of this poller.
|
65
|
-
|
66
|
-
:return: a dictionary mapping block ids (in string) to job status
|
67
|
-
"""
|
68
|
-
return self._status
|
69
|
-
|
70
|
-
@property
|
71
|
-
def executor(self) -> BlockProviderExecutor:
|
72
|
-
return self._executor
|
73
|
-
|
74
|
-
def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
75
|
-
|
76
|
-
if max_idletime is None:
|
77
|
-
block_ids = self._executor.scale_in(n)
|
78
|
-
else:
|
79
|
-
# This is a HighThroughputExecutor-specific interface violation.
|
80
|
-
# This code hopes, through pan-codebase reasoning, that this
|
81
|
-
# scale_in method really does come from HighThroughputExecutor,
|
82
|
-
# and so does have an extra max_idletime parameter not present
|
83
|
-
# in the executor interface.
|
84
|
-
block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
85
|
-
if block_ids is not None:
|
86
|
-
new_status = {}
|
87
|
-
for block_id in block_ids:
|
88
|
-
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
89
|
-
del self._status[block_id]
|
90
|
-
self.send_monitoring_info(new_status)
|
91
|
-
return block_ids
|
92
|
-
|
93
|
-
def scale_out(self, n: int) -> List[str]:
|
94
|
-
block_ids = self._executor.scale_out(n)
|
95
|
-
if block_ids is not None:
|
96
|
-
new_status = {}
|
97
|
-
for block_id in block_ids:
|
98
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
99
|
-
self.send_monitoring_info(new_status)
|
100
|
-
self._status.update(new_status)
|
101
|
-
return block_ids
|
102
|
-
|
103
|
-
def __repr__(self) -> str:
|
104
|
-
return self._status.__repr__()
|
105
|
-
|
106
|
-
|
107
15
|
class JobStatusPoller(Timer):
|
108
16
|
def __init__(self, *, strategy: Optional[str], max_idletime: float,
|
109
17
|
strategy_period: Union[float, int],
|
110
|
-
|
111
|
-
self.
|
112
|
-
self.dfk = dfk
|
18
|
+
monitoring: Optional["parsl.monitoring.radios.MonitoringRadio"] = None) -> None:
|
19
|
+
self._executors = [] # type: List[BlockProviderExecutor]
|
113
20
|
self._strategy = Strategy(strategy=strategy,
|
114
21
|
max_idletime=max_idletime)
|
115
22
|
super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
|
116
23
|
|
117
24
|
def poll(self) -> None:
|
118
25
|
self._update_state()
|
119
|
-
self._run_error_handlers(self.
|
120
|
-
self._strategy.strategize(self.
|
26
|
+
self._run_error_handlers(self._executors)
|
27
|
+
self._strategy.strategize(self._executors)
|
121
28
|
|
122
|
-
def _run_error_handlers(self,
|
123
|
-
for
|
124
|
-
|
29
|
+
def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
|
30
|
+
for e in executors:
|
31
|
+
e.handle_errors(e.status_facade)
|
125
32
|
|
126
33
|
def _update_state(self) -> None:
|
127
|
-
|
128
|
-
|
129
|
-
item.poll(now)
|
34
|
+
for item in self._executors:
|
35
|
+
item.poll_facade()
|
130
36
|
|
131
37
|
def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
|
132
38
|
for executor in executors:
|
133
39
|
if executor.status_polling_interval > 0:
|
134
40
|
logger.debug("Adding executor {}".format(executor.label))
|
135
|
-
self.
|
41
|
+
self._executors.append(executor)
|
136
42
|
self._strategy.add_executors(executors)
|
137
43
|
|
138
|
-
def close(self):
|
139
|
-
super().close()
|
140
|
-
for
|
141
|
-
if not
|
142
|
-
logger.info(f"Scaling in executor {
|
44
|
+
def close(self, timeout: Optional[float] = None) -> None:
|
45
|
+
super().close(timeout)
|
46
|
+
for executor in self._executors:
|
47
|
+
if not executor.bad_state_is_set:
|
48
|
+
logger.info(f"Scaling in executor {executor.label}")
|
143
49
|
|
144
50
|
# this code needs to be at least as many blocks as need
|
145
51
|
# cancelling, but it is safe to be more, as the scaling
|
146
52
|
# code will cope with being asked to cancel more blocks
|
147
53
|
# than exist.
|
148
|
-
block_count = len(
|
149
|
-
|
54
|
+
block_count = len(executor.status_facade)
|
55
|
+
executor.scale_in_facade(block_count)
|
150
56
|
|
151
57
|
else: # and bad_state_is_set
|
152
|
-
logger.warning(f"Not scaling in executor {
|
58
|
+
logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
|
parsl/jobs/strategy.py
CHANGED
@@ -5,8 +5,6 @@ import math
|
|
5
5
|
import warnings
|
6
6
|
from typing import Dict, List, Optional, Sequence, TypedDict
|
7
7
|
|
8
|
-
import parsl.jobs.job_status_poller as jsp
|
9
|
-
|
10
8
|
from parsl.executors import HighThroughputExecutor
|
11
9
|
from parsl.executors.base import ParslExecutor
|
12
10
|
from parsl.executors.status_handling import BlockProviderExecutor
|
@@ -150,22 +148,21 @@ class Strategy:
|
|
150
148
|
for executor in executors:
|
151
149
|
self.executors[executor.label] = {'idle_since': None, 'first': True}
|
152
150
|
|
153
|
-
def _strategy_init_only(self,
|
151
|
+
def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
|
154
152
|
"""Scale up to init_blocks at the start, then nothing more.
|
155
153
|
"""
|
156
|
-
for
|
157
|
-
executor = ef.executor
|
154
|
+
for executor in executors:
|
158
155
|
if self.executors[executor.label]['first']:
|
159
156
|
logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
|
160
|
-
|
157
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
161
158
|
self.executors[executor.label]['first'] = False
|
162
159
|
else:
|
163
160
|
logger.debug("strategy_init_only: doing nothing")
|
164
161
|
|
165
|
-
def _strategy_simple(self,
|
166
|
-
self._general_strategy(
|
162
|
+
def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
|
163
|
+
self._general_strategy(executors, strategy_type='simple')
|
167
164
|
|
168
|
-
def _strategy_htex_auto_scale(self,
|
165
|
+
def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
|
169
166
|
"""HTEX specific auto scaling strategy
|
170
167
|
|
171
168
|
This strategy works only for HTEX. This strategy will scale out by
|
@@ -180,30 +177,25 @@ class Strategy:
|
|
180
177
|
expected to scale in effectively only when # of workers, or tasks executing
|
181
178
|
per block is close to 1.
|
182
179
|
"""
|
183
|
-
self._general_strategy(
|
180
|
+
self._general_strategy(executors, strategy_type='htex')
|
184
181
|
|
185
182
|
@wrap_with_logs
|
186
|
-
def _general_strategy(self,
|
187
|
-
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(
|
183
|
+
def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
|
184
|
+
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
|
188
185
|
|
189
|
-
for
|
190
|
-
executor = ef.executor
|
186
|
+
for executor in executors:
|
191
187
|
label = executor.label
|
192
|
-
if not isinstance(executor, BlockProviderExecutor):
|
193
|
-
logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
|
194
|
-
continue
|
195
188
|
logger.debug(f"Strategizing for executor {label}")
|
196
189
|
|
197
190
|
if self.executors[label]['first']:
|
198
|
-
executor = ef.executor
|
199
191
|
logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
|
200
|
-
|
192
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
201
193
|
self.executors[label]['first'] = False
|
202
194
|
|
203
195
|
# Tasks that are either pending completion
|
204
196
|
active_tasks = executor.outstanding
|
205
197
|
|
206
|
-
status =
|
198
|
+
status = executor.status_facade
|
207
199
|
|
208
200
|
# FIXME we need to handle case where provider does not define these
|
209
201
|
# FIXME probably more of this logic should be moved to the provider
|
@@ -247,23 +239,26 @@ class Strategy:
|
|
247
239
|
else:
|
248
240
|
# We want to make sure that max_idletime is reached
|
249
241
|
# before killing off resources
|
250
|
-
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})
|
242
|
+
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
|
243
|
+
f" than minimum blocks ({min_blocks})")
|
251
244
|
|
252
245
|
if not self.executors[executor.label]['idle_since']:
|
253
246
|
logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
|
254
247
|
self.executors[executor.label]['idle_since'] = time.time()
|
255
|
-
|
256
248
|
idle_since = self.executors[executor.label]['idle_since']
|
249
|
+
assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
|
250
|
+
|
257
251
|
idle_duration = time.time() - idle_since
|
258
252
|
if idle_duration > self.max_idletime:
|
259
253
|
# We have resources idle for the max duration,
|
260
254
|
# we have to scale_in now.
|
261
255
|
logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
|
262
|
-
|
256
|
+
executor.scale_in_facade(active_blocks - min_blocks)
|
263
257
|
|
264
258
|
else:
|
265
259
|
logger.debug(
|
266
|
-
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s
|
260
|
+
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
|
261
|
+
f" for executor {label}; not scaling in")
|
267
262
|
|
268
263
|
# Case 2
|
269
264
|
# More tasks than the available slots.
|
@@ -282,7 +277,7 @@ class Strategy:
|
|
282
277
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
283
278
|
excess_blocks = min(excess_blocks, max_blocks - active_blocks)
|
284
279
|
logger.debug(f"Requesting {excess_blocks} more blocks")
|
285
|
-
|
280
|
+
executor.scale_out_facade(excess_blocks)
|
286
281
|
|
287
282
|
elif active_slots == 0 and active_tasks > 0:
|
288
283
|
logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
|
@@ -291,7 +286,7 @@ class Strategy:
|
|
291
286
|
if active_blocks < max_blocks:
|
292
287
|
logger.debug("Requesting single block")
|
293
288
|
|
294
|
-
|
289
|
+
executor.scale_out_facade(1)
|
295
290
|
else:
|
296
291
|
logger.debug("Not requesting single block, because at maxblocks already")
|
297
292
|
|
@@ -307,7 +302,7 @@ class Strategy:
|
|
307
302
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
308
303
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
309
304
|
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
310
|
-
|
305
|
+
executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
|
311
306
|
else:
|
312
307
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
313
308
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
3
3
|
import os
|
4
4
|
import time
|
5
5
|
import logging
|
6
|
+
import multiprocessing.synchronize as ms
|
6
7
|
import typeguard
|
7
|
-
import zmq
|
8
8
|
|
9
9
|
import queue
|
10
10
|
|
11
11
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
12
12
|
from multiprocessing import Process
|
13
|
+
from multiprocessing import Event
|
13
14
|
from multiprocessing.queues import Queue
|
14
15
|
from parsl.log_utils import set_file_logger
|
15
16
|
from parsl.utils import RepresentationMixin
|
@@ -18,6 +19,7 @@ from parsl.utils import setproctitle
|
|
18
19
|
|
19
20
|
from parsl.serialize import deserialize
|
20
21
|
|
22
|
+
from parsl.monitoring.radios import MultiprocessingQueueRadio
|
21
23
|
from parsl.monitoring.router import router_starter
|
22
24
|
from parsl.monitoring.message_type import MessageType
|
23
25
|
from parsl.monitoring.types import AddressedMonitoringMessage
|
@@ -90,12 +92,6 @@ class MonitoringHub(RepresentationMixin):
|
|
90
92
|
Default: 30 seconds
|
91
93
|
"""
|
92
94
|
|
93
|
-
# Any is used to disable typechecking on uses of _dfk_channel,
|
94
|
-
# because it is used in the code as if it points to a channel, but
|
95
|
-
# the static type is that it can also be None. The code relies on
|
96
|
-
# .start() being called and initialising this to a real channel.
|
97
|
-
self._dfk_channel = None # type: Any
|
98
|
-
|
99
95
|
if _db_manager_excepts:
|
100
96
|
raise _db_manager_excepts
|
101
97
|
|
@@ -157,8 +153,12 @@ class MonitoringHub(RepresentationMixin):
|
|
157
153
|
self.block_msgs: Queue[AddressedMonitoringMessage]
|
158
154
|
self.block_msgs = SizedQueue()
|
159
155
|
|
156
|
+
self.router_exit_event: ms.Event
|
157
|
+
self.router_exit_event = Event()
|
158
|
+
|
160
159
|
self.router_proc = ForkProcess(target=router_starter,
|
161
|
-
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
160
|
+
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
161
|
+
self.block_msgs, self.resource_msgs, self.router_exit_event),
|
162
162
|
kwargs={"hub_address": self.hub_address,
|
163
163
|
"udp_port": self.hub_port,
|
164
164
|
"zmq_port_range": self.hub_port_range,
|
@@ -191,8 +191,12 @@ class MonitoringHub(RepresentationMixin):
|
|
191
191
|
self.filesystem_proc.start()
|
192
192
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
193
193
|
|
194
|
+
self.radio = MultiprocessingQueueRadio(self.block_msgs)
|
195
|
+
|
194
196
|
try:
|
195
197
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
198
|
+
comm_q.close()
|
199
|
+
comm_q.join_thread()
|
196
200
|
except queue.Empty:
|
197
201
|
logger.error("Hub has not completed initialization in 120s. Aborting")
|
198
202
|
raise Exception("Hub failed to start")
|
@@ -205,14 +209,6 @@ class MonitoringHub(RepresentationMixin):
|
|
205
209
|
|
206
210
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
207
211
|
|
208
|
-
context = zmq.Context()
|
209
|
-
self.dfk_channel_timeout = 10000 # in milliseconds
|
210
|
-
self._dfk_channel = context.socket(zmq.DEALER)
|
211
|
-
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
212
|
-
self._dfk_channel.set_hwm(0)
|
213
|
-
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
214
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
|
215
|
-
|
216
212
|
logger.info("Monitoring Hub initialized")
|
217
213
|
|
218
214
|
return zmq_port
|
@@ -220,11 +216,7 @@ class MonitoringHub(RepresentationMixin):
|
|
220
216
|
# TODO: tighten the Any message format
|
221
217
|
def send(self, mtype: MessageType, message: Any) -> None:
|
222
218
|
logger.debug("Sending message type {}".format(mtype))
|
223
|
-
|
224
|
-
self._dfk_channel.send_pyobj((mtype, message))
|
225
|
-
except zmq.Again:
|
226
|
-
logger.exception(
|
227
|
-
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
219
|
+
self.radio.send((mtype, message))
|
228
220
|
|
229
221
|
def close(self) -> None:
|
230
222
|
logger.info("Terminating Monitoring Hub")
|
@@ -235,9 +227,8 @@ class MonitoringHub(RepresentationMixin):
|
|
235
227
|
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
236
228
|
except queue.Empty:
|
237
229
|
break
|
238
|
-
if self.
|
230
|
+
if self.monitoring_hub_active:
|
239
231
|
self.monitoring_hub_active = False
|
240
|
-
self._dfk_channel.close()
|
241
232
|
if exception_msgs:
|
242
233
|
for exception_msg in exception_msgs:
|
243
234
|
logger.error(
|
@@ -249,6 +240,8 @@ class MonitoringHub(RepresentationMixin):
|
|
249
240
|
self.router_proc.terminate()
|
250
241
|
self.dbm_proc.terminate()
|
251
242
|
self.filesystem_proc.terminate()
|
243
|
+
logger.info("Setting router termination event")
|
244
|
+
self.router_exit_event.set()
|
252
245
|
logger.info("Waiting for router to terminate")
|
253
246
|
self.router_proc.join()
|
254
247
|
logger.debug("Finished waiting for router termination")
|
@@ -267,6 +260,19 @@ class MonitoringHub(RepresentationMixin):
|
|
267
260
|
self.filesystem_proc.terminate()
|
268
261
|
self.filesystem_proc.join()
|
269
262
|
|
263
|
+
logger.info("Closing monitoring multiprocessing queues")
|
264
|
+
self.exception_q.close()
|
265
|
+
self.exception_q.join_thread()
|
266
|
+
self.priority_msgs.close()
|
267
|
+
self.priority_msgs.join_thread()
|
268
|
+
self.resource_msgs.close()
|
269
|
+
self.resource_msgs.join_thread()
|
270
|
+
self.node_msgs.close()
|
271
|
+
self.node_msgs.join_thread()
|
272
|
+
self.block_msgs.close()
|
273
|
+
self.block_msgs.join_thread()
|
274
|
+
logger.info("Closed monitoring multiprocessing queues")
|
275
|
+
|
270
276
|
|
271
277
|
@wrap_with_logs
|
272
278
|
def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
|
parsl/monitoring/radios.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
|
7
7
|
from abc import ABCMeta, abstractmethod
|
8
8
|
|
9
|
+
from multiprocessing.queues import Queue
|
9
10
|
from typing import Optional
|
10
11
|
|
11
12
|
from parsl.serialize import serialize
|
@@ -173,3 +174,17 @@ class UDPRadio(MonitoringRadio):
|
|
173
174
|
logging.error("Could not send message within timeout limit")
|
174
175
|
return
|
175
176
|
return
|
177
|
+
|
178
|
+
|
179
|
+
class MultiprocessingQueueRadio(MonitoringRadio):
|
180
|
+
"""A monitoring radio intended which connects over a multiprocessing Queue.
|
181
|
+
This radio is intended to be used on the submit side, where components
|
182
|
+
in the submit process, or processes launched by multiprocessing, will have
|
183
|
+
access to a Queue shared with the monitoring database code (bypassing the
|
184
|
+
monitoring router).
|
185
|
+
"""
|
186
|
+
def __init__(self, queue: Queue) -> None:
|
187
|
+
self.queue = queue
|
188
|
+
|
189
|
+
def send(self, message: object) -> None:
|
190
|
+
self.queue.put((message, 0))
|