parsl 2024.4.8__py3-none-any.whl → 2024.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/data_provider/data_manager.py +2 -1
- parsl/data_provider/zip.py +104 -0
- parsl/dataflow/dflow.py +44 -25
- parsl/dataflow/futures.py +0 -7
- parsl/executors/base.py +12 -9
- parsl/executors/high_throughput/executor.py +14 -19
- parsl/executors/high_throughput/process_worker_pool.py +3 -1
- parsl/executors/status_handling.py +81 -1
- parsl/executors/taskvine/executor.py +7 -2
- parsl/executors/workqueue/executor.py +8 -3
- parsl/jobs/job_status_poller.py +19 -113
- parsl/jobs/strategy.py +22 -27
- parsl/monitoring/monitoring.py +14 -23
- parsl/monitoring/radios.py +15 -0
- parsl/monitoring/router.py +7 -6
- parsl/providers/local/local.py +1 -1
- parsl/tests/configs/htex_local_alternate.py +2 -1
- parsl/tests/configs/taskvine_ex.py +1 -2
- parsl/tests/configs/workqueue_ex.py +1 -2
- parsl/tests/conftest.py +6 -7
- parsl/tests/test_bash_apps/test_basic.py +5 -4
- parsl/tests/test_bash_apps/test_error_codes.py +0 -3
- parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
- parsl/tests/test_bash_apps/test_memoize.py +0 -2
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
- parsl/tests/test_bash_apps/test_multiline.py +0 -1
- parsl/tests/test_bash_apps/test_stdout.py +11 -6
- parsl/tests/test_monitoring/test_basic.py +46 -21
- parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
- parsl/tests/test_python_apps/test_outputs.py +0 -1
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
- parsl/tests/test_staging/test_zip_out.py +113 -0
- parsl/version.py +1 -1
- {parsl-2024.4.8.data → parsl-2024.4.15.data}/scripts/process_worker_pool.py +3 -1
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/METADATA +3 -2
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/RECORD +43 -40
- {parsl-2024.4.8.data → parsl-2024.4.15.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.4.8.data → parsl-2024.4.15.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/LICENSE +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/WHEEL +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/entry_points.txt +0 -0
- {parsl-2024.4.8.dist-info → parsl-2024.4.15.dist-info}/top_level.txt +0 -0
parsl/jobs/job_status_poller.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import parsl
|
3
|
-
import
|
4
|
-
import zmq
|
5
|
-
from typing import Dict, List, Sequence, Optional, Union
|
3
|
+
from typing import List, Sequence, Optional, Union
|
6
4
|
|
7
|
-
from parsl.jobs.states import JobStatus, JobState
|
8
5
|
from parsl.jobs.strategy import Strategy
|
9
6
|
from parsl.executors.status_handling import BlockProviderExecutor
|
10
|
-
from parsl.monitoring.message_type import MessageType
|
11
7
|
|
12
8
|
|
13
9
|
from parsl.utils import Timer
|
@@ -16,137 +12,47 @@ from parsl.utils import Timer
|
|
16
12
|
logger = logging.getLogger(__name__)
|
17
13
|
|
18
14
|
|
19
|
-
class PolledExecutorFacade:
|
20
|
-
def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
|
21
|
-
self._executor = executor
|
22
|
-
self._interval = executor.status_polling_interval
|
23
|
-
self._last_poll_time = 0.0
|
24
|
-
self._status = {} # type: Dict[str, JobStatus]
|
25
|
-
|
26
|
-
# Create a ZMQ channel to send poll status to monitoring
|
27
|
-
self.monitoring_enabled = False
|
28
|
-
if dfk and dfk.monitoring is not None:
|
29
|
-
self.monitoring_enabled = True
|
30
|
-
hub_address = dfk.hub_address
|
31
|
-
hub_port = dfk.hub_zmq_port
|
32
|
-
context = zmq.Context()
|
33
|
-
self.hub_channel = context.socket(zmq.DEALER)
|
34
|
-
self.hub_channel.set_hwm(0)
|
35
|
-
self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
|
36
|
-
logger.info("Monitoring enabled on job status poller")
|
37
|
-
|
38
|
-
def _should_poll(self, now: float) -> bool:
|
39
|
-
return now >= self._last_poll_time + self._interval
|
40
|
-
|
41
|
-
def poll(self, now: float) -> None:
|
42
|
-
if self._should_poll(now):
|
43
|
-
previous_status = self._status
|
44
|
-
self._status = self._executor.status()
|
45
|
-
self._last_poll_time = now
|
46
|
-
delta_status = {}
|
47
|
-
for block_id in self._status:
|
48
|
-
if block_id not in previous_status \
|
49
|
-
or previous_status[block_id].state != self._status[block_id].state:
|
50
|
-
delta_status[block_id] = self._status[block_id]
|
51
|
-
|
52
|
-
if delta_status:
|
53
|
-
self.send_monitoring_info(delta_status)
|
54
|
-
|
55
|
-
def send_monitoring_info(self, status: Dict) -> None:
|
56
|
-
# Send monitoring info for HTEX when monitoring enabled
|
57
|
-
if self.monitoring_enabled:
|
58
|
-
msg = self._executor.create_monitoring_info(status)
|
59
|
-
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
60
|
-
self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
|
61
|
-
|
62
|
-
@property
|
63
|
-
def status(self) -> Dict[str, JobStatus]:
|
64
|
-
"""Return the status of all jobs/blocks of the executor of this poller.
|
65
|
-
|
66
|
-
:return: a dictionary mapping block ids (in string) to job status
|
67
|
-
"""
|
68
|
-
return self._status
|
69
|
-
|
70
|
-
@property
|
71
|
-
def executor(self) -> BlockProviderExecutor:
|
72
|
-
return self._executor
|
73
|
-
|
74
|
-
def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
75
|
-
|
76
|
-
if max_idletime is None:
|
77
|
-
block_ids = self._executor.scale_in(n)
|
78
|
-
else:
|
79
|
-
# This is a HighThroughputExecutor-specific interface violation.
|
80
|
-
# This code hopes, through pan-codebase reasoning, that this
|
81
|
-
# scale_in method really does come from HighThroughputExecutor,
|
82
|
-
# and so does have an extra max_idletime parameter not present
|
83
|
-
# in the executor interface.
|
84
|
-
block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
85
|
-
if block_ids is not None:
|
86
|
-
new_status = {}
|
87
|
-
for block_id in block_ids:
|
88
|
-
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
89
|
-
del self._status[block_id]
|
90
|
-
self.send_monitoring_info(new_status)
|
91
|
-
return block_ids
|
92
|
-
|
93
|
-
def scale_out(self, n: int) -> List[str]:
|
94
|
-
block_ids = self._executor.scale_out(n)
|
95
|
-
if block_ids is not None:
|
96
|
-
new_status = {}
|
97
|
-
for block_id in block_ids:
|
98
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
99
|
-
self.send_monitoring_info(new_status)
|
100
|
-
self._status.update(new_status)
|
101
|
-
return block_ids
|
102
|
-
|
103
|
-
def __repr__(self) -> str:
|
104
|
-
return self._status.__repr__()
|
105
|
-
|
106
|
-
|
107
15
|
class JobStatusPoller(Timer):
|
108
16
|
def __init__(self, *, strategy: Optional[str], max_idletime: float,
|
109
17
|
strategy_period: Union[float, int],
|
110
|
-
|
111
|
-
self.
|
112
|
-
self.dfk = dfk
|
18
|
+
monitoring: Optional["parsl.monitoring.radios.MonitoringRadio"] = None) -> None:
|
19
|
+
self._executors = [] # type: List[BlockProviderExecutor]
|
113
20
|
self._strategy = Strategy(strategy=strategy,
|
114
21
|
max_idletime=max_idletime)
|
115
22
|
super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
|
116
23
|
|
117
24
|
def poll(self) -> None:
|
118
25
|
self._update_state()
|
119
|
-
self._run_error_handlers(self.
|
120
|
-
self._strategy.strategize(self.
|
26
|
+
self._run_error_handlers(self._executors)
|
27
|
+
self._strategy.strategize(self._executors)
|
121
28
|
|
122
|
-
def _run_error_handlers(self,
|
123
|
-
for
|
124
|
-
|
29
|
+
def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
|
30
|
+
for e in executors:
|
31
|
+
e.handle_errors(e.status_facade)
|
125
32
|
|
126
33
|
def _update_state(self) -> None:
|
127
|
-
|
128
|
-
|
129
|
-
item.poll(now)
|
34
|
+
for item in self._executors:
|
35
|
+
item.poll_facade()
|
130
36
|
|
131
37
|
def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
|
132
38
|
for executor in executors:
|
133
39
|
if executor.status_polling_interval > 0:
|
134
40
|
logger.debug("Adding executor {}".format(executor.label))
|
135
|
-
self.
|
41
|
+
self._executors.append(executor)
|
136
42
|
self._strategy.add_executors(executors)
|
137
43
|
|
138
|
-
def close(self):
|
139
|
-
super().close()
|
140
|
-
for
|
141
|
-
if not
|
142
|
-
logger.info(f"Scaling in executor {
|
44
|
+
def close(self, timeout: Optional[float] = None) -> None:
|
45
|
+
super().close(timeout)
|
46
|
+
for executor in self._executors:
|
47
|
+
if not executor.bad_state_is_set:
|
48
|
+
logger.info(f"Scaling in executor {executor.label}")
|
143
49
|
|
144
50
|
# this code needs to be at least as many blocks as need
|
145
51
|
# cancelling, but it is safe to be more, as the scaling
|
146
52
|
# code will cope with being asked to cancel more blocks
|
147
53
|
# than exist.
|
148
|
-
block_count = len(
|
149
|
-
|
54
|
+
block_count = len(executor.status_facade)
|
55
|
+
executor.scale_in_facade(block_count)
|
150
56
|
|
151
57
|
else: # and bad_state_is_set
|
152
|
-
logger.warning(f"Not scaling in executor {
|
58
|
+
logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
|
parsl/jobs/strategy.py
CHANGED
@@ -5,8 +5,6 @@ import math
|
|
5
5
|
import warnings
|
6
6
|
from typing import Dict, List, Optional, Sequence, TypedDict
|
7
7
|
|
8
|
-
import parsl.jobs.job_status_poller as jsp
|
9
|
-
|
10
8
|
from parsl.executors import HighThroughputExecutor
|
11
9
|
from parsl.executors.base import ParslExecutor
|
12
10
|
from parsl.executors.status_handling import BlockProviderExecutor
|
@@ -150,22 +148,21 @@ class Strategy:
|
|
150
148
|
for executor in executors:
|
151
149
|
self.executors[executor.label] = {'idle_since': None, 'first': True}
|
152
150
|
|
153
|
-
def _strategy_init_only(self,
|
151
|
+
def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
|
154
152
|
"""Scale up to init_blocks at the start, then nothing more.
|
155
153
|
"""
|
156
|
-
for
|
157
|
-
executor = ef.executor
|
154
|
+
for executor in executors:
|
158
155
|
if self.executors[executor.label]['first']:
|
159
156
|
logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
|
160
|
-
|
157
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
161
158
|
self.executors[executor.label]['first'] = False
|
162
159
|
else:
|
163
160
|
logger.debug("strategy_init_only: doing nothing")
|
164
161
|
|
165
|
-
def _strategy_simple(self,
|
166
|
-
self._general_strategy(
|
162
|
+
def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
|
163
|
+
self._general_strategy(executors, strategy_type='simple')
|
167
164
|
|
168
|
-
def _strategy_htex_auto_scale(self,
|
165
|
+
def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
|
169
166
|
"""HTEX specific auto scaling strategy
|
170
167
|
|
171
168
|
This strategy works only for HTEX. This strategy will scale out by
|
@@ -180,30 +177,25 @@ class Strategy:
|
|
180
177
|
expected to scale in effectively only when # of workers, or tasks executing
|
181
178
|
per block is close to 1.
|
182
179
|
"""
|
183
|
-
self._general_strategy(
|
180
|
+
self._general_strategy(executors, strategy_type='htex')
|
184
181
|
|
185
182
|
@wrap_with_logs
|
186
|
-
def _general_strategy(self,
|
187
|
-
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(
|
183
|
+
def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
|
184
|
+
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
|
188
185
|
|
189
|
-
for
|
190
|
-
executor = ef.executor
|
186
|
+
for executor in executors:
|
191
187
|
label = executor.label
|
192
|
-
if not isinstance(executor, BlockProviderExecutor):
|
193
|
-
logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
|
194
|
-
continue
|
195
188
|
logger.debug(f"Strategizing for executor {label}")
|
196
189
|
|
197
190
|
if self.executors[label]['first']:
|
198
|
-
executor = ef.executor
|
199
191
|
logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
|
200
|
-
|
192
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
201
193
|
self.executors[label]['first'] = False
|
202
194
|
|
203
195
|
# Tasks that are either pending completion
|
204
196
|
active_tasks = executor.outstanding
|
205
197
|
|
206
|
-
status =
|
198
|
+
status = executor.status_facade
|
207
199
|
|
208
200
|
# FIXME we need to handle case where provider does not define these
|
209
201
|
# FIXME probably more of this logic should be moved to the provider
|
@@ -247,23 +239,26 @@ class Strategy:
|
|
247
239
|
else:
|
248
240
|
# We want to make sure that max_idletime is reached
|
249
241
|
# before killing off resources
|
250
|
-
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})
|
242
|
+
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
|
243
|
+
f" than minimum blocks ({min_blocks})")
|
251
244
|
|
252
245
|
if not self.executors[executor.label]['idle_since']:
|
253
246
|
logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
|
254
247
|
self.executors[executor.label]['idle_since'] = time.time()
|
255
|
-
|
256
248
|
idle_since = self.executors[executor.label]['idle_since']
|
249
|
+
assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
|
250
|
+
|
257
251
|
idle_duration = time.time() - idle_since
|
258
252
|
if idle_duration > self.max_idletime:
|
259
253
|
# We have resources idle for the max duration,
|
260
254
|
# we have to scale_in now.
|
261
255
|
logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
|
262
|
-
|
256
|
+
executor.scale_in_facade(active_blocks - min_blocks)
|
263
257
|
|
264
258
|
else:
|
265
259
|
logger.debug(
|
266
|
-
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s
|
260
|
+
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
|
261
|
+
f" for executor {label}; not scaling in")
|
267
262
|
|
268
263
|
# Case 2
|
269
264
|
# More tasks than the available slots.
|
@@ -282,7 +277,7 @@ class Strategy:
|
|
282
277
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
283
278
|
excess_blocks = min(excess_blocks, max_blocks - active_blocks)
|
284
279
|
logger.debug(f"Requesting {excess_blocks} more blocks")
|
285
|
-
|
280
|
+
executor.scale_out_facade(excess_blocks)
|
286
281
|
|
287
282
|
elif active_slots == 0 and active_tasks > 0:
|
288
283
|
logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
|
@@ -291,7 +286,7 @@ class Strategy:
|
|
291
286
|
if active_blocks < max_blocks:
|
292
287
|
logger.debug("Requesting single block")
|
293
288
|
|
294
|
-
|
289
|
+
executor.scale_out_facade(1)
|
295
290
|
else:
|
296
291
|
logger.debug("Not requesting single block, because at maxblocks already")
|
297
292
|
|
@@ -307,7 +302,7 @@ class Strategy:
|
|
307
302
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
308
303
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
309
304
|
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
310
|
-
|
305
|
+
executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
|
311
306
|
else:
|
312
307
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
313
308
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
3
3
|
import os
|
4
4
|
import time
|
5
5
|
import logging
|
6
|
+
import multiprocessing.synchronize as ms
|
6
7
|
import typeguard
|
7
|
-
import zmq
|
8
8
|
|
9
9
|
import queue
|
10
10
|
|
11
11
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
12
12
|
from multiprocessing import Process
|
13
|
+
from multiprocessing import Event
|
13
14
|
from multiprocessing.queues import Queue
|
14
15
|
from parsl.log_utils import set_file_logger
|
15
16
|
from parsl.utils import RepresentationMixin
|
@@ -18,6 +19,7 @@ from parsl.utils import setproctitle
|
|
18
19
|
|
19
20
|
from parsl.serialize import deserialize
|
20
21
|
|
22
|
+
from parsl.monitoring.radios import MultiprocessingQueueRadio
|
21
23
|
from parsl.monitoring.router import router_starter
|
22
24
|
from parsl.monitoring.message_type import MessageType
|
23
25
|
from parsl.monitoring.types import AddressedMonitoringMessage
|
@@ -90,12 +92,6 @@ class MonitoringHub(RepresentationMixin):
|
|
90
92
|
Default: 30 seconds
|
91
93
|
"""
|
92
94
|
|
93
|
-
# Any is used to disable typechecking on uses of _dfk_channel,
|
94
|
-
# because it is used in the code as if it points to a channel, but
|
95
|
-
# the static type is that it can also be None. The code relies on
|
96
|
-
# .start() being called and initialising this to a real channel.
|
97
|
-
self._dfk_channel = None # type: Any
|
98
|
-
|
99
95
|
if _db_manager_excepts:
|
100
96
|
raise _db_manager_excepts
|
101
97
|
|
@@ -157,8 +153,12 @@ class MonitoringHub(RepresentationMixin):
|
|
157
153
|
self.block_msgs: Queue[AddressedMonitoringMessage]
|
158
154
|
self.block_msgs = SizedQueue()
|
159
155
|
|
156
|
+
self.router_exit_event: ms.Event
|
157
|
+
self.router_exit_event = Event()
|
158
|
+
|
160
159
|
self.router_proc = ForkProcess(target=router_starter,
|
161
|
-
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
160
|
+
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
161
|
+
self.block_msgs, self.resource_msgs, self.router_exit_event),
|
162
162
|
kwargs={"hub_address": self.hub_address,
|
163
163
|
"udp_port": self.hub_port,
|
164
164
|
"zmq_port_range": self.hub_port_range,
|
@@ -191,6 +191,8 @@ class MonitoringHub(RepresentationMixin):
|
|
191
191
|
self.filesystem_proc.start()
|
192
192
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
193
193
|
|
194
|
+
self.radio = MultiprocessingQueueRadio(self.block_msgs)
|
195
|
+
|
194
196
|
try:
|
195
197
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
196
198
|
except queue.Empty:
|
@@ -205,14 +207,6 @@ class MonitoringHub(RepresentationMixin):
|
|
205
207
|
|
206
208
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
207
209
|
|
208
|
-
context = zmq.Context()
|
209
|
-
self.dfk_channel_timeout = 10000 # in milliseconds
|
210
|
-
self._dfk_channel = context.socket(zmq.DEALER)
|
211
|
-
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
212
|
-
self._dfk_channel.set_hwm(0)
|
213
|
-
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
214
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
|
215
|
-
|
216
210
|
logger.info("Monitoring Hub initialized")
|
217
211
|
|
218
212
|
return zmq_port
|
@@ -220,11 +214,7 @@ class MonitoringHub(RepresentationMixin):
|
|
220
214
|
# TODO: tighten the Any message format
|
221
215
|
def send(self, mtype: MessageType, message: Any) -> None:
|
222
216
|
logger.debug("Sending message type {}".format(mtype))
|
223
|
-
|
224
|
-
self._dfk_channel.send_pyobj((mtype, message))
|
225
|
-
except zmq.Again:
|
226
|
-
logger.exception(
|
227
|
-
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
217
|
+
self.radio.send((mtype, message))
|
228
218
|
|
229
219
|
def close(self) -> None:
|
230
220
|
logger.info("Terminating Monitoring Hub")
|
@@ -235,9 +225,8 @@ class MonitoringHub(RepresentationMixin):
|
|
235
225
|
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
236
226
|
except queue.Empty:
|
237
227
|
break
|
238
|
-
if self.
|
228
|
+
if self.monitoring_hub_active:
|
239
229
|
self.monitoring_hub_active = False
|
240
|
-
self._dfk_channel.close()
|
241
230
|
if exception_msgs:
|
242
231
|
for exception_msg in exception_msgs:
|
243
232
|
logger.error(
|
@@ -249,6 +238,8 @@ class MonitoringHub(RepresentationMixin):
|
|
249
238
|
self.router_proc.terminate()
|
250
239
|
self.dbm_proc.terminate()
|
251
240
|
self.filesystem_proc.terminate()
|
241
|
+
logger.info("Setting router termination event")
|
242
|
+
self.router_exit_event.set()
|
252
243
|
logger.info("Waiting for router to terminate")
|
253
244
|
self.router_proc.join()
|
254
245
|
logger.debug("Finished waiting for router termination")
|
parsl/monitoring/radios.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
|
7
7
|
from abc import ABCMeta, abstractmethod
|
8
8
|
|
9
|
+
from multiprocessing.queues import Queue
|
9
10
|
from typing import Optional
|
10
11
|
|
11
12
|
from parsl.serialize import serialize
|
@@ -173,3 +174,17 @@ class UDPRadio(MonitoringRadio):
|
|
173
174
|
logging.error("Could not send message within timeout limit")
|
174
175
|
return
|
175
176
|
return
|
177
|
+
|
178
|
+
|
179
|
+
class MultiprocessingQueueRadio(MonitoringRadio):
|
180
|
+
"""A monitoring radio intended which connects over a multiprocessing Queue.
|
181
|
+
This radio is intended to be used on the submit side, where components
|
182
|
+
in the submit process, or processes launched by multiprocessing, will have
|
183
|
+
access to a Queue shared with the monitoring database code (bypassing the
|
184
|
+
monitoring router).
|
185
|
+
"""
|
186
|
+
def __init__(self, queue: Queue) -> None:
|
187
|
+
self.queue = queue
|
188
|
+
|
189
|
+
def send(self, message: object) -> None:
|
190
|
+
self.queue.put((message, 0))
|
parsl/monitoring/router.py
CHANGED
@@ -15,6 +15,8 @@ from parsl.utils import setproctitle
|
|
15
15
|
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
18
|
+
|
19
|
+
from multiprocessing.synchronize import Event
|
18
20
|
from typing import Optional, Tuple, Union
|
19
21
|
|
20
22
|
|
@@ -98,10 +100,10 @@ class MonitoringRouter:
|
|
98
100
|
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
99
101
|
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
100
102
|
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
101
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]"
|
103
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
104
|
+
exit_event: Event) -> None:
|
102
105
|
try:
|
103
|
-
|
104
|
-
while router_keep_going:
|
106
|
+
while not exit_event.is_set():
|
105
107
|
try:
|
106
108
|
data, addr = self.udp_sock.recvfrom(2048)
|
107
109
|
resource_msg = pickle.loads(data)
|
@@ -135,8 +137,6 @@ class MonitoringRouter:
|
|
135
137
|
priority_msgs.put(msg_0)
|
136
138
|
elif msg[0] == MessageType.WORKFLOW_INFO:
|
137
139
|
priority_msgs.put(msg_0)
|
138
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
139
|
-
router_keep_going = False
|
140
140
|
else:
|
141
141
|
# There is a type: ignore here because if msg[0]
|
142
142
|
# is of the correct type, this code is unreachable,
|
@@ -178,6 +178,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
178
178
|
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
179
179
|
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
180
180
|
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
181
|
+
exit_event: Event,
|
181
182
|
|
182
183
|
hub_address: str,
|
183
184
|
udp_port: Optional[int],
|
@@ -202,7 +203,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
202
203
|
|
203
204
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
204
205
|
try:
|
205
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
206
|
+
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event)
|
206
207
|
except Exception as e:
|
207
208
|
router.logger.exception("router.start exception")
|
208
209
|
exception_q.put(('Hub', str(e)))
|
parsl/providers/local/local.py
CHANGED
@@ -266,7 +266,7 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
266
266
|
for job in job_ids:
|
267
267
|
job_dict = self.resources[job]
|
268
268
|
job_dict['cancelled'] = True
|
269
|
-
logger.debug("Terminating job/
|
269
|
+
logger.debug("Terminating job/process ID: {0}".format(job))
|
270
270
|
cmd = "kill -- -$(ps -o pgid= {} | grep -o '[0-9]*')".format(job_dict['remote_pid'])
|
271
271
|
retcode, stdout, stderr = self.channel.execute_wait(cmd, self.cmd_timeout)
|
272
272
|
if retcode != 0:
|
@@ -31,6 +31,7 @@ from parsl.executors import HighThroughputExecutor
|
|
31
31
|
from parsl.data_provider.http import HTTPInTaskStaging
|
32
32
|
from parsl.data_provider.ftp import FTPInTaskStaging
|
33
33
|
from parsl.data_provider.file_noop import NoOpFileStaging
|
34
|
+
from parsl.data_provider.zip import ZipFileStaging
|
34
35
|
|
35
36
|
working_dir = os.getcwd() + "/" + "test_htex_alternate"
|
36
37
|
|
@@ -42,7 +43,7 @@ def fresh_config():
|
|
42
43
|
address="127.0.0.1",
|
43
44
|
label="htex_Local",
|
44
45
|
working_dir=working_dir,
|
45
|
-
storage_access=[FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()],
|
46
|
+
storage_access=[ZipFileStaging(), FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()],
|
46
47
|
worker_debug=True,
|
47
48
|
cores_per_worker=1,
|
48
49
|
heartbeat_period=2,
|
@@ -9,5 +9,4 @@ from parsl.data_provider.file_noop import NoOpFileStaging
|
|
9
9
|
|
10
10
|
def fresh_config():
|
11
11
|
return Config(executors=[TaskVineExecutor(manager_config=TaskVineManagerConfig(port=9000),
|
12
|
-
worker_launch_method='factory'
|
13
|
-
storage_access=[FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()])])
|
12
|
+
worker_launch_method='factory')])
|
parsl/tests/conftest.py
CHANGED
@@ -135,28 +135,27 @@ def pytest_configure(config):
|
|
135
135
|
)
|
136
136
|
config.addinivalue_line(
|
137
137
|
'markers',
|
138
|
-
'
|
138
|
+
'cleannet: Enable tests that require a clean network connection (such as for testing FTP)'
|
139
139
|
)
|
140
|
-
|
141
140
|
config.addinivalue_line(
|
142
141
|
'markers',
|
143
|
-
'
|
142
|
+
'staging_required: Marks tests that require a staging provider, when there is no sharedFS)'
|
144
143
|
)
|
145
144
|
config.addinivalue_line(
|
146
145
|
'markers',
|
147
|
-
'
|
146
|
+
'sshd_required: Marks tests that require a SSHD'
|
148
147
|
)
|
149
148
|
config.addinivalue_line(
|
150
149
|
'markers',
|
151
|
-
'
|
150
|
+
'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity'
|
152
151
|
)
|
153
152
|
config.addinivalue_line(
|
154
153
|
'markers',
|
155
|
-
'
|
154
|
+
'issue3328: Marks tests broken by issue #3328'
|
156
155
|
)
|
157
156
|
config.addinivalue_line(
|
158
157
|
'markers',
|
159
|
-
'
|
158
|
+
'executor_supports_std_stream_tuples: Marks tests that require tuple support for stdout/stderr'
|
160
159
|
)
|
161
160
|
|
162
161
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import random
|
3
4
|
import re
|
@@ -23,7 +24,6 @@ def foo(x, y, z=10, stdout=None, label=None):
|
|
23
24
|
return f"echo {x} {y} {z}"
|
24
25
|
|
25
26
|
|
26
|
-
@pytest.mark.issue363
|
27
27
|
def test_command_format_1(tmpd_cwd):
|
28
28
|
"""Testing command format for BashApps"""
|
29
29
|
|
@@ -38,8 +38,7 @@ def test_command_format_1(tmpd_cwd):
|
|
38
38
|
assert so_content == "1 4 10"
|
39
39
|
|
40
40
|
|
41
|
-
|
42
|
-
def test_auto_log_filename_format():
|
41
|
+
def test_auto_log_filename_format(caplog):
|
43
42
|
"""Testing auto log filename format for BashApps
|
44
43
|
"""
|
45
44
|
app_label = "label_test_auto_log_filename_format"
|
@@ -61,8 +60,10 @@ def test_auto_log_filename_format():
|
|
61
60
|
assert contents == '1 {0} 10\n'.format(rand_int), \
|
62
61
|
'Output does not match expected string "1 {0} 10", Got: "{1}"'.format(rand_int, contents)
|
63
62
|
|
63
|
+
for record in caplog.records:
|
64
|
+
assert record.levelno < logging.ERROR
|
65
|
+
|
64
66
|
|
65
|
-
@pytest.mark.issue363
|
66
67
|
def test_parallel_for(tmpd_cwd, n=3):
|
67
68
|
"""Testing a simple parallel for loop"""
|
68
69
|
outdir = tmpd_cwd / "outputs/test_parallel"
|
@@ -76,7 +76,6 @@ def test_div_0(test_fn=div_0):
|
|
76
76
|
os.remove('std.out')
|
77
77
|
|
78
78
|
|
79
|
-
@pytest.mark.issue363
|
80
79
|
def test_bash_misuse(test_fn=bash_misuse):
|
81
80
|
err_code = test_matrix[test_fn]['exit_code']
|
82
81
|
f = test_fn()
|
@@ -91,7 +90,6 @@ def test_bash_misuse(test_fn=bash_misuse):
|
|
91
90
|
os.remove('std.out')
|
92
91
|
|
93
92
|
|
94
|
-
@pytest.mark.issue363
|
95
93
|
def test_command_not_found(test_fn=command_not_found):
|
96
94
|
err_code = test_matrix[test_fn]['exit_code']
|
97
95
|
f = test_fn()
|
@@ -108,7 +106,6 @@ def test_command_not_found(test_fn=command_not_found):
|
|
108
106
|
return True
|
109
107
|
|
110
108
|
|
111
|
-
@pytest.mark.issue363
|
112
109
|
def test_not_executable(test_fn=not_executable):
|
113
110
|
err_code = test_matrix[test_fn]['exit_code']
|
114
111
|
f = test_fn()
|
@@ -12,7 +12,6 @@ def fail_on_presence(outputs=()):
|
|
12
12
|
# This test is an oddity that requires a shared-FS and simply
|
13
13
|
# won't work if there's a staging provider.
|
14
14
|
# @pytest.mark.sharedFS_required
|
15
|
-
@pytest.mark.issue363
|
16
15
|
def test_bash_memoization(tmpd_cwd, n=2):
|
17
16
|
"""Testing bash memoization
|
18
17
|
"""
|
@@ -33,7 +32,6 @@ def fail_on_presence_kw(outputs=(), foo=None):
|
|
33
32
|
# This test is an oddity that requires a shared-FS and simply
|
34
33
|
# won't work if there's a staging provider.
|
35
34
|
# @pytest.mark.sharedFS_required
|
36
|
-
@pytest.mark.issue363
|
37
35
|
def test_bash_memoization_keywords(tmpd_cwd, n=2):
|
38
36
|
"""Testing bash memoization
|
39
37
|
"""
|