parsl 2024.4.1__py3-none-any.whl → 2024.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/data_provider/data_manager.py +2 -1
- parsl/data_provider/zip.py +104 -0
- parsl/dataflow/dflow.py +57 -48
- parsl/dataflow/futures.py +0 -7
- parsl/executors/base.py +12 -9
- parsl/executors/high_throughput/executor.py +14 -19
- parsl/executors/high_throughput/process_worker_pool.py +3 -1
- parsl/executors/status_handling.py +82 -9
- parsl/executors/taskvine/executor.py +7 -2
- parsl/executors/workqueue/executor.py +8 -3
- parsl/jobs/job_status_poller.py +27 -107
- parsl/jobs/strategy.py +31 -32
- parsl/monitoring/monitoring.py +14 -23
- parsl/monitoring/radios.py +15 -0
- parsl/monitoring/remote.py +2 -1
- parsl/monitoring/router.py +7 -6
- parsl/providers/local/local.py +1 -1
- parsl/tests/configs/htex_local_alternate.py +2 -1
- parsl/tests/configs/taskvine_ex.py +1 -2
- parsl/tests/configs/workqueue_ex.py +1 -2
- parsl/tests/conftest.py +6 -7
- parsl/tests/test_bash_apps/test_basic.py +5 -4
- parsl/tests/test_bash_apps/test_error_codes.py +0 -3
- parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
- parsl/tests/test_bash_apps/test_memoize.py +0 -2
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
- parsl/tests/test_bash_apps/test_multiline.py +0 -1
- parsl/tests/test_bash_apps/test_stdout.py +11 -6
- parsl/tests/test_monitoring/test_basic.py +46 -21
- parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
- parsl/tests/test_python_apps/test_outputs.py +0 -1
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
- parsl/tests/test_staging/test_zip_out.py +113 -0
- parsl/version.py +1 -1
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/process_worker_pool.py +3 -1
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/METADATA +3 -2
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/RECORD +44 -41
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/LICENSE +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/WHEEL +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/entry_points.txt +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/top_level.txt +0 -0
parsl/jobs/job_status_poller.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import parsl
|
3
|
-
import
|
4
|
-
import zmq
|
5
|
-
from typing import Dict, List, Sequence, Optional, Union
|
3
|
+
from typing import List, Sequence, Optional, Union
|
6
4
|
|
7
|
-
from parsl.jobs.states import JobStatus, JobState
|
8
5
|
from parsl.jobs.strategy import Strategy
|
9
6
|
from parsl.executors.status_handling import BlockProviderExecutor
|
10
|
-
from parsl.monitoring.message_type import MessageType
|
11
7
|
|
12
8
|
|
13
9
|
from parsl.utils import Timer
|
@@ -16,123 +12,47 @@ from parsl.utils import Timer
|
|
16
12
|
logger = logging.getLogger(__name__)
|
17
13
|
|
18
14
|
|
19
|
-
class PolledExecutorFacade:
|
20
|
-
def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
|
21
|
-
self._executor = executor
|
22
|
-
self._dfk = dfk
|
23
|
-
self._interval = executor.status_polling_interval
|
24
|
-
self._last_poll_time = 0.0
|
25
|
-
self._status = {} # type: Dict[str, JobStatus]
|
26
|
-
self.first = True
|
27
|
-
|
28
|
-
# Create a ZMQ channel to send poll status to monitoring
|
29
|
-
self.monitoring_enabled = False
|
30
|
-
if self._dfk and self._dfk.monitoring is not None:
|
31
|
-
self.monitoring_enabled = True
|
32
|
-
hub_address = self._dfk.hub_address
|
33
|
-
hub_port = self._dfk.hub_zmq_port
|
34
|
-
context = zmq.Context()
|
35
|
-
self.hub_channel = context.socket(zmq.DEALER)
|
36
|
-
self.hub_channel.set_hwm(0)
|
37
|
-
self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
|
38
|
-
logger.info("Monitoring enabled on job status poller")
|
39
|
-
|
40
|
-
def _should_poll(self, now: float) -> bool:
|
41
|
-
return now >= self._last_poll_time + self._interval
|
42
|
-
|
43
|
-
def poll(self, now: float) -> None:
|
44
|
-
if self._should_poll(now):
|
45
|
-
previous_status = self._status
|
46
|
-
self._status = self._executor.status()
|
47
|
-
self._last_poll_time = now
|
48
|
-
delta_status = {}
|
49
|
-
for block_id in self._status:
|
50
|
-
if block_id not in previous_status \
|
51
|
-
or previous_status[block_id].state != self._status[block_id].state:
|
52
|
-
delta_status[block_id] = self._status[block_id]
|
53
|
-
|
54
|
-
if delta_status:
|
55
|
-
self.send_monitoring_info(delta_status)
|
56
|
-
|
57
|
-
def send_monitoring_info(self, status: Dict) -> None:
|
58
|
-
# Send monitoring info for HTEX when monitoring enabled
|
59
|
-
if self.monitoring_enabled:
|
60
|
-
msg = self._executor.create_monitoring_info(status)
|
61
|
-
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
62
|
-
self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
|
63
|
-
|
64
|
-
@property
|
65
|
-
def status(self) -> Dict[str, JobStatus]:
|
66
|
-
"""Return the status of all jobs/blocks of the executor of this poller.
|
67
|
-
|
68
|
-
:return: a dictionary mapping block ids (in string) to job status
|
69
|
-
"""
|
70
|
-
return self._status
|
71
|
-
|
72
|
-
@property
|
73
|
-
def executor(self) -> BlockProviderExecutor:
|
74
|
-
return self._executor
|
75
|
-
|
76
|
-
def scale_in(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
77
|
-
|
78
|
-
if max_idletime is None:
|
79
|
-
block_ids = self._executor.scale_in(n)
|
80
|
-
else:
|
81
|
-
# This is a HighThroughputExecutor-specific interface violation.
|
82
|
-
# This code hopes, through pan-codebase reasoning, that this
|
83
|
-
# scale_in method really does come from HighThroughputExecutor,
|
84
|
-
# and so does have an extra max_idletime parameter not present
|
85
|
-
# in the executor interface.
|
86
|
-
block_ids = self._executor.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
87
|
-
if block_ids is not None:
|
88
|
-
new_status = {}
|
89
|
-
for block_id in block_ids:
|
90
|
-
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
91
|
-
del self._status[block_id]
|
92
|
-
self.send_monitoring_info(new_status)
|
93
|
-
return block_ids
|
94
|
-
|
95
|
-
def scale_out(self, n: int) -> List[str]:
|
96
|
-
block_ids = self._executor.scale_out(n)
|
97
|
-
if block_ids is not None:
|
98
|
-
new_status = {}
|
99
|
-
for block_id in block_ids:
|
100
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
101
|
-
self.send_monitoring_info(new_status)
|
102
|
-
self._status.update(new_status)
|
103
|
-
return block_ids
|
104
|
-
|
105
|
-
def __repr__(self) -> str:
|
106
|
-
return self._status.__repr__()
|
107
|
-
|
108
|
-
|
109
15
|
class JobStatusPoller(Timer):
|
110
16
|
def __init__(self, *, strategy: Optional[str], max_idletime: float,
|
111
17
|
strategy_period: Union[float, int],
|
112
|
-
|
113
|
-
self.
|
114
|
-
self.dfk = dfk
|
18
|
+
monitoring: Optional["parsl.monitoring.radios.MonitoringRadio"] = None) -> None:
|
19
|
+
self._executors = [] # type: List[BlockProviderExecutor]
|
115
20
|
self._strategy = Strategy(strategy=strategy,
|
116
21
|
max_idletime=max_idletime)
|
117
22
|
super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
|
118
23
|
|
119
24
|
def poll(self) -> None:
|
120
25
|
self._update_state()
|
121
|
-
self._run_error_handlers(self.
|
122
|
-
self._strategy.strategize(self.
|
26
|
+
self._run_error_handlers(self._executors)
|
27
|
+
self._strategy.strategize(self._executors)
|
123
28
|
|
124
|
-
def _run_error_handlers(self,
|
125
|
-
for
|
126
|
-
|
29
|
+
def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
|
30
|
+
for e in executors:
|
31
|
+
e.handle_errors(e.status_facade)
|
127
32
|
|
128
33
|
def _update_state(self) -> None:
|
129
|
-
|
130
|
-
|
131
|
-
item.poll(now)
|
34
|
+
for item in self._executors:
|
35
|
+
item.poll_facade()
|
132
36
|
|
133
37
|
def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
|
134
38
|
for executor in executors:
|
135
39
|
if executor.status_polling_interval > 0:
|
136
40
|
logger.debug("Adding executor {}".format(executor.label))
|
137
|
-
self.
|
41
|
+
self._executors.append(executor)
|
138
42
|
self._strategy.add_executors(executors)
|
43
|
+
|
44
|
+
def close(self, timeout: Optional[float] = None) -> None:
|
45
|
+
super().close(timeout)
|
46
|
+
for executor in self._executors:
|
47
|
+
if not executor.bad_state_is_set:
|
48
|
+
logger.info(f"Scaling in executor {executor.label}")
|
49
|
+
|
50
|
+
# this code needs to be at least as many blocks as need
|
51
|
+
# cancelling, but it is safe to be more, as the scaling
|
52
|
+
# code will cope with being asked to cancel more blocks
|
53
|
+
# than exist.
|
54
|
+
block_count = len(executor.status_facade)
|
55
|
+
executor.scale_in_facade(block_count)
|
56
|
+
|
57
|
+
else: # and bad_state_is_set
|
58
|
+
logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
|
parsl/jobs/strategy.py
CHANGED
@@ -5,8 +5,6 @@ import math
|
|
5
5
|
import warnings
|
6
6
|
from typing import Dict, List, Optional, Sequence, TypedDict
|
7
7
|
|
8
|
-
import parsl.jobs.job_status_poller as jsp
|
9
|
-
|
10
8
|
from parsl.executors import HighThroughputExecutor
|
11
9
|
from parsl.executors.base import ParslExecutor
|
12
10
|
from parsl.executors.status_handling import BlockProviderExecutor
|
@@ -26,6 +24,10 @@ class ExecutorState(TypedDict):
|
|
26
24
|
If the executor is not idle, then None.
|
27
25
|
"""
|
28
26
|
|
27
|
+
first: bool
|
28
|
+
"""True if this executor has not yet had a strategy poll.
|
29
|
+
"""
|
30
|
+
|
29
31
|
|
30
32
|
class Strategy:
|
31
33
|
"""Scaling strategy.
|
@@ -144,24 +146,23 @@ class Strategy:
|
|
144
146
|
|
145
147
|
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
146
148
|
for executor in executors:
|
147
|
-
self.executors[executor.label] = {'idle_since': None}
|
149
|
+
self.executors[executor.label] = {'idle_since': None, 'first': True}
|
148
150
|
|
149
|
-
def _strategy_init_only(self,
|
151
|
+
def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
|
150
152
|
"""Scale up to init_blocks at the start, then nothing more.
|
151
153
|
"""
|
152
|
-
for
|
153
|
-
if
|
154
|
-
executor = ef.executor
|
154
|
+
for executor in executors:
|
155
|
+
if self.executors[executor.label]['first']:
|
155
156
|
logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
|
156
|
-
|
157
|
-
|
157
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
158
|
+
self.executors[executor.label]['first'] = False
|
158
159
|
else:
|
159
160
|
logger.debug("strategy_init_only: doing nothing")
|
160
161
|
|
161
|
-
def _strategy_simple(self,
|
162
|
-
self._general_strategy(
|
162
|
+
def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
|
163
|
+
self._general_strategy(executors, strategy_type='simple')
|
163
164
|
|
164
|
-
def _strategy_htex_auto_scale(self,
|
165
|
+
def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
|
165
166
|
"""HTEX specific auto scaling strategy
|
166
167
|
|
167
168
|
This strategy works only for HTEX. This strategy will scale out by
|
@@ -176,30 +177,25 @@ class Strategy:
|
|
176
177
|
expected to scale in effectively only when # of workers, or tasks executing
|
177
178
|
per block is close to 1.
|
178
179
|
"""
|
179
|
-
self._general_strategy(
|
180
|
+
self._general_strategy(executors, strategy_type='htex')
|
180
181
|
|
181
182
|
@wrap_with_logs
|
182
|
-
def _general_strategy(self,
|
183
|
-
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(
|
183
|
+
def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
|
184
|
+
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
|
184
185
|
|
185
|
-
for
|
186
|
-
executor = ef.executor
|
186
|
+
for executor in executors:
|
187
187
|
label = executor.label
|
188
|
-
if not isinstance(executor, BlockProviderExecutor):
|
189
|
-
logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
|
190
|
-
continue
|
191
188
|
logger.debug(f"Strategizing for executor {label}")
|
192
189
|
|
193
|
-
if
|
194
|
-
executor = ef.executor
|
190
|
+
if self.executors[label]['first']:
|
195
191
|
logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
|
196
|
-
|
197
|
-
|
192
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
193
|
+
self.executors[label]['first'] = False
|
198
194
|
|
199
195
|
# Tasks that are either pending completion
|
200
196
|
active_tasks = executor.outstanding
|
201
197
|
|
202
|
-
status =
|
198
|
+
status = executor.status_facade
|
203
199
|
|
204
200
|
# FIXME we need to handle case where provider does not define these
|
205
201
|
# FIXME probably more of this logic should be moved to the provider
|
@@ -243,23 +239,26 @@ class Strategy:
|
|
243
239
|
else:
|
244
240
|
# We want to make sure that max_idletime is reached
|
245
241
|
# before killing off resources
|
246
|
-
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})
|
242
|
+
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
|
243
|
+
f" than minimum blocks ({min_blocks})")
|
247
244
|
|
248
245
|
if not self.executors[executor.label]['idle_since']:
|
249
246
|
logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
|
250
247
|
self.executors[executor.label]['idle_since'] = time.time()
|
251
|
-
|
252
248
|
idle_since = self.executors[executor.label]['idle_since']
|
249
|
+
assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
|
250
|
+
|
253
251
|
idle_duration = time.time() - idle_since
|
254
252
|
if idle_duration > self.max_idletime:
|
255
253
|
# We have resources idle for the max duration,
|
256
254
|
# we have to scale_in now.
|
257
255
|
logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
|
258
|
-
|
256
|
+
executor.scale_in_facade(active_blocks - min_blocks)
|
259
257
|
|
260
258
|
else:
|
261
259
|
logger.debug(
|
262
|
-
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s
|
260
|
+
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
|
261
|
+
f" for executor {label}; not scaling in")
|
263
262
|
|
264
263
|
# Case 2
|
265
264
|
# More tasks than the available slots.
|
@@ -278,7 +277,7 @@ class Strategy:
|
|
278
277
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
279
278
|
excess_blocks = min(excess_blocks, max_blocks - active_blocks)
|
280
279
|
logger.debug(f"Requesting {excess_blocks} more blocks")
|
281
|
-
|
280
|
+
executor.scale_out_facade(excess_blocks)
|
282
281
|
|
283
282
|
elif active_slots == 0 and active_tasks > 0:
|
284
283
|
logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
|
@@ -287,7 +286,7 @@ class Strategy:
|
|
287
286
|
if active_blocks < max_blocks:
|
288
287
|
logger.debug("Requesting single block")
|
289
288
|
|
290
|
-
|
289
|
+
executor.scale_out_facade(1)
|
291
290
|
else:
|
292
291
|
logger.debug("Not requesting single block, because at maxblocks already")
|
293
292
|
|
@@ -303,7 +302,7 @@ class Strategy:
|
|
303
302
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
304
303
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
305
304
|
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
306
|
-
|
305
|
+
executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
|
307
306
|
else:
|
308
307
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
309
308
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
3
3
|
import os
|
4
4
|
import time
|
5
5
|
import logging
|
6
|
+
import multiprocessing.synchronize as ms
|
6
7
|
import typeguard
|
7
|
-
import zmq
|
8
8
|
|
9
9
|
import queue
|
10
10
|
|
11
11
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
12
12
|
from multiprocessing import Process
|
13
|
+
from multiprocessing import Event
|
13
14
|
from multiprocessing.queues import Queue
|
14
15
|
from parsl.log_utils import set_file_logger
|
15
16
|
from parsl.utils import RepresentationMixin
|
@@ -18,6 +19,7 @@ from parsl.utils import setproctitle
|
|
18
19
|
|
19
20
|
from parsl.serialize import deserialize
|
20
21
|
|
22
|
+
from parsl.monitoring.radios import MultiprocessingQueueRadio
|
21
23
|
from parsl.monitoring.router import router_starter
|
22
24
|
from parsl.monitoring.message_type import MessageType
|
23
25
|
from parsl.monitoring.types import AddressedMonitoringMessage
|
@@ -90,12 +92,6 @@ class MonitoringHub(RepresentationMixin):
|
|
90
92
|
Default: 30 seconds
|
91
93
|
"""
|
92
94
|
|
93
|
-
# Any is used to disable typechecking on uses of _dfk_channel,
|
94
|
-
# because it is used in the code as if it points to a channel, but
|
95
|
-
# the static type is that it can also be None. The code relies on
|
96
|
-
# .start() being called and initialising this to a real channel.
|
97
|
-
self._dfk_channel = None # type: Any
|
98
|
-
|
99
95
|
if _db_manager_excepts:
|
100
96
|
raise _db_manager_excepts
|
101
97
|
|
@@ -157,8 +153,12 @@ class MonitoringHub(RepresentationMixin):
|
|
157
153
|
self.block_msgs: Queue[AddressedMonitoringMessage]
|
158
154
|
self.block_msgs = SizedQueue()
|
159
155
|
|
156
|
+
self.router_exit_event: ms.Event
|
157
|
+
self.router_exit_event = Event()
|
158
|
+
|
160
159
|
self.router_proc = ForkProcess(target=router_starter,
|
161
|
-
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
160
|
+
args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
|
161
|
+
self.block_msgs, self.resource_msgs, self.router_exit_event),
|
162
162
|
kwargs={"hub_address": self.hub_address,
|
163
163
|
"udp_port": self.hub_port,
|
164
164
|
"zmq_port_range": self.hub_port_range,
|
@@ -191,6 +191,8 @@ class MonitoringHub(RepresentationMixin):
|
|
191
191
|
self.filesystem_proc.start()
|
192
192
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
193
193
|
|
194
|
+
self.radio = MultiprocessingQueueRadio(self.block_msgs)
|
195
|
+
|
194
196
|
try:
|
195
197
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
196
198
|
except queue.Empty:
|
@@ -205,14 +207,6 @@ class MonitoringHub(RepresentationMixin):
|
|
205
207
|
|
206
208
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
207
209
|
|
208
|
-
context = zmq.Context()
|
209
|
-
self.dfk_channel_timeout = 10000 # in milliseconds
|
210
|
-
self._dfk_channel = context.socket(zmq.DEALER)
|
211
|
-
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
212
|
-
self._dfk_channel.set_hwm(0)
|
213
|
-
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
214
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, zmq_port))
|
215
|
-
|
216
210
|
logger.info("Monitoring Hub initialized")
|
217
211
|
|
218
212
|
return zmq_port
|
@@ -220,11 +214,7 @@ class MonitoringHub(RepresentationMixin):
|
|
220
214
|
# TODO: tighten the Any message format
|
221
215
|
def send(self, mtype: MessageType, message: Any) -> None:
|
222
216
|
logger.debug("Sending message type {}".format(mtype))
|
223
|
-
|
224
|
-
self._dfk_channel.send_pyobj((mtype, message))
|
225
|
-
except zmq.Again:
|
226
|
-
logger.exception(
|
227
|
-
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
217
|
+
self.radio.send((mtype, message))
|
228
218
|
|
229
219
|
def close(self) -> None:
|
230
220
|
logger.info("Terminating Monitoring Hub")
|
@@ -235,9 +225,8 @@ class MonitoringHub(RepresentationMixin):
|
|
235
225
|
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
236
226
|
except queue.Empty:
|
237
227
|
break
|
238
|
-
if self.
|
228
|
+
if self.monitoring_hub_active:
|
239
229
|
self.monitoring_hub_active = False
|
240
|
-
self._dfk_channel.close()
|
241
230
|
if exception_msgs:
|
242
231
|
for exception_msg in exception_msgs:
|
243
232
|
logger.error(
|
@@ -249,6 +238,8 @@ class MonitoringHub(RepresentationMixin):
|
|
249
238
|
self.router_proc.terminate()
|
250
239
|
self.dbm_proc.terminate()
|
251
240
|
self.filesystem_proc.terminate()
|
241
|
+
logger.info("Setting router termination event")
|
242
|
+
self.router_exit_event.set()
|
252
243
|
logger.info("Waiting for router to terminate")
|
253
244
|
self.router_proc.join()
|
254
245
|
logger.debug("Finished waiting for router termination")
|
parsl/monitoring/radios.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
|
7
7
|
from abc import ABCMeta, abstractmethod
|
8
8
|
|
9
|
+
from multiprocessing.queues import Queue
|
9
10
|
from typing import Optional
|
10
11
|
|
11
12
|
from parsl.serialize import serialize
|
@@ -173,3 +174,17 @@ class UDPRadio(MonitoringRadio):
|
|
173
174
|
logging.error("Could not send message within timeout limit")
|
174
175
|
return
|
175
176
|
return
|
177
|
+
|
178
|
+
|
179
|
+
class MultiprocessingQueueRadio(MonitoringRadio):
|
180
|
+
"""A monitoring radio intended which connects over a multiprocessing Queue.
|
181
|
+
This radio is intended to be used on the submit side, where components
|
182
|
+
in the submit process, or processes launched by multiprocessing, will have
|
183
|
+
access to a Queue shared with the monitoring database code (bypassing the
|
184
|
+
monitoring router).
|
185
|
+
"""
|
186
|
+
def __init__(self, queue: Queue) -> None:
|
187
|
+
self.queue = queue
|
188
|
+
|
189
|
+
def send(self, message: object) -> None:
|
190
|
+
self.queue.put((message, 0))
|
parsl/monitoring/remote.py
CHANGED
@@ -15,7 +15,8 @@ from typing import Any, Callable, Dict, List, Sequence, Tuple
|
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
17
|
|
18
|
-
def monitor_wrapper(
|
18
|
+
def monitor_wrapper(*,
|
19
|
+
f: Any, # per app
|
19
20
|
args: Sequence, # per invocation
|
20
21
|
kwargs: Dict, # per invocation
|
21
22
|
x_try_id: int, # per invocation
|
parsl/monitoring/router.py
CHANGED
@@ -15,6 +15,8 @@ from parsl.utils import setproctitle
|
|
15
15
|
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
18
|
+
|
19
|
+
from multiprocessing.synchronize import Event
|
18
20
|
from typing import Optional, Tuple, Union
|
19
21
|
|
20
22
|
|
@@ -98,10 +100,10 @@ class MonitoringRouter:
|
|
98
100
|
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
99
101
|
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
100
102
|
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
101
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]"
|
103
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
104
|
+
exit_event: Event) -> None:
|
102
105
|
try:
|
103
|
-
|
104
|
-
while router_keep_going:
|
106
|
+
while not exit_event.is_set():
|
105
107
|
try:
|
106
108
|
data, addr = self.udp_sock.recvfrom(2048)
|
107
109
|
resource_msg = pickle.loads(data)
|
@@ -135,8 +137,6 @@ class MonitoringRouter:
|
|
135
137
|
priority_msgs.put(msg_0)
|
136
138
|
elif msg[0] == MessageType.WORKFLOW_INFO:
|
137
139
|
priority_msgs.put(msg_0)
|
138
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
139
|
-
router_keep_going = False
|
140
140
|
else:
|
141
141
|
# There is a type: ignore here because if msg[0]
|
142
142
|
# is of the correct type, this code is unreachable,
|
@@ -178,6 +178,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
178
178
|
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
179
179
|
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
180
180
|
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
181
|
+
exit_event: Event,
|
181
182
|
|
182
183
|
hub_address: str,
|
183
184
|
udp_port: Optional[int],
|
@@ -202,7 +203,7 @@ def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
|
202
203
|
|
203
204
|
router.logger.info("Starting MonitoringRouter in router_starter")
|
204
205
|
try:
|
205
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
206
|
+
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs, exit_event)
|
206
207
|
except Exception as e:
|
207
208
|
router.logger.exception("router.start exception")
|
208
209
|
exception_q.put(('Hub', str(e)))
|
parsl/providers/local/local.py
CHANGED
@@ -266,7 +266,7 @@ class LocalProvider(ExecutionProvider, RepresentationMixin):
|
|
266
266
|
for job in job_ids:
|
267
267
|
job_dict = self.resources[job]
|
268
268
|
job_dict['cancelled'] = True
|
269
|
-
logger.debug("Terminating job/
|
269
|
+
logger.debug("Terminating job/process ID: {0}".format(job))
|
270
270
|
cmd = "kill -- -$(ps -o pgid= {} | grep -o '[0-9]*')".format(job_dict['remote_pid'])
|
271
271
|
retcode, stdout, stderr = self.channel.execute_wait(cmd, self.cmd_timeout)
|
272
272
|
if retcode != 0:
|
@@ -31,6 +31,7 @@ from parsl.executors import HighThroughputExecutor
|
|
31
31
|
from parsl.data_provider.http import HTTPInTaskStaging
|
32
32
|
from parsl.data_provider.ftp import FTPInTaskStaging
|
33
33
|
from parsl.data_provider.file_noop import NoOpFileStaging
|
34
|
+
from parsl.data_provider.zip import ZipFileStaging
|
34
35
|
|
35
36
|
working_dir = os.getcwd() + "/" + "test_htex_alternate"
|
36
37
|
|
@@ -42,7 +43,7 @@ def fresh_config():
|
|
42
43
|
address="127.0.0.1",
|
43
44
|
label="htex_Local",
|
44
45
|
working_dir=working_dir,
|
45
|
-
storage_access=[FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()],
|
46
|
+
storage_access=[ZipFileStaging(), FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()],
|
46
47
|
worker_debug=True,
|
47
48
|
cores_per_worker=1,
|
48
49
|
heartbeat_period=2,
|
@@ -9,5 +9,4 @@ from parsl.data_provider.file_noop import NoOpFileStaging
|
|
9
9
|
|
10
10
|
def fresh_config():
|
11
11
|
return Config(executors=[TaskVineExecutor(manager_config=TaskVineManagerConfig(port=9000),
|
12
|
-
worker_launch_method='factory'
|
13
|
-
storage_access=[FTPInTaskStaging(), HTTPInTaskStaging(), NoOpFileStaging()])])
|
12
|
+
worker_launch_method='factory')])
|
parsl/tests/conftest.py
CHANGED
@@ -135,28 +135,27 @@ def pytest_configure(config):
|
|
135
135
|
)
|
136
136
|
config.addinivalue_line(
|
137
137
|
'markers',
|
138
|
-
'
|
138
|
+
'cleannet: Enable tests that require a clean network connection (such as for testing FTP)'
|
139
139
|
)
|
140
|
-
|
141
140
|
config.addinivalue_line(
|
142
141
|
'markers',
|
143
|
-
'
|
142
|
+
'staging_required: Marks tests that require a staging provider, when there is no sharedFS)'
|
144
143
|
)
|
145
144
|
config.addinivalue_line(
|
146
145
|
'markers',
|
147
|
-
'
|
146
|
+
'sshd_required: Marks tests that require a SSHD'
|
148
147
|
)
|
149
148
|
config.addinivalue_line(
|
150
149
|
'markers',
|
151
|
-
'
|
150
|
+
'multiple_cores_required: Marks tests that require multiple cores, such as htex affinity'
|
152
151
|
)
|
153
152
|
config.addinivalue_line(
|
154
153
|
'markers',
|
155
|
-
'
|
154
|
+
'issue3328: Marks tests broken by issue #3328'
|
156
155
|
)
|
157
156
|
config.addinivalue_line(
|
158
157
|
'markers',
|
159
|
-
'
|
158
|
+
'executor_supports_std_stream_tuples: Marks tests that require tuple support for stdout/stderr'
|
160
159
|
)
|
161
160
|
|
162
161
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import random
|
3
4
|
import re
|
@@ -23,7 +24,6 @@ def foo(x, y, z=10, stdout=None, label=None):
|
|
23
24
|
return f"echo {x} {y} {z}"
|
24
25
|
|
25
26
|
|
26
|
-
@pytest.mark.issue363
|
27
27
|
def test_command_format_1(tmpd_cwd):
|
28
28
|
"""Testing command format for BashApps"""
|
29
29
|
|
@@ -38,8 +38,7 @@ def test_command_format_1(tmpd_cwd):
|
|
38
38
|
assert so_content == "1 4 10"
|
39
39
|
|
40
40
|
|
41
|
-
|
42
|
-
def test_auto_log_filename_format():
|
41
|
+
def test_auto_log_filename_format(caplog):
|
43
42
|
"""Testing auto log filename format for BashApps
|
44
43
|
"""
|
45
44
|
app_label = "label_test_auto_log_filename_format"
|
@@ -61,8 +60,10 @@ def test_auto_log_filename_format():
|
|
61
60
|
assert contents == '1 {0} 10\n'.format(rand_int), \
|
62
61
|
'Output does not match expected string "1 {0} 10", Got: "{1}"'.format(rand_int, contents)
|
63
62
|
|
63
|
+
for record in caplog.records:
|
64
|
+
assert record.levelno < logging.ERROR
|
65
|
+
|
64
66
|
|
65
|
-
@pytest.mark.issue363
|
66
67
|
def test_parallel_for(tmpd_cwd, n=3):
|
67
68
|
"""Testing a simple parallel for loop"""
|
68
69
|
outdir = tmpd_cwd / "outputs/test_parallel"
|
@@ -76,7 +76,6 @@ def test_div_0(test_fn=div_0):
|
|
76
76
|
os.remove('std.out')
|
77
77
|
|
78
78
|
|
79
|
-
@pytest.mark.issue363
|
80
79
|
def test_bash_misuse(test_fn=bash_misuse):
|
81
80
|
err_code = test_matrix[test_fn]['exit_code']
|
82
81
|
f = test_fn()
|
@@ -91,7 +90,6 @@ def test_bash_misuse(test_fn=bash_misuse):
|
|
91
90
|
os.remove('std.out')
|
92
91
|
|
93
92
|
|
94
|
-
@pytest.mark.issue363
|
95
93
|
def test_command_not_found(test_fn=command_not_found):
|
96
94
|
err_code = test_matrix[test_fn]['exit_code']
|
97
95
|
f = test_fn()
|
@@ -108,7 +106,6 @@ def test_command_not_found(test_fn=command_not_found):
|
|
108
106
|
return True
|
109
107
|
|
110
108
|
|
111
|
-
@pytest.mark.issue363
|
112
109
|
def test_not_executable(test_fn=not_executable):
|
113
110
|
err_code = test_matrix[test_fn]['exit_code']
|
114
111
|
f = test_fn()
|