parsl 2024.3.18__py3-none-any.whl → 2024.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +35 -36
- parsl/executors/base.py +11 -1
- parsl/executors/high_throughput/executor.py +8 -20
- parsl/executors/high_throughput/process_worker_pool.py +5 -2
- parsl/executors/status_handling.py +8 -15
- parsl/executors/taskvine/executor.py +35 -11
- parsl/executors/workqueue/executor.py +33 -11
- parsl/jobs/error_handlers.py +1 -1
- parsl/jobs/job_status_poller.py +12 -11
- parsl/jobs/strategy.py +31 -18
- parsl/monitoring/monitoring.py +27 -237
- parsl/monitoring/router.py +208 -0
- parsl/tests/site_tests/test_provider.py +1 -1
- parsl/tests/test_htex/test_disconnected_blocks.py +0 -1
- parsl/tests/test_htex/test_drain.py +1 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +85 -0
- parsl/tests/test_python_apps/test_context_manager.py +40 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +78 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/process_worker_pool.py +5 -2
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/METADATA +4 -4
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/RECORD +35 -30
- /parsl/tests/{test_data → test_shutdown}/__init__.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- /parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +0 -0
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.18.data → parsl-2024.4.1.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/WHEEL +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2024.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import socket
|
5
|
+
import time
|
6
|
+
import pickle
|
7
|
+
import logging
|
8
|
+
import zmq
|
9
|
+
|
10
|
+
import queue
|
11
|
+
|
12
|
+
from parsl.log_utils import set_file_logger
|
13
|
+
from parsl.process_loggers import wrap_with_logs
|
14
|
+
from parsl.utils import setproctitle
|
15
|
+
|
16
|
+
from parsl.monitoring.message_type import MessageType
|
17
|
+
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
18
|
+
from typing import Optional, Tuple, Union
|
19
|
+
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class MonitoringRouter:
|
25
|
+
|
26
|
+
def __init__(self,
|
27
|
+
*,
|
28
|
+
hub_address: str,
|
29
|
+
udp_port: Optional[int] = None,
|
30
|
+
zmq_port_range: Tuple[int, int] = (55050, 56000),
|
31
|
+
|
32
|
+
monitoring_hub_address: str = "127.0.0.1",
|
33
|
+
logdir: str = ".",
|
34
|
+
run_id: str,
|
35
|
+
logging_level: int = logging.INFO,
|
36
|
+
atexit_timeout: int = 3 # in seconds
|
37
|
+
):
|
38
|
+
""" Initializes a monitoring configuration class.
|
39
|
+
|
40
|
+
Parameters
|
41
|
+
----------
|
42
|
+
hub_address : str
|
43
|
+
The ip address at which the workers will be able to reach the Hub.
|
44
|
+
udp_port : int
|
45
|
+
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
46
|
+
zmq_port_range : tuple(int, int)
|
47
|
+
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
48
|
+
Default: (55050, 56000)
|
49
|
+
logdir : str
|
50
|
+
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
51
|
+
logging_level : int
|
52
|
+
Logging level as defined in the logging module. Default: logging.INFO
|
53
|
+
atexit_timeout : float, optional
|
54
|
+
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
55
|
+
|
56
|
+
"""
|
57
|
+
os.makedirs(logdir, exist_ok=True)
|
58
|
+
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
59
|
+
name="monitoring_router",
|
60
|
+
level=logging_level)
|
61
|
+
self.logger.debug("Monitoring router starting")
|
62
|
+
|
63
|
+
self.hub_address = hub_address
|
64
|
+
self.atexit_timeout = atexit_timeout
|
65
|
+
self.run_id = run_id
|
66
|
+
|
67
|
+
self.loop_freq = 10.0 # milliseconds
|
68
|
+
|
69
|
+
# Initialize the UDP socket
|
70
|
+
self.udp_sock = socket.socket(socket.AF_INET,
|
71
|
+
socket.SOCK_DGRAM,
|
72
|
+
socket.IPPROTO_UDP)
|
73
|
+
|
74
|
+
# We are trying to bind to all interfaces with 0.0.0.0
|
75
|
+
if not udp_port:
|
76
|
+
self.udp_sock.bind(('0.0.0.0', 0))
|
77
|
+
self.udp_port = self.udp_sock.getsockname()[1]
|
78
|
+
else:
|
79
|
+
self.udp_port = udp_port
|
80
|
+
try:
|
81
|
+
self.udp_sock.bind(('0.0.0.0', self.udp_port))
|
82
|
+
except Exception as e:
|
83
|
+
raise RuntimeError(f"Could not bind to udp_port {udp_port} because: {e}")
|
84
|
+
self.udp_sock.settimeout(self.loop_freq / 1000)
|
85
|
+
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
|
86
|
+
|
87
|
+
self._context = zmq.Context()
|
88
|
+
self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
|
89
|
+
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
90
|
+
self.zmq_receiver_channel.set_hwm(0)
|
91
|
+
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
92
|
+
self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
|
93
|
+
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
|
94
|
+
min_port=zmq_port_range[0],
|
95
|
+
max_port=zmq_port_range[1])
|
96
|
+
|
97
|
+
def start(self,
|
98
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
99
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
100
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
101
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
102
|
+
try:
|
103
|
+
router_keep_going = True
|
104
|
+
while router_keep_going:
|
105
|
+
try:
|
106
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
107
|
+
resource_msg = pickle.loads(data)
|
108
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
109
|
+
resource_msgs.put((resource_msg, addr))
|
110
|
+
except socket.timeout:
|
111
|
+
pass
|
112
|
+
|
113
|
+
try:
|
114
|
+
dfk_loop_start = time.time()
|
115
|
+
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
116
|
+
# note that nothing checks that msg really is of the annotated type
|
117
|
+
msg: TaggedMonitoringMessage
|
118
|
+
msg = self.zmq_receiver_channel.recv_pyobj()
|
119
|
+
|
120
|
+
assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
|
121
|
+
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
122
|
+
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
123
|
+
|
124
|
+
msg_0: AddressedMonitoringMessage
|
125
|
+
msg_0 = (msg, 0)
|
126
|
+
|
127
|
+
if msg[0] == MessageType.NODE_INFO:
|
128
|
+
msg[1]['run_id'] = self.run_id
|
129
|
+
node_msgs.put(msg_0)
|
130
|
+
elif msg[0] == MessageType.RESOURCE_INFO:
|
131
|
+
resource_msgs.put(msg_0)
|
132
|
+
elif msg[0] == MessageType.BLOCK_INFO:
|
133
|
+
block_msgs.put(msg_0)
|
134
|
+
elif msg[0] == MessageType.TASK_INFO:
|
135
|
+
priority_msgs.put(msg_0)
|
136
|
+
elif msg[0] == MessageType.WORKFLOW_INFO:
|
137
|
+
priority_msgs.put(msg_0)
|
138
|
+
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
139
|
+
router_keep_going = False
|
140
|
+
else:
|
141
|
+
# There is a type: ignore here because if msg[0]
|
142
|
+
# is of the correct type, this code is unreachable,
|
143
|
+
# but there is no verification that the message
|
144
|
+
# received from zmq_receiver_channel.recv_pyobj() is actually
|
145
|
+
# of that type.
|
146
|
+
self.logger.error("Discarding message " # type: ignore[unreachable]
|
147
|
+
f"from interchange with unknown type {msg[0].value}")
|
148
|
+
except zmq.Again:
|
149
|
+
pass
|
150
|
+
except Exception:
|
151
|
+
# This will catch malformed messages. What happens if the
|
152
|
+
# channel is broken in such a way that it always raises
|
153
|
+
# an exception? Looping on this would maybe be the wrong
|
154
|
+
# thing to do.
|
155
|
+
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
156
|
+
|
157
|
+
self.logger.info("Monitoring router draining")
|
158
|
+
last_msg_received_time = time.time()
|
159
|
+
while time.time() - last_msg_received_time < self.atexit_timeout:
|
160
|
+
try:
|
161
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
162
|
+
msg = pickle.loads(data)
|
163
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
164
|
+
resource_msgs.put((msg, addr))
|
165
|
+
last_msg_received_time = time.time()
|
166
|
+
except socket.timeout:
|
167
|
+
pass
|
168
|
+
|
169
|
+
self.logger.info("Monitoring router finishing normally")
|
170
|
+
finally:
|
171
|
+
self.logger.info("Monitoring router finished")
|
172
|
+
|
173
|
+
|
174
|
+
@wrap_with_logs
|
175
|
+
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
176
|
+
exception_q: "queue.Queue[Tuple[str, str]]",
|
177
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
178
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
179
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
180
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
181
|
+
|
182
|
+
hub_address: str,
|
183
|
+
udp_port: Optional[int],
|
184
|
+
zmq_port_range: Tuple[int, int],
|
185
|
+
|
186
|
+
logdir: str,
|
187
|
+
logging_level: int,
|
188
|
+
run_id: str) -> None:
|
189
|
+
setproctitle("parsl: monitoring router")
|
190
|
+
try:
|
191
|
+
router = MonitoringRouter(hub_address=hub_address,
|
192
|
+
udp_port=udp_port,
|
193
|
+
zmq_port_range=zmq_port_range,
|
194
|
+
logdir=logdir,
|
195
|
+
logging_level=logging_level,
|
196
|
+
run_id=run_id)
|
197
|
+
except Exception as e:
|
198
|
+
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
199
|
+
comm_q.put(f"Monitoring router construction failed: {e}")
|
200
|
+
else:
|
201
|
+
comm_q.put((router.udp_port, router.zmq_receiver_port))
|
202
|
+
|
203
|
+
router.logger.info("Starting MonitoringRouter in router_starter")
|
204
|
+
try:
|
205
|
+
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
206
|
+
except Exception as e:
|
207
|
+
router.logger.exception("router.start exception")
|
208
|
+
exception_q.put(('Hub', str(e)))
|
@@ -58,7 +58,7 @@ def test_provider():
|
|
58
58
|
logger.info("Job in terminal state")
|
59
59
|
|
60
60
|
_, current_jobs = executor._get_block_and_job_ids()
|
61
|
-
# PR 1952 stoped removing scale_in blocks from self.
|
61
|
+
# PR 1952 stoped removing scale_in blocks from self.blocks_to_job_id
|
62
62
|
# A new PR will handle removing blocks from self.block
|
63
63
|
# this includes failed/completed/canceled blocks
|
64
64
|
assert len(current_jobs) == 1, "Expected current_jobs == 1"
|
@@ -41,11 +41,11 @@ def test_row_counts():
|
|
41
41
|
|
42
42
|
# dig out the interchange port...
|
43
43
|
hub_address = parsl.dfk().hub_address
|
44
|
-
|
44
|
+
hub_zmq_port = parsl.dfk().hub_zmq_port
|
45
45
|
|
46
46
|
# this will send a string to a new socket connection
|
47
47
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
48
|
-
s.connect((hub_address,
|
48
|
+
s.connect((hub_address, hub_zmq_port))
|
49
49
|
s.sendall(b'fuzzing\r')
|
50
50
|
|
51
51
|
# this will send a non-object down the DFK's existing ZMQ connection
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import parsl
|
4
|
+
import pytest
|
5
|
+
import time
|
6
|
+
|
7
|
+
from parsl.providers import LocalProvider
|
8
|
+
from parsl.channels import LocalChannel
|
9
|
+
from parsl.launchers import SimpleLauncher
|
10
|
+
|
11
|
+
from parsl.config import Config
|
12
|
+
from parsl.executors import HighThroughputExecutor
|
13
|
+
from parsl.monitoring import MonitoringHub
|
14
|
+
|
15
|
+
|
16
|
+
def fresh_config(run_dir, strategy, db_url):
|
17
|
+
return Config(
|
18
|
+
run_dir=os.fspath(run_dir),
|
19
|
+
executors=[
|
20
|
+
HighThroughputExecutor(
|
21
|
+
label="htex_local",
|
22
|
+
cores_per_worker=1,
|
23
|
+
encrypted=True,
|
24
|
+
provider=LocalProvider(
|
25
|
+
channel=LocalChannel(),
|
26
|
+
init_blocks=1,
|
27
|
+
# min and max are set to 0 to ensure that we don't get
|
28
|
+
# a block from ongoing strategy scaling, only from
|
29
|
+
# init_blocks
|
30
|
+
min_blocks=0,
|
31
|
+
max_blocks=0,
|
32
|
+
launcher=SimpleLauncher(),
|
33
|
+
),
|
34
|
+
)
|
35
|
+
],
|
36
|
+
strategy=strategy,
|
37
|
+
strategy_period=0.1,
|
38
|
+
monitoring=MonitoringHub(
|
39
|
+
hub_address="localhost",
|
40
|
+
hub_port=55055,
|
41
|
+
logging_endpoint=db_url
|
42
|
+
)
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
@parsl.python_app
|
47
|
+
def this_app():
|
48
|
+
pass
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.mark.local
|
52
|
+
@pytest.mark.parametrize("strategy", ('none', 'simple', 'htex_auto_scale'))
|
53
|
+
def test_row_counts(tmpd_cwd, strategy):
|
54
|
+
# this is imported here rather than at module level because
|
55
|
+
# it isn't available in a plain parsl install, so this module
|
56
|
+
# would otherwise fail to import and break even a basic test
|
57
|
+
# run.
|
58
|
+
import sqlalchemy
|
59
|
+
from sqlalchemy import text
|
60
|
+
|
61
|
+
db_url = f"sqlite:///{tmpd_cwd}/monitoring.db"
|
62
|
+
with parsl.load(fresh_config(tmpd_cwd, strategy, db_url)):
|
63
|
+
dfk = parsl.dfk()
|
64
|
+
run_id = dfk.run_id
|
65
|
+
|
66
|
+
this_app().result()
|
67
|
+
|
68
|
+
parsl.clear()
|
69
|
+
|
70
|
+
engine = sqlalchemy.create_engine(db_url)
|
71
|
+
with engine.begin() as connection:
|
72
|
+
|
73
|
+
binds = {"run_id": run_id}
|
74
|
+
|
75
|
+
result = connection.execute(text("SELECT COUNT(DISTINCT block_id) FROM block WHERE run_id = :run_id"), binds)
|
76
|
+
(c, ) = result.first()
|
77
|
+
assert c == 1, "We should see a single block in this database"
|
78
|
+
|
79
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'PENDING' AND run_id = :run_id"), binds)
|
80
|
+
(c, ) = result.first()
|
81
|
+
assert c == 1, "There should be a single pending status"
|
82
|
+
|
83
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'CANCELLED' AND run_id = :run_id"), binds)
|
84
|
+
(c, ) = result.first()
|
85
|
+
assert c == 1, "There should be a single cancelled status"
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import parsl
|
2
|
+
from parsl.tests.configs.local_threads import fresh_config
|
3
|
+
import pytest
|
4
|
+
from parsl.errors import NoDataFlowKernelError
|
5
|
+
from parsl.dataflow.dflow import DataFlowKernel
|
6
|
+
|
7
|
+
|
8
|
+
@parsl.python_app
|
9
|
+
def square(x):
|
10
|
+
return x * x
|
11
|
+
|
12
|
+
|
13
|
+
@parsl.bash_app
|
14
|
+
def foo(x, stdout='foo.stdout'):
|
15
|
+
return f"echo {x + 1}"
|
16
|
+
|
17
|
+
|
18
|
+
def local_setup():
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
def local_teardown():
|
23
|
+
parsl.clear()
|
24
|
+
|
25
|
+
|
26
|
+
@pytest.mark.local
|
27
|
+
def test_within_context_manger():
|
28
|
+
config = fresh_config()
|
29
|
+
with parsl.load(config=config) as dfk:
|
30
|
+
assert isinstance(dfk, DataFlowKernel)
|
31
|
+
|
32
|
+
bash_future = foo(1)
|
33
|
+
assert bash_future.result() == 0
|
34
|
+
|
35
|
+
with open('foo.stdout', 'r') as f:
|
36
|
+
assert f.read() == "2\n"
|
37
|
+
|
38
|
+
with pytest.raises(NoDataFlowKernelError) as excinfo:
|
39
|
+
square(2).result()
|
40
|
+
assert str(excinfo.value) == "Cannot submit to a DFK that has been cleaned up"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import threading
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl.channels import LocalChannel
|
7
|
+
from parsl.config import Config
|
8
|
+
from parsl.executors import HighThroughputExecutor
|
9
|
+
from parsl.launchers import SimpleLauncher
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
|
12
|
+
import random
|
13
|
+
|
14
|
+
# we need some blocks, but it doesn't matter too much how many, as long
|
15
|
+
# as they can all start up and get registered within the try_assert
|
16
|
+
# timeout later on.
|
17
|
+
BLOCK_COUNT = 3
|
18
|
+
|
19
|
+
|
20
|
+
class AccumulatingLocalProvider(LocalProvider):
|
21
|
+
def __init__(self, *args, **kwargs):
|
22
|
+
# Use a list for submitted job IDs because if there are multiple
|
23
|
+
# submissions returning the same job ID, this test should count
|
24
|
+
# those...
|
25
|
+
self.submit_job_ids = []
|
26
|
+
|
27
|
+
# ... but there's no requirement, I think, that cancel must be called
|
28
|
+
# only once per job id. What matters here is that each job ID is
|
29
|
+
# cancelled at least once.
|
30
|
+
self.cancel_job_ids = set()
|
31
|
+
|
32
|
+
super().__init__(*args, **kwargs)
|
33
|
+
|
34
|
+
def submit(self, *args, **kwargs):
|
35
|
+
job_id = super().submit(*args, **kwargs)
|
36
|
+
self.submit_job_ids.append(job_id)
|
37
|
+
return job_id
|
38
|
+
|
39
|
+
def cancel(self, job_ids):
|
40
|
+
self.cancel_job_ids.update(job_ids)
|
41
|
+
return super().cancel(job_ids)
|
42
|
+
|
43
|
+
|
44
|
+
@pytest.mark.local
|
45
|
+
def test_shutdown_scalein_blocks(tmpd_cwd, try_assert):
|
46
|
+
"""
|
47
|
+
This test scales up several blocks, and then checks that they are all
|
48
|
+
scaled in at DFK shutdown.
|
49
|
+
"""
|
50
|
+
accumulating_provider = AccumulatingLocalProvider(
|
51
|
+
channel=LocalChannel(),
|
52
|
+
init_blocks=BLOCK_COUNT,
|
53
|
+
min_blocks=0,
|
54
|
+
max_blocks=0,
|
55
|
+
launcher=SimpleLauncher(),
|
56
|
+
)
|
57
|
+
|
58
|
+
htex = HighThroughputExecutor(
|
59
|
+
label="htex_local",
|
60
|
+
cores_per_worker=1,
|
61
|
+
provider=accumulating_provider
|
62
|
+
)
|
63
|
+
|
64
|
+
config = Config(
|
65
|
+
executors=[htex],
|
66
|
+
strategy='none',
|
67
|
+
strategy_period=0.1,
|
68
|
+
run_dir=str(tmpd_cwd)
|
69
|
+
)
|
70
|
+
|
71
|
+
with parsl.load(config):
|
72
|
+
# this will wait for everything to be scaled out fully
|
73
|
+
try_assert(lambda: len(htex.connected_managers()) == BLOCK_COUNT)
|
74
|
+
|
75
|
+
parsl.clear()
|
76
|
+
|
77
|
+
assert len(accumulating_provider.submit_job_ids) == BLOCK_COUNT, f"Exactly {BLOCK_COUNT} blocks should have been launched"
|
78
|
+
assert len(accumulating_provider.cancel_job_ids) == BLOCK_COUNT, f"Exactly {BLOCK_COUNT} blocks should have been scaled in"
|
@@ -0,0 +1,65 @@
|
|
1
|
+
import os
|
2
|
+
import parsl
|
3
|
+
import pytest
|
4
|
+
import signal
|
5
|
+
import time
|
6
|
+
|
7
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
8
|
+
|
9
|
+
# This is a very generous upper bound on expected shutdown time of target
|
10
|
+
# process after receiving a signal, measured in seconds.
|
11
|
+
PERMITTED_SHUTDOWN_TIME_S = 60
|
12
|
+
|
13
|
+
|
14
|
+
@parsl.python_app
|
15
|
+
def simple_app():
|
16
|
+
return True
|
17
|
+
|
18
|
+
|
19
|
+
@pytest.mark.local
|
20
|
+
def test_no_kills():
|
21
|
+
"""This tests that we can create a monitoring-enabled DFK and shut it down."""
|
22
|
+
|
23
|
+
parsl.load(fresh_config())
|
24
|
+
|
25
|
+
assert parsl.dfk().monitoring is not None, "This test requires monitoring"
|
26
|
+
|
27
|
+
parsl.dfk().cleanup()
|
28
|
+
parsl.clear()
|
29
|
+
|
30
|
+
|
31
|
+
@pytest.mark.local
|
32
|
+
@pytest.mark.parametrize("sig", [signal.SIGINT, signal.SIGTERM, signal.SIGKILL, signal.SIGQUIT])
|
33
|
+
@pytest.mark.parametrize("process_attr", ["router_proc", "dbm_proc"])
|
34
|
+
def test_kill_monitoring_helper_process(sig, process_attr, try_assert):
|
35
|
+
"""This tests that we can kill a monitoring process and still have successful shutdown.
|
36
|
+
SIGINT emulates some racy behaviour when ctrl-C is pressed: that
|
37
|
+
monitoring processes receive a ctrl-C too, and so the other processes
|
38
|
+
need to be tolerant to monitoring processes arbitrarily exiting.
|
39
|
+
"""
|
40
|
+
|
41
|
+
parsl.load(fresh_config())
|
42
|
+
|
43
|
+
dfk = parsl.dfk()
|
44
|
+
|
45
|
+
assert dfk.monitoring is not None, "Monitoring required"
|
46
|
+
|
47
|
+
target_proc = getattr(dfk.monitoring, process_attr)
|
48
|
+
|
49
|
+
assert target_proc is not None, "prereq: target process must exist"
|
50
|
+
assert target_proc.is_alive(), "prereq: target process must be alive"
|
51
|
+
|
52
|
+
target_pid = target_proc.pid
|
53
|
+
assert target_pid is not None, "prereq: target process must have a pid"
|
54
|
+
|
55
|
+
os.kill(target_pid, sig)
|
56
|
+
|
57
|
+
try_assert(lambda: not target_proc.is_alive(), timeout_ms=PERMITTED_SHUTDOWN_TIME_S * 1000)
|
58
|
+
|
59
|
+
# now we have broken one piece of the monitoring system, do some app
|
60
|
+
# execution and then shut down.
|
61
|
+
|
62
|
+
simple_app().result()
|
63
|
+
|
64
|
+
parsl.dfk().cleanup()
|
65
|
+
parsl.clear()
|
parsl/version.py
CHANGED
@@ -335,14 +335,17 @@ class Manager:
|
|
335
335
|
self.heartbeat_to_incoming()
|
336
336
|
last_beat = time.time()
|
337
337
|
|
338
|
-
if
|
338
|
+
if time.time() > self.drain_time:
|
339
339
|
logger.info("Requesting drain")
|
340
340
|
self.drain_to_incoming()
|
341
|
-
self.drain_time = None
|
342
341
|
# This will start the pool draining...
|
343
342
|
# Drained exit behaviour does not happen here. It will be
|
344
343
|
# driven by the interchange sending a DRAINED_CODE message.
|
345
344
|
|
345
|
+
# now set drain time to the far future so we don't send a drain
|
346
|
+
# message every iteration.
|
347
|
+
self.drain_time = float('inf')
|
348
|
+
|
346
349
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
347
350
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
348
351
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2024.
|
3
|
+
Version: 2024.4.1
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2024.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2024.04.01.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -54,7 +54,7 @@ Requires-Dist: pyyaml ; extra == 'all'
|
|
54
54
|
Requires-Dist: cffi ; extra == 'all'
|
55
55
|
Requires-Dist: jsonschema ; extra == 'all'
|
56
56
|
Requires-Dist: proxystore ; extra == 'all'
|
57
|
-
Requires-Dist: radical.pilot ; extra == 'all'
|
57
|
+
Requires-Dist: radical.pilot ==1.47 ; extra == 'all'
|
58
58
|
Provides-Extra: aws
|
59
59
|
Requires-Dist: boto3 ; extra == 'aws'
|
60
60
|
Provides-Extra: azure
|
@@ -83,7 +83,7 @@ Requires-Dist: oauth-ssh >=0.9 ; extra == 'oauth_ssh'
|
|
83
83
|
Provides-Extra: proxystore
|
84
84
|
Requires-Dist: proxystore ; extra == 'proxystore'
|
85
85
|
Provides-Extra: radical-pilot
|
86
|
-
Requires-Dist: radical.pilot ; extra == 'radical-pilot'
|
86
|
+
Requires-Dist: radical.pilot ==1.47 ; extra == 'radical-pilot'
|
87
87
|
Provides-Extra: visualization
|
88
88
|
Requires-Dist: pydot ; extra == 'visualization'
|
89
89
|
Requires-Dist: networkx <2.6,>=2.5 ; extra == 'visualization'
|