parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +3 -1
- parsl/config.py +4 -0
- parsl/dataflow/dflow.py +14 -5
- parsl/executors/base.py +10 -0
- parsl/executors/high_throughput/executor.py +12 -0
- parsl/executors/high_throughput/interchange.py +30 -8
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/process_worker_pool.py +41 -5
- parsl/executors/status_handling.py +2 -9
- parsl/executors/taskvine/executor.py +24 -3
- parsl/executors/taskvine/manager.py +1 -0
- parsl/executors/taskvine/manager_config.py +3 -4
- parsl/executors/workqueue/executor.py +19 -0
- parsl/jobs/error_handlers.py +1 -1
- parsl/jobs/job_status_poller.py +8 -7
- parsl/launchers/launchers.py +6 -6
- parsl/log_utils.py +8 -4
- parsl/monitoring/db_manager.py +4 -2
- parsl/monitoring/monitoring.py +30 -264
- parsl/monitoring/router.py +208 -0
- parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
- parsl/monitoring/visualization/views.py +2 -1
- parsl/providers/cluster_provider.py +1 -3
- parsl/tests/configs/user_opts.py +2 -1
- parsl/tests/test_htex/test_drain.py +78 -0
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
- parsl/tests/test_python_apps/test_context_manager.py +40 -0
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
- parsl/tests/test_shutdown/__init__.py +0 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
- parsl/utils.py +2 -2
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import socket
|
5
|
+
import time
|
6
|
+
import pickle
|
7
|
+
import logging
|
8
|
+
import zmq
|
9
|
+
|
10
|
+
import queue
|
11
|
+
|
12
|
+
from parsl.log_utils import set_file_logger
|
13
|
+
from parsl.process_loggers import wrap_with_logs
|
14
|
+
from parsl.utils import setproctitle
|
15
|
+
|
16
|
+
from parsl.monitoring.message_type import MessageType
|
17
|
+
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
18
|
+
from typing import Optional, Tuple, Union
|
19
|
+
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class MonitoringRouter:
|
25
|
+
|
26
|
+
def __init__(self,
|
27
|
+
*,
|
28
|
+
hub_address: str,
|
29
|
+
udp_port: Optional[int] = None,
|
30
|
+
zmq_port_range: Tuple[int, int] = (55050, 56000),
|
31
|
+
|
32
|
+
monitoring_hub_address: str = "127.0.0.1",
|
33
|
+
logdir: str = ".",
|
34
|
+
run_id: str,
|
35
|
+
logging_level: int = logging.INFO,
|
36
|
+
atexit_timeout: int = 3 # in seconds
|
37
|
+
):
|
38
|
+
""" Initializes a monitoring configuration class.
|
39
|
+
|
40
|
+
Parameters
|
41
|
+
----------
|
42
|
+
hub_address : str
|
43
|
+
The ip address at which the workers will be able to reach the Hub.
|
44
|
+
udp_port : int
|
45
|
+
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
46
|
+
zmq_port_range : tuple(int, int)
|
47
|
+
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
48
|
+
Default: (55050, 56000)
|
49
|
+
logdir : str
|
50
|
+
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
51
|
+
logging_level : int
|
52
|
+
Logging level as defined in the logging module. Default: logging.INFO
|
53
|
+
atexit_timeout : float, optional
|
54
|
+
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
55
|
+
|
56
|
+
"""
|
57
|
+
os.makedirs(logdir, exist_ok=True)
|
58
|
+
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
59
|
+
name="monitoring_router",
|
60
|
+
level=logging_level)
|
61
|
+
self.logger.debug("Monitoring router starting")
|
62
|
+
|
63
|
+
self.hub_address = hub_address
|
64
|
+
self.atexit_timeout = atexit_timeout
|
65
|
+
self.run_id = run_id
|
66
|
+
|
67
|
+
self.loop_freq = 10.0 # milliseconds
|
68
|
+
|
69
|
+
# Initialize the UDP socket
|
70
|
+
self.udp_sock = socket.socket(socket.AF_INET,
|
71
|
+
socket.SOCK_DGRAM,
|
72
|
+
socket.IPPROTO_UDP)
|
73
|
+
|
74
|
+
# We are trying to bind to all interfaces with 0.0.0.0
|
75
|
+
if not udp_port:
|
76
|
+
self.udp_sock.bind(('0.0.0.0', 0))
|
77
|
+
self.udp_port = self.udp_sock.getsockname()[1]
|
78
|
+
else:
|
79
|
+
self.udp_port = udp_port
|
80
|
+
try:
|
81
|
+
self.udp_sock.bind(('0.0.0.0', self.udp_port))
|
82
|
+
except Exception as e:
|
83
|
+
raise RuntimeError(f"Could not bind to udp_port {udp_port} because: {e}")
|
84
|
+
self.udp_sock.settimeout(self.loop_freq / 1000)
|
85
|
+
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.udp_port))
|
86
|
+
|
87
|
+
self._context = zmq.Context()
|
88
|
+
self.zmq_receiver_channel = self._context.socket(zmq.DEALER)
|
89
|
+
self.zmq_receiver_channel.setsockopt(zmq.LINGER, 0)
|
90
|
+
self.zmq_receiver_channel.set_hwm(0)
|
91
|
+
self.zmq_receiver_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
92
|
+
self.logger.debug("hub_address: {}. zmq_port_range {}".format(hub_address, zmq_port_range))
|
93
|
+
self.zmq_receiver_port = self.zmq_receiver_channel.bind_to_random_port("tcp://*",
|
94
|
+
min_port=zmq_port_range[0],
|
95
|
+
max_port=zmq_port_range[1])
|
96
|
+
|
97
|
+
def start(self,
|
98
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
99
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
100
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
101
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
102
|
+
try:
|
103
|
+
router_keep_going = True
|
104
|
+
while router_keep_going:
|
105
|
+
try:
|
106
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
107
|
+
resource_msg = pickle.loads(data)
|
108
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
109
|
+
resource_msgs.put((resource_msg, addr))
|
110
|
+
except socket.timeout:
|
111
|
+
pass
|
112
|
+
|
113
|
+
try:
|
114
|
+
dfk_loop_start = time.time()
|
115
|
+
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
116
|
+
# note that nothing checks that msg really is of the annotated type
|
117
|
+
msg: TaggedMonitoringMessage
|
118
|
+
msg = self.zmq_receiver_channel.recv_pyobj()
|
119
|
+
|
120
|
+
assert isinstance(msg, tuple), "ZMQ Receiver expects only tuples, got {}".format(msg)
|
121
|
+
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
122
|
+
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
123
|
+
|
124
|
+
msg_0: AddressedMonitoringMessage
|
125
|
+
msg_0 = (msg, 0)
|
126
|
+
|
127
|
+
if msg[0] == MessageType.NODE_INFO:
|
128
|
+
msg[1]['run_id'] = self.run_id
|
129
|
+
node_msgs.put(msg_0)
|
130
|
+
elif msg[0] == MessageType.RESOURCE_INFO:
|
131
|
+
resource_msgs.put(msg_0)
|
132
|
+
elif msg[0] == MessageType.BLOCK_INFO:
|
133
|
+
block_msgs.put(msg_0)
|
134
|
+
elif msg[0] == MessageType.TASK_INFO:
|
135
|
+
priority_msgs.put(msg_0)
|
136
|
+
elif msg[0] == MessageType.WORKFLOW_INFO:
|
137
|
+
priority_msgs.put(msg_0)
|
138
|
+
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
139
|
+
router_keep_going = False
|
140
|
+
else:
|
141
|
+
# There is a type: ignore here because if msg[0]
|
142
|
+
# is of the correct type, this code is unreachable,
|
143
|
+
# but there is no verification that the message
|
144
|
+
# received from zmq_receiver_channel.recv_pyobj() is actually
|
145
|
+
# of that type.
|
146
|
+
self.logger.error("Discarding message " # type: ignore[unreachable]
|
147
|
+
f"from interchange with unknown type {msg[0].value}")
|
148
|
+
except zmq.Again:
|
149
|
+
pass
|
150
|
+
except Exception:
|
151
|
+
# This will catch malformed messages. What happens if the
|
152
|
+
# channel is broken in such a way that it always raises
|
153
|
+
# an exception? Looping on this would maybe be the wrong
|
154
|
+
# thing to do.
|
155
|
+
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
156
|
+
|
157
|
+
self.logger.info("Monitoring router draining")
|
158
|
+
last_msg_received_time = time.time()
|
159
|
+
while time.time() - last_msg_received_time < self.atexit_timeout:
|
160
|
+
try:
|
161
|
+
data, addr = self.udp_sock.recvfrom(2048)
|
162
|
+
msg = pickle.loads(data)
|
163
|
+
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
164
|
+
resource_msgs.put((msg, addr))
|
165
|
+
last_msg_received_time = time.time()
|
166
|
+
except socket.timeout:
|
167
|
+
pass
|
168
|
+
|
169
|
+
self.logger.info("Monitoring router finishing normally")
|
170
|
+
finally:
|
171
|
+
self.logger.info("Monitoring router finished")
|
172
|
+
|
173
|
+
|
174
|
+
@wrap_with_logs
|
175
|
+
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
176
|
+
exception_q: "queue.Queue[Tuple[str, str]]",
|
177
|
+
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
178
|
+
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
179
|
+
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
180
|
+
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
181
|
+
|
182
|
+
hub_address: str,
|
183
|
+
udp_port: Optional[int],
|
184
|
+
zmq_port_range: Tuple[int, int],
|
185
|
+
|
186
|
+
logdir: str,
|
187
|
+
logging_level: int,
|
188
|
+
run_id: str) -> None:
|
189
|
+
setproctitle("parsl: monitoring router")
|
190
|
+
try:
|
191
|
+
router = MonitoringRouter(hub_address=hub_address,
|
192
|
+
udp_port=udp_port,
|
193
|
+
zmq_port_range=zmq_port_range,
|
194
|
+
logdir=logdir,
|
195
|
+
logging_level=logging_level,
|
196
|
+
run_id=run_id)
|
197
|
+
except Exception as e:
|
198
|
+
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
199
|
+
comm_q.put(f"Monitoring router construction failed: {e}")
|
200
|
+
else:
|
201
|
+
comm_q.put((router.udp_port, router.zmq_receiver_port))
|
202
|
+
|
203
|
+
router.logger.info("Starting MonitoringRouter in router_starter")
|
204
|
+
try:
|
205
|
+
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
206
|
+
except Exception as e:
|
207
|
+
router.logger.exception("router.start exception")
|
208
|
+
exception_q.put(('Hub', str(e)))
|
@@ -27,6 +27,9 @@ gantt_colors = {'unsched': 'rgb(240, 240, 240)',
|
|
27
27
|
|
28
28
|
def task_gantt_plot(df_task, df_status, time_completed=None):
|
29
29
|
|
30
|
+
if df_task.empty:
|
31
|
+
return None
|
32
|
+
|
30
33
|
# if the workflow is not recorded as completed, then assume
|
31
34
|
# that tasks should continue in their last state until now,
|
32
35
|
# rather than the workflow end time.
|
@@ -8,7 +8,8 @@ from parsl.monitoring.visualization.models import Workflow, Task, Status, db
|
|
8
8
|
|
9
9
|
from parsl.monitoring.visualization.plots.default.workflow_plots import task_gantt_plot, task_per_app_plot, workflow_dag_plot
|
10
10
|
from parsl.monitoring.visualization.plots.default.task_plots import time_series_memory_per_task_plot
|
11
|
-
from parsl.monitoring.visualization.plots.default.workflow_resource_plots import resource_distribution_plot,
|
11
|
+
from parsl.monitoring.visualization.plots.default.workflow_resource_plots import (resource_distribution_plot,
|
12
|
+
resource_efficiency, worker_efficiency)
|
12
13
|
|
13
14
|
dummy = True
|
14
15
|
|
@@ -91,7 +91,7 @@ class ClusterProvider(ExecutionProvider):
|
|
91
91
|
- configs (dict) : configs that get pushed into the template
|
92
92
|
|
93
93
|
Returns:
|
94
|
-
-
|
94
|
+
- None
|
95
95
|
|
96
96
|
Raises:
|
97
97
|
SchedulerMissingArgs : If template is missing args
|
@@ -117,8 +117,6 @@ class ClusterProvider(ExecutionProvider):
|
|
117
117
|
logger.error("Uncategorized error: %s", e)
|
118
118
|
raise e
|
119
119
|
|
120
|
-
return True
|
121
|
-
|
122
120
|
@abstractmethod
|
123
121
|
def _status(self):
|
124
122
|
pass
|
parsl/tests/configs/user_opts.py
CHANGED
@@ -60,7 +60,8 @@ user_opts = {
|
|
60
60
|
# 'username': OSG_USERNAME,
|
61
61
|
# 'script_dir': '/home/{}/parsl_scripts'.format(OSG_USERNAME),
|
62
62
|
# 'scheduler_options': "",
|
63
|
-
# 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env;
|
63
|
+
# 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env;
|
64
|
+
# source parsl_env/bin/activate; python3 -m pip install parsl==0.5.2'
|
64
65
|
# },
|
65
66
|
# 'swan': {
|
66
67
|
# 'username': SWAN_USERNAME,
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import parsl
|
2
|
+
import pytest
|
3
|
+
import time
|
4
|
+
|
5
|
+
from parsl.providers import LocalProvider
|
6
|
+
from parsl.channels import LocalChannel
|
7
|
+
from parsl.launchers import SimpleLauncher
|
8
|
+
|
9
|
+
from parsl.config import Config
|
10
|
+
from parsl.executors import HighThroughputExecutor
|
11
|
+
|
12
|
+
# this constant is used to scale some durations that happen
|
13
|
+
# based around the expected drain period: the drain period
|
14
|
+
# is TIME_CONST seconds, and the single executed task will
|
15
|
+
# last twice that many number of seconds.
|
16
|
+
TIME_CONST = 1
|
17
|
+
|
18
|
+
|
19
|
+
def local_config():
|
20
|
+
return Config(
|
21
|
+
executors=[
|
22
|
+
HighThroughputExecutor(
|
23
|
+
label="htex_local",
|
24
|
+
drain_period=TIME_CONST,
|
25
|
+
worker_debug=True,
|
26
|
+
cores_per_worker=1,
|
27
|
+
encrypted=True,
|
28
|
+
provider=LocalProvider(
|
29
|
+
channel=LocalChannel(),
|
30
|
+
init_blocks=1,
|
31
|
+
min_blocks=0,
|
32
|
+
max_blocks=0,
|
33
|
+
launcher=SimpleLauncher(),
|
34
|
+
),
|
35
|
+
)
|
36
|
+
],
|
37
|
+
strategy='none',
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
@parsl.python_app
|
42
|
+
def f(n):
|
43
|
+
import time
|
44
|
+
time.sleep(n)
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.mark.local
|
48
|
+
def test_drain(try_assert):
|
49
|
+
|
50
|
+
htex = parsl.dfk().executors['htex_local']
|
51
|
+
|
52
|
+
# wait till we have a block running...
|
53
|
+
|
54
|
+
try_assert(lambda: len(htex.connected_managers()) == 1)
|
55
|
+
|
56
|
+
managers = htex.connected_managers()
|
57
|
+
assert managers[0]['active'], "The manager should be active"
|
58
|
+
assert not managers[0]['draining'], "The manager should not be draining"
|
59
|
+
|
60
|
+
fut = f(TIME_CONST * 2)
|
61
|
+
|
62
|
+
time.sleep(TIME_CONST)
|
63
|
+
|
64
|
+
# this assert should happen *very fast* after the above delay...
|
65
|
+
try_assert(lambda: htex.connected_managers()[0]['draining'], timeout_ms=500)
|
66
|
+
|
67
|
+
# and the test task should still be running...
|
68
|
+
assert not fut.done(), "The test task should still be running"
|
69
|
+
|
70
|
+
fut.result()
|
71
|
+
|
72
|
+
# and now we should see the manager disappear...
|
73
|
+
# ... with strategy='none', this should be coming from draining but
|
74
|
+
# that information isn't immediately obvious from the absence in
|
75
|
+
# connected managers.
|
76
|
+
# As with the above draining assert, this should happen very fast after
|
77
|
+
# the task ends.
|
78
|
+
try_assert(lambda: len(htex.connected_managers()) == 0, timeout_ms=500)
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""Tests monitoring records app name under various decoration patterns.
|
2
|
+
"""
|
3
|
+
|
4
|
+
import os
|
5
|
+
import parsl
|
6
|
+
import pytest
|
7
|
+
import time
|
8
|
+
|
9
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
10
|
+
|
11
|
+
|
12
|
+
@parsl.python_app
|
13
|
+
def regular_decorated_app():
|
14
|
+
return 5
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.mark.local
|
18
|
+
def get_regular_decorated_app():
|
19
|
+
return regular_decorated_app
|
20
|
+
|
21
|
+
|
22
|
+
def for_decoration_later():
|
23
|
+
return 77
|
24
|
+
|
25
|
+
|
26
|
+
def get_for_decoration_later():
|
27
|
+
return parsl.python_app(for_decoration_later)
|
28
|
+
|
29
|
+
|
30
|
+
def get_decorated_closure():
|
31
|
+
|
32
|
+
r = 53
|
33
|
+
|
34
|
+
@parsl.python_app
|
35
|
+
def decorated_closure():
|
36
|
+
return r
|
37
|
+
|
38
|
+
return decorated_closure
|
39
|
+
|
40
|
+
|
41
|
+
@pytest.mark.local
|
42
|
+
@pytest.mark.parametrize("get_app,expected_name,expected_result",
|
43
|
+
[(get_regular_decorated_app, "regular_decorated_app", 5),
|
44
|
+
(get_for_decoration_later, "for_decoration_later", 77),
|
45
|
+
(get_decorated_closure, "decorated_closure", 53)
|
46
|
+
])
|
47
|
+
def test_app_name(get_app, expected_name, expected_result, tmpd_cwd):
|
48
|
+
|
49
|
+
# this is imported here rather than at module level because
|
50
|
+
# it isn't available in a plain parsl install, so this module
|
51
|
+
# would otherwise fail to import and break even a basic test
|
52
|
+
# run.
|
53
|
+
import sqlalchemy
|
54
|
+
|
55
|
+
c = fresh_config()
|
56
|
+
c.run_dir = tmpd_cwd
|
57
|
+
c.monitoring.logging_endpoint = f"sqlite:///{tmpd_cwd}/monitoring.db"
|
58
|
+
parsl.load(c)
|
59
|
+
|
60
|
+
app = get_app()
|
61
|
+
assert app().result() == expected_result
|
62
|
+
|
63
|
+
parsl.dfk().cleanup()
|
64
|
+
parsl.clear()
|
65
|
+
|
66
|
+
engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
|
67
|
+
with engine.begin() as connection:
|
68
|
+
|
69
|
+
def count_rows(table: str):
|
70
|
+
result = connection.execute(f"SELECT COUNT(*) FROM {table}")
|
71
|
+
(c, ) = result.first()
|
72
|
+
return c
|
73
|
+
|
74
|
+
# one workflow...
|
75
|
+
assert count_rows("workflow") == 1
|
76
|
+
|
77
|
+
# ... with one task ...
|
78
|
+
assert count_rows("task") == 1
|
79
|
+
|
80
|
+
# ... that was tried once ...
|
81
|
+
assert count_rows("try") == 1
|
82
|
+
|
83
|
+
# ... and has the expected name.
|
84
|
+
result = connection.execute("SELECT task_func_name FROM task")
|
85
|
+
(c, ) = result.first()
|
86
|
+
assert c == expected_name
|
@@ -41,11 +41,11 @@ def test_row_counts():
|
|
41
41
|
|
42
42
|
# dig out the interchange port...
|
43
43
|
hub_address = parsl.dfk().hub_address
|
44
|
-
|
44
|
+
hub_zmq_port = parsl.dfk().hub_zmq_port
|
45
45
|
|
46
46
|
# this will send a string to a new socket connection
|
47
47
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
48
|
-
s.connect((hub_address,
|
48
|
+
s.connect((hub_address, hub_zmq_port))
|
49
49
|
s.sendall(b'fuzzing\r')
|
50
50
|
|
51
51
|
# this will send a non-object down the DFK's existing ZMQ connection
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import parsl
|
4
|
+
import pytest
|
5
|
+
import time
|
6
|
+
|
7
|
+
from parsl.providers import LocalProvider
|
8
|
+
from parsl.channels import LocalChannel
|
9
|
+
from parsl.launchers import SimpleLauncher
|
10
|
+
|
11
|
+
from parsl.config import Config
|
12
|
+
from parsl.executors import HighThroughputExecutor
|
13
|
+
from parsl.monitoring import MonitoringHub
|
14
|
+
|
15
|
+
|
16
|
+
def fresh_config(run_dir, strategy, db_url):
|
17
|
+
return Config(
|
18
|
+
run_dir=os.fspath(run_dir),
|
19
|
+
executors=[
|
20
|
+
HighThroughputExecutor(
|
21
|
+
label="htex_local",
|
22
|
+
cores_per_worker=1,
|
23
|
+
encrypted=True,
|
24
|
+
provider=LocalProvider(
|
25
|
+
channel=LocalChannel(),
|
26
|
+
init_blocks=1,
|
27
|
+
# min and max are set to 0 to ensure that we don't get
|
28
|
+
# a block from ongoing strategy scaling, only from
|
29
|
+
# init_blocks
|
30
|
+
min_blocks=0,
|
31
|
+
max_blocks=0,
|
32
|
+
launcher=SimpleLauncher(),
|
33
|
+
),
|
34
|
+
)
|
35
|
+
],
|
36
|
+
strategy=strategy,
|
37
|
+
strategy_period=0.1,
|
38
|
+
monitoring=MonitoringHub(
|
39
|
+
hub_address="localhost",
|
40
|
+
hub_port=55055,
|
41
|
+
logging_endpoint=db_url
|
42
|
+
)
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
@parsl.python_app
|
47
|
+
def this_app():
|
48
|
+
pass
|
49
|
+
|
50
|
+
|
51
|
+
@pytest.mark.local
|
52
|
+
@pytest.mark.parametrize("strategy", ('none', 'simple', 'htex_auto_scale'))
|
53
|
+
def test_row_counts(tmpd_cwd, strategy):
|
54
|
+
# this is imported here rather than at module level because
|
55
|
+
# it isn't available in a plain parsl install, so this module
|
56
|
+
# would otherwise fail to import and break even a basic test
|
57
|
+
# run.
|
58
|
+
import sqlalchemy
|
59
|
+
from sqlalchemy import text
|
60
|
+
|
61
|
+
db_url = f"sqlite:///{tmpd_cwd}/monitoring.db"
|
62
|
+
parsl.load(fresh_config(tmpd_cwd, strategy, db_url))
|
63
|
+
|
64
|
+
this_app().result()
|
65
|
+
|
66
|
+
parsl.dfk().cleanup()
|
67
|
+
parsl.clear()
|
68
|
+
|
69
|
+
engine = sqlalchemy.create_engine(db_url)
|
70
|
+
with engine.begin() as connection:
|
71
|
+
|
72
|
+
result = connection.execute(text("SELECT COUNT(DISTINCT block_id) FROM block"))
|
73
|
+
(c, ) = result.first()
|
74
|
+
assert c == 1, "We should see a single block in this database"
|
75
|
+
|
76
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'PENDING'"))
|
77
|
+
(c, ) = result.first()
|
78
|
+
assert c == 1, "There should be a single pending status"
|
79
|
+
|
80
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'CANCELLED'"))
|
81
|
+
(c, ) = result.first()
|
82
|
+
assert c == 1, "There should be a single cancelled status"
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import parsl
|
2
|
+
from parsl.tests.configs.local_threads import fresh_config
|
3
|
+
import pytest
|
4
|
+
from parsl.errors import NoDataFlowKernelError
|
5
|
+
|
6
|
+
|
7
|
+
@parsl.python_app
|
8
|
+
def square(x):
|
9
|
+
return x * x
|
10
|
+
|
11
|
+
|
12
|
+
@parsl.bash_app
|
13
|
+
def foo(x, stdout='foo.stdout'):
|
14
|
+
return f"echo {x + 1}"
|
15
|
+
|
16
|
+
|
17
|
+
def local_setup():
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
def local_teardown():
|
22
|
+
parsl.clear()
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.mark.local
|
26
|
+
def test_within_context_manger():
|
27
|
+
config = fresh_config()
|
28
|
+
with parsl.load(config=config):
|
29
|
+
py_future = square(2)
|
30
|
+
assert py_future.result() == 4
|
31
|
+
|
32
|
+
bash_future = foo(1)
|
33
|
+
assert bash_future.result() == 0
|
34
|
+
|
35
|
+
with open('foo.stdout', 'r') as f:
|
36
|
+
assert f.read() == "2\n"
|
37
|
+
|
38
|
+
with pytest.raises(NoDataFlowKernelError) as excinfo:
|
39
|
+
square(2).result()
|
40
|
+
assert str(excinfo.value) == "Cannot submit to a DFK that has been cleaned up"
|
@@ -37,6 +37,7 @@ def local_config():
|
|
37
37
|
],
|
38
38
|
max_idletime=0.5,
|
39
39
|
strategy='htex_auto_scale',
|
40
|
+
strategy_period=0.1
|
40
41
|
)
|
41
42
|
|
42
43
|
|
@@ -62,16 +63,6 @@ def waiting_app(ident: int, outputs=(), inputs=()):
|
|
62
63
|
def test_scale_out(tmpd_cwd, try_assert):
|
63
64
|
dfk = parsl.dfk()
|
64
65
|
|
65
|
-
# reconfigure scaling strategy to run faster than usual. This allows
|
66
|
-
# this test to complete faster - at time of writing 27s with default
|
67
|
-
# 5s strategy, vs XXXX with 0.5s strategy.
|
68
|
-
|
69
|
-
# check this attribute still exists, in the presence of ongoing
|
70
|
-
# development, so we have some belief that setting it will not be
|
71
|
-
# setting a now-ignored parameter.
|
72
|
-
assert hasattr(dfk.job_status_poller, 'interval')
|
73
|
-
dfk.job_status_poller.interval = 0.1
|
74
|
-
|
75
66
|
num_managers = len(dfk.executors['htex_local'].connected_managers())
|
76
67
|
|
77
68
|
assert num_managers == 0, "Expected 0 managers at start"
|
File without changes
|
@@ -0,0 +1,65 @@
|
|
1
|
+
import os
|
2
|
+
import parsl
|
3
|
+
import pytest
|
4
|
+
import signal
|
5
|
+
import time
|
6
|
+
|
7
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
8
|
+
|
9
|
+
# This is a very generous upper bound on expected shutdown time of target
|
10
|
+
# process after receiving a signal, measured in seconds.
|
11
|
+
PERMITTED_SHUTDOWN_TIME_S = 60
|
12
|
+
|
13
|
+
|
14
|
+
@parsl.python_app
|
15
|
+
def simple_app():
|
16
|
+
return True
|
17
|
+
|
18
|
+
|
19
|
+
@pytest.mark.local
|
20
|
+
def test_no_kills():
|
21
|
+
"""This tests that we can create a monitoring-enabled DFK and shut it down."""
|
22
|
+
|
23
|
+
parsl.load(fresh_config())
|
24
|
+
|
25
|
+
assert parsl.dfk().monitoring is not None, "This test requires monitoring"
|
26
|
+
|
27
|
+
parsl.dfk().cleanup()
|
28
|
+
parsl.clear()
|
29
|
+
|
30
|
+
|
31
|
+
@pytest.mark.local
|
32
|
+
@pytest.mark.parametrize("sig", [signal.SIGINT, signal.SIGTERM, signal.SIGKILL, signal.SIGQUIT])
|
33
|
+
@pytest.mark.parametrize("process_attr", ["router_proc", "dbm_proc"])
|
34
|
+
def test_kill_monitoring_helper_process(sig, process_attr, try_assert):
|
35
|
+
"""This tests that we can kill a monitoring process and still have successful shutdown.
|
36
|
+
SIGINT emulates some racy behaviour when ctrl-C is pressed: that
|
37
|
+
monitoring processes receive a ctrl-C too, and so the other processes
|
38
|
+
need to be tolerant to monitoring processes arbitrarily exiting.
|
39
|
+
"""
|
40
|
+
|
41
|
+
parsl.load(fresh_config())
|
42
|
+
|
43
|
+
dfk = parsl.dfk()
|
44
|
+
|
45
|
+
assert dfk.monitoring is not None, "Monitoring required"
|
46
|
+
|
47
|
+
target_proc = getattr(dfk.monitoring, process_attr)
|
48
|
+
|
49
|
+
assert target_proc is not None, "prereq: target process must exist"
|
50
|
+
assert target_proc.is_alive(), "prereq: target process must be alive"
|
51
|
+
|
52
|
+
target_pid = target_proc.pid
|
53
|
+
assert target_pid is not None, "prereq: target process must have a pid"
|
54
|
+
|
55
|
+
os.kill(target_pid, sig)
|
56
|
+
|
57
|
+
try_assert(lambda: not target_proc.is_alive(), timeout_ms=PERMITTED_SHUTDOWN_TIME_S * 1000)
|
58
|
+
|
59
|
+
# now we have broken one piece of the monitoring system, do some app
|
60
|
+
# execution and then shut down.
|
61
|
+
|
62
|
+
simple_app().result()
|
63
|
+
|
64
|
+
parsl.dfk().cleanup()
|
65
|
+
parsl.clear()
|
parsl/utils.py
CHANGED
@@ -296,12 +296,12 @@ class Timer:
|
|
296
296
|
|
297
297
|
"""
|
298
298
|
|
299
|
-
def __init__(self, callback: Callable, *args: Any, interval: int = 5, name: Optional[str] = None) -> None:
|
299
|
+
def __init__(self, callback: Callable, *args: Any, interval: Union[float, int] = 5, name: Optional[str] = None) -> None:
|
300
300
|
"""Initialize the Timer object.
|
301
301
|
We start the timer thread here
|
302
302
|
|
303
303
|
KWargs:
|
304
|
-
- interval (int) : number of seconds between callback events
|
304
|
+
- interval (int or float) : number of seconds between callback events
|
305
305
|
- name (str) : a base name to use when naming the started thread
|
306
306
|
"""
|
307
307
|
|
parsl/version.py
CHANGED