westpa 2022.10__cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of westpa might be problematic. Click here for more details.
- westpa/__init__.py +14 -0
- westpa/_version.py +21 -0
- westpa/analysis/__init__.py +5 -0
- westpa/analysis/core.py +746 -0
- westpa/analysis/statistics.py +27 -0
- westpa/analysis/trajectories.py +360 -0
- westpa/cli/__init__.py +0 -0
- westpa/cli/core/__init__.py +0 -0
- westpa/cli/core/w_fork.py +152 -0
- westpa/cli/core/w_init.py +230 -0
- westpa/cli/core/w_run.py +77 -0
- westpa/cli/core/w_states.py +212 -0
- westpa/cli/core/w_succ.py +99 -0
- westpa/cli/core/w_truncate.py +59 -0
- westpa/cli/tools/__init__.py +0 -0
- westpa/cli/tools/ploterr.py +506 -0
- westpa/cli/tools/plothist.py +706 -0
- westpa/cli/tools/w_assign.py +596 -0
- westpa/cli/tools/w_bins.py +166 -0
- westpa/cli/tools/w_crawl.py +119 -0
- westpa/cli/tools/w_direct.py +547 -0
- westpa/cli/tools/w_dumpsegs.py +94 -0
- westpa/cli/tools/w_eddist.py +506 -0
- westpa/cli/tools/w_fluxanl.py +378 -0
- westpa/cli/tools/w_ipa.py +833 -0
- westpa/cli/tools/w_kinavg.py +127 -0
- westpa/cli/tools/w_kinetics.py +96 -0
- westpa/cli/tools/w_multi_west.py +414 -0
- westpa/cli/tools/w_ntop.py +213 -0
- westpa/cli/tools/w_pdist.py +515 -0
- westpa/cli/tools/w_postanalysis_matrix.py +82 -0
- westpa/cli/tools/w_postanalysis_reweight.py +53 -0
- westpa/cli/tools/w_red.py +486 -0
- westpa/cli/tools/w_reweight.py +780 -0
- westpa/cli/tools/w_select.py +226 -0
- westpa/cli/tools/w_stateprobs.py +111 -0
- westpa/cli/tools/w_trace.py +599 -0
- westpa/core/__init__.py +0 -0
- westpa/core/_rc.py +673 -0
- westpa/core/binning/__init__.py +55 -0
- westpa/core/binning/_assign.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/core/binning/assign.py +449 -0
- westpa/core/binning/binless.py +96 -0
- westpa/core/binning/binless_driver.py +54 -0
- westpa/core/binning/binless_manager.py +190 -0
- westpa/core/binning/bins.py +47 -0
- westpa/core/binning/mab.py +427 -0
- westpa/core/binning/mab_driver.py +54 -0
- westpa/core/binning/mab_manager.py +198 -0
- westpa/core/data_manager.py +1694 -0
- westpa/core/extloader.py +74 -0
- westpa/core/h5io.py +995 -0
- westpa/core/kinetics/__init__.py +24 -0
- westpa/core/kinetics/_kinetics.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/core/kinetics/events.py +147 -0
- westpa/core/kinetics/matrates.py +156 -0
- westpa/core/kinetics/rate_averaging.py +266 -0
- westpa/core/progress.py +218 -0
- westpa/core/propagators/__init__.py +54 -0
- westpa/core/propagators/executable.py +715 -0
- westpa/core/reweight/__init__.py +14 -0
- westpa/core/reweight/_reweight.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/core/reweight/matrix.py +126 -0
- westpa/core/segment.py +119 -0
- westpa/core/sim_manager.py +830 -0
- westpa/core/states.py +359 -0
- westpa/core/systems.py +93 -0
- westpa/core/textio.py +74 -0
- westpa/core/trajectory.py +330 -0
- westpa/core/we_driver.py +908 -0
- westpa/core/wm_ops.py +43 -0
- westpa/core/yamlcfg.py +391 -0
- westpa/fasthist/__init__.py +34 -0
- westpa/fasthist/__main__.py +110 -0
- westpa/fasthist/_fasthist.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/mclib/__init__.py +264 -0
- westpa/mclib/__main__.py +28 -0
- westpa/mclib/_mclib.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/oldtools/__init__.py +4 -0
- westpa/oldtools/aframe/__init__.py +35 -0
- westpa/oldtools/aframe/atool.py +75 -0
- westpa/oldtools/aframe/base_mixin.py +26 -0
- westpa/oldtools/aframe/binning.py +178 -0
- westpa/oldtools/aframe/data_reader.py +560 -0
- westpa/oldtools/aframe/iter_range.py +200 -0
- westpa/oldtools/aframe/kinetics.py +117 -0
- westpa/oldtools/aframe/mcbs.py +146 -0
- westpa/oldtools/aframe/output.py +39 -0
- westpa/oldtools/aframe/plotting.py +90 -0
- westpa/oldtools/aframe/trajwalker.py +126 -0
- westpa/oldtools/aframe/transitions.py +469 -0
- westpa/oldtools/cmds/__init__.py +0 -0
- westpa/oldtools/cmds/w_ttimes.py +358 -0
- westpa/oldtools/files.py +34 -0
- westpa/oldtools/miscfn.py +23 -0
- westpa/oldtools/stats/__init__.py +4 -0
- westpa/oldtools/stats/accumulator.py +35 -0
- westpa/oldtools/stats/edfs.py +129 -0
- westpa/oldtools/stats/mcbs.py +89 -0
- westpa/tools/__init__.py +33 -0
- westpa/tools/binning.py +472 -0
- westpa/tools/core.py +340 -0
- westpa/tools/data_reader.py +159 -0
- westpa/tools/dtypes.py +31 -0
- westpa/tools/iter_range.py +198 -0
- westpa/tools/kinetics_tool.py +340 -0
- westpa/tools/plot.py +283 -0
- westpa/tools/progress.py +17 -0
- westpa/tools/selected_segs.py +154 -0
- westpa/tools/wipi.py +751 -0
- westpa/trajtree/__init__.py +4 -0
- westpa/trajtree/_trajtree.cpython-312-x86_64-linux-gnu.so +0 -0
- westpa/trajtree/trajtree.py +117 -0
- westpa/westext/__init__.py +0 -0
- westpa/westext/adaptvoronoi/__init__.py +3 -0
- westpa/westext/adaptvoronoi/adaptVor_driver.py +214 -0
- westpa/westext/hamsm_restarting/__init__.py +3 -0
- westpa/westext/hamsm_restarting/example_overrides.py +35 -0
- westpa/westext/hamsm_restarting/restart_driver.py +1165 -0
- westpa/westext/stringmethod/__init__.py +11 -0
- westpa/westext/stringmethod/fourier_fitting.py +69 -0
- westpa/westext/stringmethod/string_driver.py +253 -0
- westpa/westext/stringmethod/string_method.py +306 -0
- westpa/westext/weed/BinCluster.py +180 -0
- westpa/westext/weed/ProbAdjustEquil.py +100 -0
- westpa/westext/weed/UncertMath.py +247 -0
- westpa/westext/weed/__init__.py +10 -0
- westpa/westext/weed/weed_driver.py +182 -0
- westpa/westext/wess/ProbAdjust.py +101 -0
- westpa/westext/wess/__init__.py +6 -0
- westpa/westext/wess/wess_driver.py +207 -0
- westpa/work_managers/__init__.py +57 -0
- westpa/work_managers/core.py +396 -0
- westpa/work_managers/environment.py +134 -0
- westpa/work_managers/mpi.py +318 -0
- westpa/work_managers/processes.py +187 -0
- westpa/work_managers/serial.py +28 -0
- westpa/work_managers/threads.py +79 -0
- westpa/work_managers/zeromq/__init__.py +20 -0
- westpa/work_managers/zeromq/core.py +641 -0
- westpa/work_managers/zeromq/node.py +131 -0
- westpa/work_managers/zeromq/work_manager.py +526 -0
- westpa/work_managers/zeromq/worker.py +320 -0
- westpa-2022.10.dist-info/AUTHORS +22 -0
- westpa-2022.10.dist-info/LICENSE +21 -0
- westpa-2022.10.dist-info/METADATA +183 -0
- westpa-2022.10.dist-info/RECORD +150 -0
- westpa-2022.10.dist-info/WHEEL +6 -0
- westpa-2022.10.dist-info/entry_points.txt +29 -0
- westpa-2022.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Created on Jun 11, 2015
|
|
3
|
+
|
|
4
|
+
@author: mzwier
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
log = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
from .core import ZMQCore, Message, PassiveMultiTimer, IsNode
|
|
12
|
+
|
|
13
|
+
import zmq
|
|
14
|
+
from zmq.devices import ThreadProxy
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ZMQNode(ZMQCore, IsNode):
|
|
18
|
+
def __init__(self, upstream_rr_endpoint, upstream_ann_endpoint, n_local_workers=None):
|
|
19
|
+
ZMQCore.__init__(self)
|
|
20
|
+
IsNode.__init__(self, n_local_workers)
|
|
21
|
+
|
|
22
|
+
self.upstream_rr_endpoint = upstream_rr_endpoint
|
|
23
|
+
self.upstream_ann_endpoint = upstream_ann_endpoint
|
|
24
|
+
|
|
25
|
+
def __enter__(self):
|
|
26
|
+
return self
|
|
27
|
+
|
|
28
|
+
def __exit__(self, exc_type, exc_val, exc_traceback):
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def run(self):
|
|
32
|
+
self.startup()
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def is_master(self):
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
def comm_loop(self):
|
|
39
|
+
self.context = zmq.Context.instance()
|
|
40
|
+
# or else the proxies create sockets in a different context
|
|
41
|
+
|
|
42
|
+
self.context.linger = 100
|
|
43
|
+
# So we don't have to destroy the context at the end of the loop
|
|
44
|
+
|
|
45
|
+
rr_proxy = ThreadProxy(zmq.ROUTER, zmq.DEALER)
|
|
46
|
+
|
|
47
|
+
# We use push/pull so (1) we don't miss any announcements
|
|
48
|
+
# and (2) we don't have to deal with subscription messages
|
|
49
|
+
ann_proxy = ThreadProxy(zmq.SUB, zmq.PUB, zmq.PUSH)
|
|
50
|
+
ann_monitor = self.context.socket(zmq.PULL)
|
|
51
|
+
|
|
52
|
+
# Not monitoring request/reply streams for two reasons:
|
|
53
|
+
# (1) we'd need to strip identity frames to interpret the messages
|
|
54
|
+
# (2) interpreting the messages means we'd have to decode (unpickle) and then re-encode
|
|
55
|
+
# all of the data flying through here, which seems like a waste just to see if
|
|
56
|
+
# clients start up. We miss the edge failure case where one node's workers
|
|
57
|
+
# start up but another's fail. Seems much less likely than all workers
|
|
58
|
+
# failing to start up, which would be caught by the master
|
|
59
|
+
|
|
60
|
+
ann_mon_endpoint = 'inproc://{:x}'.format(id(ann_monitor))
|
|
61
|
+
ann_monitor.bind(ann_mon_endpoint)
|
|
62
|
+
|
|
63
|
+
rr_proxy.bind_in(self.downstream_rr_endpoint)
|
|
64
|
+
if self.local_rr_endpoint:
|
|
65
|
+
rr_proxy.bind_in(self.local_rr_endpoint)
|
|
66
|
+
self.log.debug('connecting upstream_rr_endpoint = {!r}'.format(self.upstream_rr_endpoint))
|
|
67
|
+
rr_proxy.connect_out(self.upstream_rr_endpoint)
|
|
68
|
+
|
|
69
|
+
ann_proxy.bind_out(self.downstream_ann_endpoint)
|
|
70
|
+
if self.local_ann_endpoint:
|
|
71
|
+
ann_proxy.bind_out(self.local_ann_endpoint)
|
|
72
|
+
ann_proxy.connect_in(self.upstream_ann_endpoint)
|
|
73
|
+
self.log.debug('connecting upstream_ann_endpoint = {!r}'.format(self.upstream_ann_endpoint))
|
|
74
|
+
ann_proxy.setsockopt_in(zmq.SUBSCRIBE, b'')
|
|
75
|
+
ann_proxy.connect_mon(ann_mon_endpoint)
|
|
76
|
+
|
|
77
|
+
rr_proxy.start()
|
|
78
|
+
ann_proxy.start()
|
|
79
|
+
|
|
80
|
+
ann_monitor.connect(ann_mon_endpoint)
|
|
81
|
+
|
|
82
|
+
inproc_socket = self.context.socket(zmq.SUB)
|
|
83
|
+
inproc_socket.setsockopt(zmq.SUBSCRIBE, b'')
|
|
84
|
+
inproc_socket.bind(self.inproc_endpoint)
|
|
85
|
+
|
|
86
|
+
timers = PassiveMultiTimer()
|
|
87
|
+
timers.add_timer('master_beacon', self.master_beacon_period)
|
|
88
|
+
timers.add_timer('startup_timeout', self.startup_timeout)
|
|
89
|
+
timers.reset()
|
|
90
|
+
|
|
91
|
+
self.log.debug('master beacon period: {!r}'.format(self.master_beacon_period))
|
|
92
|
+
self.log.debug('startup timeout: {!r}'.format(self.startup_timeout))
|
|
93
|
+
|
|
94
|
+
peer_found = False
|
|
95
|
+
|
|
96
|
+
poller = zmq.Poller()
|
|
97
|
+
poller.register(ann_monitor, zmq.POLLIN)
|
|
98
|
+
poller.register(inproc_socket, zmq.POLLIN)
|
|
99
|
+
try:
|
|
100
|
+
while True:
|
|
101
|
+
poll_results = dict(poller.poll((timers.next_expiration_in() or 0.001) * 1000))
|
|
102
|
+
|
|
103
|
+
if inproc_socket in poll_results:
|
|
104
|
+
msgs = self.recv_all(ann_monitor, validate=False)
|
|
105
|
+
if Message.SHUTDOWN in (msg.message for msg in msgs):
|
|
106
|
+
self.log.debug('shutdown received')
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
if ann_monitor in poll_results:
|
|
110
|
+
msgs = self.recv_all(ann_monitor, validate=False)
|
|
111
|
+
message_tags = {msg.message for msg in msgs}
|
|
112
|
+
if Message.SHUTDOWN in message_tags:
|
|
113
|
+
self.log.debug('shutdown received')
|
|
114
|
+
break
|
|
115
|
+
if not peer_found and (Message.MASTER_BEACON in message_tags or Message.TASKS_AVAILABLE in message_tags):
|
|
116
|
+
peer_found = True
|
|
117
|
+
timers.remove_timer('startup_timeout')
|
|
118
|
+
|
|
119
|
+
if not peer_found and timers.expired('startup_timeout'):
|
|
120
|
+
self.log.error('startup phase elapsed with no contact from peer; shutting down')
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
finally:
|
|
124
|
+
self.log.debug('exiting')
|
|
125
|
+
self.context = None
|
|
126
|
+
self.remove_ipc_endpoints()
|
|
127
|
+
IsNode.shutdown(self)
|
|
128
|
+
|
|
129
|
+
def startup(self):
|
|
130
|
+
IsNode.startup(self)
|
|
131
|
+
super().startup()
|
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import socket
|
|
5
|
+
|
|
6
|
+
from .core import ZMQCore, Message, Task, Result, ZMQWorkerMissing, ZMQWMEnvironmentError, IsNode
|
|
7
|
+
from .core import PassiveMultiTimer
|
|
8
|
+
from .core import randport
|
|
9
|
+
from .worker import ZMQWorker
|
|
10
|
+
from .node import ZMQNode
|
|
11
|
+
|
|
12
|
+
import westpa.work_managers as work_managers
|
|
13
|
+
from westpa.work_managers import WorkManager, WMFuture
|
|
14
|
+
|
|
15
|
+
import zmq
|
|
16
|
+
|
|
17
|
+
from collections import deque
|
|
18
|
+
|
|
19
|
+
log = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ZMQWorkManager(ZMQCore, WorkManager, IsNode):
|
|
23
|
+
@classmethod
|
|
24
|
+
def add_wm_args(cls, parser, wmenv=None):
|
|
25
|
+
if wmenv is None:
|
|
26
|
+
wmenv = work_managers.environment.default_env
|
|
27
|
+
|
|
28
|
+
wm_group = parser.add_argument_group('options for ZeroMQ ("zmq") work manager (master or node)')
|
|
29
|
+
|
|
30
|
+
wm_group.add_argument(
|
|
31
|
+
wmenv.arg_flag('zmq_mode'),
|
|
32
|
+
metavar='MODE',
|
|
33
|
+
choices=('master', 'node', 'server', 'client'),
|
|
34
|
+
help='Operate as a master (server) or a node (workers/client). '
|
|
35
|
+
+ '"server" is a deprecated synonym for "master" and "client" is a '
|
|
36
|
+
+ 'deprecated synonym for "node".',
|
|
37
|
+
)
|
|
38
|
+
wm_group.add_argument(
|
|
39
|
+
wmenv.arg_flag('zmq_comm_mode'),
|
|
40
|
+
metavar='COMM_MODE',
|
|
41
|
+
choices=('ipc', 'tcp'),
|
|
42
|
+
help='Use the given communication mode -- TCP or IPC (Unix-domain) -- sockets '
|
|
43
|
+
+ 'for communication within a node. IPC (the default) may be more '
|
|
44
|
+
+ 'efficient but is not available on (exceptionally rare) systems '
|
|
45
|
+
+ 'without node-local storage (e.g. /tmp); on such systems, TCP may be used instead.',
|
|
46
|
+
)
|
|
47
|
+
wm_group.add_argument(
|
|
48
|
+
wmenv.arg_flag('zmq_write_host_info'),
|
|
49
|
+
metavar='INFO_FILE',
|
|
50
|
+
help='Store hostname and port information needed to connect to this instance '
|
|
51
|
+
+ 'in INFO_FILE. This allows the master and nodes assisting in '
|
|
52
|
+
+ 'coordinating the communication of other nodes to choose ports '
|
|
53
|
+
+ 'randomly. Downstream nodes read this file with '
|
|
54
|
+
+ wmenv.arg_flag('zmq_read_host_info')
|
|
55
|
+
+ ' and know where how to connect.',
|
|
56
|
+
)
|
|
57
|
+
wm_group.add_argument(
|
|
58
|
+
wmenv.arg_flag('zmq_read_host_info'),
|
|
59
|
+
metavar='INFO_FILE',
|
|
60
|
+
help='Read hostname and port information needed to connect to the master '
|
|
61
|
+
+ '(or other coordinating node) from INFO_FILE. '
|
|
62
|
+
+ 'This allows the master and nodes assisting in '
|
|
63
|
+
+ 'coordinating the communication of other nodes to choose ports '
|
|
64
|
+
+ 'randomly, writing that information with '
|
|
65
|
+
+ wmenv.arg_flag('zmq_write_host_info')
|
|
66
|
+
+ ' for this instance to read.',
|
|
67
|
+
)
|
|
68
|
+
wm_group.add_argument(
|
|
69
|
+
wmenv.arg_flag('zmq_upstream_rr_endpoint'),
|
|
70
|
+
metavar='ENDPOINT',
|
|
71
|
+
help='ZeroMQ endpoint to which to send request/response (task and result) ' + 'traffic toward the master.',
|
|
72
|
+
)
|
|
73
|
+
wm_group.add_argument(
|
|
74
|
+
wmenv.arg_flag('zmq_upstream_ann_endpoint'),
|
|
75
|
+
metavar='ENDPOINT',
|
|
76
|
+
help='ZeroMQ endpoint on which to receive announcement '
|
|
77
|
+
+ '(heartbeat and shutdown notification) traffic from the master.',
|
|
78
|
+
)
|
|
79
|
+
wm_group.add_argument(
|
|
80
|
+
wmenv.arg_flag('zmq_downstream_rr_endpoint'),
|
|
81
|
+
metavar='ENDPOINT',
|
|
82
|
+
help='ZeroMQ endpoint on which to listen for request/response ' + '(task and result) traffic from subsidiary workers.',
|
|
83
|
+
)
|
|
84
|
+
wm_group.add_argument(
|
|
85
|
+
wmenv.arg_flag('zmq_downstream_ann_endpoint'),
|
|
86
|
+
metavar='ENDPOINT',
|
|
87
|
+
help='ZeroMQ endpoint on which to send announcement ' + '(heartbeat and shutdown notification) traffic toward workers.',
|
|
88
|
+
)
|
|
89
|
+
wm_group.add_argument(
|
|
90
|
+
wmenv.arg_flag('zmq_master_heartbeat'),
|
|
91
|
+
metavar='MASTER_HEARTBEAT',
|
|
92
|
+
type=float,
|
|
93
|
+
help='Every MASTER_HEARTBEAT seconds, the master announces its presence ' + 'to workers.',
|
|
94
|
+
)
|
|
95
|
+
wm_group.add_argument(
|
|
96
|
+
wmenv.arg_flag('zmq_worker_heartbeat'),
|
|
97
|
+
metavar='WORKER_HEARTBEAT',
|
|
98
|
+
type=float,
|
|
99
|
+
help='Every WORKER_HEARTBEAT seconds, workers announce their presence ' + 'to the master.',
|
|
100
|
+
)
|
|
101
|
+
wm_group.add_argument(
|
|
102
|
+
wmenv.arg_flag('zmq_timeout_factor'),
|
|
103
|
+
metavar='FACTOR',
|
|
104
|
+
type=float,
|
|
105
|
+
help='Scaling factor for heartbeat timeouts. '
|
|
106
|
+
+ "If the master doesn't hear from a worker in WORKER_HEARTBEAT*FACTOR, "
|
|
107
|
+
+ "the worker is assumed to have crashed. If a worker doesn't hear from "
|
|
108
|
+
+ "the master in MASTER_HEARTBEAT*FACTOR seconds, the master is assumed "
|
|
109
|
+
+ "to have crashed. Both cases result in shutdown. ",
|
|
110
|
+
)
|
|
111
|
+
wm_group.add_argument(
|
|
112
|
+
wmenv.arg_flag('zmq_startup_timeout'),
|
|
113
|
+
metavar='STARTUP_TIMEOUT',
|
|
114
|
+
type=float,
|
|
115
|
+
help='Amount of time (in seconds) to wait for communication between '
|
|
116
|
+
+ 'the master and at least one worker. This may need to be changed '
|
|
117
|
+
+ 'on very large, heavily-loaded computer systems that start all processes '
|
|
118
|
+
+ 'simultaneously. ',
|
|
119
|
+
)
|
|
120
|
+
wm_group.add_argument(
|
|
121
|
+
wmenv.arg_flag('zmq_shutdown_timeout'),
|
|
122
|
+
metavar='SHUTDOWN_TIMEOUT',
|
|
123
|
+
type=float,
|
|
124
|
+
help='Amount of time (in seconds) to wait for workers to shut down.',
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def from_environ(cls, wmenv=None):
|
|
129
|
+
if wmenv is None:
|
|
130
|
+
wmenv = work_managers.environment.default_env
|
|
131
|
+
|
|
132
|
+
# determine mode
|
|
133
|
+
mode = wmenv.get_val('zmq_mode', 'master').lower()
|
|
134
|
+
if mode in {'master', 'server'}:
|
|
135
|
+
mode = 'master'
|
|
136
|
+
elif mode in {'node', 'client'}:
|
|
137
|
+
mode = 'node'
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError('invalid ZMQ work manager mode {!r}'.format(mode))
|
|
140
|
+
|
|
141
|
+
# determine number of workers
|
|
142
|
+
# 0 with mode=='master' is a dedicated master
|
|
143
|
+
# 0 with mode=='node' is a dedicated communications process (former ZMQRouter)
|
|
144
|
+
n_workers = wmenv.get_val('n_workers', multiprocessing.cpu_count(), int)
|
|
145
|
+
|
|
146
|
+
# We set this at the class level, because outside of testing, a node either
|
|
147
|
+
# can support IPC or it can't, and there is no obvious need (currently)
|
|
148
|
+
# to support both modes on an instance-by-instance basis
|
|
149
|
+
comm_mode = wmenv.get_val('zmq_comm_mode', cls.default_comm_mode)
|
|
150
|
+
ZMQWorkManager.internal_transport = comm_mode
|
|
151
|
+
ZMQWorker.internal_transport = comm_mode
|
|
152
|
+
ZMQNode.internal_transport = comm_mode
|
|
153
|
+
|
|
154
|
+
write_host_info = wmenv.get_val('zmq_write_host_info')
|
|
155
|
+
read_host_info = wmenv.get_val('zmq_read_host_info')
|
|
156
|
+
master_heartbeat = wmenv.get_val('zmq_master_heartbeat', cls.default_master_heartbeat, float)
|
|
157
|
+
worker_heartbeat = wmenv.get_val('zmq_worker_heartbeat', cls.default_worker_heartbeat, float)
|
|
158
|
+
timeout_factor = wmenv.get_val('zmq_timeout_factor', cls.default_timeout_factor, float)
|
|
159
|
+
startup_timeout = wmenv.get_val('zmq_startup_timeout', cls.default_startup_timeout, float)
|
|
160
|
+
|
|
161
|
+
if mode == 'master':
|
|
162
|
+
instance = ZMQWorkManager(n_workers)
|
|
163
|
+
else: # mode =='node'
|
|
164
|
+
upstream_info = {}
|
|
165
|
+
if read_host_info:
|
|
166
|
+
upstream_info.update(cls.read_host_info(read_host_info))
|
|
167
|
+
log.debug('upstream_info: {!r}'.format(upstream_info))
|
|
168
|
+
|
|
169
|
+
upstream_rr_endpoint = wmenv.get_val('zmq_upstream_rr_endpoint', upstream_info.get('rr_endpoint'))
|
|
170
|
+
upstream_ann_endpoint = wmenv.get_val('zmq_upstream_ann_endpoint', upstream_info.get('ann_endpoint'))
|
|
171
|
+
|
|
172
|
+
if not (upstream_rr_endpoint and upstream_ann_endpoint):
|
|
173
|
+
raise ZMQWMEnvironmentError('at least one upstream endpoint unspecified')
|
|
174
|
+
|
|
175
|
+
# expand hostnames, if present, to IP addresses
|
|
176
|
+
# reject wildcard hostnames, which is a logic error (can't connect to a host
|
|
177
|
+
# without specifying an address)
|
|
178
|
+
upstream_rr_endpoint = cls.canonicalize_endpoint(upstream_rr_endpoint, allow_wildcard_host=False)
|
|
179
|
+
upstream_ann_endpoint = cls.canonicalize_endpoint(upstream_ann_endpoint, allow_wildcard_host=False)
|
|
180
|
+
|
|
181
|
+
log.debug('upstream_rr_endpoint = {}'.format(upstream_rr_endpoint))
|
|
182
|
+
log.debug('upstream_ann_endpoint = {}'.format(upstream_ann_endpoint))
|
|
183
|
+
|
|
184
|
+
instance = ZMQNode(
|
|
185
|
+
upstream_ann_endpoint=upstream_ann_endpoint, upstream_rr_endpoint=upstream_rr_endpoint, n_local_workers=n_workers
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Both server and node bind downstream endpoints, so that users get fan-out communications
|
|
189
|
+
# "for free" when starting up a computational node
|
|
190
|
+
downstream_rr_endpoint = cls.canonicalize_endpoint(
|
|
191
|
+
wmenv.get_val('zmq_downstream_rr_endpoint', 'tcp://*:{}'.format(randport()))
|
|
192
|
+
)
|
|
193
|
+
downstream_ann_endpoint = cls.canonicalize_endpoint(
|
|
194
|
+
wmenv.get_val('zmq_downstream_ann_endpoint', 'tcp://*:{}'.format(randport()))
|
|
195
|
+
)
|
|
196
|
+
instance.downstream_rr_endpoint = downstream_rr_endpoint
|
|
197
|
+
instance.downstream_ann_endpoint = downstream_ann_endpoint
|
|
198
|
+
|
|
199
|
+
instance.master_beacon_period = master_heartbeat
|
|
200
|
+
instance.worker_beacon_period = worker_heartbeat
|
|
201
|
+
instance.timeout_factor = timeout_factor
|
|
202
|
+
instance.startup_timeout = startup_timeout
|
|
203
|
+
|
|
204
|
+
assert isinstance(instance, IsNode)
|
|
205
|
+
for worker in instance.local_workers:
|
|
206
|
+
worker.master_beacon_period = master_heartbeat
|
|
207
|
+
worker.worker_beacon_period = worker_heartbeat
|
|
208
|
+
worker.timeout_factor = timeout_factor
|
|
209
|
+
worker.startup_timeout = startup_timeout
|
|
210
|
+
|
|
211
|
+
# We always write host info (since we are always either master or node)
|
|
212
|
+
# we choose not to in the special case that read_host_info is '' but not None
|
|
213
|
+
# (None implies nothing found on command line or in environment variables, but ''
|
|
214
|
+
# implies that it was found somewhere but it is empty)
|
|
215
|
+
if write_host_info is not None and write_host_info != '':
|
|
216
|
+
instance.write_host_info(write_host_info)
|
|
217
|
+
|
|
218
|
+
log.debug('prepared {!r} with:'.format(instance))
|
|
219
|
+
log.debug('n_workers = {}'.format(n_workers))
|
|
220
|
+
for attr in (
|
|
221
|
+
'master_beacon_period',
|
|
222
|
+
'worker_beacon_period',
|
|
223
|
+
'startup_timeout',
|
|
224
|
+
'timeout_factor',
|
|
225
|
+
'downstream_rr_endpoint',
|
|
226
|
+
'downstream_ann_endpoint',
|
|
227
|
+
):
|
|
228
|
+
log.debug('{} = {!r}'.format(attr, getattr(instance, attr)))
|
|
229
|
+
|
|
230
|
+
return instance
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
def read_host_info(cls, filename):
|
|
234
|
+
return json.load(open(filename, 'rt'))
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
def canonicalize_endpoint(cls, endpoint, allow_wildcard_host=True):
|
|
238
|
+
if endpoint.startswith('ipc://'):
|
|
239
|
+
return endpoint
|
|
240
|
+
elif endpoint.startswith('tcp://'):
|
|
241
|
+
fields = endpoint[6:].split(':')
|
|
242
|
+
|
|
243
|
+
# get IP address
|
|
244
|
+
if fields[0] != '*':
|
|
245
|
+
ipaddr = socket.gethostbyname(fields[0])
|
|
246
|
+
else:
|
|
247
|
+
if allow_wildcard_host:
|
|
248
|
+
ipaddr = '*'
|
|
249
|
+
else:
|
|
250
|
+
raise ValueError('wildcard host not permitted')
|
|
251
|
+
|
|
252
|
+
# get/generate port
|
|
253
|
+
try:
|
|
254
|
+
port = fields[1]
|
|
255
|
+
except IndexError:
|
|
256
|
+
# no port given; select one
|
|
257
|
+
port = randport()
|
|
258
|
+
else:
|
|
259
|
+
port = int(fields[1])
|
|
260
|
+
|
|
261
|
+
return 'tcp://{}:{}'.format(ipaddr, port)
|
|
262
|
+
else:
|
|
263
|
+
raise ValueError('unrecognized/unsupported endpoint: {!r}'.format(endpoint))
|
|
264
|
+
|
|
265
|
+
def __init__(self, n_local_workers=1):
|
|
266
|
+
ZMQCore.__init__(self)
|
|
267
|
+
WorkManager.__init__(self)
|
|
268
|
+
IsNode.__init__(self, n_local_workers)
|
|
269
|
+
|
|
270
|
+
# Futures indexed by task ID
|
|
271
|
+
self.futures = dict()
|
|
272
|
+
|
|
273
|
+
# Tasks pending distribution
|
|
274
|
+
self.outgoing_tasks = deque()
|
|
275
|
+
|
|
276
|
+
# Tasks being processed by workers (indexed by worker_id)
|
|
277
|
+
self.assigned_tasks = dict()
|
|
278
|
+
|
|
279
|
+
# Identity information and last contact from workers
|
|
280
|
+
self.worker_information = dict() # indexed by worker_id
|
|
281
|
+
self.worker_timeouts = PassiveMultiTimer() # indexed by worker_id
|
|
282
|
+
|
|
283
|
+
# Number of seconds between checks to see which workers have timed out
|
|
284
|
+
self.worker_timeout_check = 5.0
|
|
285
|
+
|
|
286
|
+
# Amount of time to wait for stray requests to arrive so that workers shut down properly
|
|
287
|
+
self.shutdown_timeout = 0.5
|
|
288
|
+
|
|
289
|
+
self.master_id = self.node_id
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def n_workers(self):
|
|
293
|
+
return len(self.worker_information)
|
|
294
|
+
|
|
295
|
+
def submit(self, fn, args=None, kwargs=None):
|
|
296
|
+
if self.futures is None:
|
|
297
|
+
# We are shutting down
|
|
298
|
+
raise ZMQWMEnvironmentError('work manager is shutting down')
|
|
299
|
+
future = WMFuture()
|
|
300
|
+
task = Task(fn, args or (), kwargs or {}, task_id=future.task_id)
|
|
301
|
+
self.futures[task.task_id] = future
|
|
302
|
+
self.outgoing_tasks.append(task)
|
|
303
|
+
# Wake up the communications loop (if necessary) to announce new tasks
|
|
304
|
+
self.send_inproc_message(Message.TASKS_AVAILABLE)
|
|
305
|
+
return future
|
|
306
|
+
|
|
307
|
+
def submit_many(self, tasks):
|
|
308
|
+
if self.futures is None:
|
|
309
|
+
# We are shutting down
|
|
310
|
+
raise ZMQWMEnvironmentError('work manager is shutting down')
|
|
311
|
+
futures = []
|
|
312
|
+
for fn, args, kwargs in tasks:
|
|
313
|
+
future = WMFuture()
|
|
314
|
+
task = Task(fn, args, kwargs, task_id=future.task_id)
|
|
315
|
+
self.futures[task.task_id] = future
|
|
316
|
+
self.outgoing_tasks.append(task)
|
|
317
|
+
futures.append(future)
|
|
318
|
+
# Wake up the communications loop (if necessary) to announce new tasks
|
|
319
|
+
self.send_inproc_message(Message.TASKS_AVAILABLE)
|
|
320
|
+
return futures
|
|
321
|
+
|
|
322
|
+
def send_message(self, socket, message, payload=None, flags=0):
|
|
323
|
+
message = Message(message, payload)
|
|
324
|
+
message.master_id = self.node_id
|
|
325
|
+
super().send_message(socket, message, payload, flags)
|
|
326
|
+
|
|
327
|
+
def handle_result(self, socket, msg):
|
|
328
|
+
self.send_ack(socket, msg)
|
|
329
|
+
with self.message_validation(msg):
|
|
330
|
+
assert msg.message == Message.RESULT
|
|
331
|
+
assert isinstance(msg.payload, Result)
|
|
332
|
+
assert msg.payload.task_id in self.futures
|
|
333
|
+
assert self.assigned_tasks[msg.src_id].task_id == msg.payload.task_id
|
|
334
|
+
|
|
335
|
+
result = msg.payload
|
|
336
|
+
|
|
337
|
+
future = self.futures.pop(result.task_id)
|
|
338
|
+
del self.assigned_tasks[msg.src_id]
|
|
339
|
+
if result.exception is not None:
|
|
340
|
+
future._set_exception(result.exception, result.traceback)
|
|
341
|
+
else:
|
|
342
|
+
future._set_result(result.result)
|
|
343
|
+
|
|
344
|
+
def handle_task_request(self, socket, msg):
|
|
345
|
+
if not self.outgoing_tasks:
|
|
346
|
+
# No tasks available
|
|
347
|
+
self.send_nak(socket, msg)
|
|
348
|
+
else:
|
|
349
|
+
task = self.outgoing_tasks.popleft()
|
|
350
|
+
|
|
351
|
+
worker_id = msg.src_id
|
|
352
|
+
self.assigned_tasks[worker_id] = task
|
|
353
|
+
|
|
354
|
+
self.send_message(socket, Message.TASK, task)
|
|
355
|
+
|
|
356
|
+
def update_worker_information(self, msg):
|
|
357
|
+
if msg.message == Message.IDENTIFY:
|
|
358
|
+
with self.message_validation(msg):
|
|
359
|
+
assert isinstance(msg.payload, dict)
|
|
360
|
+
self.worker_information[msg.src_id] = msg.payload
|
|
361
|
+
else:
|
|
362
|
+
self.worker_information[msg.src_id] = {}
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
self.worker_timeouts.reset(msg.src_id)
|
|
366
|
+
except KeyError:
|
|
367
|
+
self.worker_timeouts.add_timer(msg.src_id, self.worker_beacon_period * self.timeout_factor)
|
|
368
|
+
|
|
369
|
+
def check_workers(self):
|
|
370
|
+
expired_worker_ids = self.worker_timeouts.which_expired()
|
|
371
|
+
for expired_worker_id in expired_worker_ids:
|
|
372
|
+
try:
|
|
373
|
+
worker_description = '{!s} ({!s})'.format(
|
|
374
|
+
expired_worker_id, self.worker_information[expired_worker_id]['description']
|
|
375
|
+
)
|
|
376
|
+
except KeyError:
|
|
377
|
+
worker_description = str(expired_worker_id)
|
|
378
|
+
|
|
379
|
+
self.log.error('no contact from worker {}: {}'.format(expired_worker_id, worker_description))
|
|
380
|
+
|
|
381
|
+
self.remove_worker(expired_worker_id)
|
|
382
|
+
|
|
383
|
+
def remove_worker(self, worker_id):
|
|
384
|
+
try:
|
|
385
|
+
expired_task = self.assigned_tasks.pop(worker_id)
|
|
386
|
+
except KeyError:
|
|
387
|
+
pass
|
|
388
|
+
else:
|
|
389
|
+
self.log.error('aborting task {!r} running on expired worker {!s}'.format(expired_task, worker_id))
|
|
390
|
+
future = self.futures.pop(expired_task.task_id)
|
|
391
|
+
future._set_exception(ZMQWorkerMissing('worker running this task disappeared'))
|
|
392
|
+
del self.worker_information[worker_id]
|
|
393
|
+
|
|
394
|
+
def shutdown_clear_tasks(self):
|
|
395
|
+
'''Abort pending tasks with error on shutdown.'''
|
|
396
|
+
while self.futures:
|
|
397
|
+
task_id, future = self.futures.popitem()
|
|
398
|
+
future._set_exception(ZMQWMEnvironmentError('work manager shut down during task'))
|
|
399
|
+
self.futures = None
|
|
400
|
+
|
|
401
|
+
def comm_loop(self):
|
|
402
|
+
self.context = zmq.Context()
|
|
403
|
+
|
|
404
|
+
rr_socket = self.context.socket(zmq.REP)
|
|
405
|
+
ann_socket = self.context.socket(zmq.PUB)
|
|
406
|
+
|
|
407
|
+
for endpoint in (self.local_rr_endpoint, self.downstream_rr_endpoint):
|
|
408
|
+
if endpoint:
|
|
409
|
+
rr_socket.bind(endpoint)
|
|
410
|
+
|
|
411
|
+
for endpoint in (self.local_ann_endpoint, self.downstream_ann_endpoint):
|
|
412
|
+
if endpoint:
|
|
413
|
+
ann_socket.bind(endpoint)
|
|
414
|
+
|
|
415
|
+
inproc_socket = self.context.socket(zmq.SUB)
|
|
416
|
+
inproc_socket.setsockopt(zmq.SUBSCRIBE, b'')
|
|
417
|
+
inproc_socket.bind(self.inproc_endpoint)
|
|
418
|
+
|
|
419
|
+
poller = zmq.Poller()
|
|
420
|
+
poller.register(inproc_socket, zmq.POLLIN)
|
|
421
|
+
poller.register(rr_socket, zmq.POLLIN)
|
|
422
|
+
|
|
423
|
+
timers = PassiveMultiTimer()
|
|
424
|
+
timers.add_timer('tasks_avail', self.master_beacon_period)
|
|
425
|
+
timers.add_timer('master_beacon', self.master_beacon_period)
|
|
426
|
+
timers.add_timer('worker_timeout_check', self.worker_beacon_period * self.timeout_factor)
|
|
427
|
+
timers.add_timer('startup_timeout', self.startup_timeout)
|
|
428
|
+
timers.reset()
|
|
429
|
+
|
|
430
|
+
self.log.debug('master beacon period: {!r}'.format(self.master_beacon_period))
|
|
431
|
+
self.log.debug('startup timeout: {!r}'.format(self.startup_timeout))
|
|
432
|
+
|
|
433
|
+
peer_found = False
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
# Send a master alive message immediately; it will get discarded if necessary
|
|
437
|
+
self.send_message(ann_socket, Message.MASTER_BEACON)
|
|
438
|
+
|
|
439
|
+
while True:
|
|
440
|
+
# If a timer is already expired, next_expiration_in() will return 0, which
|
|
441
|
+
# zeromq interprets as infinite wait; so instead we select a 1 ms wait in this
|
|
442
|
+
# case.
|
|
443
|
+
timeout = (timers.next_expiration_in() or 0.001) * 1000
|
|
444
|
+
# Wake up every second to check for signals
|
|
445
|
+
timeout = min(timeout, 1000)
|
|
446
|
+
poll_results = dict(poller.poll(timeout))
|
|
447
|
+
|
|
448
|
+
if inproc_socket in poll_results:
|
|
449
|
+
msgs = self.recv_all(inproc_socket, validate=False)
|
|
450
|
+
# Check for shutdown; do nothing else if shutdown is signalled
|
|
451
|
+
if Message.SHUTDOWN in (msg.message for msg in msgs):
|
|
452
|
+
self.log.debug('shutdown received')
|
|
453
|
+
break
|
|
454
|
+
# Check for any other wake-up messages
|
|
455
|
+
for msg in msgs:
|
|
456
|
+
if msg.message == Message.TASKS_AVAILABLE:
|
|
457
|
+
self.send_message(ann_socket, Message.TASKS_AVAILABLE)
|
|
458
|
+
|
|
459
|
+
if rr_socket in poll_results:
|
|
460
|
+
msg = self.recv_message(rr_socket)
|
|
461
|
+
self.update_worker_information(msg)
|
|
462
|
+
|
|
463
|
+
if msg.message == Message.TASK_REQUEST:
|
|
464
|
+
self.handle_task_request(rr_socket, msg)
|
|
465
|
+
elif msg.message == Message.RESULT:
|
|
466
|
+
self.handle_result(rr_socket, msg)
|
|
467
|
+
else:
|
|
468
|
+
self.send_ack(rr_socket, msg)
|
|
469
|
+
|
|
470
|
+
if self.worker_information:
|
|
471
|
+
peer_found = True
|
|
472
|
+
|
|
473
|
+
if timers.expired('tasks_avail'):
|
|
474
|
+
if self.outgoing_tasks:
|
|
475
|
+
self.send_message(ann_socket, Message.TASKS_AVAILABLE)
|
|
476
|
+
timers.reset('tasks_avail')
|
|
477
|
+
|
|
478
|
+
if timers.expired('master_beacon'):
|
|
479
|
+
self.send_message(ann_socket, Message.MASTER_BEACON)
|
|
480
|
+
timers.reset('master_beacon')
|
|
481
|
+
|
|
482
|
+
if peer_found and timers.expired('worker_timeout_check'):
|
|
483
|
+
self.check_workers()
|
|
484
|
+
if not self.worker_information:
|
|
485
|
+
self.log.error('all workers disappeared; exiting')
|
|
486
|
+
break
|
|
487
|
+
timers.reset('worker_timeout_check')
|
|
488
|
+
|
|
489
|
+
if not peer_found and timers.expired('startup_timeout'):
|
|
490
|
+
self.log.error('startup phase elapsed with no contact from workers; shutting down')
|
|
491
|
+
while self.futures:
|
|
492
|
+
future = self.futures.popitem()[1]
|
|
493
|
+
future._set_exception(ZMQWorkerMissing('no workers available'))
|
|
494
|
+
break
|
|
495
|
+
|
|
496
|
+
# Post a shutdown message
|
|
497
|
+
self.log.debug('sending shutdown on ann_socket')
|
|
498
|
+
self.send_message(ann_socket, Message.SHUTDOWN)
|
|
499
|
+
poller.unregister(inproc_socket)
|
|
500
|
+
|
|
501
|
+
# Clear tasks
|
|
502
|
+
self.shutdown_clear_tasks()
|
|
503
|
+
|
|
504
|
+
# Clear incoming queue of requests, to let clients exit request/reply states gracefully
|
|
505
|
+
# (clients will still timeout in these states if necessary)
|
|
506
|
+
timers.add_timer('shutdown', self.shutdown_timeout)
|
|
507
|
+
while not timers.expired('shutdown'):
|
|
508
|
+
poll_results = dict(poller.poll(self.shutdown_timeout / 10 * 1000))
|
|
509
|
+
if rr_socket in poll_results:
|
|
510
|
+
msg = self.recv_message(rr_socket)
|
|
511
|
+
self.send_nak(rr_socket, msg)
|
|
512
|
+
|
|
513
|
+
finally:
|
|
514
|
+
self.context.destroy(linger=1)
|
|
515
|
+
self.context = None
|
|
516
|
+
self.remove_ipc_endpoints()
|
|
517
|
+
|
|
518
|
+
def startup(self):
|
|
519
|
+
IsNode.startup(self)
|
|
520
|
+
super().startup()
|
|
521
|
+
|
|
522
|
+
def shutdown(self):
|
|
523
|
+
self.signal_shutdown()
|
|
524
|
+
IsNode.shutdown(self)
|
|
525
|
+
self.join()
|
|
526
|
+
super().shutdown()
|