westpa 2022.12__cp313-cp313-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of westpa might be problematic. Click here for more details.
- westpa/__init__.py +14 -0
- westpa/_version.py +21 -0
- westpa/analysis/__init__.py +5 -0
- westpa/analysis/core.py +746 -0
- westpa/analysis/statistics.py +27 -0
- westpa/analysis/trajectories.py +360 -0
- westpa/cli/__init__.py +0 -0
- westpa/cli/core/__init__.py +0 -0
- westpa/cli/core/w_fork.py +152 -0
- westpa/cli/core/w_init.py +230 -0
- westpa/cli/core/w_run.py +77 -0
- westpa/cli/core/w_states.py +212 -0
- westpa/cli/core/w_succ.py +99 -0
- westpa/cli/core/w_truncate.py +68 -0
- westpa/cli/tools/__init__.py +0 -0
- westpa/cli/tools/ploterr.py +506 -0
- westpa/cli/tools/plothist.py +706 -0
- westpa/cli/tools/w_assign.py +596 -0
- westpa/cli/tools/w_bins.py +166 -0
- westpa/cli/tools/w_crawl.py +119 -0
- westpa/cli/tools/w_direct.py +547 -0
- westpa/cli/tools/w_dumpsegs.py +94 -0
- westpa/cli/tools/w_eddist.py +506 -0
- westpa/cli/tools/w_fluxanl.py +376 -0
- westpa/cli/tools/w_ipa.py +833 -0
- westpa/cli/tools/w_kinavg.py +127 -0
- westpa/cli/tools/w_kinetics.py +96 -0
- westpa/cli/tools/w_multi_west.py +414 -0
- westpa/cli/tools/w_ntop.py +213 -0
- westpa/cli/tools/w_pdist.py +515 -0
- westpa/cli/tools/w_postanalysis_matrix.py +82 -0
- westpa/cli/tools/w_postanalysis_reweight.py +53 -0
- westpa/cli/tools/w_red.py +491 -0
- westpa/cli/tools/w_reweight.py +780 -0
- westpa/cli/tools/w_select.py +226 -0
- westpa/cli/tools/w_stateprobs.py +111 -0
- westpa/cli/tools/w_trace.py +599 -0
- westpa/core/__init__.py +0 -0
- westpa/core/_rc.py +673 -0
- westpa/core/binning/__init__.py +55 -0
- westpa/core/binning/_assign.cpython-313-darwin.so +0 -0
- westpa/core/binning/assign.py +455 -0
- westpa/core/binning/binless.py +96 -0
- westpa/core/binning/binless_driver.py +54 -0
- westpa/core/binning/binless_manager.py +190 -0
- westpa/core/binning/bins.py +47 -0
- westpa/core/binning/mab.py +506 -0
- westpa/core/binning/mab_driver.py +54 -0
- westpa/core/binning/mab_manager.py +198 -0
- westpa/core/data_manager.py +1694 -0
- westpa/core/extloader.py +74 -0
- westpa/core/h5io.py +995 -0
- westpa/core/kinetics/__init__.py +24 -0
- westpa/core/kinetics/_kinetics.cpython-313-darwin.so +0 -0
- westpa/core/kinetics/events.py +147 -0
- westpa/core/kinetics/matrates.py +156 -0
- westpa/core/kinetics/rate_averaging.py +266 -0
- westpa/core/progress.py +218 -0
- westpa/core/propagators/__init__.py +54 -0
- westpa/core/propagators/executable.py +719 -0
- westpa/core/reweight/__init__.py +14 -0
- westpa/core/reweight/_reweight.cpython-313-darwin.so +0 -0
- westpa/core/reweight/matrix.py +126 -0
- westpa/core/segment.py +119 -0
- westpa/core/sim_manager.py +835 -0
- westpa/core/states.py +359 -0
- westpa/core/systems.py +93 -0
- westpa/core/textio.py +74 -0
- westpa/core/trajectory.py +330 -0
- westpa/core/we_driver.py +910 -0
- westpa/core/wm_ops.py +43 -0
- westpa/core/yamlcfg.py +391 -0
- westpa/fasthist/__init__.py +34 -0
- westpa/fasthist/_fasthist.cpython-313-darwin.so +0 -0
- westpa/mclib/__init__.py +271 -0
- westpa/mclib/__main__.py +28 -0
- westpa/mclib/_mclib.cpython-313-darwin.so +0 -0
- westpa/oldtools/__init__.py +4 -0
- westpa/oldtools/aframe/__init__.py +35 -0
- westpa/oldtools/aframe/atool.py +75 -0
- westpa/oldtools/aframe/base_mixin.py +26 -0
- westpa/oldtools/aframe/binning.py +178 -0
- westpa/oldtools/aframe/data_reader.py +560 -0
- westpa/oldtools/aframe/iter_range.py +200 -0
- westpa/oldtools/aframe/kinetics.py +117 -0
- westpa/oldtools/aframe/mcbs.py +153 -0
- westpa/oldtools/aframe/output.py +39 -0
- westpa/oldtools/aframe/plotting.py +90 -0
- westpa/oldtools/aframe/trajwalker.py +126 -0
- westpa/oldtools/aframe/transitions.py +469 -0
- westpa/oldtools/cmds/__init__.py +0 -0
- westpa/oldtools/cmds/w_ttimes.py +361 -0
- westpa/oldtools/files.py +34 -0
- westpa/oldtools/miscfn.py +23 -0
- westpa/oldtools/stats/__init__.py +4 -0
- westpa/oldtools/stats/accumulator.py +35 -0
- westpa/oldtools/stats/edfs.py +129 -0
- westpa/oldtools/stats/mcbs.py +96 -0
- westpa/tools/__init__.py +33 -0
- westpa/tools/binning.py +472 -0
- westpa/tools/core.py +340 -0
- westpa/tools/data_reader.py +159 -0
- westpa/tools/dtypes.py +31 -0
- westpa/tools/iter_range.py +198 -0
- westpa/tools/kinetics_tool.py +340 -0
- westpa/tools/plot.py +283 -0
- westpa/tools/progress.py +17 -0
- westpa/tools/selected_segs.py +154 -0
- westpa/tools/wipi.py +751 -0
- westpa/trajtree/__init__.py +4 -0
- westpa/trajtree/_trajtree.cpython-313-darwin.so +0 -0
- westpa/trajtree/trajtree.py +117 -0
- westpa/westext/__init__.py +0 -0
- westpa/westext/adaptvoronoi/__init__.py +3 -0
- westpa/westext/adaptvoronoi/adaptVor_driver.py +214 -0
- westpa/westext/hamsm_restarting/__init__.py +3 -0
- westpa/westext/hamsm_restarting/example_overrides.py +35 -0
- westpa/westext/hamsm_restarting/restart_driver.py +1165 -0
- westpa/westext/stringmethod/__init__.py +11 -0
- westpa/westext/stringmethod/fourier_fitting.py +69 -0
- westpa/westext/stringmethod/string_driver.py +253 -0
- westpa/westext/stringmethod/string_method.py +306 -0
- westpa/westext/weed/BinCluster.py +180 -0
- westpa/westext/weed/ProbAdjustEquil.py +100 -0
- westpa/westext/weed/UncertMath.py +247 -0
- westpa/westext/weed/__init__.py +10 -0
- westpa/westext/weed/weed_driver.py +192 -0
- westpa/westext/wess/ProbAdjust.py +101 -0
- westpa/westext/wess/__init__.py +6 -0
- westpa/westext/wess/wess_driver.py +217 -0
- westpa/work_managers/__init__.py +57 -0
- westpa/work_managers/core.py +396 -0
- westpa/work_managers/environment.py +134 -0
- westpa/work_managers/mpi.py +318 -0
- westpa/work_managers/processes.py +187 -0
- westpa/work_managers/serial.py +28 -0
- westpa/work_managers/threads.py +79 -0
- westpa/work_managers/zeromq/__init__.py +20 -0
- westpa/work_managers/zeromq/core.py +641 -0
- westpa/work_managers/zeromq/node.py +131 -0
- westpa/work_managers/zeromq/work_manager.py +526 -0
- westpa/work_managers/zeromq/worker.py +320 -0
- westpa-2022.12.dist-info/AUTHORS +22 -0
- westpa-2022.12.dist-info/LICENSE +21 -0
- westpa-2022.12.dist-info/METADATA +193 -0
- westpa-2022.12.dist-info/RECORD +149 -0
- westpa-2022.12.dist-info/WHEEL +6 -0
- westpa-2022.12.dist-info/entry_points.txt +29 -0
- westpa-2022.12.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Created on May 29, 2015
|
|
3
|
+
|
|
4
|
+
@author: mzwier
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
import collections
|
|
8
|
+
import contextlib
|
|
9
|
+
import errno
|
|
10
|
+
import logging
|
|
11
|
+
import json
|
|
12
|
+
import multiprocessing
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import signal
|
|
16
|
+
import socket
|
|
17
|
+
import sys
|
|
18
|
+
import tempfile
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
import traceback
|
|
22
|
+
import uuid
|
|
23
|
+
|
|
24
|
+
import zmq
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
# Every ten seconds the master requests a status report from workers.
|
|
28
|
+
# This also notifies workers that the master is still alive
|
|
29
|
+
DEFAULT_STATUS_POLL = 10
|
|
30
|
+
|
|
31
|
+
# If we haven't heard from the master or a worker (as appropriate) in these
|
|
32
|
+
# amounts of time, we assume a crash and shut down.
|
|
33
|
+
MASTER_CRASH_TIMEOUT = DEFAULT_STATUS_POLL * 6
|
|
34
|
+
WORKER_CRASH_TIMEOUT = DEFAULT_STATUS_POLL * 3
|
|
35
|
+
|
|
36
|
+
log = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
signames = {
|
|
39
|
+
val: name for name, val in reversed(sorted(signal.__dict__.items())) if name.startswith('SIG') and not name.startswith('SIG_')
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
DEFAULT_LINGER = 1
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def randport(address='127.0.0.1'):
|
|
47
|
+
'''Select a random unused TCP port number on the given address.'''
|
|
48
|
+
s = socket.socket()
|
|
49
|
+
s.bind((address, 0))
|
|
50
|
+
try:
|
|
51
|
+
port = s.getsockname()[1]
|
|
52
|
+
finally:
|
|
53
|
+
s.close()
|
|
54
|
+
return port
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ZMQWMError(RuntimeError):
|
|
58
|
+
'''Base class for errors related to the ZeroMQ work manager itself'''
|
|
59
|
+
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ZMQWorkerMissing(ZMQWMError):
|
|
64
|
+
'''Exception representing that a worker processing a task died or disappeared'''
|
|
65
|
+
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ZMQWMEnvironmentError(ZMQWMError):
|
|
70
|
+
'''Class representing an error in the environment in which the ZeroMQ work manager is running.
|
|
71
|
+
This includes such things as master/worker ID mismatches.'''
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ZMQWMTimeout(ZMQWMEnvironmentError):
|
|
75
|
+
'''A timeout of a sort that indicatess that a master or worker has failed or never started.'''
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Message:
|
|
79
|
+
SHUTDOWN = 'shutdown'
|
|
80
|
+
|
|
81
|
+
ACK = 'ok'
|
|
82
|
+
NAK = 'no'
|
|
83
|
+
IDENTIFY = 'identify' # Two-way identification (a reply must be an IDENTIFY message)
|
|
84
|
+
TASKS_AVAILABLE = 'tasks_available'
|
|
85
|
+
TASK_REQUEST = 'task_request'
|
|
86
|
+
|
|
87
|
+
MASTER_BEACON = 'master_alive'
|
|
88
|
+
RECONFIGURE_TIMEOUT = 'reconfigure_timeout'
|
|
89
|
+
|
|
90
|
+
TASK = 'task'
|
|
91
|
+
RESULT = 'result'
|
|
92
|
+
|
|
93
|
+
idempotent_announcement_messages = {SHUTDOWN, TASKS_AVAILABLE, MASTER_BEACON}
|
|
94
|
+
|
|
95
|
+
def __init__(self, message=None, payload=None, master_id=None, src_id=None):
|
|
96
|
+
if isinstance(message, Message):
|
|
97
|
+
self.message = message.message
|
|
98
|
+
self.payload = message.payload
|
|
99
|
+
self.master_id = message.master_id
|
|
100
|
+
self.src_id = message.src_id
|
|
101
|
+
else:
|
|
102
|
+
self.master_id = master_id
|
|
103
|
+
self.src_id = src_id
|
|
104
|
+
self.message = message
|
|
105
|
+
self.payload = payload
|
|
106
|
+
|
|
107
|
+
def __repr__(self):
|
|
108
|
+
return '<{!s} master_id={master_id!s} src_id={src_id!s} message={message!r} payload={payload!r}>'.format(
|
|
109
|
+
self.__class__.__name__, **self.__dict__
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def coalesce_announcements(cls, messages):
|
|
114
|
+
d = collections.OrderedDict()
|
|
115
|
+
for msg in messages:
|
|
116
|
+
if msg.message in cls.idempotent_announcement_messages:
|
|
117
|
+
key = msg.message
|
|
118
|
+
else:
|
|
119
|
+
key = (msg.message, msg.payload)
|
|
120
|
+
d[key] = msg
|
|
121
|
+
coalesced = list(msg.values())
|
|
122
|
+
log.debug('coalesced {} announcements into {}'.format(len(messages), len(coalesced)))
|
|
123
|
+
return coalesced
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
TIMEOUT_MASTER_BEACON = 'master_beacon'
|
|
127
|
+
TIMEOUT_WORKER_CONTACT = 'worker_contact'
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Task:
|
|
131
|
+
def __init__(self, fn, args, kwargs, task_id=None):
|
|
132
|
+
self.task_id = task_id or uuid.uuid4()
|
|
133
|
+
self.fn = fn
|
|
134
|
+
self.args = args
|
|
135
|
+
self.kwargs = kwargs
|
|
136
|
+
|
|
137
|
+
def __repr__(self):
|
|
138
|
+
try:
|
|
139
|
+
return '<{} {task_id!s} {fn!r} {:d} args {:d} kwargs>'.format(
|
|
140
|
+
self.__class__.__name__, len(self.args), len(self.kwargs), **self.__dict__
|
|
141
|
+
)
|
|
142
|
+
except TypeError:
|
|
143
|
+
# no length
|
|
144
|
+
return '<{} {task_id!s} {fn!r}'.format(self.__class__.__name__, **self.__dict__)
|
|
145
|
+
|
|
146
|
+
def __hash__(self):
|
|
147
|
+
return hash(self.task_id)
|
|
148
|
+
|
|
149
|
+
def execute(self):
|
|
150
|
+
'''Run this task, returning a Result object.'''
|
|
151
|
+
rsl = Result(task_id=self.task_id)
|
|
152
|
+
try:
|
|
153
|
+
rsl.result = self.fn(*self.args, **self.kwargs)
|
|
154
|
+
except BaseException as e:
|
|
155
|
+
rsl.exception = e
|
|
156
|
+
rsl.traceback = traceback.format_exc()
|
|
157
|
+
return rsl
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class Result:
|
|
161
|
+
def __init__(self, task_id, result=None, exception=None, traceback=None):
|
|
162
|
+
self.task_id = task_id
|
|
163
|
+
self.result = result
|
|
164
|
+
self.exception = exception
|
|
165
|
+
self.traceback = traceback
|
|
166
|
+
|
|
167
|
+
def __repr__(self):
|
|
168
|
+
return '<{} {task_id!s} ({})>'.format(
|
|
169
|
+
self.__class__.__name__, 'result' if self.exception is None else 'exception', **self.__dict__
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def __hash__(self):
|
|
173
|
+
return hash(self.task_id)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class PassiveTimer:
|
|
177
|
+
__slots__ = {'started', 'duration'}
|
|
178
|
+
|
|
179
|
+
def __init__(self, duration, started=None):
|
|
180
|
+
if started is None:
|
|
181
|
+
started = time.time()
|
|
182
|
+
self.started = started
|
|
183
|
+
self.duration = duration
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def expired(self, at=None):
|
|
187
|
+
at = at or time.time()
|
|
188
|
+
return (at - self.started) > self.duration
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def expires_in(self):
|
|
192
|
+
at = time.time()
|
|
193
|
+
return self.started + self.duration - at
|
|
194
|
+
|
|
195
|
+
def reset(self, at=None):
|
|
196
|
+
self.started = at or time.time()
|
|
197
|
+
|
|
198
|
+
start = reset
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class PassiveMultiTimer:
|
|
202
|
+
def __init__(self):
|
|
203
|
+
self._identifiers = np.empty((0,), np.object_)
|
|
204
|
+
self._durations = np.empty((0,), float)
|
|
205
|
+
self._started = np.empty((0,), float)
|
|
206
|
+
self._indices = {} # indexes into durations/started, keyed by identifier
|
|
207
|
+
|
|
208
|
+
def add_timer(self, identifier, duration):
|
|
209
|
+
if identifier in self._identifiers:
|
|
210
|
+
raise KeyError('timer {!r} already present'.format(identifier))
|
|
211
|
+
|
|
212
|
+
new_idx = len(self._identifiers)
|
|
213
|
+
|
|
214
|
+
# Necessary due to coverage.py's use of a tracer triggering an error on resize
|
|
215
|
+
refcheck = True if sys.gettrace() is None else False
|
|
216
|
+
|
|
217
|
+
self._durations.resize((new_idx + 1,), refcheck=refcheck)
|
|
218
|
+
self._started.resize((new_idx + 1,), refcheck=refcheck)
|
|
219
|
+
self._identifiers.resize((new_idx + 1,), refcheck=refcheck)
|
|
220
|
+
|
|
221
|
+
self._durations[new_idx] = duration
|
|
222
|
+
self._started[new_idx] = time.time()
|
|
223
|
+
self._identifiers[new_idx] = identifier
|
|
224
|
+
self._indices[identifier] = new_idx
|
|
225
|
+
|
|
226
|
+
def remove_timer(self, identifier):
|
|
227
|
+
idx = self._indices.pop(identifier)
|
|
228
|
+
self._durations = np.delete(self._durations, idx)
|
|
229
|
+
self._started = np.delete(self._started, idx)
|
|
230
|
+
self._identifiers = np.delete(self._identifiers, idx)
|
|
231
|
+
|
|
232
|
+
def change_duration(self, identifier, duration):
|
|
233
|
+
idx = self._indices[identifier]
|
|
234
|
+
self._durations[idx] = duration
|
|
235
|
+
|
|
236
|
+
def reset(self, identifier=None, at=None):
|
|
237
|
+
at = at or time.time()
|
|
238
|
+
if identifier is None:
|
|
239
|
+
# reset all timers
|
|
240
|
+
self._started.fill(at)
|
|
241
|
+
else:
|
|
242
|
+
self._started[self._indices[identifier]] = at
|
|
243
|
+
|
|
244
|
+
def expired(self, identifier, at=None):
|
|
245
|
+
at = at or time.time()
|
|
246
|
+
idx = self._indices[identifier]
|
|
247
|
+
return (at - self._started[idx]) > self._durations[idx]
|
|
248
|
+
|
|
249
|
+
def next_expiration(self):
|
|
250
|
+
at = time.time()
|
|
251
|
+
idx = (self._started + self._durations - at).argmin()
|
|
252
|
+
return self._identifiers[idx]
|
|
253
|
+
|
|
254
|
+
def next_expiration_in(self):
|
|
255
|
+
at = time.time()
|
|
256
|
+
idx = (self._started + self._durations - at).argmin()
|
|
257
|
+
next_at = self._started[idx] + self._durations[idx] - at
|
|
258
|
+
return next_at if next_at > 0 else 0
|
|
259
|
+
|
|
260
|
+
def which_expired(self, at=None):
|
|
261
|
+
at = at or time.time()
|
|
262
|
+
expired_indices = (at - self._started) > self._durations
|
|
263
|
+
return self._identifiers[expired_indices]
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class ZMQCore:
|
|
267
|
+
# The overall communication topology (socket layout, etc)
|
|
268
|
+
# Cannot be updated without updating configuration files, command-line parameters,
|
|
269
|
+
# etc. (Changes break user scripts.)
|
|
270
|
+
PROTOCOL_MAJOR = 3
|
|
271
|
+
|
|
272
|
+
# The set of messages and replies in use.
|
|
273
|
+
# Cannot be updated without changing existing communications logic. (Changes break
|
|
274
|
+
# the ZMQ WM library.)
|
|
275
|
+
PROTOCOL_MINOR = 0
|
|
276
|
+
|
|
277
|
+
# Minor updates and additions to the protocol.
|
|
278
|
+
# Changes do not break the ZMQ WM library, but only add new
|
|
279
|
+
# functionality/code paths without changing existing code paths.
|
|
280
|
+
PROTOCOL_UPDATE = 0
|
|
281
|
+
|
|
282
|
+
PROTOCOL_VERSION = (PROTOCOL_MAJOR, PROTOCOL_MINOR, PROTOCOL_UPDATE)
|
|
283
|
+
|
|
284
|
+
# The default transport for "internal" (inter-thread/-process) communication
|
|
285
|
+
# IPC should work except on really odd systems with no local storage
|
|
286
|
+
internal_transport = 'ipc'
|
|
287
|
+
|
|
288
|
+
default_comm_mode = 'ipc'
|
|
289
|
+
default_master_heartbeat = 20.0
|
|
290
|
+
default_worker_heartbeat = 20.0
|
|
291
|
+
default_timeout_factor = 5.0
|
|
292
|
+
default_startup_timeout = 120.0
|
|
293
|
+
default_shutdown_timeout = 5.0
|
|
294
|
+
|
|
295
|
+
_ipc_endpoints_to_delete = []
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
298
|
+
def make_ipc_endpoint(cls):
|
|
299
|
+
(fd, socket_path) = tempfile.mkstemp()
|
|
300
|
+
os.close(fd)
|
|
301
|
+
endpoint = 'ipc://{}'.format(socket_path)
|
|
302
|
+
cls._ipc_endpoints_to_delete.append(endpoint)
|
|
303
|
+
return endpoint
|
|
304
|
+
|
|
305
|
+
@classmethod
|
|
306
|
+
def remove_ipc_endpoints(cls):
|
|
307
|
+
while cls._ipc_endpoints_to_delete:
|
|
308
|
+
endpoint = cls._ipc_endpoints_to_delete.pop()
|
|
309
|
+
assert endpoint.startswith('ipc://')
|
|
310
|
+
socket_path = endpoint[6:]
|
|
311
|
+
try:
|
|
312
|
+
os.unlink(socket_path)
|
|
313
|
+
except OSError as e:
|
|
314
|
+
if e.errno != errno.ENOENT:
|
|
315
|
+
log.debug('could not unlink IPC endpoint {!r}: {}'.format(socket_path, e))
|
|
316
|
+
else:
|
|
317
|
+
log.debug('unlinked IPC endpoint {!r}'.format(socket_path))
|
|
318
|
+
|
|
319
|
+
@classmethod
|
|
320
|
+
def make_tcp_endpoint(cls, address='127.0.0.1'):
|
|
321
|
+
return 'tcp://{}:{}'.format(address, randport(address))
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def make_internal_endpoint(cls):
|
|
325
|
+
assert cls.internal_transport in {'ipc', 'tcp'}
|
|
326
|
+
if cls.internal_transport == 'ipc':
|
|
327
|
+
return cls.make_ipc_endpoint()
|
|
328
|
+
else: # cls.internal_transport == 'tcp'
|
|
329
|
+
return cls.make_tcp_endpoint()
|
|
330
|
+
|
|
331
|
+
def __init__(self):
|
|
332
|
+
# Unique identifier of this ZMQ node
|
|
333
|
+
self.node_id = uuid.uuid4()
|
|
334
|
+
|
|
335
|
+
# Identifier of the task distribution network (work manager)
|
|
336
|
+
self.network_id = None
|
|
337
|
+
|
|
338
|
+
# Beacons
|
|
339
|
+
# Workers expect to hear from the master at least every master_beacon_period
|
|
340
|
+
# Master expects to hear from the workers at least every worker_beacon_period
|
|
341
|
+
# If more than {master,worker}_beacon_period*timeout_factor elapses, the
|
|
342
|
+
# master/worker is considered missing.
|
|
343
|
+
|
|
344
|
+
self.worker_beacon_period = self.default_worker_heartbeat
|
|
345
|
+
self.master_beacon_period = self.default_master_heartbeat
|
|
346
|
+
self.timeout_factor = self.default_timeout_factor
|
|
347
|
+
|
|
348
|
+
# These should allow for some fuzz, and should ratchet up as more and
|
|
349
|
+
# more workers become available (maybe order 1 s for 100 workers?) This
|
|
350
|
+
# should also account appropriately for startup delay on difficult
|
|
351
|
+
# systems.
|
|
352
|
+
|
|
353
|
+
# Number of seconds to allow first contact between at least one worker
|
|
354
|
+
# and the master.
|
|
355
|
+
self.startup_timeout = self.default_startup_timeout
|
|
356
|
+
|
|
357
|
+
# A friendlier description for logging
|
|
358
|
+
self.node_description = '{!s} on {!s} at PID {:d}'.format(self.__class__.__name__, socket.gethostname(), os.getpid())
|
|
359
|
+
|
|
360
|
+
self.validation_fail_action = 'exit' # other options are 'raise' and 'warn'
|
|
361
|
+
|
|
362
|
+
self.log = logging.getLogger(__name__ + '.' + self.__class__.__name__ + '.' + str(self.node_id))
|
|
363
|
+
|
|
364
|
+
# ZeroMQ context
|
|
365
|
+
self.context = None
|
|
366
|
+
|
|
367
|
+
# External communication endpoints
|
|
368
|
+
self.rr_endpoint = None
|
|
369
|
+
self.ann_endpoint = None
|
|
370
|
+
|
|
371
|
+
self.inproc_endpoint = 'inproc://{!s}'.format(self.node_id)
|
|
372
|
+
|
|
373
|
+
# Sockets
|
|
374
|
+
self.rr_socket = None
|
|
375
|
+
self.ann_socket = None
|
|
376
|
+
|
|
377
|
+
# This is the main-thread end of this
|
|
378
|
+
self._inproc_socket = None
|
|
379
|
+
|
|
380
|
+
self.master_id = None
|
|
381
|
+
|
|
382
|
+
if os.environ.get('WWMGR_ZMQ_DEBUG_MESSAGES', 'n').upper() in {'Y', 'YES', '1', 'T', 'TRUE'}:
|
|
383
|
+
self._super_debug = True
|
|
384
|
+
else:
|
|
385
|
+
self._super_debug = None
|
|
386
|
+
|
|
387
|
+
def __repr__(self):
|
|
388
|
+
return '<{!s} {!s}>'.format(self.__class__.__name__, self.node_id)
|
|
389
|
+
|
|
390
|
+
def get_identification(self):
|
|
391
|
+
return {
|
|
392
|
+
'node_id': self.node_id,
|
|
393
|
+
'master_id': self.master_id,
|
|
394
|
+
'class': self.__class__.__name__,
|
|
395
|
+
'description': self.node_description,
|
|
396
|
+
'hostname': socket.gethostname(),
|
|
397
|
+
'pid': os.getpid(),
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
def validate_message(self, message):
|
|
401
|
+
'''Validate incoming message. Raises an exception if the message is improperly formatted (TypeError)
|
|
402
|
+
or does not correspond to the appropriate master (ZMQWMEnvironmentError).'''
|
|
403
|
+
try:
|
|
404
|
+
super_validator = super().validate_message
|
|
405
|
+
except AttributeError:
|
|
406
|
+
pass
|
|
407
|
+
else:
|
|
408
|
+
super_validator(message)
|
|
409
|
+
|
|
410
|
+
if not isinstance(message, Message):
|
|
411
|
+
raise TypeError('message is not an instance of core.Message')
|
|
412
|
+
if message.src_id is None:
|
|
413
|
+
raise ZMQWMEnvironmentError('message src_id is not set')
|
|
414
|
+
if self.master_id is not None and message.master_id is not None and message.master_id != self.master_id:
|
|
415
|
+
raise ZMQWMEnvironmentError(
|
|
416
|
+
'incoming message associated with another master (this={!s}, incoming={!s}'.format(
|
|
417
|
+
self.master_id, message.master_id
|
|
418
|
+
)
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
@contextlib.contextmanager
|
|
422
|
+
def message_validation(self, msg):
|
|
423
|
+
'''A context manager for message validation. The instance variable ``validation_fail_action``
|
|
424
|
+
controls the behavior of this context manager:
|
|
425
|
+
* 'raise': re-raise the exception that indicated failed validation. Useful for development.
|
|
426
|
+
* 'exit' (default): report the error and exit the program.
|
|
427
|
+
* 'warn': report the error and continue.'''
|
|
428
|
+
try:
|
|
429
|
+
yield
|
|
430
|
+
except Exception as e:
|
|
431
|
+
if self.validation_fail_action == 'raise':
|
|
432
|
+
self.log.exception('message validation failed for {!r}'.format(msg))
|
|
433
|
+
raise
|
|
434
|
+
elif self.validation_fail_action == 'exit':
|
|
435
|
+
self.log.error('message validation falied: {!s}'.format(e))
|
|
436
|
+
sys.exit(1)
|
|
437
|
+
elif self.validation_fail_action == 'warn':
|
|
438
|
+
self.log.warning('message validation falied: {!s}'.format(e))
|
|
439
|
+
|
|
440
|
+
def recv_message(self, socket, flags=0, validate=True, timeout=None):
|
|
441
|
+
'''Receive a message object from the given socket, using the given flags.
|
|
442
|
+
Message validation is performed if ``validate`` is true.
|
|
443
|
+
If ``timeout`` is given, then it is the number of milliseconds to wait
|
|
444
|
+
prior to raising a ZMQWMTimeout exception. ``timeout`` is ignored if
|
|
445
|
+
``flags`` includes ``zmq.NOBLOCK``.'''
|
|
446
|
+
|
|
447
|
+
if timeout is None or flags & zmq.NOBLOCK:
|
|
448
|
+
message = socket.recv_pyobj(flags)
|
|
449
|
+
else:
|
|
450
|
+
poller = zmq.Poller()
|
|
451
|
+
poller.register(socket, zmq.POLLIN)
|
|
452
|
+
try:
|
|
453
|
+
poll_results = dict(poller.poll(timeout=timeout))
|
|
454
|
+
if socket in poll_results:
|
|
455
|
+
message = socket.recv_pyobj(flags)
|
|
456
|
+
else:
|
|
457
|
+
raise ZMQWMTimeout('recv timed out')
|
|
458
|
+
finally:
|
|
459
|
+
poller.unregister(socket)
|
|
460
|
+
|
|
461
|
+
if self._super_debug:
|
|
462
|
+
self.log.debug('received {!r}'.format(message))
|
|
463
|
+
if validate:
|
|
464
|
+
with self.message_validation(message):
|
|
465
|
+
self.validate_message(message)
|
|
466
|
+
return message
|
|
467
|
+
|
|
468
|
+
def recv_all(self, socket, flags=0, validate=True):
|
|
469
|
+
'''Receive all messages currently available from the given socket.'''
|
|
470
|
+
messages = []
|
|
471
|
+
while True:
|
|
472
|
+
try:
|
|
473
|
+
messages.append(self.recv_message(socket, flags | zmq.NOBLOCK, validate))
|
|
474
|
+
except zmq.Again:
|
|
475
|
+
return messages
|
|
476
|
+
|
|
477
|
+
def recv_ack(self, socket, flags=0, validate=True, timeout=None):
|
|
478
|
+
msg = self.recv_message(socket, flags, validate, timeout)
|
|
479
|
+
if validate:
|
|
480
|
+
with self.message_validation(msg):
|
|
481
|
+
assert msg.message in (Message.ACK, Message.NAK)
|
|
482
|
+
return msg
|
|
483
|
+
|
|
484
|
+
def send_message(self, socket, message, payload=None, flags=0):
|
|
485
|
+
'''Send a message object. Subclasses may override this to
|
|
486
|
+
decorate the message with appropriate IDs, then delegate upward to actually send
|
|
487
|
+
the message. ``message`` may either be a pre-constructed ``Message`` object or
|
|
488
|
+
a message identifier, in which (latter) case ``payload`` will become the message payload.
|
|
489
|
+
``payload`` is ignored if ``message`` is a ``Message`` object.'''
|
|
490
|
+
|
|
491
|
+
message = Message(message, payload)
|
|
492
|
+
if message.master_id is None:
|
|
493
|
+
message.master_id = self.master_id
|
|
494
|
+
message.src_id = self.node_id
|
|
495
|
+
|
|
496
|
+
if self._super_debug:
|
|
497
|
+
self.log.debug('sending {!r}'.format(message))
|
|
498
|
+
socket.send_pyobj(message, flags)
|
|
499
|
+
|
|
500
|
+
def send_reply(self, socket, original_message, reply=Message.ACK, payload=None, flags=0):
|
|
501
|
+
'''Send a reply to ``original_message`` on ``socket``. The reply message
|
|
502
|
+
is a Message object or a message identifier. The reply master_id and worker_id are
|
|
503
|
+
set from ``original_message``, unless master_id is not set, in which case it is
|
|
504
|
+
set from self.master_id.'''
|
|
505
|
+
reply = Message(reply, payload)
|
|
506
|
+
reply.master_id = original_message.master_id or self.master_id
|
|
507
|
+
assert original_message.worker_id is not None # should have been caught by validation prior to this
|
|
508
|
+
reply.worker_id = original_message.worker_id
|
|
509
|
+
self.send_message(socket, reply)
|
|
510
|
+
|
|
511
|
+
def send_ack(self, socket, original_message):
|
|
512
|
+
'''Send an acknowledgement message, which is mostly just to respect REQ/REP
|
|
513
|
+
recv/send patterns.'''
|
|
514
|
+
self.send_message(socket, Message(Message.ACK, master_id=original_message.master_id or self.master_id, src_id=self.node_id))
|
|
515
|
+
|
|
516
|
+
def send_nak(self, socket, original_message):
|
|
517
|
+
'''Send a negative acknowledgement message.'''
|
|
518
|
+
self.send_message(socket, Message(Message.NAK, master_id=original_message.master_id or self.master_id, src_id=self.node_id))
|
|
519
|
+
|
|
520
|
+
def send_inproc_message(self, message, payload=None, flags=0):
|
|
521
|
+
inproc_socket = self.context.socket(zmq.PUB)
|
|
522
|
+
inproc_socket.connect(self.inproc_endpoint)
|
|
523
|
+
# annoying wait for sockets to settle
|
|
524
|
+
time.sleep(0.01)
|
|
525
|
+
self.send_message(inproc_socket, message, payload, flags)
|
|
526
|
+
# used to be a close with linger here, but it was cutting off messages
|
|
527
|
+
|
|
528
|
+
def signal_shutdown(self):
|
|
529
|
+
try:
|
|
530
|
+
self.send_inproc_message(Message.SHUTDOWN)
|
|
531
|
+
except AttributeError:
|
|
532
|
+
# this is expected if self.context has been set to None (i.e. it has already been destroyed)
|
|
533
|
+
pass
|
|
534
|
+
except Exception as e:
|
|
535
|
+
self.log.debug('ignoring exception {!r} in signal_shutdown()'.format(e))
|
|
536
|
+
|
|
537
|
+
def shutdown_handler(self, signal=None, frame=None):
|
|
538
|
+
if signal is None:
|
|
539
|
+
self.log.info('shutting down')
|
|
540
|
+
else:
|
|
541
|
+
self.log.info('shutting down on signal {!s}'.format(signames.get(signal, signal)))
|
|
542
|
+
self.signal_shutdown()
|
|
543
|
+
|
|
544
|
+
def install_signal_handlers(self, signals=None):
|
|
545
|
+
if not signals:
|
|
546
|
+
signals = {signal.SIGINT, signal.SIGQUIT, signal.SIGTERM}
|
|
547
|
+
|
|
548
|
+
for sig in signals:
|
|
549
|
+
signal.signal(sig, self.shutdown_handler)
|
|
550
|
+
|
|
551
|
+
def install_sigint_handler(self):
|
|
552
|
+
self.install_signal_handlers()
|
|
553
|
+
|
|
554
|
+
def startup(self):
|
|
555
|
+
self.context = zmq.Context()
|
|
556
|
+
self.comm_thread = threading.Thread(target=self.comm_loop)
|
|
557
|
+
self.comm_thread.start()
|
|
558
|
+
|
|
559
|
+
# self.install_signal_handlers()
|
|
560
|
+
|
|
561
|
+
def shutdown(self):
|
|
562
|
+
self.shutdown_handler()
|
|
563
|
+
|
|
564
|
+
def join(self):
|
|
565
|
+
while True:
|
|
566
|
+
self.comm_thread.join(0.1)
|
|
567
|
+
if not self.comm_thread.is_alive():
|
|
568
|
+
break
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def shutdown_process(process, timeout=1.0):
|
|
572
|
+
process.join(timeout)
|
|
573
|
+
if process.is_alive():
|
|
574
|
+
log.debug('sending SIGINT to process {:d}'.format(process.pid))
|
|
575
|
+
os.kill(process.pid, signal.SIGINT)
|
|
576
|
+
process.join(timeout)
|
|
577
|
+
if process.is_alive():
|
|
578
|
+
log.warning('sending SIGKILL to worker process {:d}'.format(process.pid))
|
|
579
|
+
os.kill(process.pid, signal.SIGKILL)
|
|
580
|
+
process.join()
|
|
581
|
+
|
|
582
|
+
log.debug('process {:d} terminated with code {:d}'.format(process.pid, process.exitcode))
|
|
583
|
+
else:
|
|
584
|
+
log.debug('worker process {:d} terminated gracefully with code {:d}'.format(process.pid, process.exitcode))
|
|
585
|
+
assert not process.is_alive()
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
class IsNode:
|
|
589
|
+
def __init__(self, n_local_workers=None):
|
|
590
|
+
from westpa.work_managers.zeromq.worker import ZMQWorker
|
|
591
|
+
|
|
592
|
+
if n_local_workers is None:
|
|
593
|
+
n_local_workers = multiprocessing.cpu_count()
|
|
594
|
+
|
|
595
|
+
self.downstream_rr_endpoint = None
|
|
596
|
+
self.downstream_ann_endpoint = None
|
|
597
|
+
|
|
598
|
+
if n_local_workers:
|
|
599
|
+
self.local_ann_endpoint = self.make_internal_endpoint()
|
|
600
|
+
self.local_rr_endpoint = self.make_internal_endpoint()
|
|
601
|
+
self.local_workers = [ZMQWorker(self.local_rr_endpoint, self.local_ann_endpoint) for _n in range(n_local_workers)]
|
|
602
|
+
else:
|
|
603
|
+
self.local_ann_endpoint = None
|
|
604
|
+
self.local_rr_endpoint = None
|
|
605
|
+
self.local_workers = []
|
|
606
|
+
|
|
607
|
+
self.local_worker_processes = [
|
|
608
|
+
multiprocessing.Process(target=worker.startup, args=(n,)) for (n, worker) in enumerate(self.local_workers)
|
|
609
|
+
]
|
|
610
|
+
|
|
611
|
+
self.host_info_files = []
|
|
612
|
+
|
|
613
|
+
def write_host_info(self, filename=None):
|
|
614
|
+
filename = filename or 'zmq_host_info_{}.json'.format(self.node_id.hex)
|
|
615
|
+
hostname = socket.gethostname()
|
|
616
|
+
|
|
617
|
+
with open(filename, 'wt') as infofile:
|
|
618
|
+
info = {}
|
|
619
|
+
info['rr_endpoint'] = re.sub(r'\*', hostname, self.downstream_rr_endpoint or '')
|
|
620
|
+
info['ann_endpoint'] = re.sub(r'\*', hostname, self.downstream_ann_endpoint or '')
|
|
621
|
+
json.dump(info, infofile)
|
|
622
|
+
self.host_info_files.append(filename)
|
|
623
|
+
|
|
624
|
+
def startup(self):
|
|
625
|
+
for process in self.local_worker_processes:
|
|
626
|
+
process.start()
|
|
627
|
+
|
|
628
|
+
def shutdown(self):
|
|
629
|
+
try:
|
|
630
|
+
shutdown_timeout = self.shutdown_timeout
|
|
631
|
+
except AttributeError:
|
|
632
|
+
shutdown_timeout = 1.0
|
|
633
|
+
|
|
634
|
+
for process in self.local_worker_processes:
|
|
635
|
+
shutdown_process(process, shutdown_timeout)
|
|
636
|
+
|
|
637
|
+
for host_info_file in self.host_info_files:
|
|
638
|
+
try:
|
|
639
|
+
os.unlink(host_info_file)
|
|
640
|
+
except OSError:
|
|
641
|
+
pass
|