westpa 2022.10__cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of westpa might be problematic. Click here for more details.

Files changed (150) hide show
  1. westpa/__init__.py +14 -0
  2. westpa/_version.py +21 -0
  3. westpa/analysis/__init__.py +5 -0
  4. westpa/analysis/core.py +746 -0
  5. westpa/analysis/statistics.py +27 -0
  6. westpa/analysis/trajectories.py +360 -0
  7. westpa/cli/__init__.py +0 -0
  8. westpa/cli/core/__init__.py +0 -0
  9. westpa/cli/core/w_fork.py +152 -0
  10. westpa/cli/core/w_init.py +230 -0
  11. westpa/cli/core/w_run.py +77 -0
  12. westpa/cli/core/w_states.py +212 -0
  13. westpa/cli/core/w_succ.py +99 -0
  14. westpa/cli/core/w_truncate.py +59 -0
  15. westpa/cli/tools/__init__.py +0 -0
  16. westpa/cli/tools/ploterr.py +506 -0
  17. westpa/cli/tools/plothist.py +706 -0
  18. westpa/cli/tools/w_assign.py +596 -0
  19. westpa/cli/tools/w_bins.py +166 -0
  20. westpa/cli/tools/w_crawl.py +119 -0
  21. westpa/cli/tools/w_direct.py +547 -0
  22. westpa/cli/tools/w_dumpsegs.py +94 -0
  23. westpa/cli/tools/w_eddist.py +506 -0
  24. westpa/cli/tools/w_fluxanl.py +378 -0
  25. westpa/cli/tools/w_ipa.py +833 -0
  26. westpa/cli/tools/w_kinavg.py +127 -0
  27. westpa/cli/tools/w_kinetics.py +96 -0
  28. westpa/cli/tools/w_multi_west.py +414 -0
  29. westpa/cli/tools/w_ntop.py +213 -0
  30. westpa/cli/tools/w_pdist.py +515 -0
  31. westpa/cli/tools/w_postanalysis_matrix.py +82 -0
  32. westpa/cli/tools/w_postanalysis_reweight.py +53 -0
  33. westpa/cli/tools/w_red.py +486 -0
  34. westpa/cli/tools/w_reweight.py +780 -0
  35. westpa/cli/tools/w_select.py +226 -0
  36. westpa/cli/tools/w_stateprobs.py +111 -0
  37. westpa/cli/tools/w_trace.py +599 -0
  38. westpa/core/__init__.py +0 -0
  39. westpa/core/_rc.py +673 -0
  40. westpa/core/binning/__init__.py +55 -0
  41. westpa/core/binning/_assign.cpython-312-x86_64-linux-gnu.so +0 -0
  42. westpa/core/binning/assign.py +449 -0
  43. westpa/core/binning/binless.py +96 -0
  44. westpa/core/binning/binless_driver.py +54 -0
  45. westpa/core/binning/binless_manager.py +190 -0
  46. westpa/core/binning/bins.py +47 -0
  47. westpa/core/binning/mab.py +427 -0
  48. westpa/core/binning/mab_driver.py +54 -0
  49. westpa/core/binning/mab_manager.py +198 -0
  50. westpa/core/data_manager.py +1694 -0
  51. westpa/core/extloader.py +74 -0
  52. westpa/core/h5io.py +995 -0
  53. westpa/core/kinetics/__init__.py +24 -0
  54. westpa/core/kinetics/_kinetics.cpython-312-x86_64-linux-gnu.so +0 -0
  55. westpa/core/kinetics/events.py +147 -0
  56. westpa/core/kinetics/matrates.py +156 -0
  57. westpa/core/kinetics/rate_averaging.py +266 -0
  58. westpa/core/progress.py +218 -0
  59. westpa/core/propagators/__init__.py +54 -0
  60. westpa/core/propagators/executable.py +715 -0
  61. westpa/core/reweight/__init__.py +14 -0
  62. westpa/core/reweight/_reweight.cpython-312-x86_64-linux-gnu.so +0 -0
  63. westpa/core/reweight/matrix.py +126 -0
  64. westpa/core/segment.py +119 -0
  65. westpa/core/sim_manager.py +830 -0
  66. westpa/core/states.py +359 -0
  67. westpa/core/systems.py +93 -0
  68. westpa/core/textio.py +74 -0
  69. westpa/core/trajectory.py +330 -0
  70. westpa/core/we_driver.py +908 -0
  71. westpa/core/wm_ops.py +43 -0
  72. westpa/core/yamlcfg.py +391 -0
  73. westpa/fasthist/__init__.py +34 -0
  74. westpa/fasthist/__main__.py +110 -0
  75. westpa/fasthist/_fasthist.cpython-312-x86_64-linux-gnu.so +0 -0
  76. westpa/mclib/__init__.py +264 -0
  77. westpa/mclib/__main__.py +28 -0
  78. westpa/mclib/_mclib.cpython-312-x86_64-linux-gnu.so +0 -0
  79. westpa/oldtools/__init__.py +4 -0
  80. westpa/oldtools/aframe/__init__.py +35 -0
  81. westpa/oldtools/aframe/atool.py +75 -0
  82. westpa/oldtools/aframe/base_mixin.py +26 -0
  83. westpa/oldtools/aframe/binning.py +178 -0
  84. westpa/oldtools/aframe/data_reader.py +560 -0
  85. westpa/oldtools/aframe/iter_range.py +200 -0
  86. westpa/oldtools/aframe/kinetics.py +117 -0
  87. westpa/oldtools/aframe/mcbs.py +146 -0
  88. westpa/oldtools/aframe/output.py +39 -0
  89. westpa/oldtools/aframe/plotting.py +90 -0
  90. westpa/oldtools/aframe/trajwalker.py +126 -0
  91. westpa/oldtools/aframe/transitions.py +469 -0
  92. westpa/oldtools/cmds/__init__.py +0 -0
  93. westpa/oldtools/cmds/w_ttimes.py +358 -0
  94. westpa/oldtools/files.py +34 -0
  95. westpa/oldtools/miscfn.py +23 -0
  96. westpa/oldtools/stats/__init__.py +4 -0
  97. westpa/oldtools/stats/accumulator.py +35 -0
  98. westpa/oldtools/stats/edfs.py +129 -0
  99. westpa/oldtools/stats/mcbs.py +89 -0
  100. westpa/tools/__init__.py +33 -0
  101. westpa/tools/binning.py +472 -0
  102. westpa/tools/core.py +340 -0
  103. westpa/tools/data_reader.py +159 -0
  104. westpa/tools/dtypes.py +31 -0
  105. westpa/tools/iter_range.py +198 -0
  106. westpa/tools/kinetics_tool.py +340 -0
  107. westpa/tools/plot.py +283 -0
  108. westpa/tools/progress.py +17 -0
  109. westpa/tools/selected_segs.py +154 -0
  110. westpa/tools/wipi.py +751 -0
  111. westpa/trajtree/__init__.py +4 -0
  112. westpa/trajtree/_trajtree.cpython-312-x86_64-linux-gnu.so +0 -0
  113. westpa/trajtree/trajtree.py +117 -0
  114. westpa/westext/__init__.py +0 -0
  115. westpa/westext/adaptvoronoi/__init__.py +3 -0
  116. westpa/westext/adaptvoronoi/adaptVor_driver.py +214 -0
  117. westpa/westext/hamsm_restarting/__init__.py +3 -0
  118. westpa/westext/hamsm_restarting/example_overrides.py +35 -0
  119. westpa/westext/hamsm_restarting/restart_driver.py +1165 -0
  120. westpa/westext/stringmethod/__init__.py +11 -0
  121. westpa/westext/stringmethod/fourier_fitting.py +69 -0
  122. westpa/westext/stringmethod/string_driver.py +253 -0
  123. westpa/westext/stringmethod/string_method.py +306 -0
  124. westpa/westext/weed/BinCluster.py +180 -0
  125. westpa/westext/weed/ProbAdjustEquil.py +100 -0
  126. westpa/westext/weed/UncertMath.py +247 -0
  127. westpa/westext/weed/__init__.py +10 -0
  128. westpa/westext/weed/weed_driver.py +182 -0
  129. westpa/westext/wess/ProbAdjust.py +101 -0
  130. westpa/westext/wess/__init__.py +6 -0
  131. westpa/westext/wess/wess_driver.py +207 -0
  132. westpa/work_managers/__init__.py +57 -0
  133. westpa/work_managers/core.py +396 -0
  134. westpa/work_managers/environment.py +134 -0
  135. westpa/work_managers/mpi.py +318 -0
  136. westpa/work_managers/processes.py +187 -0
  137. westpa/work_managers/serial.py +28 -0
  138. westpa/work_managers/threads.py +79 -0
  139. westpa/work_managers/zeromq/__init__.py +20 -0
  140. westpa/work_managers/zeromq/core.py +641 -0
  141. westpa/work_managers/zeromq/node.py +131 -0
  142. westpa/work_managers/zeromq/work_manager.py +526 -0
  143. westpa/work_managers/zeromq/worker.py +320 -0
  144. westpa-2022.10.dist-info/AUTHORS +22 -0
  145. westpa-2022.10.dist-info/LICENSE +21 -0
  146. westpa-2022.10.dist-info/METADATA +183 -0
  147. westpa-2022.10.dist-info/RECORD +150 -0
  148. westpa-2022.10.dist-info/WHEEL +6 -0
  149. westpa-2022.10.dist-info/entry_points.txt +29 -0
  150. westpa-2022.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,131 @@
1
+ '''
2
+ Created on Jun 11, 2015
3
+
4
+ @author: mzwier
5
+ '''
6
+
7
+ import logging
8
+
9
+ log = logging.getLogger(__name__)
10
+
11
+ from .core import ZMQCore, Message, PassiveMultiTimer, IsNode
12
+
13
+ import zmq
14
+ from zmq.devices import ThreadProxy
15
+
16
+
17
+ class ZMQNode(ZMQCore, IsNode):
18
+ def __init__(self, upstream_rr_endpoint, upstream_ann_endpoint, n_local_workers=None):
19
+ ZMQCore.__init__(self)
20
+ IsNode.__init__(self, n_local_workers)
21
+
22
+ self.upstream_rr_endpoint = upstream_rr_endpoint
23
+ self.upstream_ann_endpoint = upstream_ann_endpoint
24
+
25
+ def __enter__(self):
26
+ return self
27
+
28
+ def __exit__(self, exc_type, exc_val, exc_traceback):
29
+ return False
30
+
31
+ def run(self):
32
+ self.startup()
33
+
34
+ @property
35
+ def is_master(self):
36
+ return False
37
+
38
+ def comm_loop(self):
39
+ self.context = zmq.Context.instance()
40
+ # or else the proxies create sockets in a different context
41
+
42
+ self.context.linger = 100
43
+ # So we don't have to destroy the context at the end of the loop
44
+
45
+ rr_proxy = ThreadProxy(zmq.ROUTER, zmq.DEALER)
46
+
47
+ # We use push/pull so (1) we don't miss any announcements
48
+ # and (2) we don't have to deal with subscription messages
49
+ ann_proxy = ThreadProxy(zmq.SUB, zmq.PUB, zmq.PUSH)
50
+ ann_monitor = self.context.socket(zmq.PULL)
51
+
52
+ # Not monitoring request/reply streams for two reasons:
53
+ # (1) we'd need to strip identity frames to interpret the messages
54
+ # (2) interpreting the messages means we'd have to decode (unpickle) and then re-encode
55
+ # all of the data flying through here, which seems like a waste just to see if
56
+ # clients start up. We miss the edge failure case where one node's workers
57
+ # start up but another's fail. Seems much less likely than all workers
58
+ # failing to start up, which would be caught by the master
59
+
60
+ ann_mon_endpoint = 'inproc://{:x}'.format(id(ann_monitor))
61
+ ann_monitor.bind(ann_mon_endpoint)
62
+
63
+ rr_proxy.bind_in(self.downstream_rr_endpoint)
64
+ if self.local_rr_endpoint:
65
+ rr_proxy.bind_in(self.local_rr_endpoint)
66
+ self.log.debug('connecting upstream_rr_endpoint = {!r}'.format(self.upstream_rr_endpoint))
67
+ rr_proxy.connect_out(self.upstream_rr_endpoint)
68
+
69
+ ann_proxy.bind_out(self.downstream_ann_endpoint)
70
+ if self.local_ann_endpoint:
71
+ ann_proxy.bind_out(self.local_ann_endpoint)
72
+ ann_proxy.connect_in(self.upstream_ann_endpoint)
73
+ self.log.debug('connecting upstream_ann_endpoint = {!r}'.format(self.upstream_ann_endpoint))
74
+ ann_proxy.setsockopt_in(zmq.SUBSCRIBE, b'')
75
+ ann_proxy.connect_mon(ann_mon_endpoint)
76
+
77
+ rr_proxy.start()
78
+ ann_proxy.start()
79
+
80
+ ann_monitor.connect(ann_mon_endpoint)
81
+
82
+ inproc_socket = self.context.socket(zmq.SUB)
83
+ inproc_socket.setsockopt(zmq.SUBSCRIBE, b'')
84
+ inproc_socket.bind(self.inproc_endpoint)
85
+
86
+ timers = PassiveMultiTimer()
87
+ timers.add_timer('master_beacon', self.master_beacon_period)
88
+ timers.add_timer('startup_timeout', self.startup_timeout)
89
+ timers.reset()
90
+
91
+ self.log.debug('master beacon period: {!r}'.format(self.master_beacon_period))
92
+ self.log.debug('startup timeout: {!r}'.format(self.startup_timeout))
93
+
94
+ peer_found = False
95
+
96
+ poller = zmq.Poller()
97
+ poller.register(ann_monitor, zmq.POLLIN)
98
+ poller.register(inproc_socket, zmq.POLLIN)
99
+ try:
100
+ while True:
101
+ poll_results = dict(poller.poll((timers.next_expiration_in() or 0.001) * 1000))
102
+
103
+ if inproc_socket in poll_results:
104
+ msgs = self.recv_all(ann_monitor, validate=False)
105
+ if Message.SHUTDOWN in (msg.message for msg in msgs):
106
+ self.log.debug('shutdown received')
107
+ break
108
+
109
+ if ann_monitor in poll_results:
110
+ msgs = self.recv_all(ann_monitor, validate=False)
111
+ message_tags = {msg.message for msg in msgs}
112
+ if Message.SHUTDOWN in message_tags:
113
+ self.log.debug('shutdown received')
114
+ break
115
+ if not peer_found and (Message.MASTER_BEACON in message_tags or Message.TASKS_AVAILABLE in message_tags):
116
+ peer_found = True
117
+ timers.remove_timer('startup_timeout')
118
+
119
+ if not peer_found and timers.expired('startup_timeout'):
120
+ self.log.error('startup phase elapsed with no contact from peer; shutting down')
121
+ break
122
+
123
+ finally:
124
+ self.log.debug('exiting')
125
+ self.context = None
126
+ self.remove_ipc_endpoints()
127
+ IsNode.shutdown(self)
128
+
129
+ def startup(self):
130
+ IsNode.startup(self)
131
+ super().startup()
@@ -0,0 +1,526 @@
1
+ import json
2
+ import logging
3
+ import multiprocessing
4
+ import socket
5
+
6
+ from .core import ZMQCore, Message, Task, Result, ZMQWorkerMissing, ZMQWMEnvironmentError, IsNode
7
+ from .core import PassiveMultiTimer
8
+ from .core import randport
9
+ from .worker import ZMQWorker
10
+ from .node import ZMQNode
11
+
12
+ import westpa.work_managers as work_managers
13
+ from westpa.work_managers import WorkManager, WMFuture
14
+
15
+ import zmq
16
+
17
+ from collections import deque
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+
22
+ class ZMQWorkManager(ZMQCore, WorkManager, IsNode):
23
+ @classmethod
24
+ def add_wm_args(cls, parser, wmenv=None):
25
+ if wmenv is None:
26
+ wmenv = work_managers.environment.default_env
27
+
28
+ wm_group = parser.add_argument_group('options for ZeroMQ ("zmq") work manager (master or node)')
29
+
30
+ wm_group.add_argument(
31
+ wmenv.arg_flag('zmq_mode'),
32
+ metavar='MODE',
33
+ choices=('master', 'node', 'server', 'client'),
34
+ help='Operate as a master (server) or a node (workers/client). '
35
+ + '"server" is a deprecated synonym for "master" and "client" is a '
36
+ + 'deprecated synonym for "node".',
37
+ )
38
+ wm_group.add_argument(
39
+ wmenv.arg_flag('zmq_comm_mode'),
40
+ metavar='COMM_MODE',
41
+ choices=('ipc', 'tcp'),
42
+ help='Use the given communication mode -- TCP or IPC (Unix-domain) -- sockets '
43
+ + 'for communication within a node. IPC (the default) may be more '
44
+ + 'efficient but is not available on (exceptionally rare) systems '
45
+ + 'without node-local storage (e.g. /tmp); on such systems, TCP may be used instead.',
46
+ )
47
+ wm_group.add_argument(
48
+ wmenv.arg_flag('zmq_write_host_info'),
49
+ metavar='INFO_FILE',
50
+ help='Store hostname and port information needed to connect to this instance '
51
+ + 'in INFO_FILE. This allows the master and nodes assisting in '
52
+ + 'coordinating the communication of other nodes to choose ports '
53
+ + 'randomly. Downstream nodes read this file with '
54
+ + wmenv.arg_flag('zmq_read_host_info')
55
+ + ' and know where how to connect.',
56
+ )
57
+ wm_group.add_argument(
58
+ wmenv.arg_flag('zmq_read_host_info'),
59
+ metavar='INFO_FILE',
60
+ help='Read hostname and port information needed to connect to the master '
61
+ + '(or other coordinating node) from INFO_FILE. '
62
+ + 'This allows the master and nodes assisting in '
63
+ + 'coordinating the communication of other nodes to choose ports '
64
+ + 'randomly, writing that information with '
65
+ + wmenv.arg_flag('zmq_write_host_info')
66
+ + ' for this instance to read.',
67
+ )
68
+ wm_group.add_argument(
69
+ wmenv.arg_flag('zmq_upstream_rr_endpoint'),
70
+ metavar='ENDPOINT',
71
+ help='ZeroMQ endpoint to which to send request/response (task and result) ' + 'traffic toward the master.',
72
+ )
73
+ wm_group.add_argument(
74
+ wmenv.arg_flag('zmq_upstream_ann_endpoint'),
75
+ metavar='ENDPOINT',
76
+ help='ZeroMQ endpoint on which to receive announcement '
77
+ + '(heartbeat and shutdown notification) traffic from the master.',
78
+ )
79
+ wm_group.add_argument(
80
+ wmenv.arg_flag('zmq_downstream_rr_endpoint'),
81
+ metavar='ENDPOINT',
82
+ help='ZeroMQ endpoint on which to listen for request/response ' + '(task and result) traffic from subsidiary workers.',
83
+ )
84
+ wm_group.add_argument(
85
+ wmenv.arg_flag('zmq_downstream_ann_endpoint'),
86
+ metavar='ENDPOINT',
87
+ help='ZeroMQ endpoint on which to send announcement ' + '(heartbeat and shutdown notification) traffic toward workers.',
88
+ )
89
+ wm_group.add_argument(
90
+ wmenv.arg_flag('zmq_master_heartbeat'),
91
+ metavar='MASTER_HEARTBEAT',
92
+ type=float,
93
+ help='Every MASTER_HEARTBEAT seconds, the master announces its presence ' + 'to workers.',
94
+ )
95
+ wm_group.add_argument(
96
+ wmenv.arg_flag('zmq_worker_heartbeat'),
97
+ metavar='WORKER_HEARTBEAT',
98
+ type=float,
99
+ help='Every WORKER_HEARTBEAT seconds, workers announce their presence ' + 'to the master.',
100
+ )
101
+ wm_group.add_argument(
102
+ wmenv.arg_flag('zmq_timeout_factor'),
103
+ metavar='FACTOR',
104
+ type=float,
105
+ help='Scaling factor for heartbeat timeouts. '
106
+ + "If the master doesn't hear from a worker in WORKER_HEARTBEAT*FACTOR, "
107
+ + "the worker is assumed to have crashed. If a worker doesn't hear from "
108
+ + "the master in MASTER_HEARTBEAT*FACTOR seconds, the master is assumed "
109
+ + "to have crashed. Both cases result in shutdown. ",
110
+ )
111
+ wm_group.add_argument(
112
+ wmenv.arg_flag('zmq_startup_timeout'),
113
+ metavar='STARTUP_TIMEOUT',
114
+ type=float,
115
+ help='Amount of time (in seconds) to wait for communication between '
116
+ + 'the master and at least one worker. This may need to be changed '
117
+ + 'on very large, heavily-loaded computer systems that start all processes '
118
+ + 'simultaneously. ',
119
+ )
120
+ wm_group.add_argument(
121
+ wmenv.arg_flag('zmq_shutdown_timeout'),
122
+ metavar='SHUTDOWN_TIMEOUT',
123
+ type=float,
124
+ help='Amount of time (in seconds) to wait for workers to shut down.',
125
+ )
126
+
127
+ @classmethod
128
+ def from_environ(cls, wmenv=None):
129
+ if wmenv is None:
130
+ wmenv = work_managers.environment.default_env
131
+
132
+ # determine mode
133
+ mode = wmenv.get_val('zmq_mode', 'master').lower()
134
+ if mode in {'master', 'server'}:
135
+ mode = 'master'
136
+ elif mode in {'node', 'client'}:
137
+ mode = 'node'
138
+ else:
139
+ raise ValueError('invalid ZMQ work manager mode {!r}'.format(mode))
140
+
141
+ # determine number of workers
142
+ # 0 with mode=='master' is a dedicated master
143
+ # 0 with mode=='node' is a dedicated communications process (former ZMQRouter)
144
+ n_workers = wmenv.get_val('n_workers', multiprocessing.cpu_count(), int)
145
+
146
+ # We set this at the class level, because outside of testing, a node either
147
+ # can support IPC or it can't, and there is no obvious need (currently)
148
+ # to support both modes on an instance-by-instance basis
149
+ comm_mode = wmenv.get_val('zmq_comm_mode', cls.default_comm_mode)
150
+ ZMQWorkManager.internal_transport = comm_mode
151
+ ZMQWorker.internal_transport = comm_mode
152
+ ZMQNode.internal_transport = comm_mode
153
+
154
+ write_host_info = wmenv.get_val('zmq_write_host_info')
155
+ read_host_info = wmenv.get_val('zmq_read_host_info')
156
+ master_heartbeat = wmenv.get_val('zmq_master_heartbeat', cls.default_master_heartbeat, float)
157
+ worker_heartbeat = wmenv.get_val('zmq_worker_heartbeat', cls.default_worker_heartbeat, float)
158
+ timeout_factor = wmenv.get_val('zmq_timeout_factor', cls.default_timeout_factor, float)
159
+ startup_timeout = wmenv.get_val('zmq_startup_timeout', cls.default_startup_timeout, float)
160
+
161
+ if mode == 'master':
162
+ instance = ZMQWorkManager(n_workers)
163
+ else: # mode =='node'
164
+ upstream_info = {}
165
+ if read_host_info:
166
+ upstream_info.update(cls.read_host_info(read_host_info))
167
+ log.debug('upstream_info: {!r}'.format(upstream_info))
168
+
169
+ upstream_rr_endpoint = wmenv.get_val('zmq_upstream_rr_endpoint', upstream_info.get('rr_endpoint'))
170
+ upstream_ann_endpoint = wmenv.get_val('zmq_upstream_ann_endpoint', upstream_info.get('ann_endpoint'))
171
+
172
+ if not (upstream_rr_endpoint and upstream_ann_endpoint):
173
+ raise ZMQWMEnvironmentError('at least one upstream endpoint unspecified')
174
+
175
+ # expand hostnames, if present, to IP addresses
176
+ # reject wildcard hostnames, which is a logic error (can't connect to a host
177
+ # without specifying an address)
178
+ upstream_rr_endpoint = cls.canonicalize_endpoint(upstream_rr_endpoint, allow_wildcard_host=False)
179
+ upstream_ann_endpoint = cls.canonicalize_endpoint(upstream_ann_endpoint, allow_wildcard_host=False)
180
+
181
+ log.debug('upstream_rr_endpoint = {}'.format(upstream_rr_endpoint))
182
+ log.debug('upstream_ann_endpoint = {}'.format(upstream_ann_endpoint))
183
+
184
+ instance = ZMQNode(
185
+ upstream_ann_endpoint=upstream_ann_endpoint, upstream_rr_endpoint=upstream_rr_endpoint, n_local_workers=n_workers
186
+ )
187
+
188
+ # Both server and node bind downstream endpoints, so that users get fan-out communications
189
+ # "for free" when starting up a computational node
190
+ downstream_rr_endpoint = cls.canonicalize_endpoint(
191
+ wmenv.get_val('zmq_downstream_rr_endpoint', 'tcp://*:{}'.format(randport()))
192
+ )
193
+ downstream_ann_endpoint = cls.canonicalize_endpoint(
194
+ wmenv.get_val('zmq_downstream_ann_endpoint', 'tcp://*:{}'.format(randport()))
195
+ )
196
+ instance.downstream_rr_endpoint = downstream_rr_endpoint
197
+ instance.downstream_ann_endpoint = downstream_ann_endpoint
198
+
199
+ instance.master_beacon_period = master_heartbeat
200
+ instance.worker_beacon_period = worker_heartbeat
201
+ instance.timeout_factor = timeout_factor
202
+ instance.startup_timeout = startup_timeout
203
+
204
+ assert isinstance(instance, IsNode)
205
+ for worker in instance.local_workers:
206
+ worker.master_beacon_period = master_heartbeat
207
+ worker.worker_beacon_period = worker_heartbeat
208
+ worker.timeout_factor = timeout_factor
209
+ worker.startup_timeout = startup_timeout
210
+
211
+ # We always write host info (since we are always either master or node)
212
+ # we choose not to in the special case that read_host_info is '' but not None
213
+ # (None implies nothing found on command line or in environment variables, but ''
214
+ # implies that it was found somewhere but it is empty)
215
+ if write_host_info is not None and write_host_info != '':
216
+ instance.write_host_info(write_host_info)
217
+
218
+ log.debug('prepared {!r} with:'.format(instance))
219
+ log.debug('n_workers = {}'.format(n_workers))
220
+ for attr in (
221
+ 'master_beacon_period',
222
+ 'worker_beacon_period',
223
+ 'startup_timeout',
224
+ 'timeout_factor',
225
+ 'downstream_rr_endpoint',
226
+ 'downstream_ann_endpoint',
227
+ ):
228
+ log.debug('{} = {!r}'.format(attr, getattr(instance, attr)))
229
+
230
+ return instance
231
+
232
+ @classmethod
233
+ def read_host_info(cls, filename):
234
+ return json.load(open(filename, 'rt'))
235
+
236
+ @classmethod
237
+ def canonicalize_endpoint(cls, endpoint, allow_wildcard_host=True):
238
+ if endpoint.startswith('ipc://'):
239
+ return endpoint
240
+ elif endpoint.startswith('tcp://'):
241
+ fields = endpoint[6:].split(':')
242
+
243
+ # get IP address
244
+ if fields[0] != '*':
245
+ ipaddr = socket.gethostbyname(fields[0])
246
+ else:
247
+ if allow_wildcard_host:
248
+ ipaddr = '*'
249
+ else:
250
+ raise ValueError('wildcard host not permitted')
251
+
252
+ # get/generate port
253
+ try:
254
+ port = fields[1]
255
+ except IndexError:
256
+ # no port given; select one
257
+ port = randport()
258
+ else:
259
+ port = int(fields[1])
260
+
261
+ return 'tcp://{}:{}'.format(ipaddr, port)
262
+ else:
263
+ raise ValueError('unrecognized/unsupported endpoint: {!r}'.format(endpoint))
264
+
265
+ def __init__(self, n_local_workers=1):
266
+ ZMQCore.__init__(self)
267
+ WorkManager.__init__(self)
268
+ IsNode.__init__(self, n_local_workers)
269
+
270
+ # Futures indexed by task ID
271
+ self.futures = dict()
272
+
273
+ # Tasks pending distribution
274
+ self.outgoing_tasks = deque()
275
+
276
+ # Tasks being processed by workers (indexed by worker_id)
277
+ self.assigned_tasks = dict()
278
+
279
+ # Identity information and last contact from workers
280
+ self.worker_information = dict() # indexed by worker_id
281
+ self.worker_timeouts = PassiveMultiTimer() # indexed by worker_id
282
+
283
+ # Number of seconds between checks to see which workers have timed out
284
+ self.worker_timeout_check = 5.0
285
+
286
+ # Amount of time to wait for stray requests to arrive so that workers shut down properly
287
+ self.shutdown_timeout = 0.5
288
+
289
+ self.master_id = self.node_id
290
+
291
+ @property
292
+ def n_workers(self):
293
+ return len(self.worker_information)
294
+
295
+ def submit(self, fn, args=None, kwargs=None):
296
+ if self.futures is None:
297
+ # We are shutting down
298
+ raise ZMQWMEnvironmentError('work manager is shutting down')
299
+ future = WMFuture()
300
+ task = Task(fn, args or (), kwargs or {}, task_id=future.task_id)
301
+ self.futures[task.task_id] = future
302
+ self.outgoing_tasks.append(task)
303
+ # Wake up the communications loop (if necessary) to announce new tasks
304
+ self.send_inproc_message(Message.TASKS_AVAILABLE)
305
+ return future
306
+
307
+ def submit_many(self, tasks):
308
+ if self.futures is None:
309
+ # We are shutting down
310
+ raise ZMQWMEnvironmentError('work manager is shutting down')
311
+ futures = []
312
+ for fn, args, kwargs in tasks:
313
+ future = WMFuture()
314
+ task = Task(fn, args, kwargs, task_id=future.task_id)
315
+ self.futures[task.task_id] = future
316
+ self.outgoing_tasks.append(task)
317
+ futures.append(future)
318
+ # Wake up the communications loop (if necessary) to announce new tasks
319
+ self.send_inproc_message(Message.TASKS_AVAILABLE)
320
+ return futures
321
+
322
+ def send_message(self, socket, message, payload=None, flags=0):
323
+ message = Message(message, payload)
324
+ message.master_id = self.node_id
325
+ super().send_message(socket, message, payload, flags)
326
+
327
+ def handle_result(self, socket, msg):
328
+ self.send_ack(socket, msg)
329
+ with self.message_validation(msg):
330
+ assert msg.message == Message.RESULT
331
+ assert isinstance(msg.payload, Result)
332
+ assert msg.payload.task_id in self.futures
333
+ assert self.assigned_tasks[msg.src_id].task_id == msg.payload.task_id
334
+
335
+ result = msg.payload
336
+
337
+ future = self.futures.pop(result.task_id)
338
+ del self.assigned_tasks[msg.src_id]
339
+ if result.exception is not None:
340
+ future._set_exception(result.exception, result.traceback)
341
+ else:
342
+ future._set_result(result.result)
343
+
344
+ def handle_task_request(self, socket, msg):
345
+ if not self.outgoing_tasks:
346
+ # No tasks available
347
+ self.send_nak(socket, msg)
348
+ else:
349
+ task = self.outgoing_tasks.popleft()
350
+
351
+ worker_id = msg.src_id
352
+ self.assigned_tasks[worker_id] = task
353
+
354
+ self.send_message(socket, Message.TASK, task)
355
+
356
+ def update_worker_information(self, msg):
357
+ if msg.message == Message.IDENTIFY:
358
+ with self.message_validation(msg):
359
+ assert isinstance(msg.payload, dict)
360
+ self.worker_information[msg.src_id] = msg.payload
361
+ else:
362
+ self.worker_information[msg.src_id] = {}
363
+
364
+ try:
365
+ self.worker_timeouts.reset(msg.src_id)
366
+ except KeyError:
367
+ self.worker_timeouts.add_timer(msg.src_id, self.worker_beacon_period * self.timeout_factor)
368
+
369
+ def check_workers(self):
370
+ expired_worker_ids = self.worker_timeouts.which_expired()
371
+ for expired_worker_id in expired_worker_ids:
372
+ try:
373
+ worker_description = '{!s} ({!s})'.format(
374
+ expired_worker_id, self.worker_information[expired_worker_id]['description']
375
+ )
376
+ except KeyError:
377
+ worker_description = str(expired_worker_id)
378
+
379
+ self.log.error('no contact from worker {}: {}'.format(expired_worker_id, worker_description))
380
+
381
+ self.remove_worker(expired_worker_id)
382
+
383
+ def remove_worker(self, worker_id):
384
+ try:
385
+ expired_task = self.assigned_tasks.pop(worker_id)
386
+ except KeyError:
387
+ pass
388
+ else:
389
+ self.log.error('aborting task {!r} running on expired worker {!s}'.format(expired_task, worker_id))
390
+ future = self.futures.pop(expired_task.task_id)
391
+ future._set_exception(ZMQWorkerMissing('worker running this task disappeared'))
392
+ del self.worker_information[worker_id]
393
+
394
+ def shutdown_clear_tasks(self):
395
+ '''Abort pending tasks with error on shutdown.'''
396
+ while self.futures:
397
+ task_id, future = self.futures.popitem()
398
+ future._set_exception(ZMQWMEnvironmentError('work manager shut down during task'))
399
+ self.futures = None
400
+
401
+ def comm_loop(self):
402
+ self.context = zmq.Context()
403
+
404
+ rr_socket = self.context.socket(zmq.REP)
405
+ ann_socket = self.context.socket(zmq.PUB)
406
+
407
+ for endpoint in (self.local_rr_endpoint, self.downstream_rr_endpoint):
408
+ if endpoint:
409
+ rr_socket.bind(endpoint)
410
+
411
+ for endpoint in (self.local_ann_endpoint, self.downstream_ann_endpoint):
412
+ if endpoint:
413
+ ann_socket.bind(endpoint)
414
+
415
+ inproc_socket = self.context.socket(zmq.SUB)
416
+ inproc_socket.setsockopt(zmq.SUBSCRIBE, b'')
417
+ inproc_socket.bind(self.inproc_endpoint)
418
+
419
+ poller = zmq.Poller()
420
+ poller.register(inproc_socket, zmq.POLLIN)
421
+ poller.register(rr_socket, zmq.POLLIN)
422
+
423
+ timers = PassiveMultiTimer()
424
+ timers.add_timer('tasks_avail', self.master_beacon_period)
425
+ timers.add_timer('master_beacon', self.master_beacon_period)
426
+ timers.add_timer('worker_timeout_check', self.worker_beacon_period * self.timeout_factor)
427
+ timers.add_timer('startup_timeout', self.startup_timeout)
428
+ timers.reset()
429
+
430
+ self.log.debug('master beacon period: {!r}'.format(self.master_beacon_period))
431
+ self.log.debug('startup timeout: {!r}'.format(self.startup_timeout))
432
+
433
+ peer_found = False
434
+
435
+ try:
436
+ # Send a master alive message immediately; it will get discarded if necessary
437
+ self.send_message(ann_socket, Message.MASTER_BEACON)
438
+
439
+ while True:
440
+ # If a timer is already expired, next_expiration_in() will return 0, which
441
+ # zeromq interprets as infinite wait; so instead we select a 1 ms wait in this
442
+ # case.
443
+ timeout = (timers.next_expiration_in() or 0.001) * 1000
444
+ # Wake up every second to check for signals
445
+ timeout = min(timeout, 1000)
446
+ poll_results = dict(poller.poll(timeout))
447
+
448
+ if inproc_socket in poll_results:
449
+ msgs = self.recv_all(inproc_socket, validate=False)
450
+ # Check for shutdown; do nothing else if shutdown is signalled
451
+ if Message.SHUTDOWN in (msg.message for msg in msgs):
452
+ self.log.debug('shutdown received')
453
+ break
454
+ # Check for any other wake-up messages
455
+ for msg in msgs:
456
+ if msg.message == Message.TASKS_AVAILABLE:
457
+ self.send_message(ann_socket, Message.TASKS_AVAILABLE)
458
+
459
+ if rr_socket in poll_results:
460
+ msg = self.recv_message(rr_socket)
461
+ self.update_worker_information(msg)
462
+
463
+ if msg.message == Message.TASK_REQUEST:
464
+ self.handle_task_request(rr_socket, msg)
465
+ elif msg.message == Message.RESULT:
466
+ self.handle_result(rr_socket, msg)
467
+ else:
468
+ self.send_ack(rr_socket, msg)
469
+
470
+ if self.worker_information:
471
+ peer_found = True
472
+
473
+ if timers.expired('tasks_avail'):
474
+ if self.outgoing_tasks:
475
+ self.send_message(ann_socket, Message.TASKS_AVAILABLE)
476
+ timers.reset('tasks_avail')
477
+
478
+ if timers.expired('master_beacon'):
479
+ self.send_message(ann_socket, Message.MASTER_BEACON)
480
+ timers.reset('master_beacon')
481
+
482
+ if peer_found and timers.expired('worker_timeout_check'):
483
+ self.check_workers()
484
+ if not self.worker_information:
485
+ self.log.error('all workers disappeared; exiting')
486
+ break
487
+ timers.reset('worker_timeout_check')
488
+
489
+ if not peer_found and timers.expired('startup_timeout'):
490
+ self.log.error('startup phase elapsed with no contact from workers; shutting down')
491
+ while self.futures:
492
+ future = self.futures.popitem()[1]
493
+ future._set_exception(ZMQWorkerMissing('no workers available'))
494
+ break
495
+
496
+ # Post a shutdown message
497
+ self.log.debug('sending shutdown on ann_socket')
498
+ self.send_message(ann_socket, Message.SHUTDOWN)
499
+ poller.unregister(inproc_socket)
500
+
501
+ # Clear tasks
502
+ self.shutdown_clear_tasks()
503
+
504
+ # Clear incoming queue of requests, to let clients exit request/reply states gracefully
505
+ # (clients will still timeout in these states if necessary)
506
+ timers.add_timer('shutdown', self.shutdown_timeout)
507
+ while not timers.expired('shutdown'):
508
+ poll_results = dict(poller.poll(self.shutdown_timeout / 10 * 1000))
509
+ if rr_socket in poll_results:
510
+ msg = self.recv_message(rr_socket)
511
+ self.send_nak(rr_socket, msg)
512
+
513
+ finally:
514
+ self.context.destroy(linger=1)
515
+ self.context = None
516
+ self.remove_ipc_endpoints()
517
+
518
+ def startup(self):
519
+ IsNode.startup(self)
520
+ super().startup()
521
+
522
+ def shutdown(self):
523
+ self.signal_shutdown()
524
+ IsNode.shutdown(self)
525
+ self.join()
526
+ super().shutdown()