vortex-nwp 2.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vortex/__init__.py +135 -0
- vortex/algo/__init__.py +12 -0
- vortex/algo/components.py +2136 -0
- vortex/algo/mpitools.py +1648 -0
- vortex/algo/mpitools_templates/envelope_wrapper_default.tpl +27 -0
- vortex/algo/mpitools_templates/envelope_wrapper_mpiauto.tpl +29 -0
- vortex/algo/mpitools_templates/wrapstd_wrapper_default.tpl +18 -0
- vortex/algo/serversynctools.py +170 -0
- vortex/config.py +115 -0
- vortex/data/__init__.py +13 -0
- vortex/data/abstractstores.py +1572 -0
- vortex/data/containers.py +780 -0
- vortex/data/contents.py +596 -0
- vortex/data/executables.py +284 -0
- vortex/data/flow.py +113 -0
- vortex/data/geometries.ini +2689 -0
- vortex/data/geometries.py +703 -0
- vortex/data/handlers.py +1021 -0
- vortex/data/outflow.py +67 -0
- vortex/data/providers.py +465 -0
- vortex/data/resources.py +201 -0
- vortex/data/stores.py +1271 -0
- vortex/gloves.py +282 -0
- vortex/layout/__init__.py +27 -0
- vortex/layout/appconf.py +109 -0
- vortex/layout/contexts.py +511 -0
- vortex/layout/dataflow.py +1069 -0
- vortex/layout/jobs.py +1276 -0
- vortex/layout/monitor.py +833 -0
- vortex/layout/nodes.py +1424 -0
- vortex/layout/subjobs.py +464 -0
- vortex/nwp/__init__.py +11 -0
- vortex/nwp/algo/__init__.py +12 -0
- vortex/nwp/algo/assim.py +483 -0
- vortex/nwp/algo/clim.py +920 -0
- vortex/nwp/algo/coupling.py +609 -0
- vortex/nwp/algo/eda.py +632 -0
- vortex/nwp/algo/eps.py +613 -0
- vortex/nwp/algo/forecasts.py +745 -0
- vortex/nwp/algo/fpserver.py +927 -0
- vortex/nwp/algo/ifsnaming.py +403 -0
- vortex/nwp/algo/ifsroot.py +311 -0
- vortex/nwp/algo/monitoring.py +202 -0
- vortex/nwp/algo/mpitools.py +554 -0
- vortex/nwp/algo/odbtools.py +974 -0
- vortex/nwp/algo/oopsroot.py +735 -0
- vortex/nwp/algo/oopstests.py +186 -0
- vortex/nwp/algo/request.py +579 -0
- vortex/nwp/algo/stdpost.py +1285 -0
- vortex/nwp/data/__init__.py +12 -0
- vortex/nwp/data/assim.py +392 -0
- vortex/nwp/data/boundaries.py +261 -0
- vortex/nwp/data/climfiles.py +539 -0
- vortex/nwp/data/configfiles.py +149 -0
- vortex/nwp/data/consts.py +929 -0
- vortex/nwp/data/ctpini.py +133 -0
- vortex/nwp/data/diagnostics.py +181 -0
- vortex/nwp/data/eda.py +148 -0
- vortex/nwp/data/eps.py +383 -0
- vortex/nwp/data/executables.py +1039 -0
- vortex/nwp/data/fields.py +96 -0
- vortex/nwp/data/gridfiles.py +308 -0
- vortex/nwp/data/logs.py +551 -0
- vortex/nwp/data/modelstates.py +334 -0
- vortex/nwp/data/monitoring.py +220 -0
- vortex/nwp/data/namelists.py +644 -0
- vortex/nwp/data/obs.py +748 -0
- vortex/nwp/data/oopsexec.py +72 -0
- vortex/nwp/data/providers.py +182 -0
- vortex/nwp/data/query.py +217 -0
- vortex/nwp/data/stores.py +147 -0
- vortex/nwp/data/surfex.py +338 -0
- vortex/nwp/syntax/__init__.py +9 -0
- vortex/nwp/syntax/stdattrs.py +375 -0
- vortex/nwp/tools/__init__.py +10 -0
- vortex/nwp/tools/addons.py +35 -0
- vortex/nwp/tools/agt.py +55 -0
- vortex/nwp/tools/bdap.py +48 -0
- vortex/nwp/tools/bdcp.py +38 -0
- vortex/nwp/tools/bdm.py +21 -0
- vortex/nwp/tools/bdmp.py +49 -0
- vortex/nwp/tools/conftools.py +1311 -0
- vortex/nwp/tools/drhook.py +62 -0
- vortex/nwp/tools/grib.py +268 -0
- vortex/nwp/tools/gribdiff.py +99 -0
- vortex/nwp/tools/ifstools.py +163 -0
- vortex/nwp/tools/igastuff.py +249 -0
- vortex/nwp/tools/mars.py +56 -0
- vortex/nwp/tools/odb.py +548 -0
- vortex/nwp/tools/partitioning.py +234 -0
- vortex/nwp/tools/satrad.py +56 -0
- vortex/nwp/util/__init__.py +6 -0
- vortex/nwp/util/async.py +184 -0
- vortex/nwp/util/beacon.py +40 -0
- vortex/nwp/util/diffpygram.py +359 -0
- vortex/nwp/util/ens.py +198 -0
- vortex/nwp/util/hooks.py +128 -0
- vortex/nwp/util/taskdeco.py +81 -0
- vortex/nwp/util/usepygram.py +591 -0
- vortex/nwp/util/usetnt.py +87 -0
- vortex/proxy.py +6 -0
- vortex/sessions.py +341 -0
- vortex/syntax/__init__.py +9 -0
- vortex/syntax/stdattrs.py +628 -0
- vortex/syntax/stddeco.py +176 -0
- vortex/toolbox.py +982 -0
- vortex/tools/__init__.py +11 -0
- vortex/tools/actions.py +457 -0
- vortex/tools/addons.py +297 -0
- vortex/tools/arm.py +76 -0
- vortex/tools/compression.py +322 -0
- vortex/tools/date.py +20 -0
- vortex/tools/ddhpack.py +10 -0
- vortex/tools/delayedactions.py +672 -0
- vortex/tools/env.py +513 -0
- vortex/tools/folder.py +663 -0
- vortex/tools/grib.py +559 -0
- vortex/tools/lfi.py +746 -0
- vortex/tools/listings.py +354 -0
- vortex/tools/names.py +575 -0
- vortex/tools/net.py +1790 -0
- vortex/tools/odb.py +10 -0
- vortex/tools/parallelism.py +336 -0
- vortex/tools/prestaging.py +186 -0
- vortex/tools/rawfiles.py +10 -0
- vortex/tools/schedulers.py +413 -0
- vortex/tools/services.py +871 -0
- vortex/tools/storage.py +1061 -0
- vortex/tools/surfex.py +61 -0
- vortex/tools/systems.py +3396 -0
- vortex/tools/targets.py +384 -0
- vortex/util/__init__.py +9 -0
- vortex/util/config.py +1071 -0
- vortex/util/empty.py +24 -0
- vortex/util/helpers.py +184 -0
- vortex/util/introspection.py +63 -0
- vortex/util/iosponge.py +76 -0
- vortex/util/roles.py +51 -0
- vortex/util/storefunctions.py +103 -0
- vortex/util/structs.py +26 -0
- vortex/util/worker.py +150 -0
- vortex_nwp-2.0.0b1.dist-info/LICENSE +517 -0
- vortex_nwp-2.0.0b1.dist-info/METADATA +50 -0
- vortex_nwp-2.0.0b1.dist-info/RECORD +146 -0
- vortex_nwp-2.0.0b1.dist-info/WHEEL +5 -0
- vortex_nwp-2.0.0b1.dist-info/top_level.txt +1 -0
vortex/layout/subjobs.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module defines classes in charge of launching sub-jobs. This allow for a
|
|
3
|
+
rough parallelisation at job's level.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import collections
|
|
7
|
+
import contextlib
|
|
8
|
+
import locale
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
from bronx.datagrip import datastore
|
|
14
|
+
from bronx.fancies import loggers
|
|
15
|
+
from bronx.syntax.parsing import xlist_strings
|
|
16
|
+
from bronx.patterns import observer
|
|
17
|
+
import footprints as fp
|
|
18
|
+
|
|
19
|
+
logger = loggers.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_LOG_CAPTURE_START = '>>>> subjob stdeo start <<<<\n'
|
|
22
|
+
_LOG_CAPTURE_END = '>>>> subjob stdeo end <<<<\n'
|
|
23
|
+
|
|
24
|
+
_DSTORE_IN = '{:s}_datastore.in'
|
|
25
|
+
_DSTORE_OUT = '{:s}_{:s}_datastore.out'
|
|
26
|
+
_JOB_STDEO = '{:s}_{:s}.stdeo'
|
|
27
|
+
|
|
28
|
+
_SUBJOB_NODES_CHATTER = 'subjob_nodes_observerboard_chatter'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NodesObserverboardRecorder(observer.Observer):
|
|
32
|
+
"""Listen on the 'Layout-Nodes' observer board and record everything."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, observer_tag):
|
|
35
|
+
"""
|
|
36
|
+
:param observer_tag: The name of the observer board
|
|
37
|
+
"""
|
|
38
|
+
self._obsboard = observer.get(tag=observer_tag)
|
|
39
|
+
self._obsboard.register(self)
|
|
40
|
+
self._messages = list()
|
|
41
|
+
|
|
42
|
+
def stop_listening(self):
|
|
43
|
+
"""Stop listening on the observer board."""
|
|
44
|
+
self._obsboard.unregister(self)
|
|
45
|
+
|
|
46
|
+
def updobsitem(self, item, info):
|
|
47
|
+
"""Store the observer board messages"""
|
|
48
|
+
self._messages.append({k: v for k, v in info.items()
|
|
49
|
+
if k != 'observerboard'})
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def messages(self):
|
|
53
|
+
"""The list of collected messages"""
|
|
54
|
+
return self._messages
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@contextlib.contextmanager
|
|
58
|
+
def subjob_handling(node, observer_tag):
|
|
59
|
+
"""
|
|
60
|
+
Insert markup strings in stdout in order to frame its "usefull" part and
|
|
61
|
+
record the Layout-Nodes observer board.
|
|
62
|
+
"""
|
|
63
|
+
sys.stdout.write(_LOG_CAPTURE_START)
|
|
64
|
+
sys.stdout.flush()
|
|
65
|
+
recorder = NodesObserverboardRecorder(observer_tag)
|
|
66
|
+
try:
|
|
67
|
+
yield
|
|
68
|
+
finally:
|
|
69
|
+
recorder.stop_listening()
|
|
70
|
+
sys.stdout.flush()
|
|
71
|
+
sys.stdout.write(_LOG_CAPTURE_END)
|
|
72
|
+
sys.stdout.flush()
|
|
73
|
+
node.ticket.datastore.insert(_SUBJOB_NODES_CHATTER,
|
|
74
|
+
dict(tag=node.tag), recorder.messages)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class SubJobLauncherError(Exception):
|
|
78
|
+
"""Raise whenever an error occurred in at least on of the subjobs."""
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class AbstractSubJobLauncher(fp.FootprintBase):
|
|
83
|
+
"""Abstract subjob launcher class."""
|
|
84
|
+
|
|
85
|
+
_collector = ('subjobslauncher',)
|
|
86
|
+
_abstract = True
|
|
87
|
+
_footprint = dict(
|
|
88
|
+
info = 'Abstract SubJob launcher.',
|
|
89
|
+
attr = dict(
|
|
90
|
+
kind = dict(
|
|
91
|
+
),
|
|
92
|
+
nodes_obsboard_tag = dict(
|
|
93
|
+
info = "The name of the Layout-Nodes observer board.",
|
|
94
|
+
),
|
|
95
|
+
limit = dict(
|
|
96
|
+
info = "The maximum number of parallel subjobs.",
|
|
97
|
+
type = int,
|
|
98
|
+
optional = True
|
|
99
|
+
),
|
|
100
|
+
scriptpath = dict(
|
|
101
|
+
info = "The path to the current job script.",
|
|
102
|
+
)
|
|
103
|
+
),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def __init__(self, *kargs, **kwargs):
|
|
107
|
+
super().__init__(*kargs, **kwargs)
|
|
108
|
+
self._ticket = None
|
|
109
|
+
self._watermark = 0
|
|
110
|
+
self._running = dict()
|
|
111
|
+
logger.info('"%s" subjob launcher created (limit=%s, scriptpath=%s)',
|
|
112
|
+
self.kind, str(self.limit), self.scriptpath)
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def actual_limit(self):
|
|
116
|
+
"""The maximum number of subjobs allowed in parallel."""
|
|
117
|
+
return self.limit
|
|
118
|
+
|
|
119
|
+
def _new_ticket_hook(self, t):
|
|
120
|
+
"""Any additional actions to be performed when a next session ticket is provided."""
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def _set_ticket(self, t):
|
|
124
|
+
self._ticket = t
|
|
125
|
+
self.fsid = self.ticket.sh.path.join(self.ticket.sh.pwd(), 'sjob_fs')
|
|
126
|
+
self._ticket.datastore.pickle_dump(_DSTORE_IN.format(self.fsid))
|
|
127
|
+
self._new_ticket_hook(t)
|
|
128
|
+
|
|
129
|
+
def _get_ticket(self):
|
|
130
|
+
assert self._ticket is not None
|
|
131
|
+
return self._ticket
|
|
132
|
+
|
|
133
|
+
ticket = property(_get_ticket, _set_ticket, doc="The current Session's ticket")
|
|
134
|
+
|
|
135
|
+
def __call__(self, tag, subtags):
|
|
136
|
+
"""Launch the subjob that will be in charge of processing the **tag** node."""
|
|
137
|
+
if self.actual_limit is not None and self._watermark == self.actual_limit:
|
|
138
|
+
logger.info("The subjobs limit is reached (%d). Waiting for some subjobs to finish.",
|
|
139
|
+
self.actual_limit)
|
|
140
|
+
self.wait()
|
|
141
|
+
self._running[tag] = subtags
|
|
142
|
+
self._watermark += 1
|
|
143
|
+
with self.ticket.env.delta_context(VORTEX_SUBJOB_ACTIVATED='{:s}:{:s}'.format(tag, self.fsid)):
|
|
144
|
+
logger.info("Launching subjob with VORTEX_SUBJOB_ACTIVATED='%s'",
|
|
145
|
+
self.ticket.env.VORTEX_SUBJOB_ACTIVATED)
|
|
146
|
+
self._actual_launch(tag)
|
|
147
|
+
|
|
148
|
+
def wait(self):
|
|
149
|
+
"""Wait for at least one subjob to complete."""
|
|
150
|
+
done, ko = self._actual_wait()
|
|
151
|
+
for tag in done | ko:
|
|
152
|
+
self._stdeo_dump(tag, 'succeeded' if tag in done else 'failed')
|
|
153
|
+
self._update_context(tag)
|
|
154
|
+
del self._running[tag]
|
|
155
|
+
self._watermark -= 1
|
|
156
|
+
return done, ko
|
|
157
|
+
|
|
158
|
+
def waitall(self):
|
|
159
|
+
"""Wait for all subjob to complete."""
|
|
160
|
+
logger.info("Waiting for all subjobs to terminate.")
|
|
161
|
+
done = set()
|
|
162
|
+
ko = set()
|
|
163
|
+
while self._running:
|
|
164
|
+
new_done, new_ko = self.wait()
|
|
165
|
+
done.update(new_done)
|
|
166
|
+
ko.update(new_ko)
|
|
167
|
+
return done, ko
|
|
168
|
+
|
|
169
|
+
def _stdeo_dump(self, tag, outcome='succeeded', ignore_end=False):
|
|
170
|
+
"""Dump the standard output of the subjob refered by **tag**.
|
|
171
|
+
|
|
172
|
+
:param tag: The subjob's tag
|
|
173
|
+
:param outcome: Some indication on how the subjob ended
|
|
174
|
+
:param ignore_end: Print the entire standard output (usefull for debuging)
|
|
175
|
+
"""
|
|
176
|
+
plocale = locale.getlocale()[1] or 'ascii'
|
|
177
|
+
self.ticket.sh.title('subjob "{:s}" {:s}. Here is the output:'.format(tag, outcome))
|
|
178
|
+
with open(_JOB_STDEO.format(self.fsid, tag), encoding=plocale) as fhst:
|
|
179
|
+
started = False
|
|
180
|
+
for lst in fhst:
|
|
181
|
+
if started:
|
|
182
|
+
if lst == _LOG_CAPTURE_END:
|
|
183
|
+
if not ignore_end:
|
|
184
|
+
break
|
|
185
|
+
else:
|
|
186
|
+
sys.stdout.write(lst)
|
|
187
|
+
else:
|
|
188
|
+
started = lst == _LOG_CAPTURE_START
|
|
189
|
+
print()
|
|
190
|
+
print('Full Log available at: {:s}_{:s}.stdeo'.format(self.fsid, tag))
|
|
191
|
+
print()
|
|
192
|
+
|
|
193
|
+
def _update_context(self, tag):
|
|
194
|
+
"""Update the context using the **tag** subjob datastore's dump."""
|
|
195
|
+
stags = self._running[tag]
|
|
196
|
+
ds = datastore.DataStore()
|
|
197
|
+
ds.pickle_load(_DSTORE_OUT.format(self.fsid, tag))
|
|
198
|
+
for k in ds.keys():
|
|
199
|
+
if re.match('context_', k.kind):
|
|
200
|
+
xpath = k.extras.get('path', '').split('/')
|
|
201
|
+
for stag in stags:
|
|
202
|
+
# Only update relevant entries
|
|
203
|
+
if xpath[-1] == stag:
|
|
204
|
+
logger.info('Updating "%s, path=%s" in the datastore',
|
|
205
|
+
k.kind, '/'.join(xpath))
|
|
206
|
+
curv = self.ticket.datastore.get(k.kind, k.extras)
|
|
207
|
+
if hasattr(curv, 'datastore_inplace_overwrite'):
|
|
208
|
+
curv.datastore_inplace_overwrite(ds.get(k.kind, k.extras))
|
|
209
|
+
else:
|
|
210
|
+
self.ticket.datastore.insert(k.kind, k.extras,
|
|
211
|
+
ds.get(k.kind, k.extras))
|
|
212
|
+
break
|
|
213
|
+
if k.kind == _SUBJOB_NODES_CHATTER and k.extras['tag'] == tag:
|
|
214
|
+
messages = ds.get(k.kind, k.extras)
|
|
215
|
+
if messages:
|
|
216
|
+
oboard = observer.get(tag=self.nodes_obsboard_tag)
|
|
217
|
+
oboard.notify_new(self, dict(tag=tag, subjob_replay=True))
|
|
218
|
+
for message in messages:
|
|
219
|
+
message['subjob_replay'] = True
|
|
220
|
+
oboard.notify_upd(self, message)
|
|
221
|
+
logger.debug('Relaying status change: %s', message)
|
|
222
|
+
oboard.notify_del(self, dict(tag=tag, subjob_replay=False))
|
|
223
|
+
|
|
224
|
+
def _actual_launch(self, tag):
|
|
225
|
+
"""Launch the **tag** subjob: to be overwritten in the subclass!"""
|
|
226
|
+
raise NotImplementedError()
|
|
227
|
+
|
|
228
|
+
def _actual_wait(self):
|
|
229
|
+
"""Wait for the **tag** subjob: to be overwritten in the subclass!"""
|
|
230
|
+
raise NotImplementedError()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class SpawnSubJobLauncher(AbstractSubJobLauncher):
|
|
234
|
+
"""A very simple subjob launcher: just starts a new process."""
|
|
235
|
+
|
|
236
|
+
_footprint = dict(
|
|
237
|
+
attr = dict(
|
|
238
|
+
kind =dict(
|
|
239
|
+
values = ['spawn', ]
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def __init__(self, *kargs, **kwargs):
|
|
245
|
+
super().__init__(*kargs, **kwargs)
|
|
246
|
+
self._outfhs = dict()
|
|
247
|
+
self._processes = dict()
|
|
248
|
+
self._nvcores = None
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def actual_limit(self):
|
|
252
|
+
"""The maximum number of subjobs allowed in parallel."""
|
|
253
|
+
if self.limit is not None:
|
|
254
|
+
return self.limit
|
|
255
|
+
elif self.limit is None and self._nvcores is not None:
|
|
256
|
+
return self._nvcores
|
|
257
|
+
else:
|
|
258
|
+
raise ValueError('No limit could be found for the number of subprocesses.')
|
|
259
|
+
|
|
260
|
+
def _new_ticket_hook(self, t):
|
|
261
|
+
"""Tries to find out the number of virtual cores available."""
|
|
262
|
+
super()._new_ticket_hook(t)
|
|
263
|
+
if self.limit is None and t.sh.cpus_info is not None:
|
|
264
|
+
self._nvcores = t.sh.cpus_info.nphysical_cores
|
|
265
|
+
logger.info('"spawn" subjob launcher set to %d (i.e the number of virtual cores)',
|
|
266
|
+
self._nvcores)
|
|
267
|
+
|
|
268
|
+
def _actual_launch(self, tag):
|
|
269
|
+
"""Just launch the subjob using a subprocess... easy!"""
|
|
270
|
+
sh = self.ticket.sh
|
|
271
|
+
ofh = open(_JOB_STDEO.format(self.fsid, tag), mode='wb')
|
|
272
|
+
p = sh.popen([sys.executable, self.scriptpath], stdout=ofh, stderr=ofh)
|
|
273
|
+
self._outfhs[tag] = ofh
|
|
274
|
+
self._processes[tag] = p
|
|
275
|
+
|
|
276
|
+
def _actual_wait(self):
|
|
277
|
+
"""Wait for at least one subprocess to terminate.
|
|
278
|
+
|
|
279
|
+
If none of the subjob failed, returns the list of tags of the successful
|
|
280
|
+
subjobs. If at least one of the subjob fails, wait for every process to
|
|
281
|
+
terminate and raise a :class:`SubJobLauncherError` exception.
|
|
282
|
+
"""
|
|
283
|
+
sh = self.ticket.sh
|
|
284
|
+
oktags = set()
|
|
285
|
+
kotags = set()
|
|
286
|
+
while self._processes and (not oktags or kotags):
|
|
287
|
+
for tag, p in self._processes.items():
|
|
288
|
+
if p.poll() is not None:
|
|
289
|
+
if sh.pclose(p):
|
|
290
|
+
oktags.add(tag)
|
|
291
|
+
else:
|
|
292
|
+
kotags.add(tag)
|
|
293
|
+
for tag in oktags | kotags:
|
|
294
|
+
if tag in self._processes:
|
|
295
|
+
del self._processes[tag]
|
|
296
|
+
del self._outfhs[tag]
|
|
297
|
+
if self._processes and (not oktags or kotags):
|
|
298
|
+
time.sleep(0.5)
|
|
299
|
+
return oktags, kotags
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class AbstractSshSubJobLauncher(AbstractSubJobLauncher):
|
|
303
|
+
"""Use SSH to launch a remote subjob."""
|
|
304
|
+
|
|
305
|
+
_abstract = True
|
|
306
|
+
_footprint = dict(
|
|
307
|
+
attr = dict(
|
|
308
|
+
taskspn = dict(
|
|
309
|
+
type = int,
|
|
310
|
+
default = 1,
|
|
311
|
+
optional = True,
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def __init__(self, *kargs, **kwargs):
|
|
317
|
+
super().__init__(*kargs, **kwargs)
|
|
318
|
+
self.nodes = list()
|
|
319
|
+
self._avnodes = collections.deque()
|
|
320
|
+
self._outfhs = dict()
|
|
321
|
+
self._hosts = dict()
|
|
322
|
+
self._processes = dict()
|
|
323
|
+
|
|
324
|
+
@property
|
|
325
|
+
def actual_limit(self):
|
|
326
|
+
"""The maximum number of subjobs allowed in parallel."""
|
|
327
|
+
if self.limit is None:
|
|
328
|
+
return len(self.nodes)
|
|
329
|
+
else:
|
|
330
|
+
return self.limit
|
|
331
|
+
|
|
332
|
+
def _find_raw_nodes_list(self, t):
|
|
333
|
+
"""This method should be overwritten and return a list of target hostnames."""
|
|
334
|
+
raise NotImplementedError()
|
|
335
|
+
|
|
336
|
+
def _env_variables_iterator(self, t):
|
|
337
|
+
"""Return the environment variables that will be used."""
|
|
338
|
+
return t.topenv.items()
|
|
339
|
+
|
|
340
|
+
def _new_ticket_hook(self, t):
|
|
341
|
+
"""Common initialisations."""
|
|
342
|
+
super()._new_ticket_hook(t)
|
|
343
|
+
self.nodes = self._find_raw_nodes_list(t)
|
|
344
|
+
# Several tasks may be launched on a single node
|
|
345
|
+
self.nodes = self.nodes * self.taskspn
|
|
346
|
+
if self.limit is not None and len(self.nodes) < self.limit:
|
|
347
|
+
raise RuntimeError('The are not enough compute nodes (available:{:s}/requested={:d})'
|
|
348
|
+
.format(len(self.nodes), self.limit))
|
|
349
|
+
# Summary
|
|
350
|
+
logger.info("%d task(s) per node will be launched. Nodes list: %s",
|
|
351
|
+
self.taskspn, ",".join(sorted(set(self.nodes))))
|
|
352
|
+
# Available nodes
|
|
353
|
+
self._avnodes = collections.deque(self.nodes)
|
|
354
|
+
# Freeze the root environment in a wrapper
|
|
355
|
+
self._lwrapper = '{:s}.wrap.sh'.format(self.fsid)
|
|
356
|
+
with open(self._lwrapper, 'w', encoding='utf-8') as fhwrap:
|
|
357
|
+
fhwrap.write('#! /bin/bash\n')
|
|
358
|
+
fhwrap.write('set -x\n')
|
|
359
|
+
fhwrap.write('set -e\n')
|
|
360
|
+
fhwrap.write('echo "I am running on $(hostname) !"\n')
|
|
361
|
+
for k, v in self._env_variables_iterator(t):
|
|
362
|
+
if re.match(r'[-_\w]+$', k): # Get rid of weird variable names
|
|
363
|
+
fhwrap.write("export {:s}='{!s}'\n".format(k.upper(), v))
|
|
364
|
+
fhwrap.write('exec $*\n')
|
|
365
|
+
t.sh.xperm(self._lwrapper, force=True)
|
|
366
|
+
# Put the script on the parallel file-system (otherwise it won't be accessible
|
|
367
|
+
# from other nodes)
|
|
368
|
+
self._pfs_scriptpath = '{:s}.script.py'.format(self.fsid)
|
|
369
|
+
t.sh.cp(self.scriptpath, self._pfs_scriptpath)
|
|
370
|
+
|
|
371
|
+
def _env_lastminute_update(self, tag, thost):
|
|
372
|
+
"""Add some lastminute environment variables."""
|
|
373
|
+
return dict().items()
|
|
374
|
+
|
|
375
|
+
def _actual_launch(self, tag):
|
|
376
|
+
"""Just launch the subjob using an SSH command."""
|
|
377
|
+
sh = self.ticket.sh
|
|
378
|
+
thost = self._avnodes.popleft()
|
|
379
|
+
logger.info('"%s" will be used (through SSH).', thost)
|
|
380
|
+
ofh = open(_JOB_STDEO.format(self.fsid, tag), mode='wb')
|
|
381
|
+
cmd = "export VORTEX_SUBJOB_ACTIVATED='{:s}'; ".format(sh.env.VORTEX_SUBJOB_ACTIVATED)
|
|
382
|
+
cmd += ' '.join(["export {:s}='{!s}'; ".format(k, v)
|
|
383
|
+
for k, v in self._env_lastminute_update(tag, thost)])
|
|
384
|
+
cmd += ' '.join([self._lwrapper, sys.executable, self._pfs_scriptpath])
|
|
385
|
+
print(cmd)
|
|
386
|
+
p = sh.ssh(thost).background_execute(cmd, sshopts='-o CheckHostIP=no',
|
|
387
|
+
stdout=ofh, stderr=ofh)
|
|
388
|
+
self._outfhs[tag] = ofh
|
|
389
|
+
self._hosts[tag] = thost
|
|
390
|
+
self._processes[tag] = p
|
|
391
|
+
|
|
392
|
+
def _actual_wait(self):
|
|
393
|
+
"""Wait for at least one subprocess to terminate.
|
|
394
|
+
|
|
395
|
+
If none of the subjob failed, returns the list of tags of the successful
|
|
396
|
+
subjobs. If at least one of the subjob fails, wait for every process to
|
|
397
|
+
terminate and raise a :class:`SubJobLauncherError` exception.
|
|
398
|
+
"""
|
|
399
|
+
sh = self.ticket.sh
|
|
400
|
+
oktags = set()
|
|
401
|
+
kotags = set()
|
|
402
|
+
while self._processes and (not oktags or kotags):
|
|
403
|
+
for tag, p in self._processes.items():
|
|
404
|
+
if p.poll() is not None:
|
|
405
|
+
if sh.pclose(p):
|
|
406
|
+
oktags.add(tag)
|
|
407
|
+
else:
|
|
408
|
+
kotags.add(tag)
|
|
409
|
+
for tag in oktags | kotags:
|
|
410
|
+
if tag in self._processes:
|
|
411
|
+
self._avnodes.append(self._hosts.pop(tag))
|
|
412
|
+
del self._processes[tag]
|
|
413
|
+
del self._outfhs[tag]
|
|
414
|
+
if self._processes and (not oktags or kotags):
|
|
415
|
+
time.sleep(0.5)
|
|
416
|
+
return oktags, kotags
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class SlurmSshSubJobLauncher(AbstractSshSubJobLauncher):
|
|
420
|
+
"""
|
|
421
|
+
Find the list of availlable compute nodes thanks to SLURM and use SSH to
|
|
422
|
+
launch subjobs.
|
|
423
|
+
"""
|
|
424
|
+
|
|
425
|
+
_footprint = dict(
|
|
426
|
+
attr = dict(
|
|
427
|
+
kind = dict(
|
|
428
|
+
values = ['slurm:ssh', ]
|
|
429
|
+
),
|
|
430
|
+
)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
def _env_variables_iterator(self, t):
|
|
434
|
+
"""Return the environment variables that will be used."""
|
|
435
|
+
blacklist = {"SLURM_{:s}".format(s)
|
|
436
|
+
for s in ('NNODES', 'JOB_NNODES', 'JOB_NUM_NODES',
|
|
437
|
+
'NODELIST', 'JOB_NODELIST')}
|
|
438
|
+
slurmnodes_ids = re.compile(r'(\(x\d+\))$')
|
|
439
|
+
for k, v in t.topenv.items():
|
|
440
|
+
if k not in blacklist:
|
|
441
|
+
if k.startswith('SLURM') and slurmnodes_ids.search(v):
|
|
442
|
+
yield k, slurmnodes_ids.sub('(x1)', v)
|
|
443
|
+
else:
|
|
444
|
+
yield k, v
|
|
445
|
+
for k in ('SLURM_NNODES', 'SLURM_JOB_NNODES', 'SLURM_JOB_NUM_NODES'):
|
|
446
|
+
yield k, "1"
|
|
447
|
+
|
|
448
|
+
def _env_lastminute_update(self, tag, thost): # @UnusedVariable
|
|
449
|
+
"""Add some lastminute environment variables."""
|
|
450
|
+
return dict(SLURM_NODELIST=thost,
|
|
451
|
+
SLURM_JOB_NODELIST=thost).items()
|
|
452
|
+
|
|
453
|
+
def _find_raw_nodes_list(self, t):
|
|
454
|
+
"""Find out what is the nodes list.
|
|
455
|
+
|
|
456
|
+
To do so, the SLURM_JOB_NODELIST environment variable is processed.
|
|
457
|
+
"""
|
|
458
|
+
# Process SLURM's nodes list
|
|
459
|
+
nlist = t.env.get('SLURM_JOB_NODELIST',
|
|
460
|
+
t.env.get('SLURM_NODELIST', None))
|
|
461
|
+
if nlist:
|
|
462
|
+
return xlist_strings(nlist)
|
|
463
|
+
else:
|
|
464
|
+
raise RuntimeError('The "SLURM_JOB_NODELIST" environment variable is not defined.')
|
vortex/nwp/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AlgoComponents for NWP
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Recursive inclusion of packages with potential FootprintBase classes
|
|
6
|
+
from . import forecasts, fpserver, coupling, mpitools, odbtools, stdpost, assim, \
|
|
7
|
+
eps, eda, request, monitoring, clim
|
|
8
|
+
from . import oopsroot, oopstests
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
#: No automatic export
|
|
12
|
+
__all__ = []
|