vortex-nwp 2.0.0b1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vortex/__init__.py +75 -47
- vortex/algo/__init__.py +3 -2
- vortex/algo/components.py +944 -618
- vortex/algo/mpitools.py +802 -497
- vortex/algo/mpitools_templates/__init__.py +1 -0
- vortex/algo/serversynctools.py +34 -33
- vortex/config.py +19 -22
- vortex/data/__init__.py +9 -3
- vortex/data/abstractstores.py +593 -655
- vortex/data/containers.py +217 -162
- vortex/data/contents.py +65 -39
- vortex/data/executables.py +93 -102
- vortex/data/flow.py +40 -34
- vortex/data/geometries.py +228 -132
- vortex/data/handlers.py +436 -227
- vortex/data/outflow.py +15 -15
- vortex/data/providers.py +185 -163
- vortex/data/resources.py +48 -42
- vortex/data/stores.py +540 -417
- vortex/data/sync_templates/__init__.py +0 -0
- vortex/gloves.py +114 -87
- vortex/layout/__init__.py +1 -8
- vortex/layout/contexts.py +150 -84
- vortex/layout/dataflow.py +353 -202
- vortex/layout/monitor.py +264 -128
- vortex/nwp/__init__.py +5 -2
- vortex/nwp/algo/__init__.py +14 -5
- vortex/nwp/algo/assim.py +205 -151
- vortex/nwp/algo/clim.py +683 -517
- vortex/nwp/algo/coupling.py +447 -225
- vortex/nwp/algo/eda.py +437 -229
- vortex/nwp/algo/eps.py +403 -231
- vortex/nwp/algo/forecasts.py +416 -275
- vortex/nwp/algo/fpserver.py +683 -307
- vortex/nwp/algo/ifsnaming.py +205 -145
- vortex/nwp/algo/ifsroot.py +215 -122
- vortex/nwp/algo/monitoring.py +137 -76
- vortex/nwp/algo/mpitools.py +330 -190
- vortex/nwp/algo/odbtools.py +637 -353
- vortex/nwp/algo/oopsroot.py +454 -273
- vortex/nwp/algo/oopstests.py +90 -56
- vortex/nwp/algo/request.py +287 -206
- vortex/nwp/algo/stdpost.py +878 -522
- vortex/nwp/data/__init__.py +22 -4
- vortex/nwp/data/assim.py +125 -137
- vortex/nwp/data/boundaries.py +121 -68
- vortex/nwp/data/climfiles.py +193 -211
- vortex/nwp/data/configfiles.py +73 -69
- vortex/nwp/data/consts.py +426 -401
- vortex/nwp/data/ctpini.py +59 -43
- vortex/nwp/data/diagnostics.py +94 -66
- vortex/nwp/data/eda.py +50 -51
- vortex/nwp/data/eps.py +195 -146
- vortex/nwp/data/executables.py +440 -434
- vortex/nwp/data/fields.py +63 -48
- vortex/nwp/data/gridfiles.py +183 -111
- vortex/nwp/data/logs.py +250 -217
- vortex/nwp/data/modelstates.py +180 -151
- vortex/nwp/data/monitoring.py +72 -99
- vortex/nwp/data/namelists.py +254 -202
- vortex/nwp/data/obs.py +400 -308
- vortex/nwp/data/oopsexec.py +22 -20
- vortex/nwp/data/providers.py +90 -65
- vortex/nwp/data/query.py +71 -82
- vortex/nwp/data/stores.py +49 -36
- vortex/nwp/data/surfex.py +136 -137
- vortex/nwp/syntax/__init__.py +1 -1
- vortex/nwp/syntax/stdattrs.py +173 -111
- vortex/nwp/tools/__init__.py +2 -2
- vortex/nwp/tools/addons.py +22 -17
- vortex/nwp/tools/agt.py +24 -12
- vortex/nwp/tools/bdap.py +16 -5
- vortex/nwp/tools/bdcp.py +4 -1
- vortex/nwp/tools/bdm.py +3 -0
- vortex/nwp/tools/bdmp.py +14 -9
- vortex/nwp/tools/conftools.py +728 -378
- vortex/nwp/tools/drhook.py +12 -8
- vortex/nwp/tools/grib.py +65 -39
- vortex/nwp/tools/gribdiff.py +22 -17
- vortex/nwp/tools/ifstools.py +82 -42
- vortex/nwp/tools/igastuff.py +167 -143
- vortex/nwp/tools/mars.py +14 -2
- vortex/nwp/tools/odb.py +234 -125
- vortex/nwp/tools/partitioning.py +61 -37
- vortex/nwp/tools/satrad.py +27 -12
- vortex/nwp/util/async.py +83 -55
- vortex/nwp/util/beacon.py +10 -10
- vortex/nwp/util/diffpygram.py +174 -86
- vortex/nwp/util/ens.py +144 -63
- vortex/nwp/util/hooks.py +30 -19
- vortex/nwp/util/taskdeco.py +28 -24
- vortex/nwp/util/usepygram.py +278 -172
- vortex/nwp/util/usetnt.py +31 -17
- vortex/sessions.py +72 -39
- vortex/syntax/__init__.py +1 -1
- vortex/syntax/stdattrs.py +410 -171
- vortex/syntax/stddeco.py +31 -22
- vortex/toolbox.py +327 -192
- vortex/tools/__init__.py +11 -2
- vortex/tools/actions.py +110 -121
- vortex/tools/addons.py +111 -92
- vortex/tools/arm.py +42 -22
- vortex/tools/compression.py +72 -69
- vortex/tools/date.py +11 -4
- vortex/tools/delayedactions.py +242 -132
- vortex/tools/env.py +75 -47
- vortex/tools/folder.py +342 -171
- vortex/tools/grib.py +341 -162
- vortex/tools/lfi.py +423 -216
- vortex/tools/listings.py +109 -40
- vortex/tools/names.py +218 -156
- vortex/tools/net.py +655 -299
- vortex/tools/parallelism.py +93 -61
- vortex/tools/prestaging.py +55 -31
- vortex/tools/schedulers.py +172 -105
- vortex/tools/services.py +403 -334
- vortex/tools/storage.py +293 -358
- vortex/tools/surfex.py +24 -24
- vortex/tools/systems.py +1234 -643
- vortex/tools/targets.py +156 -100
- vortex/util/__init__.py +1 -1
- vortex/util/config.py +378 -327
- vortex/util/empty.py +2 -2
- vortex/util/helpers.py +56 -24
- vortex/util/introspection.py +18 -12
- vortex/util/iosponge.py +8 -4
- vortex/util/roles.py +4 -6
- vortex/util/storefunctions.py +39 -13
- vortex/util/structs.py +3 -3
- vortex/util/worker.py +29 -17
- vortex_nwp-2.1.0.dist-info/METADATA +67 -0
- vortex_nwp-2.1.0.dist-info/RECORD +144 -0
- {vortex_nwp-2.0.0b1.dist-info → vortex_nwp-2.1.0.dist-info}/WHEEL +1 -1
- vortex/layout/appconf.py +0 -109
- vortex/layout/jobs.py +0 -1276
- vortex/layout/nodes.py +0 -1424
- vortex/layout/subjobs.py +0 -464
- vortex_nwp-2.0.0b1.dist-info/METADATA +0 -50
- vortex_nwp-2.0.0b1.dist-info/RECORD +0 -146
- {vortex_nwp-2.0.0b1.dist-info → vortex_nwp-2.1.0.dist-info/licenses}/LICENSE +0 -0
- {vortex_nwp-2.0.0b1.dist-info → vortex_nwp-2.1.0.dist-info}/top_level.txt +0 -0
vortex/layout/nodes.py
DELETED
|
@@ -1,1424 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This modules defines the base nodes of the logical layout
|
|
3
|
-
for any :mod:`vortex` experiment.
|
|
4
|
-
|
|
5
|
-
The documentation of this module is probably not enough to understand all the
|
|
6
|
-
features of :class:`Node` and :class:`Driver` objects. The examples provided
|
|
7
|
-
with the Vortex source code (see :ref:`examples_jobs`) may shed some light on
|
|
8
|
-
interesting features.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import collections
|
|
12
|
-
import contextlib
|
|
13
|
-
import re
|
|
14
|
-
import sys
|
|
15
|
-
import traceback
|
|
16
|
-
|
|
17
|
-
from bronx.fancies import loggers
|
|
18
|
-
from bronx.patterns import getbytag, observer
|
|
19
|
-
from bronx.syntax.iterators import izip_pcn
|
|
20
|
-
from bronx.system.interrupt import SignalInterruptError
|
|
21
|
-
from footprints import proxy as fpx
|
|
22
|
-
from footprints.stdtypes import FPDict
|
|
23
|
-
from vortex import toolbox, VortexForceComplete
|
|
24
|
-
from vortex.layout.appconf import ConfigSet
|
|
25
|
-
from vortex.layout.subjobs import subjob_handling, SubJobLauncherError
|
|
26
|
-
from vortex.syntax.stdattrs import Namespace
|
|
27
|
-
from vortex.util.config import GenericConfigParser
|
|
28
|
-
|
|
29
|
-
logger = loggers.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
#: Export real nodes.
|
|
32
|
-
__all__ = ['Driver', 'Task', 'Family']
|
|
33
|
-
|
|
34
|
-
OBSERVER_TAG = 'Layout-Nodes'
|
|
35
|
-
|
|
36
|
-
#: Definition of a named tuple for Node Statuses
|
|
37
|
-
_NodeStatusTuple = collections.namedtuple('_NodeStatusTuple',
|
|
38
|
-
['CREATED', 'READY', 'RUNNING', 'DONE', 'FAILED'])
|
|
39
|
-
|
|
40
|
-
#: Predefined Node Status values
|
|
41
|
-
NODE_STATUS = _NodeStatusTuple(CREATED='created',
|
|
42
|
-
READY='ready to start',
|
|
43
|
-
RUNNING='running',
|
|
44
|
-
DONE='done',
|
|
45
|
-
FAILED='FAILED')
|
|
46
|
-
|
|
47
|
-
#: Definition of a named tuple for Node on_error behaviour
|
|
48
|
-
_NodeOnErrorTuple = collections.namedtuple('_NodeOnErrorTuple',
|
|
49
|
-
['FAIL', 'DELAYED_FAIL', 'CONTINUE'])
|
|
50
|
-
|
|
51
|
-
#: Predefined Node Status values
|
|
52
|
-
NODE_ON_ERROR = _NodeOnErrorTuple(FAIL='fail',
|
|
53
|
-
DELAYED_FAIL='delayed_fail',
|
|
54
|
-
CONTINUE='continue')
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class PreviousFailureError(RuntimeError):
|
|
58
|
-
"""This exception is raised in multistep jobs (when a failure already occurred)."""
|
|
59
|
-
pass
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class RequestedFailureError(RuntimeError):
|
|
63
|
-
"""
|
|
64
|
-
This exception is raised, when a Node finishes, if the `fail_at_the_end`
|
|
65
|
-
property is True.
|
|
66
|
-
"""
|
|
67
|
-
pass
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class NiceLayout(observer.Observer):
|
|
71
|
-
"""Some nice method to share between layout items."""
|
|
72
|
-
|
|
73
|
-
@property
|
|
74
|
-
def tag(self):
|
|
75
|
-
"""Abstract property: have to be defined later on"""
|
|
76
|
-
raise NotImplementedError
|
|
77
|
-
|
|
78
|
-
@property
|
|
79
|
-
def ticket(self):
|
|
80
|
-
"""Abstract property: have to be defined later on"""
|
|
81
|
-
raise NotImplementedError
|
|
82
|
-
|
|
83
|
-
@property
|
|
84
|
-
def sh(self):
|
|
85
|
-
"""Abstract property: have to be defined later on"""
|
|
86
|
-
raise NotImplementedError
|
|
87
|
-
|
|
88
|
-
@property
|
|
89
|
-
def contents(self):
|
|
90
|
-
"""Abstract property: have to be defined later on"""
|
|
91
|
-
raise NotImplementedError
|
|
92
|
-
|
|
93
|
-
def highlight(self, *args, **kw):
|
|
94
|
-
"""Proxy to :meth:`~vortex.tools.systems.subtitle` method."""
|
|
95
|
-
return self.sh.highlight(*args, bchar=' #', bline0=False, **kw)
|
|
96
|
-
|
|
97
|
-
def subtitle(self, *args, **kw):
|
|
98
|
-
"""Proxy to :meth:`~vortex.tools.systems.subtitle` method."""
|
|
99
|
-
return self.sh.subtitle(*args, **kw)
|
|
100
|
-
|
|
101
|
-
def header(self, *args, **kw):
|
|
102
|
-
"""Proxy to :meth:`~vortex.tools.systems.header` method."""
|
|
103
|
-
return self.sh.header(*args, **kw)
|
|
104
|
-
|
|
105
|
-
def nicedump(self, msg, titlecallback=None, **kw):
|
|
106
|
-
"""Simple dump of the dict contents with ``msg`` as header."""
|
|
107
|
-
titlecallback = titlecallback or self.header
|
|
108
|
-
titlecallback(msg)
|
|
109
|
-
if kw:
|
|
110
|
-
maxlen = max([len(x) for x in kw.keys()])
|
|
111
|
-
for k, v in sorted(kw.items()):
|
|
112
|
-
print(' +', k.ljust(maxlen), '=', str(v))
|
|
113
|
-
print()
|
|
114
|
-
else:
|
|
115
|
-
print(" + ...\n")
|
|
116
|
-
|
|
117
|
-
def _print_traceback(self):
|
|
118
|
-
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
119
|
-
print('Exception type: {!s}'.format(exc_type))
|
|
120
|
-
print('Exception values: {!s}'.format(exc_value))
|
|
121
|
-
self.header('Traceback Error / BEGIN')
|
|
122
|
-
print("\n".join(traceback.format_tb(exc_traceback)))
|
|
123
|
-
self.header('Traceback Error / END')
|
|
124
|
-
|
|
125
|
-
@property
|
|
126
|
-
def _ds_extra(self):
|
|
127
|
-
return {'tag': self.tag, 'class': self.__class__.__name__}
|
|
128
|
-
|
|
129
|
-
def _nicelayout_init(self, kw):
|
|
130
|
-
"""Initialise generic stuff."""
|
|
131
|
-
self._on_error = kw.get('on_error', NODE_ON_ERROR.FAIL)
|
|
132
|
-
if self._on_error not in NODE_ON_ERROR:
|
|
133
|
-
raise ValueError('Erroneous value for on_error: {!s}'.format(self._on_error))
|
|
134
|
-
self._obs_board = observer.get(tag=OBSERVER_TAG)
|
|
135
|
-
self._obs_board.notify_new(self, dict(tag=self.tag, typename=type(self).__name__,
|
|
136
|
-
status=self.status,
|
|
137
|
-
on_error=self.on_error))
|
|
138
|
-
self._obs_board.register(self)
|
|
139
|
-
# Increment the mstep counter
|
|
140
|
-
self.ticket.datastore.insert('layout_mstep_counter', self._ds_extra,
|
|
141
|
-
self.mstep_counter + 1, readonly=False)
|
|
142
|
-
|
|
143
|
-
def updobsitem(self, item, info):
|
|
144
|
-
if info.get('observerboard', '') == OBSERVER_TAG:
|
|
145
|
-
o_id = (info['tag'], info['typename'])
|
|
146
|
-
if info.get('subjob_replay', False):
|
|
147
|
-
# If the status/delayed_error_flag chatter is being replayed,
|
|
148
|
-
# deal with it
|
|
149
|
-
if o_id == (self.tag, type(self).__name__):
|
|
150
|
-
if 'new_status' in info:
|
|
151
|
-
self._store_status(info['new_status'])
|
|
152
|
-
if info.get('delayed_error_flag', False):
|
|
153
|
-
self._store_delayed_error_flag(True)
|
|
154
|
-
else:
|
|
155
|
-
if (self.status != NODE_STATUS.CREATED and
|
|
156
|
-
any([o_id == (k.tag, type(k).__name__) for k in self.contents])):
|
|
157
|
-
# We are only interested in child nodes
|
|
158
|
-
if info.get('new_status', None) == NODE_STATUS.FAILED:
|
|
159
|
-
# On kid failure, update my own status
|
|
160
|
-
if info['on_error'] == NODE_ON_ERROR.FAIL:
|
|
161
|
-
self.status = NODE_STATUS.FAILED
|
|
162
|
-
if 'delayed_error_flag' in info:
|
|
163
|
-
# Propagate the delayed error flag
|
|
164
|
-
self.delayed_error_flag = True
|
|
165
|
-
|
|
166
|
-
@property
|
|
167
|
-
def mstep_counter(self):
|
|
168
|
-
"""Count how many times this object was created."""
|
|
169
|
-
return self.ticket.datastore.get('layout_mstep_counter', self._ds_extra,
|
|
170
|
-
default_payload=0, readonly=False)
|
|
171
|
-
|
|
172
|
-
@property
|
|
173
|
-
def on_error(self):
|
|
174
|
-
"""How to react on error."""
|
|
175
|
-
return self._on_error
|
|
176
|
-
|
|
177
|
-
@property
|
|
178
|
-
def delayed_error_flag(self):
|
|
179
|
-
"""Return the delayed error flag."""
|
|
180
|
-
return self.ticket.datastore.get('layout_delayed_error_flag', self._ds_extra,
|
|
181
|
-
default_payload=False, readonly=False)
|
|
182
|
-
|
|
183
|
-
def _store_delayed_error_flag(self, value):
|
|
184
|
-
self.ticket.datastore.insert('layout_delayed_error_flag', self._ds_extra,
|
|
185
|
-
value, readonly=False)
|
|
186
|
-
|
|
187
|
-
@delayed_error_flag.setter
|
|
188
|
-
def delayed_error_flag(self, value):
|
|
189
|
-
"""Set the status of the current Node/Driver."""
|
|
190
|
-
if not bool(value):
|
|
191
|
-
raise ValueError('True is the only possible value for delayed_error_flag')
|
|
192
|
-
if not self.delayed_error_flag:
|
|
193
|
-
self._obs_board.notify_upd(self, dict(tag=self.tag, typename=type(self).__name__,
|
|
194
|
-
delayed_error_flag=True))
|
|
195
|
-
self._store_delayed_error_flag(True)
|
|
196
|
-
|
|
197
|
-
@property
|
|
198
|
-
def status(self):
|
|
199
|
-
"""Return the status of the current Node/Driver."""
|
|
200
|
-
return self.ticket.datastore.get('layout_status', self._ds_extra,
|
|
201
|
-
default_payload=NODE_STATUS.CREATED, readonly=False)
|
|
202
|
-
|
|
203
|
-
@property
|
|
204
|
-
def status_mstep_counter(self):
|
|
205
|
-
"""Return the number of the multi-step that last updated the status."""
|
|
206
|
-
return self.ticket.datastore.get('layout_status_mstep', self._ds_extra,
|
|
207
|
-
default_payload=0, readonly=False)
|
|
208
|
-
|
|
209
|
-
def _store_status(self, value):
|
|
210
|
-
self.ticket.datastore.insert('layout_status', self._ds_extra,
|
|
211
|
-
value, readonly=False)
|
|
212
|
-
self._store_status_mstep_counter()
|
|
213
|
-
|
|
214
|
-
def _store_status_mstep_counter(self):
|
|
215
|
-
self.ticket.datastore.insert('layout_status_mstep', self._ds_extra,
|
|
216
|
-
self.mstep_counter, readonly=False)
|
|
217
|
-
|
|
218
|
-
@status.setter
|
|
219
|
-
def status(self, value):
|
|
220
|
-
"""Set the status of the current Node/Driver."""
|
|
221
|
-
if value not in NODE_STATUS:
|
|
222
|
-
raise ValueError('Erroneous value for the node status: {!s}'.format(value))
|
|
223
|
-
if value != self.status:
|
|
224
|
-
self._obs_board.notify_upd(self, dict(tag=self.tag, typename=type(self).__name__,
|
|
225
|
-
previous_status=self.status,
|
|
226
|
-
new_status=value,
|
|
227
|
-
on_error=self.on_error))
|
|
228
|
-
if value == NODE_STATUS.FAILED and self.on_error == NODE_ON_ERROR.DELAYED_FAIL:
|
|
229
|
-
self.delayed_error_flag = True
|
|
230
|
-
self._store_status(value)
|
|
231
|
-
else:
|
|
232
|
-
self._store_status_mstep_counter()
|
|
233
|
-
|
|
234
|
-
@property
|
|
235
|
-
def any_failure(self):
|
|
236
|
-
"""Return True if self or any of the subnodes failed."""
|
|
237
|
-
failure = self.status == NODE_STATUS.FAILED
|
|
238
|
-
return failure or any([k.any_failure for k in self.contents])
|
|
239
|
-
|
|
240
|
-
@property
|
|
241
|
-
def any_currently_running(self):
|
|
242
|
-
"""Return True if self or any of the subnodes is running."""
|
|
243
|
-
running = (self.status == NODE_STATUS.RUNNING and
|
|
244
|
-
self.status_mstep_counter == self.mstep_counter)
|
|
245
|
-
return running or any([k.any_currently_running for k in self.contents])
|
|
246
|
-
|
|
247
|
-
def tree_str(self, statuses_filter=(), with_conf=False):
|
|
248
|
-
"""Print the node's tree."""
|
|
249
|
-
# Kids contribution
|
|
250
|
-
filtered_kids = [k for k in self.contents
|
|
251
|
-
if not statuses_filter or k.status in statuses_filter]
|
|
252
|
-
kids_str = ['\n'.join('{:s}{:s} {:s}'.format('|' if i == 0 or ikid < len(filtered_kids) - 1 else ' ',
|
|
253
|
-
'--' if i == 0 else ' ',
|
|
254
|
-
line)
|
|
255
|
-
for i, line in enumerate(kid.tree_str(statuses_filter=statuses_filter,
|
|
256
|
-
with_conf=with_conf).split('\n')))
|
|
257
|
-
for ikid, kid in enumerate(filtered_kids)]
|
|
258
|
-
# Myself
|
|
259
|
-
tree = []
|
|
260
|
-
if not statuses_filter or self.status in statuses_filter:
|
|
261
|
-
if len(statuses_filter) != 1:
|
|
262
|
-
me_fmt = '{tag:s} ({what:s}) -> {status:s}'
|
|
263
|
-
else:
|
|
264
|
-
me_fmt = '{tag:s} ({what:s})'
|
|
265
|
-
x_status = self.status
|
|
266
|
-
if x_status == NODE_STATUS.RUNNING and self.status_mstep_counter < self.mstep_counter:
|
|
267
|
-
x_status = "interrupted because of others errors"
|
|
268
|
-
me = me_fmt.format(tag=self.tag, what=self.__class__.__name__, status=x_status)
|
|
269
|
-
if self.status == NODE_STATUS.FAILED and self.on_error != NODE_ON_ERROR.FAIL:
|
|
270
|
-
me += ' (but {:s})'.format(self.on_error)
|
|
271
|
-
tree.append(me)
|
|
272
|
-
if with_conf:
|
|
273
|
-
cd = self.confdiff
|
|
274
|
-
if cd:
|
|
275
|
-
tree.extend(['{:s} {:s}={!s}'.format('|' if self.contents else ' ', k, v)
|
|
276
|
-
for k, v in sorted(cd.items())])
|
|
277
|
-
# Myself + kids
|
|
278
|
-
tree.extend(kids_str)
|
|
279
|
-
return '\n'.join(tree)
|
|
280
|
-
|
|
281
|
-
def __str__(self):
|
|
282
|
-
"""Print the node's tree."""
|
|
283
|
-
return self.tree_str()
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
class Node(getbytag.GetByTag, NiceLayout):
|
|
287
|
-
"""Base class type for any element in the logical layout.
|
|
288
|
-
|
|
289
|
-
:param str tag: The node's tag (must be unique !)
|
|
290
|
-
:param Ticket ticket: The session's ticket that will be used
|
|
291
|
-
:param str config_tag: The configuration's file section name that will be used
|
|
292
|
-
to setup this node (default: ``self.tag``)
|
|
293
|
-
:param active_callback: Some function or lambda that will be called with
|
|
294
|
-
``self`` as first argument in order to determine if
|
|
295
|
-
the current not should be used (default: ``None``.
|
|
296
|
-
i.e. The node is active).
|
|
297
|
-
:param str special_prefix: The prefix of any environment variable that should
|
|
298
|
-
be exported into ``self.conf``
|
|
299
|
-
:param str register_cycle_prefix: The callback function used to initialise
|
|
300
|
-
Genv's cycles
|
|
301
|
-
:param JobAssistant jobassistant: the jobassistant object that might
|
|
302
|
-
be used to find out the **special_prefix**
|
|
303
|
-
and **register_cycle_prefix** callback.
|
|
304
|
-
:param str on_error: How to react when a failure occurs (default is "fail",
|
|
305
|
-
alternatives are "delayed_fail" and "continue")
|
|
306
|
-
:param dict kw: Any other attributes that will be added to ``self.options``
|
|
307
|
-
(that will eventually be added to ``self.conf``)
|
|
308
|
-
"""
|
|
309
|
-
|
|
310
|
-
def __init__(self, kw):
|
|
311
|
-
logger.debug('Node initialisation %s', repr(self))
|
|
312
|
-
self.options = dict()
|
|
313
|
-
self.play = kw.pop('play', False)
|
|
314
|
-
self._ticket = kw.pop('ticket', None)
|
|
315
|
-
if self._ticket is None:
|
|
316
|
-
raise ValueError("The session's ticket must be provided (using a `ticket` argument)")
|
|
317
|
-
self._configtag = kw.pop('config_tag', self.tag)
|
|
318
|
-
self._active_cb = kw.pop('active_callback', None)
|
|
319
|
-
if self._active_cb is not None and not callable(self._active_cb):
|
|
320
|
-
raise ValueError("If provided, active_callback must be a callable")
|
|
321
|
-
self._locprefix = kw.pop('special_prefix', 'OP_').upper()
|
|
322
|
-
self._subjobok = kw.pop('subjob_allowed', True)
|
|
323
|
-
self._subjobtag = kw.pop('subjob_tag', None)
|
|
324
|
-
self._cycle_cb = kw.pop('register_cycle_prefix', None)
|
|
325
|
-
j_assist = kw.pop('jobassistant', None)
|
|
326
|
-
if j_assist is not None:
|
|
327
|
-
self._locprefix = j_assist.special_prefix.upper()
|
|
328
|
-
self._cycle_cb = j_assist.register_cycle
|
|
329
|
-
self._subjobok = j_assist.subjob_allowed
|
|
330
|
-
self._subjobtag = j_assist.subjob_tag
|
|
331
|
-
self._mstep_job_last = kw.pop('mstep_job_last', True)
|
|
332
|
-
self._dryrun = kw.pop('dryrun', False)
|
|
333
|
-
self._conf = None
|
|
334
|
-
self._parentconf = None
|
|
335
|
-
self._activenode = None
|
|
336
|
-
self._contents = list()
|
|
337
|
-
self._nicelayout_init(kw)
|
|
338
|
-
|
|
339
|
-
def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
|
|
340
|
-
"""All the necessary arguments to build a copy of this object."""
|
|
341
|
-
argsdict = dict(play=self.play,
|
|
342
|
-
ticket=self.ticket,
|
|
343
|
-
config_tag=self.config_tag,
|
|
344
|
-
active_callback=self._active_cb,
|
|
345
|
-
special_prefix=self._locprefix,
|
|
346
|
-
register_cycle_prefix=self._cycle_cb,
|
|
347
|
-
subjob_tag=self._subjobtag,
|
|
348
|
-
subjob_allowed=self._subjobok,
|
|
349
|
-
mstep_job_last=self._mstep_job_last,
|
|
350
|
-
dryrun=self._dryrun
|
|
351
|
-
)
|
|
352
|
-
argsdict.update(self.options)
|
|
353
|
-
return argsdict
|
|
354
|
-
|
|
355
|
-
def loopclone(self, tagsuffix, extras):
|
|
356
|
-
"""Create a copy of the present object by adding a suffix to the tag.
|
|
357
|
-
|
|
358
|
-
**extras** items can be added to the copy's options.
|
|
359
|
-
"""
|
|
360
|
-
kwargs = self._args_loopclone(tagsuffix, extras)
|
|
361
|
-
kwargs.update(**extras)
|
|
362
|
-
return self.__class__(tag=self.tag + tagsuffix, **kwargs)
|
|
363
|
-
|
|
364
|
-
@classmethod
|
|
365
|
-
def tag_clean(cls, tag):
|
|
366
|
-
"""Lower case, space-free and underscore-free tag."""
|
|
367
|
-
return tag.lower().replace(' ', '')
|
|
368
|
-
|
|
369
|
-
@property
|
|
370
|
-
def ticket(self):
|
|
371
|
-
return self._ticket
|
|
372
|
-
|
|
373
|
-
@property
|
|
374
|
-
def config_tag(self):
|
|
375
|
-
return self._configtag
|
|
376
|
-
|
|
377
|
-
@property
|
|
378
|
-
def conf(self):
|
|
379
|
-
return self._conf
|
|
380
|
-
|
|
381
|
-
@property
|
|
382
|
-
def confdiff(self):
|
|
383
|
-
cs = ConfigSet()
|
|
384
|
-
cs.update({k: v for k, v in self._conf.items()
|
|
385
|
-
if k not in self._parentconf or self._parentconf[k] != v})
|
|
386
|
-
return cs
|
|
387
|
-
|
|
388
|
-
@property
|
|
389
|
-
def activenode(self):
|
|
390
|
-
if self._activenode is None:
|
|
391
|
-
if self.conf is None:
|
|
392
|
-
raise RuntimeError('Setup the configuration object before calling activenode !')
|
|
393
|
-
self._activenode = self._active_cb is None or self._active_cb(self)
|
|
394
|
-
return self._activenode
|
|
395
|
-
|
|
396
|
-
@property
|
|
397
|
-
def sh(self):
|
|
398
|
-
return self.ticket.sh
|
|
399
|
-
|
|
400
|
-
@property
|
|
401
|
-
def env(self):
|
|
402
|
-
return self.ticket.env
|
|
403
|
-
|
|
404
|
-
@property
|
|
405
|
-
def contents(self):
|
|
406
|
-
return self._contents
|
|
407
|
-
|
|
408
|
-
def clear(self):
|
|
409
|
-
"""Clear actual contents."""
|
|
410
|
-
self._contents[:] = []
|
|
411
|
-
|
|
412
|
-
def __iter__(self):
|
|
413
|
-
yield from self.contents
|
|
414
|
-
|
|
415
|
-
@property
|
|
416
|
-
def fail_at_the_end(self):
|
|
417
|
-
"""Tells whether the Node should fail when it reaches the end of 'run'."""
|
|
418
|
-
return self.ticket.datastore.get('layout_fail_at_the_end', self._ds_extra,
|
|
419
|
-
default_payload=False, readonly=False)
|
|
420
|
-
|
|
421
|
-
@fail_at_the_end.setter
|
|
422
|
-
def fail_at_the_end(self, value):
|
|
423
|
-
"""Tells whether the Node should fail when it reaches the end of 'run'."""
|
|
424
|
-
self.ticket.datastore.insert('layout_fail_at_the_end', self._ds_extra,
|
|
425
|
-
bool(value), readonly=False)
|
|
426
|
-
|
|
427
|
-
def build_context(self):
|
|
428
|
-
"""Build the context and subcontexts of the current node."""
|
|
429
|
-
if self.activenode:
|
|
430
|
-
oldctx = self.ticket.context
|
|
431
|
-
ctx = self.ticket.context.newcontext(self.tag, focus=True)
|
|
432
|
-
if not self._dryrun:
|
|
433
|
-
ctx.cocoon()
|
|
434
|
-
self._setup_context(ctx)
|
|
435
|
-
oldctx.activate()
|
|
436
|
-
if self.status == NODE_STATUS.CREATED:
|
|
437
|
-
self.status = NODE_STATUS.READY
|
|
438
|
-
|
|
439
|
-
def _setup_context(self, ctx):
|
|
440
|
-
"""Setup the newly created context."""
|
|
441
|
-
pass
|
|
442
|
-
|
|
443
|
-
@contextlib.contextmanager
|
|
444
|
-
def isolate(self):
|
|
445
|
-
"""Deal with any events related to the actual run."""
|
|
446
|
-
if self.activenode:
|
|
447
|
-
with self._context_isolation():
|
|
448
|
-
if self._subjobtag == self.tag:
|
|
449
|
-
with subjob_handling(self, OBSERVER_TAG):
|
|
450
|
-
with self._status_isolation(extra_verbose=True):
|
|
451
|
-
yield
|
|
452
|
-
else:
|
|
453
|
-
with self._status_isolation():
|
|
454
|
-
yield
|
|
455
|
-
else:
|
|
456
|
-
yield
|
|
457
|
-
|
|
458
|
-
@contextlib.contextmanager
|
|
459
|
-
def _context_isolation(self):
|
|
460
|
-
"""Handle context switching properly."""
|
|
461
|
-
self._oldctx = self.ticket.context
|
|
462
|
-
ctx = self.ticket.context.switch(self.tag)
|
|
463
|
-
ctx.cocoon()
|
|
464
|
-
logger.debug('Node context directory <%s>', self.sh.getcwd())
|
|
465
|
-
try:
|
|
466
|
-
yield
|
|
467
|
-
finally:
|
|
468
|
-
ctx.free_resources()
|
|
469
|
-
logger.debug('Exit context directory <%s>', self.sh.getcwd())
|
|
470
|
-
self._oldctx.activate()
|
|
471
|
-
self.ticket.context.cocoon()
|
|
472
|
-
|
|
473
|
-
@contextlib.contextmanager
|
|
474
|
-
def _status_isolation(self, extra_verbose=False):
|
|
475
|
-
"""Handle the Node's status updates."""
|
|
476
|
-
if self.status in (NODE_STATUS.READY, NODE_STATUS.RUNNING):
|
|
477
|
-
self.status = NODE_STATUS.RUNNING
|
|
478
|
-
try:
|
|
479
|
-
yield
|
|
480
|
-
except Exception:
|
|
481
|
-
self.status = NODE_STATUS.FAILED
|
|
482
|
-
if extra_verbose or self.on_error != NODE_ON_ERROR.FAIL:
|
|
483
|
-
# Mask the exception
|
|
484
|
-
self.subtitle('An exception occurred (on_error={:s})'.format(self.on_error))
|
|
485
|
-
self._print_traceback()
|
|
486
|
-
if self.on_error == NODE_ON_ERROR.FAIL:
|
|
487
|
-
raise
|
|
488
|
-
else:
|
|
489
|
-
if self.status == NODE_STATUS.RUNNING and self._mstep_job_last:
|
|
490
|
-
self.status = NODE_STATUS.DONE
|
|
491
|
-
|
|
492
|
-
def setconf(self, conf_local, conf_global):
|
|
493
|
-
"""Build a new conf object for the actual node."""
|
|
494
|
-
|
|
495
|
-
# The parent conf is the default configuration
|
|
496
|
-
if isinstance(conf_local, ConfigSet):
|
|
497
|
-
self._conf = conf_local.copy()
|
|
498
|
-
else:
|
|
499
|
-
self._conf = ConfigSet()
|
|
500
|
-
self._conf.update(conf_local)
|
|
501
|
-
self._parentconf = self._conf.copy()
|
|
502
|
-
self._active = None
|
|
503
|
-
|
|
504
|
-
# This configuration is updated with any section with the current tag name
|
|
505
|
-
updconf = conf_global.get(self.config_tag, dict())
|
|
506
|
-
if self.mstep_counter <= 1:
|
|
507
|
-
self.nicedump(' '.join(('Configuration for', self.realkind, self.tag)), **updconf)
|
|
508
|
-
self.conf.update(updconf)
|
|
509
|
-
|
|
510
|
-
# Add exported local variables
|
|
511
|
-
self.local2conf()
|
|
512
|
-
|
|
513
|
-
# Add potential options
|
|
514
|
-
if self.options:
|
|
515
|
-
if self.mstep_counter <= 1:
|
|
516
|
-
self.nicedump('Update conf with last minute arguments',
|
|
517
|
-
titlecallback=self.highlight, **self.options)
|
|
518
|
-
self.conf.update(self.options)
|
|
519
|
-
|
|
520
|
-
if self.activenode:
|
|
521
|
-
# Then we broadcast the current configuration to the kids
|
|
522
|
-
for node in self.contents:
|
|
523
|
-
node.setconf(self.conf, conf_global)
|
|
524
|
-
else:
|
|
525
|
-
logger.info('Under present conditions/configuration, this node will not be activated.')
|
|
526
|
-
|
|
527
|
-
def localenv(self):
|
|
528
|
-
"""Dump the actual env variables."""
|
|
529
|
-
self.header('ENV catalog')
|
|
530
|
-
self.env.mydump()
|
|
531
|
-
|
|
532
|
-
def local2conf(self):
|
|
533
|
-
"""Set some parameters if defined in environment but not in actual conf."""
|
|
534
|
-
autoconf = dict()
|
|
535
|
-
localstrip = len(self._locprefix)
|
|
536
|
-
for localvar in sorted([x for x in self.env.keys() if x.startswith(self._locprefix)]):
|
|
537
|
-
if (localvar[localstrip:] not in self.conf or
|
|
538
|
-
(localvar[localstrip:] not in ('rundate', ) and
|
|
539
|
-
self.env[localvar] is not None and
|
|
540
|
-
self.env[localvar] != self.conf[localvar[localstrip:]])):
|
|
541
|
-
autoconf[localvar[localstrip:].lower()] = self.env[localvar]
|
|
542
|
-
if autoconf:
|
|
543
|
-
if self.mstep_counter <= 1:
|
|
544
|
-
self.nicedump('Populate conf with local variables',
|
|
545
|
-
titlecallback=self.highlight, **autoconf)
|
|
546
|
-
self.conf.update(autoconf)
|
|
547
|
-
|
|
548
|
-
def conf2io(self):
|
|
549
|
-
"""Abstract method."""
|
|
550
|
-
pass
|
|
551
|
-
|
|
552
|
-
def xp2conf(self):
|
|
553
|
-
"""Set the actual experiment value -- Could be the name of the op suite if any."""
|
|
554
|
-
if 'xpid' not in self.conf:
|
|
555
|
-
self.conf.xpid = self.conf.get('suite', self.env.VORTEX_XPID)
|
|
556
|
-
if self.conf.xpid is None:
|
|
557
|
-
raise ValueError('Could not set a proper experiment id.')
|
|
558
|
-
|
|
559
|
-
def register_cycle(self, cyclename):
|
|
560
|
-
"""Adds a new cycle to genv if a proper callback is defined."""
|
|
561
|
-
if self._cycle_cb is not None:
|
|
562
|
-
self._cycle_cb(cyclename)
|
|
563
|
-
else:
|
|
564
|
-
raise NotImplementedError()
|
|
565
|
-
|
|
566
|
-
def cycles(self):
|
|
567
|
-
"""Update and register some configuration cycles."""
|
|
568
|
-
|
|
569
|
-
other_cycles = [x for x in self.conf.keys() if x.endswith('_cycle')]
|
|
570
|
-
if 'cycle' in self.conf or other_cycles:
|
|
571
|
-
self.header("Registering cycles")
|
|
572
|
-
|
|
573
|
-
# At least, look for the main cycle
|
|
574
|
-
if 'cycle' in self.conf:
|
|
575
|
-
self.register_cycle(self.conf.cycle)
|
|
576
|
-
|
|
577
|
-
# Have a look to other cycles
|
|
578
|
-
for other in other_cycles:
|
|
579
|
-
self.register_cycle(self.conf.get(other))
|
|
580
|
-
|
|
581
|
-
def geometries(self):
|
|
582
|
-
"""Setup geometries according to actual tag."""
|
|
583
|
-
thisgeo = self.tag + '_geometry'
|
|
584
|
-
if thisgeo in self.conf:
|
|
585
|
-
self.conf.geometry = self.conf.get(thisgeo)
|
|
586
|
-
if 'geometry' not in self.conf:
|
|
587
|
-
logger.warning('No default geometry defined !')
|
|
588
|
-
|
|
589
|
-
def defaults(self, extras):
|
|
590
|
-
"""Set toolbox defaults, extended with actual arguments ``extras``."""
|
|
591
|
-
t = self.ticket
|
|
592
|
-
toolbox.defaults(
|
|
593
|
-
model=t.glove.vapp,
|
|
594
|
-
namespace=self.conf.get('namespace', Namespace('vortex.cache.fr')),
|
|
595
|
-
gnamespace=self.conf.get('gnamespace', Namespace('gco.multi.fr')),
|
|
596
|
-
)
|
|
597
|
-
|
|
598
|
-
if 'rundate' in self.conf:
|
|
599
|
-
toolbox.defaults['date'] = self.conf.rundate
|
|
600
|
-
|
|
601
|
-
for optk in ('cutoff', 'geometry', 'cycle', 'model'):
|
|
602
|
-
if optk in self.conf:
|
|
603
|
-
value = self.conf.get(optk)
|
|
604
|
-
if isinstance(value, dict):
|
|
605
|
-
value = FPDict(value)
|
|
606
|
-
toolbox.defaults[optk] = self.conf.get(optk)
|
|
607
|
-
|
|
608
|
-
toolbox.defaults(**extras)
|
|
609
|
-
self.header('Toolbox defaults')
|
|
610
|
-
toolbox.defaults.show()
|
|
611
|
-
|
|
612
|
-
def setup(self, **kw):
|
|
613
|
-
"""A methodic way to build the conf of the node."""
|
|
614
|
-
self.subtitle(self.realkind.upper() + ' setup')
|
|
615
|
-
self.localenv()
|
|
616
|
-
self.local2conf()
|
|
617
|
-
self.conf2io()
|
|
618
|
-
self.xp2conf()
|
|
619
|
-
if kw:
|
|
620
|
-
if self.mstep_counter <= 1:
|
|
621
|
-
self.nicedump('Update conf with last minute arguments', **kw)
|
|
622
|
-
self.conf.update(kw)
|
|
623
|
-
self.cycles()
|
|
624
|
-
self.geometries()
|
|
625
|
-
self.defaults(kw.get('defaults', dict()))
|
|
626
|
-
|
|
627
|
-
def summary(self):
|
|
628
|
-
"""Dump actual parameters of the configuration."""
|
|
629
|
-
if self.mstep_counter <= 1:
|
|
630
|
-
self.nicedump('Complete parameters', **self.conf)
|
|
631
|
-
else:
|
|
632
|
-
self.header('Complete parameters')
|
|
633
|
-
print("Silent Node' setup: please refer to the first job step for more details")
|
|
634
|
-
|
|
635
|
-
def complete(self):
|
|
636
|
-
"""Some cleaning and completion status."""
|
|
637
|
-
pass
|
|
638
|
-
|
|
639
|
-
def _actual_run(self, sjob_activated=True):
|
|
640
|
-
"""Abstract method: the actual job to do."""
|
|
641
|
-
pass
|
|
642
|
-
|
|
643
|
-
def run(self, sjob_activated=True):
|
|
644
|
-
"""Execution driver: setup, run, complete... (if needed)."""
|
|
645
|
-
if self._dryrun:
|
|
646
|
-
raise RuntimeError('This Node was initialised with "dryrun". ' +
|
|
647
|
-
'It is not allowed to call run().')
|
|
648
|
-
if self.activenode:
|
|
649
|
-
try:
|
|
650
|
-
self._actual_run(sjob_activated)
|
|
651
|
-
except Exception:
|
|
652
|
-
self.fail_at_the_end = False
|
|
653
|
-
raise
|
|
654
|
-
else:
|
|
655
|
-
if self.fail_at_the_end:
|
|
656
|
-
raise RequestedFailureError(
|
|
657
|
-
'An error occurred in {:s}. '.format(self.tag) +
|
|
658
|
-
'Please dive into the present log to understand why.'
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
def filter_execution_error(self, exc): # @UnusedVariable
|
|
662
|
-
"""
|
|
663
|
-
May be overwritten if exceptions generated by the AlgoComponent needs
|
|
664
|
-
to be filtered.
|
|
665
|
-
|
|
666
|
-
:param Exception exc: The exception that triggered the call
|
|
667
|
-
|
|
668
|
-
:return: Two elements. The first item (boolean) tells whether or not
|
|
669
|
-
a delayed exception error is to be masked. The second item is a
|
|
670
|
-
(possibly empty) dictionary that gives some extra information
|
|
671
|
-
about the warning/error (such information could be used to
|
|
672
|
-
generate a meaningful alert email).
|
|
673
|
-
|
|
674
|
-
:note: Do not re-raised the **exc** exception in this method.
|
|
675
|
-
"""
|
|
676
|
-
return False, dict()
|
|
677
|
-
|
|
678
|
-
def report_execution_warning(self, exc, **kw_infos): # @UnusedVariable
|
|
679
|
-
"""
|
|
680
|
-
May be overwritten if a report needs to be sent when a filtered
|
|
681
|
-
execution error occurs.
|
|
682
|
-
|
|
683
|
-
:param Exception exc: The exception that triggered the call
|
|
684
|
-
:param dict kw_infos: Any kind of extra informations provided by the
|
|
685
|
-
:meth:`filter_execution_error`.
|
|
686
|
-
|
|
687
|
-
:note: Do not re-raised the **exc** exception in this method.
|
|
688
|
-
"""
|
|
689
|
-
pass
|
|
690
|
-
|
|
691
|
-
def report_execution_error(self, exc, **kw_infos): # @UnusedVariable
|
|
692
|
-
"""
|
|
693
|
-
May be overwritten if a report needs to be sent when an un-filtered
|
|
694
|
-
execution error occurs.
|
|
695
|
-
|
|
696
|
-
:param Exception exc: The exception that triggered the call
|
|
697
|
-
:param dict kw_infos: Any kind of extra informations provided by the
|
|
698
|
-
:meth:`filter_execution_error`.
|
|
699
|
-
|
|
700
|
-
:note: Do not re-raised the **exc** exception in this method.
|
|
701
|
-
"""
|
|
702
|
-
pass
|
|
703
|
-
|
|
704
|
-
def delay_execution_error(self, exc, **kw_infos): # @UnusedVariable
|
|
705
|
-
"""
|
|
706
|
-
Tells whether the execution error needs to be ignored temporarily
|
|
707
|
-
(an exception will still be raised when the Node exits).
|
|
708
|
-
|
|
709
|
-
:param Exception exc: The exception that triggered the call
|
|
710
|
-
:param dict kw_infos: Any kind of extra informations provided by the
|
|
711
|
-
:meth:`filter_execution_error`.
|
|
712
|
-
|
|
713
|
-
:note: Do not re-raised the **exc** exception in this method.
|
|
714
|
-
"""
|
|
715
|
-
return self.conf.get('delay_component_errors', False)
|
|
716
|
-
|
|
717
|
-
def component_runner(self, tbalgo, tbx=(None,), **kwargs):
|
|
718
|
-
"""Run the binaries listed in tbx using the tbalgo algo component.
|
|
719
|
-
|
|
720
|
-
This is a helper method that maybe useful (its use is not mandatory).
|
|
721
|
-
"""
|
|
722
|
-
# it may be necessary to setup a default value for OpenMP...
|
|
723
|
-
env_update = dict()
|
|
724
|
-
if 'openmp' not in self.conf or not isinstance(self.conf.openmp, (list, tuple)):
|
|
725
|
-
env_update['OMP_NUM_THREADS'] = int(self.conf.get('openmp', 1))
|
|
726
|
-
|
|
727
|
-
# If some mpiopts are in the config file, use them...
|
|
728
|
-
mpiopts = kwargs.pop('mpiopts', dict())
|
|
729
|
-
mpiopts_map = dict(nnodes='nn', ntasks='nnp', nprocs='np', proc='np')
|
|
730
|
-
for stuff in [s
|
|
731
|
-
for s in ('proc', 'nprocs', 'nnodes', 'ntasks', 'openmp',
|
|
732
|
-
'prefixcommand', 'envelope')
|
|
733
|
-
if s in mpiopts or s in self.conf]:
|
|
734
|
-
mpiopts[mpiopts_map.get(stuff, stuff)] = mpiopts.pop(stuff, self.conf[stuff])
|
|
735
|
-
|
|
736
|
-
# if the prefix command is missing in the configuration file, look in the input sequence
|
|
737
|
-
if 'prefixcommand' not in mpiopts:
|
|
738
|
-
prefixes = self.ticket.context.sequence.effective_inputs(role=re.compile('Prefixcommand'))
|
|
739
|
-
if len(prefixes) > 1:
|
|
740
|
-
raise RuntimeError("Only one prefix command can be used...")
|
|
741
|
-
for sec in prefixes:
|
|
742
|
-
prefixpath = sec.rh.container.actualpath()
|
|
743
|
-
logger.info('The following MPI prefix command will be used: %s', prefixpath)
|
|
744
|
-
mpiopts['prefixcommand'] = prefixpath
|
|
745
|
-
|
|
746
|
-
# Ensure that some of the mpiopts are integers
|
|
747
|
-
for stuff in [s for s in ('nn', 'nnp', 'openmp', 'np') if s in mpiopts]:
|
|
748
|
-
if isinstance(mpiopts[stuff], (list, tuple)):
|
|
749
|
-
mpiopts[stuff] = [int(v) for v in mpiopts[stuff]]
|
|
750
|
-
else:
|
|
751
|
-
mpiopts[stuff] = int(mpiopts[stuff])
|
|
752
|
-
|
|
753
|
-
# Read the configuration file for some extra configuration
|
|
754
|
-
allowed_conf_extras = ('launcher', 'opts', 'wrapstd', 'bind_topology')
|
|
755
|
-
for k, v in self.conf.items():
|
|
756
|
-
if (k not in kwargs and '_mpi' in k and
|
|
757
|
-
any([k.endswith('_mpi' + a) for a in allowed_conf_extras])):
|
|
758
|
-
kwargs[k] = v
|
|
759
|
-
|
|
760
|
-
# When multiple list of binaries are given (i.e several binaries are launched
|
|
761
|
-
# by the same MPI command).
|
|
762
|
-
if tbx and isinstance(tbx[0], (list, tuple)):
|
|
763
|
-
tbx = zip(*tbx)
|
|
764
|
-
with self.env.delta_context(**env_update):
|
|
765
|
-
with self.sh.default_target.algo_run_context(self.ticket, self.conf):
|
|
766
|
-
for binary in tbx:
|
|
767
|
-
try:
|
|
768
|
-
tbalgo.run(binary, mpiopts=mpiopts, **kwargs)
|
|
769
|
-
except (Exception, SignalInterruptError, KeyboardInterrupt) as e:
|
|
770
|
-
mask_delayed, f_infos = self.filter_execution_error(e)
|
|
771
|
-
if isinstance(e, Exception) and mask_delayed:
|
|
772
|
-
logger.warning("The delayed exception is masked:\n%s", str(f_infos))
|
|
773
|
-
self.report_execution_warning(e, **f_infos)
|
|
774
|
-
else:
|
|
775
|
-
logger.error("Un-filtered execution error:\n%s", str(f_infos))
|
|
776
|
-
self.report_execution_error(e, **f_infos)
|
|
777
|
-
if isinstance(e, Exception) and self.delay_execution_error(e, **f_infos):
|
|
778
|
-
self.subtitle(
|
|
779
|
-
'An exception occurred but the crash is delayed until the end of the Node'
|
|
780
|
-
)
|
|
781
|
-
self._print_traceback()
|
|
782
|
-
# Actually delay the crash
|
|
783
|
-
self.fail_at_the_end = True
|
|
784
|
-
print()
|
|
785
|
-
else:
|
|
786
|
-
raise
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
class Family(Node):
|
|
790
|
-
"""Logical group of :class:`Family` or :class:`Task`.
|
|
791
|
-
|
|
792
|
-
Compared to the usual :class:`Node` class, additional attributes are:
|
|
793
|
-
|
|
794
|
-
:param nodes: The list of :class:`Family` or :class:`Task` objects that
|
|
795
|
-
are members of this family
|
|
796
|
-
"""
|
|
797
|
-
|
|
798
|
-
def __init__(self, **kw):
|
|
799
|
-
logger.debug('Family init %s', repr(self))
|
|
800
|
-
super().__init__(kw)
|
|
801
|
-
nodes = kw.pop('nodes', list())
|
|
802
|
-
self.options = kw.copy()
|
|
803
|
-
|
|
804
|
-
# Build the nodes sequence
|
|
805
|
-
fcount = 0
|
|
806
|
-
for x in nodes:
|
|
807
|
-
if isinstance(x, Node):
|
|
808
|
-
self._contents.append(x)
|
|
809
|
-
else:
|
|
810
|
-
fcount += 1
|
|
811
|
-
self._contents.append(
|
|
812
|
-
Family(
|
|
813
|
-
tag='{:s}.f{:02d}'.format(self.tag, fcount),
|
|
814
|
-
ticket=self.ticket,
|
|
815
|
-
nodes=x,
|
|
816
|
-
**kw
|
|
817
|
-
)
|
|
818
|
-
)
|
|
819
|
-
|
|
820
|
-
@property
|
|
821
|
-
def realkind(self):
|
|
822
|
-
return 'family'
|
|
823
|
-
|
|
824
|
-
def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
|
|
825
|
-
baseargs = super()._args_loopclone(tagsuffix, extras)
|
|
826
|
-
baseargs['nodes'] = [node.loopclone(tagsuffix, extras) for node in self._contents]
|
|
827
|
-
return baseargs
|
|
828
|
-
|
|
829
|
-
def _setup_context(self, ctx):
|
|
830
|
-
"""Build the contexts of all the nodes contained by this family."""
|
|
831
|
-
for node in self.contents:
|
|
832
|
-
node.build_context()
|
|
833
|
-
|
|
834
|
-
def localenv(self):
|
|
835
|
-
"""No env dump in families (it is enough to dump it in Tasks)."""
|
|
836
|
-
pass
|
|
837
|
-
|
|
838
|
-
def summary(self):
|
|
839
|
-
"""No parameters dump in families (it is enough to dump it in Tasks)."""
|
|
840
|
-
pass
|
|
841
|
-
|
|
842
|
-
@property
|
|
843
|
-
def _parallel_launchtool(self):
|
|
844
|
-
"""Create a launchtool for parallel runs (if sensible only)."""
|
|
845
|
-
if self._subjobok and self._subjobtag is None and 'paralleljobs_kind' in self.conf:
|
|
846
|
-
# Subjob are allowed and I'am the main job (because self._subjobtag is None) :
|
|
847
|
-
# => Run the family's content using subjobs
|
|
848
|
-
|
|
849
|
-
# Create the subjob launcher
|
|
850
|
-
launcher_opts = {k[len('paralleljobs_'):]: self.conf[k]
|
|
851
|
-
for k in self.conf if k.startswith('paralleljobs_')}
|
|
852
|
-
launchtool = fpx.subjobslauncher(scriptpath=sys.argv[0],
|
|
853
|
-
nodes_obsboard_tag=OBSERVER_TAG,
|
|
854
|
-
** launcher_opts)
|
|
855
|
-
if launchtool is None:
|
|
856
|
-
raise RuntimeError('No subjob launcher could be found: check "paralleljobs_kind".')
|
|
857
|
-
launchtool.ticket = self.ticket
|
|
858
|
-
return launchtool
|
|
859
|
-
else:
|
|
860
|
-
return None
|
|
861
|
-
|
|
862
|
-
def _actual_run(self, sjob_activated=True):
|
|
863
|
-
"""Execution driver: setup, run kids, complete."""
|
|
864
|
-
launchtool = self._parallel_launchtool
|
|
865
|
-
if launchtool:
|
|
866
|
-
self.ticket.sh.title(' '.join(('Build', self.realkind, self.tag, '(using subjobs)')))
|
|
867
|
-
|
|
868
|
-
def node_recurse(some_node):
|
|
869
|
-
"""Recursively find tags."""
|
|
870
|
-
o_set = {some_node.tag}
|
|
871
|
-
for snode in some_node.contents:
|
|
872
|
-
o_set = o_set | node_recurse(snode)
|
|
873
|
-
return o_set
|
|
874
|
-
|
|
875
|
-
# Launch each family's member
|
|
876
|
-
for node in self.contents:
|
|
877
|
-
launchtool(node.tag, node_recurse(node))
|
|
878
|
-
# Wait for everybody to complete
|
|
879
|
-
done, ko = launchtool.waitall()
|
|
880
|
-
if ko:
|
|
881
|
-
raise SubJobLauncherError("Execution failed for some subjobs: {:s}"
|
|
882
|
-
.format(','.join(ko)))
|
|
883
|
-
else:
|
|
884
|
-
# No subjobs configured or allowed: run the usual way...
|
|
885
|
-
sjob_activated = sjob_activated or self._subjobtag == self.tag
|
|
886
|
-
try:
|
|
887
|
-
self.ticket.sh.title(' '.join(('Build', self.realkind, self.tag)))
|
|
888
|
-
self.setup()
|
|
889
|
-
self.summary()
|
|
890
|
-
for node in self.contents:
|
|
891
|
-
with node.isolate():
|
|
892
|
-
node.run(sjob_activated=sjob_activated)
|
|
893
|
-
finally:
|
|
894
|
-
self.complete()
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
class LoopFamily(Family):
|
|
898
|
-
"""
|
|
899
|
-
Loop on the Family's content according to a variable taken from ``self.conf``.
|
|
900
|
-
|
|
901
|
-
Compared to the usual :class:`Family` class, additional attributes are:
|
|
902
|
-
|
|
903
|
-
:param str loopconf: The name of the ``self.conf`` entry to loop on
|
|
904
|
-
:param str loopvariable: The name of the loop control variable (that is
|
|
905
|
-
automatically added to the child's self.conf).
|
|
906
|
-
By default, **loopconf** without trailing ``s`` is
|
|
907
|
-
used.
|
|
908
|
-
:param str loopsuffix: The suffix that will be added to the child's tag.
|
|
909
|
-
By default '+loopvariable{!s}' (where {!s} will be
|
|
910
|
-
replaced by the loop control variable's value).
|
|
911
|
-
:param bool loopneedprev: Ensure that the previous value is available
|
|
912
|
-
:param bool loopneednext: Ensure that the next value is available
|
|
913
|
-
"""
|
|
914
|
-
|
|
915
|
-
def __init__(self, **kw):
|
|
916
|
-
logger.debug('LoopFamily init %s', repr(self))
|
|
917
|
-
# On what should we iterate ?
|
|
918
|
-
self._loopconf = kw.pop('loopconf', None)
|
|
919
|
-
if not self._loopconf:
|
|
920
|
-
raise ValueError('The "loopconf" named argument must be given')
|
|
921
|
-
else:
|
|
922
|
-
self._loopconf = self._loopconf.split(',')
|
|
923
|
-
# Find the loop's variable names
|
|
924
|
-
self._loopvariable = kw.pop('loopvariable', None)
|
|
925
|
-
if self._loopvariable is None:
|
|
926
|
-
self._loopvariable = [s.rstrip('s') for s in self._loopconf]
|
|
927
|
-
else:
|
|
928
|
-
self._loopvariable = self._loopvariable.split(',')
|
|
929
|
-
if len(self._loopvariable) != len(self._loopconf):
|
|
930
|
-
raise ValueError('Inconsistent size between loopconf and loopvariable')
|
|
931
|
-
# Find the loop suffixes
|
|
932
|
-
self._loopsuffix = kw.pop('loopsuffix', None)
|
|
933
|
-
if self._loopsuffix is None:
|
|
934
|
-
self._loopsuffix = '+' + self._loopvariable[0] + '{0!s}'
|
|
935
|
-
# Prev/Next
|
|
936
|
-
self._loopneedprev = kw.pop('loopneedprev', False)
|
|
937
|
-
self._loopneednext = kw.pop('loopneednext', False)
|
|
938
|
-
# Generic init...
|
|
939
|
-
super().__init__(**kw)
|
|
940
|
-
# Initialisation stuff
|
|
941
|
-
self._actual_content = None
|
|
942
|
-
|
|
943
|
-
def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
|
|
944
|
-
baseargs = super()._args_loopclone(tagsuffix, extras)
|
|
945
|
-
baseargs['loopconf'] = ','.join(self._loopconf)
|
|
946
|
-
baseargs['loopvariable'] = ','.join(self._loopvariable)
|
|
947
|
-
baseargs['loopsuffix'] = self._loopsuffix
|
|
948
|
-
baseargs['loopneedprev'] = self._loopneedprev
|
|
949
|
-
baseargs['loopneednext'] = self._loopneednext
|
|
950
|
-
return baseargs
|
|
951
|
-
|
|
952
|
-
@property
|
|
953
|
-
def contents(self):
|
|
954
|
-
if self._actual_content is None:
|
|
955
|
-
self._actual_content = list()
|
|
956
|
-
for pvars, cvars, nvars in izip_pcn(*[self.conf.get(lc) for lc in self._loopconf]):
|
|
957
|
-
if self._loopneedprev and all([v is None for v in pvars]):
|
|
958
|
-
continue
|
|
959
|
-
if self._loopneednext and all([v is None for v in nvars]):
|
|
960
|
-
continue
|
|
961
|
-
extras = {v: x for v, x in zip(self._loopvariable, cvars)}
|
|
962
|
-
extras.update({v + '_prev': x for v, x in zip(self._loopvariable, pvars)})
|
|
963
|
-
extras.update({v + '_next': x for v, x in zip(self._loopvariable, nvars)})
|
|
964
|
-
suffix = self._loopsuffix.format(*cvars)
|
|
965
|
-
for node in self._contents:
|
|
966
|
-
self._actual_content.append(node.loopclone(suffix, extras))
|
|
967
|
-
return self._actual_content
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
class WorkshareFamily(Family):
|
|
971
|
-
"""
|
|
972
|
-
Loop on the Family's content according to a list taken from ``self.conf``.
|
|
973
|
-
|
|
974
|
-
The list taken from ``self.conf`` is sliced, and each iteration of the
|
|
975
|
-
loop works on its slice of the list. That's why it's called a workshare...
|
|
976
|
-
|
|
977
|
-
Compared to the usual :class:`Family` class, additional attributes are:
|
|
978
|
-
|
|
979
|
-
:param str workshareconf: The name of the ``self.conf`` entry to slice
|
|
980
|
-
:param str worksharename: The name of the slice control variable (that is
|
|
981
|
-
automatically added to the childs' ``self.conf``).
|
|
982
|
-
:param int worksharesize: The minimum number of items in each workshare (default=1)
|
|
983
|
-
:param worksharesize: The maximum number of workshares (it might
|
|
984
|
-
be an integer or a name referring to an entry
|
|
985
|
-
``in self.conf`` (default: None. e.g. no limit)
|
|
986
|
-
"""
|
|
987
|
-
|
|
988
|
-
def __init__(self, **kw):
|
|
989
|
-
logger.debug('WorkshareFamily init %s', repr(self))
|
|
990
|
-
# On what should we build the workshare ?
|
|
991
|
-
self._workshareconf = kw.pop('workshareconf', None)
|
|
992
|
-
if not self._workshareconf:
|
|
993
|
-
raise ValueError('The "workshareconf" named argument must be given')
|
|
994
|
-
else:
|
|
995
|
-
self._workshareconf = self._workshareconf.split(',')
|
|
996
|
-
# Find the loop's variable names
|
|
997
|
-
self._worksharename = kw.pop('worksharename', None)
|
|
998
|
-
if not self._worksharename:
|
|
999
|
-
raise ValueError('The "worksharename" named argument must be given')
|
|
1000
|
-
else:
|
|
1001
|
-
self._worksharename = self._worksharename.split(',')
|
|
1002
|
-
if len(self._worksharename) != len(self._workshareconf):
|
|
1003
|
-
raise ValueError('Inconsistent size between workshareconf and worksharename')
|
|
1004
|
-
# Minimum size for a workshare
|
|
1005
|
-
self._worksharesize = int(kw.pop('worksharesize', 1))
|
|
1006
|
-
# Maximum number of workshares
|
|
1007
|
-
self._worksharelimit = kw.pop('worksharelimit', None)
|
|
1008
|
-
# Generic init
|
|
1009
|
-
super().__init__(**kw)
|
|
1010
|
-
# Initialisation stuff
|
|
1011
|
-
self._actual_content = None
|
|
1012
|
-
|
|
1013
|
-
def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
|
|
1014
|
-
baseargs = super()._args_loopclone(tagsuffix, extras)
|
|
1015
|
-
baseargs['workshareconf'] = ','.join(self._workshareconf)
|
|
1016
|
-
baseargs['worksharename'] = ','.join(self._worksharename)
|
|
1017
|
-
baseargs['worksharesize'] = self._worksharesize
|
|
1018
|
-
baseargs['worksharelimit'] = self._worksharelimit
|
|
1019
|
-
return baseargs
|
|
1020
|
-
|
|
1021
|
-
@property
|
|
1022
|
-
def contents(self):
|
|
1023
|
-
if self._actual_content is None:
|
|
1024
|
-
# Find the population sizes and workshares size/number
|
|
1025
|
-
populations = [self.conf.get(lc) for lc in self._workshareconf]
|
|
1026
|
-
n_population = {len(p) for p in populations}
|
|
1027
|
-
if not (len(n_population) == 1):
|
|
1028
|
-
raise RuntimeError('Inconsistent sizes in "workshareconf" lists')
|
|
1029
|
-
n_population = n_population.pop()
|
|
1030
|
-
# Number of workshares if worksharesize alone is considered
|
|
1031
|
-
sb_ws_number = n_population // self._worksharesize
|
|
1032
|
-
# Workshare limit
|
|
1033
|
-
if isinstance(self._worksharelimit, str):
|
|
1034
|
-
lb_ws_number = int(self.conf.get(self._worksharelimit))
|
|
1035
|
-
else:
|
|
1036
|
-
lb_ws_number = self._worksharelimit or sb_ws_number
|
|
1037
|
-
# Final result
|
|
1038
|
-
ws_number = max(min([sb_ws_number, lb_ws_number]), 1)
|
|
1039
|
-
# Find out the workshares sizes
|
|
1040
|
-
floorsize = n_population // ws_number
|
|
1041
|
-
ws_sizes = [floorsize, ] * ws_number
|
|
1042
|
-
for i in range(n_population - ws_number * floorsize):
|
|
1043
|
-
ws_sizes[i % ws_number] += 1
|
|
1044
|
-
# Build de family's content
|
|
1045
|
-
self._actual_content = list()
|
|
1046
|
-
ws_start = 0
|
|
1047
|
-
for i, ws_size in enumerate(ws_sizes):
|
|
1048
|
-
ws_slice = slice(ws_start, ws_start + ws_size)
|
|
1049
|
-
extras = {v: x[ws_slice] for v, x in zip(self._worksharename, populations)}
|
|
1050
|
-
ws_start += ws_size
|
|
1051
|
-
ws_suffix = '_ws{:03d}'.format(i + 1)
|
|
1052
|
-
for node in self._contents:
|
|
1053
|
-
self._actual_content.append(node.loopclone(ws_suffix, extras))
|
|
1054
|
-
return self._actual_content
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
class Task(Node):
|
|
1058
|
-
"""Terminal node including a :class:`Sequence`."""
|
|
1059
|
-
|
|
1060
|
-
def __init__(self, **kw):
|
|
1061
|
-
logger.debug('Task init %s', repr(self))
|
|
1062
|
-
super().__init__(kw)
|
|
1063
|
-
self.steps = kw.pop('steps', tuple())
|
|
1064
|
-
self.fetch = kw.pop('fetch', 'fetch')
|
|
1065
|
-
self.compute = kw.pop('compute', 'compute')
|
|
1066
|
-
self.backup = kw.pop('backup', 'backup')
|
|
1067
|
-
self.options = kw.copy()
|
|
1068
|
-
if isinstance(self.steps, str):
|
|
1069
|
-
self.steps = tuple(self.steps.replace(' ', '').split(','))
|
|
1070
|
-
|
|
1071
|
-
@property
|
|
1072
|
-
def realkind(self):
|
|
1073
|
-
return 'task'
|
|
1074
|
-
|
|
1075
|
-
def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
|
|
1076
|
-
baseargs = super()._args_loopclone(tagsuffix, extras)
|
|
1077
|
-
baseargs['steps'] = self.steps
|
|
1078
|
-
baseargs['fetch'] = self.fetch
|
|
1079
|
-
baseargs['compute'] = self.compute
|
|
1080
|
-
baseargs['backup'] = self.backup
|
|
1081
|
-
return baseargs
|
|
1082
|
-
|
|
1083
|
-
@property
|
|
1084
|
-
def ctx(self):
|
|
1085
|
-
return self.ticket.context
|
|
1086
|
-
|
|
1087
|
-
def build(self):
|
|
1088
|
-
"""Switch to rundir and check the active steps."""
|
|
1089
|
-
|
|
1090
|
-
t = self.ticket
|
|
1091
|
-
t.sh.title(' '.join(('Build', self.realkind, self.tag)))
|
|
1092
|
-
|
|
1093
|
-
# Change actual rundir if specified
|
|
1094
|
-
rundir = self.options.get('rundir', None)
|
|
1095
|
-
if rundir:
|
|
1096
|
-
t.env.RUNDIR = rundir
|
|
1097
|
-
t.sh.cd(rundir, create=True)
|
|
1098
|
-
t.rundir = t.sh.getcwd()
|
|
1099
|
-
print('The current directory is: {}'.format(t.sh.getcwd()))
|
|
1100
|
-
|
|
1101
|
-
# Some attempt to find the current active steps
|
|
1102
|
-
if not self.steps:
|
|
1103
|
-
new_steps = []
|
|
1104
|
-
if (self.env.get(self._locprefix + 'WARMSTART')
|
|
1105
|
-
or self.conf.get('warmstart', False)):
|
|
1106
|
-
new_steps.append('warmstart')
|
|
1107
|
-
if (self.env.get(self._locprefix + 'REFILL')
|
|
1108
|
-
or self.conf.get('refill', False)):
|
|
1109
|
-
new_steps.append('refill')
|
|
1110
|
-
if new_steps:
|
|
1111
|
-
self.steps = tuple(new_steps)
|
|
1112
|
-
else:
|
|
1113
|
-
if self.play:
|
|
1114
|
-
self.steps = ('early-{:s}'.format(self.fetch), self.fetch,
|
|
1115
|
-
self.compute,
|
|
1116
|
-
self.backup, 'late-{:s}'.format(self.backup))
|
|
1117
|
-
else:
|
|
1118
|
-
self.steps = ('early-{:s}'.format(self.fetch), self.fetch)
|
|
1119
|
-
self.header('Active steps: ' + ' '.join(self.steps))
|
|
1120
|
-
|
|
1121
|
-
def conf2io(self):
|
|
1122
|
-
"""Broadcast IO SERVER configuration values to environment."""
|
|
1123
|
-
t = self.ticket
|
|
1124
|
-
triggered = any([i in self.conf
|
|
1125
|
-
for i in ('io_nodes', 'io_companions', 'io_incore_tasks',
|
|
1126
|
-
'io_openmp')])
|
|
1127
|
-
if 'io_nodes' in self.conf:
|
|
1128
|
-
t.env.default(VORTEX_IOSERVER_NODES=self.conf.io_nodes)
|
|
1129
|
-
if 'io_tasks' in self.conf:
|
|
1130
|
-
t.env.default(VORTEX_IOSERVER_TASKS=self.conf.io_tasks)
|
|
1131
|
-
elif 'io_companions' in self.conf:
|
|
1132
|
-
t.env.default(VORTEX_IOSERVER_COMPANION_TASKS=self.conf.io_companions)
|
|
1133
|
-
elif 'io_incore_tasks' in self.conf:
|
|
1134
|
-
t.env.default(VORTEX_IOSERVER_INCORE_TASKS=self.conf.io_incore_tasks)
|
|
1135
|
-
if 'io_incore_fixer' in self.conf:
|
|
1136
|
-
t.env.default(VORTEX_IOSERVER_INCORE_FIXER=self.conf.io_incore_fixer)
|
|
1137
|
-
if 'io_incore_dist' in self.conf:
|
|
1138
|
-
t.env.default(VORTEX_IOSERVER_INCORE_DIST=self.conf.io_incore_dist)
|
|
1139
|
-
if 'io_openmp' in self.conf:
|
|
1140
|
-
t.env.default(VORTEX_IOSERVER_OPENMP=self.conf.io_openmp)
|
|
1141
|
-
if triggered and self.mstep_counter <= 1:
|
|
1142
|
-
self.nicedump('IOSERVER Environment', **{k: v for k, v in t.env.items()
|
|
1143
|
-
if k.startswith('VORTEX_IOSERVER_')})
|
|
1144
|
-
|
|
1145
|
-
def io_poll(self, prefix=None):
|
|
1146
|
-
"""Complete the polling of data produced by the execution step."""
|
|
1147
|
-
sh = self.sh
|
|
1148
|
-
if prefix and sh.path.exists('io_poll.todo'):
|
|
1149
|
-
for iopr in prefix:
|
|
1150
|
-
sh.header('IO poll <' + iopr + '>')
|
|
1151
|
-
rc = sh.io_poll(iopr)
|
|
1152
|
-
print(rc)
|
|
1153
|
-
print(rc.result)
|
|
1154
|
-
sh.header('Post-IO Poll directory listing')
|
|
1155
|
-
sh.ll(output=False, fatal=False)
|
|
1156
|
-
|
|
1157
|
-
def warmstart(self, **kw):
|
|
1158
|
-
"""Populates the vortex cache with expected input flow data.
|
|
1159
|
-
|
|
1160
|
-
This is usefull when someone wants to restat an experiment from
|
|
1161
|
-
another one.
|
|
1162
|
-
|
|
1163
|
-
The warmstart method is systematically called when a task is run. However,
|
|
1164
|
-
the warmstart is not always desirable hence the if statement that checks the
|
|
1165
|
-
self.steps attribute's content.
|
|
1166
|
-
"""
|
|
1167
|
-
# This method acts as an example: if a refill is actually needed,
|
|
1168
|
-
# it should be overwritten.
|
|
1169
|
-
if 'warmstart' in self.steps:
|
|
1170
|
-
pass
|
|
1171
|
-
|
|
1172
|
-
def refill(self, **kw):
|
|
1173
|
-
"""Populates the vortex cache with external input data.
|
|
1174
|
-
|
|
1175
|
-
The refill method is systematically called when a task is run. However,
|
|
1176
|
-
the refill is not always desirable hence the if statement that checks the
|
|
1177
|
-
self.steps attribute's content.
|
|
1178
|
-
"""
|
|
1179
|
-
# This method acts as an example: if a refill is actually needed,
|
|
1180
|
-
# it should be overwritten.
|
|
1181
|
-
if 'refill' in self.steps:
|
|
1182
|
-
pass
|
|
1183
|
-
|
|
1184
|
-
def process(self):
|
|
1185
|
-
"""Abstract method: perform the task to do."""
|
|
1186
|
-
# This method acts as an example: it should be overwritten.
|
|
1187
|
-
|
|
1188
|
-
if 'early-fetch' in self.steps or 'fetch' in self.steps:
|
|
1189
|
-
# In a multi step job (MTOOL, ...), this step will be run on a
|
|
1190
|
-
# transfer node. Consequently, data that may be missing from the
|
|
1191
|
-
# local cache must be fetched here. (e.g. GCO's genv, data from the
|
|
1192
|
-
# mass archive system, ...). Note: most of the data should be
|
|
1193
|
-
# retrieved here since the use of transfer node is costless.
|
|
1194
|
-
pass
|
|
1195
|
-
|
|
1196
|
-
if 'fetch' in self.steps:
|
|
1197
|
-
# In a multi step job (MTOOL, ...), this step will be run, on a
|
|
1198
|
-
# compute node, just before the beginning of computations. It is the
|
|
1199
|
-
# appropriate place to fetch data produced by a previous task (the
|
|
1200
|
-
# so-called previous task will have to use the 'backup' step
|
|
1201
|
-
# (see the later explanations) in order to make such data available
|
|
1202
|
-
# in the local cache).
|
|
1203
|
-
pass
|
|
1204
|
-
|
|
1205
|
-
if 'compute' in self.steps:
|
|
1206
|
-
# The actual computations... (usually a call to the run method of an
|
|
1207
|
-
# AlgoComponent)
|
|
1208
|
-
pass
|
|
1209
|
-
|
|
1210
|
-
if 'backup' in self.steps or 'late-backup' in self.steps:
|
|
1211
|
-
# In a multi step job (MTOOL, ...), this step will be run, on a
|
|
1212
|
-
# compute node, just after the computations. It is the appropriate
|
|
1213
|
-
# place to put data in the local cache in order to make it available
|
|
1214
|
-
# to a subsequent step.
|
|
1215
|
-
pass
|
|
1216
|
-
|
|
1217
|
-
if 'late-backup' in self.steps:
|
|
1218
|
-
# In a multi step job (MTOOL, ...), this step will be run on a
|
|
1219
|
-
# transfer node. Consequently, most of the data should be archived
|
|
1220
|
-
# here.
|
|
1221
|
-
pass
|
|
1222
|
-
|
|
1223
|
-
def _actual_run(self, sjob_activated=True):
|
|
1224
|
-
"""Execution driver: build, setup, refill, process, complete."""
|
|
1225
|
-
sjob_activated = sjob_activated or self._subjobtag == self.tag
|
|
1226
|
-
if sjob_activated:
|
|
1227
|
-
if (self.status == NODE_STATUS.RUNNING or
|
|
1228
|
-
(self.status == NODE_STATUS.FAILED and self.fail_at_the_end)):
|
|
1229
|
-
try:
|
|
1230
|
-
self.build()
|
|
1231
|
-
self.setup()
|
|
1232
|
-
self.summary()
|
|
1233
|
-
self.warmstart()
|
|
1234
|
-
self.refill()
|
|
1235
|
-
self.process()
|
|
1236
|
-
except VortexForceComplete:
|
|
1237
|
-
self.sh.title('Force complete')
|
|
1238
|
-
finally:
|
|
1239
|
-
self.complete()
|
|
1240
|
-
else:
|
|
1241
|
-
self.build()
|
|
1242
|
-
self.subtitle('This task will not run since it failed in a previous step.')
|
|
1243
|
-
raise PreviousFailureError(
|
|
1244
|
-
'Previous error re-raised from tag={:s}'.format(self.tag)
|
|
1245
|
-
)
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
class Driver(getbytag.GetByTag, NiceLayout):
|
|
1249
|
-
"""Iterable object for a simple scheduling of :class:`Application` objects."""
|
|
1250
|
-
|
|
1251
|
-
_tag_default = 'pilot'
|
|
1252
|
-
|
|
1253
|
-
def __init__(self, ticket, nodes=(), rundate=None, iniconf=None,
|
|
1254
|
-
jobname=None, options=None, iniencoding=None):
|
|
1255
|
-
"""Setup default args value and read config file job."""
|
|
1256
|
-
self._ticket = t = ticket
|
|
1257
|
-
self._conf = None
|
|
1258
|
-
|
|
1259
|
-
# Set default parameters for the actual job
|
|
1260
|
-
self._options = dict() if options is None else options
|
|
1261
|
-
self._special_prefix = self._options.get('special_prefix', 'OP_').upper()
|
|
1262
|
-
self._subjob_tag = self._options.get('subjob_tag', None)
|
|
1263
|
-
j_assist = self._options.get('jobassistant', None)
|
|
1264
|
-
if j_assist is not None:
|
|
1265
|
-
self._special_prefix = j_assist.special_prefix.upper()
|
|
1266
|
-
self._subjob_tag = j_assist.subjob_tag
|
|
1267
|
-
self._mstep_job_last = self._options.get('mstep_job_last', True)
|
|
1268
|
-
self._dryrun = self._options.get('dryrun', False)
|
|
1269
|
-
self._iniconf = iniconf or t.env.get('{:s}INICONF'.format(self._special_prefix))
|
|
1270
|
-
self._iniencoding = iniencoding or t.env.get('{:s}INIENCODING'.format(self._special_prefix), None)
|
|
1271
|
-
self._jobname = jobname or t.env.get('{:s}JOBNAME'.format(self._special_prefix)) or 'void'
|
|
1272
|
-
self._rundate = rundate or t.env.get('{:s}RUNDATE'.format(self._special_prefix))
|
|
1273
|
-
self._nicelayout_init(dict())
|
|
1274
|
-
|
|
1275
|
-
# Build the tree to schedule
|
|
1276
|
-
self._contents = list()
|
|
1277
|
-
fcount = 0
|
|
1278
|
-
for x in nodes:
|
|
1279
|
-
if isinstance(x, Node):
|
|
1280
|
-
self._contents.append(x)
|
|
1281
|
-
else:
|
|
1282
|
-
fcount += 1
|
|
1283
|
-
self._contents.append(
|
|
1284
|
-
Family(
|
|
1285
|
-
tag='{:s}.f{:02d}'.format(self.tag, fcount),
|
|
1286
|
-
ticket=self.ticket,
|
|
1287
|
-
nodes=x,
|
|
1288
|
-
** dict(self._options)
|
|
1289
|
-
)
|
|
1290
|
-
)
|
|
1291
|
-
|
|
1292
|
-
@property
|
|
1293
|
-
def ticket(self):
|
|
1294
|
-
return self._ticket
|
|
1295
|
-
|
|
1296
|
-
@property
|
|
1297
|
-
def conf(self):
|
|
1298
|
-
return self._conf
|
|
1299
|
-
|
|
1300
|
-
@property
|
|
1301
|
-
def confdiff(self):
|
|
1302
|
-
return self.conf
|
|
1303
|
-
|
|
1304
|
-
@property
|
|
1305
|
-
def sh(self):
|
|
1306
|
-
return self.ticket.sh
|
|
1307
|
-
|
|
1308
|
-
@property
|
|
1309
|
-
def env(self):
|
|
1310
|
-
return self.ticket.env
|
|
1311
|
-
|
|
1312
|
-
@property
|
|
1313
|
-
def iniconf(self):
|
|
1314
|
-
return self._iniconf
|
|
1315
|
-
|
|
1316
|
-
@property
|
|
1317
|
-
def iniencoding(self):
|
|
1318
|
-
return self._iniencoding
|
|
1319
|
-
|
|
1320
|
-
@property
|
|
1321
|
-
def jobconf(self):
|
|
1322
|
-
return self._jobconf
|
|
1323
|
-
|
|
1324
|
-
@property
|
|
1325
|
-
def contents(self):
|
|
1326
|
-
return self._contents
|
|
1327
|
-
|
|
1328
|
-
@property
|
|
1329
|
-
def jobname(self):
|
|
1330
|
-
return self._jobname
|
|
1331
|
-
|
|
1332
|
-
@property
|
|
1333
|
-
def rundate(self):
|
|
1334
|
-
return self._rundate
|
|
1335
|
-
|
|
1336
|
-
def read_config(self, inifile=None, iniencoding=None):
|
|
1337
|
-
"""Read specified ``inifile`` initialisation file."""
|
|
1338
|
-
if inifile is None:
|
|
1339
|
-
inifile = self.iniconf
|
|
1340
|
-
if iniencoding is None:
|
|
1341
|
-
iniencoding = self.iniencoding
|
|
1342
|
-
try:
|
|
1343
|
-
iniparser = GenericConfigParser(inifile, encoding=iniencoding)
|
|
1344
|
-
thisconf = iniparser.as_dict(merged=False)
|
|
1345
|
-
except Exception:
|
|
1346
|
-
logger.critical('Could not read config %s', inifile)
|
|
1347
|
-
raise
|
|
1348
|
-
return thisconf
|
|
1349
|
-
|
|
1350
|
-
def setup(self, name=None, date=None, verbose=True):
|
|
1351
|
-
"""Top setup of the current configuration, including at least one name."""
|
|
1352
|
-
|
|
1353
|
-
jobname = name or self.jobname
|
|
1354
|
-
|
|
1355
|
-
rundate = date or self.rundate
|
|
1356
|
-
if rundate is None:
|
|
1357
|
-
logger.info('No date provided for this run.')
|
|
1358
|
-
|
|
1359
|
-
if verbose:
|
|
1360
|
-
if rundate is None:
|
|
1361
|
-
self.sh.title(['Starting job', '', jobname, ])
|
|
1362
|
-
else:
|
|
1363
|
-
self.sh.title(['Starting job', '', jobname, '', 'date ' + rundate.isoformat()])
|
|
1364
|
-
|
|
1365
|
-
# Read once for all the job configuration file
|
|
1366
|
-
if self.iniconf is None:
|
|
1367
|
-
logger.warning('This driver does not have any configuration file')
|
|
1368
|
-
self._jobconf = dict()
|
|
1369
|
-
else:
|
|
1370
|
-
self._jobconf = self.read_config(self.iniconf, self.iniencoding)
|
|
1371
|
-
|
|
1372
|
-
self._conf = ConfigSet()
|
|
1373
|
-
updconf = self.jobconf.get('defaults', dict())
|
|
1374
|
-
updconf.update(self.jobconf.get(self.jobname, dict()))
|
|
1375
|
-
if self.mstep_counter <= 1:
|
|
1376
|
-
self.nicedump('Configuration for job ' + self.jobname, **updconf)
|
|
1377
|
-
else:
|
|
1378
|
-
print("Silent Driver' setup: please refer to the first job step for more details")
|
|
1379
|
-
self.conf.update(updconf)
|
|
1380
|
-
|
|
1381
|
-
# Recursively set the configuration tree and contexts
|
|
1382
|
-
if rundate is not None:
|
|
1383
|
-
self.conf.rundate = rundate
|
|
1384
|
-
for node in self.contents:
|
|
1385
|
-
node.setconf(self.conf, self.jobconf)
|
|
1386
|
-
node.build_context()
|
|
1387
|
-
|
|
1388
|
-
if self.mstep_counter <= 1:
|
|
1389
|
-
self.status = NODE_STATUS.READY
|
|
1390
|
-
if not self._dryrun:
|
|
1391
|
-
self.header('The various nodes were configured. Here is a Tree-View of the Driver:')
|
|
1392
|
-
print(self)
|
|
1393
|
-
|
|
1394
|
-
def run(self):
|
|
1395
|
-
"""Assume recursion of nodes `run` methods."""
|
|
1396
|
-
if self._dryrun:
|
|
1397
|
-
raise RuntimeError('This Driver was initialised with "dryrun". ' +
|
|
1398
|
-
'It is not allowed to call run().')
|
|
1399
|
-
self.status = NODE_STATUS.RUNNING
|
|
1400
|
-
try:
|
|
1401
|
-
for node in self.contents:
|
|
1402
|
-
with node.isolate():
|
|
1403
|
-
node.run(sjob_activated=self._subjob_tag is None)
|
|
1404
|
-
if self._mstep_job_last:
|
|
1405
|
-
self.status = NODE_STATUS.DONE
|
|
1406
|
-
except Exception:
|
|
1407
|
-
if not self._mstep_job_last and self.any_currently_running:
|
|
1408
|
-
self.sh.title("Handling of the job failure in a multi-job context.")
|
|
1409
|
-
self._print_traceback()
|
|
1410
|
-
print()
|
|
1411
|
-
print("Since it is not the last step of this multi-step job, " +
|
|
1412
|
-
"the job failure is ignored... for now.")
|
|
1413
|
-
else:
|
|
1414
|
-
raise
|
|
1415
|
-
else:
|
|
1416
|
-
if self.delayed_error_flag and self._subjob_tag is None and self._mstep_job_last:
|
|
1417
|
-
# Test on _subjob_tag because we do not want to crash in subjobs
|
|
1418
|
-
raise RuntimeError("One or several error occurred during the Driver execution. " +
|
|
1419
|
-
"The exceptions were delayed but now that the Driver ended let's crash !")
|
|
1420
|
-
finally:
|
|
1421
|
-
if self.any_failure:
|
|
1422
|
-
self.sh.title('An error occurred during job...')
|
|
1423
|
-
print('Here is the tree-view of the present Driver:')
|
|
1424
|
-
print(self)
|