vortex-nwp 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. vortex/__init__.py +135 -0
  2. vortex/algo/__init__.py +12 -0
  3. vortex/algo/components.py +2136 -0
  4. vortex/algo/mpitools.py +1648 -0
  5. vortex/algo/mpitools_templates/envelope_wrapper_default.tpl +27 -0
  6. vortex/algo/mpitools_templates/envelope_wrapper_mpiauto.tpl +29 -0
  7. vortex/algo/mpitools_templates/wrapstd_wrapper_default.tpl +18 -0
  8. vortex/algo/serversynctools.py +170 -0
  9. vortex/config.py +115 -0
  10. vortex/data/__init__.py +13 -0
  11. vortex/data/abstractstores.py +1572 -0
  12. vortex/data/containers.py +780 -0
  13. vortex/data/contents.py +596 -0
  14. vortex/data/executables.py +284 -0
  15. vortex/data/flow.py +113 -0
  16. vortex/data/geometries.ini +2689 -0
  17. vortex/data/geometries.py +703 -0
  18. vortex/data/handlers.py +1021 -0
  19. vortex/data/outflow.py +67 -0
  20. vortex/data/providers.py +465 -0
  21. vortex/data/resources.py +201 -0
  22. vortex/data/stores.py +1271 -0
  23. vortex/gloves.py +282 -0
  24. vortex/layout/__init__.py +27 -0
  25. vortex/layout/appconf.py +109 -0
  26. vortex/layout/contexts.py +511 -0
  27. vortex/layout/dataflow.py +1069 -0
  28. vortex/layout/jobs.py +1276 -0
  29. vortex/layout/monitor.py +833 -0
  30. vortex/layout/nodes.py +1424 -0
  31. vortex/layout/subjobs.py +464 -0
  32. vortex/nwp/__init__.py +11 -0
  33. vortex/nwp/algo/__init__.py +12 -0
  34. vortex/nwp/algo/assim.py +483 -0
  35. vortex/nwp/algo/clim.py +920 -0
  36. vortex/nwp/algo/coupling.py +609 -0
  37. vortex/nwp/algo/eda.py +632 -0
  38. vortex/nwp/algo/eps.py +613 -0
  39. vortex/nwp/algo/forecasts.py +745 -0
  40. vortex/nwp/algo/fpserver.py +927 -0
  41. vortex/nwp/algo/ifsnaming.py +403 -0
  42. vortex/nwp/algo/ifsroot.py +311 -0
  43. vortex/nwp/algo/monitoring.py +202 -0
  44. vortex/nwp/algo/mpitools.py +554 -0
  45. vortex/nwp/algo/odbtools.py +974 -0
  46. vortex/nwp/algo/oopsroot.py +735 -0
  47. vortex/nwp/algo/oopstests.py +186 -0
  48. vortex/nwp/algo/request.py +579 -0
  49. vortex/nwp/algo/stdpost.py +1285 -0
  50. vortex/nwp/data/__init__.py +12 -0
  51. vortex/nwp/data/assim.py +392 -0
  52. vortex/nwp/data/boundaries.py +261 -0
  53. vortex/nwp/data/climfiles.py +539 -0
  54. vortex/nwp/data/configfiles.py +149 -0
  55. vortex/nwp/data/consts.py +929 -0
  56. vortex/nwp/data/ctpini.py +133 -0
  57. vortex/nwp/data/diagnostics.py +181 -0
  58. vortex/nwp/data/eda.py +148 -0
  59. vortex/nwp/data/eps.py +383 -0
  60. vortex/nwp/data/executables.py +1039 -0
  61. vortex/nwp/data/fields.py +96 -0
  62. vortex/nwp/data/gridfiles.py +308 -0
  63. vortex/nwp/data/logs.py +551 -0
  64. vortex/nwp/data/modelstates.py +334 -0
  65. vortex/nwp/data/monitoring.py +220 -0
  66. vortex/nwp/data/namelists.py +644 -0
  67. vortex/nwp/data/obs.py +748 -0
  68. vortex/nwp/data/oopsexec.py +72 -0
  69. vortex/nwp/data/providers.py +182 -0
  70. vortex/nwp/data/query.py +217 -0
  71. vortex/nwp/data/stores.py +147 -0
  72. vortex/nwp/data/surfex.py +338 -0
  73. vortex/nwp/syntax/__init__.py +9 -0
  74. vortex/nwp/syntax/stdattrs.py +375 -0
  75. vortex/nwp/tools/__init__.py +10 -0
  76. vortex/nwp/tools/addons.py +35 -0
  77. vortex/nwp/tools/agt.py +55 -0
  78. vortex/nwp/tools/bdap.py +48 -0
  79. vortex/nwp/tools/bdcp.py +38 -0
  80. vortex/nwp/tools/bdm.py +21 -0
  81. vortex/nwp/tools/bdmp.py +49 -0
  82. vortex/nwp/tools/conftools.py +1311 -0
  83. vortex/nwp/tools/drhook.py +62 -0
  84. vortex/nwp/tools/grib.py +268 -0
  85. vortex/nwp/tools/gribdiff.py +99 -0
  86. vortex/nwp/tools/ifstools.py +163 -0
  87. vortex/nwp/tools/igastuff.py +249 -0
  88. vortex/nwp/tools/mars.py +56 -0
  89. vortex/nwp/tools/odb.py +548 -0
  90. vortex/nwp/tools/partitioning.py +234 -0
  91. vortex/nwp/tools/satrad.py +56 -0
  92. vortex/nwp/util/__init__.py +6 -0
  93. vortex/nwp/util/async.py +184 -0
  94. vortex/nwp/util/beacon.py +40 -0
  95. vortex/nwp/util/diffpygram.py +359 -0
  96. vortex/nwp/util/ens.py +198 -0
  97. vortex/nwp/util/hooks.py +128 -0
  98. vortex/nwp/util/taskdeco.py +81 -0
  99. vortex/nwp/util/usepygram.py +591 -0
  100. vortex/nwp/util/usetnt.py +87 -0
  101. vortex/proxy.py +6 -0
  102. vortex/sessions.py +341 -0
  103. vortex/syntax/__init__.py +9 -0
  104. vortex/syntax/stdattrs.py +628 -0
  105. vortex/syntax/stddeco.py +176 -0
  106. vortex/toolbox.py +982 -0
  107. vortex/tools/__init__.py +11 -0
  108. vortex/tools/actions.py +457 -0
  109. vortex/tools/addons.py +297 -0
  110. vortex/tools/arm.py +76 -0
  111. vortex/tools/compression.py +322 -0
  112. vortex/tools/date.py +20 -0
  113. vortex/tools/ddhpack.py +10 -0
  114. vortex/tools/delayedactions.py +672 -0
  115. vortex/tools/env.py +513 -0
  116. vortex/tools/folder.py +663 -0
  117. vortex/tools/grib.py +559 -0
  118. vortex/tools/lfi.py +746 -0
  119. vortex/tools/listings.py +354 -0
  120. vortex/tools/names.py +575 -0
  121. vortex/tools/net.py +1790 -0
  122. vortex/tools/odb.py +10 -0
  123. vortex/tools/parallelism.py +336 -0
  124. vortex/tools/prestaging.py +186 -0
  125. vortex/tools/rawfiles.py +10 -0
  126. vortex/tools/schedulers.py +413 -0
  127. vortex/tools/services.py +871 -0
  128. vortex/tools/storage.py +1061 -0
  129. vortex/tools/surfex.py +61 -0
  130. vortex/tools/systems.py +3396 -0
  131. vortex/tools/targets.py +384 -0
  132. vortex/util/__init__.py +9 -0
  133. vortex/util/config.py +1071 -0
  134. vortex/util/empty.py +24 -0
  135. vortex/util/helpers.py +184 -0
  136. vortex/util/introspection.py +63 -0
  137. vortex/util/iosponge.py +76 -0
  138. vortex/util/roles.py +51 -0
  139. vortex/util/storefunctions.py +103 -0
  140. vortex/util/structs.py +26 -0
  141. vortex/util/worker.py +150 -0
  142. vortex_nwp-2.0.0b1.dist-info/LICENSE +517 -0
  143. vortex_nwp-2.0.0b1.dist-info/METADATA +50 -0
  144. vortex_nwp-2.0.0b1.dist-info/RECORD +146 -0
  145. vortex_nwp-2.0.0b1.dist-info/WHEEL +5 -0
  146. vortex_nwp-2.0.0b1.dist-info/top_level.txt +1 -0
vortex/layout/nodes.py ADDED
@@ -0,0 +1,1424 @@
1
+ """
2
+ This modules defines the base nodes of the logical layout
3
+ for any :mod:`vortex` experiment.
4
+
5
+ The documentation of this module is probably not enough to understand all the
6
+ features of :class:`Node` and :class:`Driver` objects. The examples provided
7
+ with the Vortex source code (see :ref:`examples_jobs`) may shed some light on
8
+ interesting features.
9
+ """
10
+
11
+ import collections
12
+ import contextlib
13
+ import re
14
+ import sys
15
+ import traceback
16
+
17
+ from bronx.fancies import loggers
18
+ from bronx.patterns import getbytag, observer
19
+ from bronx.syntax.iterators import izip_pcn
20
+ from bronx.system.interrupt import SignalInterruptError
21
+ from footprints import proxy as fpx
22
+ from footprints.stdtypes import FPDict
23
+ from vortex import toolbox, VortexForceComplete
24
+ from vortex.layout.appconf import ConfigSet
25
+ from vortex.layout.subjobs import subjob_handling, SubJobLauncherError
26
+ from vortex.syntax.stdattrs import Namespace
27
+ from vortex.util.config import GenericConfigParser
28
+
29
+ logger = loggers.getLogger(__name__)
30
+
31
+ #: Export real nodes.
32
+ __all__ = ['Driver', 'Task', 'Family']
33
+
34
+ OBSERVER_TAG = 'Layout-Nodes'
35
+
36
+ #: Definition of a named tuple for Node Statuses
37
+ _NodeStatusTuple = collections.namedtuple('_NodeStatusTuple',
38
+ ['CREATED', 'READY', 'RUNNING', 'DONE', 'FAILED'])
39
+
40
+ #: Predefined Node Status values
41
+ NODE_STATUS = _NodeStatusTuple(CREATED='created',
42
+ READY='ready to start',
43
+ RUNNING='running',
44
+ DONE='done',
45
+ FAILED='FAILED')
46
+
47
+ #: Definition of a named tuple for Node on_error behaviour
48
+ _NodeOnErrorTuple = collections.namedtuple('_NodeOnErrorTuple',
49
+ ['FAIL', 'DELAYED_FAIL', 'CONTINUE'])
50
+
51
+ #: Predefined Node Status values
52
+ NODE_ON_ERROR = _NodeOnErrorTuple(FAIL='fail',
53
+ DELAYED_FAIL='delayed_fail',
54
+ CONTINUE='continue')
55
+
56
+
57
+ class PreviousFailureError(RuntimeError):
58
+ """This exception is raised in multistep jobs (when a failure already occurred)."""
59
+ pass
60
+
61
+
62
+ class RequestedFailureError(RuntimeError):
63
+ """
64
+ This exception is raised, when a Node finishes, if the `fail_at_the_end`
65
+ property is True.
66
+ """
67
+ pass
68
+
69
+
70
+ class NiceLayout(observer.Observer):
71
+ """Some nice method to share between layout items."""
72
+
73
+ @property
74
+ def tag(self):
75
+ """Abstract property: have to be defined later on"""
76
+ raise NotImplementedError
77
+
78
+ @property
79
+ def ticket(self):
80
+ """Abstract property: have to be defined later on"""
81
+ raise NotImplementedError
82
+
83
+ @property
84
+ def sh(self):
85
+ """Abstract property: have to be defined later on"""
86
+ raise NotImplementedError
87
+
88
+ @property
89
+ def contents(self):
90
+ """Abstract property: have to be defined later on"""
91
+ raise NotImplementedError
92
+
93
+ def highlight(self, *args, **kw):
94
+ """Proxy to :meth:`~vortex.tools.systems.subtitle` method."""
95
+ return self.sh.highlight(*args, bchar=' #', bline0=False, **kw)
96
+
97
+ def subtitle(self, *args, **kw):
98
+ """Proxy to :meth:`~vortex.tools.systems.subtitle` method."""
99
+ return self.sh.subtitle(*args, **kw)
100
+
101
+ def header(self, *args, **kw):
102
+ """Proxy to :meth:`~vortex.tools.systems.header` method."""
103
+ return self.sh.header(*args, **kw)
104
+
105
+ def nicedump(self, msg, titlecallback=None, **kw):
106
+ """Simple dump of the dict contents with ``msg`` as header."""
107
+ titlecallback = titlecallback or self.header
108
+ titlecallback(msg)
109
+ if kw:
110
+ maxlen = max([len(x) for x in kw.keys()])
111
+ for k, v in sorted(kw.items()):
112
+ print(' +', k.ljust(maxlen), '=', str(v))
113
+ print()
114
+ else:
115
+ print(" + ...\n")
116
+
117
+ def _print_traceback(self):
118
+ exc_type, exc_value, exc_traceback = sys.exc_info()
119
+ print('Exception type: {!s}'.format(exc_type))
120
+ print('Exception values: {!s}'.format(exc_value))
121
+ self.header('Traceback Error / BEGIN')
122
+ print("\n".join(traceback.format_tb(exc_traceback)))
123
+ self.header('Traceback Error / END')
124
+
125
+ @property
126
+ def _ds_extra(self):
127
+ return {'tag': self.tag, 'class': self.__class__.__name__}
128
+
129
+ def _nicelayout_init(self, kw):
130
+ """Initialise generic stuff."""
131
+ self._on_error = kw.get('on_error', NODE_ON_ERROR.FAIL)
132
+ if self._on_error not in NODE_ON_ERROR:
133
+ raise ValueError('Erroneous value for on_error: {!s}'.format(self._on_error))
134
+ self._obs_board = observer.get(tag=OBSERVER_TAG)
135
+ self._obs_board.notify_new(self, dict(tag=self.tag, typename=type(self).__name__,
136
+ status=self.status,
137
+ on_error=self.on_error))
138
+ self._obs_board.register(self)
139
+ # Increment the mstep counter
140
+ self.ticket.datastore.insert('layout_mstep_counter', self._ds_extra,
141
+ self.mstep_counter + 1, readonly=False)
142
+
143
+ def updobsitem(self, item, info):
144
+ if info.get('observerboard', '') == OBSERVER_TAG:
145
+ o_id = (info['tag'], info['typename'])
146
+ if info.get('subjob_replay', False):
147
+ # If the status/delayed_error_flag chatter is being replayed,
148
+ # deal with it
149
+ if o_id == (self.tag, type(self).__name__):
150
+ if 'new_status' in info:
151
+ self._store_status(info['new_status'])
152
+ if info.get('delayed_error_flag', False):
153
+ self._store_delayed_error_flag(True)
154
+ else:
155
+ if (self.status != NODE_STATUS.CREATED and
156
+ any([o_id == (k.tag, type(k).__name__) for k in self.contents])):
157
+ # We are only interested in child nodes
158
+ if info.get('new_status', None) == NODE_STATUS.FAILED:
159
+ # On kid failure, update my own status
160
+ if info['on_error'] == NODE_ON_ERROR.FAIL:
161
+ self.status = NODE_STATUS.FAILED
162
+ if 'delayed_error_flag' in info:
163
+ # Propagate the delayed error flag
164
+ self.delayed_error_flag = True
165
+
166
+ @property
167
+ def mstep_counter(self):
168
+ """Count how many times this object was created."""
169
+ return self.ticket.datastore.get('layout_mstep_counter', self._ds_extra,
170
+ default_payload=0, readonly=False)
171
+
172
+ @property
173
+ def on_error(self):
174
+ """How to react on error."""
175
+ return self._on_error
176
+
177
+ @property
178
+ def delayed_error_flag(self):
179
+ """Return the delayed error flag."""
180
+ return self.ticket.datastore.get('layout_delayed_error_flag', self._ds_extra,
181
+ default_payload=False, readonly=False)
182
+
183
+ def _store_delayed_error_flag(self, value):
184
+ self.ticket.datastore.insert('layout_delayed_error_flag', self._ds_extra,
185
+ value, readonly=False)
186
+
187
+ @delayed_error_flag.setter
188
+ def delayed_error_flag(self, value):
189
+ """Set the status of the current Node/Driver."""
190
+ if not bool(value):
191
+ raise ValueError('True is the only possible value for delayed_error_flag')
192
+ if not self.delayed_error_flag:
193
+ self._obs_board.notify_upd(self, dict(tag=self.tag, typename=type(self).__name__,
194
+ delayed_error_flag=True))
195
+ self._store_delayed_error_flag(True)
196
+
197
+ @property
198
+ def status(self):
199
+ """Return the status of the current Node/Driver."""
200
+ return self.ticket.datastore.get('layout_status', self._ds_extra,
201
+ default_payload=NODE_STATUS.CREATED, readonly=False)
202
+
203
+ @property
204
+ def status_mstep_counter(self):
205
+ """Return the number of the multi-step that last updated the status."""
206
+ return self.ticket.datastore.get('layout_status_mstep', self._ds_extra,
207
+ default_payload=0, readonly=False)
208
+
209
+ def _store_status(self, value):
210
+ self.ticket.datastore.insert('layout_status', self._ds_extra,
211
+ value, readonly=False)
212
+ self._store_status_mstep_counter()
213
+
214
+ def _store_status_mstep_counter(self):
215
+ self.ticket.datastore.insert('layout_status_mstep', self._ds_extra,
216
+ self.mstep_counter, readonly=False)
217
+
218
+ @status.setter
219
+ def status(self, value):
220
+ """Set the status of the current Node/Driver."""
221
+ if value not in NODE_STATUS:
222
+ raise ValueError('Erroneous value for the node status: {!s}'.format(value))
223
+ if value != self.status:
224
+ self._obs_board.notify_upd(self, dict(tag=self.tag, typename=type(self).__name__,
225
+ previous_status=self.status,
226
+ new_status=value,
227
+ on_error=self.on_error))
228
+ if value == NODE_STATUS.FAILED and self.on_error == NODE_ON_ERROR.DELAYED_FAIL:
229
+ self.delayed_error_flag = True
230
+ self._store_status(value)
231
+ else:
232
+ self._store_status_mstep_counter()
233
+
234
+ @property
235
+ def any_failure(self):
236
+ """Return True if self or any of the subnodes failed."""
237
+ failure = self.status == NODE_STATUS.FAILED
238
+ return failure or any([k.any_failure for k in self.contents])
239
+
240
+ @property
241
+ def any_currently_running(self):
242
+ """Return True if self or any of the subnodes is running."""
243
+ running = (self.status == NODE_STATUS.RUNNING and
244
+ self.status_mstep_counter == self.mstep_counter)
245
+ return running or any([k.any_currently_running for k in self.contents])
246
+
247
+ def tree_str(self, statuses_filter=(), with_conf=False):
248
+ """Print the node's tree."""
249
+ # Kids contribution
250
+ filtered_kids = [k for k in self.contents
251
+ if not statuses_filter or k.status in statuses_filter]
252
+ kids_str = ['\n'.join('{:s}{:s} {:s}'.format('|' if i == 0 or ikid < len(filtered_kids) - 1 else ' ',
253
+ '--' if i == 0 else ' ',
254
+ line)
255
+ for i, line in enumerate(kid.tree_str(statuses_filter=statuses_filter,
256
+ with_conf=with_conf).split('\n')))
257
+ for ikid, kid in enumerate(filtered_kids)]
258
+ # Myself
259
+ tree = []
260
+ if not statuses_filter or self.status in statuses_filter:
261
+ if len(statuses_filter) != 1:
262
+ me_fmt = '{tag:s} ({what:s}) -> {status:s}'
263
+ else:
264
+ me_fmt = '{tag:s} ({what:s})'
265
+ x_status = self.status
266
+ if x_status == NODE_STATUS.RUNNING and self.status_mstep_counter < self.mstep_counter:
267
+ x_status = "interrupted because of others errors"
268
+ me = me_fmt.format(tag=self.tag, what=self.__class__.__name__, status=x_status)
269
+ if self.status == NODE_STATUS.FAILED and self.on_error != NODE_ON_ERROR.FAIL:
270
+ me += ' (but {:s})'.format(self.on_error)
271
+ tree.append(me)
272
+ if with_conf:
273
+ cd = self.confdiff
274
+ if cd:
275
+ tree.extend(['{:s} {:s}={!s}'.format('|' if self.contents else ' ', k, v)
276
+ for k, v in sorted(cd.items())])
277
+ # Myself + kids
278
+ tree.extend(kids_str)
279
+ return '\n'.join(tree)
280
+
281
+ def __str__(self):
282
+ """Print the node's tree."""
283
+ return self.tree_str()
284
+
285
+
286
+ class Node(getbytag.GetByTag, NiceLayout):
287
+ """Base class type for any element in the logical layout.
288
+
289
+ :param str tag: The node's tag (must be unique !)
290
+ :param Ticket ticket: The session's ticket that will be used
291
+ :param str config_tag: The configuration's file section name that will be used
292
+ to setup this node (default: ``self.tag``)
293
+ :param active_callback: Some function or lambda that will be called with
294
+ ``self`` as first argument in order to determine if
295
+ the current not should be used (default: ``None``.
296
+ i.e. The node is active).
297
+ :param str special_prefix: The prefix of any environment variable that should
298
+ be exported into ``self.conf``
299
+ :param str register_cycle_prefix: The callback function used to initialise
300
+ Genv's cycles
301
+ :param JobAssistant jobassistant: the jobassistant object that might
302
+ be used to find out the **special_prefix**
303
+ and **register_cycle_prefix** callback.
304
+ :param str on_error: How to react when a failure occurs (default is "fail",
305
+ alternatives are "delayed_fail" and "continue")
306
+ :param dict kw: Any other attributes that will be added to ``self.options``
307
+ (that will eventually be added to ``self.conf``)
308
+ """
309
+
310
+ def __init__(self, kw):
311
+ logger.debug('Node initialisation %s', repr(self))
312
+ self.options = dict()
313
+ self.play = kw.pop('play', False)
314
+ self._ticket = kw.pop('ticket', None)
315
+ if self._ticket is None:
316
+ raise ValueError("The session's ticket must be provided (using a `ticket` argument)")
317
+ self._configtag = kw.pop('config_tag', self.tag)
318
+ self._active_cb = kw.pop('active_callback', None)
319
+ if self._active_cb is not None and not callable(self._active_cb):
320
+ raise ValueError("If provided, active_callback must be a callable")
321
+ self._locprefix = kw.pop('special_prefix', 'OP_').upper()
322
+ self._subjobok = kw.pop('subjob_allowed', True)
323
+ self._subjobtag = kw.pop('subjob_tag', None)
324
+ self._cycle_cb = kw.pop('register_cycle_prefix', None)
325
+ j_assist = kw.pop('jobassistant', None)
326
+ if j_assist is not None:
327
+ self._locprefix = j_assist.special_prefix.upper()
328
+ self._cycle_cb = j_assist.register_cycle
329
+ self._subjobok = j_assist.subjob_allowed
330
+ self._subjobtag = j_assist.subjob_tag
331
+ self._mstep_job_last = kw.pop('mstep_job_last', True)
332
+ self._dryrun = kw.pop('dryrun', False)
333
+ self._conf = None
334
+ self._parentconf = None
335
+ self._activenode = None
336
+ self._contents = list()
337
+ self._nicelayout_init(kw)
338
+
339
+ def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
340
+ """All the necessary arguments to build a copy of this object."""
341
+ argsdict = dict(play=self.play,
342
+ ticket=self.ticket,
343
+ config_tag=self.config_tag,
344
+ active_callback=self._active_cb,
345
+ special_prefix=self._locprefix,
346
+ register_cycle_prefix=self._cycle_cb,
347
+ subjob_tag=self._subjobtag,
348
+ subjob_allowed=self._subjobok,
349
+ mstep_job_last=self._mstep_job_last,
350
+ dryrun=self._dryrun
351
+ )
352
+ argsdict.update(self.options)
353
+ return argsdict
354
+
355
+ def loopclone(self, tagsuffix, extras):
356
+ """Create a copy of the present object by adding a suffix to the tag.
357
+
358
+ **extras** items can be added to the copy's options.
359
+ """
360
+ kwargs = self._args_loopclone(tagsuffix, extras)
361
+ kwargs.update(**extras)
362
+ return self.__class__(tag=self.tag + tagsuffix, **kwargs)
363
+
364
+ @classmethod
365
+ def tag_clean(cls, tag):
366
+ """Lower case, space-free and underscore-free tag."""
367
+ return tag.lower().replace(' ', '')
368
+
369
+ @property
370
+ def ticket(self):
371
+ return self._ticket
372
+
373
+ @property
374
+ def config_tag(self):
375
+ return self._configtag
376
+
377
+ @property
378
+ def conf(self):
379
+ return self._conf
380
+
381
+ @property
382
+ def confdiff(self):
383
+ cs = ConfigSet()
384
+ cs.update({k: v for k, v in self._conf.items()
385
+ if k not in self._parentconf or self._parentconf[k] != v})
386
+ return cs
387
+
388
+ @property
389
+ def activenode(self):
390
+ if self._activenode is None:
391
+ if self.conf is None:
392
+ raise RuntimeError('Setup the configuration object before calling activenode !')
393
+ self._activenode = self._active_cb is None or self._active_cb(self)
394
+ return self._activenode
395
+
396
+ @property
397
+ def sh(self):
398
+ return self.ticket.sh
399
+
400
+ @property
401
+ def env(self):
402
+ return self.ticket.env
403
+
404
+ @property
405
+ def contents(self):
406
+ return self._contents
407
+
408
+ def clear(self):
409
+ """Clear actual contents."""
410
+ self._contents[:] = []
411
+
412
+ def __iter__(self):
413
+ yield from self.contents
414
+
415
+ @property
416
+ def fail_at_the_end(self):
417
+ """Tells whether the Node should fail when it reaches the end of 'run'."""
418
+ return self.ticket.datastore.get('layout_fail_at_the_end', self._ds_extra,
419
+ default_payload=False, readonly=False)
420
+
421
+ @fail_at_the_end.setter
422
+ def fail_at_the_end(self, value):
423
+ """Tells whether the Node should fail when it reaches the end of 'run'."""
424
+ self.ticket.datastore.insert('layout_fail_at_the_end', self._ds_extra,
425
+ bool(value), readonly=False)
426
+
427
+ def build_context(self):
428
+ """Build the context and subcontexts of the current node."""
429
+ if self.activenode:
430
+ oldctx = self.ticket.context
431
+ ctx = self.ticket.context.newcontext(self.tag, focus=True)
432
+ if not self._dryrun:
433
+ ctx.cocoon()
434
+ self._setup_context(ctx)
435
+ oldctx.activate()
436
+ if self.status == NODE_STATUS.CREATED:
437
+ self.status = NODE_STATUS.READY
438
+
439
+ def _setup_context(self, ctx):
440
+ """Setup the newly created context."""
441
+ pass
442
+
443
+ @contextlib.contextmanager
444
+ def isolate(self):
445
+ """Deal with any events related to the actual run."""
446
+ if self.activenode:
447
+ with self._context_isolation():
448
+ if self._subjobtag == self.tag:
449
+ with subjob_handling(self, OBSERVER_TAG):
450
+ with self._status_isolation(extra_verbose=True):
451
+ yield
452
+ else:
453
+ with self._status_isolation():
454
+ yield
455
+ else:
456
+ yield
457
+
458
+ @contextlib.contextmanager
459
+ def _context_isolation(self):
460
+ """Handle context switching properly."""
461
+ self._oldctx = self.ticket.context
462
+ ctx = self.ticket.context.switch(self.tag)
463
+ ctx.cocoon()
464
+ logger.debug('Node context directory <%s>', self.sh.getcwd())
465
+ try:
466
+ yield
467
+ finally:
468
+ ctx.free_resources()
469
+ logger.debug('Exit context directory <%s>', self.sh.getcwd())
470
+ self._oldctx.activate()
471
+ self.ticket.context.cocoon()
472
+
473
+ @contextlib.contextmanager
474
+ def _status_isolation(self, extra_verbose=False):
475
+ """Handle the Node's status updates."""
476
+ if self.status in (NODE_STATUS.READY, NODE_STATUS.RUNNING):
477
+ self.status = NODE_STATUS.RUNNING
478
+ try:
479
+ yield
480
+ except Exception:
481
+ self.status = NODE_STATUS.FAILED
482
+ if extra_verbose or self.on_error != NODE_ON_ERROR.FAIL:
483
+ # Mask the exception
484
+ self.subtitle('An exception occurred (on_error={:s})'.format(self.on_error))
485
+ self._print_traceback()
486
+ if self.on_error == NODE_ON_ERROR.FAIL:
487
+ raise
488
+ else:
489
+ if self.status == NODE_STATUS.RUNNING and self._mstep_job_last:
490
+ self.status = NODE_STATUS.DONE
491
+
492
+ def setconf(self, conf_local, conf_global):
493
+ """Build a new conf object for the actual node."""
494
+
495
+ # The parent conf is the default configuration
496
+ if isinstance(conf_local, ConfigSet):
497
+ self._conf = conf_local.copy()
498
+ else:
499
+ self._conf = ConfigSet()
500
+ self._conf.update(conf_local)
501
+ self._parentconf = self._conf.copy()
502
+ self._active = None
503
+
504
+ # This configuration is updated with any section with the current tag name
505
+ updconf = conf_global.get(self.config_tag, dict())
506
+ if self.mstep_counter <= 1:
507
+ self.nicedump(' '.join(('Configuration for', self.realkind, self.tag)), **updconf)
508
+ self.conf.update(updconf)
509
+
510
+ # Add exported local variables
511
+ self.local2conf()
512
+
513
+ # Add potential options
514
+ if self.options:
515
+ if self.mstep_counter <= 1:
516
+ self.nicedump('Update conf with last minute arguments',
517
+ titlecallback=self.highlight, **self.options)
518
+ self.conf.update(self.options)
519
+
520
+ if self.activenode:
521
+ # Then we broadcast the current configuration to the kids
522
+ for node in self.contents:
523
+ node.setconf(self.conf, conf_global)
524
+ else:
525
+ logger.info('Under present conditions/configuration, this node will not be activated.')
526
+
527
+ def localenv(self):
528
+ """Dump the actual env variables."""
529
+ self.header('ENV catalog')
530
+ self.env.mydump()
531
+
532
+ def local2conf(self):
533
+ """Set some parameters if defined in environment but not in actual conf."""
534
+ autoconf = dict()
535
+ localstrip = len(self._locprefix)
536
+ for localvar in sorted([x for x in self.env.keys() if x.startswith(self._locprefix)]):
537
+ if (localvar[localstrip:] not in self.conf or
538
+ (localvar[localstrip:] not in ('rundate', ) and
539
+ self.env[localvar] is not None and
540
+ self.env[localvar] != self.conf[localvar[localstrip:]])):
541
+ autoconf[localvar[localstrip:].lower()] = self.env[localvar]
542
+ if autoconf:
543
+ if self.mstep_counter <= 1:
544
+ self.nicedump('Populate conf with local variables',
545
+ titlecallback=self.highlight, **autoconf)
546
+ self.conf.update(autoconf)
547
+
548
+ def conf2io(self):
549
+ """Abstract method."""
550
+ pass
551
+
552
+ def xp2conf(self):
553
+ """Set the actual experiment value -- Could be the name of the op suite if any."""
554
+ if 'xpid' not in self.conf:
555
+ self.conf.xpid = self.conf.get('suite', self.env.VORTEX_XPID)
556
+ if self.conf.xpid is None:
557
+ raise ValueError('Could not set a proper experiment id.')
558
+
559
+ def register_cycle(self, cyclename):
560
+ """Adds a new cycle to genv if a proper callback is defined."""
561
+ if self._cycle_cb is not None:
562
+ self._cycle_cb(cyclename)
563
+ else:
564
+ raise NotImplementedError()
565
+
566
+ def cycles(self):
567
+ """Update and register some configuration cycles."""
568
+
569
+ other_cycles = [x for x in self.conf.keys() if x.endswith('_cycle')]
570
+ if 'cycle' in self.conf or other_cycles:
571
+ self.header("Registering cycles")
572
+
573
+ # At least, look for the main cycle
574
+ if 'cycle' in self.conf:
575
+ self.register_cycle(self.conf.cycle)
576
+
577
+ # Have a look to other cycles
578
+ for other in other_cycles:
579
+ self.register_cycle(self.conf.get(other))
580
+
581
+ def geometries(self):
582
+ """Setup geometries according to actual tag."""
583
+ thisgeo = self.tag + '_geometry'
584
+ if thisgeo in self.conf:
585
+ self.conf.geometry = self.conf.get(thisgeo)
586
+ if 'geometry' not in self.conf:
587
+ logger.warning('No default geometry defined !')
588
+
589
+ def defaults(self, extras):
590
+ """Set toolbox defaults, extended with actual arguments ``extras``."""
591
+ t = self.ticket
592
+ toolbox.defaults(
593
+ model=t.glove.vapp,
594
+ namespace=self.conf.get('namespace', Namespace('vortex.cache.fr')),
595
+ gnamespace=self.conf.get('gnamespace', Namespace('gco.multi.fr')),
596
+ )
597
+
598
+ if 'rundate' in self.conf:
599
+ toolbox.defaults['date'] = self.conf.rundate
600
+
601
+ for optk in ('cutoff', 'geometry', 'cycle', 'model'):
602
+ if optk in self.conf:
603
+ value = self.conf.get(optk)
604
+ if isinstance(value, dict):
605
+ value = FPDict(value)
606
+ toolbox.defaults[optk] = self.conf.get(optk)
607
+
608
+ toolbox.defaults(**extras)
609
+ self.header('Toolbox defaults')
610
+ toolbox.defaults.show()
611
+
612
+ def setup(self, **kw):
613
+ """A methodic way to build the conf of the node."""
614
+ self.subtitle(self.realkind.upper() + ' setup')
615
+ self.localenv()
616
+ self.local2conf()
617
+ self.conf2io()
618
+ self.xp2conf()
619
+ if kw:
620
+ if self.mstep_counter <= 1:
621
+ self.nicedump('Update conf with last minute arguments', **kw)
622
+ self.conf.update(kw)
623
+ self.cycles()
624
+ self.geometries()
625
+ self.defaults(kw.get('defaults', dict()))
626
+
627
+ def summary(self):
628
+ """Dump actual parameters of the configuration."""
629
+ if self.mstep_counter <= 1:
630
+ self.nicedump('Complete parameters', **self.conf)
631
+ else:
632
+ self.header('Complete parameters')
633
+ print("Silent Node' setup: please refer to the first job step for more details")
634
+
635
+ def complete(self):
636
+ """Some cleaning and completion status."""
637
+ pass
638
+
639
+ def _actual_run(self, sjob_activated=True):
640
+ """Abstract method: the actual job to do."""
641
+ pass
642
+
643
+ def run(self, sjob_activated=True):
644
+ """Execution driver: setup, run, complete... (if needed)."""
645
+ if self._dryrun:
646
+ raise RuntimeError('This Node was initialised with "dryrun". ' +
647
+ 'It is not allowed to call run().')
648
+ if self.activenode:
649
+ try:
650
+ self._actual_run(sjob_activated)
651
+ except Exception:
652
+ self.fail_at_the_end = False
653
+ raise
654
+ else:
655
+ if self.fail_at_the_end:
656
+ raise RequestedFailureError(
657
+ 'An error occurred in {:s}. '.format(self.tag) +
658
+ 'Please dive into the present log to understand why.'
659
+ )
660
+
661
+ def filter_execution_error(self, exc): # @UnusedVariable
662
+ """
663
+ May be overwritten if exceptions generated by the AlgoComponent needs
664
+ to be filtered.
665
+
666
+ :param Exception exc: The exception that triggered the call
667
+
668
+ :return: Two elements. The first item (boolean) tells whether or not
669
+ a delayed exception error is to be masked. The second item is a
670
+ (possibly empty) dictionary that gives some extra information
671
+ about the warning/error (such information could be used to
672
+ generate a meaningful alert email).
673
+
674
+ :note: Do not re-raised the **exc** exception in this method.
675
+ """
676
+ return False, dict()
677
+
678
+ def report_execution_warning(self, exc, **kw_infos): # @UnusedVariable
679
+ """
680
+ May be overwritten if a report needs to be sent when a filtered
681
+ execution error occurs.
682
+
683
+ :param Exception exc: The exception that triggered the call
684
+ :param dict kw_infos: Any kind of extra informations provided by the
685
+ :meth:`filter_execution_error`.
686
+
687
+ :note: Do not re-raised the **exc** exception in this method.
688
+ """
689
+ pass
690
+
691
+ def report_execution_error(self, exc, **kw_infos): # @UnusedVariable
692
+ """
693
+ May be overwritten if a report needs to be sent when an un-filtered
694
+ execution error occurs.
695
+
696
+ :param Exception exc: The exception that triggered the call
697
+ :param dict kw_infos: Any kind of extra informations provided by the
698
+ :meth:`filter_execution_error`.
699
+
700
+ :note: Do not re-raised the **exc** exception in this method.
701
+ """
702
+ pass
703
+
704
+ def delay_execution_error(self, exc, **kw_infos): # @UnusedVariable
705
+ """
706
+ Tells whether the execution error needs to be ignored temporarily
707
+ (an exception will still be raised when the Node exits).
708
+
709
+ :param Exception exc: The exception that triggered the call
710
+ :param dict kw_infos: Any kind of extra informations provided by the
711
+ :meth:`filter_execution_error`.
712
+
713
+ :note: Do not re-raised the **exc** exception in this method.
714
+ """
715
+ return self.conf.get('delay_component_errors', False)
716
+
717
+ def component_runner(self, tbalgo, tbx=(None,), **kwargs):
718
+ """Run the binaries listed in tbx using the tbalgo algo component.
719
+
720
+ This is a helper method that maybe useful (its use is not mandatory).
721
+ """
722
+ # it may be necessary to setup a default value for OpenMP...
723
+ env_update = dict()
724
+ if 'openmp' not in self.conf or not isinstance(self.conf.openmp, (list, tuple)):
725
+ env_update['OMP_NUM_THREADS'] = int(self.conf.get('openmp', 1))
726
+
727
+ # If some mpiopts are in the config file, use them...
728
+ mpiopts = kwargs.pop('mpiopts', dict())
729
+ mpiopts_map = dict(nnodes='nn', ntasks='nnp', nprocs='np', proc='np')
730
+ for stuff in [s
731
+ for s in ('proc', 'nprocs', 'nnodes', 'ntasks', 'openmp',
732
+ 'prefixcommand', 'envelope')
733
+ if s in mpiopts or s in self.conf]:
734
+ mpiopts[mpiopts_map.get(stuff, stuff)] = mpiopts.pop(stuff, self.conf[stuff])
735
+
736
+ # if the prefix command is missing in the configuration file, look in the input sequence
737
+ if 'prefixcommand' not in mpiopts:
738
+ prefixes = self.ticket.context.sequence.effective_inputs(role=re.compile('Prefixcommand'))
739
+ if len(prefixes) > 1:
740
+ raise RuntimeError("Only one prefix command can be used...")
741
+ for sec in prefixes:
742
+ prefixpath = sec.rh.container.actualpath()
743
+ logger.info('The following MPI prefix command will be used: %s', prefixpath)
744
+ mpiopts['prefixcommand'] = prefixpath
745
+
746
+ # Ensure that some of the mpiopts are integers
747
+ for stuff in [s for s in ('nn', 'nnp', 'openmp', 'np') if s in mpiopts]:
748
+ if isinstance(mpiopts[stuff], (list, tuple)):
749
+ mpiopts[stuff] = [int(v) for v in mpiopts[stuff]]
750
+ else:
751
+ mpiopts[stuff] = int(mpiopts[stuff])
752
+
753
+ # Read the configuration file for some extra configuration
754
+ allowed_conf_extras = ('launcher', 'opts', 'wrapstd', 'bind_topology')
755
+ for k, v in self.conf.items():
756
+ if (k not in kwargs and '_mpi' in k and
757
+ any([k.endswith('_mpi' + a) for a in allowed_conf_extras])):
758
+ kwargs[k] = v
759
+
760
+ # When multiple list of binaries are given (i.e several binaries are launched
761
+ # by the same MPI command).
762
+ if tbx and isinstance(tbx[0], (list, tuple)):
763
+ tbx = zip(*tbx)
764
+ with self.env.delta_context(**env_update):
765
+ with self.sh.default_target.algo_run_context(self.ticket, self.conf):
766
+ for binary in tbx:
767
+ try:
768
+ tbalgo.run(binary, mpiopts=mpiopts, **kwargs)
769
+ except (Exception, SignalInterruptError, KeyboardInterrupt) as e:
770
+ mask_delayed, f_infos = self.filter_execution_error(e)
771
+ if isinstance(e, Exception) and mask_delayed:
772
+ logger.warning("The delayed exception is masked:\n%s", str(f_infos))
773
+ self.report_execution_warning(e, **f_infos)
774
+ else:
775
+ logger.error("Un-filtered execution error:\n%s", str(f_infos))
776
+ self.report_execution_error(e, **f_infos)
777
+ if isinstance(e, Exception) and self.delay_execution_error(e, **f_infos):
778
+ self.subtitle(
779
+ 'An exception occurred but the crash is delayed until the end of the Node'
780
+ )
781
+ self._print_traceback()
782
+ # Actually delay the crash
783
+ self.fail_at_the_end = True
784
+ print()
785
+ else:
786
+ raise
787
+
788
+
789
+ class Family(Node):
790
+ """Logical group of :class:`Family` or :class:`Task`.
791
+
792
+ Compared to the usual :class:`Node` class, additional attributes are:
793
+
794
+ :param nodes: The list of :class:`Family` or :class:`Task` objects that
795
+ are members of this family
796
+ """
797
+
798
+ def __init__(self, **kw):
799
+ logger.debug('Family init %s', repr(self))
800
+ super().__init__(kw)
801
+ nodes = kw.pop('nodes', list())
802
+ self.options = kw.copy()
803
+
804
+ # Build the nodes sequence
805
+ fcount = 0
806
+ for x in nodes:
807
+ if isinstance(x, Node):
808
+ self._contents.append(x)
809
+ else:
810
+ fcount += 1
811
+ self._contents.append(
812
+ Family(
813
+ tag='{:s}.f{:02d}'.format(self.tag, fcount),
814
+ ticket=self.ticket,
815
+ nodes=x,
816
+ **kw
817
+ )
818
+ )
819
+
820
+ @property
821
+ def realkind(self):
822
+ return 'family'
823
+
824
+ def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
825
+ baseargs = super()._args_loopclone(tagsuffix, extras)
826
+ baseargs['nodes'] = [node.loopclone(tagsuffix, extras) for node in self._contents]
827
+ return baseargs
828
+
829
+ def _setup_context(self, ctx):
830
+ """Build the contexts of all the nodes contained by this family."""
831
+ for node in self.contents:
832
+ node.build_context()
833
+
834
+ def localenv(self):
835
+ """No env dump in families (it is enough to dump it in Tasks)."""
836
+ pass
837
+
838
+ def summary(self):
839
+ """No parameters dump in families (it is enough to dump it in Tasks)."""
840
+ pass
841
+
842
+ @property
843
+ def _parallel_launchtool(self):
844
+ """Create a launchtool for parallel runs (if sensible only)."""
845
+ if self._subjobok and self._subjobtag is None and 'paralleljobs_kind' in self.conf:
846
+ # Subjob are allowed and I'am the main job (because self._subjobtag is None) :
847
+ # => Run the family's content using subjobs
848
+
849
+ # Create the subjob launcher
850
+ launcher_opts = {k[len('paralleljobs_'):]: self.conf[k]
851
+ for k in self.conf if k.startswith('paralleljobs_')}
852
+ launchtool = fpx.subjobslauncher(scriptpath=sys.argv[0],
853
+ nodes_obsboard_tag=OBSERVER_TAG,
854
+ ** launcher_opts)
855
+ if launchtool is None:
856
+ raise RuntimeError('No subjob launcher could be found: check "paralleljobs_kind".')
857
+ launchtool.ticket = self.ticket
858
+ return launchtool
859
+ else:
860
+ return None
861
+
862
+ def _actual_run(self, sjob_activated=True):
863
+ """Execution driver: setup, run kids, complete."""
864
+ launchtool = self._parallel_launchtool
865
+ if launchtool:
866
+ self.ticket.sh.title(' '.join(('Build', self.realkind, self.tag, '(using subjobs)')))
867
+
868
+ def node_recurse(some_node):
869
+ """Recursively find tags."""
870
+ o_set = {some_node.tag}
871
+ for snode in some_node.contents:
872
+ o_set = o_set | node_recurse(snode)
873
+ return o_set
874
+
875
+ # Launch each family's member
876
+ for node in self.contents:
877
+ launchtool(node.tag, node_recurse(node))
878
+ # Wait for everybody to complete
879
+ done, ko = launchtool.waitall()
880
+ if ko:
881
+ raise SubJobLauncherError("Execution failed for some subjobs: {:s}"
882
+ .format(','.join(ko)))
883
+ else:
884
+ # No subjobs configured or allowed: run the usual way...
885
+ sjob_activated = sjob_activated or self._subjobtag == self.tag
886
+ try:
887
+ self.ticket.sh.title(' '.join(('Build', self.realkind, self.tag)))
888
+ self.setup()
889
+ self.summary()
890
+ for node in self.contents:
891
+ with node.isolate():
892
+ node.run(sjob_activated=sjob_activated)
893
+ finally:
894
+ self.complete()
895
+
896
+
897
+ class LoopFamily(Family):
898
+ """
899
+ Loop on the Family's content according to a variable taken from ``self.conf``.
900
+
901
+ Compared to the usual :class:`Family` class, additional attributes are:
902
+
903
+ :param str loopconf: The name of the ``self.conf`` entry to loop on
904
+ :param str loopvariable: The name of the loop control variable (that is
905
+ automatically added to the child's self.conf).
906
+ By default, **loopconf** without trailing ``s`` is
907
+ used.
908
+ :param str loopsuffix: The suffix that will be added to the child's tag.
909
+ By default '+loopvariable{!s}' (where {!s} will be
910
+ replaced by the loop control variable's value).
911
+ :param bool loopneedprev: Ensure that the previous value is available
912
+ :param bool loopneednext: Ensure that the next value is available
913
+ """
914
+
915
+ def __init__(self, **kw):
916
+ logger.debug('LoopFamily init %s', repr(self))
917
+ # On what should we iterate ?
918
+ self._loopconf = kw.pop('loopconf', None)
919
+ if not self._loopconf:
920
+ raise ValueError('The "loopconf" named argument must be given')
921
+ else:
922
+ self._loopconf = self._loopconf.split(',')
923
+ # Find the loop's variable names
924
+ self._loopvariable = kw.pop('loopvariable', None)
925
+ if self._loopvariable is None:
926
+ self._loopvariable = [s.rstrip('s') for s in self._loopconf]
927
+ else:
928
+ self._loopvariable = self._loopvariable.split(',')
929
+ if len(self._loopvariable) != len(self._loopconf):
930
+ raise ValueError('Inconsistent size between loopconf and loopvariable')
931
+ # Find the loop suffixes
932
+ self._loopsuffix = kw.pop('loopsuffix', None)
933
+ if self._loopsuffix is None:
934
+ self._loopsuffix = '+' + self._loopvariable[0] + '{0!s}'
935
+ # Prev/Next
936
+ self._loopneedprev = kw.pop('loopneedprev', False)
937
+ self._loopneednext = kw.pop('loopneednext', False)
938
+ # Generic init...
939
+ super().__init__(**kw)
940
+ # Initialisation stuff
941
+ self._actual_content = None
942
+
943
+ def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
944
+ baseargs = super()._args_loopclone(tagsuffix, extras)
945
+ baseargs['loopconf'] = ','.join(self._loopconf)
946
+ baseargs['loopvariable'] = ','.join(self._loopvariable)
947
+ baseargs['loopsuffix'] = self._loopsuffix
948
+ baseargs['loopneedprev'] = self._loopneedprev
949
+ baseargs['loopneednext'] = self._loopneednext
950
+ return baseargs
951
+
952
+ @property
953
+ def contents(self):
954
+ if self._actual_content is None:
955
+ self._actual_content = list()
956
+ for pvars, cvars, nvars in izip_pcn(*[self.conf.get(lc) for lc in self._loopconf]):
957
+ if self._loopneedprev and all([v is None for v in pvars]):
958
+ continue
959
+ if self._loopneednext and all([v is None for v in nvars]):
960
+ continue
961
+ extras = {v: x for v, x in zip(self._loopvariable, cvars)}
962
+ extras.update({v + '_prev': x for v, x in zip(self._loopvariable, pvars)})
963
+ extras.update({v + '_next': x for v, x in zip(self._loopvariable, nvars)})
964
+ suffix = self._loopsuffix.format(*cvars)
965
+ for node in self._contents:
966
+ self._actual_content.append(node.loopclone(suffix, extras))
967
+ return self._actual_content
968
+
969
+
970
+ class WorkshareFamily(Family):
971
+ """
972
+ Loop on the Family's content according to a list taken from ``self.conf``.
973
+
974
+ The list taken from ``self.conf`` is sliced, and each iteration of the
975
+ loop works on its slice of the list. That's why it's called a workshare...
976
+
977
+ Compared to the usual :class:`Family` class, additional attributes are:
978
+
979
+ :param str workshareconf: The name of the ``self.conf`` entry to slice
980
+ :param str worksharename: The name of the slice control variable (that is
981
+ automatically added to the childs' ``self.conf``).
982
+ :param int worksharesize: The minimum number of items in each workshare (default=1)
983
+ :param worksharesize: The maximum number of workshares (it might
984
+ be an integer or a name referring to an entry
985
+ ``in self.conf`` (default: None. e.g. no limit)
986
+ """
987
+
988
+ def __init__(self, **kw):
989
+ logger.debug('WorkshareFamily init %s', repr(self))
990
+ # On what should we build the workshare ?
991
+ self._workshareconf = kw.pop('workshareconf', None)
992
+ if not self._workshareconf:
993
+ raise ValueError('The "workshareconf" named argument must be given')
994
+ else:
995
+ self._workshareconf = self._workshareconf.split(',')
996
+ # Find the loop's variable names
997
+ self._worksharename = kw.pop('worksharename', None)
998
+ if not self._worksharename:
999
+ raise ValueError('The "worksharename" named argument must be given')
1000
+ else:
1001
+ self._worksharename = self._worksharename.split(',')
1002
+ if len(self._worksharename) != len(self._workshareconf):
1003
+ raise ValueError('Inconsistent size between workshareconf and worksharename')
1004
+ # Minimum size for a workshare
1005
+ self._worksharesize = int(kw.pop('worksharesize', 1))
1006
+ # Maximum number of workshares
1007
+ self._worksharelimit = kw.pop('worksharelimit', None)
1008
+ # Generic init
1009
+ super().__init__(**kw)
1010
+ # Initialisation stuff
1011
+ self._actual_content = None
1012
+
1013
+ def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
1014
+ baseargs = super()._args_loopclone(tagsuffix, extras)
1015
+ baseargs['workshareconf'] = ','.join(self._workshareconf)
1016
+ baseargs['worksharename'] = ','.join(self._worksharename)
1017
+ baseargs['worksharesize'] = self._worksharesize
1018
+ baseargs['worksharelimit'] = self._worksharelimit
1019
+ return baseargs
1020
+
1021
+ @property
1022
+ def contents(self):
1023
+ if self._actual_content is None:
1024
+ # Find the population sizes and workshares size/number
1025
+ populations = [self.conf.get(lc) for lc in self._workshareconf]
1026
+ n_population = {len(p) for p in populations}
1027
+ if not (len(n_population) == 1):
1028
+ raise RuntimeError('Inconsistent sizes in "workshareconf" lists')
1029
+ n_population = n_population.pop()
1030
+ # Number of workshares if worksharesize alone is considered
1031
+ sb_ws_number = n_population // self._worksharesize
1032
+ # Workshare limit
1033
+ if isinstance(self._worksharelimit, str):
1034
+ lb_ws_number = int(self.conf.get(self._worksharelimit))
1035
+ else:
1036
+ lb_ws_number = self._worksharelimit or sb_ws_number
1037
+ # Final result
1038
+ ws_number = max(min([sb_ws_number, lb_ws_number]), 1)
1039
+ # Find out the workshares sizes
1040
+ floorsize = n_population // ws_number
1041
+ ws_sizes = [floorsize, ] * ws_number
1042
+ for i in range(n_population - ws_number * floorsize):
1043
+ ws_sizes[i % ws_number] += 1
1044
+ # Build de family's content
1045
+ self._actual_content = list()
1046
+ ws_start = 0
1047
+ for i, ws_size in enumerate(ws_sizes):
1048
+ ws_slice = slice(ws_start, ws_start + ws_size)
1049
+ extras = {v: x[ws_slice] for v, x in zip(self._worksharename, populations)}
1050
+ ws_start += ws_size
1051
+ ws_suffix = '_ws{:03d}'.format(i + 1)
1052
+ for node in self._contents:
1053
+ self._actual_content.append(node.loopclone(ws_suffix, extras))
1054
+ return self._actual_content
1055
+
1056
+
1057
+ class Task(Node):
1058
+ """Terminal node including a :class:`Sequence`."""
1059
+
1060
+ def __init__(self, **kw):
1061
+ logger.debug('Task init %s', repr(self))
1062
+ super().__init__(kw)
1063
+ self.steps = kw.pop('steps', tuple())
1064
+ self.fetch = kw.pop('fetch', 'fetch')
1065
+ self.compute = kw.pop('compute', 'compute')
1066
+ self.backup = kw.pop('backup', 'backup')
1067
+ self.options = kw.copy()
1068
+ if isinstance(self.steps, str):
1069
+ self.steps = tuple(self.steps.replace(' ', '').split(','))
1070
+
1071
+ @property
1072
+ def realkind(self):
1073
+ return 'task'
1074
+
1075
+ def _args_loopclone(self, tagsuffix, extras): # @UnusedVariable
1076
+ baseargs = super()._args_loopclone(tagsuffix, extras)
1077
+ baseargs['steps'] = self.steps
1078
+ baseargs['fetch'] = self.fetch
1079
+ baseargs['compute'] = self.compute
1080
+ baseargs['backup'] = self.backup
1081
+ return baseargs
1082
+
1083
+ @property
1084
+ def ctx(self):
1085
+ return self.ticket.context
1086
+
1087
+ def build(self):
1088
+ """Switch to rundir and check the active steps."""
1089
+
1090
+ t = self.ticket
1091
+ t.sh.title(' '.join(('Build', self.realkind, self.tag)))
1092
+
1093
+ # Change actual rundir if specified
1094
+ rundir = self.options.get('rundir', None)
1095
+ if rundir:
1096
+ t.env.RUNDIR = rundir
1097
+ t.sh.cd(rundir, create=True)
1098
+ t.rundir = t.sh.getcwd()
1099
+ print('The current directory is: {}'.format(t.sh.getcwd()))
1100
+
1101
+ # Some attempt to find the current active steps
1102
+ if not self.steps:
1103
+ new_steps = []
1104
+ if (self.env.get(self._locprefix + 'WARMSTART')
1105
+ or self.conf.get('warmstart', False)):
1106
+ new_steps.append('warmstart')
1107
+ if (self.env.get(self._locprefix + 'REFILL')
1108
+ or self.conf.get('refill', False)):
1109
+ new_steps.append('refill')
1110
+ if new_steps:
1111
+ self.steps = tuple(new_steps)
1112
+ else:
1113
+ if self.play:
1114
+ self.steps = ('early-{:s}'.format(self.fetch), self.fetch,
1115
+ self.compute,
1116
+ self.backup, 'late-{:s}'.format(self.backup))
1117
+ else:
1118
+ self.steps = ('early-{:s}'.format(self.fetch), self.fetch)
1119
+ self.header('Active steps: ' + ' '.join(self.steps))
1120
+
1121
+ def conf2io(self):
1122
+ """Broadcast IO SERVER configuration values to environment."""
1123
+ t = self.ticket
1124
+ triggered = any([i in self.conf
1125
+ for i in ('io_nodes', 'io_companions', 'io_incore_tasks',
1126
+ 'io_openmp')])
1127
+ if 'io_nodes' in self.conf:
1128
+ t.env.default(VORTEX_IOSERVER_NODES=self.conf.io_nodes)
1129
+ if 'io_tasks' in self.conf:
1130
+ t.env.default(VORTEX_IOSERVER_TASKS=self.conf.io_tasks)
1131
+ elif 'io_companions' in self.conf:
1132
+ t.env.default(VORTEX_IOSERVER_COMPANION_TASKS=self.conf.io_companions)
1133
+ elif 'io_incore_tasks' in self.conf:
1134
+ t.env.default(VORTEX_IOSERVER_INCORE_TASKS=self.conf.io_incore_tasks)
1135
+ if 'io_incore_fixer' in self.conf:
1136
+ t.env.default(VORTEX_IOSERVER_INCORE_FIXER=self.conf.io_incore_fixer)
1137
+ if 'io_incore_dist' in self.conf:
1138
+ t.env.default(VORTEX_IOSERVER_INCORE_DIST=self.conf.io_incore_dist)
1139
+ if 'io_openmp' in self.conf:
1140
+ t.env.default(VORTEX_IOSERVER_OPENMP=self.conf.io_openmp)
1141
+ if triggered and self.mstep_counter <= 1:
1142
+ self.nicedump('IOSERVER Environment', **{k: v for k, v in t.env.items()
1143
+ if k.startswith('VORTEX_IOSERVER_')})
1144
+
1145
+ def io_poll(self, prefix=None):
1146
+ """Complete the polling of data produced by the execution step."""
1147
+ sh = self.sh
1148
+ if prefix and sh.path.exists('io_poll.todo'):
1149
+ for iopr in prefix:
1150
+ sh.header('IO poll <' + iopr + '>')
1151
+ rc = sh.io_poll(iopr)
1152
+ print(rc)
1153
+ print(rc.result)
1154
+ sh.header('Post-IO Poll directory listing')
1155
+ sh.ll(output=False, fatal=False)
1156
+
1157
+ def warmstart(self, **kw):
1158
+ """Populates the vortex cache with expected input flow data.
1159
+
1160
+ This is usefull when someone wants to restat an experiment from
1161
+ another one.
1162
+
1163
+ The warmstart method is systematically called when a task is run. However,
1164
+ the warmstart is not always desirable hence the if statement that checks the
1165
+ self.steps attribute's content.
1166
+ """
1167
+ # This method acts as an example: if a refill is actually needed,
1168
+ # it should be overwritten.
1169
+ if 'warmstart' in self.steps:
1170
+ pass
1171
+
1172
+ def refill(self, **kw):
1173
+ """Populates the vortex cache with external input data.
1174
+
1175
+ The refill method is systematically called when a task is run. However,
1176
+ the refill is not always desirable hence the if statement that checks the
1177
+ self.steps attribute's content.
1178
+ """
1179
+ # This method acts as an example: if a refill is actually needed,
1180
+ # it should be overwritten.
1181
+ if 'refill' in self.steps:
1182
+ pass
1183
+
1184
+ def process(self):
1185
+ """Abstract method: perform the task to do."""
1186
+ # This method acts as an example: it should be overwritten.
1187
+
1188
+ if 'early-fetch' in self.steps or 'fetch' in self.steps:
1189
+ # In a multi step job (MTOOL, ...), this step will be run on a
1190
+ # transfer node. Consequently, data that may be missing from the
1191
+ # local cache must be fetched here. (e.g. GCO's genv, data from the
1192
+ # mass archive system, ...). Note: most of the data should be
1193
+ # retrieved here since the use of transfer node is costless.
1194
+ pass
1195
+
1196
+ if 'fetch' in self.steps:
1197
+ # In a multi step job (MTOOL, ...), this step will be run, on a
1198
+ # compute node, just before the beginning of computations. It is the
1199
+ # appropriate place to fetch data produced by a previous task (the
1200
+ # so-called previous task will have to use the 'backup' step
1201
+ # (see the later explanations) in order to make such data available
1202
+ # in the local cache).
1203
+ pass
1204
+
1205
+ if 'compute' in self.steps:
1206
+ # The actual computations... (usually a call to the run method of an
1207
+ # AlgoComponent)
1208
+ pass
1209
+
1210
+ if 'backup' in self.steps or 'late-backup' in self.steps:
1211
+ # In a multi step job (MTOOL, ...), this step will be run, on a
1212
+ # compute node, just after the computations. It is the appropriate
1213
+ # place to put data in the local cache in order to make it available
1214
+ # to a subsequent step.
1215
+ pass
1216
+
1217
+ if 'late-backup' in self.steps:
1218
+ # In a multi step job (MTOOL, ...), this step will be run on a
1219
+ # transfer node. Consequently, most of the data should be archived
1220
+ # here.
1221
+ pass
1222
+
1223
+ def _actual_run(self, sjob_activated=True):
1224
+ """Execution driver: build, setup, refill, process, complete."""
1225
+ sjob_activated = sjob_activated or self._subjobtag == self.tag
1226
+ if sjob_activated:
1227
+ if (self.status == NODE_STATUS.RUNNING or
1228
+ (self.status == NODE_STATUS.FAILED and self.fail_at_the_end)):
1229
+ try:
1230
+ self.build()
1231
+ self.setup()
1232
+ self.summary()
1233
+ self.warmstart()
1234
+ self.refill()
1235
+ self.process()
1236
+ except VortexForceComplete:
1237
+ self.sh.title('Force complete')
1238
+ finally:
1239
+ self.complete()
1240
+ else:
1241
+ self.build()
1242
+ self.subtitle('This task will not run since it failed in a previous step.')
1243
+ raise PreviousFailureError(
1244
+ 'Previous error re-raised from tag={:s}'.format(self.tag)
1245
+ )
1246
+
1247
+
1248
+ class Driver(getbytag.GetByTag, NiceLayout):
1249
+ """Iterable object for a simple scheduling of :class:`Application` objects."""
1250
+
1251
+ _tag_default = 'pilot'
1252
+
1253
+ def __init__(self, ticket, nodes=(), rundate=None, iniconf=None,
1254
+ jobname=None, options=None, iniencoding=None):
1255
+ """Setup default args value and read config file job."""
1256
+ self._ticket = t = ticket
1257
+ self._conf = None
1258
+
1259
+ # Set default parameters for the actual job
1260
+ self._options = dict() if options is None else options
1261
+ self._special_prefix = self._options.get('special_prefix', 'OP_').upper()
1262
+ self._subjob_tag = self._options.get('subjob_tag', None)
1263
+ j_assist = self._options.get('jobassistant', None)
1264
+ if j_assist is not None:
1265
+ self._special_prefix = j_assist.special_prefix.upper()
1266
+ self._subjob_tag = j_assist.subjob_tag
1267
+ self._mstep_job_last = self._options.get('mstep_job_last', True)
1268
+ self._dryrun = self._options.get('dryrun', False)
1269
+ self._iniconf = iniconf or t.env.get('{:s}INICONF'.format(self._special_prefix))
1270
+ self._iniencoding = iniencoding or t.env.get('{:s}INIENCODING'.format(self._special_prefix), None)
1271
+ self._jobname = jobname or t.env.get('{:s}JOBNAME'.format(self._special_prefix)) or 'void'
1272
+ self._rundate = rundate or t.env.get('{:s}RUNDATE'.format(self._special_prefix))
1273
+ self._nicelayout_init(dict())
1274
+
1275
+ # Build the tree to schedule
1276
+ self._contents = list()
1277
+ fcount = 0
1278
+ for x in nodes:
1279
+ if isinstance(x, Node):
1280
+ self._contents.append(x)
1281
+ else:
1282
+ fcount += 1
1283
+ self._contents.append(
1284
+ Family(
1285
+ tag='{:s}.f{:02d}'.format(self.tag, fcount),
1286
+ ticket=self.ticket,
1287
+ nodes=x,
1288
+ ** dict(self._options)
1289
+ )
1290
+ )
1291
+
1292
+ @property
1293
+ def ticket(self):
1294
+ return self._ticket
1295
+
1296
+ @property
1297
+ def conf(self):
1298
+ return self._conf
1299
+
1300
+ @property
1301
+ def confdiff(self):
1302
+ return self.conf
1303
+
1304
+ @property
1305
+ def sh(self):
1306
+ return self.ticket.sh
1307
+
1308
+ @property
1309
+ def env(self):
1310
+ return self.ticket.env
1311
+
1312
+ @property
1313
+ def iniconf(self):
1314
+ return self._iniconf
1315
+
1316
+ @property
1317
+ def iniencoding(self):
1318
+ return self._iniencoding
1319
+
1320
+ @property
1321
+ def jobconf(self):
1322
+ return self._jobconf
1323
+
1324
+ @property
1325
+ def contents(self):
1326
+ return self._contents
1327
+
1328
+ @property
1329
+ def jobname(self):
1330
+ return self._jobname
1331
+
1332
+ @property
1333
+ def rundate(self):
1334
+ return self._rundate
1335
+
1336
+ def read_config(self, inifile=None, iniencoding=None):
1337
+ """Read specified ``inifile`` initialisation file."""
1338
+ if inifile is None:
1339
+ inifile = self.iniconf
1340
+ if iniencoding is None:
1341
+ iniencoding = self.iniencoding
1342
+ try:
1343
+ iniparser = GenericConfigParser(inifile, encoding=iniencoding)
1344
+ thisconf = iniparser.as_dict(merged=False)
1345
+ except Exception:
1346
+ logger.critical('Could not read config %s', inifile)
1347
+ raise
1348
+ return thisconf
1349
+
1350
+ def setup(self, name=None, date=None, verbose=True):
1351
+ """Top setup of the current configuration, including at least one name."""
1352
+
1353
+ jobname = name or self.jobname
1354
+
1355
+ rundate = date or self.rundate
1356
+ if rundate is None:
1357
+ logger.info('No date provided for this run.')
1358
+
1359
+ if verbose:
1360
+ if rundate is None:
1361
+ self.sh.title(['Starting job', '', jobname, ])
1362
+ else:
1363
+ self.sh.title(['Starting job', '', jobname, '', 'date ' + rundate.isoformat()])
1364
+
1365
+ # Read once for all the job configuration file
1366
+ if self.iniconf is None:
1367
+ logger.warning('This driver does not have any configuration file')
1368
+ self._jobconf = dict()
1369
+ else:
1370
+ self._jobconf = self.read_config(self.iniconf, self.iniencoding)
1371
+
1372
+ self._conf = ConfigSet()
1373
+ updconf = self.jobconf.get('defaults', dict())
1374
+ updconf.update(self.jobconf.get(self.jobname, dict()))
1375
+ if self.mstep_counter <= 1:
1376
+ self.nicedump('Configuration for job ' + self.jobname, **updconf)
1377
+ else:
1378
+ print("Silent Driver' setup: please refer to the first job step for more details")
1379
+ self.conf.update(updconf)
1380
+
1381
+ # Recursively set the configuration tree and contexts
1382
+ if rundate is not None:
1383
+ self.conf.rundate = rundate
1384
+ for node in self.contents:
1385
+ node.setconf(self.conf, self.jobconf)
1386
+ node.build_context()
1387
+
1388
+ if self.mstep_counter <= 1:
1389
+ self.status = NODE_STATUS.READY
1390
+ if not self._dryrun:
1391
+ self.header('The various nodes were configured. Here is a Tree-View of the Driver:')
1392
+ print(self)
1393
+
1394
+ def run(self):
1395
+ """Assume recursion of nodes `run` methods."""
1396
+ if self._dryrun:
1397
+ raise RuntimeError('This Driver was initialised with "dryrun". ' +
1398
+ 'It is not allowed to call run().')
1399
+ self.status = NODE_STATUS.RUNNING
1400
+ try:
1401
+ for node in self.contents:
1402
+ with node.isolate():
1403
+ node.run(sjob_activated=self._subjob_tag is None)
1404
+ if self._mstep_job_last:
1405
+ self.status = NODE_STATUS.DONE
1406
+ except Exception:
1407
+ if not self._mstep_job_last and self.any_currently_running:
1408
+ self.sh.title("Handling of the job failure in a multi-job context.")
1409
+ self._print_traceback()
1410
+ print()
1411
+ print("Since it is not the last step of this multi-step job, " +
1412
+ "the job failure is ignored... for now.")
1413
+ else:
1414
+ raise
1415
+ else:
1416
+ if self.delayed_error_flag and self._subjob_tag is None and self._mstep_job_last:
1417
+ # Test on _subjob_tag because we do not want to crash in subjobs
1418
+ raise RuntimeError("One or several error occurred during the Driver execution. " +
1419
+ "The exceptions were delayed but now that the Driver ended let's crash !")
1420
+ finally:
1421
+ if self.any_failure:
1422
+ self.sh.title('An error occurred during job...')
1423
+ print('Here is the tree-view of the present Driver:')
1424
+ print(self)