scipion-pyworkflow 3.11.0__py3-none-any.whl → 3.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. pyworkflow/apps/__init__.py +29 -0
  2. pyworkflow/apps/pw_manager.py +37 -0
  3. pyworkflow/apps/pw_plot.py +51 -0
  4. pyworkflow/apps/pw_project.py +130 -0
  5. pyworkflow/apps/pw_protocol_list.py +143 -0
  6. pyworkflow/apps/pw_protocol_run.py +51 -0
  7. pyworkflow/apps/pw_run_tests.py +268 -0
  8. pyworkflow/apps/pw_schedule_run.py +322 -0
  9. pyworkflow/apps/pw_sleep.py +37 -0
  10. pyworkflow/apps/pw_sync_data.py +440 -0
  11. pyworkflow/apps/pw_viewer.py +78 -0
  12. pyworkflow/constants.py +1 -1
  13. pyworkflow/gui/__init__.py +36 -0
  14. pyworkflow/gui/browser.py +768 -0
  15. pyworkflow/gui/canvas.py +1190 -0
  16. pyworkflow/gui/dialog.py +981 -0
  17. pyworkflow/gui/form.py +2727 -0
  18. pyworkflow/gui/graph.py +247 -0
  19. pyworkflow/gui/graph_layout.py +271 -0
  20. pyworkflow/gui/gui.py +571 -0
  21. pyworkflow/gui/matplotlib_image.py +233 -0
  22. pyworkflow/gui/plotter.py +247 -0
  23. pyworkflow/gui/project/__init__.py +25 -0
  24. pyworkflow/gui/project/base.py +193 -0
  25. pyworkflow/gui/project/constants.py +139 -0
  26. pyworkflow/gui/project/labels.py +205 -0
  27. pyworkflow/gui/project/project.py +491 -0
  28. pyworkflow/gui/project/searchprotocol.py +240 -0
  29. pyworkflow/gui/project/searchrun.py +181 -0
  30. pyworkflow/gui/project/steps.py +171 -0
  31. pyworkflow/gui/project/utils.py +332 -0
  32. pyworkflow/gui/project/variables.py +179 -0
  33. pyworkflow/gui/project/viewdata.py +472 -0
  34. pyworkflow/gui/project/viewprojects.py +519 -0
  35. pyworkflow/gui/project/viewprotocols.py +2141 -0
  36. pyworkflow/gui/project/viewprotocols_extra.py +562 -0
  37. pyworkflow/gui/text.py +774 -0
  38. pyworkflow/gui/tooltip.py +185 -0
  39. pyworkflow/gui/tree.py +684 -0
  40. pyworkflow/gui/widgets.py +307 -0
  41. pyworkflow/mapper/__init__.py +26 -0
  42. pyworkflow/mapper/mapper.py +226 -0
  43. pyworkflow/mapper/sqlite.py +1583 -0
  44. pyworkflow/mapper/sqlite_db.py +145 -0
  45. pyworkflow/object.py +1 -0
  46. pyworkflow/plugin.py +4 -4
  47. pyworkflow/project/__init__.py +31 -0
  48. pyworkflow/project/config.py +454 -0
  49. pyworkflow/project/manager.py +180 -0
  50. pyworkflow/project/project.py +2095 -0
  51. pyworkflow/project/usage.py +165 -0
  52. pyworkflow/protocol/__init__.py +38 -0
  53. pyworkflow/protocol/bibtex.py +48 -0
  54. pyworkflow/protocol/constants.py +87 -0
  55. pyworkflow/protocol/executor.py +515 -0
  56. pyworkflow/protocol/hosts.py +318 -0
  57. pyworkflow/protocol/launch.py +277 -0
  58. pyworkflow/protocol/package.py +42 -0
  59. pyworkflow/protocol/params.py +781 -0
  60. pyworkflow/protocol/protocol.py +2712 -0
  61. pyworkflow/resources/protlabels.xcf +0 -0
  62. pyworkflow/resources/sprites.png +0 -0
  63. pyworkflow/resources/sprites.xcf +0 -0
  64. pyworkflow/template.py +1 -1
  65. pyworkflow/tests/__init__.py +29 -0
  66. pyworkflow/tests/test_utils.py +25 -0
  67. pyworkflow/tests/tests.py +342 -0
  68. pyworkflow/utils/__init__.py +38 -0
  69. pyworkflow/utils/dataset.py +414 -0
  70. pyworkflow/utils/echo.py +104 -0
  71. pyworkflow/utils/graph.py +169 -0
  72. pyworkflow/utils/log.py +293 -0
  73. pyworkflow/utils/path.py +528 -0
  74. pyworkflow/utils/process.py +154 -0
  75. pyworkflow/utils/profiler.py +92 -0
  76. pyworkflow/utils/progressbar.py +154 -0
  77. pyworkflow/utils/properties.py +618 -0
  78. pyworkflow/utils/reflection.py +129 -0
  79. pyworkflow/utils/utils.py +880 -0
  80. pyworkflow/utils/which.py +229 -0
  81. pyworkflow/webservices/__init__.py +8 -0
  82. pyworkflow/webservices/config.py +8 -0
  83. pyworkflow/webservices/notifier.py +152 -0
  84. pyworkflow/webservices/repository.py +59 -0
  85. pyworkflow/webservices/workflowhub.py +86 -0
  86. pyworkflowtests/tests/__init__.py +0 -0
  87. pyworkflowtests/tests/test_canvas.py +72 -0
  88. pyworkflowtests/tests/test_domain.py +45 -0
  89. pyworkflowtests/tests/test_logs.py +74 -0
  90. pyworkflowtests/tests/test_mappers.py +392 -0
  91. pyworkflowtests/tests/test_object.py +507 -0
  92. pyworkflowtests/tests/test_project.py +42 -0
  93. pyworkflowtests/tests/test_protocol_execution.py +146 -0
  94. pyworkflowtests/tests/test_protocol_export.py +78 -0
  95. pyworkflowtests/tests/test_protocol_output.py +158 -0
  96. pyworkflowtests/tests/test_streaming.py +47 -0
  97. pyworkflowtests/tests/test_utils.py +210 -0
  98. {scipion_pyworkflow-3.11.0.dist-info → scipion_pyworkflow-3.11.2.dist-info}/METADATA +2 -2
  99. scipion_pyworkflow-3.11.2.dist-info/RECORD +162 -0
  100. scipion_pyworkflow-3.11.0.dist-info/RECORD +0 -71
  101. {scipion_pyworkflow-3.11.0.dist-info → scipion_pyworkflow-3.11.2.dist-info}/WHEEL +0 -0
  102. {scipion_pyworkflow-3.11.0.dist-info → scipion_pyworkflow-3.11.2.dist-info}/entry_points.txt +0 -0
  103. {scipion_pyworkflow-3.11.0.dist-info → scipion_pyworkflow-3.11.2.dist-info}/licenses/LICENSE.txt +0 -0
  104. {scipion_pyworkflow-3.11.0.dist-info → scipion_pyworkflow-3.11.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,515 @@
1
+ # **************************************************************************
2
+ # *
3
+ # * Authors: J.M. De la Rosa Trevin (jmdelarosa@cnb.csic.es)
4
+ # *
5
+ # * Unidad de Bioinformatica of Centro Nacional de Biotecnologia, CSIC
6
+ # *
7
+ # * This program is free software; you can redistribute it and/or modify
8
+ # * it under the terms of the GNU General Public License as published by
9
+ # * the Free Software Foundation; either version 3 of the License, or
10
+ # * (at your option) any later version.
11
+ # *
12
+ # * This program is distributed in the hope that it will be useful,
13
+ # * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # * GNU General Public License for more details.
16
+ # *
17
+ # * You should have received a copy of the GNU General Public License
18
+ # * along with this program; if not, write to the Free Software
19
+ # * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20
+ # * 02111-1307 USA
21
+ # *
22
+ # * All comments concerning this program package may be sent to the
23
+ # * e-mail address 'scipion@cnb.csic.es'
24
+ # *
25
+ # **************************************************************************
26
+ """
27
+ This module have the classes for execution of protocol steps.
28
+ The basic one will run steps, one by one, after completion.
29
+ There is one based on threads to execute steps in parallel
30
+ using different threads and the last one with MPI processes.
31
+ """
32
+
33
+ import logging
34
+
35
+ logger = logging.getLogger(__name__)
36
+ import time
37
+ import datetime
38
+ import threading
39
+ import os
40
+
41
+ import pyworkflow.utils.process as process
42
+ from pyworkflow.utils.path import getParentFolder, removeExt
43
+ from pyworkflow.constants import PLUGIN_MODULE_VAR, RUN_JOB_GPU_PARAM_SEARCH
44
+ from . import constants as cts
45
+
46
+ from .launch import _submit, UNKNOWN_JOBID, _checkJobStatus
47
+
48
+
49
+ class StepExecutor:
50
+ """ Run a list of Protocol steps. """
51
+
52
+ def __init__(self, hostConfig, **kwargs):
53
+ self.hostConfig = hostConfig
54
+ self.gpuList = kwargs.get(cts.GPU_LIST, None)
55
+ self.protocol = None
56
+
57
+ def getGpuList(self):
58
+ """ Return the GPU list assigned to current thread. """
59
+ return self.gpuList
60
+
61
+ def getGpuListStr(self, sep=" "):
62
+ """ Returns the GPU list assigned to the current thread as string
63
+ :param sep: default ot space. Pass "," or other string to use a separator
64
+ """
65
+ return sep.join(map(str, self.getGpuList()))
66
+
67
+ def setCudaVisibleDevices(self):
68
+ """ sets CUDA_VISIBLE DEVICES in the environment to the available GPUs in turn"""
69
+ # https://developer.nvidia.com/blog/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/
70
+ os.environ["CUDA_VISIBLE_DEVICES"] = self.getGpuListStr(",")
71
+
72
+ def setProtocol(self, protocol):
73
+ """ Set protocol to append active jobs to its jobIds. """
74
+ self.protocol = protocol
75
+
76
+ def runJob(self, log, programName, params,
77
+ numberOfMpi=1, numberOfThreads=1,
78
+ env=None, cwd=None, executable=None):
79
+ """ This function is a wrapper around runJob,
80
+ providing the host configuration.
81
+ """
82
+ process.runJob(log, programName, params,
83
+ numberOfMpi, numberOfThreads,
84
+ self.hostConfig,
85
+ env=env, cwd=cwd, gpuList=self._getGPUListForCommand(programName, params), executable=executable,
86
+ context=self.protocol.getSubmitDict())
87
+
88
+ def _getGPUListForCommand(self, program, params):
89
+ """ Returns the list of GPUs if the program or the params have the GPU placeholder %(GPU)s """
90
+ if RUN_JOB_GPU_PARAM_SEARCH in params or RUN_JOB_GPU_PARAM_SEARCH in program:
91
+ return self.getGpuList()
92
+ else:
93
+ return []
94
+
95
+ def _getRunnable(self, steps, n=1):
96
+ """ Return the n steps that are 'new' and all its
97
+ dependencies have been finished, or None if none ready.
98
+ """
99
+ rs = [] # return a list of runnable steps
100
+
101
+ for s in steps:
102
+ if (s.getStatus() == cts.STATUS_NEW and
103
+ all(steps[i - 1].isFinished() for i in s._prerequisites)):
104
+
105
+ if self._isStepRunnable(s):
106
+ rs.append(s)
107
+ if len(rs) == n:
108
+ break
109
+ return rs
110
+
111
+ def _isStepRunnable(self, step):
112
+ """ Should be implemented by inherited classes to test extra conditions """
113
+ return True
114
+
115
+ def _arePending(self, steps):
116
+ """ Return True if there are pending steps (either running or waiting)
117
+ that can be done and thus enable other steps to be executed.
118
+ """
119
+ return any(s.isRunning() or s.isWaiting() for s in steps)
120
+
121
+ def runSteps(self, steps,
122
+ stepStartedCallback,
123
+ stepFinishedCallback,
124
+ stepsCheckCallback,
125
+ stepsCheckSecs=3):
126
+ # Even if this will run the steps in a single thread
127
+ # let's follow a similar approach than the parallel one
128
+ # In this way we can take into account the steps graph
129
+ # dependency and also the case when using streaming
130
+
131
+ delta = datetime.timedelta(seconds=stepsCheckSecs)
132
+ lastCheck = datetime.datetime.now()
133
+
134
+ while True:
135
+ # Get a step to run, if there is any
136
+ runnableSteps = self._getRunnable(steps)
137
+
138
+ if runnableSteps:
139
+ step = runnableSteps[0]
140
+ # We found a step to work in, so let's start a new
141
+ # thread to do the job and book it.
142
+ step.setRunning()
143
+ stepStartedCallback(step)
144
+ step.run()
145
+ doContinue = stepFinishedCallback(step)
146
+
147
+ if not doContinue:
148
+ break
149
+
150
+ elif self._arePending(steps):
151
+ # We have not found any runnable step, but still
152
+ # there are some running or waiting for dependencies
153
+ # So, let's wait a bit to check if something changes
154
+ time.sleep(3)
155
+ else:
156
+ # No steps to run, neither running or waiting
157
+ # So, we are done, either failed or finished :)
158
+ break
159
+
160
+ now = datetime.datetime.now()
161
+ if now - lastCheck > delta:
162
+ stepsCheckCallback()
163
+ lastCheck = now
164
+
165
+ stepsCheckCallback() # one last check to finalize stuff
166
+
167
+
168
+ class StepThread(threading.Thread):
169
+ """ Thread to run Steps in parallel. """
170
+
171
+ def __init__(self, step, lock):
172
+ threading.Thread.__init__(self)
173
+ self.thId = step.getObjId()
174
+ self.step = step
175
+ self.lock = lock
176
+
177
+ def needsGPU(self):
178
+ return self.step.needsGPU()
179
+
180
+ def run(self):
181
+ error = None
182
+ try:
183
+ self.step._run() # not self.step.run() , to avoid race conditions
184
+ except Exception as e:
185
+ error = str(e)
186
+ logger.error("Couldn't run the code in a thread.", exc_info=e)
187
+ finally:
188
+ with self.lock:
189
+ if error is None:
190
+ self.step.setFinished()
191
+ else:
192
+ self.step.setFailed(error)
193
+
194
+
195
+ class ThreadStepExecutor(StepExecutor):
196
+ """ Run steps in parallel using threads. """
197
+
198
+ def __init__(self, hostConfig, nThreads, **kwargs):
199
+ StepExecutor.__init__(self, hostConfig, **kwargs)
200
+ self.numberOfProcs = nThreads
201
+ # If the gpuList was specified, we need to distribute GPUs among
202
+ # all the threads
203
+ self.gpuDict = {}
204
+
205
+ self._assignGPUperNode()
206
+
207
+ def _assignGPUperNode(self):
208
+ # If we have GPUs
209
+ if self.gpuList:
210
+
211
+ nThreads = self.numberOfProcs
212
+
213
+ # Nodes: each concurrent steps
214
+ nodes = range(1, nThreads + 1)
215
+
216
+ # Number of GPUs
217
+ nGpu = len(self.gpuList)
218
+
219
+ # If more GPUs than threads
220
+ if nGpu > nThreads:
221
+
222
+ # Get the ratio: 2 GPUs per thread? 3 GPUs per thread?
223
+ # 3 GPU and 2 threads is rounded to 1 (flooring)
224
+ step = int(nGpu / nThreads)
225
+ spare = nGpu % nThreads
226
+ fromPos = 0
227
+ # For each node(concurrent thread)
228
+ for node in nodes:
229
+ # Store the GPUS per thread:
230
+ # GPUs: 0 1 2
231
+ # Threads 2 (step 1)
232
+ # Node 0 : GPU 0 1
233
+ # Node 1 : GPU 2
234
+
235
+ extraGpu = 1 if spare > 0 else 0
236
+ toPos = fromPos + step + extraGpu
237
+ gpusForNode = list(self.gpuList[fromPos:toPos])
238
+
239
+ newGpusForNode = self.cleanVoidGPUs(gpusForNode)
240
+ if len(newGpusForNode) == 0:
241
+ logger.info("Gpu slot cancelled: all were null Gpus -> %s" % gpusForNode)
242
+ else:
243
+ logger.info("GPUs %s assigned to node %s" % (newGpusForNode, node))
244
+ self.gpuDict[-node] = newGpusForNode
245
+
246
+ fromPos = toPos
247
+ spare -= 1
248
+
249
+ else:
250
+ # Expand gpuList repeating until reach nThreads items
251
+ if nThreads > nGpu:
252
+ logger.warning("GPUs are no longer extended. If you want all GPUs to match threads repeat as many "
253
+ "GPUs as threads.")
254
+ # newList = self.gpuList * (int(nThreads / nGpu) + 1)
255
+ # self.gpuList = newList[:nThreads]
256
+
257
+ for index, gpu in enumerate(self.gpuList):
258
+
259
+ if gpu == cts.VOID_GPU:
260
+ logger.info("Void GPU (%s) found in the list. Skipping the slot." % cts.VOID_GPU)
261
+ else:
262
+ logger.info("GPU slot for gpu %s." % gpu)
263
+ # Any negative number in the key means a free gpu slot. can't be 0!
264
+ self.gpuDict[-index - 1] = [gpu]
265
+
266
+ def cleanVoidGPUs(self, gpuList):
267
+ newGPUList = []
268
+ for gpuid in gpuList:
269
+ if gpuid == cts.VOID_GPU:
270
+ logger.info("Void GPU detected in %s" % gpuList)
271
+ else:
272
+ newGPUList.append(gpuid)
273
+ return newGPUList
274
+
275
+ def getCurrentStepThread(self) -> StepThread:
276
+
277
+ return threading.current_thread()
278
+
279
+ def getGpuList(self):
280
+ """ Return the GPU list assigned to current thread
281
+ or empty list if not using GPUs. """
282
+
283
+ # If the node id has assigned gpus?
284
+ stepThread = self.getCurrentStepThread()
285
+
286
+ # If the step does not need the gpu
287
+ if not stepThread.needsGPU():
288
+ # return an empty list
289
+ return []
290
+ else:
291
+ nodeId = stepThread.thId
292
+ if nodeId in self.gpuDict:
293
+ gpus = self.gpuDict.get(nodeId)
294
+ logger.info("Reusing GPUs (%s) slot for %s" % (gpus, nodeId))
295
+ return gpus
296
+ else:
297
+
298
+ gpus = self.getFreeGpuSlot(nodeId)
299
+ if gpus is None:
300
+ logger.warning(
301
+ "Step on node %s is requesting GPUs but there isn't any available. Review configuration of threads/GPUs. Returning an empty list." % nodeId)
302
+ return []
303
+ else:
304
+ return gpus
305
+
306
+ def getFreeGpuSlot(self, stepId=None):
307
+ """ Returns a free gpu slot available or None. If node is passed it also reserves it for that node
308
+
309
+ :param node: node to make the reserve of Gpus
310
+ """
311
+ for node in self.gpuDict.keys():
312
+ # This is a free node. Book it
313
+ if node < 0:
314
+ gpus = self.gpuDict[node]
315
+
316
+ if stepId is not None:
317
+ self.gpuDict.pop(node)
318
+ self.gpuDict[stepId] = gpus
319
+ logger.info("GPUs %s assigned to step %s" % (gpus, stepId))
320
+ else:
321
+ logger.info("Free gpu slot found at %s" % node)
322
+ return gpus
323
+
324
+ return None
325
+
326
+ def freeGpusSlot(self, node):
327
+ gpus = self.gpuDict.get(node, None)
328
+
329
+ # Some nodes/threads do not use gpus so may not be booked and not in the dictionary
330
+ if gpus is not None:
331
+ self.gpuDict.pop(node)
332
+ self.gpuDict[-node] = gpus
333
+ logger.info("GPUs %s freed from step %s" % (gpus, node))
334
+ else:
335
+ logger.debug("step id %s not found in GPU slots" % node)
336
+
337
+ def _isStepRunnable(self, step):
338
+ """ Overwrite this method to check GPUs availability"""
339
+
340
+ if self.gpuList and step.needsGPU() and self.getFreeGpuSlot(step.getObjId()) is None:
341
+ logger.info("Can't run step %s. Needs gpus and there are no free gpu slots" % step)
342
+ return False
343
+
344
+ return True
345
+
346
+ def runSteps(self, steps,
347
+ stepStartedCallback,
348
+ stepFinishedCallback,
349
+ stepsCheckCallback,
350
+ stepsCheckSecs=5):
351
+ """
352
+ Creates threads and synchronize the steps execution.
353
+
354
+ :param steps: list of steps to run
355
+ :param stepStartedCallback: callback to be called before starting any step
356
+ :param stepFinishedCallback: callback to be run after all steps are done
357
+ :param stepsCheckCallback: callback to check if there are new steps to add (streaming)
358
+ :param stepsCheckSecs: seconds between stepsCheckCallback calls
359
+
360
+ """
361
+
362
+ delta = datetime.timedelta(seconds=stepsCheckSecs)
363
+ lastCheck = datetime.datetime.now()
364
+
365
+ sharedLock = threading.Lock()
366
+
367
+ runningSteps = {} # currently running step in each node ({node: step})
368
+ freeNodes = list(range(1, self.numberOfProcs + 1)) # available nodes to send jobs
369
+ logger.info("Execution threads: %s" % freeNodes)
370
+ logger.info("Running steps using %s threads. 1 thread is used for this main process." % self.numberOfProcs)
371
+
372
+ while True:
373
+ # See which of the runningSteps are not really running anymore.
374
+ # Update them and freeNodes, and call final callback for step.
375
+ with sharedLock:
376
+ nodesFinished = [node for node, step in runningSteps.items()
377
+ if not step.isRunning()]
378
+ doContinue = True
379
+ for node in nodesFinished:
380
+ step = runningSteps.pop(node) # remove entry from runningSteps
381
+ freeNodes.append(node) # the node is available now
382
+ self.freeGpusSlot(step.getObjId())
383
+ # Notify steps termination and check if we should continue
384
+ doContinue = stepFinishedCallback(step)
385
+ if not doContinue:
386
+ break
387
+
388
+ if not doContinue:
389
+ break
390
+
391
+ anyLaunched = False
392
+ # If there are available nodes, send next runnable step.
393
+ with sharedLock:
394
+ if freeNodes:
395
+ runnableSteps = self._getRunnable(steps, len(freeNodes))
396
+
397
+ for step in runnableSteps:
398
+ # We found a step to work in, so let's start a new
399
+ # thread to do the job and book it.
400
+ anyLaunched = True
401
+ step.setRunning()
402
+ stepStartedCallback(step)
403
+ node = freeNodes.pop(0) # take an available node
404
+ runningSteps[node] = step
405
+ logger.info("Running step %s on node %s" % (step, node))
406
+ t = StepThread(step, sharedLock)
407
+ # won't keep process up if main thread ends
408
+ t.daemon = True
409
+ t.start()
410
+
411
+ anyPending = self._arePending(steps)
412
+
413
+ if not anyLaunched:
414
+ logger.debug("Nothing launched in this loop")
415
+ if anyPending: # nothing running
416
+ logger.debug("There are steps pending. Waiting 3 secs")
417
+ time.sleep(3)
418
+ else:
419
+ logger.info("Nothing pending. Breaking the loop.")
420
+ break # yeah, we are done, either failed or finished :)
421
+
422
+ now = datetime.datetime.now()
423
+ if now - lastCheck > delta:
424
+ stepsCheckCallback()
425
+ lastCheck = now
426
+
427
+ stepsCheckCallback()
428
+
429
+ # Wait for all threads now.
430
+ for t in threading.enumerate():
431
+ if t is not threading.current_thread():
432
+ t.join()
433
+
434
+ def _arePending(self, steps):
435
+ """ Return True if there are pending steps (either running, waiting or new (not yet executed)
436
+ """
437
+ for s in steps:
438
+ if s.isRunning() or s.isWaiting() or s.isNew():
439
+ return True
440
+
441
+ return False
442
+
443
+
444
+ class QueueStepExecutor(ThreadStepExecutor):
445
+ def __init__(self, hostConfig, submitDict, nThreads, **kwargs):
446
+ ThreadStepExecutor.__init__(self, hostConfig, nThreads, **kwargs)
447
+ self.submitDict = submitDict
448
+ # Command counter per thread
449
+ self.threadCommands = {}
450
+
451
+ if nThreads > 1:
452
+ self.runJobs = ThreadStepExecutor.runSteps
453
+ else:
454
+ self.runJobs = StepExecutor.runSteps
455
+
456
+ self.renameGpuIds()
457
+
458
+ def renameGpuIds(self):
459
+ """ Reorganize the gpus ids starting from 0 since the queue engine is the one assigning them.
460
+ https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars """
461
+ for threadId, gpuList in self.gpuDict.items():
462
+ for i in range(len(gpuList)):
463
+ self.gpuDict[threadId][i] = i
464
+
465
+ logger.debug("Updated gpus ids rebase starting from 0: %s per thread" % self.gpuDict)
466
+
467
+ def getThreadJobId(self, stepId):
468
+ """ Returns the job id extension assigned to each thread/step """
469
+ if not stepId in self.threadCommands:
470
+ self.threadCommands[stepId] = 0
471
+
472
+ self.threadCommands[stepId] += 1
473
+
474
+ return self.threadCommands[stepId]
475
+
476
+ def runJob(self, log, programName, params, numberOfMpi=1, numberOfThreads=1, env=None, cwd=None, executable=None):
477
+ threadId = threading.current_thread().thId
478
+ submitDict = dict(self.hostConfig.getQueuesDefault())
479
+ submitDict.update(self.submitDict)
480
+ threadJobId = self.getThreadJobId(threadId)
481
+ subthreadId = '-%s-%s' % (threadId, threadJobId)
482
+ submitDict['JOB_NAME'] = submitDict['JOB_NAME'] + subthreadId
483
+ submitDict['JOB_SCRIPT'] = os.path.abspath(removeExt(submitDict['JOB_SCRIPT']) + subthreadId + ".job")
484
+ submitDict['JOB_LOGS'] = os.path.join(getParentFolder(submitDict['JOB_SCRIPT']), submitDict['JOB_NAME'])
485
+
486
+ logger.debug("Variables available for replacement in submission command are: %s" % submitDict)
487
+
488
+ submitDict['JOB_COMMAND'] = process.buildRunCommand(programName, params, numberOfMpi,
489
+ self.hostConfig, env,
490
+ gpuList=self._getGPUListForCommand(programName, params),
491
+ context=submitDict)
492
+
493
+ jobid = _submit(self.hostConfig, submitDict, cwd, env)
494
+ self.protocol.appendJobId(jobid) # append active jobs
495
+ self.protocol._store(self.protocol._jobId)
496
+
497
+ if (jobid is None) or (jobid == UNKNOWN_JOBID):
498
+ errorMsg = "Failed to submit to queue. JOBID is not valid. There's probably an error interacting with the queue: %s" % error
499
+ logger.info(errorMsg)
500
+ raise Exception(errorMsg)
501
+
502
+ status = cts.STATUS_RUNNING
503
+ wait = 3
504
+
505
+ # Check status while job running
506
+ # REVIEW this to minimize the overhead in time put by this delay check
507
+ while _checkJobStatus(self.hostConfig, jobid) == cts.STATUS_RUNNING:
508
+ time.sleep(wait)
509
+ if wait < 300:
510
+ wait += 3
511
+
512
+ self.protocol.removeJobId(jobid) # After completion, remove inactive jobs.
513
+ self.protocol._store(self.protocol._jobId)
514
+
515
+ return status