hillclimber 0.1.6__cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (475) hide show
  1. hillclimber/__init__.py +41 -0
  2. hillclimber/actions.py +53 -0
  3. hillclimber/analysis.py +590 -0
  4. hillclimber/biases.py +293 -0
  5. hillclimber/calc.py +22 -0
  6. hillclimber/cvs.py +1070 -0
  7. hillclimber/interfaces.py +133 -0
  8. hillclimber/metadynamics.py +416 -0
  9. hillclimber/nodes.py +6 -0
  10. hillclimber/opes.py +359 -0
  11. hillclimber/pycv.py +362 -0
  12. hillclimber/selectors.py +230 -0
  13. hillclimber/virtual_atoms.py +341 -0
  14. hillclimber-0.1.6.dist-info/METADATA +325 -0
  15. hillclimber-0.1.6.dist-info/RECORD +475 -0
  16. hillclimber-0.1.6.dist-info/WHEEL +6 -0
  17. hillclimber-0.1.6.dist-info/entry_points.txt +8 -0
  18. hillclimber-0.1.6.dist-info/licenses/LICENSE +165 -0
  19. hillclimber-0.1.6.dist-info/sboms/auditwheel.cdx.json +1 -0
  20. hillclimber.libs/libgomp-a49a47f9.so.1.0.0 +0 -0
  21. plumed/__init__.py +104 -0
  22. plumed/_lib/bin/plumed +0 -0
  23. plumed/_lib/bin/plumed-config +9 -0
  24. plumed/_lib/bin/plumed-patch +9 -0
  25. plumed/_lib/include/plumed/adjmat/AdjacencyMatrixBase.h +659 -0
  26. plumed/_lib/include/plumed/adjmat/ContactMatrix.h +59 -0
  27. plumed/_lib/include/plumed/asmjit/arch.h +228 -0
  28. plumed/_lib/include/plumed/asmjit/arm.h +43 -0
  29. plumed/_lib/include/plumed/asmjit/asmjit.h +69 -0
  30. plumed/_lib/include/plumed/asmjit/asmjit_apibegin.h +143 -0
  31. plumed/_lib/include/plumed/asmjit/asmjit_apiend.h +93 -0
  32. plumed/_lib/include/plumed/asmjit/asmjit_build.h +971 -0
  33. plumed/_lib/include/plumed/asmjit/assembler.h +183 -0
  34. plumed/_lib/include/plumed/asmjit/base.h +56 -0
  35. plumed/_lib/include/plumed/asmjit/codebuilder.h +944 -0
  36. plumed/_lib/include/plumed/asmjit/codecompiler.h +767 -0
  37. plumed/_lib/include/plumed/asmjit/codeemitter.h +528 -0
  38. plumed/_lib/include/plumed/asmjit/codeholder.h +777 -0
  39. plumed/_lib/include/plumed/asmjit/constpool.h +286 -0
  40. plumed/_lib/include/plumed/asmjit/cpuinfo.h +402 -0
  41. plumed/_lib/include/plumed/asmjit/func.h +1327 -0
  42. plumed/_lib/include/plumed/asmjit/globals.h +370 -0
  43. plumed/_lib/include/plumed/asmjit/inst.h +137 -0
  44. plumed/_lib/include/plumed/asmjit/logging.h +317 -0
  45. plumed/_lib/include/plumed/asmjit/misc_p.h +103 -0
  46. plumed/_lib/include/plumed/asmjit/moved_string.h +318 -0
  47. plumed/_lib/include/plumed/asmjit/operand.h +1599 -0
  48. plumed/_lib/include/plumed/asmjit/osutils.h +207 -0
  49. plumed/_lib/include/plumed/asmjit/regalloc_p.h +597 -0
  50. plumed/_lib/include/plumed/asmjit/runtime.h +227 -0
  51. plumed/_lib/include/plumed/asmjit/simdtypes.h +1104 -0
  52. plumed/_lib/include/plumed/asmjit/utils.h +1387 -0
  53. plumed/_lib/include/plumed/asmjit/vmem.h +183 -0
  54. plumed/_lib/include/plumed/asmjit/x86.h +45 -0
  55. plumed/_lib/include/plumed/asmjit/x86assembler.h +125 -0
  56. plumed/_lib/include/plumed/asmjit/x86builder.h +117 -0
  57. plumed/_lib/include/plumed/asmjit/x86compiler.h +322 -0
  58. plumed/_lib/include/plumed/asmjit/x86emitter.h +5149 -0
  59. plumed/_lib/include/plumed/asmjit/x86globals.h +535 -0
  60. plumed/_lib/include/plumed/asmjit/x86inst.h +2547 -0
  61. plumed/_lib/include/plumed/asmjit/x86instimpl_p.h +74 -0
  62. plumed/_lib/include/plumed/asmjit/x86internal_p.h +108 -0
  63. plumed/_lib/include/plumed/asmjit/x86logging_p.h +92 -0
  64. plumed/_lib/include/plumed/asmjit/x86misc.h +417 -0
  65. plumed/_lib/include/plumed/asmjit/x86operand.h +1133 -0
  66. plumed/_lib/include/plumed/asmjit/x86regalloc_p.h +734 -0
  67. plumed/_lib/include/plumed/asmjit/zone.h +1157 -0
  68. plumed/_lib/include/plumed/bias/Bias.h +82 -0
  69. plumed/_lib/include/plumed/bias/ReweightBase.h +58 -0
  70. plumed/_lib/include/plumed/blas/blas.h +253 -0
  71. plumed/_lib/include/plumed/blas/def_external.h +61 -0
  72. plumed/_lib/include/plumed/blas/def_internal.h +97 -0
  73. plumed/_lib/include/plumed/blas/real.h +49 -0
  74. plumed/_lib/include/plumed/cltools/CLTool.h +32 -0
  75. plumed/_lib/include/plumed/clusters/ClusteringBase.h +70 -0
  76. plumed/_lib/include/plumed/colvar/Colvar.h +32 -0
  77. plumed/_lib/include/plumed/colvar/ColvarInput.h +68 -0
  78. plumed/_lib/include/plumed/colvar/ColvarShortcut.h +81 -0
  79. plumed/_lib/include/plumed/colvar/CoordinationBase.h +52 -0
  80. plumed/_lib/include/plumed/colvar/MultiColvarTemplate.h +333 -0
  81. plumed/_lib/include/plumed/colvar/PathMSDBase.h +101 -0
  82. plumed/_lib/include/plumed/colvar/RMSDVector.h +78 -0
  83. plumed/_lib/include/plumed/config/Config.h +118 -0
  84. plumed/_lib/include/plumed/config/version.h +9 -0
  85. plumed/_lib/include/plumed/contour/ContourFindingObject.h +87 -0
  86. plumed/_lib/include/plumed/contour/DistanceFromContourBase.h +82 -0
  87. plumed/_lib/include/plumed/contour/FindContour.h +67 -0
  88. plumed/_lib/include/plumed/core/Action.h +540 -0
  89. plumed/_lib/include/plumed/core/ActionAnyorder.h +48 -0
  90. plumed/_lib/include/plumed/core/ActionAtomistic.h +343 -0
  91. plumed/_lib/include/plumed/core/ActionForInterface.h +99 -0
  92. plumed/_lib/include/plumed/core/ActionPilot.h +57 -0
  93. plumed/_lib/include/plumed/core/ActionRegister.h +124 -0
  94. plumed/_lib/include/plumed/core/ActionSet.h +163 -0
  95. plumed/_lib/include/plumed/core/ActionSetup.h +48 -0
  96. plumed/_lib/include/plumed/core/ActionShortcut.h +73 -0
  97. plumed/_lib/include/plumed/core/ActionToGetData.h +59 -0
  98. plumed/_lib/include/plumed/core/ActionToPutData.h +101 -0
  99. plumed/_lib/include/plumed/core/ActionWithArguments.h +140 -0
  100. plumed/_lib/include/plumed/core/ActionWithMatrix.h +87 -0
  101. plumed/_lib/include/plumed/core/ActionWithValue.h +258 -0
  102. plumed/_lib/include/plumed/core/ActionWithVector.h +94 -0
  103. plumed/_lib/include/plumed/core/ActionWithVirtualAtom.h +123 -0
  104. plumed/_lib/include/plumed/core/CLTool.h +177 -0
  105. plumed/_lib/include/plumed/core/CLToolMain.h +102 -0
  106. plumed/_lib/include/plumed/core/CLToolRegister.h +108 -0
  107. plumed/_lib/include/plumed/core/Colvar.h +115 -0
  108. plumed/_lib/include/plumed/core/DataPassingObject.h +94 -0
  109. plumed/_lib/include/plumed/core/DataPassingTools.h +54 -0
  110. plumed/_lib/include/plumed/core/DomainDecomposition.h +120 -0
  111. plumed/_lib/include/plumed/core/ExchangePatterns.h +47 -0
  112. plumed/_lib/include/plumed/core/FlexibleBin.h +63 -0
  113. plumed/_lib/include/plumed/core/GREX.h +61 -0
  114. plumed/_lib/include/plumed/core/GenericMolInfo.h +89 -0
  115. plumed/_lib/include/plumed/core/Group.h +41 -0
  116. plumed/_lib/include/plumed/core/ModuleMap.h +30 -0
  117. plumed/_lib/include/plumed/core/ParallelTaskManager.h +1023 -0
  118. plumed/_lib/include/plumed/core/PbcAction.h +61 -0
  119. plumed/_lib/include/plumed/core/PlumedMain.h +632 -0
  120. plumed/_lib/include/plumed/core/PlumedMainInitializer.h +118 -0
  121. plumed/_lib/include/plumed/core/RegisterBase.h +340 -0
  122. plumed/_lib/include/plumed/core/TargetDist.h +48 -0
  123. plumed/_lib/include/plumed/core/Value.h +547 -0
  124. plumed/_lib/include/plumed/core/WithCmd.h +93 -0
  125. plumed/_lib/include/plumed/dimred/SMACOF.h +55 -0
  126. plumed/_lib/include/plumed/drr/DRR.h +383 -0
  127. plumed/_lib/include/plumed/drr/colvar_UIestimator.h +777 -0
  128. plumed/_lib/include/plumed/fisst/legendre_rule_fast.h +44 -0
  129. plumed/_lib/include/plumed/function/Custom.h +54 -0
  130. plumed/_lib/include/plumed/function/Function.h +85 -0
  131. plumed/_lib/include/plumed/function/FunctionOfMatrix.h +368 -0
  132. plumed/_lib/include/plumed/function/FunctionOfScalar.h +135 -0
  133. plumed/_lib/include/plumed/function/FunctionOfVector.h +296 -0
  134. plumed/_lib/include/plumed/function/FunctionSetup.h +180 -0
  135. plumed/_lib/include/plumed/function/FunctionShortcut.h +130 -0
  136. plumed/_lib/include/plumed/function/FunctionWithSingleArgument.h +165 -0
  137. plumed/_lib/include/plumed/gridtools/ActionWithGrid.h +43 -0
  138. plumed/_lib/include/plumed/gridtools/EvaluateGridFunction.h +99 -0
  139. plumed/_lib/include/plumed/gridtools/FunctionOfGrid.h +295 -0
  140. plumed/_lib/include/plumed/gridtools/GridCoordinatesObject.h +179 -0
  141. plumed/_lib/include/plumed/gridtools/GridSearch.h +135 -0
  142. plumed/_lib/include/plumed/gridtools/Interpolator.h +45 -0
  143. plumed/_lib/include/plumed/gridtools/KDE.h +455 -0
  144. plumed/_lib/include/plumed/gridtools/RDF.h +40 -0
  145. plumed/_lib/include/plumed/gridtools/SumOfKernels.h +219 -0
  146. plumed/_lib/include/plumed/isdb/MetainferenceBase.h +398 -0
  147. plumed/_lib/include/plumed/lapack/def_external.h +207 -0
  148. plumed/_lib/include/plumed/lapack/def_internal.h +388 -0
  149. plumed/_lib/include/plumed/lapack/lapack.h +899 -0
  150. plumed/_lib/include/plumed/lapack/lapack_limits.h +79 -0
  151. plumed/_lib/include/plumed/lapack/real.h +50 -0
  152. plumed/_lib/include/plumed/lepton/CompiledExpression.h +164 -0
  153. plumed/_lib/include/plumed/lepton/CustomFunction.h +143 -0
  154. plumed/_lib/include/plumed/lepton/Exception.h +93 -0
  155. plumed/_lib/include/plumed/lepton/ExpressionProgram.h +137 -0
  156. plumed/_lib/include/plumed/lepton/ExpressionTreeNode.h +145 -0
  157. plumed/_lib/include/plumed/lepton/Lepton.h +85 -0
  158. plumed/_lib/include/plumed/lepton/MSVC_erfc.h +123 -0
  159. plumed/_lib/include/plumed/lepton/Operation.h +1302 -0
  160. plumed/_lib/include/plumed/lepton/ParsedExpression.h +165 -0
  161. plumed/_lib/include/plumed/lepton/Parser.h +111 -0
  162. plumed/_lib/include/plumed/lepton/windowsIncludes.h +73 -0
  163. plumed/_lib/include/plumed/mapping/Path.h +44 -0
  164. plumed/_lib/include/plumed/mapping/PathProjectionCalculator.h +57 -0
  165. plumed/_lib/include/plumed/matrixtools/MatrixOperationBase.h +54 -0
  166. plumed/_lib/include/plumed/matrixtools/MatrixTimesMatrix.h +309 -0
  167. plumed/_lib/include/plumed/matrixtools/MatrixTimesVectorBase.h +365 -0
  168. plumed/_lib/include/plumed/matrixtools/OuterProduct.h +238 -0
  169. plumed/_lib/include/plumed/maze/Core.h +65 -0
  170. plumed/_lib/include/plumed/maze/Loss.h +86 -0
  171. plumed/_lib/include/plumed/maze/Member.h +66 -0
  172. plumed/_lib/include/plumed/maze/Memetic.h +799 -0
  173. plumed/_lib/include/plumed/maze/Optimizer.h +357 -0
  174. plumed/_lib/include/plumed/maze/Random_MT.h +156 -0
  175. plumed/_lib/include/plumed/maze/Tools.h +183 -0
  176. plumed/_lib/include/plumed/metatomic/vesin.h +188 -0
  177. plumed/_lib/include/plumed/molfile/Gromacs.h +2013 -0
  178. plumed/_lib/include/plumed/molfile/endianswap.h +217 -0
  179. plumed/_lib/include/plumed/molfile/fastio.h +683 -0
  180. plumed/_lib/include/plumed/molfile/largefiles.h +78 -0
  181. plumed/_lib/include/plumed/molfile/libmolfile_plugin.h +77 -0
  182. plumed/_lib/include/plumed/molfile/molfile_plugin.h +1034 -0
  183. plumed/_lib/include/plumed/molfile/periodic_table.h +248 -0
  184. plumed/_lib/include/plumed/molfile/readpdb.h +447 -0
  185. plumed/_lib/include/plumed/molfile/vmdplugin.h +236 -0
  186. plumed/_lib/include/plumed/multicolvar/MultiColvarShortcuts.h +45 -0
  187. plumed/_lib/include/plumed/opes/ExpansionCVs.h +79 -0
  188. plumed/_lib/include/plumed/sasa/Sasa.h +32 -0
  189. plumed/_lib/include/plumed/secondarystructure/SecondaryStructureBase.h +372 -0
  190. plumed/_lib/include/plumed/setup/ActionSetup.h +25 -0
  191. plumed/_lib/include/plumed/small_vector/small_vector.h +6114 -0
  192. plumed/_lib/include/plumed/symfunc/CoordinationNumbers.h +41 -0
  193. plumed/_lib/include/plumed/tools/Angle.h +52 -0
  194. plumed/_lib/include/plumed/tools/AtomDistribution.h +138 -0
  195. plumed/_lib/include/plumed/tools/AtomNumber.h +152 -0
  196. plumed/_lib/include/plumed/tools/BiasRepresentation.h +106 -0
  197. plumed/_lib/include/plumed/tools/BitmaskEnum.h +167 -0
  198. plumed/_lib/include/plumed/tools/Brent1DRootSearch.h +159 -0
  199. plumed/_lib/include/plumed/tools/CheckInRange.h +44 -0
  200. plumed/_lib/include/plumed/tools/Citations.h +74 -0
  201. plumed/_lib/include/plumed/tools/ColvarOutput.h +118 -0
  202. plumed/_lib/include/plumed/tools/Communicator.h +316 -0
  203. plumed/_lib/include/plumed/tools/ConjugateGradient.h +80 -0
  204. plumed/_lib/include/plumed/tools/DLLoader.h +79 -0
  205. plumed/_lib/include/plumed/tools/ERMSD.h +73 -0
  206. plumed/_lib/include/plumed/tools/Exception.h +406 -0
  207. plumed/_lib/include/plumed/tools/File.h +28 -0
  208. plumed/_lib/include/plumed/tools/FileBase.h +153 -0
  209. plumed/_lib/include/plumed/tools/FileTools.h +37 -0
  210. plumed/_lib/include/plumed/tools/ForwardDecl.h +54 -0
  211. plumed/_lib/include/plumed/tools/Grid.h +638 -0
  212. plumed/_lib/include/plumed/tools/HistogramBead.h +136 -0
  213. plumed/_lib/include/plumed/tools/IFile.h +117 -0
  214. plumed/_lib/include/plumed/tools/KernelFunctions.h +113 -0
  215. plumed/_lib/include/plumed/tools/Keywords.h +380 -0
  216. plumed/_lib/include/plumed/tools/LatticeReduction.h +66 -0
  217. plumed/_lib/include/plumed/tools/LeptonCall.h +64 -0
  218. plumed/_lib/include/plumed/tools/LinkCells.h +126 -0
  219. plumed/_lib/include/plumed/tools/Log.h +41 -0
  220. plumed/_lib/include/plumed/tools/LoopUnroller.h +163 -0
  221. plumed/_lib/include/plumed/tools/Matrix.h +721 -0
  222. plumed/_lib/include/plumed/tools/MatrixSquareBracketsAccess.h +138 -0
  223. plumed/_lib/include/plumed/tools/MergeVectorTools.h +153 -0
  224. plumed/_lib/include/plumed/tools/Minimise1DBrent.h +244 -0
  225. plumed/_lib/include/plumed/tools/MinimiseBase.h +120 -0
  226. plumed/_lib/include/plumed/tools/MolDataClass.h +51 -0
  227. plumed/_lib/include/plumed/tools/NeighborList.h +112 -0
  228. plumed/_lib/include/plumed/tools/OFile.h +286 -0
  229. plumed/_lib/include/plumed/tools/OpenACC.h +180 -0
  230. plumed/_lib/include/plumed/tools/OpenMP.h +75 -0
  231. plumed/_lib/include/plumed/tools/PDB.h +154 -0
  232. plumed/_lib/include/plumed/tools/Pbc.h +139 -0
  233. plumed/_lib/include/plumed/tools/PlumedHandle.h +105 -0
  234. plumed/_lib/include/plumed/tools/RMSD.h +493 -0
  235. plumed/_lib/include/plumed/tools/Random.h +80 -0
  236. plumed/_lib/include/plumed/tools/RootFindingBase.h +79 -0
  237. plumed/_lib/include/plumed/tools/Stopwatch.h +475 -0
  238. plumed/_lib/include/plumed/tools/Subprocess.h +142 -0
  239. plumed/_lib/include/plumed/tools/SwitchingFunction.h +208 -0
  240. plumed/_lib/include/plumed/tools/Tensor.h +724 -0
  241. plumed/_lib/include/plumed/tools/TokenizedLine.h +123 -0
  242. plumed/_lib/include/plumed/tools/Tools.h +638 -0
  243. plumed/_lib/include/plumed/tools/Torsion.h +55 -0
  244. plumed/_lib/include/plumed/tools/TrajectoryParser.h +118 -0
  245. plumed/_lib/include/plumed/tools/Tree.h +61 -0
  246. plumed/_lib/include/plumed/tools/TypesafePtr.h +463 -0
  247. plumed/_lib/include/plumed/tools/Units.h +167 -0
  248. plumed/_lib/include/plumed/tools/Vector.h +433 -0
  249. plumed/_lib/include/plumed/tools/View.h +296 -0
  250. plumed/_lib/include/plumed/tools/View2D.h +100 -0
  251. plumed/_lib/include/plumed/tools/h36.h +39 -0
  252. plumed/_lib/include/plumed/vatom/ActionWithVirtualAtom.h +32 -0
  253. plumed/_lib/include/plumed/ves/BasisFunctions.h +380 -0
  254. plumed/_lib/include/plumed/ves/CoeffsBase.h +310 -0
  255. plumed/_lib/include/plumed/ves/CoeffsMatrix.h +220 -0
  256. plumed/_lib/include/plumed/ves/CoeffsVector.h +251 -0
  257. plumed/_lib/include/plumed/ves/FermiSwitchingFunction.h +74 -0
  258. plumed/_lib/include/plumed/ves/GridIntegrationWeights.h +50 -0
  259. plumed/_lib/include/plumed/ves/GridLinearInterpolation.h +81 -0
  260. plumed/_lib/include/plumed/ves/GridProjWeights.h +61 -0
  261. plumed/_lib/include/plumed/ves/LinearBasisSetExpansion.h +303 -0
  262. plumed/_lib/include/plumed/ves/Optimizer.h +444 -0
  263. plumed/_lib/include/plumed/ves/TargetDistModifer.h +53 -0
  264. plumed/_lib/include/plumed/ves/TargetDistribution.h +266 -0
  265. plumed/_lib/include/plumed/ves/VesBias.h +545 -0
  266. plumed/_lib/include/plumed/ves/VesTools.h +142 -0
  267. plumed/_lib/include/plumed/ves/WaveletGrid.h +75 -0
  268. plumed/_lib/include/plumed/volumes/ActionVolume.h +268 -0
  269. plumed/_lib/include/plumed/volumes/VolumeShortcut.h +147 -0
  270. plumed/_lib/include/plumed/wrapper/Plumed.h +5025 -0
  271. plumed/_lib/include/plumed/xdrfile/xdrfile.h +663 -0
  272. plumed/_lib/include/plumed/xdrfile/xdrfile_trr.h +89 -0
  273. plumed/_lib/include/plumed/xdrfile/xdrfile_xtc.h +90 -0
  274. plumed/_lib/lib/PythonCVInterface.so +0 -0
  275. plumed/_lib/lib/libplumed.a +0 -0
  276. plumed/_lib/lib/libplumed.so +0 -0
  277. plumed/_lib/lib/libplumedKernel.so +0 -0
  278. plumed/_lib/lib/libplumedWrapper.a +0 -0
  279. plumed/_lib/lib/pkgconfig/plumed.pc +13 -0
  280. plumed/_lib/lib/pkgconfig/plumedInternals.pc +13 -0
  281. plumed/_lib/lib/pkgconfig/plumedWrapper.pc +13 -0
  282. plumed/_lib/lib/plumed/fortran/plumed.f90 +879 -0
  283. plumed/_lib/lib/plumed/fortran/plumed_f08.f90 +2625 -0
  284. plumed/_lib/lib/plumed/modulefile +69 -0
  285. plumed/_lib/lib/plumed/patches/gromacs-2022.5.config +43 -0
  286. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt +543 -0
  287. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt.preplumed +540 -0
  288. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp +1628 -0
  289. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1590 -0
  290. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h +103 -0
  291. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h.preplumed +99 -0
  292. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp +2527 -0
  293. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2513 -0
  294. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  295. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  296. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +408 -0
  297. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +394 -0
  298. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp +2348 -0
  299. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2091 -0
  300. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp +3573 -0
  301. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3495 -0
  302. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1506 -0
  303. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1402 -0
  304. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  305. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  306. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp +997 -0
  307. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +906 -0
  308. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp +2780 -0
  309. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2738 -0
  310. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  311. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  312. plumed/_lib/lib/plumed/patches/gromacs-2023.5.config +43 -0
  313. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt +549 -0
  314. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt.preplumed +546 -0
  315. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  316. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  317. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h +104 -0
  318. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  319. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp +2624 -0
  320. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2610 -0
  321. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  322. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  323. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +409 -0
  324. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +395 -0
  325. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp +2419 -0
  326. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2164 -0
  327. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp +3546 -0
  328. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3468 -0
  329. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  330. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  331. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  332. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  333. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp +991 -0
  334. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +900 -0
  335. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp +2895 -0
  336. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2849 -0
  337. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  338. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  339. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp +886 -0
  340. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +880 -0
  341. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +347 -0
  342. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +345 -0
  343. plumed/_lib/lib/plumed/patches/gromacs-2024.3.config +43 -0
  344. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt +575 -0
  345. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt.preplumed +572 -0
  346. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  347. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  348. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h +104 -0
  349. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  350. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp +2564 -0
  351. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2550 -0
  352. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  353. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  354. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h +410 -0
  355. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +396 -0
  356. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp +2435 -0
  357. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp.preplumed +2187 -0
  358. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp +3592 -0
  359. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3514 -0
  360. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  361. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  362. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  363. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  364. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp +958 -0
  365. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp.preplumed +929 -0
  366. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp +2987 -0
  367. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp.preplumed +2941 -0
  368. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  369. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  370. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp +904 -0
  371. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +898 -0
  372. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +353 -0
  373. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +351 -0
  374. plumed/_lib/lib/plumed/patches/gromacs-2025.0.config +39 -0
  375. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake +82 -0
  376. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake.preplumed +82 -0
  377. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp +162 -0
  378. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp.preplumed +154 -0
  379. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp +107 -0
  380. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp.preplumed +99 -0
  381. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h +120 -0
  382. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h.preplumed +111 -0
  383. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp +215 -0
  384. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp.preplumed +197 -0
  385. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h +87 -0
  386. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h.preplumed +86 -0
  387. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp +2971 -0
  388. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp.preplumed +2970 -0
  389. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h +430 -0
  390. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h.preplumed +429 -0
  391. plumed/_lib/lib/plumed/patches/namd-2.12.config +30 -0
  392. plumed/_lib/lib/plumed/patches/namd-2.12.diff +267 -0
  393. plumed/_lib/lib/plumed/patches/namd-2.13.config +30 -0
  394. plumed/_lib/lib/plumed/patches/namd-2.13.diff +267 -0
  395. plumed/_lib/lib/plumed/patches/namd-2.14.config +30 -0
  396. plumed/_lib/lib/plumed/patches/namd-2.14.diff +268 -0
  397. plumed/_lib/lib/plumed/patches/patch.sh +500 -0
  398. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.config +25 -0
  399. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90 +368 -0
  400. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90.preplumed +366 -0
  401. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90 +71 -0
  402. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90.preplumed +24 -0
  403. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90 +62 -0
  404. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  405. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90 +189 -0
  406. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90.preplumed +185 -0
  407. plumed/_lib/lib/plumed/patches/qespresso-6.2.config +26 -0
  408. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90 +422 -0
  409. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90.preplumed +420 -0
  410. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90 +70 -0
  411. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  412. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90 +62 -0
  413. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  414. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90 +233 -0
  415. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90.preplumed +230 -0
  416. plumed/_lib/lib/plumed/patches/qespresso-7.0.config +28 -0
  417. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile +175 -0
  418. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile.preplumed +171 -0
  419. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90 +486 -0
  420. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90.preplumed +484 -0
  421. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90 +74 -0
  422. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  423. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90 +64 -0
  424. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  425. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90 +532 -0
  426. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90.preplumed +518 -0
  427. plumed/_lib/lib/plumed/patches/qespresso-7.2.config +28 -0
  428. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile +249 -0
  429. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile.preplumed +244 -0
  430. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90 +532 -0
  431. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90.preplumed +535 -0
  432. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90 +74 -0
  433. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  434. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90 +64 -0
  435. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  436. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90 +569 -0
  437. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90.preplumed +560 -0
  438. plumed/_lib/lib/plumed/plumed-config +9 -0
  439. plumed/_lib/lib/plumed/plumed-mklib +9 -0
  440. plumed/_lib/lib/plumed/plumed-newcv +9 -0
  441. plumed/_lib/lib/plumed/plumed-partial_tempering +9 -0
  442. plumed/_lib/lib/plumed/plumed-patch +9 -0
  443. plumed/_lib/lib/plumed/plumed-runtime +0 -0
  444. plumed/_lib/lib/plumed/plumed-selector +9 -0
  445. plumed/_lib/lib/plumed/plumed-vim2html +9 -0
  446. plumed/_lib/lib/plumed/scripts/config.sh +126 -0
  447. plumed/_lib/lib/plumed/scripts/mklib.sh +175 -0
  448. plumed/_lib/lib/plumed/scripts/newcv.sh +26 -0
  449. plumed/_lib/lib/plumed/scripts/partial_tempering.sh +319 -0
  450. plumed/_lib/lib/plumed/scripts/patch.sh +4 -0
  451. plumed/_lib/lib/plumed/scripts/selector.sh +234 -0
  452. plumed/_lib/lib/plumed/scripts/vim2html.sh +190 -0
  453. plumed/_lib/lib/plumed/src/colvar/Template.cpp +116 -0
  454. plumed/_lib/lib/plumed/src/config/compile_options.sh +3 -0
  455. plumed/_lib/lib/plumed/src/config/config.txt +179 -0
  456. plumed/_lib/lib/plumed/src/lib/Plumed.cmake +8 -0
  457. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.runtime +5 -0
  458. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.shared +5 -0
  459. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.static +5 -0
  460. plumed/_lib/lib/plumed/src/lib/Plumed.inc +8 -0
  461. plumed/_lib/lib/plumed/src/lib/Plumed.inc.runtime +5 -0
  462. plumed/_lib/lib/plumed/src/lib/Plumed.inc.shared +5 -0
  463. plumed/_lib/lib/plumed/src/lib/Plumed.inc.static +5 -0
  464. plumed/_lib/lib/plumed/vim/scripts.vim +6 -0
  465. plumed/_plumed_core.cpython-311-aarch64-linux-gnu.so +0 -0
  466. plumed/_plumed_core.cpython-312-aarch64-linux-gnu.so +0 -0
  467. plumed/_plumed_core.cpython-313-aarch64-linux-gnu.so +0 -0
  468. plumed/_plumed_core.cpython-314-aarch64-linux-gnu.so +0 -0
  469. plumed/_plumed_core.cpython-314t-aarch64-linux-gnu.so +0 -0
  470. plumedCommunications.cpython-311-aarch64-linux-gnu.so +0 -0
  471. plumedCommunications.cpython-312-aarch64-linux-gnu.so +0 -0
  472. plumedCommunications.cpython-313-aarch64-linux-gnu.so +0 -0
  473. plumedCommunications.cpython-314-aarch64-linux-gnu.so +0 -0
  474. plumedCommunications.cpython-314t-aarch64-linux-gnu.so +0 -0
  475. plumedCommunications.pyi +431 -0
@@ -0,0 +1,2527 @@
1
+ /*
2
+ * This file is part of the GROMACS molecular simulation package.
3
+ *
4
+ * Copyright 1991- The GROMACS Authors
5
+ * and the project initiators Erik Lindahl, Berk Hess and David van der Spoel.
6
+ * Consult the AUTHORS/COPYING files and https://www.gromacs.org for details.
7
+ *
8
+ * GROMACS is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU Lesser General Public License
10
+ * as published by the Free Software Foundation; either version 2.1
11
+ * of the License, or (at your option) any later version.
12
+ *
13
+ * GROMACS is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ * Lesser General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public
19
+ * License along with GROMACS; if not, see
20
+ * https://www.gnu.org/licenses, or write to the Free Software Foundation,
21
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
+ *
23
+ * If you want to redistribute modifications to GROMACS, please
24
+ * consider that scientific software is very special. Version
25
+ * control is crucial - bugs must be traceable. We will be happy to
26
+ * consider code for inclusion in the official distribution, but
27
+ * derived work must not be called official GROMACS. Details are found
28
+ * in the README & COPYING files - if they are missing, get the
29
+ * official version at https://www.gromacs.org.
30
+ *
31
+ * To help us fund GROMACS development, we humbly ask that you cite
32
+ * the research papers on the package. Check out https://www.gromacs.org.
33
+ */
34
+ #include "gmxpre.h"
35
+
36
+ #include "config.h"
37
+
38
+ #include <cmath>
39
+ #include <cstdint>
40
+ #include <cstdio>
41
+ #include <cstring>
42
+
43
+ #include <array>
44
+ #include <optional>
45
+
46
+ #include "gromacs/applied_forces/awh/awh.h"
47
+ #include "gromacs/domdec/dlbtiming.h"
48
+ #include "gromacs/domdec/domdec.h"
49
+ #include "gromacs/domdec/domdec_struct.h"
50
+ #include "gromacs/domdec/gpuhaloexchange.h"
51
+ #include "gromacs/domdec/partition.h"
52
+ #include "gromacs/essentialdynamics/edsam.h"
53
+ #include "gromacs/ewald/pme.h"
54
+ #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
55
+ #include "gromacs/ewald/pme_pp.h"
56
+ #include "gromacs/ewald/pme_pp_comm_gpu.h"
57
+ #include "gromacs/gmxlib/network.h"
58
+ #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
59
+ #include "gromacs/gmxlib/nonbonded/nonbonded.h"
60
+ #include "gromacs/gmxlib/nrnb.h"
61
+ #include "gromacs/gpu_utils/gpu_utils.h"
62
+ #include "gromacs/imd/imd.h"
63
+ #include "gromacs/listed_forces/disre.h"
64
+ #include "gromacs/listed_forces/listed_forces_gpu.h"
65
+ #include "gromacs/listed_forces/listed_forces.h"
66
+ #include "gromacs/listed_forces/orires.h"
67
+ #include "gromacs/math/arrayrefwithpadding.h"
68
+ #include "gromacs/math/functions.h"
69
+ #include "gromacs/math/units.h"
70
+ #include "gromacs/math/vec.h"
71
+ #include "gromacs/math/vecdump.h"
72
+ #include "gromacs/mdlib/calcmu.h"
73
+ #include "gromacs/mdlib/calcvir.h"
74
+ #include "gromacs/mdlib/constr.h"
75
+ #include "gromacs/mdlib/dispersioncorrection.h"
76
+ #include "gromacs/mdlib/enerdata_utils.h"
77
+ #include "gromacs/mdlib/force.h"
78
+ #include "gromacs/mdlib/force_flags.h"
79
+ #include "gromacs/mdlib/forcerec.h"
80
+ #include "gromacs/mdlib/gmx_omp_nthreads.h"
81
+ #include "gromacs/mdlib/update.h"
82
+ #include "gromacs/mdlib/vsite.h"
83
+ #include "gromacs/mdlib/wall.h"
84
+ #include "gromacs/mdlib/wholemoleculetransform.h"
85
+ #include "gromacs/mdtypes/commrec.h"
86
+ #include "gromacs/mdtypes/enerdata.h"
87
+ #include "gromacs/mdtypes/forcebuffers.h"
88
+ #include "gromacs/mdtypes/forceoutput.h"
89
+ #include "gromacs/mdtypes/forcerec.h"
90
+ #include "gromacs/mdtypes/iforceprovider.h"
91
+ #include "gromacs/mdtypes/inputrec.h"
92
+ #include "gromacs/mdtypes/md_enums.h"
93
+ #include "gromacs/mdtypes/mdatom.h"
94
+ #include "gromacs/mdtypes/multipletimestepping.h"
95
+ #include "gromacs/mdtypes/simulation_workload.h"
96
+ #include "gromacs/mdtypes/state.h"
97
+ #include "gromacs/mdtypes/state_propagator_data_gpu.h"
98
+ #include "gromacs/nbnxm/gpu_data_mgmt.h"
99
+ #include "gromacs/nbnxm/nbnxm.h"
100
+ #include "gromacs/nbnxm/nbnxm_gpu.h"
101
+ #include "gromacs/pbcutil/ishift.h"
102
+ #include "gromacs/pbcutil/pbc.h"
103
+ #include "gromacs/pulling/pull.h"
104
+ #include "gromacs/pulling/pull_rotation.h"
105
+ #include "gromacs/timing/cyclecounter.h"
106
+ #include "gromacs/timing/gpu_timing.h"
107
+ #include "gromacs/timing/wallcycle.h"
108
+ #include "gromacs/timing/wallcyclereporting.h"
109
+ #include "gromacs/timing/walltime_accounting.h"
110
+ #include "gromacs/topology/topology.h"
111
+ #include "gromacs/utility/arrayref.h"
112
+ #include "gromacs/utility/basedefinitions.h"
113
+ #include "gromacs/utility/cstringutil.h"
114
+ #include "gromacs/utility/exceptions.h"
115
+ #include "gromacs/utility/fatalerror.h"
116
+ #include "gromacs/utility/fixedcapacityvector.h"
117
+ #include "gromacs/utility/gmxassert.h"
118
+ #include "gromacs/utility/gmxmpi.h"
119
+ #include "gromacs/utility/logger.h"
120
+ #include "gromacs/utility/smalloc.h"
121
+ #include "gromacs/utility/strconvert.h"
122
+ #include "gromacs/utility/stringutil.h"
123
+ #include "gromacs/utility/sysinfo.h"
124
+
125
+ #include "gpuforcereduction.h"
126
+
127
+ using gmx::ArrayRef;
128
+ using gmx::AtomLocality;
129
+ using gmx::DomainLifetimeWorkload;
130
+ using gmx::ForceOutputs;
131
+ using gmx::ForceWithShiftForces;
132
+ using gmx::InteractionLocality;
133
+ using gmx::RVec;
134
+ using gmx::SimulationWorkload;
135
+ using gmx::StepWorkload;
136
+
137
+ /* PLUMED */
138
+ #include "../../../Plumed.h"
139
+ extern int plumedswitch;
140
+ extern plumed plumedmain;
141
+ /* END PLUMED */
142
+
143
+ // TODO: this environment variable allows us to verify before release
144
+ // that on less common architectures the total cost of polling is not larger than
145
+ // a blocking wait (so polling does not introduce overhead when the static
146
+ // PME-first ordering would suffice).
147
+ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
148
+
149
+ static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
150
+ {
151
+ GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
152
+ const int end = forceToAdd.size();
153
+
154
+ int gmx_unused nt = gmx_omp_nthreads_get(ModuleMultiThread::Default);
155
+ #pragma omp parallel for num_threads(nt) schedule(static)
156
+ for (int i = 0; i < end; i++)
157
+ {
158
+ rvec_inc(f[i], forceToAdd[i]);
159
+ }
160
+ }
161
+
162
+ static void calc_virial(int start,
163
+ int homenr,
164
+ const rvec x[],
165
+ const gmx::ForceWithShiftForces& forceWithShiftForces,
166
+ tensor vir_part,
167
+ const matrix box,
168
+ t_nrnb* nrnb,
169
+ const t_forcerec* fr,
170
+ PbcType pbcType)
171
+ {
172
+ /* The short-range virial from surrounding boxes */
173
+ const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
174
+ const rvec* shiftVecPointer = as_rvec_array(fr->shift_vec.data());
175
+ calc_vir(gmx::c_numShiftVectors, shiftVecPointer, fshift, vir_part, pbcType == PbcType::Screw, box);
176
+ inc_nrnb(nrnb, eNR_VIRIAL, gmx::c_numShiftVectors);
177
+
178
+ /* Calculate partial virial, for local atoms only, based on short range.
179
+ * Total virial is computed in global_stat, called from do_md
180
+ */
181
+ const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
182
+ f_calc_vir(start, start + homenr, x, f, vir_part, box);
183
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
184
+
185
+ if (debug)
186
+ {
187
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
188
+ }
189
+ }
190
+
191
+ static void pull_potential_wrapper(const t_commrec* cr,
192
+ const t_inputrec& ir,
193
+ const matrix box,
194
+ gmx::ArrayRef<const gmx::RVec> x,
195
+ const t_mdatoms* mdatoms,
196
+ gmx_enerdata_t* enerd,
197
+ pull_t* pull_work,
198
+ const real* lambda,
199
+ double t,
200
+ gmx_wallcycle* wcycle)
201
+ {
202
+ t_pbc pbc;
203
+ real dvdl;
204
+
205
+ /* Calculate the center of mass forces, this requires communication,
206
+ * which is why pull_potential is called close to other communication.
207
+ */
208
+ wallcycle_start(wcycle, WallCycleCounter::PullPot);
209
+ set_pbc(&pbc, ir.pbcType, box);
210
+ dvdl = 0;
211
+ enerd->term[F_COM_PULL] +=
212
+ pull_potential(pull_work,
213
+ gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
214
+ pbc,
215
+ cr,
216
+ t,
217
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Restraint)],
218
+ x,
219
+ &dvdl);
220
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
221
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
222
+ }
223
+
224
+ static void pme_receive_force_ener(t_forcerec* fr,
225
+ const t_commrec* cr,
226
+ gmx::ForceWithVirial* forceWithVirial,
227
+ gmx_enerdata_t* enerd,
228
+ bool useGpuPmePpComms,
229
+ bool receivePmeForceToGpu,
230
+ gmx_wallcycle* wcycle)
231
+ {
232
+ real e_q, e_lj, dvdl_q, dvdl_lj;
233
+ float cycles_ppdpme, cycles_seppme;
234
+
235
+ cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
236
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
237
+
238
+ /* In case of node-splitting, the PP nodes receive the long-range
239
+ * forces, virial and energy from the PME nodes here.
240
+ */
241
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
242
+ dvdl_q = 0;
243
+ dvdl_lj = 0;
244
+ gmx_pme_receive_f(fr->pmePpCommGpu.get(),
245
+ cr,
246
+ forceWithVirial,
247
+ &e_q,
248
+ &e_lj,
249
+ &dvdl_q,
250
+ &dvdl_lj,
251
+ useGpuPmePpComms,
252
+ receivePmeForceToGpu,
253
+ &cycles_seppme);
254
+ enerd->term[F_COUL_RECIP] += e_q;
255
+ enerd->term[F_LJ_RECIP] += e_lj;
256
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += dvdl_q;
257
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_lj;
258
+
259
+ if (wcycle)
260
+ {
261
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
262
+ }
263
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
264
+ }
265
+
266
+ static void print_large_forces(FILE* fp,
267
+ const t_mdatoms* md,
268
+ const t_commrec* cr,
269
+ int64_t step,
270
+ real forceTolerance,
271
+ ArrayRef<const RVec> x,
272
+ ArrayRef<const RVec> f)
273
+ {
274
+ real force2Tolerance = gmx::square(forceTolerance);
275
+ gmx::index numNonFinite = 0;
276
+ for (int i = 0; i < md->homenr; i++)
277
+ {
278
+ real force2 = norm2(f[i]);
279
+ bool nonFinite = !std::isfinite(force2);
280
+ if (force2 >= force2Tolerance || nonFinite)
281
+ {
282
+ fprintf(fp,
283
+ "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
284
+ step,
285
+ ddglatnr(cr->dd, i),
286
+ x[i][XX],
287
+ x[i][YY],
288
+ x[i][ZZ],
289
+ std::sqrt(force2));
290
+ }
291
+ if (nonFinite)
292
+ {
293
+ numNonFinite++;
294
+ }
295
+ }
296
+ if (numNonFinite > 0)
297
+ {
298
+ /* Note that with MPI this fatal call on one rank might interrupt
299
+ * the printing on other ranks. But we can only avoid that with
300
+ * an expensive MPI barrier that we would need at each step.
301
+ */
302
+ gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
303
+ }
304
+ }
305
+
306
+ //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
307
+ static void postProcessForceWithShiftForces(t_nrnb* nrnb,
308
+ gmx_wallcycle* wcycle,
309
+ const matrix box,
310
+ ArrayRef<const RVec> x,
311
+ ForceOutputs* forceOutputs,
312
+ tensor vir_force,
313
+ const t_mdatoms& mdatoms,
314
+ const t_forcerec& fr,
315
+ gmx::VirtualSitesHandler* vsite,
316
+ const StepWorkload& stepWork)
317
+ {
318
+ ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
319
+
320
+ /* If we have NoVirSum forces, but we do not calculate the virial,
321
+ * we later sum the forceWithShiftForces buffer together with
322
+ * the noVirSum buffer and spread the combined vsite forces at once.
323
+ */
324
+ if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
325
+ {
326
+ using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
327
+
328
+ auto f = forceWithShiftForces.force();
329
+ auto fshift = forceWithShiftForces.shiftForces();
330
+ const VirialHandling virialHandling =
331
+ (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
332
+ vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
333
+ forceWithShiftForces.haveSpreadVsiteForces() = true;
334
+ }
335
+
336
+ if (stepWork.computeVirial)
337
+ {
338
+ /* Calculation of the virial must be done after vsites! */
339
+ calc_virial(
340
+ 0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force, box, nrnb, &fr, fr.pbcType);
341
+ }
342
+ }
343
+
344
+ //! Spread, compute virial for and sum forces, when necessary
345
+ static void postProcessForces(const t_commrec* cr,
346
+ int64_t step,
347
+ t_nrnb* nrnb,
348
+ gmx_wallcycle* wcycle,
349
+ const matrix box,
350
+ ArrayRef<const RVec> x,
351
+ ForceOutputs* forceOutputs,
352
+ tensor vir_force,
353
+ const t_mdatoms* mdatoms,
354
+ const t_forcerec* fr,
355
+ gmx::VirtualSitesHandler* vsite,
356
+ const StepWorkload& stepWork)
357
+ {
358
+ // Extract the final output force buffer, which is also the buffer for forces with shift forces
359
+ ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
360
+
361
+ if (forceOutputs->haveForceWithVirial())
362
+ {
363
+ auto& forceWithVirial = forceOutputs->forceWithVirial();
364
+
365
+ if (vsite)
366
+ {
367
+ /* Spread the mesh force on virtual sites to the other particles...
368
+ * This is parallellized. MPI communication is performed
369
+ * if the constructing atoms aren't local.
370
+ */
371
+ GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
372
+ "We need separate force buffers for shift and virial forces when "
373
+ "computing the virial");
374
+ GMX_ASSERT(!stepWork.computeVirial
375
+ || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
376
+ "We should spread the force with shift forces separately when computing "
377
+ "the virial");
378
+ const gmx::VirtualSitesHandler::VirialHandling virialHandling =
379
+ (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
380
+ : gmx::VirtualSitesHandler::VirialHandling::None);
381
+ matrix virial = { { 0 } };
382
+ vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
383
+ forceWithVirial.addVirialContribution(virial);
384
+ }
385
+
386
+ if (stepWork.computeVirial)
387
+ {
388
+ /* Now add the forces, this is local */
389
+ sum_forces(f, forceWithVirial.force_);
390
+
391
+ /* Add the direct virial contributions */
392
+ GMX_ASSERT(
393
+ forceWithVirial.computeVirial_,
394
+ "forceWithVirial should request virial computation when we request the virial");
395
+ m_add(vir_force, forceWithVirial.getVirial(), vir_force);
396
+
397
+ if (debug)
398
+ {
399
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
400
+ }
401
+ }
402
+ }
403
+ else
404
+ {
405
+ GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
406
+ "We should have spread the vsite forces (earlier)");
407
+ }
408
+
409
+ if (fr->print_force >= 0)
410
+ {
411
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
412
+ }
413
+ }
414
+
415
+ static void do_nb_verlet(t_forcerec* fr,
416
+ const interaction_const_t* ic,
417
+ gmx_enerdata_t* enerd,
418
+ const StepWorkload& stepWork,
419
+ const InteractionLocality ilocality,
420
+ const int clearF,
421
+ const int64_t step,
422
+ t_nrnb* nrnb,
423
+ gmx_wallcycle* wcycle)
424
+ {
425
+ if (!stepWork.computeNonbondedForces)
426
+ {
427
+ /* skip non-bonded calculation */
428
+ return;
429
+ }
430
+
431
+ nonbonded_verlet_t* nbv = fr->nbv.get();
432
+
433
+ /* GPU kernel launch overhead is already timed separately */
434
+ if (!nbv->useGpu())
435
+ {
436
+ /* When dynamic pair-list pruning is requested, we need to prune
437
+ * at nstlistPrune steps.
438
+ */
439
+ if (nbv->isDynamicPruningStepCpu(step))
440
+ {
441
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
442
+ * the current coordinates of the atoms.
443
+ */
444
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
445
+ nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
446
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
447
+ }
448
+ }
449
+
450
+ nbv->dispatchNonbondedKernel(
451
+ ilocality,
452
+ *ic,
453
+ stepWork,
454
+ clearF,
455
+ fr->shift_vec,
456
+ enerd->grpp.energyGroupPairTerms[fr->haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
457
+ : NonBondedEnergyTerms::LJSR],
458
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
459
+ nrnb);
460
+ }
461
+
462
+ static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
463
+ {
464
+ int nth = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, v.ssize());
465
+
466
+ /* Note that we would like to avoid this conditional by putting it
467
+ * into the omp pragma instead, but then we still take the full
468
+ * omp parallel for overhead (at least with gcc5).
469
+ */
470
+ if (!useOpenmpThreading || nth == 1)
471
+ {
472
+ for (RVec& elem : v)
473
+ {
474
+ clear_rvec(elem);
475
+ }
476
+ }
477
+ else
478
+ {
479
+ #pragma omp parallel for num_threads(nth) schedule(static)
480
+ for (gmx::index i = 0; i < v.ssize(); i++)
481
+ {
482
+ clear_rvec(v[i]);
483
+ }
484
+ }
485
+ }
486
+
487
+ /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
488
+ *
489
+ * \param groupOptions Group options, containing T-coupling options
490
+ */
491
+ static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
492
+ {
493
+ real nrdfCoupled = 0;
494
+ real nrdfUncoupled = 0;
495
+ real kineticEnergy = 0;
496
+ for (int g = 0; g < groupOptions.ngtc; g++)
497
+ {
498
+ if (groupOptions.tau_t[g] >= 0)
499
+ {
500
+ nrdfCoupled += groupOptions.nrdf[g];
501
+ kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * gmx::c_boltz;
502
+ }
503
+ else
504
+ {
505
+ nrdfUncoupled += groupOptions.nrdf[g];
506
+ }
507
+ }
508
+
509
+ /* This conditional with > also catches nrdf=0 */
510
+ if (nrdfCoupled > nrdfUncoupled)
511
+ {
512
+ return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
513
+ }
514
+ else
515
+ {
516
+ return 0;
517
+ }
518
+ }
519
+
520
+ /*! \brief This routine checks that the potential energy is finite.
521
+ *
522
+ * Always checks that the potential energy is finite. If step equals
523
+ * inputrec.init_step also checks that the magnitude of the potential energy
524
+ * is reasonable. Terminates with a fatal error when a check fails.
525
+ * Note that passing this check does not guarantee finite forces,
526
+ * since those use slightly different arithmetics. But in most cases
527
+ * there is just a narrow coordinate range where forces are not finite
528
+ * and energies are finite.
529
+ *
530
+ * \param[in] step The step number, used for checking and printing
531
+ * \param[in] enerd The energy data; the non-bonded group energies need to be added to
532
+ * \c enerd.term[F_EPOT] before calling this routine
533
+ * \param[in] inputrec The input record
534
+ */
535
+ static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
536
+ {
537
+ /* Threshold valid for comparing absolute potential energy against
538
+ * the kinetic energy. Normally one should not consider absolute
539
+ * potential energy values, but with a factor of one million
540
+ * we should never get false positives.
541
+ */
542
+ constexpr real c_thresholdFactor = 1e6;
543
+
544
+ bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
545
+ real averageKineticEnergy = 0;
546
+ /* We only check for large potential energy at the initial step,
547
+ * because that is by far the most likely step for this too occur
548
+ * and because computing the average kinetic energy is not free.
549
+ * Note: nstcalcenergy >> 1 often does not allow to catch large energies
550
+ * before they become NaN.
551
+ */
552
+ if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
553
+ {
554
+ averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
555
+ }
556
+
557
+ if (energyIsNotFinite
558
+ || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
559
+ {
560
+ GMX_THROW(gmx::InternalError(gmx::formatString(
561
+ "Step %" PRId64
562
+ ": The total potential energy is %g, which is %s. The LJ and electrostatic "
563
+ "contributions to the energy are %g and %g, respectively. A %s potential energy "
564
+ "can be caused by overlapping interactions in bonded interactions or very large%s "
565
+ "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
566
+ "configuration, incorrect interactions or parameters in the topology.",
567
+ step,
568
+ enerd.term[F_EPOT],
569
+ energyIsNotFinite ? "not finite" : "extremely high",
570
+ enerd.term[F_LJ],
571
+ enerd.term[F_COUL_SR],
572
+ energyIsNotFinite ? "non-finite" : "very high",
573
+ energyIsNotFinite ? " or Nan" : "")));
574
+ }
575
+ }
576
+
577
+ /*! \brief Return true if there are special forces computed this step.
578
+ *
579
+ * The conditionals exactly correspond to those in computeSpecialForces().
580
+ */
581
+ static bool haveSpecialForces(const t_inputrec& inputrec,
582
+ const gmx::ForceProviders& forceProviders,
583
+ const pull_t* pull_work,
584
+ const bool computeForces,
585
+ const gmx_edsam* ed)
586
+ {
587
+
588
+ return ((computeForces && forceProviders.hasForceProvider()) || // forceProviders
589
+ (inputrec.bPull && pull_have_potential(*pull_work)) || // pull
590
+ inputrec.bRot || // enforced rotation
591
+ (ed != nullptr) || // flooding
592
+ (inputrec.bIMD && computeForces)); // IMD
593
+ }
594
+
595
+ /*! \brief Compute forces and/or energies for special algorithms
596
+ *
597
+ * The intention is to collect all calls to algorithms that compute
598
+ * forces on local atoms only and that do not contribute to the local
599
+ * virial sum (but add their virial contribution separately).
600
+ * Eventually these should likely all become ForceProviders.
601
+ * Within this function the intention is to have algorithms that do
602
+ * global communication at the end, so global barriers within the MD loop
603
+ * are as close together as possible.
604
+ *
605
+ * \param[in] fplog The log file
606
+ * \param[in] cr The communication record
607
+ * \param[in] inputrec The input record
608
+ * \param[in] awh The Awh module (nullptr if none in use).
609
+ * \param[in] enforcedRotation Enforced rotation module.
610
+ * \param[in] imdSession The IMD session
611
+ * \param[in] pull_work The pull work structure.
612
+ * \param[in] step The current MD step
613
+ * \param[in] t The current time
614
+ * \param[in,out] wcycle Wallcycle accounting struct
615
+ * \param[in,out] forceProviders Pointer to a list of force providers
616
+ * \param[in] box The unit cell
617
+ * \param[in] x The coordinates
618
+ * \param[in] mdatoms Per atom properties
619
+ * \param[in] lambda Array of free-energy lambda values
620
+ * \param[in] stepWork Step schedule flags
621
+ * \param[in,out] forceWithVirialMtsLevel0 Force and virial for MTS level0 forces
622
+ * \param[in,out] forceWithVirialMtsLevel1 Force and virial for MTS level1 forces, can be nullptr
623
+ * \param[in,out] enerd Energy buffer
624
+ * \param[in,out] ed Essential dynamics pointer
625
+ * \param[in] didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
626
+ *
627
+ * \todo Remove didNeighborSearch, which is used incorrectly.
628
+ * \todo Convert all other algorithms called here to ForceProviders.
629
+ */
630
+ static void computeSpecialForces(FILE* fplog,
631
+ const t_commrec* cr,
632
+ const t_inputrec& inputrec,
633
+ gmx::Awh* awh,
634
+ gmx_enfrot* enforcedRotation,
635
+ gmx::ImdSession* imdSession,
636
+ pull_t* pull_work,
637
+ int64_t step,
638
+ double t,
639
+ gmx_wallcycle* wcycle,
640
+ gmx::ForceProviders* forceProviders,
641
+ const matrix box,
642
+ gmx::ArrayRef<const gmx::RVec> x,
643
+ const t_mdatoms* mdatoms,
644
+ gmx::ArrayRef<const real> lambda,
645
+ const StepWorkload& stepWork,
646
+ gmx::ForceWithVirial* forceWithVirialMtsLevel0,
647
+ gmx::ForceWithVirial* forceWithVirialMtsLevel1,
648
+ gmx_enerdata_t* enerd,
649
+ gmx_edsam* ed,
650
+ bool didNeighborSearch)
651
+ {
652
+ /* NOTE: Currently all ForceProviders only provide forces.
653
+ * When they also provide energies, remove this conditional.
654
+ */
655
+ if (stepWork.computeForces)
656
+ {
657
+ gmx::ForceProviderInput forceProviderInput(
658
+ x,
659
+ mdatoms->homenr,
660
+ gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->homenr),
661
+ gmx::arrayRefFromArray(mdatoms->massT, mdatoms->homenr),
662
+ t,
663
+ step,
664
+ box,
665
+ *cr);
666
+ gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
667
+
668
+ /* Collect forces from modules */
669
+ forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
670
+ }
671
+
672
+ const int pullMtsLevel = forceGroupMtsLevel(inputrec.mtsLevels, gmx::MtsForceGroups::Pull);
673
+ const bool doPulling = (inputrec.bPull && pull_have_potential(*pull_work)
674
+ && (pullMtsLevel == 0 || stepWork.computeSlowForces));
675
+
676
+ /* pull_potential_wrapper(), awh->applyBiasForcesAndUpdateBias(), pull_apply_forces()
677
+ * have to be called in this order
678
+ */
679
+ if (doPulling)
680
+ {
681
+ pull_potential_wrapper(cr, inputrec, box, x, mdatoms, enerd, pull_work, lambda.data(), t, wcycle);
682
+ }
683
+ if (awh && (pullMtsLevel == 0 || stepWork.computeSlowForces))
684
+ {
685
+ const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
686
+ std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
687
+ if (needForeignEnergyDifferences)
688
+ {
689
+ enerd->foreignLambdaTerms.finalizePotentialContributions(
690
+ enerd->dvdl_lin, lambda, *inputrec.fepvals);
691
+ std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
692
+ }
693
+
694
+ enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
695
+ inputrec.pbcType, foreignLambdaDeltaH, foreignLambdaDhDl, box, t, step, wcycle, fplog);
696
+ }
697
+ if (doPulling)
698
+ {
699
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PullPot);
700
+ auto& forceWithVirial = (pullMtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
701
+ pull_apply_forces(
702
+ pull_work, gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr), cr, forceWithVirial);
703
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
704
+ }
705
+
706
+ /* Add the forces from enforced rotation potentials (if any) */
707
+ if (inputrec.bRot)
708
+ {
709
+ wallcycle_start(wcycle, WallCycleCounter::RotAdd);
710
+ enerd->term[F_COM_PULL] +=
711
+ add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
712
+ wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
713
+ }
714
+
715
+ if (ed)
716
+ {
717
+ /* Note that since init_edsam() is called after the initialization
718
+ * of forcerec, edsam doesn't request the noVirSum force buffer.
719
+ * Thus if no other algorithm (e.g. PME) requires it, the forces
720
+ * here will contribute to the virial.
721
+ */
722
+ do_flood(cr, inputrec, x, forceWithVirialMtsLevel0->force_, ed, box, step, didNeighborSearch);
723
+ }
724
+
725
+ /* Add forces from interactive molecular dynamics (IMD), if any */
726
+ if (inputrec.bIMD && stepWork.computeForces)
727
+ {
728
+ imdSession->applyForces(forceWithVirialMtsLevel0->force_);
729
+ }
730
+ }
731
+
732
+ /*! \brief Launch the prepare_step and spread stages of PME GPU.
733
+ *
734
+ * \param[in] pmedata The PME structure
735
+ * \param[in] box The box matrix
736
+ * \param[in] stepWork Step schedule flags
737
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
738
+ * \param[in] lambdaQ The Coulomb lambda of the current state.
739
+ * \param[in] wcycle The wallcycle structure
740
+ */
741
+ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
742
+ const matrix box,
743
+ const StepWorkload& stepWork,
744
+ GpuEventSynchronizer* xReadyOnDevice,
745
+ const real lambdaQ,
746
+ gmx_wallcycle* wcycle)
747
+ {
748
+ pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
749
+ bool useGpuDirectComm = false;
750
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
751
+ pme_gpu_launch_spread(
752
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
753
+ }
754
+
755
+ /*! \brief Launch the FFT and gather stages of PME GPU
756
+ *
757
+ * This function only implements setting the output forces (no accumulation).
758
+ *
759
+ * \param[in] pmedata The PME structure
760
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
761
+ * \param[in] wcycle The wallcycle structure
762
+ * \param[in] stepWork Step schedule flags
763
+ */
764
+ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
765
+ const real lambdaQ,
766
+ gmx_wallcycle* wcycle,
767
+ const gmx::StepWorkload& stepWork)
768
+ {
769
+ pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
770
+ pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
771
+ }
772
+
773
+ /*! \brief
774
+ * Polling wait for either of the PME or nonbonded GPU tasks.
775
+ *
776
+ * Instead of a static order in waiting for GPU tasks, this function
777
+ * polls checking which of the two tasks completes first, and does the
778
+ * associated force buffer reduction overlapped with the other task.
779
+ * By doing that, unlike static scheduling order, it can always overlap
780
+ * one of the reductions, regardless of the GPU task completion order.
781
+ *
782
+ * \param[in] nbv Nonbonded verlet structure
783
+ * \param[in,out] pmedata PME module data
784
+ * \param[in,out] forceOutputsNonbonded Force outputs for the non-bonded forces and shift forces
785
+ * \param[in,out] forceOutputsPme Force outputs for the PME forces and virial
786
+ * \param[in,out] enerd Energy data structure results are reduced into
787
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
788
+ * \param[in] stepWork Step schedule flags
789
+ * \param[in] wcycle The wallcycle structure
790
+ */
791
+ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
792
+ gmx_pme_t* pmedata,
793
+ gmx::ForceOutputs* forceOutputsNonbonded,
794
+ gmx::ForceOutputs* forceOutputsPme,
795
+ gmx_enerdata_t* enerd,
796
+ const real lambdaQ,
797
+ const StepWorkload& stepWork,
798
+ gmx_wallcycle* wcycle)
799
+ {
800
+ bool isPmeGpuDone = false;
801
+ bool isNbGpuDone = false;
802
+
803
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
804
+
805
+ while (!isPmeGpuDone || !isNbGpuDone)
806
+ {
807
+ if (!isPmeGpuDone)
808
+ {
809
+ GpuTaskCompletion completionType =
810
+ (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
811
+ isPmeGpuDone = pme_gpu_try_finish_task(
812
+ pmedata, stepWork, wcycle, &forceOutputsPme->forceWithVirial(), enerd, lambdaQ, completionType);
813
+ }
814
+
815
+ if (!isNbGpuDone)
816
+ {
817
+ auto& forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
818
+ GpuTaskCompletion completionType =
819
+ (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
820
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(
821
+ nbv->gpu_nbv,
822
+ stepWork,
823
+ AtomLocality::Local,
824
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
825
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
826
+ forceBuffersNonbonded.shiftForces(),
827
+ completionType,
828
+ wcycle);
829
+
830
+ if (isNbGpuDone)
831
+ {
832
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
833
+ }
834
+ }
835
+ }
836
+ }
837
+
838
+ /*! \brief Set up the different force buffers; also does clearing.
839
+ *
840
+ * \param[in] forceHelperBuffers Helper force buffers
841
+ * \param[in] force force array
842
+ * \param[in] domainWork Domain lifetime workload flags
843
+ * \param[in] stepWork Step schedule flags
844
+ * \param[in] havePpDomainDecomposition Whether we have a PP domain decomposition
845
+ * \param[out] wcycle wallcycle recording structure
846
+ *
847
+ * \returns Cleared force output structure
848
+ */
849
+ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceHelperBuffers,
850
+ gmx::ArrayRefWithPadding<gmx::RVec> force,
851
+ const DomainLifetimeWorkload& domainWork,
852
+ const StepWorkload& stepWork,
853
+ const bool havePpDomainDecomposition,
854
+ gmx_wallcycle* wcycle)
855
+ {
856
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
857
+
858
+ /* NOTE: We assume fr->shiftForces is all zeros here */
859
+ gmx::ForceWithShiftForces forceWithShiftForces(
860
+ force, stepWork.computeVirial, forceHelperBuffers->shiftForces());
861
+
862
+ if (stepWork.computeForces
863
+ && (domainWork.haveCpuLocalForceWork || !stepWork.useGpuFBufferOps
864
+ || (havePpDomainDecomposition && !stepWork.useGpuFHalo)))
865
+ {
866
+ /* Clear the short- and long-range forces */
867
+ clearRVecs(forceWithShiftForces.force(), true);
868
+
869
+ /* Clear the shift forces */
870
+ clearRVecs(forceWithShiftForces.shiftForces(), false);
871
+ }
872
+
873
+ /* If we need to compute the virial, we might need a separate
874
+ * force buffer for algorithms for which the virial is calculated
875
+ * directly, such as PME. Otherwise, forceWithVirial uses the
876
+ * the same force (f in legacy calls) buffer as other algorithms.
877
+ */
878
+ const bool useSeparateForceWithVirialBuffer =
879
+ (stepWork.computeForces
880
+ && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
881
+ /* forceWithVirial uses the local atom range only */
882
+ gmx::ForceWithVirial forceWithVirial(
883
+ useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
884
+ : force.unpaddedArrayRef(),
885
+ stepWork.computeVirial);
886
+
887
+ if (useSeparateForceWithVirialBuffer)
888
+ {
889
+ /* TODO: update comment
890
+ * We only compute forces on local atoms. Note that vsites can
891
+ * spread to non-local atoms, but that part of the buffer is
892
+ * cleared separately in the vsite spreading code.
893
+ */
894
+ clearRVecs(forceWithVirial.force_, true);
895
+ }
896
+
897
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
898
+
899
+ return ForceOutputs(
900
+ forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
901
+ }
902
+
903
+
904
+ /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
905
+ */
906
+ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
907
+ const t_forcerec& fr,
908
+ const pull_t* pull_work,
909
+ const gmx_edsam* ed,
910
+ const t_mdatoms& mdatoms,
911
+ const SimulationWorkload& simulationWork,
912
+ const StepWorkload& stepWork)
913
+ {
914
+ DomainLifetimeWorkload domainWork;
915
+ // Note that haveSpecialForces is constant over the whole run
916
+ domainWork.haveSpecialForces =
917
+ haveSpecialForces(inputrec, *fr.forceProviders, pull_work, stepWork.computeForces, ed);
918
+ domainWork.haveCpuListedForceWork = false;
919
+ domainWork.haveCpuBondedWork = false;
920
+ for (const auto& listedForces : fr.listedForces)
921
+ {
922
+ if (listedForces.haveCpuListedForces(*fr.fcdata))
923
+ {
924
+ domainWork.haveCpuListedForceWork = true;
925
+ }
926
+ if (listedForces.haveCpuBondeds())
927
+ {
928
+ domainWork.haveCpuBondedWork = true;
929
+ }
930
+ }
931
+ domainWork.haveGpuBondedWork =
932
+ ((fr.listedForcesGpu != nullptr) && fr.listedForcesGpu->haveInteractions());
933
+ // Note that haveFreeEnergyWork is constant over the whole run
934
+ domainWork.haveFreeEnergyWork =
935
+ (fr.efep != FreeEnergyPerturbationType::No && mdatoms.nPerturbed != 0);
936
+ // We assume we have local force work if there are CPU
937
+ // force tasks including PME or nonbondeds.
938
+ domainWork.haveCpuLocalForceWork =
939
+ domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork
940
+ || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
941
+ || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
942
+ domainWork.haveLocalForceContribInCpuBuffer =
943
+ domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
944
+ domainWork.haveNonLocalForceContribInCpuBuffer =
945
+ domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
946
+
947
+ return domainWork;
948
+ }
949
+
950
+ /*! \brief Set up force flag stuct from the force bitmask.
951
+ *
952
+ * \param[in] legacyFlags Force bitmask flags used to construct the new flags
953
+ * \param[in] mtsLevels The multiple time-stepping levels, either empty or 2 levels
954
+ * \param[in] step The current MD step
955
+ * \param[in] simulationWork Simulation workload description.
956
+ *
957
+ * \returns New Stepworkload description.
958
+ */
959
+ static StepWorkload setupStepWorkload(const int legacyFlags,
960
+ ArrayRef<const gmx::MtsLevel> mtsLevels,
961
+ const int64_t step,
962
+ const SimulationWorkload& simulationWork)
963
+ {
964
+ GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
965
+ const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
966
+
967
+ StepWorkload flags;
968
+ flags.stateChanged = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
969
+ flags.haveDynamicBox = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
970
+ flags.doNeighborSearch = ((legacyFlags & GMX_FORCE_NS) != 0);
971
+ flags.computeSlowForces = computeSlowForces;
972
+ flags.computeVirial = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
973
+ flags.computeEnergy = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
974
+ flags.computeForces = ((legacyFlags & GMX_FORCE_FORCES) != 0);
975
+ flags.useOnlyMtsCombinedForceBuffer = ((legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0);
976
+ flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
977
+ flags.computeNonbondedForces =
978
+ ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
979
+ && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
980
+ flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
981
+
982
+ if (simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps)
983
+ {
984
+ GMX_ASSERT(simulationWork.useGpuNonbonded,
985
+ "Can only offload buffer ops if nonbonded computation is also offloaded");
986
+ }
987
+ flags.useGpuXBufferOps = simulationWork.useGpuXBufferOps && !flags.doNeighborSearch;
988
+ // on virial steps the CPU reduction path is taken
989
+ flags.useGpuFBufferOps = simulationWork.useGpuFBufferOps && !flags.computeVirial;
990
+ const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
991
+ flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
992
+ && (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
993
+ flags.useGpuXHalo = simulationWork.useGpuHaloExchange && !flags.doNeighborSearch;
994
+ flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
995
+ flags.haveGpuPmeOnThisRank = rankHasGpuPmeTask && flags.computeSlowForces;
996
+ flags.computePmeOnSeparateRank = simulationWork.haveSeparatePmeRank && flags.computeSlowForces;
997
+ flags.combineMtsForcesBeforeHaloExchange =
998
+ (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
999
+ && flags.useOnlyMtsCombinedForceBuffer
1000
+ && !(flags.computeVirial || simulationWork.useGpuNonbonded || flags.haveGpuPmeOnThisRank));
1001
+
1002
+ return flags;
1003
+ }
1004
+
1005
+
1006
+ /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
1007
+ *
1008
+ */
1009
+ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
1010
+ gmx::ListedForcesGpu* listedForcesGpu,
1011
+ gmx_pme_t* pmedata,
1012
+ gmx_enerdata_t* enerd,
1013
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
1014
+ int64_t step,
1015
+ gmx_wallcycle* wcycle)
1016
+ {
1017
+ if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
1018
+ {
1019
+ /* Launch pruning before buffer clearing because the API overhead of the
1020
+ * clear kernel launches can leave the GPU idle while it could be running
1021
+ * the prune kernel.
1022
+ */
1023
+ if (nbv->isDynamicPruningStepGpu(step))
1024
+ {
1025
+ nbv->dispatchPruneKernelGpu(step);
1026
+ }
1027
+
1028
+ /* now clear the GPU outputs while we finish the step on the CPU */
1029
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1030
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1031
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
1032
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1033
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1034
+ }
1035
+
1036
+ if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
1037
+ {
1038
+ pme_gpu_reinit_computation(pmedata, wcycle);
1039
+ }
1040
+
1041
+ if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
1042
+ {
1043
+ // in principle this should be included in the DD balancing region,
1044
+ // but generally it is infrequent so we'll omit it for the sake of
1045
+ // simpler code
1046
+ listedForcesGpu->waitAccumulateEnergyTerms(enerd);
1047
+
1048
+ listedForcesGpu->clearEnergies();
1049
+ }
1050
+ }
1051
+
1052
+ /*! \brief Compute the number of times the "local coordinates ready on device" GPU event will be used as a synchronization point.
1053
+ *
1054
+ * When some work is offloaded to GPU, force calculation should wait for the atom coordinates to
1055
+ * be ready on the device. The coordinates can come either from H2D copy at the beginning of the step,
1056
+ * or from the GPU integration at the end of the previous step.
1057
+ *
1058
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
1059
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
1060
+ * in different streams depend on the coordinates.
1061
+ *
1062
+ * This function return the number of times the event will be consumed based on this step's workload.
1063
+ *
1064
+ * \param simulationWork Simulation workload flags.
1065
+ * \param stepWork Step workload flags.
1066
+ * \param pmeSendCoordinatesFromGpu Whether peer-to-peer communication is used for PME coordinates.
1067
+ * \return
1068
+ */
1069
+ static int getExpectedLocalXReadyOnDeviceConsumptionCount(gmx_used_in_debug const SimulationWorkload& simulationWork,
1070
+ const StepWorkload& stepWork,
1071
+ bool pmeSendCoordinatesFromGpu)
1072
+ {
1073
+ int result = 0;
1074
+ if (stepWork.computeSlowForces)
1075
+ {
1076
+ if (pmeSendCoordinatesFromGpu)
1077
+ {
1078
+ GMX_ASSERT(simulationWork.haveSeparatePmeRank,
1079
+ "GPU PME PP communications require having a separate PME rank");
1080
+ // Event is consumed by gmx_pme_send_coordinates for GPU PME PP Communications
1081
+ result++;
1082
+ }
1083
+ if (stepWork.haveGpuPmeOnThisRank)
1084
+ {
1085
+ // Event is consumed by launchPmeGpuSpread
1086
+ result++;
1087
+ }
1088
+ if (stepWork.computeNonbondedForces && stepWork.useGpuXBufferOps)
1089
+ {
1090
+ // Event is consumed by convertCoordinatesGpu
1091
+ result++;
1092
+ }
1093
+ }
1094
+ if (stepWork.useGpuXHalo)
1095
+ {
1096
+ // Event is consumed by communicateGpuHaloCoordinates
1097
+ result++;
1098
+ if (GMX_THREAD_MPI) // Issue #4262
1099
+ {
1100
+ result++;
1101
+ }
1102
+ }
1103
+ return result;
1104
+ }
1105
+
1106
+ //! \brief Data structure to hold dipole-related data and staging arrays
1107
+ struct DipoleData
1108
+ {
1109
+ //! Dipole staging for fast summing over MPI
1110
+ gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
1111
+ //! Dipole staging for states A and B (index 0 and 1 resp.)
1112
+ gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
1113
+ };
1114
+
1115
+
1116
+ static void reduceAndUpdateMuTot(DipoleData* dipoleData,
1117
+ const t_commrec* cr,
1118
+ const bool haveFreeEnergy,
1119
+ gmx::ArrayRef<const real> lambda,
1120
+ rvec muTotal,
1121
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1122
+ {
1123
+ if (PAR(cr))
1124
+ {
1125
+ gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
1126
+ ddBalanceRegionHandler.reopenRegionCpu();
1127
+ }
1128
+ for (int i = 0; i < 2; i++)
1129
+ {
1130
+ for (int j = 0; j < DIM; j++)
1131
+ {
1132
+ dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
1133
+ }
1134
+ }
1135
+
1136
+ if (!haveFreeEnergy)
1137
+ {
1138
+ copy_rvec(dipoleData->muStateAB[0], muTotal);
1139
+ }
1140
+ else
1141
+ {
1142
+ for (int j = 0; j < DIM; j++)
1143
+ {
1144
+ muTotal[j] = (1.0 - lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)])
1145
+ * dipoleData->muStateAB[0][j]
1146
+ + lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1147
+ * dipoleData->muStateAB[1][j];
1148
+ }
1149
+ }
1150
+ }
1151
+
1152
+ /*! \brief Combines MTS level0 and level1 force buffers into a full and MTS-combined force buffer.
1153
+ *
1154
+ * \param[in] numAtoms The number of atoms to combine forces for
1155
+ * \param[in,out] forceMtsLevel0 Input: F_level0, output: F_level0 + F_level1
1156
+ * \param[in,out] forceMts Input: F_level1, output: F_level0 + mtsFactor * F_level1
1157
+ * \param[in] mtsFactor The factor between the level0 and level1 time step
1158
+ */
1159
+ static void combineMtsForces(const int numAtoms,
1160
+ ArrayRef<RVec> forceMtsLevel0,
1161
+ ArrayRef<RVec> forceMts,
1162
+ const real mtsFactor)
1163
+ {
1164
+ const int gmx_unused numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Default);
1165
+ #pragma omp parallel for num_threads(numThreads) schedule(static)
1166
+ for (int i = 0; i < numAtoms; i++)
1167
+ {
1168
+ const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
1169
+ forceMtsLevel0[i] += forceMts[i];
1170
+ forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
1171
+ }
1172
+ }
1173
+
1174
+ /*! \brief Setup for the local GPU force reduction:
1175
+ * reinitialization plus the registration of forces and dependencies.
1176
+ *
1177
+ * \param [in] runScheduleWork Schedule workload flag structure
1178
+ * \param [in] nbv Non-bonded Verlet object
1179
+ * \param [in] stateGpu GPU state propagator object
1180
+ * \param [in] gpuForceReduction GPU force reduction object
1181
+ * \param [in] pmePpCommGpu PME-PP GPU communication object
1182
+ * \param [in] pmedata PME data object
1183
+ * \param [in] dd Domain decomposition object
1184
+ */
1185
+ static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
1186
+ const nonbonded_verlet_t* nbv,
1187
+ gmx::StatePropagatorDataGpu* stateGpu,
1188
+ gmx::GpuForceReduction* gpuForceReduction,
1189
+ gmx::PmePpCommGpu* pmePpCommGpu,
1190
+ const gmx_pme_t* pmedata,
1191
+ const gmx_domdec_t* dd)
1192
+ {
1193
+ GMX_ASSERT(!runScheduleWork->simulationWork.useMts,
1194
+ "GPU force reduction is not compatible with MTS");
1195
+
1196
+ // (re-)initialize local GPU force reduction
1197
+ const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
1198
+ || runScheduleWork->simulationWork.havePpDomainDecomposition;
1199
+ const int atomStart = 0;
1200
+ gpuForceReduction->reinit(stateGpu->getForces(),
1201
+ nbv->getNumAtoms(AtomLocality::Local),
1202
+ nbv->getGridIndices(),
1203
+ atomStart,
1204
+ accumulate,
1205
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
1206
+
1207
+ // register forces and add dependencies
1208
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
1209
+
1210
+ DeviceBuffer<gmx::RVec> pmeForcePtr;
1211
+ GpuEventSynchronizer* pmeSynchronizer = nullptr;
1212
+ bool havePmeContribution = false;
1213
+
1214
+ if (runScheduleWork->simulationWork.useGpuPme && !runScheduleWork->simulationWork.haveSeparatePmeRank)
1215
+ {
1216
+ pmeForcePtr = pme_gpu_get_device_f(pmedata);
1217
+ if (pmeForcePtr)
1218
+ {
1219
+ pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(pmedata);
1220
+ havePmeContribution = true;
1221
+ }
1222
+ }
1223
+ else if (runScheduleWork->simulationWork.useGpuPmePpCommunication)
1224
+ {
1225
+ pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
1226
+ if (pmeForcePtr)
1227
+ {
1228
+ if (GMX_THREAD_MPI)
1229
+ {
1230
+ pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
1231
+ }
1232
+ havePmeContribution = true;
1233
+ }
1234
+ }
1235
+
1236
+ if (havePmeContribution)
1237
+ {
1238
+ gpuForceReduction->registerRvecForce(pmeForcePtr);
1239
+ if (!runScheduleWork->simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
1240
+ {
1241
+ GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
1242
+ gpuForceReduction->addDependency(pmeSynchronizer);
1243
+ }
1244
+ }
1245
+
1246
+ if (runScheduleWork->domainWork.haveCpuLocalForceWork
1247
+ || (runScheduleWork->simulationWork.havePpDomainDecomposition
1248
+ && !runScheduleWork->simulationWork.useGpuHaloExchange))
1249
+ {
1250
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
1251
+ }
1252
+
1253
+ if (runScheduleWork->simulationWork.useGpuHaloExchange)
1254
+ {
1255
+ gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
1256
+ }
1257
+ }
1258
+
1259
+ /*! \brief Setup for the non-local GPU force reduction:
1260
+ * reinitialization plus the registration of forces and dependencies.
1261
+ *
1262
+ * \param [in] runScheduleWork Schedule workload flag structure
1263
+ * \param [in] nbv Non-bonded Verlet object
1264
+ * \param [in] stateGpu GPU state propagator object
1265
+ * \param [in] gpuForceReduction GPU force reduction object
1266
+ * \param [in] dd Domain decomposition object
1267
+ */
1268
+ static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
1269
+ const nonbonded_verlet_t* nbv,
1270
+ gmx::StatePropagatorDataGpu* stateGpu,
1271
+ gmx::GpuForceReduction* gpuForceReduction,
1272
+ const gmx_domdec_t* dd)
1273
+ {
1274
+ // (re-)initialize non-local GPU force reduction
1275
+ const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
1276
+ || runScheduleWork->domainWork.haveFreeEnergyWork;
1277
+ const int atomStart = dd_numHomeAtoms(*dd);
1278
+ gpuForceReduction->reinit(stateGpu->getForces(),
1279
+ nbv->getNumAtoms(AtomLocality::NonLocal),
1280
+ nbv->getGridIndices(),
1281
+ atomStart,
1282
+ accumulate,
1283
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
1284
+
1285
+ // register forces and add dependencies
1286
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
1287
+
1288
+ if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
1289
+ {
1290
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
1291
+ }
1292
+ }
1293
+
1294
+
1295
+ /*! \brief Return the number of local atoms.
1296
+ */
1297
+ static int getLocalAtomCount(const gmx_domdec_t* dd, const t_mdatoms& mdatoms, bool havePPDomainDecomposition)
1298
+ {
1299
+ GMX_ASSERT(!(havePPDomainDecomposition && (dd == nullptr)),
1300
+ "Can't have PP decomposition with dd uninitialized!");
1301
+ return havePPDomainDecomposition ? dd_numAtomsZones(*dd) : mdatoms.homenr;
1302
+ }
1303
+
1304
+
1305
+ void do_force(FILE* fplog,
1306
+ const t_commrec* cr,
1307
+ const gmx_multisim_t* ms,
1308
+ const t_inputrec& inputrec,
1309
+ gmx::Awh* awh,
1310
+ gmx_enfrot* enforcedRotation,
1311
+ gmx::ImdSession* imdSession,
1312
+ pull_t* pull_work,
1313
+ int64_t step,
1314
+ t_nrnb* nrnb,
1315
+ gmx_wallcycle* wcycle,
1316
+ const gmx_localtop_t* top,
1317
+ const matrix box,
1318
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1319
+ const history_t* hist,
1320
+ gmx::ForceBuffersView* forceView,
1321
+ tensor vir_force,
1322
+ const t_mdatoms* mdatoms,
1323
+ gmx_enerdata_t* enerd,
1324
+ gmx::ArrayRef<const real> lambda,
1325
+ t_forcerec* fr,
1326
+ gmx::MdrunScheduleWorkload* runScheduleWork,
1327
+ gmx::VirtualSitesHandler* vsite,
1328
+ rvec muTotal,
1329
+ double t,
1330
+ gmx_edsam* ed,
1331
+ CpuPpLongRangeNonbondeds* longRangeNonbondeds,
1332
+ int legacyFlags,
1333
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1334
+ {
1335
+ auto force = forceView->forceWithPadding();
1336
+ GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
1337
+ "The size of the force buffer should be at least the number of atoms to compute "
1338
+ "forces for");
1339
+
1340
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1341
+ interaction_const_t* ic = fr->ic.get();
1342
+
1343
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1344
+
1345
+ const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
1346
+
1347
+ runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
1348
+ const StepWorkload& stepWork = runScheduleWork->stepWork;
1349
+
1350
+ if (stepWork.doNeighborSearch && gmx::needStateGpu(simulationWork))
1351
+ {
1352
+ // TODO refactor this to do_md, after partitioning.
1353
+ stateGpu->reinit(mdatoms->homenr,
1354
+ getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
1355
+ if (stepWork.haveGpuPmeOnThisRank)
1356
+ {
1357
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
1358
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
1359
+ }
1360
+ }
1361
+
1362
+ if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork && !stepWork.doNeighborSearch)
1363
+ {
1364
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data
1365
+ // First clear local portion of force array, so that untouched atoms are zero.
1366
+ // The dependency for this is that forces from previous timestep have been consumed,
1367
+ // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
1368
+ // On NS steps, the buffer could have already cleared in stateGpu->reinit.
1369
+ stateGpu->clearForcesOnGpu(AtomLocality::Local,
1370
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
1371
+ AtomLocality::Local, simulationWork, stepWork));
1372
+ }
1373
+
1374
+ /* At a search step we need to start the first balancing region
1375
+ * somewhere early inside the step after communication during domain
1376
+ * decomposition (and not during the previous step as usual).
1377
+ */
1378
+ if (stepWork.doNeighborSearch)
1379
+ {
1380
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
1381
+ }
1382
+
1383
+ clear_mat(vir_force);
1384
+
1385
+ if (fr->pbcType != PbcType::No)
1386
+ {
1387
+ /* Compute shift vectors every step,
1388
+ * because of pressure coupling or box deformation!
1389
+ */
1390
+ if (stepWork.haveDynamicBox && stepWork.stateChanged)
1391
+ {
1392
+ calc_shifts(box, fr->shift_vec);
1393
+ }
1394
+
1395
+ const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
1396
+ const bool calcCGCM = (fillGrid && !haveDDAtomOrdering(*cr));
1397
+ if (calcCGCM)
1398
+ {
1399
+ put_atoms_in_box_omp(fr->pbcType,
1400
+ box,
1401
+ x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
1402
+ gmx_omp_nthreads_get(ModuleMultiThread::Default));
1403
+ inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
1404
+ }
1405
+ }
1406
+
1407
+ nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
1408
+
1409
+ const bool pmeSendCoordinatesFromGpu =
1410
+ simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
1411
+ const bool reinitGpuPmePpComms =
1412
+ simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
1413
+
1414
+ auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps)
1415
+ ? stateGpu->getCoordinatesReadyOnDeviceEvent(
1416
+ AtomLocality::Local, simulationWork, stepWork)
1417
+ : nullptr;
1418
+
1419
+ GMX_ASSERT(simulationWork.useGpuHaloExchange
1420
+ == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
1421
+ "The GPU halo exchange is active, but it has not been constructed.");
1422
+
1423
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
1424
+ // Copy coordinate from the GPU if update is on the GPU and there
1425
+ // are forces to be computed on the CPU, or for the computation of
1426
+ // virial, or if host-side data will be transferred from this task
1427
+ // to a remote task for halo exchange or PME-PP communication. At
1428
+ // search steps the current coordinates are already on the host,
1429
+ // hence copy is not needed.
1430
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
1431
+ && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
1432
+ || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
1433
+ || simulationWork.computeMuTot))
1434
+ {
1435
+ stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
1436
+ haveCopiedXFromGpu = true;
1437
+ }
1438
+
1439
+ // Coordinates on the device are needed if PME or BufferOps are offloaded.
1440
+ // The local coordinates can be copied right away.
1441
+ // NOTE: Consider moving this copy to right after they are updated and constrained,
1442
+ // if the later is not offloaded.
1443
+ if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
1444
+ {
1445
+ GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
1446
+ const int expectedLocalXReadyOnDeviceConsumptionCount =
1447
+ getExpectedLocalXReadyOnDeviceConsumptionCount(
1448
+ simulationWork, stepWork, pmeSendCoordinatesFromGpu);
1449
+
1450
+ // We need to copy coordinates when:
1451
+ // 1. Update is not offloaded
1452
+ // 2. The buffers were reinitialized on search step
1453
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
1454
+ {
1455
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(),
1456
+ AtomLocality::Local,
1457
+ expectedLocalXReadyOnDeviceConsumptionCount);
1458
+ }
1459
+ else if (simulationWork.useGpuUpdate)
1460
+ {
1461
+ stateGpu->setXUpdatedOnDeviceEventExpectedConsumptionCount(
1462
+ expectedLocalXReadyOnDeviceConsumptionCount);
1463
+ }
1464
+ }
1465
+
1466
+ if (stepWork.computePmeOnSeparateRank)
1467
+ {
1468
+ /* Send particle coordinates to the pme nodes */
1469
+ if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
1470
+ {
1471
+ GMX_ASSERT(haveCopiedXFromGpu,
1472
+ "a wait should only be triggered if copy has been scheduled");
1473
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1474
+ }
1475
+
1476
+ gmx_pme_send_coordinates(fr,
1477
+ cr,
1478
+ box,
1479
+ x.unpaddedArrayRef(),
1480
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1481
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
1482
+ (stepWork.computeVirial || stepWork.computeEnergy),
1483
+ step,
1484
+ simulationWork.useGpuPmePpCommunication,
1485
+ reinitGpuPmePpComms,
1486
+ pmeSendCoordinatesFromGpu,
1487
+ stepWork.useGpuPmeFReduction,
1488
+ localXReadyOnDevice,
1489
+ wcycle);
1490
+ }
1491
+
1492
+ if (stepWork.haveGpuPmeOnThisRank)
1493
+ {
1494
+ launchPmeGpuSpread(fr->pmedata,
1495
+ box,
1496
+ stepWork,
1497
+ localXReadyOnDevice,
1498
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1499
+ wcycle);
1500
+ }
1501
+
1502
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
1503
+
1504
+ /* do gridding for pair search */
1505
+ if (stepWork.doNeighborSearch)
1506
+ {
1507
+ if (fr->wholeMoleculeTransform && stepWork.stateChanged)
1508
+ {
1509
+ fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
1510
+ }
1511
+
1512
+ wallcycle_start(wcycle, WallCycleCounter::NS);
1513
+ if (!haveDDAtomOrdering(*cr))
1514
+ {
1515
+ const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
1516
+ const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
1517
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
1518
+ nbnxn_put_on_grid(nbv,
1519
+ box,
1520
+ 0,
1521
+ vzero,
1522
+ boxDiagonal,
1523
+ nullptr,
1524
+ { 0, mdatoms->homenr },
1525
+ -1,
1526
+ fr->atomInfo,
1527
+ x.unpaddedArrayRef(),
1528
+ 0,
1529
+ nullptr);
1530
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
1531
+ }
1532
+ else
1533
+ {
1534
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1535
+ nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->atomInfo, x.unpaddedArrayRef());
1536
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1537
+ }
1538
+
1539
+ nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
1540
+ gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
1541
+ fr->atomInfo);
1542
+
1543
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1544
+
1545
+ /* initialize the GPU nbnxm atom data and bonded data structures */
1546
+ if (simulationWork.useGpuNonbonded)
1547
+ {
1548
+ // Note: cycle counting only nononbondeds, GPU listed forces counts internally
1549
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1550
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1551
+ Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
1552
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1553
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1554
+
1555
+ if (fr->listedForcesGpu)
1556
+ {
1557
+ /* Now we put all atoms on the grid, we can assign bonded
1558
+ * interactions to the GPU, where the grid order is
1559
+ * needed. Also the xq, f and fshift device buffers have
1560
+ * been reallocated if needed, so the bonded code can
1561
+ * learn about them. */
1562
+ // TODO the xq, f, and fshift buffers are now shared
1563
+ // resources, so they should be maintained by a
1564
+ // higher-level object than the nb module.
1565
+ fr->listedForcesGpu->updateInteractionListsAndDeviceBuffers(
1566
+ nbv->getGridIndices(),
1567
+ top->idef,
1568
+ Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1569
+ Nbnxm::gpu_get_f(nbv->gpu_nbv),
1570
+ Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1571
+ }
1572
+ }
1573
+
1574
+ // Need to run after the GPU-offload bonded interaction lists
1575
+ // are set up to be able to determine whether there is bonded work.
1576
+ runScheduleWork->domainWork = setupDomainLifetimeWorkload(
1577
+ inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
1578
+
1579
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1580
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
1581
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1582
+ nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
1583
+
1584
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::Local);
1585
+
1586
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
1587
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1588
+
1589
+ if (simulationWork.useGpuXBufferOps)
1590
+ {
1591
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
1592
+ }
1593
+
1594
+ if (simulationWork.useGpuFBufferOps)
1595
+ {
1596
+ setupLocalGpuForceReduction(runScheduleWork,
1597
+ fr->nbv.get(),
1598
+ stateGpu,
1599
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1600
+ fr->pmePpCommGpu.get(),
1601
+ fr->pmedata,
1602
+ cr->dd);
1603
+ if (runScheduleWork->simulationWork.havePpDomainDecomposition)
1604
+ {
1605
+ setupNonLocalGpuForceReduction(runScheduleWork,
1606
+ fr->nbv.get(),
1607
+ stateGpu,
1608
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
1609
+ cr->dd);
1610
+ }
1611
+ }
1612
+ }
1613
+ else if (!EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
1614
+ {
1615
+ if (stepWork.useGpuXBufferOps)
1616
+ {
1617
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
1618
+ nbv->convertCoordinatesGpu(AtomLocality::Local, stateGpu->getCoordinates(), localXReadyOnDevice);
1619
+ }
1620
+ else
1621
+ {
1622
+ if (simulationWork.useGpuUpdate)
1623
+ {
1624
+ GMX_ASSERT(stateGpu, "need a valid stateGpu object");
1625
+ GMX_ASSERT(haveCopiedXFromGpu,
1626
+ "a wait should only be triggered if copy has been scheduled");
1627
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1628
+ }
1629
+ nbv->convertCoordinates(AtomLocality::Local, x.unpaddedArrayRef());
1630
+ }
1631
+ }
1632
+
1633
+ if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
1634
+ {
1635
+ ddBalanceRegionHandler.openBeforeForceComputationGpu();
1636
+
1637
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
1638
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1639
+ Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
1640
+ if (!stepWork.useGpuXBufferOps)
1641
+ {
1642
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
1643
+ }
1644
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1645
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1646
+ // with X buffer ops offloaded to the GPU on all but the search steps
1647
+
1648
+ // bonded work not split into separate local and non-local, so with DD
1649
+ // we can only launch the kernel after non-local coordinates have been received.
1650
+ if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
1651
+ {
1652
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1653
+ }
1654
+
1655
+ /* launch local nonbonded work on GPU */
1656
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1657
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1658
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
1659
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1660
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1661
+ }
1662
+
1663
+ if (stepWork.haveGpuPmeOnThisRank)
1664
+ {
1665
+ // In PME GPU and mixed mode we launch FFT / gather after the
1666
+ // X copy/transform to allow overlap as well as after the GPU NB
1667
+ // launch to avoid FFT launch overhead hijacking the CPU and delaying
1668
+ // the nonbonded kernel.
1669
+ launchPmeGpuFftAndGather(fr->pmedata,
1670
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1671
+ wcycle,
1672
+ stepWork);
1673
+ }
1674
+
1675
+ /* Communicate coordinates and sum dipole if necessary +
1676
+ do non-local pair search */
1677
+ if (simulationWork.havePpDomainDecomposition)
1678
+ {
1679
+ if (stepWork.doNeighborSearch)
1680
+ {
1681
+ // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
1682
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1683
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1684
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1685
+ nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
1686
+
1687
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::NonLocal);
1688
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1689
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1690
+ // TODO refactor this GPU halo exchange re-initialisation
1691
+ // to location in do_md where GPU halo exchange is
1692
+ // constructed at partitioning, after above stateGpu
1693
+ // re-initialization has similarly been refactored
1694
+ if (simulationWork.useGpuHaloExchange)
1695
+ {
1696
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
1697
+ }
1698
+ }
1699
+ else
1700
+ {
1701
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
1702
+ if (stepWork.useGpuXHalo)
1703
+ {
1704
+ // The following must be called after local setCoordinates (which records an event
1705
+ // when the coordinate data has been copied to the device).
1706
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
1707
+
1708
+ if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
1709
+ {
1710
+ // non-local part of coordinate buffer must be copied back to host for CPU work
1711
+ stateGpu->copyCoordinatesFromGpu(
1712
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
1713
+ }
1714
+ }
1715
+ else
1716
+ {
1717
+ if (simulationWork.useGpuUpdate)
1718
+ {
1719
+ GMX_ASSERT(haveCopiedXFromGpu,
1720
+ "a wait should only be triggered if copy has been scheduled");
1721
+ const bool haveAlreadyWaited =
1722
+ (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1723
+ if (!haveAlreadyWaited)
1724
+ {
1725
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1726
+ }
1727
+ }
1728
+ dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1729
+ }
1730
+
1731
+ if (stepWork.useGpuXBufferOps)
1732
+ {
1733
+ if (!stepWork.useGpuXHalo)
1734
+ {
1735
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
1736
+ }
1737
+ nbv->convertCoordinatesGpu(
1738
+ AtomLocality::NonLocal,
1739
+ stateGpu->getCoordinates(),
1740
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
1741
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
1742
+ }
1743
+ else
1744
+ {
1745
+ nbv->convertCoordinates(AtomLocality::NonLocal, x.unpaddedArrayRef());
1746
+ }
1747
+ }
1748
+
1749
+ if (simulationWork.useGpuNonbonded)
1750
+ {
1751
+
1752
+ if (!stepWork.useGpuXBufferOps)
1753
+ {
1754
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
1755
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1756
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
1757
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1758
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1759
+ }
1760
+
1761
+ if (domainWork.haveGpuBondedWork)
1762
+ {
1763
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1764
+ }
1765
+
1766
+ /* launch non-local nonbonded tasks on GPU */
1767
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1768
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1769
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1770
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1771
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1772
+ }
1773
+ }
1774
+
1775
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
1776
+ // so we need to do that here after the local and non-local pairlist construction.
1777
+ if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No)
1778
+ {
1779
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
1780
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
1781
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
1782
+ }
1783
+
1784
+ if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
1785
+ {
1786
+ /* launch D2H copy-back F */
1787
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1788
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1789
+
1790
+ if (simulationWork.havePpDomainDecomposition)
1791
+ {
1792
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
1793
+ }
1794
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
1795
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1796
+
1797
+ if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
1798
+ {
1799
+ fr->listedForcesGpu->launchEnergyTransfer();
1800
+ }
1801
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1802
+ }
1803
+
1804
+ gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
1805
+ if (fr->wholeMoleculeTransform)
1806
+ {
1807
+ xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
1808
+ }
1809
+
1810
+ // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
1811
+ // this wait ensures that the D2H transfer is complete.
1812
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
1813
+ {
1814
+ const bool needCoordsOnHost = (runScheduleWork->domainWork.haveCpuLocalForceWork
1815
+ || stepWork.computeVirial || simulationWork.computeMuTot);
1816
+ const bool haveAlreadyWaited =
1817
+ simulationWork.useCpuHaloExchange
1818
+ || (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1819
+ if (needCoordsOnHost && !haveAlreadyWaited)
1820
+ {
1821
+ GMX_ASSERT(haveCopiedXFromGpu,
1822
+ "a wait should only be triggered if copy has been scheduled");
1823
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1824
+ }
1825
+ }
1826
+
1827
+ DipoleData dipoleData;
1828
+
1829
+ if (simulationWork.computeMuTot)
1830
+ {
1831
+ const int start = 0;
1832
+
1833
+ /* Calculate total (local) dipole moment in a temporary common array.
1834
+ * This makes it possible to sum them over nodes faster.
1835
+ */
1836
+ gmx::ArrayRef<const gmx::RVec> xRef =
1837
+ (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
1838
+ calc_mu(start,
1839
+ mdatoms->homenr,
1840
+ xRef,
1841
+ mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
1842
+ : gmx::ArrayRef<real>{},
1843
+ mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
1844
+ : gmx::ArrayRef<real>{},
1845
+ mdatoms->nChargePerturbed != 0,
1846
+ dipoleData.muStaging[0],
1847
+ dipoleData.muStaging[1]);
1848
+
1849
+ reduceAndUpdateMuTot(
1850
+ &dipoleData, cr, (fr->efep != FreeEnergyPerturbationType::No), lambda, muTotal, ddBalanceRegionHandler);
1851
+ }
1852
+
1853
+ /* Reset energies */
1854
+ reset_enerdata(enerd);
1855
+
1856
+ if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
1857
+ {
1858
+ wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
1859
+ dd_force_flop_start(cr->dd, nrnb);
1860
+ }
1861
+
1862
+ if (inputrec.bRot)
1863
+ {
1864
+ wallcycle_start(wcycle, WallCycleCounter::Rot);
1865
+ do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
1866
+ wallcycle_stop(wcycle, WallCycleCounter::Rot);
1867
+ }
1868
+
1869
+ /* Start the force cycle counter.
1870
+ * Note that a different counter is used for dynamic load balancing.
1871
+ */
1872
+ wallcycle_start(wcycle, WallCycleCounter::Force);
1873
+
1874
+ /* Set up and clear force outputs:
1875
+ * forceOutMtsLevel0: everything except what is in the other two outputs
1876
+ * forceOutMtsLevel1: PME-mesh and listed-forces group 1
1877
+ * forceOutNonbonded: non-bonded forces
1878
+ * Without multiple time stepping all point to the same object.
1879
+ * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
1880
+ */
1881
+ ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
1882
+ &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
1883
+
1884
+ // Force output for MTS combined forces, only set at level1 MTS steps
1885
+ std::optional<ForceOutputs> forceOutMts =
1886
+ (simulationWork.useMts && stepWork.computeSlowForces)
1887
+ ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
1888
+ forceView->forceMtsCombinedWithPadding(),
1889
+ domainWork,
1890
+ stepWork,
1891
+ simulationWork.havePpDomainDecomposition,
1892
+ wcycle))
1893
+ : std::nullopt;
1894
+
1895
+ ForceOutputs* forceOutMtsLevel1 =
1896
+ simulationWork.useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr)
1897
+ : &forceOutMtsLevel0;
1898
+
1899
+ const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
1900
+
1901
+ ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
1902
+
1903
+ if (inputrec.bPull && pull_have_constraint(*pull_work))
1904
+ {
1905
+ clear_pull_forces(pull_work);
1906
+ }
1907
+
1908
+ /* We calculate the non-bonded forces, when done on the CPU, here.
1909
+ * We do this before calling do_force_lowlevel, because in that
1910
+ * function, the listed forces are calculated before PME, which
1911
+ * does communication. With this order, non-bonded and listed
1912
+ * force calculation imbalance can be balanced out by the domain
1913
+ * decomposition load balancing.
1914
+ */
1915
+
1916
+ const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
1917
+
1918
+ if (!useOrEmulateGpuNb)
1919
+ {
1920
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
1921
+ }
1922
+
1923
+ // TODO Force flags should include haveFreeEnergyWork for this domain
1924
+ if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
1925
+ {
1926
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1927
+ /* Wait for non-local coordinate data to be copied from device */
1928
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
1929
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1930
+ }
1931
+
1932
+ if (fr->efep != FreeEnergyPerturbationType::No && stepWork.computeNonbondedForces)
1933
+ {
1934
+ /* Calculate the local and non-local free energy interactions here.
1935
+ * Happens here on the CPU both with and without GPU.
1936
+ */
1937
+ nbv->dispatchFreeEnergyKernels(
1938
+ x,
1939
+ &forceOutNonbonded->forceWithShiftForces(),
1940
+ fr->use_simd_kernels,
1941
+ fr->ntype,
1942
+ fr->rlist,
1943
+ max_cutoff2(inputrec.pbcType, box),
1944
+ *fr->ic,
1945
+ fr->shift_vec,
1946
+ fr->nbfp,
1947
+ fr->ljpme_c6grid,
1948
+ mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
1949
+ : gmx::ArrayRef<real>{},
1950
+ mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
1951
+ : gmx::ArrayRef<real>{},
1952
+ mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
1953
+ : gmx::ArrayRef<int>{},
1954
+ mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
1955
+ : gmx::ArrayRef<int>{},
1956
+ inputrec.fepvals.get(),
1957
+ lambda,
1958
+ enerd,
1959
+ stepWork,
1960
+ nrnb);
1961
+ }
1962
+
1963
+ if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
1964
+ {
1965
+ if (simulationWork.havePpDomainDecomposition)
1966
+ {
1967
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1968
+ }
1969
+
1970
+ if (stepWork.computeForces)
1971
+ {
1972
+ /* Add all the non-bonded force to the normal force array.
1973
+ * This can be split into a local and a non-local part when overlapping
1974
+ * communication with calculation with domain decomposition.
1975
+ */
1976
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1977
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
1978
+ forceOutNonbonded->forceWithShiftForces().force());
1979
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1980
+ }
1981
+
1982
+ /* If there are multiple fshift output buffers we need to reduce them */
1983
+ if (stepWork.computeVirial)
1984
+ {
1985
+ /* This is not in a subcounter because it takes a
1986
+ negligible and constant-sized amount of time */
1987
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(
1988
+ *nbv->nbat, forceOutNonbonded->forceWithShiftForces().shiftForces());
1989
+ }
1990
+ }
1991
+
1992
+ // Compute wall interactions, when present.
1993
+ // Note: should be moved to special forces.
1994
+ if (inputrec.nwall && stepWork.computeNonbondedForces)
1995
+ {
1996
+ /* foreign lambda component for walls */
1997
+ real dvdl_walls = do_walls(inputrec,
1998
+ *fr,
1999
+ box,
2000
+ mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
2001
+ : gmx::ArrayRef<int>{},
2002
+ mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
2003
+ : gmx::ArrayRef<int>{},
2004
+ mdatoms->cENER ? gmx::arrayRefFromArray(mdatoms->cENER, mdatoms->nr)
2005
+ : gmx::ArrayRef<unsigned short>{},
2006
+ mdatoms->homenr,
2007
+ mdatoms->nPerturbed,
2008
+ x.unpaddedConstArrayRef(),
2009
+ &forceOutMtsLevel0.forceWithVirial(),
2010
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
2011
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR],
2012
+ nrnb);
2013
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_walls;
2014
+ }
2015
+
2016
+ if (stepWork.computeListedForces)
2017
+ {
2018
+ /* Check whether we need to take into account PBC in listed interactions */
2019
+ bool needMolPbc = false;
2020
+ for (const auto& listedForces : fr->listedForces)
2021
+ {
2022
+ if (listedForces.haveCpuListedForces(*fr->fcdata))
2023
+ {
2024
+ needMolPbc = fr->bMolPBC;
2025
+ }
2026
+ }
2027
+
2028
+ t_pbc pbc;
2029
+
2030
+ if (needMolPbc)
2031
+ {
2032
+ /* Since all atoms are in the rectangular or triclinic unit-cell,
2033
+ * only single box vector shifts (2 in x) are required.
2034
+ */
2035
+ set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
2036
+ }
2037
+
2038
+ for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
2039
+ mtsIndex++)
2040
+ {
2041
+ ListedForces& listedForces = fr->listedForces[mtsIndex];
2042
+ ForceOutputs& forceOut = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
2043
+ listedForces.calculate(wcycle,
2044
+ box,
2045
+ inputrec.fepvals.get(),
2046
+ cr,
2047
+ ms,
2048
+ x,
2049
+ xWholeMolecules,
2050
+ fr->fcdata.get(),
2051
+ hist,
2052
+ &forceOut,
2053
+ fr,
2054
+ &pbc,
2055
+ enerd,
2056
+ nrnb,
2057
+ lambda,
2058
+ mdatoms,
2059
+ haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
2060
+ stepWork);
2061
+ }
2062
+ }
2063
+
2064
+ if (stepWork.computeSlowForces)
2065
+ {
2066
+ longRangeNonbondeds->calculate(fr->pmedata,
2067
+ cr,
2068
+ x.unpaddedConstArrayRef(),
2069
+ &forceOutMtsLevel1->forceWithVirial(),
2070
+ enerd,
2071
+ box,
2072
+ lambda,
2073
+ dipoleData.muStateAB,
2074
+ stepWork,
2075
+ ddBalanceRegionHandler);
2076
+ }
2077
+
2078
+ /* PLUMED */
2079
+ if(plumedswitch){
2080
+ int plumedNeedsEnergy;
2081
+ plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
2082
+ if(!plumedNeedsEnergy) plumed_cmd(plumedmain,"performCalc",nullptr);
2083
+ }
2084
+ /* END PLUMED */
2085
+
2086
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2087
+
2088
+ // VdW dispersion correction, only computed on master rank to avoid double counting
2089
+ if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
2090
+ {
2091
+ // Calculate long range corrections to pressure and energy
2092
+ const DispersionCorrection::Correction correction = fr->dispersionCorrection->calculate(
2093
+ box, lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]);
2094
+
2095
+ if (stepWork.computeEnergy)
2096
+ {
2097
+ enerd->term[F_DISPCORR] = correction.energy;
2098
+ enerd->term[F_DVDL_VDW] += correction.dvdl;
2099
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += correction.dvdl;
2100
+ }
2101
+ if (stepWork.computeVirial)
2102
+ {
2103
+ correction.correctVirial(vir_force);
2104
+ enerd->term[F_PDISPCORR] = correction.pressure;
2105
+ }
2106
+ }
2107
+
2108
+ const bool needToReceivePmeResultsFromSeparateRank = (PAR(cr) && stepWork.computePmeOnSeparateRank);
2109
+ const bool needToReceivePmeResults =
2110
+ (stepWork.haveGpuPmeOnThisRank || needToReceivePmeResultsFromSeparateRank);
2111
+
2112
+ /* When running free energy perturbations steered by AWH and doing PME calculations on the
2113
+ * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
2114
+ * FEP dimension with AWH. */
2115
+ const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension() && needToReceivePmeResults
2116
+ && stepWork.computeEnergy && stepWork.computeSlowForces);
2117
+ if (needEarlyPmeResults)
2118
+ {
2119
+ if (stepWork.haveGpuPmeOnThisRank)
2120
+ {
2121
+ pme_gpu_wait_and_reduce(fr->pmedata,
2122
+ stepWork,
2123
+ wcycle,
2124
+ &forceOutMtsLevel1->forceWithVirial(),
2125
+ enerd,
2126
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2127
+ }
2128
+ else if (needToReceivePmeResultsFromSeparateRank)
2129
+ {
2130
+ /* In case of node-splitting, the PP nodes receive the long-range
2131
+ * forces, virial and energy from the PME nodes here.
2132
+ */
2133
+ pme_receive_force_ener(fr,
2134
+ cr,
2135
+ &forceOutMtsLevel1->forceWithVirial(),
2136
+ enerd,
2137
+ simulationWork.useGpuPmePpCommunication,
2138
+ stepWork.useGpuPmeFReduction,
2139
+ wcycle);
2140
+ }
2141
+ }
2142
+
2143
+ computeSpecialForces(fplog,
2144
+ cr,
2145
+ inputrec,
2146
+ awh,
2147
+ enforcedRotation,
2148
+ imdSession,
2149
+ pull_work,
2150
+ step,
2151
+ t,
2152
+ wcycle,
2153
+ fr->forceProviders,
2154
+ box,
2155
+ x.unpaddedArrayRef(),
2156
+ mdatoms,
2157
+ lambda,
2158
+ stepWork,
2159
+ &forceOutMtsLevel0.forceWithVirial(),
2160
+ forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr,
2161
+ enerd,
2162
+ ed,
2163
+ stepWork.doNeighborSearch);
2164
+
2165
+ if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
2166
+ && domainWork.haveCpuLocalForceWork)
2167
+ {
2168
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
2169
+ }
2170
+
2171
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2172
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2173
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
2174
+ "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
2175
+ // Will store the amount of cycles spent waiting for the GPU that
2176
+ // will be later used in the DLB accounting.
2177
+ float cycles_wait_gpu = 0;
2178
+ if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
2179
+ {
2180
+ auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
2181
+
2182
+ /* wait for non-local forces (or calculate in emulation mode) */
2183
+ if (simulationWork.havePpDomainDecomposition)
2184
+ {
2185
+ if (simulationWork.useGpuNonbonded)
2186
+ {
2187
+ cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
2188
+ nbv->gpu_nbv,
2189
+ stepWork,
2190
+ AtomLocality::NonLocal,
2191
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2192
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2193
+ forceWithShiftForces.shiftForces(),
2194
+ wcycle);
2195
+ }
2196
+ else
2197
+ {
2198
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2199
+ do_nb_verlet(
2200
+ fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
2201
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2202
+ }
2203
+
2204
+ if (stepWork.useGpuFBufferOps)
2205
+ {
2206
+ if (domainWork.haveNonLocalForceContribInCpuBuffer)
2207
+ {
2208
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2209
+ AtomLocality::NonLocal);
2210
+ }
2211
+
2212
+
2213
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
2214
+
2215
+ if (!stepWork.useGpuFHalo)
2216
+ {
2217
+ /* We don't explicitly wait for the forces to be reduced on device,
2218
+ * but wait for them to finish copying to CPU instead.
2219
+ * So, we manually consume the event, see Issue #3988. */
2220
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
2221
+ // copy from GPU input for dd_move_f()
2222
+ stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2223
+ AtomLocality::NonLocal);
2224
+ }
2225
+ }
2226
+ else
2227
+ {
2228
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
2229
+ }
2230
+
2231
+ if (fr->nbv->emulateGpu() && stepWork.computeVirial)
2232
+ {
2233
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat, forceWithShiftForces.shiftForces());
2234
+ }
2235
+ }
2236
+ }
2237
+
2238
+ /* Combining the forces for multiple time stepping before the halo exchange, when possible,
2239
+ * avoids an extra halo exchange (when DD is used) and post-processing step.
2240
+ */
2241
+ if (stepWork.combineMtsForcesBeforeHaloExchange)
2242
+ {
2243
+ combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
2244
+ force.unpaddedArrayRef(),
2245
+ forceView->forceMtsCombined(),
2246
+ inputrec.mtsLevels[1].stepFactor);
2247
+ }
2248
+
2249
+ if (simulationWork.havePpDomainDecomposition)
2250
+ {
2251
+ /* We are done with the CPU compute.
2252
+ * We will now communicate the non-local forces.
2253
+ * If we use a GPU this will overlap with GPU work, so in that case
2254
+ * we do not close the DD force balancing region here.
2255
+ */
2256
+ ddBalanceRegionHandler.closeAfterForceComputationCpu();
2257
+
2258
+ if (stepWork.computeForces)
2259
+ {
2260
+
2261
+ if (stepWork.useGpuFHalo)
2262
+ {
2263
+ // If there exist CPU forces, data from halo exchange should accumulate into these
2264
+ bool accumulateForces = domainWork.haveCpuLocalForceWork;
2265
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
2266
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
2267
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
2268
+
2269
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
2270
+ }
2271
+ else
2272
+ {
2273
+ if (stepWork.useGpuFBufferOps)
2274
+ {
2275
+ stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
2276
+ }
2277
+
2278
+ // Without MTS or with MTS at slow steps with uncombined forces we need to
2279
+ // communicate the fast forces
2280
+ if (!simulationWork.useMts || !stepWork.combineMtsForcesBeforeHaloExchange)
2281
+ {
2282
+ dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
2283
+ }
2284
+ // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
2285
+ if (simulationWork.useMts && stepWork.computeSlowForces)
2286
+ {
2287
+ dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
2288
+ }
2289
+ }
2290
+ }
2291
+ }
2292
+
2293
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
2294
+ // an alternating wait/reduction scheme.
2295
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2296
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2297
+ bool alternateGpuWait = (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
2298
+ && simulationWork.useGpuNonbonded && !simulationWork.havePpDomainDecomposition
2299
+ && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
2300
+ if (alternateGpuWait)
2301
+ {
2302
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(),
2303
+ fr->pmedata,
2304
+ forceOutNonbonded,
2305
+ forceOutMtsLevel1,
2306
+ enerd,
2307
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
2308
+ stepWork,
2309
+ wcycle);
2310
+ }
2311
+
2312
+ if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank && !needEarlyPmeResults)
2313
+ {
2314
+ pme_gpu_wait_and_reduce(fr->pmedata,
2315
+ stepWork,
2316
+ wcycle,
2317
+ &forceOutMtsLevel1->forceWithVirial(),
2318
+ enerd,
2319
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2320
+ }
2321
+
2322
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
2323
+ if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
2324
+ {
2325
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
2326
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
2327
+ * but even with a step of 0.1 ms the difference is less than 1%
2328
+ * of the step time.
2329
+ */
2330
+ const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
2331
+ const float waitCycles = Nbnxm::gpu_wait_finish_task(
2332
+ nbv->gpu_nbv,
2333
+ stepWork,
2334
+ AtomLocality::Local,
2335
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2336
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2337
+ forceOutNonbonded->forceWithShiftForces().shiftForces(),
2338
+ wcycle);
2339
+
2340
+ if (ddBalanceRegionHandler.useBalancingRegion())
2341
+ {
2342
+ DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
2343
+ if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
2344
+ {
2345
+ /* We measured few cycles, it could be that the kernel
2346
+ * and transfer finished earlier and there was no actual
2347
+ * wait time, only API call overhead.
2348
+ * Then the actual time could be anywhere between 0 and
2349
+ * cycles_wait_est. We will use half of cycles_wait_est.
2350
+ */
2351
+ waitedForGpu = DdBalanceRegionWaitedForGpu::no;
2352
+ }
2353
+ ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
2354
+ }
2355
+ }
2356
+
2357
+ if (fr->nbv->emulateGpu())
2358
+ {
2359
+ // NOTE: emulation kernel is not included in the balancing region,
2360
+ // but emulation mode does not target performance anyway
2361
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2362
+ do_nb_verlet(fr,
2363
+ ic,
2364
+ enerd,
2365
+ stepWork,
2366
+ InteractionLocality::Local,
2367
+ haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
2368
+ step,
2369
+ nrnb,
2370
+ wcycle);
2371
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2372
+ }
2373
+
2374
+ // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
2375
+ // TODO refactor this and unify with below default-path call to the same function
2376
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2377
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2378
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
2379
+ {
2380
+ /* In case of node-splitting, the PP nodes receive the long-range
2381
+ * forces, virial and energy from the PME nodes here.
2382
+ */
2383
+ pme_receive_force_ener(fr,
2384
+ cr,
2385
+ &forceOutMtsLevel1->forceWithVirial(),
2386
+ enerd,
2387
+ simulationWork.useGpuPmePpCommunication,
2388
+ stepWork.useGpuPmeFReduction,
2389
+ wcycle);
2390
+ }
2391
+
2392
+
2393
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
2394
+ * on the non-alternating path. */
2395
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2396
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2397
+ if (useOrEmulateGpuNb && !alternateGpuWait)
2398
+ {
2399
+ if (stepWork.useGpuFBufferOps)
2400
+ {
2401
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2402
+
2403
+ // TODO: move these steps as early as possible:
2404
+ // - CPU f H2D should be as soon as all CPU-side forces are done
2405
+ // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
2406
+ // before the next CPU task that consumes the forces: vsite spread or update)
2407
+ // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
2408
+ // of the halo exchange. In that case the copy is instead performed above, before the exchange.
2409
+ // These should be unified.
2410
+ if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
2411
+ {
2412
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
2413
+ }
2414
+
2415
+ if (stepWork.computeNonbondedForces)
2416
+ {
2417
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
2418
+ }
2419
+
2420
+ // Copy forces to host if they are needed for update or if virtual sites are enabled.
2421
+ // If there are vsites, we need to copy forces every step to spread vsite forces on host.
2422
+ // TODO: When the output flags will be included in step workload, this copy can be combined with the
2423
+ // copy call done in sim_utils(...) for the output.
2424
+ // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
2425
+ // they should not be copied in do_md(...) for the output.
2426
+ if (!simulationWork.useGpuUpdate
2427
+ || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
2428
+ || vsite)
2429
+ {
2430
+ if (stepWork.computeNonbondedForces)
2431
+ {
2432
+ /* We have previously issued force reduction on the GPU, but we will
2433
+ * not use this event, instead relying on the stream being in-order.
2434
+ * Issue #3988. */
2435
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
2436
+ }
2437
+ stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
2438
+ stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
2439
+ }
2440
+ }
2441
+ else if (stepWork.computeNonbondedForces)
2442
+ {
2443
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2444
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
2445
+ }
2446
+ }
2447
+
2448
+ launchGpuEndOfStepTasks(
2449
+ nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, *runScheduleWork, step, wcycle);
2450
+
2451
+ if (haveDDAtomOrdering(*cr))
2452
+ {
2453
+ dd_force_flop_stop(cr->dd, nrnb);
2454
+ }
2455
+
2456
+ const bool haveCombinedMtsForces = (stepWork.computeForces && simulationWork.useMts && stepWork.computeSlowForces
2457
+ && stepWork.combineMtsForcesBeforeHaloExchange);
2458
+ if (stepWork.computeForces)
2459
+ {
2460
+ postProcessForceWithShiftForces(
2461
+ nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0, vir_force, *mdatoms, *fr, vsite, stepWork);
2462
+
2463
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2464
+ {
2465
+ postProcessForceWithShiftForces(
2466
+ nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, *mdatoms, *fr, vsite, stepWork);
2467
+ }
2468
+ }
2469
+
2470
+ // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
2471
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2472
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2473
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useCpuPmePpCommunication && !needEarlyPmeResults)
2474
+ {
2475
+ /* In case of node-splitting, the PP nodes receive the long-range
2476
+ * forces, virial and energy from the PME nodes here.
2477
+ */
2478
+ pme_receive_force_ener(fr,
2479
+ cr,
2480
+ &forceOutMtsLevel1->forceWithVirial(),
2481
+ enerd,
2482
+ simulationWork.useGpuPmePpCommunication,
2483
+ false,
2484
+ wcycle);
2485
+ }
2486
+
2487
+ if (stepWork.computeForces)
2488
+ {
2489
+ /* If we don't use MTS or if we already combined the MTS forces before, we only
2490
+ * need to post-process one ForceOutputs object here, called forceOutCombined,
2491
+ * otherwise we have to post-process two outputs and then combine them.
2492
+ */
2493
+ ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
2494
+ postProcessForces(
2495
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined, vir_force, mdatoms, fr, vsite, stepWork);
2496
+
2497
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2498
+ {
2499
+ postProcessForces(
2500
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, mdatoms, fr, vsite, stepWork);
2501
+
2502
+ combineMtsForces(mdatoms->homenr,
2503
+ force.unpaddedArrayRef(),
2504
+ forceView->forceMtsCombined(),
2505
+ inputrec.mtsLevels[1].stepFactor);
2506
+ }
2507
+ }
2508
+
2509
+ if (stepWork.computeEnergy)
2510
+ {
2511
+ /* Compute the final potential energy terms */
2512
+ accumulatePotentialEnergies(enerd, lambda, inputrec.fepvals.get());
2513
+
2514
+ if (!EI_TPI(inputrec.eI))
2515
+ {
2516
+ checkPotentialEnergyValidity(step, *enerd, inputrec);
2517
+ }
2518
+ }
2519
+
2520
+ /* In case we don't have constraints and are using GPUs, the next balancing
2521
+ * region starts here.
2522
+ * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2523
+ * virial calculation and COM pulling, is not thus not included in
2524
+ * the balance timing, which is ok as most tasks do communication.
2525
+ */
2526
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2527
+ }