hillclimber 0.1.6__cp313-cp313-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (468) hide show
  1. hillclimber/__init__.py +41 -0
  2. hillclimber/actions.py +53 -0
  3. hillclimber/analysis.py +590 -0
  4. hillclimber/biases.py +293 -0
  5. hillclimber/calc.py +22 -0
  6. hillclimber/cvs.py +1070 -0
  7. hillclimber/interfaces.py +133 -0
  8. hillclimber/metadynamics.py +416 -0
  9. hillclimber/nodes.py +6 -0
  10. hillclimber/opes.py +359 -0
  11. hillclimber/pycv.py +362 -0
  12. hillclimber/selectors.py +230 -0
  13. hillclimber/virtual_atoms.py +341 -0
  14. hillclimber-0.1.6.dist-info/METADATA +325 -0
  15. hillclimber-0.1.6.dist-info/RECORD +468 -0
  16. hillclimber-0.1.6.dist-info/WHEEL +6 -0
  17. hillclimber-0.1.6.dist-info/entry_points.txt +8 -0
  18. hillclimber-0.1.6.dist-info/licenses/LICENSE +165 -0
  19. plumed/__init__.py +104 -0
  20. plumed/_lib/bin/plumed +0 -0
  21. plumed/_lib/bin/plumed-config +9 -0
  22. plumed/_lib/bin/plumed-patch +9 -0
  23. plumed/_lib/include/plumed/adjmat/AdjacencyMatrixBase.h +659 -0
  24. plumed/_lib/include/plumed/adjmat/ContactMatrix.h +59 -0
  25. plumed/_lib/include/plumed/asmjit/arch.h +228 -0
  26. plumed/_lib/include/plumed/asmjit/arm.h +43 -0
  27. plumed/_lib/include/plumed/asmjit/asmjit.h +69 -0
  28. plumed/_lib/include/plumed/asmjit/asmjit_apibegin.h +143 -0
  29. plumed/_lib/include/plumed/asmjit/asmjit_apiend.h +93 -0
  30. plumed/_lib/include/plumed/asmjit/asmjit_build.h +971 -0
  31. plumed/_lib/include/plumed/asmjit/assembler.h +183 -0
  32. plumed/_lib/include/plumed/asmjit/base.h +56 -0
  33. plumed/_lib/include/plumed/asmjit/codebuilder.h +944 -0
  34. plumed/_lib/include/plumed/asmjit/codecompiler.h +767 -0
  35. plumed/_lib/include/plumed/asmjit/codeemitter.h +528 -0
  36. plumed/_lib/include/plumed/asmjit/codeholder.h +777 -0
  37. plumed/_lib/include/plumed/asmjit/constpool.h +286 -0
  38. plumed/_lib/include/plumed/asmjit/cpuinfo.h +402 -0
  39. plumed/_lib/include/plumed/asmjit/func.h +1327 -0
  40. plumed/_lib/include/plumed/asmjit/globals.h +370 -0
  41. plumed/_lib/include/plumed/asmjit/inst.h +137 -0
  42. plumed/_lib/include/plumed/asmjit/logging.h +317 -0
  43. plumed/_lib/include/plumed/asmjit/misc_p.h +103 -0
  44. plumed/_lib/include/plumed/asmjit/moved_string.h +318 -0
  45. plumed/_lib/include/plumed/asmjit/operand.h +1599 -0
  46. plumed/_lib/include/plumed/asmjit/osutils.h +207 -0
  47. plumed/_lib/include/plumed/asmjit/regalloc_p.h +597 -0
  48. plumed/_lib/include/plumed/asmjit/runtime.h +227 -0
  49. plumed/_lib/include/plumed/asmjit/simdtypes.h +1104 -0
  50. plumed/_lib/include/plumed/asmjit/utils.h +1387 -0
  51. plumed/_lib/include/plumed/asmjit/vmem.h +183 -0
  52. plumed/_lib/include/plumed/asmjit/x86.h +45 -0
  53. plumed/_lib/include/plumed/asmjit/x86assembler.h +125 -0
  54. plumed/_lib/include/plumed/asmjit/x86builder.h +117 -0
  55. plumed/_lib/include/plumed/asmjit/x86compiler.h +322 -0
  56. plumed/_lib/include/plumed/asmjit/x86emitter.h +5149 -0
  57. plumed/_lib/include/plumed/asmjit/x86globals.h +535 -0
  58. plumed/_lib/include/plumed/asmjit/x86inst.h +2547 -0
  59. plumed/_lib/include/plumed/asmjit/x86instimpl_p.h +74 -0
  60. plumed/_lib/include/plumed/asmjit/x86internal_p.h +108 -0
  61. plumed/_lib/include/plumed/asmjit/x86logging_p.h +92 -0
  62. plumed/_lib/include/plumed/asmjit/x86misc.h +417 -0
  63. plumed/_lib/include/plumed/asmjit/x86operand.h +1133 -0
  64. plumed/_lib/include/plumed/asmjit/x86regalloc_p.h +734 -0
  65. plumed/_lib/include/plumed/asmjit/zone.h +1157 -0
  66. plumed/_lib/include/plumed/bias/Bias.h +82 -0
  67. plumed/_lib/include/plumed/bias/ReweightBase.h +58 -0
  68. plumed/_lib/include/plumed/blas/blas.h +253 -0
  69. plumed/_lib/include/plumed/blas/def_external.h +61 -0
  70. plumed/_lib/include/plumed/blas/def_internal.h +97 -0
  71. plumed/_lib/include/plumed/blas/real.h +49 -0
  72. plumed/_lib/include/plumed/cltools/CLTool.h +32 -0
  73. plumed/_lib/include/plumed/clusters/ClusteringBase.h +70 -0
  74. plumed/_lib/include/plumed/colvar/Colvar.h +32 -0
  75. plumed/_lib/include/plumed/colvar/ColvarInput.h +68 -0
  76. plumed/_lib/include/plumed/colvar/ColvarShortcut.h +81 -0
  77. plumed/_lib/include/plumed/colvar/CoordinationBase.h +52 -0
  78. plumed/_lib/include/plumed/colvar/MultiColvarTemplate.h +333 -0
  79. plumed/_lib/include/plumed/colvar/PathMSDBase.h +101 -0
  80. plumed/_lib/include/plumed/colvar/RMSDVector.h +78 -0
  81. plumed/_lib/include/plumed/config/Config.h +118 -0
  82. plumed/_lib/include/plumed/config/version.h +9 -0
  83. plumed/_lib/include/plumed/contour/ContourFindingObject.h +87 -0
  84. plumed/_lib/include/plumed/contour/DistanceFromContourBase.h +82 -0
  85. plumed/_lib/include/plumed/contour/FindContour.h +67 -0
  86. plumed/_lib/include/plumed/core/Action.h +540 -0
  87. plumed/_lib/include/plumed/core/ActionAnyorder.h +48 -0
  88. plumed/_lib/include/plumed/core/ActionAtomistic.h +343 -0
  89. plumed/_lib/include/plumed/core/ActionForInterface.h +99 -0
  90. plumed/_lib/include/plumed/core/ActionPilot.h +57 -0
  91. plumed/_lib/include/plumed/core/ActionRegister.h +124 -0
  92. plumed/_lib/include/plumed/core/ActionSet.h +163 -0
  93. plumed/_lib/include/plumed/core/ActionSetup.h +48 -0
  94. plumed/_lib/include/plumed/core/ActionShortcut.h +73 -0
  95. plumed/_lib/include/plumed/core/ActionToGetData.h +59 -0
  96. plumed/_lib/include/plumed/core/ActionToPutData.h +101 -0
  97. plumed/_lib/include/plumed/core/ActionWithArguments.h +140 -0
  98. plumed/_lib/include/plumed/core/ActionWithMatrix.h +87 -0
  99. plumed/_lib/include/plumed/core/ActionWithValue.h +258 -0
  100. plumed/_lib/include/plumed/core/ActionWithVector.h +94 -0
  101. plumed/_lib/include/plumed/core/ActionWithVirtualAtom.h +123 -0
  102. plumed/_lib/include/plumed/core/CLTool.h +177 -0
  103. plumed/_lib/include/plumed/core/CLToolMain.h +102 -0
  104. plumed/_lib/include/plumed/core/CLToolRegister.h +108 -0
  105. plumed/_lib/include/plumed/core/Colvar.h +115 -0
  106. plumed/_lib/include/plumed/core/DataPassingObject.h +94 -0
  107. plumed/_lib/include/plumed/core/DataPassingTools.h +54 -0
  108. plumed/_lib/include/plumed/core/DomainDecomposition.h +120 -0
  109. plumed/_lib/include/plumed/core/ExchangePatterns.h +47 -0
  110. plumed/_lib/include/plumed/core/FlexibleBin.h +63 -0
  111. plumed/_lib/include/plumed/core/GREX.h +61 -0
  112. plumed/_lib/include/plumed/core/GenericMolInfo.h +89 -0
  113. plumed/_lib/include/plumed/core/Group.h +41 -0
  114. plumed/_lib/include/plumed/core/ModuleMap.h +30 -0
  115. plumed/_lib/include/plumed/core/ParallelTaskManager.h +1023 -0
  116. plumed/_lib/include/plumed/core/PbcAction.h +61 -0
  117. plumed/_lib/include/plumed/core/PlumedMain.h +632 -0
  118. plumed/_lib/include/plumed/core/PlumedMainInitializer.h +118 -0
  119. plumed/_lib/include/plumed/core/RegisterBase.h +340 -0
  120. plumed/_lib/include/plumed/core/TargetDist.h +48 -0
  121. plumed/_lib/include/plumed/core/Value.h +547 -0
  122. plumed/_lib/include/plumed/core/WithCmd.h +93 -0
  123. plumed/_lib/include/plumed/dimred/SMACOF.h +55 -0
  124. plumed/_lib/include/plumed/drr/DRR.h +383 -0
  125. plumed/_lib/include/plumed/drr/colvar_UIestimator.h +777 -0
  126. plumed/_lib/include/plumed/fisst/legendre_rule_fast.h +44 -0
  127. plumed/_lib/include/plumed/function/Custom.h +54 -0
  128. plumed/_lib/include/plumed/function/Function.h +85 -0
  129. plumed/_lib/include/plumed/function/FunctionOfMatrix.h +368 -0
  130. plumed/_lib/include/plumed/function/FunctionOfScalar.h +135 -0
  131. plumed/_lib/include/plumed/function/FunctionOfVector.h +296 -0
  132. plumed/_lib/include/plumed/function/FunctionSetup.h +180 -0
  133. plumed/_lib/include/plumed/function/FunctionShortcut.h +130 -0
  134. plumed/_lib/include/plumed/function/FunctionWithSingleArgument.h +165 -0
  135. plumed/_lib/include/plumed/gridtools/ActionWithGrid.h +43 -0
  136. plumed/_lib/include/plumed/gridtools/EvaluateGridFunction.h +99 -0
  137. plumed/_lib/include/plumed/gridtools/FunctionOfGrid.h +295 -0
  138. plumed/_lib/include/plumed/gridtools/GridCoordinatesObject.h +179 -0
  139. plumed/_lib/include/plumed/gridtools/GridSearch.h +135 -0
  140. plumed/_lib/include/plumed/gridtools/Interpolator.h +45 -0
  141. plumed/_lib/include/plumed/gridtools/KDE.h +455 -0
  142. plumed/_lib/include/plumed/gridtools/RDF.h +40 -0
  143. plumed/_lib/include/plumed/gridtools/SumOfKernels.h +219 -0
  144. plumed/_lib/include/plumed/isdb/MetainferenceBase.h +398 -0
  145. plumed/_lib/include/plumed/lapack/def_external.h +207 -0
  146. plumed/_lib/include/plumed/lapack/def_internal.h +388 -0
  147. plumed/_lib/include/plumed/lapack/lapack.h +899 -0
  148. plumed/_lib/include/plumed/lapack/lapack_limits.h +79 -0
  149. plumed/_lib/include/plumed/lapack/real.h +50 -0
  150. plumed/_lib/include/plumed/lepton/CompiledExpression.h +164 -0
  151. plumed/_lib/include/plumed/lepton/CustomFunction.h +143 -0
  152. plumed/_lib/include/plumed/lepton/Exception.h +93 -0
  153. plumed/_lib/include/plumed/lepton/ExpressionProgram.h +137 -0
  154. plumed/_lib/include/plumed/lepton/ExpressionTreeNode.h +145 -0
  155. plumed/_lib/include/plumed/lepton/Lepton.h +85 -0
  156. plumed/_lib/include/plumed/lepton/MSVC_erfc.h +123 -0
  157. plumed/_lib/include/plumed/lepton/Operation.h +1302 -0
  158. plumed/_lib/include/plumed/lepton/ParsedExpression.h +165 -0
  159. plumed/_lib/include/plumed/lepton/Parser.h +111 -0
  160. plumed/_lib/include/plumed/lepton/windowsIncludes.h +73 -0
  161. plumed/_lib/include/plumed/mapping/Path.h +44 -0
  162. plumed/_lib/include/plumed/mapping/PathProjectionCalculator.h +57 -0
  163. plumed/_lib/include/plumed/matrixtools/MatrixOperationBase.h +54 -0
  164. plumed/_lib/include/plumed/matrixtools/MatrixTimesMatrix.h +309 -0
  165. plumed/_lib/include/plumed/matrixtools/MatrixTimesVectorBase.h +365 -0
  166. plumed/_lib/include/plumed/matrixtools/OuterProduct.h +238 -0
  167. plumed/_lib/include/plumed/maze/Core.h +65 -0
  168. plumed/_lib/include/plumed/maze/Loss.h +86 -0
  169. plumed/_lib/include/plumed/maze/Member.h +66 -0
  170. plumed/_lib/include/plumed/maze/Memetic.h +799 -0
  171. plumed/_lib/include/plumed/maze/Optimizer.h +357 -0
  172. plumed/_lib/include/plumed/maze/Random_MT.h +156 -0
  173. plumed/_lib/include/plumed/maze/Tools.h +183 -0
  174. plumed/_lib/include/plumed/metatomic/vesin.h +188 -0
  175. plumed/_lib/include/plumed/molfile/Gromacs.h +2013 -0
  176. plumed/_lib/include/plumed/molfile/endianswap.h +217 -0
  177. plumed/_lib/include/plumed/molfile/fastio.h +683 -0
  178. plumed/_lib/include/plumed/molfile/largefiles.h +78 -0
  179. plumed/_lib/include/plumed/molfile/libmolfile_plugin.h +77 -0
  180. plumed/_lib/include/plumed/molfile/molfile_plugin.h +1034 -0
  181. plumed/_lib/include/plumed/molfile/periodic_table.h +248 -0
  182. plumed/_lib/include/plumed/molfile/readpdb.h +447 -0
  183. plumed/_lib/include/plumed/molfile/vmdplugin.h +236 -0
  184. plumed/_lib/include/plumed/multicolvar/MultiColvarShortcuts.h +45 -0
  185. plumed/_lib/include/plumed/opes/ExpansionCVs.h +79 -0
  186. plumed/_lib/include/plumed/sasa/Sasa.h +32 -0
  187. plumed/_lib/include/plumed/secondarystructure/SecondaryStructureBase.h +372 -0
  188. plumed/_lib/include/plumed/setup/ActionSetup.h +25 -0
  189. plumed/_lib/include/plumed/small_vector/small_vector.h +6114 -0
  190. plumed/_lib/include/plumed/symfunc/CoordinationNumbers.h +41 -0
  191. plumed/_lib/include/plumed/tools/Angle.h +52 -0
  192. plumed/_lib/include/plumed/tools/AtomDistribution.h +138 -0
  193. plumed/_lib/include/plumed/tools/AtomNumber.h +152 -0
  194. plumed/_lib/include/plumed/tools/BiasRepresentation.h +106 -0
  195. plumed/_lib/include/plumed/tools/BitmaskEnum.h +167 -0
  196. plumed/_lib/include/plumed/tools/Brent1DRootSearch.h +159 -0
  197. plumed/_lib/include/plumed/tools/CheckInRange.h +44 -0
  198. plumed/_lib/include/plumed/tools/Citations.h +74 -0
  199. plumed/_lib/include/plumed/tools/ColvarOutput.h +118 -0
  200. plumed/_lib/include/plumed/tools/Communicator.h +316 -0
  201. plumed/_lib/include/plumed/tools/ConjugateGradient.h +80 -0
  202. plumed/_lib/include/plumed/tools/DLLoader.h +79 -0
  203. plumed/_lib/include/plumed/tools/ERMSD.h +73 -0
  204. plumed/_lib/include/plumed/tools/Exception.h +406 -0
  205. plumed/_lib/include/plumed/tools/File.h +28 -0
  206. plumed/_lib/include/plumed/tools/FileBase.h +153 -0
  207. plumed/_lib/include/plumed/tools/FileTools.h +37 -0
  208. plumed/_lib/include/plumed/tools/ForwardDecl.h +54 -0
  209. plumed/_lib/include/plumed/tools/Grid.h +638 -0
  210. plumed/_lib/include/plumed/tools/HistogramBead.h +136 -0
  211. plumed/_lib/include/plumed/tools/IFile.h +117 -0
  212. plumed/_lib/include/plumed/tools/KernelFunctions.h +113 -0
  213. plumed/_lib/include/plumed/tools/Keywords.h +380 -0
  214. plumed/_lib/include/plumed/tools/LatticeReduction.h +66 -0
  215. plumed/_lib/include/plumed/tools/LeptonCall.h +64 -0
  216. plumed/_lib/include/plumed/tools/LinkCells.h +126 -0
  217. plumed/_lib/include/plumed/tools/Log.h +41 -0
  218. plumed/_lib/include/plumed/tools/LoopUnroller.h +163 -0
  219. plumed/_lib/include/plumed/tools/Matrix.h +721 -0
  220. plumed/_lib/include/plumed/tools/MatrixSquareBracketsAccess.h +138 -0
  221. plumed/_lib/include/plumed/tools/MergeVectorTools.h +153 -0
  222. plumed/_lib/include/plumed/tools/Minimise1DBrent.h +244 -0
  223. plumed/_lib/include/plumed/tools/MinimiseBase.h +120 -0
  224. plumed/_lib/include/plumed/tools/MolDataClass.h +51 -0
  225. plumed/_lib/include/plumed/tools/NeighborList.h +112 -0
  226. plumed/_lib/include/plumed/tools/OFile.h +286 -0
  227. plumed/_lib/include/plumed/tools/OpenACC.h +180 -0
  228. plumed/_lib/include/plumed/tools/OpenMP.h +75 -0
  229. plumed/_lib/include/plumed/tools/PDB.h +154 -0
  230. plumed/_lib/include/plumed/tools/Pbc.h +139 -0
  231. plumed/_lib/include/plumed/tools/PlumedHandle.h +105 -0
  232. plumed/_lib/include/plumed/tools/RMSD.h +493 -0
  233. plumed/_lib/include/plumed/tools/Random.h +80 -0
  234. plumed/_lib/include/plumed/tools/RootFindingBase.h +79 -0
  235. plumed/_lib/include/plumed/tools/Stopwatch.h +475 -0
  236. plumed/_lib/include/plumed/tools/Subprocess.h +142 -0
  237. plumed/_lib/include/plumed/tools/SwitchingFunction.h +208 -0
  238. plumed/_lib/include/plumed/tools/Tensor.h +724 -0
  239. plumed/_lib/include/plumed/tools/TokenizedLine.h +123 -0
  240. plumed/_lib/include/plumed/tools/Tools.h +638 -0
  241. plumed/_lib/include/plumed/tools/Torsion.h +55 -0
  242. plumed/_lib/include/plumed/tools/TrajectoryParser.h +118 -0
  243. plumed/_lib/include/plumed/tools/Tree.h +61 -0
  244. plumed/_lib/include/plumed/tools/TypesafePtr.h +463 -0
  245. plumed/_lib/include/plumed/tools/Units.h +167 -0
  246. plumed/_lib/include/plumed/tools/Vector.h +433 -0
  247. plumed/_lib/include/plumed/tools/View.h +296 -0
  248. plumed/_lib/include/plumed/tools/View2D.h +100 -0
  249. plumed/_lib/include/plumed/tools/h36.h +39 -0
  250. plumed/_lib/include/plumed/vatom/ActionWithVirtualAtom.h +32 -0
  251. plumed/_lib/include/plumed/ves/BasisFunctions.h +380 -0
  252. plumed/_lib/include/plumed/ves/CoeffsBase.h +310 -0
  253. plumed/_lib/include/plumed/ves/CoeffsMatrix.h +220 -0
  254. plumed/_lib/include/plumed/ves/CoeffsVector.h +251 -0
  255. plumed/_lib/include/plumed/ves/FermiSwitchingFunction.h +74 -0
  256. plumed/_lib/include/plumed/ves/GridIntegrationWeights.h +50 -0
  257. plumed/_lib/include/plumed/ves/GridLinearInterpolation.h +81 -0
  258. plumed/_lib/include/plumed/ves/GridProjWeights.h +61 -0
  259. plumed/_lib/include/plumed/ves/LinearBasisSetExpansion.h +303 -0
  260. plumed/_lib/include/plumed/ves/Optimizer.h +444 -0
  261. plumed/_lib/include/plumed/ves/TargetDistModifer.h +53 -0
  262. plumed/_lib/include/plumed/ves/TargetDistribution.h +266 -0
  263. plumed/_lib/include/plumed/ves/VesBias.h +545 -0
  264. plumed/_lib/include/plumed/ves/VesTools.h +142 -0
  265. plumed/_lib/include/plumed/ves/WaveletGrid.h +75 -0
  266. plumed/_lib/include/plumed/volumes/ActionVolume.h +268 -0
  267. plumed/_lib/include/plumed/volumes/VolumeShortcut.h +147 -0
  268. plumed/_lib/include/plumed/wrapper/Plumed.h +5025 -0
  269. plumed/_lib/include/plumed/xdrfile/xdrfile.h +663 -0
  270. plumed/_lib/include/plumed/xdrfile/xdrfile_trr.h +89 -0
  271. plumed/_lib/include/plumed/xdrfile/xdrfile_xtc.h +90 -0
  272. plumed/_lib/lib/PythonCVInterface.dylib +0 -0
  273. plumed/_lib/lib/libplumed.dylib +0 -0
  274. plumed/_lib/lib/libplumedKernel.dylib +0 -0
  275. plumed/_lib/lib/libplumedWrapper.a +0 -0
  276. plumed/_lib/lib/pkgconfig/plumed.pc +13 -0
  277. plumed/_lib/lib/pkgconfig/plumedInternals.pc +13 -0
  278. plumed/_lib/lib/pkgconfig/plumedWrapper.pc +13 -0
  279. plumed/_lib/lib/plumed/fortran/plumed.f90 +879 -0
  280. plumed/_lib/lib/plumed/fortran/plumed_f08.f90 +2625 -0
  281. plumed/_lib/lib/plumed/modulefile +69 -0
  282. plumed/_lib/lib/plumed/patches/gromacs-2022.5.config +43 -0
  283. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt +543 -0
  284. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt.preplumed +540 -0
  285. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp +1628 -0
  286. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1590 -0
  287. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h +103 -0
  288. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h.preplumed +99 -0
  289. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp +2527 -0
  290. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2513 -0
  291. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  292. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  293. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +408 -0
  294. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +394 -0
  295. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp +2348 -0
  296. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2091 -0
  297. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp +3573 -0
  298. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3495 -0
  299. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1506 -0
  300. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1402 -0
  301. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  302. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  303. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp +997 -0
  304. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +906 -0
  305. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp +2780 -0
  306. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2738 -0
  307. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  308. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  309. plumed/_lib/lib/plumed/patches/gromacs-2023.5.config +43 -0
  310. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt +549 -0
  311. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt.preplumed +546 -0
  312. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  313. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  314. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h +104 -0
  315. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  316. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp +2624 -0
  317. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2610 -0
  318. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  319. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  320. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +409 -0
  321. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +395 -0
  322. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp +2419 -0
  323. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2164 -0
  324. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp +3546 -0
  325. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3468 -0
  326. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  327. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  328. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  329. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  330. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp +991 -0
  331. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +900 -0
  332. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp +2895 -0
  333. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2849 -0
  334. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  335. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  336. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp +886 -0
  337. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +880 -0
  338. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +347 -0
  339. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +345 -0
  340. plumed/_lib/lib/plumed/patches/gromacs-2024.3.config +43 -0
  341. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt +575 -0
  342. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt.preplumed +572 -0
  343. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  344. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  345. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h +104 -0
  346. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  347. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp +2564 -0
  348. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2550 -0
  349. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  350. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  351. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h +410 -0
  352. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +396 -0
  353. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp +2435 -0
  354. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp.preplumed +2187 -0
  355. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp +3592 -0
  356. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3514 -0
  357. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  358. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  359. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  360. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  361. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp +958 -0
  362. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp.preplumed +929 -0
  363. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp +2987 -0
  364. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp.preplumed +2941 -0
  365. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  366. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  367. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp +904 -0
  368. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +898 -0
  369. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +353 -0
  370. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +351 -0
  371. plumed/_lib/lib/plumed/patches/gromacs-2025.0.config +39 -0
  372. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake +82 -0
  373. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake.preplumed +82 -0
  374. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp +162 -0
  375. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp.preplumed +154 -0
  376. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp +107 -0
  377. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp.preplumed +99 -0
  378. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h +120 -0
  379. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h.preplumed +111 -0
  380. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp +215 -0
  381. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp.preplumed +197 -0
  382. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h +87 -0
  383. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h.preplumed +86 -0
  384. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp +2971 -0
  385. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp.preplumed +2970 -0
  386. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h +430 -0
  387. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h.preplumed +429 -0
  388. plumed/_lib/lib/plumed/patches/namd-2.12.config +30 -0
  389. plumed/_lib/lib/plumed/patches/namd-2.12.diff +267 -0
  390. plumed/_lib/lib/plumed/patches/namd-2.13.config +30 -0
  391. plumed/_lib/lib/plumed/patches/namd-2.13.diff +267 -0
  392. plumed/_lib/lib/plumed/patches/namd-2.14.config +30 -0
  393. plumed/_lib/lib/plumed/patches/namd-2.14.diff +268 -0
  394. plumed/_lib/lib/plumed/patches/patch.sh +500 -0
  395. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.config +25 -0
  396. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90 +368 -0
  397. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90.preplumed +366 -0
  398. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90 +71 -0
  399. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90.preplumed +24 -0
  400. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90 +62 -0
  401. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  402. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90 +189 -0
  403. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90.preplumed +185 -0
  404. plumed/_lib/lib/plumed/patches/qespresso-6.2.config +26 -0
  405. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90 +422 -0
  406. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90.preplumed +420 -0
  407. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90 +70 -0
  408. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  409. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90 +62 -0
  410. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  411. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90 +233 -0
  412. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90.preplumed +230 -0
  413. plumed/_lib/lib/plumed/patches/qespresso-7.0.config +28 -0
  414. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile +175 -0
  415. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile.preplumed +171 -0
  416. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90 +486 -0
  417. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90.preplumed +484 -0
  418. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90 +74 -0
  419. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  420. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90 +64 -0
  421. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  422. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90 +532 -0
  423. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90.preplumed +518 -0
  424. plumed/_lib/lib/plumed/patches/qespresso-7.2.config +28 -0
  425. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile +249 -0
  426. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile.preplumed +244 -0
  427. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90 +532 -0
  428. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90.preplumed +535 -0
  429. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90 +74 -0
  430. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  431. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90 +64 -0
  432. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  433. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90 +569 -0
  434. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90.preplumed +560 -0
  435. plumed/_lib/lib/plumed/plumed-config +9 -0
  436. plumed/_lib/lib/plumed/plumed-mklib +9 -0
  437. plumed/_lib/lib/plumed/plumed-newcv +9 -0
  438. plumed/_lib/lib/plumed/plumed-partial_tempering +9 -0
  439. plumed/_lib/lib/plumed/plumed-patch +9 -0
  440. plumed/_lib/lib/plumed/plumed-runtime +0 -0
  441. plumed/_lib/lib/plumed/plumed-selector +9 -0
  442. plumed/_lib/lib/plumed/plumed-vim2html +9 -0
  443. plumed/_lib/lib/plumed/scripts/config.sh +126 -0
  444. plumed/_lib/lib/plumed/scripts/mklib.sh +175 -0
  445. plumed/_lib/lib/plumed/scripts/newcv.sh +26 -0
  446. plumed/_lib/lib/plumed/scripts/partial_tempering.sh +319 -0
  447. plumed/_lib/lib/plumed/scripts/patch.sh +4 -0
  448. plumed/_lib/lib/plumed/scripts/selector.sh +234 -0
  449. plumed/_lib/lib/plumed/scripts/vim2html.sh +190 -0
  450. plumed/_lib/lib/plumed/src/colvar/Template.cpp +116 -0
  451. plumed/_lib/lib/plumed/src/config/compile_options.sh +3 -0
  452. plumed/_lib/lib/plumed/src/config/config.txt +181 -0
  453. plumed/_lib/lib/plumed/src/lib/Plumed.cmake +6 -0
  454. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.runtime +5 -0
  455. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.shared +5 -0
  456. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.static +3 -0
  457. plumed/_lib/lib/plumed/src/lib/Plumed.inc +6 -0
  458. plumed/_lib/lib/plumed/src/lib/Plumed.inc.runtime +5 -0
  459. plumed/_lib/lib/plumed/src/lib/Plumed.inc.shared +5 -0
  460. plumed/_lib/lib/plumed/src/lib/Plumed.inc.static +3 -0
  461. plumed/_lib/lib/plumed/vim/scripts.vim +6 -0
  462. plumed/_plumed_core.cpython-311-darwin.so +0 -0
  463. plumed/_plumed_core.cpython-312-darwin.so +0 -0
  464. plumed/_plumed_core.cpython-313-darwin.so +0 -0
  465. plumedCommunications.cpython-311-darwin.so +0 -0
  466. plumedCommunications.cpython-312-darwin.so +0 -0
  467. plumedCommunications.cpython-313-darwin.so +0 -0
  468. plumedCommunications.pyi +431 -0
@@ -0,0 +1,2550 @@
1
+ /*
2
+ * This file is part of the GROMACS molecular simulation package.
3
+ *
4
+ * Copyright 1991- The GROMACS Authors
5
+ * and the project initiators Erik Lindahl, Berk Hess and David van der Spoel.
6
+ * Consult the AUTHORS/COPYING files and https://www.gromacs.org for details.
7
+ *
8
+ * GROMACS is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU Lesser General Public License
10
+ * as published by the Free Software Foundation; either version 2.1
11
+ * of the License, or (at your option) any later version.
12
+ *
13
+ * GROMACS is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ * Lesser General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public
19
+ * License along with GROMACS; if not, see
20
+ * https://www.gnu.org/licenses, or write to the Free Software Foundation,
21
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
+ *
23
+ * If you want to redistribute modifications to GROMACS, please
24
+ * consider that scientific software is very special. Version
25
+ * control is crucial - bugs must be traceable. We will be happy to
26
+ * consider code for inclusion in the official distribution, but
27
+ * derived work must not be called official GROMACS. Details are found
28
+ * in the README & COPYING files - if they are missing, get the
29
+ * official version at https://www.gromacs.org.
30
+ *
31
+ * To help us fund GROMACS development, we humbly ask that you cite
32
+ * the research papers on the package. Check out https://www.gromacs.org.
33
+ */
34
+ #include "gmxpre.h"
35
+
36
+ #include "config.h"
37
+
38
+ #include <cmath>
39
+ #include <cstdint>
40
+ #include <cstdio>
41
+ #include <cstring>
42
+
43
+ #include <array>
44
+ #include <optional>
45
+
46
+ #include "gromacs/applied_forces/awh/awh.h"
47
+ #include "gromacs/domdec/dlbtiming.h"
48
+ #include "gromacs/domdec/domdec.h"
49
+ #include "gromacs/domdec/domdec_struct.h"
50
+ #include "gromacs/domdec/gpuhaloexchange.h"
51
+ #include "gromacs/domdec/partition.h"
52
+ #include "gromacs/essentialdynamics/edsam.h"
53
+ #include "gromacs/ewald/pme.h"
54
+ #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
55
+ #include "gromacs/ewald/pme_pp.h"
56
+ #include "gromacs/ewald/pme_pp_comm_gpu.h"
57
+ #include "gromacs/gmxlib/network.h"
58
+ #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
59
+ #include "gromacs/gmxlib/nonbonded/nonbonded.h"
60
+ #include "gromacs/gmxlib/nrnb.h"
61
+ #include "gromacs/gpu_utils/gpu_utils.h"
62
+ #include "gromacs/imd/imd.h"
63
+ #include "gromacs/listed_forces/disre.h"
64
+ #include "gromacs/listed_forces/listed_forces.h"
65
+ #include "gromacs/listed_forces/listed_forces_gpu.h"
66
+ #include "gromacs/listed_forces/orires.h"
67
+ #include "gromacs/math/arrayrefwithpadding.h"
68
+ #include "gromacs/math/functions.h"
69
+ #include "gromacs/math/units.h"
70
+ #include "gromacs/math/vec.h"
71
+ #include "gromacs/math/vecdump.h"
72
+ #include "gromacs/mdlib/calcmu.h"
73
+ #include "gromacs/mdlib/calcvir.h"
74
+ #include "gromacs/mdlib/constr.h"
75
+ #include "gromacs/mdlib/dispersioncorrection.h"
76
+ #include "gromacs/mdlib/enerdata_utils.h"
77
+ #include "gromacs/mdlib/force.h"
78
+ #include "gromacs/mdlib/force_flags.h"
79
+ #include "gromacs/mdlib/forcerec.h"
80
+ #include "gromacs/mdlib/gmx_omp_nthreads.h"
81
+ #include "gromacs/mdlib/update.h"
82
+ #include "gromacs/mdlib/vsite.h"
83
+ #include "gromacs/mdlib/wall.h"
84
+ #include "gromacs/mdlib/wholemoleculetransform.h"
85
+ #include "gromacs/mdrunutility/mdmodulesnotifiers.h"
86
+ #include "gromacs/mdtypes/commrec.h"
87
+ #include "gromacs/mdtypes/enerdata.h"
88
+ #include "gromacs/mdtypes/forcebuffers.h"
89
+ #include "gromacs/mdtypes/forceoutput.h"
90
+ #include "gromacs/mdtypes/forcerec.h"
91
+ #include "gromacs/mdtypes/iforceprovider.h"
92
+ #include "gromacs/mdtypes/inputrec.h"
93
+ #include "gromacs/mdtypes/md_enums.h"
94
+ #include "gromacs/mdtypes/mdatom.h"
95
+ #include "gromacs/mdtypes/multipletimestepping.h"
96
+ #include "gromacs/mdtypes/simulation_workload.h"
97
+ #include "gromacs/mdtypes/state.h"
98
+ #include "gromacs/mdtypes/state_propagator_data_gpu.h"
99
+ #include "gromacs/nbnxm/gpu_data_mgmt.h"
100
+ #include "gromacs/nbnxm/nbnxm.h"
101
+ #include "gromacs/nbnxm/nbnxm_gpu.h"
102
+ #include "gromacs/pbcutil/ishift.h"
103
+ #include "gromacs/pbcutil/pbc.h"
104
+ #include "gromacs/pulling/pull.h"
105
+ #include "gromacs/pulling/pull_rotation.h"
106
+ #include "gromacs/timing/cyclecounter.h"
107
+ #include "gromacs/timing/gpu_timing.h"
108
+ #include "gromacs/timing/wallcycle.h"
109
+ #include "gromacs/timing/wallcyclereporting.h"
110
+ #include "gromacs/timing/walltime_accounting.h"
111
+ #include "gromacs/topology/topology.h"
112
+ #include "gromacs/utility/arrayref.h"
113
+ #include "gromacs/utility/basedefinitions.h"
114
+ #include "gromacs/utility/cstringutil.h"
115
+ #include "gromacs/utility/exceptions.h"
116
+ #include "gromacs/utility/fatalerror.h"
117
+ #include "gromacs/utility/fixedcapacityvector.h"
118
+ #include "gromacs/utility/gmxassert.h"
119
+ #include "gromacs/utility/gmxmpi.h"
120
+ #include "gromacs/utility/logger.h"
121
+ #include "gromacs/utility/smalloc.h"
122
+ #include "gromacs/utility/strconvert.h"
123
+ #include "gromacs/utility/stringutil.h"
124
+ #include "gromacs/utility/sysinfo.h"
125
+
126
+ #include "gpuforcereduction.h"
127
+
128
+ using gmx::ArrayRef;
129
+ using gmx::AtomLocality;
130
+ using gmx::DomainLifetimeWorkload;
131
+ using gmx::ForceOutputs;
132
+ using gmx::ForceWithShiftForces;
133
+ using gmx::InteractionLocality;
134
+ using gmx::RVec;
135
+ using gmx::SimulationWorkload;
136
+ using gmx::StepWorkload;
137
+
138
+ // TODO: this environment variable allows us to verify before release
139
+ // that on less common architectures the total cost of polling is not larger than
140
+ // a blocking wait (so polling does not introduce overhead when the static
141
+ // PME-first ordering would suffice).
142
+ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
143
+
144
+ static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
145
+ {
146
+ GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
147
+ const int end = forceToAdd.size();
148
+
149
+ int gmx_unused nt = gmx_omp_nthreads_get(ModuleMultiThread::Default);
150
+ #pragma omp parallel for num_threads(nt) schedule(static)
151
+ for (int i = 0; i < end; i++)
152
+ {
153
+ rvec_inc(f[i], forceToAdd[i]);
154
+ }
155
+ }
156
+
157
+ static void calc_virial(int start,
158
+ int homenr,
159
+ const rvec x[],
160
+ const gmx::ForceWithShiftForces& forceWithShiftForces,
161
+ tensor vir_part,
162
+ const matrix box,
163
+ t_nrnb* nrnb,
164
+ const t_forcerec* fr,
165
+ PbcType pbcType)
166
+ {
167
+ /* The short-range virial from surrounding boxes */
168
+ const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
169
+ const rvec* shiftVecPointer = as_rvec_array(fr->shift_vec.data());
170
+ calc_vir(gmx::c_numShiftVectors, shiftVecPointer, fshift, vir_part, pbcType == PbcType::Screw, box);
171
+ inc_nrnb(nrnb, eNR_VIRIAL, gmx::c_numShiftVectors);
172
+
173
+ /* Calculate partial virial, for local atoms only, based on short range.
174
+ * Total virial is computed in global_stat, called from do_md
175
+ */
176
+ const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
177
+ f_calc_vir(start, start + homenr, x, f, vir_part, box);
178
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
179
+
180
+ if (debug)
181
+ {
182
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
183
+ }
184
+ }
185
+
186
+ static void pull_potential_wrapper(const t_commrec* cr,
187
+ const t_inputrec& ir,
188
+ const matrix box,
189
+ gmx::ArrayRef<const gmx::RVec> x,
190
+ const t_mdatoms* mdatoms,
191
+ gmx_enerdata_t* enerd,
192
+ pull_t* pull_work,
193
+ const real* lambda,
194
+ double t,
195
+ gmx_wallcycle* wcycle)
196
+ {
197
+ t_pbc pbc;
198
+ real dvdl;
199
+
200
+ /* Calculate the center of mass forces, this requires communication,
201
+ * which is why pull_potential is called close to other communication.
202
+ */
203
+ wallcycle_start(wcycle, WallCycleCounter::PullPot);
204
+ set_pbc(&pbc, ir.pbcType, box);
205
+ dvdl = 0;
206
+ enerd->term[F_COM_PULL] +=
207
+ pull_potential(pull_work,
208
+ mdatoms->massT,
209
+ pbc,
210
+ cr,
211
+ t,
212
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Restraint)],
213
+ x,
214
+ &dvdl);
215
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
216
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
217
+ }
218
+
219
+ static void pme_receive_force_ener(t_forcerec* fr,
220
+ const t_commrec* cr,
221
+ gmx::ForceWithVirial* forceWithVirial,
222
+ gmx_enerdata_t* enerd,
223
+ bool useGpuPmePpComms,
224
+ bool receivePmeForceToGpu,
225
+ gmx_wallcycle* wcycle)
226
+ {
227
+ real e_q, e_lj, dvdl_q, dvdl_lj;
228
+ float cycles_ppdpme, cycles_seppme;
229
+
230
+ cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
231
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
232
+
233
+ /* In case of node-splitting, the PP nodes receive the long-range
234
+ * forces, virial and energy from the PME nodes here.
235
+ */
236
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
237
+ dvdl_q = 0;
238
+ dvdl_lj = 0;
239
+ gmx_pme_receive_f(fr->pmePpCommGpu.get(),
240
+ cr,
241
+ forceWithVirial,
242
+ &e_q,
243
+ &e_lj,
244
+ &dvdl_q,
245
+ &dvdl_lj,
246
+ useGpuPmePpComms,
247
+ receivePmeForceToGpu,
248
+ &cycles_seppme);
249
+ enerd->term[F_COUL_RECIP] += e_q;
250
+ enerd->term[F_LJ_RECIP] += e_lj;
251
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += dvdl_q;
252
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_lj;
253
+
254
+ if (wcycle)
255
+ {
256
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
257
+ }
258
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
259
+ }
260
+
261
+ static void print_large_forces(FILE* fp,
262
+ const t_mdatoms* md,
263
+ const t_commrec* cr,
264
+ int64_t step,
265
+ real forceTolerance,
266
+ ArrayRef<const RVec> x,
267
+ ArrayRef<const RVec> f)
268
+ {
269
+ real force2Tolerance = gmx::square(forceTolerance);
270
+ gmx::Index numNonFinite = 0;
271
+ for (int i = 0; i < md->homenr; i++)
272
+ {
273
+ real force2 = norm2(f[i]);
274
+ bool nonFinite = !std::isfinite(force2);
275
+ if (force2 >= force2Tolerance || nonFinite)
276
+ {
277
+ fprintf(fp,
278
+ "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
279
+ step,
280
+ ddglatnr(cr->dd, i),
281
+ x[i][XX],
282
+ x[i][YY],
283
+ x[i][ZZ],
284
+ std::sqrt(force2));
285
+ }
286
+ if (nonFinite)
287
+ {
288
+ numNonFinite++;
289
+ }
290
+ }
291
+ if (numNonFinite > 0)
292
+ {
293
+ /* Note that with MPI this fatal call on one rank might interrupt
294
+ * the printing on other ranks. But we can only avoid that with
295
+ * an expensive MPI barrier that we would need at each step.
296
+ */
297
+ gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
298
+ }
299
+ }
300
+
301
+ //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
302
+ static void postProcessForceWithShiftForces(t_nrnb* nrnb,
303
+ gmx_wallcycle* wcycle,
304
+ const matrix box,
305
+ ArrayRef<const RVec> x,
306
+ ForceOutputs* forceOutputs,
307
+ tensor vir_force,
308
+ const t_mdatoms& mdatoms,
309
+ const t_forcerec& fr,
310
+ gmx::VirtualSitesHandler* vsite,
311
+ const StepWorkload& stepWork)
312
+ {
313
+ ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
314
+
315
+ /* If we have NoVirSum forces, but we do not calculate the virial,
316
+ * we later sum the forceWithShiftForces buffer together with
317
+ * the noVirSum buffer and spread the combined vsite forces at once.
318
+ */
319
+ if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
320
+ {
321
+ using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
322
+
323
+ auto f = forceWithShiftForces.force();
324
+ auto fshift = forceWithShiftForces.shiftForces();
325
+ const VirialHandling virialHandling =
326
+ (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
327
+ vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
328
+ forceWithShiftForces.haveSpreadVsiteForces() = true;
329
+ }
330
+
331
+ if (stepWork.computeVirial)
332
+ {
333
+ /* Calculation of the virial must be done after vsites! */
334
+ calc_virial(
335
+ 0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force, box, nrnb, &fr, fr.pbcType);
336
+ }
337
+ }
338
+
339
+ //! Spread, compute virial for and sum forces, when necessary
340
+ static void postProcessForces(const t_commrec* cr,
341
+ int64_t step,
342
+ t_nrnb* nrnb,
343
+ gmx_wallcycle* wcycle,
344
+ const matrix box,
345
+ ArrayRef<const RVec> x,
346
+ ForceOutputs* forceOutputs,
347
+ tensor vir_force,
348
+ const t_mdatoms* mdatoms,
349
+ const t_forcerec* fr,
350
+ gmx::VirtualSitesHandler* vsite,
351
+ const StepWorkload& stepWork)
352
+ {
353
+ // Extract the final output force buffer, which is also the buffer for forces with shift forces
354
+ ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
355
+
356
+ if (forceOutputs->haveForceWithVirial())
357
+ {
358
+ auto& forceWithVirial = forceOutputs->forceWithVirial();
359
+
360
+ if (vsite)
361
+ {
362
+ /* Spread the mesh force on virtual sites to the other particles...
363
+ * This is parallellized. MPI communication is performed
364
+ * if the constructing atoms aren't local.
365
+ */
366
+ GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
367
+ "We need separate force buffers for shift and virial forces when "
368
+ "computing the virial");
369
+ GMX_ASSERT(!stepWork.computeVirial
370
+ || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
371
+ "We should spread the force with shift forces separately when computing "
372
+ "the virial");
373
+ const gmx::VirtualSitesHandler::VirialHandling virialHandling =
374
+ (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
375
+ : gmx::VirtualSitesHandler::VirialHandling::None);
376
+ matrix virial = { { 0 } };
377
+ vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
378
+ forceWithVirial.addVirialContribution(virial);
379
+ }
380
+
381
+ if (stepWork.computeVirial)
382
+ {
383
+ /* Now add the forces, this is local */
384
+ sum_forces(f, forceWithVirial.force_);
385
+
386
+ /* Add the direct virial contributions */
387
+ GMX_ASSERT(
388
+ forceWithVirial.computeVirial_,
389
+ "forceWithVirial should request virial computation when we request the virial");
390
+ m_add(vir_force, forceWithVirial.getVirial(), vir_force);
391
+
392
+ if (debug)
393
+ {
394
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
395
+ }
396
+ }
397
+ }
398
+ else
399
+ {
400
+ GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
401
+ "We should have spread the vsite forces (earlier)");
402
+ }
403
+
404
+ if (fr->print_force >= 0)
405
+ {
406
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
407
+ }
408
+ }
409
+
410
+ static void do_nb_verlet(t_forcerec* fr,
411
+ const interaction_const_t* ic,
412
+ gmx_enerdata_t* enerd,
413
+ const StepWorkload& stepWork,
414
+ const InteractionLocality ilocality,
415
+ const int clearF,
416
+ const int64_t step,
417
+ t_nrnb* nrnb,
418
+ gmx_wallcycle* wcycle)
419
+ {
420
+ if (!stepWork.computeNonbondedForces)
421
+ {
422
+ /* skip non-bonded calculation */
423
+ return;
424
+ }
425
+
426
+ nonbonded_verlet_t* nbv = fr->nbv.get();
427
+
428
+ /* GPU kernel launch overhead is already timed separately */
429
+ if (!nbv->useGpu())
430
+ {
431
+ /* When dynamic pair-list pruning is requested, we need to prune
432
+ * at nstlistPrune steps.
433
+ */
434
+ if (nbv->isDynamicPruningStepCpu(step))
435
+ {
436
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
437
+ * the current coordinates of the atoms.
438
+ */
439
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
440
+ nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
441
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
442
+ }
443
+ }
444
+
445
+ nbv->dispatchNonbondedKernel(
446
+ ilocality,
447
+ *ic,
448
+ stepWork,
449
+ clearF,
450
+ fr->shift_vec,
451
+ enerd->grpp.energyGroupPairTerms[fr->haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
452
+ : NonBondedEnergyTerms::LJSR],
453
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
454
+ nrnb);
455
+ }
456
+
457
+ static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
458
+ {
459
+ int nth = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, v.ssize());
460
+
461
+ /* Note that we would like to avoid this conditional by putting it
462
+ * into the omp pragma instead, but then we still take the full
463
+ * omp parallel for overhead (at least with gcc5).
464
+ */
465
+ if (!useOpenmpThreading || nth == 1)
466
+ {
467
+ for (RVec& elem : v)
468
+ {
469
+ clear_rvec(elem);
470
+ }
471
+ }
472
+ else
473
+ {
474
+ #pragma omp parallel for num_threads(nth) schedule(static)
475
+ for (gmx::Index i = 0; i < v.ssize(); i++)
476
+ {
477
+ clear_rvec(v[i]);
478
+ }
479
+ }
480
+ }
481
+
482
+ /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
483
+ *
484
+ * \param groupOptions Group options, containing T-coupling options
485
+ */
486
+ static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
487
+ {
488
+ real nrdfCoupled = 0;
489
+ real nrdfUncoupled = 0;
490
+ real kineticEnergy = 0;
491
+ for (int g = 0; g < groupOptions.ngtc; g++)
492
+ {
493
+ if (groupOptions.tau_t[g] >= 0)
494
+ {
495
+ nrdfCoupled += groupOptions.nrdf[g];
496
+ kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * gmx::c_boltz;
497
+ }
498
+ else
499
+ {
500
+ nrdfUncoupled += groupOptions.nrdf[g];
501
+ }
502
+ }
503
+
504
+ /* This conditional with > also catches nrdf=0 */
505
+ if (nrdfCoupled > nrdfUncoupled)
506
+ {
507
+ return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
508
+ }
509
+ else
510
+ {
511
+ return 0;
512
+ }
513
+ }
514
+
515
+ /*! \brief This routine checks that the potential energy is finite.
516
+ *
517
+ * Always checks that the potential energy is finite. If step equals
518
+ * inputrec.init_step also checks that the magnitude of the potential energy
519
+ * is reasonable. Terminates with a fatal error when a check fails.
520
+ * Note that passing this check does not guarantee finite forces,
521
+ * since those use slightly different arithmetics. But in most cases
522
+ * there is just a narrow coordinate range where forces are not finite
523
+ * and energies are finite.
524
+ *
525
+ * \param[in] step The step number, used for checking and printing
526
+ * \param[in] enerd The energy data; the non-bonded group energies need to be added to
527
+ * \c enerd.term[F_EPOT] before calling this routine
528
+ * \param[in] inputrec The input record
529
+ */
530
+ static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
531
+ {
532
+ /* Threshold valid for comparing absolute potential energy against
533
+ * the kinetic energy. Normally one should not consider absolute
534
+ * potential energy values, but with a factor of one million
535
+ * we should never get false positives.
536
+ */
537
+ constexpr real c_thresholdFactor = 1e6;
538
+
539
+ bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
540
+ real averageKineticEnergy = 0;
541
+ /* We only check for large potential energy at the initial step,
542
+ * because that is by far the most likely step for this too occur
543
+ * and because computing the average kinetic energy is not free.
544
+ * Note: nstcalcenergy >> 1 often does not allow to catch large energies
545
+ * before they become NaN.
546
+ */
547
+ if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
548
+ {
549
+ averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
550
+ }
551
+
552
+ if (energyIsNotFinite
553
+ || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
554
+ {
555
+ GMX_THROW(gmx::InternalError(gmx::formatString(
556
+ "Step %" PRId64
557
+ ": The total potential energy is %g, which is %s. The LJ and electrostatic "
558
+ "contributions to the energy are %g and %g, respectively. A %s potential energy "
559
+ "can be caused by overlapping interactions in bonded interactions or very large%s "
560
+ "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
561
+ "configuration, incorrect interactions or parameters in the topology.",
562
+ step,
563
+ enerd.term[F_EPOT],
564
+ energyIsNotFinite ? "not finite" : "extremely high",
565
+ enerd.term[F_LJ],
566
+ enerd.term[F_COUL_SR],
567
+ energyIsNotFinite ? "non-finite" : "very high",
568
+ energyIsNotFinite ? " or Nan" : "")));
569
+ }
570
+ }
571
+
572
+ /*! \brief Compute forces and/or energies for special algorithms
573
+ *
574
+ * The intention is to collect all calls to algorithms that compute
575
+ * forces on local atoms only and that do not contribute to the local
576
+ * virial sum (but add their virial contribution separately).
577
+ * Eventually these should likely all become ForceProviders.
578
+ * Within this function the intention is to have algorithms that do
579
+ * global communication at the end, so global barriers within the MD loop
580
+ * are as close together as possible.
581
+ *
582
+ * \param[in] fplog The log file
583
+ * \param[in] cr The communication record
584
+ * \param[in] inputrec The input record
585
+ * \param[in] awh The Awh module (nullptr if none in use).
586
+ * \param[in] enforcedRotation Enforced rotation module.
587
+ * \param[in] imdSession The IMD session
588
+ * \param[in] pull_work The pull work structure.
589
+ * \param[in] step The current MD step
590
+ * \param[in] t The current time
591
+ * \param[in,out] wcycle Wallcycle accounting struct
592
+ * \param[in,out] forceProviders Pointer to a list of force providers
593
+ * \param[in] box The unit cell
594
+ * \param[in] x The coordinates
595
+ * \param[in] mdatoms Per atom properties
596
+ * \param[in] lambda Array of free-energy lambda values
597
+ * \param[in] stepWork Step schedule flags
598
+ * \param[in,out] forceWithVirialMtsLevel0 Force and virial for MTS level0 forces
599
+ * \param[in,out] forceWithVirialMtsLevel1 Force and virial for MTS level1 forces, can be nullptr
600
+ * \param[in,out] enerd Energy buffer
601
+ * \param[in,out] ed Essential dynamics pointer
602
+ * \param[in] didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
603
+ *
604
+ * \todo Remove didNeighborSearch, which is used incorrectly.
605
+ * \todo Convert all other algorithms called here to ForceProviders.
606
+ */
607
+ static void computeSpecialForces(FILE* fplog,
608
+ const t_commrec* cr,
609
+ const t_inputrec& inputrec,
610
+ gmx::Awh* awh,
611
+ gmx_enfrot* enforcedRotation,
612
+ gmx::ImdSession* imdSession,
613
+ pull_t* pull_work,
614
+ int64_t step,
615
+ double t,
616
+ gmx_wallcycle* wcycle,
617
+ gmx::ForceProviders* forceProviders,
618
+ const matrix box,
619
+ gmx::ArrayRef<const gmx::RVec> x,
620
+ const t_mdatoms* mdatoms,
621
+ gmx::ArrayRef<const real> lambda,
622
+ const StepWorkload& stepWork,
623
+ gmx::ForceWithVirial* forceWithVirialMtsLevel0,
624
+ gmx::ForceWithVirial* forceWithVirialMtsLevel1,
625
+ gmx_enerdata_t* enerd,
626
+ gmx_edsam* ed,
627
+ bool didNeighborSearch)
628
+ {
629
+ /* NOTE: Currently all ForceProviders only provide forces.
630
+ * When they also provide energies, remove this conditional.
631
+ */
632
+ if (stepWork.computeForces)
633
+ {
634
+ gmx::ForceProviderInput forceProviderInput(
635
+ x,
636
+ mdatoms->homenr,
637
+ gmx::makeArrayRef(mdatoms->chargeA).subArray(0, mdatoms->homenr),
638
+ gmx::makeArrayRef(mdatoms->massT).subArray(0, mdatoms->homenr),
639
+ t,
640
+ step,
641
+ box,
642
+ *cr);
643
+ gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
644
+
645
+ /* Collect forces from modules */
646
+ forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
647
+ }
648
+
649
+ const int pullMtsLevel = forceGroupMtsLevel(inputrec.mtsLevels, gmx::MtsForceGroups::Pull);
650
+ const bool doPulling = (inputrec.bPull && pull_have_potential(*pull_work)
651
+ && (pullMtsLevel == 0 || stepWork.computeSlowForces));
652
+
653
+ /* pull_potential_wrapper(), awh->applyBiasForcesAndUpdateBias(), pull_apply_forces()
654
+ * have to be called in this order
655
+ */
656
+ if (doPulling)
657
+ {
658
+ pull_potential_wrapper(cr, inputrec, box, x, mdatoms, enerd, pull_work, lambda.data(), t, wcycle);
659
+ }
660
+ if (awh && (pullMtsLevel == 0 || stepWork.computeSlowForces))
661
+ {
662
+ const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
663
+ std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
664
+ if (needForeignEnergyDifferences)
665
+ {
666
+ enerd->foreignLambdaTerms.finalizePotentialContributions(
667
+ enerd->dvdl_lin, lambda, *inputrec.fepvals);
668
+ std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
669
+ }
670
+
671
+ enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
672
+ inputrec.pbcType, foreignLambdaDeltaH, foreignLambdaDhDl, box, t, step, wcycle, fplog);
673
+ }
674
+ if (doPulling)
675
+ {
676
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PullPot);
677
+ auto& forceWithVirial = (pullMtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
678
+ pull_apply_forces(pull_work, mdatoms->massT, cr, forceWithVirial);
679
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
680
+ }
681
+
682
+ /* Add the forces from enforced rotation potentials (if any) */
683
+ if (inputrec.bRot)
684
+ {
685
+ wallcycle_start(wcycle, WallCycleCounter::RotAdd);
686
+ enerd->term[F_COM_PULL] +=
687
+ add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
688
+ wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
689
+ }
690
+
691
+ if (ed)
692
+ {
693
+ /* Note that since init_edsam() is called after the initialization
694
+ * of forcerec, edsam doesn't request the noVirSum force buffer.
695
+ * Thus if no other algorithm (e.g. PME) requires it, the forces
696
+ * here will contribute to the virial.
697
+ */
698
+ do_flood(cr, inputrec, x, forceWithVirialMtsLevel0->force_, ed, box, step, didNeighborSearch);
699
+ }
700
+
701
+ /* Add forces from interactive molecular dynamics (IMD), if any */
702
+ if (inputrec.bIMD && stepWork.computeForces)
703
+ {
704
+ imdSession->applyForces(forceWithVirialMtsLevel0->force_);
705
+ }
706
+ }
707
+
708
+ /*! \brief Launch the prepare_step and spread stages of PME GPU.
709
+ *
710
+ * \param[in] pmedata The PME structure
711
+ * \param[in] box The box matrix
712
+ * \param[in] stepWork Step schedule flags
713
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
714
+ * \param[in] lambdaQ The Coulomb lambda of the current state.
715
+ * \param[in] useMdGpuGraph Whether MD GPU Graph is in use.
716
+ * \param[in] wcycle The wallcycle structure
717
+ */
718
+ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
719
+ const matrix box,
720
+ const StepWorkload& stepWork,
721
+ GpuEventSynchronizer* xReadyOnDevice,
722
+ const real lambdaQ,
723
+ bool useMdGpuGraph,
724
+ gmx_wallcycle* wcycle)
725
+ {
726
+ wallcycle_start(wcycle, WallCycleCounter::PmeGpuMesh);
727
+ pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
728
+ bool useGpuDirectComm = false;
729
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
730
+ pme_gpu_launch_spread(
731
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu, useMdGpuGraph);
732
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
733
+ }
734
+
735
+ /*! \brief Launch the FFT and gather stages of PME GPU
736
+ *
737
+ * This function only implements setting the output forces (no accumulation).
738
+ *
739
+ * \param[in] pmedata The PME structure
740
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
741
+ * \param[in] wcycle The wallcycle structure
742
+ * \param[in] stepWork Step schedule flags
743
+ */
744
+ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
745
+ const real lambdaQ,
746
+ gmx_wallcycle* wcycle,
747
+ const gmx::StepWorkload& stepWork)
748
+ {
749
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
750
+ pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
751
+ pme_gpu_launch_gather(pmedata, wcycle, lambdaQ, stepWork.computeVirial);
752
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
753
+ }
754
+
755
+ /*! \brief
756
+ * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
757
+ * (if they were to be computed).
758
+ *
759
+ * \param[in] pme The PME data structure.
760
+ * \param[in] stepWork The required work for this simulation step
761
+ * \param[in] wcycle The wallclock counter.
762
+ * \param[out] forceWithVirial The output force and virial
763
+ * \param[out] enerd The output energies
764
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
765
+ */
766
+ static void pmeGpuWaitAndReduce(gmx_pme_t* pme,
767
+ const gmx::StepWorkload& stepWork,
768
+ gmx_wallcycle* wcycle,
769
+ gmx::ForceWithVirial* forceWithVirial,
770
+ gmx_enerdata_t* enerd,
771
+ const real lambdaQ)
772
+ {
773
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
774
+
775
+ pme_gpu_wait_and_reduce(pme, stepWork, wcycle, forceWithVirial, enerd, lambdaQ);
776
+
777
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
778
+ }
779
+
780
+ /*! \brief
781
+ * Polling wait for either of the PME or nonbonded GPU tasks.
782
+ *
783
+ * Instead of a static order in waiting for GPU tasks, this function
784
+ * polls checking which of the two tasks completes first, and does the
785
+ * associated force buffer reduction overlapped with the other task.
786
+ * By doing that, unlike static scheduling order, it can always overlap
787
+ * one of the reductions, regardless of the GPU task completion order.
788
+ *
789
+ * \param[in] nbv Nonbonded verlet structure
790
+ * \param[in,out] pmedata PME module data
791
+ * \param[in,out] forceOutputsNonbonded Force outputs for the non-bonded forces and shift forces
792
+ * \param[in,out] forceOutputsPme Force outputs for the PME forces and virial
793
+ * \param[in,out] enerd Energy data structure results are reduced into
794
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
795
+ * \param[in] stepWork Step schedule flags
796
+ * \param[in] wcycle The wallcycle structure
797
+ */
798
+ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
799
+ gmx_pme_t* pmedata,
800
+ gmx::ForceOutputs* forceOutputsNonbonded,
801
+ gmx::ForceOutputs* forceOutputsPme,
802
+ gmx_enerdata_t* enerd,
803
+ const real lambdaQ,
804
+ const StepWorkload& stepWork,
805
+ gmx_wallcycle* wcycle)
806
+ {
807
+ bool isPmeGpuDone = false;
808
+ bool isNbGpuDone = false;
809
+
810
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
811
+
812
+ while (!isPmeGpuDone || !isNbGpuDone)
813
+ {
814
+ if (!isPmeGpuDone)
815
+ {
816
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
817
+ GpuTaskCompletion completionType =
818
+ (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
819
+ isPmeGpuDone = pme_gpu_try_finish_task(
820
+ pmedata, stepWork, wcycle, &forceOutputsPme->forceWithVirial(), enerd, lambdaQ, completionType);
821
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
822
+ }
823
+
824
+ if (!isNbGpuDone)
825
+ {
826
+ auto& forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
827
+ GpuTaskCompletion completionType =
828
+ (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
829
+ // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
830
+ // we start without counting and only when the task finished we issue a
831
+ // start/stop to increment.
832
+ // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
833
+ wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
834
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(
835
+ nbv->gpuNbv(),
836
+ stepWork,
837
+ AtomLocality::Local,
838
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
839
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
840
+ forceBuffersNonbonded.shiftForces(),
841
+ completionType);
842
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
843
+
844
+ if (isNbGpuDone)
845
+ {
846
+ wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
847
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
848
+ }
849
+ }
850
+ }
851
+ }
852
+
853
+ /*! \brief Set up the different force buffers; also does clearing.
854
+ *
855
+ * \param[in] forceHelperBuffers Helper force buffers
856
+ * \param[in] force force array
857
+ * \param[in] domainWork Domain lifetime workload flags
858
+ * \param[in] stepWork Step schedule flags
859
+ * \param[in] havePpDomainDecomposition Whether we have a PP domain decomposition
860
+ * \param[out] wcycle wallcycle recording structure
861
+ *
862
+ * \returns Cleared force output structure
863
+ */
864
+ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceHelperBuffers,
865
+ gmx::ArrayRefWithPadding<gmx::RVec> force,
866
+ const DomainLifetimeWorkload& domainWork,
867
+ const StepWorkload& stepWork,
868
+ const bool havePpDomainDecomposition,
869
+ gmx_wallcycle* wcycle)
870
+ {
871
+ /* NOTE: We assume fr->shiftForces is all zeros here */
872
+ gmx::ForceWithShiftForces forceWithShiftForces(
873
+ force, stepWork.computeVirial, forceHelperBuffers->shiftForces());
874
+
875
+ if (stepWork.computeForces
876
+ && (domainWork.haveCpuLocalForceWork || !stepWork.useGpuFBufferOps
877
+ || (havePpDomainDecomposition && !stepWork.useGpuFHalo)))
878
+ {
879
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
880
+ /* Clear the short- and long-range forces */
881
+ clearRVecs(forceWithShiftForces.force(), true);
882
+
883
+ /* Clear the shift forces */
884
+ clearRVecs(forceWithShiftForces.shiftForces(), false);
885
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
886
+ }
887
+
888
+ /* If we need to compute the virial, we might need a separate
889
+ * force buffer for algorithms for which the virial is calculated
890
+ * directly, such as PME. Otherwise, forceWithVirial uses the
891
+ * the same force (f in legacy calls) buffer as other algorithms.
892
+ */
893
+ const bool useSeparateForceWithVirialBuffer =
894
+ (stepWork.computeForces
895
+ && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
896
+ /* forceWithVirial uses the local atom range only */
897
+ gmx::ForceWithVirial forceWithVirial(
898
+ useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
899
+ : force.unpaddedArrayRef(),
900
+ stepWork.computeVirial);
901
+
902
+ if (useSeparateForceWithVirialBuffer)
903
+ {
904
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::ClearForceBuffer);
905
+ /* TODO: update comment
906
+ * We only compute forces on local atoms. Note that vsites can
907
+ * spread to non-local atoms, but that part of the buffer is
908
+ * cleared separately in the vsite spreading code.
909
+ */
910
+ clearRVecs(forceWithVirial.force_, true);
911
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
912
+ }
913
+
914
+
915
+ return ForceOutputs(
916
+ forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
917
+ }
918
+
919
+ /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
920
+ *
921
+ */
922
+ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
923
+ gmx::ListedForcesGpu* listedForcesGpu,
924
+ gmx_pme_t* pmedata,
925
+ gmx_enerdata_t* enerd,
926
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
927
+ int64_t step,
928
+ gmx_wallcycle* wcycle)
929
+ {
930
+ if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
931
+ {
932
+ /* Launch pruning before buffer clearing because the API overhead of the
933
+ * clear kernel launches can leave the GPU idle while it could be running
934
+ * the prune kernel.
935
+ */
936
+ if (nbv->isDynamicPruningStepGpu(step))
937
+ {
938
+ nbv->dispatchPruneKernelGpu(step);
939
+ }
940
+
941
+ /* now clear the GPU outputs while we finish the step on the CPU */
942
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
943
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
944
+ Nbnxm::gpu_clear_outputs(nbv->gpuNbv(), runScheduleWork.stepWork.computeVirial);
945
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
946
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
947
+ }
948
+
949
+ if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
950
+ {
951
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
952
+ bool gpuGraphWithSeparatePmeRank = false;
953
+ pme_gpu_reinit_computation(pmedata, gpuGraphWithSeparatePmeRank, wcycle);
954
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
955
+ }
956
+
957
+ if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
958
+ {
959
+ // in principle this should be included in the DD balancing region,
960
+ // but generally it is infrequent so we'll omit it for the sake of
961
+ // simpler code
962
+ listedForcesGpu->waitAccumulateEnergyTerms(enerd);
963
+
964
+ listedForcesGpu->clearEnergies();
965
+ }
966
+ }
967
+
968
+ /*! \brief Compute the number of times the "local coordinates ready on device" GPU event will be used as a synchronization point.
969
+ *
970
+ * When some work is offloaded to GPU, force calculation should wait for the atom coordinates to
971
+ * be ready on the device. The coordinates can come either from H2D copy at the beginning of the step,
972
+ * or from the GPU integration at the end of the previous step.
973
+ *
974
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
975
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
976
+ * in different streams depend on the coordinates.
977
+ *
978
+ * This function return the number of times the event will be consumed based on this step's workload.
979
+ *
980
+ * \param simulationWork Simulation workload flags.
981
+ * \param stepWork Step workload flags.
982
+ * \param pmeSendCoordinatesFromGpu Whether peer-to-peer communication is used for PME coordinates.
983
+ * \return
984
+ */
985
+ static int getExpectedLocalXReadyOnDeviceConsumptionCount(gmx_used_in_debug const SimulationWorkload& simulationWork,
986
+ const StepWorkload& stepWork,
987
+ bool pmeSendCoordinatesFromGpu)
988
+ {
989
+ int result = 0;
990
+ if (stepWork.computeSlowForces)
991
+ {
992
+ if (pmeSendCoordinatesFromGpu)
993
+ {
994
+ GMX_ASSERT(simulationWork.haveSeparatePmeRank,
995
+ "GPU PME PP communications require having a separate PME rank");
996
+ // Event is consumed by gmx_pme_send_coordinates for GPU PME PP Communications
997
+ result++;
998
+ }
999
+ if (stepWork.haveGpuPmeOnThisRank)
1000
+ {
1001
+ // Event is consumed by launchPmeGpuSpread
1002
+ result++;
1003
+ }
1004
+ if (stepWork.computeNonbondedForces && stepWork.useGpuXBufferOps)
1005
+ {
1006
+ // Event is consumed by convertCoordinatesGpu
1007
+ result++;
1008
+ }
1009
+ }
1010
+ if (stepWork.useGpuXHalo)
1011
+ {
1012
+ // Event is consumed by communicateGpuHaloCoordinates
1013
+ result++;
1014
+ if (GMX_THREAD_MPI) // Issue #4262
1015
+ {
1016
+ result++;
1017
+ }
1018
+ }
1019
+ if (stepWork.clearGpuFBufferEarly && simulationWork.useGpuUpdate)
1020
+ {
1021
+ // Event is consumed by force clearing which waits for the update to complete
1022
+ result++;
1023
+ }
1024
+ return result;
1025
+ }
1026
+
1027
+ /*! \brief Compute the number of times the "local forces ready on device" GPU event will be used as a synchronization point.
1028
+ *
1029
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
1030
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
1031
+ * in different streams depend on the local forces.
1032
+ *
1033
+ * \param simulationWork Simulation workload flags.
1034
+ * \param domainWork Domain workload flags.
1035
+ * \param stepWork Step workload flags.
1036
+ * \param useOrEmulateGpuNb Whether GPU non-bonded calculations are used or emulated.
1037
+ * \param alternateGpuWait Whether alternating wait/reduce scheme is used.
1038
+ * \return The number of times the event will be consumed based on this step's workload.
1039
+ */
1040
+ static int getExpectedLocalFReadyOnDeviceConsumptionCount(const SimulationWorkload& simulationWork,
1041
+ const DomainLifetimeWorkload& domainWork,
1042
+ const StepWorkload& stepWork,
1043
+ bool useOrEmulateGpuNb,
1044
+ bool alternateGpuWait)
1045
+ {
1046
+ int counter = 0;
1047
+ bool eventUsedInGpuForceReduction =
1048
+ (domainWork.haveCpuLocalForceWork
1049
+ || (simulationWork.havePpDomainDecomposition && !simulationWork.useGpuHaloExchange));
1050
+ bool gpuForceReductionUsed = useOrEmulateGpuNb && !alternateGpuWait && stepWork.useGpuFBufferOps
1051
+ && stepWork.computeNonbondedForces;
1052
+ if (gpuForceReductionUsed && eventUsedInGpuForceReduction)
1053
+ {
1054
+ counter++;
1055
+ }
1056
+ bool gpuForceHaloUsed = simulationWork.havePpDomainDecomposition && stepWork.computeForces
1057
+ && stepWork.useGpuFHalo;
1058
+ if (gpuForceHaloUsed)
1059
+ {
1060
+ counter++;
1061
+ }
1062
+ return counter;
1063
+ }
1064
+
1065
+ //! \brief Data structure to hold dipole-related data and staging arrays
1066
+ struct DipoleData
1067
+ {
1068
+ //! Dipole staging for fast summing over MPI
1069
+ gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
1070
+ //! Dipole staging for states A and B (index 0 and 1 resp.)
1071
+ gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
1072
+ };
1073
+
1074
+
1075
+ static void reduceAndUpdateMuTot(DipoleData* dipoleData,
1076
+ const t_commrec* cr,
1077
+ const bool haveFreeEnergy,
1078
+ gmx::ArrayRef<const real> lambda,
1079
+ rvec muTotal,
1080
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1081
+ {
1082
+ if (PAR(cr))
1083
+ {
1084
+ gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
1085
+ ddBalanceRegionHandler.reopenRegionCpu();
1086
+ }
1087
+ for (int i = 0; i < 2; i++)
1088
+ {
1089
+ for (int j = 0; j < DIM; j++)
1090
+ {
1091
+ dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
1092
+ }
1093
+ }
1094
+
1095
+ if (!haveFreeEnergy)
1096
+ {
1097
+ copy_rvec(dipoleData->muStateAB[0], muTotal);
1098
+ }
1099
+ else
1100
+ {
1101
+ for (int j = 0; j < DIM; j++)
1102
+ {
1103
+ muTotal[j] = (1.0 - lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)])
1104
+ * dipoleData->muStateAB[0][j]
1105
+ + lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1106
+ * dipoleData->muStateAB[1][j];
1107
+ }
1108
+ }
1109
+ }
1110
+
1111
+ /*! \brief Combines MTS level0 and level1 force buffers into a full and MTS-combined force buffer.
1112
+ *
1113
+ * \param[in] numAtoms The number of atoms to combine forces for
1114
+ * \param[in,out] forceMtsLevel0 Input: F_level0, output: F_level0 + F_level1
1115
+ * \param[in,out] forceMts Input: F_level1, output: F_level0 + mtsFactor * F_level1
1116
+ * \param[in] mtsFactor The factor between the level0 and level1 time step
1117
+ */
1118
+ static void combineMtsForces(const int numAtoms,
1119
+ ArrayRef<RVec> forceMtsLevel0,
1120
+ ArrayRef<RVec> forceMts,
1121
+ const real mtsFactor)
1122
+ {
1123
+ const int gmx_unused numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Default);
1124
+ #pragma omp parallel for num_threads(numThreads) schedule(static)
1125
+ for (int i = 0; i < numAtoms; i++)
1126
+ {
1127
+ const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
1128
+ forceMtsLevel0[i] += forceMts[i];
1129
+ forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
1130
+ }
1131
+ }
1132
+
1133
+ /*! \brief Setup for the local GPU force reduction:
1134
+ * reinitialization plus the registration of forces and dependencies.
1135
+ *
1136
+ * \param [in] runScheduleWork Schedule workload flag structure
1137
+ * \param [in] nbv Non-bonded Verlet object
1138
+ * \param [in] stateGpu GPU state propagator object
1139
+ * \param [in] gpuForceReduction GPU force reduction object
1140
+ * \param [in] pmePpCommGpu PME-PP GPU communication object
1141
+ * \param [in] pmedata PME data object
1142
+ * \param [in] dd Domain decomposition object
1143
+ */
1144
+ static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload& runScheduleWork,
1145
+ nonbonded_verlet_t* nbv,
1146
+ gmx::StatePropagatorDataGpu* stateGpu,
1147
+ gmx::GpuForceReduction* gpuForceReduction,
1148
+ gmx::PmePpCommGpu* pmePpCommGpu,
1149
+ const gmx_pme_t* pmedata,
1150
+ const gmx_domdec_t* dd)
1151
+ {
1152
+ GMX_ASSERT(!runScheduleWork.simulationWork.useMts,
1153
+ "GPU force reduction is not compatible with MTS");
1154
+
1155
+ // (re-)initialize local GPU force reduction
1156
+ const bool accumulate = runScheduleWork.domainWork.haveCpuLocalForceWork
1157
+ || runScheduleWork.simulationWork.havePpDomainDecomposition;
1158
+ const int atomStart = 0;
1159
+ gpuForceReduction->reinit(stateGpu->getForces(),
1160
+ nbv->getNumAtoms(AtomLocality::Local),
1161
+ nbv->getGridIndices(),
1162
+ atomStart,
1163
+ accumulate,
1164
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
1165
+
1166
+ // register forces and add dependencies
1167
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpuNbv()));
1168
+
1169
+ DeviceBuffer<gmx::RVec> pmeForcePtr;
1170
+ GpuEventSynchronizer* pmeSynchronizer = nullptr;
1171
+ bool havePmeContribution = false;
1172
+
1173
+ if (runScheduleWork.simulationWork.haveGpuPmeOnPpRank())
1174
+ {
1175
+ pmeForcePtr = pme_gpu_get_device_f(pmedata);
1176
+ if (pmeForcePtr)
1177
+ {
1178
+ pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(pmedata);
1179
+ havePmeContribution = true;
1180
+ }
1181
+ }
1182
+ else if (runScheduleWork.simulationWork.useGpuPmePpCommunication)
1183
+ {
1184
+ pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
1185
+ GMX_ASSERT(pmeForcePtr, "PME force for reduction has no data");
1186
+ if (GMX_THREAD_MPI)
1187
+ {
1188
+ pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
1189
+ }
1190
+ havePmeContribution = true;
1191
+ }
1192
+
1193
+ if (havePmeContribution)
1194
+ {
1195
+ gpuForceReduction->registerRvecForce(pmeForcePtr);
1196
+ if (runScheduleWork.simulationWork.useNvshmem)
1197
+ {
1198
+ DeviceBuffer<uint64_t> forcesReadyNvshmemFlags = pmePpCommGpu->getGpuForcesSyncObj();
1199
+ gpuForceReduction->registerForcesReadyNvshmemFlags(forcesReadyNvshmemFlags);
1200
+ }
1201
+
1202
+ if (!runScheduleWork.simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
1203
+ {
1204
+ GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
1205
+ gpuForceReduction->addDependency(pmeSynchronizer);
1206
+ }
1207
+ }
1208
+
1209
+ if (runScheduleWork.domainWork.haveCpuLocalForceWork
1210
+ || (runScheduleWork.simulationWork.havePpDomainDecomposition
1211
+ && !runScheduleWork.simulationWork.useGpuHaloExchange))
1212
+ {
1213
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
1214
+ }
1215
+
1216
+ if (runScheduleWork.simulationWork.useGpuHaloExchange)
1217
+ {
1218
+ gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
1219
+ }
1220
+ }
1221
+
1222
+ /*! \brief Setup for the non-local GPU force reduction:
1223
+ * reinitialization plus the registration of forces and dependencies.
1224
+ *
1225
+ * \param [in] runScheduleWork Schedule workload flag structure
1226
+ * \param [in] nbv Non-bonded Verlet object
1227
+ * \param [in] stateGpu GPU state propagator object
1228
+ * \param [in] gpuForceReduction GPU force reduction object
1229
+ * \param [in] dd Domain decomposition object
1230
+ */
1231
+ static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload& runScheduleWork,
1232
+ nonbonded_verlet_t* nbv,
1233
+ gmx::StatePropagatorDataGpu* stateGpu,
1234
+ gmx::GpuForceReduction* gpuForceReduction,
1235
+ const gmx_domdec_t* dd)
1236
+ {
1237
+ // (re-)initialize non-local GPU force reduction
1238
+ const bool accumulate = runScheduleWork.domainWork.haveCpuNonLocalForceWork;
1239
+ const int atomStart = dd_numHomeAtoms(*dd);
1240
+ gpuForceReduction->reinit(stateGpu->getForces(),
1241
+ nbv->getNumAtoms(AtomLocality::NonLocal),
1242
+ nbv->getGridIndices(),
1243
+ atomStart,
1244
+ accumulate,
1245
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
1246
+
1247
+ // register forces and add dependencies
1248
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpuNbv()));
1249
+
1250
+ if (runScheduleWork.domainWork.haveCpuNonLocalForceWork)
1251
+ {
1252
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
1253
+ }
1254
+ }
1255
+
1256
+
1257
+ /*! \brief Return the number of local atoms.
1258
+ */
1259
+ static int getLocalAtomCount(const gmx_domdec_t* dd, const t_mdatoms& mdatoms, bool havePPDomainDecomposition)
1260
+ {
1261
+ GMX_ASSERT(!(havePPDomainDecomposition && (dd == nullptr)),
1262
+ "Can't have PP decomposition with dd uninitialized!");
1263
+ return havePPDomainDecomposition ? dd_numAtomsZones(*dd) : mdatoms.homenr;
1264
+ }
1265
+
1266
+ /*! \brief Does pair search and closely related activities required on search steps.
1267
+ */
1268
+ static void doPairSearch(const t_commrec* cr,
1269
+ const t_inputrec& inputrec,
1270
+ const gmx::MDModulesNotifiers& mdModulesNotifiers,
1271
+ int64_t step,
1272
+ t_nrnb* nrnb,
1273
+ gmx_wallcycle* wcycle,
1274
+ const gmx_localtop_t& top,
1275
+ const matrix box,
1276
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1277
+ gmx::ArrayRef<gmx::RVec> v,
1278
+ const t_mdatoms& mdatoms,
1279
+ t_forcerec* fr,
1280
+ const gmx::MdrunScheduleWorkload& runScheduleWork)
1281
+ {
1282
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1283
+
1284
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1285
+
1286
+ const SimulationWorkload& simulationWork = runScheduleWork.simulationWork;
1287
+ const StepWorkload& stepWork = runScheduleWork.stepWork;
1288
+
1289
+ if (gmx::needStateGpu(simulationWork))
1290
+ {
1291
+ // TODO refactor this to do_md, after partitioning.
1292
+ stateGpu->reinit(mdatoms.homenr,
1293
+ getLocalAtomCount(cr->dd, mdatoms, simulationWork.havePpDomainDecomposition));
1294
+ }
1295
+
1296
+ if (simulationWork.haveGpuPmeOnPpRank())
1297
+ {
1298
+ GMX_ASSERT(gmx::needStateGpu(simulationWork), "StatePropagatorDataGpu is needed");
1299
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
1300
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
1301
+ }
1302
+
1303
+ if (fr->pbcType != PbcType::No)
1304
+ {
1305
+ const bool calcCGCM = (stepWork.stateChanged && !haveDDAtomOrdering(*cr));
1306
+ if (calcCGCM)
1307
+ {
1308
+ put_atoms_in_box_omp(fr->pbcType,
1309
+ box,
1310
+ fr->haveBoxDeformation,
1311
+ inputrec.deform,
1312
+ x.unpaddedArrayRef().subArray(0, mdatoms.homenr),
1313
+ v.empty() ? ArrayRef<RVec>() : v.subArray(0, mdatoms.homenr),
1314
+ gmx_omp_nthreads_get(ModuleMultiThread::Default));
1315
+ inc_nrnb(nrnb, eNR_SHIFTX, mdatoms.homenr);
1316
+ }
1317
+
1318
+ if (!haveDDAtomOrdering(*cr))
1319
+ {
1320
+ // Atoms might have changed periodic image, signal MDModules
1321
+ gmx::MDModulesAtomsRedistributedSignal mdModulesAtomsRedistributedSignal(
1322
+ box, x.unpaddedArrayRef().subArray(0, mdatoms.homenr));
1323
+ mdModulesNotifiers.simulationSetupNotifier_.notify(mdModulesAtomsRedistributedSignal);
1324
+ }
1325
+ }
1326
+
1327
+ if (fr->wholeMoleculeTransform && stepWork.stateChanged)
1328
+ {
1329
+ fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
1330
+ }
1331
+
1332
+ wallcycle_start(wcycle, WallCycleCounter::NS);
1333
+ if (!haveDDAtomOrdering(*cr))
1334
+ {
1335
+ const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
1336
+ const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
1337
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
1338
+ nbv->putAtomsOnGrid(
1339
+ box, 0, vzero, boxDiagonal, nullptr, { 0, mdatoms.homenr }, -1, fr->atomInfo, x.unpaddedArrayRef(), 0, nullptr);
1340
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
1341
+ }
1342
+ else
1343
+ {
1344
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1345
+ nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->atomInfo, x.unpaddedArrayRef());
1346
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1347
+ }
1348
+
1349
+ nbv->setAtomProperties(mdatoms.typeA, mdatoms.chargeA, fr->atomInfo);
1350
+
1351
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1352
+
1353
+ /* initialize the GPU nbnxm atom data and bonded data structures */
1354
+ if (simulationWork.useGpuNonbonded)
1355
+ {
1356
+ // Note: cycle counting only nononbondeds, GPU listed forces counts internally
1357
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1358
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1359
+ Nbnxm::gpu_init_atomdata(nbv->gpuNbv(), &nbv->nbat());
1360
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1361
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1362
+
1363
+ if (fr->listedForcesGpu)
1364
+ {
1365
+ /* Now we put all atoms on the grid, we can assign bonded
1366
+ * interactions to the GPU, where the grid order is
1367
+ * needed. Also the xq, f and fshift device buffers have
1368
+ * been reallocated if needed, so the bonded code can
1369
+ * learn about them. */
1370
+ // TODO the xq, f, and fshift buffers are now shared
1371
+ // resources, so they should be maintained by a
1372
+ // higher-level object than the nb module.
1373
+ fr->listedForcesGpu->updateInteractionListsAndDeviceBuffers(
1374
+ nbv->getGridIndices(), top.idef, Nbnxm::gpuGetNBAtomData(nbv->gpuNbv()));
1375
+ }
1376
+ }
1377
+
1378
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1379
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
1380
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1381
+ nbv->constructPairlist(InteractionLocality::Local, top.excls, step, nrnb);
1382
+
1383
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::Local);
1384
+
1385
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
1386
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1387
+
1388
+ if (simulationWork.useGpuXBufferOpsWhenAllowed)
1389
+ {
1390
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
1391
+ }
1392
+
1393
+ if (simulationWork.useGpuFBufferOpsWhenAllowed)
1394
+ {
1395
+ // with MPI, direct GPU communication, and separate PME ranks we need
1396
+ // gmx_pme_send_coordinates() to be called before we can set up force reduction
1397
+ bool delaySetupLocalGpuForceReduction = GMX_MPI && simulationWork.useGpuPmePpCommunication;
1398
+ if (!delaySetupLocalGpuForceReduction)
1399
+ {
1400
+ setupLocalGpuForceReduction(runScheduleWork,
1401
+ nbv,
1402
+ stateGpu,
1403
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1404
+ fr->pmePpCommGpu.get(),
1405
+ fr->pmedata,
1406
+ cr->dd);
1407
+ }
1408
+
1409
+ if (simulationWork.havePpDomainDecomposition)
1410
+ {
1411
+ setupNonLocalGpuForceReduction(runScheduleWork,
1412
+ nbv,
1413
+ stateGpu,
1414
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
1415
+ cr->dd);
1416
+ }
1417
+ }
1418
+
1419
+ /* do non-local pair search */
1420
+ if (simulationWork.havePpDomainDecomposition)
1421
+ {
1422
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1423
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1424
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1425
+ nbv->constructPairlist(InteractionLocality::NonLocal, top.excls, step, nrnb);
1426
+
1427
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::NonLocal);
1428
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1429
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1430
+ // TODO refactor this GPU halo exchange re-initialisation
1431
+ // to location in do_md where GPU halo exchange is
1432
+ // constructed at partitioning, after above stateGpu
1433
+ // re-initialization has similarly been refactored
1434
+ if (simulationWork.useGpuHaloExchange)
1435
+ {
1436
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
1437
+ }
1438
+ }
1439
+
1440
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
1441
+ // so we need to do that here after the local and non-local pairlist construction.
1442
+ if (fr->efep != FreeEnergyPerturbationType::No)
1443
+ {
1444
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
1445
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
1446
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
1447
+ }
1448
+ }
1449
+
1450
+ void do_force(FILE* fplog,
1451
+ const t_commrec* cr,
1452
+ const gmx_multisim_t* ms,
1453
+ const t_inputrec& inputrec,
1454
+ const gmx::MDModulesNotifiers& mdModulesNotifiers,
1455
+ gmx::Awh* awh,
1456
+ gmx_enfrot* enforcedRotation,
1457
+ gmx::ImdSession* imdSession,
1458
+ pull_t* pull_work,
1459
+ int64_t step,
1460
+ t_nrnb* nrnb,
1461
+ gmx_wallcycle* wcycle,
1462
+ const gmx_localtop_t* top,
1463
+ const matrix box,
1464
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1465
+ gmx::ArrayRef<gmx::RVec> v,
1466
+ const history_t* hist,
1467
+ gmx::ForceBuffersView* forceView,
1468
+ tensor vir_force,
1469
+ const t_mdatoms* mdatoms,
1470
+ gmx_enerdata_t* enerd,
1471
+ gmx::ArrayRef<const real> lambda,
1472
+ t_forcerec* fr,
1473
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
1474
+ gmx::VirtualSitesHandler* vsite,
1475
+ rvec muTotal,
1476
+ double t,
1477
+ gmx_edsam* ed,
1478
+ CpuPpLongRangeNonbondeds* longRangeNonbondeds,
1479
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1480
+ {
1481
+ auto force = forceView->forceWithPadding();
1482
+ GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
1483
+ "The size of the force buffer should be at least the number of atoms to compute "
1484
+ "forces for");
1485
+
1486
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1487
+ interaction_const_t* ic = fr->ic.get();
1488
+
1489
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1490
+
1491
+ const SimulationWorkload& simulationWork = runScheduleWork.simulationWork;
1492
+
1493
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork.domainWork;
1494
+
1495
+ const StepWorkload& stepWork = runScheduleWork.stepWork;
1496
+
1497
+ if (stepWork.doNeighborSearch)
1498
+ {
1499
+ doPairSearch(cr, inputrec, mdModulesNotifiers, step, nrnb, wcycle, *top, box, x, v, *mdatoms, fr, runScheduleWork);
1500
+
1501
+ /* At a search step we need to start the first balancing region
1502
+ * somewhere early inside the step after communication during domain
1503
+ * decomposition (and not during the previous step as usual).
1504
+ */
1505
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
1506
+ }
1507
+
1508
+ const bool pmeSendCoordinatesFromGpu =
1509
+ simulationWork.useGpuPmePpCommunication && !stepWork.doNeighborSearch;
1510
+ auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps
1511
+ || simulationWork.useGpuUpdate || pmeSendCoordinatesFromGpu)
1512
+ ? stateGpu->getCoordinatesReadyOnDeviceEvent(
1513
+ AtomLocality::Local, simulationWork, stepWork)
1514
+ : nullptr;
1515
+
1516
+ if (stepWork.clearGpuFBufferEarly)
1517
+ {
1518
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data.
1519
+ // First clear local portion of force array, so that untouched atoms are zero.
1520
+ // The dependency for this is that forces from previous timestep have been consumed,
1521
+ // which is satisfied when localXReadyOnDevice has been marked for GPU update case.
1522
+ // For CPU update, the forces are consumed by the beginning of the step, so no extra sync needed.
1523
+ GpuEventSynchronizer* dependency = simulationWork.useGpuUpdate ? localXReadyOnDevice : nullptr;
1524
+ stateGpu->clearForcesOnGpu(AtomLocality::Local, dependency);
1525
+ }
1526
+
1527
+ clear_mat(vir_force);
1528
+
1529
+ if (fr->pbcType != PbcType::No)
1530
+ {
1531
+ /* Compute shift vectors every step,
1532
+ * because of pressure coupling or box deformation!
1533
+ */
1534
+ if (stepWork.haveDynamicBox && stepWork.stateChanged)
1535
+ {
1536
+ calc_shifts(box, fr->shift_vec);
1537
+ }
1538
+ }
1539
+ nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, &nbv->nbat());
1540
+
1541
+
1542
+ GMX_ASSERT(simulationWork.useGpuHaloExchange
1543
+ == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
1544
+ "The GPU halo exchange is active, but it has not been constructed.");
1545
+
1546
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
1547
+ // Copy coordinate from the GPU if update is on the GPU and there
1548
+ // are forces to be computed on the CPU, or for the computation of
1549
+ // virial, or if host-side data will be transferred from this task
1550
+ // to a remote task for halo exchange or PME-PP communication. At
1551
+ // search steps the current coordinates are already on the host,
1552
+ // hence copy is not needed.
1553
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
1554
+ && (runScheduleWork.domainWork.haveCpuLocalForceWork || stepWork.computeVirial
1555
+ || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
1556
+ || simulationWork.computeMuTot))
1557
+ {
1558
+ stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
1559
+ haveCopiedXFromGpu = true;
1560
+ }
1561
+
1562
+ // Coordinates on the device are needed if PME or BufferOps are offloaded.
1563
+ // The local coordinates can be copied right away.
1564
+ // NOTE: Consider moving this copy to right after they are updated and constrained,
1565
+ // if the later is not offloaded.
1566
+ if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps || pmeSendCoordinatesFromGpu)
1567
+ {
1568
+ GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
1569
+ const int expectedLocalXReadyOnDeviceConsumptionCount =
1570
+ getExpectedLocalXReadyOnDeviceConsumptionCount(
1571
+ simulationWork, stepWork, pmeSendCoordinatesFromGpu);
1572
+
1573
+ // We need to copy coordinates when:
1574
+ // 1. Update is not offloaded
1575
+ // 2. The buffers were reinitialized on search step
1576
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
1577
+ {
1578
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(),
1579
+ AtomLocality::Local,
1580
+ expectedLocalXReadyOnDeviceConsumptionCount);
1581
+ }
1582
+ else if (simulationWork.useGpuUpdate)
1583
+ {
1584
+ stateGpu->setXUpdatedOnDeviceEventExpectedConsumptionCount(
1585
+ expectedLocalXReadyOnDeviceConsumptionCount);
1586
+ }
1587
+ }
1588
+
1589
+ if (stepWork.computePmeOnSeparateRank)
1590
+ {
1591
+ /* Send particle coordinates to the pme nodes */
1592
+ if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
1593
+ {
1594
+ GMX_ASSERT(haveCopiedXFromGpu,
1595
+ "a wait should only be triggered if copy has been scheduled");
1596
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1597
+ }
1598
+
1599
+ const bool reinitGpuPmePpComms =
1600
+ simulationWork.useGpuPmePpCommunication && stepWork.doNeighborSearch;
1601
+ gmx_pme_send_coordinates(fr,
1602
+ cr,
1603
+ box,
1604
+ x.unpaddedArrayRef(),
1605
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1606
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
1607
+ (stepWork.computeVirial || stepWork.computeEnergy),
1608
+ step,
1609
+ simulationWork.useGpuPmePpCommunication,
1610
+ reinitGpuPmePpComms,
1611
+ pmeSendCoordinatesFromGpu,
1612
+ stepWork.useGpuPmeFReduction,
1613
+ pmeSendCoordinatesFromGpu ? localXReadyOnDevice : nullptr,
1614
+ simulationWork.useMdGpuGraph,
1615
+ wcycle);
1616
+ }
1617
+
1618
+ if (simulationWork.useGpuFBufferOpsWhenAllowed && stepWork.doNeighborSearch)
1619
+ {
1620
+ // with MPI, direct GPU communication, and separate PME ranks we need
1621
+ // gmx_pme_send_coordinates() to be called before we can set up force reduction
1622
+ bool doSetupLocalGpuForceReduction = GMX_MPI && simulationWork.useGpuPmePpCommunication;
1623
+ if (doSetupLocalGpuForceReduction)
1624
+ {
1625
+ setupLocalGpuForceReduction(runScheduleWork,
1626
+ fr->nbv.get(),
1627
+ stateGpu,
1628
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1629
+ fr->pmePpCommGpu.get(),
1630
+ fr->pmedata,
1631
+ cr->dd);
1632
+ }
1633
+ }
1634
+
1635
+ if (stepWork.haveGpuPmeOnThisRank)
1636
+ {
1637
+ launchPmeGpuSpread(fr->pmedata,
1638
+ box,
1639
+ stepWork,
1640
+ localXReadyOnDevice,
1641
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1642
+ simulationWork.useMdGpuGraph,
1643
+ wcycle);
1644
+ }
1645
+
1646
+
1647
+ if (!stepWork.doNeighborSearch && !EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
1648
+ {
1649
+ if (stepWork.useGpuXBufferOps)
1650
+ {
1651
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
1652
+ nbv->convertCoordinatesGpu(AtomLocality::Local, stateGpu->getCoordinates(), localXReadyOnDevice);
1653
+ }
1654
+ else
1655
+ {
1656
+ if (simulationWork.useGpuUpdate)
1657
+ {
1658
+ GMX_ASSERT(stateGpu, "need a valid stateGpu object");
1659
+ GMX_ASSERT(haveCopiedXFromGpu,
1660
+ "a wait should only be triggered if copy has been scheduled");
1661
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1662
+ }
1663
+ nbv->convertCoordinates(AtomLocality::Local, x.unpaddedArrayRef());
1664
+ }
1665
+ }
1666
+
1667
+ if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
1668
+ {
1669
+ ddBalanceRegionHandler.openBeforeForceComputationGpu();
1670
+
1671
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpuPp);
1672
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1673
+ Nbnxm::gpu_upload_shiftvec(nbv->gpuNbv(), &nbv->nbat());
1674
+ if (!stepWork.useGpuXBufferOps)
1675
+ {
1676
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpuNbv(), &nbv->nbat(), AtomLocality::Local);
1677
+ }
1678
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1679
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1680
+ // with X buffer ops offloaded to the GPU on all but the search steps
1681
+
1682
+ // bonded work not split into separate local and non-local, so with DD
1683
+ // we can only launch the kernel after non-local coordinates have been received.
1684
+ if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
1685
+ {
1686
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1687
+ }
1688
+
1689
+ /* launch local nonbonded work on GPU */
1690
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1691
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1692
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
1693
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1694
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1695
+ }
1696
+
1697
+ if (stepWork.haveGpuPmeOnThisRank)
1698
+ {
1699
+ // In PME GPU and mixed mode we launch FFT / gather after the
1700
+ // X copy/transform to allow overlap as well as after the GPU NB
1701
+ // launch to avoid FFT launch overhead hijacking the CPU and delaying
1702
+ // the nonbonded kernel.
1703
+ launchPmeGpuFftAndGather(fr->pmedata,
1704
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1705
+ wcycle,
1706
+ stepWork);
1707
+ }
1708
+
1709
+ /* Communicate coordinates and sum dipole if necessary */
1710
+ if (simulationWork.havePpDomainDecomposition)
1711
+ {
1712
+ if (!stepWork.doNeighborSearch)
1713
+ {
1714
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
1715
+ if (stepWork.useGpuXHalo)
1716
+ {
1717
+ // The following must be called after local setCoordinates (which records an event
1718
+ // when the coordinate data has been copied to the device).
1719
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
1720
+
1721
+ if (domainWork.haveCpuNonLocalForceWork)
1722
+ {
1723
+ // non-local part of coordinate buffer must be copied back to host for CPU work
1724
+ stateGpu->copyCoordinatesFromGpu(
1725
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
1726
+ }
1727
+ }
1728
+ else
1729
+ {
1730
+ if (simulationWork.useGpuUpdate)
1731
+ {
1732
+ GMX_ASSERT(haveCopiedXFromGpu,
1733
+ "a wait should only be triggered if copy has been scheduled");
1734
+ const bool haveAlreadyWaited =
1735
+ (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1736
+ if (!haveAlreadyWaited)
1737
+ {
1738
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1739
+ }
1740
+ }
1741
+ dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1742
+ }
1743
+
1744
+ if (stepWork.useGpuXBufferOps)
1745
+ {
1746
+ if (!stepWork.useGpuXHalo)
1747
+ {
1748
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
1749
+ }
1750
+ GpuEventSynchronizer* xReadyOnDeviceEvent = stateGpu->getCoordinatesReadyOnDeviceEvent(
1751
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched);
1752
+ if (stepWork.useGpuXHalo && domainWork.haveCpuNonLocalForceWork)
1753
+ {
1754
+ /* We already enqueued an event for Gpu Halo exchange completion into the
1755
+ * NonLocal stream when D2H copying the coordinates. */
1756
+ xReadyOnDeviceEvent = nullptr;
1757
+ }
1758
+ nbv->convertCoordinatesGpu(
1759
+ AtomLocality::NonLocal, stateGpu->getCoordinates(), xReadyOnDeviceEvent);
1760
+ }
1761
+ else
1762
+ {
1763
+ nbv->convertCoordinates(AtomLocality::NonLocal, x.unpaddedArrayRef());
1764
+ }
1765
+ }
1766
+
1767
+ if (simulationWork.useGpuNonbonded)
1768
+ {
1769
+
1770
+ if (!stepWork.useGpuXBufferOps)
1771
+ {
1772
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpuPp);
1773
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1774
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpuNbv(), &nbv->nbat(), AtomLocality::NonLocal);
1775
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1776
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1777
+ }
1778
+
1779
+ if (domainWork.haveGpuBondedWork)
1780
+ {
1781
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1782
+ }
1783
+
1784
+ /* launch non-local nonbonded tasks on GPU */
1785
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1786
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1787
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1788
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1789
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1790
+ }
1791
+ }
1792
+
1793
+ if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
1794
+ {
1795
+ /* launch D2H copy-back F */
1796
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1797
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1798
+
1799
+ if (simulationWork.havePpDomainDecomposition)
1800
+ {
1801
+ Nbnxm::gpu_launch_cpyback(nbv->gpuNbv(), &nbv->nbat(), stepWork, AtomLocality::NonLocal);
1802
+ }
1803
+ Nbnxm::gpu_launch_cpyback(nbv->gpuNbv(), &nbv->nbat(), stepWork, AtomLocality::Local);
1804
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1805
+
1806
+ if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
1807
+ {
1808
+ fr->listedForcesGpu->launchEnergyTransfer();
1809
+ }
1810
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1811
+ }
1812
+
1813
+ gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
1814
+ if (fr->wholeMoleculeTransform)
1815
+ {
1816
+ xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
1817
+ }
1818
+
1819
+ // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
1820
+ // this wait ensures that the D2H transfer is complete.
1821
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
1822
+ {
1823
+ const bool needCoordsOnHost = (runScheduleWork.domainWork.haveCpuLocalForceWork
1824
+ || stepWork.computeVirial || simulationWork.computeMuTot);
1825
+ const bool haveAlreadyWaited =
1826
+ simulationWork.useCpuHaloExchange
1827
+ || (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1828
+ if (needCoordsOnHost && !haveAlreadyWaited)
1829
+ {
1830
+ GMX_ASSERT(haveCopiedXFromGpu,
1831
+ "a wait should only be triggered if copy has been scheduled");
1832
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1833
+ }
1834
+ }
1835
+
1836
+ DipoleData dipoleData;
1837
+
1838
+ if (simulationWork.computeMuTot)
1839
+ {
1840
+ const int start = 0;
1841
+
1842
+ /* Calculate total (local) dipole moment in a temporary common array.
1843
+ * This makes it possible to sum them over nodes faster.
1844
+ */
1845
+ gmx::ArrayRef<const gmx::RVec> xRef =
1846
+ (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
1847
+ calc_mu(start,
1848
+ mdatoms->homenr,
1849
+ xRef,
1850
+ mdatoms->chargeA,
1851
+ mdatoms->chargeB,
1852
+ mdatoms->nChargePerturbed != 0,
1853
+ dipoleData.muStaging[0],
1854
+ dipoleData.muStaging[1]);
1855
+
1856
+ reduceAndUpdateMuTot(
1857
+ &dipoleData, cr, (fr->efep != FreeEnergyPerturbationType::No), lambda, muTotal, ddBalanceRegionHandler);
1858
+ }
1859
+
1860
+ /* Reset energies */
1861
+ reset_enerdata(enerd);
1862
+
1863
+ if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
1864
+ {
1865
+ wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
1866
+ dd_force_flop_start(cr->dd, nrnb);
1867
+ }
1868
+
1869
+ if (inputrec.bRot)
1870
+ {
1871
+ wallcycle_start(wcycle, WallCycleCounter::Rot);
1872
+ do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
1873
+ wallcycle_stop(wcycle, WallCycleCounter::Rot);
1874
+ }
1875
+
1876
+ /* Start the force cycle counter.
1877
+ * Note that a different counter is used for dynamic load balancing.
1878
+ */
1879
+ wallcycle_start(wcycle, WallCycleCounter::Force);
1880
+
1881
+ /* Set up and clear force outputs:
1882
+ * forceOutMtsLevel0: everything except what is in the other two outputs
1883
+ * forceOutMtsLevel1: PME-mesh and listed-forces group 1
1884
+ * forceOutNonbonded: non-bonded forces
1885
+ * Without multiple time stepping all point to the same object.
1886
+ * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
1887
+ *
1888
+ * Note that CPU force buffer clearing needs to happen after the completion of the
1889
+ * previous step's CPU force H2D transfer (prior to force reduction).
1890
+ * In the current code this is ensured by the earlier waitCoordinatesReadyOnHost()
1891
+ * which is sufficient, but it is suboptimal as it prevents overlap of the force clearing
1892
+ * with independent GPU work (integration/constraints, x D2H copy).
1893
+ */
1894
+ ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
1895
+ &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
1896
+
1897
+ // Force output for MTS combined forces, only set at level1 MTS steps
1898
+ std::optional<ForceOutputs> forceOutMts =
1899
+ (simulationWork.useMts && stepWork.computeSlowForces)
1900
+ ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
1901
+ forceView->forceMtsCombinedWithPadding(),
1902
+ domainWork,
1903
+ stepWork,
1904
+ simulationWork.havePpDomainDecomposition,
1905
+ wcycle))
1906
+ : std::nullopt;
1907
+
1908
+ ForceOutputs* forceOutMtsLevel1 =
1909
+ simulationWork.useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr)
1910
+ : &forceOutMtsLevel0;
1911
+
1912
+ const bool nonbondedAtMtsLevel1 = runScheduleWork.simulationWork.computeNonbondedAtMtsLevel1;
1913
+
1914
+ ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
1915
+
1916
+ if (inputrec.bPull && pull_have_constraint(*pull_work))
1917
+ {
1918
+ clear_pull_forces(pull_work);
1919
+ }
1920
+
1921
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1922
+
1923
+ /* We calculate the non-bonded forces, when done on the CPU, here.
1924
+ * We do this before calling do_force_lowlevel, because in that
1925
+ * function, the listed forces are calculated before PME, which
1926
+ * does communication. With this order, non-bonded and listed
1927
+ * force calculation imbalance can be balanced out by the domain
1928
+ * decomposition load balancing.
1929
+ */
1930
+
1931
+ const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
1932
+
1933
+ if (!useOrEmulateGpuNb)
1934
+ {
1935
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1936
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
1937
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1938
+ }
1939
+
1940
+ if (stepWork.useGpuXHalo && domainWork.haveCpuNonLocalForceWork)
1941
+ {
1942
+ /* Wait for non-local coordinate data to be copied from device */
1943
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
1944
+ }
1945
+
1946
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1947
+ if (fr->efep != FreeEnergyPerturbationType::No && stepWork.computeNonbondedForces)
1948
+ {
1949
+ /* Calculate the local and non-local free energy interactions here.
1950
+ * Happens here on the CPU both with and without GPU.
1951
+ */
1952
+ nbv->dispatchFreeEnergyKernels(x,
1953
+ &forceOutNonbonded->forceWithShiftForces(),
1954
+ fr->use_simd_kernels,
1955
+ fr->ntype,
1956
+ *fr->ic,
1957
+ fr->shift_vec,
1958
+ fr->nbfp,
1959
+ fr->ljpme_c6grid,
1960
+ mdatoms->chargeA,
1961
+ mdatoms->chargeB,
1962
+ mdatoms->typeA,
1963
+ mdatoms->typeB,
1964
+ lambda,
1965
+ enerd,
1966
+ stepWork,
1967
+ nrnb);
1968
+ }
1969
+
1970
+ if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
1971
+ {
1972
+ if (simulationWork.havePpDomainDecomposition)
1973
+ {
1974
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1975
+ }
1976
+
1977
+ if (stepWork.computeForces)
1978
+ {
1979
+ /* Add all the non-bonded force to the normal force array.
1980
+ * This can be split into a local and a non-local part when overlapping
1981
+ * communication with calculation with domain decomposition.
1982
+ */
1983
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1984
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
1985
+ forceOutNonbonded->forceWithShiftForces().force());
1986
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1987
+ }
1988
+
1989
+ /* If there are multiple fshift output buffers we need to reduce them */
1990
+ if (stepWork.computeVirial)
1991
+ {
1992
+ /* This is not in a subcounter because it takes a
1993
+ negligible and constant-sized amount of time */
1994
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(
1995
+ nbv->nbat(), forceOutNonbonded->forceWithShiftForces().shiftForces());
1996
+ }
1997
+ }
1998
+
1999
+ // Compute wall interactions, when present.
2000
+ // Note: should be moved to special forces.
2001
+ if (inputrec.nwall && stepWork.computeNonbondedForces)
2002
+ {
2003
+ /* foreign lambda component for walls */
2004
+ real dvdl_walls = do_walls(inputrec,
2005
+ *fr,
2006
+ box,
2007
+ mdatoms->typeA,
2008
+ mdatoms->typeB,
2009
+ mdatoms->cENER,
2010
+ mdatoms->homenr,
2011
+ mdatoms->nPerturbed,
2012
+ x.unpaddedConstArrayRef(),
2013
+ &forceOutMtsLevel0.forceWithVirial(),
2014
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
2015
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR],
2016
+ nrnb);
2017
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_walls;
2018
+ }
2019
+
2020
+ if (stepWork.computeListedForces)
2021
+ {
2022
+ /* Check whether we need to take into account PBC in listed interactions */
2023
+ bool needMolPbc = false;
2024
+ for (const auto& listedForces : fr->listedForces)
2025
+ {
2026
+ if (listedForces.haveCpuListedForces(*fr->fcdata))
2027
+ {
2028
+ needMolPbc = fr->bMolPBC;
2029
+ }
2030
+ }
2031
+
2032
+ t_pbc pbc;
2033
+
2034
+ if (needMolPbc)
2035
+ {
2036
+ /* Since all atoms are in the rectangular or triclinic unit-cell,
2037
+ * only single box vector shifts (2 in x) are required.
2038
+ */
2039
+ set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
2040
+ }
2041
+
2042
+ for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
2043
+ mtsIndex++)
2044
+ {
2045
+ ListedForces& listedForces = fr->listedForces[mtsIndex];
2046
+ ForceOutputs& forceOut = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
2047
+ listedForces.calculate(wcycle,
2048
+ box,
2049
+ cr,
2050
+ ms,
2051
+ x,
2052
+ xWholeMolecules,
2053
+ fr->fcdata.get(),
2054
+ hist,
2055
+ &forceOut,
2056
+ fr,
2057
+ &pbc,
2058
+ enerd,
2059
+ nrnb,
2060
+ lambda,
2061
+ mdatoms->chargeA,
2062
+ mdatoms->chargeB,
2063
+ makeConstArrayRef(mdatoms->bPerturbed),
2064
+ mdatoms->cENER,
2065
+ mdatoms->nPerturbed,
2066
+ haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
2067
+ stepWork);
2068
+ }
2069
+ }
2070
+
2071
+ if (stepWork.computeSlowForces)
2072
+ {
2073
+ longRangeNonbondeds->calculate(fr->pmedata,
2074
+ cr,
2075
+ x.unpaddedConstArrayRef(),
2076
+ &forceOutMtsLevel1->forceWithVirial(),
2077
+ enerd,
2078
+ box,
2079
+ lambda,
2080
+ dipoleData.muStateAB,
2081
+ stepWork,
2082
+ ddBalanceRegionHandler);
2083
+ }
2084
+
2085
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2086
+
2087
+ // VdW dispersion correction, only computed on main rank to avoid double counting
2088
+ if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MAIN(cr))
2089
+ {
2090
+ // Calculate long range corrections to pressure and energy
2091
+ const DispersionCorrection::Correction correction = fr->dispersionCorrection->calculate(
2092
+ box, lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]);
2093
+
2094
+ if (stepWork.computeEnergy)
2095
+ {
2096
+ enerd->term[F_DISPCORR] = correction.energy;
2097
+ enerd->term[F_DVDL_VDW] += correction.dvdl;
2098
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += correction.dvdl;
2099
+ }
2100
+ if (stepWork.computeVirial)
2101
+ {
2102
+ correction.correctVirial(vir_force);
2103
+ enerd->term[F_PDISPCORR] = correction.pressure;
2104
+ }
2105
+ }
2106
+
2107
+ const bool needToReceivePmeResultsFromSeparateRank = (PAR(cr) && stepWork.computePmeOnSeparateRank);
2108
+ const bool needToReceivePmeResults =
2109
+ (stepWork.haveGpuPmeOnThisRank || needToReceivePmeResultsFromSeparateRank);
2110
+
2111
+ /* When running free energy perturbations steered by AWH and doing PME calculations on the
2112
+ * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
2113
+ * FEP dimension with AWH. */
2114
+ const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension() && needToReceivePmeResults
2115
+ && stepWork.computeEnergy && stepWork.computeSlowForces);
2116
+ if (needEarlyPmeResults)
2117
+ {
2118
+ if (stepWork.haveGpuPmeOnThisRank)
2119
+ {
2120
+ pmeGpuWaitAndReduce(fr->pmedata,
2121
+ stepWork,
2122
+ wcycle,
2123
+ &forceOutMtsLevel1->forceWithVirial(),
2124
+ enerd,
2125
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2126
+ }
2127
+ else if (needToReceivePmeResultsFromSeparateRank)
2128
+ {
2129
+ /* In case of node-splitting, the PP nodes receive the long-range
2130
+ * forces, virial and energy from the PME nodes here.
2131
+ */
2132
+ pme_receive_force_ener(fr,
2133
+ cr,
2134
+ &forceOutMtsLevel1->forceWithVirial(),
2135
+ enerd,
2136
+ simulationWork.useGpuPmePpCommunication,
2137
+ stepWork.useGpuPmeFReduction,
2138
+ wcycle);
2139
+ }
2140
+ }
2141
+
2142
+ computeSpecialForces(fplog,
2143
+ cr,
2144
+ inputrec,
2145
+ awh,
2146
+ enforcedRotation,
2147
+ imdSession,
2148
+ pull_work,
2149
+ step,
2150
+ t,
2151
+ wcycle,
2152
+ fr->forceProviders,
2153
+ box,
2154
+ x.unpaddedArrayRef(),
2155
+ mdatoms,
2156
+ lambda,
2157
+ stepWork,
2158
+ &forceOutMtsLevel0.forceWithVirial(),
2159
+ forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr,
2160
+ enerd,
2161
+ ed,
2162
+ stepWork.doNeighborSearch);
2163
+
2164
+ if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
2165
+ && domainWork.haveCpuLocalForceWork)
2166
+ {
2167
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
2168
+ }
2169
+
2170
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2171
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2172
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
2173
+ "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
2174
+ // Will store the amount of cycles spent waiting for the GPU that
2175
+ // will be later used in the DLB accounting.
2176
+ float cycles_wait_gpu = 0;
2177
+ if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
2178
+ {
2179
+ auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
2180
+
2181
+ /* wait for non-local forces (or calculate in emulation mode) */
2182
+ if (simulationWork.havePpDomainDecomposition)
2183
+ {
2184
+ if (simulationWork.useGpuNonbonded)
2185
+ {
2186
+ cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
2187
+ nbv->gpuNbv(),
2188
+ stepWork,
2189
+ AtomLocality::NonLocal,
2190
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2191
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2192
+ forceWithShiftForces.shiftForces(),
2193
+ wcycle);
2194
+ }
2195
+ else
2196
+ {
2197
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2198
+ do_nb_verlet(
2199
+ fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
2200
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2201
+ }
2202
+
2203
+ if (stepWork.useGpuFBufferOps)
2204
+ {
2205
+ if (domainWork.haveCpuNonLocalForceWork)
2206
+ {
2207
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2208
+ AtomLocality::NonLocal);
2209
+ }
2210
+
2211
+
2212
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
2213
+
2214
+ if (!stepWork.useGpuFHalo)
2215
+ {
2216
+ /* We don't explicitly wait for the forces to be reduced on device,
2217
+ * but wait for them to finish copying to CPU instead.
2218
+ * So, we manually consume the event, see Issue #3988. */
2219
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
2220
+ // copy from GPU input for dd_move_f()
2221
+ stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2222
+ AtomLocality::NonLocal);
2223
+ }
2224
+ }
2225
+ else
2226
+ {
2227
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
2228
+ }
2229
+
2230
+ if (fr->nbv->emulateGpu() && stepWork.computeVirial)
2231
+ {
2232
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat(), forceWithShiftForces.shiftForces());
2233
+ }
2234
+ }
2235
+ }
2236
+
2237
+ /* Combining the forces for multiple time stepping before the halo exchange, when possible,
2238
+ * avoids an extra halo exchange (when DD is used) and post-processing step.
2239
+ */
2240
+ if (stepWork.combineMtsForcesBeforeHaloExchange)
2241
+ {
2242
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2243
+ combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
2244
+ force.unpaddedArrayRef(),
2245
+ forceView->forceMtsCombined(),
2246
+ inputrec.mtsLevels[1].stepFactor);
2247
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2248
+ }
2249
+
2250
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
2251
+ // an alternating wait/reduction scheme.
2252
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2253
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2254
+ const bool alternateGpuWait = (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
2255
+ && simulationWork.useGpuNonbonded && !simulationWork.havePpDomainDecomposition
2256
+ && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
2257
+
2258
+
2259
+ const int expectedLocalFReadyOnDeviceConsumptionCount = getExpectedLocalFReadyOnDeviceConsumptionCount(
2260
+ simulationWork, domainWork, stepWork, useOrEmulateGpuNb, alternateGpuWait);
2261
+ // If expectedLocalFReadyOnDeviceConsumptionCount == 0, stateGpu can be uninitialized
2262
+ if (expectedLocalFReadyOnDeviceConsumptionCount > 0)
2263
+ {
2264
+ stateGpu->setFReadyOnDeviceEventExpectedConsumptionCount(
2265
+ AtomLocality::Local, expectedLocalFReadyOnDeviceConsumptionCount);
2266
+ }
2267
+
2268
+ if (simulationWork.havePpDomainDecomposition)
2269
+ {
2270
+ /* We are done with the CPU compute.
2271
+ * We will now communicate the non-local forces.
2272
+ * If we use a GPU this will overlap with GPU work, so in that case
2273
+ * we do not close the DD force balancing region here.
2274
+ */
2275
+ ddBalanceRegionHandler.closeAfterForceComputationCpu();
2276
+
2277
+ if (stepWork.computeForces)
2278
+ {
2279
+
2280
+ if (stepWork.useGpuFHalo)
2281
+ {
2282
+ // If there exist CPU forces, data from halo exchange should accumulate into these
2283
+ bool accumulateForces = domainWork.haveCpuLocalForceWork;
2284
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
2285
+ // completion of both H2D copy and clearing is signaled by fReadyOnDevice
2286
+ if (domainWork.haveCpuLocalForceWork || stepWork.clearGpuFBufferEarly)
2287
+ {
2288
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
2289
+ }
2290
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
2291
+
2292
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
2293
+ }
2294
+ else
2295
+ {
2296
+ if (stepWork.useGpuFBufferOps)
2297
+ {
2298
+ stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
2299
+ }
2300
+
2301
+ // Without MTS or with MTS at slow steps with uncombined forces we need to
2302
+ // communicate the fast forces
2303
+ if (!simulationWork.useMts || !stepWork.combineMtsForcesBeforeHaloExchange)
2304
+ {
2305
+ dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
2306
+ }
2307
+ // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
2308
+ if (simulationWork.useMts && stepWork.computeSlowForces)
2309
+ {
2310
+ dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
2311
+ }
2312
+ }
2313
+ }
2314
+ }
2315
+
2316
+ if (alternateGpuWait)
2317
+ {
2318
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(),
2319
+ fr->pmedata,
2320
+ forceOutNonbonded,
2321
+ forceOutMtsLevel1,
2322
+ enerd,
2323
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
2324
+ stepWork,
2325
+ wcycle);
2326
+ }
2327
+
2328
+ if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank && !needEarlyPmeResults)
2329
+ {
2330
+ pmeGpuWaitAndReduce(fr->pmedata,
2331
+ stepWork,
2332
+ wcycle,
2333
+ &forceOutMtsLevel1->forceWithVirial(),
2334
+ enerd,
2335
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2336
+ }
2337
+
2338
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
2339
+ if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
2340
+ {
2341
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
2342
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
2343
+ * but even with a step of 0.1 ms the difference is less than 1%
2344
+ * of the step time.
2345
+ */
2346
+ const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
2347
+ const float waitCycles = Nbnxm::gpu_wait_finish_task(
2348
+ nbv->gpuNbv(),
2349
+ stepWork,
2350
+ AtomLocality::Local,
2351
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2352
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2353
+ forceOutNonbonded->forceWithShiftForces().shiftForces(),
2354
+ wcycle);
2355
+
2356
+ if (ddBalanceRegionHandler.useBalancingRegion())
2357
+ {
2358
+ DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
2359
+ if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
2360
+ {
2361
+ /* We measured few cycles, it could be that the kernel
2362
+ * and transfer finished earlier and there was no actual
2363
+ * wait time, only API call overhead.
2364
+ * Then the actual time could be anywhere between 0 and
2365
+ * cycles_wait_est. We will use half of cycles_wait_est.
2366
+ */
2367
+ waitedForGpu = DdBalanceRegionWaitedForGpu::no;
2368
+ }
2369
+ ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
2370
+ }
2371
+ }
2372
+
2373
+ if (fr->nbv->emulateGpu())
2374
+ {
2375
+ // NOTE: emulation kernel is not included in the balancing region,
2376
+ // but emulation mode does not target performance anyway
2377
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2378
+ do_nb_verlet(fr,
2379
+ ic,
2380
+ enerd,
2381
+ stepWork,
2382
+ InteractionLocality::Local,
2383
+ haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
2384
+ step,
2385
+ nrnb,
2386
+ wcycle);
2387
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2388
+ }
2389
+
2390
+ // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
2391
+ // TODO refactor this and unify with below default-path call to the same function
2392
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2393
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2394
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
2395
+ {
2396
+ /* In case of node-splitting, the PP nodes receive the long-range
2397
+ * forces, virial and energy from the PME nodes here.
2398
+ */
2399
+ pme_receive_force_ener(fr,
2400
+ cr,
2401
+ &forceOutMtsLevel1->forceWithVirial(),
2402
+ enerd,
2403
+ simulationWork.useGpuPmePpCommunication,
2404
+ stepWork.useGpuPmeFReduction,
2405
+ wcycle);
2406
+ }
2407
+
2408
+
2409
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
2410
+ * on the non-alternating path. */
2411
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2412
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2413
+ if (useOrEmulateGpuNb && !alternateGpuWait)
2414
+ {
2415
+ if (stepWork.useGpuFBufferOps)
2416
+ {
2417
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2418
+
2419
+ // TODO: move these steps as early as possible:
2420
+ // - CPU f H2D should be as soon as all CPU-side forces are done
2421
+ // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
2422
+ // before the next CPU task that consumes the forces: vsite spread or update)
2423
+ // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
2424
+ // of the halo exchange. In that case the copy is instead performed above, before the exchange.
2425
+ // These should be unified.
2426
+ if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
2427
+ {
2428
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
2429
+ }
2430
+
2431
+ if (stepWork.computeNonbondedForces)
2432
+ {
2433
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
2434
+ }
2435
+
2436
+ // Copy forces to host if they are needed for update or if virtual sites are enabled.
2437
+ // If there are vsites, we need to copy forces every step to spread vsite forces on host.
2438
+ // TODO: When the output flags will be included in step workload, this copy can be combined with the
2439
+ // copy call done in sim_utils(...) for the output.
2440
+ // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
2441
+ // they should not be copied in do_md(...) for the output.
2442
+ if (!simulationWork.useGpuUpdate
2443
+ || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
2444
+ || vsite)
2445
+ {
2446
+ if (stepWork.computeNonbondedForces)
2447
+ {
2448
+ /* We have previously issued force reduction on the GPU, but we will
2449
+ * not use this event, instead relying on the stream being in-order.
2450
+ * Issue #3988. */
2451
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
2452
+ }
2453
+ stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
2454
+ stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
2455
+ }
2456
+ }
2457
+ else if (stepWork.computeNonbondedForces)
2458
+ {
2459
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2460
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
2461
+ }
2462
+ }
2463
+
2464
+ if (expectedLocalFReadyOnDeviceConsumptionCount > 0)
2465
+ {
2466
+ /* The same fReadyOnDevice device synchronizer is later used to track buffer clearing,
2467
+ * so we reset the expected consumption value back to the default (1). */
2468
+ stateGpu->setFReadyOnDeviceEventExpectedConsumptionCount(AtomLocality::Local, 1);
2469
+ }
2470
+
2471
+ launchGpuEndOfStepTasks(
2472
+ nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, runScheduleWork, step, wcycle);
2473
+
2474
+ if (haveDDAtomOrdering(*cr))
2475
+ {
2476
+ dd_force_flop_stop(cr->dd, nrnb);
2477
+ }
2478
+
2479
+ const bool haveCombinedMtsForces = (stepWork.computeForces && simulationWork.useMts && stepWork.computeSlowForces
2480
+ && stepWork.combineMtsForcesBeforeHaloExchange);
2481
+ if (stepWork.computeForces)
2482
+ {
2483
+ postProcessForceWithShiftForces(
2484
+ nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0, vir_force, *mdatoms, *fr, vsite, stepWork);
2485
+
2486
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2487
+ {
2488
+ postProcessForceWithShiftForces(
2489
+ nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, *mdatoms, *fr, vsite, stepWork);
2490
+ }
2491
+ }
2492
+
2493
+ // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
2494
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2495
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2496
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useCpuPmePpCommunication && !needEarlyPmeResults)
2497
+ {
2498
+ /* In case of node-splitting, the PP nodes receive the long-range
2499
+ * forces, virial and energy from the PME nodes here.
2500
+ */
2501
+ pme_receive_force_ener(fr,
2502
+ cr,
2503
+ &forceOutMtsLevel1->forceWithVirial(),
2504
+ enerd,
2505
+ simulationWork.useGpuPmePpCommunication,
2506
+ false,
2507
+ wcycle);
2508
+ }
2509
+
2510
+ if (stepWork.computeForces)
2511
+ {
2512
+ /* If we don't use MTS or if we already combined the MTS forces before, we only
2513
+ * need to post-process one ForceOutputs object here, called forceOutCombined,
2514
+ * otherwise we have to post-process two outputs and then combine them.
2515
+ */
2516
+ ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
2517
+ postProcessForces(
2518
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined, vir_force, mdatoms, fr, vsite, stepWork);
2519
+
2520
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2521
+ {
2522
+ postProcessForces(
2523
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, mdatoms, fr, vsite, stepWork);
2524
+
2525
+ combineMtsForces(mdatoms->homenr,
2526
+ force.unpaddedArrayRef(),
2527
+ forceView->forceMtsCombined(),
2528
+ inputrec.mtsLevels[1].stepFactor);
2529
+ }
2530
+ }
2531
+
2532
+ if (stepWork.computeEnergy)
2533
+ {
2534
+ /* Compute the final potential energy terms */
2535
+ accumulatePotentialEnergies(enerd, lambda, inputrec.fepvals.get());
2536
+
2537
+ if (!EI_TPI(inputrec.eI))
2538
+ {
2539
+ checkPotentialEnergyValidity(step, *enerd, inputrec);
2540
+ }
2541
+ }
2542
+
2543
+ /* In case we don't have constraints and are using GPUs, the next balancing
2544
+ * region starts here.
2545
+ * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2546
+ * virial calculation and COM pulling, is not thus not included in
2547
+ * the balance timing, which is ok as most tasks do communication.
2548
+ */
2549
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2550
+ }