hillclimber 0.1.6__cp313-cp313-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (468) hide show
  1. hillclimber/__init__.py +41 -0
  2. hillclimber/actions.py +53 -0
  3. hillclimber/analysis.py +590 -0
  4. hillclimber/biases.py +293 -0
  5. hillclimber/calc.py +22 -0
  6. hillclimber/cvs.py +1070 -0
  7. hillclimber/interfaces.py +133 -0
  8. hillclimber/metadynamics.py +416 -0
  9. hillclimber/nodes.py +6 -0
  10. hillclimber/opes.py +359 -0
  11. hillclimber/pycv.py +362 -0
  12. hillclimber/selectors.py +230 -0
  13. hillclimber/virtual_atoms.py +341 -0
  14. hillclimber-0.1.6.dist-info/METADATA +325 -0
  15. hillclimber-0.1.6.dist-info/RECORD +468 -0
  16. hillclimber-0.1.6.dist-info/WHEEL +6 -0
  17. hillclimber-0.1.6.dist-info/entry_points.txt +8 -0
  18. hillclimber-0.1.6.dist-info/licenses/LICENSE +165 -0
  19. plumed/__init__.py +104 -0
  20. plumed/_lib/bin/plumed +0 -0
  21. plumed/_lib/bin/plumed-config +9 -0
  22. plumed/_lib/bin/plumed-patch +9 -0
  23. plumed/_lib/include/plumed/adjmat/AdjacencyMatrixBase.h +659 -0
  24. plumed/_lib/include/plumed/adjmat/ContactMatrix.h +59 -0
  25. plumed/_lib/include/plumed/asmjit/arch.h +228 -0
  26. plumed/_lib/include/plumed/asmjit/arm.h +43 -0
  27. plumed/_lib/include/plumed/asmjit/asmjit.h +69 -0
  28. plumed/_lib/include/plumed/asmjit/asmjit_apibegin.h +143 -0
  29. plumed/_lib/include/plumed/asmjit/asmjit_apiend.h +93 -0
  30. plumed/_lib/include/plumed/asmjit/asmjit_build.h +971 -0
  31. plumed/_lib/include/plumed/asmjit/assembler.h +183 -0
  32. plumed/_lib/include/plumed/asmjit/base.h +56 -0
  33. plumed/_lib/include/plumed/asmjit/codebuilder.h +944 -0
  34. plumed/_lib/include/plumed/asmjit/codecompiler.h +767 -0
  35. plumed/_lib/include/plumed/asmjit/codeemitter.h +528 -0
  36. plumed/_lib/include/plumed/asmjit/codeholder.h +777 -0
  37. plumed/_lib/include/plumed/asmjit/constpool.h +286 -0
  38. plumed/_lib/include/plumed/asmjit/cpuinfo.h +402 -0
  39. plumed/_lib/include/plumed/asmjit/func.h +1327 -0
  40. plumed/_lib/include/plumed/asmjit/globals.h +370 -0
  41. plumed/_lib/include/plumed/asmjit/inst.h +137 -0
  42. plumed/_lib/include/plumed/asmjit/logging.h +317 -0
  43. plumed/_lib/include/plumed/asmjit/misc_p.h +103 -0
  44. plumed/_lib/include/plumed/asmjit/moved_string.h +318 -0
  45. plumed/_lib/include/plumed/asmjit/operand.h +1599 -0
  46. plumed/_lib/include/plumed/asmjit/osutils.h +207 -0
  47. plumed/_lib/include/plumed/asmjit/regalloc_p.h +597 -0
  48. plumed/_lib/include/plumed/asmjit/runtime.h +227 -0
  49. plumed/_lib/include/plumed/asmjit/simdtypes.h +1104 -0
  50. plumed/_lib/include/plumed/asmjit/utils.h +1387 -0
  51. plumed/_lib/include/plumed/asmjit/vmem.h +183 -0
  52. plumed/_lib/include/plumed/asmjit/x86.h +45 -0
  53. plumed/_lib/include/plumed/asmjit/x86assembler.h +125 -0
  54. plumed/_lib/include/plumed/asmjit/x86builder.h +117 -0
  55. plumed/_lib/include/plumed/asmjit/x86compiler.h +322 -0
  56. plumed/_lib/include/plumed/asmjit/x86emitter.h +5149 -0
  57. plumed/_lib/include/plumed/asmjit/x86globals.h +535 -0
  58. plumed/_lib/include/plumed/asmjit/x86inst.h +2547 -0
  59. plumed/_lib/include/plumed/asmjit/x86instimpl_p.h +74 -0
  60. plumed/_lib/include/plumed/asmjit/x86internal_p.h +108 -0
  61. plumed/_lib/include/plumed/asmjit/x86logging_p.h +92 -0
  62. plumed/_lib/include/plumed/asmjit/x86misc.h +417 -0
  63. plumed/_lib/include/plumed/asmjit/x86operand.h +1133 -0
  64. plumed/_lib/include/plumed/asmjit/x86regalloc_p.h +734 -0
  65. plumed/_lib/include/plumed/asmjit/zone.h +1157 -0
  66. plumed/_lib/include/plumed/bias/Bias.h +82 -0
  67. plumed/_lib/include/plumed/bias/ReweightBase.h +58 -0
  68. plumed/_lib/include/plumed/blas/blas.h +253 -0
  69. plumed/_lib/include/plumed/blas/def_external.h +61 -0
  70. plumed/_lib/include/plumed/blas/def_internal.h +97 -0
  71. plumed/_lib/include/plumed/blas/real.h +49 -0
  72. plumed/_lib/include/plumed/cltools/CLTool.h +32 -0
  73. plumed/_lib/include/plumed/clusters/ClusteringBase.h +70 -0
  74. plumed/_lib/include/plumed/colvar/Colvar.h +32 -0
  75. plumed/_lib/include/plumed/colvar/ColvarInput.h +68 -0
  76. plumed/_lib/include/plumed/colvar/ColvarShortcut.h +81 -0
  77. plumed/_lib/include/plumed/colvar/CoordinationBase.h +52 -0
  78. plumed/_lib/include/plumed/colvar/MultiColvarTemplate.h +333 -0
  79. plumed/_lib/include/plumed/colvar/PathMSDBase.h +101 -0
  80. plumed/_lib/include/plumed/colvar/RMSDVector.h +78 -0
  81. plumed/_lib/include/plumed/config/Config.h +118 -0
  82. plumed/_lib/include/plumed/config/version.h +9 -0
  83. plumed/_lib/include/plumed/contour/ContourFindingObject.h +87 -0
  84. plumed/_lib/include/plumed/contour/DistanceFromContourBase.h +82 -0
  85. plumed/_lib/include/plumed/contour/FindContour.h +67 -0
  86. plumed/_lib/include/plumed/core/Action.h +540 -0
  87. plumed/_lib/include/plumed/core/ActionAnyorder.h +48 -0
  88. plumed/_lib/include/plumed/core/ActionAtomistic.h +343 -0
  89. plumed/_lib/include/plumed/core/ActionForInterface.h +99 -0
  90. plumed/_lib/include/plumed/core/ActionPilot.h +57 -0
  91. plumed/_lib/include/plumed/core/ActionRegister.h +124 -0
  92. plumed/_lib/include/plumed/core/ActionSet.h +163 -0
  93. plumed/_lib/include/plumed/core/ActionSetup.h +48 -0
  94. plumed/_lib/include/plumed/core/ActionShortcut.h +73 -0
  95. plumed/_lib/include/plumed/core/ActionToGetData.h +59 -0
  96. plumed/_lib/include/plumed/core/ActionToPutData.h +101 -0
  97. plumed/_lib/include/plumed/core/ActionWithArguments.h +140 -0
  98. plumed/_lib/include/plumed/core/ActionWithMatrix.h +87 -0
  99. plumed/_lib/include/plumed/core/ActionWithValue.h +258 -0
  100. plumed/_lib/include/plumed/core/ActionWithVector.h +94 -0
  101. plumed/_lib/include/plumed/core/ActionWithVirtualAtom.h +123 -0
  102. plumed/_lib/include/plumed/core/CLTool.h +177 -0
  103. plumed/_lib/include/plumed/core/CLToolMain.h +102 -0
  104. plumed/_lib/include/plumed/core/CLToolRegister.h +108 -0
  105. plumed/_lib/include/plumed/core/Colvar.h +115 -0
  106. plumed/_lib/include/plumed/core/DataPassingObject.h +94 -0
  107. plumed/_lib/include/plumed/core/DataPassingTools.h +54 -0
  108. plumed/_lib/include/plumed/core/DomainDecomposition.h +120 -0
  109. plumed/_lib/include/plumed/core/ExchangePatterns.h +47 -0
  110. plumed/_lib/include/plumed/core/FlexibleBin.h +63 -0
  111. plumed/_lib/include/plumed/core/GREX.h +61 -0
  112. plumed/_lib/include/plumed/core/GenericMolInfo.h +89 -0
  113. plumed/_lib/include/plumed/core/Group.h +41 -0
  114. plumed/_lib/include/plumed/core/ModuleMap.h +30 -0
  115. plumed/_lib/include/plumed/core/ParallelTaskManager.h +1023 -0
  116. plumed/_lib/include/plumed/core/PbcAction.h +61 -0
  117. plumed/_lib/include/plumed/core/PlumedMain.h +632 -0
  118. plumed/_lib/include/plumed/core/PlumedMainInitializer.h +118 -0
  119. plumed/_lib/include/plumed/core/RegisterBase.h +340 -0
  120. plumed/_lib/include/plumed/core/TargetDist.h +48 -0
  121. plumed/_lib/include/plumed/core/Value.h +547 -0
  122. plumed/_lib/include/plumed/core/WithCmd.h +93 -0
  123. plumed/_lib/include/plumed/dimred/SMACOF.h +55 -0
  124. plumed/_lib/include/plumed/drr/DRR.h +383 -0
  125. plumed/_lib/include/plumed/drr/colvar_UIestimator.h +777 -0
  126. plumed/_lib/include/plumed/fisst/legendre_rule_fast.h +44 -0
  127. plumed/_lib/include/plumed/function/Custom.h +54 -0
  128. plumed/_lib/include/plumed/function/Function.h +85 -0
  129. plumed/_lib/include/plumed/function/FunctionOfMatrix.h +368 -0
  130. plumed/_lib/include/plumed/function/FunctionOfScalar.h +135 -0
  131. plumed/_lib/include/plumed/function/FunctionOfVector.h +296 -0
  132. plumed/_lib/include/plumed/function/FunctionSetup.h +180 -0
  133. plumed/_lib/include/plumed/function/FunctionShortcut.h +130 -0
  134. plumed/_lib/include/plumed/function/FunctionWithSingleArgument.h +165 -0
  135. plumed/_lib/include/plumed/gridtools/ActionWithGrid.h +43 -0
  136. plumed/_lib/include/plumed/gridtools/EvaluateGridFunction.h +99 -0
  137. plumed/_lib/include/plumed/gridtools/FunctionOfGrid.h +295 -0
  138. plumed/_lib/include/plumed/gridtools/GridCoordinatesObject.h +179 -0
  139. plumed/_lib/include/plumed/gridtools/GridSearch.h +135 -0
  140. plumed/_lib/include/plumed/gridtools/Interpolator.h +45 -0
  141. plumed/_lib/include/plumed/gridtools/KDE.h +455 -0
  142. plumed/_lib/include/plumed/gridtools/RDF.h +40 -0
  143. plumed/_lib/include/plumed/gridtools/SumOfKernels.h +219 -0
  144. plumed/_lib/include/plumed/isdb/MetainferenceBase.h +398 -0
  145. plumed/_lib/include/plumed/lapack/def_external.h +207 -0
  146. plumed/_lib/include/plumed/lapack/def_internal.h +388 -0
  147. plumed/_lib/include/plumed/lapack/lapack.h +899 -0
  148. plumed/_lib/include/plumed/lapack/lapack_limits.h +79 -0
  149. plumed/_lib/include/plumed/lapack/real.h +50 -0
  150. plumed/_lib/include/plumed/lepton/CompiledExpression.h +164 -0
  151. plumed/_lib/include/plumed/lepton/CustomFunction.h +143 -0
  152. plumed/_lib/include/plumed/lepton/Exception.h +93 -0
  153. plumed/_lib/include/plumed/lepton/ExpressionProgram.h +137 -0
  154. plumed/_lib/include/plumed/lepton/ExpressionTreeNode.h +145 -0
  155. plumed/_lib/include/plumed/lepton/Lepton.h +85 -0
  156. plumed/_lib/include/plumed/lepton/MSVC_erfc.h +123 -0
  157. plumed/_lib/include/plumed/lepton/Operation.h +1302 -0
  158. plumed/_lib/include/plumed/lepton/ParsedExpression.h +165 -0
  159. plumed/_lib/include/plumed/lepton/Parser.h +111 -0
  160. plumed/_lib/include/plumed/lepton/windowsIncludes.h +73 -0
  161. plumed/_lib/include/plumed/mapping/Path.h +44 -0
  162. plumed/_lib/include/plumed/mapping/PathProjectionCalculator.h +57 -0
  163. plumed/_lib/include/plumed/matrixtools/MatrixOperationBase.h +54 -0
  164. plumed/_lib/include/plumed/matrixtools/MatrixTimesMatrix.h +309 -0
  165. plumed/_lib/include/plumed/matrixtools/MatrixTimesVectorBase.h +365 -0
  166. plumed/_lib/include/plumed/matrixtools/OuterProduct.h +238 -0
  167. plumed/_lib/include/plumed/maze/Core.h +65 -0
  168. plumed/_lib/include/plumed/maze/Loss.h +86 -0
  169. plumed/_lib/include/plumed/maze/Member.h +66 -0
  170. plumed/_lib/include/plumed/maze/Memetic.h +799 -0
  171. plumed/_lib/include/plumed/maze/Optimizer.h +357 -0
  172. plumed/_lib/include/plumed/maze/Random_MT.h +156 -0
  173. plumed/_lib/include/plumed/maze/Tools.h +183 -0
  174. plumed/_lib/include/plumed/metatomic/vesin.h +188 -0
  175. plumed/_lib/include/plumed/molfile/Gromacs.h +2013 -0
  176. plumed/_lib/include/plumed/molfile/endianswap.h +217 -0
  177. plumed/_lib/include/plumed/molfile/fastio.h +683 -0
  178. plumed/_lib/include/plumed/molfile/largefiles.h +78 -0
  179. plumed/_lib/include/plumed/molfile/libmolfile_plugin.h +77 -0
  180. plumed/_lib/include/plumed/molfile/molfile_plugin.h +1034 -0
  181. plumed/_lib/include/plumed/molfile/periodic_table.h +248 -0
  182. plumed/_lib/include/plumed/molfile/readpdb.h +447 -0
  183. plumed/_lib/include/plumed/molfile/vmdplugin.h +236 -0
  184. plumed/_lib/include/plumed/multicolvar/MultiColvarShortcuts.h +45 -0
  185. plumed/_lib/include/plumed/opes/ExpansionCVs.h +79 -0
  186. plumed/_lib/include/plumed/sasa/Sasa.h +32 -0
  187. plumed/_lib/include/plumed/secondarystructure/SecondaryStructureBase.h +372 -0
  188. plumed/_lib/include/plumed/setup/ActionSetup.h +25 -0
  189. plumed/_lib/include/plumed/small_vector/small_vector.h +6114 -0
  190. plumed/_lib/include/plumed/symfunc/CoordinationNumbers.h +41 -0
  191. plumed/_lib/include/plumed/tools/Angle.h +52 -0
  192. plumed/_lib/include/plumed/tools/AtomDistribution.h +138 -0
  193. plumed/_lib/include/plumed/tools/AtomNumber.h +152 -0
  194. plumed/_lib/include/plumed/tools/BiasRepresentation.h +106 -0
  195. plumed/_lib/include/plumed/tools/BitmaskEnum.h +167 -0
  196. plumed/_lib/include/plumed/tools/Brent1DRootSearch.h +159 -0
  197. plumed/_lib/include/plumed/tools/CheckInRange.h +44 -0
  198. plumed/_lib/include/plumed/tools/Citations.h +74 -0
  199. plumed/_lib/include/plumed/tools/ColvarOutput.h +118 -0
  200. plumed/_lib/include/plumed/tools/Communicator.h +316 -0
  201. plumed/_lib/include/plumed/tools/ConjugateGradient.h +80 -0
  202. plumed/_lib/include/plumed/tools/DLLoader.h +79 -0
  203. plumed/_lib/include/plumed/tools/ERMSD.h +73 -0
  204. plumed/_lib/include/plumed/tools/Exception.h +406 -0
  205. plumed/_lib/include/plumed/tools/File.h +28 -0
  206. plumed/_lib/include/plumed/tools/FileBase.h +153 -0
  207. plumed/_lib/include/plumed/tools/FileTools.h +37 -0
  208. plumed/_lib/include/plumed/tools/ForwardDecl.h +54 -0
  209. plumed/_lib/include/plumed/tools/Grid.h +638 -0
  210. plumed/_lib/include/plumed/tools/HistogramBead.h +136 -0
  211. plumed/_lib/include/plumed/tools/IFile.h +117 -0
  212. plumed/_lib/include/plumed/tools/KernelFunctions.h +113 -0
  213. plumed/_lib/include/plumed/tools/Keywords.h +380 -0
  214. plumed/_lib/include/plumed/tools/LatticeReduction.h +66 -0
  215. plumed/_lib/include/plumed/tools/LeptonCall.h +64 -0
  216. plumed/_lib/include/plumed/tools/LinkCells.h +126 -0
  217. plumed/_lib/include/plumed/tools/Log.h +41 -0
  218. plumed/_lib/include/plumed/tools/LoopUnroller.h +163 -0
  219. plumed/_lib/include/plumed/tools/Matrix.h +721 -0
  220. plumed/_lib/include/plumed/tools/MatrixSquareBracketsAccess.h +138 -0
  221. plumed/_lib/include/plumed/tools/MergeVectorTools.h +153 -0
  222. plumed/_lib/include/plumed/tools/Minimise1DBrent.h +244 -0
  223. plumed/_lib/include/plumed/tools/MinimiseBase.h +120 -0
  224. plumed/_lib/include/plumed/tools/MolDataClass.h +51 -0
  225. plumed/_lib/include/plumed/tools/NeighborList.h +112 -0
  226. plumed/_lib/include/plumed/tools/OFile.h +286 -0
  227. plumed/_lib/include/plumed/tools/OpenACC.h +180 -0
  228. plumed/_lib/include/plumed/tools/OpenMP.h +75 -0
  229. plumed/_lib/include/plumed/tools/PDB.h +154 -0
  230. plumed/_lib/include/plumed/tools/Pbc.h +139 -0
  231. plumed/_lib/include/plumed/tools/PlumedHandle.h +105 -0
  232. plumed/_lib/include/plumed/tools/RMSD.h +493 -0
  233. plumed/_lib/include/plumed/tools/Random.h +80 -0
  234. plumed/_lib/include/plumed/tools/RootFindingBase.h +79 -0
  235. plumed/_lib/include/plumed/tools/Stopwatch.h +475 -0
  236. plumed/_lib/include/plumed/tools/Subprocess.h +142 -0
  237. plumed/_lib/include/plumed/tools/SwitchingFunction.h +208 -0
  238. plumed/_lib/include/plumed/tools/Tensor.h +724 -0
  239. plumed/_lib/include/plumed/tools/TokenizedLine.h +123 -0
  240. plumed/_lib/include/plumed/tools/Tools.h +638 -0
  241. plumed/_lib/include/plumed/tools/Torsion.h +55 -0
  242. plumed/_lib/include/plumed/tools/TrajectoryParser.h +118 -0
  243. plumed/_lib/include/plumed/tools/Tree.h +61 -0
  244. plumed/_lib/include/plumed/tools/TypesafePtr.h +463 -0
  245. plumed/_lib/include/plumed/tools/Units.h +167 -0
  246. plumed/_lib/include/plumed/tools/Vector.h +433 -0
  247. plumed/_lib/include/plumed/tools/View.h +296 -0
  248. plumed/_lib/include/plumed/tools/View2D.h +100 -0
  249. plumed/_lib/include/plumed/tools/h36.h +39 -0
  250. plumed/_lib/include/plumed/vatom/ActionWithVirtualAtom.h +32 -0
  251. plumed/_lib/include/plumed/ves/BasisFunctions.h +380 -0
  252. plumed/_lib/include/plumed/ves/CoeffsBase.h +310 -0
  253. plumed/_lib/include/plumed/ves/CoeffsMatrix.h +220 -0
  254. plumed/_lib/include/plumed/ves/CoeffsVector.h +251 -0
  255. plumed/_lib/include/plumed/ves/FermiSwitchingFunction.h +74 -0
  256. plumed/_lib/include/plumed/ves/GridIntegrationWeights.h +50 -0
  257. plumed/_lib/include/plumed/ves/GridLinearInterpolation.h +81 -0
  258. plumed/_lib/include/plumed/ves/GridProjWeights.h +61 -0
  259. plumed/_lib/include/plumed/ves/LinearBasisSetExpansion.h +303 -0
  260. plumed/_lib/include/plumed/ves/Optimizer.h +444 -0
  261. plumed/_lib/include/plumed/ves/TargetDistModifer.h +53 -0
  262. plumed/_lib/include/plumed/ves/TargetDistribution.h +266 -0
  263. plumed/_lib/include/plumed/ves/VesBias.h +545 -0
  264. plumed/_lib/include/plumed/ves/VesTools.h +142 -0
  265. plumed/_lib/include/plumed/ves/WaveletGrid.h +75 -0
  266. plumed/_lib/include/plumed/volumes/ActionVolume.h +268 -0
  267. plumed/_lib/include/plumed/volumes/VolumeShortcut.h +147 -0
  268. plumed/_lib/include/plumed/wrapper/Plumed.h +5025 -0
  269. plumed/_lib/include/plumed/xdrfile/xdrfile.h +663 -0
  270. plumed/_lib/include/plumed/xdrfile/xdrfile_trr.h +89 -0
  271. plumed/_lib/include/plumed/xdrfile/xdrfile_xtc.h +90 -0
  272. plumed/_lib/lib/PythonCVInterface.dylib +0 -0
  273. plumed/_lib/lib/libplumed.dylib +0 -0
  274. plumed/_lib/lib/libplumedKernel.dylib +0 -0
  275. plumed/_lib/lib/libplumedWrapper.a +0 -0
  276. plumed/_lib/lib/pkgconfig/plumed.pc +13 -0
  277. plumed/_lib/lib/pkgconfig/plumedInternals.pc +13 -0
  278. plumed/_lib/lib/pkgconfig/plumedWrapper.pc +13 -0
  279. plumed/_lib/lib/plumed/fortran/plumed.f90 +879 -0
  280. plumed/_lib/lib/plumed/fortran/plumed_f08.f90 +2625 -0
  281. plumed/_lib/lib/plumed/modulefile +69 -0
  282. plumed/_lib/lib/plumed/patches/gromacs-2022.5.config +43 -0
  283. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt +543 -0
  284. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt.preplumed +540 -0
  285. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp +1628 -0
  286. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1590 -0
  287. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h +103 -0
  288. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h.preplumed +99 -0
  289. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp +2527 -0
  290. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2513 -0
  291. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  292. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  293. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +408 -0
  294. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +394 -0
  295. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp +2348 -0
  296. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2091 -0
  297. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp +3573 -0
  298. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3495 -0
  299. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1506 -0
  300. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1402 -0
  301. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  302. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  303. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp +997 -0
  304. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +906 -0
  305. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp +2780 -0
  306. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2738 -0
  307. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  308. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  309. plumed/_lib/lib/plumed/patches/gromacs-2023.5.config +43 -0
  310. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt +549 -0
  311. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt.preplumed +546 -0
  312. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  313. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  314. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h +104 -0
  315. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  316. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp +2624 -0
  317. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2610 -0
  318. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  319. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  320. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +409 -0
  321. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +395 -0
  322. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp +2419 -0
  323. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2164 -0
  324. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp +3546 -0
  325. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3468 -0
  326. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  327. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  328. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  329. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  330. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp +991 -0
  331. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +900 -0
  332. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp +2895 -0
  333. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2849 -0
  334. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  335. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  336. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp +886 -0
  337. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +880 -0
  338. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +347 -0
  339. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +345 -0
  340. plumed/_lib/lib/plumed/patches/gromacs-2024.3.config +43 -0
  341. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt +575 -0
  342. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt.preplumed +572 -0
  343. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  344. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  345. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h +104 -0
  346. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  347. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp +2564 -0
  348. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2550 -0
  349. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  350. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  351. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h +410 -0
  352. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +396 -0
  353. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp +2435 -0
  354. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp.preplumed +2187 -0
  355. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp +3592 -0
  356. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3514 -0
  357. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  358. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  359. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  360. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  361. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp +958 -0
  362. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp.preplumed +929 -0
  363. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp +2987 -0
  364. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp.preplumed +2941 -0
  365. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  366. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  367. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp +904 -0
  368. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +898 -0
  369. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +353 -0
  370. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +351 -0
  371. plumed/_lib/lib/plumed/patches/gromacs-2025.0.config +39 -0
  372. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake +82 -0
  373. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake.preplumed +82 -0
  374. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp +162 -0
  375. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp.preplumed +154 -0
  376. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp +107 -0
  377. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp.preplumed +99 -0
  378. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h +120 -0
  379. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h.preplumed +111 -0
  380. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp +215 -0
  381. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp.preplumed +197 -0
  382. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h +87 -0
  383. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h.preplumed +86 -0
  384. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp +2971 -0
  385. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp.preplumed +2970 -0
  386. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h +430 -0
  387. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h.preplumed +429 -0
  388. plumed/_lib/lib/plumed/patches/namd-2.12.config +30 -0
  389. plumed/_lib/lib/plumed/patches/namd-2.12.diff +267 -0
  390. plumed/_lib/lib/plumed/patches/namd-2.13.config +30 -0
  391. plumed/_lib/lib/plumed/patches/namd-2.13.diff +267 -0
  392. plumed/_lib/lib/plumed/patches/namd-2.14.config +30 -0
  393. plumed/_lib/lib/plumed/patches/namd-2.14.diff +268 -0
  394. plumed/_lib/lib/plumed/patches/patch.sh +500 -0
  395. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.config +25 -0
  396. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90 +368 -0
  397. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90.preplumed +366 -0
  398. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90 +71 -0
  399. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90.preplumed +24 -0
  400. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90 +62 -0
  401. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  402. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90 +189 -0
  403. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90.preplumed +185 -0
  404. plumed/_lib/lib/plumed/patches/qespresso-6.2.config +26 -0
  405. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90 +422 -0
  406. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90.preplumed +420 -0
  407. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90 +70 -0
  408. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  409. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90 +62 -0
  410. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  411. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90 +233 -0
  412. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90.preplumed +230 -0
  413. plumed/_lib/lib/plumed/patches/qespresso-7.0.config +28 -0
  414. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile +175 -0
  415. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile.preplumed +171 -0
  416. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90 +486 -0
  417. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90.preplumed +484 -0
  418. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90 +74 -0
  419. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  420. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90 +64 -0
  421. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  422. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90 +532 -0
  423. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90.preplumed +518 -0
  424. plumed/_lib/lib/plumed/patches/qespresso-7.2.config +28 -0
  425. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile +249 -0
  426. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile.preplumed +244 -0
  427. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90 +532 -0
  428. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90.preplumed +535 -0
  429. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90 +74 -0
  430. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  431. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90 +64 -0
  432. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  433. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90 +569 -0
  434. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90.preplumed +560 -0
  435. plumed/_lib/lib/plumed/plumed-config +9 -0
  436. plumed/_lib/lib/plumed/plumed-mklib +9 -0
  437. plumed/_lib/lib/plumed/plumed-newcv +9 -0
  438. plumed/_lib/lib/plumed/plumed-partial_tempering +9 -0
  439. plumed/_lib/lib/plumed/plumed-patch +9 -0
  440. plumed/_lib/lib/plumed/plumed-runtime +0 -0
  441. plumed/_lib/lib/plumed/plumed-selector +9 -0
  442. plumed/_lib/lib/plumed/plumed-vim2html +9 -0
  443. plumed/_lib/lib/plumed/scripts/config.sh +126 -0
  444. plumed/_lib/lib/plumed/scripts/mklib.sh +175 -0
  445. plumed/_lib/lib/plumed/scripts/newcv.sh +26 -0
  446. plumed/_lib/lib/plumed/scripts/partial_tempering.sh +319 -0
  447. plumed/_lib/lib/plumed/scripts/patch.sh +4 -0
  448. plumed/_lib/lib/plumed/scripts/selector.sh +234 -0
  449. plumed/_lib/lib/plumed/scripts/vim2html.sh +190 -0
  450. plumed/_lib/lib/plumed/src/colvar/Template.cpp +116 -0
  451. plumed/_lib/lib/plumed/src/config/compile_options.sh +3 -0
  452. plumed/_lib/lib/plumed/src/config/config.txt +181 -0
  453. plumed/_lib/lib/plumed/src/lib/Plumed.cmake +6 -0
  454. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.runtime +5 -0
  455. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.shared +5 -0
  456. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.static +3 -0
  457. plumed/_lib/lib/plumed/src/lib/Plumed.inc +6 -0
  458. plumed/_lib/lib/plumed/src/lib/Plumed.inc.runtime +5 -0
  459. plumed/_lib/lib/plumed/src/lib/Plumed.inc.shared +5 -0
  460. plumed/_lib/lib/plumed/src/lib/Plumed.inc.static +3 -0
  461. plumed/_lib/lib/plumed/vim/scripts.vim +6 -0
  462. plumed/_plumed_core.cpython-311-darwin.so +0 -0
  463. plumed/_plumed_core.cpython-312-darwin.so +0 -0
  464. plumed/_plumed_core.cpython-313-darwin.so +0 -0
  465. plumedCommunications.cpython-311-darwin.so +0 -0
  466. plumedCommunications.cpython-312-darwin.so +0 -0
  467. plumedCommunications.cpython-313-darwin.so +0 -0
  468. plumedCommunications.pyi +431 -0
@@ -0,0 +1,2564 @@
1
+ /*
2
+ * This file is part of the GROMACS molecular simulation package.
3
+ *
4
+ * Copyright 1991- The GROMACS Authors
5
+ * and the project initiators Erik Lindahl, Berk Hess and David van der Spoel.
6
+ * Consult the AUTHORS/COPYING files and https://www.gromacs.org for details.
7
+ *
8
+ * GROMACS is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU Lesser General Public License
10
+ * as published by the Free Software Foundation; either version 2.1
11
+ * of the License, or (at your option) any later version.
12
+ *
13
+ * GROMACS is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ * Lesser General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public
19
+ * License along with GROMACS; if not, see
20
+ * https://www.gnu.org/licenses, or write to the Free Software Foundation,
21
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
+ *
23
+ * If you want to redistribute modifications to GROMACS, please
24
+ * consider that scientific software is very special. Version
25
+ * control is crucial - bugs must be traceable. We will be happy to
26
+ * consider code for inclusion in the official distribution, but
27
+ * derived work must not be called official GROMACS. Details are found
28
+ * in the README & COPYING files - if they are missing, get the
29
+ * official version at https://www.gromacs.org.
30
+ *
31
+ * To help us fund GROMACS development, we humbly ask that you cite
32
+ * the research papers on the package. Check out https://www.gromacs.org.
33
+ */
34
+ #include "gmxpre.h"
35
+
36
+ #include "config.h"
37
+
38
+ #include <cmath>
39
+ #include <cstdint>
40
+ #include <cstdio>
41
+ #include <cstring>
42
+
43
+ #include <array>
44
+ #include <optional>
45
+
46
+ #include "gromacs/applied_forces/awh/awh.h"
47
+ #include "gromacs/domdec/dlbtiming.h"
48
+ #include "gromacs/domdec/domdec.h"
49
+ #include "gromacs/domdec/domdec_struct.h"
50
+ #include "gromacs/domdec/gpuhaloexchange.h"
51
+ #include "gromacs/domdec/partition.h"
52
+ #include "gromacs/essentialdynamics/edsam.h"
53
+ #include "gromacs/ewald/pme.h"
54
+ #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
55
+ #include "gromacs/ewald/pme_pp.h"
56
+ #include "gromacs/ewald/pme_pp_comm_gpu.h"
57
+ #include "gromacs/gmxlib/network.h"
58
+ #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
59
+ #include "gromacs/gmxlib/nonbonded/nonbonded.h"
60
+ #include "gromacs/gmxlib/nrnb.h"
61
+ #include "gromacs/gpu_utils/gpu_utils.h"
62
+ #include "gromacs/imd/imd.h"
63
+ #include "gromacs/listed_forces/disre.h"
64
+ #include "gromacs/listed_forces/listed_forces.h"
65
+ #include "gromacs/listed_forces/listed_forces_gpu.h"
66
+ #include "gromacs/listed_forces/orires.h"
67
+ #include "gromacs/math/arrayrefwithpadding.h"
68
+ #include "gromacs/math/functions.h"
69
+ #include "gromacs/math/units.h"
70
+ #include "gromacs/math/vec.h"
71
+ #include "gromacs/math/vecdump.h"
72
+ #include "gromacs/mdlib/calcmu.h"
73
+ #include "gromacs/mdlib/calcvir.h"
74
+ #include "gromacs/mdlib/constr.h"
75
+ #include "gromacs/mdlib/dispersioncorrection.h"
76
+ #include "gromacs/mdlib/enerdata_utils.h"
77
+ #include "gromacs/mdlib/force.h"
78
+ #include "gromacs/mdlib/force_flags.h"
79
+ #include "gromacs/mdlib/forcerec.h"
80
+ #include "gromacs/mdlib/gmx_omp_nthreads.h"
81
+ #include "gromacs/mdlib/update.h"
82
+ #include "gromacs/mdlib/vsite.h"
83
+ #include "gromacs/mdlib/wall.h"
84
+ #include "gromacs/mdlib/wholemoleculetransform.h"
85
+ #include "gromacs/mdrunutility/mdmodulesnotifiers.h"
86
+ #include "gromacs/mdtypes/commrec.h"
87
+ #include "gromacs/mdtypes/enerdata.h"
88
+ #include "gromacs/mdtypes/forcebuffers.h"
89
+ #include "gromacs/mdtypes/forceoutput.h"
90
+ #include "gromacs/mdtypes/forcerec.h"
91
+ #include "gromacs/mdtypes/iforceprovider.h"
92
+ #include "gromacs/mdtypes/inputrec.h"
93
+ #include "gromacs/mdtypes/md_enums.h"
94
+ #include "gromacs/mdtypes/mdatom.h"
95
+ #include "gromacs/mdtypes/multipletimestepping.h"
96
+ #include "gromacs/mdtypes/simulation_workload.h"
97
+ #include "gromacs/mdtypes/state.h"
98
+ #include "gromacs/mdtypes/state_propagator_data_gpu.h"
99
+ #include "gromacs/nbnxm/gpu_data_mgmt.h"
100
+ #include "gromacs/nbnxm/nbnxm.h"
101
+ #include "gromacs/nbnxm/nbnxm_gpu.h"
102
+ #include "gromacs/pbcutil/ishift.h"
103
+ #include "gromacs/pbcutil/pbc.h"
104
+ #include "gromacs/pulling/pull.h"
105
+ #include "gromacs/pulling/pull_rotation.h"
106
+ #include "gromacs/timing/cyclecounter.h"
107
+ #include "gromacs/timing/gpu_timing.h"
108
+ #include "gromacs/timing/wallcycle.h"
109
+ #include "gromacs/timing/wallcyclereporting.h"
110
+ #include "gromacs/timing/walltime_accounting.h"
111
+ #include "gromacs/topology/topology.h"
112
+ #include "gromacs/utility/arrayref.h"
113
+ #include "gromacs/utility/basedefinitions.h"
114
+ #include "gromacs/utility/cstringutil.h"
115
+ #include "gromacs/utility/exceptions.h"
116
+ #include "gromacs/utility/fatalerror.h"
117
+ #include "gromacs/utility/fixedcapacityvector.h"
118
+ #include "gromacs/utility/gmxassert.h"
119
+ #include "gromacs/utility/gmxmpi.h"
120
+ #include "gromacs/utility/logger.h"
121
+ #include "gromacs/utility/smalloc.h"
122
+ #include "gromacs/utility/strconvert.h"
123
+ #include "gromacs/utility/stringutil.h"
124
+ #include "gromacs/utility/sysinfo.h"
125
+
126
+ #include "gpuforcereduction.h"
127
+
128
+ using gmx::ArrayRef;
129
+ using gmx::AtomLocality;
130
+ using gmx::DomainLifetimeWorkload;
131
+ using gmx::ForceOutputs;
132
+ using gmx::ForceWithShiftForces;
133
+ using gmx::InteractionLocality;
134
+ using gmx::RVec;
135
+ using gmx::SimulationWorkload;
136
+ using gmx::StepWorkload;
137
+
138
+ /* PLUMED */
139
+ #include "../../../Plumed.h"
140
+ extern int plumedswitch;
141
+ extern plumed plumedmain;
142
+ /* END PLUMED */
143
+
144
+ // TODO: this environment variable allows us to verify before release
145
+ // that on less common architectures the total cost of polling is not larger than
146
+ // a blocking wait (so polling does not introduce overhead when the static
147
+ // PME-first ordering would suffice).
148
+ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
149
+
150
+ static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
151
+ {
152
+ GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
153
+ const int end = forceToAdd.size();
154
+
155
+ int gmx_unused nt = gmx_omp_nthreads_get(ModuleMultiThread::Default);
156
+ #pragma omp parallel for num_threads(nt) schedule(static)
157
+ for (int i = 0; i < end; i++)
158
+ {
159
+ rvec_inc(f[i], forceToAdd[i]);
160
+ }
161
+ }
162
+
163
+ static void calc_virial(int start,
164
+ int homenr,
165
+ const rvec x[],
166
+ const gmx::ForceWithShiftForces& forceWithShiftForces,
167
+ tensor vir_part,
168
+ const matrix box,
169
+ t_nrnb* nrnb,
170
+ const t_forcerec* fr,
171
+ PbcType pbcType)
172
+ {
173
+ /* The short-range virial from surrounding boxes */
174
+ const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
175
+ const rvec* shiftVecPointer = as_rvec_array(fr->shift_vec.data());
176
+ calc_vir(gmx::c_numShiftVectors, shiftVecPointer, fshift, vir_part, pbcType == PbcType::Screw, box);
177
+ inc_nrnb(nrnb, eNR_VIRIAL, gmx::c_numShiftVectors);
178
+
179
+ /* Calculate partial virial, for local atoms only, based on short range.
180
+ * Total virial is computed in global_stat, called from do_md
181
+ */
182
+ const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
183
+ f_calc_vir(start, start + homenr, x, f, vir_part, box);
184
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
185
+
186
+ if (debug)
187
+ {
188
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
189
+ }
190
+ }
191
+
192
+ static void pull_potential_wrapper(const t_commrec* cr,
193
+ const t_inputrec& ir,
194
+ const matrix box,
195
+ gmx::ArrayRef<const gmx::RVec> x,
196
+ const t_mdatoms* mdatoms,
197
+ gmx_enerdata_t* enerd,
198
+ pull_t* pull_work,
199
+ const real* lambda,
200
+ double t,
201
+ gmx_wallcycle* wcycle)
202
+ {
203
+ t_pbc pbc;
204
+ real dvdl;
205
+
206
+ /* Calculate the center of mass forces, this requires communication,
207
+ * which is why pull_potential is called close to other communication.
208
+ */
209
+ wallcycle_start(wcycle, WallCycleCounter::PullPot);
210
+ set_pbc(&pbc, ir.pbcType, box);
211
+ dvdl = 0;
212
+ enerd->term[F_COM_PULL] +=
213
+ pull_potential(pull_work,
214
+ mdatoms->massT,
215
+ pbc,
216
+ cr,
217
+ t,
218
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Restraint)],
219
+ x,
220
+ &dvdl);
221
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
222
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
223
+ }
224
+
225
+ static void pme_receive_force_ener(t_forcerec* fr,
226
+ const t_commrec* cr,
227
+ gmx::ForceWithVirial* forceWithVirial,
228
+ gmx_enerdata_t* enerd,
229
+ bool useGpuPmePpComms,
230
+ bool receivePmeForceToGpu,
231
+ gmx_wallcycle* wcycle)
232
+ {
233
+ real e_q, e_lj, dvdl_q, dvdl_lj;
234
+ float cycles_ppdpme, cycles_seppme;
235
+
236
+ cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
237
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
238
+
239
+ /* In case of node-splitting, the PP nodes receive the long-range
240
+ * forces, virial and energy from the PME nodes here.
241
+ */
242
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
243
+ dvdl_q = 0;
244
+ dvdl_lj = 0;
245
+ gmx_pme_receive_f(fr->pmePpCommGpu.get(),
246
+ cr,
247
+ forceWithVirial,
248
+ &e_q,
249
+ &e_lj,
250
+ &dvdl_q,
251
+ &dvdl_lj,
252
+ useGpuPmePpComms,
253
+ receivePmeForceToGpu,
254
+ &cycles_seppme);
255
+ enerd->term[F_COUL_RECIP] += e_q;
256
+ enerd->term[F_LJ_RECIP] += e_lj;
257
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += dvdl_q;
258
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_lj;
259
+
260
+ if (wcycle)
261
+ {
262
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
263
+ }
264
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
265
+ }
266
+
267
+ static void print_large_forces(FILE* fp,
268
+ const t_mdatoms* md,
269
+ const t_commrec* cr,
270
+ int64_t step,
271
+ real forceTolerance,
272
+ ArrayRef<const RVec> x,
273
+ ArrayRef<const RVec> f)
274
+ {
275
+ real force2Tolerance = gmx::square(forceTolerance);
276
+ gmx::Index numNonFinite = 0;
277
+ for (int i = 0; i < md->homenr; i++)
278
+ {
279
+ real force2 = norm2(f[i]);
280
+ bool nonFinite = !std::isfinite(force2);
281
+ if (force2 >= force2Tolerance || nonFinite)
282
+ {
283
+ fprintf(fp,
284
+ "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
285
+ step,
286
+ ddglatnr(cr->dd, i),
287
+ x[i][XX],
288
+ x[i][YY],
289
+ x[i][ZZ],
290
+ std::sqrt(force2));
291
+ }
292
+ if (nonFinite)
293
+ {
294
+ numNonFinite++;
295
+ }
296
+ }
297
+ if (numNonFinite > 0)
298
+ {
299
+ /* Note that with MPI this fatal call on one rank might interrupt
300
+ * the printing on other ranks. But we can only avoid that with
301
+ * an expensive MPI barrier that we would need at each step.
302
+ */
303
+ gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
304
+ }
305
+ }
306
+
307
+ //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
308
+ static void postProcessForceWithShiftForces(t_nrnb* nrnb,
309
+ gmx_wallcycle* wcycle,
310
+ const matrix box,
311
+ ArrayRef<const RVec> x,
312
+ ForceOutputs* forceOutputs,
313
+ tensor vir_force,
314
+ const t_mdatoms& mdatoms,
315
+ const t_forcerec& fr,
316
+ gmx::VirtualSitesHandler* vsite,
317
+ const StepWorkload& stepWork)
318
+ {
319
+ ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
320
+
321
+ /* If we have NoVirSum forces, but we do not calculate the virial,
322
+ * we later sum the forceWithShiftForces buffer together with
323
+ * the noVirSum buffer and spread the combined vsite forces at once.
324
+ */
325
+ if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
326
+ {
327
+ using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
328
+
329
+ auto f = forceWithShiftForces.force();
330
+ auto fshift = forceWithShiftForces.shiftForces();
331
+ const VirialHandling virialHandling =
332
+ (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
333
+ vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
334
+ forceWithShiftForces.haveSpreadVsiteForces() = true;
335
+ }
336
+
337
+ if (stepWork.computeVirial)
338
+ {
339
+ /* Calculation of the virial must be done after vsites! */
340
+ calc_virial(
341
+ 0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force, box, nrnb, &fr, fr.pbcType);
342
+ }
343
+ }
344
+
345
+ //! Spread, compute virial for and sum forces, when necessary
346
+ static void postProcessForces(const t_commrec* cr,
347
+ int64_t step,
348
+ t_nrnb* nrnb,
349
+ gmx_wallcycle* wcycle,
350
+ const matrix box,
351
+ ArrayRef<const RVec> x,
352
+ ForceOutputs* forceOutputs,
353
+ tensor vir_force,
354
+ const t_mdatoms* mdatoms,
355
+ const t_forcerec* fr,
356
+ gmx::VirtualSitesHandler* vsite,
357
+ const StepWorkload& stepWork)
358
+ {
359
+ // Extract the final output force buffer, which is also the buffer for forces with shift forces
360
+ ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
361
+
362
+ if (forceOutputs->haveForceWithVirial())
363
+ {
364
+ auto& forceWithVirial = forceOutputs->forceWithVirial();
365
+
366
+ if (vsite)
367
+ {
368
+ /* Spread the mesh force on virtual sites to the other particles...
369
+ * This is parallellized. MPI communication is performed
370
+ * if the constructing atoms aren't local.
371
+ */
372
+ GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
373
+ "We need separate force buffers for shift and virial forces when "
374
+ "computing the virial");
375
+ GMX_ASSERT(!stepWork.computeVirial
376
+ || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
377
+ "We should spread the force with shift forces separately when computing "
378
+ "the virial");
379
+ const gmx::VirtualSitesHandler::VirialHandling virialHandling =
380
+ (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
381
+ : gmx::VirtualSitesHandler::VirialHandling::None);
382
+ matrix virial = { { 0 } };
383
+ vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
384
+ forceWithVirial.addVirialContribution(virial);
385
+ }
386
+
387
+ if (stepWork.computeVirial)
388
+ {
389
+ /* Now add the forces, this is local */
390
+ sum_forces(f, forceWithVirial.force_);
391
+
392
+ /* Add the direct virial contributions */
393
+ GMX_ASSERT(
394
+ forceWithVirial.computeVirial_,
395
+ "forceWithVirial should request virial computation when we request the virial");
396
+ m_add(vir_force, forceWithVirial.getVirial(), vir_force);
397
+
398
+ if (debug)
399
+ {
400
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
401
+ }
402
+ }
403
+ }
404
+ else
405
+ {
406
+ GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
407
+ "We should have spread the vsite forces (earlier)");
408
+ }
409
+
410
+ if (fr->print_force >= 0)
411
+ {
412
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
413
+ }
414
+ }
415
+
416
+ static void do_nb_verlet(t_forcerec* fr,
417
+ const interaction_const_t* ic,
418
+ gmx_enerdata_t* enerd,
419
+ const StepWorkload& stepWork,
420
+ const InteractionLocality ilocality,
421
+ const int clearF,
422
+ const int64_t step,
423
+ t_nrnb* nrnb,
424
+ gmx_wallcycle* wcycle)
425
+ {
426
+ if (!stepWork.computeNonbondedForces)
427
+ {
428
+ /* skip non-bonded calculation */
429
+ return;
430
+ }
431
+
432
+ nonbonded_verlet_t* nbv = fr->nbv.get();
433
+
434
+ /* GPU kernel launch overhead is already timed separately */
435
+ if (!nbv->useGpu())
436
+ {
437
+ /* When dynamic pair-list pruning is requested, we need to prune
438
+ * at nstlistPrune steps.
439
+ */
440
+ if (nbv->isDynamicPruningStepCpu(step))
441
+ {
442
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
443
+ * the current coordinates of the atoms.
444
+ */
445
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
446
+ nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
447
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
448
+ }
449
+ }
450
+
451
+ nbv->dispatchNonbondedKernel(
452
+ ilocality,
453
+ *ic,
454
+ stepWork,
455
+ clearF,
456
+ fr->shift_vec,
457
+ enerd->grpp.energyGroupPairTerms[fr->haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
458
+ : NonBondedEnergyTerms::LJSR],
459
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
460
+ nrnb);
461
+ }
462
+
463
+ static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
464
+ {
465
+ int nth = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, v.ssize());
466
+
467
+ /* Note that we would like to avoid this conditional by putting it
468
+ * into the omp pragma instead, but then we still take the full
469
+ * omp parallel for overhead (at least with gcc5).
470
+ */
471
+ if (!useOpenmpThreading || nth == 1)
472
+ {
473
+ for (RVec& elem : v)
474
+ {
475
+ clear_rvec(elem);
476
+ }
477
+ }
478
+ else
479
+ {
480
+ #pragma omp parallel for num_threads(nth) schedule(static)
481
+ for (gmx::Index i = 0; i < v.ssize(); i++)
482
+ {
483
+ clear_rvec(v[i]);
484
+ }
485
+ }
486
+ }
487
+
488
+ /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
489
+ *
490
+ * \param groupOptions Group options, containing T-coupling options
491
+ */
492
+ static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
493
+ {
494
+ real nrdfCoupled = 0;
495
+ real nrdfUncoupled = 0;
496
+ real kineticEnergy = 0;
497
+ for (int g = 0; g < groupOptions.ngtc; g++)
498
+ {
499
+ if (groupOptions.tau_t[g] >= 0)
500
+ {
501
+ nrdfCoupled += groupOptions.nrdf[g];
502
+ kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * gmx::c_boltz;
503
+ }
504
+ else
505
+ {
506
+ nrdfUncoupled += groupOptions.nrdf[g];
507
+ }
508
+ }
509
+
510
+ /* This conditional with > also catches nrdf=0 */
511
+ if (nrdfCoupled > nrdfUncoupled)
512
+ {
513
+ return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
514
+ }
515
+ else
516
+ {
517
+ return 0;
518
+ }
519
+ }
520
+
521
+ /*! \brief This routine checks that the potential energy is finite.
522
+ *
523
+ * Always checks that the potential energy is finite. If step equals
524
+ * inputrec.init_step also checks that the magnitude of the potential energy
525
+ * is reasonable. Terminates with a fatal error when a check fails.
526
+ * Note that passing this check does not guarantee finite forces,
527
+ * since those use slightly different arithmetics. But in most cases
528
+ * there is just a narrow coordinate range where forces are not finite
529
+ * and energies are finite.
530
+ *
531
+ * \param[in] step The step number, used for checking and printing
532
+ * \param[in] enerd The energy data; the non-bonded group energies need to be added to
533
+ * \c enerd.term[F_EPOT] before calling this routine
534
+ * \param[in] inputrec The input record
535
+ */
536
+ static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
537
+ {
538
+ /* Threshold valid for comparing absolute potential energy against
539
+ * the kinetic energy. Normally one should not consider absolute
540
+ * potential energy values, but with a factor of one million
541
+ * we should never get false positives.
542
+ */
543
+ constexpr real c_thresholdFactor = 1e6;
544
+
545
+ bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
546
+ real averageKineticEnergy = 0;
547
+ /* We only check for large potential energy at the initial step,
548
+ * because that is by far the most likely step for this too occur
549
+ * and because computing the average kinetic energy is not free.
550
+ * Note: nstcalcenergy >> 1 often does not allow to catch large energies
551
+ * before they become NaN.
552
+ */
553
+ if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
554
+ {
555
+ averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
556
+ }
557
+
558
+ if (energyIsNotFinite
559
+ || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
560
+ {
561
+ GMX_THROW(gmx::InternalError(gmx::formatString(
562
+ "Step %" PRId64
563
+ ": The total potential energy is %g, which is %s. The LJ and electrostatic "
564
+ "contributions to the energy are %g and %g, respectively. A %s potential energy "
565
+ "can be caused by overlapping interactions in bonded interactions or very large%s "
566
+ "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
567
+ "configuration, incorrect interactions or parameters in the topology.",
568
+ step,
569
+ enerd.term[F_EPOT],
570
+ energyIsNotFinite ? "not finite" : "extremely high",
571
+ enerd.term[F_LJ],
572
+ enerd.term[F_COUL_SR],
573
+ energyIsNotFinite ? "non-finite" : "very high",
574
+ energyIsNotFinite ? " or Nan" : "")));
575
+ }
576
+ }
577
+
578
+ /*! \brief Compute forces and/or energies for special algorithms
579
+ *
580
+ * The intention is to collect all calls to algorithms that compute
581
+ * forces on local atoms only and that do not contribute to the local
582
+ * virial sum (but add their virial contribution separately).
583
+ * Eventually these should likely all become ForceProviders.
584
+ * Within this function the intention is to have algorithms that do
585
+ * global communication at the end, so global barriers within the MD loop
586
+ * are as close together as possible.
587
+ *
588
+ * \param[in] fplog The log file
589
+ * \param[in] cr The communication record
590
+ * \param[in] inputrec The input record
591
+ * \param[in] awh The Awh module (nullptr if none in use).
592
+ * \param[in] enforcedRotation Enforced rotation module.
593
+ * \param[in] imdSession The IMD session
594
+ * \param[in] pull_work The pull work structure.
595
+ * \param[in] step The current MD step
596
+ * \param[in] t The current time
597
+ * \param[in,out] wcycle Wallcycle accounting struct
598
+ * \param[in,out] forceProviders Pointer to a list of force providers
599
+ * \param[in] box The unit cell
600
+ * \param[in] x The coordinates
601
+ * \param[in] mdatoms Per atom properties
602
+ * \param[in] lambda Array of free-energy lambda values
603
+ * \param[in] stepWork Step schedule flags
604
+ * \param[in,out] forceWithVirialMtsLevel0 Force and virial for MTS level0 forces
605
+ * \param[in,out] forceWithVirialMtsLevel1 Force and virial for MTS level1 forces, can be nullptr
606
+ * \param[in,out] enerd Energy buffer
607
+ * \param[in,out] ed Essential dynamics pointer
608
+ * \param[in] didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
609
+ *
610
+ * \todo Remove didNeighborSearch, which is used incorrectly.
611
+ * \todo Convert all other algorithms called here to ForceProviders.
612
+ */
613
+ static void computeSpecialForces(FILE* fplog,
614
+ const t_commrec* cr,
615
+ const t_inputrec& inputrec,
616
+ gmx::Awh* awh,
617
+ gmx_enfrot* enforcedRotation,
618
+ gmx::ImdSession* imdSession,
619
+ pull_t* pull_work,
620
+ int64_t step,
621
+ double t,
622
+ gmx_wallcycle* wcycle,
623
+ gmx::ForceProviders* forceProviders,
624
+ const matrix box,
625
+ gmx::ArrayRef<const gmx::RVec> x,
626
+ const t_mdatoms* mdatoms,
627
+ gmx::ArrayRef<const real> lambda,
628
+ const StepWorkload& stepWork,
629
+ gmx::ForceWithVirial* forceWithVirialMtsLevel0,
630
+ gmx::ForceWithVirial* forceWithVirialMtsLevel1,
631
+ gmx_enerdata_t* enerd,
632
+ gmx_edsam* ed,
633
+ bool didNeighborSearch)
634
+ {
635
+ /* NOTE: Currently all ForceProviders only provide forces.
636
+ * When they also provide energies, remove this conditional.
637
+ */
638
+ if (stepWork.computeForces)
639
+ {
640
+ gmx::ForceProviderInput forceProviderInput(
641
+ x,
642
+ mdatoms->homenr,
643
+ gmx::makeArrayRef(mdatoms->chargeA).subArray(0, mdatoms->homenr),
644
+ gmx::makeArrayRef(mdatoms->massT).subArray(0, mdatoms->homenr),
645
+ t,
646
+ step,
647
+ box,
648
+ *cr);
649
+ gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
650
+
651
+ /* Collect forces from modules */
652
+ forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
653
+ }
654
+
655
+ const int pullMtsLevel = forceGroupMtsLevel(inputrec.mtsLevels, gmx::MtsForceGroups::Pull);
656
+ const bool doPulling = (inputrec.bPull && pull_have_potential(*pull_work)
657
+ && (pullMtsLevel == 0 || stepWork.computeSlowForces));
658
+
659
+ /* pull_potential_wrapper(), awh->applyBiasForcesAndUpdateBias(), pull_apply_forces()
660
+ * have to be called in this order
661
+ */
662
+ if (doPulling)
663
+ {
664
+ pull_potential_wrapper(cr, inputrec, box, x, mdatoms, enerd, pull_work, lambda.data(), t, wcycle);
665
+ }
666
+ if (awh && (pullMtsLevel == 0 || stepWork.computeSlowForces))
667
+ {
668
+ const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
669
+ std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
670
+ if (needForeignEnergyDifferences)
671
+ {
672
+ enerd->foreignLambdaTerms.finalizePotentialContributions(
673
+ enerd->dvdl_lin, lambda, *inputrec.fepvals);
674
+ std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
675
+ }
676
+
677
+ enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
678
+ inputrec.pbcType, foreignLambdaDeltaH, foreignLambdaDhDl, box, t, step, wcycle, fplog);
679
+ }
680
+ if (doPulling)
681
+ {
682
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PullPot);
683
+ auto& forceWithVirial = (pullMtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
684
+ pull_apply_forces(pull_work, mdatoms->massT, cr, forceWithVirial);
685
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
686
+ }
687
+
688
+ /* Add the forces from enforced rotation potentials (if any) */
689
+ if (inputrec.bRot)
690
+ {
691
+ wallcycle_start(wcycle, WallCycleCounter::RotAdd);
692
+ enerd->term[F_COM_PULL] +=
693
+ add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
694
+ wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
695
+ }
696
+
697
+ if (ed)
698
+ {
699
+ /* Note that since init_edsam() is called after the initialization
700
+ * of forcerec, edsam doesn't request the noVirSum force buffer.
701
+ * Thus if no other algorithm (e.g. PME) requires it, the forces
702
+ * here will contribute to the virial.
703
+ */
704
+ do_flood(cr, inputrec, x, forceWithVirialMtsLevel0->force_, ed, box, step, didNeighborSearch);
705
+ }
706
+
707
+ /* Add forces from interactive molecular dynamics (IMD), if any */
708
+ if (inputrec.bIMD && stepWork.computeForces)
709
+ {
710
+ imdSession->applyForces(forceWithVirialMtsLevel0->force_);
711
+ }
712
+ }
713
+
714
+ /*! \brief Launch the prepare_step and spread stages of PME GPU.
715
+ *
716
+ * \param[in] pmedata The PME structure
717
+ * \param[in] box The box matrix
718
+ * \param[in] stepWork Step schedule flags
719
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
720
+ * \param[in] lambdaQ The Coulomb lambda of the current state.
721
+ * \param[in] useMdGpuGraph Whether MD GPU Graph is in use.
722
+ * \param[in] wcycle The wallcycle structure
723
+ */
724
+ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
725
+ const matrix box,
726
+ const StepWorkload& stepWork,
727
+ GpuEventSynchronizer* xReadyOnDevice,
728
+ const real lambdaQ,
729
+ bool useMdGpuGraph,
730
+ gmx_wallcycle* wcycle)
731
+ {
732
+ wallcycle_start(wcycle, WallCycleCounter::PmeGpuMesh);
733
+ pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
734
+ bool useGpuDirectComm = false;
735
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
736
+ pme_gpu_launch_spread(
737
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu, useMdGpuGraph);
738
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
739
+ }
740
+
741
+ /*! \brief Launch the FFT and gather stages of PME GPU
742
+ *
743
+ * This function only implements setting the output forces (no accumulation).
744
+ *
745
+ * \param[in] pmedata The PME structure
746
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
747
+ * \param[in] wcycle The wallcycle structure
748
+ * \param[in] stepWork Step schedule flags
749
+ */
750
+ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
751
+ const real lambdaQ,
752
+ gmx_wallcycle* wcycle,
753
+ const gmx::StepWorkload& stepWork)
754
+ {
755
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
756
+ pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
757
+ pme_gpu_launch_gather(pmedata, wcycle, lambdaQ, stepWork.computeVirial);
758
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
759
+ }
760
+
761
+ /*! \brief
762
+ * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
763
+ * (if they were to be computed).
764
+ *
765
+ * \param[in] pme The PME data structure.
766
+ * \param[in] stepWork The required work for this simulation step
767
+ * \param[in] wcycle The wallclock counter.
768
+ * \param[out] forceWithVirial The output force and virial
769
+ * \param[out] enerd The output energies
770
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
771
+ */
772
+ static void pmeGpuWaitAndReduce(gmx_pme_t* pme,
773
+ const gmx::StepWorkload& stepWork,
774
+ gmx_wallcycle* wcycle,
775
+ gmx::ForceWithVirial* forceWithVirial,
776
+ gmx_enerdata_t* enerd,
777
+ const real lambdaQ)
778
+ {
779
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
780
+
781
+ pme_gpu_wait_and_reduce(pme, stepWork, wcycle, forceWithVirial, enerd, lambdaQ);
782
+
783
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
784
+ }
785
+
786
+ /*! \brief
787
+ * Polling wait for either of the PME or nonbonded GPU tasks.
788
+ *
789
+ * Instead of a static order in waiting for GPU tasks, this function
790
+ * polls checking which of the two tasks completes first, and does the
791
+ * associated force buffer reduction overlapped with the other task.
792
+ * By doing that, unlike static scheduling order, it can always overlap
793
+ * one of the reductions, regardless of the GPU task completion order.
794
+ *
795
+ * \param[in] nbv Nonbonded verlet structure
796
+ * \param[in,out] pmedata PME module data
797
+ * \param[in,out] forceOutputsNonbonded Force outputs for the non-bonded forces and shift forces
798
+ * \param[in,out] forceOutputsPme Force outputs for the PME forces and virial
799
+ * \param[in,out] enerd Energy data structure results are reduced into
800
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
801
+ * \param[in] stepWork Step schedule flags
802
+ * \param[in] wcycle The wallcycle structure
803
+ */
804
+ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
805
+ gmx_pme_t* pmedata,
806
+ gmx::ForceOutputs* forceOutputsNonbonded,
807
+ gmx::ForceOutputs* forceOutputsPme,
808
+ gmx_enerdata_t* enerd,
809
+ const real lambdaQ,
810
+ const StepWorkload& stepWork,
811
+ gmx_wallcycle* wcycle)
812
+ {
813
+ bool isPmeGpuDone = false;
814
+ bool isNbGpuDone = false;
815
+
816
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
817
+
818
+ while (!isPmeGpuDone || !isNbGpuDone)
819
+ {
820
+ if (!isPmeGpuDone)
821
+ {
822
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
823
+ GpuTaskCompletion completionType =
824
+ (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
825
+ isPmeGpuDone = pme_gpu_try_finish_task(
826
+ pmedata, stepWork, wcycle, &forceOutputsPme->forceWithVirial(), enerd, lambdaQ, completionType);
827
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
828
+ }
829
+
830
+ if (!isNbGpuDone)
831
+ {
832
+ auto& forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
833
+ GpuTaskCompletion completionType =
834
+ (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
835
+ // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
836
+ // we start without counting and only when the task finished we issue a
837
+ // start/stop to increment.
838
+ // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
839
+ wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
840
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(
841
+ nbv->gpuNbv(),
842
+ stepWork,
843
+ AtomLocality::Local,
844
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
845
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
846
+ forceBuffersNonbonded.shiftForces(),
847
+ completionType);
848
+ wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
849
+
850
+ if (isNbGpuDone)
851
+ {
852
+ wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
853
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
854
+ }
855
+ }
856
+ }
857
+ }
858
+
859
+ /*! \brief Set up the different force buffers; also does clearing.
860
+ *
861
+ * \param[in] forceHelperBuffers Helper force buffers
862
+ * \param[in] force force array
863
+ * \param[in] domainWork Domain lifetime workload flags
864
+ * \param[in] stepWork Step schedule flags
865
+ * \param[in] havePpDomainDecomposition Whether we have a PP domain decomposition
866
+ * \param[out] wcycle wallcycle recording structure
867
+ *
868
+ * \returns Cleared force output structure
869
+ */
870
+ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceHelperBuffers,
871
+ gmx::ArrayRefWithPadding<gmx::RVec> force,
872
+ const DomainLifetimeWorkload& domainWork,
873
+ const StepWorkload& stepWork,
874
+ const bool havePpDomainDecomposition,
875
+ gmx_wallcycle* wcycle)
876
+ {
877
+ /* NOTE: We assume fr->shiftForces is all zeros here */
878
+ gmx::ForceWithShiftForces forceWithShiftForces(
879
+ force, stepWork.computeVirial, forceHelperBuffers->shiftForces());
880
+
881
+ if (stepWork.computeForces
882
+ && (domainWork.haveCpuLocalForceWork || !stepWork.useGpuFBufferOps
883
+ || (havePpDomainDecomposition && !stepWork.useGpuFHalo)))
884
+ {
885
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
886
+ /* Clear the short- and long-range forces */
887
+ clearRVecs(forceWithShiftForces.force(), true);
888
+
889
+ /* Clear the shift forces */
890
+ clearRVecs(forceWithShiftForces.shiftForces(), false);
891
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
892
+ }
893
+
894
+ /* If we need to compute the virial, we might need a separate
895
+ * force buffer for algorithms for which the virial is calculated
896
+ * directly, such as PME. Otherwise, forceWithVirial uses the
897
+ * the same force (f in legacy calls) buffer as other algorithms.
898
+ */
899
+ const bool useSeparateForceWithVirialBuffer =
900
+ (stepWork.computeForces
901
+ && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
902
+ /* forceWithVirial uses the local atom range only */
903
+ gmx::ForceWithVirial forceWithVirial(
904
+ useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
905
+ : force.unpaddedArrayRef(),
906
+ stepWork.computeVirial);
907
+
908
+ if (useSeparateForceWithVirialBuffer)
909
+ {
910
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::ClearForceBuffer);
911
+ /* TODO: update comment
912
+ * We only compute forces on local atoms. Note that vsites can
913
+ * spread to non-local atoms, but that part of the buffer is
914
+ * cleared separately in the vsite spreading code.
915
+ */
916
+ clearRVecs(forceWithVirial.force_, true);
917
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
918
+ }
919
+
920
+
921
+ return ForceOutputs(
922
+ forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
923
+ }
924
+
925
+ /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
926
+ *
927
+ */
928
+ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
929
+ gmx::ListedForcesGpu* listedForcesGpu,
930
+ gmx_pme_t* pmedata,
931
+ gmx_enerdata_t* enerd,
932
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
933
+ int64_t step,
934
+ gmx_wallcycle* wcycle)
935
+ {
936
+ if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
937
+ {
938
+ /* Launch pruning before buffer clearing because the API overhead of the
939
+ * clear kernel launches can leave the GPU idle while it could be running
940
+ * the prune kernel.
941
+ */
942
+ if (nbv->isDynamicPruningStepGpu(step))
943
+ {
944
+ nbv->dispatchPruneKernelGpu(step);
945
+ }
946
+
947
+ /* now clear the GPU outputs while we finish the step on the CPU */
948
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
949
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
950
+ Nbnxm::gpu_clear_outputs(nbv->gpuNbv(), runScheduleWork.stepWork.computeVirial);
951
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
952
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
953
+ }
954
+
955
+ if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
956
+ {
957
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
958
+ bool gpuGraphWithSeparatePmeRank = false;
959
+ pme_gpu_reinit_computation(pmedata, gpuGraphWithSeparatePmeRank, wcycle);
960
+ wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
961
+ }
962
+
963
+ if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
964
+ {
965
+ // in principle this should be included in the DD balancing region,
966
+ // but generally it is infrequent so we'll omit it for the sake of
967
+ // simpler code
968
+ listedForcesGpu->waitAccumulateEnergyTerms(enerd);
969
+
970
+ listedForcesGpu->clearEnergies();
971
+ }
972
+ }
973
+
974
+ /*! \brief Compute the number of times the "local coordinates ready on device" GPU event will be used as a synchronization point.
975
+ *
976
+ * When some work is offloaded to GPU, force calculation should wait for the atom coordinates to
977
+ * be ready on the device. The coordinates can come either from H2D copy at the beginning of the step,
978
+ * or from the GPU integration at the end of the previous step.
979
+ *
980
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
981
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
982
+ * in different streams depend on the coordinates.
983
+ *
984
+ * This function return the number of times the event will be consumed based on this step's workload.
985
+ *
986
+ * \param simulationWork Simulation workload flags.
987
+ * \param stepWork Step workload flags.
988
+ * \param pmeSendCoordinatesFromGpu Whether peer-to-peer communication is used for PME coordinates.
989
+ * \return
990
+ */
991
+ static int getExpectedLocalXReadyOnDeviceConsumptionCount(gmx_used_in_debug const SimulationWorkload& simulationWork,
992
+ const StepWorkload& stepWork,
993
+ bool pmeSendCoordinatesFromGpu)
994
+ {
995
+ int result = 0;
996
+ if (stepWork.computeSlowForces)
997
+ {
998
+ if (pmeSendCoordinatesFromGpu)
999
+ {
1000
+ GMX_ASSERT(simulationWork.haveSeparatePmeRank,
1001
+ "GPU PME PP communications require having a separate PME rank");
1002
+ // Event is consumed by gmx_pme_send_coordinates for GPU PME PP Communications
1003
+ result++;
1004
+ }
1005
+ if (stepWork.haveGpuPmeOnThisRank)
1006
+ {
1007
+ // Event is consumed by launchPmeGpuSpread
1008
+ result++;
1009
+ }
1010
+ if (stepWork.computeNonbondedForces && stepWork.useGpuXBufferOps)
1011
+ {
1012
+ // Event is consumed by convertCoordinatesGpu
1013
+ result++;
1014
+ }
1015
+ }
1016
+ if (stepWork.useGpuXHalo)
1017
+ {
1018
+ // Event is consumed by communicateGpuHaloCoordinates
1019
+ result++;
1020
+ if (GMX_THREAD_MPI) // Issue #4262
1021
+ {
1022
+ result++;
1023
+ }
1024
+ }
1025
+ if (stepWork.clearGpuFBufferEarly && simulationWork.useGpuUpdate)
1026
+ {
1027
+ // Event is consumed by force clearing which waits for the update to complete
1028
+ result++;
1029
+ }
1030
+ return result;
1031
+ }
1032
+
1033
+ /*! \brief Compute the number of times the "local forces ready on device" GPU event will be used as a synchronization point.
1034
+ *
1035
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
1036
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
1037
+ * in different streams depend on the local forces.
1038
+ *
1039
+ * \param simulationWork Simulation workload flags.
1040
+ * \param domainWork Domain workload flags.
1041
+ * \param stepWork Step workload flags.
1042
+ * \param useOrEmulateGpuNb Whether GPU non-bonded calculations are used or emulated.
1043
+ * \param alternateGpuWait Whether alternating wait/reduce scheme is used.
1044
+ * \return The number of times the event will be consumed based on this step's workload.
1045
+ */
1046
+ static int getExpectedLocalFReadyOnDeviceConsumptionCount(const SimulationWorkload& simulationWork,
1047
+ const DomainLifetimeWorkload& domainWork,
1048
+ const StepWorkload& stepWork,
1049
+ bool useOrEmulateGpuNb,
1050
+ bool alternateGpuWait)
1051
+ {
1052
+ int counter = 0;
1053
+ bool eventUsedInGpuForceReduction =
1054
+ (domainWork.haveCpuLocalForceWork
1055
+ || (simulationWork.havePpDomainDecomposition && !simulationWork.useGpuHaloExchange));
1056
+ bool gpuForceReductionUsed = useOrEmulateGpuNb && !alternateGpuWait && stepWork.useGpuFBufferOps
1057
+ && stepWork.computeNonbondedForces;
1058
+ if (gpuForceReductionUsed && eventUsedInGpuForceReduction)
1059
+ {
1060
+ counter++;
1061
+ }
1062
+ bool gpuForceHaloUsed = simulationWork.havePpDomainDecomposition && stepWork.computeForces
1063
+ && stepWork.useGpuFHalo;
1064
+ if (gpuForceHaloUsed)
1065
+ {
1066
+ counter++;
1067
+ }
1068
+ return counter;
1069
+ }
1070
+
1071
+ //! \brief Data structure to hold dipole-related data and staging arrays
1072
+ struct DipoleData
1073
+ {
1074
+ //! Dipole staging for fast summing over MPI
1075
+ gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
1076
+ //! Dipole staging for states A and B (index 0 and 1 resp.)
1077
+ gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
1078
+ };
1079
+
1080
+
1081
+ static void reduceAndUpdateMuTot(DipoleData* dipoleData,
1082
+ const t_commrec* cr,
1083
+ const bool haveFreeEnergy,
1084
+ gmx::ArrayRef<const real> lambda,
1085
+ rvec muTotal,
1086
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1087
+ {
1088
+ if (PAR(cr))
1089
+ {
1090
+ gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
1091
+ ddBalanceRegionHandler.reopenRegionCpu();
1092
+ }
1093
+ for (int i = 0; i < 2; i++)
1094
+ {
1095
+ for (int j = 0; j < DIM; j++)
1096
+ {
1097
+ dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
1098
+ }
1099
+ }
1100
+
1101
+ if (!haveFreeEnergy)
1102
+ {
1103
+ copy_rvec(dipoleData->muStateAB[0], muTotal);
1104
+ }
1105
+ else
1106
+ {
1107
+ for (int j = 0; j < DIM; j++)
1108
+ {
1109
+ muTotal[j] = (1.0 - lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)])
1110
+ * dipoleData->muStateAB[0][j]
1111
+ + lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1112
+ * dipoleData->muStateAB[1][j];
1113
+ }
1114
+ }
1115
+ }
1116
+
1117
+ /*! \brief Combines MTS level0 and level1 force buffers into a full and MTS-combined force buffer.
1118
+ *
1119
+ * \param[in] numAtoms The number of atoms to combine forces for
1120
+ * \param[in,out] forceMtsLevel0 Input: F_level0, output: F_level0 + F_level1
1121
+ * \param[in,out] forceMts Input: F_level1, output: F_level0 + mtsFactor * F_level1
1122
+ * \param[in] mtsFactor The factor between the level0 and level1 time step
1123
+ */
1124
+ static void combineMtsForces(const int numAtoms,
1125
+ ArrayRef<RVec> forceMtsLevel0,
1126
+ ArrayRef<RVec> forceMts,
1127
+ const real mtsFactor)
1128
+ {
1129
+ const int gmx_unused numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Default);
1130
+ #pragma omp parallel for num_threads(numThreads) schedule(static)
1131
+ for (int i = 0; i < numAtoms; i++)
1132
+ {
1133
+ const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
1134
+ forceMtsLevel0[i] += forceMts[i];
1135
+ forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
1136
+ }
1137
+ }
1138
+
1139
+ /*! \brief Setup for the local GPU force reduction:
1140
+ * reinitialization plus the registration of forces and dependencies.
1141
+ *
1142
+ * \param [in] runScheduleWork Schedule workload flag structure
1143
+ * \param [in] nbv Non-bonded Verlet object
1144
+ * \param [in] stateGpu GPU state propagator object
1145
+ * \param [in] gpuForceReduction GPU force reduction object
1146
+ * \param [in] pmePpCommGpu PME-PP GPU communication object
1147
+ * \param [in] pmedata PME data object
1148
+ * \param [in] dd Domain decomposition object
1149
+ */
1150
+ static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload& runScheduleWork,
1151
+ nonbonded_verlet_t* nbv,
1152
+ gmx::StatePropagatorDataGpu* stateGpu,
1153
+ gmx::GpuForceReduction* gpuForceReduction,
1154
+ gmx::PmePpCommGpu* pmePpCommGpu,
1155
+ const gmx_pme_t* pmedata,
1156
+ const gmx_domdec_t* dd)
1157
+ {
1158
+ GMX_ASSERT(!runScheduleWork.simulationWork.useMts,
1159
+ "GPU force reduction is not compatible with MTS");
1160
+
1161
+ // (re-)initialize local GPU force reduction
1162
+ const bool accumulate = runScheduleWork.domainWork.haveCpuLocalForceWork
1163
+ || runScheduleWork.simulationWork.havePpDomainDecomposition;
1164
+ const int atomStart = 0;
1165
+ gpuForceReduction->reinit(stateGpu->getForces(),
1166
+ nbv->getNumAtoms(AtomLocality::Local),
1167
+ nbv->getGridIndices(),
1168
+ atomStart,
1169
+ accumulate,
1170
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
1171
+
1172
+ // register forces and add dependencies
1173
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpuNbv()));
1174
+
1175
+ DeviceBuffer<gmx::RVec> pmeForcePtr;
1176
+ GpuEventSynchronizer* pmeSynchronizer = nullptr;
1177
+ bool havePmeContribution = false;
1178
+
1179
+ if (runScheduleWork.simulationWork.haveGpuPmeOnPpRank())
1180
+ {
1181
+ pmeForcePtr = pme_gpu_get_device_f(pmedata);
1182
+ if (pmeForcePtr)
1183
+ {
1184
+ pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(pmedata);
1185
+ havePmeContribution = true;
1186
+ }
1187
+ }
1188
+ else if (runScheduleWork.simulationWork.useGpuPmePpCommunication)
1189
+ {
1190
+ pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
1191
+ GMX_ASSERT(pmeForcePtr, "PME force for reduction has no data");
1192
+ if (GMX_THREAD_MPI)
1193
+ {
1194
+ pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
1195
+ }
1196
+ havePmeContribution = true;
1197
+ }
1198
+
1199
+ if (havePmeContribution)
1200
+ {
1201
+ gpuForceReduction->registerRvecForce(pmeForcePtr);
1202
+ if (runScheduleWork.simulationWork.useNvshmem)
1203
+ {
1204
+ DeviceBuffer<uint64_t> forcesReadyNvshmemFlags = pmePpCommGpu->getGpuForcesSyncObj();
1205
+ gpuForceReduction->registerForcesReadyNvshmemFlags(forcesReadyNvshmemFlags);
1206
+ }
1207
+
1208
+ if (!runScheduleWork.simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
1209
+ {
1210
+ GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
1211
+ gpuForceReduction->addDependency(pmeSynchronizer);
1212
+ }
1213
+ }
1214
+
1215
+ if (runScheduleWork.domainWork.haveCpuLocalForceWork
1216
+ || (runScheduleWork.simulationWork.havePpDomainDecomposition
1217
+ && !runScheduleWork.simulationWork.useGpuHaloExchange))
1218
+ {
1219
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
1220
+ }
1221
+
1222
+ if (runScheduleWork.simulationWork.useGpuHaloExchange)
1223
+ {
1224
+ gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
1225
+ }
1226
+ }
1227
+
1228
+ /*! \brief Setup for the non-local GPU force reduction:
1229
+ * reinitialization plus the registration of forces and dependencies.
1230
+ *
1231
+ * \param [in] runScheduleWork Schedule workload flag structure
1232
+ * \param [in] nbv Non-bonded Verlet object
1233
+ * \param [in] stateGpu GPU state propagator object
1234
+ * \param [in] gpuForceReduction GPU force reduction object
1235
+ * \param [in] dd Domain decomposition object
1236
+ */
1237
+ static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload& runScheduleWork,
1238
+ nonbonded_verlet_t* nbv,
1239
+ gmx::StatePropagatorDataGpu* stateGpu,
1240
+ gmx::GpuForceReduction* gpuForceReduction,
1241
+ const gmx_domdec_t* dd)
1242
+ {
1243
+ // (re-)initialize non-local GPU force reduction
1244
+ const bool accumulate = runScheduleWork.domainWork.haveCpuNonLocalForceWork;
1245
+ const int atomStart = dd_numHomeAtoms(*dd);
1246
+ gpuForceReduction->reinit(stateGpu->getForces(),
1247
+ nbv->getNumAtoms(AtomLocality::NonLocal),
1248
+ nbv->getGridIndices(),
1249
+ atomStart,
1250
+ accumulate,
1251
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
1252
+
1253
+ // register forces and add dependencies
1254
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpuNbv()));
1255
+
1256
+ if (runScheduleWork.domainWork.haveCpuNonLocalForceWork)
1257
+ {
1258
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
1259
+ }
1260
+ }
1261
+
1262
+
1263
+ /*! \brief Return the number of local atoms.
1264
+ */
1265
+ static int getLocalAtomCount(const gmx_domdec_t* dd, const t_mdatoms& mdatoms, bool havePPDomainDecomposition)
1266
+ {
1267
+ GMX_ASSERT(!(havePPDomainDecomposition && (dd == nullptr)),
1268
+ "Can't have PP decomposition with dd uninitialized!");
1269
+ return havePPDomainDecomposition ? dd_numAtomsZones(*dd) : mdatoms.homenr;
1270
+ }
1271
+
1272
+ /*! \brief Does pair search and closely related activities required on search steps.
1273
+ */
1274
+ static void doPairSearch(const t_commrec* cr,
1275
+ const t_inputrec& inputrec,
1276
+ const gmx::MDModulesNotifiers& mdModulesNotifiers,
1277
+ int64_t step,
1278
+ t_nrnb* nrnb,
1279
+ gmx_wallcycle* wcycle,
1280
+ const gmx_localtop_t& top,
1281
+ const matrix box,
1282
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1283
+ gmx::ArrayRef<gmx::RVec> v,
1284
+ const t_mdatoms& mdatoms,
1285
+ t_forcerec* fr,
1286
+ const gmx::MdrunScheduleWorkload& runScheduleWork)
1287
+ {
1288
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1289
+
1290
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1291
+
1292
+ const SimulationWorkload& simulationWork = runScheduleWork.simulationWork;
1293
+ const StepWorkload& stepWork = runScheduleWork.stepWork;
1294
+
1295
+ if (gmx::needStateGpu(simulationWork))
1296
+ {
1297
+ // TODO refactor this to do_md, after partitioning.
1298
+ stateGpu->reinit(mdatoms.homenr,
1299
+ getLocalAtomCount(cr->dd, mdatoms, simulationWork.havePpDomainDecomposition));
1300
+ }
1301
+
1302
+ if (simulationWork.haveGpuPmeOnPpRank())
1303
+ {
1304
+ GMX_ASSERT(gmx::needStateGpu(simulationWork), "StatePropagatorDataGpu is needed");
1305
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
1306
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
1307
+ }
1308
+
1309
+ if (fr->pbcType != PbcType::No)
1310
+ {
1311
+ const bool calcCGCM = (stepWork.stateChanged && !haveDDAtomOrdering(*cr));
1312
+ if (calcCGCM)
1313
+ {
1314
+ put_atoms_in_box_omp(fr->pbcType,
1315
+ box,
1316
+ fr->haveBoxDeformation,
1317
+ inputrec.deform,
1318
+ x.unpaddedArrayRef().subArray(0, mdatoms.homenr),
1319
+ v.empty() ? ArrayRef<RVec>() : v.subArray(0, mdatoms.homenr),
1320
+ gmx_omp_nthreads_get(ModuleMultiThread::Default));
1321
+ inc_nrnb(nrnb, eNR_SHIFTX, mdatoms.homenr);
1322
+ }
1323
+
1324
+ if (!haveDDAtomOrdering(*cr))
1325
+ {
1326
+ // Atoms might have changed periodic image, signal MDModules
1327
+ gmx::MDModulesAtomsRedistributedSignal mdModulesAtomsRedistributedSignal(
1328
+ box, x.unpaddedArrayRef().subArray(0, mdatoms.homenr));
1329
+ mdModulesNotifiers.simulationSetupNotifier_.notify(mdModulesAtomsRedistributedSignal);
1330
+ }
1331
+ }
1332
+
1333
+ if (fr->wholeMoleculeTransform && stepWork.stateChanged)
1334
+ {
1335
+ fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
1336
+ }
1337
+
1338
+ wallcycle_start(wcycle, WallCycleCounter::NS);
1339
+ if (!haveDDAtomOrdering(*cr))
1340
+ {
1341
+ const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
1342
+ const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
1343
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
1344
+ nbv->putAtomsOnGrid(
1345
+ box, 0, vzero, boxDiagonal, nullptr, { 0, mdatoms.homenr }, -1, fr->atomInfo, x.unpaddedArrayRef(), 0, nullptr);
1346
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
1347
+ }
1348
+ else
1349
+ {
1350
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1351
+ nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->atomInfo, x.unpaddedArrayRef());
1352
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1353
+ }
1354
+
1355
+ nbv->setAtomProperties(mdatoms.typeA, mdatoms.chargeA, fr->atomInfo);
1356
+
1357
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1358
+
1359
+ /* initialize the GPU nbnxm atom data and bonded data structures */
1360
+ if (simulationWork.useGpuNonbonded)
1361
+ {
1362
+ // Note: cycle counting only nononbondeds, GPU listed forces counts internally
1363
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1364
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1365
+ Nbnxm::gpu_init_atomdata(nbv->gpuNbv(), &nbv->nbat());
1366
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1367
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1368
+
1369
+ if (fr->listedForcesGpu)
1370
+ {
1371
+ /* Now we put all atoms on the grid, we can assign bonded
1372
+ * interactions to the GPU, where the grid order is
1373
+ * needed. Also the xq, f and fshift device buffers have
1374
+ * been reallocated if needed, so the bonded code can
1375
+ * learn about them. */
1376
+ // TODO the xq, f, and fshift buffers are now shared
1377
+ // resources, so they should be maintained by a
1378
+ // higher-level object than the nb module.
1379
+ fr->listedForcesGpu->updateInteractionListsAndDeviceBuffers(
1380
+ nbv->getGridIndices(), top.idef, Nbnxm::gpuGetNBAtomData(nbv->gpuNbv()));
1381
+ }
1382
+ }
1383
+
1384
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1385
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
1386
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1387
+ nbv->constructPairlist(InteractionLocality::Local, top.excls, step, nrnb);
1388
+
1389
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::Local);
1390
+
1391
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
1392
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1393
+
1394
+ if (simulationWork.useGpuXBufferOpsWhenAllowed)
1395
+ {
1396
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
1397
+ }
1398
+
1399
+ if (simulationWork.useGpuFBufferOpsWhenAllowed)
1400
+ {
1401
+ // with MPI, direct GPU communication, and separate PME ranks we need
1402
+ // gmx_pme_send_coordinates() to be called before we can set up force reduction
1403
+ bool delaySetupLocalGpuForceReduction = GMX_MPI && simulationWork.useGpuPmePpCommunication;
1404
+ if (!delaySetupLocalGpuForceReduction)
1405
+ {
1406
+ setupLocalGpuForceReduction(runScheduleWork,
1407
+ nbv,
1408
+ stateGpu,
1409
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1410
+ fr->pmePpCommGpu.get(),
1411
+ fr->pmedata,
1412
+ cr->dd);
1413
+ }
1414
+
1415
+ if (simulationWork.havePpDomainDecomposition)
1416
+ {
1417
+ setupNonLocalGpuForceReduction(runScheduleWork,
1418
+ nbv,
1419
+ stateGpu,
1420
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
1421
+ cr->dd);
1422
+ }
1423
+ }
1424
+
1425
+ /* do non-local pair search */
1426
+ if (simulationWork.havePpDomainDecomposition)
1427
+ {
1428
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1429
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1430
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1431
+ nbv->constructPairlist(InteractionLocality::NonLocal, top.excls, step, nrnb);
1432
+
1433
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::NonLocal);
1434
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1435
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1436
+ // TODO refactor this GPU halo exchange re-initialisation
1437
+ // to location in do_md where GPU halo exchange is
1438
+ // constructed at partitioning, after above stateGpu
1439
+ // re-initialization has similarly been refactored
1440
+ if (simulationWork.useGpuHaloExchange)
1441
+ {
1442
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
1443
+ }
1444
+ }
1445
+
1446
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
1447
+ // so we need to do that here after the local and non-local pairlist construction.
1448
+ if (fr->efep != FreeEnergyPerturbationType::No)
1449
+ {
1450
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
1451
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
1452
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
1453
+ }
1454
+ }
1455
+
1456
+ void do_force(FILE* fplog,
1457
+ const t_commrec* cr,
1458
+ const gmx_multisim_t* ms,
1459
+ const t_inputrec& inputrec,
1460
+ const gmx::MDModulesNotifiers& mdModulesNotifiers,
1461
+ gmx::Awh* awh,
1462
+ gmx_enfrot* enforcedRotation,
1463
+ gmx::ImdSession* imdSession,
1464
+ pull_t* pull_work,
1465
+ int64_t step,
1466
+ t_nrnb* nrnb,
1467
+ gmx_wallcycle* wcycle,
1468
+ const gmx_localtop_t* top,
1469
+ const matrix box,
1470
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1471
+ gmx::ArrayRef<gmx::RVec> v,
1472
+ const history_t* hist,
1473
+ gmx::ForceBuffersView* forceView,
1474
+ tensor vir_force,
1475
+ const t_mdatoms* mdatoms,
1476
+ gmx_enerdata_t* enerd,
1477
+ gmx::ArrayRef<const real> lambda,
1478
+ t_forcerec* fr,
1479
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
1480
+ gmx::VirtualSitesHandler* vsite,
1481
+ rvec muTotal,
1482
+ double t,
1483
+ gmx_edsam* ed,
1484
+ CpuPpLongRangeNonbondeds* longRangeNonbondeds,
1485
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1486
+ {
1487
+ auto force = forceView->forceWithPadding();
1488
+ GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
1489
+ "The size of the force buffer should be at least the number of atoms to compute "
1490
+ "forces for");
1491
+
1492
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1493
+ interaction_const_t* ic = fr->ic.get();
1494
+
1495
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1496
+
1497
+ const SimulationWorkload& simulationWork = runScheduleWork.simulationWork;
1498
+
1499
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork.domainWork;
1500
+
1501
+ const StepWorkload& stepWork = runScheduleWork.stepWork;
1502
+
1503
+ if (stepWork.doNeighborSearch)
1504
+ {
1505
+ doPairSearch(cr, inputrec, mdModulesNotifiers, step, nrnb, wcycle, *top, box, x, v, *mdatoms, fr, runScheduleWork);
1506
+
1507
+ /* At a search step we need to start the first balancing region
1508
+ * somewhere early inside the step after communication during domain
1509
+ * decomposition (and not during the previous step as usual).
1510
+ */
1511
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
1512
+ }
1513
+
1514
+ const bool pmeSendCoordinatesFromGpu =
1515
+ simulationWork.useGpuPmePpCommunication && !stepWork.doNeighborSearch;
1516
+ auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps
1517
+ || simulationWork.useGpuUpdate || pmeSendCoordinatesFromGpu)
1518
+ ? stateGpu->getCoordinatesReadyOnDeviceEvent(
1519
+ AtomLocality::Local, simulationWork, stepWork)
1520
+ : nullptr;
1521
+
1522
+ if (stepWork.clearGpuFBufferEarly)
1523
+ {
1524
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data.
1525
+ // First clear local portion of force array, so that untouched atoms are zero.
1526
+ // The dependency for this is that forces from previous timestep have been consumed,
1527
+ // which is satisfied when localXReadyOnDevice has been marked for GPU update case.
1528
+ // For CPU update, the forces are consumed by the beginning of the step, so no extra sync needed.
1529
+ GpuEventSynchronizer* dependency = simulationWork.useGpuUpdate ? localXReadyOnDevice : nullptr;
1530
+ stateGpu->clearForcesOnGpu(AtomLocality::Local, dependency);
1531
+ }
1532
+
1533
+ clear_mat(vir_force);
1534
+
1535
+ if (fr->pbcType != PbcType::No)
1536
+ {
1537
+ /* Compute shift vectors every step,
1538
+ * because of pressure coupling or box deformation!
1539
+ */
1540
+ if (stepWork.haveDynamicBox && stepWork.stateChanged)
1541
+ {
1542
+ calc_shifts(box, fr->shift_vec);
1543
+ }
1544
+ }
1545
+ nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, &nbv->nbat());
1546
+
1547
+
1548
+ GMX_ASSERT(simulationWork.useGpuHaloExchange
1549
+ == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
1550
+ "The GPU halo exchange is active, but it has not been constructed.");
1551
+
1552
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
1553
+ // Copy coordinate from the GPU if update is on the GPU and there
1554
+ // are forces to be computed on the CPU, or for the computation of
1555
+ // virial, or if host-side data will be transferred from this task
1556
+ // to a remote task for halo exchange or PME-PP communication. At
1557
+ // search steps the current coordinates are already on the host,
1558
+ // hence copy is not needed.
1559
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
1560
+ && (runScheduleWork.domainWork.haveCpuLocalForceWork || stepWork.computeVirial
1561
+ || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
1562
+ || simulationWork.computeMuTot))
1563
+ {
1564
+ stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
1565
+ haveCopiedXFromGpu = true;
1566
+ }
1567
+
1568
+ // Coordinates on the device are needed if PME or BufferOps are offloaded.
1569
+ // The local coordinates can be copied right away.
1570
+ // NOTE: Consider moving this copy to right after they are updated and constrained,
1571
+ // if the later is not offloaded.
1572
+ if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps || pmeSendCoordinatesFromGpu)
1573
+ {
1574
+ GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
1575
+ const int expectedLocalXReadyOnDeviceConsumptionCount =
1576
+ getExpectedLocalXReadyOnDeviceConsumptionCount(
1577
+ simulationWork, stepWork, pmeSendCoordinatesFromGpu);
1578
+
1579
+ // We need to copy coordinates when:
1580
+ // 1. Update is not offloaded
1581
+ // 2. The buffers were reinitialized on search step
1582
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
1583
+ {
1584
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(),
1585
+ AtomLocality::Local,
1586
+ expectedLocalXReadyOnDeviceConsumptionCount);
1587
+ }
1588
+ else if (simulationWork.useGpuUpdate)
1589
+ {
1590
+ stateGpu->setXUpdatedOnDeviceEventExpectedConsumptionCount(
1591
+ expectedLocalXReadyOnDeviceConsumptionCount);
1592
+ }
1593
+ }
1594
+
1595
+ if (stepWork.computePmeOnSeparateRank)
1596
+ {
1597
+ /* Send particle coordinates to the pme nodes */
1598
+ if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
1599
+ {
1600
+ GMX_ASSERT(haveCopiedXFromGpu,
1601
+ "a wait should only be triggered if copy has been scheduled");
1602
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1603
+ }
1604
+
1605
+ const bool reinitGpuPmePpComms =
1606
+ simulationWork.useGpuPmePpCommunication && stepWork.doNeighborSearch;
1607
+ gmx_pme_send_coordinates(fr,
1608
+ cr,
1609
+ box,
1610
+ x.unpaddedArrayRef(),
1611
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1612
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
1613
+ (stepWork.computeVirial || stepWork.computeEnergy),
1614
+ step,
1615
+ simulationWork.useGpuPmePpCommunication,
1616
+ reinitGpuPmePpComms,
1617
+ pmeSendCoordinatesFromGpu,
1618
+ stepWork.useGpuPmeFReduction,
1619
+ pmeSendCoordinatesFromGpu ? localXReadyOnDevice : nullptr,
1620
+ simulationWork.useMdGpuGraph,
1621
+ wcycle);
1622
+ }
1623
+
1624
+ if (simulationWork.useGpuFBufferOpsWhenAllowed && stepWork.doNeighborSearch)
1625
+ {
1626
+ // with MPI, direct GPU communication, and separate PME ranks we need
1627
+ // gmx_pme_send_coordinates() to be called before we can set up force reduction
1628
+ bool doSetupLocalGpuForceReduction = GMX_MPI && simulationWork.useGpuPmePpCommunication;
1629
+ if (doSetupLocalGpuForceReduction)
1630
+ {
1631
+ setupLocalGpuForceReduction(runScheduleWork,
1632
+ fr->nbv.get(),
1633
+ stateGpu,
1634
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1635
+ fr->pmePpCommGpu.get(),
1636
+ fr->pmedata,
1637
+ cr->dd);
1638
+ }
1639
+ }
1640
+
1641
+ if (stepWork.haveGpuPmeOnThisRank)
1642
+ {
1643
+ launchPmeGpuSpread(fr->pmedata,
1644
+ box,
1645
+ stepWork,
1646
+ localXReadyOnDevice,
1647
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1648
+ simulationWork.useMdGpuGraph,
1649
+ wcycle);
1650
+ }
1651
+
1652
+
1653
+ if (!stepWork.doNeighborSearch && !EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
1654
+ {
1655
+ if (stepWork.useGpuXBufferOps)
1656
+ {
1657
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
1658
+ nbv->convertCoordinatesGpu(AtomLocality::Local, stateGpu->getCoordinates(), localXReadyOnDevice);
1659
+ }
1660
+ else
1661
+ {
1662
+ if (simulationWork.useGpuUpdate)
1663
+ {
1664
+ GMX_ASSERT(stateGpu, "need a valid stateGpu object");
1665
+ GMX_ASSERT(haveCopiedXFromGpu,
1666
+ "a wait should only be triggered if copy has been scheduled");
1667
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1668
+ }
1669
+ nbv->convertCoordinates(AtomLocality::Local, x.unpaddedArrayRef());
1670
+ }
1671
+ }
1672
+
1673
+ if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
1674
+ {
1675
+ ddBalanceRegionHandler.openBeforeForceComputationGpu();
1676
+
1677
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpuPp);
1678
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1679
+ Nbnxm::gpu_upload_shiftvec(nbv->gpuNbv(), &nbv->nbat());
1680
+ if (!stepWork.useGpuXBufferOps)
1681
+ {
1682
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpuNbv(), &nbv->nbat(), AtomLocality::Local);
1683
+ }
1684
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1685
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1686
+ // with X buffer ops offloaded to the GPU on all but the search steps
1687
+
1688
+ // bonded work not split into separate local and non-local, so with DD
1689
+ // we can only launch the kernel after non-local coordinates have been received.
1690
+ if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
1691
+ {
1692
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1693
+ }
1694
+
1695
+ /* launch local nonbonded work on GPU */
1696
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1697
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1698
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
1699
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1700
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1701
+ }
1702
+
1703
+ if (stepWork.haveGpuPmeOnThisRank)
1704
+ {
1705
+ // In PME GPU and mixed mode we launch FFT / gather after the
1706
+ // X copy/transform to allow overlap as well as after the GPU NB
1707
+ // launch to avoid FFT launch overhead hijacking the CPU and delaying
1708
+ // the nonbonded kernel.
1709
+ launchPmeGpuFftAndGather(fr->pmedata,
1710
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1711
+ wcycle,
1712
+ stepWork);
1713
+ }
1714
+
1715
+ /* Communicate coordinates and sum dipole if necessary */
1716
+ if (simulationWork.havePpDomainDecomposition)
1717
+ {
1718
+ if (!stepWork.doNeighborSearch)
1719
+ {
1720
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
1721
+ if (stepWork.useGpuXHalo)
1722
+ {
1723
+ // The following must be called after local setCoordinates (which records an event
1724
+ // when the coordinate data has been copied to the device).
1725
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
1726
+
1727
+ if (domainWork.haveCpuNonLocalForceWork)
1728
+ {
1729
+ // non-local part of coordinate buffer must be copied back to host for CPU work
1730
+ stateGpu->copyCoordinatesFromGpu(
1731
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
1732
+ }
1733
+ }
1734
+ else
1735
+ {
1736
+ if (simulationWork.useGpuUpdate)
1737
+ {
1738
+ GMX_ASSERT(haveCopiedXFromGpu,
1739
+ "a wait should only be triggered if copy has been scheduled");
1740
+ const bool haveAlreadyWaited =
1741
+ (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1742
+ if (!haveAlreadyWaited)
1743
+ {
1744
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1745
+ }
1746
+ }
1747
+ dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1748
+ }
1749
+
1750
+ if (stepWork.useGpuXBufferOps)
1751
+ {
1752
+ if (!stepWork.useGpuXHalo)
1753
+ {
1754
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
1755
+ }
1756
+ GpuEventSynchronizer* xReadyOnDeviceEvent = stateGpu->getCoordinatesReadyOnDeviceEvent(
1757
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched);
1758
+ if (stepWork.useGpuXHalo && domainWork.haveCpuNonLocalForceWork)
1759
+ {
1760
+ /* We already enqueued an event for Gpu Halo exchange completion into the
1761
+ * NonLocal stream when D2H copying the coordinates. */
1762
+ xReadyOnDeviceEvent = nullptr;
1763
+ }
1764
+ nbv->convertCoordinatesGpu(
1765
+ AtomLocality::NonLocal, stateGpu->getCoordinates(), xReadyOnDeviceEvent);
1766
+ }
1767
+ else
1768
+ {
1769
+ nbv->convertCoordinates(AtomLocality::NonLocal, x.unpaddedArrayRef());
1770
+ }
1771
+ }
1772
+
1773
+ if (simulationWork.useGpuNonbonded)
1774
+ {
1775
+
1776
+ if (!stepWork.useGpuXBufferOps)
1777
+ {
1778
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpuPp);
1779
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1780
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpuNbv(), &nbv->nbat(), AtomLocality::NonLocal);
1781
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1782
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1783
+ }
1784
+
1785
+ if (domainWork.haveGpuBondedWork)
1786
+ {
1787
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1788
+ }
1789
+
1790
+ /* launch non-local nonbonded tasks on GPU */
1791
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1792
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1793
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1794
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1795
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1796
+ }
1797
+ }
1798
+
1799
+ if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
1800
+ {
1801
+ /* launch D2H copy-back F */
1802
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpuPp);
1803
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1804
+
1805
+ if (simulationWork.havePpDomainDecomposition)
1806
+ {
1807
+ Nbnxm::gpu_launch_cpyback(nbv->gpuNbv(), &nbv->nbat(), stepWork, AtomLocality::NonLocal);
1808
+ }
1809
+ Nbnxm::gpu_launch_cpyback(nbv->gpuNbv(), &nbv->nbat(), stepWork, AtomLocality::Local);
1810
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1811
+
1812
+ if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
1813
+ {
1814
+ fr->listedForcesGpu->launchEnergyTransfer();
1815
+ }
1816
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPp);
1817
+ }
1818
+
1819
+ gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
1820
+ if (fr->wholeMoleculeTransform)
1821
+ {
1822
+ xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
1823
+ }
1824
+
1825
+ // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
1826
+ // this wait ensures that the D2H transfer is complete.
1827
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
1828
+ {
1829
+ const bool needCoordsOnHost = (runScheduleWork.domainWork.haveCpuLocalForceWork
1830
+ || stepWork.computeVirial || simulationWork.computeMuTot);
1831
+ const bool haveAlreadyWaited =
1832
+ simulationWork.useCpuHaloExchange
1833
+ || (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1834
+ if (needCoordsOnHost && !haveAlreadyWaited)
1835
+ {
1836
+ GMX_ASSERT(haveCopiedXFromGpu,
1837
+ "a wait should only be triggered if copy has been scheduled");
1838
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1839
+ }
1840
+ }
1841
+
1842
+ DipoleData dipoleData;
1843
+
1844
+ if (simulationWork.computeMuTot)
1845
+ {
1846
+ const int start = 0;
1847
+
1848
+ /* Calculate total (local) dipole moment in a temporary common array.
1849
+ * This makes it possible to sum them over nodes faster.
1850
+ */
1851
+ gmx::ArrayRef<const gmx::RVec> xRef =
1852
+ (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
1853
+ calc_mu(start,
1854
+ mdatoms->homenr,
1855
+ xRef,
1856
+ mdatoms->chargeA,
1857
+ mdatoms->chargeB,
1858
+ mdatoms->nChargePerturbed != 0,
1859
+ dipoleData.muStaging[0],
1860
+ dipoleData.muStaging[1]);
1861
+
1862
+ reduceAndUpdateMuTot(
1863
+ &dipoleData, cr, (fr->efep != FreeEnergyPerturbationType::No), lambda, muTotal, ddBalanceRegionHandler);
1864
+ }
1865
+
1866
+ /* Reset energies */
1867
+ reset_enerdata(enerd);
1868
+
1869
+ if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
1870
+ {
1871
+ wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
1872
+ dd_force_flop_start(cr->dd, nrnb);
1873
+ }
1874
+
1875
+ if (inputrec.bRot)
1876
+ {
1877
+ wallcycle_start(wcycle, WallCycleCounter::Rot);
1878
+ do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
1879
+ wallcycle_stop(wcycle, WallCycleCounter::Rot);
1880
+ }
1881
+
1882
+ /* Start the force cycle counter.
1883
+ * Note that a different counter is used for dynamic load balancing.
1884
+ */
1885
+ wallcycle_start(wcycle, WallCycleCounter::Force);
1886
+
1887
+ /* Set up and clear force outputs:
1888
+ * forceOutMtsLevel0: everything except what is in the other two outputs
1889
+ * forceOutMtsLevel1: PME-mesh and listed-forces group 1
1890
+ * forceOutNonbonded: non-bonded forces
1891
+ * Without multiple time stepping all point to the same object.
1892
+ * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
1893
+ *
1894
+ * Note that CPU force buffer clearing needs to happen after the completion of the
1895
+ * previous step's CPU force H2D transfer (prior to force reduction).
1896
+ * In the current code this is ensured by the earlier waitCoordinatesReadyOnHost()
1897
+ * which is sufficient, but it is suboptimal as it prevents overlap of the force clearing
1898
+ * with independent GPU work (integration/constraints, x D2H copy).
1899
+ */
1900
+ ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
1901
+ &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
1902
+
1903
+ // Force output for MTS combined forces, only set at level1 MTS steps
1904
+ std::optional<ForceOutputs> forceOutMts =
1905
+ (simulationWork.useMts && stepWork.computeSlowForces)
1906
+ ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
1907
+ forceView->forceMtsCombinedWithPadding(),
1908
+ domainWork,
1909
+ stepWork,
1910
+ simulationWork.havePpDomainDecomposition,
1911
+ wcycle))
1912
+ : std::nullopt;
1913
+
1914
+ ForceOutputs* forceOutMtsLevel1 =
1915
+ simulationWork.useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr)
1916
+ : &forceOutMtsLevel0;
1917
+
1918
+ const bool nonbondedAtMtsLevel1 = runScheduleWork.simulationWork.computeNonbondedAtMtsLevel1;
1919
+
1920
+ ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
1921
+
1922
+ if (inputrec.bPull && pull_have_constraint(*pull_work))
1923
+ {
1924
+ clear_pull_forces(pull_work);
1925
+ }
1926
+
1927
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1928
+
1929
+ /* We calculate the non-bonded forces, when done on the CPU, here.
1930
+ * We do this before calling do_force_lowlevel, because in that
1931
+ * function, the listed forces are calculated before PME, which
1932
+ * does communication. With this order, non-bonded and listed
1933
+ * force calculation imbalance can be balanced out by the domain
1934
+ * decomposition load balancing.
1935
+ */
1936
+
1937
+ const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
1938
+
1939
+ if (!useOrEmulateGpuNb)
1940
+ {
1941
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1942
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
1943
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1944
+ }
1945
+
1946
+ if (stepWork.useGpuXHalo && domainWork.haveCpuNonLocalForceWork)
1947
+ {
1948
+ /* Wait for non-local coordinate data to be copied from device */
1949
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
1950
+ }
1951
+
1952
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1953
+ if (fr->efep != FreeEnergyPerturbationType::No && stepWork.computeNonbondedForces)
1954
+ {
1955
+ /* Calculate the local and non-local free energy interactions here.
1956
+ * Happens here on the CPU both with and without GPU.
1957
+ */
1958
+ nbv->dispatchFreeEnergyKernels(x,
1959
+ &forceOutNonbonded->forceWithShiftForces(),
1960
+ fr->use_simd_kernels,
1961
+ fr->ntype,
1962
+ *fr->ic,
1963
+ fr->shift_vec,
1964
+ fr->nbfp,
1965
+ fr->ljpme_c6grid,
1966
+ mdatoms->chargeA,
1967
+ mdatoms->chargeB,
1968
+ mdatoms->typeA,
1969
+ mdatoms->typeB,
1970
+ lambda,
1971
+ enerd,
1972
+ stepWork,
1973
+ nrnb);
1974
+ }
1975
+
1976
+ if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
1977
+ {
1978
+ if (simulationWork.havePpDomainDecomposition)
1979
+ {
1980
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1981
+ }
1982
+
1983
+ if (stepWork.computeForces)
1984
+ {
1985
+ /* Add all the non-bonded force to the normal force array.
1986
+ * This can be split into a local and a non-local part when overlapping
1987
+ * communication with calculation with domain decomposition.
1988
+ */
1989
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1990
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
1991
+ forceOutNonbonded->forceWithShiftForces().force());
1992
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1993
+ }
1994
+
1995
+ /* If there are multiple fshift output buffers we need to reduce them */
1996
+ if (stepWork.computeVirial)
1997
+ {
1998
+ /* This is not in a subcounter because it takes a
1999
+ negligible and constant-sized amount of time */
2000
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(
2001
+ nbv->nbat(), forceOutNonbonded->forceWithShiftForces().shiftForces());
2002
+ }
2003
+ }
2004
+
2005
+ // Compute wall interactions, when present.
2006
+ // Note: should be moved to special forces.
2007
+ if (inputrec.nwall && stepWork.computeNonbondedForces)
2008
+ {
2009
+ /* foreign lambda component for walls */
2010
+ real dvdl_walls = do_walls(inputrec,
2011
+ *fr,
2012
+ box,
2013
+ mdatoms->typeA,
2014
+ mdatoms->typeB,
2015
+ mdatoms->cENER,
2016
+ mdatoms->homenr,
2017
+ mdatoms->nPerturbed,
2018
+ x.unpaddedConstArrayRef(),
2019
+ &forceOutMtsLevel0.forceWithVirial(),
2020
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
2021
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR],
2022
+ nrnb);
2023
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_walls;
2024
+ }
2025
+
2026
+ if (stepWork.computeListedForces)
2027
+ {
2028
+ /* Check whether we need to take into account PBC in listed interactions */
2029
+ bool needMolPbc = false;
2030
+ for (const auto& listedForces : fr->listedForces)
2031
+ {
2032
+ if (listedForces.haveCpuListedForces(*fr->fcdata))
2033
+ {
2034
+ needMolPbc = fr->bMolPBC;
2035
+ }
2036
+ }
2037
+
2038
+ t_pbc pbc;
2039
+
2040
+ if (needMolPbc)
2041
+ {
2042
+ /* Since all atoms are in the rectangular or triclinic unit-cell,
2043
+ * only single box vector shifts (2 in x) are required.
2044
+ */
2045
+ set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
2046
+ }
2047
+
2048
+ for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
2049
+ mtsIndex++)
2050
+ {
2051
+ ListedForces& listedForces = fr->listedForces[mtsIndex];
2052
+ ForceOutputs& forceOut = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
2053
+ listedForces.calculate(wcycle,
2054
+ box,
2055
+ cr,
2056
+ ms,
2057
+ x,
2058
+ xWholeMolecules,
2059
+ fr->fcdata.get(),
2060
+ hist,
2061
+ &forceOut,
2062
+ fr,
2063
+ &pbc,
2064
+ enerd,
2065
+ nrnb,
2066
+ lambda,
2067
+ mdatoms->chargeA,
2068
+ mdatoms->chargeB,
2069
+ makeConstArrayRef(mdatoms->bPerturbed),
2070
+ mdatoms->cENER,
2071
+ mdatoms->nPerturbed,
2072
+ haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
2073
+ stepWork);
2074
+ }
2075
+ }
2076
+
2077
+ if (stepWork.computeSlowForces)
2078
+ {
2079
+ longRangeNonbondeds->calculate(fr->pmedata,
2080
+ cr,
2081
+ x.unpaddedConstArrayRef(),
2082
+ &forceOutMtsLevel1->forceWithVirial(),
2083
+ enerd,
2084
+ box,
2085
+ lambda,
2086
+ dipoleData.muStateAB,
2087
+ stepWork,
2088
+ ddBalanceRegionHandler);
2089
+ }
2090
+
2091
+ /* PLUMED */
2092
+ if(plumedswitch){
2093
+ int plumedNeedsEnergy;
2094
+ plumed_cmd(plumedmain,"isEnergyNeeded",&plumedNeedsEnergy);
2095
+ if(!plumedNeedsEnergy) plumed_cmd(plumedmain,"performCalc",nullptr);
2096
+ }
2097
+ /* END PLUMED */
2098
+
2099
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2100
+
2101
+ // VdW dispersion correction, only computed on main rank to avoid double counting
2102
+ if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MAIN(cr))
2103
+ {
2104
+ // Calculate long range corrections to pressure and energy
2105
+ const DispersionCorrection::Correction correction = fr->dispersionCorrection->calculate(
2106
+ box, lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]);
2107
+
2108
+ if (stepWork.computeEnergy)
2109
+ {
2110
+ enerd->term[F_DISPCORR] = correction.energy;
2111
+ enerd->term[F_DVDL_VDW] += correction.dvdl;
2112
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += correction.dvdl;
2113
+ }
2114
+ if (stepWork.computeVirial)
2115
+ {
2116
+ correction.correctVirial(vir_force);
2117
+ enerd->term[F_PDISPCORR] = correction.pressure;
2118
+ }
2119
+ }
2120
+
2121
+ const bool needToReceivePmeResultsFromSeparateRank = (PAR(cr) && stepWork.computePmeOnSeparateRank);
2122
+ const bool needToReceivePmeResults =
2123
+ (stepWork.haveGpuPmeOnThisRank || needToReceivePmeResultsFromSeparateRank);
2124
+
2125
+ /* When running free energy perturbations steered by AWH and doing PME calculations on the
2126
+ * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
2127
+ * FEP dimension with AWH. */
2128
+ const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension() && needToReceivePmeResults
2129
+ && stepWork.computeEnergy && stepWork.computeSlowForces);
2130
+ if (needEarlyPmeResults)
2131
+ {
2132
+ if (stepWork.haveGpuPmeOnThisRank)
2133
+ {
2134
+ pmeGpuWaitAndReduce(fr->pmedata,
2135
+ stepWork,
2136
+ wcycle,
2137
+ &forceOutMtsLevel1->forceWithVirial(),
2138
+ enerd,
2139
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2140
+ }
2141
+ else if (needToReceivePmeResultsFromSeparateRank)
2142
+ {
2143
+ /* In case of node-splitting, the PP nodes receive the long-range
2144
+ * forces, virial and energy from the PME nodes here.
2145
+ */
2146
+ pme_receive_force_ener(fr,
2147
+ cr,
2148
+ &forceOutMtsLevel1->forceWithVirial(),
2149
+ enerd,
2150
+ simulationWork.useGpuPmePpCommunication,
2151
+ stepWork.useGpuPmeFReduction,
2152
+ wcycle);
2153
+ }
2154
+ }
2155
+
2156
+ computeSpecialForces(fplog,
2157
+ cr,
2158
+ inputrec,
2159
+ awh,
2160
+ enforcedRotation,
2161
+ imdSession,
2162
+ pull_work,
2163
+ step,
2164
+ t,
2165
+ wcycle,
2166
+ fr->forceProviders,
2167
+ box,
2168
+ x.unpaddedArrayRef(),
2169
+ mdatoms,
2170
+ lambda,
2171
+ stepWork,
2172
+ &forceOutMtsLevel0.forceWithVirial(),
2173
+ forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr,
2174
+ enerd,
2175
+ ed,
2176
+ stepWork.doNeighborSearch);
2177
+
2178
+ if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
2179
+ && domainWork.haveCpuLocalForceWork)
2180
+ {
2181
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
2182
+ }
2183
+
2184
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2185
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2186
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
2187
+ "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
2188
+ // Will store the amount of cycles spent waiting for the GPU that
2189
+ // will be later used in the DLB accounting.
2190
+ float cycles_wait_gpu = 0;
2191
+ if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
2192
+ {
2193
+ auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
2194
+
2195
+ /* wait for non-local forces (or calculate in emulation mode) */
2196
+ if (simulationWork.havePpDomainDecomposition)
2197
+ {
2198
+ if (simulationWork.useGpuNonbonded)
2199
+ {
2200
+ cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
2201
+ nbv->gpuNbv(),
2202
+ stepWork,
2203
+ AtomLocality::NonLocal,
2204
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2205
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2206
+ forceWithShiftForces.shiftForces(),
2207
+ wcycle);
2208
+ }
2209
+ else
2210
+ {
2211
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2212
+ do_nb_verlet(
2213
+ fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
2214
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2215
+ }
2216
+
2217
+ if (stepWork.useGpuFBufferOps)
2218
+ {
2219
+ if (domainWork.haveCpuNonLocalForceWork)
2220
+ {
2221
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2222
+ AtomLocality::NonLocal);
2223
+ }
2224
+
2225
+
2226
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
2227
+
2228
+ if (!stepWork.useGpuFHalo)
2229
+ {
2230
+ /* We don't explicitly wait for the forces to be reduced on device,
2231
+ * but wait for them to finish copying to CPU instead.
2232
+ * So, we manually consume the event, see Issue #3988. */
2233
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
2234
+ // copy from GPU input for dd_move_f()
2235
+ stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2236
+ AtomLocality::NonLocal);
2237
+ }
2238
+ }
2239
+ else
2240
+ {
2241
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
2242
+ }
2243
+
2244
+ if (fr->nbv->emulateGpu() && stepWork.computeVirial)
2245
+ {
2246
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat(), forceWithShiftForces.shiftForces());
2247
+ }
2248
+ }
2249
+ }
2250
+
2251
+ /* Combining the forces for multiple time stepping before the halo exchange, when possible,
2252
+ * avoids an extra halo exchange (when DD is used) and post-processing step.
2253
+ */
2254
+ if (stepWork.combineMtsForcesBeforeHaloExchange)
2255
+ {
2256
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2257
+ combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
2258
+ force.unpaddedArrayRef(),
2259
+ forceView->forceMtsCombined(),
2260
+ inputrec.mtsLevels[1].stepFactor);
2261
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2262
+ }
2263
+
2264
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
2265
+ // an alternating wait/reduction scheme.
2266
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2267
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2268
+ const bool alternateGpuWait = (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
2269
+ && simulationWork.useGpuNonbonded && !simulationWork.havePpDomainDecomposition
2270
+ && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
2271
+
2272
+
2273
+ const int expectedLocalFReadyOnDeviceConsumptionCount = getExpectedLocalFReadyOnDeviceConsumptionCount(
2274
+ simulationWork, domainWork, stepWork, useOrEmulateGpuNb, alternateGpuWait);
2275
+ // If expectedLocalFReadyOnDeviceConsumptionCount == 0, stateGpu can be uninitialized
2276
+ if (expectedLocalFReadyOnDeviceConsumptionCount > 0)
2277
+ {
2278
+ stateGpu->setFReadyOnDeviceEventExpectedConsumptionCount(
2279
+ AtomLocality::Local, expectedLocalFReadyOnDeviceConsumptionCount);
2280
+ }
2281
+
2282
+ if (simulationWork.havePpDomainDecomposition)
2283
+ {
2284
+ /* We are done with the CPU compute.
2285
+ * We will now communicate the non-local forces.
2286
+ * If we use a GPU this will overlap with GPU work, so in that case
2287
+ * we do not close the DD force balancing region here.
2288
+ */
2289
+ ddBalanceRegionHandler.closeAfterForceComputationCpu();
2290
+
2291
+ if (stepWork.computeForces)
2292
+ {
2293
+
2294
+ if (stepWork.useGpuFHalo)
2295
+ {
2296
+ // If there exist CPU forces, data from halo exchange should accumulate into these
2297
+ bool accumulateForces = domainWork.haveCpuLocalForceWork;
2298
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
2299
+ // completion of both H2D copy and clearing is signaled by fReadyOnDevice
2300
+ if (domainWork.haveCpuLocalForceWork || stepWork.clearGpuFBufferEarly)
2301
+ {
2302
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
2303
+ }
2304
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
2305
+
2306
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
2307
+ }
2308
+ else
2309
+ {
2310
+ if (stepWork.useGpuFBufferOps)
2311
+ {
2312
+ stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
2313
+ }
2314
+
2315
+ // Without MTS or with MTS at slow steps with uncombined forces we need to
2316
+ // communicate the fast forces
2317
+ if (!simulationWork.useMts || !stepWork.combineMtsForcesBeforeHaloExchange)
2318
+ {
2319
+ dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
2320
+ }
2321
+ // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
2322
+ if (simulationWork.useMts && stepWork.computeSlowForces)
2323
+ {
2324
+ dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
2325
+ }
2326
+ }
2327
+ }
2328
+ }
2329
+
2330
+ if (alternateGpuWait)
2331
+ {
2332
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(),
2333
+ fr->pmedata,
2334
+ forceOutNonbonded,
2335
+ forceOutMtsLevel1,
2336
+ enerd,
2337
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
2338
+ stepWork,
2339
+ wcycle);
2340
+ }
2341
+
2342
+ if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank && !needEarlyPmeResults)
2343
+ {
2344
+ pmeGpuWaitAndReduce(fr->pmedata,
2345
+ stepWork,
2346
+ wcycle,
2347
+ &forceOutMtsLevel1->forceWithVirial(),
2348
+ enerd,
2349
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2350
+ }
2351
+
2352
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
2353
+ if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
2354
+ {
2355
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
2356
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
2357
+ * but even with a step of 0.1 ms the difference is less than 1%
2358
+ * of the step time.
2359
+ */
2360
+ const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
2361
+ const float waitCycles = Nbnxm::gpu_wait_finish_task(
2362
+ nbv->gpuNbv(),
2363
+ stepWork,
2364
+ AtomLocality::Local,
2365
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2366
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2367
+ forceOutNonbonded->forceWithShiftForces().shiftForces(),
2368
+ wcycle);
2369
+
2370
+ if (ddBalanceRegionHandler.useBalancingRegion())
2371
+ {
2372
+ DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
2373
+ if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
2374
+ {
2375
+ /* We measured few cycles, it could be that the kernel
2376
+ * and transfer finished earlier and there was no actual
2377
+ * wait time, only API call overhead.
2378
+ * Then the actual time could be anywhere between 0 and
2379
+ * cycles_wait_est. We will use half of cycles_wait_est.
2380
+ */
2381
+ waitedForGpu = DdBalanceRegionWaitedForGpu::no;
2382
+ }
2383
+ ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
2384
+ }
2385
+ }
2386
+
2387
+ if (fr->nbv->emulateGpu())
2388
+ {
2389
+ // NOTE: emulation kernel is not included in the balancing region,
2390
+ // but emulation mode does not target performance anyway
2391
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2392
+ do_nb_verlet(fr,
2393
+ ic,
2394
+ enerd,
2395
+ stepWork,
2396
+ InteractionLocality::Local,
2397
+ haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
2398
+ step,
2399
+ nrnb,
2400
+ wcycle);
2401
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2402
+ }
2403
+
2404
+ // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
2405
+ // TODO refactor this and unify with below default-path call to the same function
2406
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2407
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2408
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
2409
+ {
2410
+ /* In case of node-splitting, the PP nodes receive the long-range
2411
+ * forces, virial and energy from the PME nodes here.
2412
+ */
2413
+ pme_receive_force_ener(fr,
2414
+ cr,
2415
+ &forceOutMtsLevel1->forceWithVirial(),
2416
+ enerd,
2417
+ simulationWork.useGpuPmePpCommunication,
2418
+ stepWork.useGpuPmeFReduction,
2419
+ wcycle);
2420
+ }
2421
+
2422
+
2423
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
2424
+ * on the non-alternating path. */
2425
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2426
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2427
+ if (useOrEmulateGpuNb && !alternateGpuWait)
2428
+ {
2429
+ if (stepWork.useGpuFBufferOps)
2430
+ {
2431
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2432
+
2433
+ // TODO: move these steps as early as possible:
2434
+ // - CPU f H2D should be as soon as all CPU-side forces are done
2435
+ // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
2436
+ // before the next CPU task that consumes the forces: vsite spread or update)
2437
+ // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
2438
+ // of the halo exchange. In that case the copy is instead performed above, before the exchange.
2439
+ // These should be unified.
2440
+ if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
2441
+ {
2442
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
2443
+ }
2444
+
2445
+ if (stepWork.computeNonbondedForces)
2446
+ {
2447
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
2448
+ }
2449
+
2450
+ // Copy forces to host if they are needed for update or if virtual sites are enabled.
2451
+ // If there are vsites, we need to copy forces every step to spread vsite forces on host.
2452
+ // TODO: When the output flags will be included in step workload, this copy can be combined with the
2453
+ // copy call done in sim_utils(...) for the output.
2454
+ // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
2455
+ // they should not be copied in do_md(...) for the output.
2456
+ if (!simulationWork.useGpuUpdate
2457
+ || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
2458
+ || vsite)
2459
+ {
2460
+ if (stepWork.computeNonbondedForces)
2461
+ {
2462
+ /* We have previously issued force reduction on the GPU, but we will
2463
+ * not use this event, instead relying on the stream being in-order.
2464
+ * Issue #3988. */
2465
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
2466
+ }
2467
+ stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
2468
+ stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
2469
+ }
2470
+ }
2471
+ else if (stepWork.computeNonbondedForces)
2472
+ {
2473
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2474
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
2475
+ }
2476
+ }
2477
+
2478
+ if (expectedLocalFReadyOnDeviceConsumptionCount > 0)
2479
+ {
2480
+ /* The same fReadyOnDevice device synchronizer is later used to track buffer clearing,
2481
+ * so we reset the expected consumption value back to the default (1). */
2482
+ stateGpu->setFReadyOnDeviceEventExpectedConsumptionCount(AtomLocality::Local, 1);
2483
+ }
2484
+
2485
+ launchGpuEndOfStepTasks(
2486
+ nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, runScheduleWork, step, wcycle);
2487
+
2488
+ if (haveDDAtomOrdering(*cr))
2489
+ {
2490
+ dd_force_flop_stop(cr->dd, nrnb);
2491
+ }
2492
+
2493
+ const bool haveCombinedMtsForces = (stepWork.computeForces && simulationWork.useMts && stepWork.computeSlowForces
2494
+ && stepWork.combineMtsForcesBeforeHaloExchange);
2495
+ if (stepWork.computeForces)
2496
+ {
2497
+ postProcessForceWithShiftForces(
2498
+ nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0, vir_force, *mdatoms, *fr, vsite, stepWork);
2499
+
2500
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2501
+ {
2502
+ postProcessForceWithShiftForces(
2503
+ nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, *mdatoms, *fr, vsite, stepWork);
2504
+ }
2505
+ }
2506
+
2507
+ // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
2508
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2509
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2510
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useCpuPmePpCommunication && !needEarlyPmeResults)
2511
+ {
2512
+ /* In case of node-splitting, the PP nodes receive the long-range
2513
+ * forces, virial and energy from the PME nodes here.
2514
+ */
2515
+ pme_receive_force_ener(fr,
2516
+ cr,
2517
+ &forceOutMtsLevel1->forceWithVirial(),
2518
+ enerd,
2519
+ simulationWork.useGpuPmePpCommunication,
2520
+ false,
2521
+ wcycle);
2522
+ }
2523
+
2524
+ if (stepWork.computeForces)
2525
+ {
2526
+ /* If we don't use MTS or if we already combined the MTS forces before, we only
2527
+ * need to post-process one ForceOutputs object here, called forceOutCombined,
2528
+ * otherwise we have to post-process two outputs and then combine them.
2529
+ */
2530
+ ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
2531
+ postProcessForces(
2532
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined, vir_force, mdatoms, fr, vsite, stepWork);
2533
+
2534
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2535
+ {
2536
+ postProcessForces(
2537
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, mdatoms, fr, vsite, stepWork);
2538
+
2539
+ combineMtsForces(mdatoms->homenr,
2540
+ force.unpaddedArrayRef(),
2541
+ forceView->forceMtsCombined(),
2542
+ inputrec.mtsLevels[1].stepFactor);
2543
+ }
2544
+ }
2545
+
2546
+ if (stepWork.computeEnergy)
2547
+ {
2548
+ /* Compute the final potential energy terms */
2549
+ accumulatePotentialEnergies(enerd, lambda, inputrec.fepvals.get());
2550
+
2551
+ if (!EI_TPI(inputrec.eI))
2552
+ {
2553
+ checkPotentialEnergyValidity(step, *enerd, inputrec);
2554
+ }
2555
+ }
2556
+
2557
+ /* In case we don't have constraints and are using GPUs, the next balancing
2558
+ * region starts here.
2559
+ * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2560
+ * virial calculation and COM pulling, is not thus not included in
2561
+ * the balance timing, which is ok as most tasks do communication.
2562
+ */
2563
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2564
+ }