hillclimber 0.1.6__cp313-cp313-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (468) hide show
  1. hillclimber/__init__.py +41 -0
  2. hillclimber/actions.py +53 -0
  3. hillclimber/analysis.py +590 -0
  4. hillclimber/biases.py +293 -0
  5. hillclimber/calc.py +22 -0
  6. hillclimber/cvs.py +1070 -0
  7. hillclimber/interfaces.py +133 -0
  8. hillclimber/metadynamics.py +416 -0
  9. hillclimber/nodes.py +6 -0
  10. hillclimber/opes.py +359 -0
  11. hillclimber/pycv.py +362 -0
  12. hillclimber/selectors.py +230 -0
  13. hillclimber/virtual_atoms.py +341 -0
  14. hillclimber-0.1.6.dist-info/METADATA +325 -0
  15. hillclimber-0.1.6.dist-info/RECORD +468 -0
  16. hillclimber-0.1.6.dist-info/WHEEL +6 -0
  17. hillclimber-0.1.6.dist-info/entry_points.txt +8 -0
  18. hillclimber-0.1.6.dist-info/licenses/LICENSE +165 -0
  19. plumed/__init__.py +104 -0
  20. plumed/_lib/bin/plumed +0 -0
  21. plumed/_lib/bin/plumed-config +9 -0
  22. plumed/_lib/bin/plumed-patch +9 -0
  23. plumed/_lib/include/plumed/adjmat/AdjacencyMatrixBase.h +659 -0
  24. plumed/_lib/include/plumed/adjmat/ContactMatrix.h +59 -0
  25. plumed/_lib/include/plumed/asmjit/arch.h +228 -0
  26. plumed/_lib/include/plumed/asmjit/arm.h +43 -0
  27. plumed/_lib/include/plumed/asmjit/asmjit.h +69 -0
  28. plumed/_lib/include/plumed/asmjit/asmjit_apibegin.h +143 -0
  29. plumed/_lib/include/plumed/asmjit/asmjit_apiend.h +93 -0
  30. plumed/_lib/include/plumed/asmjit/asmjit_build.h +971 -0
  31. plumed/_lib/include/plumed/asmjit/assembler.h +183 -0
  32. plumed/_lib/include/plumed/asmjit/base.h +56 -0
  33. plumed/_lib/include/plumed/asmjit/codebuilder.h +944 -0
  34. plumed/_lib/include/plumed/asmjit/codecompiler.h +767 -0
  35. plumed/_lib/include/plumed/asmjit/codeemitter.h +528 -0
  36. plumed/_lib/include/plumed/asmjit/codeholder.h +777 -0
  37. plumed/_lib/include/plumed/asmjit/constpool.h +286 -0
  38. plumed/_lib/include/plumed/asmjit/cpuinfo.h +402 -0
  39. plumed/_lib/include/plumed/asmjit/func.h +1327 -0
  40. plumed/_lib/include/plumed/asmjit/globals.h +370 -0
  41. plumed/_lib/include/plumed/asmjit/inst.h +137 -0
  42. plumed/_lib/include/plumed/asmjit/logging.h +317 -0
  43. plumed/_lib/include/plumed/asmjit/misc_p.h +103 -0
  44. plumed/_lib/include/plumed/asmjit/moved_string.h +318 -0
  45. plumed/_lib/include/plumed/asmjit/operand.h +1599 -0
  46. plumed/_lib/include/plumed/asmjit/osutils.h +207 -0
  47. plumed/_lib/include/plumed/asmjit/regalloc_p.h +597 -0
  48. plumed/_lib/include/plumed/asmjit/runtime.h +227 -0
  49. plumed/_lib/include/plumed/asmjit/simdtypes.h +1104 -0
  50. plumed/_lib/include/plumed/asmjit/utils.h +1387 -0
  51. plumed/_lib/include/plumed/asmjit/vmem.h +183 -0
  52. plumed/_lib/include/plumed/asmjit/x86.h +45 -0
  53. plumed/_lib/include/plumed/asmjit/x86assembler.h +125 -0
  54. plumed/_lib/include/plumed/asmjit/x86builder.h +117 -0
  55. plumed/_lib/include/plumed/asmjit/x86compiler.h +322 -0
  56. plumed/_lib/include/plumed/asmjit/x86emitter.h +5149 -0
  57. plumed/_lib/include/plumed/asmjit/x86globals.h +535 -0
  58. plumed/_lib/include/plumed/asmjit/x86inst.h +2547 -0
  59. plumed/_lib/include/plumed/asmjit/x86instimpl_p.h +74 -0
  60. plumed/_lib/include/plumed/asmjit/x86internal_p.h +108 -0
  61. plumed/_lib/include/plumed/asmjit/x86logging_p.h +92 -0
  62. plumed/_lib/include/plumed/asmjit/x86misc.h +417 -0
  63. plumed/_lib/include/plumed/asmjit/x86operand.h +1133 -0
  64. plumed/_lib/include/plumed/asmjit/x86regalloc_p.h +734 -0
  65. plumed/_lib/include/plumed/asmjit/zone.h +1157 -0
  66. plumed/_lib/include/plumed/bias/Bias.h +82 -0
  67. plumed/_lib/include/plumed/bias/ReweightBase.h +58 -0
  68. plumed/_lib/include/plumed/blas/blas.h +253 -0
  69. plumed/_lib/include/plumed/blas/def_external.h +61 -0
  70. plumed/_lib/include/plumed/blas/def_internal.h +97 -0
  71. plumed/_lib/include/plumed/blas/real.h +49 -0
  72. plumed/_lib/include/plumed/cltools/CLTool.h +32 -0
  73. plumed/_lib/include/plumed/clusters/ClusteringBase.h +70 -0
  74. plumed/_lib/include/plumed/colvar/Colvar.h +32 -0
  75. plumed/_lib/include/plumed/colvar/ColvarInput.h +68 -0
  76. plumed/_lib/include/plumed/colvar/ColvarShortcut.h +81 -0
  77. plumed/_lib/include/plumed/colvar/CoordinationBase.h +52 -0
  78. plumed/_lib/include/plumed/colvar/MultiColvarTemplate.h +333 -0
  79. plumed/_lib/include/plumed/colvar/PathMSDBase.h +101 -0
  80. plumed/_lib/include/plumed/colvar/RMSDVector.h +78 -0
  81. plumed/_lib/include/plumed/config/Config.h +118 -0
  82. plumed/_lib/include/plumed/config/version.h +9 -0
  83. plumed/_lib/include/plumed/contour/ContourFindingObject.h +87 -0
  84. plumed/_lib/include/plumed/contour/DistanceFromContourBase.h +82 -0
  85. plumed/_lib/include/plumed/contour/FindContour.h +67 -0
  86. plumed/_lib/include/plumed/core/Action.h +540 -0
  87. plumed/_lib/include/plumed/core/ActionAnyorder.h +48 -0
  88. plumed/_lib/include/plumed/core/ActionAtomistic.h +343 -0
  89. plumed/_lib/include/plumed/core/ActionForInterface.h +99 -0
  90. plumed/_lib/include/plumed/core/ActionPilot.h +57 -0
  91. plumed/_lib/include/plumed/core/ActionRegister.h +124 -0
  92. plumed/_lib/include/plumed/core/ActionSet.h +163 -0
  93. plumed/_lib/include/plumed/core/ActionSetup.h +48 -0
  94. plumed/_lib/include/plumed/core/ActionShortcut.h +73 -0
  95. plumed/_lib/include/plumed/core/ActionToGetData.h +59 -0
  96. plumed/_lib/include/plumed/core/ActionToPutData.h +101 -0
  97. plumed/_lib/include/plumed/core/ActionWithArguments.h +140 -0
  98. plumed/_lib/include/plumed/core/ActionWithMatrix.h +87 -0
  99. plumed/_lib/include/plumed/core/ActionWithValue.h +258 -0
  100. plumed/_lib/include/plumed/core/ActionWithVector.h +94 -0
  101. plumed/_lib/include/plumed/core/ActionWithVirtualAtom.h +123 -0
  102. plumed/_lib/include/plumed/core/CLTool.h +177 -0
  103. plumed/_lib/include/plumed/core/CLToolMain.h +102 -0
  104. plumed/_lib/include/plumed/core/CLToolRegister.h +108 -0
  105. plumed/_lib/include/plumed/core/Colvar.h +115 -0
  106. plumed/_lib/include/plumed/core/DataPassingObject.h +94 -0
  107. plumed/_lib/include/plumed/core/DataPassingTools.h +54 -0
  108. plumed/_lib/include/plumed/core/DomainDecomposition.h +120 -0
  109. plumed/_lib/include/plumed/core/ExchangePatterns.h +47 -0
  110. plumed/_lib/include/plumed/core/FlexibleBin.h +63 -0
  111. plumed/_lib/include/plumed/core/GREX.h +61 -0
  112. plumed/_lib/include/plumed/core/GenericMolInfo.h +89 -0
  113. plumed/_lib/include/plumed/core/Group.h +41 -0
  114. plumed/_lib/include/plumed/core/ModuleMap.h +30 -0
  115. plumed/_lib/include/plumed/core/ParallelTaskManager.h +1023 -0
  116. plumed/_lib/include/plumed/core/PbcAction.h +61 -0
  117. plumed/_lib/include/plumed/core/PlumedMain.h +632 -0
  118. plumed/_lib/include/plumed/core/PlumedMainInitializer.h +118 -0
  119. plumed/_lib/include/plumed/core/RegisterBase.h +340 -0
  120. plumed/_lib/include/plumed/core/TargetDist.h +48 -0
  121. plumed/_lib/include/plumed/core/Value.h +547 -0
  122. plumed/_lib/include/plumed/core/WithCmd.h +93 -0
  123. plumed/_lib/include/plumed/dimred/SMACOF.h +55 -0
  124. plumed/_lib/include/plumed/drr/DRR.h +383 -0
  125. plumed/_lib/include/plumed/drr/colvar_UIestimator.h +777 -0
  126. plumed/_lib/include/plumed/fisst/legendre_rule_fast.h +44 -0
  127. plumed/_lib/include/plumed/function/Custom.h +54 -0
  128. plumed/_lib/include/plumed/function/Function.h +85 -0
  129. plumed/_lib/include/plumed/function/FunctionOfMatrix.h +368 -0
  130. plumed/_lib/include/plumed/function/FunctionOfScalar.h +135 -0
  131. plumed/_lib/include/plumed/function/FunctionOfVector.h +296 -0
  132. plumed/_lib/include/plumed/function/FunctionSetup.h +180 -0
  133. plumed/_lib/include/plumed/function/FunctionShortcut.h +130 -0
  134. plumed/_lib/include/plumed/function/FunctionWithSingleArgument.h +165 -0
  135. plumed/_lib/include/plumed/gridtools/ActionWithGrid.h +43 -0
  136. plumed/_lib/include/plumed/gridtools/EvaluateGridFunction.h +99 -0
  137. plumed/_lib/include/plumed/gridtools/FunctionOfGrid.h +295 -0
  138. plumed/_lib/include/plumed/gridtools/GridCoordinatesObject.h +179 -0
  139. plumed/_lib/include/plumed/gridtools/GridSearch.h +135 -0
  140. plumed/_lib/include/plumed/gridtools/Interpolator.h +45 -0
  141. plumed/_lib/include/plumed/gridtools/KDE.h +455 -0
  142. plumed/_lib/include/plumed/gridtools/RDF.h +40 -0
  143. plumed/_lib/include/plumed/gridtools/SumOfKernels.h +219 -0
  144. plumed/_lib/include/plumed/isdb/MetainferenceBase.h +398 -0
  145. plumed/_lib/include/plumed/lapack/def_external.h +207 -0
  146. plumed/_lib/include/plumed/lapack/def_internal.h +388 -0
  147. plumed/_lib/include/plumed/lapack/lapack.h +899 -0
  148. plumed/_lib/include/plumed/lapack/lapack_limits.h +79 -0
  149. plumed/_lib/include/plumed/lapack/real.h +50 -0
  150. plumed/_lib/include/plumed/lepton/CompiledExpression.h +164 -0
  151. plumed/_lib/include/plumed/lepton/CustomFunction.h +143 -0
  152. plumed/_lib/include/plumed/lepton/Exception.h +93 -0
  153. plumed/_lib/include/plumed/lepton/ExpressionProgram.h +137 -0
  154. plumed/_lib/include/plumed/lepton/ExpressionTreeNode.h +145 -0
  155. plumed/_lib/include/plumed/lepton/Lepton.h +85 -0
  156. plumed/_lib/include/plumed/lepton/MSVC_erfc.h +123 -0
  157. plumed/_lib/include/plumed/lepton/Operation.h +1302 -0
  158. plumed/_lib/include/plumed/lepton/ParsedExpression.h +165 -0
  159. plumed/_lib/include/plumed/lepton/Parser.h +111 -0
  160. plumed/_lib/include/plumed/lepton/windowsIncludes.h +73 -0
  161. plumed/_lib/include/plumed/mapping/Path.h +44 -0
  162. plumed/_lib/include/plumed/mapping/PathProjectionCalculator.h +57 -0
  163. plumed/_lib/include/plumed/matrixtools/MatrixOperationBase.h +54 -0
  164. plumed/_lib/include/plumed/matrixtools/MatrixTimesMatrix.h +309 -0
  165. plumed/_lib/include/plumed/matrixtools/MatrixTimesVectorBase.h +365 -0
  166. plumed/_lib/include/plumed/matrixtools/OuterProduct.h +238 -0
  167. plumed/_lib/include/plumed/maze/Core.h +65 -0
  168. plumed/_lib/include/plumed/maze/Loss.h +86 -0
  169. plumed/_lib/include/plumed/maze/Member.h +66 -0
  170. plumed/_lib/include/plumed/maze/Memetic.h +799 -0
  171. plumed/_lib/include/plumed/maze/Optimizer.h +357 -0
  172. plumed/_lib/include/plumed/maze/Random_MT.h +156 -0
  173. plumed/_lib/include/plumed/maze/Tools.h +183 -0
  174. plumed/_lib/include/plumed/metatomic/vesin.h +188 -0
  175. plumed/_lib/include/plumed/molfile/Gromacs.h +2013 -0
  176. plumed/_lib/include/plumed/molfile/endianswap.h +217 -0
  177. plumed/_lib/include/plumed/molfile/fastio.h +683 -0
  178. plumed/_lib/include/plumed/molfile/largefiles.h +78 -0
  179. plumed/_lib/include/plumed/molfile/libmolfile_plugin.h +77 -0
  180. plumed/_lib/include/plumed/molfile/molfile_plugin.h +1034 -0
  181. plumed/_lib/include/plumed/molfile/periodic_table.h +248 -0
  182. plumed/_lib/include/plumed/molfile/readpdb.h +447 -0
  183. plumed/_lib/include/plumed/molfile/vmdplugin.h +236 -0
  184. plumed/_lib/include/plumed/multicolvar/MultiColvarShortcuts.h +45 -0
  185. plumed/_lib/include/plumed/opes/ExpansionCVs.h +79 -0
  186. plumed/_lib/include/plumed/sasa/Sasa.h +32 -0
  187. plumed/_lib/include/plumed/secondarystructure/SecondaryStructureBase.h +372 -0
  188. plumed/_lib/include/plumed/setup/ActionSetup.h +25 -0
  189. plumed/_lib/include/plumed/small_vector/small_vector.h +6114 -0
  190. plumed/_lib/include/plumed/symfunc/CoordinationNumbers.h +41 -0
  191. plumed/_lib/include/plumed/tools/Angle.h +52 -0
  192. plumed/_lib/include/plumed/tools/AtomDistribution.h +138 -0
  193. plumed/_lib/include/plumed/tools/AtomNumber.h +152 -0
  194. plumed/_lib/include/plumed/tools/BiasRepresentation.h +106 -0
  195. plumed/_lib/include/plumed/tools/BitmaskEnum.h +167 -0
  196. plumed/_lib/include/plumed/tools/Brent1DRootSearch.h +159 -0
  197. plumed/_lib/include/plumed/tools/CheckInRange.h +44 -0
  198. plumed/_lib/include/plumed/tools/Citations.h +74 -0
  199. plumed/_lib/include/plumed/tools/ColvarOutput.h +118 -0
  200. plumed/_lib/include/plumed/tools/Communicator.h +316 -0
  201. plumed/_lib/include/plumed/tools/ConjugateGradient.h +80 -0
  202. plumed/_lib/include/plumed/tools/DLLoader.h +79 -0
  203. plumed/_lib/include/plumed/tools/ERMSD.h +73 -0
  204. plumed/_lib/include/plumed/tools/Exception.h +406 -0
  205. plumed/_lib/include/plumed/tools/File.h +28 -0
  206. plumed/_lib/include/plumed/tools/FileBase.h +153 -0
  207. plumed/_lib/include/plumed/tools/FileTools.h +37 -0
  208. plumed/_lib/include/plumed/tools/ForwardDecl.h +54 -0
  209. plumed/_lib/include/plumed/tools/Grid.h +638 -0
  210. plumed/_lib/include/plumed/tools/HistogramBead.h +136 -0
  211. plumed/_lib/include/plumed/tools/IFile.h +117 -0
  212. plumed/_lib/include/plumed/tools/KernelFunctions.h +113 -0
  213. plumed/_lib/include/plumed/tools/Keywords.h +380 -0
  214. plumed/_lib/include/plumed/tools/LatticeReduction.h +66 -0
  215. plumed/_lib/include/plumed/tools/LeptonCall.h +64 -0
  216. plumed/_lib/include/plumed/tools/LinkCells.h +126 -0
  217. plumed/_lib/include/plumed/tools/Log.h +41 -0
  218. plumed/_lib/include/plumed/tools/LoopUnroller.h +163 -0
  219. plumed/_lib/include/plumed/tools/Matrix.h +721 -0
  220. plumed/_lib/include/plumed/tools/MatrixSquareBracketsAccess.h +138 -0
  221. plumed/_lib/include/plumed/tools/MergeVectorTools.h +153 -0
  222. plumed/_lib/include/plumed/tools/Minimise1DBrent.h +244 -0
  223. plumed/_lib/include/plumed/tools/MinimiseBase.h +120 -0
  224. plumed/_lib/include/plumed/tools/MolDataClass.h +51 -0
  225. plumed/_lib/include/plumed/tools/NeighborList.h +112 -0
  226. plumed/_lib/include/plumed/tools/OFile.h +286 -0
  227. plumed/_lib/include/plumed/tools/OpenACC.h +180 -0
  228. plumed/_lib/include/plumed/tools/OpenMP.h +75 -0
  229. plumed/_lib/include/plumed/tools/PDB.h +154 -0
  230. plumed/_lib/include/plumed/tools/Pbc.h +139 -0
  231. plumed/_lib/include/plumed/tools/PlumedHandle.h +105 -0
  232. plumed/_lib/include/plumed/tools/RMSD.h +493 -0
  233. plumed/_lib/include/plumed/tools/Random.h +80 -0
  234. plumed/_lib/include/plumed/tools/RootFindingBase.h +79 -0
  235. plumed/_lib/include/plumed/tools/Stopwatch.h +475 -0
  236. plumed/_lib/include/plumed/tools/Subprocess.h +142 -0
  237. plumed/_lib/include/plumed/tools/SwitchingFunction.h +208 -0
  238. plumed/_lib/include/plumed/tools/Tensor.h +724 -0
  239. plumed/_lib/include/plumed/tools/TokenizedLine.h +123 -0
  240. plumed/_lib/include/plumed/tools/Tools.h +638 -0
  241. plumed/_lib/include/plumed/tools/Torsion.h +55 -0
  242. plumed/_lib/include/plumed/tools/TrajectoryParser.h +118 -0
  243. plumed/_lib/include/plumed/tools/Tree.h +61 -0
  244. plumed/_lib/include/plumed/tools/TypesafePtr.h +463 -0
  245. plumed/_lib/include/plumed/tools/Units.h +167 -0
  246. plumed/_lib/include/plumed/tools/Vector.h +433 -0
  247. plumed/_lib/include/plumed/tools/View.h +296 -0
  248. plumed/_lib/include/plumed/tools/View2D.h +100 -0
  249. plumed/_lib/include/plumed/tools/h36.h +39 -0
  250. plumed/_lib/include/plumed/vatom/ActionWithVirtualAtom.h +32 -0
  251. plumed/_lib/include/plumed/ves/BasisFunctions.h +380 -0
  252. plumed/_lib/include/plumed/ves/CoeffsBase.h +310 -0
  253. plumed/_lib/include/plumed/ves/CoeffsMatrix.h +220 -0
  254. plumed/_lib/include/plumed/ves/CoeffsVector.h +251 -0
  255. plumed/_lib/include/plumed/ves/FermiSwitchingFunction.h +74 -0
  256. plumed/_lib/include/plumed/ves/GridIntegrationWeights.h +50 -0
  257. plumed/_lib/include/plumed/ves/GridLinearInterpolation.h +81 -0
  258. plumed/_lib/include/plumed/ves/GridProjWeights.h +61 -0
  259. plumed/_lib/include/plumed/ves/LinearBasisSetExpansion.h +303 -0
  260. plumed/_lib/include/plumed/ves/Optimizer.h +444 -0
  261. plumed/_lib/include/plumed/ves/TargetDistModifer.h +53 -0
  262. plumed/_lib/include/plumed/ves/TargetDistribution.h +266 -0
  263. plumed/_lib/include/plumed/ves/VesBias.h +545 -0
  264. plumed/_lib/include/plumed/ves/VesTools.h +142 -0
  265. plumed/_lib/include/plumed/ves/WaveletGrid.h +75 -0
  266. plumed/_lib/include/plumed/volumes/ActionVolume.h +268 -0
  267. plumed/_lib/include/plumed/volumes/VolumeShortcut.h +147 -0
  268. plumed/_lib/include/plumed/wrapper/Plumed.h +5025 -0
  269. plumed/_lib/include/plumed/xdrfile/xdrfile.h +663 -0
  270. plumed/_lib/include/plumed/xdrfile/xdrfile_trr.h +89 -0
  271. plumed/_lib/include/plumed/xdrfile/xdrfile_xtc.h +90 -0
  272. plumed/_lib/lib/PythonCVInterface.dylib +0 -0
  273. plumed/_lib/lib/libplumed.dylib +0 -0
  274. plumed/_lib/lib/libplumedKernel.dylib +0 -0
  275. plumed/_lib/lib/libplumedWrapper.a +0 -0
  276. plumed/_lib/lib/pkgconfig/plumed.pc +13 -0
  277. plumed/_lib/lib/pkgconfig/plumedInternals.pc +13 -0
  278. plumed/_lib/lib/pkgconfig/plumedWrapper.pc +13 -0
  279. plumed/_lib/lib/plumed/fortran/plumed.f90 +879 -0
  280. plumed/_lib/lib/plumed/fortran/plumed_f08.f90 +2625 -0
  281. plumed/_lib/lib/plumed/modulefile +69 -0
  282. plumed/_lib/lib/plumed/patches/gromacs-2022.5.config +43 -0
  283. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt +543 -0
  284. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/CMakeLists.txt.preplumed +540 -0
  285. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp +1628 -0
  286. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1590 -0
  287. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h +103 -0
  288. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/expanded.h.preplumed +99 -0
  289. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp +2527 -0
  290. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2513 -0
  291. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  292. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  293. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +408 -0
  294. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +394 -0
  295. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp +2348 -0
  296. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2091 -0
  297. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp +3573 -0
  298. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3495 -0
  299. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1506 -0
  300. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1402 -0
  301. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  302. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  303. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp +997 -0
  304. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +906 -0
  305. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp +2780 -0
  306. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2738 -0
  307. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  308. plumed/_lib/lib/plumed/patches/gromacs-2022.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  309. plumed/_lib/lib/plumed/patches/gromacs-2023.5.config +43 -0
  310. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt +549 -0
  311. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/CMakeLists.txt.preplumed +546 -0
  312. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  313. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  314. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h +104 -0
  315. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  316. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp +2624 -0
  317. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2610 -0
  318. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  319. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  320. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h +409 -0
  321. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +395 -0
  322. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp +2419 -0
  323. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/md.cpp.preplumed +2164 -0
  324. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp +3546 -0
  325. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3468 -0
  326. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  327. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  328. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  329. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  330. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp +991 -0
  331. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/rerun.cpp.preplumed +900 -0
  332. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp +2895 -0
  333. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/mdrun/runner.cpp.preplumed +2849 -0
  334. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  335. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  336. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp +886 -0
  337. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +880 -0
  338. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +347 -0
  339. plumed/_lib/lib/plumed/patches/gromacs-2023.5.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +345 -0
  340. plumed/_lib/lib/plumed/patches/gromacs-2024.3.config +43 -0
  341. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt +575 -0
  342. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/CMakeLists.txt.preplumed +572 -0
  343. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp +1632 -0
  344. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.cpp.preplumed +1594 -0
  345. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h +104 -0
  346. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/expanded.h.preplumed +100 -0
  347. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp +2564 -0
  348. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdlib/sim_util.cpp.preplumed +2550 -0
  349. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp +208 -0
  350. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.cpp.preplumed +175 -0
  351. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h +410 -0
  352. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/legacymdrunoptions.h.preplumed +396 -0
  353. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp +2435 -0
  354. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/md.cpp.preplumed +2187 -0
  355. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp +3592 -0
  356. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/minimize.cpp.preplumed +3514 -0
  357. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp +1513 -0
  358. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.cpp.preplumed +1409 -0
  359. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h +114 -0
  360. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/replicaexchange.h.preplumed +106 -0
  361. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp +958 -0
  362. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/rerun.cpp.preplumed +929 -0
  363. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp +2987 -0
  364. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/mdrun/runner.cpp.preplumed +2941 -0
  365. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp +224 -0
  366. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/modularsimulator/expandedensembleelement.cpp.preplumed +222 -0
  367. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp +904 -0
  368. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/decidegpuusage.cpp.preplumed +898 -0
  369. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h +353 -0
  370. plumed/_lib/lib/plumed/patches/gromacs-2024.3.diff/src/gromacs/taskassignment/include/gromacs/taskassignment/decidegpuusage.h.preplumed +351 -0
  371. plumed/_lib/lib/plumed/patches/gromacs-2025.0.config +39 -0
  372. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake +82 -0
  373. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/cmake/gmxManagePlumed.cmake.preplumed +82 -0
  374. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp +162 -0
  375. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedMDModule.cpp.preplumed +154 -0
  376. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp +107 -0
  377. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.cpp.preplumed +99 -0
  378. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h +120 -0
  379. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedOptions.h.preplumed +111 -0
  380. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp +215 -0
  381. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.cpp.preplumed +197 -0
  382. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h +87 -0
  383. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/applied_forces/plumed/plumedforceprovider.h.preplumed +86 -0
  384. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp +2971 -0
  385. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrun/runner.cpp.preplumed +2970 -0
  386. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h +430 -0
  387. plumed/_lib/lib/plumed/patches/gromacs-2025.0.diff/src/gromacs/mdrunutility/mdmodulesnotifiers.h.preplumed +429 -0
  388. plumed/_lib/lib/plumed/patches/namd-2.12.config +30 -0
  389. plumed/_lib/lib/plumed/patches/namd-2.12.diff +267 -0
  390. plumed/_lib/lib/plumed/patches/namd-2.13.config +30 -0
  391. plumed/_lib/lib/plumed/patches/namd-2.13.diff +267 -0
  392. plumed/_lib/lib/plumed/patches/namd-2.14.config +30 -0
  393. plumed/_lib/lib/plumed/patches/namd-2.14.diff +268 -0
  394. plumed/_lib/lib/plumed/patches/patch.sh +500 -0
  395. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.config +25 -0
  396. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90 +368 -0
  397. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/forces.f90.preplumed +366 -0
  398. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90 +71 -0
  399. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_forces.f90.preplumed +24 -0
  400. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90 +62 -0
  401. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  402. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90 +189 -0
  403. plumed/_lib/lib/plumed/patches/qespresso-5.0.2.diff/PW/src/pwscf.f90.preplumed +185 -0
  404. plumed/_lib/lib/plumed/patches/qespresso-6.2.config +26 -0
  405. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90 +422 -0
  406. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/forces.f90.preplumed +420 -0
  407. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90 +70 -0
  408. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  409. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90 +62 -0
  410. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  411. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90 +233 -0
  412. plumed/_lib/lib/plumed/patches/qespresso-6.2.diff/PW/src/run_pwscf.f90.preplumed +230 -0
  413. plumed/_lib/lib/plumed/patches/qespresso-7.0.config +28 -0
  414. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile +175 -0
  415. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/Modules/Makefile.preplumed +171 -0
  416. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90 +486 -0
  417. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/forces.f90.preplumed +484 -0
  418. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90 +74 -0
  419. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  420. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90 +64 -0
  421. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  422. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90 +532 -0
  423. plumed/_lib/lib/plumed/patches/qespresso-7.0.diff/PW/src/run_pwscf.f90.preplumed +518 -0
  424. plumed/_lib/lib/plumed/patches/qespresso-7.2.config +28 -0
  425. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile +249 -0
  426. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/Modules/Makefile.preplumed +244 -0
  427. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90 +532 -0
  428. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/forces.f90.preplumed +535 -0
  429. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90 +74 -0
  430. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_ext_forces.f90.preplumed +23 -0
  431. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90 +64 -0
  432. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/plugin_initialization.f90.preplumed +21 -0
  433. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90 +569 -0
  434. plumed/_lib/lib/plumed/patches/qespresso-7.2.diff/PW/src/run_pwscf.f90.preplumed +560 -0
  435. plumed/_lib/lib/plumed/plumed-config +9 -0
  436. plumed/_lib/lib/plumed/plumed-mklib +9 -0
  437. plumed/_lib/lib/plumed/plumed-newcv +9 -0
  438. plumed/_lib/lib/plumed/plumed-partial_tempering +9 -0
  439. plumed/_lib/lib/plumed/plumed-patch +9 -0
  440. plumed/_lib/lib/plumed/plumed-runtime +0 -0
  441. plumed/_lib/lib/plumed/plumed-selector +9 -0
  442. plumed/_lib/lib/plumed/plumed-vim2html +9 -0
  443. plumed/_lib/lib/plumed/scripts/config.sh +126 -0
  444. plumed/_lib/lib/plumed/scripts/mklib.sh +175 -0
  445. plumed/_lib/lib/plumed/scripts/newcv.sh +26 -0
  446. plumed/_lib/lib/plumed/scripts/partial_tempering.sh +319 -0
  447. plumed/_lib/lib/plumed/scripts/patch.sh +4 -0
  448. plumed/_lib/lib/plumed/scripts/selector.sh +234 -0
  449. plumed/_lib/lib/plumed/scripts/vim2html.sh +190 -0
  450. plumed/_lib/lib/plumed/src/colvar/Template.cpp +116 -0
  451. plumed/_lib/lib/plumed/src/config/compile_options.sh +3 -0
  452. plumed/_lib/lib/plumed/src/config/config.txt +181 -0
  453. plumed/_lib/lib/plumed/src/lib/Plumed.cmake +6 -0
  454. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.runtime +5 -0
  455. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.shared +5 -0
  456. plumed/_lib/lib/plumed/src/lib/Plumed.cmake.static +3 -0
  457. plumed/_lib/lib/plumed/src/lib/Plumed.inc +6 -0
  458. plumed/_lib/lib/plumed/src/lib/Plumed.inc.runtime +5 -0
  459. plumed/_lib/lib/plumed/src/lib/Plumed.inc.shared +5 -0
  460. plumed/_lib/lib/plumed/src/lib/Plumed.inc.static +3 -0
  461. plumed/_lib/lib/plumed/vim/scripts.vim +6 -0
  462. plumed/_plumed_core.cpython-311-darwin.so +0 -0
  463. plumed/_plumed_core.cpython-312-darwin.so +0 -0
  464. plumed/_plumed_core.cpython-313-darwin.so +0 -0
  465. plumedCommunications.cpython-311-darwin.so +0 -0
  466. plumedCommunications.cpython-312-darwin.so +0 -0
  467. plumedCommunications.cpython-313-darwin.so +0 -0
  468. plumedCommunications.pyi +431 -0
@@ -0,0 +1,2513 @@
1
+ /*
2
+ * This file is part of the GROMACS molecular simulation package.
3
+ *
4
+ * Copyright 1991- The GROMACS Authors
5
+ * and the project initiators Erik Lindahl, Berk Hess and David van der Spoel.
6
+ * Consult the AUTHORS/COPYING files and https://www.gromacs.org for details.
7
+ *
8
+ * GROMACS is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU Lesser General Public License
10
+ * as published by the Free Software Foundation; either version 2.1
11
+ * of the License, or (at your option) any later version.
12
+ *
13
+ * GROMACS is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ * Lesser General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public
19
+ * License along with GROMACS; if not, see
20
+ * https://www.gnu.org/licenses, or write to the Free Software Foundation,
21
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
+ *
23
+ * If you want to redistribute modifications to GROMACS, please
24
+ * consider that scientific software is very special. Version
25
+ * control is crucial - bugs must be traceable. We will be happy to
26
+ * consider code for inclusion in the official distribution, but
27
+ * derived work must not be called official GROMACS. Details are found
28
+ * in the README & COPYING files - if they are missing, get the
29
+ * official version at https://www.gromacs.org.
30
+ *
31
+ * To help us fund GROMACS development, we humbly ask that you cite
32
+ * the research papers on the package. Check out https://www.gromacs.org.
33
+ */
34
+ #include "gmxpre.h"
35
+
36
+ #include "config.h"
37
+
38
+ #include <cmath>
39
+ #include <cstdint>
40
+ #include <cstdio>
41
+ #include <cstring>
42
+
43
+ #include <array>
44
+ #include <optional>
45
+
46
+ #include "gromacs/applied_forces/awh/awh.h"
47
+ #include "gromacs/domdec/dlbtiming.h"
48
+ #include "gromacs/domdec/domdec.h"
49
+ #include "gromacs/domdec/domdec_struct.h"
50
+ #include "gromacs/domdec/gpuhaloexchange.h"
51
+ #include "gromacs/domdec/partition.h"
52
+ #include "gromacs/essentialdynamics/edsam.h"
53
+ #include "gromacs/ewald/pme.h"
54
+ #include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
55
+ #include "gromacs/ewald/pme_pp.h"
56
+ #include "gromacs/ewald/pme_pp_comm_gpu.h"
57
+ #include "gromacs/gmxlib/network.h"
58
+ #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
59
+ #include "gromacs/gmxlib/nonbonded/nonbonded.h"
60
+ #include "gromacs/gmxlib/nrnb.h"
61
+ #include "gromacs/gpu_utils/gpu_utils.h"
62
+ #include "gromacs/imd/imd.h"
63
+ #include "gromacs/listed_forces/disre.h"
64
+ #include "gromacs/listed_forces/listed_forces_gpu.h"
65
+ #include "gromacs/listed_forces/listed_forces.h"
66
+ #include "gromacs/listed_forces/orires.h"
67
+ #include "gromacs/math/arrayrefwithpadding.h"
68
+ #include "gromacs/math/functions.h"
69
+ #include "gromacs/math/units.h"
70
+ #include "gromacs/math/vec.h"
71
+ #include "gromacs/math/vecdump.h"
72
+ #include "gromacs/mdlib/calcmu.h"
73
+ #include "gromacs/mdlib/calcvir.h"
74
+ #include "gromacs/mdlib/constr.h"
75
+ #include "gromacs/mdlib/dispersioncorrection.h"
76
+ #include "gromacs/mdlib/enerdata_utils.h"
77
+ #include "gromacs/mdlib/force.h"
78
+ #include "gromacs/mdlib/force_flags.h"
79
+ #include "gromacs/mdlib/forcerec.h"
80
+ #include "gromacs/mdlib/gmx_omp_nthreads.h"
81
+ #include "gromacs/mdlib/update.h"
82
+ #include "gromacs/mdlib/vsite.h"
83
+ #include "gromacs/mdlib/wall.h"
84
+ #include "gromacs/mdlib/wholemoleculetransform.h"
85
+ #include "gromacs/mdtypes/commrec.h"
86
+ #include "gromacs/mdtypes/enerdata.h"
87
+ #include "gromacs/mdtypes/forcebuffers.h"
88
+ #include "gromacs/mdtypes/forceoutput.h"
89
+ #include "gromacs/mdtypes/forcerec.h"
90
+ #include "gromacs/mdtypes/iforceprovider.h"
91
+ #include "gromacs/mdtypes/inputrec.h"
92
+ #include "gromacs/mdtypes/md_enums.h"
93
+ #include "gromacs/mdtypes/mdatom.h"
94
+ #include "gromacs/mdtypes/multipletimestepping.h"
95
+ #include "gromacs/mdtypes/simulation_workload.h"
96
+ #include "gromacs/mdtypes/state.h"
97
+ #include "gromacs/mdtypes/state_propagator_data_gpu.h"
98
+ #include "gromacs/nbnxm/gpu_data_mgmt.h"
99
+ #include "gromacs/nbnxm/nbnxm.h"
100
+ #include "gromacs/nbnxm/nbnxm_gpu.h"
101
+ #include "gromacs/pbcutil/ishift.h"
102
+ #include "gromacs/pbcutil/pbc.h"
103
+ #include "gromacs/pulling/pull.h"
104
+ #include "gromacs/pulling/pull_rotation.h"
105
+ #include "gromacs/timing/cyclecounter.h"
106
+ #include "gromacs/timing/gpu_timing.h"
107
+ #include "gromacs/timing/wallcycle.h"
108
+ #include "gromacs/timing/wallcyclereporting.h"
109
+ #include "gromacs/timing/walltime_accounting.h"
110
+ #include "gromacs/topology/topology.h"
111
+ #include "gromacs/utility/arrayref.h"
112
+ #include "gromacs/utility/basedefinitions.h"
113
+ #include "gromacs/utility/cstringutil.h"
114
+ #include "gromacs/utility/exceptions.h"
115
+ #include "gromacs/utility/fatalerror.h"
116
+ #include "gromacs/utility/fixedcapacityvector.h"
117
+ #include "gromacs/utility/gmxassert.h"
118
+ #include "gromacs/utility/gmxmpi.h"
119
+ #include "gromacs/utility/logger.h"
120
+ #include "gromacs/utility/smalloc.h"
121
+ #include "gromacs/utility/strconvert.h"
122
+ #include "gromacs/utility/stringutil.h"
123
+ #include "gromacs/utility/sysinfo.h"
124
+
125
+ #include "gpuforcereduction.h"
126
+
127
+ using gmx::ArrayRef;
128
+ using gmx::AtomLocality;
129
+ using gmx::DomainLifetimeWorkload;
130
+ using gmx::ForceOutputs;
131
+ using gmx::ForceWithShiftForces;
132
+ using gmx::InteractionLocality;
133
+ using gmx::RVec;
134
+ using gmx::SimulationWorkload;
135
+ using gmx::StepWorkload;
136
+
137
+ // TODO: this environment variable allows us to verify before release
138
+ // that on less common architectures the total cost of polling is not larger than
139
+ // a blocking wait (so polling does not introduce overhead when the static
140
+ // PME-first ordering would suffice).
141
+ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
142
+
143
+ static void sum_forces(ArrayRef<RVec> f, ArrayRef<const RVec> forceToAdd)
144
+ {
145
+ GMX_ASSERT(f.size() >= forceToAdd.size(), "Accumulation buffer should be sufficiently large");
146
+ const int end = forceToAdd.size();
147
+
148
+ int gmx_unused nt = gmx_omp_nthreads_get(ModuleMultiThread::Default);
149
+ #pragma omp parallel for num_threads(nt) schedule(static)
150
+ for (int i = 0; i < end; i++)
151
+ {
152
+ rvec_inc(f[i], forceToAdd[i]);
153
+ }
154
+ }
155
+
156
+ static void calc_virial(int start,
157
+ int homenr,
158
+ const rvec x[],
159
+ const gmx::ForceWithShiftForces& forceWithShiftForces,
160
+ tensor vir_part,
161
+ const matrix box,
162
+ t_nrnb* nrnb,
163
+ const t_forcerec* fr,
164
+ PbcType pbcType)
165
+ {
166
+ /* The short-range virial from surrounding boxes */
167
+ const rvec* fshift = as_rvec_array(forceWithShiftForces.shiftForces().data());
168
+ const rvec* shiftVecPointer = as_rvec_array(fr->shift_vec.data());
169
+ calc_vir(gmx::c_numShiftVectors, shiftVecPointer, fshift, vir_part, pbcType == PbcType::Screw, box);
170
+ inc_nrnb(nrnb, eNR_VIRIAL, gmx::c_numShiftVectors);
171
+
172
+ /* Calculate partial virial, for local atoms only, based on short range.
173
+ * Total virial is computed in global_stat, called from do_md
174
+ */
175
+ const rvec* f = as_rvec_array(forceWithShiftForces.force().data());
176
+ f_calc_vir(start, start + homenr, x, f, vir_part, box);
177
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
178
+
179
+ if (debug)
180
+ {
181
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
182
+ }
183
+ }
184
+
185
+ static void pull_potential_wrapper(const t_commrec* cr,
186
+ const t_inputrec& ir,
187
+ const matrix box,
188
+ gmx::ArrayRef<const gmx::RVec> x,
189
+ const t_mdatoms* mdatoms,
190
+ gmx_enerdata_t* enerd,
191
+ pull_t* pull_work,
192
+ const real* lambda,
193
+ double t,
194
+ gmx_wallcycle* wcycle)
195
+ {
196
+ t_pbc pbc;
197
+ real dvdl;
198
+
199
+ /* Calculate the center of mass forces, this requires communication,
200
+ * which is why pull_potential is called close to other communication.
201
+ */
202
+ wallcycle_start(wcycle, WallCycleCounter::PullPot);
203
+ set_pbc(&pbc, ir.pbcType, box);
204
+ dvdl = 0;
205
+ enerd->term[F_COM_PULL] +=
206
+ pull_potential(pull_work,
207
+ gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
208
+ pbc,
209
+ cr,
210
+ t,
211
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Restraint)],
212
+ x,
213
+ &dvdl);
214
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
215
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
216
+ }
217
+
218
+ static void pme_receive_force_ener(t_forcerec* fr,
219
+ const t_commrec* cr,
220
+ gmx::ForceWithVirial* forceWithVirial,
221
+ gmx_enerdata_t* enerd,
222
+ bool useGpuPmePpComms,
223
+ bool receivePmeForceToGpu,
224
+ gmx_wallcycle* wcycle)
225
+ {
226
+ real e_q, e_lj, dvdl_q, dvdl_lj;
227
+ float cycles_ppdpme, cycles_seppme;
228
+
229
+ cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
230
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
231
+
232
+ /* In case of node-splitting, the PP nodes receive the long-range
233
+ * forces, virial and energy from the PME nodes here.
234
+ */
235
+ wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
236
+ dvdl_q = 0;
237
+ dvdl_lj = 0;
238
+ gmx_pme_receive_f(fr->pmePpCommGpu.get(),
239
+ cr,
240
+ forceWithVirial,
241
+ &e_q,
242
+ &e_lj,
243
+ &dvdl_q,
244
+ &dvdl_lj,
245
+ useGpuPmePpComms,
246
+ receivePmeForceToGpu,
247
+ &cycles_seppme);
248
+ enerd->term[F_COUL_RECIP] += e_q;
249
+ enerd->term[F_LJ_RECIP] += e_lj;
250
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += dvdl_q;
251
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_lj;
252
+
253
+ if (wcycle)
254
+ {
255
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
256
+ }
257
+ wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
258
+ }
259
+
260
+ static void print_large_forces(FILE* fp,
261
+ const t_mdatoms* md,
262
+ const t_commrec* cr,
263
+ int64_t step,
264
+ real forceTolerance,
265
+ ArrayRef<const RVec> x,
266
+ ArrayRef<const RVec> f)
267
+ {
268
+ real force2Tolerance = gmx::square(forceTolerance);
269
+ gmx::index numNonFinite = 0;
270
+ for (int i = 0; i < md->homenr; i++)
271
+ {
272
+ real force2 = norm2(f[i]);
273
+ bool nonFinite = !std::isfinite(force2);
274
+ if (force2 >= force2Tolerance || nonFinite)
275
+ {
276
+ fprintf(fp,
277
+ "step %" PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
278
+ step,
279
+ ddglatnr(cr->dd, i),
280
+ x[i][XX],
281
+ x[i][YY],
282
+ x[i][ZZ],
283
+ std::sqrt(force2));
284
+ }
285
+ if (nonFinite)
286
+ {
287
+ numNonFinite++;
288
+ }
289
+ }
290
+ if (numNonFinite > 0)
291
+ {
292
+ /* Note that with MPI this fatal call on one rank might interrupt
293
+ * the printing on other ranks. But we can only avoid that with
294
+ * an expensive MPI barrier that we would need at each step.
295
+ */
296
+ gmx_fatal(FARGS, "At step %" PRId64 " detected non-finite forces on %td atoms", step, numNonFinite);
297
+ }
298
+ }
299
+
300
+ //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
301
+ static void postProcessForceWithShiftForces(t_nrnb* nrnb,
302
+ gmx_wallcycle* wcycle,
303
+ const matrix box,
304
+ ArrayRef<const RVec> x,
305
+ ForceOutputs* forceOutputs,
306
+ tensor vir_force,
307
+ const t_mdatoms& mdatoms,
308
+ const t_forcerec& fr,
309
+ gmx::VirtualSitesHandler* vsite,
310
+ const StepWorkload& stepWork)
311
+ {
312
+ ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
313
+
314
+ /* If we have NoVirSum forces, but we do not calculate the virial,
315
+ * we later sum the forceWithShiftForces buffer together with
316
+ * the noVirSum buffer and spread the combined vsite forces at once.
317
+ */
318
+ if (vsite && (!forceOutputs->haveForceWithVirial() || stepWork.computeVirial))
319
+ {
320
+ using VirialHandling = gmx::VirtualSitesHandler::VirialHandling;
321
+
322
+ auto f = forceWithShiftForces.force();
323
+ auto fshift = forceWithShiftForces.shiftForces();
324
+ const VirialHandling virialHandling =
325
+ (stepWork.computeVirial ? VirialHandling::Pbc : VirialHandling::None);
326
+ vsite->spreadForces(x, f, virialHandling, fshift, nullptr, nrnb, box, wcycle);
327
+ forceWithShiftForces.haveSpreadVsiteForces() = true;
328
+ }
329
+
330
+ if (stepWork.computeVirial)
331
+ {
332
+ /* Calculation of the virial must be done after vsites! */
333
+ calc_virial(
334
+ 0, mdatoms.homenr, as_rvec_array(x.data()), forceWithShiftForces, vir_force, box, nrnb, &fr, fr.pbcType);
335
+ }
336
+ }
337
+
338
+ //! Spread, compute virial for and sum forces, when necessary
339
+ static void postProcessForces(const t_commrec* cr,
340
+ int64_t step,
341
+ t_nrnb* nrnb,
342
+ gmx_wallcycle* wcycle,
343
+ const matrix box,
344
+ ArrayRef<const RVec> x,
345
+ ForceOutputs* forceOutputs,
346
+ tensor vir_force,
347
+ const t_mdatoms* mdatoms,
348
+ const t_forcerec* fr,
349
+ gmx::VirtualSitesHandler* vsite,
350
+ const StepWorkload& stepWork)
351
+ {
352
+ // Extract the final output force buffer, which is also the buffer for forces with shift forces
353
+ ArrayRef<RVec> f = forceOutputs->forceWithShiftForces().force();
354
+
355
+ if (forceOutputs->haveForceWithVirial())
356
+ {
357
+ auto& forceWithVirial = forceOutputs->forceWithVirial();
358
+
359
+ if (vsite)
360
+ {
361
+ /* Spread the mesh force on virtual sites to the other particles...
362
+ * This is parallellized. MPI communication is performed
363
+ * if the constructing atoms aren't local.
364
+ */
365
+ GMX_ASSERT(!stepWork.computeVirial || f.data() != forceWithVirial.force_.data(),
366
+ "We need separate force buffers for shift and virial forces when "
367
+ "computing the virial");
368
+ GMX_ASSERT(!stepWork.computeVirial
369
+ || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
370
+ "We should spread the force with shift forces separately when computing "
371
+ "the virial");
372
+ const gmx::VirtualSitesHandler::VirialHandling virialHandling =
373
+ (stepWork.computeVirial ? gmx::VirtualSitesHandler::VirialHandling::NonLinear
374
+ : gmx::VirtualSitesHandler::VirialHandling::None);
375
+ matrix virial = { { 0 } };
376
+ vsite->spreadForces(x, forceWithVirial.force_, virialHandling, {}, virial, nrnb, box, wcycle);
377
+ forceWithVirial.addVirialContribution(virial);
378
+ }
379
+
380
+ if (stepWork.computeVirial)
381
+ {
382
+ /* Now add the forces, this is local */
383
+ sum_forces(f, forceWithVirial.force_);
384
+
385
+ /* Add the direct virial contributions */
386
+ GMX_ASSERT(
387
+ forceWithVirial.computeVirial_,
388
+ "forceWithVirial should request virial computation when we request the virial");
389
+ m_add(vir_force, forceWithVirial.getVirial(), vir_force);
390
+
391
+ if (debug)
392
+ {
393
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
394
+ }
395
+ }
396
+ }
397
+ else
398
+ {
399
+ GMX_ASSERT(vsite == nullptr || forceOutputs->forceWithShiftForces().haveSpreadVsiteForces(),
400
+ "We should have spread the vsite forces (earlier)");
401
+ }
402
+
403
+ if (fr->print_force >= 0)
404
+ {
405
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
406
+ }
407
+ }
408
+
409
+ static void do_nb_verlet(t_forcerec* fr,
410
+ const interaction_const_t* ic,
411
+ gmx_enerdata_t* enerd,
412
+ const StepWorkload& stepWork,
413
+ const InteractionLocality ilocality,
414
+ const int clearF,
415
+ const int64_t step,
416
+ t_nrnb* nrnb,
417
+ gmx_wallcycle* wcycle)
418
+ {
419
+ if (!stepWork.computeNonbondedForces)
420
+ {
421
+ /* skip non-bonded calculation */
422
+ return;
423
+ }
424
+
425
+ nonbonded_verlet_t* nbv = fr->nbv.get();
426
+
427
+ /* GPU kernel launch overhead is already timed separately */
428
+ if (!nbv->useGpu())
429
+ {
430
+ /* When dynamic pair-list pruning is requested, we need to prune
431
+ * at nstlistPrune steps.
432
+ */
433
+ if (nbv->isDynamicPruningStepCpu(step))
434
+ {
435
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
436
+ * the current coordinates of the atoms.
437
+ */
438
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
439
+ nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
440
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
441
+ }
442
+ }
443
+
444
+ nbv->dispatchNonbondedKernel(
445
+ ilocality,
446
+ *ic,
447
+ stepWork,
448
+ clearF,
449
+ fr->shift_vec,
450
+ enerd->grpp.energyGroupPairTerms[fr->haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
451
+ : NonBondedEnergyTerms::LJSR],
452
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
453
+ nrnb);
454
+ }
455
+
456
+ static inline void clearRVecs(ArrayRef<RVec> v, const bool useOpenmpThreading)
457
+ {
458
+ int nth = gmx_omp_nthreads_get_simple_rvec_task(ModuleMultiThread::Default, v.ssize());
459
+
460
+ /* Note that we would like to avoid this conditional by putting it
461
+ * into the omp pragma instead, but then we still take the full
462
+ * omp parallel for overhead (at least with gcc5).
463
+ */
464
+ if (!useOpenmpThreading || nth == 1)
465
+ {
466
+ for (RVec& elem : v)
467
+ {
468
+ clear_rvec(elem);
469
+ }
470
+ }
471
+ else
472
+ {
473
+ #pragma omp parallel for num_threads(nth) schedule(static)
474
+ for (gmx::index i = 0; i < v.ssize(); i++)
475
+ {
476
+ clear_rvec(v[i]);
477
+ }
478
+ }
479
+ }
480
+
481
+ /*! \brief Return an estimate of the average kinetic energy or 0 when unreliable
482
+ *
483
+ * \param groupOptions Group options, containing T-coupling options
484
+ */
485
+ static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
486
+ {
487
+ real nrdfCoupled = 0;
488
+ real nrdfUncoupled = 0;
489
+ real kineticEnergy = 0;
490
+ for (int g = 0; g < groupOptions.ngtc; g++)
491
+ {
492
+ if (groupOptions.tau_t[g] >= 0)
493
+ {
494
+ nrdfCoupled += groupOptions.nrdf[g];
495
+ kineticEnergy += groupOptions.nrdf[g] * 0.5 * groupOptions.ref_t[g] * gmx::c_boltz;
496
+ }
497
+ else
498
+ {
499
+ nrdfUncoupled += groupOptions.nrdf[g];
500
+ }
501
+ }
502
+
503
+ /* This conditional with > also catches nrdf=0 */
504
+ if (nrdfCoupled > nrdfUncoupled)
505
+ {
506
+ return kineticEnergy * (nrdfCoupled + nrdfUncoupled) / nrdfCoupled;
507
+ }
508
+ else
509
+ {
510
+ return 0;
511
+ }
512
+ }
513
+
514
+ /*! \brief This routine checks that the potential energy is finite.
515
+ *
516
+ * Always checks that the potential energy is finite. If step equals
517
+ * inputrec.init_step also checks that the magnitude of the potential energy
518
+ * is reasonable. Terminates with a fatal error when a check fails.
519
+ * Note that passing this check does not guarantee finite forces,
520
+ * since those use slightly different arithmetics. But in most cases
521
+ * there is just a narrow coordinate range where forces are not finite
522
+ * and energies are finite.
523
+ *
524
+ * \param[in] step The step number, used for checking and printing
525
+ * \param[in] enerd The energy data; the non-bonded group energies need to be added to
526
+ * \c enerd.term[F_EPOT] before calling this routine
527
+ * \param[in] inputrec The input record
528
+ */
529
+ static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
530
+ {
531
+ /* Threshold valid for comparing absolute potential energy against
532
+ * the kinetic energy. Normally one should not consider absolute
533
+ * potential energy values, but with a factor of one million
534
+ * we should never get false positives.
535
+ */
536
+ constexpr real c_thresholdFactor = 1e6;
537
+
538
+ bool energyIsNotFinite = !std::isfinite(enerd.term[F_EPOT]);
539
+ real averageKineticEnergy = 0;
540
+ /* We only check for large potential energy at the initial step,
541
+ * because that is by far the most likely step for this too occur
542
+ * and because computing the average kinetic energy is not free.
543
+ * Note: nstcalcenergy >> 1 often does not allow to catch large energies
544
+ * before they become NaN.
545
+ */
546
+ if (step == inputrec.init_step && EI_DYNAMICS(inputrec.eI))
547
+ {
548
+ averageKineticEnergy = averageKineticEnergyEstimate(inputrec.opts);
549
+ }
550
+
551
+ if (energyIsNotFinite
552
+ || (averageKineticEnergy > 0 && enerd.term[F_EPOT] > c_thresholdFactor * averageKineticEnergy))
553
+ {
554
+ GMX_THROW(gmx::InternalError(gmx::formatString(
555
+ "Step %" PRId64
556
+ ": The total potential energy is %g, which is %s. The LJ and electrostatic "
557
+ "contributions to the energy are %g and %g, respectively. A %s potential energy "
558
+ "can be caused by overlapping interactions in bonded interactions or very large%s "
559
+ "coordinate values. Usually this is caused by a badly- or non-equilibrated initial "
560
+ "configuration, incorrect interactions or parameters in the topology.",
561
+ step,
562
+ enerd.term[F_EPOT],
563
+ energyIsNotFinite ? "not finite" : "extremely high",
564
+ enerd.term[F_LJ],
565
+ enerd.term[F_COUL_SR],
566
+ energyIsNotFinite ? "non-finite" : "very high",
567
+ energyIsNotFinite ? " or Nan" : "")));
568
+ }
569
+ }
570
+
571
+ /*! \brief Return true if there are special forces computed this step.
572
+ *
573
+ * The conditionals exactly correspond to those in computeSpecialForces().
574
+ */
575
+ static bool haveSpecialForces(const t_inputrec& inputrec,
576
+ const gmx::ForceProviders& forceProviders,
577
+ const pull_t* pull_work,
578
+ const bool computeForces,
579
+ const gmx_edsam* ed)
580
+ {
581
+
582
+ return ((computeForces && forceProviders.hasForceProvider()) || // forceProviders
583
+ (inputrec.bPull && pull_have_potential(*pull_work)) || // pull
584
+ inputrec.bRot || // enforced rotation
585
+ (ed != nullptr) || // flooding
586
+ (inputrec.bIMD && computeForces)); // IMD
587
+ }
588
+
589
+ /*! \brief Compute forces and/or energies for special algorithms
590
+ *
591
+ * The intention is to collect all calls to algorithms that compute
592
+ * forces on local atoms only and that do not contribute to the local
593
+ * virial sum (but add their virial contribution separately).
594
+ * Eventually these should likely all become ForceProviders.
595
+ * Within this function the intention is to have algorithms that do
596
+ * global communication at the end, so global barriers within the MD loop
597
+ * are as close together as possible.
598
+ *
599
+ * \param[in] fplog The log file
600
+ * \param[in] cr The communication record
601
+ * \param[in] inputrec The input record
602
+ * \param[in] awh The Awh module (nullptr if none in use).
603
+ * \param[in] enforcedRotation Enforced rotation module.
604
+ * \param[in] imdSession The IMD session
605
+ * \param[in] pull_work The pull work structure.
606
+ * \param[in] step The current MD step
607
+ * \param[in] t The current time
608
+ * \param[in,out] wcycle Wallcycle accounting struct
609
+ * \param[in,out] forceProviders Pointer to a list of force providers
610
+ * \param[in] box The unit cell
611
+ * \param[in] x The coordinates
612
+ * \param[in] mdatoms Per atom properties
613
+ * \param[in] lambda Array of free-energy lambda values
614
+ * \param[in] stepWork Step schedule flags
615
+ * \param[in,out] forceWithVirialMtsLevel0 Force and virial for MTS level0 forces
616
+ * \param[in,out] forceWithVirialMtsLevel1 Force and virial for MTS level1 forces, can be nullptr
617
+ * \param[in,out] enerd Energy buffer
618
+ * \param[in,out] ed Essential dynamics pointer
619
+ * \param[in] didNeighborSearch Tells if we did neighbor searching this step, used for ED sampling
620
+ *
621
+ * \todo Remove didNeighborSearch, which is used incorrectly.
622
+ * \todo Convert all other algorithms called here to ForceProviders.
623
+ */
624
+ static void computeSpecialForces(FILE* fplog,
625
+ const t_commrec* cr,
626
+ const t_inputrec& inputrec,
627
+ gmx::Awh* awh,
628
+ gmx_enfrot* enforcedRotation,
629
+ gmx::ImdSession* imdSession,
630
+ pull_t* pull_work,
631
+ int64_t step,
632
+ double t,
633
+ gmx_wallcycle* wcycle,
634
+ gmx::ForceProviders* forceProviders,
635
+ const matrix box,
636
+ gmx::ArrayRef<const gmx::RVec> x,
637
+ const t_mdatoms* mdatoms,
638
+ gmx::ArrayRef<const real> lambda,
639
+ const StepWorkload& stepWork,
640
+ gmx::ForceWithVirial* forceWithVirialMtsLevel0,
641
+ gmx::ForceWithVirial* forceWithVirialMtsLevel1,
642
+ gmx_enerdata_t* enerd,
643
+ gmx_edsam* ed,
644
+ bool didNeighborSearch)
645
+ {
646
+ /* NOTE: Currently all ForceProviders only provide forces.
647
+ * When they also provide energies, remove this conditional.
648
+ */
649
+ if (stepWork.computeForces)
650
+ {
651
+ gmx::ForceProviderInput forceProviderInput(
652
+ x,
653
+ mdatoms->homenr,
654
+ gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->homenr),
655
+ gmx::arrayRefFromArray(mdatoms->massT, mdatoms->homenr),
656
+ t,
657
+ step,
658
+ box,
659
+ *cr);
660
+ gmx::ForceProviderOutput forceProviderOutput(forceWithVirialMtsLevel0, enerd);
661
+
662
+ /* Collect forces from modules */
663
+ forceProviders->calculateForces(forceProviderInput, &forceProviderOutput);
664
+ }
665
+
666
+ const int pullMtsLevel = forceGroupMtsLevel(inputrec.mtsLevels, gmx::MtsForceGroups::Pull);
667
+ const bool doPulling = (inputrec.bPull && pull_have_potential(*pull_work)
668
+ && (pullMtsLevel == 0 || stepWork.computeSlowForces));
669
+
670
+ /* pull_potential_wrapper(), awh->applyBiasForcesAndUpdateBias(), pull_apply_forces()
671
+ * have to be called in this order
672
+ */
673
+ if (doPulling)
674
+ {
675
+ pull_potential_wrapper(cr, inputrec, box, x, mdatoms, enerd, pull_work, lambda.data(), t, wcycle);
676
+ }
677
+ if (awh && (pullMtsLevel == 0 || stepWork.computeSlowForces))
678
+ {
679
+ const bool needForeignEnergyDifferences = awh->needForeignEnergyDifferences(step);
680
+ std::vector<double> foreignLambdaDeltaH, foreignLambdaDhDl;
681
+ if (needForeignEnergyDifferences)
682
+ {
683
+ enerd->foreignLambdaTerms.finalizePotentialContributions(
684
+ enerd->dvdl_lin, lambda, *inputrec.fepvals);
685
+ std::tie(foreignLambdaDeltaH, foreignLambdaDhDl) = enerd->foreignLambdaTerms.getTerms(cr);
686
+ }
687
+
688
+ enerd->term[F_COM_PULL] += awh->applyBiasForcesAndUpdateBias(
689
+ inputrec.pbcType, foreignLambdaDeltaH, foreignLambdaDhDl, box, t, step, wcycle, fplog);
690
+ }
691
+ if (doPulling)
692
+ {
693
+ wallcycle_start_nocount(wcycle, WallCycleCounter::PullPot);
694
+ auto& forceWithVirial = (pullMtsLevel == 0) ? forceWithVirialMtsLevel0 : forceWithVirialMtsLevel1;
695
+ pull_apply_forces(
696
+ pull_work, gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr), cr, forceWithVirial);
697
+ wallcycle_stop(wcycle, WallCycleCounter::PullPot);
698
+ }
699
+
700
+ /* Add the forces from enforced rotation potentials (if any) */
701
+ if (inputrec.bRot)
702
+ {
703
+ wallcycle_start(wcycle, WallCycleCounter::RotAdd);
704
+ enerd->term[F_COM_PULL] +=
705
+ add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
706
+ wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
707
+ }
708
+
709
+ if (ed)
710
+ {
711
+ /* Note that since init_edsam() is called after the initialization
712
+ * of forcerec, edsam doesn't request the noVirSum force buffer.
713
+ * Thus if no other algorithm (e.g. PME) requires it, the forces
714
+ * here will contribute to the virial.
715
+ */
716
+ do_flood(cr, inputrec, x, forceWithVirialMtsLevel0->force_, ed, box, step, didNeighborSearch);
717
+ }
718
+
719
+ /* Add forces from interactive molecular dynamics (IMD), if any */
720
+ if (inputrec.bIMD && stepWork.computeForces)
721
+ {
722
+ imdSession->applyForces(forceWithVirialMtsLevel0->force_);
723
+ }
724
+ }
725
+
726
+ /*! \brief Launch the prepare_step and spread stages of PME GPU.
727
+ *
728
+ * \param[in] pmedata The PME structure
729
+ * \param[in] box The box matrix
730
+ * \param[in] stepWork Step schedule flags
731
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
732
+ * \param[in] lambdaQ The Coulomb lambda of the current state.
733
+ * \param[in] wcycle The wallcycle structure
734
+ */
735
+ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
736
+ const matrix box,
737
+ const StepWorkload& stepWork,
738
+ GpuEventSynchronizer* xReadyOnDevice,
739
+ const real lambdaQ,
740
+ gmx_wallcycle* wcycle)
741
+ {
742
+ pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
743
+ bool useGpuDirectComm = false;
744
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
745
+ pme_gpu_launch_spread(
746
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
747
+ }
748
+
749
+ /*! \brief Launch the FFT and gather stages of PME GPU
750
+ *
751
+ * This function only implements setting the output forces (no accumulation).
752
+ *
753
+ * \param[in] pmedata The PME structure
754
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
755
+ * \param[in] wcycle The wallcycle structure
756
+ * \param[in] stepWork Step schedule flags
757
+ */
758
+ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
759
+ const real lambdaQ,
760
+ gmx_wallcycle* wcycle,
761
+ const gmx::StepWorkload& stepWork)
762
+ {
763
+ pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
764
+ pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
765
+ }
766
+
767
+ /*! \brief
768
+ * Polling wait for either of the PME or nonbonded GPU tasks.
769
+ *
770
+ * Instead of a static order in waiting for GPU tasks, this function
771
+ * polls checking which of the two tasks completes first, and does the
772
+ * associated force buffer reduction overlapped with the other task.
773
+ * By doing that, unlike static scheduling order, it can always overlap
774
+ * one of the reductions, regardless of the GPU task completion order.
775
+ *
776
+ * \param[in] nbv Nonbonded verlet structure
777
+ * \param[in,out] pmedata PME module data
778
+ * \param[in,out] forceOutputsNonbonded Force outputs for the non-bonded forces and shift forces
779
+ * \param[in,out] forceOutputsPme Force outputs for the PME forces and virial
780
+ * \param[in,out] enerd Energy data structure results are reduced into
781
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
782
+ * \param[in] stepWork Step schedule flags
783
+ * \param[in] wcycle The wallcycle structure
784
+ */
785
+ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
786
+ gmx_pme_t* pmedata,
787
+ gmx::ForceOutputs* forceOutputsNonbonded,
788
+ gmx::ForceOutputs* forceOutputsPme,
789
+ gmx_enerdata_t* enerd,
790
+ const real lambdaQ,
791
+ const StepWorkload& stepWork,
792
+ gmx_wallcycle* wcycle)
793
+ {
794
+ bool isPmeGpuDone = false;
795
+ bool isNbGpuDone = false;
796
+
797
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
798
+
799
+ while (!isPmeGpuDone || !isNbGpuDone)
800
+ {
801
+ if (!isPmeGpuDone)
802
+ {
803
+ GpuTaskCompletion completionType =
804
+ (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
805
+ isPmeGpuDone = pme_gpu_try_finish_task(
806
+ pmedata, stepWork, wcycle, &forceOutputsPme->forceWithVirial(), enerd, lambdaQ, completionType);
807
+ }
808
+
809
+ if (!isNbGpuDone)
810
+ {
811
+ auto& forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
812
+ GpuTaskCompletion completionType =
813
+ (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
814
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(
815
+ nbv->gpu_nbv,
816
+ stepWork,
817
+ AtomLocality::Local,
818
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
819
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
820
+ forceBuffersNonbonded.shiftForces(),
821
+ completionType,
822
+ wcycle);
823
+
824
+ if (isNbGpuDone)
825
+ {
826
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
827
+ }
828
+ }
829
+ }
830
+ }
831
+
832
+ /*! \brief Set up the different force buffers; also does clearing.
833
+ *
834
+ * \param[in] forceHelperBuffers Helper force buffers
835
+ * \param[in] force force array
836
+ * \param[in] domainWork Domain lifetime workload flags
837
+ * \param[in] stepWork Step schedule flags
838
+ * \param[in] havePpDomainDecomposition Whether we have a PP domain decomposition
839
+ * \param[out] wcycle wallcycle recording structure
840
+ *
841
+ * \returns Cleared force output structure
842
+ */
843
+ static ForceOutputs setupForceOutputs(ForceHelperBuffers* forceHelperBuffers,
844
+ gmx::ArrayRefWithPadding<gmx::RVec> force,
845
+ const DomainLifetimeWorkload& domainWork,
846
+ const StepWorkload& stepWork,
847
+ const bool havePpDomainDecomposition,
848
+ gmx_wallcycle* wcycle)
849
+ {
850
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
851
+
852
+ /* NOTE: We assume fr->shiftForces is all zeros here */
853
+ gmx::ForceWithShiftForces forceWithShiftForces(
854
+ force, stepWork.computeVirial, forceHelperBuffers->shiftForces());
855
+
856
+ if (stepWork.computeForces
857
+ && (domainWork.haveCpuLocalForceWork || !stepWork.useGpuFBufferOps
858
+ || (havePpDomainDecomposition && !stepWork.useGpuFHalo)))
859
+ {
860
+ /* Clear the short- and long-range forces */
861
+ clearRVecs(forceWithShiftForces.force(), true);
862
+
863
+ /* Clear the shift forces */
864
+ clearRVecs(forceWithShiftForces.shiftForces(), false);
865
+ }
866
+
867
+ /* If we need to compute the virial, we might need a separate
868
+ * force buffer for algorithms for which the virial is calculated
869
+ * directly, such as PME. Otherwise, forceWithVirial uses the
870
+ * the same force (f in legacy calls) buffer as other algorithms.
871
+ */
872
+ const bool useSeparateForceWithVirialBuffer =
873
+ (stepWork.computeForces
874
+ && (stepWork.computeVirial && forceHelperBuffers->haveDirectVirialContributions()));
875
+ /* forceWithVirial uses the local atom range only */
876
+ gmx::ForceWithVirial forceWithVirial(
877
+ useSeparateForceWithVirialBuffer ? forceHelperBuffers->forceBufferForDirectVirialContributions()
878
+ : force.unpaddedArrayRef(),
879
+ stepWork.computeVirial);
880
+
881
+ if (useSeparateForceWithVirialBuffer)
882
+ {
883
+ /* TODO: update comment
884
+ * We only compute forces on local atoms. Note that vsites can
885
+ * spread to non-local atoms, but that part of the buffer is
886
+ * cleared separately in the vsite spreading code.
887
+ */
888
+ clearRVecs(forceWithVirial.force_, true);
889
+ }
890
+
891
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
892
+
893
+ return ForceOutputs(
894
+ forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
895
+ }
896
+
897
+
898
+ /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
899
+ */
900
+ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec& inputrec,
901
+ const t_forcerec& fr,
902
+ const pull_t* pull_work,
903
+ const gmx_edsam* ed,
904
+ const t_mdatoms& mdatoms,
905
+ const SimulationWorkload& simulationWork,
906
+ const StepWorkload& stepWork)
907
+ {
908
+ DomainLifetimeWorkload domainWork;
909
+ // Note that haveSpecialForces is constant over the whole run
910
+ domainWork.haveSpecialForces =
911
+ haveSpecialForces(inputrec, *fr.forceProviders, pull_work, stepWork.computeForces, ed);
912
+ domainWork.haveCpuListedForceWork = false;
913
+ domainWork.haveCpuBondedWork = false;
914
+ for (const auto& listedForces : fr.listedForces)
915
+ {
916
+ if (listedForces.haveCpuListedForces(*fr.fcdata))
917
+ {
918
+ domainWork.haveCpuListedForceWork = true;
919
+ }
920
+ if (listedForces.haveCpuBondeds())
921
+ {
922
+ domainWork.haveCpuBondedWork = true;
923
+ }
924
+ }
925
+ domainWork.haveGpuBondedWork =
926
+ ((fr.listedForcesGpu != nullptr) && fr.listedForcesGpu->haveInteractions());
927
+ // Note that haveFreeEnergyWork is constant over the whole run
928
+ domainWork.haveFreeEnergyWork =
929
+ (fr.efep != FreeEnergyPerturbationType::No && mdatoms.nPerturbed != 0);
930
+ // We assume we have local force work if there are CPU
931
+ // force tasks including PME or nonbondeds.
932
+ domainWork.haveCpuLocalForceWork =
933
+ domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork
934
+ || domainWork.haveFreeEnergyWork || simulationWork.useCpuNonbonded || simulationWork.useCpuPme
935
+ || simulationWork.haveEwaldSurfaceContribution || inputrec.nwall > 0;
936
+ domainWork.haveLocalForceContribInCpuBuffer =
937
+ domainWork.haveCpuLocalForceWork || simulationWork.havePpDomainDecomposition;
938
+ domainWork.haveNonLocalForceContribInCpuBuffer =
939
+ domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
940
+
941
+ return domainWork;
942
+ }
943
+
944
+ /*! \brief Set up force flag stuct from the force bitmask.
945
+ *
946
+ * \param[in] legacyFlags Force bitmask flags used to construct the new flags
947
+ * \param[in] mtsLevels The multiple time-stepping levels, either empty or 2 levels
948
+ * \param[in] step The current MD step
949
+ * \param[in] simulationWork Simulation workload description.
950
+ *
951
+ * \returns New Stepworkload description.
952
+ */
953
+ static StepWorkload setupStepWorkload(const int legacyFlags,
954
+ ArrayRef<const gmx::MtsLevel> mtsLevels,
955
+ const int64_t step,
956
+ const SimulationWorkload& simulationWork)
957
+ {
958
+ GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
959
+ const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
960
+
961
+ StepWorkload flags;
962
+ flags.stateChanged = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
963
+ flags.haveDynamicBox = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
964
+ flags.doNeighborSearch = ((legacyFlags & GMX_FORCE_NS) != 0);
965
+ flags.computeSlowForces = computeSlowForces;
966
+ flags.computeVirial = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
967
+ flags.computeEnergy = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
968
+ flags.computeForces = ((legacyFlags & GMX_FORCE_FORCES) != 0);
969
+ flags.useOnlyMtsCombinedForceBuffer = ((legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0);
970
+ flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
971
+ flags.computeNonbondedForces =
972
+ ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
973
+ && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
974
+ flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
975
+
976
+ if (simulationWork.useGpuXBufferOps || simulationWork.useGpuFBufferOps)
977
+ {
978
+ GMX_ASSERT(simulationWork.useGpuNonbonded,
979
+ "Can only offload buffer ops if nonbonded computation is also offloaded");
980
+ }
981
+ flags.useGpuXBufferOps = simulationWork.useGpuXBufferOps && !flags.doNeighborSearch;
982
+ // on virial steps the CPU reduction path is taken
983
+ flags.useGpuFBufferOps = simulationWork.useGpuFBufferOps && !flags.computeVirial;
984
+ const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
985
+ flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
986
+ && (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
987
+ flags.useGpuXHalo = simulationWork.useGpuHaloExchange && !flags.doNeighborSearch;
988
+ flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
989
+ flags.haveGpuPmeOnThisRank = rankHasGpuPmeTask && flags.computeSlowForces;
990
+ flags.computePmeOnSeparateRank = simulationWork.haveSeparatePmeRank && flags.computeSlowForces;
991
+ flags.combineMtsForcesBeforeHaloExchange =
992
+ (flags.computeForces && simulationWork.useMts && flags.computeSlowForces
993
+ && flags.useOnlyMtsCombinedForceBuffer
994
+ && !(flags.computeVirial || simulationWork.useGpuNonbonded || flags.haveGpuPmeOnThisRank));
995
+
996
+ return flags;
997
+ }
998
+
999
+
1000
+ /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
1001
+ *
1002
+ */
1003
+ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
1004
+ gmx::ListedForcesGpu* listedForcesGpu,
1005
+ gmx_pme_t* pmedata,
1006
+ gmx_enerdata_t* enerd,
1007
+ const gmx::MdrunScheduleWorkload& runScheduleWork,
1008
+ int64_t step,
1009
+ gmx_wallcycle* wcycle)
1010
+ {
1011
+ if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
1012
+ {
1013
+ /* Launch pruning before buffer clearing because the API overhead of the
1014
+ * clear kernel launches can leave the GPU idle while it could be running
1015
+ * the prune kernel.
1016
+ */
1017
+ if (nbv->isDynamicPruningStepGpu(step))
1018
+ {
1019
+ nbv->dispatchPruneKernelGpu(step);
1020
+ }
1021
+
1022
+ /* now clear the GPU outputs while we finish the step on the CPU */
1023
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1024
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1025
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
1026
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1027
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1028
+ }
1029
+
1030
+ if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
1031
+ {
1032
+ pme_gpu_reinit_computation(pmedata, wcycle);
1033
+ }
1034
+
1035
+ if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
1036
+ {
1037
+ // in principle this should be included in the DD balancing region,
1038
+ // but generally it is infrequent so we'll omit it for the sake of
1039
+ // simpler code
1040
+ listedForcesGpu->waitAccumulateEnergyTerms(enerd);
1041
+
1042
+ listedForcesGpu->clearEnergies();
1043
+ }
1044
+ }
1045
+
1046
+ /*! \brief Compute the number of times the "local coordinates ready on device" GPU event will be used as a synchronization point.
1047
+ *
1048
+ * When some work is offloaded to GPU, force calculation should wait for the atom coordinates to
1049
+ * be ready on the device. The coordinates can come either from H2D copy at the beginning of the step,
1050
+ * or from the GPU integration at the end of the previous step.
1051
+ *
1052
+ * In GROMACS, we usually follow the "mark once - wait once" approach. But this event is "consumed"
1053
+ * (that is, waited upon either on host or on the device) multiple times, since many tasks
1054
+ * in different streams depend on the coordinates.
1055
+ *
1056
+ * This function return the number of times the event will be consumed based on this step's workload.
1057
+ *
1058
+ * \param simulationWork Simulation workload flags.
1059
+ * \param stepWork Step workload flags.
1060
+ * \param pmeSendCoordinatesFromGpu Whether peer-to-peer communication is used for PME coordinates.
1061
+ * \return
1062
+ */
1063
+ static int getExpectedLocalXReadyOnDeviceConsumptionCount(gmx_used_in_debug const SimulationWorkload& simulationWork,
1064
+ const StepWorkload& stepWork,
1065
+ bool pmeSendCoordinatesFromGpu)
1066
+ {
1067
+ int result = 0;
1068
+ if (stepWork.computeSlowForces)
1069
+ {
1070
+ if (pmeSendCoordinatesFromGpu)
1071
+ {
1072
+ GMX_ASSERT(simulationWork.haveSeparatePmeRank,
1073
+ "GPU PME PP communications require having a separate PME rank");
1074
+ // Event is consumed by gmx_pme_send_coordinates for GPU PME PP Communications
1075
+ result++;
1076
+ }
1077
+ if (stepWork.haveGpuPmeOnThisRank)
1078
+ {
1079
+ // Event is consumed by launchPmeGpuSpread
1080
+ result++;
1081
+ }
1082
+ if (stepWork.computeNonbondedForces && stepWork.useGpuXBufferOps)
1083
+ {
1084
+ // Event is consumed by convertCoordinatesGpu
1085
+ result++;
1086
+ }
1087
+ }
1088
+ if (stepWork.useGpuXHalo)
1089
+ {
1090
+ // Event is consumed by communicateGpuHaloCoordinates
1091
+ result++;
1092
+ if (GMX_THREAD_MPI) // Issue #4262
1093
+ {
1094
+ result++;
1095
+ }
1096
+ }
1097
+ return result;
1098
+ }
1099
+
1100
+ //! \brief Data structure to hold dipole-related data and staging arrays
1101
+ struct DipoleData
1102
+ {
1103
+ //! Dipole staging for fast summing over MPI
1104
+ gmx::DVec muStaging[2] = { { 0.0, 0.0, 0.0 } };
1105
+ //! Dipole staging for states A and B (index 0 and 1 resp.)
1106
+ gmx::RVec muStateAB[2] = { { 0.0_real, 0.0_real, 0.0_real } };
1107
+ };
1108
+
1109
+
1110
+ static void reduceAndUpdateMuTot(DipoleData* dipoleData,
1111
+ const t_commrec* cr,
1112
+ const bool haveFreeEnergy,
1113
+ gmx::ArrayRef<const real> lambda,
1114
+ rvec muTotal,
1115
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1116
+ {
1117
+ if (PAR(cr))
1118
+ {
1119
+ gmx_sumd(2 * DIM, dipoleData->muStaging[0], cr);
1120
+ ddBalanceRegionHandler.reopenRegionCpu();
1121
+ }
1122
+ for (int i = 0; i < 2; i++)
1123
+ {
1124
+ for (int j = 0; j < DIM; j++)
1125
+ {
1126
+ dipoleData->muStateAB[i][j] = dipoleData->muStaging[i][j];
1127
+ }
1128
+ }
1129
+
1130
+ if (!haveFreeEnergy)
1131
+ {
1132
+ copy_rvec(dipoleData->muStateAB[0], muTotal);
1133
+ }
1134
+ else
1135
+ {
1136
+ for (int j = 0; j < DIM; j++)
1137
+ {
1138
+ muTotal[j] = (1.0 - lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)])
1139
+ * dipoleData->muStateAB[0][j]
1140
+ + lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1141
+ * dipoleData->muStateAB[1][j];
1142
+ }
1143
+ }
1144
+ }
1145
+
1146
+ /*! \brief Combines MTS level0 and level1 force buffers into a full and MTS-combined force buffer.
1147
+ *
1148
+ * \param[in] numAtoms The number of atoms to combine forces for
1149
+ * \param[in,out] forceMtsLevel0 Input: F_level0, output: F_level0 + F_level1
1150
+ * \param[in,out] forceMts Input: F_level1, output: F_level0 + mtsFactor * F_level1
1151
+ * \param[in] mtsFactor The factor between the level0 and level1 time step
1152
+ */
1153
+ static void combineMtsForces(const int numAtoms,
1154
+ ArrayRef<RVec> forceMtsLevel0,
1155
+ ArrayRef<RVec> forceMts,
1156
+ const real mtsFactor)
1157
+ {
1158
+ const int gmx_unused numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Default);
1159
+ #pragma omp parallel for num_threads(numThreads) schedule(static)
1160
+ for (int i = 0; i < numAtoms; i++)
1161
+ {
1162
+ const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
1163
+ forceMtsLevel0[i] += forceMts[i];
1164
+ forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
1165
+ }
1166
+ }
1167
+
1168
+ /*! \brief Setup for the local GPU force reduction:
1169
+ * reinitialization plus the registration of forces and dependencies.
1170
+ *
1171
+ * \param [in] runScheduleWork Schedule workload flag structure
1172
+ * \param [in] nbv Non-bonded Verlet object
1173
+ * \param [in] stateGpu GPU state propagator object
1174
+ * \param [in] gpuForceReduction GPU force reduction object
1175
+ * \param [in] pmePpCommGpu PME-PP GPU communication object
1176
+ * \param [in] pmedata PME data object
1177
+ * \param [in] dd Domain decomposition object
1178
+ */
1179
+ static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
1180
+ const nonbonded_verlet_t* nbv,
1181
+ gmx::StatePropagatorDataGpu* stateGpu,
1182
+ gmx::GpuForceReduction* gpuForceReduction,
1183
+ gmx::PmePpCommGpu* pmePpCommGpu,
1184
+ const gmx_pme_t* pmedata,
1185
+ const gmx_domdec_t* dd)
1186
+ {
1187
+ GMX_ASSERT(!runScheduleWork->simulationWork.useMts,
1188
+ "GPU force reduction is not compatible with MTS");
1189
+
1190
+ // (re-)initialize local GPU force reduction
1191
+ const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
1192
+ || runScheduleWork->simulationWork.havePpDomainDecomposition;
1193
+ const int atomStart = 0;
1194
+ gpuForceReduction->reinit(stateGpu->getForces(),
1195
+ nbv->getNumAtoms(AtomLocality::Local),
1196
+ nbv->getGridIndices(),
1197
+ atomStart,
1198
+ accumulate,
1199
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
1200
+
1201
+ // register forces and add dependencies
1202
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
1203
+
1204
+ DeviceBuffer<gmx::RVec> pmeForcePtr;
1205
+ GpuEventSynchronizer* pmeSynchronizer = nullptr;
1206
+ bool havePmeContribution = false;
1207
+
1208
+ if (runScheduleWork->simulationWork.useGpuPme && !runScheduleWork->simulationWork.haveSeparatePmeRank)
1209
+ {
1210
+ pmeForcePtr = pme_gpu_get_device_f(pmedata);
1211
+ if (pmeForcePtr)
1212
+ {
1213
+ pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(pmedata);
1214
+ havePmeContribution = true;
1215
+ }
1216
+ }
1217
+ else if (runScheduleWork->simulationWork.useGpuPmePpCommunication)
1218
+ {
1219
+ pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
1220
+ if (pmeForcePtr)
1221
+ {
1222
+ if (GMX_THREAD_MPI)
1223
+ {
1224
+ pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
1225
+ }
1226
+ havePmeContribution = true;
1227
+ }
1228
+ }
1229
+
1230
+ if (havePmeContribution)
1231
+ {
1232
+ gpuForceReduction->registerRvecForce(pmeForcePtr);
1233
+ if (!runScheduleWork->simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
1234
+ {
1235
+ GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
1236
+ gpuForceReduction->addDependency(pmeSynchronizer);
1237
+ }
1238
+ }
1239
+
1240
+ if (runScheduleWork->domainWork.haveCpuLocalForceWork
1241
+ || (runScheduleWork->simulationWork.havePpDomainDecomposition
1242
+ && !runScheduleWork->simulationWork.useGpuHaloExchange))
1243
+ {
1244
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
1245
+ }
1246
+
1247
+ if (runScheduleWork->simulationWork.useGpuHaloExchange)
1248
+ {
1249
+ gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
1250
+ }
1251
+ }
1252
+
1253
+ /*! \brief Setup for the non-local GPU force reduction:
1254
+ * reinitialization plus the registration of forces and dependencies.
1255
+ *
1256
+ * \param [in] runScheduleWork Schedule workload flag structure
1257
+ * \param [in] nbv Non-bonded Verlet object
1258
+ * \param [in] stateGpu GPU state propagator object
1259
+ * \param [in] gpuForceReduction GPU force reduction object
1260
+ * \param [in] dd Domain decomposition object
1261
+ */
1262
+ static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
1263
+ const nonbonded_verlet_t* nbv,
1264
+ gmx::StatePropagatorDataGpu* stateGpu,
1265
+ gmx::GpuForceReduction* gpuForceReduction,
1266
+ const gmx_domdec_t* dd)
1267
+ {
1268
+ // (re-)initialize non-local GPU force reduction
1269
+ const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
1270
+ || runScheduleWork->domainWork.haveFreeEnergyWork;
1271
+ const int atomStart = dd_numHomeAtoms(*dd);
1272
+ gpuForceReduction->reinit(stateGpu->getForces(),
1273
+ nbv->getNumAtoms(AtomLocality::NonLocal),
1274
+ nbv->getGridIndices(),
1275
+ atomStart,
1276
+ accumulate,
1277
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
1278
+
1279
+ // register forces and add dependencies
1280
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
1281
+
1282
+ if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
1283
+ {
1284
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
1285
+ }
1286
+ }
1287
+
1288
+
1289
+ /*! \brief Return the number of local atoms.
1290
+ */
1291
+ static int getLocalAtomCount(const gmx_domdec_t* dd, const t_mdatoms& mdatoms, bool havePPDomainDecomposition)
1292
+ {
1293
+ GMX_ASSERT(!(havePPDomainDecomposition && (dd == nullptr)),
1294
+ "Can't have PP decomposition with dd uninitialized!");
1295
+ return havePPDomainDecomposition ? dd_numAtomsZones(*dd) : mdatoms.homenr;
1296
+ }
1297
+
1298
+
1299
+ void do_force(FILE* fplog,
1300
+ const t_commrec* cr,
1301
+ const gmx_multisim_t* ms,
1302
+ const t_inputrec& inputrec,
1303
+ gmx::Awh* awh,
1304
+ gmx_enfrot* enforcedRotation,
1305
+ gmx::ImdSession* imdSession,
1306
+ pull_t* pull_work,
1307
+ int64_t step,
1308
+ t_nrnb* nrnb,
1309
+ gmx_wallcycle* wcycle,
1310
+ const gmx_localtop_t* top,
1311
+ const matrix box,
1312
+ gmx::ArrayRefWithPadding<gmx::RVec> x,
1313
+ const history_t* hist,
1314
+ gmx::ForceBuffersView* forceView,
1315
+ tensor vir_force,
1316
+ const t_mdatoms* mdatoms,
1317
+ gmx_enerdata_t* enerd,
1318
+ gmx::ArrayRef<const real> lambda,
1319
+ t_forcerec* fr,
1320
+ gmx::MdrunScheduleWorkload* runScheduleWork,
1321
+ gmx::VirtualSitesHandler* vsite,
1322
+ rvec muTotal,
1323
+ double t,
1324
+ gmx_edsam* ed,
1325
+ CpuPpLongRangeNonbondeds* longRangeNonbondeds,
1326
+ int legacyFlags,
1327
+ const DDBalanceRegionHandler& ddBalanceRegionHandler)
1328
+ {
1329
+ auto force = forceView->forceWithPadding();
1330
+ GMX_ASSERT(force.unpaddedArrayRef().ssize() >= fr->natoms_force_constr,
1331
+ "The size of the force buffer should be at least the number of atoms to compute "
1332
+ "forces for");
1333
+
1334
+ nonbonded_verlet_t* nbv = fr->nbv.get();
1335
+ interaction_const_t* ic = fr->ic.get();
1336
+
1337
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
1338
+
1339
+ const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
1340
+
1341
+ runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
1342
+ const StepWorkload& stepWork = runScheduleWork->stepWork;
1343
+
1344
+ if (stepWork.doNeighborSearch && gmx::needStateGpu(simulationWork))
1345
+ {
1346
+ // TODO refactor this to do_md, after partitioning.
1347
+ stateGpu->reinit(mdatoms->homenr,
1348
+ getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
1349
+ if (stepWork.haveGpuPmeOnThisRank)
1350
+ {
1351
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
1352
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
1353
+ }
1354
+ }
1355
+
1356
+ if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork && !stepWork.doNeighborSearch)
1357
+ {
1358
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data
1359
+ // First clear local portion of force array, so that untouched atoms are zero.
1360
+ // The dependency for this is that forces from previous timestep have been consumed,
1361
+ // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
1362
+ // On NS steps, the buffer could have already cleared in stateGpu->reinit.
1363
+ stateGpu->clearForcesOnGpu(AtomLocality::Local,
1364
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
1365
+ AtomLocality::Local, simulationWork, stepWork));
1366
+ }
1367
+
1368
+ /* At a search step we need to start the first balancing region
1369
+ * somewhere early inside the step after communication during domain
1370
+ * decomposition (and not during the previous step as usual).
1371
+ */
1372
+ if (stepWork.doNeighborSearch)
1373
+ {
1374
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
1375
+ }
1376
+
1377
+ clear_mat(vir_force);
1378
+
1379
+ if (fr->pbcType != PbcType::No)
1380
+ {
1381
+ /* Compute shift vectors every step,
1382
+ * because of pressure coupling or box deformation!
1383
+ */
1384
+ if (stepWork.haveDynamicBox && stepWork.stateChanged)
1385
+ {
1386
+ calc_shifts(box, fr->shift_vec);
1387
+ }
1388
+
1389
+ const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
1390
+ const bool calcCGCM = (fillGrid && !haveDDAtomOrdering(*cr));
1391
+ if (calcCGCM)
1392
+ {
1393
+ put_atoms_in_box_omp(fr->pbcType,
1394
+ box,
1395
+ x.unpaddedArrayRef().subArray(0, mdatoms->homenr),
1396
+ gmx_omp_nthreads_get(ModuleMultiThread::Default));
1397
+ inc_nrnb(nrnb, eNR_SHIFTX, mdatoms->homenr);
1398
+ }
1399
+ }
1400
+
1401
+ nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox, fr->shift_vec, nbv->nbat.get());
1402
+
1403
+ const bool pmeSendCoordinatesFromGpu =
1404
+ simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
1405
+ const bool reinitGpuPmePpComms =
1406
+ simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
1407
+
1408
+ auto* localXReadyOnDevice = (stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuXBufferOps)
1409
+ ? stateGpu->getCoordinatesReadyOnDeviceEvent(
1410
+ AtomLocality::Local, simulationWork, stepWork)
1411
+ : nullptr;
1412
+
1413
+ GMX_ASSERT(simulationWork.useGpuHaloExchange
1414
+ == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
1415
+ "The GPU halo exchange is active, but it has not been constructed.");
1416
+
1417
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
1418
+ // Copy coordinate from the GPU if update is on the GPU and there
1419
+ // are forces to be computed on the CPU, or for the computation of
1420
+ // virial, or if host-side data will be transferred from this task
1421
+ // to a remote task for halo exchange or PME-PP communication. At
1422
+ // search steps the current coordinates are already on the host,
1423
+ // hence copy is not needed.
1424
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
1425
+ && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
1426
+ || simulationWork.useCpuPmePpCommunication || simulationWork.useCpuHaloExchange
1427
+ || simulationWork.computeMuTot))
1428
+ {
1429
+ stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
1430
+ haveCopiedXFromGpu = true;
1431
+ }
1432
+
1433
+ // Coordinates on the device are needed if PME or BufferOps are offloaded.
1434
+ // The local coordinates can be copied right away.
1435
+ // NOTE: Consider moving this copy to right after they are updated and constrained,
1436
+ // if the later is not offloaded.
1437
+ if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
1438
+ {
1439
+ GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
1440
+ const int expectedLocalXReadyOnDeviceConsumptionCount =
1441
+ getExpectedLocalXReadyOnDeviceConsumptionCount(
1442
+ simulationWork, stepWork, pmeSendCoordinatesFromGpu);
1443
+
1444
+ // We need to copy coordinates when:
1445
+ // 1. Update is not offloaded
1446
+ // 2. The buffers were reinitialized on search step
1447
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
1448
+ {
1449
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(),
1450
+ AtomLocality::Local,
1451
+ expectedLocalXReadyOnDeviceConsumptionCount);
1452
+ }
1453
+ else if (simulationWork.useGpuUpdate)
1454
+ {
1455
+ stateGpu->setXUpdatedOnDeviceEventExpectedConsumptionCount(
1456
+ expectedLocalXReadyOnDeviceConsumptionCount);
1457
+ }
1458
+ }
1459
+
1460
+ if (stepWork.computePmeOnSeparateRank)
1461
+ {
1462
+ /* Send particle coordinates to the pme nodes */
1463
+ if (!pmeSendCoordinatesFromGpu && !stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
1464
+ {
1465
+ GMX_ASSERT(haveCopiedXFromGpu,
1466
+ "a wait should only be triggered if copy has been scheduled");
1467
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1468
+ }
1469
+
1470
+ gmx_pme_send_coordinates(fr,
1471
+ cr,
1472
+ box,
1473
+ x.unpaddedArrayRef(),
1474
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1475
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
1476
+ (stepWork.computeVirial || stepWork.computeEnergy),
1477
+ step,
1478
+ simulationWork.useGpuPmePpCommunication,
1479
+ reinitGpuPmePpComms,
1480
+ pmeSendCoordinatesFromGpu,
1481
+ stepWork.useGpuPmeFReduction,
1482
+ localXReadyOnDevice,
1483
+ wcycle);
1484
+ }
1485
+
1486
+ if (stepWork.haveGpuPmeOnThisRank)
1487
+ {
1488
+ launchPmeGpuSpread(fr->pmedata,
1489
+ box,
1490
+ stepWork,
1491
+ localXReadyOnDevice,
1492
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1493
+ wcycle);
1494
+ }
1495
+
1496
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
1497
+
1498
+ /* do gridding for pair search */
1499
+ if (stepWork.doNeighborSearch)
1500
+ {
1501
+ if (fr->wholeMoleculeTransform && stepWork.stateChanged)
1502
+ {
1503
+ fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
1504
+ }
1505
+
1506
+ wallcycle_start(wcycle, WallCycleCounter::NS);
1507
+ if (!haveDDAtomOrdering(*cr))
1508
+ {
1509
+ const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
1510
+ const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
1511
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
1512
+ nbnxn_put_on_grid(nbv,
1513
+ box,
1514
+ 0,
1515
+ vzero,
1516
+ boxDiagonal,
1517
+ nullptr,
1518
+ { 0, mdatoms->homenr },
1519
+ -1,
1520
+ fr->atomInfo,
1521
+ x.unpaddedArrayRef(),
1522
+ 0,
1523
+ nullptr);
1524
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
1525
+ }
1526
+ else
1527
+ {
1528
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1529
+ nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->atomInfo, x.unpaddedArrayRef());
1530
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
1531
+ }
1532
+
1533
+ nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
1534
+ gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
1535
+ fr->atomInfo);
1536
+
1537
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1538
+
1539
+ /* initialize the GPU nbnxm atom data and bonded data structures */
1540
+ if (simulationWork.useGpuNonbonded)
1541
+ {
1542
+ // Note: cycle counting only nononbondeds, GPU listed forces counts internally
1543
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1544
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1545
+ Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
1546
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1547
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1548
+
1549
+ if (fr->listedForcesGpu)
1550
+ {
1551
+ /* Now we put all atoms on the grid, we can assign bonded
1552
+ * interactions to the GPU, where the grid order is
1553
+ * needed. Also the xq, f and fshift device buffers have
1554
+ * been reallocated if needed, so the bonded code can
1555
+ * learn about them. */
1556
+ // TODO the xq, f, and fshift buffers are now shared
1557
+ // resources, so they should be maintained by a
1558
+ // higher-level object than the nb module.
1559
+ fr->listedForcesGpu->updateInteractionListsAndDeviceBuffers(
1560
+ nbv->getGridIndices(),
1561
+ top->idef,
1562
+ Nbnxm::gpu_get_xq(nbv->gpu_nbv),
1563
+ Nbnxm::gpu_get_f(nbv->gpu_nbv),
1564
+ Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
1565
+ }
1566
+ }
1567
+
1568
+ // Need to run after the GPU-offload bonded interaction lists
1569
+ // are set up to be able to determine whether there is bonded work.
1570
+ runScheduleWork->domainWork = setupDomainLifetimeWorkload(
1571
+ inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
1572
+
1573
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1574
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
1575
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1576
+ nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
1577
+
1578
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::Local);
1579
+
1580
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
1581
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1582
+
1583
+ if (simulationWork.useGpuXBufferOps)
1584
+ {
1585
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
1586
+ }
1587
+
1588
+ if (simulationWork.useGpuFBufferOps)
1589
+ {
1590
+ setupLocalGpuForceReduction(runScheduleWork,
1591
+ fr->nbv.get(),
1592
+ stateGpu,
1593
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
1594
+ fr->pmePpCommGpu.get(),
1595
+ fr->pmedata,
1596
+ cr->dd);
1597
+ if (runScheduleWork->simulationWork.havePpDomainDecomposition)
1598
+ {
1599
+ setupNonLocalGpuForceReduction(runScheduleWork,
1600
+ fr->nbv.get(),
1601
+ stateGpu,
1602
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
1603
+ cr->dd);
1604
+ }
1605
+ }
1606
+ }
1607
+ else if (!EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
1608
+ {
1609
+ if (stepWork.useGpuXBufferOps)
1610
+ {
1611
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
1612
+ nbv->convertCoordinatesGpu(AtomLocality::Local, stateGpu->getCoordinates(), localXReadyOnDevice);
1613
+ }
1614
+ else
1615
+ {
1616
+ if (simulationWork.useGpuUpdate)
1617
+ {
1618
+ GMX_ASSERT(stateGpu, "need a valid stateGpu object");
1619
+ GMX_ASSERT(haveCopiedXFromGpu,
1620
+ "a wait should only be triggered if copy has been scheduled");
1621
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1622
+ }
1623
+ nbv->convertCoordinates(AtomLocality::Local, x.unpaddedArrayRef());
1624
+ }
1625
+ }
1626
+
1627
+ if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
1628
+ {
1629
+ ddBalanceRegionHandler.openBeforeForceComputationGpu();
1630
+
1631
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
1632
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1633
+ Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
1634
+ if (!stepWork.useGpuXBufferOps)
1635
+ {
1636
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
1637
+ }
1638
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1639
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1640
+ // with X buffer ops offloaded to the GPU on all but the search steps
1641
+
1642
+ // bonded work not split into separate local and non-local, so with DD
1643
+ // we can only launch the kernel after non-local coordinates have been received.
1644
+ if (domainWork.haveGpuBondedWork && !simulationWork.havePpDomainDecomposition)
1645
+ {
1646
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1647
+ }
1648
+
1649
+ /* launch local nonbonded work on GPU */
1650
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1651
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1652
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
1653
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1654
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1655
+ }
1656
+
1657
+ if (stepWork.haveGpuPmeOnThisRank)
1658
+ {
1659
+ // In PME GPU and mixed mode we launch FFT / gather after the
1660
+ // X copy/transform to allow overlap as well as after the GPU NB
1661
+ // launch to avoid FFT launch overhead hijacking the CPU and delaying
1662
+ // the nonbonded kernel.
1663
+ launchPmeGpuFftAndGather(fr->pmedata,
1664
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
1665
+ wcycle,
1666
+ stepWork);
1667
+ }
1668
+
1669
+ /* Communicate coordinates and sum dipole if necessary +
1670
+ do non-local pair search */
1671
+ if (simulationWork.havePpDomainDecomposition)
1672
+ {
1673
+ if (stepWork.doNeighborSearch)
1674
+ {
1675
+ // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
1676
+ wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
1677
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1678
+ /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
1679
+ nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
1680
+
1681
+ nbv->setupGpuShortRangeWork(fr->listedForcesGpu.get(), InteractionLocality::NonLocal);
1682
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
1683
+ wallcycle_stop(wcycle, WallCycleCounter::NS);
1684
+ // TODO refactor this GPU halo exchange re-initialisation
1685
+ // to location in do_md where GPU halo exchange is
1686
+ // constructed at partitioning, after above stateGpu
1687
+ // re-initialization has similarly been refactored
1688
+ if (simulationWork.useGpuHaloExchange)
1689
+ {
1690
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
1691
+ }
1692
+ }
1693
+ else
1694
+ {
1695
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
1696
+ if (stepWork.useGpuXHalo)
1697
+ {
1698
+ // The following must be called after local setCoordinates (which records an event
1699
+ // when the coordinate data has been copied to the device).
1700
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
1701
+
1702
+ if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
1703
+ {
1704
+ // non-local part of coordinate buffer must be copied back to host for CPU work
1705
+ stateGpu->copyCoordinatesFromGpu(
1706
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
1707
+ }
1708
+ }
1709
+ else
1710
+ {
1711
+ if (simulationWork.useGpuUpdate)
1712
+ {
1713
+ GMX_ASSERT(haveCopiedXFromGpu,
1714
+ "a wait should only be triggered if copy has been scheduled");
1715
+ const bool haveAlreadyWaited =
1716
+ (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1717
+ if (!haveAlreadyWaited)
1718
+ {
1719
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1720
+ }
1721
+ }
1722
+ dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
1723
+ }
1724
+
1725
+ if (stepWork.useGpuXBufferOps)
1726
+ {
1727
+ if (!stepWork.useGpuXHalo)
1728
+ {
1729
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
1730
+ }
1731
+ nbv->convertCoordinatesGpu(
1732
+ AtomLocality::NonLocal,
1733
+ stateGpu->getCoordinates(),
1734
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
1735
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
1736
+ }
1737
+ else
1738
+ {
1739
+ nbv->convertCoordinates(AtomLocality::NonLocal, x.unpaddedArrayRef());
1740
+ }
1741
+ }
1742
+
1743
+ if (simulationWork.useGpuNonbonded)
1744
+ {
1745
+
1746
+ if (!stepWork.useGpuXBufferOps)
1747
+ {
1748
+ wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
1749
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1750
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
1751
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1752
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1753
+ }
1754
+
1755
+ if (domainWork.haveGpuBondedWork)
1756
+ {
1757
+ fr->listedForcesGpu->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
1758
+ }
1759
+
1760
+ /* launch non-local nonbonded tasks on GPU */
1761
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1762
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1763
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1764
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1765
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1766
+ }
1767
+ }
1768
+
1769
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
1770
+ // so we need to do that here after the local and non-local pairlist construction.
1771
+ if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No)
1772
+ {
1773
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
1774
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
1775
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
1776
+ }
1777
+
1778
+ if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
1779
+ {
1780
+ /* launch D2H copy-back F */
1781
+ wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
1782
+ wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1783
+
1784
+ if (simulationWork.havePpDomainDecomposition)
1785
+ {
1786
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
1787
+ }
1788
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
1789
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
1790
+
1791
+ if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
1792
+ {
1793
+ fr->listedForcesGpu->launchEnergyTransfer();
1794
+ }
1795
+ wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
1796
+ }
1797
+
1798
+ gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
1799
+ if (fr->wholeMoleculeTransform)
1800
+ {
1801
+ xWholeMolecules = fr->wholeMoleculeTransform->wholeMoleculeCoordinates(x.unpaddedArrayRef(), box);
1802
+ }
1803
+
1804
+ // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
1805
+ // this wait ensures that the D2H transfer is complete.
1806
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
1807
+ {
1808
+ const bool needCoordsOnHost = (runScheduleWork->domainWork.haveCpuLocalForceWork
1809
+ || stepWork.computeVirial || simulationWork.computeMuTot);
1810
+ const bool haveAlreadyWaited =
1811
+ simulationWork.useCpuHaloExchange
1812
+ || (stepWork.computePmeOnSeparateRank && !pmeSendCoordinatesFromGpu);
1813
+ if (needCoordsOnHost && !haveAlreadyWaited)
1814
+ {
1815
+ GMX_ASSERT(haveCopiedXFromGpu,
1816
+ "a wait should only be triggered if copy has been scheduled");
1817
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1818
+ }
1819
+ }
1820
+
1821
+ DipoleData dipoleData;
1822
+
1823
+ if (simulationWork.computeMuTot)
1824
+ {
1825
+ const int start = 0;
1826
+
1827
+ /* Calculate total (local) dipole moment in a temporary common array.
1828
+ * This makes it possible to sum them over nodes faster.
1829
+ */
1830
+ gmx::ArrayRef<const gmx::RVec> xRef =
1831
+ (xWholeMolecules.empty() ? x.unpaddedArrayRef() : xWholeMolecules);
1832
+ calc_mu(start,
1833
+ mdatoms->homenr,
1834
+ xRef,
1835
+ mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
1836
+ : gmx::ArrayRef<real>{},
1837
+ mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
1838
+ : gmx::ArrayRef<real>{},
1839
+ mdatoms->nChargePerturbed != 0,
1840
+ dipoleData.muStaging[0],
1841
+ dipoleData.muStaging[1]);
1842
+
1843
+ reduceAndUpdateMuTot(
1844
+ &dipoleData, cr, (fr->efep != FreeEnergyPerturbationType::No), lambda, muTotal, ddBalanceRegionHandler);
1845
+ }
1846
+
1847
+ /* Reset energies */
1848
+ reset_enerdata(enerd);
1849
+
1850
+ if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
1851
+ {
1852
+ wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
1853
+ dd_force_flop_start(cr->dd, nrnb);
1854
+ }
1855
+
1856
+ if (inputrec.bRot)
1857
+ {
1858
+ wallcycle_start(wcycle, WallCycleCounter::Rot);
1859
+ do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
1860
+ wallcycle_stop(wcycle, WallCycleCounter::Rot);
1861
+ }
1862
+
1863
+ /* Start the force cycle counter.
1864
+ * Note that a different counter is used for dynamic load balancing.
1865
+ */
1866
+ wallcycle_start(wcycle, WallCycleCounter::Force);
1867
+
1868
+ /* Set up and clear force outputs:
1869
+ * forceOutMtsLevel0: everything except what is in the other two outputs
1870
+ * forceOutMtsLevel1: PME-mesh and listed-forces group 1
1871
+ * forceOutNonbonded: non-bonded forces
1872
+ * Without multiple time stepping all point to the same object.
1873
+ * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
1874
+ */
1875
+ ForceOutputs forceOutMtsLevel0 = setupForceOutputs(
1876
+ &fr->forceHelperBuffers[0], force, domainWork, stepWork, simulationWork.havePpDomainDecomposition, wcycle);
1877
+
1878
+ // Force output for MTS combined forces, only set at level1 MTS steps
1879
+ std::optional<ForceOutputs> forceOutMts =
1880
+ (simulationWork.useMts && stepWork.computeSlowForces)
1881
+ ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
1882
+ forceView->forceMtsCombinedWithPadding(),
1883
+ domainWork,
1884
+ stepWork,
1885
+ simulationWork.havePpDomainDecomposition,
1886
+ wcycle))
1887
+ : std::nullopt;
1888
+
1889
+ ForceOutputs* forceOutMtsLevel1 =
1890
+ simulationWork.useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr)
1891
+ : &forceOutMtsLevel0;
1892
+
1893
+ const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
1894
+
1895
+ ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
1896
+
1897
+ if (inputrec.bPull && pull_have_constraint(*pull_work))
1898
+ {
1899
+ clear_pull_forces(pull_work);
1900
+ }
1901
+
1902
+ /* We calculate the non-bonded forces, when done on the CPU, here.
1903
+ * We do this before calling do_force_lowlevel, because in that
1904
+ * function, the listed forces are calculated before PME, which
1905
+ * does communication. With this order, non-bonded and listed
1906
+ * force calculation imbalance can be balanced out by the domain
1907
+ * decomposition load balancing.
1908
+ */
1909
+
1910
+ const bool useOrEmulateGpuNb = simulationWork.useGpuNonbonded || fr->nbv->emulateGpu();
1911
+
1912
+ if (!useOrEmulateGpuNb)
1913
+ {
1914
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
1915
+ }
1916
+
1917
+ // TODO Force flags should include haveFreeEnergyWork for this domain
1918
+ if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
1919
+ {
1920
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1921
+ /* Wait for non-local coordinate data to be copied from device */
1922
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
1923
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1924
+ }
1925
+
1926
+ if (fr->efep != FreeEnergyPerturbationType::No && stepWork.computeNonbondedForces)
1927
+ {
1928
+ /* Calculate the local and non-local free energy interactions here.
1929
+ * Happens here on the CPU both with and without GPU.
1930
+ */
1931
+ nbv->dispatchFreeEnergyKernels(
1932
+ x,
1933
+ &forceOutNonbonded->forceWithShiftForces(),
1934
+ fr->use_simd_kernels,
1935
+ fr->ntype,
1936
+ fr->rlist,
1937
+ max_cutoff2(inputrec.pbcType, box),
1938
+ *fr->ic,
1939
+ fr->shift_vec,
1940
+ fr->nbfp,
1941
+ fr->ljpme_c6grid,
1942
+ mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
1943
+ : gmx::ArrayRef<real>{},
1944
+ mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
1945
+ : gmx::ArrayRef<real>{},
1946
+ mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
1947
+ : gmx::ArrayRef<int>{},
1948
+ mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
1949
+ : gmx::ArrayRef<int>{},
1950
+ inputrec.fepvals.get(),
1951
+ lambda,
1952
+ enerd,
1953
+ stepWork,
1954
+ nrnb);
1955
+ }
1956
+
1957
+ if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
1958
+ {
1959
+ if (simulationWork.havePpDomainDecomposition)
1960
+ {
1961
+ do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
1962
+ }
1963
+
1964
+ if (stepWork.computeForces)
1965
+ {
1966
+ /* Add all the non-bonded force to the normal force array.
1967
+ * This can be split into a local and a non-local part when overlapping
1968
+ * communication with calculation with domain decomposition.
1969
+ */
1970
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
1971
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
1972
+ forceOutNonbonded->forceWithShiftForces().force());
1973
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
1974
+ }
1975
+
1976
+ /* If there are multiple fshift output buffers we need to reduce them */
1977
+ if (stepWork.computeVirial)
1978
+ {
1979
+ /* This is not in a subcounter because it takes a
1980
+ negligible and constant-sized amount of time */
1981
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(
1982
+ *nbv->nbat, forceOutNonbonded->forceWithShiftForces().shiftForces());
1983
+ }
1984
+ }
1985
+
1986
+ // Compute wall interactions, when present.
1987
+ // Note: should be moved to special forces.
1988
+ if (inputrec.nwall && stepWork.computeNonbondedForces)
1989
+ {
1990
+ /* foreign lambda component for walls */
1991
+ real dvdl_walls = do_walls(inputrec,
1992
+ *fr,
1993
+ box,
1994
+ mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
1995
+ : gmx::ArrayRef<int>{},
1996
+ mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
1997
+ : gmx::ArrayRef<int>{},
1998
+ mdatoms->cENER ? gmx::arrayRefFromArray(mdatoms->cENER, mdatoms->nr)
1999
+ : gmx::ArrayRef<unsigned short>{},
2000
+ mdatoms->homenr,
2001
+ mdatoms->nPerturbed,
2002
+ x.unpaddedConstArrayRef(),
2003
+ &forceOutMtsLevel0.forceWithVirial(),
2004
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)],
2005
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR],
2006
+ nrnb);
2007
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += dvdl_walls;
2008
+ }
2009
+
2010
+ if (stepWork.computeListedForces)
2011
+ {
2012
+ /* Check whether we need to take into account PBC in listed interactions */
2013
+ bool needMolPbc = false;
2014
+ for (const auto& listedForces : fr->listedForces)
2015
+ {
2016
+ if (listedForces.haveCpuListedForces(*fr->fcdata))
2017
+ {
2018
+ needMolPbc = fr->bMolPBC;
2019
+ }
2020
+ }
2021
+
2022
+ t_pbc pbc;
2023
+
2024
+ if (needMolPbc)
2025
+ {
2026
+ /* Since all atoms are in the rectangular or triclinic unit-cell,
2027
+ * only single box vector shifts (2 in x) are required.
2028
+ */
2029
+ set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
2030
+ }
2031
+
2032
+ for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
2033
+ mtsIndex++)
2034
+ {
2035
+ ListedForces& listedForces = fr->listedForces[mtsIndex];
2036
+ ForceOutputs& forceOut = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
2037
+ listedForces.calculate(wcycle,
2038
+ box,
2039
+ inputrec.fepvals.get(),
2040
+ cr,
2041
+ ms,
2042
+ x,
2043
+ xWholeMolecules,
2044
+ fr->fcdata.get(),
2045
+ hist,
2046
+ &forceOut,
2047
+ fr,
2048
+ &pbc,
2049
+ enerd,
2050
+ nrnb,
2051
+ lambda,
2052
+ mdatoms,
2053
+ haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
2054
+ stepWork);
2055
+ }
2056
+ }
2057
+
2058
+ if (stepWork.computeSlowForces)
2059
+ {
2060
+ longRangeNonbondeds->calculate(fr->pmedata,
2061
+ cr,
2062
+ x.unpaddedConstArrayRef(),
2063
+ &forceOutMtsLevel1->forceWithVirial(),
2064
+ enerd,
2065
+ box,
2066
+ lambda,
2067
+ dipoleData.muStateAB,
2068
+ stepWork,
2069
+ ddBalanceRegionHandler);
2070
+ }
2071
+
2072
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2073
+
2074
+ // VdW dispersion correction, only computed on master rank to avoid double counting
2075
+ if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
2076
+ {
2077
+ // Calculate long range corrections to pressure and energy
2078
+ const DispersionCorrection::Correction correction = fr->dispersionCorrection->calculate(
2079
+ box, lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]);
2080
+
2081
+ if (stepWork.computeEnergy)
2082
+ {
2083
+ enerd->term[F_DISPCORR] = correction.energy;
2084
+ enerd->term[F_DVDL_VDW] += correction.dvdl;
2085
+ enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += correction.dvdl;
2086
+ }
2087
+ if (stepWork.computeVirial)
2088
+ {
2089
+ correction.correctVirial(vir_force);
2090
+ enerd->term[F_PDISPCORR] = correction.pressure;
2091
+ }
2092
+ }
2093
+
2094
+ const bool needToReceivePmeResultsFromSeparateRank = (PAR(cr) && stepWork.computePmeOnSeparateRank);
2095
+ const bool needToReceivePmeResults =
2096
+ (stepWork.haveGpuPmeOnThisRank || needToReceivePmeResultsFromSeparateRank);
2097
+
2098
+ /* When running free energy perturbations steered by AWH and doing PME calculations on the
2099
+ * GPU we must wait for the PME calculation (dhdl) results to finish before sampling the
2100
+ * FEP dimension with AWH. */
2101
+ const bool needEarlyPmeResults = (awh != nullptr && awh->hasFepLambdaDimension() && needToReceivePmeResults
2102
+ && stepWork.computeEnergy && stepWork.computeSlowForces);
2103
+ if (needEarlyPmeResults)
2104
+ {
2105
+ if (stepWork.haveGpuPmeOnThisRank)
2106
+ {
2107
+ pme_gpu_wait_and_reduce(fr->pmedata,
2108
+ stepWork,
2109
+ wcycle,
2110
+ &forceOutMtsLevel1->forceWithVirial(),
2111
+ enerd,
2112
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2113
+ }
2114
+ else if (needToReceivePmeResultsFromSeparateRank)
2115
+ {
2116
+ /* In case of node-splitting, the PP nodes receive the long-range
2117
+ * forces, virial and energy from the PME nodes here.
2118
+ */
2119
+ pme_receive_force_ener(fr,
2120
+ cr,
2121
+ &forceOutMtsLevel1->forceWithVirial(),
2122
+ enerd,
2123
+ simulationWork.useGpuPmePpCommunication,
2124
+ stepWork.useGpuPmeFReduction,
2125
+ wcycle);
2126
+ }
2127
+ }
2128
+
2129
+ computeSpecialForces(fplog,
2130
+ cr,
2131
+ inputrec,
2132
+ awh,
2133
+ enforcedRotation,
2134
+ imdSession,
2135
+ pull_work,
2136
+ step,
2137
+ t,
2138
+ wcycle,
2139
+ fr->forceProviders,
2140
+ box,
2141
+ x.unpaddedArrayRef(),
2142
+ mdatoms,
2143
+ lambda,
2144
+ stepWork,
2145
+ &forceOutMtsLevel0.forceWithVirial(),
2146
+ forceOutMtsLevel1 ? &forceOutMtsLevel1->forceWithVirial() : nullptr,
2147
+ enerd,
2148
+ ed,
2149
+ stepWork.doNeighborSearch);
2150
+
2151
+ if (simulationWork.havePpDomainDecomposition && stepWork.computeForces && stepWork.useGpuFHalo
2152
+ && domainWork.haveCpuLocalForceWork)
2153
+ {
2154
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::Local);
2155
+ }
2156
+
2157
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2158
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2159
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFHalo),
2160
+ "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
2161
+ // Will store the amount of cycles spent waiting for the GPU that
2162
+ // will be later used in the DLB accounting.
2163
+ float cycles_wait_gpu = 0;
2164
+ if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
2165
+ {
2166
+ auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
2167
+
2168
+ /* wait for non-local forces (or calculate in emulation mode) */
2169
+ if (simulationWork.havePpDomainDecomposition)
2170
+ {
2171
+ if (simulationWork.useGpuNonbonded)
2172
+ {
2173
+ cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(
2174
+ nbv->gpu_nbv,
2175
+ stepWork,
2176
+ AtomLocality::NonLocal,
2177
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2178
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2179
+ forceWithShiftForces.shiftForces(),
2180
+ wcycle);
2181
+ }
2182
+ else
2183
+ {
2184
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2185
+ do_nb_verlet(
2186
+ fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
2187
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2188
+ }
2189
+
2190
+ if (stepWork.useGpuFBufferOps)
2191
+ {
2192
+ if (domainWork.haveNonLocalForceContribInCpuBuffer)
2193
+ {
2194
+ stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2195
+ AtomLocality::NonLocal);
2196
+ }
2197
+
2198
+
2199
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
2200
+
2201
+ if (!stepWork.useGpuFHalo)
2202
+ {
2203
+ /* We don't explicitly wait for the forces to be reduced on device,
2204
+ * but wait for them to finish copying to CPU instead.
2205
+ * So, we manually consume the event, see Issue #3988. */
2206
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
2207
+ // copy from GPU input for dd_move_f()
2208
+ stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
2209
+ AtomLocality::NonLocal);
2210
+ }
2211
+ }
2212
+ else
2213
+ {
2214
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
2215
+ }
2216
+
2217
+ if (fr->nbv->emulateGpu() && stepWork.computeVirial)
2218
+ {
2219
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat, forceWithShiftForces.shiftForces());
2220
+ }
2221
+ }
2222
+ }
2223
+
2224
+ /* Combining the forces for multiple time stepping before the halo exchange, when possible,
2225
+ * avoids an extra halo exchange (when DD is used) and post-processing step.
2226
+ */
2227
+ if (stepWork.combineMtsForcesBeforeHaloExchange)
2228
+ {
2229
+ combineMtsForces(getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition),
2230
+ force.unpaddedArrayRef(),
2231
+ forceView->forceMtsCombined(),
2232
+ inputrec.mtsLevels[1].stepFactor);
2233
+ }
2234
+
2235
+ if (simulationWork.havePpDomainDecomposition)
2236
+ {
2237
+ /* We are done with the CPU compute.
2238
+ * We will now communicate the non-local forces.
2239
+ * If we use a GPU this will overlap with GPU work, so in that case
2240
+ * we do not close the DD force balancing region here.
2241
+ */
2242
+ ddBalanceRegionHandler.closeAfterForceComputationCpu();
2243
+
2244
+ if (stepWork.computeForces)
2245
+ {
2246
+
2247
+ if (stepWork.useGpuFHalo)
2248
+ {
2249
+ // If there exist CPU forces, data from halo exchange should accumulate into these
2250
+ bool accumulateForces = domainWork.haveCpuLocalForceWork;
2251
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
2252
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
2253
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
2254
+
2255
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
2256
+ }
2257
+ else
2258
+ {
2259
+ if (stepWork.useGpuFBufferOps)
2260
+ {
2261
+ stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
2262
+ }
2263
+
2264
+ // Without MTS or with MTS at slow steps with uncombined forces we need to
2265
+ // communicate the fast forces
2266
+ if (!simulationWork.useMts || !stepWork.combineMtsForcesBeforeHaloExchange)
2267
+ {
2268
+ dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
2269
+ }
2270
+ // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
2271
+ if (simulationWork.useMts && stepWork.computeSlowForces)
2272
+ {
2273
+ dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
2274
+ }
2275
+ }
2276
+ }
2277
+ }
2278
+
2279
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
2280
+ // an alternating wait/reduction scheme.
2281
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2282
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2283
+ bool alternateGpuWait = (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
2284
+ && simulationWork.useGpuNonbonded && !simulationWork.havePpDomainDecomposition
2285
+ && !stepWork.useGpuFBufferOps && !needEarlyPmeResults);
2286
+ if (alternateGpuWait)
2287
+ {
2288
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(),
2289
+ fr->pmedata,
2290
+ forceOutNonbonded,
2291
+ forceOutMtsLevel1,
2292
+ enerd,
2293
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)],
2294
+ stepWork,
2295
+ wcycle);
2296
+ }
2297
+
2298
+ if (!alternateGpuWait && stepWork.haveGpuPmeOnThisRank && !needEarlyPmeResults)
2299
+ {
2300
+ pme_gpu_wait_and_reduce(fr->pmedata,
2301
+ stepWork,
2302
+ wcycle,
2303
+ &forceOutMtsLevel1->forceWithVirial(),
2304
+ enerd,
2305
+ lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]);
2306
+ }
2307
+
2308
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
2309
+ if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
2310
+ {
2311
+ /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
2312
+ * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
2313
+ * but even with a step of 0.1 ms the difference is less than 1%
2314
+ * of the step time.
2315
+ */
2316
+ const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
2317
+ const float waitCycles = Nbnxm::gpu_wait_finish_task(
2318
+ nbv->gpu_nbv,
2319
+ stepWork,
2320
+ AtomLocality::Local,
2321
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR].data(),
2322
+ enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR].data(),
2323
+ forceOutNonbonded->forceWithShiftForces().shiftForces(),
2324
+ wcycle);
2325
+
2326
+ if (ddBalanceRegionHandler.useBalancingRegion())
2327
+ {
2328
+ DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
2329
+ if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
2330
+ {
2331
+ /* We measured few cycles, it could be that the kernel
2332
+ * and transfer finished earlier and there was no actual
2333
+ * wait time, only API call overhead.
2334
+ * Then the actual time could be anywhere between 0 and
2335
+ * cycles_wait_est. We will use half of cycles_wait_est.
2336
+ */
2337
+ waitedForGpu = DdBalanceRegionWaitedForGpu::no;
2338
+ }
2339
+ ddBalanceRegionHandler.closeAfterForceComputationGpu(cycles_wait_gpu, waitedForGpu);
2340
+ }
2341
+ }
2342
+
2343
+ if (fr->nbv->emulateGpu())
2344
+ {
2345
+ // NOTE: emulation kernel is not included in the balancing region,
2346
+ // but emulation mode does not target performance anyway
2347
+ wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
2348
+ do_nb_verlet(fr,
2349
+ ic,
2350
+ enerd,
2351
+ stepWork,
2352
+ InteractionLocality::Local,
2353
+ haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
2354
+ step,
2355
+ nrnb,
2356
+ wcycle);
2357
+ wallcycle_stop(wcycle, WallCycleCounter::Force);
2358
+ }
2359
+
2360
+ // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
2361
+ // TODO refactor this and unify with below default-path call to the same function
2362
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2363
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2364
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useGpuPmePpCommunication && !needEarlyPmeResults)
2365
+ {
2366
+ /* In case of node-splitting, the PP nodes receive the long-range
2367
+ * forces, virial and energy from the PME nodes here.
2368
+ */
2369
+ pme_receive_force_ener(fr,
2370
+ cr,
2371
+ &forceOutMtsLevel1->forceWithVirial(),
2372
+ enerd,
2373
+ simulationWork.useGpuPmePpCommunication,
2374
+ stepWork.useGpuPmeFReduction,
2375
+ wcycle);
2376
+ }
2377
+
2378
+
2379
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
2380
+ * on the non-alternating path. */
2381
+ GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
2382
+ "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
2383
+ if (useOrEmulateGpuNb && !alternateGpuWait)
2384
+ {
2385
+ if (stepWork.useGpuFBufferOps)
2386
+ {
2387
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2388
+
2389
+ // TODO: move these steps as early as possible:
2390
+ // - CPU f H2D should be as soon as all CPU-side forces are done
2391
+ // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
2392
+ // before the next CPU task that consumes the forces: vsite spread or update)
2393
+ // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
2394
+ // of the halo exchange. In that case the copy is instead performed above, before the exchange.
2395
+ // These should be unified.
2396
+ if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
2397
+ {
2398
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
2399
+ }
2400
+
2401
+ if (stepWork.computeNonbondedForces)
2402
+ {
2403
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
2404
+ }
2405
+
2406
+ // Copy forces to host if they are needed for update or if virtual sites are enabled.
2407
+ // If there are vsites, we need to copy forces every step to spread vsite forces on host.
2408
+ // TODO: When the output flags will be included in step workload, this copy can be combined with the
2409
+ // copy call done in sim_utils(...) for the output.
2410
+ // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
2411
+ // they should not be copied in do_md(...) for the output.
2412
+ if (!simulationWork.useGpuUpdate
2413
+ || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
2414
+ || vsite)
2415
+ {
2416
+ if (stepWork.computeNonbondedForces)
2417
+ {
2418
+ /* We have previously issued force reduction on the GPU, but we will
2419
+ * not use this event, instead relying on the stream being in-order.
2420
+ * Issue #3988. */
2421
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
2422
+ }
2423
+ stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
2424
+ stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
2425
+ }
2426
+ }
2427
+ else if (stepWork.computeNonbondedForces)
2428
+ {
2429
+ ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
2430
+ nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
2431
+ }
2432
+ }
2433
+
2434
+ launchGpuEndOfStepTasks(
2435
+ nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, *runScheduleWork, step, wcycle);
2436
+
2437
+ if (haveDDAtomOrdering(*cr))
2438
+ {
2439
+ dd_force_flop_stop(cr->dd, nrnb);
2440
+ }
2441
+
2442
+ const bool haveCombinedMtsForces = (stepWork.computeForces && simulationWork.useMts && stepWork.computeSlowForces
2443
+ && stepWork.combineMtsForcesBeforeHaloExchange);
2444
+ if (stepWork.computeForces)
2445
+ {
2446
+ postProcessForceWithShiftForces(
2447
+ nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0, vir_force, *mdatoms, *fr, vsite, stepWork);
2448
+
2449
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2450
+ {
2451
+ postProcessForceWithShiftForces(
2452
+ nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, *mdatoms, *fr, vsite, stepWork);
2453
+ }
2454
+ }
2455
+
2456
+ // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
2457
+ // When running free energy perturbations steered by AWH and calculating PME on GPU,
2458
+ // i.e. if needEarlyPmeResults == true, the PME results have already been reduced above.
2459
+ if (needToReceivePmeResultsFromSeparateRank && simulationWork.useCpuPmePpCommunication && !needEarlyPmeResults)
2460
+ {
2461
+ /* In case of node-splitting, the PP nodes receive the long-range
2462
+ * forces, virial and energy from the PME nodes here.
2463
+ */
2464
+ pme_receive_force_ener(fr,
2465
+ cr,
2466
+ &forceOutMtsLevel1->forceWithVirial(),
2467
+ enerd,
2468
+ simulationWork.useGpuPmePpCommunication,
2469
+ false,
2470
+ wcycle);
2471
+ }
2472
+
2473
+ if (stepWork.computeForces)
2474
+ {
2475
+ /* If we don't use MTS or if we already combined the MTS forces before, we only
2476
+ * need to post-process one ForceOutputs object here, called forceOutCombined,
2477
+ * otherwise we have to post-process two outputs and then combine them.
2478
+ */
2479
+ ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
2480
+ postProcessForces(
2481
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined, vir_force, mdatoms, fr, vsite, stepWork);
2482
+
2483
+ if (simulationWork.useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
2484
+ {
2485
+ postProcessForces(
2486
+ cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1, vir_force, mdatoms, fr, vsite, stepWork);
2487
+
2488
+ combineMtsForces(mdatoms->homenr,
2489
+ force.unpaddedArrayRef(),
2490
+ forceView->forceMtsCombined(),
2491
+ inputrec.mtsLevels[1].stepFactor);
2492
+ }
2493
+ }
2494
+
2495
+ if (stepWork.computeEnergy)
2496
+ {
2497
+ /* Compute the final potential energy terms */
2498
+ accumulatePotentialEnergies(enerd, lambda, inputrec.fepvals.get());
2499
+
2500
+ if (!EI_TPI(inputrec.eI))
2501
+ {
2502
+ checkPotentialEnergyValidity(step, *enerd, inputrec);
2503
+ }
2504
+ }
2505
+
2506
+ /* In case we don't have constraints and are using GPUs, the next balancing
2507
+ * region starts here.
2508
+ * Some "special" work at the end of do_force_cuts?, such as vsite spread,
2509
+ * virial calculation and COM pulling, is not thus not included in
2510
+ * the balance timing, which is ok as most tasks do communication.
2511
+ */
2512
+ ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::no);
2513
+ }