forgeo-gmlib 0.6.2__cp39-cp39-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (554) hide show
  1. forgeo/gmlib/GeologicalModel3D.py +758 -0
  2. forgeo/gmlib/__init__.py +9 -0
  3. forgeo/gmlib/_version.py +34 -0
  4. forgeo/gmlib/architecture/__init__.py +1 -0
  5. forgeo/gmlib/architecture/core.py +130 -0
  6. forgeo/gmlib/common.cpython-39-x86_64-linux-gnu.so +0 -0
  7. forgeo/gmlib/fault_network.py +171 -0
  8. forgeo/gmlib/geomodeller_data.py +101 -0
  9. forgeo/gmlib/geomodeller_project.py +396 -0
  10. forgeo/gmlib/myxmltools.py +30 -0
  11. forgeo/gmlib/pypotential2D.cpython-39-x86_64-linux-gnu.so +0 -0
  12. forgeo/gmlib/pypotential3D.cpython-39-x86_64-linux-gnu.so +0 -0
  13. forgeo/gmlib/tesselate.py +236 -0
  14. forgeo/gmlib/tesselate_deprecated.py +249 -0
  15. forgeo/gmlib/topography_reader.py +198 -0
  16. forgeo/gmlib/utils/__init__.py +0 -0
  17. forgeo/gmlib/utils/append_data.py +508 -0
  18. forgeo/gmlib/utils/export.py +45 -0
  19. forgeo/gmlib/utils/normalized_gradient.py +40 -0
  20. forgeo/gmlib/utils/tools.py +35 -0
  21. forgeo_gmlib-0.6.2.dist-info/METADATA +23 -0
  22. forgeo_gmlib-0.6.2.dist-info/RECORD +554 -0
  23. forgeo_gmlib-0.6.2.dist-info/WHEEL +5 -0
  24. forgeo_gmlib-0.6.2.dist-info/licenses/LICENSE +661 -0
  25. include/eigen3/Eigen/Cholesky +45 -0
  26. include/eigen3/Eigen/CholmodSupport +48 -0
  27. include/eigen3/Eigen/Core +385 -0
  28. include/eigen3/Eigen/Dense +7 -0
  29. include/eigen3/Eigen/Eigen +2 -0
  30. include/eigen3/Eigen/Eigenvalues +60 -0
  31. include/eigen3/Eigen/Geometry +59 -0
  32. include/eigen3/Eigen/Householder +29 -0
  33. include/eigen3/Eigen/IterativeLinearSolvers +48 -0
  34. include/eigen3/Eigen/Jacobi +32 -0
  35. include/eigen3/Eigen/KLUSupport +41 -0
  36. include/eigen3/Eigen/LU +47 -0
  37. include/eigen3/Eigen/MetisSupport +35 -0
  38. include/eigen3/Eigen/OrderingMethods +70 -0
  39. include/eigen3/Eigen/PaStiXSupport +49 -0
  40. include/eigen3/Eigen/PardisoSupport +35 -0
  41. include/eigen3/Eigen/QR +50 -0
  42. include/eigen3/Eigen/QtAlignedMalloc +39 -0
  43. include/eigen3/Eigen/SPQRSupport +34 -0
  44. include/eigen3/Eigen/SVD +50 -0
  45. include/eigen3/Eigen/Sparse +34 -0
  46. include/eigen3/Eigen/SparseCholesky +37 -0
  47. include/eigen3/Eigen/SparseCore +69 -0
  48. include/eigen3/Eigen/SparseLU +48 -0
  49. include/eigen3/Eigen/SparseQR +36 -0
  50. include/eigen3/Eigen/StdDeque +27 -0
  51. include/eigen3/Eigen/StdList +26 -0
  52. include/eigen3/Eigen/StdVector +27 -0
  53. include/eigen3/Eigen/SuperLUSupport +64 -0
  54. include/eigen3/Eigen/UmfPackSupport +40 -0
  55. include/eigen3/Eigen/src/Cholesky/LDLT.h +688 -0
  56. include/eigen3/Eigen/src/Cholesky/LLT.h +558 -0
  57. include/eigen3/Eigen/src/Cholesky/LLT_LAPACKE.h +99 -0
  58. include/eigen3/Eigen/src/CholmodSupport/CholmodSupport.h +682 -0
  59. include/eigen3/Eigen/src/Core/ArithmeticSequence.h +406 -0
  60. include/eigen3/Eigen/src/Core/Array.h +425 -0
  61. include/eigen3/Eigen/src/Core/ArrayBase.h +226 -0
  62. include/eigen3/Eigen/src/Core/ArrayWrapper.h +209 -0
  63. include/eigen3/Eigen/src/Core/Assign.h +90 -0
  64. include/eigen3/Eigen/src/Core/AssignEvaluator.h +1010 -0
  65. include/eigen3/Eigen/src/Core/Assign_MKL.h +178 -0
  66. include/eigen3/Eigen/src/Core/BandMatrix.h +353 -0
  67. include/eigen3/Eigen/src/Core/Block.h +463 -0
  68. include/eigen3/Eigen/src/Core/BooleanRedux.h +164 -0
  69. include/eigen3/Eigen/src/Core/CommaInitializer.h +164 -0
  70. include/eigen3/Eigen/src/Core/ConditionEstimator.h +175 -0
  71. include/eigen3/Eigen/src/Core/CoreEvaluators.h +1741 -0
  72. include/eigen3/Eigen/src/Core/CoreIterators.h +132 -0
  73. include/eigen3/Eigen/src/Core/CwiseBinaryOp.h +183 -0
  74. include/eigen3/Eigen/src/Core/CwiseNullaryOp.h +1001 -0
  75. include/eigen3/Eigen/src/Core/CwiseTernaryOp.h +197 -0
  76. include/eigen3/Eigen/src/Core/CwiseUnaryOp.h +103 -0
  77. include/eigen3/Eigen/src/Core/CwiseUnaryView.h +132 -0
  78. include/eigen3/Eigen/src/Core/DenseBase.h +701 -0
  79. include/eigen3/Eigen/src/Core/DenseCoeffsBase.h +685 -0
  80. include/eigen3/Eigen/src/Core/DenseStorage.h +652 -0
  81. include/eigen3/Eigen/src/Core/Diagonal.h +259 -0
  82. include/eigen3/Eigen/src/Core/DiagonalMatrix.h +391 -0
  83. include/eigen3/Eigen/src/Core/DiagonalProduct.h +28 -0
  84. include/eigen3/Eigen/src/Core/Dot.h +313 -0
  85. include/eigen3/Eigen/src/Core/EigenBase.h +160 -0
  86. include/eigen3/Eigen/src/Core/ForceAlignedAccess.h +150 -0
  87. include/eigen3/Eigen/src/Core/Fuzzy.h +155 -0
  88. include/eigen3/Eigen/src/Core/GeneralProduct.h +465 -0
  89. include/eigen3/Eigen/src/Core/GenericPacketMath.h +1040 -0
  90. include/eigen3/Eigen/src/Core/GlobalFunctions.h +194 -0
  91. include/eigen3/Eigen/src/Core/IO.h +258 -0
  92. include/eigen3/Eigen/src/Core/IndexedView.h +247 -0
  93. include/eigen3/Eigen/src/Core/Inverse.h +117 -0
  94. include/eigen3/Eigen/src/Core/Map.h +171 -0
  95. include/eigen3/Eigen/src/Core/MapBase.h +310 -0
  96. include/eigen3/Eigen/src/Core/MathFunctions.h +2212 -0
  97. include/eigen3/Eigen/src/Core/MathFunctionsImpl.h +200 -0
  98. include/eigen3/Eigen/src/Core/Matrix.h +578 -0
  99. include/eigen3/Eigen/src/Core/MatrixBase.h +541 -0
  100. include/eigen3/Eigen/src/Core/NestByValue.h +85 -0
  101. include/eigen3/Eigen/src/Core/NoAlias.h +109 -0
  102. include/eigen3/Eigen/src/Core/NumTraits.h +351 -0
  103. include/eigen3/Eigen/src/Core/PartialReduxEvaluator.h +237 -0
  104. include/eigen3/Eigen/src/Core/PermutationMatrix.h +605 -0
  105. include/eigen3/Eigen/src/Core/PlainObjectBase.h +1128 -0
  106. include/eigen3/Eigen/src/Core/Product.h +191 -0
  107. include/eigen3/Eigen/src/Core/ProductEvaluators.h +1179 -0
  108. include/eigen3/Eigen/src/Core/Random.h +218 -0
  109. include/eigen3/Eigen/src/Core/Redux.h +515 -0
  110. include/eigen3/Eigen/src/Core/Ref.h +381 -0
  111. include/eigen3/Eigen/src/Core/Replicate.h +142 -0
  112. include/eigen3/Eigen/src/Core/Reshaped.h +454 -0
  113. include/eigen3/Eigen/src/Core/ReturnByValue.h +119 -0
  114. include/eigen3/Eigen/src/Core/Reverse.h +217 -0
  115. include/eigen3/Eigen/src/Core/Select.h +164 -0
  116. include/eigen3/Eigen/src/Core/SelfAdjointView.h +365 -0
  117. include/eigen3/Eigen/src/Core/SelfCwiseBinaryOp.h +47 -0
  118. include/eigen3/Eigen/src/Core/Solve.h +188 -0
  119. include/eigen3/Eigen/src/Core/SolveTriangular.h +235 -0
  120. include/eigen3/Eigen/src/Core/SolverBase.h +168 -0
  121. include/eigen3/Eigen/src/Core/StableNorm.h +251 -0
  122. include/eigen3/Eigen/src/Core/StlIterators.h +463 -0
  123. include/eigen3/Eigen/src/Core/Stride.h +120 -0
  124. include/eigen3/Eigen/src/Core/Swap.h +68 -0
  125. include/eigen3/Eigen/src/Core/Transpose.h +464 -0
  126. include/eigen3/Eigen/src/Core/Transpositions.h +386 -0
  127. include/eigen3/Eigen/src/Core/TriangularMatrix.h +994 -0
  128. include/eigen3/Eigen/src/Core/VectorBlock.h +96 -0
  129. include/eigen3/Eigen/src/Core/VectorwiseOp.h +784 -0
  130. include/eigen3/Eigen/src/Core/Visitor.h +381 -0
  131. include/eigen3/Eigen/src/Core/arch/AVX/Complex.h +368 -0
  132. include/eigen3/Eigen/src/Core/arch/AVX/MathFunctions.h +228 -0
  133. include/eigen3/Eigen/src/Core/arch/AVX/PacketMath.h +1588 -0
  134. include/eigen3/Eigen/src/Core/arch/AVX/TypeCasting.h +115 -0
  135. include/eigen3/Eigen/src/Core/arch/AVX512/Complex.h +384 -0
  136. include/eigen3/Eigen/src/Core/arch/AVX512/MathFunctions.h +361 -0
  137. include/eigen3/Eigen/src/Core/arch/AVX512/PacketMath.h +2270 -0
  138. include/eigen3/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  139. include/eigen3/Eigen/src/Core/arch/AltiVec/Complex.h +415 -0
  140. include/eigen3/Eigen/src/Core/arch/AltiVec/MathFunctions.h +119 -0
  141. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2776 -0
  142. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +159 -0
  143. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +627 -0
  144. include/eigen3/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h +2400 -0
  145. include/eigen3/Eigen/src/Core/arch/AltiVec/PacketMath.h +2743 -0
  146. include/eigen3/Eigen/src/Core/arch/CUDA/Complex.h +269 -0
  147. include/eigen3/Eigen/src/Core/arch/Default/BFloat16.h +688 -0
  148. include/eigen3/Eigen/src/Core/arch/Default/ConjHelper.h +117 -0
  149. include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1662 -0
  150. include/eigen3/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +116 -0
  151. include/eigen3/Eigen/src/Core/arch/Default/Half.h +950 -0
  152. include/eigen3/Eigen/src/Core/arch/Default/Settings.h +49 -0
  153. include/eigen3/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  154. include/eigen3/Eigen/src/Core/arch/GPU/MathFunctions.h +103 -0
  155. include/eigen3/Eigen/src/Core/arch/GPU/PacketMath.h +1646 -0
  156. include/eigen3/Eigen/src/Core/arch/GPU/TypeCasting.h +79 -0
  157. include/eigen3/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  158. include/eigen3/Eigen/src/Core/arch/MSA/Complex.h +645 -0
  159. include/eigen3/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  160. include/eigen3/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  161. include/eigen3/Eigen/src/Core/arch/NEON/Complex.h +560 -0
  162. include/eigen3/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  163. include/eigen3/Eigen/src/Core/arch/NEON/MathFunctions.h +75 -0
  164. include/eigen3/Eigen/src/Core/arch/NEON/PacketMath.h +4653 -0
  165. include/eigen3/Eigen/src/Core/arch/NEON/TypeCasting.h +1424 -0
  166. include/eigen3/Eigen/src/Core/arch/SSE/Complex.h +338 -0
  167. include/eigen3/Eigen/src/Core/arch/SSE/MathFunctions.h +199 -0
  168. include/eigen3/Eigen/src/Core/arch/SSE/PacketMath.h +1505 -0
  169. include/eigen3/Eigen/src/Core/arch/SSE/TypeCasting.h +142 -0
  170. include/eigen3/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  171. include/eigen3/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  172. include/eigen3/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  173. include/eigen3/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  174. include/eigen3/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  175. include/eigen3/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  176. include/eigen3/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  177. include/eigen3/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  178. include/eigen3/Eigen/src/Core/arch/ZVector/Complex.h +428 -0
  179. include/eigen3/Eigen/src/Core/arch/ZVector/MathFunctions.h +233 -0
  180. include/eigen3/Eigen/src/Core/arch/ZVector/PacketMath.h +1060 -0
  181. include/eigen3/Eigen/src/Core/functors/AssignmentFunctors.h +177 -0
  182. include/eigen3/Eigen/src/Core/functors/BinaryFunctors.h +541 -0
  183. include/eigen3/Eigen/src/Core/functors/NullaryFunctors.h +189 -0
  184. include/eigen3/Eigen/src/Core/functors/StlFunctors.h +166 -0
  185. include/eigen3/Eigen/src/Core/functors/TernaryFunctors.h +25 -0
  186. include/eigen3/Eigen/src/Core/functors/UnaryFunctors.h +1131 -0
  187. include/eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2645 -0
  188. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h +517 -0
  189. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +322 -0
  190. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +145 -0
  191. include/eigen3/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +124 -0
  192. include/eigen3/Eigen/src/Core/products/GeneralMatrixVector.h +523 -0
  193. include/eigen3/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +136 -0
  194. include/eigen3/Eigen/src/Core/products/Parallelizer.h +180 -0
  195. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +544 -0
  196. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +295 -0
  197. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h +262 -0
  198. include/eigen3/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +118 -0
  199. include/eigen3/Eigen/src/Core/products/SelfadjointProduct.h +133 -0
  200. include/eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h +94 -0
  201. include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h +472 -0
  202. include/eigen3/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +317 -0
  203. include/eigen3/Eigen/src/Core/products/TriangularMatrixVector.h +350 -0
  204. include/eigen3/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +255 -0
  205. include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h +337 -0
  206. include/eigen3/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +167 -0
  207. include/eigen3/Eigen/src/Core/products/TriangularSolverVector.h +148 -0
  208. include/eigen3/Eigen/src/Core/util/BlasUtil.h +583 -0
  209. include/eigen3/Eigen/src/Core/util/ConfigureVectorization.h +521 -0
  210. include/eigen3/Eigen/src/Core/util/Constants.h +563 -0
  211. include/eigen3/Eigen/src/Core/util/DisableStupidWarnings.h +138 -0
  212. include/eigen3/Eigen/src/Core/util/ForwardDeclarations.h +322 -0
  213. include/eigen3/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  214. include/eigen3/Eigen/src/Core/util/IntegralConstant.h +272 -0
  215. include/eigen3/Eigen/src/Core/util/MKL_support.h +137 -0
  216. include/eigen3/Eigen/src/Core/util/Macros.h +1511 -0
  217. include/eigen3/Eigen/src/Core/util/Memory.h +1202 -0
  218. include/eigen3/Eigen/src/Core/util/Meta.h +812 -0
  219. include/eigen3/Eigen/src/Core/util/NonMPL2.h +3 -0
  220. include/eigen3/Eigen/src/Core/util/ReenableStupidWarnings.h +31 -0
  221. include/eigen3/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  222. include/eigen3/Eigen/src/Core/util/StaticAssert.h +221 -0
  223. include/eigen3/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  224. include/eigen3/Eigen/src/Core/util/XprHelper.h +856 -0
  225. include/eigen3/Eigen/src/Eigenvalues/ComplexEigenSolver.h +345 -0
  226. include/eigen3/Eigen/src/Eigenvalues/ComplexSchur.h +462 -0
  227. include/eigen3/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +91 -0
  228. include/eigen3/Eigen/src/Eigenvalues/EigenSolver.h +622 -0
  229. include/eigen3/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +417 -0
  230. include/eigen3/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +226 -0
  231. include/eigen3/Eigen/src/Eigenvalues/HessenbergDecomposition.h +374 -0
  232. include/eigen3/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +158 -0
  233. include/eigen3/Eigen/src/Eigenvalues/RealQZ.h +657 -0
  234. include/eigen3/Eigen/src/Eigenvalues/RealSchur.h +557 -0
  235. include/eigen3/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +77 -0
  236. include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +904 -0
  237. include/eigen3/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +87 -0
  238. include/eigen3/Eigen/src/Eigenvalues/Tridiagonalization.h +560 -0
  239. include/eigen3/Eigen/src/Geometry/AlignedBox.h +486 -0
  240. include/eigen3/Eigen/src/Geometry/AngleAxis.h +247 -0
  241. include/eigen3/Eigen/src/Geometry/EulerAngles.h +114 -0
  242. include/eigen3/Eigen/src/Geometry/Homogeneous.h +501 -0
  243. include/eigen3/Eigen/src/Geometry/Hyperplane.h +282 -0
  244. include/eigen3/Eigen/src/Geometry/OrthoMethods.h +235 -0
  245. include/eigen3/Eigen/src/Geometry/ParametrizedLine.h +232 -0
  246. include/eigen3/Eigen/src/Geometry/Quaternion.h +870 -0
  247. include/eigen3/Eigen/src/Geometry/Rotation2D.h +199 -0
  248. include/eigen3/Eigen/src/Geometry/RotationBase.h +206 -0
  249. include/eigen3/Eigen/src/Geometry/Scaling.h +188 -0
  250. include/eigen3/Eigen/src/Geometry/Transform.h +1566 -0
  251. include/eigen3/Eigen/src/Geometry/Translation.h +202 -0
  252. include/eigen3/Eigen/src/Geometry/Umeyama.h +168 -0
  253. include/eigen3/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  254. include/eigen3/Eigen/src/Householder/BlockHouseholder.h +110 -0
  255. include/eigen3/Eigen/src/Householder/Householder.h +176 -0
  256. include/eigen3/Eigen/src/Householder/HouseholderSequence.h +553 -0
  257. include/eigen3/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +226 -0
  258. include/eigen3/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +212 -0
  259. include/eigen3/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +227 -0
  260. include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +394 -0
  261. include/eigen3/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +453 -0
  262. include/eigen3/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +444 -0
  263. include/eigen3/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +198 -0
  264. include/eigen3/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +117 -0
  265. include/eigen3/Eigen/src/Jacobi/Jacobi.h +483 -0
  266. include/eigen3/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  267. include/eigen3/Eigen/src/LU/Determinant.h +117 -0
  268. include/eigen3/Eigen/src/LU/FullPivLU.h +877 -0
  269. include/eigen3/Eigen/src/LU/InverseImpl.h +432 -0
  270. include/eigen3/Eigen/src/LU/PartialPivLU.h +624 -0
  271. include/eigen3/Eigen/src/LU/PartialPivLU_LAPACKE.h +83 -0
  272. include/eigen3/Eigen/src/LU/arch/InverseSize4.h +363 -0
  273. include/eigen3/Eigen/src/MetisSupport/MetisSupport.h +137 -0
  274. include/eigen3/Eigen/src/OrderingMethods/Amd.h +435 -0
  275. include/eigen3/Eigen/src/OrderingMethods/Eigen_Colamd.h +1863 -0
  276. include/eigen3/Eigen/src/OrderingMethods/Ordering.h +153 -0
  277. include/eigen3/Eigen/src/PaStiXSupport/PaStiXSupport.h +678 -0
  278. include/eigen3/Eigen/src/PardisoSupport/PardisoSupport.h +545 -0
  279. include/eigen3/Eigen/src/QR/ColPivHouseholderQR.h +674 -0
  280. include/eigen3/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +97 -0
  281. include/eigen3/Eigen/src/QR/CompleteOrthogonalDecomposition.h +635 -0
  282. include/eigen3/Eigen/src/QR/FullPivHouseholderQR.h +713 -0
  283. include/eigen3/Eigen/src/QR/HouseholderQR.h +434 -0
  284. include/eigen3/Eigen/src/QR/HouseholderQR_LAPACKE.h +68 -0
  285. include/eigen3/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +335 -0
  286. include/eigen3/Eigen/src/SVD/BDCSVD.h +1377 -0
  287. include/eigen3/Eigen/src/SVD/JacobiSVD.h +813 -0
  288. include/eigen3/Eigen/src/SVD/JacobiSVD_LAPACKE.h +91 -0
  289. include/eigen3/Eigen/src/SVD/SVDBase.h +376 -0
  290. include/eigen3/Eigen/src/SVD/UpperBidiagonalization.h +415 -0
  291. include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky.h +697 -0
  292. include/eigen3/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +174 -0
  293. include/eigen3/Eigen/src/SparseCore/AmbiVector.h +378 -0
  294. include/eigen3/Eigen/src/SparseCore/CompressedStorage.h +274 -0
  295. include/eigen3/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +352 -0
  296. include/eigen3/Eigen/src/SparseCore/MappedSparseMatrix.h +67 -0
  297. include/eigen3/Eigen/src/SparseCore/SparseAssign.h +270 -0
  298. include/eigen3/Eigen/src/SparseCore/SparseBlock.h +566 -0
  299. include/eigen3/Eigen/src/SparseCore/SparseColEtree.h +206 -0
  300. include/eigen3/Eigen/src/SparseCore/SparseCompressedBase.h +370 -0
  301. include/eigen3/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +722 -0
  302. include/eigen3/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +150 -0
  303. include/eigen3/Eigen/src/SparseCore/SparseDenseProduct.h +342 -0
  304. include/eigen3/Eigen/src/SparseCore/SparseDiagonalProduct.h +138 -0
  305. include/eigen3/Eigen/src/SparseCore/SparseDot.h +98 -0
  306. include/eigen3/Eigen/src/SparseCore/SparseFuzzy.h +29 -0
  307. include/eigen3/Eigen/src/SparseCore/SparseMap.h +306 -0
  308. include/eigen3/Eigen/src/SparseCore/SparseMatrix.h +1518 -0
  309. include/eigen3/Eigen/src/SparseCore/SparseMatrixBase.h +399 -0
  310. include/eigen3/Eigen/src/SparseCore/SparsePermutation.h +178 -0
  311. include/eigen3/Eigen/src/SparseCore/SparseProduct.h +182 -0
  312. include/eigen3/Eigen/src/SparseCore/SparseRedux.h +49 -0
  313. include/eigen3/Eigen/src/SparseCore/SparseRef.h +397 -0
  314. include/eigen3/Eigen/src/SparseCore/SparseSelfAdjointView.h +659 -0
  315. include/eigen3/Eigen/src/SparseCore/SparseSolverBase.h +124 -0
  316. include/eigen3/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +198 -0
  317. include/eigen3/Eigen/src/SparseCore/SparseTranspose.h +92 -0
  318. include/eigen3/Eigen/src/SparseCore/SparseTriangularView.h +189 -0
  319. include/eigen3/Eigen/src/SparseCore/SparseUtil.h +186 -0
  320. include/eigen3/Eigen/src/SparseCore/SparseVector.h +480 -0
  321. include/eigen3/Eigen/src/SparseCore/SparseView.h +254 -0
  322. include/eigen3/Eigen/src/SparseCore/TriangularSolver.h +315 -0
  323. include/eigen3/Eigen/src/SparseLU/SparseLU.h +925 -0
  324. include/eigen3/Eigen/src/SparseLU/SparseLUImpl.h +66 -0
  325. include/eigen3/Eigen/src/SparseLU/SparseLU_Memory.h +226 -0
  326. include/eigen3/Eigen/src/SparseLU/SparseLU_Structs.h +110 -0
  327. include/eigen3/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +374 -0
  328. include/eigen3/Eigen/src/SparseLU/SparseLU_Utils.h +80 -0
  329. include/eigen3/Eigen/src/SparseLU/SparseLU_column_bmod.h +181 -0
  330. include/eigen3/Eigen/src/SparseLU/SparseLU_column_dfs.h +179 -0
  331. include/eigen3/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +107 -0
  332. include/eigen3/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +121 -0
  333. include/eigen3/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +129 -0
  334. include/eigen3/Eigen/src/SparseLU/SparseLU_panel_bmod.h +222 -0
  335. include/eigen3/Eigen/src/SparseLU/SparseLU_panel_dfs.h +258 -0
  336. include/eigen3/Eigen/src/SparseLU/SparseLU_pivotL.h +137 -0
  337. include/eigen3/Eigen/src/SparseLU/SparseLU_pruneL.h +136 -0
  338. include/eigen3/Eigen/src/SparseLU/SparseLU_relax_snode.h +83 -0
  339. include/eigen3/Eigen/src/SparseQR/SparseQR.h +758 -0
  340. include/eigen3/Eigen/src/StlSupport/StdDeque.h +116 -0
  341. include/eigen3/Eigen/src/StlSupport/StdList.h +106 -0
  342. include/eigen3/Eigen/src/StlSupport/StdVector.h +131 -0
  343. include/eigen3/Eigen/src/StlSupport/details.h +84 -0
  344. include/eigen3/Eigen/src/SuperLUSupport/SuperLUSupport.h +1025 -0
  345. include/eigen3/Eigen/src/UmfPackSupport/UmfPackSupport.h +642 -0
  346. include/eigen3/Eigen/src/misc/Image.h +82 -0
  347. include/eigen3/Eigen/src/misc/Kernel.h +79 -0
  348. include/eigen3/Eigen/src/misc/RealSvd2x2.h +55 -0
  349. include/eigen3/Eigen/src/misc/blas.h +440 -0
  350. include/eigen3/Eigen/src/misc/lapack.h +152 -0
  351. include/eigen3/Eigen/src/misc/lapacke.h +16292 -0
  352. include/eigen3/Eigen/src/misc/lapacke_mangling.h +17 -0
  353. include/eigen3/Eigen/src/plugins/ArrayCwiseBinaryOps.h +431 -0
  354. include/eigen3/Eigen/src/plugins/ArrayCwiseUnaryOps.h +696 -0
  355. include/eigen3/Eigen/src/plugins/BlockMethods.h +1442 -0
  356. include/eigen3/Eigen/src/plugins/CommonCwiseBinaryOps.h +115 -0
  357. include/eigen3/Eigen/src/plugins/CommonCwiseUnaryOps.h +177 -0
  358. include/eigen3/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  359. include/eigen3/Eigen/src/plugins/MatrixCwiseBinaryOps.h +184 -0
  360. include/eigen3/Eigen/src/plugins/MatrixCwiseUnaryOps.h +95 -0
  361. include/eigen3/Eigen/src/plugins/ReshapedMethods.h +149 -0
  362. include/eigen3/signature_of_eigen3_matrix_library +1 -0
  363. include/eigen3/unsupported/Eigen/AdolcForward +159 -0
  364. include/eigen3/unsupported/Eigen/AlignedVector3 +234 -0
  365. include/eigen3/unsupported/Eigen/ArpackSupport +30 -0
  366. include/eigen3/unsupported/Eigen/AutoDiff +48 -0
  367. include/eigen3/unsupported/Eigen/BVH +95 -0
  368. include/eigen3/unsupported/Eigen/CXX11/Tensor +137 -0
  369. include/eigen3/unsupported/Eigen/CXX11/TensorSymmetry +42 -0
  370. include/eigen3/unsupported/Eigen/CXX11/ThreadPool +74 -0
  371. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +554 -0
  372. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +327 -0
  373. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +242 -0
  374. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +1176 -0
  375. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +1559 -0
  376. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +1083 -0
  377. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +510 -0
  378. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +373 -0
  379. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +1019 -0
  380. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +73 -0
  381. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +6 -0
  382. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +1413 -0
  383. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +575 -0
  384. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +1650 -0
  385. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +1679 -0
  386. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +455 -0
  387. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +1126 -0
  388. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +536 -0
  389. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +213 -0
  390. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +342 -0
  391. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +137 -0
  392. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +6 -0
  393. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +104 -0
  394. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +389 -0
  395. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +1048 -0
  396. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +409 -0
  397. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +234 -0
  398. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +493 -0
  399. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +229 -0
  400. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +980 -0
  401. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +701 -0
  402. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +389 -0
  403. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +669 -0
  404. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +377 -0
  405. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +232 -0
  406. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +191 -0
  407. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +488 -0
  408. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +297 -0
  409. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +33 -0
  410. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +99 -0
  411. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +44 -0
  412. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +79 -0
  413. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +602 -0
  414. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +735 -0
  415. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +244 -0
  416. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +82 -0
  417. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +257 -0
  418. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +213 -0
  419. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +98 -0
  420. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +327 -0
  421. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +311 -0
  422. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +1098 -0
  423. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +705 -0
  424. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +286 -0
  425. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +317 -0
  426. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +1000 -0
  427. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +6 -0
  428. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +973 -0
  429. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +582 -0
  430. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +454 -0
  431. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +462 -0
  432. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +528 -0
  433. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +513 -0
  434. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +466 -0
  435. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +157 -0
  436. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +341 -0
  437. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +299 -0
  438. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +264 -0
  439. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +249 -0
  440. include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +628 -0
  441. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h +293 -0
  442. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h +236 -0
  443. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h +338 -0
  444. include/eigen3/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h +669 -0
  445. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h +67 -0
  446. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +249 -0
  447. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +486 -0
  448. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +236 -0
  449. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +23 -0
  450. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +40 -0
  451. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h +301 -0
  452. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +48 -0
  453. include/eigen3/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h +20 -0
  454. include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +538 -0
  455. include/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h +88 -0
  456. include/eigen3/unsupported/Eigen/CXX11/src/util/EmulateArray.h +261 -0
  457. include/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h +158 -0
  458. include/eigen3/unsupported/Eigen/EulerAngles +43 -0
  459. include/eigen3/unsupported/Eigen/FFT +420 -0
  460. include/eigen3/unsupported/Eigen/IterativeSolvers +51 -0
  461. include/eigen3/unsupported/Eigen/KroneckerProduct +36 -0
  462. include/eigen3/unsupported/Eigen/LevenbergMarquardt +49 -0
  463. include/eigen3/unsupported/Eigen/MPRealSupport +213 -0
  464. include/eigen3/unsupported/Eigen/MatrixFunctions +504 -0
  465. include/eigen3/unsupported/Eigen/MoreVectorization +24 -0
  466. include/eigen3/unsupported/Eigen/NonLinearOptimization +140 -0
  467. include/eigen3/unsupported/Eigen/NumericalDiff +56 -0
  468. include/eigen3/unsupported/Eigen/OpenGLSupport +322 -0
  469. include/eigen3/unsupported/Eigen/Polynomials +137 -0
  470. include/eigen3/unsupported/Eigen/Skyline +39 -0
  471. include/eigen3/unsupported/Eigen/SparseExtra +54 -0
  472. include/eigen3/unsupported/Eigen/SpecialFunctions +103 -0
  473. include/eigen3/unsupported/Eigen/Splines +35 -0
  474. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h +108 -0
  475. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +730 -0
  476. include/eigen3/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h +220 -0
  477. include/eigen3/unsupported/Eigen/src/BVH/BVAlgorithms.h +293 -0
  478. include/eigen3/unsupported/Eigen/src/BVH/KdBVH.h +223 -0
  479. include/eigen3/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h +790 -0
  480. include/eigen3/unsupported/Eigen/src/EulerAngles/EulerAngles.h +356 -0
  481. include/eigen3/unsupported/Eigen/src/EulerAngles/EulerSystem.h +306 -0
  482. include/eigen3/unsupported/Eigen/src/FFT/ei_fftw_impl.h +261 -0
  483. include/eigen3/unsupported/Eigen/src/FFT/ei_kissfft_impl.h +449 -0
  484. include/eigen3/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h +187 -0
  485. include/eigen3/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +511 -0
  486. include/eigen3/unsupported/Eigen/src/IterativeSolvers/GMRES.h +335 -0
  487. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IDRS.h +436 -0
  488. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h +90 -0
  489. include/eigen3/unsupported/Eigen/src/IterativeSolvers/IterationController.h +154 -0
  490. include/eigen3/unsupported/Eigen/src/IterativeSolvers/MINRES.h +267 -0
  491. include/eigen3/unsupported/Eigen/src/IterativeSolvers/Scaling.h +193 -0
  492. include/eigen3/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h +305 -0
  493. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h +84 -0
  494. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h +202 -0
  495. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h +160 -0
  496. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h +188 -0
  497. include/eigen3/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h +396 -0
  498. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +441 -0
  499. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +569 -0
  500. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +373 -0
  501. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +705 -0
  502. include/eigen3/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +368 -0
  503. include/eigen3/unsupported/Eigen/src/MatrixFunctions/StemFunction.h +117 -0
  504. include/eigen3/unsupported/Eigen/src/MoreVectorization/MathFunctions.h +95 -0
  505. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h +601 -0
  506. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h +657 -0
  507. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/chkder.h +66 -0
  508. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/covar.h +70 -0
  509. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/dogleg.h +107 -0
  510. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h +79 -0
  511. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/lmpar.h +298 -0
  512. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h +91 -0
  513. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h +30 -0
  514. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/r1updt.h +99 -0
  515. include/eigen3/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h +49 -0
  516. include/eigen3/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h +130 -0
  517. include/eigen3/unsupported/Eigen/src/Polynomials/Companion.h +280 -0
  518. include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +429 -0
  519. include/eigen3/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +143 -0
  520. include/eigen3/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h +352 -0
  521. include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrix.h +862 -0
  522. include/eigen3/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h +212 -0
  523. include/eigen3/unsupported/Eigen/src/Skyline/SkylineProduct.h +295 -0
  524. include/eigen3/unsupported/Eigen/src/Skyline/SkylineStorage.h +259 -0
  525. include/eigen3/unsupported/Eigen/src/Skyline/SkylineUtil.h +89 -0
  526. include/eigen3/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h +122 -0
  527. include/eigen3/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h +1079 -0
  528. include/eigen3/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +404 -0
  529. include/eigen3/unsupported/Eigen/src/SparseExtra/MarketIO.h +282 -0
  530. include/eigen3/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +247 -0
  531. include/eigen3/unsupported/Eigen/src/SparseExtra/RandomSetter.h +349 -0
  532. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h +286 -0
  533. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h +68 -0
  534. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h +357 -0
  535. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h +66 -0
  536. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h +1959 -0
  537. include/eigen3/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h +118 -0
  538. include/eigen3/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h +67 -0
  539. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +167 -0
  540. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h +58 -0
  541. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +330 -0
  542. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h +58 -0
  543. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +2051 -0
  544. include/eigen3/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h +79 -0
  545. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h +46 -0
  546. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h +16 -0
  547. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +51 -0
  548. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h +16 -0
  549. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h +369 -0
  550. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h +54 -0
  551. include/eigen3/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h +34 -0
  552. include/eigen3/unsupported/Eigen/src/Splines/Spline.h +507 -0
  553. include/eigen3/unsupported/Eigen/src/Splines/SplineFitting.h +431 -0
  554. include/eigen3/unsupported/Eigen/src/Splines/SplineFwd.h +93 -0
@@ -0,0 +1,973 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11
+ #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12
+
13
+ namespace Eigen {
14
+ namespace internal {
15
+
16
+
17
+ #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18
+ // Full reducers for GPU, don't vectorize for now
19
+
20
+ // Reducer function that enables multiple gpu thread to safely accumulate at the same
21
+ // output address. It basically reads the current value of the output variable, and
22
+ // attempts to update it with the new value. If in the meantime another gpu thread
23
+ // updated the content of the output address it will try again.
24
+ template <typename T, typename R>
25
+ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26
+ #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27
+ if (sizeof(T) == 4)
28
+ {
29
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30
+ unsigned int newval = oldval;
31
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32
+ if (newval == oldval) {
33
+ return;
34
+ }
35
+ unsigned int readback;
36
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37
+ oldval = readback;
38
+ newval = oldval;
39
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40
+ if (newval == oldval) {
41
+ return;
42
+ }
43
+ }
44
+ }
45
+ else if (sizeof(T) == 8) {
46
+ unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47
+ unsigned long long newval = oldval;
48
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49
+ if (newval == oldval) {
50
+ return;
51
+ }
52
+ unsigned long long readback;
53
+ while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54
+ oldval = readback;
55
+ newval = oldval;
56
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57
+ if (newval == oldval) {
58
+ return;
59
+ }
60
+ }
61
+ }
62
+ else {
63
+ gpu_assert(0 && "Wordsize not supported");
64
+ }
65
+ #else // EIGEN_CUDA_ARCH >= 300
66
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
67
+ #endif // EIGEN_CUDA_ARCH >= 300
68
+ }
69
+
70
+ // We extend atomicExch to support extra data types
71
+ template <typename Type>
72
+ __device__ inline Type atomicExchCustom(Type* address, Type val) {
73
+ return atomicExch(address, val);
74
+ }
75
+
76
+ template <>
77
+ __device__ inline double atomicExchCustom(double* address, double val) {
78
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80
+ }
81
+
82
+ #ifdef EIGEN_HAS_GPU_FP16
83
+ template <typename R>
84
+ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86
+ unsigned int newval = oldval;
87
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88
+ if (newval == oldval) {
89
+ return;
90
+ }
91
+ unsigned int readback;
92
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93
+ oldval = readback;
94
+ newval = oldval;
95
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96
+ if (newval == oldval) {
97
+ return;
98
+ }
99
+ }
100
+ }
101
+ #ifdef EIGEN_GPU_COMPILE_PHASE
102
+ // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
103
+ template <typename R>
104
+ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
105
+ half2* houtput=reinterpret_cast<half2*>(output);
106
+ half2* haccum=reinterpret_cast<half2*>(&accum);
107
+ for(int i=0;i<4;++i){
108
+ atomicReduce(houtput+i,*(haccum+i),reducer);
109
+ }
110
+ }
111
+ #endif // EIGEN_GPU_COMPILE_PHASE
112
+ #endif // EIGEN_HAS_GPU_FP16
113
+
114
+ template <>
115
+ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
116
+ #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
117
+ atomicAdd(output, accum);
118
+ #else // EIGEN_CUDA_ARCH >= 300
119
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
120
+ #endif // EIGEN_CUDA_ARCH >= 300
121
+ }
122
+
123
+
124
+ template <typename CoeffType, typename Index>
125
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
126
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
127
+ const Index num_threads = blockDim.x * gridDim.x;
128
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
129
+ output[i] = val;
130
+ }
131
+ }
132
+
133
+
134
+ template <int BlockSize, int NumPerThread, typename Self,
135
+ typename Reducer, typename Index>
136
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
137
+ typename Self::CoeffReturnType* output, unsigned int* semaphore) {
138
+ #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
139
+ // Initialize the output value
140
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
141
+ if (gridDim.x == 1) {
142
+ if (first_index == 0) {
143
+ *output = reducer.initialize();
144
+ }
145
+ }
146
+ else {
147
+ if (threadIdx.x == 0) {
148
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
149
+ if (block == 0) {
150
+ // We're the first block to run, initialize the output value
151
+ atomicExchCustom(output, reducer.initialize());
152
+ __threadfence();
153
+ atomicExch(semaphore, 2u);
154
+ }
155
+ else {
156
+ // Wait for the first block to initialize the output value.
157
+ // Use atomicCAS here to ensure that the reads aren't cached
158
+ unsigned int val;
159
+ do {
160
+ val = atomicCAS(semaphore, 2u, 2u);
161
+ }
162
+ while (val < 2u);
163
+ }
164
+ }
165
+ }
166
+
167
+ __syncthreads();
168
+
169
+ eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
170
+
171
+ typename Self::CoeffReturnType accum = reducer.initialize();
172
+ Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
173
+ for (Index i = 0; i < max_iter; i+=BlockSize) {
174
+ const Index index = first_index + i;
175
+ eigen_assert(index < num_coeffs);
176
+ typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177
+ reducer.reduce(val, &accum);
178
+ }
179
+
180
+ #pragma unroll
181
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
182
+ #if defined(EIGEN_HIPCC)
183
+ // use std::is_floating_point to determine the type of reduced_val
184
+ // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
185
+ // and list the float and int versions of __shfl_down as the candidate functions.
186
+ if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
187
+ reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
188
+ } else {
189
+ reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
190
+ }
191
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192
+ reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
193
+ #else
194
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
195
+ #endif
196
+ }
197
+
198
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
199
+ atomicReduce(output, accum, reducer);
200
+ }
201
+
202
+ if (gridDim.x > 1 && threadIdx.x == 0) {
203
+ // Let the last block reset the semaphore
204
+ atomicInc(semaphore, gridDim.x + 1);
205
+ #if defined(EIGEN_HIPCC)
206
+ __threadfence_system();
207
+ #endif
208
+ }
209
+ #else // EIGEN_CUDA_ARCH >= 300
210
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
211
+ #endif // EIGEN_CUDA_ARCH >= 300
212
+ }
213
+
214
+
215
+ #ifdef EIGEN_HAS_GPU_FP16
216
+ template <typename Self,
217
+ typename Reducer, typename Index>
218
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
219
+ Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
220
+ eigen_assert(blockDim.x == 1);
221
+ eigen_assert(gridDim.x == 1);
222
+ typedef packet_traits<Eigen::half>::type packet_type;
223
+ Index packet_remainder =
224
+ num_coeffs % Index(unpacket_traits<packet_type>::size);
225
+ if (packet_remainder != 0) {
226
+ half2* h2scratch = reinterpret_cast<half2*>(scratch);
227
+ for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
228
+ *h2scratch =
229
+ __halves2half2(input.coeff(i), input.coeff(i + 1));
230
+ h2scratch++;
231
+ }
232
+ if ((num_coeffs & 1) != 0) {
233
+ half lastCoeff = input.coeff(num_coeffs - 1);
234
+ *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
235
+ }
236
+ } else {
237
+ packet_type reduce = reducer.template initializePacket<packet_type>();
238
+ internal::pstoreu(scratch, reduce);
239
+ }
240
+ }
241
+
242
+ template <typename Self,
243
+ typename Reducer, typename Index>
244
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
245
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
246
+ const Index num_threads = blockDim.x * gridDim.x;
247
+ typedef typename packet_traits<Eigen::half>::type PacketType;
248
+
249
+ const Index num_packets =
250
+ num_coeffs / Index(unpacket_traits<PacketType>::size);
251
+ PacketType* p_output = reinterpret_cast<PacketType*>(output);
252
+ for (Index i = thread_id; i < num_packets; i += num_threads) {
253
+ p_output[i] = reducer.template initializePacket<PacketType>();
254
+ }
255
+ Index packet_remainder =
256
+ num_coeffs % Index(unpacket_traits<PacketType>::size);
257
+ if (thread_id < packet_remainder) {
258
+ output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
259
+ }
260
+ }
261
+
262
+ template <int BlockSize, int NumPerThread, typename Self,
263
+ typename Reducer, typename Index>
264
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
265
+ Reducer reducer, const Self input, Index num_coeffs,
266
+ half* output, half* scratch) {
267
+ typedef typename packet_traits<Eigen::half>::type PacketType;
268
+ const int packet_width = unpacket_traits<PacketType>::size;
269
+ eigen_assert(NumPerThread % packet_width == 0);
270
+ const Index first_index =
271
+ blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
272
+
273
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
274
+
275
+ if (gridDim.x == 1) {
276
+ if (first_index == 0) {
277
+ int rem = num_coeffs % packet_width;
278
+ if (rem != 0) {
279
+ half2* p_scratch = reinterpret_cast<half2*>(scratch);
280
+ pstoreu(scratch, reducer.template initializePacket<PacketType>());
281
+ for (int i = 0; i < rem / 2; i++) {
282
+ *p_scratch = __halves2half2(
283
+ input.coeff(num_coeffs - packet_width + 2 * i),
284
+ input.coeff(num_coeffs - packet_width + 2 * i + 1));
285
+ p_scratch++;
286
+ }
287
+ if ((num_coeffs & 1) != 0) {
288
+ half last = input.coeff(num_coeffs - 1);
289
+ *p_scratch = __halves2half2(last, reducer.initialize());
290
+ }
291
+ } else {
292
+ PacketType reduce = reducer.template initializePacket<PacketType>();
293
+ pstoreu(scratch, reduce);
294
+ }
295
+ }
296
+ __syncthreads();
297
+ }
298
+
299
+ PacketType accum = reducer.template initializePacket<PacketType>();
300
+ const Index max_iter =
301
+ numext::mini<Index>((num_coeffs - first_index) / packet_width,
302
+ NumPerThread * BlockSize / packet_width);
303
+ for (Index i = 0; i < max_iter; i += BlockSize) {
304
+ const Index index = first_index + packet_width * i;
305
+ eigen_assert(index + packet_width < num_coeffs);
306
+ PacketType val = input.template packet<Unaligned>(index);
307
+ reducer.reducePacket(val, &accum);
308
+ }
309
+
310
+ #pragma unroll
311
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
312
+ #if defined(EIGEN_HIPCC)
313
+ PacketType r1;
314
+ half2* hr = reinterpret_cast<half2*>(&r1);
315
+ half2* hacc = reinterpret_cast<half2*>(&accum);
316
+ for (int i = 0; i < packet_width / 2; i++) {
317
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
318
+ union { int i; half2 h; } wka_in, wka_out;
319
+ wka_in.h = hacc[i];
320
+ wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
321
+ hr[i] = wka_out.h;
322
+ }
323
+ reducer.reducePacket(r1, &accum);
324
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
325
+ PacketType r1;
326
+ half2* hr = reinterpret_cast<half2*>(&r1);
327
+ half2* hacc = reinterpret_cast<half2*>(&accum);
328
+ for (int i = 0; i < packet_width / 2; i++) {
329
+ hr[i] = __shfl_down(hacc[i], offset, warpSize);
330
+ }
331
+ reducer.reducePacket(r1, &accum);
332
+ #else
333
+ PacketType r1;
334
+ half2* hr = reinterpret_cast<half2*>(&r1);
335
+ half2* hacc = reinterpret_cast<half2*>(&accum);
336
+ for (int i = 0; i < packet_width / 2; i++) {
337
+ hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
338
+ }
339
+ reducer.reducePacket(r1, &accum);
340
+
341
+ #endif
342
+ }
343
+
344
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
345
+ atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
346
+ }
347
+
348
+ __syncthreads();
349
+ half2* rv1 = reinterpret_cast<half2*>(scratch);
350
+ if (packet_width > 2) {
351
+ reducer.reducePacket(rv1[2], rv1);
352
+ reducer.reducePacket(rv1[3], rv1 + 1);
353
+ reducer.reducePacket(rv1[1], rv1);
354
+ }
355
+ if (gridDim.x == 1) {
356
+ if (first_index == 0) {
357
+ half tmp = __low2half(*rv1);
358
+ reducer.reduce(__high2half(*rv1), &tmp);
359
+ *output = tmp;
360
+ }
361
+ }
362
+ }
363
+
364
+ template <typename Op>
365
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
366
+ eigen_assert(threadIdx.x == 1);
367
+ typedef packet_traits<Eigen::half>::type packet_type;
368
+ if (unpacket_traits<packet_type>::size == 1) {
369
+ *output = *scratch;
370
+ } else {
371
+ half2* pscratch = reinterpret_cast<half2*>(scratch);
372
+ half tmp = __float2half(0.f);
373
+ for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
374
+ reducer.reduce(__low2half(*pscratch), &tmp);
375
+ reducer.reduce(__high2half(*pscratch), &tmp);
376
+ pscratch++;
377
+ }
378
+ *output = tmp;
379
+ }
380
+ }
381
+
382
+ #endif // EIGEN_HAS_GPU_FP16
383
+
384
+ template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
385
+ struct FullReductionLauncher {
386
+ static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
387
+ gpu_assert(false && "Should only be called on doubles, floats and half floats");
388
+ }
389
+ };
390
+
391
+ // Specialization for float and double
392
+ template <typename Self, typename Op, typename OutputType, bool PacketAccess>
393
+ struct FullReductionLauncher<
394
+ Self, Op, OutputType, PacketAccess,
395
+ typename internal::enable_if<
396
+ internal::is_same<float, OutputType>::value ||
397
+ internal::is_same<double, OutputType>::value,
398
+ void>::type> {
399
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
400
+
401
+ typedef typename Self::Index Index;
402
+ const int block_size = 256;
403
+ const int num_per_thread = 128;
404
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
405
+
406
+ unsigned int* semaphore = NULL;
407
+ if (num_blocks > 1) {
408
+ semaphore = device.semaphore();
409
+ }
410
+
411
+ LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
412
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
413
+ }
414
+ };
415
+
416
+ #ifdef EIGEN_HAS_GPU_FP16
417
+ template <typename Self, typename Op>
418
+ struct FullReductionLauncher<Self, Op, Eigen::half, false> {
419
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
420
+ gpu_assert(false && "Should not be called since there is no packet accessor");
421
+ }
422
+ };
423
+
424
+ template <typename Self, typename Op>
425
+ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
426
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
427
+ typedef typename Self::Index Index;
428
+
429
+ const int block_size = 256;
430
+ const int num_per_thread = 128;
431
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
432
+ half* scratch = static_cast<half*>(device.scratchpad());
433
+
434
+ if (num_blocks > 1) {
435
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
436
+ // won't be a race conditions between multiple thread blocks.
437
+ LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
438
+ 1, 1, 0, device, reducer, self, num_coeffs, scratch);
439
+ }
440
+
441
+ LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
442
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
443
+
444
+ if (num_blocks > 1) {
445
+ LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
446
+ 1, 1, 0, device, reducer, output, scratch);
447
+ }
448
+ }
449
+ };
450
+ #endif // EIGEN_HAS_GPU_FP16
451
+
452
+
453
+ template <typename Self, typename Op, bool Vectorizable>
454
+ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
455
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
456
+ // so reduce the scope of the optimized version of the code to the simple cases
457
+ // of doubles, floats and half floats
458
+ #ifdef EIGEN_HAS_GPU_FP16
459
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
461
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
462
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
463
+ #else // EIGEN_HAS_GPU_FP16
464
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
465
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
466
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
467
+ #endif // EIGEN_HAS_GPU_FP16
468
+
469
+ template <typename OutputType>
470
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
471
+ gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
472
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
473
+ // Don't crash when we're called with an input tensor of size 0.
474
+ if (num_coeffs == 0) {
475
+ return;
476
+ }
477
+
478
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
479
+ }
480
+ };
481
+
482
+
483
+ template <int NumPerThread, typename Self,
484
+ typename Reducer, typename Index>
485
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
486
+ typename Self::CoeffReturnType* output) {
487
+ #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
488
+ typedef typename Self::CoeffReturnType Type;
489
+ eigen_assert(blockDim.y == 1);
490
+ eigen_assert(blockDim.z == 1);
491
+ eigen_assert(gridDim.y == 1);
492
+ eigen_assert(gridDim.z == 1);
493
+
494
+ const int unroll_times = 16;
495
+ eigen_assert(NumPerThread % unroll_times == 0);
496
+
497
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
498
+ const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
499
+
500
+ const Index num_threads = blockDim.x * gridDim.x;
501
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
502
+
503
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
504
+ if (gridDim.x == 1) {
505
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
506
+ output[i] = reducer.initialize();
507
+ }
508
+ __syncthreads();
509
+ }
510
+
511
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
512
+ const Index row = i / input_col_blocks;
513
+
514
+ if (row < num_preserved_coeffs) {
515
+ const Index col_block = i % input_col_blocks;
516
+ const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
517
+
518
+ Type reduced_val = reducer.initialize();
519
+
520
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
521
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
522
+ if (last_col >= num_coeffs_to_reduce) {
523
+ for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
524
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
525
+ reducer.reduce(val, &reduced_val);
526
+ }
527
+ break;
528
+ } else {
529
+ // Faster version of the loop with no branches after unrolling.
530
+ #pragma unroll
531
+ for (int k = 0; k < unroll_times; ++k) {
532
+ const Index col = col_begin + blockDim.x * (j + k);
533
+ reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
534
+ }
535
+ }
536
+ }
537
+
538
+ #pragma unroll
539
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
540
+ #if defined(EIGEN_HIPCC)
541
+ // use std::is_floating_point to determine the type of reduced_val
542
+ // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
543
+ // and list the float and int versions of __shfl_down as the candidate functions.
544
+ if (std::is_floating_point<Type>::value) {
545
+ reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
546
+ } else {
547
+ reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
548
+ }
549
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
550
+ reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
551
+ #else
552
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
553
+ #endif
554
+ }
555
+
556
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
557
+ atomicReduce(&(output[row]), reduced_val, reducer);
558
+ }
559
+ }
560
+ }
561
+ #else // EIGEN_CUDA_ARCH >= 300
562
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
563
+ #endif // EIGEN_CUDA_ARCH >= 300
564
+ }
565
+
566
+ #ifdef EIGEN_HAS_GPU_FP16
567
+
568
+ template <int NumPerThread, typename Self,
569
+ typename Reducer, typename Index>
570
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
571
+ half* output) {
572
+ eigen_assert(blockDim.y == 1);
573
+ eigen_assert(blockDim.z == 1);
574
+ eigen_assert(gridDim.y == 1);
575
+ eigen_assert(gridDim.z == 1);
576
+
577
+ typedef typename packet_traits<Eigen::half>::type PacketType;
578
+ const int packet_width = unpacket_traits<PacketType>::size;
579
+ const int unroll_times = 16 / packet_width;
580
+ eigen_assert(NumPerThread % unroll_times == 0);
581
+ eigen_assert(unroll_times % 2 == 0);
582
+
583
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
584
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
585
+
586
+ const Index num_threads = blockDim.x * gridDim.x;
587
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
588
+
589
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
590
+ if (gridDim.x == 1) {
591
+ Index i = packet_width * thread_id;
592
+ for (; i + packet_width <= num_preserved_coeffs;
593
+ i += packet_width * num_threads) {
594
+ PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
595
+ *poutput = reducer.template initializePacket<PacketType>();
596
+ }
597
+ if (i < num_preserved_coeffs) {
598
+ output[i] = reducer.initialize();
599
+ }
600
+ __syncthreads();
601
+ }
602
+
603
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
604
+ const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
605
+
606
+ if (row + 1 < num_preserved_coeffs) {
607
+ const Index col_block = i % input_col_blocks;
608
+ const Index col_begin =
609
+ packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
610
+
611
+ PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
612
+ PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
613
+
614
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
615
+ const Index last_col =
616
+ col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
617
+ if (last_col >= num_coeffs_to_reduce) {
618
+ Index col = col_begin + blockDim.x * j;
619
+ for (; col + packet_width <= num_coeffs_to_reduce;
620
+ col += blockDim.x) {
621
+ const PacketType val1 = input.m_impl.template packet<Unaligned>(
622
+ row * num_coeffs_to_reduce + col);
623
+ reducer.reducePacket(val1, &reduced_val1);
624
+ const PacketType val2 = input.m_impl.template packet<Unaligned>(
625
+ (row + 1) * num_coeffs_to_reduce + col);
626
+ reducer.reducePacket(val2, &reduced_val2);
627
+ }
628
+ if (col < num_coeffs_to_reduce) {
629
+ PacketType r1 = reducer.template initializePacket<PacketType>();
630
+ PacketType r2 = reducer.template initializePacket<PacketType>();
631
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
632
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
633
+ while (col + 1 < num_coeffs_to_reduce) {
634
+ *hr1 = __halves2half2(
635
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col),
636
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
637
+ *hr2 = __halves2half2(
638
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
639
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
640
+ 1));
641
+ hr1++;
642
+ hr2++;
643
+ col += 2;
644
+ }
645
+ if (col < num_coeffs_to_reduce) {
646
+ // Peel;
647
+ const half last1 =
648
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col);
649
+ *hr1 = __halves2half2(last1, reducer.initialize());
650
+ const half last2 =
651
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
652
+ *hr2 = __halves2half2(last2, reducer.initialize());
653
+ }
654
+ reducer.reducePacket(r1, &reduced_val1);
655
+ reducer.reducePacket(r2, &reduced_val2);
656
+ }
657
+ break;
658
+ } else {
659
+ // Faster version of the loop with no branches after unrolling.
660
+ #pragma unroll
661
+ for (int k = 0; k < unroll_times; ++k) {
662
+ const Index col = col_begin + blockDim.x * (j + k) * packet_width;
663
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(
664
+ row * num_coeffs_to_reduce + col),
665
+ &reduced_val1);
666
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(
667
+ (row + 1) * num_coeffs_to_reduce + col),
668
+ &reduced_val2);
669
+ }
670
+ }
671
+ }
672
+
673
+ #pragma unroll
674
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
675
+ #if defined(EIGEN_HIPCC)
676
+ PacketType r1;
677
+ PacketType r2;
678
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
679
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
680
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
681
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
682
+ for (int i = 0; i < packet_width / 2; i++) {
683
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
684
+ union { int i; half2 h; } wka_in1, wka_out1;
685
+ wka_in1.h = rv1[i];
686
+ wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
687
+ hr1[i] = wka_out1.h;
688
+
689
+ union { int i; half2 h; } wka_in2, wka_out2;
690
+ wka_in2.h = rv2[i];
691
+ wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
692
+ hr2[i] = wka_out2.h;
693
+ }
694
+ reducer.reducePacket(r1, &reduced_val1);
695
+ reducer.reducePacket(r2, &reduced_val2);
696
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
697
+ PacketType r1;
698
+ PacketType r2;
699
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
700
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
701
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
702
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
703
+ for (int i = 0; i < packet_width / 2; i++) {
704
+ hr1[i] = __shfl_down(rv1[i], offset, warpSize);
705
+ hr2[i] = __shfl_down(rv2[i], offset, warpSize);
706
+ }
707
+ reducer.reducePacket(r1, &reduced_val1);
708
+ reducer.reducePacket(r2, &reduced_val2);
709
+ #else
710
+ PacketType r1;
711
+ PacketType r2;
712
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
713
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
714
+ half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
715
+ half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
716
+ for (int i = 0; i < packet_width / 2; i++) {
717
+ hr1[i] =
718
+ __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
719
+ hr2[i] =
720
+ __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
721
+ }
722
+ reducer.reducePacket(r1, &reduced_val1);
723
+ reducer.reducePacket(r2, &reduced_val2);
724
+
725
+ #endif
726
+ }
727
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
728
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
729
+ half2 val;
730
+ if (packet_width > 2) {
731
+ reducer.reducePacket(rv1[2], rv1);
732
+ reducer.reducePacket(rv1[3], rv1 + 1);
733
+ reducer.reducePacket(rv1[1], rv1);
734
+ reducer.reducePacket(rv2[2], rv2);
735
+ reducer.reducePacket(rv2[3], rv2 + 1);
736
+ reducer.reducePacket(rv2[1], rv2);
737
+ }
738
+ half val1 = __low2half(*rv1);
739
+ reducer.reduce(__high2half(*rv1), &val1);
740
+ half val2 = __low2half(*rv2);
741
+ reducer.reduce(__high2half(*rv2), &val2);
742
+ val = __halves2half2(val1, val2);
743
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
744
+ half* loc = output + row;
745
+ atomicReduce((half2*)loc, val, reducer);
746
+ }
747
+ }
748
+ }
749
+ }
750
+
751
+ #endif // EIGEN_HAS_GPU_FP16
752
+
753
+ template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
754
+ struct InnerReductionLauncher {
755
+ static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
756
+ gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
757
+ return true;
758
+ }
759
+ };
760
+
761
+ // Specialization for float and double
762
+ template <typename Self, typename Op, typename OutputType, bool PacketAccess>
763
+ struct InnerReductionLauncher<
764
+ Self, Op, OutputType, PacketAccess,
765
+ typename internal::enable_if<
766
+ internal::is_same<float, OutputType>::value ||
767
+ internal::is_same<double, OutputType>::value,
768
+ void>::type> {
769
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
770
+ typedef typename Self::Index Index;
771
+
772
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
773
+ const int block_size = 256;
774
+ const int num_per_thread = 128;
775
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
776
+ const int max_blocks = device.getNumGpuMultiProcessors() *
777
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
778
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
779
+
780
+ if (num_blocks > 1) {
781
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
782
+ // won't be a race conditions between multiple thread blocks.
783
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
784
+ const int max_blocks = device.getNumGpuMultiProcessors() *
785
+ device.maxGpuThreadsPerMultiProcessor() / 1024;
786
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
787
+ LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
788
+ num_blocks, 1024, 0, device, reducer.initialize(),
789
+ num_preserved_vals, output);
790
+ }
791
+
792
+ LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
793
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
794
+
795
+ return false;
796
+ }
797
+ };
798
+
799
+ #ifdef EIGEN_HAS_GPU_FP16
800
+ template <typename Self, typename Op>
801
+ struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
802
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
803
+ gpu_assert(false && "Should not be called since there is no packet accessor");
804
+ return true;
805
+ }
806
+ };
807
+
808
+ template <typename Self, typename Op>
809
+ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
810
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
811
+ typedef typename Self::Index Index;
812
+
813
+ if (num_preserved_vals % 2 != 0) {
814
+ // Not supported yet, revert to the slower code path
815
+ return true;
816
+ }
817
+
818
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
819
+ const int block_size = /*256*/128;
820
+ const int num_per_thread = /*128*/64;
821
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
822
+ const int max_blocks = device.getNumGpuMultiProcessors() *
823
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
824
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
825
+
826
+ if (num_blocks > 1) {
827
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
828
+ // won't be a race conditions between multiple thread blocks.
829
+ LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
830
+ 1, 1, 0, device, reducer, self, num_preserved_vals, output);
831
+ }
832
+
833
+ LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
834
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
835
+
836
+ return false;
837
+ }
838
+ };
839
+ #endif // EIGEN_HAS_GPU_FP16
840
+
841
+
842
+ template <typename Self, typename Op>
843
+ struct InnerReducer<Self, Op, GpuDevice> {
844
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
845
+ // so reduce the scope of the optimized version of the code to the simple case
846
+ // of floats and half floats.
847
+ #ifdef EIGEN_HAS_GPU_FP16
848
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
850
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
851
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
852
+ #else // EIGEN_HAS_GPU_FP16
853
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
854
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
855
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
856
+ #endif // EIGEN_HAS_GPU_FP16
857
+
858
+ template <typename OutputType>
859
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
860
+ gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
861
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
862
+ // Don't crash when we're called with an input tensor of size 0.
863
+ if (num_coeffs == 0) {
864
+ return true;
865
+ }
866
+ // It's faster to use the usual code.
867
+ if (num_coeffs_to_reduce <= 128) {
868
+ return true;
869
+ }
870
+
871
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
872
+ }
873
+ };
874
+
875
+ template <int NumPerThread, typename Self,
876
+ typename Reducer, typename Index>
877
+ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
878
+ typename Self::CoeffReturnType* output) {
879
+ const Index num_threads = blockDim.x * gridDim.x;
880
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
881
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
882
+ if (gridDim.x == 1) {
883
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
884
+ output[i] = reducer.initialize();
885
+ }
886
+ __syncthreads();
887
+ }
888
+
889
+ // Do the reduction.
890
+ const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
891
+ for (Index i = thread_id; i < max_iter; i += num_threads) {
892
+ const Index input_col = i % num_preserved_coeffs;
893
+ const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
894
+ typename Self::CoeffReturnType reduced_val = reducer.initialize();
895
+ const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
896
+ for (Index j = input_row; j < max_row; j++) {
897
+ typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
898
+ reducer.reduce(val, &reduced_val);
899
+ }
900
+ atomicReduce(&(output[input_col]), reduced_val, reducer);
901
+ }
902
+ }
903
+
904
+
905
+ template <typename Self, typename Op>
906
+ struct OuterReducer<Self, Op, GpuDevice> {
907
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
908
+ // so reduce the scope of the optimized version of the code to the simple case
909
+ // of floats.
910
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
911
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
912
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
913
+ template <typename Device, typename OutputType>
914
+ static
915
+ #if !defined(EIGEN_HIPCC)
916
+ // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
917
+ // (in the cxx11_tensor_reduction_gpu test)
918
+ //
919
+ // terminate called after throwing an instance of 'std::runtime_error'
920
+ // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
921
+ //
922
+ // don't know why this happens (and why is it a runtime error instead of a compile time error)
923
+ //
924
+ // this will be fixed by HIP PR#457
925
+ EIGEN_DEVICE_FUNC
926
+ #endif
927
+ bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
928
+ gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
929
+ return true;
930
+ }
931
+
932
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
933
+ typedef typename Self::Index Index;
934
+
935
+ // It's faster to use the usual code.
936
+ if (num_coeffs_to_reduce <= 32) {
937
+ return true;
938
+ }
939
+
940
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
941
+ const int block_size = 256;
942
+ const int num_per_thread = 16;
943
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
944
+ const int max_blocks = device.getNumGpuMultiProcessors() *
945
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
946
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
947
+
948
+ if (num_blocks > 1) {
949
+ // We initialize the outputs in the reduction kernel itself when we don't have to worry
950
+ // about race conditions between multiple thread blocks.
951
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
952
+ const int max_blocks = device.getNumGpuMultiProcessors() *
953
+ device.maxGpuThreadsPerMultiProcessor() / 1024;
954
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
955
+ LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
956
+ num_blocks, 1024, 0, device, reducer.initialize(),
957
+ num_preserved_vals, output);
958
+ }
959
+
960
+ LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
961
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
962
+
963
+ return false;
964
+ }
965
+ };
966
+
967
+ #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
968
+
969
+
970
+ } // end namespace internal
971
+ } // end namespace Eigen
972
+
973
+ #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H