tomoto 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (347) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +8 -10
  4. data/ext/tomoto/extconf.rb +6 -2
  5. data/ext/tomoto/{ext.cpp → tomoto.cpp} +1 -1
  6. data/lib/tomoto/version.rb +1 -1
  7. data/lib/tomoto.rb +5 -1
  8. data/vendor/EigenRand/EigenRand/Core.h +10 -10
  9. data/vendor/EigenRand/EigenRand/Dists/Basic.h +208 -9
  10. data/vendor/EigenRand/EigenRand/Dists/Discrete.h +52 -31
  11. data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +9 -8
  12. data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +28 -21
  13. data/vendor/EigenRand/EigenRand/EigenRand +11 -6
  14. data/vendor/EigenRand/EigenRand/Macro.h +13 -7
  15. data/vendor/EigenRand/EigenRand/MorePacketMath.h +348 -740
  16. data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +5 -3
  17. data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +9 -3
  18. data/vendor/EigenRand/EigenRand/PacketFilter.h +11 -253
  19. data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +21 -47
  20. data/vendor/EigenRand/EigenRand/RandUtils.h +50 -344
  21. data/vendor/EigenRand/EigenRand/arch/AVX/MorePacketMath.h +619 -0
  22. data/vendor/EigenRand/EigenRand/arch/AVX/PacketFilter.h +149 -0
  23. data/vendor/EigenRand/EigenRand/arch/AVX/RandUtils.h +228 -0
  24. data/vendor/EigenRand/EigenRand/arch/NEON/MorePacketMath.h +473 -0
  25. data/vendor/EigenRand/EigenRand/arch/NEON/PacketFilter.h +142 -0
  26. data/vendor/EigenRand/EigenRand/arch/NEON/RandUtils.h +126 -0
  27. data/vendor/EigenRand/EigenRand/arch/SSE/MorePacketMath.h +501 -0
  28. data/vendor/EigenRand/EigenRand/arch/SSE/PacketFilter.h +133 -0
  29. data/vendor/EigenRand/EigenRand/arch/SSE/RandUtils.h +120 -0
  30. data/vendor/EigenRand/EigenRand/doc.h +24 -12
  31. data/vendor/EigenRand/README.md +57 -4
  32. data/vendor/eigen/COPYING.APACHE +203 -0
  33. data/vendor/eigen/COPYING.BSD +1 -1
  34. data/vendor/eigen/COPYING.MINPACK +51 -52
  35. data/vendor/eigen/Eigen/Cholesky +0 -1
  36. data/vendor/eigen/Eigen/Core +112 -265
  37. data/vendor/eigen/Eigen/Eigenvalues +2 -3
  38. data/vendor/eigen/Eigen/Geometry +5 -8
  39. data/vendor/eigen/Eigen/Householder +0 -1
  40. data/vendor/eigen/Eigen/Jacobi +0 -1
  41. data/vendor/eigen/Eigen/KLUSupport +41 -0
  42. data/vendor/eigen/Eigen/LU +2 -5
  43. data/vendor/eigen/Eigen/OrderingMethods +0 -3
  44. data/vendor/eigen/Eigen/PaStiXSupport +1 -0
  45. data/vendor/eigen/Eigen/PardisoSupport +0 -0
  46. data/vendor/eigen/Eigen/QR +2 -3
  47. data/vendor/eigen/Eigen/QtAlignedMalloc +0 -1
  48. data/vendor/eigen/Eigen/SVD +0 -1
  49. data/vendor/eigen/Eigen/Sparse +0 -2
  50. data/vendor/eigen/Eigen/SparseCholesky +0 -8
  51. data/vendor/eigen/Eigen/SparseLU +4 -0
  52. data/vendor/eigen/Eigen/SparseQR +0 -1
  53. data/vendor/eigen/Eigen/src/Cholesky/LDLT.h +42 -27
  54. data/vendor/eigen/Eigen/src/Cholesky/LLT.h +39 -23
  55. data/vendor/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +90 -47
  56. data/vendor/eigen/Eigen/src/Core/ArithmeticSequence.h +413 -0
  57. data/vendor/eigen/Eigen/src/Core/Array.h +99 -11
  58. data/vendor/eigen/Eigen/src/Core/ArrayBase.h +3 -3
  59. data/vendor/eigen/Eigen/src/Core/ArrayWrapper.h +21 -21
  60. data/vendor/eigen/Eigen/src/Core/Assign.h +1 -1
  61. data/vendor/eigen/Eigen/src/Core/AssignEvaluator.h +125 -50
  62. data/vendor/eigen/Eigen/src/Core/Assign_MKL.h +10 -10
  63. data/vendor/eigen/Eigen/src/Core/BandMatrix.h +16 -16
  64. data/vendor/eigen/Eigen/src/Core/Block.h +56 -60
  65. data/vendor/eigen/Eigen/src/Core/BooleanRedux.h +29 -31
  66. data/vendor/eigen/Eigen/src/Core/CommaInitializer.h +7 -3
  67. data/vendor/eigen/Eigen/src/Core/CoreEvaluators.h +325 -272
  68. data/vendor/eigen/Eigen/src/Core/CoreIterators.h +5 -0
  69. data/vendor/eigen/Eigen/src/Core/CwiseBinaryOp.h +21 -22
  70. data/vendor/eigen/Eigen/src/Core/CwiseNullaryOp.h +153 -18
  71. data/vendor/eigen/Eigen/src/Core/CwiseUnaryOp.h +6 -6
  72. data/vendor/eigen/Eigen/src/Core/CwiseUnaryView.h +14 -10
  73. data/vendor/eigen/Eigen/src/Core/DenseBase.h +132 -42
  74. data/vendor/eigen/Eigen/src/Core/DenseCoeffsBase.h +25 -21
  75. data/vendor/eigen/Eigen/src/Core/DenseStorage.h +153 -71
  76. data/vendor/eigen/Eigen/src/Core/Diagonal.h +21 -23
  77. data/vendor/eigen/Eigen/src/Core/DiagonalMatrix.h +50 -2
  78. data/vendor/eigen/Eigen/src/Core/DiagonalProduct.h +1 -1
  79. data/vendor/eigen/Eigen/src/Core/Dot.h +10 -10
  80. data/vendor/eigen/Eigen/src/Core/EigenBase.h +10 -9
  81. data/vendor/eigen/Eigen/src/Core/ForceAlignedAccess.h +8 -4
  82. data/vendor/eigen/Eigen/src/Core/Fuzzy.h +3 -3
  83. data/vendor/eigen/Eigen/src/Core/GeneralProduct.h +20 -10
  84. data/vendor/eigen/Eigen/src/Core/GenericPacketMath.h +599 -152
  85. data/vendor/eigen/Eigen/src/Core/GlobalFunctions.h +40 -33
  86. data/vendor/eigen/Eigen/src/Core/IO.h +40 -7
  87. data/vendor/eigen/Eigen/src/Core/IndexedView.h +237 -0
  88. data/vendor/eigen/Eigen/src/Core/Inverse.h +9 -10
  89. data/vendor/eigen/Eigen/src/Core/Map.h +7 -7
  90. data/vendor/eigen/Eigen/src/Core/MapBase.h +10 -3
  91. data/vendor/eigen/Eigen/src/Core/MathFunctions.h +767 -125
  92. data/vendor/eigen/Eigen/src/Core/MathFunctionsImpl.h +118 -19
  93. data/vendor/eigen/Eigen/src/Core/Matrix.h +131 -25
  94. data/vendor/eigen/Eigen/src/Core/MatrixBase.h +21 -3
  95. data/vendor/eigen/Eigen/src/Core/NestByValue.h +25 -50
  96. data/vendor/eigen/Eigen/src/Core/NoAlias.h +4 -3
  97. data/vendor/eigen/Eigen/src/Core/NumTraits.h +107 -20
  98. data/vendor/eigen/Eigen/src/Core/PartialReduxEvaluator.h +232 -0
  99. data/vendor/eigen/Eigen/src/Core/PermutationMatrix.h +3 -31
  100. data/vendor/eigen/Eigen/src/Core/PlainObjectBase.h +152 -59
  101. data/vendor/eigen/Eigen/src/Core/Product.h +30 -25
  102. data/vendor/eigen/Eigen/src/Core/ProductEvaluators.h +192 -125
  103. data/vendor/eigen/Eigen/src/Core/Random.h +37 -1
  104. data/vendor/eigen/Eigen/src/Core/Redux.h +180 -170
  105. data/vendor/eigen/Eigen/src/Core/Ref.h +121 -23
  106. data/vendor/eigen/Eigen/src/Core/Replicate.h +8 -8
  107. data/vendor/eigen/Eigen/src/Core/Reshaped.h +454 -0
  108. data/vendor/eigen/Eigen/src/Core/ReturnByValue.h +7 -5
  109. data/vendor/eigen/Eigen/src/Core/Reverse.h +18 -12
  110. data/vendor/eigen/Eigen/src/Core/Select.h +8 -6
  111. data/vendor/eigen/Eigen/src/Core/SelfAdjointView.h +33 -20
  112. data/vendor/eigen/Eigen/src/Core/Solve.h +14 -14
  113. data/vendor/eigen/Eigen/src/Core/SolveTriangular.h +16 -16
  114. data/vendor/eigen/Eigen/src/Core/SolverBase.h +41 -3
  115. data/vendor/eigen/Eigen/src/Core/StableNorm.h +100 -70
  116. data/vendor/eigen/Eigen/src/Core/StlIterators.h +463 -0
  117. data/vendor/eigen/Eigen/src/Core/Stride.h +9 -4
  118. data/vendor/eigen/Eigen/src/Core/Swap.h +5 -4
  119. data/vendor/eigen/Eigen/src/Core/Transpose.h +88 -27
  120. data/vendor/eigen/Eigen/src/Core/Transpositions.h +26 -47
  121. data/vendor/eigen/Eigen/src/Core/TriangularMatrix.h +93 -75
  122. data/vendor/eigen/Eigen/src/Core/VectorBlock.h +5 -5
  123. data/vendor/eigen/Eigen/src/Core/VectorwiseOp.h +159 -70
  124. data/vendor/eigen/Eigen/src/Core/Visitor.h +137 -29
  125. data/vendor/eigen/Eigen/src/Core/arch/AVX/Complex.h +50 -129
  126. data/vendor/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +126 -337
  127. data/vendor/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +1092 -155
  128. data/vendor/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +65 -1
  129. data/vendor/eigen/Eigen/src/Core/arch/AVX512/Complex.h +422 -0
  130. data/vendor/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +207 -236
  131. data/vendor/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1482 -495
  132. data/vendor/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +89 -0
  133. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +152 -165
  134. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +19 -251
  135. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2937 -0
  136. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +221 -0
  137. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +629 -0
  138. data/vendor/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2042 -392
  139. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Complex.h +235 -80
  140. data/vendor/eigen/Eigen/src/Core/arch/Default/BFloat16.h +700 -0
  141. data/vendor/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +102 -14
  142. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1649 -0
  143. data/vendor/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +110 -0
  144. data/vendor/eigen/Eigen/src/Core/arch/Default/Half.h +942 -0
  145. data/vendor/eigen/Eigen/src/Core/arch/Default/Settings.h +1 -1
  146. data/vendor/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +120 -0
  147. data/vendor/eigen/Eigen/src/Core/arch/{CUDA → GPU}/MathFunctions.h +16 -4
  148. data/vendor/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1685 -0
  149. data/vendor/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +80 -0
  150. data/vendor/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  151. data/vendor/eigen/Eigen/src/Core/arch/MSA/Complex.h +648 -0
  152. data/vendor/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +387 -0
  153. data/vendor/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1233 -0
  154. data/vendor/eigen/Eigen/src/Core/arch/NEON/Complex.h +313 -219
  155. data/vendor/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +183 -0
  156. data/vendor/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +54 -70
  157. data/vendor/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4376 -549
  158. data/vendor/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1419 -0
  159. data/vendor/eigen/Eigen/src/Core/arch/SSE/Complex.h +59 -179
  160. data/vendor/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +65 -428
  161. data/vendor/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +893 -283
  162. data/vendor/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +65 -0
  163. data/vendor/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +44 -0
  164. data/vendor/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +752 -0
  165. data/vendor/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +49 -0
  166. data/vendor/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +232 -0
  167. data/vendor/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +301 -0
  168. data/vendor/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +670 -0
  169. data/vendor/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +694 -0
  170. data/vendor/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +85 -0
  171. data/vendor/eigen/Eigen/src/Core/arch/ZVector/Complex.h +212 -183
  172. data/vendor/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +101 -5
  173. data/vendor/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +510 -395
  174. data/vendor/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +11 -2
  175. data/vendor/eigen/Eigen/src/Core/functors/BinaryFunctors.h +112 -46
  176. data/vendor/eigen/Eigen/src/Core/functors/NullaryFunctors.h +31 -30
  177. data/vendor/eigen/Eigen/src/Core/functors/StlFunctors.h +32 -2
  178. data/vendor/eigen/Eigen/src/Core/functors/UnaryFunctors.h +355 -16
  179. data/vendor/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1075 -586
  180. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +49 -24
  181. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +41 -35
  182. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +6 -6
  183. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +4 -2
  184. data/vendor/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +382 -483
  185. data/vendor/eigen/Eigen/src/Core/products/Parallelizer.h +22 -5
  186. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +53 -30
  187. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +16 -8
  188. data/vendor/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +8 -6
  189. data/vendor/eigen/Eigen/src/Core/products/SelfadjointProduct.h +4 -4
  190. data/vendor/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +5 -4
  191. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +33 -27
  192. data/vendor/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +14 -12
  193. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +36 -34
  194. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +8 -4
  195. data/vendor/eigen/Eigen/src/Core/products/TriangularSolverVector.h +13 -10
  196. data/vendor/eigen/Eigen/src/Core/util/BlasUtil.h +304 -119
  197. data/vendor/eigen/Eigen/src/Core/util/ConfigureVectorization.h +512 -0
  198. data/vendor/eigen/Eigen/src/Core/util/Constants.h +25 -9
  199. data/vendor/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +26 -3
  200. data/vendor/eigen/Eigen/src/Core/util/ForwardDeclarations.h +29 -9
  201. data/vendor/eigen/Eigen/src/Core/util/IndexedViewHelper.h +186 -0
  202. data/vendor/eigen/Eigen/src/Core/util/IntegralConstant.h +272 -0
  203. data/vendor/eigen/Eigen/src/Core/util/MKL_support.h +8 -1
  204. data/vendor/eigen/Eigen/src/Core/util/Macros.h +709 -246
  205. data/vendor/eigen/Eigen/src/Core/util/Memory.h +222 -52
  206. data/vendor/eigen/Eigen/src/Core/util/Meta.h +355 -77
  207. data/vendor/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +5 -1
  208. data/vendor/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  209. data/vendor/eigen/Eigen/src/Core/util/StaticAssert.h +8 -5
  210. data/vendor/eigen/Eigen/src/Core/util/SymbolicIndex.h +293 -0
  211. data/vendor/eigen/Eigen/src/Core/util/XprHelper.h +65 -30
  212. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +1 -1
  213. data/vendor/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +7 -4
  214. data/vendor/eigen/Eigen/src/Eigenvalues/EigenSolver.h +2 -2
  215. data/vendor/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +1 -1
  216. data/vendor/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +2 -2
  217. data/vendor/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +2 -2
  218. data/vendor/eigen/Eigen/src/Eigenvalues/RealQZ.h +9 -6
  219. data/vendor/eigen/Eigen/src/Eigenvalues/RealSchur.h +21 -9
  220. data/vendor/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +77 -43
  221. data/vendor/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +20 -15
  222. data/vendor/eigen/Eigen/src/Geometry/AlignedBox.h +99 -5
  223. data/vendor/eigen/Eigen/src/Geometry/AngleAxis.h +4 -4
  224. data/vendor/eigen/Eigen/src/Geometry/EulerAngles.h +3 -3
  225. data/vendor/eigen/Eigen/src/Geometry/Homogeneous.h +15 -11
  226. data/vendor/eigen/Eigen/src/Geometry/Hyperplane.h +1 -1
  227. data/vendor/eigen/Eigen/src/Geometry/OrthoMethods.h +3 -2
  228. data/vendor/eigen/Eigen/src/Geometry/ParametrizedLine.h +39 -2
  229. data/vendor/eigen/Eigen/src/Geometry/Quaternion.h +70 -14
  230. data/vendor/eigen/Eigen/src/Geometry/Rotation2D.h +3 -3
  231. data/vendor/eigen/Eigen/src/Geometry/Scaling.h +23 -5
  232. data/vendor/eigen/Eigen/src/Geometry/Transform.h +88 -67
  233. data/vendor/eigen/Eigen/src/Geometry/Translation.h +6 -12
  234. data/vendor/eigen/Eigen/src/Geometry/Umeyama.h +1 -1
  235. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +168 -0
  236. data/vendor/eigen/Eigen/src/Householder/BlockHouseholder.h +9 -2
  237. data/vendor/eigen/Eigen/src/Householder/Householder.h +8 -4
  238. data/vendor/eigen/Eigen/src/Householder/HouseholderSequence.h +123 -48
  239. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +15 -15
  240. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +7 -23
  241. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +5 -22
  242. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +41 -47
  243. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +51 -60
  244. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +70 -20
  245. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +2 -20
  246. data/vendor/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +11 -9
  247. data/vendor/eigen/Eigen/src/Jacobi/Jacobi.h +31 -10
  248. data/vendor/eigen/Eigen/src/KLUSupport/KLUSupport.h +358 -0
  249. data/vendor/eigen/Eigen/src/LU/Determinant.h +35 -19
  250. data/vendor/eigen/Eigen/src/LU/FullPivLU.h +29 -43
  251. data/vendor/eigen/Eigen/src/LU/InverseImpl.h +25 -8
  252. data/vendor/eigen/Eigen/src/LU/PartialPivLU.h +71 -58
  253. data/vendor/eigen/Eigen/src/LU/arch/InverseSize4.h +351 -0
  254. data/vendor/eigen/Eigen/src/OrderingMethods/Amd.h +7 -17
  255. data/vendor/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +297 -277
  256. data/vendor/eigen/Eigen/src/OrderingMethods/Ordering.h +6 -10
  257. data/vendor/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +1 -1
  258. data/vendor/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +9 -7
  259. data/vendor/eigen/Eigen/src/QR/ColPivHouseholderQR.h +41 -20
  260. data/vendor/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +100 -27
  261. data/vendor/eigen/Eigen/src/QR/FullPivHouseholderQR.h +59 -22
  262. data/vendor/eigen/Eigen/src/QR/HouseholderQR.h +48 -23
  263. data/vendor/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +25 -3
  264. data/vendor/eigen/Eigen/src/SVD/BDCSVD.h +183 -63
  265. data/vendor/eigen/Eigen/src/SVD/JacobiSVD.h +22 -14
  266. data/vendor/eigen/Eigen/src/SVD/SVDBase.h +83 -22
  267. data/vendor/eigen/Eigen/src/SVD/UpperBidiagonalization.h +3 -3
  268. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +17 -9
  269. data/vendor/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +12 -37
  270. data/vendor/eigen/Eigen/src/SparseCore/AmbiVector.h +3 -2
  271. data/vendor/eigen/Eigen/src/SparseCore/CompressedStorage.h +16 -0
  272. data/vendor/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +6 -6
  273. data/vendor/eigen/Eigen/src/SparseCore/SparseAssign.h +81 -27
  274. data/vendor/eigen/Eigen/src/SparseCore/SparseBlock.h +25 -57
  275. data/vendor/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +40 -11
  276. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +11 -15
  277. data/vendor/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +4 -2
  278. data/vendor/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +30 -8
  279. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrix.h +126 -11
  280. data/vendor/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +5 -12
  281. data/vendor/eigen/Eigen/src/SparseCore/SparseProduct.h +13 -1
  282. data/vendor/eigen/Eigen/src/SparseCore/SparseRef.h +7 -7
  283. data/vendor/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +5 -2
  284. data/vendor/eigen/Eigen/src/SparseCore/SparseUtil.h +8 -0
  285. data/vendor/eigen/Eigen/src/SparseCore/SparseVector.h +1 -1
  286. data/vendor/eigen/Eigen/src/SparseCore/SparseView.h +1 -0
  287. data/vendor/eigen/Eigen/src/SparseLU/SparseLU.h +162 -12
  288. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +1 -1
  289. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +76 -2
  290. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +2 -2
  291. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +1 -1
  292. data/vendor/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +1 -1
  293. data/vendor/eigen/Eigen/src/SparseQR/SparseQR.h +19 -6
  294. data/vendor/eigen/Eigen/src/StlSupport/StdDeque.h +2 -12
  295. data/vendor/eigen/Eigen/src/StlSupport/StdList.h +2 -2
  296. data/vendor/eigen/Eigen/src/StlSupport/StdVector.h +2 -2
  297. data/vendor/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +6 -8
  298. data/vendor/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +175 -39
  299. data/vendor/eigen/Eigen/src/misc/lapacke.h +5 -4
  300. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +28 -2
  301. data/vendor/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +155 -11
  302. data/vendor/eigen/Eigen/src/plugins/BlockMethods.h +626 -242
  303. data/vendor/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +14 -0
  304. data/vendor/eigen/Eigen/src/plugins/IndexedViewMethods.h +262 -0
  305. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +4 -4
  306. data/vendor/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +10 -0
  307. data/vendor/eigen/Eigen/src/plugins/ReshapedMethods.h +149 -0
  308. data/vendor/eigen/README.md +2 -0
  309. data/vendor/eigen/bench/btl/README +1 -1
  310. data/vendor/eigen/bench/tensors/README +6 -7
  311. data/vendor/eigen/ci/README.md +56 -0
  312. data/vendor/eigen/demos/mix_eigen_and_c/README +1 -1
  313. data/vendor/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +213 -158
  314. data/vendor/eigen/unsupported/README.txt +1 -1
  315. data/vendor/tomotopy/README.kr.rst +21 -0
  316. data/vendor/tomotopy/README.rst +20 -0
  317. data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +2 -2
  318. data/vendor/tomotopy/src/Labeling/Phraser.hpp +1 -1
  319. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +2 -1
  320. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +2 -1
  321. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +1 -1
  322. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +2 -2
  323. data/vendor/tomotopy/src/TopicModel/HDP.h +1 -0
  324. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +53 -2
  325. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +1 -1
  326. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +1 -0
  327. data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +2 -2
  328. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +16 -5
  329. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  330. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  331. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  332. data/vendor/tomotopy/src/TopicModel/PT.h +3 -1
  333. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +31 -1
  334. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +2 -2
  335. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +7 -5
  336. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +36 -1
  337. data/vendor/tomotopy/src/Utils/exception.h +6 -0
  338. data/vendor/tomotopy/src/Utils/sample.hpp +14 -12
  339. data/vendor/tomotopy/src/Utils/sse_gamma.h +0 -3
  340. metadata +60 -14
  341. data/vendor/eigen/Eigen/CMakeLists.txt +0 -19
  342. data/vendor/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -674
  343. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  344. data/vendor/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  345. data/vendor/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  346. data/vendor/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  347. data/vendor/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
@@ -0,0 +1,1685 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_GPU_H
11
+ #define EIGEN_PACKET_MATH_GPU_H
12
+
13
+ namespace Eigen {
14
+
15
+ namespace internal {
16
+
17
+ // Read-only data cached load available.
18
+ #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
19
+ #define EIGEN_GPU_HAS_LDG 1
20
+ #endif
21
+
22
+ // FP16 math available.
23
+ #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
24
+ #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
25
+ #endif
26
+
27
+ #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
28
+ #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
29
+ #endif
30
+
31
+ // Make sure this is only available when targeting a GPU: we don't want to
32
+ // introduce conflicts between these packet_traits definitions and the ones
33
+ // we'll use on the host side (SSE, AVX, ...)
34
+ #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
35
+
36
+ template<> struct is_arithmetic<float4> { enum { value = true }; };
37
+ template<> struct is_arithmetic<double2> { enum { value = true }; };
38
+
39
+ template<> struct packet_traits<float> : default_packet_traits
40
+ {
41
+ typedef float4 type;
42
+ typedef float4 half;
43
+ enum {
44
+ Vectorizable = 1,
45
+ AlignedOnScalar = 1,
46
+ size=4,
47
+ HasHalfPacket = 0,
48
+
49
+ HasDiv = 1,
50
+ HasSin = 0,
51
+ HasCos = 0,
52
+ HasLog = 1,
53
+ HasExp = 1,
54
+ HasSqrt = 1,
55
+ HasRsqrt = 1,
56
+ HasLGamma = 1,
57
+ HasDiGamma = 1,
58
+ HasZeta = 1,
59
+ HasPolygamma = 1,
60
+ HasErf = 1,
61
+ HasErfc = 1,
62
+ HasNdtri = 1,
63
+ HasBessel = 1,
64
+ HasIGamma = 1,
65
+ HasIGammaDerA = 1,
66
+ HasGammaSampleDerAlpha = 1,
67
+ HasIGammac = 1,
68
+ HasBetaInc = 1,
69
+
70
+ HasBlend = 0,
71
+ HasFloor = 1,
72
+ };
73
+ };
74
+
75
+ template<> struct packet_traits<double> : default_packet_traits
76
+ {
77
+ typedef double2 type;
78
+ typedef double2 half;
79
+ enum {
80
+ Vectorizable = 1,
81
+ AlignedOnScalar = 1,
82
+ size=2,
83
+ HasHalfPacket = 0,
84
+
85
+ HasDiv = 1,
86
+ HasLog = 1,
87
+ HasExp = 1,
88
+ HasSqrt = 1,
89
+ HasRsqrt = 1,
90
+ HasLGamma = 1,
91
+ HasDiGamma = 1,
92
+ HasZeta = 1,
93
+ HasPolygamma = 1,
94
+ HasErf = 1,
95
+ HasErfc = 1,
96
+ HasNdtri = 1,
97
+ HasBessel = 1,
98
+ HasIGamma = 1,
99
+ HasIGammaDerA = 1,
100
+ HasGammaSampleDerAlpha = 1,
101
+ HasIGammac = 1,
102
+ HasBetaInc = 1,
103
+
104
+ HasBlend = 0,
105
+ HasFloor = 1,
106
+ };
107
+ };
108
+
109
+
110
+ template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
111
+ template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
112
+
113
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
114
+ return make_float4(from, from, from, from);
115
+ }
116
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
117
+ return make_double2(from, from);
118
+ }
119
+
120
+ // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
121
+ // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
122
+ // of the functions, while the latter can only deal with one of them.
123
+ #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
124
+ namespace {
125
+
126
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
127
+ const float& b) {
128
+ return __int_as_float(__float_as_int(a) & __float_as_int(b));
129
+ }
130
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
131
+ const double& b) {
132
+ return __longlong_as_double(__double_as_longlong(a) &
133
+ __double_as_longlong(b));
134
+ }
135
+
136
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
137
+ const float& b) {
138
+ return __int_as_float(__float_as_int(a) | __float_as_int(b));
139
+ }
140
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
141
+ const double& b) {
142
+ return __longlong_as_double(__double_as_longlong(a) |
143
+ __double_as_longlong(b));
144
+ }
145
+
146
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
147
+ const float& b) {
148
+ return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
149
+ }
150
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
151
+ const double& b) {
152
+ return __longlong_as_double(__double_as_longlong(a) ^
153
+ __double_as_longlong(b));
154
+ }
155
+
156
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
157
+ const float& b) {
158
+ return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
159
+ }
160
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
161
+ const double& b) {
162
+ return __longlong_as_double(__double_as_longlong(a) &
163
+ ~__double_as_longlong(b));
164
+ }
165
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
166
+ const float& b) {
167
+ return __int_as_float(a == b ? 0xffffffffu : 0u);
168
+ }
169
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
170
+ const double& b) {
171
+ return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
172
+ }
173
+
174
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
175
+ const float& b) {
176
+ return __int_as_float(a < b ? 0xffffffffu : 0u);
177
+ }
178
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
179
+ const double& b) {
180
+ return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
181
+ }
182
+
183
+ } // namespace
184
+
185
+ template <>
186
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
187
+ const float4& b) {
188
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
189
+ bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
190
+ }
191
+ template <>
192
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
193
+ const double2& b) {
194
+ return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
195
+ }
196
+
197
+ template <>
198
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
199
+ const float4& b) {
200
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
201
+ bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
202
+ }
203
+ template <>
204
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
205
+ const double2& b) {
206
+ return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
207
+ }
208
+
209
+ template <>
210
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
211
+ const float4& b) {
212
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
213
+ bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
214
+ }
215
+ template <>
216
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
217
+ const double2& b) {
218
+ return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
219
+ }
220
+
221
+ template <>
222
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
223
+ const float4& b) {
224
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
225
+ bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
226
+ }
227
+ template <>
228
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
229
+ pandnot<double2>(const double2& a, const double2& b) {
230
+ return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
231
+ }
232
+
233
+ template <>
234
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
235
+ const float4& b) {
236
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
237
+ eq_mask(a.w, b.w));
238
+ }
239
+ template <>
240
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
241
+ const float4& b) {
242
+ return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
243
+ lt_mask(a.w, b.w));
244
+ }
245
+ template <>
246
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
247
+ pcmp_eq<double2>(const double2& a, const double2& b) {
248
+ return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
249
+ }
250
+ template <>
251
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
252
+ pcmp_lt<double2>(const double2& a, const double2& b) {
253
+ return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
254
+ }
255
+ #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
256
+
257
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
258
+ return make_float4(a, a+1, a+2, a+3);
259
+ }
260
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
261
+ return make_double2(a, a+1);
262
+ }
263
+
264
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
265
+ return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
266
+ }
267
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
268
+ return make_double2(a.x+b.x, a.y+b.y);
269
+ }
270
+
271
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
272
+ return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
273
+ }
274
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
275
+ return make_double2(a.x-b.x, a.y-b.y);
276
+ }
277
+
278
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
279
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
280
+ }
281
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
282
+ return make_double2(-a.x, -a.y);
283
+ }
284
+
285
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
286
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
287
+
288
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
289
+ return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
290
+ }
291
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
292
+ return make_double2(a.x*b.x, a.y*b.y);
293
+ }
294
+
295
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
296
+ return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
297
+ }
298
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
299
+ return make_double2(a.x/b.x, a.y/b.y);
300
+ }
301
+
302
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
303
+ return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
304
+ }
305
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
306
+ return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
307
+ }
308
+
309
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
310
+ return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
311
+ }
312
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
313
+ return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
314
+ }
315
+
316
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
317
+ return *reinterpret_cast<const float4*>(from);
318
+ }
319
+
320
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
321
+ return *reinterpret_cast<const double2*>(from);
322
+ }
323
+
324
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
325
+ return make_float4(from[0], from[1], from[2], from[3]);
326
+ }
327
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
328
+ return make_double2(from[0], from[1]);
329
+ }
330
+
331
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
332
+ return make_float4(from[0], from[0], from[1], from[1]);
333
+ }
334
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
335
+ return make_double2(from[0], from[0]);
336
+ }
337
+
338
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
339
+ *reinterpret_cast<float4*>(to) = from;
340
+ }
341
+
342
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
343
+ *reinterpret_cast<double2*>(to) = from;
344
+ }
345
+
346
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
347
+ to[0] = from.x;
348
+ to[1] = from.y;
349
+ to[2] = from.z;
350
+ to[3] = from.w;
351
+ }
352
+
353
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
354
+ to[0] = from.x;
355
+ to[1] = from.y;
356
+ }
357
+
358
+ template<>
359
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
360
+ #if defined(EIGEN_GPU_HAS_LDG)
361
+ return __ldg((const float4*)from);
362
+ #else
363
+ return make_float4(from[0], from[1], from[2], from[3]);
364
+ #endif
365
+ }
366
+ template<>
367
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
368
+ #if defined(EIGEN_GPU_HAS_LDG)
369
+ return __ldg((const double2*)from);
370
+ #else
371
+ return make_double2(from[0], from[1]);
372
+ #endif
373
+ }
374
+
375
+ template<>
376
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
377
+ #if defined(EIGEN_GPU_HAS_LDG)
378
+ return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
379
+ #else
380
+ return make_float4(from[0], from[1], from[2], from[3]);
381
+ #endif
382
+ }
383
+ template<>
384
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
385
+ #if defined(EIGEN_GPU_HAS_LDG)
386
+ return make_double2(__ldg(from+0), __ldg(from+1));
387
+ #else
388
+ return make_double2(from[0], from[1]);
389
+ #endif
390
+ }
391
+
392
+ template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
393
+ return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
394
+ }
395
+
396
+ template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
397
+ return make_double2(from[0*stride], from[1*stride]);
398
+ }
399
+
400
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
401
+ to[stride*0] = from.x;
402
+ to[stride*1] = from.y;
403
+ to[stride*2] = from.z;
404
+ to[stride*3] = from.w;
405
+ }
406
+ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
407
+ to[stride*0] = from.x;
408
+ to[stride*1] = from.y;
409
+ }
410
+
411
+ template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
412
+ return a.x;
413
+ }
414
+ template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
415
+ return a.x;
416
+ }
417
+
418
+ template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
419
+ return a.x + a.y + a.z + a.w;
420
+ }
421
+ template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
422
+ return a.x + a.y;
423
+ }
424
+
425
+ template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
426
+ return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
427
+ }
428
+ template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
429
+ return fmax(a.x, a.y);
430
+ }
431
+
432
+ template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
433
+ return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
434
+ }
435
+ template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
436
+ return fmin(a.x, a.y);
437
+ }
438
+
439
+ template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
440
+ return a.x * a.y * a.z * a.w;
441
+ }
442
+ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
443
+ return a.x * a.y;
444
+ }
445
+
446
+ template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
447
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
448
+ }
449
+ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
450
+ return make_double2(fabs(a.x), fabs(a.y));
451
+ }
452
+
453
+ template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
454
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
455
+ }
456
+ template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
457
+ return make_double2(floor(a.x), floor(a.y));
458
+ }
459
+
460
+ EIGEN_DEVICE_FUNC inline void
461
+ ptranspose(PacketBlock<float4,4>& kernel) {
462
+ float tmp = kernel.packet[0].y;
463
+ kernel.packet[0].y = kernel.packet[1].x;
464
+ kernel.packet[1].x = tmp;
465
+
466
+ tmp = kernel.packet[0].z;
467
+ kernel.packet[0].z = kernel.packet[2].x;
468
+ kernel.packet[2].x = tmp;
469
+
470
+ tmp = kernel.packet[0].w;
471
+ kernel.packet[0].w = kernel.packet[3].x;
472
+ kernel.packet[3].x = tmp;
473
+
474
+ tmp = kernel.packet[1].z;
475
+ kernel.packet[1].z = kernel.packet[2].y;
476
+ kernel.packet[2].y = tmp;
477
+
478
+ tmp = kernel.packet[1].w;
479
+ kernel.packet[1].w = kernel.packet[3].y;
480
+ kernel.packet[3].y = tmp;
481
+
482
+ tmp = kernel.packet[2].w;
483
+ kernel.packet[2].w = kernel.packet[3].z;
484
+ kernel.packet[3].z = tmp;
485
+ }
486
+
487
+ EIGEN_DEVICE_FUNC inline void
488
+ ptranspose(PacketBlock<double2,2>& kernel) {
489
+ double tmp = kernel.packet[0].y;
490
+ kernel.packet[0].y = kernel.packet[1].x;
491
+ kernel.packet[1].x = tmp;
492
+ }
493
+
494
+ #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
495
+
496
+ // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
497
+ // its corresponding packet_traits<Eigen::half> must be visible on host.
498
+ #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
499
+
500
+ typedef ulonglong2 Packet4h2;
501
+ template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
502
+ template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
503
+
504
+ template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
505
+ template<> struct is_arithmetic<half2> { enum { value = true }; };
506
+
507
+ template<> struct packet_traits<Eigen::half> : default_packet_traits
508
+ {
509
+ typedef Packet4h2 type;
510
+ typedef Packet4h2 half;
511
+ enum {
512
+ Vectorizable = 1,
513
+ AlignedOnScalar = 1,
514
+ size=8,
515
+ HasHalfPacket = 0,
516
+ HasAdd = 1,
517
+ HasSub = 1,
518
+ HasMul = 1,
519
+ HasDiv = 1,
520
+ HasSqrt = 1,
521
+ HasRsqrt = 1,
522
+ HasExp = 1,
523
+ HasExpm1 = 1,
524
+ HasLog = 1,
525
+ HasLog1p = 1
526
+ };
527
+ };
528
+
529
+ namespace {
530
+ // This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
531
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
532
+ #if defined(EIGEN_GPU_COMPILE_PHASE)
533
+ return __halves2half2(a, b);
534
+ #else
535
+ // Round-about way since __halves2half2 is a __device__ function.
536
+ return __floats2half2_rn(__half2float(a), __half2float(b));
537
+ #endif
538
+ }
539
+
540
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
541
+ #if defined(EIGEN_GPU_COMPILE_PHASE)
542
+ return __low2half(a);
543
+ #else
544
+ return __float2half(__low2float(a));
545
+ #endif
546
+ }
547
+
548
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
549
+ #if defined(EIGEN_GPU_COMPILE_PHASE)
550
+ return __high2half(a);
551
+ #else
552
+ return __float2half(__high2float(a));
553
+ #endif
554
+ }
555
+ } // namespace
556
+
557
+ template<>
558
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
559
+ #if defined(EIGEN_GPU_COMPILE_PHASE)
560
+ return __half2half2(from);
561
+ #else
562
+ const float f = __half2float(from);
563
+ return __floats2half2_rn(f, f);
564
+ #endif
565
+ }
566
+
567
+ template <>
568
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
569
+ pset1<Packet4h2>(const Eigen::half& from) {
570
+ Packet4h2 r;
571
+ half2* p_alias = reinterpret_cast<half2*>(&r);
572
+ p_alias[0] = pset1<half2>(from);
573
+ p_alias[1] = pset1<half2>(from);
574
+ p_alias[2] = pset1<half2>(from);
575
+ p_alias[3] = pset1<half2>(from);
576
+ return r;
577
+ }
578
+
579
+ // We now need this visible on both host and device.
580
+ // #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
581
+ namespace {
582
+
583
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
584
+ return *reinterpret_cast<const half2*>(from);
585
+ }
586
+
587
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
588
+ return combine_half(from[0], from[1]);
589
+ }
590
+
591
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
592
+ return combine_half(from[0], from[0]);
593
+ }
594
+
595
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
596
+ const half2& from) {
597
+ *reinterpret_cast<half2*>(to) = from;
598
+ }
599
+
600
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
601
+ const half2& from) {
602
+ to[0] = get_half2_low(from);
603
+ to[1] = get_half2_high(from);
604
+ }
605
+
606
+
607
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
608
+ const Eigen::half* from) {
609
+ #if defined(EIGEN_GPU_HAS_LDG)
610
+ // Input is guaranteed to be properly aligned.
611
+ return __ldg(reinterpret_cast<const half2*>(from));
612
+ #else
613
+ return combine_half(*(from+0), *(from+1));
614
+ #endif
615
+ }
616
+
617
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
618
+ const Eigen::half* from) {
619
+ #if defined(EIGEN_GPU_HAS_LDG)
620
+ return __halves2half2(__ldg(from+0), __ldg(from+1));
621
+ #else
622
+ return combine_half(*(from+0), *(from+1));
623
+ #endif
624
+ }
625
+
626
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
627
+ Index stride) {
628
+ return combine_half(from[0*stride], from[1*stride]);
629
+ }
630
+
631
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
632
+ Eigen::half* to, const half2& from, Index stride) {
633
+ to[stride*0] = get_half2_low(from);
634
+ to[stride*1] = get_half2_high(from);
635
+ }
636
+
637
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
638
+ return get_half2_low(a);
639
+ }
640
+
641
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
642
+ half a1 = get_half2_low(a);
643
+ half a2 = get_half2_high(a);
644
+ half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
645
+ half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
646
+ return combine_half(result1, result2);
647
+ }
648
+
649
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
650
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
651
+ return pset1<half2>(true_half);
652
+ }
653
+
654
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
655
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
656
+ return pset1<half2>(false_half);
657
+ }
658
+
659
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
660
+ ptranspose(PacketBlock<half2,2>& kernel) {
661
+ __half a1 = get_half2_low(kernel.packet[0]);
662
+ __half a2 = get_half2_high(kernel.packet[0]);
663
+ __half b1 = get_half2_low(kernel.packet[1]);
664
+ __half b2 = get_half2_high(kernel.packet[1]);
665
+ kernel.packet[0] = combine_half(a1, b1);
666
+ kernel.packet[1] = combine_half(a2, b2);
667
+ }
668
+
669
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
670
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
671
+ return __halves2half2(a, __hadd(a, __float2half(1.0f)));
672
+ #else
673
+ float f = __half2float(a) + 1.0f;
674
+ return combine_half(a, __float2half(f));
675
+ #endif
676
+ }
677
+
678
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
679
+ const half2& a,
680
+ const half2& b) {
681
+ half mask_low = get_half2_low(mask);
682
+ half mask_high = get_half2_high(mask);
683
+ half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
684
+ half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
685
+ return combine_half(result_low, result_high);
686
+ }
687
+
688
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
689
+ const half2& b) {
690
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
691
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
692
+ half a1 = get_half2_low(a);
693
+ half a2 = get_half2_high(a);
694
+ half b1 = get_half2_low(b);
695
+ half b2 = get_half2_high(b);
696
+ half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
697
+ half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
698
+ return combine_half(eq1, eq2);
699
+ }
700
+
701
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
702
+ const half2& b) {
703
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
704
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
705
+ half a1 = get_half2_low(a);
706
+ half a2 = get_half2_high(a);
707
+ half b1 = get_half2_low(b);
708
+ half b2 = get_half2_high(b);
709
+ half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
710
+ half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
711
+ return combine_half(eq1, eq2);
712
+ }
713
+
714
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
715
+ const half2& b) {
716
+ half a1 = get_half2_low(a);
717
+ half a2 = get_half2_high(a);
718
+ half b1 = get_half2_low(b);
719
+ half b2 = get_half2_high(b);
720
+ half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
721
+ half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
722
+ return combine_half(result1, result2);
723
+ }
724
+
725
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
726
+ const half2& b) {
727
+ half a1 = get_half2_low(a);
728
+ half a2 = get_half2_high(a);
729
+ half b1 = get_half2_low(b);
730
+ half b2 = get_half2_high(b);
731
+ half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
732
+ half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
733
+ return combine_half(result1, result2);
734
+ }
735
+
736
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
737
+ const half2& b) {
738
+ half a1 = get_half2_low(a);
739
+ half a2 = get_half2_high(a);
740
+ half b1 = get_half2_low(b);
741
+ half b2 = get_half2_high(b);
742
+ half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
743
+ half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
744
+ return combine_half(result1, result2);
745
+ }
746
+
747
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
748
+ const half2& b) {
749
+ half a1 = get_half2_low(a);
750
+ half a2 = get_half2_high(a);
751
+ half b1 = get_half2_low(b);
752
+ half b2 = get_half2_high(b);
753
+ half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
754
+ half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
755
+ return combine_half(result1, result2);
756
+ }
757
+
758
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
759
+ const half2& b) {
760
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
761
+ return __hadd2(a, b);
762
+ #else
763
+ float a1 = __low2float(a);
764
+ float a2 = __high2float(a);
765
+ float b1 = __low2float(b);
766
+ float b2 = __high2float(b);
767
+ float r1 = a1 + b1;
768
+ float r2 = a2 + b2;
769
+ return __floats2half2_rn(r1, r2);
770
+ #endif
771
+ }
772
+
773
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
774
+ const half2& b) {
775
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
776
+ return __hsub2(a, b);
777
+ #else
778
+ float a1 = __low2float(a);
779
+ float a2 = __high2float(a);
780
+ float b1 = __low2float(b);
781
+ float b2 = __high2float(b);
782
+ float r1 = a1 - b1;
783
+ float r2 = a2 - b2;
784
+ return __floats2half2_rn(r1, r2);
785
+ #endif
786
+ }
787
+
788
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
789
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
790
+ return __hneg2(a);
791
+ #else
792
+ float a1 = __low2float(a);
793
+ float a2 = __high2float(a);
794
+ return __floats2half2_rn(-a1, -a2);
795
+ #endif
796
+ }
797
+
798
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
799
+
800
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
801
+ const half2& b) {
802
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
803
+ return __hmul2(a, b);
804
+ #else
805
+ float a1 = __low2float(a);
806
+ float a2 = __high2float(a);
807
+ float b1 = __low2float(b);
808
+ float b2 = __high2float(b);
809
+ float r1 = a1 * b1;
810
+ float r2 = a2 * b2;
811
+ return __floats2half2_rn(r1, r2);
812
+ #endif
813
+ }
814
+
815
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
816
+ const half2& b,
817
+ const half2& c) {
818
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
819
+ return __hfma2(a, b, c);
820
+ #else
821
+ float a1 = __low2float(a);
822
+ float a2 = __high2float(a);
823
+ float b1 = __low2float(b);
824
+ float b2 = __high2float(b);
825
+ float c1 = __low2float(c);
826
+ float c2 = __high2float(c);
827
+ float r1 = a1 * b1 + c1;
828
+ float r2 = a2 * b2 + c2;
829
+ return __floats2half2_rn(r1, r2);
830
+ #endif
831
+ }
832
+
833
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
834
+ const half2& b) {
835
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
836
+ return __h2div(a, b);
837
+ #else
838
+ float a1 = __low2float(a);
839
+ float a2 = __high2float(a);
840
+ float b1 = __low2float(b);
841
+ float b2 = __high2float(b);
842
+ float r1 = a1 / b1;
843
+ float r2 = a2 / b2;
844
+ return __floats2half2_rn(r1, r2);
845
+ #endif
846
+ }
847
+
848
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
849
+ const half2& b) {
850
+ float a1 = __low2float(a);
851
+ float a2 = __high2float(a);
852
+ float b1 = __low2float(b);
853
+ float b2 = __high2float(b);
854
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
855
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
856
+ return combine_half(r1, r2);
857
+ }
858
+
859
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
860
+ const half2& b) {
861
+ float a1 = __low2float(a);
862
+ float a2 = __high2float(a);
863
+ float b1 = __low2float(b);
864
+ float b2 = __high2float(b);
865
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
866
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
867
+ return combine_half(r1, r2);
868
+ }
869
+
870
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
871
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
872
+ return __hadd(__low2half(a), __high2half(a));
873
+ #else
874
+ float a1 = __low2float(a);
875
+ float a2 = __high2float(a);
876
+ return Eigen::half(__float2half(a1 + a2));
877
+ #endif
878
+ }
879
+
880
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
881
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
882
+ __half first = __low2half(a);
883
+ __half second = __high2half(a);
884
+ return __hgt(first, second) ? first : second;
885
+ #else
886
+ float a1 = __low2float(a);
887
+ float a2 = __high2float(a);
888
+ return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
889
+ #endif
890
+ }
891
+
892
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
893
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
894
+ __half first = __low2half(a);
895
+ __half second = __high2half(a);
896
+ return __hlt(first, second) ? first : second;
897
+ #else
898
+ float a1 = __low2float(a);
899
+ float a2 = __high2float(a);
900
+ return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
901
+ #endif
902
+ }
903
+
904
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
905
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
906
+ return __hmul(__low2half(a), __high2half(a));
907
+ #else
908
+ float a1 = __low2float(a);
909
+ float a2 = __high2float(a);
910
+ return Eigen::half(__float2half(a1 * a2));
911
+ #endif
912
+ }
913
+
914
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
915
+ float a1 = __low2float(a);
916
+ float a2 = __high2float(a);
917
+ float r1 = log1pf(a1);
918
+ float r2 = log1pf(a2);
919
+ return __floats2half2_rn(r1, r2);
920
+ }
921
+
922
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
923
+ float a1 = __low2float(a);
924
+ float a2 = __high2float(a);
925
+ float r1 = expm1f(a1);
926
+ float r2 = expm1f(a2);
927
+ return __floats2half2_rn(r1, r2);
928
+ }
929
+
930
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
931
+ defined(EIGEN_HIP_DEVICE_COMPILE)
932
+
933
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
934
+ half2 plog(const half2& a) {
935
+ return h2log(a);
936
+ }
937
+
938
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
939
+ half2 pexp(const half2& a) {
940
+ return h2exp(a);
941
+ }
942
+
943
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
944
+ half2 psqrt(const half2& a) {
945
+ return h2sqrt(a);
946
+ }
947
+
948
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
949
+ half2 prsqrt(const half2& a) {
950
+ return h2rsqrt(a);
951
+ }
952
+
953
+ #else
954
+
955
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
956
+ float a1 = __low2float(a);
957
+ float a2 = __high2float(a);
958
+ float r1 = logf(a1);
959
+ float r2 = logf(a2);
960
+ return __floats2half2_rn(r1, r2);
961
+ }
962
+
963
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
964
+ float a1 = __low2float(a);
965
+ float a2 = __high2float(a);
966
+ float r1 = expf(a1);
967
+ float r2 = expf(a2);
968
+ return __floats2half2_rn(r1, r2);
969
+ }
970
+
971
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
972
+ float a1 = __low2float(a);
973
+ float a2 = __high2float(a);
974
+ float r1 = sqrtf(a1);
975
+ float r2 = sqrtf(a2);
976
+ return __floats2half2_rn(r1, r2);
977
+ }
978
+
979
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
980
+ float a1 = __low2float(a);
981
+ float a2 = __high2float(a);
982
+ float r1 = rsqrtf(a1);
983
+ float r2 = rsqrtf(a2);
984
+ return __floats2half2_rn(r1, r2);
985
+ }
986
+ #endif
987
+ } // namespace
988
+
989
+ template <>
990
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
991
+ pload<Packet4h2>(const Eigen::half* from) {
992
+ return *reinterpret_cast<const Packet4h2*>(from);
993
+ }
994
+
995
+ // unaligned load;
996
+ template <>
997
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
998
+ ploadu<Packet4h2>(const Eigen::half* from) {
999
+ Packet4h2 r;
1000
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1001
+ p_alias[0] = ploadu(from + 0);
1002
+ p_alias[1] = ploadu(from + 2);
1003
+ p_alias[2] = ploadu(from + 4);
1004
+ p_alias[3] = ploadu(from + 6);
1005
+ return r;
1006
+ }
1007
+
1008
+ template <>
1009
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1010
+ ploaddup<Packet4h2>(const Eigen::half* from) {
1011
+ Packet4h2 r;
1012
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1013
+ p_alias[0] = ploaddup(from + 0);
1014
+ p_alias[1] = ploaddup(from + 1);
1015
+ p_alias[2] = ploaddup(from + 2);
1016
+ p_alias[3] = ploaddup(from + 3);
1017
+ return r;
1018
+ }
1019
+
1020
+ template <>
1021
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
1022
+ Eigen::half* to, const Packet4h2& from) {
1023
+ *reinterpret_cast<Packet4h2*>(to) = from;
1024
+ }
1025
+
1026
+ template <>
1027
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
1028
+ Eigen::half* to, const Packet4h2& from) {
1029
+ const half2* from_alias = reinterpret_cast<const half2*>(&from);
1030
+ pstoreu(to + 0,from_alias[0]);
1031
+ pstoreu(to + 2,from_alias[1]);
1032
+ pstoreu(to + 4,from_alias[2]);
1033
+ pstoreu(to + 6,from_alias[3]);
1034
+ }
1035
+
1036
+ template <>
1037
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1038
+ ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1039
+ #if defined(EIGEN_GPU_HAS_LDG)
1040
+ Packet4h2 r;
1041
+ r = __ldg(reinterpret_cast<const Packet4h2*>(from));
1042
+ return r;
1043
+ #else
1044
+ Packet4h2 r;
1045
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1046
+ r_alias[0] = ploadt_ro_aligned(from + 0);
1047
+ r_alias[1] = ploadt_ro_aligned(from + 2);
1048
+ r_alias[2] = ploadt_ro_aligned(from + 4);
1049
+ r_alias[3] = ploadt_ro_aligned(from + 6);
1050
+ return r;
1051
+ #endif
1052
+ }
1053
+
1054
+ template <>
1055
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1056
+ ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1057
+ Packet4h2 r;
1058
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1059
+ r_alias[0] = ploadt_ro_unaligned(from + 0);
1060
+ r_alias[1] = ploadt_ro_unaligned(from + 2);
1061
+ r_alias[2] = ploadt_ro_unaligned(from + 4);
1062
+ r_alias[3] = ploadt_ro_unaligned(from + 6);
1063
+ return r;
1064
+ }
1065
+
1066
+ template <>
1067
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1068
+ pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1069
+ Packet4h2 r;
1070
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1071
+ p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072
+ p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073
+ p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074
+ p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1075
+ return r;
1076
+ }
1077
+
1078
+ template <>
1079
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
1080
+ Eigen::half* to, const Packet4h2& from, Index stride) {
1081
+ const half2* from_alias = reinterpret_cast<const half2*>(&from);
1082
+ pscatter(to + stride * 0, from_alias[0], stride);
1083
+ pscatter(to + stride * 2, from_alias[1], stride);
1084
+ pscatter(to + stride * 4, from_alias[2], stride);
1085
+ pscatter(to + stride * 6, from_alias[3], stride);
1086
+ }
1087
+
1088
+ template <>
1089
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
1090
+ const Packet4h2& a) {
1091
+ return pfirst(*(reinterpret_cast<const half2*>(&a)));
1092
+ }
1093
+
1094
+ template <>
1095
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
1096
+ const Packet4h2& a) {
1097
+ Packet4h2 r;
1098
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1099
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1100
+ p_alias[0] = pabs(a_alias[0]);
1101
+ p_alias[1] = pabs(a_alias[1]);
1102
+ p_alias[2] = pabs(a_alias[2]);
1103
+ p_alias[3] = pabs(a_alias[3]);
1104
+ return r;
1105
+ }
1106
+
1107
+ template <>
1108
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
1109
+ const Packet4h2& /*a*/) {
1110
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
1111
+ return pset1<Packet4h2>(true_half);
1112
+ }
1113
+
1114
+ template <>
1115
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
1116
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
1117
+ return pset1<Packet4h2>(false_half);
1118
+ }
1119
+
1120
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
1121
+ double* d_row0, double* d_row1, double* d_row2, double* d_row3,
1122
+ double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
1123
+ double d_tmp;
1124
+ d_tmp = d_row0[1];
1125
+ d_row0[1] = d_row4[0];
1126
+ d_row4[0] = d_tmp;
1127
+
1128
+ d_tmp = d_row1[1];
1129
+ d_row1[1] = d_row5[0];
1130
+ d_row5[0] = d_tmp;
1131
+
1132
+ d_tmp = d_row2[1];
1133
+ d_row2[1] = d_row6[0];
1134
+ d_row6[0] = d_tmp;
1135
+
1136
+ d_tmp = d_row3[1];
1137
+ d_row3[1] = d_row7[0];
1138
+ d_row7[0] = d_tmp;
1139
+ }
1140
+
1141
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
1142
+ half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1143
+ half2 f_tmp;
1144
+ f_tmp = f_row0[1];
1145
+ f_row0[1] = f_row2[0];
1146
+ f_row2[0] = f_tmp;
1147
+
1148
+ f_tmp = f_row1[1];
1149
+ f_row1[1] = f_row3[0];
1150
+ f_row3[0] = f_tmp;
1151
+ }
1152
+
1153
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1154
+ ptranspose_half(half2& f0, half2& f1) {
1155
+ __half a1 = get_half2_low(f0);
1156
+ __half a2 = get_half2_high(f0);
1157
+ __half b1 = get_half2_low(f1);
1158
+ __half b2 = get_half2_high(f1);
1159
+ f0 = combine_half(a1, b1);
1160
+ f1 = combine_half(a2, b2);
1161
+ }
1162
+
1163
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1164
+ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1165
+ double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
1166
+ double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
1167
+ double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
1168
+ double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
1169
+ double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
1170
+ double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
1171
+ double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
1172
+ double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
1173
+ ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174
+ d_row4, d_row5, d_row6, d_row7);
1175
+
1176
+
1177
+ half2* f_row0 = reinterpret_cast<half2*>(d_row0);
1178
+ half2* f_row1 = reinterpret_cast<half2*>(d_row1);
1179
+ half2* f_row2 = reinterpret_cast<half2*>(d_row2);
1180
+ half2* f_row3 = reinterpret_cast<half2*>(d_row3);
1181
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1182
+ ptranspose_half(f_row0[0], f_row1[0]);
1183
+ ptranspose_half(f_row0[1], f_row1[1]);
1184
+ ptranspose_half(f_row2[0], f_row3[0]);
1185
+ ptranspose_half(f_row2[1], f_row3[1]);
1186
+
1187
+ f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
1188
+ f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
1189
+ f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
1190
+ f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
1191
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1192
+ ptranspose_half(f_row0[0], f_row1[0]);
1193
+ ptranspose_half(f_row0[1], f_row1[1]);
1194
+ ptranspose_half(f_row2[0], f_row3[0]);
1195
+ ptranspose_half(f_row2[1], f_row3[1]);
1196
+
1197
+ f_row0 = reinterpret_cast<half2*>(d_row4);
1198
+ f_row1 = reinterpret_cast<half2*>(d_row5);
1199
+ f_row2 = reinterpret_cast<half2*>(d_row6);
1200
+ f_row3 = reinterpret_cast<half2*>(d_row7);
1201
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1202
+ ptranspose_half(f_row0[0], f_row1[0]);
1203
+ ptranspose_half(f_row0[1], f_row1[1]);
1204
+ ptranspose_half(f_row2[0], f_row3[0]);
1205
+ ptranspose_half(f_row2[1], f_row3[1]);
1206
+
1207
+ f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
1208
+ f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
1209
+ f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
1210
+ f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
1211
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1212
+ ptranspose_half(f_row0[0], f_row1[0]);
1213
+ ptranspose_half(f_row0[1], f_row1[1]);
1214
+ ptranspose_half(f_row2[0], f_row3[0]);
1215
+ ptranspose_half(f_row2[1], f_row3[1]);
1216
+
1217
+ }
1218
+
1219
+ template <>
1220
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1221
+ plset<Packet4h2>(const Eigen::half& a) {
1222
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
1223
+
1224
+ Packet4h2 r;
1225
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1226
+ p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1227
+ p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
1228
+ __hadd(a, __float2half(3.0f)));
1229
+ p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
1230
+ __hadd(a, __float2half(5.0f)));
1231
+ p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
1232
+ __hadd(a, __float2half(7.0f)));
1233
+ return r;
1234
+ #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1235
+ Packet4h2 r;
1236
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1237
+
1238
+ half2 b = pset1<half2>(a);
1239
+ half2 c;
1240
+ half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1241
+ half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1242
+
1243
+ c = __hadd2(b, half_offset0);
1244
+ r_alias[0] = plset(__low2half(c));
1245
+ r_alias[1] = plset(__high2half(c));
1246
+
1247
+ c = __hadd2(b, half_offset1);
1248
+ r_alias[2] = plset(__low2half(c));
1249
+ r_alias[3] = plset(__high2half(c));
1250
+
1251
+ return r;
1252
+
1253
+ #else
1254
+ float f = __half2float(a);
1255
+ Packet4h2 r;
1256
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1257
+ p_alias[0] = combine_half(a, __float2half(f + 1.0f));
1258
+ p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
1259
+ p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
1260
+ p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
1261
+ return r;
1262
+ #endif
1263
+ }
1264
+
1265
+ template <>
1266
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1267
+ pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1268
+ const Packet4h2& b) {
1269
+ Packet4h2 r;
1270
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1271
+ const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
1272
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1273
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1274
+ r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1275
+ r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1276
+ r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1277
+ r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1278
+ return r;
1279
+ }
1280
+
1281
+ template <>
1282
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1283
+ pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1284
+ Packet4h2 r;
1285
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1286
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1287
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1288
+ r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1289
+ r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1290
+ r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1291
+ r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1292
+ return r;
1293
+ }
1294
+
1295
+ template <>
1296
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
1297
+ const Packet4h2& a, const Packet4h2& b) {
1298
+ Packet4h2 r;
1299
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1300
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1301
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1302
+ r_alias[0] = pand(a_alias[0], b_alias[0]);
1303
+ r_alias[1] = pand(a_alias[1], b_alias[1]);
1304
+ r_alias[2] = pand(a_alias[2], b_alias[2]);
1305
+ r_alias[3] = pand(a_alias[3], b_alias[3]);
1306
+ return r;
1307
+ }
1308
+
1309
+ template <>
1310
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
1311
+ const Packet4h2& a, const Packet4h2& b) {
1312
+ Packet4h2 r;
1313
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1314
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1315
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1316
+ r_alias[0] = por(a_alias[0], b_alias[0]);
1317
+ r_alias[1] = por(a_alias[1], b_alias[1]);
1318
+ r_alias[2] = por(a_alias[2], b_alias[2]);
1319
+ r_alias[3] = por(a_alias[3], b_alias[3]);
1320
+ return r;
1321
+ }
1322
+
1323
+ template <>
1324
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
1325
+ const Packet4h2& a, const Packet4h2& b) {
1326
+ Packet4h2 r;
1327
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1328
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1329
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1330
+ r_alias[0] = pxor(a_alias[0], b_alias[0]);
1331
+ r_alias[1] = pxor(a_alias[1], b_alias[1]);
1332
+ r_alias[2] = pxor(a_alias[2], b_alias[2]);
1333
+ r_alias[3] = pxor(a_alias[3], b_alias[3]);
1334
+ return r;
1335
+ }
1336
+
1337
+ template <>
1338
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1339
+ pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1340
+ Packet4h2 r;
1341
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1342
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1343
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1344
+ r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1345
+ r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1346
+ r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1347
+ r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1348
+ return r;
1349
+ }
1350
+
1351
+ template <>
1352
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
1353
+ const Packet4h2& a, const Packet4h2& b) {
1354
+ Packet4h2 r;
1355
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1356
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1357
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1358
+ r_alias[0] = padd(a_alias[0], b_alias[0]);
1359
+ r_alias[1] = padd(a_alias[1], b_alias[1]);
1360
+ r_alias[2] = padd(a_alias[2], b_alias[2]);
1361
+ r_alias[3] = padd(a_alias[3], b_alias[3]);
1362
+ return r;
1363
+ }
1364
+
1365
+ template <>
1366
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
1367
+ const Packet4h2& a, const Packet4h2& b) {
1368
+ Packet4h2 r;
1369
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1370
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1371
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1372
+ r_alias[0] = psub(a_alias[0], b_alias[0]);
1373
+ r_alias[1] = psub(a_alias[1], b_alias[1]);
1374
+ r_alias[2] = psub(a_alias[2], b_alias[2]);
1375
+ r_alias[3] = psub(a_alias[3], b_alias[3]);
1376
+ return r;
1377
+ }
1378
+
1379
+ template <>
1380
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
1381
+ Packet4h2 r;
1382
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1383
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1384
+ r_alias[0] = pnegate(a_alias[0]);
1385
+ r_alias[1] = pnegate(a_alias[1]);
1386
+ r_alias[2] = pnegate(a_alias[2]);
1387
+ r_alias[3] = pnegate(a_alias[3]);
1388
+ return r;
1389
+ }
1390
+
1391
+ template <>
1392
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
1393
+ return a;
1394
+ }
1395
+
1396
+ template <>
1397
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
1398
+ const Packet4h2& a, const Packet4h2& b) {
1399
+ Packet4h2 r;
1400
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1401
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1402
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1403
+ r_alias[0] = pmul(a_alias[0], b_alias[0]);
1404
+ r_alias[1] = pmul(a_alias[1], b_alias[1]);
1405
+ r_alias[2] = pmul(a_alias[2], b_alias[2]);
1406
+ r_alias[3] = pmul(a_alias[3], b_alias[3]);
1407
+ return r;
1408
+ }
1409
+
1410
+ template <>
1411
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
1412
+ const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
1413
+ Packet4h2 r;
1414
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1415
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1416
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1417
+ const half2* c_alias = reinterpret_cast<const half2*>(&c);
1418
+ r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1419
+ r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1420
+ r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1421
+ r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1422
+ return r;
1423
+ }
1424
+
1425
+ template <>
1426
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
1427
+ const Packet4h2& a, const Packet4h2& b) {
1428
+ Packet4h2 r;
1429
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1430
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1431
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1432
+ r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1433
+ r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1434
+ r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1435
+ r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1436
+ return r;
1437
+ }
1438
+
1439
+ template <>
1440
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
1441
+ const Packet4h2& a, const Packet4h2& b) {
1442
+ Packet4h2 r;
1443
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1444
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1445
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1446
+ r_alias[0] = pmin(a_alias[0], b_alias[0]);
1447
+ r_alias[1] = pmin(a_alias[1], b_alias[1]);
1448
+ r_alias[2] = pmin(a_alias[2], b_alias[2]);
1449
+ r_alias[3] = pmin(a_alias[3], b_alias[3]);
1450
+ return r;
1451
+ }
1452
+
1453
+ template <>
1454
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
1455
+ const Packet4h2& a, const Packet4h2& b) {
1456
+ Packet4h2 r;
1457
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1458
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1459
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1460
+ r_alias[0] = pmax(a_alias[0], b_alias[0]);
1461
+ r_alias[1] = pmax(a_alias[1], b_alias[1]);
1462
+ r_alias[2] = pmax(a_alias[2], b_alias[2]);
1463
+ r_alias[3] = pmax(a_alias[3], b_alias[3]);
1464
+ return r;
1465
+ }
1466
+
1467
+ template <>
1468
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
1469
+ const Packet4h2& a) {
1470
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1471
+
1472
+ return predux(a_alias[0]) + predux(a_alias[1]) +
1473
+ predux(a_alias[2]) + predux(a_alias[3]);
1474
+ }
1475
+
1476
+ template <>
1477
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
1478
+ const Packet4h2& a) {
1479
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1480
+ half2 m0 = combine_half(predux_max(a_alias[0]),
1481
+ predux_max(a_alias[1]));
1482
+ half2 m1 = combine_half(predux_max(a_alias[2]),
1483
+ predux_max(a_alias[3]));
1484
+ __half first = predux_max(m0);
1485
+ __half second = predux_max(m1);
1486
+ #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1487
+ return (__hgt(first, second) ? first : second);
1488
+ #else
1489
+ float ffirst = __half2float(first);
1490
+ float fsecond = __half2float(second);
1491
+ return (ffirst > fsecond)? first: second;
1492
+ #endif
1493
+ }
1494
+
1495
+ template <>
1496
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
1497
+ const Packet4h2& a) {
1498
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1499
+ half2 m0 = combine_half(predux_min(a_alias[0]),
1500
+ predux_min(a_alias[1]));
1501
+ half2 m1 = combine_half(predux_min(a_alias[2]),
1502
+ predux_min(a_alias[3]));
1503
+ __half first = predux_min(m0);
1504
+ __half second = predux_min(m1);
1505
+ #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1506
+ return (__hlt(first, second) ? first : second);
1507
+ #else
1508
+ float ffirst = __half2float(first);
1509
+ float fsecond = __half2float(second);
1510
+ return (ffirst < fsecond)? first: second;
1511
+ #endif
1512
+ }
1513
+
1514
+ // likely overflow/underflow
1515
+ template <>
1516
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
1517
+ const Packet4h2& a) {
1518
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1519
+ return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
1520
+ pmul(a_alias[2], a_alias[3])));
1521
+ }
1522
+
1523
+ template <>
1524
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1525
+ plog1p<Packet4h2>(const Packet4h2& a) {
1526
+ Packet4h2 r;
1527
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1528
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1529
+ r_alias[0] = plog1p(a_alias[0]);
1530
+ r_alias[1] = plog1p(a_alias[1]);
1531
+ r_alias[2] = plog1p(a_alias[2]);
1532
+ r_alias[3] = plog1p(a_alias[3]);
1533
+ return r;
1534
+ }
1535
+
1536
+ template <>
1537
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1538
+ pexpm1<Packet4h2>(const Packet4h2& a) {
1539
+ Packet4h2 r;
1540
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1541
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1542
+ r_alias[0] = pexpm1(a_alias[0]);
1543
+ r_alias[1] = pexpm1(a_alias[1]);
1544
+ r_alias[2] = pexpm1(a_alias[2]);
1545
+ r_alias[3] = pexpm1(a_alias[3]);
1546
+ return r;
1547
+ }
1548
+
1549
+ template <>
1550
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
1551
+ Packet4h2 r;
1552
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1553
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1554
+ r_alias[0] = plog(a_alias[0]);
1555
+ r_alias[1] = plog(a_alias[1]);
1556
+ r_alias[2] = plog(a_alias[2]);
1557
+ r_alias[3] = plog(a_alias[3]);
1558
+ return r;
1559
+ }
1560
+
1561
+ template <>
1562
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
1563
+ Packet4h2 r;
1564
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1565
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1566
+ r_alias[0] = pexp(a_alias[0]);
1567
+ r_alias[1] = pexp(a_alias[1]);
1568
+ r_alias[2] = pexp(a_alias[2]);
1569
+ r_alias[3] = pexp(a_alias[3]);
1570
+ return r;
1571
+ }
1572
+
1573
+ template <>
1574
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
1575
+ Packet4h2 r;
1576
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1577
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1578
+ r_alias[0] = psqrt(a_alias[0]);
1579
+ r_alias[1] = psqrt(a_alias[1]);
1580
+ r_alias[2] = psqrt(a_alias[2]);
1581
+ r_alias[3] = psqrt(a_alias[3]);
1582
+ return r;
1583
+ }
1584
+
1585
+ template <>
1586
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1587
+ prsqrt<Packet4h2>(const Packet4h2& a) {
1588
+ Packet4h2 r;
1589
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1590
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1591
+ r_alias[0] = prsqrt(a_alias[0]);
1592
+ r_alias[1] = prsqrt(a_alias[1]);
1593
+ r_alias[2] = prsqrt(a_alias[2]);
1594
+ r_alias[3] = prsqrt(a_alias[3]);
1595
+ return r;
1596
+ }
1597
+
1598
+ // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
1599
+ // the implementation of GPU half reduction.
1600
+ template<>
1601
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
1602
+ const half2& b) {
1603
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1604
+ return __hadd2(a, b);
1605
+ #else
1606
+ float a1 = __low2float(a);
1607
+ float a2 = __high2float(a);
1608
+ float b1 = __low2float(b);
1609
+ float b2 = __high2float(b);
1610
+ float r1 = a1 + b1;
1611
+ float r2 = a2 + b2;
1612
+ return __floats2half2_rn(r1, r2);
1613
+ #endif
1614
+ }
1615
+
1616
+ template<>
1617
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
1618
+ const half2& b) {
1619
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1620
+ return __hmul2(a, b);
1621
+ #else
1622
+ float a1 = __low2float(a);
1623
+ float a2 = __high2float(a);
1624
+ float b1 = __low2float(b);
1625
+ float b2 = __high2float(b);
1626
+ float r1 = a1 * b1;
1627
+ float r2 = a2 * b2;
1628
+ return __floats2half2_rn(r1, r2);
1629
+ #endif
1630
+ }
1631
+
1632
+ template<>
1633
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
1634
+ const half2& b) {
1635
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1636
+ return __h2div(a, b);
1637
+ #else
1638
+ float a1 = __low2float(a);
1639
+ float a2 = __high2float(a);
1640
+ float b1 = __low2float(b);
1641
+ float b2 = __high2float(b);
1642
+ float r1 = a1 / b1;
1643
+ float r2 = a2 / b2;
1644
+ return __floats2half2_rn(r1, r2);
1645
+ #endif
1646
+ }
1647
+
1648
+ template<>
1649
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
1650
+ const half2& b) {
1651
+ float a1 = __low2float(a);
1652
+ float a2 = __high2float(a);
1653
+ float b1 = __low2float(b);
1654
+ float b2 = __high2float(b);
1655
+ __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
1656
+ __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
1657
+ return combine_half(r1, r2);
1658
+ }
1659
+
1660
+ template<>
1661
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
1662
+ const half2& b) {
1663
+ float a1 = __low2float(a);
1664
+ float a2 = __high2float(a);
1665
+ float b1 = __low2float(b);
1666
+ float b2 = __high2float(b);
1667
+ __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
1668
+ __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
1669
+ return combine_half(r1, r2);
1670
+ }
1671
+
1672
+ // #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
1673
+
1674
+ #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
1675
+
1676
+ #undef EIGEN_GPU_HAS_LDG
1677
+ #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1678
+ #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1679
+
1680
+ } // end namespace internal
1681
+
1682
+ } // end namespace Eigen
1683
+
1684
+
1685
+ #endif // EIGEN_PACKET_MATH_GPU_H