@smake/eigen 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -21
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +235 -326
  6. package/eigen/Eigen/Eigenvalues +16 -14
  7. package/eigen/Eigen/Geometry +21 -24
  8. package/eigen/Eigen/Householder +9 -8
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -14
  11. package/eigen/Eigen/KLUSupport +43 -0
  12. package/eigen/Eigen/LU +16 -20
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -54
  15. package/eigen/Eigen/PaStiXSupport +23 -20
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -21
  18. package/eigen/Eigen/QtAlignedMalloc +5 -13
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -18
  21. package/eigen/Eigen/Sparse +1 -4
  22. package/eigen/Eigen/SparseCholesky +18 -23
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +12 -8
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +377 -401
  37. package/eigen/Eigen/src/Cholesky/LLT.h +332 -360
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +620 -521
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +239 -0
  42. package/eigen/Eigen/src/Core/Array.h +341 -294
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +127 -171
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +711 -589
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +130 -125
  48. package/eigen/Eigen/src/Core/BandMatrix.h +268 -283
  49. package/eigen/Eigen/src/Core/Block.h +375 -398
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +86 -97
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1356 -1026
  53. package/eigen/Eigen/src/Core/CoreIterators.h +73 -59
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +114 -132
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +726 -617
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +56 -68
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +132 -95
  59. package/eigen/Eigen/src/Core/DenseBase.h +632 -571
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -624
  61. package/eigen/Eigen/src/Core/DenseStorage.h +512 -509
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +169 -210
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +351 -274
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +172 -222
  67. package/eigen/Eigen/src/Core/EigenBase.h +75 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -109
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +327 -263
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1472 -360
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +194 -151
  75. package/eigen/Eigen/src/Core/IO.h +147 -139
  76. package/eigen/Eigen/src/Core/IndexedView.h +321 -0
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +56 -66
  80. package/eigen/Eigen/src/Core/Map.h +124 -142
  81. package/eigen/Eigen/src/Core/MapBase.h +256 -281
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1620 -938
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +233 -71
  84. package/eigen/Eigen/src/Core/Matrix.h +491 -416
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -453
  86. package/eigen/Eigen/src/Core/NestByValue.h +66 -85
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -85
  88. package/eigen/Eigen/src/Core/NumTraits.h +235 -148
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +253 -0
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +871 -894
  92. package/eigen/Eigen/src/Core/Product.h +260 -139
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +863 -714
  94. package/eigen/Eigen/src/Core/Random.h +161 -136
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +366 -336
  98. package/eigen/Eigen/src/Core/Ref.h +308 -209
  99. package/eigen/Eigen/src/Core/Replicate.h +94 -106
  100. package/eigen/Eigen/src/Core/Reshaped.h +398 -0
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +49 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +136 -145
  103. package/eigen/Eigen/src/Core/Select.h +70 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +262 -285
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +97 -111
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +131 -129
  109. package/eigen/Eigen/src/Core/SolverBase.h +138 -101
  110. package/eigen/Eigen/src/Core/StableNorm.h +156 -160
  111. package/eigen/Eigen/src/Core/StlIterators.h +619 -0
  112. package/eigen/Eigen/src/Core/Stride.h +91 -88
  113. package/eigen/Eigen/src/Core/Swap.h +70 -38
  114. package/eigen/Eigen/src/Core/Transpose.h +295 -273
  115. package/eigen/Eigen/src/Core/Transpositions.h +272 -317
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +670 -755
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +668 -630
  119. package/eigen/Eigen/src/Core/Visitor.h +480 -216
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +407 -293
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +79 -388
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2935 -491
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +279 -22
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +472 -0
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +85 -333
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +2490 -649
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +277 -0
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +521 -298
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +39 -280
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +3686 -0
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +205 -0
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +901 -0
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +3391 -723
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +866 -0
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +113 -14
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +2634 -0
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +227 -0
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +1091 -0
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +11 -13
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +104 -0
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +1712 -0
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +77 -0
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +23 -0
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +620 -0
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +379 -0
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +1237 -0
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +531 -289
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +243 -0
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +50 -73
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +5915 -579
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +1642 -0
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +366 -334
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +40 -514
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +2164 -675
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +188 -35
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +48 -0
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +674 -0
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +52 -0
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +227 -0
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +303 -0
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +576 -0
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +83 -0
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +434 -261
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +160 -53
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +1073 -605
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +123 -117
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +594 -322
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +204 -118
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +110 -97
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1158 -530
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +2329 -1333
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +328 -364
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +191 -178
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +85 -82
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +396 -542
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +208 -92
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +331 -375
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +139 -146
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -46
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -275
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +70 -93
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +413 -290
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +543 -0
  217. package/eigen/Eigen/src/Core/util/Constants.h +314 -263
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -78
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +450 -224
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +487 -0
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +279 -0
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -30
  226. package/eigen/Eigen/src/Core/util/Macros.h +939 -646
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +1042 -650
  229. package/eigen/Eigen/src/Core/util/Meta.h +618 -426
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +51 -0
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +51 -164
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +445 -0
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +793 -538
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +91 -107
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +539 -606
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +348 -382
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +579 -600
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +434 -461
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +307 -214
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +135 -137
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +289 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +152 -161
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -145
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +141 -104
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +595 -497
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +110 -108
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +115 -90
  264. package/eigen/Eigen/src/Geometry/Transform.h +896 -953
  265. package/eigen/Eigen/src/Geometry/Translation.h +100 -98
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +154 -0
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +54 -42
  269. package/eigen/Eigen/src/Householder/Householder.h +104 -122
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +416 -382
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +153 -166
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +127 -138
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +95 -124
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +269 -267
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +246 -259
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +218 -217
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +80 -103
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +59 -63
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +256 -291
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +339 -0
  285. package/eigen/Eigen/src/LU/Determinant.h +60 -63
  286. package/eigen/Eigen/src/LU/FullPivLU.h +561 -626
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +213 -275
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +407 -435
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +353 -0
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +250 -282
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +950 -1103
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +111 -122
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -429
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +494 -473
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +223 -137
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +517 -460
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +412 -278
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +263 -261
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +872 -679
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +585 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +281 -160
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +202 -237
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +769 -590
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +318 -129
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -236
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +140 -184
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +174 -111
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +408 -477
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +531 -280
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +559 -347
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +185 -191
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1614 -1142
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -357
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +100 -91
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +371 -414
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +146 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +814 -618
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +273 -255
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +90 -101
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +125 -133
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +451 -490
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -105
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -732
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +480 -380
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9976 -16182
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/BlockMethods.inc +1370 -0
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.inc +167 -0
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/lib/LibEigen.d.ts +4 -0
  409. package/lib/LibEigen.js +14 -0
  410. package/lib/index.d.ts +1 -1
  411. package/lib/index.js +7 -3
  412. package/package.json +2 -10
  413. package/eigen/Eigen/CMakeLists.txt +0 -19
  414. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -164
  415. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -103
  416. package/eigen/Eigen/src/Core/arch/CUDA/Half.h +0 -675
  417. package/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +0 -91
  418. package/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +0 -333
  419. package/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +0 -1124
  420. package/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +0 -212
  421. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  422. package/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +0 -161
  423. package/eigen/Eigen/src/LU/arch/Inverse_SSE.h +0 -338
  424. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  425. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  426. package/eigen/Eigen/src/misc/lapack.h +0 -152
  427. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -332
  428. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -552
  429. package/eigen/Eigen/src/plugins/BlockMethods.h +0 -1058
  430. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  431. package/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +0 -163
  432. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  433. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -85
  434. package/lib/eigen.d.ts +0 -2
  435. package/lib/eigen.js +0 -15
@@ -0,0 +1,1712 @@
1
+ // This file is part of Eigen, a lightweight C++ template library
2
+ // for linear algebra.
3
+ //
4
+ // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5
+ //
6
+ // This Source Code Form is subject to the terms of the Mozilla
7
+ // Public License v. 2.0. If a copy of the MPL was not distributed
8
+ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ #ifndef EIGEN_PACKET_MATH_GPU_H
11
+ #define EIGEN_PACKET_MATH_GPU_H
12
+
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
16
+ namespace Eigen {
17
+
18
+ namespace internal {
19
+
20
+ // Read-only data cached load available.
21
+ #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
22
+ #define EIGEN_GPU_HAS_LDG 1
23
+ #endif
24
+
25
+ // FP16 math available.
26
+ #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
27
+ #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
28
+ #endif
29
+
30
+ #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
31
+ #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
32
+ #endif
33
+
34
+ // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
35
+ // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
36
+ // of the functions, while the latter can only deal with one of them.
37
+ #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
38
+ #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
39
+ #else
40
+ #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
41
+ #endif
42
+
43
+ // Make sure this is only available when targeting a GPU: we don't want to
44
+ // introduce conflicts between these packet_traits definitions and the ones
45
+ // we'll use on the host side (SSE, AVX, ...)
46
+ #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
47
+
48
+ template <>
49
+ struct is_arithmetic<float4> {
50
+ enum { value = true };
51
+ };
52
+ template <>
53
+ struct is_arithmetic<double2> {
54
+ enum { value = true };
55
+ };
56
+
57
+ template <>
58
+ struct packet_traits<float> : default_packet_traits {
59
+ typedef float4 type;
60
+ typedef float4 half;
61
+ enum {
62
+ Vectorizable = 1,
63
+ AlignedOnScalar = 1,
64
+ size = 4,
65
+
66
+ HasDiv = 1,
67
+ HasSin = 0,
68
+ HasCos = 0,
69
+ HasLog = 1,
70
+ HasExp = 1,
71
+ HasSqrt = 1,
72
+ HasRsqrt = 1,
73
+ HasLGamma = 1,
74
+ HasDiGamma = 1,
75
+ HasZeta = 1,
76
+ HasPolygamma = 1,
77
+ HasErf = 1,
78
+ HasErfc = 1,
79
+ HasNdtri = 1,
80
+ HasBessel = 1,
81
+ HasIGamma = 1,
82
+ HasIGammaDerA = 1,
83
+ HasGammaSampleDerAlpha = 1,
84
+ HasIGammac = 1,
85
+ HasBetaInc = 1,
86
+
87
+ HasBlend = 0,
88
+ HasFloor = 1,
89
+ HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
90
+ };
91
+ };
92
+
93
+ template <>
94
+ struct packet_traits<double> : default_packet_traits {
95
+ typedef double2 type;
96
+ typedef double2 half;
97
+ enum {
98
+ Vectorizable = 1,
99
+ AlignedOnScalar = 1,
100
+ size = 2,
101
+
102
+ HasDiv = 1,
103
+ HasLog = 1,
104
+ HasExp = 1,
105
+ HasSqrt = 1,
106
+ HasRsqrt = 1,
107
+ HasLGamma = 1,
108
+ HasDiGamma = 1,
109
+ HasZeta = 1,
110
+ HasPolygamma = 1,
111
+ HasErf = 1,
112
+ HasErfc = 1,
113
+ HasNdtri = 1,
114
+ HasBessel = 1,
115
+ HasIGamma = 1,
116
+ HasIGammaDerA = 1,
117
+ HasGammaSampleDerAlpha = 1,
118
+ HasIGammac = 1,
119
+ HasBetaInc = 1,
120
+ HasBlend = 0,
121
+ };
122
+ };
123
+
124
+ template <>
125
+ struct unpacket_traits<float4> {
126
+ typedef float type;
127
+ enum {
128
+ size = 4,
129
+ alignment = Aligned16,
130
+ vectorizable = true,
131
+ masked_load_available = false,
132
+ masked_store_available = false
133
+ };
134
+ typedef float4 half;
135
+ };
136
+ template <>
137
+ struct unpacket_traits<double2> {
138
+ typedef double type;
139
+ enum {
140
+ size = 2,
141
+ alignment = Aligned16,
142
+ vectorizable = true,
143
+ masked_load_available = false,
144
+ masked_store_available = false
145
+ };
146
+ typedef double2 half;
147
+ };
148
+
149
+ template <>
150
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
151
+ return make_float4(from, from, from, from);
152
+ }
153
+ template <>
154
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
155
+ return make_double2(from, from);
156
+ }
157
+
158
+ #if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
159
+
160
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
161
+ return __int_as_float(__float_as_int(a) & __float_as_int(b));
162
+ }
163
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
164
+ return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
165
+ }
166
+
167
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
168
+ return __int_as_float(__float_as_int(a) | __float_as_int(b));
169
+ }
170
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
171
+ return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
172
+ }
173
+
174
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
175
+ return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
176
+ }
177
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
178
+ return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
179
+ }
180
+
181
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
182
+ return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
183
+ }
184
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
185
+ return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
186
+ }
187
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
188
+ return __int_as_float(a == b ? 0xffffffffu : 0u);
189
+ }
190
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
191
+ return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
192
+ }
193
+
194
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
195
+ return __int_as_float(a < b ? 0xffffffffu : 0u);
196
+ }
197
+
198
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
199
+ return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
200
+ }
201
+
202
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
203
+ return __int_as_float(a <= b ? 0xffffffffu : 0u);
204
+ }
205
+
206
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
207
+ return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
208
+ }
209
+
210
+ template <>
211
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
212
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
213
+ }
214
+ template <>
215
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
216
+ return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
217
+ }
218
+
219
+ template <>
220
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
221
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
222
+ }
223
+ template <>
224
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
225
+ return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
226
+ }
227
+
228
+ template <>
229
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
230
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
231
+ }
232
+ template <>
233
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
234
+ return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
235
+ }
236
+
237
+ template <>
238
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
239
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
240
+ bitwise_andnot(a.w, b.w));
241
+ }
242
+ template <>
243
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
244
+ return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
245
+ }
246
+
247
+ template <>
248
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
249
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
250
+ }
251
+ template <>
252
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
253
+ return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
254
+ }
255
+ template <>
256
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
257
+ return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
258
+ }
259
+ template <>
260
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
261
+ return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
262
+ }
263
+ template <>
264
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
265
+ return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
266
+ }
267
+ template <>
268
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
269
+ return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
270
+ }
271
+ #endif // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
272
+
273
+ template <>
274
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
275
+ return make_float4(a, a + 1, a + 2, a + 3);
276
+ }
277
+ template <>
278
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
279
+ return make_double2(a, a + 1);
280
+ }
281
+
282
+ template <>
283
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
284
+ return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
285
+ }
286
+ template <>
287
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
288
+ return make_double2(a.x + b.x, a.y + b.y);
289
+ }
290
+
291
+ template <>
292
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
293
+ return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
294
+ }
295
+ template <>
296
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
297
+ return make_double2(a.x - b.x, a.y - b.y);
298
+ }
299
+
300
+ template <>
301
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
302
+ return make_float4(-a.x, -a.y, -a.z, -a.w);
303
+ }
304
+ template <>
305
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
306
+ return make_double2(-a.x, -a.y);
307
+ }
308
+
309
+ template <>
310
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
311
+ return a;
312
+ }
313
+ template <>
314
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
315
+ return a;
316
+ }
317
+
318
+ template <>
319
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
320
+ return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
321
+ }
322
+ template <>
323
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
324
+ return make_double2(a.x * b.x, a.y * b.y);
325
+ }
326
+
327
+ template <>
328
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
329
+ return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
330
+ }
331
+ template <>
332
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
333
+ return make_double2(a.x / b.x, a.y / b.y);
334
+ }
335
+
336
+ template <>
337
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
338
+ return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
339
+ }
340
+ template <>
341
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
342
+ return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
343
+ }
344
+
345
+ template <>
346
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
347
+ return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
348
+ }
349
+ template <>
350
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
351
+ return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
352
+ }
353
+
354
+ template <>
355
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
356
+ return *reinterpret_cast<const float4*>(from);
357
+ }
358
+
359
+ template <>
360
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
361
+ return *reinterpret_cast<const double2*>(from);
362
+ }
363
+
364
+ template <>
365
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
366
+ return make_float4(from[0], from[1], from[2], from[3]);
367
+ }
368
+ template <>
369
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
370
+ return make_double2(from[0], from[1]);
371
+ }
372
+
373
+ template <>
374
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
375
+ return make_float4(from[0], from[0], from[1], from[1]);
376
+ }
377
+ template <>
378
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
379
+ return make_double2(from[0], from[0]);
380
+ }
381
+
382
+ template <>
383
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
384
+ *reinterpret_cast<float4*>(to) = from;
385
+ }
386
+
387
+ template <>
388
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
389
+ *reinterpret_cast<double2*>(to) = from;
390
+ }
391
+
392
+ template <>
393
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
394
+ to[0] = from.x;
395
+ to[1] = from.y;
396
+ to[2] = from.z;
397
+ to[3] = from.w;
398
+ }
399
+
400
+ template <>
401
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
402
+ to[0] = from.x;
403
+ to[1] = from.y;
404
+ }
405
+
406
+ template <>
407
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
408
+ #if defined(EIGEN_GPU_HAS_LDG)
409
+ return __ldg(reinterpret_cast<const float4*>(from));
410
+ #else
411
+ return make_float4(from[0], from[1], from[2], from[3]);
412
+ #endif
413
+ }
414
+ template <>
415
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
416
+ #if defined(EIGEN_GPU_HAS_LDG)
417
+ return __ldg(reinterpret_cast<const double2*>(from));
418
+ #else
419
+ return make_double2(from[0], from[1]);
420
+ #endif
421
+ }
422
+
423
+ template <>
424
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
425
+ #if defined(EIGEN_GPU_HAS_LDG)
426
+ return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
427
+ #else
428
+ return make_float4(from[0], from[1], from[2], from[3]);
429
+ #endif
430
+ }
431
+ template <>
432
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
433
+ #if defined(EIGEN_GPU_HAS_LDG)
434
+ return make_double2(__ldg(from + 0), __ldg(from + 1));
435
+ #else
436
+ return make_double2(from[0], from[1]);
437
+ #endif
438
+ }
439
+
440
+ template <>
441
+ EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
442
+ return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
443
+ }
444
+
445
+ template <>
446
+ EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
447
+ return make_double2(from[0 * stride], from[1 * stride]);
448
+ }
449
+
450
+ template <>
451
+ EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
452
+ to[stride * 0] = from.x;
453
+ to[stride * 1] = from.y;
454
+ to[stride * 2] = from.z;
455
+ to[stride * 3] = from.w;
456
+ }
457
+ template <>
458
+ EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
459
+ to[stride * 0] = from.x;
460
+ to[stride * 1] = from.y;
461
+ }
462
+
463
+ template <>
464
+ EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
465
+ return a.x;
466
+ }
467
+ template <>
468
+ EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
469
+ return a.x;
470
+ }
471
+
472
+ template <>
473
+ EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
474
+ return a.x + a.y + a.z + a.w;
475
+ }
476
+ template <>
477
+ EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
478
+ return a.x + a.y;
479
+ }
480
+
481
+ template <>
482
+ EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
483
+ return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
484
+ }
485
+ template <>
486
+ EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
487
+ return fmax(a.x, a.y);
488
+ }
489
+
490
+ template <>
491
+ EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
492
+ return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
493
+ }
494
+ template <>
495
+ EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
496
+ return fmin(a.x, a.y);
497
+ }
498
+
499
+ template <>
500
+ EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
501
+ return a.x * a.y * a.z * a.w;
502
+ }
503
+ template <>
504
+ EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
505
+ return a.x * a.y;
506
+ }
507
+
508
+ template <>
509
+ EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
510
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
511
+ }
512
+ template <>
513
+ EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
514
+ return make_double2(fabs(a.x), fabs(a.y));
515
+ }
516
+
517
+ template <>
518
+ EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
519
+ return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
520
+ }
521
+ template <>
522
+ EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
523
+ return make_double2(floor(a.x), floor(a.y));
524
+ }
525
+
526
+ template <>
527
+ EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
528
+ return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
529
+ }
530
+ template <>
531
+ EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
532
+ return make_double2(ceil(a.x), ceil(a.y));
533
+ }
534
+
535
+ template <>
536
+ EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
537
+ return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
538
+ }
539
+ template <>
540
+ EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
541
+ return make_double2(rint(a.x), rint(a.y));
542
+ }
543
+
544
+ template <>
545
+ EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
546
+ return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
547
+ }
548
+ template <>
549
+ EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
550
+ return make_double2(trunc(a.x), trunc(a.y));
551
+ }
552
+
553
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
554
+ float tmp = kernel.packet[0].y;
555
+ kernel.packet[0].y = kernel.packet[1].x;
556
+ kernel.packet[1].x = tmp;
557
+
558
+ tmp = kernel.packet[0].z;
559
+ kernel.packet[0].z = kernel.packet[2].x;
560
+ kernel.packet[2].x = tmp;
561
+
562
+ tmp = kernel.packet[0].w;
563
+ kernel.packet[0].w = kernel.packet[3].x;
564
+ kernel.packet[3].x = tmp;
565
+
566
+ tmp = kernel.packet[1].z;
567
+ kernel.packet[1].z = kernel.packet[2].y;
568
+ kernel.packet[2].y = tmp;
569
+
570
+ tmp = kernel.packet[1].w;
571
+ kernel.packet[1].w = kernel.packet[3].y;
572
+ kernel.packet[3].y = tmp;
573
+
574
+ tmp = kernel.packet[2].w;
575
+ kernel.packet[2].w = kernel.packet[3].z;
576
+ kernel.packet[3].z = tmp;
577
+ }
578
+
579
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
580
+ double tmp = kernel.packet[0].y;
581
+ kernel.packet[0].y = kernel.packet[1].x;
582
+ kernel.packet[1].x = tmp;
583
+ }
584
+
585
+ #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
586
+
587
+ // Half-packet functions are not available on the host for CUDA 9.0-9.2, only
588
+ // on device. There is no benefit to using them on the host anyways, since they are
589
+ // emulated.
590
+ #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
591
+
592
+ typedef ulonglong2 Packet4h2;
593
+ template <>
594
+ struct unpacket_traits<Packet4h2> {
595
+ typedef Eigen::half type;
596
+ enum {
597
+ size = 8,
598
+ alignment = Aligned16,
599
+ vectorizable = true,
600
+ masked_load_available = false,
601
+ masked_store_available = false
602
+ };
603
+ typedef Packet4h2 half;
604
+ };
605
+ template <>
606
+ struct is_arithmetic<Packet4h2> {
607
+ enum { value = true };
608
+ };
609
+
610
+ template <>
611
+ struct unpacket_traits<half2> {
612
+ typedef Eigen::half type;
613
+ enum {
614
+ size = 2,
615
+ alignment = Aligned16,
616
+ vectorizable = true,
617
+ masked_load_available = false,
618
+ masked_store_available = false
619
+ };
620
+ typedef half2 half;
621
+ };
622
+ template <>
623
+ struct is_arithmetic<half2> {
624
+ enum { value = true };
625
+ };
626
+
627
+ template <>
628
+ struct packet_traits<Eigen::half> : default_packet_traits {
629
+ typedef Packet4h2 type;
630
+ typedef Packet4h2 half;
631
+ enum {
632
+ Vectorizable = 1,
633
+ AlignedOnScalar = 1,
634
+ size = 8,
635
+ HasAdd = 1,
636
+ HasSub = 1,
637
+ HasMul = 1,
638
+ HasDiv = 1,
639
+ HasSqrt = 1,
640
+ HasRsqrt = 1,
641
+ HasExp = 1,
642
+ HasExpm1 = 1,
643
+ HasLog = 1,
644
+ HasLog1p = 1
645
+ };
646
+ };
647
+
648
+ template <>
649
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
650
+ return __half2half2(from);
651
+ }
652
+
653
+ template <>
654
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
655
+ Packet4h2 r;
656
+ half2* p_alias = reinterpret_cast<half2*>(&r);
657
+ p_alias[0] = pset1<half2>(from);
658
+ p_alias[1] = pset1<half2>(from);
659
+ p_alias[2] = pset1<half2>(from);
660
+ p_alias[3] = pset1<half2>(from);
661
+ return r;
662
+ }
663
+
664
+ namespace {
665
+
666
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
667
+ return *reinterpret_cast<const half2*>(from);
668
+ }
669
+
670
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
671
+
672
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
673
+ return __halves2half2(from[0], from[0]);
674
+ }
675
+
676
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
677
+ *reinterpret_cast<half2*>(to) = from;
678
+ }
679
+
680
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
681
+ to[0] = __low2half(from);
682
+ to[1] = __high2half(from);
683
+ }
684
+
685
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
686
+ #if defined(EIGEN_GPU_HAS_LDG)
687
+ // Input is guaranteed to be properly aligned.
688
+ return __ldg(reinterpret_cast<const half2*>(from));
689
+ #else
690
+ return __halves2half2(*(from + 0), *(from + 1));
691
+ #endif
692
+ }
693
+
694
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
695
+ #if defined(EIGEN_GPU_HAS_LDG)
696
+ return __halves2half2(__ldg(from + 0), __ldg(from + 1));
697
+ #else
698
+ return __halves2half2(*(from + 0), *(from + 1));
699
+ #endif
700
+ }
701
+
702
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
703
+ return __halves2half2(from[0 * stride], from[1 * stride]);
704
+ }
705
+
706
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
707
+ to[stride * 0] = __low2half(from);
708
+ to[stride * 1] = __high2half(from);
709
+ }
710
+
711
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
712
+
713
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
714
+ half a1 = __low2half(a);
715
+ half a2 = __high2half(a);
716
+ half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
717
+ half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
718
+ return __halves2half2(result1, result2);
719
+ }
720
+
721
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
722
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
723
+ return pset1<half2>(true_half);
724
+ }
725
+
726
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
727
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
728
+ return pset1<half2>(false_half);
729
+ }
730
+
731
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
732
+ __half a1 = __low2half(kernel.packet[0]);
733
+ __half a2 = __high2half(kernel.packet[0]);
734
+ __half b1 = __low2half(kernel.packet[1]);
735
+ __half b2 = __high2half(kernel.packet[1]);
736
+ kernel.packet[0] = __halves2half2(a1, b1);
737
+ kernel.packet[1] = __halves2half2(a2, b2);
738
+ }
739
+
740
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
741
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
742
+ return __halves2half2(a, __hadd(a, __float2half(1.0f)));
743
+ #else
744
+ float f = __half2float(a) + 1.0f;
745
+ return __halves2half2(a, __float2half(f));
746
+ #endif
747
+ }
748
+
749
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
750
+ half mask_low = __low2half(mask);
751
+ half mask_high = __high2half(mask);
752
+ half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
753
+ half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
754
+ return __halves2half2(result_low, result_high);
755
+ }
756
+
757
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
758
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
759
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
760
+ half a1 = __low2half(a);
761
+ half a2 = __high2half(a);
762
+ half b1 = __low2half(b);
763
+ half b2 = __high2half(b);
764
+ half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
765
+ half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
766
+ return __halves2half2(eq1, eq2);
767
+ }
768
+
769
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
770
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
771
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
772
+ half a1 = __low2half(a);
773
+ half a2 = __high2half(a);
774
+ half b1 = __low2half(b);
775
+ half b2 = __high2half(b);
776
+ half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
777
+ half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
778
+ return __halves2half2(eq1, eq2);
779
+ }
780
+
781
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
782
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
783
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
784
+ half a1 = __low2half(a);
785
+ half a2 = __high2half(a);
786
+ half b1 = __low2half(b);
787
+ half b2 = __high2half(b);
788
+ half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
789
+ half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
790
+ return __halves2half2(eq1, eq2);
791
+ }
792
+
793
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
794
+ half a1 = __low2half(a);
795
+ half a2 = __high2half(a);
796
+ half b1 = __low2half(b);
797
+ half b2 = __high2half(b);
798
+ half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
799
+ half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
800
+ return __halves2half2(result1, result2);
801
+ }
802
+
803
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
804
+ half a1 = __low2half(a);
805
+ half a2 = __high2half(a);
806
+ half b1 = __low2half(b);
807
+ half b2 = __high2half(b);
808
+ half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
809
+ half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
810
+ return __halves2half2(result1, result2);
811
+ }
812
+
813
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
814
+ half a1 = __low2half(a);
815
+ half a2 = __high2half(a);
816
+ half b1 = __low2half(b);
817
+ half b2 = __high2half(b);
818
+ half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
819
+ half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
820
+ return __halves2half2(result1, result2);
821
+ }
822
+
823
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
824
+ half a1 = __low2half(a);
825
+ half a2 = __high2half(a);
826
+ half b1 = __low2half(b);
827
+ half b2 = __high2half(b);
828
+ half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
829
+ half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
830
+ return __halves2half2(result1, result2);
831
+ }
832
+
833
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
834
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
835
+ return __hadd2(a, b);
836
+ #else
837
+ float a1 = __low2float(a);
838
+ float a2 = __high2float(a);
839
+ float b1 = __low2float(b);
840
+ float b2 = __high2float(b);
841
+ float r1 = a1 + b1;
842
+ float r2 = a2 + b2;
843
+ return __floats2half2_rn(r1, r2);
844
+ #endif
845
+ }
846
+
847
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
848
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
849
+ return __hsub2(a, b);
850
+ #else
851
+ float a1 = __low2float(a);
852
+ float a2 = __high2float(a);
853
+ float b1 = __low2float(b);
854
+ float b2 = __high2float(b);
855
+ float r1 = a1 - b1;
856
+ float r2 = a2 - b2;
857
+ return __floats2half2_rn(r1, r2);
858
+ #endif
859
+ }
860
+
861
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
862
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
863
+ return __hneg2(a);
864
+ #else
865
+ float a1 = __low2float(a);
866
+ float a2 = __high2float(a);
867
+ return __floats2half2_rn(-a1, -a2);
868
+ #endif
869
+ }
870
+
871
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
872
+
873
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
874
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
875
+ return __hmul2(a, b);
876
+ #else
877
+ float a1 = __low2float(a);
878
+ float a2 = __high2float(a);
879
+ float b1 = __low2float(b);
880
+ float b2 = __high2float(b);
881
+ float r1 = a1 * b1;
882
+ float r2 = a2 * b2;
883
+ return __floats2half2_rn(r1, r2);
884
+ #endif
885
+ }
886
+
887
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
888
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
889
+ return __hfma2(a, b, c);
890
+ #else
891
+ float a1 = __low2float(a);
892
+ float a2 = __high2float(a);
893
+ float b1 = __low2float(b);
894
+ float b2 = __high2float(b);
895
+ float c1 = __low2float(c);
896
+ float c2 = __high2float(c);
897
+ float r1 = a1 * b1 + c1;
898
+ float r2 = a2 * b2 + c2;
899
+ return __floats2half2_rn(r1, r2);
900
+ #endif
901
+ }
902
+
903
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
904
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
905
+ return __h2div(a, b);
906
+ #else
907
+ float a1 = __low2float(a);
908
+ float a2 = __high2float(a);
909
+ float b1 = __low2float(b);
910
+ float b2 = __high2float(b);
911
+ float r1 = a1 / b1;
912
+ float r2 = a2 / b2;
913
+ return __floats2half2_rn(r1, r2);
914
+ #endif
915
+ }
916
+
917
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
918
+ float a1 = __low2float(a);
919
+ float a2 = __high2float(a);
920
+ float b1 = __low2float(b);
921
+ float b2 = __high2float(b);
922
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
923
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
924
+ return __halves2half2(r1, r2);
925
+ }
926
+
927
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
928
+ float a1 = __low2float(a);
929
+ float a2 = __high2float(a);
930
+ float b1 = __low2float(b);
931
+ float b2 = __high2float(b);
932
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
933
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
934
+ return __halves2half2(r1, r2);
935
+ }
936
+
937
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
938
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
939
+ return __hadd(__low2half(a), __high2half(a));
940
+ #else
941
+ float a1 = __low2float(a);
942
+ float a2 = __high2float(a);
943
+ return Eigen::half(__float2half(a1 + a2));
944
+ #endif
945
+ }
946
+
947
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
948
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
949
+ __half first = __low2half(a);
950
+ __half second = __high2half(a);
951
+ return __hgt(first, second) ? first : second;
952
+ #else
953
+ float a1 = __low2float(a);
954
+ float a2 = __high2float(a);
955
+ return a1 > a2 ? __low2half(a) : __high2half(a);
956
+ #endif
957
+ }
958
+
959
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
960
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
961
+ __half first = __low2half(a);
962
+ __half second = __high2half(a);
963
+ return __hlt(first, second) ? first : second;
964
+ #else
965
+ float a1 = __low2float(a);
966
+ float a2 = __high2float(a);
967
+ return a1 < a2 ? __low2half(a) : __high2half(a);
968
+ #endif
969
+ }
970
+
971
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
972
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
973
+ return __hmul(__low2half(a), __high2half(a));
974
+ #else
975
+ float a1 = __low2float(a);
976
+ float a2 = __high2float(a);
977
+ return Eigen::half(__float2half(a1 * a2));
978
+ #endif
979
+ }
980
+
981
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
982
+ float a1 = __low2float(a);
983
+ float a2 = __high2float(a);
984
+ float r1 = log1pf(a1);
985
+ float r2 = log1pf(a2);
986
+ return __floats2half2_rn(r1, r2);
987
+ }
988
+
989
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
990
+ float a1 = __low2float(a);
991
+ float a2 = __high2float(a);
992
+ float r1 = expm1f(a1);
993
+ float r2 = expm1f(a2);
994
+ return __floats2half2_rn(r1, r2);
995
+ }
996
+
997
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
998
+
999
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
1000
+
1001
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
1002
+
1003
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
1004
+
1005
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
1006
+
1007
+ #else
1008
+
1009
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
1010
+ float a1 = __low2float(a);
1011
+ float a2 = __high2float(a);
1012
+ float r1 = logf(a1);
1013
+ float r2 = logf(a2);
1014
+ return __floats2half2_rn(r1, r2);
1015
+ }
1016
+
1017
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
1018
+ float a1 = __low2float(a);
1019
+ float a2 = __high2float(a);
1020
+ float r1 = expf(a1);
1021
+ float r2 = expf(a2);
1022
+ return __floats2half2_rn(r1, r2);
1023
+ }
1024
+
1025
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
1026
+ float a1 = __low2float(a);
1027
+ float a2 = __high2float(a);
1028
+ float r1 = sqrtf(a1);
1029
+ float r2 = sqrtf(a2);
1030
+ return __floats2half2_rn(r1, r2);
1031
+ }
1032
+
1033
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
1034
+ float a1 = __low2float(a);
1035
+ float a2 = __high2float(a);
1036
+ float r1 = rsqrtf(a1);
1037
+ float r2 = rsqrtf(a2);
1038
+ return __floats2half2_rn(r1, r2);
1039
+ }
1040
+ #endif
1041
+ } // namespace
1042
+
1043
+ template <>
1044
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
1045
+ return *reinterpret_cast<const Packet4h2*>(from);
1046
+ }
1047
+
1048
+ // unaligned load;
1049
+ template <>
1050
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
1051
+ Packet4h2 r;
1052
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1053
+ p_alias[0] = ploadu(from + 0);
1054
+ p_alias[1] = ploadu(from + 2);
1055
+ p_alias[2] = ploadu(from + 4);
1056
+ p_alias[3] = ploadu(from + 6);
1057
+ return r;
1058
+ }
1059
+
1060
+ template <>
1061
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
1062
+ Packet4h2 r;
1063
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1064
+ p_alias[0] = ploaddup(from + 0);
1065
+ p_alias[1] = ploaddup(from + 1);
1066
+ p_alias[2] = ploaddup(from + 2);
1067
+ p_alias[3] = ploaddup(from + 3);
1068
+ return r;
1069
+ }
1070
+
1071
+ template <>
1072
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1073
+ *reinterpret_cast<Packet4h2*>(to) = from;
1074
+ }
1075
+
1076
+ template <>
1077
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1078
+ const half2* from_alias = reinterpret_cast<const half2*>(&from);
1079
+ pstoreu(to + 0, from_alias[0]);
1080
+ pstoreu(to + 2, from_alias[1]);
1081
+ pstoreu(to + 4, from_alias[2]);
1082
+ pstoreu(to + 6, from_alias[3]);
1083
+ }
1084
+
1085
+ template <>
1086
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1087
+ #if defined(EIGEN_GPU_HAS_LDG)
1088
+ Packet4h2 r;
1089
+ r = __ldg(reinterpret_cast<const Packet4h2*>(from));
1090
+ return r;
1091
+ #else
1092
+ Packet4h2 r;
1093
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1094
+ r_alias[0] = ploadt_ro_aligned(from + 0);
1095
+ r_alias[1] = ploadt_ro_aligned(from + 2);
1096
+ r_alias[2] = ploadt_ro_aligned(from + 4);
1097
+ r_alias[3] = ploadt_ro_aligned(from + 6);
1098
+ return r;
1099
+ #endif
1100
+ }
1101
+
1102
+ template <>
1103
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1104
+ Packet4h2 r;
1105
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1106
+ r_alias[0] = ploadt_ro_unaligned(from + 0);
1107
+ r_alias[1] = ploadt_ro_unaligned(from + 2);
1108
+ r_alias[2] = ploadt_ro_unaligned(from + 4);
1109
+ r_alias[3] = ploadt_ro_unaligned(from + 6);
1110
+ return r;
1111
+ }
1112
+
1113
+ template <>
1114
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1115
+ Packet4h2 r;
1116
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1117
+ p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1118
+ p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1119
+ p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1120
+ p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1121
+ return r;
1122
+ }
1123
+
1124
+ template <>
1125
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
1126
+ Index stride) {
1127
+ const half2* from_alias = reinterpret_cast<const half2*>(&from);
1128
+ pscatter(to + stride * 0, from_alias[0], stride);
1129
+ pscatter(to + stride * 2, from_alias[1], stride);
1130
+ pscatter(to + stride * 4, from_alias[2], stride);
1131
+ pscatter(to + stride * 6, from_alias[3], stride);
1132
+ }
1133
+
1134
+ template <>
1135
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
1136
+ return pfirst(*(reinterpret_cast<const half2*>(&a)));
1137
+ }
1138
+
1139
+ template <>
1140
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
1141
+ Packet4h2 r;
1142
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1143
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1144
+ p_alias[0] = pabs(a_alias[0]);
1145
+ p_alias[1] = pabs(a_alias[1]);
1146
+ p_alias[2] = pabs(a_alias[2]);
1147
+ p_alias[3] = pabs(a_alias[3]);
1148
+ return r;
1149
+ }
1150
+
1151
+ template <>
1152
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
1153
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
1154
+ return pset1<Packet4h2>(true_half);
1155
+ }
1156
+
1157
+ template <>
1158
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
1159
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
1160
+ return pset1<Packet4h2>(false_half);
1161
+ }
1162
+
1163
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
1164
+ double* d_row3, double* d_row4, double* d_row5,
1165
+ double* d_row6, double* d_row7) {
1166
+ double d_tmp;
1167
+ d_tmp = d_row0[1];
1168
+ d_row0[1] = d_row4[0];
1169
+ d_row4[0] = d_tmp;
1170
+
1171
+ d_tmp = d_row1[1];
1172
+ d_row1[1] = d_row5[0];
1173
+ d_row5[0] = d_tmp;
1174
+
1175
+ d_tmp = d_row2[1];
1176
+ d_row2[1] = d_row6[0];
1177
+ d_row6[0] = d_tmp;
1178
+
1179
+ d_tmp = d_row3[1];
1180
+ d_row3[1] = d_row7[0];
1181
+ d_row7[0] = d_tmp;
1182
+ }
1183
+
1184
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
1185
+ half2* f_row3) {
1186
+ half2 f_tmp;
1187
+ f_tmp = f_row0[1];
1188
+ f_row0[1] = f_row2[0];
1189
+ f_row2[0] = f_tmp;
1190
+
1191
+ f_tmp = f_row1[1];
1192
+ f_row1[1] = f_row3[0];
1193
+ f_row3[0] = f_tmp;
1194
+ }
1195
+
1196
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
1197
+ __half a1 = __low2half(f0);
1198
+ __half a2 = __high2half(f0);
1199
+ __half b1 = __low2half(f1);
1200
+ __half b2 = __high2half(f1);
1201
+ f0 = __halves2half2(a1, b1);
1202
+ f1 = __halves2half2(a2, b2);
1203
+ }
1204
+
1205
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
1206
+ double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
1207
+ double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
1208
+ double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
1209
+ double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
1210
+ double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
1211
+ double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
1212
+ double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
1213
+ double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
1214
+ ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
1215
+
1216
+ half2* f_row0 = reinterpret_cast<half2*>(d_row0);
1217
+ half2* f_row1 = reinterpret_cast<half2*>(d_row1);
1218
+ half2* f_row2 = reinterpret_cast<half2*>(d_row2);
1219
+ half2* f_row3 = reinterpret_cast<half2*>(d_row3);
1220
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1221
+ ptranspose_half(f_row0[0], f_row1[0]);
1222
+ ptranspose_half(f_row0[1], f_row1[1]);
1223
+ ptranspose_half(f_row2[0], f_row3[0]);
1224
+ ptranspose_half(f_row2[1], f_row3[1]);
1225
+
1226
+ f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
1227
+ f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
1228
+ f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
1229
+ f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
1230
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1231
+ ptranspose_half(f_row0[0], f_row1[0]);
1232
+ ptranspose_half(f_row0[1], f_row1[1]);
1233
+ ptranspose_half(f_row2[0], f_row3[0]);
1234
+ ptranspose_half(f_row2[1], f_row3[1]);
1235
+
1236
+ f_row0 = reinterpret_cast<half2*>(d_row4);
1237
+ f_row1 = reinterpret_cast<half2*>(d_row5);
1238
+ f_row2 = reinterpret_cast<half2*>(d_row6);
1239
+ f_row3 = reinterpret_cast<half2*>(d_row7);
1240
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1241
+ ptranspose_half(f_row0[0], f_row1[0]);
1242
+ ptranspose_half(f_row0[1], f_row1[1]);
1243
+ ptranspose_half(f_row2[0], f_row3[0]);
1244
+ ptranspose_half(f_row2[1], f_row3[1]);
1245
+
1246
+ f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
1247
+ f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
1248
+ f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
1249
+ f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
1250
+ ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1251
+ ptranspose_half(f_row0[0], f_row1[0]);
1252
+ ptranspose_half(f_row0[1], f_row1[1]);
1253
+ ptranspose_half(f_row2[0], f_row3[0]);
1254
+ ptranspose_half(f_row2[1], f_row3[1]);
1255
+ }
1256
+
1257
+ template <>
1258
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
1259
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
1260
+
1261
+ Packet4h2 r;
1262
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1263
+ p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1264
+ p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
1265
+ p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
1266
+ p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
1267
+ return r;
1268
+ #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1269
+ Packet4h2 r;
1270
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1271
+
1272
+ half2 b = pset1<half2>(a);
1273
+ half2 c;
1274
+ half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
1275
+ half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
1276
+
1277
+ c = __hadd2(b, half_offset0);
1278
+ r_alias[0] = plset(__low2half(c));
1279
+ r_alias[1] = plset(__high2half(c));
1280
+
1281
+ c = __hadd2(b, half_offset1);
1282
+ r_alias[2] = plset(__low2half(c));
1283
+ r_alias[3] = plset(__high2half(c));
1284
+
1285
+ return r;
1286
+
1287
+ #else
1288
+ float f = __half2float(a);
1289
+ Packet4h2 r;
1290
+ half2* p_alias = reinterpret_cast<half2*>(&r);
1291
+ p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
1292
+ p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
1293
+ p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
1294
+ p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
1295
+ return r;
1296
+ #endif
1297
+ }
1298
+
1299
+ template <>
1300
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1301
+ const Packet4h2& b) {
1302
+ Packet4h2 r;
1303
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1304
+ const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
1305
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1306
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1307
+ r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1308
+ r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1309
+ r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1310
+ r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1311
+ return r;
1312
+ }
1313
+
1314
+ template <>
1315
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1316
+ Packet4h2 r;
1317
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1318
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1319
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1320
+ r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1321
+ r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1322
+ r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1323
+ r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1324
+ return r;
1325
+ }
1326
+
1327
+ template <>
1328
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1329
+ Packet4h2 r;
1330
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1331
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1332
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1333
+ r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
1334
+ r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
1335
+ r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
1336
+ r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
1337
+ return r;
1338
+ }
1339
+
1340
+ template <>
1341
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1342
+ Packet4h2 r;
1343
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1344
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1345
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1346
+ r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
1347
+ r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
1348
+ r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
1349
+ r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
1350
+ return r;
1351
+ }
1352
+
1353
+ template <>
1354
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1355
+ Packet4h2 r;
1356
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1357
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1358
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1359
+ r_alias[0] = pand(a_alias[0], b_alias[0]);
1360
+ r_alias[1] = pand(a_alias[1], b_alias[1]);
1361
+ r_alias[2] = pand(a_alias[2], b_alias[2]);
1362
+ r_alias[3] = pand(a_alias[3], b_alias[3]);
1363
+ return r;
1364
+ }
1365
+
1366
+ template <>
1367
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1368
+ Packet4h2 r;
1369
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1370
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1371
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1372
+ r_alias[0] = por(a_alias[0], b_alias[0]);
1373
+ r_alias[1] = por(a_alias[1], b_alias[1]);
1374
+ r_alias[2] = por(a_alias[2], b_alias[2]);
1375
+ r_alias[3] = por(a_alias[3], b_alias[3]);
1376
+ return r;
1377
+ }
1378
+
1379
+ template <>
1380
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1381
+ Packet4h2 r;
1382
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1383
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1384
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1385
+ r_alias[0] = pxor(a_alias[0], b_alias[0]);
1386
+ r_alias[1] = pxor(a_alias[1], b_alias[1]);
1387
+ r_alias[2] = pxor(a_alias[2], b_alias[2]);
1388
+ r_alias[3] = pxor(a_alias[3], b_alias[3]);
1389
+ return r;
1390
+ }
1391
+
1392
+ template <>
1393
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1394
+ Packet4h2 r;
1395
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1396
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1397
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1398
+ r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1399
+ r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1400
+ r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1401
+ r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1402
+ return r;
1403
+ }
1404
+
1405
+ template <>
1406
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1407
+ Packet4h2 r;
1408
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1409
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1410
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1411
+ r_alias[0] = padd(a_alias[0], b_alias[0]);
1412
+ r_alias[1] = padd(a_alias[1], b_alias[1]);
1413
+ r_alias[2] = padd(a_alias[2], b_alias[2]);
1414
+ r_alias[3] = padd(a_alias[3], b_alias[3]);
1415
+ return r;
1416
+ }
1417
+
1418
+ template <>
1419
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1420
+ Packet4h2 r;
1421
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1422
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1423
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1424
+ r_alias[0] = psub(a_alias[0], b_alias[0]);
1425
+ r_alias[1] = psub(a_alias[1], b_alias[1]);
1426
+ r_alias[2] = psub(a_alias[2], b_alias[2]);
1427
+ r_alias[3] = psub(a_alias[3], b_alias[3]);
1428
+ return r;
1429
+ }
1430
+
1431
+ template <>
1432
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
1433
+ Packet4h2 r;
1434
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1435
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1436
+ r_alias[0] = pnegate(a_alias[0]);
1437
+ r_alias[1] = pnegate(a_alias[1]);
1438
+ r_alias[2] = pnegate(a_alias[2]);
1439
+ r_alias[3] = pnegate(a_alias[3]);
1440
+ return r;
1441
+ }
1442
+
1443
+ template <>
1444
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
1445
+ return a;
1446
+ }
1447
+
1448
+ template <>
1449
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1450
+ Packet4h2 r;
1451
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1452
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1453
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1454
+ r_alias[0] = pmul(a_alias[0], b_alias[0]);
1455
+ r_alias[1] = pmul(a_alias[1], b_alias[1]);
1456
+ r_alias[2] = pmul(a_alias[2], b_alias[2]);
1457
+ r_alias[3] = pmul(a_alias[3], b_alias[3]);
1458
+ return r;
1459
+ }
1460
+
1461
+ template <>
1462
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
1463
+ const Packet4h2& c) {
1464
+ Packet4h2 r;
1465
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1466
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1467
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1468
+ const half2* c_alias = reinterpret_cast<const half2*>(&c);
1469
+ r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1470
+ r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1471
+ r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1472
+ r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1473
+ return r;
1474
+ }
1475
+
1476
+ template <>
1477
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1478
+ Packet4h2 r;
1479
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1480
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1481
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1482
+ r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1483
+ r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1484
+ r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1485
+ r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1486
+ return r;
1487
+ }
1488
+
1489
+ template <>
1490
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1491
+ Packet4h2 r;
1492
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1493
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1494
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1495
+ r_alias[0] = pmin(a_alias[0], b_alias[0]);
1496
+ r_alias[1] = pmin(a_alias[1], b_alias[1]);
1497
+ r_alias[2] = pmin(a_alias[2], b_alias[2]);
1498
+ r_alias[3] = pmin(a_alias[3], b_alias[3]);
1499
+ return r;
1500
+ }
1501
+
1502
+ template <>
1503
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1504
+ Packet4h2 r;
1505
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1506
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1507
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1508
+ r_alias[0] = pmax(a_alias[0], b_alias[0]);
1509
+ r_alias[1] = pmax(a_alias[1], b_alias[1]);
1510
+ r_alias[2] = pmax(a_alias[2], b_alias[2]);
1511
+ r_alias[3] = pmax(a_alias[3], b_alias[3]);
1512
+ return r;
1513
+ }
1514
+
1515
+ template <>
1516
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
1517
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1518
+
1519
+ return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
1520
+ }
1521
+
1522
+ template <>
1523
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
1524
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1525
+ half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
1526
+ half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
1527
+ __half first = predux_max(m0);
1528
+ __half second = predux_max(m1);
1529
+ #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1530
+ return (__hgt(first, second) ? first : second);
1531
+ #else
1532
+ float ffirst = __half2float(first);
1533
+ float fsecond = __half2float(second);
1534
+ return (ffirst > fsecond) ? first : second;
1535
+ #endif
1536
+ }
1537
+
1538
+ template <>
1539
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
1540
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1541
+ half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
1542
+ half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
1543
+ __half first = predux_min(m0);
1544
+ __half second = predux_min(m1);
1545
+ #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1546
+ return (__hlt(first, second) ? first : second);
1547
+ #else
1548
+ float ffirst = __half2float(first);
1549
+ float fsecond = __half2float(second);
1550
+ return (ffirst < fsecond) ? first : second;
1551
+ #endif
1552
+ }
1553
+
1554
+ // likely overflow/underflow
1555
+ template <>
1556
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
1557
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1558
+ return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
1559
+ }
1560
+
1561
+ template <>
1562
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
1563
+ Packet4h2 r;
1564
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1565
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1566
+ r_alias[0] = plog1p(a_alias[0]);
1567
+ r_alias[1] = plog1p(a_alias[1]);
1568
+ r_alias[2] = plog1p(a_alias[2]);
1569
+ r_alias[3] = plog1p(a_alias[3]);
1570
+ return r;
1571
+ }
1572
+
1573
+ template <>
1574
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
1575
+ Packet4h2 r;
1576
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1577
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1578
+ r_alias[0] = pexpm1(a_alias[0]);
1579
+ r_alias[1] = pexpm1(a_alias[1]);
1580
+ r_alias[2] = pexpm1(a_alias[2]);
1581
+ r_alias[3] = pexpm1(a_alias[3]);
1582
+ return r;
1583
+ }
1584
+
1585
+ template <>
1586
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
1587
+ Packet4h2 r;
1588
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1589
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1590
+ r_alias[0] = plog(a_alias[0]);
1591
+ r_alias[1] = plog(a_alias[1]);
1592
+ r_alias[2] = plog(a_alias[2]);
1593
+ r_alias[3] = plog(a_alias[3]);
1594
+ return r;
1595
+ }
1596
+
1597
+ template <>
1598
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
1599
+ Packet4h2 r;
1600
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1601
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1602
+ r_alias[0] = pexp(a_alias[0]);
1603
+ r_alias[1] = pexp(a_alias[1]);
1604
+ r_alias[2] = pexp(a_alias[2]);
1605
+ r_alias[3] = pexp(a_alias[3]);
1606
+ return r;
1607
+ }
1608
+
1609
+ template <>
1610
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
1611
+ Packet4h2 r;
1612
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1613
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1614
+ r_alias[0] = psqrt(a_alias[0]);
1615
+ r_alias[1] = psqrt(a_alias[1]);
1616
+ r_alias[2] = psqrt(a_alias[2]);
1617
+ r_alias[3] = psqrt(a_alias[3]);
1618
+ return r;
1619
+ }
1620
+
1621
+ template <>
1622
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
1623
+ Packet4h2 r;
1624
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1625
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1626
+ r_alias[0] = prsqrt(a_alias[0]);
1627
+ r_alias[1] = prsqrt(a_alias[1]);
1628
+ r_alias[2] = prsqrt(a_alias[2]);
1629
+ r_alias[3] = prsqrt(a_alias[3]);
1630
+ return r;
1631
+ }
1632
+
1633
+ // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
1634
+ // the implementation of GPU half reduction.
1635
+ template <>
1636
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
1637
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1638
+ return __hadd2(a, b);
1639
+ #else
1640
+ float a1 = __low2float(a);
1641
+ float a2 = __high2float(a);
1642
+ float b1 = __low2float(b);
1643
+ float b2 = __high2float(b);
1644
+ float r1 = a1 + b1;
1645
+ float r2 = a2 + b2;
1646
+ return __floats2half2_rn(r1, r2);
1647
+ #endif
1648
+ }
1649
+
1650
+ template <>
1651
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
1652
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1653
+ return __hmul2(a, b);
1654
+ #else
1655
+ float a1 = __low2float(a);
1656
+ float a2 = __high2float(a);
1657
+ float b1 = __low2float(b);
1658
+ float b2 = __high2float(b);
1659
+ float r1 = a1 * b1;
1660
+ float r2 = a2 * b2;
1661
+ return __floats2half2_rn(r1, r2);
1662
+ #endif
1663
+ }
1664
+
1665
+ template <>
1666
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
1667
+ #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1668
+ return __h2div(a, b);
1669
+ #else
1670
+ float a1 = __low2float(a);
1671
+ float a2 = __high2float(a);
1672
+ float b1 = __low2float(b);
1673
+ float b2 = __high2float(b);
1674
+ float r1 = a1 / b1;
1675
+ float r2 = a2 / b2;
1676
+ return __floats2half2_rn(r1, r2);
1677
+ #endif
1678
+ }
1679
+
1680
+ template <>
1681
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
1682
+ float a1 = __low2float(a);
1683
+ float a2 = __high2float(a);
1684
+ float b1 = __low2float(b);
1685
+ float b2 = __high2float(b);
1686
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
1687
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
1688
+ return __halves2half2(r1, r2);
1689
+ }
1690
+
1691
+ template <>
1692
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
1693
+ float a1 = __low2float(a);
1694
+ float a2 = __high2float(a);
1695
+ float b1 = __low2float(b);
1696
+ float b2 = __high2float(b);
1697
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
1698
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
1699
+ return __halves2half2(r1, r2);
1700
+ }
1701
+
1702
+ #endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
1703
+
1704
+ #undef EIGEN_GPU_HAS_LDG
1705
+ #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1706
+ #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1707
+
1708
+ } // end namespace internal
1709
+
1710
+ } // end namespace Eigen
1711
+
1712
+ #endif // EIGEN_PACKET_MATH_GPU_H