@smake/eigen 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. package/README.md +1 -1
  2. package/eigen/Eigen/AccelerateSupport +52 -0
  3. package/eigen/Eigen/Cholesky +18 -20
  4. package/eigen/Eigen/CholmodSupport +28 -28
  5. package/eigen/Eigen/Core +187 -120
  6. package/eigen/Eigen/Eigenvalues +16 -13
  7. package/eigen/Eigen/Geometry +18 -18
  8. package/eigen/Eigen/Householder +9 -7
  9. package/eigen/Eigen/IterativeLinearSolvers +8 -4
  10. package/eigen/Eigen/Jacobi +14 -13
  11. package/eigen/Eigen/KLUSupport +23 -21
  12. package/eigen/Eigen/LU +15 -16
  13. package/eigen/Eigen/MetisSupport +12 -12
  14. package/eigen/Eigen/OrderingMethods +54 -51
  15. package/eigen/Eigen/PaStiXSupport +23 -21
  16. package/eigen/Eigen/PardisoSupport +17 -14
  17. package/eigen/Eigen/QR +18 -20
  18. package/eigen/Eigen/QtAlignedMalloc +5 -12
  19. package/eigen/Eigen/SPQRSupport +21 -14
  20. package/eigen/Eigen/SVD +23 -17
  21. package/eigen/Eigen/Sparse +1 -2
  22. package/eigen/Eigen/SparseCholesky +18 -15
  23. package/eigen/Eigen/SparseCore +18 -17
  24. package/eigen/Eigen/SparseLU +9 -9
  25. package/eigen/Eigen/SparseQR +16 -14
  26. package/eigen/Eigen/StdDeque +5 -2
  27. package/eigen/Eigen/StdList +5 -2
  28. package/eigen/Eigen/StdVector +5 -2
  29. package/eigen/Eigen/SuperLUSupport +30 -24
  30. package/eigen/Eigen/ThreadPool +80 -0
  31. package/eigen/Eigen/UmfPackSupport +19 -17
  32. package/eigen/Eigen/Version +14 -0
  33. package/eigen/Eigen/src/AccelerateSupport/AccelerateSupport.h +423 -0
  34. package/eigen/Eigen/src/AccelerateSupport/InternalHeaderCheck.h +3 -0
  35. package/eigen/Eigen/src/Cholesky/InternalHeaderCheck.h +3 -0
  36. package/eigen/Eigen/src/Cholesky/LDLT.h +366 -405
  37. package/eigen/Eigen/src/Cholesky/LLT.h +323 -367
  38. package/eigen/Eigen/src/Cholesky/LLT_LAPACKE.h +81 -56
  39. package/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +585 -529
  40. package/eigen/Eigen/src/CholmodSupport/InternalHeaderCheck.h +3 -0
  41. package/eigen/Eigen/src/Core/ArithmeticSequence.h +143 -317
  42. package/eigen/Eigen/src/Core/Array.h +329 -370
  43. package/eigen/Eigen/src/Core/ArrayBase.h +190 -203
  44. package/eigen/Eigen/src/Core/ArrayWrapper.h +126 -170
  45. package/eigen/Eigen/src/Core/Assign.h +30 -40
  46. package/eigen/Eigen/src/Core/AssignEvaluator.h +651 -604
  47. package/eigen/Eigen/src/Core/Assign_MKL.h +125 -120
  48. package/eigen/Eigen/src/Core/BandMatrix.h +267 -282
  49. package/eigen/Eigen/src/Core/Block.h +371 -390
  50. package/eigen/Eigen/src/Core/CommaInitializer.h +85 -100
  51. package/eigen/Eigen/src/Core/ConditionEstimator.h +51 -53
  52. package/eigen/Eigen/src/Core/CoreEvaluators.h +1214 -937
  53. package/eigen/Eigen/src/Core/CoreIterators.h +72 -63
  54. package/eigen/Eigen/src/Core/CwiseBinaryOp.h +112 -129
  55. package/eigen/Eigen/src/Core/CwiseNullaryOp.h +676 -702
  56. package/eigen/Eigen/src/Core/CwiseTernaryOp.h +77 -103
  57. package/eigen/Eigen/src/Core/CwiseUnaryOp.h +55 -67
  58. package/eigen/Eigen/src/Core/CwiseUnaryView.h +127 -92
  59. package/eigen/Eigen/src/Core/DenseBase.h +630 -658
  60. package/eigen/Eigen/src/Core/DenseCoeffsBase.h +511 -628
  61. package/eigen/Eigen/src/Core/DenseStorage.h +511 -590
  62. package/eigen/Eigen/src/Core/DeviceWrapper.h +153 -0
  63. package/eigen/Eigen/src/Core/Diagonal.h +168 -207
  64. package/eigen/Eigen/src/Core/DiagonalMatrix.h +346 -317
  65. package/eigen/Eigen/src/Core/DiagonalProduct.h +12 -10
  66. package/eigen/Eigen/src/Core/Dot.h +167 -217
  67. package/eigen/Eigen/src/Core/EigenBase.h +74 -85
  68. package/eigen/Eigen/src/Core/Fill.h +138 -0
  69. package/eigen/Eigen/src/Core/FindCoeff.h +464 -0
  70. package/eigen/Eigen/src/Core/ForceAlignedAccess.h +90 -113
  71. package/eigen/Eigen/src/Core/Fuzzy.h +82 -105
  72. package/eigen/Eigen/src/Core/GeneralProduct.h +315 -261
  73. package/eigen/Eigen/src/Core/GenericPacketMath.h +1182 -520
  74. package/eigen/Eigen/src/Core/GlobalFunctions.h +193 -157
  75. package/eigen/Eigen/src/Core/IO.h +131 -156
  76. package/eigen/Eigen/src/Core/IndexedView.h +209 -125
  77. package/eigen/Eigen/src/Core/InnerProduct.h +260 -0
  78. package/eigen/Eigen/src/Core/InternalHeaderCheck.h +3 -0
  79. package/eigen/Eigen/src/Core/Inverse.h +50 -59
  80. package/eigen/Eigen/src/Core/Map.h +123 -141
  81. package/eigen/Eigen/src/Core/MapBase.h +255 -282
  82. package/eigen/Eigen/src/Core/MathFunctions.h +1247 -1201
  83. package/eigen/Eigen/src/Core/MathFunctionsImpl.h +162 -99
  84. package/eigen/Eigen/src/Core/Matrix.h +463 -494
  85. package/eigen/Eigen/src/Core/MatrixBase.h +468 -470
  86. package/eigen/Eigen/src/Core/NestByValue.h +58 -52
  87. package/eigen/Eigen/src/Core/NoAlias.h +79 -86
  88. package/eigen/Eigen/src/Core/NumTraits.h +206 -206
  89. package/eigen/Eigen/src/Core/PartialReduxEvaluator.h +163 -142
  90. package/eigen/Eigen/src/Core/PermutationMatrix.h +461 -511
  91. package/eigen/Eigen/src/Core/PlainObjectBase.h +858 -972
  92. package/eigen/Eigen/src/Core/Product.h +246 -130
  93. package/eigen/Eigen/src/Core/ProductEvaluators.h +779 -671
  94. package/eigen/Eigen/src/Core/Random.h +153 -164
  95. package/eigen/Eigen/src/Core/RandomImpl.h +262 -0
  96. package/eigen/Eigen/src/Core/RealView.h +250 -0
  97. package/eigen/Eigen/src/Core/Redux.h +334 -314
  98. package/eigen/Eigen/src/Core/Ref.h +259 -257
  99. package/eigen/Eigen/src/Core/Replicate.h +92 -104
  100. package/eigen/Eigen/src/Core/Reshaped.h +215 -271
  101. package/eigen/Eigen/src/Core/ReturnByValue.h +47 -55
  102. package/eigen/Eigen/src/Core/Reverse.h +133 -148
  103. package/eigen/Eigen/src/Core/Select.h +68 -140
  104. package/eigen/Eigen/src/Core/SelfAdjointView.h +254 -290
  105. package/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +23 -20
  106. package/eigen/Eigen/src/Core/SkewSymmetricMatrix3.h +382 -0
  107. package/eigen/Eigen/src/Core/Solve.h +88 -102
  108. package/eigen/Eigen/src/Core/SolveTriangular.h +126 -124
  109. package/eigen/Eigen/src/Core/SolverBase.h +132 -133
  110. package/eigen/Eigen/src/Core/StableNorm.h +113 -147
  111. package/eigen/Eigen/src/Core/StlIterators.h +404 -248
  112. package/eigen/Eigen/src/Core/Stride.h +90 -92
  113. package/eigen/Eigen/src/Core/Swap.h +70 -39
  114. package/eigen/Eigen/src/Core/Transpose.h +258 -295
  115. package/eigen/Eigen/src/Core/Transpositions.h +270 -333
  116. package/eigen/Eigen/src/Core/TriangularMatrix.h +642 -743
  117. package/eigen/Eigen/src/Core/VectorBlock.h +59 -72
  118. package/eigen/Eigen/src/Core/VectorwiseOp.h +653 -704
  119. package/eigen/Eigen/src/Core/Visitor.h +464 -308
  120. package/eigen/Eigen/src/Core/arch/AVX/Complex.h +380 -187
  121. package/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +65 -163
  122. package/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +2145 -638
  123. package/eigen/Eigen/src/Core/arch/AVX/Reductions.h +353 -0
  124. package/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +253 -60
  125. package/eigen/Eigen/src/Core/arch/AVX512/Complex.h +278 -228
  126. package/eigen/Eigen/src/Core/arch/AVX512/GemmKernel.h +1245 -0
  127. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +48 -269
  128. package/eigen/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h +75 -0
  129. package/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +1597 -754
  130. package/eigen/Eigen/src/Core/arch/AVX512/PacketMathFP16.h +1413 -0
  131. package/eigen/Eigen/src/Core/arch/AVX512/Reductions.h +297 -0
  132. package/eigen/Eigen/src/Core/arch/AVX512/TrsmKernel.h +1167 -0
  133. package/eigen/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc +1219 -0
  134. package/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +229 -41
  135. package/eigen/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h +130 -0
  136. package/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +420 -184
  137. package/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +40 -49
  138. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +2962 -2213
  139. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +196 -212
  140. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +713 -441
  141. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h +742 -0
  142. package/eigen/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +2818 -0
  143. package/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +2380 -1362
  144. package/eigen/Eigen/src/Core/arch/AltiVec/TypeCasting.h +153 -0
  145. package/eigen/Eigen/src/Core/arch/Default/BFloat16.h +390 -224
  146. package/eigen/Eigen/src/Core/arch/Default/ConjHelper.h +78 -67
  147. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +1784 -799
  148. package/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +167 -50
  149. package/eigen/Eigen/src/Core/arch/Default/Half.h +528 -379
  150. package/eigen/Eigen/src/Core/arch/Default/Settings.h +10 -12
  151. package/eigen/Eigen/src/Core/arch/GPU/Complex.h +244 -0
  152. package/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +41 -40
  153. package/eigen/Eigen/src/Core/arch/GPU/PacketMath.h +550 -523
  154. package/eigen/Eigen/src/Core/arch/GPU/Tuple.h +268 -0
  155. package/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +27 -30
  156. package/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +8 -8
  157. package/eigen/Eigen/src/Core/arch/HVX/PacketMath.h +1088 -0
  158. package/eigen/Eigen/src/Core/arch/LSX/Complex.h +520 -0
  159. package/eigen/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h +23 -0
  160. package/eigen/Eigen/src/Core/arch/LSX/MathFunctions.h +43 -0
  161. package/eigen/Eigen/src/Core/arch/LSX/PacketMath.h +2866 -0
  162. package/eigen/Eigen/src/Core/arch/LSX/TypeCasting.h +526 -0
  163. package/eigen/Eigen/src/Core/arch/MSA/Complex.h +54 -82
  164. package/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +84 -92
  165. package/eigen/Eigen/src/Core/arch/MSA/PacketMath.h +51 -47
  166. package/eigen/Eigen/src/Core/arch/NEON/Complex.h +454 -306
  167. package/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +175 -115
  168. package/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +23 -30
  169. package/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +4366 -2857
  170. package/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +616 -393
  171. package/eigen/Eigen/src/Core/arch/NEON/UnaryFunctors.h +57 -0
  172. package/eigen/Eigen/src/Core/arch/SSE/Complex.h +350 -198
  173. package/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +38 -149
  174. package/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +1791 -912
  175. package/eigen/Eigen/src/Core/arch/SSE/Reductions.h +324 -0
  176. package/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +128 -40
  177. package/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h +10 -6
  178. package/eigen/Eigen/src/Core/arch/SVE/PacketMath.h +156 -234
  179. package/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h +6 -3
  180. package/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +27 -32
  181. package/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +119 -117
  182. package/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +325 -419
  183. package/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +15 -17
  184. package/eigen/Eigen/src/Core/arch/ZVector/Complex.h +325 -181
  185. package/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +94 -83
  186. package/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +811 -458
  187. package/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +121 -124
  188. package/eigen/Eigen/src/Core/functors/BinaryFunctors.h +576 -370
  189. package/eigen/Eigen/src/Core/functors/NullaryFunctors.h +194 -109
  190. package/eigen/Eigen/src/Core/functors/StlFunctors.h +95 -112
  191. package/eigen/Eigen/src/Core/functors/TernaryFunctors.h +34 -7
  192. package/eigen/Eigen/src/Core/functors/UnaryFunctors.h +1038 -749
  193. package/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +1883 -1375
  194. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +312 -370
  195. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +189 -176
  196. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +84 -81
  197. package/eigen/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +154 -73
  198. package/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +292 -337
  199. package/eigen/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +80 -77
  200. package/eigen/Eigen/src/Core/products/Parallelizer.h +207 -105
  201. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +327 -388
  202. package/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +206 -224
  203. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +138 -147
  204. package/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +58 -61
  205. package/eigen/Eigen/src/Core/products/SelfadjointProduct.h +71 -71
  206. package/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +48 -47
  207. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +294 -369
  208. package/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +246 -238
  209. package/eigen/Eigen/src/Core/products/TriangularMatrixVector.h +244 -247
  210. package/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +212 -192
  211. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +328 -277
  212. package/eigen/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +108 -109
  213. package/eigen/Eigen/src/Core/products/TriangularSolverVector.h +68 -94
  214. package/eigen/Eigen/src/Core/util/Assert.h +158 -0
  215. package/eigen/Eigen/src/Core/util/BlasUtil.h +342 -303
  216. package/eigen/Eigen/src/Core/util/ConfigureVectorization.h +348 -317
  217. package/eigen/Eigen/src/Core/util/Constants.h +297 -262
  218. package/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +130 -90
  219. package/eigen/Eigen/src/Core/util/EmulateArray.h +270 -0
  220. package/eigen/Eigen/src/Core/util/ForwardDeclarations.h +449 -247
  221. package/eigen/Eigen/src/Core/util/GpuHipCudaDefines.inc +101 -0
  222. package/eigen/Eigen/src/Core/util/GpuHipCudaUndefines.inc +45 -0
  223. package/eigen/Eigen/src/Core/util/IndexedViewHelper.h +417 -116
  224. package/eigen/Eigen/src/Core/util/IntegralConstant.h +211 -204
  225. package/eigen/Eigen/src/Core/util/MKL_support.h +39 -37
  226. package/eigen/Eigen/src/Core/util/Macros.h +655 -773
  227. package/eigen/Eigen/src/Core/util/MaxSizeVector.h +139 -0
  228. package/eigen/Eigen/src/Core/util/Memory.h +970 -748
  229. package/eigen/Eigen/src/Core/util/Meta.h +581 -633
  230. package/eigen/Eigen/src/Core/util/MoreMeta.h +638 -0
  231. package/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +32 -19
  232. package/eigen/Eigen/src/Core/util/ReshapedHelper.h +17 -17
  233. package/eigen/Eigen/src/Core/util/Serializer.h +209 -0
  234. package/eigen/Eigen/src/Core/util/StaticAssert.h +50 -166
  235. package/eigen/Eigen/src/Core/util/SymbolicIndex.h +377 -225
  236. package/eigen/Eigen/src/Core/util/XprHelper.h +784 -547
  237. package/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +246 -277
  238. package/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +299 -319
  239. package/eigen/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h +52 -48
  240. package/eigen/Eigen/src/Eigenvalues/EigenSolver.h +413 -456
  241. package/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +309 -325
  242. package/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +157 -171
  243. package/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +292 -310
  244. package/eigen/Eigen/src/Eigenvalues/InternalHeaderCheck.h +3 -0
  245. package/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +89 -105
  246. package/eigen/Eigen/src/Eigenvalues/RealQZ.h +537 -607
  247. package/eigen/Eigen/src/Eigenvalues/RealSchur.h +342 -381
  248. package/eigen/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h +41 -35
  249. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +541 -595
  250. package/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +47 -44
  251. package/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +430 -462
  252. package/eigen/Eigen/src/Geometry/AlignedBox.h +226 -227
  253. package/eigen/Eigen/src/Geometry/AngleAxis.h +131 -133
  254. package/eigen/Eigen/src/Geometry/EulerAngles.h +163 -74
  255. package/eigen/Eigen/src/Geometry/Homogeneous.h +285 -333
  256. package/eigen/Eigen/src/Geometry/Hyperplane.h +151 -160
  257. package/eigen/Eigen/src/Geometry/InternalHeaderCheck.h +3 -0
  258. package/eigen/Eigen/src/Geometry/OrthoMethods.h +168 -146
  259. package/eigen/Eigen/src/Geometry/ParametrizedLine.h +127 -127
  260. package/eigen/Eigen/src/Geometry/Quaternion.h +566 -506
  261. package/eigen/Eigen/src/Geometry/Rotation2D.h +107 -105
  262. package/eigen/Eigen/src/Geometry/RotationBase.h +148 -145
  263. package/eigen/Eigen/src/Geometry/Scaling.h +113 -106
  264. package/eigen/Eigen/src/Geometry/Transform.h +858 -936
  265. package/eigen/Eigen/src/Geometry/Translation.h +94 -92
  266. package/eigen/Eigen/src/Geometry/Umeyama.h +79 -84
  267. package/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h +90 -104
  268. package/eigen/Eigen/src/Householder/BlockHouseholder.h +51 -46
  269. package/eigen/Eigen/src/Householder/Householder.h +102 -124
  270. package/eigen/Eigen/src/Householder/HouseholderSequence.h +412 -453
  271. package/eigen/Eigen/src/Householder/InternalHeaderCheck.h +3 -0
  272. package/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +149 -162
  273. package/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +124 -119
  274. package/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +92 -104
  275. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +251 -243
  276. package/eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +224 -228
  277. package/eigen/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h +3 -0
  278. package/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +178 -227
  279. package/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +79 -84
  280. package/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +54 -60
  281. package/eigen/Eigen/src/Jacobi/InternalHeaderCheck.h +3 -0
  282. package/eigen/Eigen/src/Jacobi/Jacobi.h +252 -308
  283. package/eigen/Eigen/src/KLUSupport/InternalHeaderCheck.h +3 -0
  284. package/eigen/Eigen/src/KLUSupport/KLUSupport.h +208 -227
  285. package/eigen/Eigen/src/LU/Determinant.h +50 -69
  286. package/eigen/Eigen/src/LU/FullPivLU.h +545 -596
  287. package/eigen/Eigen/src/LU/InternalHeaderCheck.h +3 -0
  288. package/eigen/Eigen/src/LU/InverseImpl.h +206 -285
  289. package/eigen/Eigen/src/LU/PartialPivLU.h +390 -428
  290. package/eigen/Eigen/src/LU/PartialPivLU_LAPACKE.h +54 -40
  291. package/eigen/Eigen/src/LU/arch/InverseSize4.h +72 -70
  292. package/eigen/Eigen/src/MetisSupport/InternalHeaderCheck.h +3 -0
  293. package/eigen/Eigen/src/MetisSupport/MetisSupport.h +81 -93
  294. package/eigen/Eigen/src/OrderingMethods/Amd.h +243 -265
  295. package/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +831 -1004
  296. package/eigen/Eigen/src/OrderingMethods/InternalHeaderCheck.h +3 -0
  297. package/eigen/Eigen/src/OrderingMethods/Ordering.h +112 -119
  298. package/eigen/Eigen/src/PaStiXSupport/InternalHeaderCheck.h +3 -0
  299. package/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +524 -570
  300. package/eigen/Eigen/src/PardisoSupport/InternalHeaderCheck.h +3 -0
  301. package/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +385 -430
  302. package/eigen/Eigen/src/QR/ColPivHouseholderQR.h +479 -479
  303. package/eigen/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h +120 -56
  304. package/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +166 -153
  305. package/eigen/Eigen/src/QR/FullPivHouseholderQR.h +495 -475
  306. package/eigen/Eigen/src/QR/HouseholderQR.h +394 -285
  307. package/eigen/Eigen/src/QR/HouseholderQR_LAPACKE.h +32 -23
  308. package/eigen/Eigen/src/QR/InternalHeaderCheck.h +3 -0
  309. package/eigen/Eigen/src/SPQRSupport/InternalHeaderCheck.h +3 -0
  310. package/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +244 -264
  311. package/eigen/Eigen/src/SVD/BDCSVD.h +817 -713
  312. package/eigen/Eigen/src/SVD/BDCSVD_LAPACKE.h +174 -0
  313. package/eigen/Eigen/src/SVD/InternalHeaderCheck.h +3 -0
  314. package/eigen/Eigen/src/SVD/JacobiSVD.h +577 -543
  315. package/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +85 -49
  316. package/eigen/Eigen/src/SVD/SVDBase.h +242 -182
  317. package/eigen/Eigen/src/SVD/UpperBidiagonalization.h +200 -235
  318. package/eigen/Eigen/src/SparseCholesky/InternalHeaderCheck.h +3 -0
  319. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +765 -594
  320. package/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +308 -94
  321. package/eigen/Eigen/src/SparseCore/AmbiVector.h +202 -251
  322. package/eigen/Eigen/src/SparseCore/CompressedStorage.h +184 -252
  323. package/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +134 -178
  324. package/eigen/Eigen/src/SparseCore/InternalHeaderCheck.h +3 -0
  325. package/eigen/Eigen/src/SparseCore/SparseAssign.h +149 -140
  326. package/eigen/Eigen/src/SparseCore/SparseBlock.h +403 -440
  327. package/eigen/Eigen/src/SparseCore/SparseColEtree.h +100 -112
  328. package/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +525 -303
  329. package/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +555 -339
  330. package/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +100 -108
  331. package/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +169 -197
  332. package/eigen/Eigen/src/SparseCore/SparseDiagonalProduct.h +71 -71
  333. package/eigen/Eigen/src/SparseCore/SparseDot.h +49 -47
  334. package/eigen/Eigen/src/SparseCore/SparseFuzzy.h +13 -11
  335. package/eigen/Eigen/src/SparseCore/SparseMap.h +243 -253
  336. package/eigen/Eigen/src/SparseCore/SparseMatrix.h +1603 -1245
  337. package/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +403 -350
  338. package/eigen/Eigen/src/SparseCore/SparsePermutation.h +186 -115
  339. package/eigen/Eigen/src/SparseCore/SparseProduct.h +94 -97
  340. package/eigen/Eigen/src/SparseCore/SparseRedux.h +22 -24
  341. package/eigen/Eigen/src/SparseCore/SparseRef.h +268 -295
  342. package/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +370 -416
  343. package/eigen/Eigen/src/SparseCore/SparseSolverBase.h +78 -87
  344. package/eigen/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +81 -95
  345. package/eigen/Eigen/src/SparseCore/SparseTranspose.h +62 -71
  346. package/eigen/Eigen/src/SparseCore/SparseTriangularView.h +132 -144
  347. package/eigen/Eigen/src/SparseCore/SparseUtil.h +138 -115
  348. package/eigen/Eigen/src/SparseCore/SparseVector.h +426 -372
  349. package/eigen/Eigen/src/SparseCore/SparseView.h +164 -193
  350. package/eigen/Eigen/src/SparseCore/TriangularSolver.h +129 -170
  351. package/eigen/Eigen/src/SparseLU/InternalHeaderCheck.h +3 -0
  352. package/eigen/Eigen/src/SparseLU/SparseLU.h +756 -710
  353. package/eigen/Eigen/src/SparseLU/SparseLUImpl.h +61 -48
  354. package/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +102 -118
  355. package/eigen/Eigen/src/SparseLU/SparseLU_Structs.h +38 -35
  356. package/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +245 -301
  357. package/eigen/Eigen/src/SparseLU/SparseLU_Utils.h +44 -49
  358. package/eigen/Eigen/src/SparseLU/SparseLU_column_bmod.h +104 -108
  359. package/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +89 -100
  360. package/eigen/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h +57 -58
  361. package/eigen/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +43 -55
  362. package/eigen/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +74 -71
  363. package/eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h +124 -132
  364. package/eigen/Eigen/src/SparseLU/SparseLU_panel_dfs.h +136 -159
  365. package/eigen/Eigen/src/SparseLU/SparseLU_pivotL.h +51 -52
  366. package/eigen/Eigen/src/SparseLU/SparseLU_pruneL.h +67 -73
  367. package/eigen/Eigen/src/SparseLU/SparseLU_relax_snode.h +24 -26
  368. package/eigen/Eigen/src/SparseQR/InternalHeaderCheck.h +3 -0
  369. package/eigen/Eigen/src/SparseQR/SparseQR.h +450 -502
  370. package/eigen/Eigen/src/StlSupport/StdDeque.h +28 -93
  371. package/eigen/Eigen/src/StlSupport/StdList.h +28 -84
  372. package/eigen/Eigen/src/StlSupport/StdVector.h +28 -108
  373. package/eigen/Eigen/src/StlSupport/details.h +48 -50
  374. package/eigen/Eigen/src/SuperLUSupport/InternalHeaderCheck.h +3 -0
  375. package/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +634 -730
  376. package/eigen/Eigen/src/ThreadPool/Barrier.h +70 -0
  377. package/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h +336 -0
  378. package/eigen/Eigen/src/ThreadPool/EventCount.h +241 -0
  379. package/eigen/Eigen/src/ThreadPool/ForkJoin.h +140 -0
  380. package/eigen/Eigen/src/ThreadPool/InternalHeaderCheck.h +4 -0
  381. package/eigen/Eigen/src/ThreadPool/NonBlockingThreadPool.h +587 -0
  382. package/eigen/Eigen/src/ThreadPool/RunQueue.h +230 -0
  383. package/eigen/Eigen/src/ThreadPool/ThreadCancel.h +21 -0
  384. package/eigen/Eigen/src/ThreadPool/ThreadEnvironment.h +43 -0
  385. package/eigen/Eigen/src/ThreadPool/ThreadLocal.h +289 -0
  386. package/eigen/Eigen/src/ThreadPool/ThreadPoolInterface.h +50 -0
  387. package/eigen/Eigen/src/ThreadPool/ThreadYield.h +16 -0
  388. package/eigen/Eigen/src/UmfPackSupport/InternalHeaderCheck.h +3 -0
  389. package/eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +428 -464
  390. package/eigen/Eigen/src/misc/Image.h +41 -43
  391. package/eigen/Eigen/src/misc/InternalHeaderCheck.h +3 -0
  392. package/eigen/Eigen/src/misc/Kernel.h +39 -41
  393. package/eigen/Eigen/src/misc/RealSvd2x2.h +19 -21
  394. package/eigen/Eigen/src/misc/blas.h +83 -426
  395. package/eigen/Eigen/src/misc/lapacke.h +9972 -16179
  396. package/eigen/Eigen/src/misc/lapacke_helpers.h +163 -0
  397. package/eigen/Eigen/src/misc/lapacke_mangling.h +4 -5
  398. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.inc +344 -0
  399. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.inc +544 -0
  400. package/eigen/Eigen/src/plugins/{BlockMethods.h → BlockMethods.inc} +434 -506
  401. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.inc +116 -0
  402. package/eigen/Eigen/src/plugins/{CommonCwiseUnaryOps.h → CommonCwiseUnaryOps.inc} +58 -68
  403. package/eigen/Eigen/src/plugins/IndexedViewMethods.inc +192 -0
  404. package/eigen/Eigen/src/plugins/InternalHeaderCheck.inc +3 -0
  405. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.inc +331 -0
  406. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.inc +118 -0
  407. package/eigen/Eigen/src/plugins/ReshapedMethods.inc +133 -0
  408. package/package.json +1 -1
  409. package/eigen/COPYING.APACHE +0 -203
  410. package/eigen/COPYING.BSD +0 -26
  411. package/eigen/COPYING.GPL +0 -674
  412. package/eigen/COPYING.LGPL +0 -502
  413. package/eigen/COPYING.MINPACK +0 -51
  414. package/eigen/COPYING.MPL2 +0 -373
  415. package/eigen/COPYING.README +0 -18
  416. package/eigen/Eigen/src/Core/BooleanRedux.h +0 -162
  417. package/eigen/Eigen/src/Core/arch/CUDA/Complex.h +0 -258
  418. package/eigen/Eigen/src/Core/arch/Default/TypeCasting.h +0 -120
  419. package/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +0 -694
  420. package/eigen/Eigen/src/Core/util/NonMPL2.h +0 -3
  421. package/eigen/Eigen/src/SparseCore/MappedSparseMatrix.h +0 -67
  422. package/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +0 -280
  423. package/eigen/Eigen/src/misc/lapack.h +0 -152
  424. package/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +0 -358
  425. package/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +0 -696
  426. package/eigen/Eigen/src/plugins/CommonCwiseBinaryOps.h +0 -115
  427. package/eigen/Eigen/src/plugins/IndexedViewMethods.h +0 -262
  428. package/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +0 -152
  429. package/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +0 -95
  430. package/eigen/Eigen/src/plugins/ReshapedMethods.h +0 -149
  431. package/eigen/README.md +0 -5
@@ -10,6 +10,9 @@
10
10
  #ifndef EIGEN_PACKET_MATH_GPU_H
11
11
  #define EIGEN_PACKET_MATH_GPU_H
12
12
 
13
+ // IWYU pragma: private
14
+ #include "../../InternalHeaderCheck.h"
15
+
13
16
  namespace Eigen {
14
17
 
15
18
  namespace internal {
@@ -28,29 +31,43 @@ namespace internal {
28
31
  #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
29
32
  #endif
30
33
 
34
+ // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
35
+ // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
36
+ // of the functions, while the latter can only deal with one of them.
37
+ #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
38
+ #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
39
+ #else
40
+ #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
41
+ #endif
42
+
31
43
  // Make sure this is only available when targeting a GPU: we don't want to
32
44
  // introduce conflicts between these packet_traits definitions and the ones
33
45
  // we'll use on the host side (SSE, AVX, ...)
34
46
  #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
35
47
 
36
- template<> struct is_arithmetic<float4> { enum { value = true }; };
37
- template<> struct is_arithmetic<double2> { enum { value = true }; };
48
+ template <>
49
+ struct is_arithmetic<float4> {
50
+ enum { value = true };
51
+ };
52
+ template <>
53
+ struct is_arithmetic<double2> {
54
+ enum { value = true };
55
+ };
38
56
 
39
- template<> struct packet_traits<float> : default_packet_traits
40
- {
57
+ template <>
58
+ struct packet_traits<float> : default_packet_traits {
41
59
  typedef float4 type;
42
60
  typedef float4 half;
43
61
  enum {
44
62
  Vectorizable = 1,
45
63
  AlignedOnScalar = 1,
46
- size=4,
47
- HasHalfPacket = 0,
48
-
49
- HasDiv = 1,
50
- HasSin = 0,
51
- HasCos = 0,
52
- HasLog = 1,
53
- HasExp = 1,
64
+ size = 4,
65
+
66
+ HasDiv = 1,
67
+ HasSin = 0,
68
+ HasCos = 0,
69
+ HasLog = 1,
70
+ HasExp = 1,
54
71
  HasSqrt = 1,
55
72
  HasRsqrt = 1,
56
73
  HasLGamma = 1,
@@ -69,22 +86,22 @@ template<> struct packet_traits<float> : default_packet_traits
69
86
 
70
87
  HasBlend = 0,
71
88
  HasFloor = 1,
89
+ HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
72
90
  };
73
91
  };
74
92
 
75
- template<> struct packet_traits<double> : default_packet_traits
76
- {
93
+ template <>
94
+ struct packet_traits<double> : default_packet_traits {
77
95
  typedef double2 type;
78
96
  typedef double2 half;
79
97
  enum {
80
98
  Vectorizable = 1,
81
99
  AlignedOnScalar = 1,
82
- size=2,
83
- HasHalfPacket = 0,
100
+ size = 2,
84
101
 
85
- HasDiv = 1,
86
- HasLog = 1,
87
- HasExp = 1,
102
+ HasDiv = 1,
103
+ HasLog = 1,
104
+ HasExp = 1,
88
105
  HasSqrt = 1,
89
106
  HasRsqrt = 1,
90
107
  HasLGamma = 1,
@@ -100,365 +117,440 @@ template<> struct packet_traits<double> : default_packet_traits
100
117
  HasGammaSampleDerAlpha = 1,
101
118
  HasIGammac = 1,
102
119
  HasBetaInc = 1,
103
-
104
120
  HasBlend = 0,
105
- HasFloor = 1,
106
121
  };
107
122
  };
108
123
 
124
+ template <>
125
+ struct unpacket_traits<float4> {
126
+ typedef float type;
127
+ enum {
128
+ size = 4,
129
+ alignment = Aligned16,
130
+ vectorizable = true,
131
+ masked_load_available = false,
132
+ masked_store_available = false
133
+ };
134
+ typedef float4 half;
135
+ };
136
+ template <>
137
+ struct unpacket_traits<double2> {
138
+ typedef double type;
139
+ enum {
140
+ size = 2,
141
+ alignment = Aligned16,
142
+ vectorizable = true,
143
+ masked_load_available = false,
144
+ masked_store_available = false
145
+ };
146
+ typedef double2 half;
147
+ };
109
148
 
110
- template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
111
- template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
112
-
113
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
149
+ template <>
150
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
114
151
  return make_float4(from, from, from, from);
115
152
  }
116
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
153
+ template <>
154
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
117
155
  return make_double2(from, from);
118
156
  }
119
157
 
120
- // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
121
- // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
122
- // of the functions, while the latter can only deal with one of them.
123
- #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
124
- namespace {
158
+ #if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
125
159
 
126
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
127
- const float& b) {
160
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
128
161
  return __int_as_float(__float_as_int(a) & __float_as_int(b));
129
162
  }
130
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
131
- const double& b) {
132
- return __longlong_as_double(__double_as_longlong(a) &
133
- __double_as_longlong(b));
163
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
164
+ return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
134
165
  }
135
166
 
136
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
137
- const float& b) {
167
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
138
168
  return __int_as_float(__float_as_int(a) | __float_as_int(b));
139
169
  }
140
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
141
- const double& b) {
142
- return __longlong_as_double(__double_as_longlong(a) |
143
- __double_as_longlong(b));
170
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
171
+ return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
144
172
  }
145
173
 
146
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
147
- const float& b) {
174
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
148
175
  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
149
176
  }
150
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
151
- const double& b) {
152
- return __longlong_as_double(__double_as_longlong(a) ^
153
- __double_as_longlong(b));
177
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
178
+ return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
154
179
  }
155
180
 
156
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
157
- const float& b) {
181
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
158
182
  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
159
183
  }
160
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
161
- const double& b) {
162
- return __longlong_as_double(__double_as_longlong(a) &
163
- ~__double_as_longlong(b));
184
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
185
+ return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
164
186
  }
165
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
166
- const float& b) {
187
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
167
188
  return __int_as_float(a == b ? 0xffffffffu : 0u);
168
189
  }
169
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
170
- const double& b) {
190
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
171
191
  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
172
192
  }
173
193
 
174
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
175
- const float& b) {
194
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
176
195
  return __int_as_float(a < b ? 0xffffffffu : 0u);
177
196
  }
178
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
179
- const double& b) {
197
+
198
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
180
199
  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
181
200
  }
182
201
 
183
- } // namespace
202
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
203
+ return __int_as_float(a <= b ? 0xffffffffu : 0u);
204
+ }
205
+
206
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
207
+ return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
208
+ }
184
209
 
185
210
  template <>
186
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
187
- const float4& b) {
188
- return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
189
- bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
211
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
212
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
190
213
  }
191
214
  template <>
192
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
193
- const double2& b) {
215
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
194
216
  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
195
217
  }
196
218
 
197
219
  template <>
198
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
199
- const float4& b) {
200
- return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
201
- bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
220
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
221
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
202
222
  }
203
223
  template <>
204
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
205
- const double2& b) {
224
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
206
225
  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
207
226
  }
208
227
 
209
228
  template <>
210
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
211
- const float4& b) {
212
- return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
213
- bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
229
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
230
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
214
231
  }
215
232
  template <>
216
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
217
- const double2& b) {
233
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
218
234
  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
219
235
  }
220
236
 
221
237
  template <>
222
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
223
- const float4& b) {
224
- return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
225
- bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
238
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
239
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
240
+ bitwise_andnot(a.w, b.w));
226
241
  }
227
242
  template <>
228
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
229
- pandnot<double2>(const double2& a, const double2& b) {
243
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
230
244
  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
231
245
  }
232
246
 
233
247
  template <>
234
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
235
- const float4& b) {
236
- return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
237
- eq_mask(a.w, b.w));
248
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
249
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
238
250
  }
239
251
  template <>
240
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
241
- const float4& b) {
242
- return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
243
- lt_mask(a.w, b.w));
252
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
253
+ return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
244
254
  }
245
255
  template <>
246
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
247
- pcmp_eq<double2>(const double2& a, const double2& b) {
256
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
257
+ return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
258
+ }
259
+ template <>
260
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
248
261
  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
249
262
  }
250
263
  template <>
251
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
252
- pcmp_lt<double2>(const double2& a, const double2& b) {
264
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
253
265
  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
254
266
  }
255
- #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
267
+ template <>
268
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
269
+ return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
270
+ }
271
+ #endif // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
256
272
 
257
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
258
- return make_float4(a, a+1, a+2, a+3);
273
+ template <>
274
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
275
+ return make_float4(a, a + 1, a + 2, a + 3);
259
276
  }
260
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
261
- return make_double2(a, a+1);
277
+ template <>
278
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
279
+ return make_double2(a, a + 1);
262
280
  }
263
281
 
264
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
265
- return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
282
+ template <>
283
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
284
+ return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
266
285
  }
267
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
268
- return make_double2(a.x+b.x, a.y+b.y);
286
+ template <>
287
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
288
+ return make_double2(a.x + b.x, a.y + b.y);
269
289
  }
270
290
 
271
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
272
- return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
291
+ template <>
292
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
293
+ return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
273
294
  }
274
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
275
- return make_double2(a.x-b.x, a.y-b.y);
295
+ template <>
296
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
297
+ return make_double2(a.x - b.x, a.y - b.y);
276
298
  }
277
299
 
278
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
300
+ template <>
301
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
279
302
  return make_float4(-a.x, -a.y, -a.z, -a.w);
280
303
  }
281
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
304
+ template <>
305
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
282
306
  return make_double2(-a.x, -a.y);
283
307
  }
284
308
 
285
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
286
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
309
+ template <>
310
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
311
+ return a;
312
+ }
313
+ template <>
314
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
315
+ return a;
316
+ }
287
317
 
288
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
289
- return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
318
+ template <>
319
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
320
+ return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
290
321
  }
291
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
292
- return make_double2(a.x*b.x, a.y*b.y);
322
+ template <>
323
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
324
+ return make_double2(a.x * b.x, a.y * b.y);
293
325
  }
294
326
 
295
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
296
- return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
327
+ template <>
328
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
329
+ return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
297
330
  }
298
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
299
- return make_double2(a.x/b.x, a.y/b.y);
331
+ template <>
332
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
333
+ return make_double2(a.x / b.x, a.y / b.y);
300
334
  }
301
335
 
302
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
336
+ template <>
337
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
303
338
  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
304
339
  }
305
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
340
+ template <>
341
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
306
342
  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
307
343
  }
308
344
 
309
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
345
+ template <>
346
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
310
347
  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
311
348
  }
312
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
349
+ template <>
350
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
313
351
  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
314
352
  }
315
353
 
316
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
354
+ template <>
355
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
317
356
  return *reinterpret_cast<const float4*>(from);
318
357
  }
319
358
 
320
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
359
+ template <>
360
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
321
361
  return *reinterpret_cast<const double2*>(from);
322
362
  }
323
363
 
324
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
364
+ template <>
365
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
325
366
  return make_float4(from[0], from[1], from[2], from[3]);
326
367
  }
327
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
368
+ template <>
369
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
328
370
  return make_double2(from[0], from[1]);
329
371
  }
330
372
 
331
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
373
+ template <>
374
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
332
375
  return make_float4(from[0], from[0], from[1], from[1]);
333
376
  }
334
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
377
+ template <>
378
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
335
379
  return make_double2(from[0], from[0]);
336
380
  }
337
381
 
338
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
382
+ template <>
383
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
339
384
  *reinterpret_cast<float4*>(to) = from;
340
385
  }
341
386
 
342
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
387
+ template <>
388
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
343
389
  *reinterpret_cast<double2*>(to) = from;
344
390
  }
345
391
 
346
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
392
+ template <>
393
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
347
394
  to[0] = from.x;
348
395
  to[1] = from.y;
349
396
  to[2] = from.z;
350
397
  to[3] = from.w;
351
398
  }
352
399
 
353
- template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
400
+ template <>
401
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
354
402
  to[0] = from.x;
355
403
  to[1] = from.y;
356
404
  }
357
405
 
358
- template<>
406
+ template <>
359
407
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
360
408
  #if defined(EIGEN_GPU_HAS_LDG)
361
- return __ldg((const float4*)from);
409
+ return __ldg(reinterpret_cast<const float4*>(from));
362
410
  #else
363
411
  return make_float4(from[0], from[1], from[2], from[3]);
364
412
  #endif
365
413
  }
366
- template<>
414
+ template <>
367
415
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
368
416
  #if defined(EIGEN_GPU_HAS_LDG)
369
- return __ldg((const double2*)from);
417
+ return __ldg(reinterpret_cast<const double2*>(from));
370
418
  #else
371
419
  return make_double2(from[0], from[1]);
372
420
  #endif
373
421
  }
374
422
 
375
- template<>
423
+ template <>
376
424
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
377
425
  #if defined(EIGEN_GPU_HAS_LDG)
378
- return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
426
+ return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
379
427
  #else
380
428
  return make_float4(from[0], from[1], from[2], from[3]);
381
429
  #endif
382
430
  }
383
- template<>
431
+ template <>
384
432
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
385
433
  #if defined(EIGEN_GPU_HAS_LDG)
386
- return make_double2(__ldg(from+0), __ldg(from+1));
434
+ return make_double2(__ldg(from + 0), __ldg(from + 1));
387
435
  #else
388
436
  return make_double2(from[0], from[1]);
389
437
  #endif
390
438
  }
391
439
 
392
- template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
393
- return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
440
+ template <>
441
+ EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
442
+ return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
394
443
  }
395
444
 
396
- template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
397
- return make_double2(from[0*stride], from[1*stride]);
445
+ template <>
446
+ EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
447
+ return make_double2(from[0 * stride], from[1 * stride]);
398
448
  }
399
449
 
400
- template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
401
- to[stride*0] = from.x;
402
- to[stride*1] = from.y;
403
- to[stride*2] = from.z;
404
- to[stride*3] = from.w;
450
+ template <>
451
+ EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
452
+ to[stride * 0] = from.x;
453
+ to[stride * 1] = from.y;
454
+ to[stride * 2] = from.z;
455
+ to[stride * 3] = from.w;
405
456
  }
406
- template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
407
- to[stride*0] = from.x;
408
- to[stride*1] = from.y;
457
+ template <>
458
+ EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
459
+ to[stride * 0] = from.x;
460
+ to[stride * 1] = from.y;
409
461
  }
410
462
 
411
- template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
463
+ template <>
464
+ EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
412
465
  return a.x;
413
466
  }
414
- template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
467
+ template <>
468
+ EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
415
469
  return a.x;
416
470
  }
417
471
 
418
- template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
472
+ template <>
473
+ EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
419
474
  return a.x + a.y + a.z + a.w;
420
475
  }
421
- template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
476
+ template <>
477
+ EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
422
478
  return a.x + a.y;
423
479
  }
424
480
 
425
- template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
481
+ template <>
482
+ EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
426
483
  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
427
484
  }
428
- template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
485
+ template <>
486
+ EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
429
487
  return fmax(a.x, a.y);
430
488
  }
431
489
 
432
- template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
490
+ template <>
491
+ EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
433
492
  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
434
493
  }
435
- template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
494
+ template <>
495
+ EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
436
496
  return fmin(a.x, a.y);
437
497
  }
438
498
 
439
- template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
499
+ template <>
500
+ EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
440
501
  return a.x * a.y * a.z * a.w;
441
502
  }
442
- template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
503
+ template <>
504
+ EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
443
505
  return a.x * a.y;
444
506
  }
445
507
 
446
- template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
508
+ template <>
509
+ EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
447
510
  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
448
511
  }
449
- template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
512
+ template <>
513
+ EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
450
514
  return make_double2(fabs(a.x), fabs(a.y));
451
515
  }
452
516
 
453
- template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
517
+ template <>
518
+ EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
454
519
  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
455
520
  }
456
- template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
521
+ template <>
522
+ EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
457
523
  return make_double2(floor(a.x), floor(a.y));
458
524
  }
459
525
 
460
- EIGEN_DEVICE_FUNC inline void
461
- ptranspose(PacketBlock<float4,4>& kernel) {
526
+ template <>
527
+ EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
528
+ return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
529
+ }
530
+ template <>
531
+ EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
532
+ return make_double2(ceil(a.x), ceil(a.y));
533
+ }
534
+
535
+ template <>
536
+ EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
537
+ return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
538
+ }
539
+ template <>
540
+ EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
541
+ return make_double2(rint(a.x), rint(a.y));
542
+ }
543
+
544
+ template <>
545
+ EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
546
+ return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
547
+ }
548
+ template <>
549
+ EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
550
+ return make_double2(trunc(a.x), trunc(a.y));
551
+ }
552
+
553
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
462
554
  float tmp = kernel.packet[0].y;
463
555
  kernel.packet[0].y = kernel.packet[1].x;
464
556
  kernel.packet[1].x = tmp;
@@ -484,89 +576,82 @@ ptranspose(PacketBlock<float4,4>& kernel) {
484
576
  kernel.packet[3].z = tmp;
485
577
  }
486
578
 
487
- EIGEN_DEVICE_FUNC inline void
488
- ptranspose(PacketBlock<double2,2>& kernel) {
579
+ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
489
580
  double tmp = kernel.packet[0].y;
490
581
  kernel.packet[0].y = kernel.packet[1].x;
491
582
  kernel.packet[1].x = tmp;
492
583
  }
493
584
 
494
- #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
585
+ #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
495
586
 
496
- // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
497
- // its corresponding packet_traits<Eigen::half> must be visible on host.
498
- #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
587
+ // Half-packet functions are not available on the host for CUDA 9.0-9.2, only
588
+ // on device. There is no benefit to using them on the host anyways, since they are
589
+ // emulated.
590
+ #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
499
591
 
500
592
  typedef ulonglong2 Packet4h2;
501
- template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
502
- template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
593
+ template <>
594
+ struct unpacket_traits<Packet4h2> {
595
+ typedef Eigen::half type;
596
+ enum {
597
+ size = 8,
598
+ alignment = Aligned16,
599
+ vectorizable = true,
600
+ masked_load_available = false,
601
+ masked_store_available = false
602
+ };
603
+ typedef Packet4h2 half;
604
+ };
605
+ template <>
606
+ struct is_arithmetic<Packet4h2> {
607
+ enum { value = true };
608
+ };
503
609
 
504
- template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
505
- template<> struct is_arithmetic<half2> { enum { value = true }; };
610
+ template <>
611
+ struct unpacket_traits<half2> {
612
+ typedef Eigen::half type;
613
+ enum {
614
+ size = 2,
615
+ alignment = Aligned16,
616
+ vectorizable = true,
617
+ masked_load_available = false,
618
+ masked_store_available = false
619
+ };
620
+ typedef half2 half;
621
+ };
622
+ template <>
623
+ struct is_arithmetic<half2> {
624
+ enum { value = true };
625
+ };
506
626
 
507
- template<> struct packet_traits<Eigen::half> : default_packet_traits
508
- {
627
+ template <>
628
+ struct packet_traits<Eigen::half> : default_packet_traits {
509
629
  typedef Packet4h2 type;
510
630
  typedef Packet4h2 half;
511
631
  enum {
512
632
  Vectorizable = 1,
513
633
  AlignedOnScalar = 1,
514
- size=8,
515
- HasHalfPacket = 0,
516
- HasAdd = 1,
517
- HasSub = 1,
518
- HasMul = 1,
519
- HasDiv = 1,
520
- HasSqrt = 1,
521
- HasRsqrt = 1,
522
- HasExp = 1,
523
- HasExpm1 = 1,
524
- HasLog = 1,
525
- HasLog1p = 1
634
+ size = 8,
635
+ HasAdd = 1,
636
+ HasSub = 1,
637
+ HasMul = 1,
638
+ HasDiv = 1,
639
+ HasSqrt = 1,
640
+ HasRsqrt = 1,
641
+ HasExp = 1,
642
+ HasExpm1 = 1,
643
+ HasLog = 1,
644
+ HasLog1p = 1
526
645
  };
527
646
  };
528
647
 
529
- namespace {
530
- // This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
531
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
532
- #if defined(EIGEN_GPU_COMPILE_PHASE)
533
- return __halves2half2(a, b);
534
- #else
535
- // Round-about way since __halves2half2 is a __device__ function.
536
- return __floats2half2_rn(__half2float(a), __half2float(b));
537
- #endif
538
- }
539
-
540
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
541
- #if defined(EIGEN_GPU_COMPILE_PHASE)
542
- return __low2half(a);
543
- #else
544
- return __float2half(__low2float(a));
545
- #endif
546
- }
547
-
548
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
549
- #if defined(EIGEN_GPU_COMPILE_PHASE)
550
- return __high2half(a);
551
- #else
552
- return __float2half(__high2float(a));
553
- #endif
554
- }
555
- } // namespace
556
-
557
- template<>
648
+ template <>
558
649
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
559
- #if defined(EIGEN_GPU_COMPILE_PHASE)
560
650
  return __half2half2(from);
561
- #else
562
- const float f = __half2float(from);
563
- return __floats2half2_rn(f, f);
564
- #endif
565
651
  }
566
652
 
567
653
  template <>
568
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
569
- pset1<Packet4h2>(const Eigen::half& from) {
654
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
570
655
  Packet4h2 r;
571
656
  half2* p_alias = reinterpret_cast<half2*>(&r);
572
657
  p_alias[0] = pset1<half2>(from);
@@ -576,74 +661,61 @@ pset1<Packet4h2>(const Eigen::half& from) {
576
661
  return r;
577
662
  }
578
663
 
579
- // We now need this visible on both host and device.
580
- // #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
581
664
  namespace {
582
665
 
583
666
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
584
667
  return *reinterpret_cast<const half2*>(from);
585
668
  }
586
669
 
587
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
588
- return combine_half(from[0], from[1]);
589
- }
670
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
590
671
 
591
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
592
- return combine_half(from[0], from[0]);
672
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
673
+ return __halves2half2(from[0], from[0]);
593
674
  }
594
675
 
595
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
596
- const half2& from) {
676
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
597
677
  *reinterpret_cast<half2*>(to) = from;
598
678
  }
599
679
 
600
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
601
- const half2& from) {
602
- to[0] = get_half2_low(from);
603
- to[1] = get_half2_high(from);
680
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
681
+ to[0] = __low2half(from);
682
+ to[1] = __high2half(from);
604
683
  }
605
684
 
606
-
607
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
608
- const Eigen::half* from) {
685
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
609
686
  #if defined(EIGEN_GPU_HAS_LDG)
610
687
  // Input is guaranteed to be properly aligned.
611
688
  return __ldg(reinterpret_cast<const half2*>(from));
612
689
  #else
613
- return combine_half(*(from+0), *(from+1));
690
+ return __halves2half2(*(from + 0), *(from + 1));
614
691
  #endif
615
692
  }
616
693
 
617
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
618
- const Eigen::half* from) {
694
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
619
695
  #if defined(EIGEN_GPU_HAS_LDG)
620
- return __halves2half2(__ldg(from+0), __ldg(from+1));
696
+ return __halves2half2(__ldg(from + 0), __ldg(from + 1));
621
697
  #else
622
- return combine_half(*(from+0), *(from+1));
698
+ return __halves2half2(*(from + 0), *(from + 1));
623
699
  #endif
624
700
  }
625
701
 
626
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
627
- Index stride) {
628
- return combine_half(from[0*stride], from[1*stride]);
702
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
703
+ return __halves2half2(from[0 * stride], from[1 * stride]);
629
704
  }
630
705
 
631
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
632
- Eigen::half* to, const half2& from, Index stride) {
633
- to[stride*0] = get_half2_low(from);
634
- to[stride*1] = get_half2_high(from);
706
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
707
+ to[stride * 0] = __low2half(from);
708
+ to[stride * 1] = __high2half(from);
635
709
  }
636
710
 
637
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
638
- return get_half2_low(a);
639
- }
711
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
640
712
 
641
713
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
642
- half a1 = get_half2_low(a);
643
- half a2 = get_half2_high(a);
714
+ half a1 = __low2half(a);
715
+ half a2 = __high2half(a);
644
716
  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
645
717
  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
646
- return combine_half(result1, result2);
718
+ return __halves2half2(result1, result2);
647
719
  }
648
720
 
649
721
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
@@ -656,14 +728,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
656
728
  return pset1<half2>(false_half);
657
729
  }
658
730
 
659
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
660
- ptranspose(PacketBlock<half2,2>& kernel) {
661
- __half a1 = get_half2_low(kernel.packet[0]);
662
- __half a2 = get_half2_high(kernel.packet[0]);
663
- __half b1 = get_half2_low(kernel.packet[1]);
664
- __half b2 = get_half2_high(kernel.packet[1]);
665
- kernel.packet[0] = combine_half(a1, b1);
666
- kernel.packet[1] = combine_half(a2, b2);
731
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
732
+ __half a1 = __low2half(kernel.packet[0]);
733
+ __half a2 = __high2half(kernel.packet[0]);
734
+ __half b1 = __low2half(kernel.packet[1]);
735
+ __half b2 = __high2half(kernel.packet[1]);
736
+ kernel.packet[0] = __halves2half2(a1, b1);
737
+ kernel.packet[1] = __halves2half2(a2, b2);
667
738
  }
668
739
 
669
740
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
@@ -671,92 +742,95 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
671
742
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
672
743
  #else
673
744
  float f = __half2float(a) + 1.0f;
674
- return combine_half(a, __float2half(f));
745
+ return __halves2half2(a, __float2half(f));
675
746
  #endif
676
747
  }
677
748
 
678
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
679
- const half2& a,
680
- const half2& b) {
681
- half mask_low = get_half2_low(mask);
682
- half mask_high = get_half2_high(mask);
683
- half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
684
- half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
685
- return combine_half(result_low, result_high);
749
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
750
+ half mask_low = __low2half(mask);
751
+ half mask_high = __high2half(mask);
752
+ half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
753
+ half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
754
+ return __halves2half2(result_low, result_high);
686
755
  }
687
756
 
688
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
689
- const half2& b) {
757
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
690
758
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
691
759
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
692
- half a1 = get_half2_low(a);
693
- half a2 = get_half2_high(a);
694
- half b1 = get_half2_low(b);
695
- half b2 = get_half2_high(b);
760
+ half a1 = __low2half(a);
761
+ half a2 = __high2half(a);
762
+ half b1 = __low2half(b);
763
+ half b2 = __high2half(b);
696
764
  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
697
765
  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
698
- return combine_half(eq1, eq2);
766
+ return __halves2half2(eq1, eq2);
699
767
  }
700
768
 
701
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
702
- const half2& b) {
769
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
703
770
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
704
771
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
705
- half a1 = get_half2_low(a);
706
- half a2 = get_half2_high(a);
707
- half b1 = get_half2_low(b);
708
- half b2 = get_half2_high(b);
772
+ half a1 = __low2half(a);
773
+ half a2 = __high2half(a);
774
+ half b1 = __low2half(b);
775
+ half b2 = __high2half(b);
709
776
  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
710
777
  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
711
- return combine_half(eq1, eq2);
778
+ return __halves2half2(eq1, eq2);
712
779
  }
713
780
 
714
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
715
- const half2& b) {
716
- half a1 = get_half2_low(a);
717
- half a2 = get_half2_high(a);
718
- half b1 = get_half2_low(b);
719
- half b2 = get_half2_high(b);
781
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
782
+ half true_half = half_impl::raw_uint16_to_half(0xffffu);
783
+ half false_half = half_impl::raw_uint16_to_half(0x0000u);
784
+ half a1 = __low2half(a);
785
+ half a2 = __high2half(a);
786
+ half b1 = __low2half(b);
787
+ half b2 = __high2half(b);
788
+ half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
789
+ half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
790
+ return __halves2half2(eq1, eq2);
791
+ }
792
+
793
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
794
+ half a1 = __low2half(a);
795
+ half a2 = __high2half(a);
796
+ half b1 = __low2half(b);
797
+ half b2 = __high2half(b);
720
798
  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
721
799
  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
722
- return combine_half(result1, result2);
800
+ return __halves2half2(result1, result2);
723
801
  }
724
802
 
725
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
726
- const half2& b) {
727
- half a1 = get_half2_low(a);
728
- half a2 = get_half2_high(a);
729
- half b1 = get_half2_low(b);
730
- half b2 = get_half2_high(b);
803
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
804
+ half a1 = __low2half(a);
805
+ half a2 = __high2half(a);
806
+ half b1 = __low2half(b);
807
+ half b2 = __high2half(b);
731
808
  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
732
809
  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
733
- return combine_half(result1, result2);
810
+ return __halves2half2(result1, result2);
734
811
  }
735
812
 
736
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
737
- const half2& b) {
738
- half a1 = get_half2_low(a);
739
- half a2 = get_half2_high(a);
740
- half b1 = get_half2_low(b);
741
- half b2 = get_half2_high(b);
813
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
814
+ half a1 = __low2half(a);
815
+ half a2 = __high2half(a);
816
+ half b1 = __low2half(b);
817
+ half b2 = __high2half(b);
742
818
  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
743
819
  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
744
- return combine_half(result1, result2);
820
+ return __halves2half2(result1, result2);
745
821
  }
746
822
 
747
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
748
- const half2& b) {
749
- half a1 = get_half2_low(a);
750
- half a2 = get_half2_high(a);
751
- half b1 = get_half2_low(b);
752
- half b2 = get_half2_high(b);
823
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
824
+ half a1 = __low2half(a);
825
+ half a2 = __high2half(a);
826
+ half b1 = __low2half(b);
827
+ half b2 = __high2half(b);
753
828
  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
754
829
  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
755
- return combine_half(result1, result2);
830
+ return __halves2half2(result1, result2);
756
831
  }
757
832
 
758
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
759
- const half2& b) {
833
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
760
834
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
761
835
  return __hadd2(a, b);
762
836
  #else
@@ -770,8 +844,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
770
844
  #endif
771
845
  }
772
846
 
773
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
774
- const half2& b) {
847
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
775
848
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
776
849
  return __hsub2(a, b);
777
850
  #else
@@ -797,8 +870,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
797
870
 
798
871
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
799
872
 
800
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
801
- const half2& b) {
873
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
802
874
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
803
875
  return __hmul2(a, b);
804
876
  #else
@@ -812,11 +884,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
812
884
  #endif
813
885
  }
814
886
 
815
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
816
- const half2& b,
817
- const half2& c) {
887
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
818
888
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
819
- return __hfma2(a, b, c);
889
+ return __hfma2(a, b, c);
820
890
  #else
821
891
  float a1 = __low2float(a);
822
892
  float a2 = __high2float(a);
@@ -830,8 +900,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
830
900
  #endif
831
901
  }
832
902
 
833
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
834
- const half2& b) {
903
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
835
904
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
836
905
  return __h2div(a, b);
837
906
  #else
@@ -845,26 +914,24 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
845
914
  #endif
846
915
  }
847
916
 
848
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
849
- const half2& b) {
917
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
850
918
  float a1 = __low2float(a);
851
919
  float a2 = __high2float(a);
852
920
  float b1 = __low2float(b);
853
921
  float b2 = __high2float(b);
854
- __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
855
- __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
856
- return combine_half(r1, r2);
922
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
923
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
924
+ return __halves2half2(r1, r2);
857
925
  }
858
926
 
859
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
860
- const half2& b) {
927
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
861
928
  float a1 = __low2float(a);
862
929
  float a2 = __high2float(a);
863
930
  float b1 = __low2float(b);
864
931
  float b2 = __high2float(b);
865
- __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
866
- __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
867
- return combine_half(r1, r2);
932
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
933
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
934
+ return __halves2half2(r1, r2);
868
935
  }
869
936
 
870
937
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
@@ -885,7 +952,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
885
952
  #else
886
953
  float a1 = __low2float(a);
887
954
  float a2 = __high2float(a);
888
- return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
955
+ return a1 > a2 ? __low2half(a) : __high2half(a);
889
956
  #endif
890
957
  }
891
958
 
@@ -897,7 +964,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
897
964
  #else
898
965
  float a1 = __low2float(a);
899
966
  float a2 = __high2float(a);
900
- return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
967
+ return a1 < a2 ? __low2half(a) : __high2half(a);
901
968
  #endif
902
969
  }
903
970
 
@@ -927,28 +994,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
927
994
  return __floats2half2_rn(r1, r2);
928
995
  }
929
996
 
930
- #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
931
- defined(EIGEN_HIP_DEVICE_COMPILE)
997
+ #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
932
998
 
933
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
934
- half2 plog(const half2& a) {
935
- return h2log(a);
936
- }
999
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
937
1000
 
938
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
939
- half2 pexp(const half2& a) {
940
- return h2exp(a);
941
- }
1001
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
942
1002
 
943
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
944
- half2 psqrt(const half2& a) {
945
- return h2sqrt(a);
946
- }
1003
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
947
1004
 
948
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
949
- half2 prsqrt(const half2& a) {
950
- return h2rsqrt(a);
951
- }
1005
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
952
1006
 
953
1007
  #else
954
1008
 
@@ -984,18 +1038,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
984
1038
  return __floats2half2_rn(r1, r2);
985
1039
  }
986
1040
  #endif
987
- } // namespace
1041
+ } // namespace
988
1042
 
989
1043
  template <>
990
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
991
- pload<Packet4h2>(const Eigen::half* from) {
1044
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
992
1045
  return *reinterpret_cast<const Packet4h2*>(from);
993
1046
  }
994
1047
 
995
1048
  // unaligned load;
996
1049
  template <>
997
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
998
- ploadu<Packet4h2>(const Eigen::half* from) {
1050
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
999
1051
  Packet4h2 r;
1000
1052
  half2* p_alias = reinterpret_cast<half2*>(&r);
1001
1053
  p_alias[0] = ploadu(from + 0);
@@ -1006,8 +1058,7 @@ ploadu<Packet4h2>(const Eigen::half* from) {
1006
1058
  }
1007
1059
 
1008
1060
  template <>
1009
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1010
- ploaddup<Packet4h2>(const Eigen::half* from) {
1061
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
1011
1062
  Packet4h2 r;
1012
1063
  half2* p_alias = reinterpret_cast<half2*>(&r);
1013
1064
  p_alias[0] = ploaddup(from + 0);
@@ -1018,24 +1069,21 @@ ploaddup<Packet4h2>(const Eigen::half* from) {
1018
1069
  }
1019
1070
 
1020
1071
  template <>
1021
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
1022
- Eigen::half* to, const Packet4h2& from) {
1072
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1023
1073
  *reinterpret_cast<Packet4h2*>(to) = from;
1024
1074
  }
1025
1075
 
1026
1076
  template <>
1027
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
1028
- Eigen::half* to, const Packet4h2& from) {
1077
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1029
1078
  const half2* from_alias = reinterpret_cast<const half2*>(&from);
1030
- pstoreu(to + 0,from_alias[0]);
1031
- pstoreu(to + 2,from_alias[1]);
1032
- pstoreu(to + 4,from_alias[2]);
1033
- pstoreu(to + 6,from_alias[3]);
1079
+ pstoreu(to + 0, from_alias[0]);
1080
+ pstoreu(to + 2, from_alias[1]);
1081
+ pstoreu(to + 4, from_alias[2]);
1082
+ pstoreu(to + 6, from_alias[3]);
1034
1083
  }
1035
1084
 
1036
1085
  template <>
1037
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1038
- ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1086
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1039
1087
  #if defined(EIGEN_GPU_HAS_LDG)
1040
1088
  Packet4h2 r;
1041
1089
  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
@@ -1052,8 +1100,7 @@ ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1052
1100
  }
1053
1101
 
1054
1102
  template <>
1055
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1056
- ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1103
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1057
1104
  Packet4h2 r;
1058
1105
  half2* r_alias = reinterpret_cast<half2*>(&r);
1059
1106
  r_alias[0] = ploadt_ro_unaligned(from + 0);
@@ -1064,20 +1111,19 @@ ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1064
1111
  }
1065
1112
 
1066
1113
  template <>
1067
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1068
- pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1114
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1069
1115
  Packet4h2 r;
1070
1116
  half2* p_alias = reinterpret_cast<half2*>(&r);
1071
- p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072
- p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073
- p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074
- p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1117
+ p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1118
+ p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1119
+ p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1120
+ p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1075
1121
  return r;
1076
1122
  }
1077
1123
 
1078
1124
  template <>
1079
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
1080
- Eigen::half* to, const Packet4h2& from, Index stride) {
1125
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
1126
+ Index stride) {
1081
1127
  const half2* from_alias = reinterpret_cast<const half2*>(&from);
1082
1128
  pscatter(to + stride * 0, from_alias[0], stride);
1083
1129
  pscatter(to + stride * 2, from_alias[1], stride);
@@ -1086,14 +1132,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
1086
1132
  }
1087
1133
 
1088
1134
  template <>
1089
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
1090
- const Packet4h2& a) {
1135
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
1091
1136
  return pfirst(*(reinterpret_cast<const half2*>(&a)));
1092
1137
  }
1093
1138
 
1094
1139
  template <>
1095
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
1096
- const Packet4h2& a) {
1140
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
1097
1141
  Packet4h2 r;
1098
1142
  half2* p_alias = reinterpret_cast<half2*>(&r);
1099
1143
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1105,8 +1149,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
1105
1149
  }
1106
1150
 
1107
1151
  template <>
1108
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
1109
- const Packet4h2& /*a*/) {
1152
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
1110
1153
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
1111
1154
  return pset1<Packet4h2>(true_half);
1112
1155
  }
@@ -1117,9 +1160,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2
1117
1160
  return pset1<Packet4h2>(false_half);
1118
1161
  }
1119
1162
 
1120
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
1121
- double* d_row0, double* d_row1, double* d_row2, double* d_row3,
1122
- double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
1163
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
1164
+ double* d_row3, double* d_row4, double* d_row5,
1165
+ double* d_row6, double* d_row7) {
1123
1166
  double d_tmp;
1124
1167
  d_tmp = d_row0[1];
1125
1168
  d_row0[1] = d_row4[0];
@@ -1138,8 +1181,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
1138
1181
  d_row7[0] = d_tmp;
1139
1182
  }
1140
1183
 
1141
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
1142
- half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1184
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
1185
+ half2* f_row3) {
1143
1186
  half2 f_tmp;
1144
1187
  f_tmp = f_row0[1];
1145
1188
  f_row0[1] = f_row2[0];
@@ -1150,18 +1193,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
1150
1193
  f_row3[0] = f_tmp;
1151
1194
  }
1152
1195
 
1153
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1154
- ptranspose_half(half2& f0, half2& f1) {
1155
- __half a1 = get_half2_low(f0);
1156
- __half a2 = get_half2_high(f0);
1157
- __half b1 = get_half2_low(f1);
1158
- __half b2 = get_half2_high(f1);
1159
- f0 = combine_half(a1, b1);
1160
- f1 = combine_half(a2, b2);
1196
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
1197
+ __half a1 = __low2half(f0);
1198
+ __half a2 = __high2half(f0);
1199
+ __half b1 = __low2half(f1);
1200
+ __half b2 = __high2half(f1);
1201
+ f0 = __halves2half2(a1, b1);
1202
+ f1 = __halves2half2(a2, b2);
1161
1203
  }
1162
1204
 
1163
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1164
- ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1205
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
1165
1206
  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
1166
1207
  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
1167
1208
  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
@@ -1170,9 +1211,7 @@ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1170
1211
  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
1171
1212
  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
1172
1213
  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
1173
- ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174
- d_row4, d_row5, d_row6, d_row7);
1175
-
1214
+ ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
1176
1215
 
1177
1216
  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
1178
1217
  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
@@ -1213,23 +1252,18 @@ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1213
1252
  ptranspose_half(f_row0[1], f_row1[1]);
1214
1253
  ptranspose_half(f_row2[0], f_row3[0]);
1215
1254
  ptranspose_half(f_row2[1], f_row3[1]);
1216
-
1217
1255
  }
1218
1256
 
1219
1257
  template <>
1220
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1221
- plset<Packet4h2>(const Eigen::half& a) {
1258
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
1222
1259
  #if defined(EIGEN_HIP_DEVICE_COMPILE)
1223
1260
 
1224
1261
  Packet4h2 r;
1225
1262
  half2* p_alias = reinterpret_cast<half2*>(&r);
1226
1263
  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1227
- p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
1228
- __hadd(a, __float2half(3.0f)));
1229
- p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
1230
- __hadd(a, __float2half(5.0f)));
1231
- p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
1232
- __hadd(a, __float2half(7.0f)));
1264
+ p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
1265
+ p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
1266
+ p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
1233
1267
  return r;
1234
1268
  #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1235
1269
  Packet4h2 r;
@@ -1237,8 +1271,8 @@ plset<Packet4h2>(const Eigen::half& a) {
1237
1271
 
1238
1272
  half2 b = pset1<half2>(a);
1239
1273
  half2 c;
1240
- half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1241
- half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1274
+ half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
1275
+ half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
1242
1276
 
1243
1277
  c = __hadd2(b, half_offset0);
1244
1278
  r_alias[0] = plset(__low2half(c));
@@ -1254,18 +1288,17 @@ plset<Packet4h2>(const Eigen::half& a) {
1254
1288
  float f = __half2float(a);
1255
1289
  Packet4h2 r;
1256
1290
  half2* p_alias = reinterpret_cast<half2*>(&r);
1257
- p_alias[0] = combine_half(a, __float2half(f + 1.0f));
1258
- p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
1259
- p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
1260
- p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
1291
+ p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
1292
+ p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
1293
+ p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
1294
+ p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
1261
1295
  return r;
1262
1296
  #endif
1263
1297
  }
1264
1298
 
1265
1299
  template <>
1266
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1267
- pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1268
- const Packet4h2& b) {
1300
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1301
+ const Packet4h2& b) {
1269
1302
  Packet4h2 r;
1270
1303
  half2* r_alias = reinterpret_cast<half2*>(&r);
1271
1304
  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
@@ -1279,8 +1312,7 @@ pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1279
1312
  }
1280
1313
 
1281
1314
  template <>
1282
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1283
- pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1315
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1284
1316
  Packet4h2 r;
1285
1317
  half2* r_alias = reinterpret_cast<half2*>(&r);
1286
1318
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1293,8 +1325,33 @@ pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1293
1325
  }
1294
1326
 
1295
1327
  template <>
1296
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
1297
- const Packet4h2& a, const Packet4h2& b) {
1328
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1329
+ Packet4h2 r;
1330
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1331
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1332
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1333
+ r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
1334
+ r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
1335
+ r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
1336
+ r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
1337
+ return r;
1338
+ }
1339
+
1340
+ template <>
1341
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1342
+ Packet4h2 r;
1343
+ half2* r_alias = reinterpret_cast<half2*>(&r);
1344
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
1345
+ const half2* b_alias = reinterpret_cast<const half2*>(&b);
1346
+ r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
1347
+ r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
1348
+ r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
1349
+ r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
1350
+ return r;
1351
+ }
1352
+
1353
+ template <>
1354
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1298
1355
  Packet4h2 r;
1299
1356
  half2* r_alias = reinterpret_cast<half2*>(&r);
1300
1357
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1307,8 +1364,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
1307
1364
  }
1308
1365
 
1309
1366
  template <>
1310
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
1311
- const Packet4h2& a, const Packet4h2& b) {
1367
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1312
1368
  Packet4h2 r;
1313
1369
  half2* r_alias = reinterpret_cast<half2*>(&r);
1314
1370
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1321,8 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
1321
1377
  }
1322
1378
 
1323
1379
  template <>
1324
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
1325
- const Packet4h2& a, const Packet4h2& b) {
1380
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1326
1381
  Packet4h2 r;
1327
1382
  half2* r_alias = reinterpret_cast<half2*>(&r);
1328
1383
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1335,8 +1390,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
1335
1390
  }
1336
1391
 
1337
1392
  template <>
1338
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1339
- pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1393
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1340
1394
  Packet4h2 r;
1341
1395
  half2* r_alias = reinterpret_cast<half2*>(&r);
1342
1396
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1349,8 +1403,7 @@ pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1349
1403
  }
1350
1404
 
1351
1405
  template <>
1352
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
1353
- const Packet4h2& a, const Packet4h2& b) {
1406
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1354
1407
  Packet4h2 r;
1355
1408
  half2* r_alias = reinterpret_cast<half2*>(&r);
1356
1409
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1363,8 +1416,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
1363
1416
  }
1364
1417
 
1365
1418
  template <>
1366
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
1367
- const Packet4h2& a, const Packet4h2& b) {
1419
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1368
1420
  Packet4h2 r;
1369
1421
  half2* r_alias = reinterpret_cast<half2*>(&r);
1370
1422
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1394,8 +1446,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
1394
1446
  }
1395
1447
 
1396
1448
  template <>
1397
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
1398
- const Packet4h2& a, const Packet4h2& b) {
1449
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1399
1450
  Packet4h2 r;
1400
1451
  half2* r_alias = reinterpret_cast<half2*>(&r);
1401
1452
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1408,8 +1459,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
1408
1459
  }
1409
1460
 
1410
1461
  template <>
1411
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
1412
- const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
1462
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
1463
+ const Packet4h2& c) {
1413
1464
  Packet4h2 r;
1414
1465
  half2* r_alias = reinterpret_cast<half2*>(&r);
1415
1466
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1423,8 +1474,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
1423
1474
  }
1424
1475
 
1425
1476
  template <>
1426
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
1427
- const Packet4h2& a, const Packet4h2& b) {
1477
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1428
1478
  Packet4h2 r;
1429
1479
  half2* r_alias = reinterpret_cast<half2*>(&r);
1430
1480
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1437,8 +1487,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
1437
1487
  }
1438
1488
 
1439
1489
  template <>
1440
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
1441
- const Packet4h2& a, const Packet4h2& b) {
1490
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1442
1491
  Packet4h2 r;
1443
1492
  half2* r_alias = reinterpret_cast<half2*>(&r);
1444
1493
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1451,8 +1500,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
1451
1500
  }
1452
1501
 
1453
1502
  template <>
1454
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
1455
- const Packet4h2& a, const Packet4h2& b) {
1503
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1456
1504
  Packet4h2 r;
1457
1505
  half2* r_alias = reinterpret_cast<half2*>(&r);
1458
1506
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1465,64 +1513,53 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
1465
1513
  }
1466
1514
 
1467
1515
  template <>
1468
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
1469
- const Packet4h2& a) {
1516
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
1470
1517
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1471
1518
 
1472
- return predux(a_alias[0]) + predux(a_alias[1]) +
1473
- predux(a_alias[2]) + predux(a_alias[3]);
1519
+ return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
1474
1520
  }
1475
1521
 
1476
1522
  template <>
1477
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
1478
- const Packet4h2& a) {
1523
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
1479
1524
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1480
- half2 m0 = combine_half(predux_max(a_alias[0]),
1481
- predux_max(a_alias[1]));
1482
- half2 m1 = combine_half(predux_max(a_alias[2]),
1483
- predux_max(a_alias[3]));
1484
- __half first = predux_max(m0);
1525
+ half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
1526
+ half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
1527
+ __half first = predux_max(m0);
1485
1528
  __half second = predux_max(m1);
1486
1529
  #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1487
1530
  return (__hgt(first, second) ? first : second);
1488
1531
  #else
1489
- float ffirst = __half2float(first);
1532
+ float ffirst = __half2float(first);
1490
1533
  float fsecond = __half2float(second);
1491
- return (ffirst > fsecond)? first: second;
1534
+ return (ffirst > fsecond) ? first : second;
1492
1535
  #endif
1493
1536
  }
1494
1537
 
1495
1538
  template <>
1496
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
1497
- const Packet4h2& a) {
1539
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
1498
1540
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1499
- half2 m0 = combine_half(predux_min(a_alias[0]),
1500
- predux_min(a_alias[1]));
1501
- half2 m1 = combine_half(predux_min(a_alias[2]),
1502
- predux_min(a_alias[3]));
1503
- __half first = predux_min(m0);
1541
+ half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
1542
+ half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
1543
+ __half first = predux_min(m0);
1504
1544
  __half second = predux_min(m1);
1505
1545
  #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1506
1546
  return (__hlt(first, second) ? first : second);
1507
1547
  #else
1508
- float ffirst = __half2float(first);
1548
+ float ffirst = __half2float(first);
1509
1549
  float fsecond = __half2float(second);
1510
- return (ffirst < fsecond)? first: second;
1550
+ return (ffirst < fsecond) ? first : second;
1511
1551
  #endif
1512
1552
  }
1513
1553
 
1514
1554
  // likely overflow/underflow
1515
1555
  template <>
1516
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
1517
- const Packet4h2& a) {
1556
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
1518
1557
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1519
- return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
1520
- pmul(a_alias[2], a_alias[3])));
1558
+ return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
1521
1559
  }
1522
1560
 
1523
1561
  template <>
1524
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1525
- plog1p<Packet4h2>(const Packet4h2& a) {
1562
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
1526
1563
  Packet4h2 r;
1527
1564
  half2* r_alias = reinterpret_cast<half2*>(&r);
1528
1565
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1534,8 +1571,7 @@ plog1p<Packet4h2>(const Packet4h2& a) {
1534
1571
  }
1535
1572
 
1536
1573
  template <>
1537
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1538
- pexpm1<Packet4h2>(const Packet4h2& a) {
1574
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
1539
1575
  Packet4h2 r;
1540
1576
  half2* r_alias = reinterpret_cast<half2*>(&r);
1541
1577
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1583,8 +1619,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2
1583
1619
  }
1584
1620
 
1585
1621
  template <>
1586
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1587
- prsqrt<Packet4h2>(const Packet4h2& a) {
1622
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
1588
1623
  Packet4h2 r;
1589
1624
  half2* r_alias = reinterpret_cast<half2*>(&r);
1590
1625
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1597,9 +1632,8 @@ prsqrt<Packet4h2>(const Packet4h2& a) {
1597
1632
 
1598
1633
  // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
1599
1634
  // the implementation of GPU half reduction.
1600
- template<>
1601
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
1602
- const half2& b) {
1635
+ template <>
1636
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
1603
1637
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1604
1638
  return __hadd2(a, b);
1605
1639
  #else
@@ -1613,9 +1647,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
1613
1647
  #endif
1614
1648
  }
1615
1649
 
1616
- template<>
1617
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
1618
- const half2& b) {
1650
+ template <>
1651
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
1619
1652
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1620
1653
  return __hmul2(a, b);
1621
1654
  #else
@@ -1629,9 +1662,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
1629
1662
  #endif
1630
1663
  }
1631
1664
 
1632
- template<>
1633
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
1634
- const half2& b) {
1665
+ template <>
1666
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
1635
1667
  #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1636
1668
  return __h2div(a, b);
1637
1669
  #else
@@ -1645,41 +1677,36 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
1645
1677
  #endif
1646
1678
  }
1647
1679
 
1648
- template<>
1649
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
1650
- const half2& b) {
1680
+ template <>
1681
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
1651
1682
  float a1 = __low2float(a);
1652
1683
  float a2 = __high2float(a);
1653
1684
  float b1 = __low2float(b);
1654
1685
  float b2 = __high2float(b);
1655
- __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
1656
- __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
1657
- return combine_half(r1, r2);
1686
+ __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
1687
+ __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
1688
+ return __halves2half2(r1, r2);
1658
1689
  }
1659
1690
 
1660
- template<>
1661
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
1662
- const half2& b) {
1691
+ template <>
1692
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
1663
1693
  float a1 = __low2float(a);
1664
1694
  float a2 = __high2float(a);
1665
1695
  float b1 = __low2float(b);
1666
1696
  float b2 = __high2float(b);
1667
- __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
1668
- __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
1669
- return combine_half(r1, r2);
1697
+ __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
1698
+ __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
1699
+ return __halves2half2(r1, r2);
1670
1700
  }
1671
1701
 
1672
- // #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
1673
-
1674
- #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
1702
+ #endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
1675
1703
 
1676
1704
  #undef EIGEN_GPU_HAS_LDG
1677
1705
  #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1678
1706
  #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1679
1707
 
1680
- } // end namespace internal
1681
-
1682
- } // end namespace Eigen
1708
+ } // end namespace internal
1683
1709
 
1710
+ } // end namespace Eigen
1684
1711
 
1685
- #endif // EIGEN_PACKET_MATH_GPU_H
1712
+ #endif // EIGEN_PACKET_MATH_GPU_H